// Anime4K 3D Graphics Anti-Aliased Upscaler (Ultra Small)
// Based on Anime4K project by bloc97

//!BGFX EFFECT
//!VERSION 1
//!NAME Anime4K 3D AA Upscale US
//!CATEGORY Anime4K
//!DESCRIPTION 2x upscaler with anti-aliasing for 3D graphics content. Ultra small variant offers fastest performance with basic quality.
//!USE MulAdd

#ifdef BG_FP16
#pragma warning(disable: 3557)
#define MF float16_t
#define MF2 float16_t2
#define MF3 float16_t3
#define MF4 float16_t4
#define MF3x4 float16_t3x4
#define MF4x4 float16_t4x4
#else
#define MF float
#define MF2 float2
#define MF3 float3
#define MF4 float4
#define MF3x4 float3x4
#define MF4x4 float4x4
#endif


//!TEXTURE
Texture2D INPUT;

//!TEXTURE
//!WIDTH INPUT_WIDTH * 2
//!HEIGHT INPUT_HEIGHT * 2
Texture2D OUTPUT;

//!SAMPLER
//!FILTER POINT
SamplerState sam;

//!SAMPLER
//!FILTER LINEAR
SamplerState sam1;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R16G16B16A16_FLOAT
Texture2D tex1;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R16G16B16A16_FLOAT
Texture2D tex2;


//!COMMON

MF4 MulAdd(MF3 v, MF3x4 m, MF4 a) {
	return v.x * MF4(m[0][0], m[0][1], m[0][2], m[0][3]) +
	       v.y * MF4(m[1][0], m[1][1], m[1][2], m[1][3]) +
	       v.z * MF4(m[2][0], m[2][1], m[2][2], m[2][3]) + a;
}

MF4 MulAdd(MF4 v, MF4x4 m, MF4 a) {
	return v.x * MF4(m[0][0], m[0][1], m[0][2], m[0][3]) +
	       v.y * MF4(m[1][0], m[1][1], m[1][2], m[1][3]) +
	       v.z * MF4(m[2][0], m[2][1], m[2][2], m[2][3]) +
	       v.w * MF4(m[3][0], m[3][1], m[3][2], m[3][3]) + a;
}


//!PASS 1
//!DESC Feature Extraction Conv-4x3x3x3
//!IN INPUT
//!OUT tex1
//!BLOCK_SIZE 16
//!NUM_THREADS 64

void Pass1(uint2 blockStart, uint3 threadId) {
	uint2 gxy = (TileSwizzle8x8(threadId.x) << 1) + blockStart;
	uint2 inputSize = GetInputSize();
	if (gxy.x >= inputSize.x || gxy.y >= inputSize.y) {
		return;
	}

	float2 inputPt = GetInputPt();
	uint i, j;

	MF3 src[4][4];
	[unroll]
	for (i = 0; i <= 2; i += 2) {
		[unroll]
		for (j = 0; j <= 2; j += 2) {
			float2 tpos = (gxy + uint2(i, j)) * inputPt;
			const MF4 sr = INPUT.GatherRed(sam, tpos);
			const MF4 sg = INPUT.GatherGreen(sam, tpos);
			const MF4 sb = INPUT.GatherBlue(sam, tpos);

			src[i][j] = MF3(sr.w, sg.w, sb.w);
			src[i][j + 1] = MF3(sr.x, sg.x, sb.x);
			src[i + 1][j] = MF3(sr.z, sg.z, sb.z);
			src[i + 1][j + 1] = MF3(sr.y, sg.y, sb.y);
		}
	}

	[unroll]
	for (i = 1; i <= 2; ++i) {
		[unroll]
		for (j = 1; j <= 2; ++j) {
			uint2 destPos = gxy + uint2(i - 1, j - 1);

			if (i != 1 || j != 1) {
				if (destPos.x >= inputSize.x || destPos.y >= inputSize.y) {
					continue;
				}
			}

			MF4 result = { 0.025272772, 0.014345055, -0.009859513, 0.000597734 };
			result = MulAdd(src[i - 1][j - 1], MF3x4(0.10005958, 0.30363804, -0.24045889, -0.003466652, 0.25860623, 0.47408342, -0.58965975, 0.058167808, 0.17228158, 0.43657768, -0.3982826, -0.022539442), result);
			result = MulAdd(src[i - 1][j], MF3x4(-0.23593923, 0.4692322, 0.04355681, 0.009586428, -0.37485301, 0.5885971, 0.3236714, -0.08301241, -0.3188667, 0.5608897, 0.3396368, 0.059106056), result);
			result = MulAdd(src[i - 1][j + 1], MF3x4(-0.15485556, -0.11745722, 0.042440087, 0.5313071, -0.24682014, 0.00033858762, -0.08202063, 0.84100145, -0.15803772, -0.11368423, -0.09765383, 0.6991758), result);
			result = MulAdd(src[i][j - 1], MF3x4(0.21323937, 0.07442176, -0.10949712, -0.05313448, 0.44871446, 0.16815953, 0.07202329, -0.05763504, 0.12998791, 0.06934043, 0.044557367, -0.00978054), result);
			result = MulAdd(src[i][j], MF3x4(0.40295616, -0.7156766, 0.7321813, -0.54544497, 0.44781828, -1.1244348, 0.7786728, -0.91297877, 0.52567977, -0.81486106, 0.56867415, -0.68681335), result);
			result = MulAdd(src[i][j + 1], MF3x4(0.020084642, -0.072761856, -0.13040084, 0.063976064, 0.18822637, -0.096821584, -0.06842927, 0.18078656, 0.05295053, -0.18540566, -0.1239999, 0.0156137515), result);
			result = MulAdd(src[i + 1][j - 1], MF3x4(-0.6254935, 0.0074730455, 0.21930416, 0.028796878, -0.82789946, 0.051125027, 0.25597844, 0.049207535, -0.68400925, -0.015768895, 0.233402, 0.021760475), result);
			result = MulAdd(src[i + 1][j], MF3x4(0.21823564, -0.15992375, -0.14845636, -0.031485636, 0.13821888, -0.27466524, -0.094343, -0.07067512, 0.20875643, -0.20346795, -0.12910774, -0.052383807), result);
			result = MulAdd(src[i + 1][j + 1], MF3x4(0.001368614, 0.17603171, -0.36661625, -0.0043979343, 0.1381601, 0.27952382, -0.6743216, 0.0067374213, -0.023204552, 0.21662682, -0.3795221, -0.025739884), result);

			tex1[destPos] = result;
		}
	}
}


//!PASS 2
//!DESC Feature Processing Conv-4x3x3x8
//!IN tex1
//!OUT tex2
//!BLOCK_SIZE 16
//!NUM_THREADS 64

void Pass2(uint2 blockStart, uint3 threadId) {
	uint2 gxy = (TileSwizzle8x8(threadId.x) << 1) + blockStart;
	uint2 inputSize = GetInputSize();
	if (gxy.x >= inputSize.x || gxy.y >= inputSize.y) {
		return;
	}

	float2 inputPt = GetInputPt();
	uint i, j;

	MF4 src[4][4];
	[unroll]
	for (i = 0; i < 3; i += 2) {
		[unroll]
		for (j = 0; j < 3; j += 2) {
			float2 tpos = (gxy + uint2(i, j)) * inputPt;
			const MF4 sr = tex1.GatherRed(sam, tpos);
			const MF4 sg = tex1.GatherGreen(sam, tpos);
			const MF4 sb = tex1.GatherBlue(sam, tpos);
			const MF4 sa = tex1.GatherAlpha(sam, tpos);

			src[i][j] = MF4(sr.w, sg.w, sb.w, sa.w);
			src[i][j + 1] = MF4(sr.x, sg.x, sb.x, sa.x);
			src[i + 1][j] = MF4(sr.z, sg.z, sb.z, sa.z);
			src[i + 1][j + 1] = MF4(sr.y, sg.y, sb.y, sa.y);
		}
	}

	[unroll]
	for (i = 1; i <= 2; ++i) {
		[unroll]
		for (j = 1; j <= 2; ++j) {
			uint2 destPos = gxy + uint2(i - 1, j - 1);

			if (i != 1 || j != 1) {
				if (destPos.x >= inputSize.x || destPos.y >= inputSize.y) {
					continue;
				}
			}

			MF4 result = { -0.012601902, -0.0121468, -0.027073797, -0.0223602 };
			result = MulAdd(max(src[i - 1][j - 1], 0), MF4x4(-0.08796357, 0.028130328, 0.073414765, -0.029320398, -0.07826724, 0.012752971, 0.06304871, 0.082551956, -0.052348416, 0.010077275, 0.0803755, 0.16395038, -0.08238233, -0.0012038432, -0.1297045, -0.1087021), result);
			result = MulAdd(max(src[i - 1][j], 0), MF4x4(0.044162463, -0.019727755, -0.05845153, -0.23984948, 0.08363732, -0.06774037, 0.0234879, 0.02139741, 0.0028723166, -0.07549135, 0.0744662, 0.109019615, 0.03763121, -0.060664024, -0.03823593, -0.015655363), result);
			result = MulAdd(max(src[i - 1][j + 1], 0), MF4x4(-0.026882887, 0.124355234, -0.005225512, 0.053853527, -0.004761375, 0.07739831, 0.007993726, -0.024238527, -0.035357814, 0.022114292, -0.026158875, 0.047122046, -0.021067293, 0.041959677, 0.008588816, -0.006613815), result);
			result = MulAdd(max(src[i][j - 1], 0), MF4x4(-0.037601672, 0.010898833, 0.05053419, -0.0118405875, 0.052177202, 0.013291429, -0.20246609, -0.07192325, -0.05164381, -0.011278074, -0.12394048, -0.037769064, 0.24392918, 0.03289724, 0.018663784, 0.04071627), result);
			result = MulAdd(max(src[i][j], 0), MF4x4(-0.17768572, -0.003431817, 0.024597375, -0.067222916, -0.15119793, -0.049984362, 0.0588867, 0.20031504, -0.028296817, -0.17337173, 0.02136566, 0.07842319, -0.10203611, 0.02128208, 0.20057699, 0.026265312), result);
			result = MulAdd(max(src[i][j + 1], 0), MF4x4(-0.018206367, -0.36731398, -0.07842714, -0.08946319, 0.05601789, -0.13398123, -0.09766525, 0.0051633804, -0.004821273, -0.060362365, -0.08751827, -0.01924666, -0.01642196, -0.084792316, -0.021546558, -0.01531331), result);
			result = MulAdd(max(src[i + 1][j - 1], 0), MF4x4(-0.003315341, 0.003464535, 0.023609636, -0.029517155, 0.023121882, -0.033598952, 0.032658506, 0.072380014, 0.038630765, -0.020992903, -0.09003304, 0.048244834, 0.17752261, -0.023978172, 0.7178278, 0.09461632), result);
			result = MulAdd(max(src[i + 1][j], 0), MF4x4(0.010277829, -0.0462686, -0.024897251, -0.02214524, 0.1262903, -0.15583614, -0.50100106, -0.04074772, 0.0612536, -0.17066137, -0.15715116, -0.020877155, -0.062031068, 0.4314311, -0.008700501, -0.030722365), result);
			result = MulAdd(max(src[i + 1][j + 1], 0), MF4x4(-0.12062004, 0.055291675, 0.041176047, -0.034254536, -0.04062085, 0.14750236, 0.100433215, 0.024384778, -0.02506444, -0.0012329774, 0.06715311, 0.013158619, -0.07343181, 0.08929479, 0.015891392, 0.0014893904), result);
			result = MulAdd(max(-src[i - 1][j - 1], 0), MF4x4(-0.00028356185, 0.008408778, 0.046833538, -0.110735945, 0.050230157, -0.023995856, -0.06471944, -0.12666705, 0.121487044, -0.040447604, -0.13425831, -0.035763647, 0.06327994, 0.04542948, 0.12984566, 0.041735172), result);
			result = MulAdd(max(-src[i - 1][j], 0), MF4x4(-0.09654193, 0.055733874, 0.14149562, 0.20103204, -0.04256184, 0.041129943, -0.0997907, 0.030775042, 0.017492702, 0.053436417, -0.13472094, -0.037674613, -0.09461306, 0.07363193, 0.025130237, -0.020962669), result);
			result = MulAdd(max(-src[i - 1][j + 1], 0), MF4x4(0.003966979, -0.077911004, -0.025530541, -0.08657802, 0.047928706, -0.12820454, -0.034780253, 0.070523396, 0.0991259, -0.07432318, -0.035848588, 0.026542934, -0.005886989, -0.048655648, 0.014799456, -0.033676937), result);
			result = MulAdd(max(-src[i][j - 1], 0), MF4x4(0.0040423325, 0.011639387, 0.014709128, -0.100935176, -0.03094238, -0.0058094636, 0.1256023, 0.086693585, -0.00840243, -0.02635784, -0.2395783, 0.0055595445, -0.104565054, 0.05285065, 0.092289336, 0.12696597), result);
			result = MulAdd(max(-src[i][j], 0), MF4x4(-0.097862415, 0.035469674, -0.12026435, -0.25865972, 0.12508512, -0.00648921, -0.1848096, -0.24143967, -0.009432349, -0.035211377, -0.05589267, -0.11565712, 0.015937572, 0.02717122, -0.09954979, -0.081140056), result);
			result = MulAdd(max(-src[i][j + 1], 0), MF4x4(-0.09073428, 0.31426015, 0.087145604, -0.00073830306, 0.013578701, 0.032616604, 0.038264107, 0.07236385, -0.012257218, 0.040580798, 0.08520396, 0.004167174, 0.02280993, 0.113494344, 0.027510444, 0.029490784), result);
			result = MulAdd(max(-src[i + 1][j - 1], 0), MF4x4(-0.02391937, 0.0039571812, -0.026116686, -0.025334306, 0.06904104, 0.011511556, -0.14147542, 0.01224604, 0.03788813, -0.041387778, -0.1523622, 0.03650455, 0.04693732, 0.03091366, 0.2839756, 0.1779714), result);
			result = MulAdd(max(-src[i + 1][j], 0), MF4x4(-0.026292996, 0.020397607, 0.09354275, 0.00044126343, -0.047845, 0.11368384, 0.18426466, 0.12002076, -0.034070846, 0.042704806, -0.041553736, 0.04446022, -0.006331844, 0.16227855, 0.07832003, -0.07068554), result);
			result = MulAdd(max(-src[i + 1][j + 1], 0), MF4x4(-0.026658786, -0.0079359505, -0.04125044, -0.10622727, 0.06254047, -0.36537018, -0.10755624, 0.011665703, 0.025558028, -0.087151, -0.06987865, 0.00023839885, 0.03247968, -0.053188834, -0.004876301, -0.06005079), result);

			tex2[destPos] = result;
		}
	}
}


//!PASS 3
//!DESC Reconstruction Conv-4x3x3x4, Depth-to-Space
//!IN INPUT, tex2
//!OUT OUTPUT
//!BLOCK_SIZE 16
//!NUM_THREADS 64

void Pass3(uint2 blockStart, uint3 threadId) {
	uint2 gxy = (TileSwizzle8x8(threadId.x) << 1) + blockStart;

	const uint2 outputSize = GetOutputSize();
	if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
		return;
	}

	float2 inputPt = GetInputPt();
	float2 outputPt = GetOutputPt();

	float2 pos = ((gxy >> 1) + 0.5f) * inputPt;

	float2 tpos = pos - 0.5f * inputPt;
	const MF4 sr = tex2.GatherRed(sam, tpos);
	const MF4 sg = tex2.GatherGreen(sam, tpos);
	const MF4 sb = tex2.GatherBlue(sam, tpos);
	const MF4 sa = tex2.GatherAlpha(sam, tpos);

	MF4 a = MF4(sr.w, sg.w, sb.w, sa.w);
	MF4 b = MF4(sr.x, sg.x, sb.x, sa.x);
	MF4 c = tex2.SampleLevel(sam, pos + MF2(-inputPt.x, inputPt.y), 0);
	MF4 d = MF4(sr.z, sg.z, sb.z, sa.z);
	MF4 e = MF4(sr.y, sg.y, sb.y, sa.y);
	MF4 f = tex2.SampleLevel(sam, pos + MF2(0, inputPt.y), 0);
	MF4 g = tex2.SampleLevel(sam, pos + MF2(inputPt.x, -inputPt.y), 0);
	MF4 h = tex2.SampleLevel(sam, pos + MF2(inputPt.x, 0), 0);
	MF4 i = tex2.SampleLevel(sam, pos + MF2(inputPt.x, inputPt.y), 0);

	MF4 result = { -3.1127936e-05, 3.3726166e-05, 4.8580805e-05, -9.541029e-06 };
	result = MulAdd(max(a, 0), MF4x4(-0.00055252935, 0.0011350953, -0.0016148019, 0.0014946404, -0.30635214, -0.017596753, -0.0036547943, 0.016236471, 0.005174489, 0.0030302007, 0.00019672248, 0.0006430973, 0.0007490077, -0.0031795658, -6.158733e-05, 0.0006820584), result);
	result = MulAdd(max(b, 0), MF4x4(0.15602079, 0.011071071, -0.0027609533, -0.0034318874, -0.0039016667, 0.016504101, -0.27816474, -0.008282344, 0.19063498, 0.012465078, 0.010091085, -0.004841106, -0.11758087, -0.012808949, 0.0067606894, 0.005216566), result);
	result = MulAdd(max(c, 0), MF4x4(0.013258877, -0.014989483, 0.22402754, 0.013204027, 0.00016207264, -0.00042593342, -0.00333761, -0.0012207513, 0.0033727325, -0.007841196, 0.16044731, 0.00594871, -0.0028581345, 0.012616562, -0.15928285, -0.011812331), result);
	result = MulAdd(max(d, 0), MF4x4(-0.0048872055, -0.0011780986, -0.0029523429, 0.00082424335, -0.0024385185, -0.26525813, 0.013532772, -0.0008381766, 0.0024996721, 0.0022899017, -0.0017697349, -0.0010618394, 0.0024938583, 0.005421073, 0.0028740794, -0.007808829), result);
	result = MulAdd(max(e, 0), MF4x4(-0.08293415, 0.2659366, -0.010839574, 0.023423964, 0.01725351, -0.009252893, -0.011632222, -0.308242, 0.0001496815, 0.16104282, -0.0069378703, 0.00842848, 0.085917845, -0.18407243, -0.006601597, -0.027134055), result);
	result = MulAdd(max(f, 0), MF4x4(-0.033873428, -0.011743531, -0.230377, 0.116242796, -0.0018527015, -0.00853698, 0.0059901997, -0.006155517, -0.009841329, 0.006163952, 0.014816026, 0.18667653, 0.016977048, -0.0017093032, 0.19695279, -0.061764043), result);
	result = MulAdd(max(g, 0), MF4x4(-0.0003514533, -0.0069080726, 0.0052108583, -0.0016346197, -0.0016860099, 0.006002445, -0.0022835485, -0.0028219873, 0.0005367275, 0.0005437954, 0.00059865275, -0.00014915364, -0.0032214937, -0.00052043283, -0.0031621973, 0.0055843857), result);
	result = MulAdd(max(h, 0), MF4x4(-0.006905302, -0.20389622, 0.01891904, -0.018114902, 0.00724176, 0.011335843, -0.0028616642, 0.016452003, -0.00013852821, -0.00039706306, 0.0011838446, 0.0028873065, 0.012857878, 0.16889338, -0.014114007, 0.009388666), result);
	result = MulAdd(max(i, 0), MF4x4(0.0040798862, 0.002933288, -0.016012201, -0.14650294, -0.0017411204, 0.0017980475, 0.00056705566, -0.0003218331, -0.0014291195, -0.0062614805, 0.00082543516, -0.00397049, -0.004496662, 0.0008032309, 0.0049529593, 0.117166765), result);

	pos -= 0.5f * outputPt;
	OUTPUT[gxy] = MF4(result.x + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);

	++gxy.x;
	pos.x += outputPt.x;
	OUTPUT[gxy] = MF4(result.y + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);

	++gxy.y;
	pos.y += outputPt.y;
	OUTPUT[gxy] = MF4(result.w + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);

	--gxy.x;
	pos.x -= outputPt.x;
	OUTPUT[gxy] = MF4(result.z + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
}
