// Anime4K CNN Effect - Anime4K Upscale GAN x2 S
// Based on Anime4K project by bloc97

//!BGFX EFFECT
//!VERSION 1
//!NAME Anime4K Upscale GAN x2 S
//!CATEGORY Anime4K
//!DESCRIPTION GAN-based 2x upscaler for anime. Small variant for fast performance.
//!CAPABILITY FP16
//!USE MulAdd

//!TEXTURE
Texture2D INPUT;

//!TEXTURE
//!WIDTH INPUT_WIDTH * 2
//!HEIGHT INPUT_HEIGHT * 2
Texture2D OUTPUT;

//!SAMPLER
//!FILTER POINT
SamplerState sam;

//!SAMPLER
//!FILTER LINEAR
SamplerState sam1;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R16G16B16A16_FLOAT
Texture2D tex1;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R16G16B16A16_FLOAT
Texture2D tex2;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R16G16B16A16_FLOAT
Texture2D tex3;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R16G16B16A16_FLOAT
Texture2D tex4;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R16G16B16A16_FLOAT
Texture2D tex5;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R16G16B16A16_FLOAT
Texture2D tex6;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R16G16B16A16_FLOAT
Texture2D tex7;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R16G16B16A16_FLOAT
Texture2D tex8;


//!PASS 1
//!DESC Conv-4x3x3x3
//!IN INPUT
//!OUT tex1
//!BLOCK_SIZE 16
//!NUM_THREADS 64

void Pass1(uint2 blockStart, uint3 threadId) {
	uint2 gxy = (TileSwizzle8x8(threadId.x) << 1) + blockStart;
	uint2 inputSize = GetInputSize();
	if (gxy.x >= inputSize.x || gxy.y >= inputSize.y) {
		return;
	}
	float2 inputPt = GetInputPt();

	uint i, j;

	MF3 src[4][4];
	[unroll]
	for (i = 0; i <= 2; i += 2) {
		[unroll]
		for (j = 0; j <= 2; j += 2) {
			float2 tpos = (gxy + uint2(i, j)) * inputPt;
			const MF4 sr = INPUT.GatherRed(sam, tpos);
			const MF4 sg = INPUT.GatherGreen(sam, tpos);
			const MF4 sb = INPUT.GatherBlue(sam, tpos);

			// w z
			// x y
			src[i][j] = MF3(sr.w, sg.w, sb.w);
			src[i][j + 1] = MF3(sr.x, sg.x, sb.x);
			src[i + 1][j] = MF3(sr.z, sg.z, sb.z);
			src[i + 1][j + 1] = MF3(sr.y, sg.y, sb.y);
		}
	}

	[unroll]
	for (i = 1; i <= 2; ++i) {
		[unroll]
		for (j = 1; j <= 2; ++j) {
			uint2 destPos = gxy + uint2(i - 1, j - 1);

			if (i != 1 || j != 1) {
				if (destPos.x >= inputSize.x || destPos.y >= inputSize.y) {
					continue;
				}
			}

			MF4 result = { -0.035786826, -3.2876174e-05, -0.029245647, 0.0141837 };
			result = MulAdd(src[i - 1][j - 1], MF3x4(0.21797048, -0.212819, 0.04459435, -0.04423212, 0.33138385, -0.17247623, -0.120917134, 0.23732775, 0.19597639, -0.33451796, -0.021611832, -0.017377583), result);
			result = MulAdd(src[i - 1][j], MF3x4(-0.12904494, -0.01359655, -0.40096298, 0.32336384, 0.25585845, 0.23335338, -0.4461792, 0.6704216, -0.13310009, 0.05402756, -0.5437191, 0.32286412), result);
			result = MulAdd(src[i - 1][j + 1], MF3x4(0.062060427, 0.07804567, -0.016457668, 0.25662076, -0.1567372, -0.04152728, 0.15387323, -0.12621297, 0.097600766, 0.023655256, 0.052513056, 0.30542207), result);
			result = MulAdd(src[i][j - 1], MF3x4(-0.18701962, -0.4233291, -0.086120665, -0.16739355, -0.63525766, -0.6932253, -0.1777197, -0.5140771, -0.19856504, -0.4475936, 0.12013144, -0.11179723), result);
			result = MulAdd(src[i][j], MF3x4(-0.21761869, 0.65340257, 0.25189772, -0.20664653, 0.05614669, 0.81569123, 0.26439375, -0.22282092, -0.20241423, 0.71137106, 0.041106064, -0.558707), result);
			result = MulAdd(src[i][j + 1], MF3x4(0.014729233, -0.09996152, 0.22300848, -0.04927536, -0.08988005, -0.12005097, -0.04899431, -0.18048033, -0.17237821, -0.03483246, 0.33783346, 0.22711775), result);
			result = MulAdd(src[i + 1][j - 1], MF3x4(-0.010091276, -0.11388358, 0.15959989, 0.16021152, 0.353214, -0.3420636, 0.39659426, 0.14725044, 0.048077144, -0.06667417, 0.047712438, 0.1991372), result);
			result = MulAdd(src[i + 1][j], MF3x4(-0.17764397, 0.014430492, -0.009073561, 0.052957222, -0.26687172, 0.21589288, 0.29830712, 0.15975259, -0.3100123, -0.03535766, 0.18167259, 0.07284526), result);
			result = MulAdd(src[i + 1][j + 1], MF3x4(0.22984034, 0.11556983, -0.26964244, -0.31616172, 0.059412085, 0.10849835, -0.3704685, -0.16312528, 0.3656624, 0.11611945, -0.3790553, -0.4223729), result);

			tex1[destPos] = result;
		}
	}
}


//!PASS 2
//!DESC Conv-4x3x3x8, Conv-4x1x1x24
//!IN tex1
//!OUT tex3, tex4
//!BLOCK_SIZE 8
//!NUM_THREADS 64

void Pass2(uint2 blockStart, uint3 threadId) {
	uint2 gxy = TileSwizzle8x8(threadId.x) + blockStart;
	uint2 inputSize = GetInputSize();
	if (gxy.x >= inputSize.x || gxy.y >= inputSize.y) {
		return;
	}

	float2 inputPt = GetInputPt();
	float2 pos = (gxy + 0.5f) * inputPt;

	// [ a, d, g ]
	// [ b, e, h ]
	// [ c, f, i ]
	MF4 a = tex1.SampleLevel(sam, pos - inputPt, 0);
	MF4 b = tex1.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0);
	MF4 c = tex1.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0);
	MF4 d = tex1.SampleLevel(sam, pos + float2(0, -inputPt.y), 0);
	MF4 e = tex1.SampleLevel(sam, pos, 0);
	MF4 f = tex1.SampleLevel(sam, pos + float2(0, inputPt.y), 0);
	MF4 g = tex1.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0);
	MF4 h = tex1.SampleLevel(sam, pos + float2(inputPt.x, 0), 0);
	MF4 i = tex1.SampleLevel(sam, pos + inputPt, 0);

	MF4 na = max(-a, 0);
	MF4 nb = max(-b, 0);
	MF4 nc = max(-c, 0);
	MF4 nd = max(-d, 0);
	MF4 ne = max(-e, 0);
	MF4 nf = max(-f, 0);
	MF4 ng = max(-g, 0);
	MF4 nh = max(-h, 0);
	MF4 ni = max(-i, 0);

	a = max(a, 0);
	b = max(b, 0);
	c = max(c, 0);
	d = max(d, 0);
	e = max(e, 0);
	f = max(f, 0);
	g = max(g, 0);
	h = max(h, 0);
	i = max(i, 0);

	MF4 conv2d_2_tf = { -0.041068032, 0.02181786, -0.02366552, 0.07215206 };
	conv2d_2_tf = MulAdd(a, MF4x4(0.02899383, 0.12331602, 0.1755303, 0.14228395, -0.23719487, 0.28783783, -0.15755224, 0.16501419, 0.09971766, -0.112085044, 0.15989542, 0.013457646, -0.21386063, -0.10184436, 0.2920392, 0.11544854), conv2d_2_tf);
	conv2d_2_tf = MulAdd(b, MF4x4(-0.09577094, 0.052495796, 0.5072853, -0.16720837, -0.030821526, -0.13200149, 0.061197, 0.09785798, 0.097248554, -0.056709435, -0.12684566, 0.25153175, 0.12550084, 0.5723225, -0.061046973, 0.2737185), conv2d_2_tf);
	conv2d_2_tf = MulAdd(c, MF4x4(0.14275773, 0.3116807, 0.020866666, -0.029567914, 0.054051064, -0.018836629, 0.16237853, 0.23302408, 0.23014219, -0.20245266, -0.040263597, 0.10550008, -0.1419676, -0.07544839, -0.04724355, 0.06713984), conv2d_2_tf);
	conv2d_2_tf = MulAdd(d, MF4x4(-0.36056906, 0.21647012, -0.21559654, -0.1321654, 0.26311335, -0.35098836, 0.08977303, -0.2912846, -0.03221502, -0.33539286, 0.55078757, 0.14826211, 0.12334663, 0.031169238, 0.0626983, 0.13543329), conv2d_2_tf);
	conv2d_2_tf = MulAdd(e, MF4x4(0.032711882, 0.53162986, 0.1736962, 0.22126123, 0.13229683, 0.12998195, -0.08843839, 0.3830243, -0.29015037, -0.13158421, 0.2987182, 0.0039998284, -0.4924434, -0.34931743, 0.3501415, -0.015819922), conv2d_2_tf);
	conv2d_2_tf = MulAdd(f, MF4x4(0.039777573, -0.039639533, -0.27015024, -0.33144557, -0.11338446, -0.19242573, 0.48813564, -0.24602202, 0.120988116, -0.12362437, 0.23984735, -0.33717445, 0.14359151, -0.09583342, -0.015998919, -0.19725454), conv2d_2_tf);
	conv2d_2_tf = MulAdd(g, MF4x4(0.17751572, -0.14914338, -0.24518701, 0.22713365, 0.10613938, 0.12027283, 0.1582502, 0.011725502, -0.02418084, 0.106176965, 0.10111444, 0.07009088, 0.017611375, 0.369643, -0.21788761, -0.15093188), conv2d_2_tf);
	conv2d_2_tf = MulAdd(h, MF4x4(0.0863035, -0.43148708, 0.0994751, 0.17801163, -0.42566994, -0.2744198, -0.028655952, -0.2481176, -0.26144302, -0.26753834, 0.11043684, -0.48341632, 0.41320416, 0.25118062, -0.31461874, 0.36563694), conv2d_2_tf);
	conv2d_2_tf = MulAdd(i, MF4x4(-0.04845539, -0.2790916, -0.1626853, 0.18036526, 0.2368911, -0.5688802, 0.05240968, -0.034105603, -0.14011742, -0.37861058, -0.096871816, -0.27824572, 0.41195226, 0.23514003, 0.12282304, 0.28447765), conv2d_2_tf);
	conv2d_2_tf = MulAdd(na, MF4x4(-0.13261828, -0.13148594, 0.05470859, -0.114724025, 0.17642413, -0.05585294, 0.44086194, -0.10915775, -0.23456413, -0.18385538, -0.4193869, 0.2708079, 0.03720121, 0.15744475, 0.092449814, -0.0922205), conv2d_2_tf);
	conv2d_2_tf = MulAdd(nb, MF4x4(-0.14146912, 0.386554, -0.15197717, 0.1682067, -0.33229175, 0.18661757, 0.142476, -0.05811066, -0.12433686, 0.20817612, 0.17710523, 0.24227881, -0.3699883, -0.14644128, -0.066485085, -0.010829679), conv2d_2_tf);
	conv2d_2_tf = MulAdd(nc, MF4x4(-0.02267665, -0.21349631, 0.05916224, 0.07111888, -0.3317847, -0.044436328, -0.08067249, -0.13602455, -0.2652356, -0.13666181, 0.022768881, -0.21616152, 0.10042784, 0.13159652, -0.062913835, -0.12882891), conv2d_2_tf);
	conv2d_2_tf = MulAdd(nd, MF4x4(-0.21270499, 0.14776433, 0.26771793, 0.41242316, -0.22445452, 0.3885536, -0.36809587, 0.09838256, 0.030300573, -0.016225152, -0.41985163, -0.32797396, 0.3021247, -0.2566993, 0.24282119, 0.071926266), conv2d_2_tf);
	conv2d_2_tf = MulAdd(ne, MF4x4(-0.14173156, 0.10360139, 0.03603846, 0.23004, -0.37078354, -0.7556456, 0.43359467, -0.42839774, -0.08143208, -0.061868757, -0.017048405, -0.1806454, 0.07700074, -0.028751602, -0.49057922, -0.07150736), conv2d_2_tf);
	conv2d_2_tf = MulAdd(nf, MF4x4(-0.21411006, -0.039522924, -0.11006789, 0.30172586, -0.019509817, 0.34646508, 0.03348711, 0.3949624, 0.09367525, 0.11841692, 0.064099714, 0.30587056, 0.00071666663, 0.09569139, 0.07905173, -0.043038815), conv2d_2_tf);
	conv2d_2_tf = MulAdd(ng, MF4x4(-0.1082019, -0.081530154, 0.1997084, 0.0064345463, -0.002075576, 0.0122295255, -0.21594198, -0.20039533, 0.023058774, 0.061136324, -0.043233447, 0.018114857, -0.12538326, -0.008044748, 0.08879177, 0.29855737), conv2d_2_tf);
	conv2d_2_tf = MulAdd(nh, MF4x4(0.06425974, -0.162355, -0.07716668, -0.1783711, 0.08560717, 0.42500424, 0.15796345, 0.25115898, 0.39673963, 0.24484198, -0.16364126, 0.45589596, -0.54474986, -0.41130677, 0.15731613, -0.13945425), conv2d_2_tf);
	conv2d_2_tf = MulAdd(ni, MF4x4(-0.4015527, -0.22220162, 0.088239804, -0.16343592, -0.05973259, -0.053600565, -0.11719207, 0.340347, 0.07810557, 0.06943392, 0.07088433, 0.36863637, -0.16925047, -0.09059371, -0.086145744, -0.26417965), conv2d_2_tf);

	MF4 target1 = { 0.022193057, 0.0031918385, 0.04232464, -0.0056721596 };
	target1 = MulAdd(a, MF4x4(-0.016554115, 0.41586095, -0.11134646, 0.041401796, -0.032285847, 0.07744446, 0.012422875, 0.08027069, -0.11944374, -0.4644861, -0.1625419, 0.09757052, 0.08459575, -0.32677624, -0.15526624, 0.13285875), target1);
	target1 = MulAdd(b, MF4x4(-0.05147117, -0.31841335, -0.07968151, -0.037866592, -0.1438723, 0.21164599, 0.042448167, 0.1660907, -0.03240849, 0.2866945, -0.123190455, -0.2005157, -0.100519955, -0.04109891, -0.14908177, -0.20055951), target1);
	target1 = MulAdd(c, MF4x4(-0.33594802, 0.17970876, -0.08458461, 0.22198248, 0.041744266, 0.053618595, -0.64927346, 0.43071616, -0.042823542, 0.36384553, 0.13817975, -0.23117469, -0.009722301, 0.043797005, -0.006320899, -0.056160737), target1);
	target1 = MulAdd(d, MF4x4(0.020939048, 0.15744017, -0.18557346, 0.2221421, 0.13683408, -0.17577636, -0.1028824, -0.05909411, -0.11116942, -0.23898265, 0.013275228, -0.10834194, -0.23541391, -0.045599524, 0.13663499, -0.061863456), target1);
	target1 = MulAdd(e, MF4x4(-0.9347821, -1.0879762, 0.029261602, 0.0058627487, 0.37568024, 0.07800278, 0.22918043, -0.22581682, -0.24621771, 0.0565432, -0.01175261, 0.20289935, -0.18791674, -0.34127015, -0.20261073, 0.24382167), target1);
	target1 = MulAdd(f, MF4x4(-0.42576772, -0.9465751, 0.36503372, 0.047452617, -0.03021601, 0.19896118, -0.9916106, 0.68441176, -0.097055614, -0.039465737, -0.3072724, 0.3834049, 0.044579748, 0.10185175, -0.07127564, 0.053964186), target1);
	target1 = MulAdd(g, MF4x4(-0.12718496, -0.20010719, -0.13560185, -0.28841987, -0.18198563, 0.06924996, 0.15375975, 0.007953754, -0.03143177, 0.24778824, -0.41971257, -0.15984616, 0.06914517, -0.15320878, -0.058414314, -0.1829401), target1);
	target1 = MulAdd(h, MF4x4(-0.05676951, -0.39852038, -0.0008664457, 0.073233515, -0.110736564, -0.12950265, -0.32641715, 0.05254214, -0.0013476483, 0.04590487, -0.6886247, -0.029103741, 0.13570555, -0.06356145, 0.26564398, 0.16304392), target1);
	target1 = MulAdd(i, MF4x4(-0.14373688, 0.2627747, 0.19523594, -0.04094942, -0.027800431, 0.080428846, -0.21676755, 0.22764, -0.08686052, -0.14352795, 0.012905041, 0.12002593, 0.096998215, -0.0822731, 0.25796455, 0.3244333), target1);
	target1 = MulAdd(na, MF4x4(0.13717347, -0.2534293, -0.08265135, 0.02238695, 0.061414074, -0.12315743, -0.105848454, -0.0324352, -0.019163579, 0.5106144, 0.111571215, -0.17051223, 0.14541212, 0.26512033, 0.17036803, -0.05180038), target1);
	target1 = MulAdd(nb, MF4x4(0.10731618, -0.011980742, -0.06125307, -0.043496255, 0.06382452, -0.53873694, -0.21860467, 0.076045096, 0.014617647, -0.12188417, -0.23983037, 0.20181973, -0.03130421, -0.23090406, 0.07917799, 0.11006313), target1);
	target1 = MulAdd(nc, MF4x4(-0.07749841, -0.17617406, -0.2105074, 0.20204528, 0.31133667, 0.045247886, 0.38000366, -0.23678038, 0.14622565, -0.077519946, 0.04709938, 0.28799757, -0.02295692, 0.021911716, 0.037108235, -0.050266817), target1);
	target1 = MulAdd(nd, MF4x4(-0.04620016, -0.053893, 0.07671593, -0.08702991, -0.31122503, 0.08491399, 0.39734617, 0.10588835, 0.1706988, -0.0030106953, -0.23740743, 0.119870976, 0.04136371, -0.08475979, -0.26021543, -0.26772037), target1);
	target1 = MulAdd(ne, MF4x4(0.013240527, 0.27298495, 0.061895885, -0.1766251, -0.35479823, -0.5952594, -0.2486822, 0.40527418, 0.017724868, -0.64586586, -0.056991536, -0.22597985, 0.1953091, -0.09300436, 0.28394333, -0.17164071), target1);
	target1 = MulAdd(nf, MF4x4(-0.0437722, 0.20237646, 0.1734046, 0.12661959, 0.3563361, 0.20119205, 0.49104276, -0.62781703, 0.10580526, 0.09021795, 0.2986983, 0.05439145, -0.030656314, -0.06551242, 0.06034035, 0.24646781), target1);
	target1 = MulAdd(ng, MF4x4(0.07150872, 0.2634299, -0.15512806, 0.032365914, -0.04214553, -0.32488832, -0.029638838, -0.11298656, 0.016363487, -0.20394005, 0.13789146, -0.1160082, -0.29543686, 0.056006238, 0.022565948, -0.0209169), target1);
	target1 = MulAdd(nh, MF4x4(-0.08222271, 0.1397535, 0.18386504, -0.029725704, 0.19525485, -0.26657727, 0.3193575, 0.39357802, 0.13274485, 0.063030235, 0.5509124, 0.076320685, -0.24871972, -0.23029849, -0.29287627, 0.0009975942), target1);
	target1 = MulAdd(ni, MF4x4(-0.11978757, -0.115064315, -0.32878634, -0.091591395, 0.011527068, -0.07584138, 0.20703748, -0.16326526, -0.07295838, -0.088844456, 0.0057264403, 0.08162376, -0.17551814, 0.10645812, -0.1522622, -0.18409562), target1);

	MF4 target2 = { 0.08279582, -0.12997188, 0.08899629, 0.018068794 };
	target2 = MulAdd(e, MF4x4(-0.5713254, 0.59251165, -0.14328027, 0.3463698, -0.6896771, -0.14296922, -0.3860265, 0.4501756, -0.39508528, 0.40213254, -0.16835114, -0.0029681697, 0.06473641, 0.18837942, 0.18787977, -0.14020114), target2);
	target2 = MulAdd(ne, MF4x4(0.08934268, -0.28500432, 0.45083842, 0.16448207, 0.10745752, -0.07937402, 0.17439699, -0.4361477, 0.35800517, -0.16299683, -0.112771064, 0.46456474, -0.016184373, -0.2676676, -0.09250065, 0.30093423), target2);
	target2 = MulAdd(max(conv2d_2_tf, 0), MF4x4(-0.23437534, 0.30892932, -0.3382499, -0.11436098, -0.09584061, 0.010766669, -0.6745943, 0.19373886, 0.19484869, 0.0063928245, 0.20636424, -0.6427624, 0.22710505, 0.580292, -0.56174964, -0.15055792), target2);
	target2 = MulAdd(max(-conv2d_2_tf, 0), MF4x4(-0.4264334, -0.43369257, 0.29302827, -0.2763896, 0.20638986, 0.066474296, 0.18825729, 0.14629841, -0.70805573, 0.3601201, -0.49326342, 0.4604217, -0.3331877, -0.30442527, 0.33416224, 0.08233912), target2);
	target2 = MulAdd(max(target1, 0), MF4x4(-0.043108743, 0.32130125, -0.13206981, 0.56653565, -0.069573626, -0.32312635, 0.17708589, 0.12717012, -0.39452434, 0.7504042, -0.563233, -0.38678297, -0.20246895, 0.399379, -0.1829332, -0.4856879), target2);
	target2 = MulAdd(max(-target1, 0), MF4x4(0.46322855, -0.14412759, 0.26863632, -0.37377957, 0.18703142, 0.12013766, -0.010468053, 0.36067548, 0.29069972, -0.5482968, 0.1952737, 0.42751312, 0.47847852, -0.13346007, 0.35286024, 0.23347002), target2);

	tex3[gxy] = target1;
	tex4[gxy] = target2;
}


//!PASS 3
//!DESC Conv-4x3x3x8, Conv-4x1x1x32
//!IN tex4, tex3
//!OUT tex2, tex5
//!BLOCK_SIZE 8
//!NUM_THREADS 64

void Pass3(uint2 blockStart, uint3 threadId) {
	uint2 gxy = TileSwizzle8x8(threadId.x) + blockStart;
	uint2 inputSize = GetInputSize();
	if (gxy.x >= inputSize.x || gxy.y >= inputSize.y) {
		return;
	}

	float2 inputPt = GetInputPt();
	float2 pos = (gxy + 0.5f) * inputPt;

	// [ a, d, g ]
	// [ b, e, h ]
	// [ c, f, i ]
	MF4 a = tex4.SampleLevel(sam, pos - inputPt, 0);
	MF4 b = tex4.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0);
	MF4 c = tex4.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0);
	MF4 d = tex4.SampleLevel(sam, pos + float2(0, -inputPt.y), 0);
	MF4 e = tex4.SampleLevel(sam, pos, 0);
	MF4 f = tex4.SampleLevel(sam, pos + float2(0, inputPt.y), 0);
	MF4 g = tex4.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0);
	MF4 h = tex4.SampleLevel(sam, pos + float2(inputPt.x, 0), 0);
	MF4 i = tex4.SampleLevel(sam, pos + inputPt, 0);

	MF4 na = max(-a, 0);
	MF4 nb = max(-b, 0);
	MF4 nc = max(-c, 0);
	MF4 nd = max(-d, 0);
	MF4 ne = max(-e, 0);
	MF4 nf = max(-f, 0);
	MF4 ng = max(-g, 0);
	MF4 nh = max(-h, 0);
	MF4 ni = max(-i, 0);

	a = max(a, 0);
	b = max(b, 0);
	c = max(c, 0);
	d = max(d, 0);
	e = max(e, 0);
	f = max(f, 0);
	g = max(g, 0);
	h = max(h, 0);
	i = max(i, 0);

	MF4 conv2d_5_tf = { -0.0375635, -0.08823075, 0.0025748173, 0.014370204 };
	conv2d_5_tf = MulAdd(a, MF4x4(0.014682038, -0.12901896, -0.16721351, -0.14512789, 0.1975804, 0.31713018, -0.13655594, -0.07817547, -0.1379136, 0.012892589, 0.23835693, 0.18214643, 0.15153849, -0.16835038, 0.2145134, -0.10536737), conv2d_5_tf);
	conv2d_5_tf = MulAdd(b, MF4x4(0.020937767, 0.19783083, -0.54175997, 0.037820112, 0.2667656, 0.22040194, 0.37909588, 0.18100308, 0.020120522, -0.60052997, -0.043528315, -0.25213948, -0.15584327, 0.27506578, -0.092381746, 0.32063565), conv2d_5_tf);
	conv2d_5_tf = MulAdd(c, MF4x4(0.122979, -0.16768639, -0.31459492, -0.0615338, 0.2467096, 0.39879864, 0.30217072, 0.05501944, -0.036550965, 0.30801496, -0.21168339, -0.13092734, -0.10309731, 0.02561574, -0.28071794, 0.111772805), conv2d_5_tf);
	conv2d_5_tf = MulAdd(d, MF4x4(0.30419037, -0.27610013, -0.20951773, -0.4682423, 0.013910727, 0.45360255, 0.26947716, -0.28788614, -0.3465049, -0.027093071, 0.19358, -0.0759516, 0.05402844, 0.23829742, 0.14955573, 0.10131891), conv2d_5_tf);
	conv2d_5_tf = MulAdd(e, MF4x4(-0.18213613, 0.1460758, -0.13212326, -0.33431244, -0.038493834, -0.399577, 0.29018825, 0.046454914, 0.5486579, -0.37918556, -0.09230001, -0.06452045, -0.27307686, 0.16817085, -0.3927623, 0.4070809), conv2d_5_tf);
	conv2d_5_tf = MulAdd(f, MF4x4(0.3655112, 0.42978507, -0.20408633, -0.17724891, 0.018163562, 0.16742137, -0.20677765, -0.18758915, 0.08664044, 0.15635273, 0.04482592, -0.10135638, -0.042055663, 0.0120497495, -0.061840538, -0.23626032), conv2d_5_tf);
	conv2d_5_tf = MulAdd(g, MF4x4(0.29038852, -0.14159334, -0.07436412, -0.13352816, -0.3326411, 0.31299374, 0.2287002, 0.2508818, 0.26760912, -0.0037750339, 0.0058190194, -0.024687344, -0.1777058, -0.015039313, -0.07848877, -0.2052551), conv2d_5_tf);
	conv2d_5_tf = MulAdd(h, MF4x4(0.33255517, 0.45893422, 0.20505154, -0.11818784, -0.0353625, -0.2725971, 0.15468855, 0.14384854, -0.01441209, 0.12198328, -0.07893593, 0.0810518, 0.323934, -0.29967225, -0.24283892, -0.11573156), conv2d_5_tf);
	conv2d_5_tf = MulAdd(i, MF4x4(0.17880976, -0.20802346, 0.028815132, 0.22950941, 0.22764732, 0.32852155, -0.16896188, -0.22661959, 0.06486004, 0.00723564, -0.022966828, -0.05319699, 0.03109079, -0.00031444168, -0.16299056, -0.120937996), conv2d_5_tf);
	conv2d_5_tf = MulAdd(na, MF4x4(0.023376284, 0.029397544, -0.23599954, 0.15093243, -0.058068898, -0.022674788, 0.016787661, -0.100131355, -0.06670702, -0.0654595, 0.060609553, -0.24878198, 0.1184957, 0.12865701, -0.110585764, 0.027937055), conv2d_5_tf);
	conv2d_5_tf = MulAdd(nb, MF4x4(-0.21986784, -0.044010285, 0.07705757, -0.06578579, -0.34479773, -0.27297345, 0.07099886, 0.043877546, -0.3284597, 0.60647607, -0.13495111, 0.39562428, 0.12766926, -0.26691958, -0.13183068, 0.19720052), conv2d_5_tf);
	conv2d_5_tf = MulAdd(nc, MF4x4(-0.15688242, 0.02787055, 0.11245185, 0.010610981, 0.31926978, 0.6880586, -0.08503132, 0.2515481, -0.24620119, -0.3889153, 0.07599151, -0.04537119, -0.55283034, -0.170027, -0.14118128, -0.30742723), conv2d_5_tf);
	conv2d_5_tf = MulAdd(nd, MF4x4(0.037949517, 0.0026801233, 0.013419875, -0.07403992, -0.17499912, 0.012353954, 0.15956756, -0.14248073, -0.0017226954, 0.052071165, -0.19224213, 0.00033604537, -0.1924897, -0.21002872, -0.23516886, -0.09922695), conv2d_5_tf);
	conv2d_5_tf = MulAdd(ne, MF4x4(-0.21850063, -0.22287996, -0.046637002, -0.28330007, -0.106190234, 0.027529838, 0.5553775, 0.3273539, 0.0110251075, 0.0067749587, 0.18001638, 0.18281236, 0.19831169, -0.03785556, 0.06003045, -0.12625378), conv2d_5_tf);
	conv2d_5_tf = MulAdd(nf, MF4x4(-0.44703564, -0.2896555, 0.72527117, 0.29206118, -0.004199225, 0.46381885, 0.049183566, 0.14319502, -0.3226642, -0.39931563, 0.23164241, 0.10428929, -0.598285, -0.21007223, -0.36386037, 0.09704366), conv2d_5_tf);
	conv2d_5_tf = MulAdd(ng, MF4x4(0.0462183, -0.063166276, 0.14364852, 0.212176, 0.17403619, -0.09878261, 0.0017970221, -0.31676117, -0.1104441, -0.073732674, -0.12653485, -0.20641124, 0.024175802, 0.005339486, -0.08178427, -0.2761102), conv2d_5_tf);
	conv2d_5_tf = MulAdd(nh, MF4x4(-0.19256714, -0.246452, 0.3358081, -0.16956173, -0.2549593, 0.21122634, -0.06487135, -0.051329695, 0.110607915, -0.09860077, 0.1355533, -0.1489809, 0.023808947, 0.29945812, -0.056281622, 0.0020249223), conv2d_5_tf);
	conv2d_5_tf = MulAdd(ni, MF4x4(-0.34458768, -0.074856885, -0.01856148, 0.06707525, -0.3314005, -0.16196185, 0.33313355, 0.20943385, -0.266928, -0.27552158, 0.018665945, 0.013205852, -0.33579, -0.16876023, -0.031895302, -0.13143763), conv2d_5_tf);

	MF4 target1 = { 0.09810561, 0.044599928, -0.0019709724, 0.064204566 };
	target1 = MulAdd(a, MF4x4(0.23653865, 0.034179572, 0.2680533, 0.03070888, -0.34707117, 0.05323393, 0.20052955, -0.09135351, 0.031460114, -0.23158966, 0.08698448, -0.120006196, -0.11532645, -0.08093671, 0.0037868635, 0.10042472), target1);
	target1 = MulAdd(b, MF4x4(-0.018171439, -0.12269748, 0.09214298, 0.07735124, -0.38116398, 0.2625897, 0.045807257, 0.06052568, 0.15468815, -0.40968472, 0.37565818, 0.032876365, 0.058758568, 0.17787455, 0.11352259, 0.23624317), target1);
	target1 = MulAdd(c, MF4x4(-0.094512895, 0.15499377, -0.15345438, -0.18841587, -0.07849487, 0.037030153, -0.17632313, 0.10438565, -0.18453433, -0.079957336, 0.10274841, 0.07198532, -0.04770108, 0.16846456, 0.31273615, -0.13635644), target1);
	target1 = MulAdd(d, MF4x4(0.13088372, -0.008759914, 0.1716414, 0.082108594, -0.51469034, 0.18175006, -0.16164891, 0.1918173, 0.21287642, -0.094005, 0.20578988, 0.13113159, 0.07577773, 0.09737444, -0.08676422, -0.059179075), target1);
	target1 = MulAdd(e, MF4x4(-0.28462783, 0.42669204, 0.3224737, -0.29510942, -0.12424295, -0.16050552, -0.12770653, 0.0930919, -0.22179118, 0.33128613, -0.42117682, -0.14691186, 0.41048542, -0.040950067, -0.13896315, -0.24155742), target1);
	target1 = MulAdd(f, MF4x4(0.15060697, -0.088174045, 0.27417374, 0.0397946, 0.0078119785, 0.091031335, 0.008468849, -0.04850853, 0.03755719, -0.005380725, 0.13488528, -0.21345685, 0.12456556, 0.17801593, -0.21285392, -0.2111536), target1);
	target1 = MulAdd(g, MF4x4(0.13265789, 0.0058933417, -0.35399312, -0.10547572, 0.014682838, 0.03247095, -0.046823166, -0.086899005, 0.022227641, -0.10579067, 0.13096501, -0.020894872, 0.08426519, 0.068370126, -0.051551163, -0.02995364), target1);
	target1 = MulAdd(h, MF4x4(-0.19551872, 0.16199462, 0.31150326, 0.082667254, 0.20023693, -0.22914512, -0.29721177, -0.2741043, 0.08894789, -0.06843645, -0.019058365, -0.06370645, 0.11551113, 0.011740334, -0.17567629, -0.05505456), target1);
	target1 = MulAdd(i, MF4x4(0.043439314, 0.19573408, -0.17608817, 0.043509595, 0.22829561, 0.059223037, 0.05529666, -0.16555707, 0.2754871, 0.042527672, 0.09646824, 0.07046857, 0.10173791, 0.04030276, -0.0544029, -0.26882443), target1);
	target1 = MulAdd(na, MF4x4(0.022059897, -0.04408266, -0.18699357, -0.09142074, 0.044572234, -0.14162005, 0.108728774, -0.08984615, -0.14737117, 0.12838708, -0.0019777226, 0.21070306, -0.111902215, 0.23080471, 0.0134878885, 0.07111553), target1);
	target1 = MulAdd(nb, MF4x4(0.12182694, 0.063630685, 0.110018775, -0.03879438, 0.333222, -0.45207745, 0.3209222, 0.123050354, -0.40609705, 0.48236838, 0.14323111, -0.12578699, 0.0015041681, -0.019454073, 0.07013497, 0.093687624), target1);
	target1 = MulAdd(nc, MF4x4(0.07142873, -0.32094324, 0.3302099, -0.3693182, 0.15444939, -0.14791024, 0.07907135, -0.111387216, 0.045319714, -0.12518585, 0.13145387, 0.09406553, 0.038564056, -0.3085204, 0.39396307, 0.12083835), target1);
	target1 = MulAdd(nd, MF4x4(0.16042647, -0.16409212, 0.105187505, 0.14153793, 0.269689, -0.14337258, 0.0915773, -0.26669213, -0.059172913, 0.1121628, -0.06627627, -0.29320538, -0.038348313, 0.060661227, -0.09798249, -0.027975965), target1);
	target1 = MulAdd(ne, MF4x4(-0.4110324, -0.06847458, -0.22187959, -0.17196147, -0.2673298, 0.15388274, -0.20157869, 0.45323396, 0.419686, -0.15836199, -0.08358049, 0.2121381, -0.33858112, 0.06060976, -0.0400928, 0.047277283), target1);
	target1 = MulAdd(nf, MF4x4(0.040201366, 0.12845124, 0.6901938, -0.009195482, 0.014911491, -0.06885409, -0.08029354, 0.1280681, 0.13877457, 0.0048243836, -0.13357066, 0.02874182, -0.07086705, -0.08369575, 0.070227675, 0.1674778), target1);
	target1 = MulAdd(ng, MF4x4(-0.009859274, -0.06701725, 0.25491804, -0.035013054, 0.15333284, -0.055876795, -0.22912641, -0.30044466, 0.05092424, 0.15086575, -0.062285095, 0.05064704, 0.02725196, 0.0008295126, -0.24010411, -0.0076930025), target1);
	target1 = MulAdd(nh, MF4x4(-0.033275966, -0.25090593, 0.2981365, 0.12117296, -0.04844607, 0.12529893, 0.041575357, -0.10317985, 0.048691675, 0.13610789, -0.15120777, -0.21308705, -0.019387634, 0.20519307, -0.09056782, -0.04757386), target1);
	target1 = MulAdd(ni, MF4x4(-0.010075166, -0.08621876, -0.19569752, 0.1553574, -0.115346536, -0.009765705, -0.37459797, -0.017294222, -0.18065308, 0.052127127, 0.045157496, 0.11466202, 0.036598917, 0.1750653, -0.18558112, 0.13441156), target1);

	MF4 conv2d_1_tf = tex3.SampleLevel(sam, pos, 0);

	MF4 target2 = { -0.04242169, -0.0033301958, -0.016717333, -0.0006306486 };
	target2 = MulAdd(e, MF4x4(0.16594207, 0.47900248, 0.15186168, -0.38448718, -0.33396608, -0.12204449, -0.21397614, 0.22567725, 0.2399077, 0.16945037, 0.072409995, -0.015192162, -0.5004075, -0.10852234, 0.14456534, 0.36797065), target2);
	target2 = MulAdd(ne, MF4x4(-0.03527082, -0.13062008, 0.2529196, 0.16799021, 0.2743078, 0.22924475, 0.4391596, -0.34473032, -0.08008852, 0.14463465, -0.30243787, 0.0352092, 0.49160767, 0.18479864, -0.13473135, -0.40414095), target2);
	target2 = MulAdd(max(conv2d_5_tf, 0), MF4x4(0.14367065, 0.058683306, 0.091011606, 0.15336677, -0.119622074, 0.04199915, -0.19148684, -0.103310175, 0.116265774, -0.105254985, 0.6245667, -0.26108894, 0.18143174, -0.1839799, 0.048575178, -0.55331755), target2);
	target2 = MulAdd(max(-conv2d_5_tf, 0), MF4x4(0.35027766, 0.03997352, -0.023643266, -0.3330187, -0.10459313, -0.4023968, 0.07325048, -0.09424643, 0.06866858, 0.53465986, -0.44508684, 0.18428375, -0.23138772, 0.027757954, 0.17421234, 0.026670102), target2);
	target2 = MulAdd(max(conv2d_1_tf, 0), MF4x4(-0.4365351, 0.22217907, -0.6871689, 0.045348447, 0.15043557, -0.48645085, -0.29547492, 0.057184387, -0.03682008, 0.3751258, -0.3201267, -0.17569698, 0.3118066, -0.3671979, 0.41987854, -0.122571744), target2);
	target2 = MulAdd(max(-conv2d_1_tf, 0), MF4x4(0.44111615, -0.40698248, 0.0016049108, -0.25277275, -0.28967234, 0.016609022, 0.5386827, 0.069790244, -0.51845384, 0.024502689, -0.026591584, 0.17351557, 0.12391694, 0.08250939, -0.08813545, 0.43510008), target2);
	target2 = MulAdd(max(target1, 0), MF4x4(-0.15770161, -0.27004284, -0.56035084, 0.15914616, 0.22454856, 0.3096621, 0.45845222, -0.008859915, 0.10483775, 0.14181131, 0.026368458, -0.0063670245, 0.24472655, -0.038785648, -0.14339298, -0.10899222), target2);
	target2 = MulAdd(max(-target1, 0), MF4x4(-0.034405068, -0.2823658, 0.050728954, -0.08360402, -0.11867297, -0.20057304, -0.011291816, 0.08128843, 0.07198962, 0.41366118, -0.40760013, -0.05193347, -0.31802976, 0.11970909, 0.09838232, -0.08124989), target2);

	tex2[gxy] = target1;
	tex5[gxy] = target2;
}


//!PASS 4
//!DESC Conv-4x3x3x8, Conv-4x1x1x40
//!IN tex5, tex2, tex3
//!OUT tex4, tex6
//!BLOCK_SIZE 8
//!NUM_THREADS 64

void Pass4(uint2 blockStart, uint3 threadId) {
	uint2 gxy = TileSwizzle8x8(threadId.x) + blockStart;
	uint2 inputSize = GetInputSize();
	if (gxy.x >= inputSize.x || gxy.y >= inputSize.y) {
		return;
	}

	float2 inputPt = GetInputPt();
	float2 pos = (gxy + 0.5f) * inputPt;

	// [ a, d, g ]
	// [ b, e, h ]
	// [ c, f, i ]
	MF4 a = tex5.SampleLevel(sam, pos - inputPt, 0);
	MF4 b = tex5.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0);
	MF4 c = tex5.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0);
	MF4 d = tex5.SampleLevel(sam, pos + float2(0, -inputPt.y), 0);
	MF4 e = tex5.SampleLevel(sam, pos, 0);
	MF4 f = tex5.SampleLevel(sam, pos + float2(0, inputPt.y), 0);
	MF4 g = tex5.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0);
	MF4 h = tex5.SampleLevel(sam, pos + float2(inputPt.x, 0), 0);
	MF4 i = tex5.SampleLevel(sam, pos + inputPt, 0);

	MF4 na = max(-a, 0);
	MF4 nb = max(-b, 0);
	MF4 nc = max(-c, 0);
	MF4 nd = max(-d, 0);
	MF4 ne = max(-e, 0);
	MF4 nf = max(-f, 0);
	MF4 ng = max(-g, 0);
	MF4 nh = max(-h, 0);
	MF4 ni = max(-i, 0);

	a = max(a, 0);
	b = max(b, 0);
	c = max(c, 0);
	d = max(d, 0);
	e = max(e, 0);
	f = max(f, 0);
	g = max(g, 0);
	h = max(h, 0);
	i = max(i, 0);

	MF4 conv2d_8_tf = { 0.017177593, -0.03303642, 0.018293152, -0.0153594585 };
	conv2d_8_tf = MulAdd(a, MF4x4(-0.43036512, 0.052133385, 0.1917228, -0.0080327755, -0.13650647, 0.23129214, -0.03926996, -0.07268268, -0.039649602, -0.04959827, 0.04222682, 0.00578327, -0.6177682, -0.5984116, -0.055091057, -0.41249448), conv2d_8_tf);
	conv2d_8_tf = MulAdd(b, MF4x4(-0.41248822, 0.42497736, 0.3476831, 0.11943562, 0.071097784, 0.1390214, 0.05519766, -0.13476476, -0.36376685, 0.058813993, -0.05142066, 0.059006505, -0.17129485, 0.18402734, 0.412061, -0.38983205), conv2d_8_tf);
	conv2d_8_tf = MulAdd(c, MF4x4(-0.19183454, -0.11911039, 0.20892574, 0.1218832, -0.23423564, 0.10342528, 0.09782025, 0.027760351, -0.08676245, 0.07389133, 0.009934853, 0.015378812, 0.28361297, -0.23730409, -0.10037592, -0.24095006), conv2d_8_tf);
	conv2d_8_tf = MulAdd(d, MF4x4(0.035607535, -0.3156877, -0.013944192, 0.22095163, 0.20762561, -0.26094976, 0.049627785, -0.20424393, 0.07220507, 0.14855692, -0.04763761, 0.09102831, -0.6707187, 0.044909656, 0.73606086, 0.3112647), conv2d_8_tf);
	conv2d_8_tf = MulAdd(e, MF4x4(0.28717026, -0.027964758, 0.19860156, -0.18898363, -0.10064204, 0.05297523, 0.014720102, -0.10856063, -0.517343, -0.17088185, 0.21192405, 0.040609106, 0.07515164, -0.22581428, 0.54721195, 0.40544033), conv2d_8_tf);
	conv2d_8_tf = MulAdd(f, MF4x4(-0.021332845, -0.28534392, -0.053418603, -0.5890941, 0.3246433, 0.255651, 0.07088422, -0.10737213, -0.116894506, 0.13120323, 0.09616092, -0.0067616547, 0.085571416, 0.14623387, -0.26895332, -0.12028506), conv2d_8_tf);
	conv2d_8_tf = MulAdd(g, MF4x4(-0.052351072, -0.73936135, -0.07819111, -0.35983723, 0.13252614, -0.3479261, -0.07381629, 0.008948218, 0.0053645126, -0.039163757, -0.061387096, 0.0041966103, -0.22976315, -0.10269704, 0.5676015, -0.2502383), conv2d_8_tf);
	conv2d_8_tf = MulAdd(h, MF4x4(0.09443165, 0.13924311, 0.15899155, -0.029454758, 0.002642519, 0.4178081, -0.19227526, 0.25177202, -0.26731998, -0.14999937, -0.15141752, -0.16183105, -0.4617529, -0.43337283, 0.2787283, -0.72364557), conv2d_8_tf);
	conv2d_8_tf = MulAdd(i, MF4x4(0.18768649, -0.33622888, 0.10795176, -0.3965141, -0.1887279, 0.2281405, -0.45963305, -0.16073631, -0.015594818, 0.07035953, -0.16940016, -0.28909472, -0.017725285, -0.35240498, 0.30173686, 0.20117418), conv2d_8_tf);
	conv2d_8_tf = MulAdd(na, MF4x4(0.03129677, -0.04133618, -0.011259672, 0.03561297, 0.0852418, 0.04584553, 0.19103919, 0.09809102, -0.14594959, -0.4438363, 0.16297287, -0.20317835, 0.115456745, -0.06761671, 0.15409957, 0.04450018), conv2d_8_tf);
	conv2d_8_tf = MulAdd(nb, MF4x4(0.039826628, -0.45614466, 0.0642495, 0.05919764, -0.44811794, 0.30939403, -0.09915154, 0.1356114, 0.24242148, -0.5744648, 0.051002555, 0.2401494, -0.24656531, -0.025525048, 0.0022000005, 0.16019441), conv2d_8_tf);
	conv2d_8_tf = MulAdd(nc, MF4x4(-0.30609047, -0.44622147, -0.1323853, 0.27586594, 0.28131932, -0.1788347, -0.13601942, -0.056978267, 0.1390773, 0.023616405, 0.23695482, 0.014369665, 0.1065836, 0.2862605, 0.12936947, -0.08392774), conv2d_8_tf);
	conv2d_8_tf = MulAdd(nd, MF4x4(-0.21285766, -0.19791842, -0.08064578, -0.15698087, -0.6196114, -0.30824217, -0.048959345, 0.30395007, -0.41899, -0.3358852, -0.097170554, 0.28982377, 0.087944746, 0.15887393, 0.12179637, -0.33221152), conv2d_8_tf);
	conv2d_8_tf = MulAdd(ne, MF4x4(-0.13241346, 0.035703655, -0.4474765, 0.110112734, -0.27055773, 0.41301596, -0.6500781, -0.15217184, -0.2048386, 0.011350564, -0.45242086, 0.4019483, -0.13381444, -0.34816414, -0.5594909, 0.06767518), conv2d_8_tf);
	conv2d_8_tf = MulAdd(nf, MF4x4(-0.16038893, 0.035530727, -0.029575568, 0.4231352, 0.024787677, 0.63239074, -0.039876997, -0.025136393, -0.51243687, 0.05607693, -0.26631242, 0.089419514, -0.051774174, 0.08727033, -0.055868924, -0.0934304), conv2d_8_tf);
	conv2d_8_tf = MulAdd(ng, MF4x4(0.08607903, 0.10347359, -0.08568057, -0.04361689, -0.09244961, 0.032459106, 0.07126668, 0.40926656, -0.17473985, -0.2854381, -0.07475363, -0.16183083, 0.22286943, 0.068349905, -0.07890174, -0.18732166), conv2d_8_tf);
	conv2d_8_tf = MulAdd(nh, MF4x4(0.17825048, -0.31030193, -0.21215369, 0.015413245, -0.0980228, -0.3963089, -0.09465454, -0.39197174, 0.22134416, -0.10105557, 0.3249675, -0.027290137, -0.10875647, -0.2393993, -0.015305307, 0.21288091), conv2d_8_tf);
	conv2d_8_tf = MulAdd(ni, MF4x4(0.26367134, -0.11709682, 0.10634492, -0.13768406, 0.5535611, 0.6967819, -0.31092402, -0.5262172, 0.14721805, -0.05149995, 0.22435789, -0.21493623, 0.27388602, -0.14029293, -0.1060113, 0.083680965), conv2d_8_tf);

	MF4 target1 = { 0.034884464, 0.055267137, 0.03452981, 0.012002485 };
	target1 = MulAdd(a, MF4x4(0.43671334, -0.16534646, -0.13688485, -0.008512402, -0.10336664, -0.08822921, -0.116312236, -0.038849946, -0.035221335, 0.019403309, 0.060067646, -0.025432155, 0.090118125, -0.117073216, 0.16502255, 0.034231257), target1);
	target1 = MulAdd(b, MF4x4(0.17112842, -0.023511292, -0.2592198, -0.07303919, 0.048081987, -0.054403186, -0.060226068, -0.2663483, 0.16908844, -0.11529753, -0.036192283, 0.05631556, -0.12996213, 0.32429552, -0.17090482, 0.37093237), target1);
	target1 = MulAdd(c, MF4x4(-0.0398796, -0.21753207, -0.014232783, 0.04652695, 0.06361906, 0.11714849, -0.116917215, -0.0088206185, -0.15661797, 0.11036933, 0.043800946, 0.0088503305, 0.15252474, -0.21677117, -0.26665527, 0.11332868), target1);
	target1 = MulAdd(d, MF4x4(0.14935064, 0.03734691, 0.08192101, -0.28615516, 0.19225292, 0.09485945, -0.018961852, -0.04503368, -0.14962928, 0.14281853, 0.015293623, -0.0051231394, 0.31510183, 0.28869596, 0.1890055, -0.07833456), target1);
	target1 = MulAdd(e, MF4x4(0.2734724, 0.37409434, -0.2611236, 0.06528365, -0.1886752, 0.045421556, 0.25771844, 0.14760634, -0.02859129, -0.071093805, -0.1635561, 0.06800318, 0.44370538, 0.43510497, 0.15145455, -0.029246451), target1);
	target1 = MulAdd(f, MF4x4(0.17102292, 0.33519942, 0.2755555, -0.24724208, 0.042192735, -0.6907692, -0.10582406, 0.2008313, 0.04859614, -0.24115612, 0.015256011, -0.029317714, -0.057466604, -0.1004556, 0.24814546, -0.22135083), target1);
	target1 = MulAdd(g, MF4x4(0.20959556, 0.113371, -0.021680012, -0.054057337, -0.017139604, -0.082443535, -0.03216185, 0.13644056, -0.105473205, -0.033690784, 0.030838218, 0.013347346, 0.49752173, -0.14028637, -0.23801191, 0.059374087), target1);
	target1 = MulAdd(h, MF4x4(0.054281052, 0.04908332, 0.065993994, -0.09818599, 0.17124225, -0.22669722, -0.090717405, 0.20086871, 0.05861675, 0.09584638, 0.18013628, 0.026234226, 0.32684898, 0.28582916, -0.03517119, -0.21534745), target1);
	target1 = MulAdd(i, MF4x4(0.2143339, -0.009243758, -0.043321237, -0.18695052, 0.0707111, -0.052678097, 0.04782485, 0.06970353, -0.029827276, 0.10827879, 0.049044352, -0.09452859, -0.08516196, 0.11786405, -0.18170272, -0.117841594), target1);
	target1 = MulAdd(na, MF4x4(-0.23180094, 0.079831, -0.17606014, -0.06691572, 0.13079396, -0.054930445, 0.025274629, 0.059386294, 0.18818773, 0.071563244, -0.19136675, 0.031156426, 0.12569802, 0.057418842, -0.022066243, 0.09572557), target1);
	target1 = MulAdd(nb, MF4x4(0.13405065, -0.038109858, 0.19447789, -0.121862344, -0.5014013, 0.030394621, -0.11468341, 0.24658446, -0.2861801, 0.11453208, 0.17080295, 0.32403797, 0.01776269, 0.21879151, -0.1487332, -0.13659461), target1);
	target1 = MulAdd(nc, MF4x4(-0.16852567, 0.37488598, 0.103131816, 0.15805401, -0.5529941, -0.0106922565, 0.14309406, 0.018851891, 0.18253598, -0.18453355, -0.14344332, 0.14581451, 0.00017439971, -0.22823274, -0.02480218, -0.28830686), target1);
	target1 = MulAdd(nd, MF4x4(-0.036933262, -0.105577976, 0.02778643, 0.21757011, -0.0051288083, 0.036500473, 0.12934865, -0.18750058, 0.05384686, -0.14823805, 0.12996665, -0.0717687, 0.15035072, 0.00028661545, -0.4272515, 0.102082215), target1);
	target1 = MulAdd(ne, MF4x4(0.3707243, -0.34236187, -0.037726954, 0.19196671, 0.101593964, 0.3211922, -0.30584693, -0.09473774, -0.012873282, -0.26314828, -0.3015266, -0.05155332, -0.23810461, -0.17289765, 0.16493215, 0.07951415), target1);
	target1 = MulAdd(nf, MF4x4(-0.054548983, 0.20742553, -0.17368966, -0.11417929, -0.14998713, 0.14250377, 0.08688373, -0.39742398, -0.29795423, 0.3917638, -0.24611169, -0.007993072, -0.052766692, -0.05993209, -0.017495412, 0.2881331), target1);
	target1 = MulAdd(ng, MF4x4(-0.05283335, 0.081839375, 0.013510656, -0.097930856, -0.09817993, -0.10169309, -0.024573473, -0.061191153, 0.14742163, 0.12549889, 0.21033141, -0.11116201, -0.046900082, 0.052657153, -0.10784069, 0.0005640972), target1);
	target1 = MulAdd(nh, MF4x4(0.036850937, -0.004740191, -0.105057694, 0.16894996, -0.39845806, -0.11454543, 0.044997875, 0.10780206, -0.15164936, -0.030377366, -0.015979659, -0.16242398, -0.045865484, 0.04037505, -0.03663904, 0.24529697), target1);
	target1 = MulAdd(ni, MF4x4(0.0041185757, 0.0843081, 0.07231875, 0.100667596, -0.31684703, -0.2574812, -0.03461963, 0.11267055, -0.22542828, -0.104221806, -0.095156625, -0.08219916, 0.18497708, -0.08431334, -0.074380755, 0.07518058), target1);

	MF4 conv2d_1_tf = tex3.SampleLevel(sam, pos, 0);
	MF4 conv2d_4_tf = tex2.SampleLevel(sam, pos, 0);

	MF4 target2 = { -0.048349448, -0.027946962, -0.014499015, -0.017825816 };
	target2 = MulAdd(e, MF4x4(0.09644354, -0.12061228, -0.15139145, 0.010084075, 0.19283041, -0.15289722, 0.0028078665, 0.15971705, -0.03884288, -0.06906346, -0.04772131, 0.32280502, -0.42069855, 0.21643022, -0.8389786, -0.50325495), target2);
	target2 = MulAdd(ne, MF4x4(0.18034904, 0.037142154, 0.41413367, 0.08413125, -0.14397736, -0.4820656, 0.32794252, 0.2589487, 0.46948192, 0.26964813, -0.07420985, -0.16767345, 0.086358115, -0.10306444, 0.36070088, 0.1681583), target2);
	target2 = MulAdd(max(conv2d_8_tf, 0), MF4x4(0.35362276, 0.012461055, -0.77784586, 0.09078976, 0.19976044, 0.17758635, -0.37238386, -0.03503108, 0.13998942, -0.37809366, 0.016560063, 0.3934089, -0.25227416, -0.123653956, -0.05106222, 0.005900442), target2);
	target2 = MulAdd(max(-conv2d_8_tf, 0), MF4x4(0.057956465, -0.049570814, 0.0606723, -0.20321843, -0.26415482, -0.27723017, 0.116116256, 0.091267794, -0.14814565, 0.25946814, 0.17341542, 0.14638402, 0.2880723, 0.10809813, 0.025261842, -0.34984475), target2);
	target2 = MulAdd(max(conv2d_1_tf, 0), MF4x4(0.05510083, 0.17530598, -0.20630372, -0.027601322, 0.017287979, 0.1857018, -0.41756013, -0.14747128, 0.36301833, 0.13361412, 0.021245379, 0.08700895, -0.15968269, -0.32113054, 0.019964505, -0.15953153), target2);
	target2 = MulAdd(max(-conv2d_1_tf, 0), MF4x4(-0.12913038, -0.21853726, -0.14845535, -0.2878481, 0.060428645, -0.12468173, -0.0068141054, 0.044517014, -0.3603185, -0.21329117, -0.029232644, 0.033500195, 0.4367195, -0.048263986, 0.36913735, -0.015526651), target2);
	target2 = MulAdd(max(conv2d_4_tf, 0), MF4x4(0.15424874, 0.09803074, -0.4081566, -0.24807191, -0.21617292, -0.26116055, -0.19488858, 0.13665622, -0.23223704, 0.13516016, -0.19990326, -0.09589857, 0.2877168, -0.18335378, -0.12726076, -0.01706245), target2);
	target2 = MulAdd(max(-conv2d_4_tf, 0), MF4x4(0.17850566, 0.11283147, 0.0941847, 0.07064274, 0.23485339, 0.053585358, 0.038221374, -0.052291602, -0.085393615, -0.43200582, -0.3899717, -3.6526293e-05, -0.1805902, 0.15160961, -0.25388122, -0.10506431), target2);
	target2 = MulAdd(max(target1, 0), MF4x4(0.10518986, 0.4441116, -0.16333202, -0.15620118, -0.025791602, -0.2971725, 0.27621722, 0.15761738, 0.008179799, 0.4354704, 0.8792617, 0.98227674, 0.27862114, -0.28962052, 0.08527341, 0.06820025), target2);
	target2 = MulAdd(max(-target1, 0), MF4x4(-0.002976883, -0.220515, -0.2764896, 0.03840775, 0.09852327, 0.09890841, 0.6333531, 0.05949176, -0.12757486, 0.12711844, -0.103355624, -0.2612116, -0.92972547, 0.20546664, 0.43557793, 0.14573197), target2);

	tex4[gxy] = target1;
	tex6[gxy] = target2;
}


//!PASS 5
//!DESC Conv-4x3x3x8, Conv-4x1x1x48
//!IN tex6, tex3, tex2, tex4
//!OUT tex1, tex5, tex7
//!BLOCK_SIZE 8
//!NUM_THREADS 64

void Pass5(uint2 blockStart, uint3 threadId) {
	uint2 gxy = TileSwizzle8x8(threadId.x) + blockStart;
	uint2 inputSize = GetInputSize();
	if (gxy.x >= inputSize.x || gxy.y >= inputSize.y) {
		return;
	}

	float2 inputPt = GetInputPt();
	float2 pos = (gxy + 0.5f) * inputPt;

	// [ a, d, g ]
	// [ b, e, h ]
	// [ c, f, i ]
	MF4 a = tex6.SampleLevel(sam, pos - inputPt, 0);
	MF4 b = tex6.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0);
	MF4 c = tex6.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0);
	MF4 d = tex6.SampleLevel(sam, pos + float2(0, -inputPt.y), 0);
	MF4 e = tex6.SampleLevel(sam, pos, 0);
	MF4 f = tex6.SampleLevel(sam, pos + float2(0, inputPt.y), 0);
	MF4 g = tex6.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0);
	MF4 h = tex6.SampleLevel(sam, pos + float2(inputPt.x, 0), 0);
	MF4 i = tex6.SampleLevel(sam, pos + inputPt, 0);

	MF4 na = max(-a, 0);
	MF4 nb = max(-b, 0);
	MF4 nc = max(-c, 0);
	MF4 nd = max(-d, 0);
	MF4 ne = max(-e, 0);
	MF4 nf = max(-f, 0);
	MF4 ng = max(-g, 0);
	MF4 nh = max(-h, 0);
	MF4 ni = max(-i, 0);

	a = max(a, 0);
	b = max(b, 0);
	c = max(c, 0);
	d = max(d, 0);
	e = max(e, 0);
	f = max(f, 0);
	g = max(g, 0);
	h = max(h, 0);
	i = max(i, 0);

	MF4 target1 = { -0.10319947, 0.010868113, 0.0143356435, -0.007343647 };
	target1 = MulAdd(a, MF4x4(0.099030726, -0.06836123, 0.08793171, -0.08440806, 0.1367897, -0.18130925, -0.061028607, -0.0036578078, -0.2664728, 0.11683366, -0.106817886, 0.054352235, -0.037010342, -0.04099114, -0.024939198, 0.17543977), target1);
	target1 = MulAdd(b, MF4x4(-0.005120602, 0.033574037, 0.15293613, 0.14662915, 0.16131143, 0.14048538, -0.07979977, -0.09974233, 0.12065904, -0.027316207, 0.05308134, -0.39921048, -0.11916608, 0.05068417, -0.064156584, 0.0906338), target1);
	target1 = MulAdd(c, MF4x4(0.19719984, 0.031454016, 0.057130553, -0.08133089, -0.48387995, -0.20429122, -0.2968695, 0.17029694, 0.2686546, -0.32400158, 0.23564363, -0.12357238, -0.039444853, -0.25260264, -0.045210194, 0.009996893), target1);
	target1 = MulAdd(d, MF4x4(0.24888185, -0.16971394, 0.23991539, -0.20469886, -0.05449719, -0.22697294, -0.19475369, -0.14052935, 0.15595771, 0.09519395, -0.18674417, -0.19258659, -0.18656066, -0.07679601, 0.04305061, -0.052698307), target1);
	target1 = MulAdd(e, MF4x4(0.26016366, 0.37886587, 0.29538265, 0.13591415, 0.08657945, 0.2248858, 0.13191143, -0.27878642, 0.38287383, -0.24528888, 0.16275367, -0.4445379, -0.15009366, 0.21030647, 0.04707718, -0.36865705), target1);
	target1 = MulAdd(f, MF4x4(0.00060599507, -0.063061595, 0.09708327, 0.18096425, -0.18803552, -0.15204777, -0.21307996, 0.25915486, 0.180343, 0.15965502, 0.4193544, 0.11587751, -0.01724538, -0.0003311443, 0.118263096, 0.3388005), target1);
	target1 = MulAdd(g, MF4x4(-0.11013732, -0.24454343, 0.11523979, 0.16267157, 0.037852544, -0.018723588, -0.044225607, 0.010824283, -0.09449054, -0.43009904, 0.17163227, 0.058022983, 0.3704038, -0.124312826, -0.04090871, -0.41738933), target1);
	target1 = MulAdd(h, MF4x4(-0.08466185, -0.032986447, -0.12251885, -0.061746452, -0.28120902, -0.03351265, -0.07977477, 0.035497896, -0.40911916, -0.265343, 0.18400514, 0.18039864, 0.2885377, 0.17138512, -0.2672905, -0.17658347), target1);
	target1 = MulAdd(i, MF4x4(0.14892288, 0.054083705, 0.074718416, 0.011234817, -0.1644216, 0.10958687, 0.016626561, 0.13260235, 0.15622494, 0.028492622, 0.16308293, 0.0817191, 0.004302441, -0.03425889, 0.019733155, 0.20729025), target1);
	target1 = MulAdd(na, MF4x4(-0.10912273, 0.18627015, -0.12923245, -0.007432667, -0.15062776, 0.1132029, -0.039932206, -0.048926212, -0.19350322, -0.052288085, -0.062460408, 0.06341913, -0.22352171, 0.12735958, -0.030772611, 0.10314876), target1);
	target1 = MulAdd(nb, MF4x4(0.055571638, -0.29345444, -0.05150461, 0.038981512, -0.20368473, -0.1620652, 0.2212063, 0.16812243, -0.25869122, -0.055914585, 0.1699279, 0.09515419, -0.051229157, 0.029384349, 0.2958992, 0.33411613), target1);
	target1 = MulAdd(nc, MF4x4(-0.16893966, -0.11777383, -0.1890183, 0.3100362, 0.32964075, 0.1503138, 0.23687156, -0.1966872, -0.34989685, 0.018697567, -0.054476835, 0.2467992, 0.1404086, 0.042806204, 0.22713056, -0.07194008), target1);
	target1 = MulAdd(nd, MF4x4(0.1294499, 0.08734431, -0.27748963, -0.30450672, 0.347131, 0.10832939, 0.094416045, -0.021583052, -0.03705905, 0.13216147, 0.060019907, 0.17617045, -0.31731188, 0.055844136, -0.32436728, 0.09127553), target1);
	target1 = MulAdd(ne, MF4x4(-0.37301856, -0.59706587, 0.14188358, -0.11759082, -0.123990245, 0.17104799, -0.22897844, 0.044174567, 0.08194783, 0.5041956, 0.080176726, 0.30695775, 0.14737315, 0.06887362, -0.14944588, 0.041438155), target1);
	target1 = MulAdd(nf, MF4x4(0.028311472, -0.12458831, 0.09180698, 0.21692544, 0.26750755, -0.095768556, 0.37605208, -0.09700436, -0.43799365, -0.2001086, -0.22588708, 0.21119161, 0.017415013, 0.15119827, -0.015756091, -0.097044095), target1);
	target1 = MulAdd(ng, MF4x4(0.07018085, 0.07628864, 0.03961951, 0.032012466, 0.09119677, -0.11489552, 0.086640276, -0.10799725, -0.09006475, 0.18994014, 0.015971951, 0.025477583, 0.034011904, -0.07448855, -0.090691224, -0.08970111), target1);
	target1 = MulAdd(nh, MF4x4(-0.036299143, 0.14122474, -0.1863209, 0.1802412, 0.25498003, 0.12084085, -0.15148233, -0.15718026, 0.00034174722, 0.13090368, -0.17938401, -0.064941354, -0.42650834, -0.24431564, 0.1735792, -0.08763975), target1);
	target1 = MulAdd(ni, MF4x4(-0.018800588, -0.09828807, 0.022626605, 0.19307971, 0.2295834, 0.021806285, 0.17869954, -0.089709155, 0.039047185, 0.1444108, -0.058205944, -0.0141449645, 0.10705844, 0.17592433, -0.017586943, 0.100735694), target1);

	MF4 target2 = { -0.0891901, 0.05071113, -0.026449949, -0.0051819966 };
	target2 = MulAdd(a, MF4x4(-0.034931988, -0.10314893, 0.050731838, 0.008667428, 0.093605734, 0.18763398, 0.1329972, 0.32109565, 0.018679736, 0.16050446, -0.21393016, -0.5850818, -0.03595686, -0.06816087, 0.058053996, 0.14945738), target2);
	target2 = MulAdd(b, MF4x4(0.13086358, 0.1037956, 0.024482725, 0.28596595, 0.03427747, 0.03360277, -0.08412939, -0.09863662, -0.14649919, 0.049508557, -0.040583454, -0.3193693, 0.09898459, -0.055807225, -0.13826977, -0.24508655), target2);
	target2 = MulAdd(c, MF4x4(0.022690594, -0.049172435, -0.043048073, 0.28297383, -0.12327597, 0.12841734, 0.19118458, -0.14444864, 0.25481266, -0.1530131, -0.32560238, 0.28813502, 0.07987849, -0.081693284, 0.023993304, 0.051493756), target2);
	target2 = MulAdd(d, MF4x4(-0.21383128, 0.10948106, 0.29768178, 0.5630563, -0.097254336, 0.3000293, 0.27545682, -0.10354583, 0.064267136, -0.0722382, 0.16716443, -0.29272497, 0.124174535, -0.09405645, -0.07759505, -0.63239044), target2);
	target2 = MulAdd(e, MF4x4(-0.049770556, -0.2611922, -0.11767422, -0.056895554, -0.10655438, 0.15822971, -0.15873717, -0.034663625, -0.22618848, -0.037567407, 0.8648974, 0.15630767, 0.24981938, 0.15488663, -0.01769864, -0.05102535), target2);
	target2 = MulAdd(f, MF4x4(0.021745246, -0.019828277, -0.2533036, 0.08191131, 0.21484213, 0.07265768, 0.13022637, 0.12640825, 0.3097948, 0.1656624, 0.29834095, 0.26926345, 0.1445516, -0.096134044, 0.23720652, 0.104119554), target2);
	target2 = MulAdd(g, MF4x4(-0.0026226363, -0.11969785, -0.07630252, 0.48163646, 0.020707106, 0.098053664, 0.15194124, -0.067455925, -0.0072260266, -0.063311785, -0.13165388, -0.2720021, 0.056918275, -0.46139827, 0.062053606, -0.2062505), target2);
	target2 = MulAdd(h, MF4x4(0.18370466, -0.21412961, -0.08481129, 0.012198226, -0.08129054, 0.5550795, 0.047955874, 0.2502166, -0.07373375, 0.28914857, -0.0046189106, -0.014052611, -0.1366542, -0.4555943, -0.053266894, 0.4447608), target2);
	target2 = MulAdd(i, MF4x4(-0.028673984, -0.05453405, -0.118545935, -0.069395766, 0.17180833, 0.17611517, 0.13780451, 0.28597325, -0.07254466, 0.05339366, 0.0095731495, 0.17107281, 0.08671597, -0.06200009, -0.06297748, 0.08674916), target2);
	target2 = MulAdd(na, MF4x4(-0.040299665, 0.095958404, 0.052906267, -0.48397818, -0.1331588, -0.0012678325, -0.042020816, -0.33833674, -0.012395556, 0.07671447, -0.15005252, -0.083733305, 0.12279073, 0.13883469, -0.10359484, -0.31333458), target2);
	target2 = MulAdd(nb, MF4x4(0.14495945, -0.12174993, -0.11281622, -0.018538697, -0.14329918, 0.12817283, -0.046540275, -0.1030246, -0.1832771, -0.30401602, -0.33390167, -0.052471336, 0.12632851, 0.23514742, 0.0011784412, -0.49560672), target2);
	target2 = MulAdd(nc, MF4x4(0.08295849, 0.044828687, 0.27639604, 0.039427668, 0.02818349, -0.06210292, -0.27352595, 0.19817229, -0.18440844, -0.06898423, 0.0017214341, -0.18130824, -0.0071537187, 0.03517007, -0.2113949, 0.025240164), target2);
	target2 = MulAdd(nd, MF4x4(-0.2006673, -0.041704424, 0.16268894, -0.25376207, 0.07905478, -0.17365594, 0.10044552, -0.20418073, 0.085226685, -0.16344517, -0.11064805, -0.2824042, 0.00095205643, 0.31177342, -0.3084233, -0.0908839), target2);
	target2 = MulAdd(ne, MF4x4(0.26129997, 0.3127755, 0.06982181, 0.23317924, -0.05344337, 0.008762884, 0.20765801, 0.13311344, -0.021598162, 0.0038430444, -0.40633947, 0.09444498, -0.097569115, 0.1161639, 0.051482536, -0.13007577), target2);
	target2 = MulAdd(nf, MF4x4(0.1168701, 0.10319956, -0.26231092, 0.13755418, -0.31545812, 0.21018027, -0.2570223, 0.11072984, 0.169098, -0.092338, 0.19418359, -0.24841106, 0.2179265, 0.26306525, -0.030364338, 0.011455713), target2);
	target2 = MulAdd(ng, MF4x4(0.013165953, -0.027480505, 0.019355817, -0.22797722, 0.10252238, -0.13104701, 0.043106645, -0.113860615, 0.077017605, 0.16079858, -0.13723075, 0.08403468, 0.07229952, -0.07288171, 0.153157, -0.30485252), target2);
	target2 = MulAdd(nh, MF4x4(-0.18590495, -0.02694476, 0.14553905, 0.135362, 0.033088487, -0.49798432, -0.11869643, 0.15896079, 0.09456545, -0.14991766, -0.15788183, -0.13954063, -0.1400199, 0.47176227, 0.1710854, 0.24664737), target2);
	target2 = MulAdd(ni, MF4x4(0.15082799, -0.1990422, -0.07347236, 0.106623515, -0.054368034, -0.10389193, -0.0711653, -0.022524087, -0.056636613, -0.07881972, 0.09727487, -0.16494693, 0.13156064, 0.176482, 0.11008391, 0.16038191), target2);

	MF4 conv2d_1_tf = tex3.SampleLevel(sam, pos, 0);
	MF4 conv2d_4_tf = tex2.SampleLevel(sam, pos, 0);
	MF4 conv2d_7_tf = tex4.SampleLevel(sam, pos, 0);

	MF4 target3 = { 0.06043013, -0.057747327, -0.0260778, 0.034383494 };
	target3 = MulAdd(e, MF4x4(-0.2967133, -0.18581349, -0.03749059, 0.30880052, -0.11064016, -0.23309472, 0.05572459, 0.04502667, -0.12098995, 0.1875494, 0.17095889, 0.008563628, -0.16092524, 0.03845401, 0.1908294, 0.10556762), target3);
	target3 = MulAdd(ne, MF4x4(0.23697758, 0.11629349, 0.19466121, -0.41413772, -0.20402254, 0.0062864223, -0.13700421, -0.10543815, -0.03498975, 0.02710536, -0.32383642, 0.12299909, -0.06849518, -0.005379719, 0.15714374, -0.15514039), target3);
	target3 = MulAdd(max(target1, 0), MF4x4(-0.17502604, -0.24644612, -0.13557185, -0.16728596, -0.024457034, -0.28457522, 0.13460088, -0.21639405, 0.057475664, 0.1473123, 0.19220911, -0.12668033, 0.67518485, -0.36505973, -0.16904399, -0.010216019), target3);
	target3 = MulAdd(max(-target1, 0), MF4x4(-0.15164074, 0.2532923, -0.13278177, -0.11557631, -0.23019886, 0.115244605, 0.010407434, 0.044481948, -0.36745974, 0.6252675, -0.7489445, 0.31991, 0.04725299, 0.32507753, 0.3035176, -0.18355042), target3);
	target3 = MulAdd(max(conv2d_1_tf, 0), MF4x4(0.11328097, -0.09094802, -0.03745151, 0.12965176, 0.0051720524, 0.028558291, -0.047848992, 0.23055501, 0.18047509, -0.07151716, 0.05670166, -0.008592144, -0.092078224, -0.013172229, -0.017855234, -0.07338865), target3);
	target3 = MulAdd(max(-conv2d_1_tf, 0), MF4x4(0.123723745, -0.06312486, 0.0427355, -0.11981472, 0.028110307, 0.2275076, -0.019800344, -0.10352946, -0.23628815, 0.24896589, -0.07624697, -0.21491022, -0.13148311, 0.27282125, -0.053250857, -0.15992334), target3);
	target3 = MulAdd(max(conv2d_4_tf, 0), MF4x4(-0.23408101, 0.20139061, 0.0035646914, 0.16009186, -0.1912387, -0.0066828816, -0.13681525, -0.22325766, -0.056139376, -0.0638933, 0.0681208, 0.041838214, -0.016192758, 0.19360517, -0.21080317, 0.113634475), target3);
	target3 = MulAdd(max(-conv2d_4_tf, 0), MF4x4(0.1369719, 0.18950021, 0.019468868, -0.08180063, -0.31615034, 0.028354429, -0.1489749, -0.096815735, 0.22448029, 0.16501611, -0.11709639, -0.047612794, 0.10514418, -0.07882259, 0.2664075, 0.19011621), target3);
	target3 = MulAdd(max(conv2d_7_tf, 0), MF4x4(0.13804765, 0.01748137, 0.18502045, 0.058146507, -0.5661739, 0.14128609, -0.25875592, -0.6150388, -0.031642724, 0.3204696, -0.021026978, -0.3983191, 0.08609409, 0.0042772954, -0.3754959, -0.19454613), target3);
	target3 = MulAdd(max(-conv2d_7_tf, 0), MF4x4(0.09550674, 0.26413566, -0.15292425, -0.13285659, 0.14078279, 0.08191184, 0.066060774, -0.02605145, -0.08946464, 0.11715431, 0.05521046, -0.03218011, -0.31606913, -0.011917866, 0.11926112, 0.145299), target3);
	target3 = MulAdd(max(target2, 0), MF4x4(0.71071726, -0.8614542, -0.050295915, 0.41341305, -0.38318273, 0.1269644, 0.46467987, -0.15950991, -0.75483114, 0.6358254, -0.19257315, -0.5991311, 0.10807353, 0.083646335, 0.032484207, -0.20280145), target3);
	target3 = MulAdd(max(-target2, 0), MF4x4(-0.21395132, 0.37320906, 0.30284703, 0.054482624, 0.10859697, 0.21301107, -0.09715497, -0.047609363, 0.40013343, -0.22015318, 0.09944949, 0.4283713, 0.1767619, 0.15653327, -0.01787549, 0.22862214), target3);

	tex1[gxy] = target1;
	tex5[gxy] = target2;
	tex7[gxy] = target3;
}


//!PASS 6
//!DESC Conv-4x3x3x8, Conv-4x1x1x56
//!IN tex7, tex1, tex3, tex2, tex4, tex5
//!OUT tex6, tex8
//!BLOCK_SIZE 8
//!NUM_THREADS 64

void Pass6(uint2 blockStart, uint3 threadId) {
	uint2 gxy = TileSwizzle8x8(threadId.x) + blockStart;
	uint2 inputSize = GetInputSize();
	if (gxy.x >= inputSize.x || gxy.y >= inputSize.y) {
		return;
	}

	float2 inputPt = GetInputPt();
	float2 pos = (gxy + 0.5f) * inputPt;

	// [ a, d, g ]
	// [ b, e, h ]
	// [ c, f, i ]
	MF4 a = tex7.SampleLevel(sam, pos - inputPt, 0);
	MF4 b = tex7.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0);
	MF4 c = tex7.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0);
	MF4 d = tex7.SampleLevel(sam, pos + float2(0, -inputPt.y), 0);
	MF4 e = tex7.SampleLevel(sam, pos, 0);
	MF4 f = tex7.SampleLevel(sam, pos + float2(0, inputPt.y), 0);
	MF4 g = tex7.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0);
	MF4 h = tex7.SampleLevel(sam, pos + float2(inputPt.x, 0), 0);
	MF4 i = tex7.SampleLevel(sam, pos + inputPt, 0);

	MF4 na = max(-a, 0);
	MF4 nb = max(-b, 0);
	MF4 nc = max(-c, 0);
	MF4 nd = max(-d, 0);
	MF4 ne = max(-e, 0);
	MF4 nf = max(-f, 0);
	MF4 ng = max(-g, 0);
	MF4 nh = max(-h, 0);
	MF4 ni = max(-i, 0);

	a = max(a, 0);
	b = max(b, 0);
	c = max(c, 0);
	d = max(d, 0);
	e = max(e, 0);
	f = max(f, 0);
	g = max(g, 0);
	h = max(h, 0);
	i = max(i, 0);

	MF4 conv2d_13_tf = { -0.03207076, 0.045260444, 0.040100798, -0.014172305 };
	conv2d_13_tf = MulAdd(a, MF4x4(0.122954965, 0.18889557, -0.050585095, -0.09285047, 0.041825704, -0.10147826, -0.0524878, 0.042394586, 0.26654795, -0.052367304, 0.32582784, 0.23248254, -0.18429202, -0.036516707, 0.034441825, 0.13747402), conv2d_13_tf);
	conv2d_13_tf = MulAdd(b, MF4x4(0.39325443, 0.12691088, -0.14018032, 0.2601387, -0.0128762275, 0.09533191, -0.15545139, -0.064879976, 0.4752176, -0.46358192, -0.048625924, 0.07356933, -0.030162415, -0.09837143, -0.34081137, 0.09620003), conv2d_13_tf);
	conv2d_13_tf = MulAdd(c, MF4x4(0.11647179, 0.020975508, -0.06064534, -0.1789612, 0.057696175, 0.11116113, -0.015037568, -0.024370348, -0.03656938, -0.2899815, -0.10285936, 0.055147626, 0.19246738, 0.30268162, -0.4149779, -0.0402745), conv2d_13_tf);
	conv2d_13_tf = MulAdd(d, MF4x4(-0.009147066, -0.17453548, 0.23320405, -0.009745345, 0.080975994, 0.07396582, -0.13413322, 0.17224005, -0.19477916, 0.16737588, 0.5310824, -0.48741058, 0.3713329, -0.061815146, -0.19980642, 0.25318542), conv2d_13_tf);
	conv2d_13_tf = MulAdd(e, MF4x4(0.34857947, 0.09298978, 0.20253287, 1.0750674, 0.074417695, 0.15859176, 0.17113946, 0.3587233, -0.3720992, 0.5499863, -0.3334931, -0.7303378, 0.28977355, -0.40827954, -0.15625797, 0.44504634), conv2d_13_tf);
	conv2d_13_tf = MulAdd(f, MF4x4(0.00963027, -0.103650935, -0.15111534, -0.054710496, 0.068436116, -0.04733752, -0.014022155, -0.06435892, 0.46522453, 0.06746723, -0.13256127, -0.354952, 0.036626723, -0.2881872, -0.20110025, 0.18387023), conv2d_13_tf);
	conv2d_13_tf = MulAdd(g, MF4x4(-0.042692482, -0.08184722, 0.29142103, 0.10918554, 0.022569105, -0.03967552, -0.029662814, 0.16549924, -0.06727612, 0.49291298, 0.12881728, -0.02918886, -0.01579875, -0.12708642, -0.21163678, -0.24313599), conv2d_13_tf);
	conv2d_13_tf = MulAdd(h, MF4x4(-0.044082023, -0.047357306, -0.044077095, 0.20591871, -0.015887344, 0.05115381, -0.19811073, -0.035676513, 0.019275555, 0.4578326, 0.5141636, 0.0702626, 0.13119744, -0.17745942, -0.1892288, -0.062224492), conv2d_13_tf);
	conv2d_13_tf = MulAdd(i, MF4x4(0.06651709, -0.016656881, -0.0052546742, 0.014599082, -0.032204926, 0.09341175, -0.010483702, -0.04786155, 0.23358113, 0.13316281, 0.21748747, 0.04741849, -0.11040673, 0.06230487, 0.16795471, -0.104242735), conv2d_13_tf);
	conv2d_13_tf = MulAdd(na, MF4x4(-0.06844235, -0.01974277, 0.03758873, 0.0437811, -0.057502225, -0.076013766, 0.05226354, 0.16626364, -0.15094693, -0.06513261, -0.07178063, -0.25390542, -0.046331745, 0.048600584, -0.09498597, -0.029823082), conv2d_13_tf);
	conv2d_13_tf = MulAdd(nb, MF4x4(0.055906143, -0.09671702, -0.022703249, -0.074096285, -0.18490121, -0.14549334, 0.42093202, 0.087242134, -0.29526195, 0.31182536, 0.044069793, -0.17393354, -0.17096926, -0.15162584, 0.25237793, 0.047123164), conv2d_13_tf);
	conv2d_13_tf = MulAdd(nc, MF4x4(-0.0007076463, 0.0037513115, -0.044519257, 0.05986656, -0.12090617, 0.17659539, -0.07153321, 0.043799683, -0.050228495, -0.04225425, 0.24785443, 0.19911547, -0.05966556, -0.19790268, 0.20703633, 0.0048266468), conv2d_13_tf);
	conv2d_13_tf = MulAdd(nd, MF4x4(0.21739465, -0.046017647, -0.17681813, 0.21452186, 0.230653, -0.47062522, -0.23921433, 0.39329913, -0.036690675, 0.3303968, -0.47879925, -0.16289225, -0.1494594, 0.27207994, 0.1856394, -0.47609702), conv2d_13_tf);
	conv2d_13_tf = MulAdd(ne, MF4x4(0.3214577, -0.023753606, 0.21297608, -0.7130707, 0.050221473, 0.9629573, 0.5004743, 0.10413513, 0.10723351, -0.07022509, 0.23218232, -0.5185978, -0.6921137, 0.0619471, 0.16877905, -0.60311705), conv2d_13_tf);
	conv2d_13_tf = MulAdd(nf, MF4x4(0.0079998905, -0.066334635, 0.24110058, 0.06277327, -0.099571265, 0.28088686, 0.089555554, 0.049777288, -0.12143259, 0.19382764, 0.028673613, 0.14329565, -0.10053404, -0.07129261, -0.06196109, -0.54130787), conv2d_13_tf);
	conv2d_13_tf = MulAdd(ng, MF4x4(0.0602462, -0.21520244, -0.17295553, 0.01296868, 0.09711833, 0.051904213, -0.20535164, -0.17658444, 0.27075645, 0.0784139, 0.13146368, -1.7129825e-05, -0.06117924, 0.24631894, -0.01026257, 0.0030612787), conv2d_13_tf);
	conv2d_13_tf = MulAdd(nh, MF4x4(0.19062799, 0.122910775, 0.09640838, 0.06539721, 0.057701044, -0.20118104, -0.06261069, 0.107874714, 0.0973878, -0.20830666, -0.108459, -0.10059624, -0.08533175, -0.025608283, -0.07584223, -0.26741856), conv2d_13_tf);
	conv2d_13_tf = MulAdd(ni, MF4x4(-0.1459836, -0.092159286, 0.05037609, 0.07709965, -0.18563168, -0.017586546, -0.16244653, -0.017426869, -0.20880185, -0.26068223, 0.037480514, 0.056800563, 0.14884543, 0.13592677, -0.1492276, 0.023280073), conv2d_13_tf);

	MF4 conv2d_11_tf = tex1.SampleLevel(sam, pos, 0);
	MF4 conv2d_1_tf = tex3.SampleLevel(sam, pos, 0);
	MF4 conv2d_4_tf = tex2.SampleLevel(sam, pos, 0);
	MF4 conv2d_7_tf = tex4.SampleLevel(sam, pos, 0);
	MF4 conv2d_10_tf = tex5.SampleLevel(sam, pos, 0);

	MF4 n_conv2d_11_tf = max(-conv2d_11_tf, 0);
	MF4 n_conv2d_1_tf = max(-conv2d_1_tf, 0);
	MF4 n_conv2d_4_tf = max(-conv2d_4_tf, 0);
	MF4 n_conv2d_7_tf = max(-conv2d_7_tf, 0);
	MF4 n_conv2d_10_tf = max(-conv2d_10_tf, 0);
	MF4 n_conv2d_13_tf = max(-conv2d_13_tf, 0);

	conv2d_11_tf = max(conv2d_11_tf, 0);
	conv2d_1_tf = max(conv2d_1_tf, 0);
	conv2d_4_tf = max(conv2d_4_tf, 0);
	conv2d_7_tf = max(conv2d_7_tf, 0);
	conv2d_10_tf = max(conv2d_10_tf, 0);
	conv2d_13_tf = max(conv2d_13_tf, 0);

	MF4 target1 = { -0.022439916, 0.020257013, 0.041364692, 0.0141367195 };
	target1 = MulAdd(e, MF4x4(0.06499131, -0.18188648, -0.3129073, 0.46508536, 0.12730247, -0.0048228996, -0.29037076, -0.040671512, -0.37960687, -0.014975028, 0.051478356, -0.17510629, 0.24467152, -0.3726265, -0.05205153, 0.29063764), target1);
	target1 = MulAdd(ne, MF4x4(-0.036466975, -0.021365412, 0.19166216, 0.2391551, 0.38419026, 0.16602032, 0.06468244, 0.7733659, 0.004007756, 0.03079535, -0.0030497843, -0.2033753, -0.3095698, 0.40909737, 0.067926906, -0.16948561), target1);
	target1 = MulAdd(conv2d_11_tf, MF4x4(-0.07662823, 0.021806711, 0.05107831, 0.09089961, -0.051882017, -0.00308805, -0.08946813, -0.085923605, 0.13135786, -0.040860962, -0.12652986, -0.17011258, -0.23838595, 0.16027555, -0.27720237, 0.3512776), target1);
	target1 = MulAdd(n_conv2d_11_tf, MF4x4(0.054664467, -0.012412156, -0.11934643, -0.20614244, 0.005247195, -0.07548066, 0.1898925, -0.08086777, -0.27888495, 0.08055913, 0.2733805, 0.05444851, 0.22015096, -0.15712278, 0.070828624, -0.12955543), target1);
	target1 = MulAdd(conv2d_1_tf, MF4x4(-0.19064794, 0.10234088, -0.07635815, 0.15928909, 0.25309163, -0.0055202493, -0.04807871, 0.1251584, -0.19122045, 0.050241888, 0.020203145, 0.12914757, 0.20982412, -0.042472344, 0.12709813, -0.10014193), target1);
	target1 = MulAdd(n_conv2d_1_tf, MF4x4(-0.025030518, -0.077239156, 0.12003885, -0.07962912, -0.17808792, -0.027223784, 0.13286914, -0.026946044, 0.044607714, -0.045288526, 0.12821364, -0.19116278, 0.053770527, -0.05832497, -0.14832996, -0.08657012), target1);
	target1 = MulAdd(conv2d_4_tf, MF4x4(0.17286317, -0.029046731, -0.06853154, -0.080361344, -0.14082976, -0.076902896, 0.08296736, -0.17621617, 0.10048785, -0.01766402, -0.06414528, -0.012933831, 0.13066664, -0.05233094, 0.09176876, 0.0053013414), target1);
	target1 = MulAdd(n_conv2d_4_tf, MF4x4(0.09860572, 0.0578288, 0.05035504, 0.017596964, 0.055266783, -0.084020205, 0.1214565, -0.04180339, -0.16650584, 0.02645373, 0.08516016, 0.123672284, -0.11207144, 0.03805417, 0.017909998, 0.08631275), target1);
	target1 = MulAdd(conv2d_7_tf, MF4x4(0.08567236, 0.11860556, -0.2603184, 0.04399533, -0.13169551, -0.14144541, 0.11864987, -0.19813964, -0.14435594, 0.0943669, 0.318387, -0.039731313, -0.05394642, 0.018096905, 0.11445131, -0.07224858), target1);
	target1 = MulAdd(n_conv2d_7_tf, MF4x4(-0.066673055, -0.0079072425, 0.15320915, 0.1241549, -0.03786454, 0.02686796, 0.062339537, 0.0921351, 0.24909046, -0.13677734, -0.08606315, -0.1311618, -0.11268947, 0.017006561, -0.010060483, -0.016905207), target1);
	target1 = MulAdd(conv2d_10_tf, MF4x4(0.11682704, -0.06385352, 0.048959445, 0.2103904, -0.24271931, -0.114691064, 0.106675364, -0.16527846, 0.20034032, -0.19069487, 0.13964948, -0.2999216, -0.05324707, 0.03835898, 0.002079623, -0.042824514), target1);
	target1 = MulAdd(n_conv2d_10_tf, MF4x4(0.021089941, 0.058709584, -0.026687654, 0.061108842, 0.13278545, 0.0154480925, -0.1858288, 0.07775379, -0.013820952, 0.04138522, 0.040989578, 0.19044249, -0.05938495, 0.049729984, 0.022488212, 0.13883443), target1);
	target1 = MulAdd(conv2d_13_tf, MF4x4(-0.12241166, 0.24528268, -0.5302565, 0.045535725, -0.054705787, -0.038350295, -0.0833044, 0.18413262, -0.16520579, 0.087780885, -0.42400438, 0.30506396, -0.05254002, 0.0068022306, -0.6969388, 1.901328), target1);
	target1 = MulAdd(n_conv2d_13_tf, MF4x4(-0.12879479, -0.13513997, -0.068150125, 0.34132335, 0.08568371, 0.086309135, -0.10726202, 0.053040955, -0.007894386, 0.0694188, 0.13861355, -0.06504751, 0.1669743, -0.06529014, -0.048758753, -0.10337064), target1);

	MF4 target2 = { -0.032370187, 0.008661155, 0.020123083, 0.04574251 };
	target2 = MulAdd(e, MF4x4(0.10813235, 0.05466766, -0.20426773, 0.03014769, -0.23742639, -0.18808678, -0.08507936, 0.11070251, -0.24421449, -0.047370236, -0.034263644, -0.36471045, 0.022079159, -0.13425855, -0.43840396, 0.14318791), target2);
	target2 = MulAdd(ne, MF4x4(0.006743051, 0.07216438, 0.14125177, 0.06620228, 0.42031923, 0.2496421, -0.07731219, -0.013831615, 0.15525927, 0.090886295, 0.019504324, -0.048566148, -0.21346657, 0.022109412, 0.26717573, -0.11774596), target2);
	target2 = MulAdd(conv2d_11_tf, MF4x4(-0.28528357, -0.17186452, -0.20616518, 0.034786828, -0.10506841, -0.12335915, 0.07619831, -0.23998813, 0.19965814, 0.103892386, 0.04367025, -0.19183081, -0.16918147, -0.056264214, 0.20310691, 0.3341895), target2);
	target2 = MulAdd(n_conv2d_11_tf, MF4x4(0.20581162, 0.02040467, 0.35530564, -0.15494272, -0.010262163, 0.07301455, -0.074129246, 0.17339204, -0.00919498, -0.11473048, 0.042003002, -0.050515488, 0.24150477, 0.14734265, -0.102072336, -0.03404688), target2);
	target2 = MulAdd(conv2d_1_tf, MF4x4(-0.022791447, -0.005725081, 0.057149626, 0.013613261, 0.017012713, 0.0022030922, 0.06826359, -0.1473429, -0.055662345, 0.015804563, 0.07033723, 0.0380571, -0.030761583, -0.06867299, -0.0004780991, -0.10686876), target2);
	target2 = MulAdd(n_conv2d_1_tf, MF4x4(0.11448204, 0.08165584, 0.56496936, 0.2275344, 0.050801918, 0.115319155, 0.11518415, 0.05895198, 0.06831797, 0.08119943, 0.34825838, -0.048232127, 0.028284775, -0.03452888, 0.1979006, -0.041894354), target2);
	target2 = MulAdd(conv2d_4_tf, MF4x4(0.11946663, 0.03388757, -0.13882776, -0.14631757, -0.07182763, -0.08768853, 0.14146432, 0.10330784, -0.012143934, -0.022009725, -0.15579993, -0.050503176, -0.016312446, -0.054338187, -0.07755307, -0.07889432), target2);
	target2 = MulAdd(n_conv2d_4_tf, MF4x4(-0.02631465, 0.05617023, 0.13298586, 0.045326687, -0.11627329, -0.087329924, -0.05144727, -0.13488398, 0.06281482, 0.054220017, 0.25243595, 0.002556835, -0.03581036, 0.10341262, 0.10574532, 0.15461436), target2);
	target2 = MulAdd(conv2d_7_tf, MF4x4(0.07718563, 0.038919166, -0.06910819, -0.059710544, -0.09481636, -0.1109951, 0.5187051, 0.045543563, -0.048131686, 0.072409846, 0.4892963, -0.086976275, -0.07343929, -0.12501429, 0.26566443, 0.08579102), target2);
	target2 = MulAdd(n_conv2d_7_tf, MF4x4(0.005692247, 0.042074066, 0.13430944, 0.10093059, 0.023651319, 0.019474167, -0.13077211, -0.07782639, 0.072300054, 0.011820138, -0.1379879, -0.033925157, 0.012152839, 0.005247593, 0.15555158, -0.10433893), target2);
	target2 = MulAdd(conv2d_10_tf, MF4x4(-0.14903626, -0.0649052, 0.103872776, 0.18057188, 0.02891697, 0.13026263, 0.45847327, 0.09324349, -0.039312128, -0.05299939, 0.4332103, -0.25727344, 0.006733611, 0.05955007, 0.24531682, 0.053989712), target2);
	target2 = MulAdd(n_conv2d_10_tf, MF4x4(0.111072116, 0.11529407, -0.26600304, -0.032266896, 0.09633932, 0.0094333775, 0.060893714, -0.08118885, -0.03830528, 0.0037902966, -0.11128639, 0.13511918, 0.06553124, 0.054722965, 0.08178846, 0.06025588), target2);
	target2 = MulAdd(conv2d_13_tf, MF4x4(0.095904954, 0.0008960944, 0.35145932, 0.28108585, -0.011538731, -0.09239871, -0.21972048, -0.0820484, 0.112448506, -0.10381135, 0.09701949, 0.023723679, 0.04458077, 0.04700858, -0.056815177, 0.33785793), target2);
	target2 = MulAdd(n_conv2d_13_tf, MF4x4(0.08533725, 0.05978557, -0.40020186, -0.13684823, -0.0074113654, 0.1310689, 0.12906975, 0.11596462, 0.007170312, 0.13460107, 0.08450185, -0.019635776, 0.0966497, 0.021586724, -0.06784809, 0.12102399), target2);

	tex6[gxy] = target1;
	tex8[gxy] = target2;
}


//!PASS 7
//!DESC Conv-3x3x3x16
//!IN tex6, tex8, INPUT
//!OUT OUTPUT
//!BLOCK_SIZE 8
//!NUM_THREADS 64

void Pass7(uint2 blockStart, uint3 threadId) {
	uint2 gxy = TileSwizzle8x8(threadId.x) + blockStart;
	
	const uint2 outputSize = GetOutputSize();
	if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
		return;
	}

	float2 outputPt = GetOutputPt();
	float2 pos = (gxy + 0.5f) * outputPt;

	// [ a, d, g ]
	// [ b, e, h ]
	// [ c, f, i ]
	MF4 a1 = tex6.SampleLevel(sam1, pos - outputPt, 0);
	MF4 b1 = tex6.SampleLevel(sam1, pos + float2(-outputPt.x, 0), 0);
	MF4 c1 = tex6.SampleLevel(sam1, pos + float2(-outputPt.x, outputPt.y), 0);
	MF4 d1 = tex6.SampleLevel(sam1, pos + float2(0, -outputPt.y), 0);
	MF4 e1 = tex6.SampleLevel(sam1, pos, 0);
	MF4 f1 = tex6.SampleLevel(sam1, pos + float2(0, outputPt.y), 0);
	MF4 g1 = tex6.SampleLevel(sam1, pos + float2(outputPt.x, -outputPt.y), 0);
	MF4 h1 = tex6.SampleLevel(sam1, pos + float2(outputPt.x, 0), 0);
	MF4 i1 = tex6.SampleLevel(sam1, pos + outputPt, 0);

	MF4 na1 = max(-a1, 0);
	MF4 nb1 = max(-b1, 0);
	MF4 nc1 = max(-c1, 0);
	MF4 nd1 = max(-d1, 0);
	MF4 ne1 = max(-e1, 0);
	MF4 nf1 = max(-f1, 0);
	MF4 ng1 = max(-g1, 0);
	MF4 nh1 = max(-h1, 0);
	MF4 ni1 = max(-i1, 0);

	a1 = max(a1, 0);
	b1 = max(b1, 0);
	c1 = max(c1, 0);
	d1 = max(d1, 0);
	e1 = max(e1, 0);
	f1 = max(f1, 0);
	g1 = max(g1, 0);
	h1 = max(h1, 0);
	i1 = max(i1, 0);

	MF4 a2 = tex8.SampleLevel(sam1, pos - outputPt, 0);
	MF4 b2 = tex8.SampleLevel(sam1, pos + float2(-outputPt.x, 0), 0);
	MF4 c2 = tex8.SampleLevel(sam1, pos + float2(-outputPt.x, outputPt.y), 0);
	MF4 d2 = tex8.SampleLevel(sam1, pos + float2(0, -outputPt.y), 0);
	MF4 e2 = tex8.SampleLevel(sam1, pos, 0);
	MF4 f2 = tex8.SampleLevel(sam1, pos + float2(0, outputPt.y), 0);
	MF4 g2 = tex8.SampleLevel(sam1, pos + float2(outputPt.x, -outputPt.y), 0);
	MF4 h2 = tex8.SampleLevel(sam1, pos + float2(outputPt.x, 0), 0);
	MF4 i2 = tex8.SampleLevel(sam1, pos + outputPt, 0);

	MF4 na2 = max(-a2, 0);
	MF4 nb2 = max(-b2, 0);
	MF4 nc2 = max(-c2, 0);
	MF4 nd2 = max(-d2, 0);
	MF4 ne2 = max(-e2, 0);
	MF4 nf2 = max(-f2, 0);
	MF4 ng2 = max(-g2, 0);
	MF4 nh2 = max(-h2, 0);
	MF4 ni2 = max(-i2, 0);

	a2 = max(a2, 0);
	b2 = max(b2, 0);
	c2 = max(c2, 0);
	d2 = max(d2, 0);
	e2 = max(e2, 0);
	f2 = max(f2, 0);
	g2 = max(g2, 0);
	h2 = max(h2, 0);
	i2 = max(i2, 0);

	MF3 result = { 0.00016753975, -0.00019302216, -0.0001663917 };
	result = MulAdd(a1, MF4x3(0.03277269, -0.005261106, 0.017171703, 0.07399743, 0.06816794, 0.09821277, -0.013628815, -0.09454006, -0.2801339, -0.020518344, -0.008617738, -0.010507532), result);
	result = MulAdd(b1, MF4x3(-0.0728787, -0.05837346, -0.06754399, -0.14260155, -0.11570593, -0.156841, -0.0050546993, 0.22888114, 0.21504444, 9.040898e-05, -0.023274591, -0.013553191), result);
	result = MulAdd(c1, MF4x3(0.051917054, 0.05906303, 0.06952429, 0.0525386, 0.088182524, 0.058972485, -0.089566976, -0.11995993, -0.060805317, -0.0016516607, 0.014582383, 0.0018667864), result);
	result = MulAdd(d1, MF4x3(-0.010195044, -0.0016970673, -0.007473783, 0.0048292056, 0.00090277405, -0.018349117, 0.33494812, 0.21826924, 0.07975424, 0.0313906, 0.023605483, 0.019729096), result);
	result = MulAdd(e1, MF4x3(-0.04102709, -0.057343815, -0.053828835, -0.20089269, -0.14614193, -0.16869506, -0.48148197, -0.112935685, 0.15368614, 0.013808743, 0.019406663, 0.016180169), result);
	result = MulAdd(f1, MF4x3(-0.06643282, -0.06502517, -0.07856252, 0.018638078, -0.022319186, -0.0067943106, 0.036783714, -0.05270904, -0.0070206574, 0.016395729, -0.004902533, 0.008296518), result);
	result = MulAdd(g1, MF4x3(0.12068605, 0.15490896, 0.16064006, 0.065336704, 0.053270113, 0.041463483, -0.070910245, -0.16710983, -0.100286275, -0.00765049, 0.002855491, 0.005510328), result);
	result = MulAdd(h1, MF4x3(0.020581165, 0.009990014, 0.020439452, 0.053358912, 0.019578407, 0.07360501, -0.01833616, 0.024298528, 0.09730532, 0.007911377, -0.0008312725, 0.0012658008), result);
	result = MulAdd(i1, MF4x3(-0.029880082, -0.048992317, -0.06480165, 0.057186406, 0.02767247, 0.02632289, 0.3017522, 0.107764654, -0.082682736, -0.018429814, -0.0037222179, -0.008986925), result);
	result = MulAdd(a2, MF4x3(0.05568391, 0.07143306, 0.11425685, 0.09023551, 0.07943949, 0.09341015, -0.006964977, 0.005051686, -0.0066025057, 0.016425984, 0.016140617, 0.017426701), result);
	result = MulAdd(b2, MF4x3(-0.05428328, -0.05257552, -0.055414293, 0.07355084, 0.02099847, 0.02532324, -0.0059588687, 0.0026828237, 0.012020099, -0.02094392, -0.008076426, -0.004007557), result);
	result = MulAdd(c2, MF4x3(0.078949526, 0.060797416, 0.06175456, 0.038563624, 0.1133258, 0.097543724, 0.009481104, 0.010644464, 0.017376821, -0.025299812, -0.034176692, -0.024242869), result);
	result = MulAdd(d2, MF4x3(0.097633384, 0.08206449, 0.07688493, -0.13658656, -0.07185774, -0.046447344, 0.023979248, 0.007561647, 0.013846933, -0.05918984, -0.061709706, -0.05624362), result);
	result = MulAdd(e2, MF4x3(-0.06739334, -0.08787811, -0.11320143, -0.21294294, -0.20553987, -0.212303, 0.03569362, 0.005086715, -0.008558981, -0.029743299, -0.01592082, -0.023579126), result);
	result = MulAdd(f2, MF4x3(-0.06479095, -0.07233743, -0.0707415, 0.042067222, 0.020530105, -0.013605897, -0.024686582, -0.019044759, -0.028663088, -0.02459999, -0.022106387, -0.037910707), result);
	result = MulAdd(g2, MF4x3(0.00047730867, 0.0074251383, -0.019326044, -0.0079797115, -0.028213829, -0.04960014, -0.007960453, 0.006997611, 0.008396939, 0.06343004, 0.049828995, 0.03993323), result);
	result = MulAdd(h2, MF4x3(0.041342042, 0.04802731, 0.05910926, -0.06663181, -0.017722478, -0.063366435, -0.0066454113, -0.007623568, -0.0052808253, 0.019400312, 0.023122162, 0.014149712), result);
	result = MulAdd(i2, MF4x3(-0.02667231, 0.00326689, 0.028842116, 0.1206443, 0.059932612, 0.11402581, -0.019962605, -0.012744165, -0.0043374747, 0.0076787886, -0.0029834688, 0.016930124), result);
	result = MulAdd(na1, MF4x3(-0.048204165, -0.040773313, -0.048701975, -0.10603768, -0.0444273, -0.05195404, 0.0075067757, -0.018593295, -0.021308444, -0.03957737, -0.009982081, 0.010517069), result);
	result = MulAdd(nb1, MF4x3(0.04416329, 0.0061665634, 0.006213014, 0.08318984, 0.10827006, 0.066440694, 0.020778455, 0.039835304, 0.043959253, 0.21019539, 0.20858723, 0.17247656), result);
	result = MulAdd(nc1, MF4x3(-0.023037061, -0.040597446, -0.03936031, 0.038322993, -0.006460271, 0.008364464, 0.0013878595, -0.017040763, -0.008046535, 0.04411088, 0.0034189504, -0.00865711), result);
	result = MulAdd(nd1, MF4x3(-0.04620107, -0.010026264, -0.018166702, -0.13721117, -0.13748127, -0.15809298, -0.015785996, -0.005124028, -0.02296112, 0.14735141, 0.17641969, 0.18629177), result);
	result = MulAdd(ne1, MF4x3(0.06815282, 0.12910986, 0.1348522, 0.3159465, 0.39939725, 0.35339746, -0.003487101, 0.01400649, 0.03802699, -0.61086726, -0.60257083, -0.57637924), result);
	result = MulAdd(nf1, MF4x3(0.051779903, 0.040781803, 0.057703253, 0.08762279, 0.058650948, 0.14592434, -0.0027639035, 0.019435523, 0.007374421, 0.14841707, 0.15387256, 0.18617661), result);
	result = MulAdd(ng1, MF4x3(0.0061518056, -0.036338966, -0.01811052, -0.0409911, -0.10952732, -0.06394289, -0.03781909, -0.036061246, -0.017401218, 0.036531474, -0.009453272, -0.0205337), result);
	result = MulAdd(nh1, MF4x3(0.011860616, -0.01409049, -0.0038651319, -0.026641136, 0.052935697, 0.024065036, -0.00801134, -0.021182325, -0.03668359, 0.17521855, 0.1884243, 0.21842308), result);
	result = MulAdd(ni1, MF4x3(-0.04098353, -0.010698699, -0.042900108, -0.3209868, -0.37843677, -0.40212557, 0.016307857, 0.010040624, 0.0025999267, -0.008670373, 0.0011820213, -0.021262378), result);
	result = MulAdd(na2, MF4x3(0.109322615, 0.072824165, 0.111781776, 0.056546386, -0.00393398, 0.004904314, 0.18162459, 0.1963156, 0.18083604, -0.11325025, 0.03739349, -0.034167226), result);
	result = MulAdd(nb2, MF4x3(-0.16535625, -0.19053574, -0.19740228, -0.09285224, -0.18288574, -0.16264571, -0.15362014, -0.11303279, 0.023057505, -0.019013347, 0.025035419, 0.046823245), result);
	result = MulAdd(nc2, MF4x3(0.051271398, 0.06677435, 0.071102865, -0.24909541, -0.24379867, -0.26372898, -0.051355038, 0.16958164, 0.12556365, -0.078110464, -0.09428601, -0.12403035), result);
	result = MulAdd(nd2, MF4x3(-0.20382409, -0.21728146, -0.25310788, 0.0863418, 0.16670556, 0.13722113, 0.09728048, -0.05204764, -0.13571848, 0.011384012, -0.12616627, -0.121069506), result);
	result = MulAdd(ne2, MF4x3(0.048272748, 0.056282464, 0.053991128, 0.24383838, 0.30037045, 0.2993122, -0.10345337, -0.28334868, -0.36417452, 0.289455, 0.26967737, 0.30849114), result);
	result = MulAdd(nf2, MF4x3(0.08048932, 0.10012804, 0.13864101, 0.028471693, -0.10722793, -0.110060275, -0.09971538, -0.011243501, 0.17263469, 0.0536668, 0.08396721, 0.058851402), result);
	result = MulAdd(ng2, MF4x3(-0.02470257, -0.0099621, 0.0018576515, -0.07751234, -0.0431258, -0.03958112, 0.07120911, 0.05517916, 0.18740316, -0.043790314, -0.0959628, -0.070550814), result);
	result = MulAdd(nh2, MF4x3(0.10409344, 0.08135716, 0.04320299, 0.09303134, 0.073921256, 0.07716563, 0.09312593, 0.03623192, 0.06660019, -0.12193945, -0.16342056, -0.15565647), result);
	result = MulAdd(ni2, MF4x3(0.068098865, 0.07742245, 0.04117883, -0.07239023, -0.0048315763, -0.0029638975, -0.053049978, 0.121163346, 0.048760712, -0.033619802, -0.010043663, -0.012648383), result);

	OUTPUT[gxy] = MF4(result + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
}
