// CuNNy 6x8C NVL DN - Convolutional Neural Network Upscaler
// Based on CuNNy by funnyplanter - https://github.com/funnyplanter/CuNNy
// Ported to BGFX format for BorderlessGaming

// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// 
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// 
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
// 
// You should have received a copy of the GNU General Public License
// along with this program.  If not, see <http://www.gnu.org/licenses/>.

//!BGFX EFFECT
//!VERSION 1
//!NAME CuNNy 6x8C NVL DN
//!CATEGORY Neural Network
//!DESCRIPTION A large CNN-based 2x upscaler with integrated denoising. Best for noisy or compressed sources. Performance impact: moderate to high.
//!USE MulAdd
//!CAPABILITY BG_FP16


//!TEXTURE
Texture2D INPUT;

//!TEXTURE
//!WIDTH INPUT_WIDTH * 2
//!HEIGHT INPUT_HEIGHT * 2
Texture2D OUTPUT;

//!SAMPLER
//!FILTER POINT
SamplerState SP;

//!SAMPLER
//!FILTER LINEAR
SamplerState SL;

//!COMMON
#define V4 MF4
#define M4 MF4x4
#define O(t, p) t.SampleLevel(SP, pos + p * pt, 0)

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t0;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t1;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t2;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t3;

//!PASS 1
//!DESC Feature extraction - Input layer
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN INPUT
//!OUT t0, t1

#define l0(x, y) (dot(MF3(-2.035e-01, -4.051e-01, -9.041e-02), O(INPUT, float2(x, y)).rgb) + MF(4.315e-01))

V4 f0(MF s0_0, MF s0_1, MF s0_2, MF s0_3, MF s0_4, MF s0_5, MF s0_6, MF s0_7, MF s0_8) {
	V4 r = { -8.698e-03, -1.051e-02, -2.456e-02, 8.033e-03 };
	r = mad(s0_0, V4(-5.333e-02, 1.506e-02, -4.863e-02, 5.352e-03), r);
	r = mad(s0_1, V4(3.064e-02, -1.241e-03, 3.831e-02, 1.406e-01), r);
	r = mad(s0_2, V4(-3.234e-03, 3.668e-03, -1.982e-02, -1.101e-01), r);
	r = mad(s0_3, V4(-4.854e-01, 5.950e-01, -9.253e-02, -3.601e-01), r);
	r = mad(s0_4, V4(-1.239e-01, 1.942e-01, 4.939e-01, 5.723e-01), r);
	r = mad(s0_5, V4(-1.746e-02, 8.397e-04, 4.267e-02, -7.863e-02), r);
	r = mad(s0_6, V4(5.387e-01, -5.918e-01, -5.641e-02, 2.018e-02), r);
	r = mad(s0_7, V4(9.656e-02, -2.095e-01, -1.788e-01, -1.900e-01), r);
	r = mad(s0_8, V4(1.780e-02, -3.460e-03, -1.204e-01, -1.629e-02), r);
	return r;
}

V4 f1(MF s0_0, MF s0_1, MF s0_2, MF s0_3, MF s0_4, MF s0_5, MF s0_6, MF s0_7, MF s0_8) {
	V4 r = { 1.285e-01, 1.293e-01, -1.923e-02, 1.313e-02 };
	r = mad(s0_0, V4(1.238e-02, 8.036e-03, 3.125e-01, 1.440e-01), r);
	r = mad(s0_1, V4(-5.165e-02, -8.626e-03, -3.096e-01, 4.477e-02), r);
	r = mad(s0_2, V4(3.169e-04, 1.974e-02, -2.705e-01, -5.500e-02), r);
	r = mad(s0_3, V4(-2.838e-02, 3.386e-03, 1.727e-01, 4.413e-01), r);
	r = mad(s0_4, V4(1.497e-01, 1.326e-02, 3.072e-01, -5.548e-01), r);
	r = mad(s0_5, V4(1.428e-04, -3.819e-01, -2.438e-01, 5.685e-03), r);
	r = mad(s0_6, V4(-3.416e-02, 2.989e-03, 2.969e-02, 2.118e-02), r);
	r = mad(s0_7, V4(5.299e-01, 4.041e-03, -9.012e-02, -7.878e-02), r);
	r = mad(s0_8, V4(-5.243e-02, 9.452e-03, 7.145e-02, 3.160e-02), r);
	return r;
}

void Pass1(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = TileSwizzle8x8(tid.x) + blockStart;
	uint2 size = GetInputSize();
	if (gxy.x >= size.x || gxy.y >= size.y) {
		return;
	}
	float2 pos = (gxy + 0.5) * pt;

	MF s0_0 = l0(-1.0, -1.0);
	MF s0_1 = l0(0.0, -1.0);
	MF s0_2 = l0(1.0, -1.0);
	MF s0_3 = l0(-1.0, 0.0);
	MF s0_4 = l0(0.0, 0.0);
	MF s0_5 = l0(1.0, 0.0);
	MF s0_6 = l0(-1.0, 1.0);
	MF s0_7 = l0(0.0, 1.0);
	MF s0_8 = l0(1.0, 1.0);

	t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8);
	t1[gxy] = f1(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8);
}

//!PASS 2
//!DESC Convolution layer 1 - Feature processing
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0, t1
//!OUT t2, t3

#define l0(x, y) V4(O(t0, float2(x, y)))
#define l1(x, y) V4(O(t1, float2(x, y)))

V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
	V4 r = { 2.009e-02, -6.020e-02, -2.095e-01, -2.428e-04 };
	r = MulAdd(s0_0, M4(1.415e-01, -1.943e-02, -8.607e-02, 1.720e-02, -5.503e-02, -7.077e-02, 6.357e-02, 1.637e-01, -3.997e-04, 3.993e-01, 1.124e-02, -7.681e-02, 2.275e-01, 2.744e-01, 3.333e-02, -7.161e-02), r);
	r = MulAdd(s0_1, M4(9.960e-02, 3.193e-01, 1.675e-01, -3.303e-01, 2.128e-01, -4.128e-01, -2.347e-01, 7.342e-01, 1.727e-01, 3.478e-02, -1.685e-01, 2.954e-01, 3.302e-01, -4.103e-01, -1.933e-01, 6.543e-01), r);
	r = MulAdd(s0_2, M4(3.761e-01, -1.545e-01, -1.035e-01, 4.556e-01, 2.239e-01, 2.557e-01, -2.634e-01, -3.533e-01, 1.636e-01, -6.189e-02, 7.688e-02, 4.012e-01, 2.164e-01, -1.716e-01, -1.205e-01, 8.663e-02), r);
	r = MulAdd(s0_3, M4(-1.181e-02, 1.876e-01, -6.568e-02, -5.398e-02, 1.158e-01, -1.222e-01, 5.085e-02, 2.418e-01, 1.176e-01, -3.773e-01, -2.300e-02, 1.588e-02, 3.060e-02, 6.070e-02, -7.153e-02, 1.807e-04), r);
	r = MulAdd(s0_4, M4(2.148e-01, -3.282e-01, -1.202e-01, -5.740e-02, -8.745e-02, 4.563e-01, 1.371e-01, -2.197e-02, 1.796e-01, -8.210e-02, 4.639e-01, -2.013e-02, -1.657e-01, 1.274e-01, 1.544e-01, -3.108e-01), r);
	r = MulAdd(s0_5, M4(5.544e-01, 1.682e-01, -5.642e-01, 2.636e-01, -5.510e-01, -1.371e-01, 1.450e-01, -4.443e-01, 1.705e-01, -1.304e-01, -2.385e-02, -7.088e-02, -3.489e-01, 6.447e-02, 6.309e-02, -1.137e-01), r);
	r = MulAdd(s0_6, M4(-1.218e-01, -2.795e-02, -1.160e-01, 8.723e-02, -2.646e-02, -2.134e-01, 1.170e-02, 1.284e-01, -1.001e-02, 1.919e-01, -9.593e-02, -2.529e-01, -8.665e-02, 1.473e-01, 3.237e-01, -9.264e-02), r);
	r = MulAdd(s0_7, M4(-2.280e-01, -1.011e-01, -1.681e-01, -1.644e-01, 7.984e-02, 4.135e-02, 9.764e-02, 1.405e-01, -3.485e-01, 2.078e-01, 8.260e-02, -6.538e-02, -1.472e-01, -2.301e-02, -1.911e-02, -6.720e-04), r);
	r = MulAdd(s0_8, M4(-2.326e-01, -1.645e-01, -1.724e-01, -1.018e-01, 1.470e-01, 2.117e-01, 3.018e-01, -7.956e-02, -1.260e-01, -2.090e-01, -7.409e-02, 8.810e-03, 7.153e-02, 1.818e-02, 8.868e-03, -1.993e-03), r);
	r = MulAdd(s1_0, M4(-4.601e-02, -1.224e-01, 1.492e-01, 1.063e-01, 1.004e-01, 3.807e-02, -7.326e-02, 3.358e-02, 2.690e-02, 3.908e-02, 7.352e-02, -1.512e-01, -1.342e-02, 3.690e-01, -2.220e-01, -9.966e-02), r);
	r = MulAdd(s1_1, M4(-4.769e-01, 1.653e-01, -2.428e-01, 3.893e-02, 8.017e-01, -5.618e-01, 7.747e-02, 2.968e-01, 2.540e-01, -3.845e-01, -2.625e-01, 4.699e-01, 4.378e-01, -9.630e-01, -5.949e-02, 8.363e-01), r);
	r = MulAdd(s1_2, M4(-4.463e-01, -1.382e-01, -3.918e-01, -4.812e-01, 3.207e-01, -2.003e-01, 1.050e-01, 1.800e-01, 2.242e-01, -1.199e-01, -4.862e-02, 1.616e-01, 1.341e-01, -4.310e-01, -1.937e-01, 2.326e-01), r);
	r = MulAdd(s1_3, M4(2.147e-02, -2.783e-01, 8.845e-02, 2.262e-01, 1.024e-01, 2.714e-01, -4.063e-02, -7.636e-03, 1.980e-02, -1.420e-01, -2.109e-02, 3.946e-02, 8.302e-02, 5.234e-01, -2.883e-01, -1.296e-01), r);
	r = MulAdd(s1_4, M4(-5.593e-02, 7.059e-02, -1.420e-01, 8.657e-02, 2.728e-01, -8.770e-02, 1.477e-01, -1.427e-01, -1.250e-01, 6.256e-02, -3.762e-01, -1.046e-01, -3.094e-01, 5.179e-01, -3.617e-02, -2.885e-01), r);
	r = MulAdd(s1_5, M4(-1.382e-01, 2.495e-02, 3.461e-01, -5.939e-01, 3.019e-01, -8.266e-02, -3.544e-01, 2.824e-01, -1.029e-01, -3.089e-04, 1.307e-01, -6.184e-02, -1.293e-01, -4.540e-02, -3.406e-01, -1.988e-01), r);
	r = MulAdd(s1_6, M4(-6.472e-02, -2.037e-01, -1.320e-02, 1.285e-01, -1.378e-01, -3.275e-02, -8.836e-02, 9.191e-02, 1.123e-01, 1.736e-01, -2.116e-01, 2.340e-02, -1.655e-01, 1.227e-01, 2.700e-01, -1.485e-01), r);
	r = MulAdd(s1_7, M4(1.229e-01, 2.141e-02, 1.523e-01, 7.696e-02, -1.830e-01, -1.036e-01, -9.236e-02, -1.558e-01, -2.906e-01, -3.243e-02, -2.723e-02, -8.754e-02, -2.951e-01, 2.923e-02, -9.960e-02, -9.770e-03), r);
	r = MulAdd(s1_8, M4(9.990e-02, 3.377e-01, 1.933e-01, -1.242e-01, -1.515e-01, -6.217e-02, 4.245e-02, -6.189e-02, -9.236e-02, -2.214e-01, -6.575e-02, 6.456e-03, 1.795e-02, 4.278e-02, -6.188e-02, 1.573e-02), r);
	r = MulAdd(s2_0, M4(1.491e-01, -9.449e-02, -1.071e-01, 3.575e-02, -6.811e-04, -4.960e-02, -5.181e-04, 7.298e-02, 3.839e-02, -2.178e-02, -1.703e-02, 1.046e-01, -4.208e-02, 9.251e-03, 3.523e-02, -2.881e-01), r);
	r = MulAdd(s2_1, M4(3.309e-02, 6.440e-02, 2.337e-01, 1.524e-01, -1.725e-02, 3.153e-02, 9.155e-03, -5.480e-02, 4.245e-02, -1.042e-01, -1.053e-01, 1.759e-01, -1.713e-01, -1.269e-01, -1.378e-02, -6.438e-02), r);
	r = MulAdd(s2_2, M4(-1.648e-01, -6.306e-03, 8.299e-02, -1.812e-01, 6.679e-02, -6.670e-02, -8.122e-02, 1.284e-01, -7.879e-02, 9.292e-02, 7.607e-02, -2.007e-01, 2.088e-02, 5.058e-02, -1.943e-01, 1.150e-01), r);
	r = MulAdd(s2_3, M4(9.643e-02, 2.546e-01, -5.823e-02, 6.665e-02, -6.931e-02, 1.891e-02, 1.862e-01, 1.711e-02, 1.426e-01, 2.191e-01, -2.335e-01, 2.214e-01, -5.269e-02, 2.464e-02, -9.016e-02, -7.974e-02), r);
	r = MulAdd(s2_4, M4(3.501e-02, -1.022e-02, 1.990e-01, -7.641e-02, -1.492e-01, 1.836e-01, 8.877e-02, -2.226e-01, -2.686e-01, 1.809e-01, 2.979e-01, -2.644e-01, 1.681e-01, -3.821e-01, -1.366e-01, 7.164e-01), r);
	r = MulAdd(s2_5, M4(2.277e-01, -1.757e-01, -1.340e-02, 1.040e-01, -7.691e-02, 1.556e-01, -1.029e-02, -9.558e-02, 2.705e-01, 4.410e-02, 1.649e-01, 6.361e-02, 1.790e-01, 2.952e-01, -5.692e-02, -3.225e-02), r);
	r = MulAdd(s2_6, M4(-5.395e-02, 8.925e-02, 6.361e-02, -4.226e-02, -3.947e-02, -1.783e-01, -3.393e-02, 5.163e-02, 4.873e-02, 2.768e-01, -1.197e-01, -1.120e-01, -1.770e-02, 9.506e-02, 1.098e-01, -1.090e-01), r);
	r = MulAdd(s2_7, M4(-2.177e-01, 7.495e-02, 8.441e-02, -1.043e-01, 2.036e-01, -1.763e-01, 1.748e-01, 1.660e-01, 1.121e-01, -2.253e-01, -1.790e-01, -1.439e-01, -1.083e-01, 5.504e-02, 2.336e-01, -9.936e-02), r);
	r = MulAdd(s2_8, M4(-1.184e-01, -8.465e-02, 5.144e-02, 2.009e-02, 7.120e-02, 9.413e-02, 4.370e-02, -5.484e-02, -1.443e-01, -8.664e-02, -4.848e-02, 1.242e-01, 5.112e-02, 2.200e-01, -5.659e-03, 5.474e-02), r);
	r = MulAdd(s3_0, M4(6.419e-01, -3.023e-01, 2.127e-01, 3.398e-01, -1.325e+00, -2.531e-01, -9.418e-01, -1.673e+00, -2.186e-02, -1.045e-01, -4.623e-02, 9.273e-02, 5.411e-02, 2.154e-01, 1.728e-01, -1.901e-01), r);
	r = MulAdd(s3_1, M4(-1.247e-01, 2.308e-01, -5.479e-01, -4.446e-02, -2.334e-01, -7.716e-02, -1.220e-01, -3.670e-01, 2.478e-02, -4.998e-02, -7.881e-02, 1.175e-01, -6.152e-01, 8.178e-02, -8.520e-02, -2.934e-01), r);
	r = MulAdd(s3_2, M4(-8.153e-01, 1.575e-01, 7.752e-01, 9.207e-01, 3.695e-02, -2.903e-01, -1.513e-01, 6.705e-03, -5.714e-02, 1.421e-01, 4.701e-02, -8.445e-02, 1.149e-01, -1.975e-01, -2.647e-01, 3.480e-01), r);
	r = MulAdd(s3_3, M4(3.278e-02, 4.846e-01, -6.441e-01, -5.745e-02, 1.015e+00, -5.113e-01, -2.161e+00, 9.403e-01, 1.408e-01, 2.036e-01, -8.272e-02, 1.226e-01, -1.586e-01, -6.721e-02, -1.057e-01, -5.341e-02), r);
	r = MulAdd(s3_4, M4(-1.353e+00, 7.765e-01, -7.567e-01, -9.945e-01, 4.103e-01, 1.065e-01, -1.313e-02, 2.815e-01, -3.549e-02, -1.802e-01, 3.446e-01, -3.407e-01, 3.526e-01, -3.278e-01, -2.182e-01, 7.167e-01), r);
	r = MulAdd(s3_5, M4(2.939e-01, 1.015e+00, -1.901e-01, 7.339e-01, -1.241e-01, 8.653e-02, 3.034e-01, 4.420e-01, 1.446e-02, 3.259e-04, 1.860e-01, -1.794e-02, -1.243e-01, -7.423e-02, -1.561e-01, -3.792e-01), r);
	r = MulAdd(s3_6, M4(-2.100e-02, 1.608e-01, 6.866e-02, 8.797e-02, 6.613e-01, -5.224e-01, -9.743e-01, 1.941e-01, -3.859e-02, 9.605e-02, -1.248e-02, -4.475e-03, 8.367e-02, 2.368e-01, 1.273e-01, -9.406e-02), r);
	r = MulAdd(s3_7, M4(1.635e-01, 3.213e-01, -1.108e-01, 5.477e-01, 2.577e-01, -4.309e-01, 2.367e-01, 7.649e-02, -1.462e-02, -1.543e-01, -1.300e-01, -3.833e-02, -4.089e-02, -2.145e-02, 2.984e-01, 5.385e-02), r);
	r = MulAdd(s3_8, M4(1.851e-01, -4.212e-01, -1.136e-01, -1.345e-01, -2.134e-01, -4.054e-01, -2.896e-01, -5.105e-02, -4.118e-02, -6.056e-02, 1.869e-02, 6.431e-02, 2.077e-02, -1.061e-01, -5.865e-02, 2.204e-01), r);
	return r;
}

V4 f1(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
	V4 r = { -1.228e-02, -4.362e-02, 4.248e-02, -2.610e-02 };
	r = MulAdd(s0_0, M4(1.340e-01, -2.739e-02, -5.723e-01, 5.792e-02, -2.152e-01, 5.695e-02, 1.657e-01, 1.371e-01, -1.275e-01, 2.891e-01, -2.673e-01, 9.725e-02, -5.268e-02, 5.140e-02, -3.580e-01, 4.427e-01), r);
	r = MulAdd(s0_1, M4(2.613e-01, 3.597e-01, -3.318e-01, -1.844e-02, -1.190e-02, -2.073e-01, -1.665e-01, -3.505e-01, 8.562e-02, -3.558e-01, 1.160e-01, 1.326e-03, -7.102e-02, -4.067e-01, 3.810e-01, 1.910e-01), r);
	r = MulAdd(s0_2, M4(4.934e-02, -5.566e-01, 6.730e-01, -1.377e-01, 1.549e-02, 2.280e-01, -3.924e-01, 1.057e-01, 6.662e-03, -2.799e-01, 9.640e-02, -8.985e-02, 1.093e-02, -1.567e-01, 8.349e-02, -1.169e-01), r);
	r = MulAdd(s0_3, M4(9.578e-02, -9.551e-02, 4.883e-02, 1.361e-01, -2.646e-01, 9.763e-02, -2.427e-01, 4.072e-01, -3.877e-01, 1.953e-01, 1.251e-01, 4.511e-02, 6.040e-02, 1.248e-02, -2.899e-02, 2.293e-01), r);
	r = MulAdd(s0_4, M4(1.141e-01, -3.109e-01, -2.500e-01, -2.775e-01, -8.209e-02, 7.398e-02, 2.509e-01, 4.549e-02, 7.525e-02, 4.394e-01, 3.993e-01, 1.754e-01, 1.776e-01, 2.971e-01, -3.441e-01, -5.120e-01), r);
	r = MulAdd(s0_5, M4(-6.703e-03, 5.090e-01, -1.500e-01, 2.635e-01, 1.989e-01, -6.816e-01, 4.591e-01, -3.330e-01, 4.645e-03, -1.490e-01, -1.983e-01, -1.401e-01, 1.173e-01, -1.492e-02, 3.411e-01, -3.166e-02), r);
	r = MulAdd(s0_6, M4(-3.800e-02, -1.477e-01, 5.108e-02, 2.905e-01, -7.616e-03, -1.992e-01, -1.272e-01, 1.880e-01, -3.035e-02, 2.286e-01, 8.193e-02, 2.364e-01, -4.650e-02, 6.422e-03, 2.178e-01, -3.440e-01), r);
	r = MulAdd(s0_7, M4(1.507e-02, -1.787e-01, 2.303e-01, -8.268e-02, -8.501e-02, -6.999e-02, -2.088e-01, 1.476e-01, 1.189e-01, 1.203e-01, -2.417e-01, -2.783e-01, 5.869e-02, 2.074e-01, -1.446e-01, 1.392e-01), r);
	r = MulAdd(s0_8, M4(9.754e-02, 1.925e-01, -3.828e-03, -2.816e-01, -1.213e-01, -2.292e-02, 2.923e-01, -2.626e-01, -1.298e-01, -9.575e-02, 2.873e-02, -9.571e-02, 9.200e-02, -1.673e-01, -1.188e-01, 8.730e-02), r);
	r = MulAdd(s1_0, M4(-2.350e-01, 1.587e-01, 3.174e-01, 1.542e-01, 1.652e-01, -9.002e-02, -4.229e-01, 9.736e-02, 5.939e-02, 3.149e-01, -1.059e-01, 1.857e-01, -3.531e-01, 2.776e-01, -2.455e-01, 4.585e-01), r);
	r = MulAdd(s1_1, M4(-3.605e-01, -1.254e-01, -4.447e-01, -4.215e-01, 2.543e-01, -1.086e-02, 3.330e-01, 1.645e-01, 4.044e-02, -1.669e-01, 1.289e-01, -2.380e-02, -3.433e-01, -3.620e-01, 1.712e-01, 1.668e-01), r);
	r = MulAdd(s1_2, M4(1.756e-02, -2.704e-03, -3.697e-01, 3.403e-01, -3.247e-02, -2.006e-01, -3.319e-02, -2.798e-01, -1.027e-01, -1.990e-01, -4.246e-01, 7.520e-02, 1.787e-01, -3.869e-01, 7.570e-02, -3.073e-01), r);
	r = MulAdd(s1_3, M4(-8.614e-02, 6.455e-02, -1.730e-01, 5.939e-02, -1.393e-02, -2.230e-02, 1.723e-01, 4.100e-01, 1.529e-02, -1.270e-01, -1.502e-01, 3.934e-01, -3.637e-01, -6.028e-02, 1.548e-01, 3.118e-01), r);
	r = MulAdd(s1_4, M4(-1.500e-01, -1.187e-01, 4.891e-01, 1.696e-01, 1.480e-01, 2.560e-01, -2.057e-01, -4.255e-01, 1.289e-01, -1.118e-02, -1.087e-02, -1.905e-01, 1.239e-01, 2.520e-02, -1.996e-02, -3.386e-01), r);
	r = MulAdd(s1_5, M4(1.052e-01, -2.239e-02, 3.975e-01, -1.405e-01, -1.267e-02, 4.224e-01, -2.635e-01, 9.741e-02, 1.863e-02, -2.286e-01, 3.328e-01, -2.667e-01, -6.544e-02, -3.208e-01, -2.388e-01, 4.362e-02), r);
	r = MulAdd(s1_6, M4(-5.319e-03, -1.865e-01, -4.191e-02, 2.387e-01, -2.874e-02, -1.661e-01, 1.281e-01, 2.304e-01, -2.066e-01, -1.021e-01, -1.088e-01, 2.609e-01, -1.502e-01, -1.255e-01, 1.263e-01, -4.176e-01), r);
	r = MulAdd(s1_7, M4(-6.257e-02, -1.212e-01, -2.666e-01, 5.840e-03, 1.525e-01, 2.046e-01, 2.017e-01, 4.602e-02, 3.198e-02, 9.142e-02, 6.658e-02, -1.799e-01, 5.912e-02, 2.451e-01, 4.026e-01, 2.306e-02), r);
	r = MulAdd(s1_8, M4(-3.453e-02, 1.717e-01, 1.058e-01, -2.543e-01, -8.375e-03, 1.824e-03, -1.615e-01, -2.510e-01, 4.237e-02, -7.161e-02, 1.259e-01, -9.696e-02, 3.739e-02, 2.963e-02, -3.003e-01, 5.154e-02), r);
	r = MulAdd(s2_0, M4(-3.399e-02, 4.481e-02, -1.857e-01, 2.356e-01, -3.322e-02, -5.841e-02, 6.648e-02, -1.404e-01, -1.145e-02, -6.816e-02, -1.611e-02, 5.459e-02, -5.165e-03, 2.280e-01, -7.486e-02, -1.055e-01), r);
	r = MulAdd(s2_1, M4(5.683e-02, -1.365e-01, -7.714e-03, 2.856e-02, 5.374e-02, -1.428e-01, 1.258e-01, -7.951e-02, 9.614e-02, -2.378e-01, 2.039e-01, -1.078e-01, -5.929e-02, 5.041e-02, -1.935e-01, -1.077e-01), r);
	r = MulAdd(s2_2, M4(-7.949e-02, 3.859e-02, 1.619e-01, -1.051e-01, 8.312e-02, 2.784e-02, -1.049e-01, 9.604e-02, -9.492e-02, 1.578e-01, -4.464e-02, 1.584e-03, -2.066e-03, 5.707e-02, -2.157e-02, 8.817e-02), r);
	r = MulAdd(s2_3, M4(1.038e-01, 8.953e-02, 5.480e-02, 2.716e-01, -7.691e-02, -3.918e-02, -9.394e-02, 9.078e-02, -1.943e-01, -7.785e-02, -2.547e-01, 8.458e-01, -1.954e-02, 1.182e-01, -1.381e-01, -3.832e-01), r);
	r = MulAdd(s2_4, M4(-2.084e-01, 3.375e-02, -1.733e-01, -7.274e-02, -9.431e-02, 1.635e-01, -1.894e-01, -3.871e-02, 4.172e-02, 2.193e-01, -3.097e-01, -4.415e-01, -1.866e-01, -3.925e-01, 2.134e-01, 6.298e-01), r);
	r = MulAdd(s2_5, M4(1.212e-01, -5.359e-02, -2.222e-02, -2.707e-01, -9.405e-02, 6.387e-02, 8.948e-03, 1.820e-01, -8.107e-02, 1.164e-01, -2.881e-01, -8.921e-02, -1.152e-01, 5.029e-01, -5.062e-01, 7.837e-02), r);
	r = MulAdd(s2_6, M4(-5.824e-02, 9.913e-02, 9.971e-02, 1.557e-01, 7.915e-02, -8.769e-02, 7.980e-02, -3.432e-02, -1.692e-01, -9.786e-02, 1.282e-01, 1.709e-01, -1.265e-01, -6.808e-02, 7.225e-02, 1.675e-01), r);
	r = MulAdd(s2_7, M4(-4.990e-02, -6.602e-03, -1.068e-01, -1.430e-02, 8.055e-02, -2.758e-02, -6.937e-02, -4.834e-02, -1.227e-02, 8.519e-02, 7.793e-01, -1.867e-01, -3.421e-01, -7.152e-02, -1.059e-01, 1.740e-01), r);
	r = MulAdd(s2_8, M4(8.256e-02, -7.590e-03, 6.570e-02, -1.256e-01, -8.610e-02, 2.026e-01, -3.546e-02, 1.104e-01, 1.248e-01, -1.588e-01, 5.464e-01, -6.521e-02, 1.475e-01, -1.531e-01, 3.597e-01, -1.642e-01), r);
	r = MulAdd(s3_0, M4(6.340e-01, 3.940e-01, 1.113e+00, 1.007e+00, -1.212e+00, 2.533e-01, -1.397e+00, 6.065e-01, -7.533e-02, -2.773e-02, -1.402e-02, 4.141e-02, 8.358e-02, 7.306e-02, 7.087e-03, -1.155e-01), r);
	r = MulAdd(s3_1, M4(1.419e+00, 4.487e-01, -7.031e-01, -8.197e-01, 2.210e-01, -1.184e+00, 8.590e-01, -4.940e-01, 1.331e-01, -3.564e-01, 2.108e-01, -8.584e-02, -2.572e-01, -1.731e-02, 2.636e-01, -1.133e-01), r);
	r = MulAdd(s3_2, M4(9.915e-01, -7.961e-01, 5.330e-01, -4.510e-01, -5.253e-01, 1.903e-01, -4.662e-01, 3.206e-01, 2.827e-02, 2.187e-02, -3.303e-02, 1.886e-02, -1.535e-02, -1.334e-02, 6.018e-03, 1.582e-01), r);
	r = MulAdd(s3_3, M4(4.868e-02, 5.590e-01, 5.528e-02, 4.772e-01, 2.589e-01, -4.851e-01, 9.148e-01, 7.832e-01, -3.434e-01, -5.368e-02, -1.723e-01, 1.019e+00, 2.202e-01, 1.859e-01, 5.046e-03, -7.167e-01), r);
	r = MulAdd(s3_4, M4(2.415e+00, 1.390e+00, 6.607e-01, -2.473e-01, 2.204e-01, -1.690e-01, -6.748e-01, -6.208e-01, 5.492e-02, 1.191e-01, -3.462e-01, -4.150e-01, 2.729e-01, -2.437e-01, 1.929e-01, 1.347e-01), r);
	r = MulAdd(s3_5, M4(9.691e-01, -8.869e-01, 3.704e-01, -5.191e-02, -1.921e-02, 1.358e+00, -4.369e-01, 4.558e-01, -5.294e-02, 3.324e-02, -1.043e-01, -1.216e-01, 1.593e-01, 5.623e-01, -4.612e-01, -1.042e-01), r);
	r = MulAdd(s3_6, M4(-1.293e-01, 4.959e-01, 1.363e-01, 5.027e-02, -5.043e-01, -4.472e-01, -6.873e-02, -6.303e-02, -2.425e-01, 1.265e-01, -1.273e-01, 2.420e-01, 6.472e-03, 5.157e-02, 1.899e-01, 6.324e-02), r);
	r = MulAdd(s3_7, M4(-2.009e-01, 5.446e-01, -2.478e-01, -9.895e-03, 6.806e-02, -3.905e-01, -3.134e-01, -4.657e-01, -9.783e-02, -1.021e-01, 3.844e-01, -1.041e-01, -4.335e-03, 3.746e-01, -1.352e-01, 5.207e-02), r);
	r = MulAdd(s3_8, M4(3.874e-01, 2.981e-02, 2.205e-01, -1.412e-01, 3.674e-01, 8.717e-01, -6.576e-01, 4.285e-01, -4.677e-02, 1.490e-02, 1.870e-01, -1.712e-02, 1.799e-01, 2.057e-01, -9.212e-02, -6.896e-02), r);
	return r;
}

void Pass2(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = TileSwizzle8x8(tid.x) + blockStart;
	uint2 size = GetInputSize();
	if (gxy.x >= size.x || gxy.y >= size.y) {
		return;
	}
	float2 pos = (gxy + 0.5) * pt;

	V4 s0_0 = l0(-1.0, -1.0);
	V4 s0_1 = l0(0.0, -1.0);
	V4 s0_2 = l0(1.0, -1.0);
	V4 s0_3 = l0(-1.0, 0.0);
	V4 s0_4 = l0(0.0, 0.0);
	V4 s0_5 = l0(1.0, 0.0);
	V4 s0_6 = l0(-1.0, 1.0);
	V4 s0_7 = l0(0.0, 1.0);
	V4 s0_8 = l0(1.0, 1.0);
	V4 s1_0 = -max(-s0_0, 0.0);
	V4 s1_1 = -max(-s0_1, 0.0);
	V4 s1_2 = -max(-s0_2, 0.0);
	V4 s1_3 = -max(-s0_3, 0.0);
	V4 s1_4 = -max(-s0_4, 0.0);
	V4 s1_5 = -max(-s0_5, 0.0);
	V4 s1_6 = -max(-s0_6, 0.0);
	V4 s1_7 = -max(-s0_7, 0.0);
	V4 s1_8 = -max(-s0_8, 0.0);
	s0_0 = max(s0_0, 0.0);
	s0_1 = max(s0_1, 0.0);
	s0_2 = max(s0_2, 0.0);
	s0_3 = max(s0_3, 0.0);
	s0_4 = max(s0_4, 0.0);
	s0_5 = max(s0_5, 0.0);
	s0_6 = max(s0_6, 0.0);
	s0_7 = max(s0_7, 0.0);
	s0_8 = max(s0_8, 0.0);

	V4 s2_0 = l1(-1.0, -1.0);
	V4 s2_1 = l1(0.0, -1.0);
	V4 s2_2 = l1(1.0, -1.0);
	V4 s2_3 = l1(-1.0, 0.0);
	V4 s2_4 = l1(0.0, 0.0);
	V4 s2_5 = l1(1.0, 0.0);
	V4 s2_6 = l1(-1.0, 1.0);
	V4 s2_7 = l1(0.0, 1.0);
	V4 s2_8 = l1(1.0, 1.0);
	V4 s3_0 = -max(-s2_0, 0.0);
	V4 s3_1 = -max(-s2_1, 0.0);
	V4 s3_2 = -max(-s2_2, 0.0);
	V4 s3_3 = -max(-s2_3, 0.0);
	V4 s3_4 = -max(-s2_4, 0.0);
	V4 s3_5 = -max(-s2_5, 0.0);
	V4 s3_6 = -max(-s2_6, 0.0);
	V4 s3_7 = -max(-s2_7, 0.0);
	V4 s3_8 = -max(-s2_8, 0.0);
	s2_0 = max(s2_0, 0.0);
	s2_1 = max(s2_1, 0.0);
	s2_2 = max(s2_2, 0.0);
	s2_3 = max(s2_3, 0.0);
	s2_4 = max(s2_4, 0.0);
	s2_5 = max(s2_5, 0.0);
	s2_6 = max(s2_6, 0.0);
	s2_7 = max(s2_7, 0.0);
	s2_8 = max(s2_8, 0.0);

	t2[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
	t3[gxy] = f1(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
}

//!PASS 3
//!DESC Convolution layer 2 - Feature processing
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t2, t3
//!OUT t0, t1

#define l0(x, y) V4(O(t2, float2(x, y)))
#define l1(x, y) V4(O(t3, float2(x, y)))

V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
	V4 r = { 3.966e-02, 2.168e-04, 3.712e-03, 7.866e-03 };
	r = MulAdd(s0_0, M4(-1.867e-01, -2.287e-03, -1.191e-01, -1.249e-02, -8.376e-02, 2.333e-01, 3.011e-02, 8.727e-02, -1.528e-01, 4.522e-02, -1.092e-01, 1.235e-01, 1.179e-01, 1.085e-01, 2.075e-01, -4.524e-02), r);
	r = MulAdd(s0_1, M4(-6.251e-02, 9.622e-02, -1.299e-01, -8.819e-02, 8.242e-02, -4.033e-01, -1.512e-01, -8.449e-02, -2.673e-01, 3.868e-01, 2.615e-02, 1.069e-01, 2.055e-01, -1.800e-01, 7.375e-03, 8.391e-03), r);
	r = MulAdd(s0_2, M4(1.410e-01, -8.185e-02, 2.465e-02, 3.546e-02, -7.339e-02, -4.492e-02, -6.633e-02, -1.005e-02, 5.993e-02, 5.536e-02, -6.278e-03, -8.258e-02, -1.498e-01, 5.872e-02, 1.560e-01, -7.461e-02), r);
	r = MulAdd(s0_3, M4(-2.116e-01, 2.332e-01, 2.930e-01, -1.691e-01, -9.505e-02, -8.336e-02, 1.302e-01, -8.985e-03, 1.795e-01, 4.858e-01, -5.144e-02, 1.663e-01, 2.647e-01, -3.083e-01, -1.612e-01, 9.123e-02), r);
	r = MulAdd(s0_4, M4(2.233e-01, 3.415e-01, 2.541e-01, 1.521e-01, 2.939e-01, 5.545e-02, 2.474e-01, -1.522e-01, 1.945e-01, 1.302e-01, 6.035e-02, 1.204e-01, -5.938e-02, -1.101e-01, -9.751e-02, -2.144e-01), r);
	r = MulAdd(s0_5, M4(-1.567e-01, 7.505e-02, 7.642e-02, -1.079e-01, 3.845e-02, -9.225e-02, 1.250e-02, -9.117e-02, 3.753e-01, -1.417e-01, -7.198e-02, 3.818e-01, -5.320e-02, -2.646e-01, 2.586e-03, 1.312e-01), r);
	r = MulAdd(s0_6, M4(-9.201e-02, 1.449e-01, 2.745e-01, -2.933e-02, 8.872e-02, -7.589e-02, 1.074e-01, -1.002e-01, -8.457e-03, -3.762e-01, 5.415e-02, 1.822e-01, 2.688e-01, -2.326e-01, -5.302e-01, -6.336e-02), r);
	r = MulAdd(s0_7, M4(1.119e-01, -1.561e-02, -6.547e-03, -9.972e-02, -1.073e-01, 9.063e-02, 3.843e-02, 6.815e-02, 3.779e-01, -1.196e-01, -1.353e-01, -2.813e-02, 1.060e-01, 2.387e-01, 1.267e-02, 1.083e-01), r);
	r = MulAdd(s0_8, M4(-9.346e-02, 1.792e-02, -1.428e-01, 3.228e-02, -1.043e-01, -1.120e-01, -3.488e-03, -4.356e-02, -2.876e-01, -7.053e-02, -4.502e-02, -5.783e-02, -2.254e-01, 5.783e-02, 3.232e-01, 7.605e-04), r);
	r = MulAdd(s1_0, M4(9.940e-02, 9.588e-03, -3.584e-01, 1.252e-01, -1.917e-01, 1.258e-01, 2.297e-01, 1.390e-01, 4.639e-02, -1.121e-01, -1.118e-01, -4.381e-02, -1.695e-01, 1.821e-01, 2.465e-01, -9.049e-02), r);
	r = MulAdd(s1_1, M4(5.143e-02, 1.069e-02, -8.068e-02, -8.982e-02, -2.494e-02, -7.513e-02, 1.598e-02, 3.467e-02, -1.174e-01, 2.261e-01, -8.818e-02, -1.923e-02, -3.776e-02, -3.819e-01, -8.594e-02, 4.435e-03), r);
	r = MulAdd(s1_2, M4(-3.623e-02, -5.439e-02, -9.351e-03, 1.235e-01, -9.703e-03, -1.335e-01, 1.036e-01, 2.097e-02, 1.458e-02, 3.203e-02, 1.967e-01, -1.818e-02, -9.724e-02, -7.890e-02, 3.670e-02, -4.865e-02), r);
	r = MulAdd(s1_3, M4(-2.127e-01, -2.056e-02, 3.076e-01, 5.685e-02, 9.937e-02, -2.134e-01, 3.166e-02, -4.301e-02, 6.663e-02, -5.974e-02, 8.903e-02, 3.410e-02, 4.111e-02, 1.169e-01, 4.898e-02, 5.654e-02), r);
	r = MulAdd(s1_4, M4(1.874e-01, 1.435e-01, -4.010e-01, 6.335e-05, 3.853e-01, -4.015e-01, -4.051e-01, -2.454e-01, -2.675e-02, 2.236e-01, 2.143e-01, -4.164e-02, -2.142e-01, -1.742e-01, 4.441e-01, -2.792e-01), r);
	r = MulAdd(s1_5, M4(-1.686e-01, -3.803e-02, 1.583e-01, 9.059e-02, -1.233e-01, 7.512e-02, 1.737e-01, 6.937e-02, 1.250e-01, -9.459e-02, -2.433e-01, 2.107e-01, 8.135e-02, -1.054e-01, -1.870e-01, -1.011e-01), r);
	r = MulAdd(s1_6, M4(1.078e-01, -1.143e-01, 2.303e-01, 1.447e-01, -6.396e-02, -1.863e-01, -1.454e-01, -1.476e-01, 2.346e-02, -1.712e-02, 3.948e-02, -2.445e-02, -7.822e-02, 5.100e-01, 1.676e-01, 5.526e-02), r);
	r = MulAdd(s1_7, M4(2.939e-01, -3.193e-01, 1.130e-01, 1.019e-01, -2.836e-01, -3.252e-01, -1.221e-01, -1.886e-01, -1.541e-02, -3.348e-02, -1.192e-01, 5.553e-02, 2.747e-01, 3.810e-01, -1.083e-01, 8.926e-02), r);
	r = MulAdd(s1_8, M4(-1.914e-01, -1.810e-01, 3.616e-01, 1.443e-01, -2.356e-01, 1.987e-02, 2.206e-01, -7.203e-03, -4.873e-02, 1.449e-02, 2.429e-03, 4.135e-03, 1.601e-01, 1.436e-02, -2.503e-01, -1.747e-01), r);
	r = MulAdd(s2_0, M4(5.158e-02, -5.768e-01, 1.059e-01, 6.356e-02, 5.453e-02, 1.182e-01, 1.025e-01, -4.581e-03, -4.476e-02, -7.326e-02, 3.128e-02, -1.658e-02, 7.979e-02, -1.015e-01, -7.756e-03, 5.261e-02), r);
	r = MulAdd(s2_1, M4(-9.209e-02, 5.020e-04, 1.687e-01, 2.337e-01, 5.893e-02, -1.190e-02, 2.134e-01, 2.056e-01, -8.468e-02, 1.393e-01, 2.666e-01, 8.482e-02, 9.005e-03, 1.830e-01, 1.487e-01, 1.541e-01), r);
	r = MulAdd(s2_2, M4(-1.577e-02, 1.800e-01, -4.463e-02, 1.857e-01, -2.505e-02, -4.244e-02, 3.534e-02, 1.091e-02, 3.562e-02, -1.613e-02, 7.525e-02, -6.411e-02, -1.120e-01, 2.570e-01, 1.793e-01, 3.232e-01), r);
	r = MulAdd(s2_3, M4(1.081e-01, -3.272e-01, 9.989e-02, -1.714e-01, 1.272e-02, -8.817e-02, -6.541e-01, 9.601e-03, 5.846e-03, -1.214e-01, 4.286e-02, -1.424e-04, 2.707e-02, -1.255e-01, -1.766e-01, 7.553e-02), r);
	r = MulAdd(s2_4, M4(1.707e-01, 2.406e-02, 7.128e-01, 3.512e-01, 4.649e-01, -2.976e-02, -2.139e-01, -1.608e-01, 1.348e-01, -1.933e-01, -1.625e-01, -1.502e-01, 3.457e-02, -2.310e-01, -1.418e-01, -1.607e-01), r);
	r = MulAdd(s2_5, M4(2.141e-01, -3.424e-01, -2.787e-01, 1.588e-01, -8.585e-03, 1.987e-01, 8.631e-02, -2.482e-02, -1.159e-01, 3.130e-02, 6.300e-02, -1.646e-01, 2.076e-01, 3.123e-01, 7.450e-02, 8.300e-01), r);
	r = MulAdd(s2_6, M4(-1.156e-01, 5.885e-01, 2.790e-01, 1.944e-01, -2.609e-02, 1.192e-01, 1.064e-01, 2.452e-01, 6.769e-02, 7.500e-02, 1.444e-01, 1.657e-02, 1.032e-04, 6.885e-02, -3.119e-01, 1.838e-02), r);
	r = MulAdd(s2_7, M4(2.686e-01, -3.624e-01, -3.820e-01, -1.910e-02, 2.541e-01, -3.535e-02, 3.461e-02, 6.867e-02, 1.792e-01, -4.944e-02, 9.518e-02, -9.353e-02, 1.577e-01, -2.950e-01, -5.366e-01, -9.244e-02), r);
	r = MulAdd(s2_8, M4(-5.502e-02, -3.642e-02, 2.950e-01, 1.710e-02, 1.386e-03, -4.643e-03, -5.468e-02, 1.526e-01, -3.839e-02, 2.502e-02, 6.890e-02, -5.190e-02, -3.046e-01, -8.298e-02, -2.710e-01, -4.361e-02), r);
	r = MulAdd(s3_0, M4(3.470e-02, -1.794e-01, 4.939e-03, 5.871e-02, -3.644e-02, 2.163e-01, 1.659e-01, 1.729e-02, -7.065e-02, -4.364e-02, -1.401e-01, -6.392e-02, 2.933e-02, -2.708e-01, -6.419e-02, 2.552e-02), r);
	r = MulAdd(s3_1, M4(2.013e-01, 1.346e-01, 1.245e-01, 2.012e-01, 1.159e-01, 2.536e-02, 3.348e-01, 2.151e-01, -1.428e-01, 3.330e-02, 5.570e-02, 1.265e-01, -1.421e-01, -2.205e-01, -2.407e-02, -1.310e-02), r);
	r = MulAdd(s3_2, M4(1.897e-01, 1.382e-01, 8.547e-02, 1.685e-02, 3.906e-02, -1.645e-02, 8.177e-02, -1.090e-01, 2.720e-02, -1.483e-01, -1.471e-01, -3.796e-02, -9.755e-02, 2.098e-01, -1.363e-01, 8.741e-04), r);
	r = MulAdd(s3_3, M4(-8.746e-02, -2.411e-01, -5.564e-01, 1.288e-01, -1.560e-03, -1.454e-01, -3.086e-01, -1.364e-01, -6.448e-02, 2.139e-02, -6.920e-02, 1.125e-01, 2.256e-02, -1.337e-01, 2.318e-01, -9.337e-02), r);
	r = MulAdd(s3_4, M4(1.330e-01, 7.059e-02, 2.570e-01, -3.958e-02, 2.678e-01, -1.782e-01, -4.249e-01, -7.859e-02, -7.410e-03, 4.629e-02, 1.869e-01, 6.028e-02, -2.636e-01, -8.746e-01, 4.294e-01, -7.591e-01), r);
	r = MulAdd(s3_5, M4(6.538e-02, 3.535e-01, 3.381e-01, 1.665e-01, -1.464e-01, -4.395e-02, 6.332e-02, 6.707e-02, 4.085e-02, 3.123e-02, -8.943e-02, -1.124e-01, 7.129e-01, -2.572e-01, -3.428e-02, -3.166e-01), r);
	r = MulAdd(s3_6, M4(3.447e-02, 1.964e-02, -1.246e-02, -4.894e-02, -3.053e-02, 6.215e-02, 1.432e-01, 4.347e-02, 9.639e-02, 2.140e-01, 1.791e-01, 1.898e-01, -5.042e-02, 2.674e-01, 1.714e-01, 8.155e-02), r);
	r = MulAdd(s3_7, M4(3.324e-02, -2.281e-01, 8.075e-02, 8.469e-02, 7.409e-02, 4.094e-02, 1.500e-01, 7.922e-02, 1.632e-01, 3.876e-02, -1.316e-01, 1.200e-01, 3.408e-01, 1.752e-01, -1.315e-01, 1.251e-01), r);
	r = MulAdd(s3_8, M4(-1.043e-01, -2.295e-01, -2.968e-01, -7.326e-02, 9.514e-02, 1.595e-02, -2.148e-02, -7.380e-02, -3.873e-02, 5.639e-02, 5.948e-02, 6.939e-02, 4.680e-01, 2.191e-01, -1.091e-01, -4.211e-02), r);
	return r;
}

V4 f1(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
	V4 r = { 2.464e-02, -1.503e-02, -3.365e-02, 7.710e-03 };
	r = MulAdd(s0_0, M4(1.510e-01, 1.269e-02, -1.267e-01, 1.446e-01, 3.963e-02, 3.673e-02, -3.129e-02, 2.499e-02, 1.567e-02, 1.863e-01, -4.694e-03, -1.522e-01, -1.215e-01, 1.422e-01, 1.161e-01, -1.114e-01), r);
	r = MulAdd(s0_1, M4(-2.120e-02, 1.988e-01, 1.436e-01, -1.165e-01, 1.754e-01, 6.868e-02, -1.784e-01, 1.453e-02, 1.257e-02, -3.559e-01, 6.345e-02, 7.528e-02, -1.168e-01, -3.730e-01, -1.436e-02, 2.597e-02), r);
	r = MulAdd(s0_2, M4(-1.577e-01, 4.407e-03, -3.120e-02, -3.013e-03, -1.327e-01, -1.630e-01, -8.175e-02, -1.577e-02, -2.674e-01, -5.831e-02, 1.139e-01, -9.667e-02, 1.499e-01, -3.017e-02, -7.416e-02, 8.048e-02), r);
	r = MulAdd(s0_3, M4(-3.099e-01, 1.730e-01, 4.941e-02, 3.643e-01, 5.996e-03, -1.379e-01, 3.862e-01, 1.959e-01, 3.855e-01, 2.959e-01, -1.075e-01, -3.195e-01, 5.520e-02, -2.633e-02, 6.609e-01, -2.863e-01), r);
	r = MulAdd(s0_4, M4(-1.581e-01, 5.127e-01, 5.755e-01, -1.758e-02, 2.351e-02, -1.945e-01, -3.701e-01, -2.035e-01, 1.510e+00, -6.021e-01, 2.540e-01, 1.989e-03, -2.473e-01, 6.479e-01, -1.236e-01, -2.509e-02), r);
	r = MulAdd(s0_5, M4(6.567e-02, 4.649e-02, 1.349e-01, 4.919e-02, 4.185e-02, -1.002e-02, 9.981e-02, -2.859e-02, -2.278e-01, 4.727e-01, 1.868e-01, -2.588e-01, 8.428e-02, 7.575e-03, -3.525e-01, 3.996e-03), r);
	r = MulAdd(s0_6, M4(-1.744e-01, -9.151e-02, 1.860e-01, 2.731e-01, -6.986e-02, -3.020e-02, -1.869e-01, -1.327e-02, -1.042e-01, -2.540e-02, -2.241e-01, 3.422e-01, -1.022e-01, 1.238e-01, 2.314e-01, -3.349e-01), r);
	r = MulAdd(s0_7, M4(-2.791e-02, 1.601e-01, 2.001e-01, -7.415e-02, 1.847e-01, -9.655e-02, -3.229e-02, 2.887e-02, -9.137e-02, -3.329e-02, 2.700e-01, 9.334e-02, 9.282e-02, -6.146e-02, 1.428e-01, 1.684e-01), r);
	r = MulAdd(s0_8, M4(1.795e-01, -2.872e-02, 3.678e-02, 6.662e-02, -1.044e-01, -3.932e-02, 2.314e-02, -7.340e-02, 9.445e-02, 1.306e-01, -2.439e-01, 9.100e-02, 3.984e-02, -9.294e-02, -6.069e-02, -2.545e-02), r);
	r = MulAdd(s1_0, M4(4.324e-01, 2.259e-01, -4.051e-02, 4.797e-03, -1.294e-01, 2.765e-02, 1.875e-01, 5.255e-02, -1.390e-02, 2.073e-01, 6.489e-02, -1.416e-01, -2.698e-01, 2.849e-02, 1.129e-01, 6.424e-02), r);
	r = MulAdd(s1_1, M4(-1.527e-01, -3.081e-01, -2.367e-01, -1.744e-01, -1.040e-02, -1.817e-01, 1.216e-01, -5.997e-03, -1.056e-01, 1.047e-02, -9.173e-02, 6.123e-02, 1.587e-01, 3.438e-01, 8.663e-02, 1.487e-01), r);
	r = MulAdd(s1_2, M4(-1.173e-01, -1.156e-01, -4.805e-02, 5.150e-02, -8.245e-02, 7.996e-02, -1.812e-01, 1.072e-01, -4.140e-02, -1.859e-01, 9.617e-03, 9.194e-02, 9.678e-03, 4.886e-02, -7.522e-02, 1.882e-02), r);
	r = MulAdd(s1_3, M4(4.124e-02, -1.781e-01, -8.895e-01, 1.978e-02, -1.229e-01, -1.255e-01, 1.666e-01, 3.719e-01, -1.012e-01, -4.286e-02, 8.884e-02, -1.577e-01, -1.258e-01, 1.669e-01, 4.306e-01, 2.863e-01), r);
	r = MulAdd(s1_4, M4(-3.640e-01, 7.624e-02, -5.308e-01, -3.999e-02, 6.426e-02, -2.244e-01, -2.932e-01, -2.347e-01, 3.475e-01, -1.236e-01, 1.469e-01, 8.772e-02, 9.278e-02, 5.106e-01, -7.446e-02, -5.061e-02), r);
	r = MulAdd(s1_5, M4(2.733e-01, 2.298e-01, -4.516e-02, 1.068e-01, -6.146e-02, 1.248e-01, 1.512e-01, -1.409e-01, -2.574e-02, 1.303e-01, -6.689e-02, -1.138e-01, 3.539e-02, 3.450e-03, -3.933e-02, -2.456e-02), r);
	r = MulAdd(s1_6, M4(-1.740e-01, 1.962e-01, -2.318e-01, 4.245e-02, 1.837e-01, 1.318e-02, 6.216e-02, -8.214e-02, 3.329e-02, 4.938e-02, -1.139e-01, 2.849e-02, 7.127e-02, -1.978e-01, -6.718e-02, 3.379e-01), r);
	r = MulAdd(s1_7, M4(1.036e-01, -1.074e-01, 1.211e-01, -1.502e-01, 3.505e-01, 2.469e-02, 3.160e-02, 1.840e-01, -5.042e-02, 1.114e-02, 1.469e-02, -2.675e-02, -6.888e-02, -1.076e-01, -1.764e-01, -1.250e-03), r);
	r = MulAdd(s1_8, M4(-1.227e-01, 3.169e-02, -1.346e-01, 2.663e-02, -3.523e-01, -4.952e-02, 2.526e-02, -3.065e-01, 1.105e-01, -1.705e-02, -2.442e-02, 4.761e-02, 2.978e-01, -1.184e-01, -2.559e-02, -1.439e-01), r);
	r = MulAdd(s2_0, M4(-6.434e-02, -1.450e-01, 6.691e-02, -1.498e-02, -2.586e-01, -1.415e-01, 2.408e-02, 1.405e-02, -9.741e-02, -2.278e-02, -7.323e-03, 1.029e-01, 1.971e-01, -7.889e-02, 1.621e-01, -1.481e-01), r);
	r = MulAdd(s2_1, M4(7.306e-02, 8.261e-03, -1.106e-01, -6.119e-02, 9.048e-02, 3.668e-01, -4.932e-02, 1.419e-01, -1.065e-01, 1.921e-01, -6.933e-03, 1.397e-01, 1.131e-01, -2.959e-01, 2.377e-01, -8.244e-02), r);
	r = MulAdd(s2_2, M4(-6.484e-02, -8.438e-02, -8.452e-02, 7.652e-02, 1.060e-01, 2.751e-01, -4.448e-02, 6.228e-02, 2.452e-02, 1.715e-01, -2.210e-01, 1.880e-01, -1.089e-01, -4.575e-01, 1.402e-01, 2.589e-02), r);
	r = MulAdd(s2_3, M4(5.019e-02, 1.179e-01, -3.322e-01, -1.521e-01, 1.682e-01, 5.881e-02, -4.209e-01, -8.599e-02, 1.338e-02, 1.764e-02, -1.383e-01, 7.389e-02, 3.577e-02, 7.338e-02, -2.762e-01, -9.510e-02), r);
	r = MulAdd(s2_4, M4(-1.853e-01, -4.620e-01, -1.324e-01, -2.907e-01, 4.004e-01, 6.572e-02, -3.924e-01, 3.585e-02, 1.534e-02, 2.261e-01, -1.448e-01, 7.334e-02, -4.871e-01, 2.474e-01, 6.138e-01, 1.618e-01), r);
	r = MulAdd(s2_5, M4(-3.986e-02, -5.998e-02, -1.165e-01, 2.199e-01, 1.416e-01, -2.026e-01, -1.034e-01, -8.049e-03, 2.830e-01, 1.312e-03, -1.952e-01, 1.926e-01, 1.404e-01, -1.712e-01, 2.704e-01, 5.702e-02), r);
	r = MulAdd(s2_6, M4(1.954e-01, 1.098e-01, -4.060e-01, 6.263e-01, 9.565e-02, 3.321e-02, -9.177e-02, 1.727e-01, -3.629e-02, -3.498e-02, -1.498e-01, -1.020e-02, 7.289e-02, -1.008e-01, -6.138e-02, -7.229e-02), r);
	r = MulAdd(s2_7, M4(4.694e-01, 4.689e-02, -2.026e-02, -1.420e-01, -1.815e-01, 1.337e-01, -3.029e-03, 8.901e-03, -7.604e-02, 6.520e-03, -9.244e-02, 1.180e-01, 1.924e-01, 4.576e-03, 9.215e-02, 1.393e-01), r);
	r = MulAdd(s2_8, M4(-2.694e-01, -7.673e-02, -3.907e-02, 2.054e-01, -7.508e-02, 4.244e-02, 8.589e-03, -2.004e-02, 1.029e-01, -2.785e-04, -1.329e-01, 1.480e-01, 1.240e-01, 1.682e-01, 4.375e-02, 5.794e-02), r);
	r = MulAdd(s3_0, M4(1.908e-01, -9.550e-02, -6.231e-02, -8.961e-02, -2.150e-01, -5.266e-02, -5.877e-02, 6.574e-02, 1.206e-01, -5.647e-02, 1.819e-01, 9.125e-03, -1.411e-02, -7.630e-02, 1.694e-01, -6.670e-02), r);
	r = MulAdd(s3_1, M4(-7.247e-02, -1.664e-02, 3.111e-02, -8.914e-02, 1.373e-02, -7.042e-03, 2.124e-01, 8.056e-03, 1.248e-01, 1.013e-01, 1.053e-01, 8.886e-02, -3.534e-01, -5.092e-02, -8.903e-02, 1.120e-01), r);
	r = MulAdd(s3_2, M4(-1.274e-01, 1.296e-01, 6.739e-02, -1.061e-01, 1.488e-02, 1.174e-01, -2.220e-02, 3.190e-02, -8.067e-03, 6.182e-03, 3.938e-02, -1.144e-02, 1.720e-02, 2.232e-02, -1.033e-01, 1.938e-01), r);
	r = MulAdd(s3_3, M4(2.119e-01, -7.107e-03, -1.771e-01, 3.717e-03, -9.809e-02, -2.060e-01, -1.242e-01, 1.331e-01, 1.656e-01, -4.429e-02, 3.430e-01, -1.180e-01, -8.243e-02, 2.318e-01, -3.745e-01, 2.954e-02), r);
	r = MulAdd(s3_4, M4(5.462e-02, 2.234e-01, 5.211e-02, 4.298e-02, 2.002e-01, -6.029e-02, 8.922e-02, -1.422e-01, 4.850e-03, 2.692e-02, 1.883e-03, -5.867e-02, -1.761e-01, -2.994e-01, -1.180e-01, 5.357e-01), r);
	r = MulAdd(s3_5, M4(-1.020e-01, -9.731e-03, 4.965e-02, 1.040e-01, 3.933e-02, -7.445e-02, -8.691e-02, 2.101e-02, 1.135e-01, -4.423e-02, 7.911e-02, -7.177e-04, 7.153e-03, -2.231e-01, -2.036e-01, 2.961e-01), r);
	r = MulAdd(s3_6, M4(-8.583e-02, -1.500e-01, -4.673e-02, -1.242e-01, 5.128e-02, 1.491e-03, 2.211e-01, 4.382e-03, -1.899e-01, -1.419e-02, 1.903e-01, -1.301e-01, 6.635e-02, -2.247e-02, -5.734e-02, 3.099e-01), r);
	r = MulAdd(s3_7, M4(1.511e-01, -1.528e-01, 1.166e-01, 1.147e-01, -3.419e-01, 3.498e-02, 1.819e-02, -1.488e-01, -1.092e-01, -4.590e-02, 2.240e-01, -2.029e-01, 3.950e-02, 2.020e-02, 1.403e-02, 4.349e-01), r);
	r = MulAdd(s3_8, M4(4.921e-02, 8.799e-02, -6.688e-03, 9.084e-02, 3.443e-02, 3.622e-02, 8.327e-02, -2.267e-02, -1.895e-02, -2.932e-02, 8.468e-02, -4.335e-02, -5.082e-02, 8.529e-02, 7.031e-02, 2.080e-01), r);
	return r;
}

void Pass3(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = TileSwizzle8x8(tid.x) + blockStart;
	uint2 size = GetInputSize();
	if (gxy.x >= size.x || gxy.y >= size.y) {
		return;
	}
	float2 pos = (gxy + 0.5) * pt;

	V4 s0_0 = l0(-1.0, -1.0);
	V4 s0_1 = l0(0.0, -1.0);
	V4 s0_2 = l0(1.0, -1.0);
	V4 s0_3 = l0(-1.0, 0.0);
	V4 s0_4 = l0(0.0, 0.0);
	V4 s0_5 = l0(1.0, 0.0);
	V4 s0_6 = l0(-1.0, 1.0);
	V4 s0_7 = l0(0.0, 1.0);
	V4 s0_8 = l0(1.0, 1.0);
	V4 s1_0 = -max(-s0_0, 0.0);
	V4 s1_1 = -max(-s0_1, 0.0);
	V4 s1_2 = -max(-s0_2, 0.0);
	V4 s1_3 = -max(-s0_3, 0.0);
	V4 s1_4 = -max(-s0_4, 0.0);
	V4 s1_5 = -max(-s0_5, 0.0);
	V4 s1_6 = -max(-s0_6, 0.0);
	V4 s1_7 = -max(-s0_7, 0.0);
	V4 s1_8 = -max(-s0_8, 0.0);
	s0_0 = max(s0_0, 0.0);
	s0_1 = max(s0_1, 0.0);
	s0_2 = max(s0_2, 0.0);
	s0_3 = max(s0_3, 0.0);
	s0_4 = max(s0_4, 0.0);
	s0_5 = max(s0_5, 0.0);
	s0_6 = max(s0_6, 0.0);
	s0_7 = max(s0_7, 0.0);
	s0_8 = max(s0_8, 0.0);

	V4 s2_0 = l1(-1.0, -1.0);
	V4 s2_1 = l1(0.0, -1.0);
	V4 s2_2 = l1(1.0, -1.0);
	V4 s2_3 = l1(-1.0, 0.0);
	V4 s2_4 = l1(0.0, 0.0);
	V4 s2_5 = l1(1.0, 0.0);
	V4 s2_6 = l1(-1.0, 1.0);
	V4 s2_7 = l1(0.0, 1.0);
	V4 s2_8 = l1(1.0, 1.0);
	V4 s3_0 = -max(-s2_0, 0.0);
	V4 s3_1 = -max(-s2_1, 0.0);
	V4 s3_2 = -max(-s2_2, 0.0);
	V4 s3_3 = -max(-s2_3, 0.0);
	V4 s3_4 = -max(-s2_4, 0.0);
	V4 s3_5 = -max(-s2_5, 0.0);
	V4 s3_6 = -max(-s2_6, 0.0);
	V4 s3_7 = -max(-s2_7, 0.0);
	V4 s3_8 = -max(-s2_8, 0.0);
	s2_0 = max(s2_0, 0.0);
	s2_1 = max(s2_1, 0.0);
	s2_2 = max(s2_2, 0.0);
	s2_3 = max(s2_3, 0.0);
	s2_4 = max(s2_4, 0.0);
	s2_5 = max(s2_5, 0.0);
	s2_6 = max(s2_6, 0.0);
	s2_7 = max(s2_7, 0.0);
	s2_8 = max(s2_8, 0.0);

	t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
	t1[gxy] = f1(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
}

//!PASS 4
//!DESC Convolution layer 3 - Feature processing
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0, t1
//!OUT t2, t3

#define l0(x, y) V4(O(t0, float2(x, y)))
#define l1(x, y) V4(O(t1, float2(x, y)))

V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
	V4 r = { 1.554e-02, -1.502e-02, 3.403e-02, -3.054e-03 };
	r = MulAdd(s0_0, M4(-1.303e-01, -2.502e-01, 6.500e-03, 2.601e-01, -3.627e-02, 1.172e-02, -2.737e-02, 1.167e-02, -1.626e-01, -3.004e-02, 2.364e-02, -2.347e-02, -4.186e-02, 1.951e-01, 2.632e-02, 1.146e-01), r);
	r = MulAdd(s0_1, M4(-1.030e-01, -1.744e-01, -7.392e-03, 1.713e-02, 7.134e-02, -2.497e-04, -1.421e-01, 1.345e-01, -1.077e-01, -2.417e-01, 7.019e-02, -3.896e-01, -7.303e-02, -4.439e-03, -4.887e-02, 9.957e-02), r);
	r = MulAdd(s0_2, M4(-3.608e-02, 1.177e-02, 1.276e-01, -1.034e-01, 3.357e-02, 1.218e-01, -6.970e-02, 2.349e-02, -8.838e-03, -1.774e-01, -1.413e-01, -9.342e-02, -4.418e-02, -7.144e-02, 4.315e-02, 6.959e-02), r);
	r = MulAdd(s0_3, M4(-2.867e-01, -2.233e-01, 4.131e-01, 6.114e-01, 7.602e-02, 1.683e-01, -1.640e-01, 5.004e-02, -1.479e-01, -1.302e-01, 1.253e-01, -5.586e-02, 1.083e-02, 2.092e-01, 4.253e-01, 5.892e-02), r);
	r = MulAdd(s0_4, M4(2.991e-01, -2.867e-01, 3.591e-01, 1.815e-01, 1.301e-01, 2.781e-02, 6.110e-02, -7.526e-02, -2.236e-01, -2.383e-01, -1.365e-01, -1.511e-01, -3.541e-02, 2.105e-01, 1.880e-01, -2.169e-01), r);
	r = MulAdd(s0_5, M4(1.310e-01, 6.178e-03, 6.162e-02, 8.473e-02, -4.512e-02, 2.741e-01, -2.587e-01, -1.368e-01, 1.448e-01, -2.241e-01, 1.788e-01, 1.691e-01, -7.749e-02, -6.264e-02, 4.554e-02, 6.527e-02), r);
	r = MulAdd(s0_6, M4(9.763e-02, 4.812e-02, -1.276e-01, -2.642e-01, -2.232e-02, -2.877e-02, -1.684e-01, 1.152e-01, -9.840e-02, 1.224e-01, 3.182e-02, -6.762e-02, 8.287e-02, 3.558e-01, -4.662e-02, -2.938e-01), r);
	r = MulAdd(s0_7, M4(1.670e-01, -1.360e-02, -9.542e-02, -1.691e-01, -1.819e-01, -9.168e-03, 9.087e-02, -7.566e-02, -9.845e-03, 1.043e-01, -1.144e-01, 2.102e-02, -7.557e-02, -2.903e-02, 2.711e-02, 4.163e-02), r);
	r = MulAdd(s0_8, M4(3.748e-02, 7.026e-02, -5.355e-02, -2.032e-01, -1.054e-01, 7.008e-02, -3.887e-03, 9.327e-02, -5.416e-02, -4.324e-02, -8.562e-02, -2.246e-01, 4.115e-02, -5.455e-02, -5.699e-02, 9.352e-02), r);
	r = MulAdd(s1_0, M4(-1.647e-01, -6.626e-02, -1.987e-01, 1.533e-01, -4.810e-02, -1.097e-02, -1.609e-01, -2.368e-01, 1.036e-02, 5.420e-02, 5.011e-02, 1.669e-01, -6.079e-02, -1.636e-01, -1.879e-03, -2.154e-01), r);
	r = MulAdd(s1_1, M4(-2.303e-01, -1.604e-01, 5.711e-02, 1.264e-01, -3.227e-02, -3.792e-01, -1.013e-01, -1.919e-01, 2.945e-02, 1.079e-01, 2.416e-01, -1.813e-01, -8.157e-02, -2.102e-01, -1.462e-01, 1.594e-01), r);
	r = MulAdd(s1_2, M4(-5.792e-02, -1.373e-02, 1.286e-01, 4.894e-02, -9.753e-02, -2.114e-01, -1.218e-02, -1.496e-01, 4.394e-02, 1.692e-02, -7.685e-02, -1.331e-01, 4.462e-03, -5.792e-02, 1.508e-01, 1.907e-01), r);
	r = MulAdd(s1_3, M4(-3.344e-01, 2.221e-01, 3.409e-01, 2.959e-01, 2.695e-01, -5.308e-02, -2.027e-01, 1.681e-01, -7.817e-02, 1.927e-02, 1.200e-01, -1.694e-01, 4.014e-01, 2.818e-01, 1.119e-01, -2.400e-01), r);
	r = MulAdd(s1_4, M4(-2.868e-02, -7.143e-02, 2.904e-01, 5.075e-02, -1.590e-01, -3.739e-01, 1.705e-01, -1.525e-01, -7.932e-02, -8.959e-02, -1.228e-01, 1.294e-01, 1.669e-01, -2.851e-01, 2.203e-02, -6.589e-01), r);
	r = MulAdd(s1_5, M4(-1.570e-02, 2.385e-02, -1.108e-02, 9.343e-02, -2.494e-01, -3.927e-01, 4.257e-02, 2.139e-01, 1.506e-01, 1.220e-02, 1.834e-01, -9.939e-02, -1.581e-01, 6.915e-02, -1.147e-01, -2.094e-01), r);
	r = MulAdd(s1_6, M4(7.238e-02, 2.064e-02, -8.644e-02, 1.141e-01, 9.264e-02, -1.623e-01, -5.621e-02, 2.077e-03, -1.771e-01, 6.011e-02, 1.249e-01, 2.444e-02, 1.023e-02, 2.457e-02, 1.956e-01, 1.182e-01), r);
	r = MulAdd(s1_7, M4(-1.855e-02, 1.438e-01, -5.501e-02, -2.316e-01, -7.153e-02, -1.020e-02, -1.145e-01, -4.723e-01, 1.101e-01, -2.734e-02, -8.558e-03, 5.762e-01, 8.280e-02, -1.551e-01, 8.434e-02, 2.051e-01), r);
	r = MulAdd(s1_8, M4(1.539e-02, 1.436e-02, 8.719e-03, -6.027e-02, -1.844e-01, -1.623e-01, 8.752e-02, 7.324e-02, 8.527e-02, 6.751e-03, -8.172e-02, 5.645e-02, -9.335e-02, 9.705e-03, -1.859e-03, 2.190e-01), r);
	r = MulAdd(s2_0, M4(1.063e-02, -5.882e-02, 7.294e-02, 1.135e-01, 6.834e-03, 2.521e-01, 3.334e-02, -7.202e-02, 1.982e-02, 8.106e-04, 5.125e-02, -3.961e-02, 1.600e-02, -1.903e-01, 8.996e-02, 1.905e-01), r);
	r = MulAdd(s2_1, M4(1.227e-01, 1.424e-01, 1.137e-01, 3.396e-02, -7.578e-02, 9.724e-02, -2.649e-02, -7.468e-02, 2.632e-02, -2.053e-01, -2.161e-02, -8.906e-02, -5.289e-02, -1.019e-02, -5.841e-02, -2.570e-02), r);
	r = MulAdd(s2_2, M4(3.359e-02, 5.073e-02, 1.054e-01, -3.593e-03, -7.366e-02, 1.187e-02, 2.007e-03, 2.112e-02, 8.835e-02, 7.359e-02, 2.478e-02, 4.758e-02, -8.954e-02, -2.354e-02, -1.852e-01, 1.836e-01), r);
	r = MulAdd(s2_3, M4(-2.403e-01, -3.137e-01, 4.761e-02, -1.840e-01, 1.012e-01, 5.392e-02, -1.457e-01, -6.425e-02, -6.436e-03, -2.480e-02, 9.199e-02, -8.296e-02, -2.894e-01, 1.054e-01, 1.282e-01, -7.266e-02), r);
	r = MulAdd(s2_4, M4(3.247e-01, -1.603e-01, 6.251e-02, -1.106e-01, -4.083e-02, 2.252e-01, -2.616e-01, 1.397e-01, -1.153e-01, -4.065e-02, 5.923e-02, -4.491e-03, 6.616e-02, -2.146e-01, 6.314e-02, 5.401e-01), r);
	r = MulAdd(s2_5, M4(7.356e-02, 3.269e-03, 1.710e-01, -2.914e-02, -7.138e-02, 2.294e-02, 1.634e-01, 1.120e-01, -7.388e-02, -1.768e-02, -3.813e-02, -3.019e-02, -4.569e-02, -1.257e-01, -1.341e-01, 4.040e-03), r);
	r = MulAdd(s2_6, M4(-1.479e-01, 2.260e-02, 5.173e-02, 1.889e-01, -1.911e-01, -6.621e-02, 1.666e-01, 1.329e-01, -1.320e-01, 3.452e-02, 1.232e-01, -1.545e-02, 7.646e-02, 4.924e-02, -9.582e-02, -1.941e-01), r);
	r = MulAdd(s2_7, M4(1.873e-01, -1.340e-02, -1.278e-01, 9.443e-02, 4.326e-02, 1.467e-01, 7.788e-02, 1.345e-01, 8.715e-02, -1.063e-01, -1.265e-01, 1.358e-01, 3.897e-02, -9.523e-03, -6.335e-02, 2.356e-02), r);
	r = MulAdd(s2_8, M4(-1.881e-02, -5.854e-02, -1.554e-02, -1.388e-01, -5.206e-03, 4.616e-02, -4.462e-02, 8.522e-03, -4.023e-02, -1.133e-01, 8.830e-02, -7.407e-02, -1.715e-02, -3.550e-02, 2.474e-02, 9.114e-02), r);
	r = MulAdd(s3_0, M4(7.529e-02, 2.230e-01, 1.972e-02, -1.296e-02, 1.146e-01, -6.835e-02, 4.912e-02, 1.397e-02, 1.189e-01, 6.960e-02, -4.406e-02, -7.066e-02, -1.058e-01, 6.757e-02, 1.539e-01, 1.892e-01), r);
	r = MulAdd(s3_1, M4(-6.716e-02, 2.716e-01, 1.435e-01, -2.363e-01, 5.216e-02, -3.579e-02, 1.132e-01, -8.205e-03, 3.452e-01, 1.436e-01, -4.125e-02, 2.391e-01, -1.578e-01, 1.164e-02, 1.621e-01, -7.879e-02), r);
	r = MulAdd(s3_2, M4(-1.167e-01, 1.343e-01, -1.091e-01, -1.064e-01, -1.269e-02, -8.151e-02, 7.006e-02, -3.599e-02, 1.215e-01, 2.535e-02, -8.567e-02, 7.653e-02, 1.470e-02, 1.479e-01, -6.068e-02, 8.576e-02), r);
	r = MulAdd(s3_3, M4(-2.659e-01, 5.315e-02, 8.693e-02, 1.562e-01, 2.764e-01, -4.758e-01, -2.118e-02, -1.585e-01, 1.634e-01, 1.815e-01, 3.742e-02, 1.125e-01, -1.832e-01, 3.244e-01, 1.500e-01, 1.319e-01), r);
	r = MulAdd(s3_4, M4(-7.748e-02, -1.417e-01, -2.502e-02, -5.102e-01, 5.740e-02, -1.569e-01, -1.604e-01, 1.076e-01, 9.692e-02, 1.335e-01, 1.066e-01, 2.489e-01, -1.600e-01, -2.911e-02, 9.011e-04, -1.260e-01), r);
	r = MulAdd(s3_5, M4(-5.218e-02, 1.823e-01, 2.527e-02, 5.142e-02, 3.503e-02, -1.782e-01, 1.947e-01, 1.810e-01, 1.059e-01, 3.631e-02, -1.122e-01, -1.281e-01, -6.201e-02, 5.464e-02, -2.633e-01, -2.567e-01), r);
	r = MulAdd(s3_6, M4(-3.602e-01, 8.443e-02, 3.335e-02, 2.454e-01, 2.898e-02, -5.114e-01, 1.577e-01, 1.251e-01, -6.350e-02, 4.388e-02, 4.373e-02, 3.042e-03, 4.010e-02, 4.499e-02, -8.740e-02, -1.123e-01), r);
	r = MulAdd(s3_7, M4(-1.979e-01, 3.968e-02, -6.856e-02, 9.250e-02, 5.656e-02, -2.908e-01, 4.040e-01, 1.679e-01, 3.232e-01, 2.436e-01, -2.430e-01, 3.446e-01, -1.092e-01, 8.027e-02, 5.210e-03, -1.632e-01), r);
	r = MulAdd(s3_8, M4(-8.864e-02, 5.409e-02, -7.792e-02, -6.164e-02, -3.213e-02, 5.911e-02, -3.344e-02, -1.642e-01, -3.467e-02, 1.005e-01, -4.552e-02, -5.140e-02, -3.020e-02, -5.049e-02, 5.460e-02, 2.551e-01), r);
	return r;
}

V4 f1(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
	V4 r = { -1.519e-02, -1.626e-03, -6.363e-03, 2.015e-02 };
	r = MulAdd(s0_0, M4(2.210e-01, -8.771e-02, 8.433e-02, -1.834e-02, 4.487e-03, 1.229e-01, 9.812e-02, 1.122e-02, 6.535e-02, 2.964e-02, -5.238e-02, -8.784e-03, 1.608e-01, -3.298e-01, -7.243e-02, -2.594e-02), r);
	r = MulAdd(s0_1, M4(1.251e-01, 2.624e-03, 1.901e-01, -4.362e-02, -2.175e-02, 6.931e-02, 8.964e-03, -1.462e-02, 4.903e-02, 4.953e-03, 4.655e-02, -7.294e-02, 1.417e-01, -8.032e-02, 3.567e-02, -2.951e-02), r);
	r = MulAdd(s0_2, M4(6.153e-02, -1.150e-01, 1.440e-01, -5.493e-02, 4.033e-02, 1.981e-03, -2.974e-02, 2.702e-02, 5.450e-02, -7.033e-02, -2.222e-02, -7.965e-02, 1.456e-02, 2.782e-02, 2.746e-02, 1.040e-02), r);
	r = MulAdd(s0_3, M4(2.015e-01, 3.217e-01, 7.861e-02, -8.683e-03, 3.937e-02, -4.713e-03, -3.380e-02, 2.001e-03, 1.475e-02, 4.002e-02, -1.472e-02, -6.966e-02, 1.221e-01, -1.903e-01, -2.130e-02, 2.612e-02), r);
	r = MulAdd(s0_4, M4(-1.416e-01, 6.991e-02, 5.720e-02, 8.791e-02, 1.773e-01, -3.038e-01, -3.310e-01, -1.970e-01, -1.473e-01, 2.420e-01, 3.486e-02, -5.738e-02, -2.204e-01, -5.488e-02, -2.149e-01, 4.573e-01), r);
	r = MulAdd(s0_5, M4(-4.676e-02, 6.991e-02, 8.723e-02, -4.131e-02, -3.696e-03, -9.530e-02, -2.305e-01, 7.145e-02, 1.770e-01, 1.801e-01, 7.689e-02, -2.927e-01, 2.088e-01, -6.714e-02, 6.918e-02, -8.667e-02), r);
	r = MulAdd(s0_6, M4(2.136e-01, -2.888e-03, 6.237e-02, -2.379e-01, -1.859e-01, -4.943e-02, 9.806e-02, 3.477e-02, 1.773e-01, -1.017e-02, -8.464e-02, -4.892e-02, 1.725e-01, -2.379e-02, -1.774e-01, 6.270e-02), r);
	r = MulAdd(s0_7, M4(9.334e-02, 4.811e-03, 1.178e-01, -8.021e-02, 3.474e-02, -4.447e-02, 2.635e-02, 3.954e-02, 6.511e-02, -1.431e-01, -3.595e-02, -2.550e-01, -1.586e-01, -2.793e-01, -5.439e-02, 1.146e-01), r);
	r = MulAdd(s0_8, M4(-1.065e-02, -2.644e-02, 2.093e-02, 8.023e-02, 3.019e-02, 1.242e-02, -3.142e-02, -1.020e-01, 1.166e-01, -4.816e-02, -7.099e-02, -4.466e-02, 1.021e-01, 1.844e-02, -5.070e-02, 5.314e-02), r);
	r = MulAdd(s1_0, M4(-4.590e-02, -2.057e-01, -1.242e-01, -3.319e-02, -2.894e-01, 3.706e-01, 1.807e-01, 4.483e-02, -3.240e-03, 6.281e-02, -1.038e-01, 1.921e-02, 2.339e-01, -3.037e-01, 1.067e-01, -7.859e-02), r);
	r = MulAdd(s1_1, M4(-1.438e-01, 1.188e-02, 1.465e-01, -5.182e-02, -1.534e-01, 9.311e-02, 2.122e-01, -5.038e-03, -1.265e-01, -5.506e-03, -3.281e-02, -1.296e-02, -2.746e-02, -1.885e-02, 1.191e-01, -6.267e-03), r);
	r = MulAdd(s1_2, M4(-8.901e-03, -1.093e-01, 8.306e-02, -6.763e-02, -8.580e-02, 1.989e-01, 2.396e-01, 9.082e-02, 6.451e-03, -9.354e-02, -7.606e-02, 4.595e-02, -6.431e-02, 2.274e-01, 2.200e-01, 8.497e-02), r);
	r = MulAdd(s1_3, M4(-8.861e-02, 1.255e-01, 3.949e-02, -6.585e-02, -4.185e-02, 4.627e-01, -3.145e-02, 2.407e-02, -5.978e-02, -7.530e-02, -9.670e-03, -1.027e-02, -9.589e-02, 9.377e-02, 2.228e-01, 2.195e-02), r);
	r = MulAdd(s1_4, M4(-3.904e-01, 3.109e-03, -1.475e-01, 3.837e-01, -1.894e-01, 6.479e-02, -6.395e-02, -1.995e-02, -2.757e-01, 8.815e-02, -7.492e-02, 1.081e-01, -3.250e-01, 2.217e-02, -2.679e-01, 2.825e-01), r);
	r = MulAdd(s1_5, M4(-3.827e-02, -8.505e-02, -4.066e-02, -6.333e-02, -7.558e-02, 2.457e-01, 1.232e-01, -2.482e-01, 3.403e-02, 4.594e-02, -4.212e-03, -1.739e-01, -2.179e-01, -1.252e-01, 2.949e-02, 2.564e-01), r);
	r = MulAdd(s1_6, M4(6.617e-02, -9.007e-02, 8.929e-02, -1.230e-01, -1.098e-01, -1.735e-02, 6.949e-02, -4.675e-02, -6.309e-02, -7.797e-02, -6.646e-02, -5.128e-03, 2.148e-01, 1.261e-01, -1.065e-01, 1.382e-01), r);
	r = MulAdd(s1_7, M4(5.742e-02, -1.194e-01, 8.199e-02, -5.755e-02, -6.747e-03, 2.054e-01, 1.251e-01, 3.603e-02, -1.160e-01, -1.202e-01, -1.329e-01, -1.194e-02, -1.654e-01, -2.280e-02, -2.143e-01, 9.542e-02), r);
	r = MulAdd(s1_8, M4(8.072e-02, -2.271e-02, -2.824e-02, -9.697e-03, -2.783e-01, -2.375e-02, 1.056e-01, -4.308e-02, -9.954e-02, -5.603e-02, 1.340e-02, 6.323e-02, -9.490e-02, 1.534e-01, -9.825e-02, 1.031e-01), r);
	r = MulAdd(s2_0, M4(-6.657e-02, 1.213e-01, -4.044e-03, -1.773e-02, -1.002e-01, -9.754e-02, -8.548e-02, -7.991e-03, -1.089e-02, 2.650e-02, 7.231e-02, 1.553e-02, 1.862e-02, -4.304e-02, -2.659e-02, -9.413e-02), r);
	r = MulAdd(s2_1, M4(-1.215e-01, 2.919e-01, -1.007e-01, 5.449e-02, -3.993e-02, -2.603e-01, -9.447e-02, 4.317e-02, 1.249e-02, 6.626e-03, 3.549e-02, 2.056e-02, -1.929e-01, 2.127e-01, 1.262e-01, 1.266e-01), r);
	r = MulAdd(s2_2, M4(-1.348e-01, 9.263e-02, 4.868e-02, 3.141e-02, -3.260e-02, -5.119e-02, 1.882e-02, -2.687e-02, 1.175e-01, 7.400e-03, 5.936e-02, -3.558e-02, -1.845e-01, 1.852e-01, 1.900e-01, 3.633e-02), r);
	r = MulAdd(s2_3, M4(-1.500e-01, 5.300e-01, 1.362e-01, 1.648e-02, -2.638e-01, -1.051e-01, -8.124e-02, 6.321e-02, 1.936e-02, -9.783e-03, -2.973e-03, -3.255e-02, -8.121e-03, 1.057e-01, 3.040e-02, 1.927e-03), r);
	r = MulAdd(s2_4, M4(-3.394e-01, -9.811e-02, 1.051e-01, 7.235e-02, 7.202e-02, -4.507e-02, -3.230e-01, 7.066e-02, -9.951e-02, -3.135e-02, -3.983e-02, -1.663e-02, -1.988e-01, -1.760e-01, 7.408e-02, 7.511e-03), r);
	r = MulAdd(s2_5, M4(-1.656e-01, -5.325e-02, 8.364e-02, -1.237e-01, -2.843e-02, -1.214e-01, 2.234e-02, -8.794e-02, -5.301e-02, -9.052e-02, -7.931e-03, 1.106e-01, -5.915e-02, 2.891e-01, 1.154e-02, 4.972e-01), r);
	r = MulAdd(s2_6, M4(-1.144e-01, 1.519e-01, -1.635e-02, -4.138e-02, -2.222e-01, -2.007e-02, -9.045e-02, 7.578e-02, -1.112e-02, 7.288e-02, -6.133e-02, 4.914e-02, -3.029e-02, 6.290e-02, -3.176e-02, -5.098e-02), r);
	r = MulAdd(s2_7, M4(9.470e-03, -3.087e-02, -4.088e-02, -1.636e-01, -1.954e-01, -3.066e-02, -6.252e-01, 3.643e-01, -8.264e-02, -9.051e-02, -4.227e-02, -1.096e-01, -1.828e-01, 9.511e-02, -1.038e-01, 1.368e-01), r);
	r = MulAdd(s2_8, M4(-1.110e-01, 1.403e-02, 6.434e-02, 4.785e-02, -9.627e-02, -9.932e-02, -3.143e-02, 1.898e-01, 5.175e-02, 1.199e-01, 1.578e-01, 1.062e-02, -1.138e-01, 8.965e-02, 4.713e-03, 1.549e-01), r);
	r = MulAdd(s3_0, M4(3.829e-02, -1.337e-01, -1.104e-03, -7.664e-03, 6.715e-02, 1.739e-01, 1.395e-02, 1.015e-02, -6.929e-02, -4.876e-02, -3.160e-02, 4.958e-03, 3.909e-02, -7.175e-02, 6.043e-02, -1.690e-03), r);
	r = MulAdd(s3_1, M4(-4.348e-02, -3.422e-01, -1.035e-01, 5.749e-02, -2.770e-02, -6.681e-02, 9.643e-02, -3.363e-02, -1.086e-01, 1.263e-01, -3.565e-02, 1.481e-01, 2.105e-01, -1.008e-01, 2.541e-02, -1.773e-03), r);
	r = MulAdd(s3_2, M4(-1.641e-01, -1.775e-01, -3.140e-02, -2.266e-02, 5.426e-02, 1.135e-01, 1.251e-01, -6.043e-02, -1.022e-01, 4.385e-01, 7.205e-03, 9.644e-02, 9.829e-02, -5.496e-02, -2.885e-02, -8.019e-02), r);
	r = MulAdd(s3_3, M4(-1.623e-01, -4.012e-02, 4.193e-02, 5.826e-02, 7.017e-02, 3.917e-02, 1.997e-01, -8.713e-02, -2.845e-02, 8.698e-02, -1.084e-02, 3.293e-02, 3.392e-04, 6.589e-02, -7.126e-02, 2.960e-02), r);
	r = MulAdd(s3_4, M4(-3.154e-01, -1.805e-01, -2.569e-01, 1.802e-02, 5.784e-01, 3.153e-02, 1.085e-01, -8.445e-02, -1.383e-01, 3.813e-01, -4.014e-01, 2.353e-01, 7.655e-02, -5.224e-01, -2.345e-01, -5.558e-02), r);
	r = MulAdd(s3_5, M4(-7.284e-02, -4.643e-01, -1.979e-01, -2.940e-01, 3.934e-02, 1.468e-01, 2.169e-01, -1.865e-01, 3.773e-02, 1.863e-01, -2.202e-01, 3.661e-01, 2.442e-01, -6.375e-02, -1.982e-01, -2.276e-02), r);
	r = MulAdd(s3_6, M4(-6.857e-02, -1.182e-01, -5.203e-02, -4.814e-02, 1.110e-01, 5.270e-02, 1.293e-01, -6.715e-02, -1.529e-01, 1.844e-01, -1.089e-01, 6.857e-02, 1.382e-01, 3.184e-02, 2.557e-02, -2.965e-02), r);
	r = MulAdd(s3_7, M4(1.598e-01, -3.311e-01, -1.057e-01, -4.271e-01, 2.381e-01, -4.726e-02, 1.948e-01, -8.839e-02, -9.368e-02, 1.343e-01, -1.369e-01, 8.824e-02, 7.348e-02, 3.476e-02, -2.949e-02, 6.313e-03), r);
	r = MulAdd(s3_8, M4(1.153e-01, -1.471e-01, -5.845e-02, -2.276e-01, 1.669e-01, -4.299e-02, 7.663e-02, -3.158e-02, -1.171e-01, 1.190e-01, -5.942e-02, 1.771e-01, 1.996e-03, 2.404e-02, -3.151e-02, -3.041e-02), r);
	return r;
}

void Pass4(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = TileSwizzle8x8(tid.x) + blockStart;
	uint2 size = GetInputSize();
	if (gxy.x >= size.x || gxy.y >= size.y) {
		return;
	}
	float2 pos = (gxy + 0.5) * pt;

	V4 s0_0 = l0(-1.0, -1.0);
	V4 s0_1 = l0(0.0, -1.0);
	V4 s0_2 = l0(1.0, -1.0);
	V4 s0_3 = l0(-1.0, 0.0);
	V4 s0_4 = l0(0.0, 0.0);
	V4 s0_5 = l0(1.0, 0.0);
	V4 s0_6 = l0(-1.0, 1.0);
	V4 s0_7 = l0(0.0, 1.0);
	V4 s0_8 = l0(1.0, 1.0);
	V4 s1_0 = -max(-s0_0, 0.0);
	V4 s1_1 = -max(-s0_1, 0.0);
	V4 s1_2 = -max(-s0_2, 0.0);
	V4 s1_3 = -max(-s0_3, 0.0);
	V4 s1_4 = -max(-s0_4, 0.0);
	V4 s1_5 = -max(-s0_5, 0.0);
	V4 s1_6 = -max(-s0_6, 0.0);
	V4 s1_7 = -max(-s0_7, 0.0);
	V4 s1_8 = -max(-s0_8, 0.0);
	s0_0 = max(s0_0, 0.0);
	s0_1 = max(s0_1, 0.0);
	s0_2 = max(s0_2, 0.0);
	s0_3 = max(s0_3, 0.0);
	s0_4 = max(s0_4, 0.0);
	s0_5 = max(s0_5, 0.0);
	s0_6 = max(s0_6, 0.0);
	s0_7 = max(s0_7, 0.0);
	s0_8 = max(s0_8, 0.0);

	V4 s2_0 = l1(-1.0, -1.0);
	V4 s2_1 = l1(0.0, -1.0);
	V4 s2_2 = l1(1.0, -1.0);
	V4 s2_3 = l1(-1.0, 0.0);
	V4 s2_4 = l1(0.0, 0.0);
	V4 s2_5 = l1(1.0, 0.0);
	V4 s2_6 = l1(-1.0, 1.0);
	V4 s2_7 = l1(0.0, 1.0);
	V4 s2_8 = l1(1.0, 1.0);
	V4 s3_0 = -max(-s2_0, 0.0);
	V4 s3_1 = -max(-s2_1, 0.0);
	V4 s3_2 = -max(-s2_2, 0.0);
	V4 s3_3 = -max(-s2_3, 0.0);
	V4 s3_4 = -max(-s2_4, 0.0);
	V4 s3_5 = -max(-s2_5, 0.0);
	V4 s3_6 = -max(-s2_6, 0.0);
	V4 s3_7 = -max(-s2_7, 0.0);
	V4 s3_8 = -max(-s2_8, 0.0);
	s2_0 = max(s2_0, 0.0);
	s2_1 = max(s2_1, 0.0);
	s2_2 = max(s2_2, 0.0);
	s2_3 = max(s2_3, 0.0);
	s2_4 = max(s2_4, 0.0);
	s2_5 = max(s2_5, 0.0);
	s2_6 = max(s2_6, 0.0);
	s2_7 = max(s2_7, 0.0);
	s2_8 = max(s2_8, 0.0);

	t2[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
	t3[gxy] = f1(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
}

//!PASS 5
//!DESC Convolution layer 4 - Feature processing
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t2, t3
//!OUT t0, t1

#define l0(x, y) V4(O(t2, float2(x, y)))
#define l1(x, y) V4(O(t3, float2(x, y)))

V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
	V4 r = { 7.981e-03, 1.017e-02, -9.502e-03, -2.618e-02 };
	r = MulAdd(s0_0, M4(-4.603e-02, 1.457e-02, 4.018e-02, 1.284e-01, -2.497e-02, 1.041e-02, 8.365e-02, -2.611e-02, 2.223e-02, -8.476e-03, -1.068e-01, -1.092e-02, -2.023e-02, -6.393e-02, 3.540e-02, -3.517e-02), r);
	r = MulAdd(s0_1, M4(4.999e-02, -3.245e-02, 3.582e-03, -1.362e-01, 5.032e-02, 2.189e-03, -1.182e-01, -5.752e-02, -3.133e-02, -4.474e-02, 6.192e-02, -5.508e-02, 2.607e-02, 3.363e-02, -4.378e-02, -6.372e-02), r);
	r = MulAdd(s0_2, M4(-1.613e-01, -8.100e-02, -7.770e-03, 1.425e-02, -1.075e-02, 4.560e-02, -1.378e-02, 2.426e-02, 4.124e-02, 6.632e-02, 1.771e-01, 1.226e-01, -8.028e-02, 3.032e-02, 2.187e-02, -7.814e-03), r);
	r = MulAdd(s0_3, M4(4.616e-02, 1.362e-01, -1.578e-01, -7.459e-02, -1.762e-01, 3.738e-02, -2.270e-02, -1.593e-01, -9.981e-02, 7.580e-02, 1.020e-01, -1.905e-02, -4.813e-02, -3.603e-02, -6.406e-02, 5.516e-02), r);
	r = MulAdd(s0_4, M4(-3.955e-01, 4.862e-03, 4.408e-01, 3.431e-01, 1.827e-01, 4.348e-02, 1.825e-02, 3.428e-02, 9.285e-02, 2.980e-01, -1.655e-01, 1.925e-02, 1.656e-01, -7.685e-02, -2.193e-01, -6.239e-02), r);
	r = MulAdd(s0_5, M4(1.214e-01, -1.069e-02, -2.443e-01, 4.160e-03, -2.923e-03, -3.846e-01, 1.242e-01, 1.638e-02, 1.392e-01, 2.218e-01, 8.119e-02, 6.071e-02, -3.691e-02, -2.882e-01, 1.464e-01, 7.349e-02), r);
	r = MulAdd(s0_6, M4(-1.240e-01, 7.950e-03, 6.145e-02, -8.886e-02, 5.262e-02, 9.563e-02, -2.241e-02, -5.722e-02, -1.371e-02, 4.435e-02, 1.125e-01, 1.029e-01, -1.878e-02, -3.662e-03, 2.522e-02, -6.313e-02), r);
	r = MulAdd(s0_7, M4(-2.502e-02, 1.062e-01, -2.685e-01, 4.436e-02, 2.446e-02, 7.840e-02, -6.193e-02, -7.416e-05, 2.145e-02, 2.132e-01, -1.304e-01, 1.844e-01, -3.718e-04, -8.233e-02, 1.137e-01, -1.111e-02), r);
	r = MulAdd(s0_8, M4(6.311e-02, 1.239e-02, 3.474e-02, 6.830e-02, 7.880e-02, 1.460e-01, 1.436e-02, 5.359e-03, 1.001e-01, 1.663e-01, 3.111e-02, 1.435e-02, 1.698e-02, 1.415e-02, 6.806e-02, 2.488e-03), r);
	r = MulAdd(s1_0, M4(1.176e-02, -4.177e-02, -5.899e-02, -8.043e-02, -5.556e-02, -5.232e-02, 1.104e-01, -1.912e-02, -1.999e-02, 3.132e-02, 3.493e-02, -4.262e-02, 8.312e-02, 9.728e-02, -4.373e-02, 9.260e-02), r);
	r = MulAdd(s1_1, M4(-1.069e-01, 4.645e-02, 2.282e-01, -5.721e-02, -3.651e-02, -3.742e-03, -8.886e-03, -5.040e-02, -5.751e-02, 4.869e-02, 9.988e-03, 1.313e-02, -3.140e-02, 1.363e-01, -9.975e-02, -3.207e-02), r);
	r = MulAdd(s1_2, M4(-1.288e-01, -2.886e-02, 9.747e-02, 2.324e-02, 1.364e-02, -8.983e-02, 5.610e-02, 2.811e-02, -2.433e-02, 2.233e-01, -1.660e-02, 7.249e-02, -4.680e-02, 2.193e-01, -2.173e-01, -8.384e-02), r);
	r = MulAdd(s1_3, M4(-1.302e-03, -2.432e-02, 1.779e-01, 8.593e-02, -1.982e-01, 5.378e-02, -5.749e-03, -3.947e-02, -2.771e-02, 6.767e-02, -1.146e-01, -3.119e-02, 6.422e-02, 9.432e-02, -3.233e-01, 9.318e-02), r);
	r = MulAdd(s1_4, M4(-2.855e-01, 1.026e-01, 2.524e-01, 2.993e-01, 2.177e-01, -1.882e-01, -5.235e-02, -6.145e-03, 2.709e-02, 2.778e-01, 2.826e-01, -2.878e-01, 3.057e-01, 1.621e-01, -2.407e-01, -3.343e-01), r);
	r = MulAdd(s1_5, M4(-3.628e-02, 5.554e-03, 1.063e-01, 4.774e-02, 3.746e-02, -1.545e-01, -4.221e-02, 6.273e-02, 7.253e-02, 5.197e-01, -5.520e-02, -6.371e-02, 1.401e-01, 2.903e-02, -6.322e-02, -1.457e-01), r);
	r = MulAdd(s1_6, M4(-3.768e-02, 8.166e-03, 3.330e-01, 9.689e-02, -3.890e-02, 1.159e-01, 6.738e-02, -8.253e-02, -7.114e-03, -1.422e-02, 1.549e-01, 6.271e-02, -1.100e-02, 2.598e-02, -2.596e-01, -8.673e-02), r);
	r = MulAdd(s1_7, M4(-1.372e-01, 6.481e-02, -1.001e-01, 7.663e-02, 5.323e-02, 1.137e-01, 9.721e-02, 3.063e-01, 7.966e-02, 1.892e-01, -2.452e-01, 8.129e-02, 6.147e-02, -4.174e-02, -8.889e-02, -2.223e-01), r);
	r = MulAdd(s1_8, M4(-7.513e-02, 1.446e-02, 1.697e-01, 7.281e-02, 2.272e-02, 1.891e-01, -2.171e-01, -8.146e-02, 4.419e-02, 9.160e-02, 3.967e-02, 3.398e-02, 1.016e-02, 1.109e-02, -1.280e-02, 6.043e-03), r);
	r = MulAdd(s2_0, M4(-1.125e-02, -4.668e-02, 8.017e-03, -1.144e-01, -1.167e-01, 1.599e-02, 1.427e-01, -8.549e-02, -8.563e-02, -7.150e-03, 1.269e-01, -1.134e-02, -1.111e-02, -9.739e-02, 5.041e-02, -7.451e-02), r);
	r = MulAdd(s2_1, M4(-4.166e-02, -2.516e-02, 9.528e-02, 5.857e-02, 9.985e-02, -1.021e-02, 1.122e-02, 1.325e-02, -3.881e-02, 5.275e-02, -1.425e-01, 2.573e-01, 3.958e-02, -1.059e-01, 1.155e-01, 4.783e-02), r);
	r = MulAdd(s2_2, M4(-9.030e-03, -9.384e-03, 1.231e-01, -2.887e-02, 6.405e-02, -2.676e-02, -7.671e-03, 1.489e-02, -3.528e-02, -2.811e-01, 1.087e-01, 2.132e-02, -4.533e-02, 1.584e-02, 7.144e-02, 3.402e-02), r);
	r = MulAdd(s2_3, M4(1.897e-01, 6.806e-03, -2.573e-01, 1.147e-02, -4.295e-02, 5.574e-02, -1.997e-01, -1.539e-01, 2.371e-01, 8.366e-02, -5.064e-02, 7.570e-02, 2.415e-01, -9.769e-02, 1.666e-02, 1.138e-01), r);
	r = MulAdd(s2_4, M4(4.319e-01, -1.018e-02, -2.282e-01, -6.357e-02, -2.734e-02, -2.508e-01, 7.848e-02, 6.481e-02, 1.501e-01, 1.706e-01, -7.193e-02, -4.792e-01, -7.641e-02, -7.082e-02, -1.236e-01, -6.017e-02), r);
	r = MulAdd(s2_5, M4(-7.273e-02, -2.068e-01, -9.371e-02, -2.801e-02, 1.659e-01, 5.531e-02, -1.428e-02, 2.927e-02, -1.047e-01, -2.108e-01, 1.224e-01, 1.807e-01, -5.600e-02, -1.184e-02, -8.557e-02, 2.045e-02), r);
	r = MulAdd(s2_6, M4(6.962e-02, -9.497e-02, -1.894e-02, 7.397e-02, 4.261e-02, 1.095e-02, 1.084e-01, 8.961e-02, 1.426e-01, 5.959e-02, -2.532e-02, -7.401e-02, 9.025e-02, -2.152e-02, 5.712e-02, -7.736e-02), r);
	r = MulAdd(s2_7, M4(1.248e-01, -1.912e-01, 2.197e-01, -3.152e-02, 1.601e-02, -5.634e-02, 1.397e-01, 3.612e-01, 5.845e-02, 1.990e-01, 1.667e-01, 1.460e-01, -1.803e-02, -1.255e-01, 2.606e-02, -3.095e-02), r);
	r = MulAdd(s2_8, M4(-7.043e-02, -1.660e-01, -1.409e-02, 2.977e-02, 9.866e-02, 6.648e-02, -8.271e-02, -1.705e-02, -5.470e-02, -4.730e-02, -9.331e-02, -9.813e-02, -2.882e-02, -7.116e-02, 3.656e-02, 7.024e-02), r);
	r = MulAdd(s3_0, M4(3.335e-02, -1.035e-01, -4.027e-02, -9.856e-02, 1.123e-02, -4.545e-02, -1.428e-02, -4.436e-02, -1.125e-02, 5.338e-02, 1.272e-01, 7.738e-02, 2.460e-01, 6.170e-02, -2.828e-01, -9.801e-02), r);
	r = MulAdd(s3_1, M4(-1.291e-02, -9.460e-02, 1.647e-01, 3.956e-02, 6.269e-02, -1.042e-01, 1.587e-01, 1.304e-01, 8.502e-03, 3.894e-02, 4.633e-02, 1.400e-01, -1.945e-02, 3.277e-01, 1.464e-01, 1.050e-01), r);
	r = MulAdd(s3_2, M4(-3.341e-02, -1.665e-02, 7.029e-02, 4.459e-02, 5.240e-02, -1.783e-01, 1.174e-01, 7.980e-02, -6.808e-04, 3.650e-02, -9.933e-02, -2.446e-02, -7.836e-02, 1.854e-01, 5.613e-02, -1.825e-02), r);
	r = MulAdd(s3_3, M4(-2.069e-01, -4.267e-02, 2.725e-01, 5.264e-03, -4.267e-02, -9.650e-03, -3.641e-02, -5.011e-02, 1.346e-01, 1.087e-01, -1.854e-01, 1.758e-01, 4.033e-01, -1.891e-01, 1.411e-01, -1.705e-01), r);
	r = MulAdd(s3_4, M4(3.056e-01, -5.330e-01, 5.010e-02, 2.460e-01, -2.118e-02, 1.479e-01, 1.269e-01, 2.600e-01, 9.058e-02, -2.206e-01, 2.627e-02, -2.579e-01, -5.702e-03, -2.285e-01, 2.033e-01, -1.174e-01), r);
	r = MulAdd(s3_5, M4(6.399e-02, -1.645e-01, -1.331e-01, 5.684e-02, 4.928e-02, -2.339e-01, 2.013e-02, -4.284e-02, -2.017e-01, -4.847e-02, 2.648e-02, -1.329e-02, -1.203e-01, -1.062e-01, -7.317e-02, 7.343e-02), r);
	r = MulAdd(s3_6, M4(1.753e-02, -9.153e-03, 9.648e-02, 1.480e-01, 2.131e-02, 6.322e-03, 1.606e-01, 2.550e-01, 1.438e-02, 9.530e-02, -1.749e-02, -3.708e-03, 1.606e-01, 6.072e-02, 1.209e-01, -1.087e-01), r);
	r = MulAdd(s3_7, M4(4.474e-02, -1.546e-01, -1.312e-02, 4.897e-02, 9.376e-02, 8.851e-02, 8.660e-02, -7.482e-03, 1.519e-01, 8.864e-02, 5.592e-02, -1.059e-02, -1.083e-01, -1.601e-01, 2.599e-01, 6.959e-02), r);
	r = MulAdd(s3_8, M4(-1.033e-01, -1.514e-01, 3.964e-02, 5.707e-02, -2.455e-02, -1.163e-01, 2.173e-01, 1.447e-01, -1.753e-02, -3.387e-02, 2.775e-02, -3.772e-02, -1.142e-01, 5.344e-02, -1.796e-01, -1.290e-01), r);
	return r;
}

V4 f1(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
	V4 r = { -1.969e-02, 1.581e-02, -1.910e-02, -1.871e-02 };
	r = MulAdd(s0_0, M4(4.499e-03, -1.196e-02, -1.650e-02, 6.240e-02, -4.222e-02, -1.463e-02, -6.762e-02, -3.560e-02, 1.501e-01, -1.203e-01, 1.276e-01, 3.161e-02, 8.032e-02, -5.788e-02, 3.964e-02, -1.081e-01), r);
	r = MulAdd(s0_1, M4(7.720e-02, -1.253e-01, 1.192e-01, 4.363e-02, 2.300e-03, 3.121e-02, -9.608e-02, -1.262e-01, -1.082e-02, -5.831e-02, 1.486e-01, 1.558e-01, 2.582e-02, -7.162e-02, -1.317e-01, -1.325e-01), r);
	r = MulAdd(s0_2, M4(-3.036e-02, -1.122e-01, -2.954e-02, 4.476e-02, 5.942e-02, 1.889e-02, 2.699e-02, 3.370e-02, 7.309e-02, 7.204e-03, 8.554e-02, -3.684e-02, 2.101e-03, 5.032e-02, 4.602e-02, -4.234e-02), r);
	r = MulAdd(s0_3, M4(1.287e-01, -6.731e-02, -1.535e-01, 6.860e-02, 1.347e-01, -6.583e-02, -2.647e-02, -7.921e-03, 1.514e-01, -4.460e-02, -5.672e-02, -1.412e-01, 2.153e-01, -8.017e-03, 3.155e-03, 9.885e-02), r);
	r = MulAdd(s0_4, M4(1.192e-01, -1.237e-01, -1.446e-01, -2.197e-01, 5.265e-02, -1.721e-02, 1.762e-01, -1.570e-01, -2.728e-01, -9.284e-02, -1.318e-01, -2.472e-01, 2.642e-01, 3.686e-02, 2.810e-01, -7.129e-02), r);
	r = MulAdd(s0_5, M4(1.372e-01, -3.096e-01, 1.258e-01, 1.579e-01, -1.576e-01, 3.233e-01, -7.987e-02, -1.414e-01, -6.258e-02, 8.063e-02, -9.703e-02, 1.087e-01, 2.968e-02, 3.011e-02, 3.135e-01, 5.113e-02), r);
	r = MulAdd(s0_6, M4(-4.781e-02, -1.868e-02, -1.984e-02, 3.647e-02, -7.447e-02, -1.965e-02, -7.801e-02, -2.734e-02, 5.353e-02, 7.222e-02, -2.888e-02, -5.486e-02, 5.216e-02, -2.872e-02, -1.073e-02, 3.894e-02), r);
	r = MulAdd(s0_7, M4(-8.822e-02, 2.075e-01, -9.096e-02, -5.276e-03, 3.016e-01, 9.170e-02, -3.208e-02, 1.088e-01, -1.142e-01, 3.082e-01, -1.133e-01, -2.093e-01, 2.744e-01, 2.188e-02, 2.380e-02, 9.535e-02), r);
	r = MulAdd(s0_8, M4(-1.286e-02, -8.243e-02, 2.199e-02, 2.645e-01, 6.284e-02, 1.616e-02, -5.037e-02, 8.813e-02, -2.444e-02, -8.535e-02, 3.516e-02, 3.767e-02, -8.980e-02, 1.333e-01, 2.389e-02, 4.367e-02), r);
	r = MulAdd(s1_0, M4(-1.972e-02, -4.719e-02, 4.859e-02, -8.123e-03, 7.200e-02, -2.635e-02, -1.962e-02, -6.082e-02, 7.614e-02, 6.818e-03, 6.225e-02, 7.733e-02, -2.530e-02, -2.384e-02, -1.177e-01, -4.357e-02), r);
	r = MulAdd(s1_1, M4(1.055e-01, -8.694e-02, 4.911e-03, -5.235e-02, 2.067e-02, -7.520e-02, -1.095e-01, -2.507e-01, 4.948e-02, -5.254e-02, -7.734e-02, 3.959e-01, -5.456e-02, -8.501e-02, -2.084e-01, -7.064e-02), r);
	r = MulAdd(s1_2, M4(-4.418e-02, -2.485e-03, -9.979e-02, -1.702e-02, -1.339e-01, 2.062e-02, -6.812e-02, -3.193e-01, -8.136e-02, 8.432e-02, -9.392e-02, -7.457e-02, -4.301e-02, -9.250e-02, -3.137e-01, -1.783e-01), r);
	r = MulAdd(s1_3, M4(7.296e-02, 6.799e-03, -8.333e-04, 4.303e-02, 2.939e-01, -1.032e-01, 2.746e-02, 1.708e-02, 1.057e-01, -7.090e-02, -1.024e-01, 1.230e-03, 3.034e-02, -7.408e-02, -1.711e-01, 2.089e-01), r);
	r = MulAdd(s1_4, M4(2.376e-01, -4.740e-01, -2.568e-01, 2.305e-01, 1.795e-01, 3.196e-03, 2.923e-01, -2.949e-01, -2.451e-01, -1.050e-01, 6.901e-03, -1.139e-01, 1.644e-01, -1.711e-03, -2.378e-02, 1.237e-01), r);
	r = MulAdd(s1_5, M4(1.772e-01, -1.673e-01, 2.690e-01, -1.356e-03, 6.326e-03, 1.908e-01, 6.859e-02, -1.588e-02, 5.134e-02, 1.322e-01, -1.345e-01, 2.626e-01, 6.353e-02, -6.034e-03, -4.632e-03, 6.808e-02), r);
	r = MulAdd(s1_6, M4(-1.538e-01, -2.396e-02, 9.343e-02, -9.545e-02, 2.179e-01, -6.521e-02, 3.206e-02, -1.402e-01, 8.921e-02, -1.290e-02, -8.212e-03, -3.336e-03, -1.101e-01, -7.274e-02, -1.222e-01, -3.584e-03), r);
	r = MulAdd(s1_7, M4(-2.568e-01, 3.896e-01, -1.382e-01, -1.016e-01, 6.448e-01, 2.466e-02, -1.558e-01, -1.596e-01, 9.313e-03, 2.957e-01, -1.509e-01, -1.101e-03, 1.536e-01, -6.831e-02, -2.091e-02, -5.669e-03), r);
	r = MulAdd(s1_8, M4(2.950e-02, -3.207e-02, 3.291e-02, 1.807e-02, 2.214e-01, -6.876e-02, -1.293e-01, 2.144e-01, -8.780e-02, -1.075e-01, 1.157e-02, 1.124e-01, -1.431e-01, 7.471e-02, 2.634e-02, -1.739e-01), r);
	r = MulAdd(s2_0, M4(-3.641e-02, -1.460e-01, 3.591e-02, 7.587e-02, -1.678e-02, 3.743e-02, -9.497e-02, -4.368e-02, -7.434e-02, 6.365e-02, -5.406e-02, -1.009e-01, -4.776e-02, -1.209e-01, 1.313e-01, -1.629e-02), r);
	r = MulAdd(s2_1, M4(-2.724e-02, -4.254e-02, -1.241e-02, -1.851e-01, -1.199e-01, 3.992e-02, -1.018e-01, -2.485e-01, 2.846e-02, 3.044e-02, -2.142e-01, -1.481e-01, 5.311e-02, -6.084e-02, 2.746e-02, 2.253e-01), r);
	r = MulAdd(s2_2, M4(2.912e-02, -8.517e-02, 4.616e-02, 2.585e-02, -3.418e-02, 7.831e-02, -6.313e-02, -6.349e-02, -2.185e-01, 4.734e-03, -1.576e-01, -2.940e-01, 3.779e-02, -4.732e-02, -2.341e-02, 9.825e-02), r);
	r = MulAdd(s2_3, M4(-1.595e-01, 1.192e-01, -1.748e-02, 1.593e-01, 2.061e-01, -1.857e-01, 1.617e-01, 1.555e-02, -1.745e-02, 4.421e-02, -4.839e-02, -1.656e-01, -1.701e-01, 2.096e-01, 2.222e-01, -1.145e-01), r);
	r = MulAdd(s2_4, M4(-7.095e-02, 2.911e-01, 1.119e-01, 6.460e-01, -1.009e-01, 2.390e-01, 4.439e-01, -2.016e-01, 3.076e-01, 1.578e-01, -8.802e-02, 2.667e-01, -8.825e-02, -4.537e-01, -2.085e-01, -3.020e-01), r);
	r = MulAdd(s2_5, M4(5.685e-02, -2.135e-01, 3.319e-02, -1.256e-03, 9.093e-03, 2.351e-01, 8.694e-02, 1.826e-02, 1.843e-02, 1.012e-01, 9.358e-02, -1.900e-01, -5.482e-02, -1.073e-01, 2.982e-02, 8.203e-02), r);
	r = MulAdd(s2_6, M4(-2.368e-01, -9.192e-03, 2.210e-02, 1.076e-01, -9.849e-02, 3.512e-02, 1.548e-02, -3.966e-02, 3.754e-01, 1.172e-01, -6.346e-02, 1.927e-01, 1.465e-01, -1.343e-01, 1.473e-01, 1.476e-01), r);
	r = MulAdd(s2_7, M4(-1.001e-01, -2.172e-01, 3.278e-01, 3.514e-01, -2.047e-01, 2.959e-01, -2.066e-01, -2.066e-01, 2.820e-01, 2.278e-01, 3.879e-02, -9.134e-02, 1.094e-01, 3.713e-02, 7.681e-02, 1.422e-01), r);
	r = MulAdd(s2_8, M4(1.533e-01, -9.170e-02, -1.468e-02, -1.166e-01, -1.566e-02, 2.613e-02, -6.614e-02, 1.829e-01, 1.087e-02, 5.517e-02, 6.255e-02, 1.459e-01, 4.011e-02, -6.970e-02, 5.322e-02, -1.088e-02), r);
	r = MulAdd(s3_0, M4(-1.905e-02, -1.010e-01, 1.092e-01, 4.898e-02, -2.340e-02, 3.379e-02, 9.514e-02, 5.005e-03, 3.866e-03, -3.382e-02, -3.593e-02, 2.363e-02, -1.810e-01, 1.544e-01, 6.000e-02, 4.837e-02), r);
	r = MulAdd(s3_1, M4(-1.749e-02, -1.130e-01, 6.763e-02, -4.278e-02, -3.698e-02, 7.479e-02, 1.882e-01, -1.991e-02, 9.916e-02, -3.566e-02, -2.936e-02, -6.237e-03, 9.864e-02, -1.650e-02, 8.939e-02, 6.387e-01), r);
	r = MulAdd(s3_2, M4(1.503e-03, -3.209e-02, 1.688e-02, 9.354e-02, -6.807e-02, 1.355e-01, 1.820e-01, -4.962e-02, 7.103e-02, -2.163e-02, -5.474e-03, 8.376e-02, -4.150e-03, -1.194e-01, -3.023e-02, 3.833e-02), r);
	r = MulAdd(s3_3, M4(5.056e-02, 6.071e-02, 2.378e-01, -1.013e-01, -2.159e-02, 1.218e-01, 2.809e-02, -6.215e-02, -1.733e-01, -3.470e-04, -7.359e-02, 7.317e-03, -5.394e-01, 5.426e-01, 3.015e-01, -3.820e-01), r);
	r = MulAdd(s3_4, M4(-6.020e-02, -2.055e-01, 3.680e-01, -1.910e-01, -8.192e-02, 2.159e-01, -3.798e-02, -2.656e-01, 5.141e-02, 2.568e-01, -3.443e-01, 2.042e-01, -3.336e-01, -6.124e-01, 2.211e-01, -2.492e-01), r);
	r = MulAdd(s3_5, M4(-3.453e-02, -4.960e-02, 7.981e-02, 2.635e-01, -7.613e-02, 2.725e-01, 1.583e-02, -1.533e-01, -6.782e-02, -1.673e-01, -1.790e-01, -1.456e-01, 1.161e-01, -2.882e-01, -4.865e-02, -8.904e-02), r);
	r = MulAdd(s3_6, M4(-6.294e-02, 3.362e-02, 5.988e-02, 2.554e-02, -2.094e-01, 6.535e-02, 6.381e-02, -7.398e-02, 1.185e-01, -8.304e-02, -2.793e-02, -1.274e-01, 6.660e-02, 1.347e-02, 3.585e-02, -4.377e-02), r);
	r = MulAdd(s3_7, M4(5.355e-02, -1.068e-01, -2.474e-02, 2.486e-01, 2.395e-01, -1.376e-01, 5.042e-02, 1.474e-01, 1.529e-01, 8.491e-02, 7.636e-02, -8.829e-03, 1.284e-01, -1.563e-01, 2.101e-01, -1.310e-01), r);
	r = MulAdd(s3_8, M4(4.926e-02, -1.433e-01, -2.143e-02, -6.901e-02, -9.212e-02, 2.929e-01, 2.215e-01, 8.423e-02, 5.067e-02, 5.809e-02, 1.200e-01, 8.764e-02, 9.560e-02, -2.676e-01, -2.228e-01, -9.947e-02), r);
	return r;
}

void Pass5(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = TileSwizzle8x8(tid.x) + blockStart;
	uint2 size = GetInputSize();
	if (gxy.x >= size.x || gxy.y >= size.y) {
		return;
	}
	float2 pos = (gxy + 0.5) * pt;

	V4 s0_0 = l0(-1.0, -1.0);
	V4 s0_1 = l0(0.0, -1.0);
	V4 s0_2 = l0(1.0, -1.0);
	V4 s0_3 = l0(-1.0, 0.0);
	V4 s0_4 = l0(0.0, 0.0);
	V4 s0_5 = l0(1.0, 0.0);
	V4 s0_6 = l0(-1.0, 1.0);
	V4 s0_7 = l0(0.0, 1.0);
	V4 s0_8 = l0(1.0, 1.0);
	V4 s1_0 = -max(-s0_0, 0.0);
	V4 s1_1 = -max(-s0_1, 0.0);
	V4 s1_2 = -max(-s0_2, 0.0);
	V4 s1_3 = -max(-s0_3, 0.0);
	V4 s1_4 = -max(-s0_4, 0.0);
	V4 s1_5 = -max(-s0_5, 0.0);
	V4 s1_6 = -max(-s0_6, 0.0);
	V4 s1_7 = -max(-s0_7, 0.0);
	V4 s1_8 = -max(-s0_8, 0.0);
	s0_0 = max(s0_0, 0.0);
	s0_1 = max(s0_1, 0.0);
	s0_2 = max(s0_2, 0.0);
	s0_3 = max(s0_3, 0.0);
	s0_4 = max(s0_4, 0.0);
	s0_5 = max(s0_5, 0.0);
	s0_6 = max(s0_6, 0.0);
	s0_7 = max(s0_7, 0.0);
	s0_8 = max(s0_8, 0.0);

	V4 s2_0 = l1(-1.0, -1.0);
	V4 s2_1 = l1(0.0, -1.0);
	V4 s2_2 = l1(1.0, -1.0);
	V4 s2_3 = l1(-1.0, 0.0);
	V4 s2_4 = l1(0.0, 0.0);
	V4 s2_5 = l1(1.0, 0.0);
	V4 s2_6 = l1(-1.0, 1.0);
	V4 s2_7 = l1(0.0, 1.0);
	V4 s2_8 = l1(1.0, 1.0);
	V4 s3_0 = -max(-s2_0, 0.0);
	V4 s3_1 = -max(-s2_1, 0.0);
	V4 s3_2 = -max(-s2_2, 0.0);
	V4 s3_3 = -max(-s2_3, 0.0);
	V4 s3_4 = -max(-s2_4, 0.0);
	V4 s3_5 = -max(-s2_5, 0.0);
	V4 s3_6 = -max(-s2_6, 0.0);
	V4 s3_7 = -max(-s2_7, 0.0);
	V4 s3_8 = -max(-s2_8, 0.0);
	s2_0 = max(s2_0, 0.0);
	s2_1 = max(s2_1, 0.0);
	s2_2 = max(s2_2, 0.0);
	s2_3 = max(s2_3, 0.0);
	s2_4 = max(s2_4, 0.0);
	s2_5 = max(s2_5, 0.0);
	s2_6 = max(s2_6, 0.0);
	s2_7 = max(s2_7, 0.0);
	s2_8 = max(s2_8, 0.0);

	t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
	t1[gxy] = f1(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
}

//!PASS 6
//!DESC Convolution layer 5 - Feature processing
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0, t1
//!OUT t2, t3

#define l0(x, y) V4(O(t0, float2(x, y)))
#define l1(x, y) V4(O(t1, float2(x, y)))

V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
	V4 r = { 1.622e-02, 8.633e-03, 2.282e-02, -1.091e-02 };
	r = MulAdd(s0_0, M4(-5.629e-02, -1.499e-02, 2.729e-02, 5.240e-02, -2.085e-02, 1.943e-02, 2.721e-01, 1.895e-02, 1.703e-05, -2.185e-03, 5.241e-02, 1.372e-01, 9.478e-02, -1.754e-01, -5.204e-02, -3.875e-02), r);
	r = MulAdd(s0_1, M4(-1.752e-01, 8.544e-03, 2.009e-01, 1.081e-01, -6.113e-02, -1.264e-02, -1.099e-01, 7.721e-02, -8.883e-02, 6.624e-02, 1.543e-01, -1.470e-01, 7.084e-02, 2.041e-02, -1.353e-01, 5.514e-02), r);
	r = MulAdd(s0_2, M4(-8.100e-02, 7.181e-02, 1.823e-02, 1.137e-01, -7.627e-03, 1.157e-02, 6.381e-03, -2.029e-02, -1.793e-01, 2.163e-01, 1.071e-01, 1.975e-01, 4.782e-03, -1.038e-01, -2.722e-02, -2.863e-02), r);
	r = MulAdd(s0_3, M4(-1.987e-01, 2.234e-02, 1.629e-01, 1.411e-01, 1.686e-01, -1.169e-01, -1.153e-01, 5.303e-02, -1.401e-01, 7.446e-02, 1.651e-02, 1.049e-01, 6.766e-02, -4.508e-03, -4.739e-02, -1.971e-02), r);
	r = MulAdd(s0_4, M4(-6.077e-01, -1.132e-01, 5.496e-02, -1.474e-01, -4.083e-02, -2.358e-02, 5.782e-02, -8.903e-03, -2.187e-01, -5.799e-01, -2.327e-01, 3.352e-01, 4.096e-01, 1.543e-01, 2.634e-01, 1.353e-02), r);
	r = MulAdd(s0_5, M4(-1.090e-01, -1.697e-02, -6.169e-02, 8.472e-02, 3.849e-02, -4.439e-02, -2.872e-02, 2.331e-02, -2.056e-01, 2.056e-01, -6.852e-02, 2.337e-03, -6.904e-02, 9.751e-02, 2.718e-02, 2.761e-02), r);
	r = MulAdd(s0_6, M4(-4.874e-02, 2.765e-02, -2.766e-02, 9.790e-02, -3.487e-02, 9.293e-02, 6.253e-02, 2.296e-02, -6.212e-02, 1.142e-01, 5.876e-02, 1.080e-01, -5.299e-02, -8.509e-04, -9.618e-02, -6.980e-03), r);
	r = MulAdd(s0_7, M4(-2.509e-02, -1.062e-01, 4.788e-02, 9.037e-02, 6.434e-03, -4.993e-02, 1.378e-02, -2.191e-02, -2.489e-02, 1.125e-01, -3.071e-02, 1.494e-01, 1.074e-02, -1.213e-02, -6.753e-02, -1.152e-01), r);
	r = MulAdd(s0_8, M4(-6.282e-02, 7.023e-03, -1.272e-02, 4.021e-02, -4.950e-02, 7.900e-02, -3.829e-03, 5.379e-02, -7.884e-02, 4.758e-02, -2.827e-02, 2.264e-02, 3.407e-02, -1.753e-01, 5.579e-03, 3.211e-02), r);
	r = MulAdd(s1_0, M4(2.093e-02, -4.623e-02, -3.265e-02, 1.873e-02, 3.220e-02, -1.627e-01, 5.046e-02, -4.882e-02, -6.228e-02, 2.889e-02, 5.429e-03, 2.945e-02, -4.053e-02, -6.110e-02, -4.289e-02, 1.538e-02), r);
	r = MulAdd(s1_1, M4(1.844e-02, -5.925e-02, -1.384e-01, -1.712e-01, 2.836e-02, -7.524e-02, -1.785e-01, -6.518e-02, -3.521e-02, -1.454e-02, 1.426e-01, -4.671e-02, 4.228e-02, 4.236e-02, -3.918e-02, 3.684e-02), r);
	r = MulAdd(s1_2, M4(-5.506e-02, -1.322e-02, -8.521e-02, 2.606e-02, 2.439e-02, -3.190e-02, 8.334e-03, -1.499e-03, -5.366e-02, -1.778e-02, 5.984e-02, -5.725e-02, 2.785e-02, -3.899e-02, -4.964e-02, 5.202e-02), r);
	r = MulAdd(s1_3, M4(-2.135e-02, 1.172e-02, -3.127e-03, 4.214e-02, -1.414e-01, -7.592e-02, -9.769e-01, 3.319e-01, -3.444e-02, 3.007e-02, -3.268e-02, 6.656e-03, 4.946e-02, 7.209e-02, 5.624e-03, 4.819e-02), r);
	r = MulAdd(s1_4, M4(2.301e-02, -8.416e-03, -1.886e-01, -4.043e-01, 5.243e-02, 8.550e-02, -1.203e-01, -2.122e-01, -8.371e-03, -2.264e-01, -1.174e-01, -1.152e-01, -2.725e-02, 1.957e-01, 5.036e-02, 1.214e-01), r);
	r = MulAdd(s1_5, M4(9.059e-02, -1.391e-01, -2.427e-01, -1.716e-03, 1.048e-01, -9.487e-02, -8.739e-02, 1.464e-02, -5.532e-02, -8.633e-02, -1.491e-01, 1.422e-01, -7.080e-02, 2.580e-01, -1.783e-02, 1.028e-01), r);
	r = MulAdd(s1_6, M4(5.407e-02, -4.321e-02, -6.909e-02, -5.464e-04, 2.971e-02, 6.365e-02, -3.780e-02, 3.135e-04, -1.352e-02, 5.407e-02, -2.180e-02, 1.456e-02, -2.334e-02, -6.107e-03, 3.552e-02, 1.831e-02), r);
	r = MulAdd(s1_7, M4(9.306e-02, -3.273e-01, 6.469e-02, -6.293e-02, 7.617e-02, -5.466e-02, 5.817e-04, -1.038e-01, -8.348e-03, -4.016e-02, 5.163e-02, 5.153e-02, -4.913e-02, -3.528e-02, -1.018e-02, -4.440e-02), r);
	r = MulAdd(s1_8, M4(-1.901e-02, -1.003e-02, 4.926e-03, -5.469e-02, 4.712e-02, 6.210e-02, 6.828e-03, 8.601e-03, -4.973e-02, 1.034e-01, 8.064e-02, -4.831e-02, -8.915e-03, 1.338e-01, -2.552e-02, 1.284e-01), r);
	r = MulAdd(s2_0, M4(1.099e-01, -3.907e-02, -5.627e-02, 3.726e-02, -4.089e-02, 3.905e-02, -7.333e-02, -5.201e-02, 1.199e-01, -3.077e-02, 7.080e-02, -6.518e-02, 5.478e-02, 3.929e-02, 3.922e-02, 5.502e-02), r);
	r = MulAdd(s2_1, M4(-1.650e-01, 1.957e-02, -1.799e-01, 2.149e-01, 8.199e-02, -1.061e-01, -4.542e-02, -1.756e-01, -3.644e-02, -2.350e-02, -1.891e-01, -1.270e-01, 7.510e-02, 6.960e-02, 1.670e-01, -3.395e-01), r);
	r = MulAdd(s2_2, M4(1.296e-01, -1.354e-02, -1.690e-01, 1.539e-01, 4.985e-02, -5.993e-02, 1.355e-03, 1.321e-02, 5.868e-02, -6.005e-03, 7.050e-03, -4.139e-02, 2.584e-02, -5.562e-02, 1.896e-03, -6.420e-02), r);
	r = MulAdd(s2_3, M4(-3.750e-03, 9.356e-02, 8.346e-02, -9.586e-02, 5.356e-02, 1.038e-01, 6.120e-02, 7.898e-02, 2.210e-01, -5.833e-02, -2.771e-01, -6.909e-02, 1.841e-01, -2.505e-02, -1.723e-01, 6.775e-02), r);
	r = MulAdd(s2_4, M4(-1.675e-01, 2.077e-01, 3.639e-01, 1.616e-01, -1.849e-02, -6.565e-02, 2.531e-01, -3.022e-01, -2.568e-01, 1.240e-01, -3.299e-01, 2.133e-01, 2.786e-01, -2.316e-01, -4.497e-02, -2.981e-01), r);
	r = MulAdd(s2_5, M4(-1.918e-01, 1.659e-01, 1.114e-01, 1.974e-01, 3.208e-02, -2.558e-02, -1.265e-02, -2.035e-02, 2.564e-01, -3.624e-02, -6.179e-02, -1.233e-01, -3.454e-02, -7.588e-02, 2.185e-02, -5.032e-02), r);
	r = MulAdd(s2_6, M4(1.514e-02, -9.920e-03, 3.117e-02, -5.511e-02, 4.066e-02, 3.938e-02, -5.380e-03, -5.667e-02, -1.978e-02, -4.687e-02, 1.253e-01, -1.195e-02, -7.667e-03, 5.952e-02, 2.288e-02, 3.814e-02), r);
	r = MulAdd(s2_7, M4(-3.610e-02, -3.112e-02, 3.272e-02, 7.061e-02, 3.913e-02, -1.046e-01, -1.197e-03, 2.646e-02, 9.618e-03, 1.146e-01, 7.210e-02, -1.143e-01, -1.136e-03, 6.210e-02, -1.122e-02, -2.587e-02), r);
	r = MulAdd(s2_8, M4(-7.697e-02, -9.952e-04, -7.539e-03, -1.513e-02, -9.029e-03, 1.408e-02, 2.055e-06, -1.059e-02, 7.768e-02, 3.124e-02, 3.368e-02, -1.155e-02, -1.546e-02, -2.931e-03, 2.106e-02, -3.257e-02), r);
	r = MulAdd(s3_0, M4(7.144e-03, -1.322e-02, -1.944e-01, -1.612e-02, -4.237e-02, 1.456e-01, -5.996e-02, -2.145e-01, 4.330e-02, -3.849e-02, 8.443e-02, -4.098e-03, 1.960e-03, 5.152e-03, 9.973e-02, 8.078e-02), r);
	r = MulAdd(s3_1, M4(1.113e-01, -3.005e-02, 1.199e-01, -4.029e-02, 9.934e-02, -1.540e-01, -3.818e-01, 3.199e-02, -3.314e-02, 3.015e-02, -8.632e-02, 7.195e-02, -8.405e-02, 1.511e-02, -1.364e-01, -4.779e-02), r);
	r = MulAdd(s3_2, M4(-7.198e-02, -9.730e-03, -7.659e-02, -9.198e-02, -1.291e-02, -2.118e-02, -7.500e-02, 3.164e-03, -3.136e-02, 2.908e-02, 6.306e-02, -3.065e-03, 1.294e-04, -2.250e-02, 6.295e-02, 1.235e-02), r);
	r = MulAdd(s3_3, M4(-3.799e-03, 3.301e-02, 1.588e-01, 2.691e-02, 1.612e-01, 2.476e-01, -6.205e-02, 2.352e-01, 9.168e-02, 7.156e-03, 2.315e-02, -2.909e-02, 3.882e-02, 3.297e-02, -1.062e-01, 9.778e-02), r);
	r = MulAdd(s3_4, M4(-1.450e-01, 8.155e-02, 6.351e-02, -7.599e-02, -9.454e-04, -4.971e-01, 1.665e-01, -5.560e-01, -1.282e-01, 1.987e-01, 7.292e-02, 6.917e-02, 2.889e-02, 2.158e-01, 1.941e-01, -1.472e-01), r);
	r = MulAdd(s3_5, M4(-6.303e-03, 4.578e-02, 5.086e-02, -1.684e-02, -5.308e-02, 2.573e-01, 6.806e-02, 2.070e-02, 1.006e-01, -1.195e-01, 9.802e-03, -1.123e-01, 1.097e-02, 6.926e-03, 7.695e-02, 1.168e-01), r);
	r = MulAdd(s3_6, M4(3.211e-02, -1.982e-02, -2.188e-02, -4.794e-02, 2.477e-02, 3.063e-02, 7.311e-02, -9.376e-02, 6.516e-02, -1.927e-02, 4.276e-02, 7.122e-04, -4.200e-02, 2.947e-02, 8.275e-02, 3.763e-02), r);
	r = MulAdd(s3_7, M4(-1.918e-02, -3.916e-03, -6.341e-03, 5.559e-02, 1.027e-01, -7.839e-02, -3.427e-02, 9.477e-03, 1.564e-03, -2.435e-02, -3.752e-02, -4.801e-02, 4.764e-02, -1.354e-01, 1.214e-02, -5.935e-02), r);
	r = MulAdd(s3_8, M4(2.244e-02, -2.103e-02, 2.209e-02, -3.679e-02, -1.259e-02, 8.911e-02, -2.068e-02, -8.888e-03, 3.137e-02, -4.821e-02, -2.994e-03, -7.204e-03, -6.453e-02, -7.934e-02, 7.787e-03, -5.808e-03), r);
	return r;
}

V4 f1(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
	V4 r = { -6.201e-03, -2.350e-02, 4.995e-03, 5.325e-03 };
	r = MulAdd(s0_0, M4(-1.176e-01, 4.590e-02, -1.097e-03, 9.562e-03, -9.896e-02, -4.315e-02, 5.445e-03, 5.506e-02, -1.072e-01, 2.441e-02, 1.076e-02, -7.966e-02, -1.692e-01, 3.632e-02, 4.413e-03, -4.337e-02), r);
	r = MulAdd(s0_1, M4(-1.753e-01, 1.540e-01, 9.123e-02, 1.894e-01, 3.174e-02, 1.360e-01, -9.307e-02, 1.550e-01, -3.076e-01, -7.511e-02, -9.321e-03, 2.660e-01, 1.373e-01, 2.495e-02, 1.941e-01, 2.048e-02), r);
	r = MulAdd(s0_2, M4(-7.759e-02, 7.841e-02, -7.251e-02, 2.212e-01, -2.399e-02, -1.360e-02, -1.016e-02, 2.063e-03, 9.256e-03, -1.288e-02, 8.587e-02, -2.566e-02, 9.888e-02, 8.241e-02, -4.956e-02, -1.440e-01), r);
	r = MulAdd(s0_3, M4(-1.812e-01, 4.365e-02, 1.120e-01, -1.079e-01, -7.096e-02, 1.734e-01, 5.045e-02, -6.046e-02, 7.682e-02, 2.865e-02, -4.742e-03, -2.903e-02, 1.958e-01, 1.039e-02, -4.816e-02, -6.483e-03), r);
	r = MulAdd(s0_4, M4(-2.251e-01, 5.948e-02, 2.301e-01, 3.526e-01, 1.676e-01, 3.284e-01, 7.006e-02, -4.188e-01, -1.429e-01, 4.799e-01, 4.885e-02, 1.954e-01, -1.919e-01, -3.617e-01, -1.317e-01, -1.693e-01), r);
	r = MulAdd(s0_5, M4(5.904e-02, 1.668e-01, 2.446e-01, -6.086e-03, 2.400e-02, 6.791e-03, -1.382e-02, 4.357e-02, -6.655e-02, -2.124e-01, -1.177e-01, 7.355e-02, -5.483e-03, -3.423e-02, 2.298e-01, -8.469e-02), r);
	r = MulAdd(s0_6, M4(2.939e-03, 4.728e-02, 1.274e-01, -9.988e-02, -6.243e-03, -8.422e-03, 1.984e-02, -1.120e-02, -3.664e-02, 1.984e-02, 5.726e-02, -1.255e-01, 1.866e-02, 3.333e-02, -4.936e-02, 4.108e-02), r);
	r = MulAdd(s0_7, M4(-1.240e-02, 1.239e-01, 1.241e-01, 5.387e-02, -1.040e-01, -2.150e-04, -1.027e-02, 1.470e-01, -2.384e-02, -2.690e-02, 2.359e-01, 5.578e-02, 2.181e-02, -2.860e-02, -4.156e-02, 5.233e-03), r);
	r = MulAdd(s0_8, M4(-1.985e-02, 1.875e-02, 2.490e-02, 4.427e-02, 1.119e-02, -1.343e-02, -1.967e-02, 2.461e-02, 1.923e-02, -1.645e-02, 8.968e-02, 1.569e-02, 6.168e-02, 8.987e-02, -7.952e-02, -2.058e-02), r);
	r = MulAdd(s1_0, M4(-7.795e-02, -3.943e-02, -9.027e-02, 4.130e-02, -4.783e-02, 3.800e-02, -1.264e-01, 3.568e-02, -4.561e-02, -5.500e-03, -3.950e-03, 2.980e-02, 1.306e-02, 4.398e-02, -1.101e-01, 6.269e-02), r);
	r = MulAdd(s1_1, M4(7.817e-03, -8.656e-02, -1.610e-01, 1.343e-01, 7.768e-03, 6.015e-02, -2.118e-01, 9.987e-02, -9.086e-02, 3.293e-02, -1.073e-02, 2.126e-02, 9.909e-02, 7.013e-02, -1.638e-02, 1.370e-02), r);
	r = MulAdd(s1_2, M4(1.524e-02, 1.213e-01, -2.121e-01, -7.601e-03, -1.088e-03, -7.068e-03, -3.429e-02, -1.854e-02, -1.256e-01, -5.000e-02, -4.249e-03, 6.956e-02, 1.810e-01, 1.228e-01, -1.183e-01, -2.285e-02), r);
	r = MulAdd(s1_3, M4(-3.223e-02, -3.501e-02, 5.832e-02, -1.608e-01, 3.728e-01, 5.507e-01, -3.432e-02, -1.058e-01, -5.823e-02, 5.583e-03, 4.258e-02, -4.061e-02, 5.032e-02, -1.031e-02, 6.090e-02, -6.812e-02), r);
	r = MulAdd(s1_4, M4(-5.041e-02, -6.582e-01, 1.553e-01, 1.593e-01, 1.460e-01, 2.310e-02, 3.522e-02, -4.788e-01, -1.449e-01, -4.072e-02, 1.242e-01, -1.751e-02, 3.372e-02, -1.114e-01, -5.362e-02, -2.117e-02), r);
	r = MulAdd(s1_5, M4(1.978e-01, 2.884e-01, 5.209e-02, -1.751e-01, 3.227e-02, 1.168e-02, -9.727e-03, 4.108e-02, 2.009e-02, 7.821e-02, 2.300e-01, 1.131e-01, 7.034e-02, 5.115e-02, -2.973e-01, 1.802e-01), r);
	r = MulAdd(s1_6, M4(1.958e-02, 2.796e-02, 6.192e-02, -5.750e-02, -6.072e-02, 8.721e-02, 4.716e-02, 4.567e-02, -4.567e-02, -2.416e-02, 2.033e-02, -3.503e-02, 3.544e-03, 2.999e-02, -1.305e-02, 3.346e-02), r);
	r = MulAdd(s1_7, M4(8.872e-03, 8.004e-02, -7.152e-02, 4.066e-02, -3.603e-02, -9.614e-02, 3.228e-02, 9.803e-03, -6.331e-02, -4.420e-02, 1.642e-01, -1.447e-02, 4.909e-02, 5.909e-02, -8.621e-02, 1.548e-01), r);
	r = MulAdd(s1_8, M4(-1.711e-02, 5.139e-02, -7.837e-02, -7.919e-03, -1.636e-02, -4.681e-03, 2.828e-03, 2.137e-02, -1.023e-01, -7.483e-03, 7.474e-02, 1.323e-01, 1.304e-01, 2.446e-02, -9.139e-02, -9.162e-02), r);
	r = MulAdd(s2_0, M4(7.268e-02, 3.978e-02, -1.480e-01, 1.105e-01, 1.312e-01, 1.599e-02, -4.579e-02, 9.830e-02, 5.525e-02, -7.153e-02, 4.305e-02, -3.304e-02, -4.484e-02, -7.667e-05, -1.446e-02, 3.995e-03), r);
	r = MulAdd(s2_1, M4(1.060e-01, 7.116e-02, -2.697e-01, -2.968e-01, 7.944e-02, -1.363e-03, 7.398e-02, -3.584e-01, -2.039e-02, -3.787e-02, -2.126e-01, 3.468e-02, -3.340e-01, -2.642e-01, -1.053e-01, 4.584e-02), r);
	r = MulAdd(s2_2, M4(8.515e-02, 7.926e-02, -1.782e-01, 3.409e-01, -4.535e-02, 6.948e-02, 4.167e-03, -8.690e-02, -1.100e-03, -1.622e-02, -2.110e-02, -4.446e-03, -2.315e-02, 1.463e-03, 4.195e-03, -7.673e-02), r);
	r = MulAdd(s2_3, M4(7.574e-02, -6.551e-02, -7.931e-02, 1.847e-02, -1.222e-01, -2.567e-02, 1.781e-02, 6.826e-02, 1.751e-01, 6.480e-02, 9.014e-02, 5.389e-02, -6.614e-02, 1.289e-01, 3.795e-02, 7.552e-02), r);
	r = MulAdd(s2_4, M4(1.655e-01, 1.347e-01, -4.840e-02, -4.500e-02, -2.469e-01, -1.613e-02, 9.173e-02, 2.510e-02, 6.241e-01, 9.625e-02, 3.608e-01, -9.156e-02, -3.869e-01, -2.592e-01, 3.916e-01, -2.647e-01), r);
	r = MulAdd(s2_5, M4(3.298e-02, 2.479e-01, 9.145e-02, 1.734e-01, 1.213e-02, 3.894e-02, -4.250e-02, 1.374e-01, 1.559e-01, -3.204e-02, -1.421e-01, -2.360e-01, -1.211e-01, -1.204e-02, 1.006e-01, 1.015e-01), r);
	r = MulAdd(s2_6, M4(-3.940e-02, -3.056e-02, 9.430e-03, -2.775e-03, -1.404e-03, -5.216e-02, 1.604e-02, -2.766e-02, 3.187e-02, 9.107e-02, -1.513e-01, 1.491e-01, 2.200e-02, -5.174e-03, -4.308e-02, -1.033e-02), r);
	r = MulAdd(s2_7, M4(-4.013e-02, 3.924e-02, 3.782e-02, 2.474e-02, 6.905e-02, -1.493e-02, 7.994e-04, -1.260e-01, -3.816e-02, 8.267e-02, -2.321e-01, -4.363e-03, 5.037e-02, -9.344e-02, -3.923e-02, 2.305e-02), r);
	r = MulAdd(s2_8, M4(-9.598e-02, 6.235e-02, -3.485e-02, 1.507e-01, 2.140e-02, -1.713e-02, -2.150e-02, 5.550e-03, -7.057e-02, -4.752e-02, -8.683e-02, 9.942e-02, 1.652e-02, -2.513e-03, 6.316e-03, -1.445e-02), r);
	r = MulAdd(s3_0, M4(7.494e-03, 6.167e-02, -2.229e-02, 9.588e-02, 2.098e-01, -2.744e-01, -1.121e-01, -1.093e-01, -3.600e-02, 7.007e-03, 6.640e-02, -6.241e-02, -2.350e-02, -2.487e-02, -8.227e-03, -9.150e-02), r);
	r = MulAdd(s3_1, M4(3.763e-02, 2.717e-02, 2.647e-01, -3.230e-01, 4.209e-01, 3.723e-01, -1.271e-01, -1.294e-01, -3.973e-02, -2.565e-02, -4.823e-02, 2.920e-01, -3.086e-02, 2.765e-02, -8.713e-02, 1.211e-01), r);
	r = MulAdd(s3_2, M4(-8.572e-02, -2.810e-03, -4.112e-02, 1.389e-01, 1.504e-03, -1.636e-03, -4.064e-03, 7.007e-02, -6.103e-02, -9.792e-03, 6.503e-02, -2.098e-02, -1.478e-02, -1.370e-02, 5.957e-02, -2.460e-02), r);
	r = MulAdd(s3_3, M4(-7.273e-02, -7.381e-02, -1.120e-02, 2.998e-02, -1.515e-01, 2.393e-02, 1.901e-01, -2.623e-02, 9.012e-02, 3.699e-02, -5.944e-02, 5.235e-02, -4.027e-02, 1.674e-01, -4.495e-03, 3.007e-02), r);
	r = MulAdd(s3_4, M4(-1.656e-01, -2.356e-02, -1.028e-01, 1.295e-01, -4.564e-01, -2.254e-01, 2.951e-01, -1.610e-02, 1.831e-01, 2.144e-01, -1.461e-01, -4.598e-02, -2.185e-01, -2.248e-01, 5.653e-01, -4.681e-01), r);
	r = MulAdd(s3_5, M4(-4.814e-02, 6.573e-02, 2.865e-02, 1.553e-01, -7.495e-02, -4.398e-02, -1.738e-01, 1.548e-01, -4.272e-03, 3.478e-02, -2.813e-02, -9.149e-02, -7.011e-02, -3.629e-03, 6.780e-02, 8.682e-02), r);
	r = MulAdd(s3_6, M4(-4.552e-03, -7.880e-03, 3.240e-02, -4.527e-02, -4.243e-02, -3.649e-02, -3.803e-02, 6.218e-02, 4.543e-03, 1.675e-02, -1.154e-02, 3.727e-03, -6.186e-02, 3.390e-02, 2.030e-02, 3.279e-02), r);
	r = MulAdd(s3_7, M4(3.833e-02, -3.550e-04, 4.441e-02, -7.695e-02, 1.958e-01, -5.135e-02, -1.626e-01, -1.099e-01, 7.221e-02, 3.797e-02, -8.113e-02, -1.506e-01, -1.960e-01, 7.023e-02, 4.237e-02, 1.152e-01), r);
	r = MulAdd(s3_8, M4(-4.346e-02, -3.773e-03, 6.954e-02, 6.227e-03, 7.212e-02, -4.309e-02, -1.550e-02, -9.249e-02, -1.252e-02, -3.851e-03, -1.582e-02, -2.023e-02, -3.647e-02, 6.547e-02, 5.062e-02, 4.362e-02), r);
	return r;
}

void Pass6(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = TileSwizzle8x8(tid.x) + blockStart;
	uint2 size = GetInputSize();
	if (gxy.x >= size.x || gxy.y >= size.y) {
		return;
	}
	float2 pos = (gxy + 0.5) * pt;

	V4 s0_0 = l0(-1.0, -1.0);
	V4 s0_1 = l0(0.0, -1.0);
	V4 s0_2 = l0(1.0, -1.0);
	V4 s0_3 = l0(-1.0, 0.0);
	V4 s0_4 = l0(0.0, 0.0);
	V4 s0_5 = l0(1.0, 0.0);
	V4 s0_6 = l0(-1.0, 1.0);
	V4 s0_7 = l0(0.0, 1.0);
	V4 s0_8 = l0(1.0, 1.0);
	V4 s1_0 = -max(-s0_0, 0.0);
	V4 s1_1 = -max(-s0_1, 0.0);
	V4 s1_2 = -max(-s0_2, 0.0);
	V4 s1_3 = -max(-s0_3, 0.0);
	V4 s1_4 = -max(-s0_4, 0.0);
	V4 s1_5 = -max(-s0_5, 0.0);
	V4 s1_6 = -max(-s0_6, 0.0);
	V4 s1_7 = -max(-s0_7, 0.0);
	V4 s1_8 = -max(-s0_8, 0.0);
	s0_0 = max(s0_0, 0.0);
	s0_1 = max(s0_1, 0.0);
	s0_2 = max(s0_2, 0.0);
	s0_3 = max(s0_3, 0.0);
	s0_4 = max(s0_4, 0.0);
	s0_5 = max(s0_5, 0.0);
	s0_6 = max(s0_6, 0.0);
	s0_7 = max(s0_7, 0.0);
	s0_8 = max(s0_8, 0.0);

	V4 s2_0 = l1(-1.0, -1.0);
	V4 s2_1 = l1(0.0, -1.0);
	V4 s2_2 = l1(1.0, -1.0);
	V4 s2_3 = l1(-1.0, 0.0);
	V4 s2_4 = l1(0.0, 0.0);
	V4 s2_5 = l1(1.0, 0.0);
	V4 s2_6 = l1(-1.0, 1.0);
	V4 s2_7 = l1(0.0, 1.0);
	V4 s2_8 = l1(1.0, 1.0);
	V4 s3_0 = -max(-s2_0, 0.0);
	V4 s3_1 = -max(-s2_1, 0.0);
	V4 s3_2 = -max(-s2_2, 0.0);
	V4 s3_3 = -max(-s2_3, 0.0);
	V4 s3_4 = -max(-s2_4, 0.0);
	V4 s3_5 = -max(-s2_5, 0.0);
	V4 s3_6 = -max(-s2_6, 0.0);
	V4 s3_7 = -max(-s2_7, 0.0);
	V4 s3_8 = -max(-s2_8, 0.0);
	s2_0 = max(s2_0, 0.0);
	s2_1 = max(s2_1, 0.0);
	s2_2 = max(s2_2, 0.0);
	s2_3 = max(s2_3, 0.0);
	s2_4 = max(s2_4, 0.0);
	s2_5 = max(s2_5, 0.0);
	s2_6 = max(s2_6, 0.0);
	s2_7 = max(s2_7, 0.0);
	s2_8 = max(s2_8, 0.0);

	t2[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
	t3[gxy] = f1(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
}

//!PASS 7
//!DESC Convolution layer 6 - Feature processing
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t2, t3
//!OUT t0, t1

#define l0(x, y) V4(O(t2, float2(x, y)))
#define l1(x, y) V4(O(t3, float2(x, y)))

V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
	V4 r = { -3.105e-03, 3.230e-03, 4.884e-04, 1.938e-03 };
	r = MulAdd(s0_0, M4(-1.952e-02, -2.301e-02, -3.736e-04, -1.581e-02, 7.341e-02, -3.569e-02, 5.963e-03, 1.284e-01, 6.945e-03, 1.208e-02, -2.852e-02, 1.210e-02, -5.234e-02, 3.450e-02, 1.034e-02, 9.300e-04), r);
	r = MulAdd(s0_1, M4(-1.048e-01, 4.626e-02, 1.610e-02, -1.378e-02, -1.910e-01, 2.419e-02, 7.012e-01, -2.998e-01, -8.037e-02, -4.437e-02, -3.772e-02, -1.620e-02, -3.646e-01, -2.323e-02, 1.019e-02, 3.517e-02), r);
	r = MulAdd(s0_2, M4(-9.399e-02, -3.329e-02, 1.219e-02, -8.421e-03, -3.055e-02, 3.756e-02, 3.237e-02, 1.406e-01, -1.392e-02, -1.962e-02, 1.136e-01, 2.379e-02, -6.186e-03, -4.576e-02, -4.557e-02, -1.548e-01), r);
	r = MulAdd(s0_3, M4(1.638e-02, 8.104e-02, -3.369e-02, 2.663e-02, 4.984e-02, 2.547e-01, -3.319e-02, 1.098e-01, -8.594e-03, 4.489e-03, 9.319e-03, -1.642e-02, 3.309e-02, -2.018e-02, -4.158e-03, -2.341e-02), r);
	r = MulAdd(s0_4, M4(4.271e-02, -2.245e-01, 5.310e-02, -1.032e-01, 2.033e-02, -5.645e-01, -1.033e-01, -1.308e-01, 1.039e-01, -5.559e-02, 2.827e-02, -1.009e-01, 3.317e-01, -1.548e-01, -6.898e-02, -2.608e-01), r);
	r = MulAdd(s0_5, M4(5.093e-02, 1.337e-01, 2.877e-02, 1.042e-01, -1.935e-02, 5.340e-02, 3.398e-02, -1.656e-02, 1.024e-02, 2.018e-01, -5.609e-02, 2.052e-01, -1.878e-02, -1.618e-01, 3.110e-02, -1.025e-01), r);
	r = MulAdd(s0_6, M4(-1.982e-03, -4.609e-02, 2.637e-02, -9.266e-03, 2.498e-04, 2.218e-02, -1.013e-02, 2.184e-02, 5.965e-03, 1.045e-02, 1.644e-03, 2.716e-02, 3.749e-03, -3.710e-02, -1.464e-02, -1.227e-02), r);
	r = MulAdd(s0_7, M4(-1.146e-02, -3.264e-03, -1.180e-02, -2.658e-02, -2.606e-02, -3.221e-02, -1.237e-02, -2.961e-02, 9.372e-03, -4.673e-02, -5.785e-04, -4.285e-02, 1.433e-03, -7.620e-02, -2.407e-03, 2.959e-02), r);
	r = MulAdd(s0_8, M4(-3.948e-04, 6.024e-02, 9.655e-04, 2.124e-02, -1.357e-02, -2.965e-02, -1.354e-03, -6.947e-03, 3.782e-02, 4.020e-02, -2.875e-02, 3.749e-02, -5.428e-03, -3.603e-02, -3.256e-02, -2.771e-02), r);
	r = MulAdd(s1_0, M4(3.111e-03, -4.528e-02, 2.406e-02, -3.269e-02, 2.000e-02, -5.895e-02, 3.012e-02, -2.212e-02, 1.071e-01, 1.928e-02, -3.492e-02, 1.053e-01, -6.362e-02, 4.470e-02, 2.578e-02, -1.040e-02), r);
	r = MulAdd(s1_1, M4(1.134e-01, 6.078e-02, -4.550e-03, -7.141e-03, -1.449e-01, 7.662e-02, 1.104e-01, -3.543e-02, 4.676e-01, 1.248e-02, -3.675e-01, -1.655e-01, 7.049e-02, 2.875e-02, -3.009e-02, 9.027e-02), r);
	r = MulAdd(s1_2, M4(-5.208e-02, -3.105e-02, -3.721e-02, 1.006e-01, -3.864e-02, 2.810e-02, 1.604e-02, 8.082e-02, 8.273e-02, 7.069e-02, -2.601e-02, 8.756e-02, -2.213e-02, -3.431e-02, -3.394e-02, -4.138e-02), r);
	r = MulAdd(s1_3, M4(6.901e-02, -1.813e-02, -6.033e-02, -5.048e-02, 1.642e-02, 6.266e-02, -4.282e-02, 2.908e-02, -5.339e-02, 2.241e-02, -7.009e-02, -2.048e-02, 5.016e-02, 1.243e-02, -2.653e-02, 8.097e-02), r);
	r = MulAdd(s1_4, M4(-8.214e-03, -6.216e-02, 3.436e-02, 3.719e-02, 1.412e-01, -1.256e-01, -9.253e-02, 1.326e-01, 8.852e-01, -4.551e-02, -7.485e-01, -1.673e-01, -9.588e-03, -1.411e-01, -4.338e-02, -4.232e-01), r);
	r = MulAdd(s1_5, M4(7.913e-02, 5.560e-01, 9.992e-02, 5.410e-01, -1.700e-03, -1.288e-02, -3.230e-02, -8.186e-02, 1.255e-01, 4.736e-02, -5.084e-01, 7.951e-02, 1.181e-01, -8.278e-02, 3.674e-02, -1.468e-01), r);
	r = MulAdd(s1_6, M4(-6.616e-03, -4.503e-02, 2.533e-02, -1.964e-02, -8.381e-03, -6.342e-03, 6.659e-03, -1.064e-02, 4.090e-02, 7.180e-02, 5.233e-02, 5.864e-03, -1.308e-03, 5.536e-03, -2.743e-03, -1.219e-02), r);
	r = MulAdd(s1_7, M4(2.071e-02, 7.370e-02, 2.681e-04, 9.920e-03, -2.067e-02, 1.362e-01, 1.855e-02, 5.200e-04, 3.511e-01, -2.001e-01, -1.940e-01, 9.383e-02, -3.934e-02, -1.915e-01, 2.669e-02, 1.094e-02), r);
	r = MulAdd(s1_8, M4(-2.372e-02, 1.062e-01, -1.496e-02, 1.175e-02, -1.135e-02, 6.667e-03, 3.389e-03, 1.929e-02, -1.125e-02, 1.674e-01, -3.336e-02, -6.202e-02, 6.653e-03, 1.235e-02, -1.633e-02, 5.038e-02), r);
	r = MulAdd(s2_0, M4(-3.964e-02, -3.119e-03, -9.673e-03, -6.366e-03, 8.531e-02, -3.670e-02, -5.924e-03, 1.261e-02, -6.665e-02, -1.278e-02, -1.696e-02, -2.727e-02, -3.882e-02, 1.510e-03, 2.840e-02, -1.340e-02), r);
	r = MulAdd(s2_1, M4(-1.639e-01, -1.744e-02, -5.910e-03, -1.800e-02, 4.918e-02, 2.777e-02, 4.527e-02, -1.255e-01, -1.224e-01, -4.651e-02, 3.581e-02, -2.700e-03, -6.049e-02, 2.921e-02, 4.759e-02, -3.412e-03), r);
	r = MulAdd(s2_2, M4(-1.017e-01, 3.087e-02, 4.443e-01, 7.643e-02, -5.113e-02, 3.162e-02, -2.544e-02, 1.116e-01, 2.750e-03, 6.519e-02, -1.121e-02, 5.005e-02, -6.372e-02, -2.334e-02, 6.144e-02, 6.538e-02), r);
	r = MulAdd(s2_3, M4(2.036e-01, -2.519e-02, -1.132e-02, -6.775e-02, -1.205e-02, 2.868e-02, -1.777e-02, -1.051e-02, 7.531e-02, -8.572e-02, 5.924e-02, -8.820e-02, 1.963e-01, -4.408e-02, -4.202e-02, -4.132e-02), r);
	r = MulAdd(s2_4, M4(-4.319e-01, 2.154e-02, 4.534e-01, 1.691e-02, -5.194e-02, 8.830e-03, 4.441e-02, 2.165e-01, -4.723e-02, 2.877e-01, 2.503e-02, 2.071e-01, 2.739e-01, 1.595e-01, -1.141e-01, 4.121e-01), r);
	r = MulAdd(s2_5, M4(3.627e-01, 8.034e-02, 2.531e-01, 1.357e-01, -5.579e-03, -5.171e-02, 4.784e-02, -1.580e-01, -1.597e-03, -2.143e-02, 1.145e-02, -2.409e-02, 1.135e-01, 1.942e-02, -2.637e-02, -1.371e-01), r);
	r = MulAdd(s2_6, M4(5.398e-02, 8.680e-03, 1.624e-02, -2.777e-02, -1.992e-02, 9.359e-03, 2.106e-02, 1.651e-02, -3.371e-02, -3.989e-02, 2.268e-02, -1.706e-02, 7.451e-03, -3.325e-02, -2.215e-02, -1.584e-02), r);
	r = MulAdd(s2_7, M4(1.898e-02, -7.606e-02, -8.297e-02, 5.946e-03, -2.822e-02, 1.434e-02, 6.815e-03, -7.690e-02, -2.483e-02, 1.220e-02, 2.584e-02, -1.723e-02, -1.801e-02, 2.864e-01, 2.826e-02, -1.121e-02), r);
	r = MulAdd(s2_8, M4(5.351e-02, 1.077e-01, -6.458e-02, 6.860e-02, -1.110e-02, 3.929e-02, 1.543e-02, 6.122e-02, -8.551e-03, 4.159e-02, 2.833e-03, 3.752e-02, 2.189e-02, 2.252e-04, -1.454e-02, 8.179e-02), r);
	r = MulAdd(s3_0, M4(-1.542e-02, -7.969e-03, -1.764e-02, 1.333e-02, 1.272e-01, -2.018e-02, -1.554e-02, 1.398e-02, -6.583e-02, 4.630e-02, -5.456e-02, -5.594e-02, -1.606e-01, -2.573e-02, 4.592e-02, -4.465e-02), r);
	r = MulAdd(s3_1, M4(-4.177e-02, -3.885e-02, -5.615e-02, -3.797e-02, -5.923e-02, -2.316e-02, 1.191e-02, -1.548e-01, -6.239e-01, -8.157e-02, 1.527e-01, 8.326e-02, -2.888e-01, -6.520e-02, 3.209e-02, 1.560e-02), r);
	r = MulAdd(s3_2, M4(-2.764e-03, 2.741e-02, 5.627e-02, 1.597e-02, 1.443e-02, 4.968e-02, 1.132e-02, 9.153e-02, -5.382e-03, -1.644e-02, 5.247e-03, 3.311e-02, -4.528e-02, 2.605e-02, 4.794e-02, 7.640e-02), r);
	r = MulAdd(s3_3, M4(2.045e-02, -1.614e-03, 2.289e-03, -4.305e-02, -1.017e-01, 5.861e-02, -1.314e-02, 2.477e-02, -1.999e-01, -1.155e-01, 1.294e-01, -2.392e-02, 1.875e-01, 2.845e-02, -3.870e-02, 5.380e-02), r);
	r = MulAdd(s3_4, M4(5.933e-02, -1.103e-02, 7.176e-02, -3.689e-02, 5.613e-02, -3.270e-01, 3.240e-02, -2.858e-01, -1.259e+00, 2.597e-01, 3.525e-01, 1.430e-02, 1.284e-01, 8.753e-02, 7.569e-02, -5.311e-02), r);
	r = MulAdd(s3_5, M4(2.279e-02, 1.895e-01, -6.586e-02, 1.515e-01, -2.923e-02, -1.046e-01, 7.716e-03, -1.229e-01, 2.289e-02, -7.755e-02, 1.265e-01, 8.574e-02, 1.189e-02, 6.164e-02, -7.733e-03, 5.218e-03), r);
	r = MulAdd(s3_6, M4(-8.498e-03, -3.724e-02, 2.832e-02, -7.279e-03, -3.663e-02, 2.402e-02, 2.821e-02, 1.569e-02, 2.319e-02, 2.036e-02, 2.252e-02, -1.611e-02, 9.843e-03, -3.099e-02, 3.160e-03, -5.084e-02), r);
	r = MulAdd(s3_7, M4(-2.465e-02, 1.290e-02, 2.281e-02, -5.200e-04, -6.553e-02, -1.768e-01, 9.698e-03, -6.422e-02, -1.386e-02, -1.391e-01, -5.888e-02, 5.580e-03, 2.841e-02, 2.990e-02, -5.399e-02, 5.650e-03), r);
	r = MulAdd(s3_8, M4(-4.902e-03, 9.385e-03, -8.622e-03, 1.815e-02, -8.117e-03, 4.919e-02, 1.259e-02, 4.528e-02, 2.606e-02, 4.409e-02, -1.766e-02, 1.950e-02, -1.739e-02, 7.717e-03, -1.473e-02, -1.569e-02), r);
	return r;
}

V4 f1(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
	V4 r = { 4.076e-03, -1.858e-02, 9.260e-03, 6.390e-04 };
	r = MulAdd(s0_0, M4(-9.510e-02, 5.523e-03, -5.580e-02, -3.107e-02, 2.588e-02, -3.076e-02, 4.525e-02, 2.069e-02, 1.302e-01, 1.260e-02, 3.877e-02, -1.878e-02, -1.553e-01, -3.705e-02, 2.666e-02, -1.631e-02), r);
	r = MulAdd(s0_1, M4(1.518e-01, 5.025e-04, 1.008e-01, 2.029e-02, -1.343e-01, 1.633e-01, 3.819e-01, 6.217e-03, -7.098e-02, 2.839e-02, 1.971e-02, -4.067e-02, -7.846e-02, -2.545e-01, 1.126e-01, 1.475e-02), r);
	r = MulAdd(s0_2, M4(-6.206e-02, -4.029e-02, 2.049e-02, -1.331e-02, 4.682e-02, 3.132e-03, 1.294e-01, -3.039e-02, 5.065e-02, -2.711e-03, 7.522e-02, 1.499e-01, 1.383e-03, -2.086e-02, -5.117e-02, -8.129e-02), r);
	r = MulAdd(s0_3, M4(-7.973e-02, 2.775e-02, -1.490e-02, -7.830e-02, 8.944e-02, -7.283e-02, 1.841e-01, -4.484e-02, -3.632e-02, 1.255e-02, 3.882e-02, -2.862e-02, -2.390e-01, -2.774e-02, 6.677e-02, 1.939e-02), r);
	r = MulAdd(s0_4, M4(2.243e-01, 8.911e-02, -1.718e-01, 1.485e-01, 9.759e-02, 2.321e-01, -4.390e-01, 2.092e-01, 4.278e-02, 1.205e-01, -1.966e-01, 2.130e-02, -2.223e-01, 3.818e-01, -5.426e-01, -3.762e-01), r);
	r = MulAdd(s0_5, M4(-2.560e-02, -7.789e-02, -1.339e-01, 4.073e-02, -4.376e-03, -1.462e-01, -2.860e-02, 1.660e-01, -3.137e-02, 2.393e-01, 2.252e-02, 5.165e-02, 6.906e-02, -1.712e-02, 3.349e-02, -3.350e-01), r);
	r = MulAdd(s0_6, M4(3.665e-03, 8.648e-02, -5.364e-02, 3.068e-02, -2.822e-02, -7.398e-02, 5.950e-03, 6.154e-03, 1.010e-02, 1.587e-02, -7.437e-03, 1.494e-02, 6.585e-02, 4.852e-02, 1.699e-02, -2.453e-02), r);
	r = MulAdd(s0_7, M4(-6.942e-02, -1.231e-01, 1.254e-01, -7.903e-03, 1.278e-02, -4.337e-02, 1.014e-01, -3.921e-02, 3.086e-02, 8.863e-02, 3.278e-03, -1.790e-02, -4.250e-02, -6.413e-02, 6.324e-02, 9.679e-02), r);
	r = MulAdd(s0_8, M4(-4.634e-02, 3.222e-03, 3.770e-02, -6.919e-02, 6.432e-03, 3.093e-02, -1.804e-02, -3.677e-02, 1.142e-03, 5.227e-02, -1.075e-01, 1.375e-01, 1.826e-02, 1.763e-02, 4.677e-02, -2.110e-02), r);
	r = MulAdd(s1_0, M4(-1.817e-02, 1.265e-02, -2.043e-02, 1.326e-02, -1.695e-01, 1.255e-01, -8.576e-02, 7.959e-03, 7.193e-02, 7.724e-02, -4.627e-02, -2.431e-02, -1.319e-01, -5.651e-02, 5.140e-02, 1.678e-02), r);
	r = MulAdd(s1_1, M4(3.337e-02, 4.582e-02, -2.165e-02, 9.076e-03, -4.945e-02, -8.773e-03, 9.112e-02, 1.320e-01, 3.413e-02, 3.273e-01, -2.814e-01, 1.595e-02, 1.343e-01, -6.513e-02, 4.512e-02, -8.970e-02), r);
	r = MulAdd(s1_2, M4(3.505e-02, 2.270e-02, 7.114e-02, 1.198e-01, 3.787e-02, 1.018e-02, 6.472e-02, -2.210e-02, 2.939e-02, 8.307e-02, 2.993e-03, 1.123e-01, -9.273e-02, 2.409e-02, -4.903e-02, -4.636e-02), r);
	r = MulAdd(s1_3, M4(-4.843e-02, -6.013e-02, 2.715e-02, -9.149e-03, 1.452e-01, 3.518e-02, 7.180e-02, 3.286e-03, -2.588e-01, -1.638e-01, 1.521e-01, -1.203e-02, -2.689e-01, 4.285e-02, -1.837e-01, -5.794e-03), r);
	r = MulAdd(s1_4, M4(2.749e-01, -3.031e-01, 1.152e-01, -7.245e-02, 9.716e-02, 8.837e-02, -3.117e-01, 5.664e-02, -1.557e-01, 3.682e-01, 1.259e-01, 3.908e-02, -8.916e-02, 2.177e-01, 1.254e-01, -2.862e-01), r);
	r = MulAdd(s1_5, M4(3.740e-02, 1.899e-01, -9.864e-02, 6.740e-01, -8.316e-03, 1.469e-02, -4.439e-02, 2.380e-02, 1.310e-02, 2.651e-01, -3.290e-02, -4.670e-02, -3.505e-02, -1.649e-01, -2.098e-01, 1.243e-01), r);
	r = MulAdd(s1_6, M4(-3.589e-02, 2.351e-02, -2.010e-02, 1.626e-02, -4.976e-02, -8.924e-02, -1.295e-03, 2.366e-02, -1.079e-02, 9.095e-02, -5.359e-02, -2.689e-02, 1.871e-02, -4.332e-03, -3.191e-02, -2.244e-02), r);
	r = MulAdd(s1_7, M4(-4.806e-02, 1.472e-02, 6.391e-02, -5.629e-03, 3.986e-02, -8.815e-02, 1.416e-01, -6.209e-02, -2.343e-03, 3.541e-01, -7.232e-02, 1.099e-01, 2.329e-03, 7.972e-02, -7.797e-02, 1.009e-02), r);
	r = MulAdd(s1_8, M4(-1.305e-02, -6.657e-02, 4.004e-02, 3.393e-03, 1.116e-02, 4.225e-03, -1.438e-02, -1.464e-02, 6.804e-02, -1.095e-02, -1.145e-01, 2.725e-01, -2.005e-02, 6.875e-02, 7.349e-02, 1.354e-02), r);
	r = MulAdd(s2_0, M4(5.085e-02, 3.046e-02, -8.986e-03, -1.066e-02, -6.250e-02, 6.891e-02, -9.668e-02, 1.813e-02, 1.333e-01, -6.920e-02, 4.096e-02, -2.500e-02, -1.387e-01, -3.212e-02, -7.621e-03, 5.839e-03), r);
	r = MulAdd(s2_1, M4(-6.667e-02, 1.181e-02, 7.953e-02, 7.288e-02, 2.128e-02, 2.170e-01, -3.280e-02, 5.377e-02, 6.852e-03, -7.527e-03, 9.964e-02, 7.642e-02, 3.760e-02, -4.753e-02, 7.111e-02, -1.569e-02), r);
	r = MulAdd(s2_2, M4(2.790e-02, 3.059e-02, 2.660e-01, 1.046e-01, 2.424e-02, -2.548e-02, 3.858e-02, 2.470e-02, -1.736e-03, 1.112e-03, 6.293e-03, -3.455e-02, 5.024e-03, -3.655e-02, 7.447e-02, 3.024e-02), r);
	r = MulAdd(s2_3, M4(-8.148e-02, 1.468e-01, -2.009e-02, 6.602e-02, 1.842e-02, -2.225e-01, 2.157e-01, -1.562e-02, -1.049e-01, 9.012e-02, -3.863e-02, 7.084e-02, -7.823e-02, 8.032e-03, 4.661e-02, 1.247e-01), r);
	r = MulAdd(s2_4, M4(2.322e-01, 6.905e-02, -6.217e-01, -2.876e-01, 2.005e-01, -8.268e-01, 1.071e-02, -1.958e-01, 1.116e-01, 1.250e-01, -5.871e-02, -1.294e-01, 3.019e-01, 3.933e-01, 3.487e-01, -2.436e-01), r);
	r = MulAdd(s2_5, M4(-9.753e-02, 6.593e-02, -2.563e-01, 2.381e-01, 4.691e-02, 1.796e-01, 1.375e-03, -3.460e-02, -3.481e-02, -2.220e-02, 3.476e-03, 9.687e-02, -4.415e-02, 1.811e-01, -1.381e-01, -6.685e-02), r);
	r = MulAdd(s2_6, M4(-1.212e-03, 4.823e-02, -1.680e-03, 1.062e-02, -3.830e-02, -5.797e-02, 1.147e-02, 2.973e-02, 1.650e-02, -2.618e-02, 5.139e-02, 1.433e-02, -1.063e-01, -2.012e-01, 1.667e-01, 3.318e-02), r);
	r = MulAdd(s2_7, M4(-2.061e-02, 5.721e-02, 4.963e-02, 3.164e-02, 1.512e-02, -4.045e-02, 1.890e-01, -3.224e-02, -2.952e-02, -1.150e-01, 3.362e-02, -3.598e-03, 2.868e-02, -2.058e-01, 2.494e-01, -1.499e-01), r);
	r = MulAdd(s2_8, M4(-3.157e-02, -1.385e-01, 1.396e-02, 6.336e-02, -2.137e-02, -9.049e-02, 1.104e-02, -3.260e-03, 2.508e-03, -4.677e-02, -1.825e-02, -3.115e-03, -3.979e-02, -1.426e-01, 2.571e-02, 1.175e-01), r);
	r = MulAdd(s3_0, M4(1.485e-01, -6.051e-03, 3.529e-02, -2.369e-02, -2.982e-01, 1.121e-01, -1.680e-01, -2.738e-02, 1.186e-01, -8.329e-02, 5.089e-02, -2.131e-02, 1.111e-01, -4.026e-02, 9.861e-02, -2.885e-02), r);
	r = MulAdd(s3_1, M4(-1.694e-01, 2.375e-02, -3.937e-02, -3.478e-02, 2.193e-02, 7.665e-02, -5.437e-02, 4.724e-02, -6.213e-02, -4.031e-01, 1.861e-01, 9.586e-03, -4.383e-02, -1.106e-01, 1.108e-01, 4.909e-02), r);
	r = MulAdd(s3_2, M4(6.812e-02, -2.311e-02, 5.210e-02, 9.546e-02, 2.433e-02, 3.416e-02, 2.978e-02, 2.772e-02, -5.155e-03, -2.713e-02, 2.212e-02, -6.056e-03, 5.356e-02, -5.571e-02, 7.526e-02, 6.467e-02), r);
	r = MulAdd(s3_3, M4(-1.633e-01, -2.742e-02, -7.204e-03, -8.113e-03, -2.917e-01, -1.351e-01, 5.153e-02, -8.188e-02, -1.977e-01, -1.460e-01, -1.173e-01, -1.349e-02, 7.836e-02, 1.091e-01, -5.539e-02, 9.694e-02), r);
	r = MulAdd(s3_4, M4(2.165e-01, 1.383e-01, -1.362e-01, 8.388e-02, 9.544e-02, -4.350e-01, -3.249e-01, 1.559e-01, 1.919e-01, -4.631e-01, -4.345e-01, -3.247e-01, 7.224e-02, 2.959e-01, -2.107e-01, -2.463e-01), r);
	r = MulAdd(s3_5, M4(-4.089e-02, 2.202e-01, 6.955e-02, -1.248e-01, 1.140e-02, -7.158e-03, -3.404e-02, 1.370e-01, -4.043e-03, -7.008e-02, 7.084e-02, 8.809e-02, 1.570e-02, 6.716e-02, -9.100e-04, -1.166e-02), r);
	r = MulAdd(s3_6, M4(2.194e-02, 6.765e-02, -9.062e-03, 1.480e-02, 1.792e-02, 4.489e-02, -1.333e-02, 3.524e-02, 3.035e-02, 1.197e-01, -5.057e-02, -2.569e-03, -5.396e-02, -3.068e-02, 3.150e-02, -3.015e-02), r);
	r = MulAdd(s3_7, M4(-7.630e-02, -1.487e-01, 1.509e-01, -2.381e-02, 2.594e-02, -2.307e-02, 1.260e-01, -3.836e-02, -2.424e-02, 3.167e-02, 3.592e-03, 8.270e-02, -5.491e-02, -1.358e-01, 9.294e-02, 2.589e-02), r);
	r = MulAdd(s3_8, M4(-4.926e-04, -1.151e-01, -4.633e-02, 5.883e-02, -5.406e-03, 1.852e-02, -2.089e-02, -5.929e-02, -1.291e-02, -2.909e-02, -1.351e-02, -1.849e-02, -1.130e-02, -7.846e-02, -9.407e-03, -1.708e-02), r);
	return r;
}

void Pass7(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = TileSwizzle8x8(tid.x) + blockStart;
	uint2 size = GetInputSize();
	if (gxy.x >= size.x || gxy.y >= size.y) {
		return;
	}
	float2 pos = (gxy + 0.5) * pt;

	V4 s0_0 = l0(-1.0, -1.0);
	V4 s0_1 = l0(0.0, -1.0);
	V4 s0_2 = l0(1.0, -1.0);
	V4 s0_3 = l0(-1.0, 0.0);
	V4 s0_4 = l0(0.0, 0.0);
	V4 s0_5 = l0(1.0, 0.0);
	V4 s0_6 = l0(-1.0, 1.0);
	V4 s0_7 = l0(0.0, 1.0);
	V4 s0_8 = l0(1.0, 1.0);
	V4 s1_0 = -max(-s0_0, 0.0);
	V4 s1_1 = -max(-s0_1, 0.0);
	V4 s1_2 = -max(-s0_2, 0.0);
	V4 s1_3 = -max(-s0_3, 0.0);
	V4 s1_4 = -max(-s0_4, 0.0);
	V4 s1_5 = -max(-s0_5, 0.0);
	V4 s1_6 = -max(-s0_6, 0.0);
	V4 s1_7 = -max(-s0_7, 0.0);
	V4 s1_8 = -max(-s0_8, 0.0);
	s0_0 = max(s0_0, 0.0);
	s0_1 = max(s0_1, 0.0);
	s0_2 = max(s0_2, 0.0);
	s0_3 = max(s0_3, 0.0);
	s0_4 = max(s0_4, 0.0);
	s0_5 = max(s0_5, 0.0);
	s0_6 = max(s0_6, 0.0);
	s0_7 = max(s0_7, 0.0);
	s0_8 = max(s0_8, 0.0);

	V4 s2_0 = l1(-1.0, -1.0);
	V4 s2_1 = l1(0.0, -1.0);
	V4 s2_2 = l1(1.0, -1.0);
	V4 s2_3 = l1(-1.0, 0.0);
	V4 s2_4 = l1(0.0, 0.0);
	V4 s2_5 = l1(1.0, 0.0);
	V4 s2_6 = l1(-1.0, 1.0);
	V4 s2_7 = l1(0.0, 1.0);
	V4 s2_8 = l1(1.0, 1.0);
	V4 s3_0 = -max(-s2_0, 0.0);
	V4 s3_1 = -max(-s2_1, 0.0);
	V4 s3_2 = -max(-s2_2, 0.0);
	V4 s3_3 = -max(-s2_3, 0.0);
	V4 s3_4 = -max(-s2_4, 0.0);
	V4 s3_5 = -max(-s2_5, 0.0);
	V4 s3_6 = -max(-s2_6, 0.0);
	V4 s3_7 = -max(-s2_7, 0.0);
	V4 s3_8 = -max(-s2_8, 0.0);
	s2_0 = max(s2_0, 0.0);
	s2_1 = max(s2_1, 0.0);
	s2_2 = max(s2_2, 0.0);
	s2_3 = max(s2_3, 0.0);
	s2_4 = max(s2_4, 0.0);
	s2_5 = max(s2_5, 0.0);
	s2_6 = max(s2_6, 0.0);
	s2_7 = max(s2_7, 0.0);
	s2_8 = max(s2_8, 0.0);

	t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
	t1[gxy] = f1(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
}

//!PASS 8
//!DESC Output shuffle - Pixel reconstruction
//!BLOCK_SIZE 16
//!NUM_THREADS 64
//!IN INPUT, t0, t1
//!OUT OUTPUT

#define l0(x, y) V4(O(t0, float2(x, y)))
#define l1(x, y) V4(O(t1, float2(x, y)))

V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
	V4 r = { 6.897e-04, 1.064e-03, 8.808e-04, 1.561e-03 };
	r = MulAdd(s0_0, M4(1.911e-02, 7.153e-03, -2.976e-04, -7.179e-03, -1.377e-02, -6.451e-03, 2.290e-02, -1.537e-02, 2.230e-02, 1.335e-02, -2.970e-03, -1.058e-02, 2.051e-02, 6.362e-03, -2.487e-02, -6.941e-04), r);
	r = MulAdd(s0_1, M4(-2.149e-02, 8.997e-03, 5.809e-02, 4.138e-02, -3.275e-02, -2.813e-02, 1.701e-02, -8.705e-03, 2.320e-02, 9.248e-03, -3.233e-02, -6.138e-03, 4.763e-02, 2.689e-02, -5.537e-02, -2.760e-02), r);
	r = MulAdd(s0_2, M4(-1.414e-02, -3.483e-02, -7.279e-03, 1.521e-02, -8.604e-03, -5.970e-02, -3.428e-02, -1.553e-02, 3.362e-03, 3.209e-02, 2.829e-03, -1.605e-02, 1.413e-02, 6.860e-02, 3.001e-02, -1.223e-03), r);
	r = MulAdd(s0_3, M4(-6.645e-02, -8.320e-03, 4.523e-02, 5.457e-02, 1.400e-02, 3.198e-02, 1.427e-02, 1.483e-02, -1.421e-01, -2.657e-02, 4.797e-02, 7.217e-02, -3.955e-02, -3.721e-02, 2.393e-02, 2.795e-02), r);
	r = MulAdd(s0_4, M4(-1.653e-02, -1.199e-01, -4.775e-01, -3.048e-01, 5.775e-02, -2.554e-02, 1.568e-01, 3.992e-02, -1.368e-01, -2.363e-01, -5.499e-02, -1.315e-01, -2.829e-02, 1.148e-02, -4.298e-02, -2.297e-02), r);
	r = MulAdd(s0_5, M4(2.388e-02, 9.640e-02, -6.766e-04, -1.894e-01, -1.938e-02, -1.417e-02, -6.162e-03, 1.409e-02, 1.217e-02, -8.992e-03, 2.264e-02, 8.724e-02, 7.060e-03, 1.580e-02, 1.648e-02, 4.897e-02), r);
	r = MulAdd(s0_6, M4(7.503e-03, -2.970e-03, -5.068e-03, -1.602e-02, 1.288e-02, 9.609e-03, -1.090e-02, 1.443e-03, -1.047e-02, 1.581e-02, -7.625e-03, -7.056e-02, -2.884e-02, -3.020e-02, -2.549e-02, -4.848e-02), r);
	r = MulAdd(s0_7, M4(2.075e-02, 1.442e-02, 4.774e-02, 2.273e-02, 1.258e-02, 1.637e-02, -7.148e-02, -5.842e-02, 1.055e-02, -3.364e-02, 1.822e-01, 2.194e-01, -2.953e-02, -2.531e-02, 4.235e-02, 4.981e-02), r);
	r = MulAdd(s0_8, M4(-6.428e-03, 9.970e-03, 1.333e-02, 5.654e-02, -9.648e-03, 2.059e-04, -1.464e-02, -3.448e-02, 3.807e-03, 1.338e-02, -2.262e-02, -1.434e-02, 6.271e-03, -9.572e-03, 6.071e-03, 1.341e-02), r);
	r = MulAdd(s1_0, M4(1.419e-02, 5.825e-03, 3.581e-03, -2.137e-03, 2.863e-03, 7.173e-03, 4.501e-02, -2.820e-02, 3.248e-02, -7.079e-03, 1.837e-02, 2.565e-02, 1.775e-02, 1.246e-02, -3.047e-02, 1.125e-02), r);
	r = MulAdd(s1_1, M4(-4.685e-02, 1.148e-03, 7.450e-02, 5.961e-02, -7.743e-02, -1.134e-02, -2.803e-02, -8.041e-03, 1.388e-01, 1.785e-01, -2.667e-02, -3.292e-02, 2.367e-02, 1.317e-02, -5.723e-02, 6.215e-03), r);
	r = MulAdd(s1_2, M4(-9.482e-03, -5.237e-02, -5.906e-04, 2.193e-02, 2.896e-03, -6.707e-02, -9.075e-03, -1.458e-02, -4.769e-05, 2.592e-02, 1.115e-02, -9.638e-03, 2.494e-02, 7.106e-02, 2.839e-02, 5.956e-03), r);
	r = MulAdd(s1_3, M4(-6.437e-02, -4.261e-03, 6.120e-02, 6.179e-02, 3.619e-02, 6.905e-02, -7.007e-02, -9.677e-02, 1.439e-02, -5.513e-02, -1.084e-01, 9.009e-02, -2.376e-01, -7.979e-02, -7.095e-03, 8.423e-02), r);
	r = MulAdd(s1_4, M4(-3.465e-02, -1.174e-01, -4.971e-01, -2.725e-01, -1.110e-01, -2.241e-01, -1.919e-01, 1.115e+00, 1.600e-01, 1.870e-01, -3.286e-01, -5.513e-01, -4.899e-01, 9.921e-01, -1.312e-01, -2.666e-01), r);
	r = MulAdd(s1_5, M4(6.483e-03, 4.674e-02, 1.070e-02, -2.163e-01, -4.141e-03, -2.429e-02, 4.874e-02, -1.576e-01, -3.007e-02, 2.606e-02, 4.476e-02, 6.729e-02, 5.863e-02, -6.616e-02, 7.878e-03, 9.787e-02), r);
	r = MulAdd(s1_6, M4(2.207e-02, 5.671e-03, 2.283e-03, -1.648e-02, 5.681e-03, -1.943e-03, 1.594e-02, 2.148e-02, -3.016e-02, 3.046e-04, 4.328e-02, -4.072e-02, -1.499e-02, -6.713e-03, -8.285e-02, 1.440e-03), r);
	r = MulAdd(s1_7, M4(-4.522e-02, -1.953e-02, 3.004e-01, 1.255e-01, 6.459e-03, 3.954e-02, -1.323e-01, -1.315e-01, -1.715e-02, -5.240e-02, 1.653e-01, 2.243e-01, -2.645e-02, -2.654e-02, 3.446e-02, 1.267e-02), r);
	r = MulAdd(s1_8, M4(1.441e-02, 2.868e-03, 4.941e-02, 2.607e-01, -5.114e-03, -6.024e-03, 3.945e-03, -4.669e-02, 6.142e-03, 4.197e-03, -4.871e-03, 2.252e-04, 1.890e-02, 1.142e-02, -1.040e-02, 7.153e-02), r);
	r = MulAdd(s2_0, M4(4.156e-03, 3.821e-03, -1.529e-03, 2.462e-03, -2.837e-02, -1.254e-02, -1.678e-02, -5.421e-03, -8.727e-03, -8.809e-03, -3.158e-02, -2.025e-02, -1.411e-02, 7.584e-03, 8.450e-02, 4.221e-02), r);
	r = MulAdd(s2_1, M4(1.418e-02, -7.004e-03, -2.935e-02, -2.645e-02, -1.518e-02, -3.918e-02, -6.617e-02, -5.273e-02, -9.888e-02, -1.090e-02, 1.380e-01, 7.376e-02, -1.161e-02, -1.487e-02, 2.550e-02, 6.274e-02), r);
	r = MulAdd(s2_2, M4(5.309e-03, 2.695e-02, 1.267e-02, -3.495e-03, 9.392e-03, 2.603e-02, 2.278e-02, 3.305e-03, -1.302e-02, -1.123e-01, -1.547e-02, 3.941e-02, 1.298e-03, -1.396e-02, 1.964e-03, 7.526e-03), r);
	r = MulAdd(s2_3, M4(-1.822e-02, 8.636e-03, 8.368e-03, 1.102e-02, 6.230e-02, -2.289e-02, -1.296e-02, -1.681e-02, 4.747e-02, -1.882e-02, 9.110e-03, -1.084e-02, -2.726e-02, -2.000e-02, -1.814e-01, -7.070e-02), r);
	r = MulAdd(s2_4, M4(-4.428e-02, -6.799e-02, -5.391e-03, -2.368e-02, 3.822e-01, 3.324e-01, 2.766e-01, 1.771e-01, 1.929e-01, 2.418e-01, -2.039e-01, 5.701e-03, 1.467e-01, 4.835e-02, 1.470e-01, -7.146e-02), r);
	r = MulAdd(s2_5, M4(-9.316e-03, -2.872e-03, 7.287e-03, 5.032e-02, -2.617e-02, 9.548e-02, -3.550e-02, 9.550e-02, -4.338e-02, -2.402e-02, -4.187e-02, -2.445e-01, -5.388e-03, 3.789e-02, -2.563e-02, 2.686e-02), r);
	r = MulAdd(s2_6, M4(9.817e-03, 4.676e-03, -6.399e-03, 7.844e-03, -1.019e-02, -8.003e-04, 5.167e-02, 1.419e-02, -9.988e-03, -2.480e-03, 2.983e-02, -6.955e-03, -4.089e-02, -1.849e-02, -3.747e-02, -2.679e-02), r);
	r = MulAdd(s2_7, M4(3.122e-02, 1.641e-02, 5.238e-02, -1.886e-03, -1.368e-02, -1.047e-02, 4.063e-02, 8.081e-02, -2.436e-02, -1.314e-02, 7.406e-02, 1.255e-01, 1.328e-03, -1.480e-02, -2.203e-02, -6.794e-03), r);
	r = MulAdd(s2_8, M4(-1.214e-02, 3.663e-02, -5.298e-02, 4.172e-02, -1.636e-03, -1.193e-02, 1.591e-02, 1.546e-02, -4.897e-03, -2.570e-02, 1.029e-02, -1.321e-02, 1.692e-03, -4.016e-03, 1.854e-02, -2.154e-02), r);
	r = MulAdd(s3_0, M4(1.653e-02, 8.081e-04, -8.249e-04, -2.749e-03, -4.287e-02, -1.117e-02, -1.953e-02, -1.392e-02, -9.860e-03, -1.139e-02, -3.431e-02, -2.482e-02, 2.073e-03, 2.388e-02, 8.476e-02, 5.234e-02), r);
	r = MulAdd(s3_1, M4(1.545e-02, 2.697e-03, -2.547e-02, -1.587e-02, 2.739e-02, -5.155e-02, -8.569e-02, -6.176e-02, -4.554e-02, -2.669e-02, 1.469e-01, 7.837e-02, -3.883e-02, -1.063e-02, 3.383e-02, 7.495e-02), r);
	r = MulAdd(s3_2, M4(1.549e-02, -4.319e-03, 3.554e-02, -1.405e-02, 3.542e-03, 5.563e-02, 1.556e-02, -6.819e-03, -2.139e-02, -4.968e-02, -2.142e-02, 3.475e-02, -1.378e-02, -3.919e-02, -8.013e-03, -5.496e-03), r);
	r = MulAdd(s3_3, M4(-3.531e-03, 9.257e-03, 1.262e-02, 4.105e-03, 4.193e-02, -2.331e-02, -2.424e-02, -2.262e-02, 5.437e-02, -3.147e-02, 2.348e-02, -2.547e-02, -1.490e-01, 2.063e-02, -1.968e-01, -2.503e-02), r);
	r = MulAdd(s3_4, M4(-6.533e-02, -2.179e-02, 2.316e-02, -8.762e-03, 2.356e-01, 2.350e-01, 3.014e-01, 1.560e-01, 2.691e-01, 3.561e-01, -4.286e-01, -1.469e-01, 5.332e-01, -3.363e-01, 2.388e-01, -2.940e-01), r);
	r = MulAdd(s3_5, M4(3.350e-01, -2.028e-01, -8.521e-02, -1.395e-02, -4.912e-02, 1.973e-02, -3.872e-02, 1.125e-01, -6.922e-02, -1.093e-01, -1.874e-02, -2.666e-01, -4.931e-02, 8.861e-02, -4.383e-02, 8.293e-03), r);
	r = MulAdd(s3_6, M4(1.665e-02, -1.267e-03, 2.164e-03, 3.524e-03, -1.150e-02, -4.903e-03, 2.395e-02, 2.391e-03, -1.383e-02, 1.477e-03, 1.728e-02, -1.356e-02, 2.222e-03, 6.370e-03, -7.935e-02, -8.551e-03), r);
	r = MulAdd(s3_7, M4(2.945e-02, 3.503e-02, 1.334e-02, 6.900e-02, -1.146e-02, -1.218e-02, 5.461e-02, 7.172e-02, -2.653e-02, -2.229e-02, 1.076e-01, 1.016e-01, -7.640e-03, -5.440e-03, 5.615e-02, -5.270e-02), r);
	r = MulAdd(s3_8, M4(1.949e-02, -1.039e-01, 6.569e-01, -3.909e-01, -1.983e-03, -1.781e-02, -3.701e-03, -1.970e-02, -1.160e-02, -2.080e-02, -1.361e-02, 1.851e-03, -1.366e-02, -2.093e-02, -4.601e-03, -1.360e-02), r);
	return tanh(r);
}

void Pass8(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = (TileSwizzle8x8(tid.x) << 1) + blockStart;
	uint2 size = GetOutputSize();
	if (gxy.x >= size.x || gxy.y >= size.y) {
		return;
	}
	float2 pos = ((gxy >> 1) + 0.5) * pt;

	V4 s0_0 = l0(-1.0, -1.0);
	V4 s0_1 = l0(0.0, -1.0);
	V4 s0_2 = l0(1.0, -1.0);
	V4 s0_3 = l0(-1.0, 0.0);
	V4 s0_4 = l0(0.0, 0.0);
	V4 s0_5 = l0(1.0, 0.0);
	V4 s0_6 = l0(-1.0, 1.0);
	V4 s0_7 = l0(0.0, 1.0);
	V4 s0_8 = l0(1.0, 1.0);
	V4 s1_0 = -max(-s0_0, 0.0);
	V4 s1_1 = -max(-s0_1, 0.0);
	V4 s1_2 = -max(-s0_2, 0.0);
	V4 s1_3 = -max(-s0_3, 0.0);
	V4 s1_4 = -max(-s0_4, 0.0);
	V4 s1_5 = -max(-s0_5, 0.0);
	V4 s1_6 = -max(-s0_6, 0.0);
	V4 s1_7 = -max(-s0_7, 0.0);
	V4 s1_8 = -max(-s0_8, 0.0);
	s0_0 = max(s0_0, 0.0);
	s0_1 = max(s0_1, 0.0);
	s0_2 = max(s0_2, 0.0);
	s0_3 = max(s0_3, 0.0);
	s0_4 = max(s0_4, 0.0);
	s0_5 = max(s0_5, 0.0);
	s0_6 = max(s0_6, 0.0);
	s0_7 = max(s0_7, 0.0);
	s0_8 = max(s0_8, 0.0);

	V4 s2_0 = l1(-1.0, -1.0);
	V4 s2_1 = l1(0.0, -1.0);
	V4 s2_2 = l1(1.0, -1.0);
	V4 s2_3 = l1(-1.0, 0.0);
	V4 s2_4 = l1(0.0, 0.0);
	V4 s2_5 = l1(1.0, 0.0);
	V4 s2_6 = l1(-1.0, 1.0);
	V4 s2_7 = l1(0.0, 1.0);
	V4 s2_8 = l1(1.0, 1.0);
	V4 s3_0 = -max(-s2_0, 0.0);
	V4 s3_1 = -max(-s2_1, 0.0);
	V4 s3_2 = -max(-s2_2, 0.0);
	V4 s3_3 = -max(-s2_3, 0.0);
	V4 s3_4 = -max(-s2_4, 0.0);
	V4 s3_5 = -max(-s2_5, 0.0);
	V4 s3_6 = -max(-s2_6, 0.0);
	V4 s3_7 = -max(-s2_7, 0.0);
	V4 s3_8 = -max(-s2_8, 0.0);
	s2_0 = max(s2_0, 0.0);
	s2_1 = max(s2_1, 0.0);
	s2_2 = max(s2_2, 0.0);
	s2_3 = max(s2_3, 0.0);
	s2_4 = max(s2_4, 0.0);
	s2_5 = max(s2_5, 0.0);
	s2_6 = max(s2_6, 0.0);
	s2_7 = max(s2_7, 0.0);
	s2_8 = max(s2_8, 0.0);

	V4 r = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);

	// Color space conversion matrices
	static const MF3x3 rgb2yuv = { 0.299, 0.587, 0.114, -0.169, -0.331, 0.5, 0.5, -0.419, -0.081 };
	static const MF3x3 yuv2rgb = { 1, -0.00093, 1.401687, 1, -0.3437, -0.71417, 1, 1.77216, 0.00099 };
	float2 opt = float2(GetOutputPt());

	pos -= 0.5f * opt;
	MF3 yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
	OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.x), yuv.yz)), 1);

	++gxy.x;
	pos.x += opt.x;
	yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
	OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.y), yuv.yz)), 1);

	++gxy.y;
	pos.y += opt.y;
	yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
	OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.w), yuv.yz)), 1);

	--gxy.x;
	pos.x -= opt.x;
	yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
	OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.z), yuv.yz)), 1);
}
