// RAVU (Rapid and Accurate Video Upscaling) - Radius 4 Luma variant
// 2x edge-adaptive upscaler using structure tensor analysis
// Uses larger radius 4 kernel for highest quality edge preservation

//!BGFX EFFECT
//!VERSION 1
//!NAME RAVU R4
//!CATEGORY Upscaling
//!DESCRIPTION 2x edge-adaptive upscaler with radius 4 kernel. Uses structure tensor analysis for intelligent interpolation. Largest kernel size for maximum quality. Luma-only processing with chroma reconstruction.

//!TEXTURE
Texture2D INPUT;

//!SAMPLER
//!FILTER POINT
SamplerState sam_INPUT;

//!TEXTURE
//!WIDTH  INPUT_WIDTH * 2
//!HEIGHT INPUT_HEIGHT * 2
Texture2D OUTPUT;

//!SAMPLER
//!FILTER LINEAR
SamplerState sam_INPUT_LINEAR;

//!TEXTURE
//!SOURCE ravu_lut4_f16.dds
//!FORMAT R16G16B16A16_FLOAT
Texture2D ravu_lut4;

//!SAMPLER
//!FILTER LINEAR
SamplerState sam_ravu_lut4;

//!TEXTURE
//!FORMAT R16_FLOAT
//!WIDTH  INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
Texture2D ravu_int11;

//!SAMPLER
//!FILTER POINT
SamplerState sam_ravu_int11;

//!COMMON
#include "prescalers.hlsli"

#define LAST_PASS 2

//!PASS 1
//!DESC RAVU R4 Pass 1 - Compute diagonal interpolation weights
//!IN INPUT, ravu_lut4
//!OUT ravu_int11
//!BLOCK_SIZE 32, 8
//!NUM_THREADS 32, 8

// Shared memory for neighborhood samples
shared float inp0[585];

#define CURRENT_PASS 1

#define GET_SAMPLE(x) dot(x.rgb, rgb2y)
#define imageStore(out_image, pos, val) imageStoreOverride(pos, val.x)
void imageStoreOverride(uint2 pos, float value) { ravu_int11[pos] = (value); }

#define INPUT_tex(pos) GET_SAMPLE(vec4(texture(INPUT, pos)))
static const float2 INPUT_size = float2(GetInputSize());
static const float2 INPUT_pt = float2(GetInputPt());

#define ravu_lut4_tex(pos) (vec4(texture(ravu_lut4, pos)))

#define HOOKED_tex(pos) INPUT_tex(pos)
#define HOOKED_size INPUT_size
#define HOOKED_pt INPUT_pt

void Pass1(uint2 blockStart, uint3 threadId) {
	ivec2 group_base = ivec2(gl_WorkGroupID) * ivec2(gl_WorkGroupSize);
	int local_pos = int(gl_LocalInvocationID.x) * 15 + int(gl_LocalInvocationID.y);

	// Load input samples into shared memory
	{
		for (int id = int(gl_LocalInvocationIndex); id < 585; id += int(gl_WorkGroupSize.x * gl_WorkGroupSize.y)) {
			uint x = (uint)id / 15, y = (uint)id % 15;
			inp0[id] =
				HOOKED_tex(HOOKED_pt * vec2(float(group_base.x + x) + (-2.5), float(group_base.y + y) + (-2.5))).x;
		}
	}
	barrier();

#if CURRENT_PASS == LAST_PASS
	uint2 destPos = blockStart + threadId.xy * 2;
	uint2 outputSize = GetOutputSize();
	if (destPos.x >= outputSize.x || destPos.y >= outputSize.y) {
		return;
	}
#endif

	// Load luma samples from shared memory
	{
		float luma57 = inp0[local_pos + 106];
		float luma58 = inp0[local_pos + 107];
		float luma59 = inp0[local_pos + 108];
		float luma60 = inp0[local_pos + 109];
		float luma61 = inp0[local_pos + 110];
		float luma62 = inp0[local_pos + 111];
		float luma8 = inp0[local_pos + 15];
		float luma9 = inp0[local_pos + 16];
		float luma10 = inp0[local_pos + 17];
		float luma11 = inp0[local_pos + 18];
		float luma12 = inp0[local_pos + 19];
		float luma1 = inp0[local_pos + 1];
		float luma13 = inp0[local_pos + 20];
		float luma14 = inp0[local_pos + 21];
		float luma15 = inp0[local_pos + 22];
		float luma2 = inp0[local_pos + 2];
		float luma16 = inp0[local_pos + 30];
		float luma17 = inp0[local_pos + 31];
		float luma18 = inp0[local_pos + 32];
		float luma19 = inp0[local_pos + 33];
		float luma20 = inp0[local_pos + 34];
		float luma21 = inp0[local_pos + 35];
		float luma22 = inp0[local_pos + 36];
		float luma23 = inp0[local_pos + 37];
		float luma3 = inp0[local_pos + 3];
		float luma24 = inp0[local_pos + 45];
		float luma25 = inp0[local_pos + 46];
		float luma26 = inp0[local_pos + 47];
		float luma27 = inp0[local_pos + 48];
		float luma28 = inp0[local_pos + 49];
		float luma4 = inp0[local_pos + 4];
		float luma29 = inp0[local_pos + 50];
		float luma30 = inp0[local_pos + 51];
		float luma31 = inp0[local_pos + 52];
		float luma5 = inp0[local_pos + 5];
		float luma32 = inp0[local_pos + 60];
		float luma33 = inp0[local_pos + 61];
		float luma34 = inp0[local_pos + 62];
		float luma35 = inp0[local_pos + 63];
		float luma36 = inp0[local_pos + 64];
		float luma37 = inp0[local_pos + 65];
		float luma38 = inp0[local_pos + 66];
		float luma39 = inp0[local_pos + 67];
		float luma6 = inp0[local_pos + 6];
		float luma40 = inp0[local_pos + 75];
		float luma41 = inp0[local_pos + 76];
		float luma42 = inp0[local_pos + 77];
		float luma43 = inp0[local_pos + 78];
		float luma44 = inp0[local_pos + 79];
		float luma45 = inp0[local_pos + 80];
		float luma46 = inp0[local_pos + 81];
		float luma47 = inp0[local_pos + 82];
		float luma48 = inp0[local_pos + 90];
		float luma49 = inp0[local_pos + 91];
		float luma50 = inp0[local_pos + 92];
		float luma51 = inp0[local_pos + 93];
		float luma52 = inp0[local_pos + 94];
		float luma53 = inp0[local_pos + 95];
		float luma54 = inp0[local_pos + 96];
		float luma55 = inp0[local_pos + 97];

		// Compute structure tensor for edge detection
		vec3 abd = vec3(0.0, 0.0, 0.0);
		float gx, gy;

		// Gradient computation with Gaussian weighting
		gx = (luma17 - luma1) / 2.0;
		gy = (luma10 - luma8) / 2.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.011007348802298533;
		gx = (luma18 - luma2) / 2.0;
		gy = (-luma12 + 8.0 * luma11 - 8.0 * luma9 + luma8) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.018148050104365175;
		gx = (luma19 - luma3) / 2.0;
		gy = (-luma13 + 8.0 * luma12 - 8.0 * luma10 + luma9) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.0233025575973275;
		gx = (luma20 - luma4) / 2.0;
		gy = (-luma14 + 8.0 * luma13 - 8.0 * luma11 + luma10) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.0233025575973275;
		gx = (luma21 - luma5) / 2.0;
		gy = (-luma15 + 8.0 * luma14 - 8.0 * luma12 + luma11) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.018148050104365175;
		gx = (luma22 - luma6) / 2.0;
		gy = (luma15 - luma13) / 2.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.011007348802298533;
		gx = (-luma33 + 8.0 * luma25 - 8.0 * luma9 + luma1) / 12.0;
		gy = (luma18 - luma16) / 2.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.018148050104365175;
		gx = (-luma34 + 8.0 * luma26 - 8.0 * luma10 + luma2) / 12.0;
		gy = (-luma20 + 8.0 * luma19 - 8.0 * luma17 + luma16) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.02992107622879854;
		gx = (-luma35 + 8.0 * luma27 - 8.0 * luma11 + luma3) / 12.0;
		gy = (-luma21 + 8.0 * luma20 - 8.0 * luma18 + luma17) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.03841942237242872;
		gx = (-luma36 + 8.0 * luma28 - 8.0 * luma12 + luma4) / 12.0;
		gy = (-luma22 + 8.0 * luma21 - 8.0 * luma19 + luma18) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.03841942237242872;
		gx = (-luma37 + 8.0 * luma29 - 8.0 * luma13 + luma5) / 12.0;
		gy = (-luma23 + 8.0 * luma22 - 8.0 * luma20 + luma19) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.02992107622879854;
		gx = (-luma38 + 8.0 * luma30 - 8.0 * luma14 + luma6) / 12.0;
		gy = (luma23 - luma21) / 2.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.018148050104365175;
		gx = (-luma41 + 8.0 * luma33 - 8.0 * luma17 + luma9) / 12.0;
		gy = (luma26 - luma24) / 2.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.0233025575973275;
		gx = (-luma42 + 8.0 * luma34 - 8.0 * luma18 + luma10) / 12.0;
		gy = (-luma28 + 8.0 * luma27 - 8.0 * luma25 + luma24) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.03841942237242872;
		gx = (-luma43 + 8.0 * luma35 - 8.0 * luma19 + luma11) / 12.0;
		gy = (-luma29 + 8.0 * luma28 - 8.0 * luma26 + luma25) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.04933151482066013;
		gx = (-luma44 + 8.0 * luma36 - 8.0 * luma20 + luma12) / 12.0;
		gy = (-luma30 + 8.0 * luma29 - 8.0 * luma27 + luma26) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.04933151482066013;
		gx = (-luma45 + 8.0 * luma37 - 8.0 * luma21 + luma13) / 12.0;
		gy = (-luma31 + 8.0 * luma30 - 8.0 * luma28 + luma27) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.03841942237242872;
		gx = (-luma46 + 8.0 * luma38 - 8.0 * luma22 + luma14) / 12.0;
		gy = (luma31 - luma29) / 2.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.0233025575973275;
		gx = (-luma49 + 8.0 * luma41 - 8.0 * luma25 + luma17) / 12.0;
		gy = (luma34 - luma32) / 2.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.0233025575973275;
		gx = (-luma50 + 8.0 * luma42 - 8.0 * luma26 + luma18) / 12.0;
		gy = (-luma36 + 8.0 * luma35 - 8.0 * luma33 + luma32) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.03841942237242872;
		gx = (-luma51 + 8.0 * luma43 - 8.0 * luma27 + luma19) / 12.0;
		gy = (-luma37 + 8.0 * luma36 - 8.0 * luma34 + luma33) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.04933151482066013;
		gx = (-luma52 + 8.0 * luma44 - 8.0 * luma28 + luma20) / 12.0;
		gy = (-luma38 + 8.0 * luma37 - 8.0 * luma35 + luma34) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.04933151482066013;
		gx = (-luma53 + 8.0 * luma45 - 8.0 * luma29 + luma21) / 12.0;
		gy = (-luma39 + 8.0 * luma38 - 8.0 * luma36 + luma35) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.03841942237242872;
		gx = (-luma54 + 8.0 * luma46 - 8.0 * luma30 + luma22) / 12.0;
		gy = (luma39 - luma37) / 2.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.0233025575973275;
		gx = (-luma57 + 8.0 * luma49 - 8.0 * luma33 + luma25) / 12.0;
		gy = (luma42 - luma40) / 2.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.018148050104365175;
		gx = (-luma58 + 8.0 * luma50 - 8.0 * luma34 + luma26) / 12.0;
		gy = (-luma44 + 8.0 * luma43 - 8.0 * luma41 + luma40) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.02992107622879854;
		gx = (-luma59 + 8.0 * luma51 - 8.0 * luma35 + luma27) / 12.0;
		gy = (-luma45 + 8.0 * luma44 - 8.0 * luma42 + luma41) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.03841942237242872;
		gx = (-luma60 + 8.0 * luma52 - 8.0 * luma36 + luma28) / 12.0;
		gy = (-luma46 + 8.0 * luma45 - 8.0 * luma43 + luma42) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.03841942237242872;
		gx = (-luma61 + 8.0 * luma53 - 8.0 * luma37 + luma29) / 12.0;
		gy = (-luma47 + 8.0 * luma46 - 8.0 * luma44 + luma43) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.02992107622879854;
		gx = (-luma62 + 8.0 * luma54 - 8.0 * luma38 + luma30) / 12.0;
		gy = (luma47 - luma45) / 2.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.018148050104365175;
		gx = (luma57 - luma41) / 2.0;
		gy = (luma50 - luma48) / 2.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.011007348802298533;
		gx = (luma58 - luma42) / 2.0;
		gy = (-luma52 + 8.0 * luma51 - 8.0 * luma49 + luma48) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.018148050104365175;
		gx = (luma59 - luma43) / 2.0;
		gy = (-luma53 + 8.0 * luma52 - 8.0 * luma50 + luma49) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.0233025575973275;
		gx = (luma60 - luma44) / 2.0;
		gy = (-luma54 + 8.0 * luma53 - 8.0 * luma51 + luma50) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.0233025575973275;
		gx = (luma61 - luma45) / 2.0;
		gy = (-luma55 + 8.0 * luma54 - 8.0 * luma52 + luma51) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.018148050104365175;
		gx = (luma62 - luma46) / 2.0;
		gy = (luma55 - luma53) / 2.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.011007348802298533;

		// Eigenvalue decomposition for edge direction
		float a = abd.x, b = abd.y, d = abd.z;
		float T = a + d, D = a * d - b * b;
		float delta = sqrt(max(T * T / 4.0 - D, 0.0));
		float L1 = T / 2.0 + delta, L2 = T / 2.0 - delta;
		float sqrtL1 = sqrt(L1), sqrtL2 = sqrt(L2);
		float theta = mix(mod(atan(L1 - a, b) + 3.141592653589793, 3.141592653589793), 0.0, abs(b) < 1.192092896e-7);
		float lambda = sqrtL1;
		float mu = mix((sqrtL1 - sqrtL2) / (sqrtL1 + sqrtL2), 0.0, sqrtL1 + sqrtL2 < 1.192092896e-7);

		// Quantize edge parameters for LUT lookup
		float angle = floor(theta * 24.0 / 3.141592653589793);
		float strength = clamp(floor(log2(lambda * 2000.0 + 1.192092896e-7)), 0.0, 8.0);
		float coherence = mix(mix(0.0, 1.0, mu >= 0.25), 2.0, mu >= 0.5);
		float coord_y = ((angle * 9.0 + strength) * 3.0 + coherence + 0.5) / 648.0;

		// Sample LUT weights and compute interpolated result
		float res = 0.0;
		vec4 w;
		w = texture(ravu_lut4, vec2(0.0625, coord_y));
		res += (inp0[local_pos + 0] + inp0[local_pos + 112]) * w[0];
		res += (inp0[local_pos + 1] + inp0[local_pos + 111]) * w[1];
		res += (inp0[local_pos + 2] + inp0[local_pos + 110]) * w[2];
		res += (inp0[local_pos + 3] + inp0[local_pos + 109]) * w[3];
		w = texture(ravu_lut4, vec2(0.1875, coord_y));
		res += (inp0[local_pos + 4] + inp0[local_pos + 108]) * w[0];
		res += (inp0[local_pos + 5] + inp0[local_pos + 107]) * w[1];
		res += (inp0[local_pos + 6] + inp0[local_pos + 106]) * w[2];
		res += (inp0[local_pos + 7] + inp0[local_pos + 105]) * w[3];
		w = texture(ravu_lut4, vec2(0.3125, coord_y));
		res += (inp0[local_pos + 15] + inp0[local_pos + 97]) * w[0];
		res += (inp0[local_pos + 16] + inp0[local_pos + 96]) * w[1];
		res += (inp0[local_pos + 17] + inp0[local_pos + 95]) * w[2];
		res += (inp0[local_pos + 18] + inp0[local_pos + 94]) * w[3];
		w = texture(ravu_lut4, vec2(0.4375, coord_y));
		res += (inp0[local_pos + 19] + inp0[local_pos + 93]) * w[0];
		res += (inp0[local_pos + 20] + inp0[local_pos + 92]) * w[1];
		res += (inp0[local_pos + 21] + inp0[local_pos + 91]) * w[2];
		res += (inp0[local_pos + 22] + inp0[local_pos + 90]) * w[3];
		w = texture(ravu_lut4, vec2(0.5625, coord_y));
		res += (inp0[local_pos + 30] + inp0[local_pos + 82]) * w[0];
		res += (inp0[local_pos + 31] + inp0[local_pos + 81]) * w[1];
		res += (inp0[local_pos + 32] + inp0[local_pos + 80]) * w[2];
		res += (inp0[local_pos + 33] + inp0[local_pos + 79]) * w[3];
		w = texture(ravu_lut4, vec2(0.6875, coord_y));
		res += (inp0[local_pos + 34] + inp0[local_pos + 78]) * w[0];
		res += (inp0[local_pos + 35] + inp0[local_pos + 77]) * w[1];
		res += (inp0[local_pos + 36] + inp0[local_pos + 76]) * w[2];
		res += (inp0[local_pos + 37] + inp0[local_pos + 75]) * w[3];
		w = texture(ravu_lut4, vec2(0.8125, coord_y));
		res += (inp0[local_pos + 45] + inp0[local_pos + 67]) * w[0];
		res += (inp0[local_pos + 46] + inp0[local_pos + 66]) * w[1];
		res += (inp0[local_pos + 47] + inp0[local_pos + 65]) * w[2];
		res += (inp0[local_pos + 48] + inp0[local_pos + 64]) * w[3];
		w = texture(ravu_lut4, vec2(0.9375, coord_y));
		res += (inp0[local_pos + 49] + inp0[local_pos + 63]) * w[0];
		res += (inp0[local_pos + 50] + inp0[local_pos + 62]) * w[1];
		res += (inp0[local_pos + 51] + inp0[local_pos + 61]) * w[2];
		res += (inp0[local_pos + 52] + inp0[local_pos + 60]) * w[3];
		res = clamp(res, 0.0, 1.0);
		imageStore(out_image, ivec2(gl_GlobalInvocationID), res);
	}
}

//!PASS 2
//!DESC RAVU R4 Pass 2 - Final output composition
//!IN INPUT, ravu_lut4, ravu_int11
//!OUT OUTPUT
//!BLOCK_SIZE 64, 16
//!NUM_THREADS 32, 8

// Dual shared memory buffers for intermediate and input data
shared float inp0[585];
shared float inp1[585];

#define CURRENT_PASS 2

#define GET_SAMPLE(x) dot(x.rgb, rgb2y)
#define imageStore(out_image, pos, val) imageStoreOverride(pos, val.x)
void imageStoreOverride(uint2 pos, float value) {
	float2 UV = mul(rgb2uv, INPUT.SampleLevel(sam_INPUT_LINEAR, HOOKED_map(pos), 0).rgb);
	OUTPUT[pos] = float4(mul(yuv2rgb, float3(value.x, UV)), 1.0);
}

#define INPUT_tex(pos) GET_SAMPLE(vec4(texture(INPUT, pos)))
static const float2 INPUT_size = float2(GetInputSize());
static const float2 INPUT_pt = float2(GetInputPt());

#define ravu_lut4_tex(pos) (vec4(texture(ravu_lut4, pos)))

#define ravu_int11_tex(pos) (float(texture(ravu_int11, pos).x))
static const float2 ravu_int11_size = float2(GetInputSize().x, GetInputSize().y);
static const float2 ravu_int11_pt = float2(1.0 / (ravu_int11_size.x), 1.0 / (ravu_int11_size.y));

#define HOOKED_tex(pos) INPUT_tex(pos)
#define HOOKED_size INPUT_size
#define HOOKED_pt INPUT_pt

void Pass2(uint2 blockStart, uint3 threadId) {
	ivec2 group_base = ivec2(gl_WorkGroupID) * ivec2(gl_WorkGroupSize);
	int local_pos = int(gl_LocalInvocationID.x) * 15 + int(gl_LocalInvocationID.y);

	// Load intermediate results from pass 1
	{
		for (int id = int(gl_LocalInvocationIndex); id < 585; id += int(gl_WorkGroupSize.x * gl_WorkGroupSize.y)) {
			uint x = (uint)id / 15, y = (uint)id % 15;
			inp0[id] =
				ravu_int11_tex(ravu_int11_pt * vec2(float(group_base.x + x) + (-3.5), float(group_base.y + y) + (-3.5)))
					.x;
		}
	}
	// Load original input samples
	{
		for (int id = int(gl_LocalInvocationIndex); id < 585; id += int(gl_WorkGroupSize.x * gl_WorkGroupSize.y)) {
			uint x = (uint)id / 15, y = (uint)id % 15;
			inp1[id] =
				HOOKED_tex(HOOKED_pt * vec2(float(group_base.x + x) + (-2.5), float(group_base.y + y) + (-2.5))).x;
		}
	}
	barrier();

#if CURRENT_PASS == LAST_PASS
	uint2 destPos = blockStart + threadId.xy * 2;
	uint2 outputSize = GetOutputSize();
	if (destPos.x >= outputSize.x || destPos.y >= outputSize.y) {
		return;
	}
#endif

	// Output pixel at position (0, 1)
	{
		float luma16 = inp0[local_pos + 18];
		float luma9 = inp0[local_pos + 19];
		float luma2 = inp0[local_pos + 20];
		float luma32 = inp0[local_pos + 32];
		float luma25 = inp0[local_pos + 33];
		float luma18 = inp0[local_pos + 34];
		float luma11 = inp0[local_pos + 35];
		float luma4 = inp0[local_pos + 36];
		float luma48 = inp0[local_pos + 46];
		float luma41 = inp0[local_pos + 47];
		float luma34 = inp0[local_pos + 48];
		float luma27 = inp0[local_pos + 49];
		float luma20 = inp0[local_pos + 50];
		float luma13 = inp0[local_pos + 51];
		float luma6 = inp0[local_pos + 52];
		float luma57 = inp0[local_pos + 61];
		float luma50 = inp0[local_pos + 62];
		float luma43 = inp0[local_pos + 63];
		float luma36 = inp0[local_pos + 64];
		float luma29 = inp0[local_pos + 65];
		float luma22 = inp0[local_pos + 66];
		float luma15 = inp0[local_pos + 67];
		float luma59 = inp0[local_pos + 77];
		float luma52 = inp0[local_pos + 78];
		float luma45 = inp0[local_pos + 79];
		float luma38 = inp0[local_pos + 80];
		float luma31 = inp0[local_pos + 81];
		float luma61 = inp0[local_pos + 93];
		float luma54 = inp0[local_pos + 94];
		float luma47 = inp0[local_pos + 95];
		float luma24 = inp1[local_pos + 17];
		float luma17 = inp1[local_pos + 18];
		float luma10 = inp1[local_pos + 19];
		float luma3 = inp1[local_pos + 20];
		float luma40 = inp1[local_pos + 31];
		float luma33 = inp1[local_pos + 32];
		float luma26 = inp1[local_pos + 33];
		float luma19 = inp1[local_pos + 34];
		float luma12 = inp1[local_pos + 35];
		float luma5 = inp1[local_pos + 36];
		float luma8 = inp1[local_pos + 3];
		float luma49 = inp1[local_pos + 46];
		float luma42 = inp1[local_pos + 47];
		float luma35 = inp1[local_pos + 48];
		float luma28 = inp1[local_pos + 49];
		float luma1 = inp1[local_pos + 4];
		float luma21 = inp1[local_pos + 50];
		float luma14 = inp1[local_pos + 51];
		float luma58 = inp1[local_pos + 61];
		float luma51 = inp1[local_pos + 62];
		float luma44 = inp1[local_pos + 63];
		float luma37 = inp1[local_pos + 64];
		float luma30 = inp1[local_pos + 65];
		float luma23 = inp1[local_pos + 66];
		float luma60 = inp1[local_pos + 77];
		float luma53 = inp1[local_pos + 78];
		float luma46 = inp1[local_pos + 79];
		float luma39 = inp1[local_pos + 80];
		float luma62 = inp1[local_pos + 93];
		float luma55 = inp1[local_pos + 94];
		vec3 abd = vec3(0.0, 0.0, 0.0);
		float gx, gy;
		gx = (luma17 - luma1) / 2.0;
		gy = (luma10 - luma8) / 2.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.011007348802298533;
		gx = (luma18 - luma2) / 2.0;
		gy = (-luma12 + 8.0 * luma11 - 8.0 * luma9 + luma8) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.018148050104365175;
		gx = (luma19 - luma3) / 2.0;
		gy = (-luma13 + 8.0 * luma12 - 8.0 * luma10 + luma9) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.0233025575973275;
		gx = (luma20 - luma4) / 2.0;
		gy = (-luma14 + 8.0 * luma13 - 8.0 * luma11 + luma10) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.0233025575973275;
		gx = (luma21 - luma5) / 2.0;
		gy = (-luma15 + 8.0 * luma14 - 8.0 * luma12 + luma11) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.018148050104365175;
		gx = (luma22 - luma6) / 2.0;
		gy = (luma15 - luma13) / 2.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.011007348802298533;
		gx = (-luma33 + 8.0 * luma25 - 8.0 * luma9 + luma1) / 12.0;
		gy = (luma18 - luma16) / 2.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.018148050104365175;
		gx = (-luma34 + 8.0 * luma26 - 8.0 * luma10 + luma2) / 12.0;
		gy = (-luma20 + 8.0 * luma19 - 8.0 * luma17 + luma16) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.02992107622879854;
		gx = (-luma35 + 8.0 * luma27 - 8.0 * luma11 + luma3) / 12.0;
		gy = (-luma21 + 8.0 * luma20 - 8.0 * luma18 + luma17) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.03841942237242872;
		gx = (-luma36 + 8.0 * luma28 - 8.0 * luma12 + luma4) / 12.0;
		gy = (-luma22 + 8.0 * luma21 - 8.0 * luma19 + luma18) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.03841942237242872;
		gx = (-luma37 + 8.0 * luma29 - 8.0 * luma13 + luma5) / 12.0;
		gy = (-luma23 + 8.0 * luma22 - 8.0 * luma20 + luma19) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.02992107622879854;
		gx = (-luma38 + 8.0 * luma30 - 8.0 * luma14 + luma6) / 12.0;
		gy = (luma23 - luma21) / 2.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.018148050104365175;
		gx = (-luma41 + 8.0 * luma33 - 8.0 * luma17 + luma9) / 12.0;
		gy = (luma26 - luma24) / 2.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.0233025575973275;
		gx = (-luma42 + 8.0 * luma34 - 8.0 * luma18 + luma10) / 12.0;
		gy = (-luma28 + 8.0 * luma27 - 8.0 * luma25 + luma24) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.03841942237242872;
		gx = (-luma43 + 8.0 * luma35 - 8.0 * luma19 + luma11) / 12.0;
		gy = (-luma29 + 8.0 * luma28 - 8.0 * luma26 + luma25) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.04933151482066013;
		gx = (-luma44 + 8.0 * luma36 - 8.0 * luma20 + luma12) / 12.0;
		gy = (-luma30 + 8.0 * luma29 - 8.0 * luma27 + luma26) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.04933151482066013;
		gx = (-luma45 + 8.0 * luma37 - 8.0 * luma21 + luma13) / 12.0;
		gy = (-luma31 + 8.0 * luma30 - 8.0 * luma28 + luma27) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.03841942237242872;
		gx = (-luma46 + 8.0 * luma38 - 8.0 * luma22 + luma14) / 12.0;
		gy = (luma31 - luma29) / 2.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.0233025575973275;
		gx = (-luma49 + 8.0 * luma41 - 8.0 * luma25 + luma17) / 12.0;
		gy = (luma34 - luma32) / 2.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.0233025575973275;
		gx = (-luma50 + 8.0 * luma42 - 8.0 * luma26 + luma18) / 12.0;
		gy = (-luma36 + 8.0 * luma35 - 8.0 * luma33 + luma32) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.03841942237242872;
		gx = (-luma51 + 8.0 * luma43 - 8.0 * luma27 + luma19) / 12.0;
		gy = (-luma37 + 8.0 * luma36 - 8.0 * luma34 + luma33) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.04933151482066013;
		gx = (-luma52 + 8.0 * luma44 - 8.0 * luma28 + luma20) / 12.0;
		gy = (-luma38 + 8.0 * luma37 - 8.0 * luma35 + luma34) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.04933151482066013;
		gx = (-luma53 + 8.0 * luma45 - 8.0 * luma29 + luma21) / 12.0;
		gy = (-luma39 + 8.0 * luma38 - 8.0 * luma36 + luma35) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.03841942237242872;
		gx = (-luma54 + 8.0 * luma46 - 8.0 * luma30 + luma22) / 12.0;
		gy = (luma39 - luma37) / 2.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.0233025575973275;
		gx = (-luma57 + 8.0 * luma49 - 8.0 * luma33 + luma25) / 12.0;
		gy = (luma42 - luma40) / 2.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.018148050104365175;
		gx = (-luma58 + 8.0 * luma50 - 8.0 * luma34 + luma26) / 12.0;
		gy = (-luma44 + 8.0 * luma43 - 8.0 * luma41 + luma40) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.02992107622879854;
		gx = (-luma59 + 8.0 * luma51 - 8.0 * luma35 + luma27) / 12.0;
		gy = (-luma45 + 8.0 * luma44 - 8.0 * luma42 + luma41) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.03841942237242872;
		gx = (-luma60 + 8.0 * luma52 - 8.0 * luma36 + luma28) / 12.0;
		gy = (-luma46 + 8.0 * luma45 - 8.0 * luma43 + luma42) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.03841942237242872;
		gx = (-luma61 + 8.0 * luma53 - 8.0 * luma37 + luma29) / 12.0;
		gy = (-luma47 + 8.0 * luma46 - 8.0 * luma44 + luma43) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.02992107622879854;
		gx = (-luma62 + 8.0 * luma54 - 8.0 * luma38 + luma30) / 12.0;
		gy = (luma47 - luma45) / 2.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.018148050104365175;
		gx = (luma57 - luma41) / 2.0;
		gy = (luma50 - luma48) / 2.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.011007348802298533;
		gx = (luma58 - luma42) / 2.0;
		gy = (-luma52 + 8.0 * luma51 - 8.0 * luma49 + luma48) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.018148050104365175;
		gx = (luma59 - luma43) / 2.0;
		gy = (-luma53 + 8.0 * luma52 - 8.0 * luma50 + luma49) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.0233025575973275;
		gx = (luma60 - luma44) / 2.0;
		gy = (-luma54 + 8.0 * luma53 - 8.0 * luma51 + luma50) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.0233025575973275;
		gx = (luma61 - luma45) / 2.0;
		gy = (-luma55 + 8.0 * luma54 - 8.0 * luma52 + luma51) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.018148050104365175;
		gx = (luma62 - luma46) / 2.0;
		gy = (luma55 - luma53) / 2.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.011007348802298533;
		float a = abd.x, b = abd.y, d = abd.z;
		float T = a + d, D = a * d - b * b;
		float delta = sqrt(max(T * T / 4.0 - D, 0.0));
		float L1 = T / 2.0 + delta, L2 = T / 2.0 - delta;
		float sqrtL1 = sqrt(L1), sqrtL2 = sqrt(L2);
		float theta = mix(mod(atan(L1 - a, b) + 3.141592653589793, 3.141592653589793), 0.0, abs(b) < 1.192092896e-7);
		float lambda = sqrtL1;
		float mu = mix((sqrtL1 - sqrtL2) / (sqrtL1 + sqrtL2), 0.0, sqrtL1 + sqrtL2 < 1.192092896e-7);
		float angle = floor(theta * 24.0 / 3.141592653589793);
		float strength = clamp(floor(log2(lambda * 2000.0 + 1.192092896e-7)), 0.0, 8.0);
		float coherence = mix(mix(0.0, 1.0, mu >= 0.25), 2.0, mu >= 0.5);
		float coord_y = ((angle * 9.0 + strength) * 3.0 + coherence + 0.5) / 648.0;
		float res = 0.0;
		vec4 w;
		w = texture(ravu_lut4, vec2(0.0625, coord_y));
		res += (inp0[local_pos + 4] + inp0[local_pos + 109]) * w[0];
		res += (inp1[local_pos + 4] + inp1[local_pos + 93]) * w[1];
		res += (inp0[local_pos + 20] + inp0[local_pos + 93]) * w[2];
		res += (inp1[local_pos + 20] + inp1[local_pos + 77]) * w[3];
		w = texture(ravu_lut4, vec2(0.1875, coord_y));
		res += (inp0[local_pos + 36] + inp0[local_pos + 77]) * w[0];
		res += (inp1[local_pos + 36] + inp1[local_pos + 61]) * w[1];
		res += (inp0[local_pos + 52] + inp0[local_pos + 61]) * w[2];
		res += (inp1[local_pos + 52] + inp1[local_pos + 45]) * w[3];
		w = texture(ravu_lut4, vec2(0.3125, coord_y));
		res += (inp1[local_pos + 3] + inp1[local_pos + 94]) * w[0];
		res += (inp0[local_pos + 19] + inp0[local_pos + 94]) * w[1];
		res += (inp1[local_pos + 19] + inp1[local_pos + 78]) * w[2];
		res += (inp0[local_pos + 35] + inp0[local_pos + 78]) * w[3];
		w = texture(ravu_lut4, vec2(0.4375, coord_y));
		res += (inp1[local_pos + 35] + inp1[local_pos + 62]) * w[0];
		res += (inp0[local_pos + 51] + inp0[local_pos + 62]) * w[1];
		res += (inp1[local_pos + 51] + inp1[local_pos + 46]) * w[2];
		res += (inp0[local_pos + 67] + inp0[local_pos + 46]) * w[3];
		w = texture(ravu_lut4, vec2(0.5625, coord_y));
		res += (inp0[local_pos + 18] + inp0[local_pos + 95]) * w[0];
		res += (inp1[local_pos + 18] + inp1[local_pos + 79]) * w[1];
		res += (inp0[local_pos + 34] + inp0[local_pos + 79]) * w[2];
		res += (inp1[local_pos + 34] + inp1[local_pos + 63]) * w[3];
		w = texture(ravu_lut4, vec2(0.6875, coord_y));
		res += (inp0[local_pos + 50] + inp0[local_pos + 63]) * w[0];
		res += (inp1[local_pos + 50] + inp1[local_pos + 47]) * w[1];
		res += (inp0[local_pos + 66] + inp0[local_pos + 47]) * w[2];
		res += (inp1[local_pos + 66] + inp1[local_pos + 31]) * w[3];
		w = texture(ravu_lut4, vec2(0.8125, coord_y));
		res += (inp1[local_pos + 17] + inp1[local_pos + 80]) * w[0];
		res += (inp0[local_pos + 33] + inp0[local_pos + 80]) * w[1];
		res += (inp1[local_pos + 33] + inp1[local_pos + 64]) * w[2];
		res += (inp0[local_pos + 49] + inp0[local_pos + 64]) * w[3];
		w = texture(ravu_lut4, vec2(0.9375, coord_y));
		res += (inp1[local_pos + 49] + inp1[local_pos + 48]) * w[0];
		res += (inp0[local_pos + 65] + inp0[local_pos + 48]) * w[1];
		res += (inp1[local_pos + 65] + inp1[local_pos + 32]) * w[2];
		res += (inp0[local_pos + 81] + inp0[local_pos + 32]) * w[3];
		res = clamp(res, 0.0, 1.0);
		imageStore(out_image, ivec2(gl_GlobalInvocationID) * 2 + ivec2(0, 1), res);
	}

	// Output pixel at position (1, 0)
	{
		float luma62 = inp0[local_pos + 108];
		float luma55 = inp0[local_pos + 109];
		float luma8 = inp0[local_pos + 18];
		float luma1 = inp0[local_pos + 19];
		float luma24 = inp0[local_pos + 32];
		float luma17 = inp0[local_pos + 33];
		float luma10 = inp0[local_pos + 34];
		float luma3 = inp0[local_pos + 35];
		float luma40 = inp0[local_pos + 46];
		float luma33 = inp0[local_pos + 47];
		float luma26 = inp0[local_pos + 48];
		float luma19 = inp0[local_pos + 49];
		float luma12 = inp0[local_pos + 50];
		float luma5 = inp0[local_pos + 51];
		float luma49 = inp0[local_pos + 61];
		float luma42 = inp0[local_pos + 62];
		float luma35 = inp0[local_pos + 63];
		float luma28 = inp0[local_pos + 64];
		float luma21 = inp0[local_pos + 65];
		float luma14 = inp0[local_pos + 66];
		float luma58 = inp0[local_pos + 76];
		float luma51 = inp0[local_pos + 77];
		float luma44 = inp0[local_pos + 78];
		float luma37 = inp0[local_pos + 79];
		float luma30 = inp0[local_pos + 80];
		float luma23 = inp0[local_pos + 81];
		float luma60 = inp0[local_pos + 92];
		float luma53 = inp0[local_pos + 93];
		float luma46 = inp0[local_pos + 94];
		float luma39 = inp0[local_pos + 95];
		float luma16 = inp1[local_pos + 17];
		float luma9 = inp1[local_pos + 18];
		float luma2 = inp1[local_pos + 19];
		float luma32 = inp1[local_pos + 31];
		float luma25 = inp1[local_pos + 32];
		float luma18 = inp1[local_pos + 33];
		float luma11 = inp1[local_pos + 34];
		float luma4 = inp1[local_pos + 35];
		float luma48 = inp1[local_pos + 45];
		float luma41 = inp1[local_pos + 46];
		float luma34 = inp1[local_pos + 47];
		float luma27 = inp1[local_pos + 48];
		float luma20 = inp1[local_pos + 49];
		float luma13 = inp1[local_pos + 50];
		float luma6 = inp1[local_pos + 51];
		float luma57 = inp1[local_pos + 60];
		float luma50 = inp1[local_pos + 61];
		float luma43 = inp1[local_pos + 62];
		float luma36 = inp1[local_pos + 63];
		float luma29 = inp1[local_pos + 64];
		float luma22 = inp1[local_pos + 65];
		float luma15 = inp1[local_pos + 66];
		float luma59 = inp1[local_pos + 76];
		float luma52 = inp1[local_pos + 77];
		float luma45 = inp1[local_pos + 78];
		float luma38 = inp1[local_pos + 79];
		float luma31 = inp1[local_pos + 80];
		float luma61 = inp1[local_pos + 92];
		float luma54 = inp1[local_pos + 93];
		float luma47 = inp1[local_pos + 94];
		vec3 abd = vec3(0.0, 0.0, 0.0);
		float gx, gy;
		gx = (luma17 - luma1) / 2.0;
		gy = (luma10 - luma8) / 2.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.011007348802298533;
		gx = (luma18 - luma2) / 2.0;
		gy = (-luma12 + 8.0 * luma11 - 8.0 * luma9 + luma8) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.018148050104365175;
		gx = (luma19 - luma3) / 2.0;
		gy = (-luma13 + 8.0 * luma12 - 8.0 * luma10 + luma9) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.0233025575973275;
		gx = (luma20 - luma4) / 2.0;
		gy = (-luma14 + 8.0 * luma13 - 8.0 * luma11 + luma10) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.0233025575973275;
		gx = (luma21 - luma5) / 2.0;
		gy = (-luma15 + 8.0 * luma14 - 8.0 * luma12 + luma11) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.018148050104365175;
		gx = (luma22 - luma6) / 2.0;
		gy = (luma15 - luma13) / 2.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.011007348802298533;
		gx = (-luma33 + 8.0 * luma25 - 8.0 * luma9 + luma1) / 12.0;
		gy = (luma18 - luma16) / 2.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.018148050104365175;
		gx = (-luma34 + 8.0 * luma26 - 8.0 * luma10 + luma2) / 12.0;
		gy = (-luma20 + 8.0 * luma19 - 8.0 * luma17 + luma16) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.02992107622879854;
		gx = (-luma35 + 8.0 * luma27 - 8.0 * luma11 + luma3) / 12.0;
		gy = (-luma21 + 8.0 * luma20 - 8.0 * luma18 + luma17) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.03841942237242872;
		gx = (-luma36 + 8.0 * luma28 - 8.0 * luma12 + luma4) / 12.0;
		gy = (-luma22 + 8.0 * luma21 - 8.0 * luma19 + luma18) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.03841942237242872;
		gx = (-luma37 + 8.0 * luma29 - 8.0 * luma13 + luma5) / 12.0;
		gy = (-luma23 + 8.0 * luma22 - 8.0 * luma20 + luma19) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.02992107622879854;
		gx = (-luma38 + 8.0 * luma30 - 8.0 * luma14 + luma6) / 12.0;
		gy = (luma23 - luma21) / 2.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.018148050104365175;
		gx = (-luma41 + 8.0 * luma33 - 8.0 * luma17 + luma9) / 12.0;
		gy = (luma26 - luma24) / 2.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.0233025575973275;
		gx = (-luma42 + 8.0 * luma34 - 8.0 * luma18 + luma10) / 12.0;
		gy = (-luma28 + 8.0 * luma27 - 8.0 * luma25 + luma24) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.03841942237242872;
		gx = (-luma43 + 8.0 * luma35 - 8.0 * luma19 + luma11) / 12.0;
		gy = (-luma29 + 8.0 * luma28 - 8.0 * luma26 + luma25) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.04933151482066013;
		gx = (-luma44 + 8.0 * luma36 - 8.0 * luma20 + luma12) / 12.0;
		gy = (-luma30 + 8.0 * luma29 - 8.0 * luma27 + luma26) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.04933151482066013;
		gx = (-luma45 + 8.0 * luma37 - 8.0 * luma21 + luma13) / 12.0;
		gy = (-luma31 + 8.0 * luma30 - 8.0 * luma28 + luma27) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.03841942237242872;
		gx = (-luma46 + 8.0 * luma38 - 8.0 * luma22 + luma14) / 12.0;
		gy = (luma31 - luma29) / 2.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.0233025575973275;
		gx = (-luma49 + 8.0 * luma41 - 8.0 * luma25 + luma17) / 12.0;
		gy = (luma34 - luma32) / 2.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.0233025575973275;
		gx = (-luma50 + 8.0 * luma42 - 8.0 * luma26 + luma18) / 12.0;
		gy = (-luma36 + 8.0 * luma35 - 8.0 * luma33 + luma32) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.03841942237242872;
		gx = (-luma51 + 8.0 * luma43 - 8.0 * luma27 + luma19) / 12.0;
		gy = (-luma37 + 8.0 * luma36 - 8.0 * luma34 + luma33) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.04933151482066013;
		gx = (-luma52 + 8.0 * luma44 - 8.0 * luma28 + luma20) / 12.0;
		gy = (-luma38 + 8.0 * luma37 - 8.0 * luma35 + luma34) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.04933151482066013;
		gx = (-luma53 + 8.0 * luma45 - 8.0 * luma29 + luma21) / 12.0;
		gy = (-luma39 + 8.0 * luma38 - 8.0 * luma36 + luma35) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.03841942237242872;
		gx = (-luma54 + 8.0 * luma46 - 8.0 * luma30 + luma22) / 12.0;
		gy = (luma39 - luma37) / 2.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.0233025575973275;
		gx = (-luma57 + 8.0 * luma49 - 8.0 * luma33 + luma25) / 12.0;
		gy = (luma42 - luma40) / 2.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.018148050104365175;
		gx = (-luma58 + 8.0 * luma50 - 8.0 * luma34 + luma26) / 12.0;
		gy = (-luma44 + 8.0 * luma43 - 8.0 * luma41 + luma40) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.02992107622879854;
		gx = (-luma59 + 8.0 * luma51 - 8.0 * luma35 + luma27) / 12.0;
		gy = (-luma45 + 8.0 * luma44 - 8.0 * luma42 + luma41) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.03841942237242872;
		gx = (-luma60 + 8.0 * luma52 - 8.0 * luma36 + luma28) / 12.0;
		gy = (-luma46 + 8.0 * luma45 - 8.0 * luma43 + luma42) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.03841942237242872;
		gx = (-luma61 + 8.0 * luma53 - 8.0 * luma37 + luma29) / 12.0;
		gy = (-luma47 + 8.0 * luma46 - 8.0 * luma44 + luma43) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.02992107622879854;
		gx = (-luma62 + 8.0 * luma54 - 8.0 * luma38 + luma30) / 12.0;
		gy = (luma47 - luma45) / 2.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.018148050104365175;
		gx = (luma57 - luma41) / 2.0;
		gy = (luma50 - luma48) / 2.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.011007348802298533;
		gx = (luma58 - luma42) / 2.0;
		gy = (-luma52 + 8.0 * luma51 - 8.0 * luma49 + luma48) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.018148050104365175;
		gx = (luma59 - luma43) / 2.0;
		gy = (-luma53 + 8.0 * luma52 - 8.0 * luma50 + luma49) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.0233025575973275;
		gx = (luma60 - luma44) / 2.0;
		gy = (-luma54 + 8.0 * luma53 - 8.0 * luma51 + luma50) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.0233025575973275;
		gx = (luma61 - luma45) / 2.0;
		gy = (-luma55 + 8.0 * luma54 - 8.0 * luma52 + luma51) / 12.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.018148050104365175;
		gx = (luma62 - luma46) / 2.0;
		gy = (luma55 - luma53) / 2.0;
		abd += vec3(gx * gx, gx * gy, gy * gy) * 0.011007348802298533;
		float a = abd.x, b = abd.y, d = abd.z;
		float T = a + d, D = a * d - b * b;
		float delta = sqrt(max(T * T / 4.0 - D, 0.0));
		float L1 = T / 2.0 + delta, L2 = T / 2.0 - delta;
		float sqrtL1 = sqrt(L1), sqrtL2 = sqrt(L2);
		float theta = mix(mod(atan(L1 - a, b) + 3.141592653589793, 3.141592653589793), 0.0, abs(b) < 1.192092896e-7);
		float lambda = sqrtL1;
		float mu = mix((sqrtL1 - sqrtL2) / (sqrtL1 + sqrtL2), 0.0, sqrtL1 + sqrtL2 < 1.192092896e-7);
		float angle = floor(theta * 24.0 / 3.141592653589793);
		float strength = clamp(floor(log2(lambda * 2000.0 + 1.192092896e-7)), 0.0, 8.0);
		float coherence = mix(mix(0.0, 1.0, mu >= 0.25), 2.0, mu >= 0.5);
		float coord_y = ((angle * 9.0 + strength) * 3.0 + coherence + 0.5) / 648.0;
		float res = 0.0;
		vec4 w;
		w = texture(ravu_lut4, vec2(0.0625, coord_y));
		res += (inp1[local_pos + 3] + inp1[local_pos + 108]) * w[0];
		res += (inp0[local_pos + 19] + inp0[local_pos + 108]) * w[1];
		res += (inp1[local_pos + 19] + inp1[local_pos + 92]) * w[2];
		res += (inp0[local_pos + 35] + inp0[local_pos + 92]) * w[3];
		w = texture(ravu_lut4, vec2(0.1875, coord_y));
		res += (inp1[local_pos + 35] + inp1[local_pos + 76]) * w[0];
		res += (inp0[local_pos + 51] + inp0[local_pos + 76]) * w[1];
		res += (inp1[local_pos + 51] + inp1[local_pos + 60]) * w[2];
		res += (inp0[local_pos + 67] + inp0[local_pos + 60]) * w[3];
		w = texture(ravu_lut4, vec2(0.3125, coord_y));
		res += (inp0[local_pos + 18] + inp0[local_pos + 109]) * w[0];
		res += (inp1[local_pos + 18] + inp1[local_pos + 93]) * w[1];
		res += (inp0[local_pos + 34] + inp0[local_pos + 93]) * w[2];
		res += (inp1[local_pos + 34] + inp1[local_pos + 77]) * w[3];
		w = texture(ravu_lut4, vec2(0.4375, coord_y));
		res += (inp0[local_pos + 50] + inp0[local_pos + 77]) * w[0];
		res += (inp1[local_pos + 50] + inp1[local_pos + 61]) * w[1];
		res += (inp0[local_pos + 66] + inp0[local_pos + 61]) * w[2];
		res += (inp1[local_pos + 66] + inp1[local_pos + 45]) * w[3];
		w = texture(ravu_lut4, vec2(0.5625, coord_y));
		res += (inp1[local_pos + 17] + inp1[local_pos + 94]) * w[0];
		res += (inp0[local_pos + 33] + inp0[local_pos + 94]) * w[1];
		res += (inp1[local_pos + 33] + inp1[local_pos + 78]) * w[2];
		res += (inp0[local_pos + 49] + inp0[local_pos + 78]) * w[3];
		w = texture(ravu_lut4, vec2(0.6875, coord_y));
		res += (inp1[local_pos + 49] + inp1[local_pos + 62]) * w[0];
		res += (inp0[local_pos + 65] + inp0[local_pos + 62]) * w[1];
		res += (inp1[local_pos + 65] + inp1[local_pos + 46]) * w[2];
		res += (inp0[local_pos + 81] + inp0[local_pos + 46]) * w[3];
		w = texture(ravu_lut4, vec2(0.8125, coord_y));
		res += (inp0[local_pos + 32] + inp0[local_pos + 95]) * w[0];
		res += (inp1[local_pos + 32] + inp1[local_pos + 79]) * w[1];
		res += (inp0[local_pos + 48] + inp0[local_pos + 79]) * w[2];
		res += (inp1[local_pos + 48] + inp1[local_pos + 63]) * w[3];
		w = texture(ravu_lut4, vec2(0.9375, coord_y));
		res += (inp0[local_pos + 64] + inp0[local_pos + 63]) * w[0];
		res += (inp1[local_pos + 64] + inp1[local_pos + 47]) * w[1];
		res += (inp0[local_pos + 80] + inp0[local_pos + 47]) * w[2];
		res += (inp1[local_pos + 80] + inp1[local_pos + 31]) * w[3];
		res = clamp(res, 0.0, 1.0);
		imageStore(out_image, ivec2(gl_GlobalInvocationID) * 2 + ivec2(1, 0), res);
	}

	// Direct copy for remaining pixels
	float res;
	res = inp0[local_pos + 64];
	imageStore(out_image, ivec2(gl_GlobalInvocationID) * 2 + ivec2(1, 1), res);
	res = inp1[local_pos + 48];
	imageStore(out_image, ivec2(gl_GlobalInvocationID) * 2 + ivec2(0, 0), res);
}
