// RAVU 3x Upscaling Shader (R4 Variant, Luma)
// Rapid and Accurate Video Upscaling - Triple scale version
// Edge-adaptive upscaler using radius 4 kernel for 3x scaling

//!BGFX EFFECT
//!VERSION 1
//!NAME RAVU 3x R4
//!CATEGORY Upscaling
//!DESCRIPTION 3x edge-adaptive upscaler with radius 4 kernel. Processes luminance channel only. Highest quality preset.

//!TEXTURE
Texture2D INPUT;

//!SAMPLER
//!FILTER POINT
SamplerState sam_INPUT;

//!TEXTURE
//!WIDTH  INPUT_WIDTH * 3
//!HEIGHT INPUT_HEIGHT * 3
Texture2D OUTPUT;

//!SAMPLER
//!FILTER LINEAR
SamplerState sam_INPUT_LINEAR;

//!TEXTURE
//!SOURCE ravu_3x_lut4_f16.dds
//!FORMAT R16G16B16A16_FLOAT
Texture2D ravu_3x_lut4;

//!SAMPLER
//!FILTER LINEAR
SamplerState sam_ravu_3x_lut4;

//!COMMON
#include "prescalers.hlsli"

#define LAST_PASS 1

//!PASS 1
//!DESC RAVU-3x Upscale (luma, r4)
//!IN INPUT, ravu_3x_lut4
//!OUT OUTPUT
//!BLOCK_SIZE 96, 24
//!NUM_THREADS 32, 8
shared float inp[532];

#define CURRENT_PASS 1

#define GET_SAMPLE(x) dot(x.rgb, rgb2y)
#define imageStore(out_image, pos, val) imageStoreOverride(pos, val.x)
void imageStoreOverride(uint2 pos, float value) {
	float2 UV = mul(rgb2uv, INPUT.SampleLevel(sam_INPUT_LINEAR, HOOKED_map(pos), 0).rgb);
	OUTPUT[pos] = float4(mul(yuv2rgb, float3(value.x, UV)), 1.0);
}

#define INPUT_tex(pos) GET_SAMPLE(vec4(texture(INPUT, pos)))
static const float2 INPUT_size = float2(GetInputSize());
static const float2 INPUT_pt = float2(GetInputPt());

#define ravu_3x_lut4_tex(pos) (vec4(texture(ravu_3x_lut4, pos)))

#define HOOKED_tex(pos) INPUT_tex(pos)
#define HOOKED_size INPUT_size
#define HOOKED_pt INPUT_pt

void Pass1(uint2 blockStart, uint3 threadId) {
	ivec2 group_base = ivec2(gl_WorkGroupID) * ivec2(gl_WorkGroupSize);
	int local_pos = int(gl_LocalInvocationID.x) * 14 + int(gl_LocalInvocationID.y);
	for (int id = int(gl_LocalInvocationIndex); id < 532; id += int(gl_WorkGroupSize.x * gl_WorkGroupSize.y)) {
		uint x = (uint)id / 14, y = (uint)id % 14;
		inp[id] = HOOKED_tex(HOOKED_pt * vec2(float(group_base.x + x) + (-2.5), float(group_base.y + y) + (-2.5))).x;
	}
	barrier();
#if CURRENT_PASS == LAST_PASS
	uint2 destPos = blockStart + threadId.xy * 3;
	uint2 outputSize = GetOutputSize();
	if (destPos.x >= outputSize.x || destPos.y >= outputSize.y) {
		return;
	}
#endif

	// Sample luminance values from shared memory (7x7 kernel)
	float luma0 = inp[local_pos + 0];
	float luma1 = inp[local_pos + 1];
	float luma2 = inp[local_pos + 2];
	float luma3 = inp[local_pos + 3];
	float luma4 = inp[local_pos + 4];
	float luma5 = inp[local_pos + 5];
	float luma6 = inp[local_pos + 6];
	float luma7 = inp[local_pos + 14];
	float luma8 = inp[local_pos + 15];
	float luma9 = inp[local_pos + 16];
	float luma10 = inp[local_pos + 17];
	float luma11 = inp[local_pos + 18];
	float luma12 = inp[local_pos + 19];
	float luma13 = inp[local_pos + 20];
	float luma14 = inp[local_pos + 28];
	float luma15 = inp[local_pos + 29];
	float luma16 = inp[local_pos + 30];
	float luma17 = inp[local_pos + 31];
	float luma18 = inp[local_pos + 32];
	float luma19 = inp[local_pos + 33];
	float luma20 = inp[local_pos + 34];
	float luma21 = inp[local_pos + 42];
	float luma22 = inp[local_pos + 43];
	float luma23 = inp[local_pos + 44];
	float luma24 = inp[local_pos + 45];
	float luma25 = inp[local_pos + 46];
	float luma26 = inp[local_pos + 47];
	float luma27 = inp[local_pos + 48];
	float luma28 = inp[local_pos + 56];
	float luma29 = inp[local_pos + 57];
	float luma30 = inp[local_pos + 58];
	float luma31 = inp[local_pos + 59];
	float luma32 = inp[local_pos + 60];
	float luma33 = inp[local_pos + 61];
	float luma34 = inp[local_pos + 62];
	float luma35 = inp[local_pos + 70];
	float luma36 = inp[local_pos + 71];
	float luma37 = inp[local_pos + 72];
	float luma38 = inp[local_pos + 73];
	float luma39 = inp[local_pos + 74];
	float luma40 = inp[local_pos + 75];
	float luma41 = inp[local_pos + 76];
	float luma42 = inp[local_pos + 84];
	float luma43 = inp[local_pos + 85];
	float luma44 = inp[local_pos + 86];
	float luma45 = inp[local_pos + 87];
	float luma46 = inp[local_pos + 88];
	float luma47 = inp[local_pos + 89];
	float luma48 = inp[local_pos + 90];

	// Structure tensor computation for edge detection
	vec3 abd = vec3(0.0, 0.0, 0.0);
	float gx, gy;
	gx = (luma15 - luma1) / 2.0;
	gy = (luma9 - luma7) / 2.0;
	abd += vec3(gx * gx, gx * gy, gy * gy) * 0.02324683987829437;
	gx = (luma16 - luma2) / 2.0;
	gy = (luma10 - luma8) / 2.0;
	abd += vec3(gx * gx, gx * gy, gy * gy) * 0.033823952439922346;
	gx = (luma17 - luma3) / 2.0;
	gy = (luma11 - luma9) / 2.0;
	abd += vec3(gx * gx, gx * gy, gy * gy) * 0.038327559383903906;
	gx = (luma18 - luma4) / 2.0;
	gy = (luma12 - luma10) / 2.0;
	abd += vec3(gx * gx, gx * gy, gy * gy) * 0.033823952439922346;
	gx = (luma19 - luma5) / 2.0;
	gy = (luma13 - luma11) / 2.0;
	abd += vec3(gx * gx, gx * gy, gy * gy) * 0.02324683987829437;
	gx = (luma22 - luma8) / 2.0;
	gy = (luma16 - luma14) / 2.0;
	abd += vec3(gx * gx, gx * gy, gy * gy) * 0.033823952439922346;
	gx = (luma23 - luma9) / 2.0;
	gy = (luma17 - luma15) / 2.0;
	abd += vec3(gx * gx, gx * gy, gy * gy) * 0.04921356040854137;
	gx = (luma24 - luma10) / 2.0;
	gy = (luma18 - luma16) / 2.0;
	abd += vec3(gx * gx, gx * gy, gy * gy) * 0.055766269846849466;
	gx = (luma25 - luma11) / 2.0;
	gy = (luma19 - luma17) / 2.0;
	abd += vec3(gx * gx, gx * gy, gy * gy) * 0.04921356040854137;
	gx = (luma26 - luma12) / 2.0;
	gy = (luma20 - luma18) / 2.0;
	abd += vec3(gx * gx, gx * gy, gy * gy) * 0.033823952439922346;
	gx = (luma29 - luma15) / 2.0;
	gy = (luma23 - luma21) / 2.0;
	abd += vec3(gx * gx, gx * gy, gy * gy) * 0.038327559383903906;
	gx = (luma30 - luma16) / 2.0;
	gy = (luma24 - luma22) / 2.0;
	abd += vec3(gx * gx, gx * gy, gy * gy) * 0.055766269846849466;
	gx = (luma31 - luma17) / 2.0;
	gy = (luma25 - luma23) / 2.0;
	abd += vec3(gx * gx, gx * gy, gy * gy) * 0.06319146241026467;
	gx = (luma32 - luma18) / 2.0;
	gy = (luma26 - luma24) / 2.0;
	abd += vec3(gx * gx, gx * gy, gy * gy) * 0.055766269846849466;
	gx = (luma33 - luma19) / 2.0;
	gy = (luma27 - luma25) / 2.0;
	abd += vec3(gx * gx, gx * gy, gy * gy) * 0.038327559383903906;
	gx = (luma36 - luma22) / 2.0;
	gy = (luma30 - luma28) / 2.0;
	abd += vec3(gx * gx, gx * gy, gy * gy) * 0.033823952439922346;
	gx = (luma37 - luma23) / 2.0;
	gy = (luma31 - luma29) / 2.0;
	abd += vec3(gx * gx, gx * gy, gy * gy) * 0.04921356040854137;
	gx = (luma38 - luma24) / 2.0;
	gy = (luma32 - luma30) / 2.0;
	abd += vec3(gx * gx, gx * gy, gy * gy) * 0.055766269846849466;
	gx = (luma39 - luma25) / 2.0;
	gy = (luma33 - luma31) / 2.0;
	abd += vec3(gx * gx, gx * gy, gy * gy) * 0.04921356040854137;
	gx = (luma40 - luma26) / 2.0;
	gy = (luma34 - luma32) / 2.0;
	abd += vec3(gx * gx, gx * gy, gy * gy) * 0.033823952439922346;
	gx = (luma43 - luma29) / 2.0;
	gy = (luma37 - luma35) / 2.0;
	abd += vec3(gx * gx, gx * gy, gy * gy) * 0.02324683987829437;
	gx = (luma44 - luma30) / 2.0;
	gy = (luma38 - luma36) / 2.0;
	abd += vec3(gx * gx, gx * gy, gy * gy) * 0.033823952439922346;
	gx = (luma45 - luma31) / 2.0;
	gy = (luma39 - luma37) / 2.0;
	abd += vec3(gx * gx, gx * gy, gy * gy) * 0.038327559383903906;
	gx = (luma46 - luma32) / 2.0;
	gy = (luma40 - luma38) / 2.0;
	abd += vec3(gx * gx, gx * gy, gy * gy) * 0.033823952439922346;
	gx = (luma47 - luma33) / 2.0;
	gy = (luma41 - luma39) / 2.0;
	abd += vec3(gx * gx, gx * gy, gy * gy) * 0.02324683987829437;

	// Eigenvalue decomposition
	float a = abd.x, b = abd.y, d = abd.z;
	float T = a + d, D = a * d - b * b;
	float delta = sqrt(max(T * T / 4.0 - D, 0.0));
	float L1 = T / 2.0 + delta, L2 = T / 2.0 - delta;
	float sqrtL1 = sqrt(L1), sqrtL2 = sqrt(L2);
	float theta = mix(mod(atan(L1 - a, b) + 3.141592653589793, 3.141592653589793), 0.0, abs(b) < 1.192092896e-7);
	float lambda = sqrtL1;
	float mu = mix((sqrtL1 - sqrtL2) / (sqrtL1 + sqrtL2), 0.0, sqrtL1 + sqrtL2 < 1.192092896e-7);

	// LUT coordinate calculation
	float angle = floor(theta * 24.0 / 3.141592653589793);
	float strength = mix(mix(0.0, 1.0, lambda >= 0.005), 2.0, lambda >= 0.02);
	float coherence = mix(mix(0.0, 1.0, mu >= 0.25), 2.0, mu >= 0.5);
	float coord_y = ((angle * 3.0 + strength) * 3.0 + coherence + 0.5) / 216.0;

	// Weighted sample accumulation from LUT
	vec4 res0 = vec4(0.0, 0.0, 0.0, 0.0), res1 = vec4(0.0, 0.0, 0.0, 0.0);
	vec4 w0, w1;
	w0 = texture(ravu_3x_lut4, vec2(0.01, coord_y));
	w1 = texture(ravu_3x_lut4, vec2(0.03, coord_y));
	res0 += luma0 * w0 + luma48 * w1.wzyx;
	res1 += luma0 * w1 + luma48 * w0.wzyx;
	w0 = texture(ravu_3x_lut4, vec2(0.05, coord_y));
	w1 = texture(ravu_3x_lut4, vec2(0.07, coord_y));
	res0 += luma1 * w0 + luma47 * w1.wzyx;
	res1 += luma1 * w1 + luma47 * w0.wzyx;
	w0 = texture(ravu_3x_lut4, vec2(0.09, coord_y));
	w1 = texture(ravu_3x_lut4, vec2(0.11, coord_y));
	res0 += luma2 * w0 + luma46 * w1.wzyx;
	res1 += luma2 * w1 + luma46 * w0.wzyx;
	w0 = texture(ravu_3x_lut4, vec2(0.13, coord_y));
	w1 = texture(ravu_3x_lut4, vec2(0.15, coord_y));
	res0 += luma3 * w0 + luma45 * w1.wzyx;
	res1 += luma3 * w1 + luma45 * w0.wzyx;
	w0 = texture(ravu_3x_lut4, vec2(0.17, coord_y));
	w1 = texture(ravu_3x_lut4, vec2(0.19, coord_y));
	res0 += luma4 * w0 + luma44 * w1.wzyx;
	res1 += luma4 * w1 + luma44 * w0.wzyx;
	w0 = texture(ravu_3x_lut4, vec2(0.21, coord_y));
	w1 = texture(ravu_3x_lut4, vec2(0.23, coord_y));
	res0 += luma5 * w0 + luma43 * w1.wzyx;
	res1 += luma5 * w1 + luma43 * w0.wzyx;
	w0 = texture(ravu_3x_lut4, vec2(0.25, coord_y));
	w1 = texture(ravu_3x_lut4, vec2(0.27, coord_y));
	res0 += luma6 * w0 + luma42 * w1.wzyx;
	res1 += luma6 * w1 + luma42 * w0.wzyx;
	w0 = texture(ravu_3x_lut4, vec2(0.29, coord_y));
	w1 = texture(ravu_3x_lut4, vec2(0.31, coord_y));
	res0 += luma7 * w0 + luma41 * w1.wzyx;
	res1 += luma7 * w1 + luma41 * w0.wzyx;
	w0 = texture(ravu_3x_lut4, vec2(0.33, coord_y));
	w1 = texture(ravu_3x_lut4, vec2(0.35, coord_y));
	res0 += luma8 * w0 + luma40 * w1.wzyx;
	res1 += luma8 * w1 + luma40 * w0.wzyx;
	w0 = texture(ravu_3x_lut4, vec2(0.37, coord_y));
	w1 = texture(ravu_3x_lut4, vec2(0.39, coord_y));
	res0 += luma9 * w0 + luma39 * w1.wzyx;
	res1 += luma9 * w1 + luma39 * w0.wzyx;
	w0 = texture(ravu_3x_lut4, vec2(0.41, coord_y));
	w1 = texture(ravu_3x_lut4, vec2(0.43, coord_y));
	res0 += luma10 * w0 + luma38 * w1.wzyx;
	res1 += luma10 * w1 + luma38 * w0.wzyx;
	w0 = texture(ravu_3x_lut4, vec2(0.45, coord_y));
	w1 = texture(ravu_3x_lut4, vec2(0.47, coord_y));
	res0 += luma11 * w0 + luma37 * w1.wzyx;
	res1 += luma11 * w1 + luma37 * w0.wzyx;
	w0 = texture(ravu_3x_lut4, vec2(0.49, coord_y));
	w1 = texture(ravu_3x_lut4, vec2(0.51, coord_y));
	res0 += luma12 * w0 + luma36 * w1.wzyx;
	res1 += luma12 * w1 + luma36 * w0.wzyx;
	w0 = texture(ravu_3x_lut4, vec2(0.53, coord_y));
	w1 = texture(ravu_3x_lut4, vec2(0.55, coord_y));
	res0 += luma13 * w0 + luma35 * w1.wzyx;
	res1 += luma13 * w1 + luma35 * w0.wzyx;
	w0 = texture(ravu_3x_lut4, vec2(0.57, coord_y));
	w1 = texture(ravu_3x_lut4, vec2(0.59, coord_y));
	res0 += luma14 * w0 + luma34 * w1.wzyx;
	res1 += luma14 * w1 + luma34 * w0.wzyx;
	w0 = texture(ravu_3x_lut4, vec2(0.61, coord_y));
	w1 = texture(ravu_3x_lut4, vec2(0.63, coord_y));
	res0 += luma15 * w0 + luma33 * w1.wzyx;
	res1 += luma15 * w1 + luma33 * w0.wzyx;
	w0 = texture(ravu_3x_lut4, vec2(0.65, coord_y));
	w1 = texture(ravu_3x_lut4, vec2(0.67, coord_y));
	res0 += luma16 * w0 + luma32 * w1.wzyx;
	res1 += luma16 * w1 + luma32 * w0.wzyx;
	w0 = texture(ravu_3x_lut4, vec2(0.69, coord_y));
	w1 = texture(ravu_3x_lut4, vec2(0.71, coord_y));
	res0 += luma17 * w0 + luma31 * w1.wzyx;
	res1 += luma17 * w1 + luma31 * w0.wzyx;
	w0 = texture(ravu_3x_lut4, vec2(0.73, coord_y));
	w1 = texture(ravu_3x_lut4, vec2(0.75, coord_y));
	res0 += luma18 * w0 + luma30 * w1.wzyx;
	res1 += luma18 * w1 + luma30 * w0.wzyx;
	w0 = texture(ravu_3x_lut4, vec2(0.77, coord_y));
	w1 = texture(ravu_3x_lut4, vec2(0.79, coord_y));
	res0 += luma19 * w0 + luma29 * w1.wzyx;
	res1 += luma19 * w1 + luma29 * w0.wzyx;
	w0 = texture(ravu_3x_lut4, vec2(0.81, coord_y));
	w1 = texture(ravu_3x_lut4, vec2(0.83, coord_y));
	res0 += luma20 * w0 + luma28 * w1.wzyx;
	res1 += luma20 * w1 + luma28 * w0.wzyx;
	w0 = texture(ravu_3x_lut4, vec2(0.85, coord_y));
	w1 = texture(ravu_3x_lut4, vec2(0.87, coord_y));
	res0 += luma21 * w0 + luma27 * w1.wzyx;
	res1 += luma21 * w1 + luma27 * w0.wzyx;
	w0 = texture(ravu_3x_lut4, vec2(0.89, coord_y));
	w1 = texture(ravu_3x_lut4, vec2(0.91, coord_y));
	res0 += luma22 * w0 + luma26 * w1.wzyx;
	res1 += luma22 * w1 + luma26 * w0.wzyx;
	w0 = texture(ravu_3x_lut4, vec2(0.93, coord_y));
	w1 = texture(ravu_3x_lut4, vec2(0.95, coord_y));
	res0 += luma23 * w0 + luma25 * w1.wzyx;
	res1 += luma23 * w1 + luma25 * w0.wzyx;
	w0 = texture(ravu_3x_lut4, vec2(0.97, coord_y));
	w1 = texture(ravu_3x_lut4, vec2(0.99, coord_y));
	res0 += luma24 * w0;
	res1 += luma24 * w1;
	res0 = clamp(res0, 0.0, 1.0);
	res1 = clamp(res1, 0.0, 1.0);

	// Output 3x3 block
	imageStore(out_image, ivec2(gl_GlobalInvocationID) * 3 + ivec2(0, 0), res0[0]);
	imageStore(out_image, ivec2(gl_GlobalInvocationID) * 3 + ivec2(0, 1), res0[1]);
	imageStore(out_image, ivec2(gl_GlobalInvocationID) * 3 + ivec2(0, 2), res0[2]);
	imageStore(out_image, ivec2(gl_GlobalInvocationID) * 3 + ivec2(1, 0), res0[3]);
	imageStore(out_image, ivec2(gl_GlobalInvocationID) * 3 + ivec2(1, 1), luma24);
	imageStore(out_image, ivec2(gl_GlobalInvocationID) * 3 + ivec2(1, 2), res1[0]);
	imageStore(out_image, ivec2(gl_GlobalInvocationID) * 3 + ivec2(2, 0), res1[1]);
	imageStore(out_image, ivec2(gl_GlobalInvocationID) * 3 + ivec2(2, 1), res1[2]);
	imageStore(out_image, ivec2(gl_GlobalInvocationID) * 3 + ivec2(2, 2), res1[3]);
}
