// RAVU 3x Upscaling Shader (R3 Variant, RGB)
// Rapid and Accurate Video Upscaling - Triple scale version
// Edge-adaptive upscaler using radius 3 kernel for 3x scaling, full RGB processing

//!BGFX EFFECT
//!VERSION 1
//!NAME RAVU 3x R3 RGB
//!CATEGORY Upscaling
//!DESCRIPTION 3x edge-adaptive upscaler with radius 3 kernel. Full RGB processing for better color accuracy. Medium quality preset.

//!TEXTURE
Texture2D INPUT;

//!SAMPLER
//!FILTER POINT
SamplerState sam_INPUT;

//!TEXTURE
//!WIDTH  INPUT_WIDTH * 3
//!HEIGHT INPUT_HEIGHT * 3
Texture2D OUTPUT;

//!TEXTURE
//!SOURCE ravu_3x_lut3_f16.dds
//!FORMAT R16G16B16A16_FLOAT
Texture2D ravu_3x_lut3;

//!SAMPLER
//!FILTER LINEAR
SamplerState sam_ravu_3x_lut3;

//!COMMON
#include "prescalers.hlsli"

#define LAST_PASS 1

//!PASS 1
//!DESC RAVU-3x Upscale (rgb, r3)
//!IN INPUT, ravu_3x_lut3
//!OUT OUTPUT
//!BLOCK_SIZE 96, 24
//!NUM_THREADS 32, 8
static const vec3 color_primary = vec3(0.2126, 0.7152, 0.0722);
// HLSL outer product helper
float4x3 outerProduct(float3 l, float4 r) { return mul(float4x1(r), float1x3(l)); }
shared vec3 inp[432];
shared float inp_luma[432];

#define CURRENT_PASS 1

#define GET_SAMPLE(x) x
#define imageStore(out_image, pos, val) imageStoreOverride(pos, val)
void imageStoreOverride(uint2 pos, float4 value) { OUTPUT[pos] = value; }

#define INPUT_tex(pos) GET_SAMPLE(vec4(texture(INPUT, pos)))
static const float2 INPUT_size = float2(GetInputSize());
static const float2 INPUT_pt = float2(GetInputPt());

#define ravu_3x_lut3_tex(pos) (vec4(texture(ravu_3x_lut3, pos)))

#define HOOKED_tex(pos) INPUT_tex(pos)
#define HOOKED_size INPUT_size
#define HOOKED_pt INPUT_pt

void Pass1(uint2 blockStart, uint3 threadId) {
	ivec2 group_base = ivec2(gl_WorkGroupID) * ivec2(gl_WorkGroupSize);
	int local_pos = int(gl_LocalInvocationID.x) * 12 + int(gl_LocalInvocationID.y);
	for (int id = int(gl_LocalInvocationIndex); id < 432; id += int(gl_WorkGroupSize.x * gl_WorkGroupSize.y)) {
		uint x = (uint)id / 12, y = (uint)id % 12;
		inp[id] = HOOKED_tex(HOOKED_pt * vec2(float(group_base.x + x) + (-1.5), float(group_base.y + y) + (-1.5))).xyz;
		inp_luma[id] = dot(inp[id], color_primary);
	}
	barrier();
#if CURRENT_PASS == LAST_PASS
	uint2 destPos = blockStart + threadId.xy * 3;
	uint2 outputSize = GetOutputSize();
	if (destPos.x >= outputSize.x || destPos.y >= outputSize.y) {
		return;
	}
#endif

	// Sample luminance values for edge detection
	float luma1 = inp_luma[local_pos + 1];
	float luma2 = inp_luma[local_pos + 2];
	float luma3 = inp_luma[local_pos + 3];
	float luma5 = inp_luma[local_pos + 12];
	float luma6 = inp_luma[local_pos + 13];
	float luma7 = inp_luma[local_pos + 14];
	float luma8 = inp_luma[local_pos + 15];
	float luma9 = inp_luma[local_pos + 16];
	float luma10 = inp_luma[local_pos + 24];
	float luma11 = inp_luma[local_pos + 25];
	float luma12 = inp_luma[local_pos + 26];
	float luma13 = inp_luma[local_pos + 27];
	float luma14 = inp_luma[local_pos + 28];
	float luma15 = inp_luma[local_pos + 36];
	float luma16 = inp_luma[local_pos + 37];
	float luma17 = inp_luma[local_pos + 38];
	float luma18 = inp_luma[local_pos + 39];
	float luma19 = inp_luma[local_pos + 40];
	float luma21 = inp_luma[local_pos + 49];
	float luma22 = inp_luma[local_pos + 50];
	float luma23 = inp_luma[local_pos + 51];

	// Structure tensor computation for edge detection
	vec3 abd = vec3(0.0, 0.0, 0.0);
	float gx, gy;
	gx = (luma11 - luma1) / 2.0;
	gy = (luma7 - luma5) / 2.0;
	abd += vec3(gx * gx, gx * gy, gy * gy) * 0.1018680644198163;
	gx = (luma12 - luma2) / 2.0;
	gy = (luma8 - luma6) / 2.0;
	abd += vec3(gx * gx, gx * gy, gy * gy) * 0.11543163961422666;
	gx = (luma13 - luma3) / 2.0;
	gy = (luma9 - luma7) / 2.0;
	abd += vec3(gx * gx, gx * gy, gy * gy) * 0.1018680644198163;
	gx = (luma16 - luma6) / 2.0;
	gy = (luma12 - luma10) / 2.0;
	abd += vec3(gx * gx, gx * gy, gy * gy) * 0.11543163961422666;
	gx = (luma17 - luma7) / 2.0;
	gy = (luma13 - luma11) / 2.0;
	abd += vec3(gx * gx, gx * gy, gy * gy) * 0.13080118386382833;
	gx = (luma18 - luma8) / 2.0;
	gy = (luma14 - luma12) / 2.0;
	abd += vec3(gx * gx, gx * gy, gy * gy) * 0.11543163961422666;
	gx = (luma21 - luma11) / 2.0;
	gy = (luma17 - luma15) / 2.0;
	abd += vec3(gx * gx, gx * gy, gy * gy) * 0.1018680644198163;
	gx = (luma22 - luma12) / 2.0;
	gy = (luma18 - luma16) / 2.0;
	abd += vec3(gx * gx, gx * gy, gy * gy) * 0.11543163961422666;
	gx = (luma23 - luma13) / 2.0;
	gy = (luma19 - luma17) / 2.0;
	abd += vec3(gx * gx, gx * gy, gy * gy) * 0.1018680644198163;

	// Eigenvalue decomposition
	float a = abd.x, b = abd.y, d = abd.z;
	float T = a + d, D = a * d - b * b;
	float delta = sqrt(max(T * T / 4.0 - D, 0.0));
	float L1 = T / 2.0 + delta, L2 = T / 2.0 - delta;
	float sqrtL1 = sqrt(L1), sqrtL2 = sqrt(L2);
	float theta = mix(mod(atan(L1 - a, b) + 3.141592653589793, 3.141592653589793), 0.0, abs(b) < 1.192092896e-7);
	float lambda = sqrtL1;
	float mu = mix((sqrtL1 - sqrtL2) / (sqrtL1 + sqrtL2), 0.0, sqrtL1 + sqrtL2 < 1.192092896e-7);

	// LUT coordinate calculation
	float angle = floor(theta * 24.0 / 3.141592653589793);
	float strength = mix(mix(0.0, 1.0, lambda >= 0.005), 2.0, lambda >= 0.02);
	float coherence = mix(mix(0.0, 1.0, mu >= 0.25), 2.0, mu >= 0.5);
	float coord_y = ((angle * 3.0 + strength) * 3.0 + coherence + 0.5) / 216.0;

	// Weighted sample accumulation from LUT (RGB processing)
	mat4x3 res0 = 0.0, res1 = 0.0;
	vec4 w0, w1;
	w0 = texture(ravu_3x_lut3, vec2(0.019230769230769232, coord_y));
	w1 = texture(ravu_3x_lut3, vec2(0.057692307692307696, coord_y));
	res0 += outerProduct(inp[local_pos + 0], w0) + outerProduct(inp[local_pos + 52], w1.wzyx);
	res1 += outerProduct(inp[local_pos + 0], w1) + outerProduct(inp[local_pos + 52], w0.wzyx);
	w0 = texture(ravu_3x_lut3, vec2(0.09615384615384616, coord_y));
	w1 = texture(ravu_3x_lut3, vec2(0.1346153846153846, coord_y));
	res0 += outerProduct(inp[local_pos + 1], w0) + outerProduct(inp[local_pos + 51], w1.wzyx);
	res1 += outerProduct(inp[local_pos + 1], w1) + outerProduct(inp[local_pos + 51], w0.wzyx);
	w0 = texture(ravu_3x_lut3, vec2(0.17307692307692307, coord_y));
	w1 = texture(ravu_3x_lut3, vec2(0.21153846153846154, coord_y));
	res0 += outerProduct(inp[local_pos + 2], w0) + outerProduct(inp[local_pos + 50], w1.wzyx);
	res1 += outerProduct(inp[local_pos + 2], w1) + outerProduct(inp[local_pos + 50], w0.wzyx);
	w0 = texture(ravu_3x_lut3, vec2(0.25, coord_y));
	w1 = texture(ravu_3x_lut3, vec2(0.28846153846153844, coord_y));
	res0 += outerProduct(inp[local_pos + 3], w0) + outerProduct(inp[local_pos + 49], w1.wzyx);
	res1 += outerProduct(inp[local_pos + 3], w1) + outerProduct(inp[local_pos + 49], w0.wzyx);
	w0 = texture(ravu_3x_lut3, vec2(0.3269230769230769, coord_y));
	w1 = texture(ravu_3x_lut3, vec2(0.36538461538461536, coord_y));
	res0 += outerProduct(inp[local_pos + 4], w0) + outerProduct(inp[local_pos + 48], w1.wzyx);
	res1 += outerProduct(inp[local_pos + 4], w1) + outerProduct(inp[local_pos + 48], w0.wzyx);
	w0 = texture(ravu_3x_lut3, vec2(0.40384615384615385, coord_y));
	w1 = texture(ravu_3x_lut3, vec2(0.4423076923076923, coord_y));
	res0 += outerProduct(inp[local_pos + 12], w0) + outerProduct(inp[local_pos + 40], w1.wzyx);
	res1 += outerProduct(inp[local_pos + 12], w1) + outerProduct(inp[local_pos + 40], w0.wzyx);
	w0 = texture(ravu_3x_lut3, vec2(0.4807692307692308, coord_y));
	w1 = texture(ravu_3x_lut3, vec2(0.5192307692307693, coord_y));
	res0 += outerProduct(inp[local_pos + 13], w0) + outerProduct(inp[local_pos + 39], w1.wzyx);
	res1 += outerProduct(inp[local_pos + 13], w1) + outerProduct(inp[local_pos + 39], w0.wzyx);
	w0 = texture(ravu_3x_lut3, vec2(0.5576923076923077, coord_y));
	w1 = texture(ravu_3x_lut3, vec2(0.5961538461538461, coord_y));
	res0 += outerProduct(inp[local_pos + 14], w0) + outerProduct(inp[local_pos + 38], w1.wzyx);
	res1 += outerProduct(inp[local_pos + 14], w1) + outerProduct(inp[local_pos + 38], w0.wzyx);
	w0 = texture(ravu_3x_lut3, vec2(0.6346153846153846, coord_y));
	w1 = texture(ravu_3x_lut3, vec2(0.6730769230769231, coord_y));
	res0 += outerProduct(inp[local_pos + 15], w0) + outerProduct(inp[local_pos + 37], w1.wzyx);
	res1 += outerProduct(inp[local_pos + 15], w1) + outerProduct(inp[local_pos + 37], w0.wzyx);
	w0 = texture(ravu_3x_lut3, vec2(0.7115384615384616, coord_y));
	w1 = texture(ravu_3x_lut3, vec2(0.75, coord_y));
	res0 += outerProduct(inp[local_pos + 16], w0) + outerProduct(inp[local_pos + 36], w1.wzyx);
	res1 += outerProduct(inp[local_pos + 16], w1) + outerProduct(inp[local_pos + 36], w0.wzyx);
	w0 = texture(ravu_3x_lut3, vec2(0.7884615384615384, coord_y));
	w1 = texture(ravu_3x_lut3, vec2(0.8269230769230769, coord_y));
	res0 += outerProduct(inp[local_pos + 24], w0) + outerProduct(inp[local_pos + 28], w1.wzyx);
	res1 += outerProduct(inp[local_pos + 24], w1) + outerProduct(inp[local_pos + 28], w0.wzyx);
	w0 = texture(ravu_3x_lut3, vec2(0.8653846153846154, coord_y));
	w1 = texture(ravu_3x_lut3, vec2(0.9038461538461539, coord_y));
	res0 += outerProduct(inp[local_pos + 25], w0) + outerProduct(inp[local_pos + 27], w1.wzyx);
	res1 += outerProduct(inp[local_pos + 25], w1) + outerProduct(inp[local_pos + 27], w0.wzyx);
	w0 = texture(ravu_3x_lut3, vec2(0.9423076923076923, coord_y));
	w1 = texture(ravu_3x_lut3, vec2(0.9807692307692307, coord_y));
	res0 += outerProduct(inp[local_pos + 26], w0);
	res1 += outerProduct(inp[local_pos + 26], w1);
	res0[0] = clamp(res0[0], 0.0, 1.0);
	res0[1] = clamp(res0[1], 0.0, 1.0);
	res0[2] = clamp(res0[2], 0.0, 1.0);
	res0[3] = clamp(res0[3], 0.0, 1.0);
	res1[0] = clamp(res1[0], 0.0, 1.0);
	res1[1] = clamp(res1[1], 0.0, 1.0);
	res1[2] = clamp(res1[2], 0.0, 1.0);
	res1[3] = clamp(res1[3], 0.0, 1.0);

	// Output 3x3 block
	imageStore(out_image, ivec2(gl_GlobalInvocationID) * 3 + ivec2(0, 0), vec4(res0[0], 1.0));
	imageStore(out_image, ivec2(gl_GlobalInvocationID) * 3 + ivec2(0, 1), vec4(res0[1], 1.0));
	imageStore(out_image, ivec2(gl_GlobalInvocationID) * 3 + ivec2(0, 2), vec4(res0[2], 1.0));
	imageStore(out_image, ivec2(gl_GlobalInvocationID) * 3 + ivec2(1, 0), vec4(res0[3], 1.0));
	imageStore(out_image, ivec2(gl_GlobalInvocationID) * 3 + ivec2(1, 1), vec4(inp[local_pos + 26], 1.0));
	imageStore(out_image, ivec2(gl_GlobalInvocationID) * 3 + ivec2(1, 2), vec4(res1[0], 1.0));
	imageStore(out_image, ivec2(gl_GlobalInvocationID) * 3 + ivec2(2, 0), vec4(res1[1], 1.0));
	imageStore(out_image, ivec2(gl_GlobalInvocationID) * 3 + ivec2(2, 1), vec4(res1[2], 1.0));
	imageStore(out_image, ivec2(gl_GlobalInvocationID) * 3 + ivec2(2, 2), vec4(res1[3], 1.0));
}
