// RAVU-Lite Upscaling Shader (R2 Variant)
// Rapid and Accurate Video Upscaling - Lightweight version
// Based on prescaler algorithms

//!BGFX EFFECT
//!VERSION 1
//!NAME RAVU-Lite R2
//!CATEGORY Upscaling
//!DESCRIPTION Fast edge-adaptive upscaler with radius 2 kernel. Processes luminance channel only for better performance.

//!TEXTURE
Texture2D INPUT;

//!SAMPLER
//!FILTER POINT
SamplerState sam_INPUT;

//!TEXTURE
//!WIDTH  INPUT_WIDTH * 2
//!HEIGHT INPUT_HEIGHT * 2
Texture2D OUTPUT;

//!SAMPLER
//!FILTER LINEAR
SamplerState sam_INPUT_LINEAR;

//!TEXTURE
//!SOURCE ravu_lite_lut2_f16.dds
//!FORMAT R16G16B16A16_FLOAT
Texture2D ravu_lite_lut2;

//!SAMPLER
//!FILTER LINEAR
SamplerState sam_ravu_lite_lut2;

//!COMMON
#include "prescalers.hlsli"

#define LAST_PASS 1

//!PASS 1
//!DESC RAVU-Lite Upscale (r2, compute)
//!IN INPUT, ravu_lite_lut2
//!OUT OUTPUT
//!BLOCK_SIZE 64, 16
//!NUM_THREADS 32, 8
shared float inp[340];

#define CURRENT_PASS 1

#define GET_SAMPLE(x) dot(x.rgb, rgb2y)
#define imageStore(out_image, pos, val) imageStoreOverride(pos, val.x)
void imageStoreOverride(uint2 pos, float value) {
	float2 UV = mul(rgb2uv, INPUT.SampleLevel(sam_INPUT_LINEAR, HOOKED_map(pos), 0).rgb);
	OUTPUT[pos] = float4(mul(yuv2rgb, float3(value.x, UV)), 1.0);
}

#define INPUT_tex(pos) GET_SAMPLE(vec4(texture(INPUT, pos)))
static const float2 INPUT_size = float2(GetInputSize());
static const float2 INPUT_pt = float2(GetInputPt());

#define ravu_lite_lut2_tex(pos) (vec4(texture(ravu_lite_lut2, pos)))

#define HOOKED_tex(pos) INPUT_tex(pos)
#define HOOKED_size INPUT_size
#define HOOKED_pt INPUT_pt

void Pass1(uint2 blockStart, uint3 threadId) {
	ivec2 group_base = ivec2(gl_WorkGroupID) * ivec2(gl_WorkGroupSize);
	int local_pos = int(gl_LocalInvocationID.x) * 10 + int(gl_LocalInvocationID.y);
#pragma warning(disable : 3557)
	for (int id = int(gl_LocalInvocationIndex); id < 340; id += int(gl_WorkGroupSize.x * gl_WorkGroupSize.y)) {
		uint x = (uint)id / 10, y = (uint)id % 10;
		inp[id] = HOOKED_tex(HOOKED_pt * vec2(float(group_base.x + x) + (-0.5), float(group_base.y + y) + (-0.5))).x;
	}
	barrier();
#if CURRENT_PASS == LAST_PASS
	uint2 destPos = blockStart + threadId.xy * 2;
	uint2 outputSize = GetOutputSize();
	if (destPos.x >= outputSize.x || destPos.y >= outputSize.y) {
		return;
	}
#endif

	// Structure tensor computation
	vec3 abd = vec3(0.0, 0.0, 0.0);
	float gx, gy;
	gx = (inp[local_pos + 10] - inp[local_pos + 0]);
	gy = (inp[local_pos + 1] - inp[local_pos + 0]);
	abd += vec3(gx * gx, gx * gy, gy * gy) * 0.1018680644198163;
	gx = (inp[local_pos + 11] - inp[local_pos + 1]);
	gy = (inp[local_pos + 2] - inp[local_pos + 0]) / 2.0;
	abd += vec3(gx * gx, gx * gy, gy * gy) * 0.11543163961422666;
	gx = (inp[local_pos + 12] - inp[local_pos + 2]);
	gy = (inp[local_pos + 2] - inp[local_pos + 1]);
	abd += vec3(gx * gx, gx * gy, gy * gy) * 0.1018680644198163;
	gx = (inp[local_pos + 20] - inp[local_pos + 0]) / 2.0;
	gy = (inp[local_pos + 11] - inp[local_pos + 10]);
	abd += vec3(gx * gx, gx * gy, gy * gy) * 0.11543163961422666;
	gx = (inp[local_pos + 21] - inp[local_pos + 1]) / 2.0;
	gy = (inp[local_pos + 12] - inp[local_pos + 10]) / 2.0;
	abd += vec3(gx * gx, gx * gy, gy * gy) * 0.13080118386382833;
	gx = (inp[local_pos + 22] - inp[local_pos + 2]) / 2.0;
	gy = (inp[local_pos + 12] - inp[local_pos + 11]);
	abd += vec3(gx * gx, gx * gy, gy * gy) * 0.11543163961422666;
	gx = (inp[local_pos + 20] - inp[local_pos + 10]);
	gy = (inp[local_pos + 21] - inp[local_pos + 20]);
	abd += vec3(gx * gx, gx * gy, gy * gy) * 0.1018680644198163;
	gx = (inp[local_pos + 21] - inp[local_pos + 11]);
	gy = (inp[local_pos + 22] - inp[local_pos + 20]) / 2.0;
	abd += vec3(gx * gx, gx * gy, gy * gy) * 0.11543163961422666;
	gx = (inp[local_pos + 22] - inp[local_pos + 12]);
	gy = (inp[local_pos + 22] - inp[local_pos + 21]);
	abd += vec3(gx * gx, gx * gy, gy * gy) * 0.1018680644198163;

	// Eigenvalue decomposition for edge detection
	float a = abd.x, b = abd.y, d = abd.z;
	float T = a + d, D = a * d - b * b;
	float delta = sqrt(max(T * T / 4.0 - D, 0.0));
	float L1 = T / 2.0 + delta, L2 = T / 2.0 - delta;
	float sqrtL1 = sqrt(L1), sqrtL2 = sqrt(L2);
	float theta = mix(mod(atan(L1 - a, b) + 3.141592653589793, 3.141592653589793), 0.0, abs(b) < 1.192092896e-7);
	float lambda = sqrtL1;
	float mu = mix((sqrtL1 - sqrtL2) / (sqrtL1 + sqrtL2), 0.0, sqrtL1 + sqrtL2 < 1.192092896e-7);

	// LUT coordinate calculation
	float angle = floor(theta * 24.0 / 3.141592653589793);
	float strength = mix(mix(0.0, 1.0, lambda >= 0.004), mix(2.0, 3.0, lambda >= 0.05), lambda >= 0.016);
	float coherence = mix(mix(0.0, 1.0, mu >= 0.25), 2.0, mu >= 0.5);
	float coord_y = ((angle * 4.0 + strength) * 3.0 + coherence + 0.5) / 288.0;

	// Weighted sample accumulation
	vec4 res = vec4(0.0, 0.0, 0.0, 0.0), w;
	w = texture(ravu_lite_lut2, vec2(0.1, coord_y));
	res += inp[local_pos + 0] * w + inp[local_pos + 22] * w.wzyx;
	w = texture(ravu_lite_lut2, vec2(0.3, coord_y));
	res += inp[local_pos + 1] * w + inp[local_pos + 21] * w.wzyx;
	w = texture(ravu_lite_lut2, vec2(0.5, coord_y));
	res += inp[local_pos + 2] * w + inp[local_pos + 20] * w.wzyx;
	w = texture(ravu_lite_lut2, vec2(0.7, coord_y));
	res += inp[local_pos + 10] * w + inp[local_pos + 12] * w.wzyx;
	w = texture(ravu_lite_lut2, vec2(0.9, coord_y));
	res += inp[local_pos + 11] * w;
	res = clamp(res, 0.0, 1.0);

	// Output results
	imageStore(out_image, ivec2(gl_GlobalInvocationID) * 2 + ivec2(0, 0), vec4(res[0], 0.0, 0.0, 0.0));
	imageStore(out_image, ivec2(gl_GlobalInvocationID) * 2 + ivec2(0, 1), vec4(res[1], 0.0, 0.0, 0.0));
	imageStore(out_image, ivec2(gl_GlobalInvocationID) * 2 + ivec2(1, 0), vec4(res[2], 0.0, 0.0, 0.0));
	imageStore(out_image, ivec2(gl_GlobalInvocationID) * 2 + ivec2(1, 1), vec4(res[3], 0.0, 0.0, 0.0));
}
