// LCAS - Light Contrast Adaptive Sharpening
// A combination of linear interpolation and a lightweight CAS implementation

//!BGFX EFFECT
//!VERSION 1
//!NAME Light CAS
//!CATEGORY Sharpening
//!DESCRIPTION Lightweight Contrast Adaptive Sharpening that combines linear interpolation with CAS principles. Good balance between performance and quality.

//!PARAMETER
//!LABEL Sharpness
//!DESC Controls the intensity of the sharpening effect. Higher values = sharper image.
//!DEFAULT 0.5
//!MIN 0
//!MAX 1
//!STEP 0.01
float sharpness;

//!TEXTURE
Texture2D INPUT;

//!TEXTURE
Texture2D OUTPUT;

//!SAMPLER
//!FILTER LINEAR
SamplerState sam;

//!PASS 1
//!IN INPUT
//!OUT OUTPUT
//!BLOCK_SIZE 16
//!NUM_THREADS 64

float3 LightCAS(uint2 ip, float peak) {

	float2 pos = (ip + 0.5f) * GetOutputPt();
	float2 inputPt = GetInputPt();

	// Sample 3x3 neighborhood around the center pixel:
	//	a b c
	//	d(e)f
	//	g h i
	float3 a = INPUT.SampleLevel(sam, pos + float2(-inputPt.x, -inputPt.y), 0).rgb;
	float3 b = INPUT.SampleLevel(sam, pos + float2(0, -inputPt.y), 0).rgb;
	float3 c = INPUT.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0).rgb;
	float3 d = INPUT.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0).rgb;
	float3 f = INPUT.SampleLevel(sam, pos + float2(inputPt.x, 0), 0).rgb;
	float3 g = INPUT.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0).rgb;
	float3 h = INPUT.SampleLevel(sam, pos + float2(0, inputPt.y), 0).rgb;
	float3 i = INPUT.SampleLevel(sam, pos + float2(inputPt.x, inputPt.y), 0).rgb;

	// Pre-compute sums for efficiency
	float3 x = a + c + g + i;  // Corner sum
	float3 y = b + d + f + h;  // Cross sum

	// Sample center using offset sub-pixel positions for better interpolation
	float3 e = INPUT.SampleLevel(sam, pos + float2(inputPt.x * 0.25, inputPt.y * 0.5), 0).rgb;
	e += INPUT.SampleLevel(sam, pos + float2(-inputPt.x * 0.25, -inputPt.y * 0.5), 0).rgb;
	e += INPUT.SampleLevel(sam, pos + float2(inputPt.x * 0.5, -inputPt.y * 0.25), 0).rgb;
	e += INPUT.SampleLevel(sam, pos + float2(-inputPt.x * 0.5, inputPt.y * 0.25), 0).rgb;
	e /= 4;

	// Compute soft min/max across the 3x3 neighborhood
	float3 mnRGB = min(min(min(min(d, e), min(f, b)), h), min(min(a, i), min(c, g)));
	float3 mxRGB = max(max(max(max(d, e), max(f, b)), h), max(max(a, i), max(c, g)));

	// Calculate adaptive sharpening weights based on local contrast
	float3 wRGB = sqrt(min(mnRGB, 1.0 - mxRGB) / mxRGB) * peak;

	// Apply filter with weights:
	//  w w w
	//  w 1 w
	//  w w w
	float3 color = ((x + y) * wRGB + (e * 5.0 - (x + y * 2.0 + e * 4.0) / 4.0)) / (1.0 + 8.0 * wRGB);

	// Blend with clamped result for artifact reduction
	return (color + clamp(color, mnRGB, mxRGB) * 4.0) / 5.0;
}

void Pass1(uint2 blockStart, uint3 threadId) {
	uint2 gxy = blockStart + TileSwizzle8x8(threadId.x);

	const uint2 outputSize = GetOutputSize();
	if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
		return;
	}

	// Convert sharpness parameter to peak value
	const float peak = lerp(0, -0.1111111111111111, sharpness);

	// Process 4 pixels per thread for efficiency
	OUTPUT[gxy] = float4(LightCAS(gxy, peak), 1);

	gxy.x += 8u;
	OUTPUT[gxy] = float4(LightCAS(gxy, peak), 1);

	gxy.y += 8u;
	OUTPUT[gxy] = float4(LightCAS(gxy, peak), 1);

	gxy.x -= 8u;
	OUTPUT[gxy] = float4(LightCAS(gxy, peak), 1);
}
