// FineSharp
// Based on: https://forum.doom9.org/showthread.php?t=171346
//
// FineSharp avisynth script by Didee converted to shader format.
// Originally designed for madVR (requires 16-bit accuracy in shader chain).
//
// Note: This sharpener does not filter noise or source artifacts and will
// sharpen those as well. Clean your source first if necessary.

//!BGFX EFFECT
//!VERSION 1
//!NAME FineSharp
//!CATEGORY Sharpening
//!DESCRIPTION Multi-pass sharpening algorithm with equalisation and repair. Provides fine detail enhancement with minimal artifacts when properly tuned.


//!PARAMETER
//!LABEL Sharpening Strength
//!DESC Primary sharpening intensity. Effective range is 0.0 to 8.0. Adjust cstr when changing this value.
//!DEFAULT 2.0
//!MIN 0
//!MAX 8
//!STEP 0.01
float sstr;

//!PARAMETER
//!LABEL Equalisation Strength
//!DESC Controls sharpening equalisation. Suggested values based on sstr: 0->0, 0.5->0.1, 1.0->0.6, 2.0->0.9, 3.0->1.09, 4.0->1.19
//!DEFAULT 0.9
//!MIN 0
//!MAX 2
//!STEP 0.01
float cstr;

//!PARAMETER
//!LABEL XSharpen Strength
//!DESC Final XSharpen-style sharpening pass. Keep below 0.25 for best results.
//!DEFAULT 0.19
//!MIN 0
//!MAX 1
//!STEP 0.01
float xstr;

//!PARAMETER
//!LABEL Repair Strength
//!DESC Reduces artifacts from final sharpening pass. Higher values = more repair.
//!DEFAULT 0.25
//!MIN 0
//!MAX 1
//!STEP 0.01
float xrep;

//!TEXTURE
Texture2D INPUT;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
Texture2D OUTPUT;

//!TEXTURE
//!WIDTH OUTPUT_WIDTH
//!HEIGHT OUTPUT_HEIGHT
//!FORMAT R16G16B16A16_FLOAT
Texture2D tex1;

//!TEXTURE
//!WIDTH OUTPUT_WIDTH
//!HEIGHT OUTPUT_HEIGHT
//!FORMAT R16G16B16A16_FLOAT
Texture2D tex2;

//!SAMPLER
//!FILTER POINT
SamplerState sam;


//!PASS 1
//!DESC RGB to YUV conversion with Gaussian blur
//!IN INPUT
//!OUT tex1
//!BLOCK_SIZE 16
//!NUM_THREADS 64


// Color space conversion matrices
#define RGBtoYUV_Matrix(Kb,Kr) float3x3(float3(Kr, 1 - Kr - Kb, Kb), float3(-Kr, Kr + Kb - 1, 1 - Kb) / (2*(1 - Kb)), float3(1 - Kr, Kr + Kb - 1, -Kb) / (2*(1 - Kr)))
static const float3x3 RGBtoYUV = GetInputSize().y <= 576 ? RGBtoYUV_Matrix(0.114, 0.299) : RGBtoYUV_Matrix(0.0722, 0.2126);

void Pass1(uint2 blockStart, uint3 threadId) {
	uint2 gxy = (TileSwizzle8x8(threadId.x) << 1) + blockStart;
	uint2 inputSize = GetInputSize();
	if (gxy.x >= inputSize.x || gxy.y >= inputSize.y) {
		return;
	}

	float2 inputPt = GetInputPt();
	uint i, j;

	// Load 4x4 source neighborhood
	float3 src[4][4];
	[unroll]
	for (i = 0; i < 3; i += 2) {
		[unroll]
		for (j = 0; j < 3; j += 2) {
			float2 tpos = (gxy + uint2(i, j)) * inputPt;
			const float4 sr = INPUT.GatherRed(sam, tpos);
			const float4 sg = INPUT.GatherGreen(sam, tpos);
			const float4 sb = INPUT.GatherBlue(sam, tpos);

			// Convert to YUV color space
			src[i][j] = mul(RGBtoYUV, float3(sr.w, sg.w, sb.w)) + float3(0, 0.5, 0.5);
			src[i][j + 1] = mul(RGBtoYUV, float3(sr.x, sg.x, sb.x)) + float3(0, 0.5, 0.5);
			src[i + 1][j] = mul(RGBtoYUV, float3(sr.z, sg.z, sb.z)) + float3(0, 0.5, 0.5);
			src[i + 1][j + 1] = mul(RGBtoYUV, float3(sr.y, sg.y, sb.y)) + float3(0, 0.5, 0.5);
		}
	}

	// Apply RemoveGrain11 (weighted 3x3 blur) to luma channel
	[unroll]
	for (i = 1; i <= 2; ++i) {
		[unroll]
		for (j = 1; j <= 2; ++j) {
			float4 o = src[i][j].rgbr;

			// Weighted blur: center*2 + cross*2 + corners
			o.x += o.x;
			o.x += src[i][j - 1].x + src[i - 1][j].x + src[i + 1][j].x + src[i][j + 1].x;
			o.x += o.x;
			o.x += src[i - 1][j - 1].x + src[i + 1][j - 1].x + src[i - 1][j + 1].x + src[i + 1][j + 1].x;
			o.x *= 0.0625f;

			tex1[gxy + uint2(i - 1, j - 1)] = o;
		}
	}
}


//!PASS 2
//!DESC Median filter pass (RemoveGrain4)
//!IN tex1
//!OUT tex2
//!BLOCK_SIZE 16
//!NUM_THREADS 64

// Sorting macros for median calculation
#define sort(a1,a2)                         (t=min(a1,a2),a2=max(a1,a2),a1=t)
#define median3(a1,a2,a3)                   (sort(a2,a3),sort(a1,a2),min(a2,a3))
#define median5(a1,a2,a3,a4,a5)             (sort(a1,a2),sort(a3,a4),sort(a1,a3),sort(a2,a4),median3(a2,a3,a5))
#define median9(a1,a2,a3,a4,a5,a6,a7,a8,a9) (sort(a1,a2),sort(a3,a4),sort(a5,a6),sort(a7,a8),\
											 sort(a1,a3),sort(a5,a7),sort(a1,a5),sort(a3,a5),sort(a3,a7),\
											 sort(a2,a4),sort(a6,a8),sort(a4,a8),sort(a4,a6),sort(a2,a6),median5(a2,a4,a5,a7,a9))

void Pass2(uint2 blockStart, uint3 threadId) {
	uint2 gxy = (TileSwizzle8x8(threadId.x) << 1) + blockStart;
	uint2 inputSize = GetInputSize();
	if (gxy.x >= inputSize.x || gxy.y >= inputSize.y) {
		return;
	}

	float2 inputPt = GetInputPt();
	uint i, j;

	// Load neighborhood
	float4 src[4][4];
	[unroll]
	for (i = 0; i < 3; i += 2) {
		[unroll]
		for (j = 0; j < 3; j += 2) {
			float2 tpos = (gxy + uint2(i, j)) * inputPt;
			const float4 sr = tex1.GatherRed(sam, tpos);

			src[i][j].r = sr.w;
			src[i][j + 1].r = sr.x;
			src[i + 1][j].r = sr.z;
			src[i + 1][j + 1].r = sr.y;
		}
	}

	// Load center chroma values
	float2 tpos = (gxy + 1) * inputPt;
	const float4 sg = tex1.GatherGreen(sam, tpos);
	const float4 sb = tex1.GatherBlue(sam, tpos);
	const float4 sa = tex1.GatherAlpha(sam, tpos);
	src[1][1].gba = float3(sg.w, sb.w, sa.w);
	src[1][2].gba = float3(sg.x, sb.x, sa.x);
	src[2][1].gba = float3(sg.z, sb.z, sa.z);
	src[2][2].gba = float3(sg.y, sb.y, sa.y);

	// Apply blur then median filter
	[unroll]
	for (i = 1; i <= 2; ++i) {
		[unroll]
		for (j = 1; j <= 2; ++j) {
			float4 o = src[i][j];

			// Another weighted blur pass
			o.x += o.x;
			o.x += src[i][j - 1].x + src[i - 1][j].x + src[i + 1][j].x + src[i][j + 1].x;
			o.x += o.x;
			o.x += src[i - 1][j - 1].x + src[i + 1][j - 1].x + src[i - 1][j + 1].x + src[i + 1][j + 1].x;
			o.x *= 0.0625f;

			// Compute median of 3x3 neighborhood
			float t;
			float t1 = src[i - 1][j - 1].x;
			float t2 = src[i][j - 1].x;
			float t3 = src[i + 1][j - 1].x;
			float t4 = src[i - 1][j].x;
			float t5 = o.x;
			float t6 = src[i + 1][j].x;
			float t7 = src[i - 1][j + 1].x;
			float t8 = src[i][j + 1].x;
			float t9 = src[i + 1][j + 1].x;
			o.x = median9(t1, t2, t3, t4, t5, t6, t7, t8, t9);

			tex2[gxy + uint2(i - 1, j - 1)] = o;
		}
	}
}

//!PASS 3
//!DESC Sharpening calculation (Part A)
//!IN tex2
//!OUT tex1
//!BLOCK_SIZE 16
//!NUM_THREADS 64

// Non-linear sharpening parameters
#define lstr 1.49  // Modifier for non-linear sharpening
#define pstr 1.272 // Exponent for non-linear sharpening
#define ldmp (sstr+0.1f) // Low damp - prevents over-enhancement of small differences

float ComputeSharpDiff(float4 c) {
	float t = c.a - c.x;
	return sign(t) * (sstr / 255.0f) * pow(abs(t) / (lstr / 255.0f), 1.0f / pstr) * ((t * t) / (t * t + ldmp / (255.0f * 255.0f)));
}

void Pass3(uint2 blockStart, uint3 threadId) {
	uint2 gxy = (TileSwizzle8x8(threadId.x) << 1) + blockStart;
	uint2 inputSize = GetInputSize();
	if (gxy.x >= inputSize.x || gxy.y >= inputSize.y) {
		return;
	}

	float2 inputPt = GetInputPt();
	uint i, j;

	float4 src[4][4];
	float sharpDiffs[4][4];

	// Load neighborhood and compute sharpening differences
	[unroll]
	for (i = 0; i < 3; i += 2) {
		[unroll]
		for (j = 0; j < 3; j += 2) {
			float2 tpos = (gxy + uint2(i, j)) * inputPt;
			const float4 sr = tex2.GatherRed(sam, tpos);
			const float4 sa = tex2.GatherAlpha(sam, tpos);

			src[i][j].ra = float2(sr.w, sa.w);
			src[i][j + 1].ra = float2(sr.x, sa.x);
			src[i + 1][j].ra = float2(sr.z, sa.z);
			src[i + 1][j + 1].ra = float2(sr.y, sa.y);

			sharpDiffs[i][j] = ComputeSharpDiff(src[i][j]);
			sharpDiffs[i][j + 1] = ComputeSharpDiff(src[i][j + 1]);
			sharpDiffs[i + 1][j] = ComputeSharpDiff(src[i + 1][j]);
			sharpDiffs[i + 1][j + 1] = ComputeSharpDiff(src[i + 1][j + 1]);
		}
	}

	// Load chroma values for center pixels
	float2 tpos = (gxy + 1) * inputPt;
	const float4 sg = tex2.GatherGreen(sam, tpos);
	const float4 sb = tex2.GatherBlue(sam, tpos);
	src[1][1].gb = float2(sg.w, sb.w);
	src[1][2].gb = float2(sg.x, sb.x);
	src[2][1].gb = float2(sg.z, sb.z);
	src[2][2].gb = float2(sg.y, sb.y);

	// Apply sharpening with equalisation
	[unroll]
	for (i = 1; i <= 2; ++i) {
		[unroll]
		for (j = 1; j <= 2; ++j) {
			float4 o = src[i][j];

			float sd = sharpDiffs[i][j];
			o.x = o.a + sd;

			// Blur the sharpening differences for equalisation
			sd += sd;
			sd += sharpDiffs[i][j - 1] + sharpDiffs[i - 1][j] + sharpDiffs[i + 1][j] + sharpDiffs[i][j + 1];
			sd += sd;
			sd += sharpDiffs[i - 1][j - 1] + sharpDiffs[i + 1][j - 1] + sharpDiffs[i - 1][j + 1] + sharpDiffs[i + 1][j + 1];
			sd *= 0.0625f;
			o.x -= cstr * sd;
			o.a = o.x;

			tex1[gxy + uint2(i - 1, j - 1)] = o;
		}
	}
}


//!PASS 4
//!DESC Edge-aware repair (Part B)
//!IN tex1
//!OUT tex2
//!BLOCK_SIZE 16
//!NUM_THREADS 64

// Sorting macros for min/max calculation
#define sort(a1,a2)                               (t=min(a1,a2),a2=max(a1,a2),a1=t)
#define sort_min_max3(a1,a2,a3)                   (sort(a1,a2),sort(a1,a3),sort(a2,a3))
#define sort_min_max5(a1,a2,a3,a4,a5)             (sort(a1,a2),sort(a3,a4),sort(a1,a3),sort(a2,a4),sort(a1,a5),sort(a4,a5))
#define sort_min_max7(a1,a2,a3,a4,a5,a6,a7)       (sort(a1,a2),sort(a3,a4),sort(a5,a6),sort(a1,a3),sort(a1,a5),sort(a2,a6),sort(a4,a5),sort(a1,a7),sort(a6,a7))
#define sort_min_max9(a1,a2,a3,a4,a5,a6,a7,a8,a9) (sort(a1,a2),sort(a3,a4),sort(a5,a6),sort(a7,a8),sort(a1,a3),sort(a5,a7),sort(a1,a5),sort(a2,a4),sort(a6,a7),sort(a4,a8),sort(a1,a9),sort(a8,a9))
#define sort9_partial2(a1,a2,a3,a4,a5,a6,a7,a8,a9) (sort_min_max9(a1,a2,a3,a4,a5,a6,a7,a8,a9),sort_min_max7(a2,a3,a4,a5,a6,a7,a8))


void Pass4(uint2 blockStart, uint3 threadId) {
	uint2 gxy = (TileSwizzle8x8(threadId.x) << 1) + blockStart;
	uint2 inputSize = GetInputSize();
	if (gxy.x >= inputSize.x || gxy.y >= inputSize.y) {
		return;
	}

	float2 inputPt = GetInputPt();
	uint i, j;

	// Load alpha channel neighborhood
	float4 src[4][4];
	[unroll]
	for (i = 0; i < 3; i += 2) {
		[unroll]
		for (j = 0; j < 3; j += 2) {
			float2 tpos = (gxy + uint2(i, j)) * inputPt;
			const float4 sa = tex1.GatherAlpha(sam, tpos);

			src[i][j].a = sa.w;
			src[i][j + 1].a = sa.x;
			src[i + 1][j].a = sa.z;
			src[i + 1][j + 1].a = sa.y;
		}
	}

	// Load RGB for center pixels
	float2 tpos = (gxy + 1) * inputPt;
	const float4 sr = tex1.GatherRed(sam, tpos);
	const float4 sg = tex1.GatherGreen(sam, tpos);
	const float4 sb = tex1.GatherBlue(sam, tpos);
	src[1][1].rgb = float3(sr.w, sg.w, sb.w);
	src[1][2].rgb = float3(sr.x, sg.x, sb.x);
	src[2][1].rgb = float3(sr.z, sg.z, sb.z);
	src[2][2].rgb = float3(sr.y, sg.y, sb.y);

	// Apply edge enhancement with clamping
	[unroll]
	for (i = 1; i <= 2; ++i) {
		[unroll]
		for (j = 1; j <= 2; ++j) {
			float4 o = src[i][j];

			float t;
			float t1 = src[i - 1][j - 1].a;
			float t2 = src[i][j - 1].a;
			float t3 = src[i + 1][j - 1].a;
			float t4 = src[i - 1][j].a;
			float t5 = o.a;
			float t6 = src[i + 1][j].a;
			float t7 = src[i - 1][j + 1].a;
			float t8 = src[i][j + 1].a;
			float t9 = src[i + 1][j + 1].a;

			// Edge enhancement: difference from neighborhood mean
			o.x += t1 + t2 + t3 + t4 + t6 + t7 + t8 + t9;
			o.x /= 9.0f;
			o.x = o.a + 9.9f * (o.a - o.x);

			// Clamp to sorted neighborhood range
			sort9_partial2(t1, t2, t3, t4, t5, t6, t7, t8, t9);
			o.x = max(o.x, min(t2, o.a));
			o.x = min(o.x, max(t8, o.a));

			tex2[gxy + uint2(i - 1, j - 1)] = o;
		}
	}
}


//!PASS 5
//!DESC Final XSharpen and YUV to RGB (Part C)
//!IN tex2
//!OUT OUTPUT
//!BLOCK_SIZE 16
//!NUM_THREADS 64


#define YUVtoRGB_Matrix(Kb,Kr) float3x3(float3(1, 0, 2*(1 - Kr)), float3(Kb + Kr - 1, 2*(1 - Kb)*Kb, 2*Kr*(1 - Kr)) / (Kb + Kr - 1), float3(1, 2*(1 - Kb),0))
static const float3x3 YUVtoRGB = GetInputSize().y <= 576 ? YUVtoRGB_Matrix(0.114, 0.299) : YUVtoRGB_Matrix(0.0722, 0.2126);


void Pass5(uint2 blockStart, uint3 threadId) {
	uint2 gxy = (TileSwizzle8x8(threadId.x) << 1) + blockStart;

	const uint2 outputSize = GetOutputSize();
	if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
		return;
	}

	float2 inputPt = GetInputPt();
	uint i, j;

	// Load luma neighborhood
	float4 src[4][4];
	[unroll]
	for (i = 0; i < 3; i += 2) {
		[unroll]
		for (j = 0; j < 3; j += 2) {
			float2 tpos = (gxy + uint2(i, j)) * inputPt;
			const float4 sr = tex2.GatherRed(sam, tpos);

			src[i][j].r = sr.w;
			src[i][j + 1].r = sr.x;
			src[i + 1][j].r = sr.z;
			src[i + 1][j + 1].r = sr.y;
		}
	}

	// Load full YUV for center pixels
	float2 tpos = (gxy + 1) * inputPt;
	const float4 sg = tex2.GatherGreen(sam, tpos);
	const float4 sb = tex2.GatherBlue(sam, tpos);
	const float4 sa = tex2.GatherAlpha(sam, tpos);
	src[1][1].gba = float3(sg.w, sb.w, sa.w);
	src[1][2].gba = float3(sg.x, sb.x, sa.x);
	src[2][1].gba = float3(sg.z, sb.z, sa.z);
	src[2][2].gba = float3(sg.y, sb.y, sa.y);

	// Final XSharpen pass and convert back to RGB
	[unroll]
	for (i = 1; i <= 2; ++i) {
		[unroll]
		for (j = 1; j <= 2; ++j) {
			uint2 destPos = gxy + uint2(i - 1, j - 1);

			float4 o = src[i][j];

			// Edge-aware XSharpen blending
			float edge = abs(src[i][j - 1].x + src[i - 1][j].x + src[i + 1][j].x + src[i][j + 1].x - 4 * o.x);
			o.x = lerp(o.a, o.x, xstr * (1 - saturate(edge * xrep)));

			// Convert from YUV back to RGB
			OUTPUT[destPos] = float4(mul(YUVtoRGB, o.xyz - float3(0.0, 0.5, 0.5)), 1);
		}
	}
}
