// Anime4K High Quality Line Thinning
// Based on Anime4K project by bloc97

//!BGFX EFFECT
//!VERSION 1
//!NAME Anime4K Thin HQ
//!CATEGORY Anime4K
//!DESCRIPTION High quality line thinning effect. Reduces line thickness for sharper anime artwork.


//!PARAMETER
//!LABEL Strength
//!DESC Warping strength per iteration. Lower values with more iterations improve quality.
//!DEFAULT 0.6
//!MIN 0.1
//!MAX 10
//!STEP 0.1
float strength;

//!PARAMETER
//!LABEL Iterations
//!DESC Number of solver iterations. More iterations improve quality at cost of performance.
//!DEFAULT 1
//!MIN 1
//!MAX 10
//!STEP 1
int iterations;

//!TEXTURE
Texture2D INPUT;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
Texture2D OUTPUT;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R16G16_FLOAT
Texture2D tex1;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R16G16_FLOAT
Texture2D tex2;

//!SAMPLER
//!FILTER POINT
SamplerState sam;

//!SAMPLER
//!FILTER LINEAR
SamplerState sam1;

//!COMMON

#ifdef BG_FP16
#pragma warning(disable: 3557)
#endif


//!PASS 1
//!DESC Sobel Edge Detection
//!IN INPUT
//!OUT tex2
//!BLOCK_SIZE 16
//!NUM_THREADS 64

float ComputeLuma(float3 rgb) {
	return dot(float3(0.299, 0.587, 0.114), rgb);
}

void Pass1(uint2 blockStart, uint3 threadId) {
	uint2 gxy = (TileSwizzle8x8(threadId.x) << 1) + blockStart;
	uint2 inputSize = GetInputSize();
	if (gxy.x >= inputSize.x || gxy.y >= inputSize.y) {
		return;
	}
	float2 inputPt = GetInputPt();

	uint i, j;

	// Gather luminance values
	float src[4][4];
	[unroll]
	for (i = 0; i <= 2; i += 2) {
		[unroll]
		for (j = 0; j <= 2; j += 2) {
			float2 tpos = (gxy + uint2(i, j)) * inputPt;
			const float4 sr = INPUT.GatherRed(sam, tpos);
			const float4 sg = INPUT.GatherGreen(sam, tpos);
			const float4 sb = INPUT.GatherBlue(sam, tpos);

			src[i][j] = ComputeLuma(float3(sr.w, sg.w, sb.w));
			src[i][j + 1] = ComputeLuma(float3(sr.x, sg.x, sb.x));
			src[i + 1][j] = ComputeLuma(float3(sr.z, sg.z, sb.z));
			src[i + 1][j + 1] = ComputeLuma(float3(sr.y, sg.y, sb.y));
		}
	}

	// Compute Sobel gradients
	[unroll]
	for (i = 1; i <= 2; ++i) {
		[unroll]
		for (j = 1; j <= 2; ++j) {
			uint2 destPos = gxy + uint2(i - 1, j - 1);

			float xgrad = (-src[i - 1][j - 1] + src[i + 1][j - 1] - src[i - 1][j] + src[i + 1][j] - src[i - 1][j] + src[i + 1][j] - src[i - 1][j + 1] + src[i + 1][j + 1]) / 8.0f;
			float ygrad = (-src[i - 1][j - 1] - src[i][j - 1] - src[i][j - 1] - src[i + 1][j - 1] + src[i - 1][j + 1] + src[i][j + 1] + src[i][j + 1] + src[i + 1][j + 1]) / 8.0f;

			// Store gradient magnitude with power curve
			float norm = sqrt(xgrad * xgrad + ygrad * ygrad);
			tex2[destPos] = float2(pow(norm, 0.7), 0);
		}
	}
}


//!PASS 2
//!DESC Gaussian Blur X
//!IN tex2
//!OUT tex1
//!BLOCK_SIZE 16
//!NUM_THREADS 64

float GaussianWeight(float x, float s, float m) {
	float scaled = (x - m) / s;
	return exp(-0.5 * scaled * scaled);
}

void Pass2(uint2 blockStart, uint3 threadId) {
	const uint2 gxy = (TileSwizzle8x8(threadId.x) << 1) + blockStart;
	const uint2 inputSize = GetInputSize();
	if (gxy.x >= inputSize.x || gxy.y >= inputSize.y) {
		return;
	}
	const float2 inputPt = GetInputPt();

	// Resolution-adaptive spatial sigma
	const float SPATIAL_SIGMA = 2.0f * GetInputSize().y / 1080.0f;
	const int KERNELSIZE = max(int(ceil(SPATIAL_SIGMA * 2.0)), 1) * 2 + 1;
	const int KERNELHALFSIZE = KERNELSIZE >> 1;

	[unroll]
	for (uint i = 0; i <= 1; ++i) {
		[unroll]
		for (uint j = 0; j <= 1; ++j) {
			const uint2 destPos = gxy + uint2(i, j);

			if (i != 0 || j != 0) {
				if (destPos.x >= inputSize.x || destPos.y >= inputSize.y) {
					continue;
				}
			}

			const float2 pos = (destPos + 0.5f) * inputPt;

			float g = 0.0;
			float gn = 0.0;

			for (int k = 0; k < KERNELSIZE; ++k) {
				int di = k - KERNELHALFSIZE;
				float gf = GaussianWeight(di, SPATIAL_SIGMA, 0.0);
				g = g + tex2.SampleLevel(sam, pos + float2(di * inputPt.x, 0.0), 0).x * gf;
				gn = gn + gf;
			}

			tex1[destPos] = float2(g / gn, 0);
		}
	}
}


//!PASS 3
//!DESC Gaussian Blur Y
//!IN tex1
//!OUT tex2
//!BLOCK_SIZE 16
//!NUM_THREADS 64

float GaussianWeight(float x, float s, float m) {
	float scaled = (x - m) / s;
	return exp(-0.5 * scaled * scaled);
}

void Pass3(uint2 blockStart, uint3 threadId) {
	const uint2 gxy = (TileSwizzle8x8(threadId.x) << 1) + blockStart;
	const uint2 inputSize = GetInputSize();
	if (gxy.x >= inputSize.x || gxy.y >= inputSize.y) {
		return;
	}
	const float2 inputPt = GetInputPt();

	// Resolution-adaptive spatial sigma
	const float SPATIAL_SIGMA = 2.0f * GetInputSize().y / 1080.0f;
	const int KERNELSIZE = max(int(ceil(SPATIAL_SIGMA * 2.0)), 1) * 2 + 1;
	const int KERNELHALFSIZE = KERNELSIZE >> 1;

	[unroll]
	for (uint i = 0; i <= 1; ++i) {
		[unroll]
		for (uint j = 0; j <= 1; ++j) {
			const uint2 destPos = gxy + uint2(i, j);

			if (i != 0 || j != 0) {
				if (destPos.x >= inputSize.x || destPos.y >= inputSize.y) {
					continue;
				}
			}

			const float2 pos = (destPos + 0.5f) * inputPt;

			float g = 0.0;
			float gn = 0.0;

			for (int k = 0; k < KERNELSIZE; ++k) {
				int di = k - KERNELHALFSIZE;
				float gf = GaussianWeight(di, SPATIAL_SIGMA, 0.0);

				g = g + tex1.SampleLevel(sam, pos + float2(0, di * inputPt.y), 0).x * gf;
				gn = gn + gf;
			}

			tex2[destPos] = float2(g / gn, 0);
		}
	}
}


//!PASS 4
//!DESC Compute Warp Kernel
//!IN tex2
//!OUT tex1
//!BLOCK_SIZE 16
//!NUM_THREADS 64

void Pass4(uint2 blockStart, uint3 threadId) {
	const uint2 gxy = (TileSwizzle8x8(threadId.x) << 1) + blockStart;
	const uint2 inputSize = GetInputSize();
	if (gxy.x >= inputSize.x || gxy.y >= inputSize.y) {
		return;
	}
	const float2 inputPt = GetInputPt();

	uint i, j;

	// Gather gradient values
	float src[4][4];
	[unroll]
	for (i = 0; i <= 2; i += 2) {
		[unroll]
		for (j = 0; j <= 2; j += 2) {
			float2 tpos = (gxy + uint2(i, j)) * inputPt;
			const float4 sr = tex2.GatherRed(sam, tpos);

			src[i][j] = sr.w;
			src[i][j + 1] = sr.x;
			src[i + 1][j] = sr.z;
			src[i + 1][j + 1] = sr.y;
		}
	}

	// Compute gradient direction
	[unroll]
	for (i = 1; i <= 2; ++i) {
		[unroll]
		for (j = 1; j <= 2; ++j) {
			uint2 destPos = gxy + uint2(i - 1, j - 1);

			float xgrad = -src[i - 1][j - 1] + src[i + 1][j - 1] - src[i - 1][j] + src[i + 1][j] - src[i - 1][j] + src[i + 1][j] - src[i - 1][j + 1] + src[i + 1][j + 1];
			float ygrad = -src[i - 1][j - 1] - src[i][j - 1] - src[i][j - 1] - src[i + 1][j - 1] + src[i - 1][j + 1] + src[i][j + 1] + src[i][j + 1] + src[i + 1][j + 1];

			tex1[destPos] = float2(xgrad, ygrad) / 8.0f;
		}
	}
}


//!PASS 5
//!DESC Apply Line Thinning Warp
//!IN tex1, INPUT
//!OUT OUTPUT
//!BLOCK_SIZE 16
//!NUM_THREADS 64

void Pass5(uint2 blockStart, uint3 threadId) {
	const uint2 gxy = (TileSwizzle8x8(threadId.x) << 1) + blockStart;

	const uint2 outputSize = GetOutputSize();
	if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
		return;
	}

	const float2 inputPt = GetInputPt();
	const float relstr = GetInputSize().y / 1080.0f * strength;

	[unroll]
	for (uint i = 0; i <= 1; ++i) {
		[unroll]
		for (uint j = 0; j <= 1; ++j) {
			const uint2 destPos = gxy + uint2(i, j);

			float2 pos = (destPos + 0.5f) * inputPt;

			// Iteratively warp towards thinner lines
			for (int iter = 0; iter < iterations; ++iter) {
				float2 dn = tex1.SampleLevel(sam1, pos, 0).xy;
				// Quasi-normalization to handle large vectors and avoid divide by zero
				float2 dd = (dn / (length(dn) + 0.01f)) * inputPt * relstr;
				pos -= dd;
			}

			OUTPUT[destPos] = INPUT.SampleLevel(sam1, pos, 0);
		}
	}
}
