// Anime4K Bilateral Mean Denoiser
// Based on Anime4K project by bloc97

//!BGFX EFFECT
//!VERSION 1
//!NAME Anime4K Denoise Bilateral Mean
//!CATEGORY Anime4K
//!DESCRIPTION Bilateral mean filter for noise reduction. Uses intensity and spatial weighting for edge-preserving smoothing.


//!PARAMETER
//!LABEL Intensity Sigma
//!DESC Controls denoising strength. Higher values produce stronger smoothing but may blur details.
//!DEFAULT 0.1
//!MIN 0.01
//!MAX 5
//!STEP 0.01
float intensitySigma;

//!TEXTURE
Texture2D INPUT;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
Texture2D OUTPUT;

//!SAMPLER
//!FILTER POINT
SamplerState sam;


//!PASS 1
//!IN INPUT
//!OUT OUTPUT
//!BLOCK_SIZE 16
//!NUM_THREADS 64

// Intensity window size - higher values give stronger denoising
#define INTENSITY_SIGMA intensitySigma
// Spatial window size - controls blur radius
#define SPATIAL_SIGMA 1.0
// Power curve for intensity weighting (0 = uniform, higher = adaptive to brightness)
#define INTENSITY_POWER_CURVE 1.0

// Calculate kernel dimensions from spatial sigma
#define KERNELSIZE (max(uint(ceil(SPATIAL_SIGMA * 2.0)), 1) * 2 + 1)
#define KERNELHALFSIZE (uint(KERNELSIZE/2))
#define KERNELLEN (KERNELSIZE * KERNELSIZE)


// Gaussian function for vector input
float3 GaussianVec(float3 x, float3 rcpS, float3 m) {
	float3 scaled = (x - m) * rcpS;
	return exp(-0.5 * scaled * scaled);
}

// Gaussian function for scalar input
float GaussianScalar(float x, float rcpS, float m) {
	float scaled = (x - m) * rcpS;
	return exp(-0.5 * scaled * scaled);
}


void Pass1(uint2 blockStart, uint3 threadId) {
	uint2 gxy = (TileSwizzle8x8(threadId.x) << 1) + blockStart;

	const uint2 outputSize = GetOutputSize();
	if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
		return;
	}

	float2 inputPt = GetInputPt();
	uint i, j;

	// Gather source pixels into local array
	float3 src[KERNELSIZE + 1][KERNELSIZE + 1];
	[unroll]
	for (i = 0; i <= KERNELSIZE - 1; i += 2) {
		[unroll]
		for (j = 0; j <= KERNELSIZE - 1; j += 2) {
			float2 tpos = (gxy + int2(i, j) - KERNELHALFSIZE + 1) * inputPt;
			const float4 sr = INPUT.GatherRed(sam, tpos);
			const float4 sg = INPUT.GatherGreen(sam, tpos);
			const float4 sb = INPUT.GatherBlue(sam, tpos);

			// Gather returns: w z
			//                 x y
			src[i][j] = float3(sr.w, sg.w, sb.w);
			src[i][j + 1] = float3(sr.x, sg.x, sb.x);
			src[i + 1][j] = float3(sr.z, sg.z, sb.z);
			src[i + 1][j + 1] = float3(sr.y, sg.y, sb.y);
		}
	}

	// Precompute spatial distances
	float len[KERNELSIZE][KERNELSIZE];
	[unroll]
	for (i = 0; i < KERNELSIZE; ++i) {
		[unroll]
		for (j = 0; j < KERNELSIZE; ++j) {
			len[i][j] = length(float2((int)i - KERNELHALFSIZE, (int)j - KERNELHALFSIZE));
		}
	}

	// Process 2x2 output block
	[unroll]
	for (i = 0; i <= 1; ++i) {
		[unroll]
		for (j = 0; j <= 1; ++j) {
			uint2 destPos = gxy + uint2(i, j);

			float3 sum = 0;
			float3 n = 0;

			// Get center pixel color
			float3 vc = src[KERNELHALFSIZE + i][KERNELHALFSIZE + j].rgb;

			// Compute reciprocal sigmas for weighting
			float3 rcpIs = rcp(pow(vc + 0.0001, INTENSITY_POWER_CURVE) * INTENSITY_SIGMA);
			float rcpSs = rcp(SPATIAL_SIGMA);

			// Apply bilateral filter kernel
			[unroll]
			for (uint k = 0; k < KERNELSIZE; ++k) {
				[unroll]
				for (uint m = 0; m < KERNELSIZE; ++m) {
					float3 v = src[k + i][m + j];
					float3 d = GaussianVec(v, rcpIs, vc) * GaussianScalar(len[k][m], rcpSs, 0);
					sum += d * v;
					n += d;
				}
			}

			OUTPUT[destPos] = float4(sum / n, 1);
		}
	}
}
