// CRT Lottes
// Public Domain CRT Styled Scan-line Shader by Timothy Lottes
// Ported from https://github.com/libretro/common-shaders/blob/master/crt/shaders/crt-lottes.cg
// CGA arcade monitor style with RGB inputs

//!BGFX EFFECT
//!VERSION 1
//!NAME CRT Lottes
//!CATEGORY CRT
//!DESCRIPTION Timothy Lottes' public domain CRT shader simulating a high-quality CGA arcade monitor. Features multiple shadow mask styles, bloom effects, and barrel distortion. Excellent for pixel art games.

//!PARAMETER
//!LABEL Scanline Hardness
//!DESC Controls sharpness of scanline edges (more negative = harder)
//!DEFAULT -8
//!MIN -20
//!MAX 0
//!STEP 1
int hardScan;

//!PARAMETER
//!LABEL Pixel Hardness
//!DESC Controls sharpness of pixel edges (more negative = harder)
//!DEFAULT -3
//!MIN -20
//!MAX 0
//!STEP 1
int hardPix;

//!PARAMETER
//!LABEL Horizontal Display Warp
//!DESC Horizontal barrel distortion amount
//!DEFAULT 0.031
//!MIN 0
//!MAX 0.125
//!STEP 0.001
float warpX;

//!PARAMETER
//!LABEL Vertical Display Warp
//!DESC Vertical barrel distortion amount
//!DEFAULT 0.041
//!MIN 0
//!MAX 0.125
//!STEP 0.001
float warpY;

//!PARAMETER
//!LABEL Mask Dark
//!DESC Brightness of dark mask pixels
//!DEFAULT 0.5
//!MIN 0
//!MAX 2
//!STEP 0.01
float maskDark;

//!PARAMETER
//!LABEL Mask Light
//!DESC Brightness of light mask pixels
//!DEFAULT 1.5
//!MIN 0
//!MAX 2
//!STEP 0.01
float maskLight;

//!PARAMETER
//!LABEL Shadow Mask
//!DESC Mask type: 0=none, 1=compressed TV, 2=aperture grille, 3=stretched VGA, 4=VGA
//!DEFAULT 3
//!MIN 0
//!MAX 4
//!STEP 1
int shadowMask;

//!PARAMETER
//!LABEL Brightness Boost
//!DESC Overall brightness multiplier
//!DEFAULT 1
//!MIN 0
//!MAX 2
//!STEP 0.01
float brightBoost;

//!PARAMETER
//!LABEL Bloom-X Soft
//!DESC Horizontal bloom softness (more negative = softer)
//!DEFAULT -1.5
//!MIN -2
//!MAX -0.5
//!STEP 0.01
float hardBloomPix;

//!PARAMETER
//!LABEL Bloom-Y Soft
//!DESC Vertical bloom softness (more negative = softer)
//!DEFAULT -2
//!MIN -4
//!MAX -1
//!STEP 0.01
float hardBloomScan;

//!PARAMETER
//!LABEL Bloom Amount
//!DESC Intensity of the bloom/glow effect
//!DEFAULT 0.15
//!MIN 0
//!MAX 1
//!STEP 0.01
float bloomAmount;

//!PARAMETER
//!LABEL Filter Kernel Shape
//!DESC Shape of the gaussian filter kernel
//!DEFAULT 2
//!MIN 0
//!MAX 10
//!STEP 0.01
float shape;


//!TEXTURE
Texture2D INPUT;

//!TEXTURE
Texture2D OUTPUT;

//!SAMPLER
//!FILTER POINT
SamplerState sam;


//!PASS 1
//!IN INPUT
//!OUT OUTPUT
//!BLOCK_SIZE 8
//!NUM_THREADS 64

#pragma warning(disable: 3571)

#define DO_BLOOM 1
#define warp float2(warpX, warpY)


// Fetch nearest emulated sample at floating point position with texel offset
float3 Fetch(float2 pos, float2 off, float2 texture_size) {
	pos = (floor(pos * texture_size.xy + off) + float2(0.5, 0.5)) / texture_size.xy;
	return brightBoost * pow(INPUT.SampleLevel(sam, pos, 0).rgb, 2.2f);
}

// Distance to nearest texel in emulated pixels
float2 Dist(float2 pos, float2 texture_size) {
	pos = pos * texture_size.xy;
	return -((pos - floor(pos)) - float2(0.5, 0.5));
}

// 1D Gaussian with configurable shape
float Gaus(float pos, float scale) {
	return exp2(scale * pow(abs(pos), shape));
}

// 3-tap horizontal Gaussian filter
float3 Horz3(float2 pos, float off, float2 texture_size) {
	float3 b = Fetch(pos, float2(-1.0, off), texture_size);
	float3 c = Fetch(pos, float2(0.0, off), texture_size);
	float3 d = Fetch(pos, float2(1.0, off), texture_size);
	float dst = Dist(pos, texture_size).x;
	float scale = hardPix;
	float wb = Gaus(dst - 1.0, scale);
	float wc = Gaus(dst + 0.0, scale);
	float wd = Gaus(dst + 1.0, scale);
	return (b * wb + c * wc + d * wd) / (wb + wc + wd);
}

// 5-tap horizontal Gaussian filter
float3 Horz5(float2 pos, float off, float2 texture_size) {
	float3 a = Fetch(pos, float2(-2.0, off), texture_size);
	float3 b = Fetch(pos, float2(-1.0, off), texture_size);
	float3 c = Fetch(pos, float2(0.0, off), texture_size);
	float3 d = Fetch(pos, float2(1.0, off), texture_size);
	float3 e = Fetch(pos, float2(2.0, off), texture_size);
	float dst = Dist(pos, texture_size).x;
	float scale = hardPix;
	float wa = Gaus(dst - 2.0, scale);
	float wb = Gaus(dst - 1.0, scale);
	float wc = Gaus(dst + 0.0, scale);
	float wd = Gaus(dst + 1.0, scale);
	float we = Gaus(dst + 2.0, scale);
	return (a * wa + b * wb + c * wc + d * wd + e * we) / (wa + wb + wc + wd + we);
}

// 7-tap horizontal Gaussian filter for bloom
float3 Horz7(float2 pos, float off, float2 texture_size) {
	float3 a = Fetch(pos, float2(-3.0, off), texture_size);
	float3 b = Fetch(pos, float2(-2.0, off), texture_size);
	float3 c = Fetch(pos, float2(-1.0, off), texture_size);
	float3 d = Fetch(pos, float2(0.0, off), texture_size);
	float3 e = Fetch(pos, float2(1.0, off), texture_size);
	float3 f = Fetch(pos, float2(2.0, off), texture_size);
	float3 g = Fetch(pos, float2(3.0, off), texture_size);
	float dst = Dist(pos, texture_size).x;
	float scale = hardBloomPix;
	float wa = Gaus(dst - 3.0, scale);
	float wb = Gaus(dst - 2.0, scale);
	float wc = Gaus(dst - 1.0, scale);
	float wd = Gaus(dst + 0.0, scale);
	float we = Gaus(dst + 1.0, scale);
	float wf = Gaus(dst + 2.0, scale);
	float wg = Gaus(dst + 3.0, scale);
	return (a * wa + b * wb + c * wc + d * wd + e * we + f * wf + g * wg) / (wa + wb + wc + wd + we + wf + wg);
}

// Scanline weight
float Scan(float2 pos, float off, float2 texture_size) {
	float dst = Dist(pos, texture_size).y;
	return Gaus(dst + off, hardScan);
}

// Scanline weight for bloom pass
float BloomScan(float2 pos, float off, float2 texture_size) {
	float dst = Dist(pos, texture_size).y;
	return Gaus(dst + off, hardBloomScan);
}

// Tri-linear filtering for main image
float3 Tri(float2 pos, float2 texture_size) {
	float3 a = Horz3(pos, -1.0, texture_size);
	float3 b = Horz5(pos, 0.0, texture_size);
	float3 c = Horz3(pos, 1.0, texture_size);
	float wa = Scan(pos, -1.0, texture_size);
	float wb = Scan(pos, 0.0, texture_size);
	float wc = Scan(pos, 1.0, texture_size);
	return a * wa + b * wb + c * wc;
}

// Bloom calculation using wider filter kernel
float3 Bloom(float2 pos, float2 texture_size) {
	float3 a = Horz5(pos, -2.0, texture_size);
	float3 b = Horz7(pos, -1.0, texture_size);
	float3 c = Horz7(pos, 0.0, texture_size);
	float3 d = Horz7(pos, 1.0, texture_size);
	float3 e = Horz5(pos, 2.0, texture_size);
	float wa = BloomScan(pos, -2.0, texture_size);
	float wb = BloomScan(pos, -1.0, texture_size);
	float wc = BloomScan(pos, 0.0, texture_size);
	float wd = BloomScan(pos, 1.0, texture_size);
	float we = BloomScan(pos, 2.0, texture_size);
	return a * wa + b * wb + c * wc + d * wd + e * we;
}

// Barrel distortion transformation
float2 Warp(float2 pos) {
	pos = pos * 2.0 - 1.0;
	pos *= float2(1.0 + (pos.y * pos.y) * warp.x, 1.0 + (pos.x * pos.x) * warp.y);
	return pos * 0.5 + 0.5;
}

// Shadow mask patterns
float3 Mask(float2 pos) {
	float3 mask = float3(maskDark, maskDark, maskDark);

	// Type 1: Compressed TV style shadow mask
	if (shadowMask == 1) {
		float mask_line = maskLight;
		float odd = 0.0;
		if (frac(pos.x / 6.0) < 0.5) odd = 1.0;
		if (frac((pos.y + odd) / 2.0) < 0.5) mask_line = maskDark;
		pos.x = frac(pos.x / 3.0);

		if (pos.x < 0.333) mask.r = maskLight;
		else if (pos.x < 0.666) mask.g = maskLight;
		else mask.b = maskLight;
		mask *= mask_line;
	}

	// Type 2: Aperture grille
	else if (shadowMask == 2) {
		pos.x = frac(pos.x / 3.0);

		if (pos.x < 0.333) mask.r = maskLight;
		else if (pos.x < 0.666) mask.g = maskLight;
		else mask.b = maskLight;
	}

	// Type 3: Stretched VGA style shadow mask
	else if (shadowMask == 3) {
		pos.x += pos.y * 3.0;
		pos.x = frac(pos.x / 6.0);

		if (pos.x < 0.333) mask.r = maskLight;
		else if (pos.x < 0.666) mask.g = maskLight;
		else mask.b = maskLight;
	}

	// Type 4: VGA style shadow mask
	else if (shadowMask == 4) {
		pos.xy = floor(pos.xy * float2(1.0, 0.5));
		pos.x += pos.y * 3.0;
		pos.x = frac(pos.x / 6.0);

		if (pos.x < 0.333) mask.r = maskLight;
		else if (pos.x < 0.666) mask.g = maskLight;
		else mask.b = maskLight;
	}

	return mask;
}

void Pass1(uint2 blockStart, uint3 threadId) {
	uint2 gxy = TileSwizzle8x8(threadId.x) + blockStart;

	const uint2 outputSize = GetOutputSize();
	if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
		return;
	}

	float2 pos = (gxy + 0.5f) * GetOutputPt();

	uint2 inputSize = GetInputSize();
	float2 pos1 = Warp(pos);
	float3 outColor = Tri(pos1, inputSize);

#ifdef DO_BLOOM
	// Add bloom/glow effect
	outColor.rgb += Bloom(pos1, inputSize) * bloomAmount;
#endif

	// Apply shadow mask
	if (shadowMask) {
		outColor.rgb *= Mask(gxy + 0.5f);
	}

	// Output with gamma correction
	OUTPUT[gxy] = float4(pow(outColor.rgb, 1.0f / 2.2f), 1);
}
