Skip to content

Commit

Permalink
fp16 optimizations (#985)
Browse files Browse the repository at this point in the history
  • Loading branch information
turanszkij authored Dec 8, 2024
1 parent 765fd54 commit 842e2a0
Show file tree
Hide file tree
Showing 28 changed files with 340 additions and 320 deletions.
36 changes: 10 additions & 26 deletions WickedEngine/shaders/ColorSpaceUtility.hlsli
Original file line number Diff line number Diff line change
Expand Up @@ -34,41 +34,25 @@
// are--the sRGB curve needs to be removed before involving the colors in linear mathematics such
// as physically based lighting.

float3 ApplySRGBCurve( float3 x )
{
// Approximately pow(x, 1.0 / 2.2)
return select(x < 0.0031308, 12.92 * x, 1.055 * pow(x, 1.0 / 2.4) - 0.055);
}
// Note: modified for Wicked Engine to use macros, for better half precision mapping

float3 RemoveSRGBCurve( float3 x )
{
// Approximately pow(x, 2.2)
return select(x < 0.04045, x / 12.92, pow((x + 0.055) / 1.055, 2.4));
}
// Approximately pow(x, 1.0 / 2.2)
#define ApplySRGBCurve( x ) select(x < 0.0031308, 12.92 * x, 1.055 * pow(x, 1.0 / 2.4) - 0.055)

// Approximately pow(x, 2.2)
#define RemoveSRGBCurve( x ) select(x < 0.04045, x / 12.92, pow((x + 0.055) / 1.055, 2.4))

// These functions avoid pow() to efficiently approximate sRGB with an error < 0.4%.
float3 ApplySRGBCurve_Fast( float3 x )
{
return select(x < 0.0031308, 12.92 * x, 1.13005 * sqrt(x - 0.00228) - 0.13448 * x + 0.005719);
}
#define ApplySRGBCurve_Fast( x ) select(x < 0.0031308, 12.92 * x, 1.13005 * sqrt(x - 0.00228) - 0.13448 * x + 0.005719)

float3 RemoveSRGBCurve_Fast( float3 x )
{
return select(x < 0.04045, x / 12.92, -7.43605 * x - 31.24297 * sqrt(-0.53792 * x + 1.279924) + 35.34864);
}
#define RemoveSRGBCurve_Fast( x ) select(x < 0.04045, x / 12.92, -7.43605 * x - 31.24297 * sqrt(-0.53792 * x + 1.279924) + 35.34864)

// The OETF recommended for content shown on HDTVs. This "gamma ramp" may increase contrast as
// appropriate for viewing in a dark environment. Always use this curve with Limited RGB as it is
// used in conjunction with HDTVs.
float3 ApplyREC709Curve( float3 x )
{
return select(x < 0.0181, 4.5 * x, 1.0993 * pow(x, 0.45) - 0.0993);
}
#define ApplyREC709Curve( x ) select(x < 0.0181, 4.5 * x, 1.0993 * pow(x, 0.45) - 0.0993)

float3 RemoveREC709Curve( float3 x )
{
return select(x < 0.08145, x / 4.5, pow((x + 0.0993) / 1.0993, 1.0 / 0.45));
}
#define RemoveREC709Curve( x ) select(x < 0.08145, x / 4.5, pow((x + 0.0993) / 1.0993, 1.0 / 0.45))

// This is the new HDR transfer function, also called "PQ" for perceptual quantizer. Note that REC2084
// does not also refer to a color space. REC2084 is typically used with the REC2020 color space.
Expand Down
4 changes: 2 additions & 2 deletions WickedEngine/shaders/ShaderInterop_DDGI.h
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ half3 ddgi_sample_irradiance(float3 P, half3 N)
half dist_to_probe = length(probe_to_point);

//float2 temp = textureLod(depth_texture, tex_coord, 0.0f).rg;
half2 temp = bindless_textures[GetScene().ddgi.depth_texture].SampleLevel(sampler_linear_clamp, tex_coord, 0).xy;
half2 temp = bindless_textures_half4[GetScene().ddgi.depth_texture].SampleLevel(sampler_linear_clamp, tex_coord, 0).xy;
half mean = temp.x;
half variance = abs(sqr(temp.x) - temp.y);

Expand All @@ -254,7 +254,7 @@ half3 ddgi_sample_irradiance(float3 P, half3 N)
float2 tex_coord = ddgi_probe_color_uv(probe_grid_coord, irradiance_dir);

//float3 probe_irradiance = textureLod(irradiance_texture, tex_coord, 0.0f).rgb;
half3 probe_irradiance = bindless_textures[GetScene().ddgi.color_texture].SampleLevel(sampler_linear_clamp, tex_coord, 0).rgb;
half3 probe_irradiance = bindless_textures_half4[GetScene().ddgi.color_texture].SampleLevel(sampler_linear_clamp, tex_coord, 0).rgb;

// A tiny bit of light is really visible due to log perception, so
// crush tiny weights but keep the curve continuous. This must be done
Expand Down
8 changes: 2 additions & 6 deletions WickedEngine/shaders/ShaderInterop_Font.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,8 @@ struct FontConstants
int texture_index;
int padding0;

float4 color;

float softness;
float bolden;
uint flags;
float hdr_scaling;
uint2 color; // packed half4
uint2 softness_bolden_hdrscaling; // packed half3 | uint16 flags

float4x4 transform;
};
Expand Down
30 changes: 15 additions & 15 deletions WickedEngine/shaders/ShaderInterop_Renderer.h
Original file line number Diff line number Diff line change
Expand Up @@ -195,12 +195,12 @@ struct alignas(16) ShaderTextureSlot
{
return f16tof32((uvset_aniso_lodclamp >> 16u) & 0xFFFF);
}
Texture2D GetTexture()
Texture2D<half4> GetTexture()
{
return bindless_textures[UniformTextureSlot(texture_descriptor)];
return bindless_textures_half4[UniformTextureSlot(texture_descriptor)];
}
float4 SampleVirtual(
in Texture2D tex,
half4 SampleVirtual(
in Texture2D<half4> tex,
in SamplerState sam,
in float2 uv,
in Texture2D<uint4> residency_map,
Expand Down Expand Up @@ -235,7 +235,7 @@ struct alignas(16) ShaderTextureSlot
const float clamped_lod = virtual_lod < max_nonpacked_lod ? max(virtual_lod, residency.z) : virtual_lod;

// Mip - more detailed:
float4 value0;
half4 value0;
{
uint lod0 = uint(clamped_lod);
const uint packed_mip_idx = packed_mips ? uint(virtual_lod - max_nonpacked_lod - 1) : 0;
Expand All @@ -249,7 +249,7 @@ struct alignas(16) ShaderTextureSlot
}

// Mip - less detailed:
float4 value1;
half4 value1;
{
uint lod1 = uint(clamped_lod + 1);
packed_mips = uint(lod1) > max_nonpacked_lod;
Expand All @@ -265,11 +265,11 @@ struct alignas(16) ShaderTextureSlot
value1 = tex.SampleLevel(sam, atlas_uv, 0);
}

return lerp(value0, value1, frac(clamped_lod)); // custom trilinear filtering
return lerp(value0, value1, (half)frac(clamped_lod)); // custom trilinear filtering
}
float4 Sample(in SamplerState sam, in float4 uvsets)
half4 Sample(in SamplerState sam, in float4 uvsets)
{
Texture2D tex = GetTexture();
Texture2D<half4> tex = GetTexture();
float2 uv = GetUVSet() == 0 ? uvsets.xy : uvsets.zw;

#ifndef DISABLE_SVT
Expand All @@ -288,9 +288,9 @@ struct alignas(16) ShaderTextureSlot
return tex.Sample(sam, uv);
}

float4 SampleLevel(in SamplerState sam, in float4 uvsets, in float lod)
half4 SampleLevel(in SamplerState sam, in float4 uvsets, in float lod)
{
Texture2D tex = GetTexture();
Texture2D<half4> tex = GetTexture();
float2 uv = GetUVSet() == 0 ? uvsets.xy : uvsets.zw;

#ifndef DISABLE_SVT
Expand All @@ -308,9 +308,9 @@ struct alignas(16) ShaderTextureSlot
return tex.SampleLevel(sam, uv, lod);
}

float4 SampleBias(in SamplerState sam, in float4 uvsets, in float bias)
half4 SampleBias(in SamplerState sam, in float4 uvsets, in float bias)
{
Texture2D tex = GetTexture();
Texture2D<half4> tex = GetTexture();
float2 uv = GetUVSet() == 0 ? uvsets.xy : uvsets.zw;

#ifndef DISABLE_SVT
Expand All @@ -330,9 +330,9 @@ struct alignas(16) ShaderTextureSlot
return tex.SampleBias(sam, uv, bias);
}

float4 SampleGrad(in SamplerState sam, in float4 uvsets, in float4 uvsets_dx, in float4 uvsets_dy)
half4 SampleGrad(in SamplerState sam, in float4 uvsets, in float4 uvsets_dx, in float4 uvsets_dy)
{
Texture2D tex = GetTexture();
Texture2D<half4> tex = GetTexture();
float2 uv = GetUVSet() == 0 ? uvsets.xy : uvsets.zw;
float2 uv_dx = GetUVSet() == 0 ? uvsets_dx.xy : uvsets_dx.zw;
float2 uv_dy = GetUVSet() == 0 ? uvsets_dy.xy : uvsets_dy.zw;
Expand Down
26 changes: 16 additions & 10 deletions WickedEngine/shaders/fontPS.hlsl
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,27 @@ struct VertextoPixel
float2 bary : TEXCOORD1;
};

float4 main(VertextoPixel input) : SV_TARGET
half4 main(VertextoPixel input) : SV_TARGET
{
Texture2D tex = bindless_textures[font.texture_index];
Texture2D<half4> tex = bindless_textures_half4[font.texture_index];
half value = tex.SampleLevel(sampler_linear_clamp, input.uv, 0).r;
half4 color = font.color;
half4 color = unpack_half4(font.color);

const half3 softness_bolden_hdrscaling = unpack_half3(font.softness_bolden_hdrscaling);
const half softness = softness_bolden_hdrscaling.x;
const half bolden = softness_bolden_hdrscaling.y;
const half hdr_scaling = softness_bolden_hdrscaling.z;
const min16uint flags = font.softness_bolden_hdrscaling.y >> 16u;

[branch]
if (font.flags & FONT_FLAG_SDF_RENDERING)
if (flags & FONT_FLAG_SDF_RENDERING)
{
float2 bary_fw = fwidth(input.bary);
float w = max(bary_fw.x, bary_fw.y); // screen coverage dependency
half w = max(bary_fw.x, bary_fw.y); // screen coverage dependency
w = max(w, 1.0 / 255.0); // min softness to avoid pixelated hard edge in magnification
w += font.softness;
w += softness;
w = saturate(w);
half mid = lerp(SDF::onedge_value_unorm, 0, font.bolden);
half mid = lerp((half)SDF::onedge_value_unorm, 0, bolden);
color.a *= smoothstep(saturate(mid - w), saturate(mid + w), value);
}
else
Expand All @@ -31,7 +37,7 @@ float4 main(VertextoPixel input) : SV_TARGET
}

[branch]
if (font.flags & FONT_FLAG_OUTPUT_COLOR_SPACE_HDR10_ST2084)
if (flags & FONT_FLAG_OUTPUT_COLOR_SPACE_HDR10_ST2084)
{
// https://github.com/microsoft/DirectX-Graphics-Samples/blob/master/Samples/Desktop/D3D12HDR/src/presentPS.hlsl
const half referenceWhiteNits = 80.0;
Expand All @@ -42,10 +48,10 @@ float4 main(VertextoPixel input) : SV_TARGET
// Apply the ST.2084 curve to the result.
color.rgb = ApplyREC2084Curve(color.rgb * hdrScalar);
}
else if (font.flags & FONT_FLAG_OUTPUT_COLOR_SPACE_LINEAR)
else if (flags & FONT_FLAG_OUTPUT_COLOR_SPACE_LINEAR)
{
color.rgb = RemoveSRGBCurve_Fast(color.rgb);
color.rgb *= font.hdr_scaling;
color.rgb *= hdr_scaling;
}

return color;
Expand Down
Loading

0 comments on commit 842e2a0

Please sign in to comment.