From 8611727ffc4f7e115e5da77a2ede894cca561261 Mon Sep 17 00:00:00 2001
From: papadanku <pauldang2@gmail.com>
Date: Mon, 2 Sep 2024 02:42:22 -0700
Subject: [PATCH] cImageSharpen: Implement FSR's Robust Contrast Adaptive
 Sharpening (RCAS)

---
 shaders/cImageSharpen.fx           |  50 +-------
 shaders/shared/fidelityfx/cCas.fxh | 188 ++++++++++++++---------------
 2 files changed, 95 insertions(+), 143 deletions(-)

diff --git a/shaders/cImageSharpen.fx b/shaders/cImageSharpen.fx
index f4d6a0a..ae9c681 100644
--- a/shaders/cImageSharpen.fx
+++ b/shaders/cImageSharpen.fx
@@ -1,59 +1,17 @@
 
-/*
-    Bilinear modification of AMD's CAS algorithm.
-
-    Source: https://github.com/GPUOpen-LibrariesAndSDKs/FidelityFX-SDK/blob/main/sdk/include/FidelityFX/gpu/cas/ffx_cas.h
-
-    This file is part of the FidelityFX SDK.
-
-    Copyright (C) 2024 Advanced Micro Devices, Inc.
-
-    Permission is hereby granted, free of charge, to any person obtaining a copy
-    of this software and associated documentation files(the "Software"), to deal
-    in the Software without restriction, including without limitation the rights
-    to use, copy, modify, merge, publish, distribute, sublicense, and /or sell
-    copies of the Software, and to permit persons to whom the Software is
-    furnished to do so, subject to the following conditions:
-
-    The above copyright notice and this permission notice shall be included in
-    all copies or substantial portions of the Software.
-
-    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-    THE SOFTWARE.
-*/
-
 uniform int _RenderMode <
     ui_label = "Render Mode";
     ui_type = "combo";
     ui_items = "Image\0Mask\0";
 > = 0;
 
-uniform int _Detection <
-    ui_category = "Sharpening";
-    ui_label = "Detection Mode";
-    ui_type = "combo";
-    ui_items = "Multi-Channel\0Single-Channel (Average)\0Single-Channel (Max)\0";
-> = 0;
-
-uniform int _Kernel <
-    ui_category = "Sharpening";
-    ui_label = "Kernel Shape";
-    ui_type = "combo";
-    ui_items = "CAS: Box\0CAS: Diamond\0CShade: Bilinear Diamond\0";
-> = 1;
-
-uniform float _Contrast <
+uniform float _Sharpening <
     ui_category = "Sharpening";
     ui_label = "Contrast";
     ui_type = "slider";
     ui_min = 0.0;
     ui_max = 1.0;
-> = 0.0;
+> = 1.0;
 
 #include "shared/fidelityfx/cCas.fxh"
 
@@ -69,9 +27,7 @@ float4 PS_CasFilterNoScaling(CShade_VS2PS_Quad Input): SV_TARGET0
         OutputMask,
         Input.Tex0,
         fwidth(Input.Tex0.xy),
-        _Detection,
-        _Kernel,
-        _Contrast
+        _Sharpening
     );
 
     if (_RenderMode == 1)
diff --git a/shaders/shared/fidelityfx/cCas.fxh b/shaders/shared/fidelityfx/cCas.fxh
index 942c20f..1c82bbb 100644
--- a/shaders/shared/fidelityfx/cCas.fxh
+++ b/shaders/shared/fidelityfx/cCas.fxh
@@ -28,6 +28,44 @@
     THE SOFTWARE.
 */
 
+/*
+    https://github.com/GPUOpen-LibrariesAndSDKs/FidelityFX-SDK/blob/main/sdk/include/FidelityFX/gpu/fsr1/ffx_fsr1.h
+
+    FSR - [RCAS] ROBUST CONTRAST ADAPTIVE SHARPENING
+
+    CAS uses a simplified mechanism to convert local contrast into a variable amount of sharpness.
+    RCAS uses a more exact mechanism, solving for the maximum local sharpness possible before clipping.
+    RCAS also has a built in process to limit sharpening of what it detects as possible noise.
+    RCAS sharper does not support scaling, as it should be applied after EASU scaling.
+    Pass EASU output straight into RCAS, no color conversions necessary.
+
+    RCAS is based on the following logic.
+    RCAS uses a 5 tap filter in a cross pattern (same as CAS),
+        w                n
+      w 1 w  for taps  w m e
+        w                s
+
+    Where 'w' is the negative lobe weight.
+      output = (w*(n+e+w+s)+m)/(4*w+1)
+
+    RCAS solves for 'w' by seeing where the signal might clip out of the {0 to 1} input range,
+      0 == (w*(n+e+w+s)+m)/(4*w+1) -> w = -m/(n+e+w+s)
+      1 == (w*(n+e+w+s)+m)/(4*w+1) -> w = (1-m)/(n+e+w+s-4*1)
+
+    Then chooses the 'w' which results in no clipping, limits 'w', and multiplies by the 'sharp' amount.
+    This solution above has issues with MSAA input as the steps along the gradient cause edge detection issues.
+    So RCAS uses 4x the maximum and 4x the minimum (depending on equation)in place of the individual taps.
+    As well as switching from 'm' to either the minimum or maximum (depending on side), to help in energy conservation.
+    This stabilizes RCAS.
+
+    RCAS does a simple highpass which is normalized against the local contrast then shaped,
+           0.25
+      0.25  -1  0.25
+           0.25
+    This is used as a noise detection filter, to reduce the effect of RCAS on grain, and focus on real edges.
+*/
+#define FSR_RCAS_LIMIT (0.25 - (1.0 / 16.0))
+
 #if !defined(INCLIDE_FFX_CAS)
     #define INCLIDE_FFX_CAS
 
@@ -36,105 +74,63 @@
         inout float4 FilterMask,
         in float2 Tex,
         in float2 Delta,
-        in int Detection,
-        in int Kernel,
-        in float Contrast
+        in float Sharpening
     )
     {
-        /*
-            Load a collection of samples in a 3x3 neighorhood, where e is the current pixel.
-            5 3 6 |   3   | 1 3
-            1 0 2 | 1 0 2 |  0
-            7 4 8 |   4   | 2 4
-        */
-
-        // Select kernel sample
-        float4 TexArray[3];
-        float4 Sample[9];
-        switch (Kernel)
-        {
-            case 0:
-                TexArray[0] = Tex.xyxy + (Delta.xyxy * float4(-1.0, 0.0, 1.0, 0.0));
-                TexArray[1] = Tex.xyxy + (Delta.xyxy * float4(0.0, -1.0, 0.0, 1.0));
-                TexArray[2] = Tex.xyxy + (Delta.xyxy * float4(-1.0, -1.0, 1.0, 1.0));
-                Sample[0] = tex2D(CShade_SampleColorTex, Tex);
-                Sample[1] = tex2D(CShade_SampleColorTex, TexArray[0].xy);
-                Sample[2] = tex2D(CShade_SampleColorTex, TexArray[0].zw);
-                Sample[3] = tex2D(CShade_SampleColorTex, TexArray[1].xy);
-                Sample[4] = tex2D(CShade_SampleColorTex, TexArray[1].zw);
-                Sample[5] = tex2D(CShade_SampleColorTex, TexArray[2].xw);
-                Sample[6] = tex2D(CShade_SampleColorTex, TexArray[2].zw);
-                Sample[7] = tex2D(CShade_SampleColorTex, TexArray[2].xy);
-                Sample[8] = tex2D(CShade_SampleColorTex, TexArray[2].zy);
-                break;
-            case 1:
-                TexArray[0] = Tex.xyxy + (Delta.xyxy * float4(-1.0, 0.0, 1.0, 0.0));
-                TexArray[1] = Tex.xyxy + (Delta.xyxy * float4(0.0, -1.0, 0.0, 1.0));
-                Sample[0] = tex2D(CShade_SampleColorTex, Tex);
-                Sample[1] = tex2D(CShade_SampleColorTex, TexArray[0].xy);
-                Sample[2] = tex2D(CShade_SampleColorTex, TexArray[0].zw);
-                Sample[3] = tex2D(CShade_SampleColorTex, TexArray[1].xy);
-                Sample[4] = tex2D(CShade_SampleColorTex, TexArray[1].zw);
-                break;
-            case 2:
-                TexArray[0] = Tex.xyxy + (Delta.xyxy * float4(-0.5, -0.5, 0.5, 0.5));
-                Sample[0] = tex2D(CShade_SampleColorTex, Tex);
-                Sample[1] = tex2D(CShade_SampleColorTex, TexArray[0].xw);
-                Sample[2] = tex2D(CShade_SampleColorTex, TexArray[0].zw);
-                Sample[3] = tex2D(CShade_SampleColorTex, TexArray[0].xy);
-                Sample[4] = tex2D(CShade_SampleColorTex, TexArray[0].zy);
-                break;
-            default:
-                break;
-        }
-
-        // Get polar min/max
-        float4 MinRGB = min(Sample[0], min(min(Sample[1], Sample[2]), min(Sample[3], Sample[4])));
-        float4 MaxRGB = max(Sample[0], max(max(Sample[1], Sample[2]), max(Sample[3], Sample[4])));
-
-        if (Kernel == 0)
-        {
-            MinRGB = min(MinRGB, min(min(Sample[5], Sample[6]), min(Sample[7], Sample[8])));
-            MaxRGB = max(MaxRGB, max(max(Sample[5], Sample[6]), max(Sample[7], Sample[8])));
-        }
-
-        // Get needed reciprocal
-        float4 ReciprocalMaxRGB = 1.0 / MaxRGB;
-
-        // Amplify
-        float4 AmplifyRGB = saturate(min(MinRGB, 2.0 - MaxRGB) * ReciprocalMaxRGB);
-
-        // Optional grayscale
-        switch (Detection)
-        {
-            case 1:
-                AmplifyRGB = CColor_GetLuma(AmplifyRGB.rgb, 0);
-                break;
-            case 2:
-                AmplifyRGB = CColor_GetLuma(AmplifyRGB.rgb, 3);
-                break;
-        }
-
-        // Shaping amount of sharpening.
-        AmplifyRGB *= rsqrt(AmplifyRGB);
-
-        /* Filter shape.
-              w   |   w   | w w
-            w 1 w | w 1 w |  1
-              w   |   w   | w w
-        */
-        float4 Peak = -(1.0 / lerp(8.0, 5.0, Contrast));
-        float4 Weight = AmplifyRGB * Peak;
-        float4 ReciprocalWeight = 1.0 / (1.0 + (4.0 * Weight));
-
+        float4 TexArray[2];
+        TexArray[0] = Tex.xyxy + (Delta.xyxy * float4(-1.0, 0.0, 1.0, 0.0));
+        TexArray[1] = Tex.xyxy + (Delta.xyxy * float4(0.0, -1.0, 0.0, 1.0));
+
+        float4 Sample[5];
+        Sample[0] = tex2D(CShade_SampleColorTex, Tex);
+        Sample[1] = tex2D(CShade_SampleColorTex, TexArray[0].xy);
+        Sample[2] = tex2D(CShade_SampleColorTex, TexArray[0].zw);
+        Sample[3] = tex2D(CShade_SampleColorTex, TexArray[1].xy);
+        Sample[4] = tex2D(CShade_SampleColorTex, TexArray[1].zw);
+
+        // Luma times 2.
+        float Luma[5];
+        Luma[0] = dot(Sample[0].rgb, float3(0.5, 0.5, 1.0));
+        Luma[1] = dot(Sample[1].rgb, float3(0.5, 0.5, 1.0));
+        Luma[2] = dot(Sample[2].rgb, float3(0.5, 0.5, 1.0));
+        Luma[3] = dot(Sample[3].rgb, float3(0.5, 0.5, 1.0));
+        Luma[4] = dot(Sample[4].rgb, float3(0.5, 0.5, 1.0));
+
+        // Noise detection using a normalized local contrast filter
+        float Noise = ((Luma[1] + Luma[2] + Luma[3] + Luma[4]) * 0.25) - Luma[0];
+        float MaxLuma = max(Luma[0], max(max(Luma[1], Luma[2]), max(Luma[3], Luma[4])));
+        float MinLuma = min(Luma[0], min(min(Luma[1], Luma[2]), min(Luma[3], Luma[4])));
+        float RangeLuma = MaxLuma - MinLuma;
+        Noise = saturate(abs(Noise) / RangeLuma);
+        Noise = (-0.5 * Noise) + 1.0;
+
+        // Min and max of ring.
+        float4 MaxRGB = max(max(Sample[1], Sample[2]), max(Sample[3], Sample[4]));
+        float4 MinRGB = min(min(Sample[1], Sample[2]), min(Sample[3], Sample[4]));
+
+        // Immediate constants for peak range.
+        float2 PeakC = float2(1.0, -1.0 * 4.0);
+
+        // Limiters, these need to be high precision RCPs.
+        float4 HitMinRGB = MinRGB / (4.0 * MaxRGB);
+        float4 HitMaxRGB = (PeakC.x - MaxRGB) / ((4.0 * MinRGB) + PeakC.y);
+        float4 LobeRGB = max(-HitMinRGB, HitMaxRGB);
+        float MaxLobe = max(max(LobeRGB.r, LobeRGB.g), LobeRGB.b);
+
+        Sharpening = 1.0 - Sharpening;
+        float4 Lobe = max(-FSR_RCAS_LIMIT, min(MaxLobe, 0.0)) * int(exp2(-Sharpening));
+
+        // Apply noise removal
+        Lobe *= Noise;
+
+        // Resolve, which needs the medium precision rcp approximation to avoid visible tonality changes.
+        float4 RcpL = 1.0 / ((4.0 * Lobe) + 1.0);
         FilterShape = Sample[0];
-        FilterShape += Sample[1] * Weight;
-        FilterShape += Sample[2] * Weight;
-        FilterShape += Sample[3] * Weight;
-        FilterShape += Sample[4] * Weight;
-        FilterShape = saturate(FilterShape * ReciprocalWeight);
-
-        FilterMask = AmplifyRGB;
+        FilterShape += (Lobe * Sample[1]);
+        FilterShape += (Lobe * Sample[2]);
+        FilterShape += (Lobe * Sample[3]);
+        FilterShape += (Lobe * Sample[4]);
+        FilterShape *= RcpL;
     }
 
 #endif
\ No newline at end of file