Skip to content

Commit

Permalink
ssgi optimization and improvement
Browse files Browse the repository at this point in the history
  • Loading branch information
turanszkij committed Apr 7, 2024
1 parent d470ea9 commit 23b2547
Show file tree
Hide file tree
Showing 6 changed files with 151 additions and 41 deletions.
5 changes: 4 additions & 1 deletion WickedEngine/offlineshadercompiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -489,7 +489,10 @@ int main(int argc, char* argv[])

// permutations for ssgiCS:
shaders.push_back({ "ssgiCS", wi::graphics::ShaderStage::CS });
shaders.back().permutations.emplace_back().defines = {"WIDE"};
shaders.back().permutations.emplace_back().defines = { "WIDE" };
// permutations for ssgi_upsampleCS:
shaders.push_back({ "ssgi_upsampleCS", wi::graphics::ShaderStage::CS });
shaders.back().permutations.emplace_back().defines = { "WIDE" };

wi::jobsystem::Initialize();
wi::jobsystem::context ctx;
Expand Down
61 changes: 42 additions & 19 deletions WickedEngine/shaders/ssgiCS.hlsl
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ float3 compute_diffuse(
x += x_incr;
y += y_incr;

const int2 loc = int2(round(x), round(y));
const int2 loc = int2(x, y);
const uint tt = coord_to_cache(loc);

const float dt = float(i) / float(step);
Expand Down Expand Up @@ -104,18 +104,46 @@ void main(uint3 DTid : SV_DispatchThreadID, uint3 Gid : SV_GroupID, uint2 GTid :
GroupMemoryBarrierWithGroupSync();

const int2 tile_upperleft = Gid.xy * THREADCOUNT - TILE_BORDER;
for(uint t = groupIndex; t < TILE_SIZE * TILE_SIZE; t += THREADCOUNT * THREADCOUNT)
for(uint x = GTid.x * 2; x < TILE_SIZE; x += THREADCOUNT * 2)
for(uint y = GTid.y * 2; y < TILE_SIZE; y += THREADCOUNT * 2)
{
const int2 pixel = tile_upperleft + unflatten2D(t, TILE_SIZE);
const float depth = input_depth[uint3(pixel, layer)];
const float2 uv = (pixel + 0.5f) * postprocess.resolution_rcp;
const float3 P = reconstruct_position(uv, depth, GetCamera().inverse_projection);
const float3 color = input_color[uint3(pixel, layer)];
const uint pkcolor = Pack_R11G11B10_FLOAT(color.rgb);
cache_xy[t] = pack_half2(P.xy);
cache_z[t] = P.z;
cache_rgb[t] = pkcolor;
if(pkcolor)
const int2 pixel = tile_upperleft + int2(x, y);
const float3 uvw = float3((pixel + 0.5f) * postprocess.resolution_rcp, layer);
const float4 depths = input_depth.GatherRed(sampler_linear_clamp, uvw);
const float4 reds = input_color.GatherRed(sampler_linear_clamp, uvw);
const float4 greens = input_color.GatherGreen(sampler_linear_clamp, uvw);
const float4 blues = input_color.GatherBlue(sampler_linear_clamp, uvw);
const float2 uv0 = (pixel + 0.5 + int2(0, 0)) * postprocess.resolution_rcp;
const float2 uv1 = (pixel + 0.5 + int2(1, 0)) * postprocess.resolution_rcp;
const float2 uv2 = (pixel + 0.5 + int2(0, 1)) * postprocess.resolution_rcp;
const float2 uv3 = (pixel + 0.5 + int2(1, 1)) * postprocess.resolution_rcp;
const float3 P0 = reconstruct_position(uv0, depths.w, GetCamera().inverse_projection);
const float3 P1 = reconstruct_position(uv1, depths.z, GetCamera().inverse_projection);
const float3 P2 = reconstruct_position(uv2, depths.x, GetCamera().inverse_projection);
const float3 P3 = reconstruct_position(uv3, depths.y, GetCamera().inverse_projection);
const uint C0 = Pack_R11G11B10_FLOAT(float3(reds.w, greens.w, blues.w));
const uint C1 = Pack_R11G11B10_FLOAT(float3(reds.z, greens.z, blues.z));
const uint C2 = Pack_R11G11B10_FLOAT(float3(reds.x, greens.x, blues.x));
const uint C3 = Pack_R11G11B10_FLOAT(float3(reds.y, greens.y, blues.y));

const uint t = coord_to_cache(int2(x, y));
cache_xy[t] = pack_half2(P0.xy);
cache_z[t] = P0.z;
cache_rgb[t] = C0;

cache_xy[t + 1] = pack_half2(P1.xy);
cache_z[t + 1] = P1.z;
cache_rgb[t + 1] = C1;

cache_xy[t + TILE_SIZE] = pack_half2(P2.xy);
cache_z[t + TILE_SIZE] = P2.z;
cache_rgb[t + TILE_SIZE] = C2;

cache_xy[t + TILE_SIZE + 1] = pack_half2(P3.xy);
cache_z[t + TILE_SIZE + 1] = P3.z;
cache_rgb[t + TILE_SIZE + 1] = C3;

if(C0 || C1 || C2 || C3)
InterlockedOr(group_valid, 1u);
}
GroupMemoryBarrierWithGroupSync();
Expand Down Expand Up @@ -143,17 +171,12 @@ void main(uint3 DTid : SV_DispatchThreadID, uint3 Gid : SV_GroupID, uint2 GTid :
const float spread = postprocess.params0.y /*+ dither(DTid.xy)*/;
const float rangespread_rcp2 = postprocess.params0.z;

const int2 pixel_base = Gid.xy * THREADCOUNT + GTid;
for(int x = -range; x <= range; ++x)
{
for(int y = -range; y <= range; ++y)
{
const int2 pixel = pixel_base + int2(x, y);
if(any(pixel < 0) || any(pixel >= postprocess.resolution) || (x == 0 && y == 0))
continue; // to not lose energy when sampling outside of textures, we skip those offsets
const float2 foffset = float2(x, y) * spread;
const int2 offset = round(foffset);
const float weight = saturate(1 - abs(foffset.x) * abs(foffset.y) * rangespread_rcp2);
const int2 offset = int2(x, y) * spread;
const float weight = saturate(1 - abs(offset.x) * abs(offset.y) * rangespread_rcp2);
diffuse += compute_diffuse(P, N, originLoc, originLoc + offset) * weight;
sum += weight;
}
Expand Down
90 changes: 76 additions & 14 deletions WickedEngine/shaders/ssgi_upsampleCS.hlsl
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,75 @@ Texture2D<float2> input_normal_high : register(t4);

RWTexture2D<float4> output : register(u0);

#ifdef WIDE
static const uint THREADCOUNT = POSTPROCESS_BLOCKSIZE;
static const int TILE_BORDER = 6;
#else
static const uint THREADCOUNT = POSTPROCESS_BLOCKSIZE;
static const int TILE_BORDER = 2;
#endif // WIDE
static const int TILE_SIZE = TILE_BORDER + THREADCOUNT + TILE_BORDER;
groupshared float cache_z[TILE_SIZE * TILE_SIZE];
groupshared uint cache_rgb[TILE_SIZE * TILE_SIZE];
groupshared uint cache_oct[TILE_SIZE * TILE_SIZE];

inline uint coord_to_cache(int2 coord)
{
return flatten2D(clamp(coord, 0, TILE_SIZE - 1), TILE_SIZE);
}

static const float depthThreshold = 0.1;
static const float normalThreshold = 64;

[numthreads(POSTPROCESS_BLOCKSIZE, POSTPROCESS_BLOCKSIZE, 1)]
void main(uint2 Gid : SV_GroupID, uint groupIndex : SV_GroupIndex)
{
uint2 GTid = remap_lane_8x8(groupIndex);
const uint2 GTid = remap_lane_8x8(groupIndex);

const int2 tile_upperleft = Gid.xy * THREADCOUNT / 2 - TILE_BORDER;
for(uint x = GTid.x * 2; x < TILE_SIZE; x += THREADCOUNT * 2)
for(uint y = GTid.y * 2; y < TILE_SIZE; y += THREADCOUNT * 2)
{
const int2 pixel = tile_upperleft + int2(x, y);
const float2 uv = (pixel + 0.5f) * postprocess.params1.zw;
const float4 depths = input_depth_low.GatherRed(sampler_linear_clamp, uv);
const float4 reds = input_diffuse_low.GatherRed(sampler_linear_clamp, uv);
const float4 greens = input_diffuse_low.GatherGreen(sampler_linear_clamp, uv);
const float4 blues = input_diffuse_low.GatherBlue(sampler_linear_clamp, uv);
const float4 xxxx = input_normal_low.GatherRed(sampler_linear_clamp, uv);
const float4 yyyy = input_normal_low.GatherGreen(sampler_linear_clamp, uv);
const float Z0 = compute_lineardepth(depths.w);
const float Z1 = compute_lineardepth(depths.z);
const float Z2 = compute_lineardepth(depths.x);
const float Z3 = compute_lineardepth(depths.y);
const uint C0 = Pack_R11G11B10_FLOAT(float3(reds.w, greens.w, blues.w));
const uint C1 = Pack_R11G11B10_FLOAT(float3(reds.z, greens.z, blues.z));
const uint C2 = Pack_R11G11B10_FLOAT(float3(reds.x, greens.x, blues.x));
const uint C3 = Pack_R11G11B10_FLOAT(float3(reds.y, greens.y, blues.y));
const uint OCT0 = pack_half2(float2(xxxx.w, yyyy.w));
const uint OCT1 = pack_half2(float2(xxxx.z, yyyy.z));
const uint OCT2 = pack_half2(float2(xxxx.x, yyyy.x));
const uint OCT3 = pack_half2(float2(xxxx.y, yyyy.y));

const uint t = coord_to_cache(int2(x, y));
cache_z[t] = Z0;
cache_rgb[t] = C0;
cache_oct[t] = OCT0;

cache_z[t + 1] = Z1;
cache_rgb[t + 1] = C1;
cache_oct[t + 1] = OCT1;

cache_z[t + TILE_SIZE] = Z2;
cache_rgb[t + TILE_SIZE] = C2;
cache_oct[t + TILE_SIZE] = OCT2;

cache_z[t + TILE_SIZE + 1] = Z3;
cache_rgb[t + TILE_SIZE + 1] = C3;
cache_oct[t + TILE_SIZE + 1] = OCT3;
}
GroupMemoryBarrierWithGroupSync();

uint2 pixel = Gid * POSTPROCESS_BLOCKSIZE + GTid;
const float2 uv = (pixel + 0.5) * postprocess.resolution_rcp;

Expand All @@ -28,10 +90,10 @@ void main(uint2 Gid : SV_GroupID, uint groupIndex : SV_GroupIndex)

#if 1
const int range = int(postprocess.params0.x);
float spread = postprocess.params0.y;
int spread = int(postprocess.params0.y);
#else
const int range = 2;
float spread = 2;
int spread = 2;
#endif

#if 0
Expand All @@ -45,24 +107,24 @@ void main(uint2 Gid : SV_GroupID, uint groupIndex : SV_GroupIndex)
const float normalPow = normalThreshold;
#endif

const int2 coord_base = GTid.xy / 2 + TILE_BORDER;

float3 result = 0;
float sum = 0;
for(int x = -range; x <= range; ++x)
{
for(int y = -range; y <= range; ++y)
{
const float2 offset = float2(x, y) * spread * postprocess.resolution_rcp;
const float2 sample_uv = uv + offset;

const float3 sampleDiffuse = input_diffuse_low.SampleLevel(sampler_linear_clamp, sample_uv, 0).rgb;

const float sampleDepth = input_depth_low.SampleLevel(sampler_point_clamp, sample_uv, 0);
const float sampleLinearDepth = compute_lineardepth(sampleDepth);
float bilateralDepthWeight = 1 - saturate(abs(sampleLinearDepth - linearDepth) * depthThreshold);
const int2 coord = coord_base + int2(x, y) * spread;
const uint t = coord_to_cache(coord);

const float3 sampleN = decode_oct(input_normal_low.SampleLevel(sampler_linear_clamp, sample_uv, 0));
float normalError = pow(saturate(dot(sampleN, N)), normalPow) + 0.001;
float bilateralNormalWeight = normalError;
const float3 sampleDiffuse = Unpack_R11G11B10_FLOAT(cache_rgb[t]);
const float sampleLinearDepth = cache_z[t];
const float3 sampleN = decode_oct(unpack_half2(cache_oct[t]));

float bilateralDepthWeight = 1 - saturate(abs(sampleLinearDepth - linearDepth) * depthThreshold);

float bilateralNormalWeight = pow(saturate(dot(sampleN, N)), normalPow) + 0.001;

float weight = bilateralDepthWeight * bilateralNormalWeight;

Expand Down
1 change: 1 addition & 0 deletions WickedEngine/wiEnums.h
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,7 @@ namespace wi::enums
CSTYPE_POSTPROCESS_SSGI,
CSTYPE_POSTPROCESS_SSGI_WIDE,
CSTYPE_POSTPROCESS_SSGI_UPSAMPLE,
CSTYPE_POSTPROCESS_SSGI_UPSAMPLE_WIDE,
CSTYPE_POSTPROCESS_RTDIFFUSE,
CSTYPE_POSTPROCESS_RTDIFFUSE_SPATIAL,
CSTYPE_POSTPROCESS_RTDIFFUSE_TEMPORAL,
Expand Down
33 changes: 27 additions & 6 deletions WickedEngine/wiRenderer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1044,6 +1044,7 @@ void LoadShaders()
wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_POSTPROCESS_SSGI], "ssgiCS.cso"); });
wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_POSTPROCESS_SSGI_WIDE], "ssgiCS.cso", wi::graphics::ShaderModel::SM_5_0, { "WIDE" }); });
wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_POSTPROCESS_SSGI_UPSAMPLE], "ssgi_upsampleCS.cso"); });
wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_POSTPROCESS_SSGI_UPSAMPLE_WIDE], "ssgi_upsampleCS.cso", wi::graphics::ShaderModel::SM_5_0, { "WIDE" }); });
wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_POSTPROCESS_RTDIFFUSE_SPATIAL], "rtdiffuse_spatialCS.cso"); });
wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_POSTPROCESS_RTDIFFUSE_TEMPORAL], "rtdiffuse_temporalCS.cso"); });
wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_POSTPROCESS_RTDIFFUSE_UPSAMPLE], "rtdiffuse_upsampleCS.cso"); });
Expand Down Expand Up @@ -12720,7 +12721,8 @@ void Postprocess_SSGI(

{
device->EventBegin("SSGI - upsample", cmd);
device->BindComputeShader(&shaders[CSTYPE_POSTPROCESS_SSGI_UPSAMPLE], cmd);

device->BindComputeShader(&shaders[CSTYPE_POSTPROCESS_SSGI_UPSAMPLE_WIDE], cmd);

// 16x -> 8x
{
Expand All @@ -12736,8 +12738,12 @@ void Postprocess_SSGI(
postprocess.resolution.y = desc.height >> 2;
postprocess.resolution_rcp.x = 1.0f / postprocess.resolution.x;
postprocess.resolution_rcp.y = 1.0f / postprocess.resolution.y;
postprocess.params0.x = 2; // range
postprocess.params0.y = 4; // spread
postprocess.params0.x = 3; // range
postprocess.params0.y = 2; // spread
postprocess.params1.x = float(desc.width >> 3);
postprocess.params1.y = float(desc.height >> 3);
postprocess.params1.z = 1.0f / postprocess.params1.x;
postprocess.params1.w = 1.0f / postprocess.params1.y;
device->PushConstants(&postprocess, sizeof(postprocess), cmd);

device->Dispatch(
Expand Down Expand Up @@ -12770,7 +12776,11 @@ void Postprocess_SSGI(
postprocess.resolution_rcp.x = 1.0f / postprocess.resolution.x;
postprocess.resolution_rcp.y = 1.0f / postprocess.resolution.y;
postprocess.params0.x = 2; // range
postprocess.params0.y = 6; // spread
postprocess.params0.y = 3; // spread
postprocess.params1.x = float(desc.width >> 2);
postprocess.params1.y = float(desc.height >> 2);
postprocess.params1.z = 1.0f / postprocess.params1.x;
postprocess.params1.w = 1.0f / postprocess.params1.y;
device->PushConstants(&postprocess, sizeof(postprocess), cmd);

device->Dispatch(
Expand All @@ -12788,6 +12798,8 @@ void Postprocess_SSGI(
}
}

device->BindComputeShader(&shaders[CSTYPE_POSTPROCESS_SSGI_UPSAMPLE], cmd);

// 4x -> 2x
{
device->BindResource(&res.texture_depth_mips, 0, cmd, 1);
Expand All @@ -12803,7 +12815,11 @@ void Postprocess_SSGI(
postprocess.resolution_rcp.x = 1.0f / postprocess.resolution.x;
postprocess.resolution_rcp.y = 1.0f / postprocess.resolution.y;
postprocess.params0.x = 1; // range
postprocess.params0.y = 4; // spread
postprocess.params0.y = 2; // spread
postprocess.params1.x = float(desc.width >> 1);
postprocess.params1.y = float(desc.height >> 1);
postprocess.params1.z = 1.0f / postprocess.params1.x;
postprocess.params1.w = 1.0f / postprocess.params1.y;
device->PushConstants(&postprocess, sizeof(postprocess), cmd);

device->Dispatch(
Expand Down Expand Up @@ -12836,7 +12852,12 @@ void Postprocess_SSGI(
postprocess.resolution_rcp.x = 1.0f / postprocess.resolution.x;
postprocess.resolution_rcp.y = 1.0f / postprocess.resolution.y;
postprocess.params0.x = 1; // range
postprocess.params0.y = 2; // spread
postprocess.params0.y = 1; // spread
const TextureDesc& desc2 = res.texture_diffuse_mips.desc;
postprocess.params1.x = float(desc2.width);
postprocess.params1.y = float(desc2.height);
postprocess.params1.z = 1.0f / postprocess.params1.x;
postprocess.params1.w = 1.0f / postprocess.params1.y;
device->PushConstants(&postprocess, sizeof(postprocess), cmd);

device->Dispatch(
Expand Down
2 changes: 1 addition & 1 deletion WickedEngine/wiVersion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ namespace wi::version
// minor features, major updates, breaking compatibility changes
const int minor = 71;
// minor bug fixes, alterations, refactors, updates
const int revision = 424;
const int revision = 425;

const std::string version_string = std::to_string(major) + "." + std::to_string(minor) + "." + std::to_string(revision);

Expand Down

0 comments on commit 23b2547

Please sign in to comment.