diff --git a/WickedEngine/offlineshadercompiler.cpp b/WickedEngine/offlineshadercompiler.cpp index a61d44cd80..8e8abcd098 100644 --- a/WickedEngine/offlineshadercompiler.cpp +++ b/WickedEngine/offlineshadercompiler.cpp @@ -489,7 +489,10 @@ int main(int argc, char* argv[]) // permutations for ssgiCS: shaders.push_back({ "ssgiCS", wi::graphics::ShaderStage::CS }); - shaders.back().permutations.emplace_back().defines = {"WIDE"}; + shaders.back().permutations.emplace_back().defines = { "WIDE" }; + // permutations for ssgi_upsampleCS: + shaders.push_back({ "ssgi_upsampleCS", wi::graphics::ShaderStage::CS }); + shaders.back().permutations.emplace_back().defines = { "WIDE" }; wi::jobsystem::Initialize(); wi::jobsystem::context ctx; diff --git a/WickedEngine/shaders/ssgiCS.hlsl b/WickedEngine/shaders/ssgiCS.hlsl index fa0e492555..d4aaa398c1 100644 --- a/WickedEngine/shaders/ssgiCS.hlsl +++ b/WickedEngine/shaders/ssgiCS.hlsl @@ -73,7 +73,7 @@ float3 compute_diffuse( x += x_incr; y += y_incr; - const int2 loc = int2(round(x), round(y)); + const int2 loc = int2(x, y); const uint tt = coord_to_cache(loc); const float dt = float(i) / float(step); @@ -104,18 +104,46 @@ void main(uint3 DTid : SV_DispatchThreadID, uint3 Gid : SV_GroupID, uint2 GTid : GroupMemoryBarrierWithGroupSync(); const int2 tile_upperleft = Gid.xy * THREADCOUNT - TILE_BORDER; - for(uint t = groupIndex; t < TILE_SIZE * TILE_SIZE; t += THREADCOUNT * THREADCOUNT) + for(uint x = GTid.x * 2; x < TILE_SIZE; x += THREADCOUNT * 2) + for(uint y = GTid.y * 2; y < TILE_SIZE; y += THREADCOUNT * 2) { - const int2 pixel = tile_upperleft + unflatten2D(t, TILE_SIZE); - const float depth = input_depth[uint3(pixel, layer)]; - const float2 uv = (pixel + 0.5f) * postprocess.resolution_rcp; - const float3 P = reconstruct_position(uv, depth, GetCamera().inverse_projection); - const float3 color = input_color[uint3(pixel, layer)]; - const uint pkcolor = Pack_R11G11B10_FLOAT(color.rgb); - cache_xy[t] = pack_half2(P.xy); - cache_z[t] = P.z; - cache_rgb[t] = pkcolor; - if(pkcolor) + const int2 pixel = tile_upperleft + int2(x, y); + const float3 uvw = float3((pixel + 0.5f) * postprocess.resolution_rcp, layer); + const float4 depths = input_depth.GatherRed(sampler_linear_clamp, uvw); + const float4 reds = input_color.GatherRed(sampler_linear_clamp, uvw); + const float4 greens = input_color.GatherGreen(sampler_linear_clamp, uvw); + const float4 blues = input_color.GatherBlue(sampler_linear_clamp, uvw); + const float2 uv0 = (pixel + 0.5 + int2(0, 0)) * postprocess.resolution_rcp; + const float2 uv1 = (pixel + 0.5 + int2(1, 0)) * postprocess.resolution_rcp; + const float2 uv2 = (pixel + 0.5 + int2(0, 1)) * postprocess.resolution_rcp; + const float2 uv3 = (pixel + 0.5 + int2(1, 1)) * postprocess.resolution_rcp; + const float3 P0 = reconstruct_position(uv0, depths.w, GetCamera().inverse_projection); + const float3 P1 = reconstruct_position(uv1, depths.z, GetCamera().inverse_projection); + const float3 P2 = reconstruct_position(uv2, depths.x, GetCamera().inverse_projection); + const float3 P3 = reconstruct_position(uv3, depths.y, GetCamera().inverse_projection); + const uint C0 = Pack_R11G11B10_FLOAT(float3(reds.w, greens.w, blues.w)); + const uint C1 = Pack_R11G11B10_FLOAT(float3(reds.z, greens.z, blues.z)); + const uint C2 = Pack_R11G11B10_FLOAT(float3(reds.x, greens.x, blues.x)); + const uint C3 = Pack_R11G11B10_FLOAT(float3(reds.y, greens.y, blues.y)); + + const uint t = coord_to_cache(int2(x, y)); + cache_xy[t] = pack_half2(P0.xy); + cache_z[t] = P0.z; + cache_rgb[t] = C0; + + cache_xy[t + 1] = pack_half2(P1.xy); + cache_z[t + 1] = P1.z; + cache_rgb[t + 1] = C1; + + cache_xy[t + TILE_SIZE] = pack_half2(P2.xy); + cache_z[t + TILE_SIZE] = P2.z; + cache_rgb[t + TILE_SIZE] = C2; + + cache_xy[t + TILE_SIZE + 1] = pack_half2(P3.xy); + cache_z[t + TILE_SIZE + 1] = P3.z; + cache_rgb[t + TILE_SIZE + 1] = C3; + + if(C0 || C1 || C2 || C3) InterlockedOr(group_valid, 1u); } GroupMemoryBarrierWithGroupSync(); @@ -143,17 +171,12 @@ void main(uint3 DTid : SV_DispatchThreadID, uint3 Gid : SV_GroupID, uint2 GTid : const float spread = postprocess.params0.y /*+ dither(DTid.xy)*/; const float rangespread_rcp2 = postprocess.params0.z; - const int2 pixel_base = Gid.xy * THREADCOUNT + GTid; for(int x = -range; x <= range; ++x) { for(int y = -range; y <= range; ++y) { - const int2 pixel = pixel_base + int2(x, y); - if(any(pixel < 0) || any(pixel >= postprocess.resolution) || (x == 0 && y == 0)) - continue; // to not lose energy when sampling outside of textures, we skip those offsets - const float2 foffset = float2(x, y) * spread; - const int2 offset = round(foffset); - const float weight = saturate(1 - abs(foffset.x) * abs(foffset.y) * rangespread_rcp2); + const int2 offset = int2(x, y) * spread; + const float weight = saturate(1 - abs(offset.x) * abs(offset.y) * rangespread_rcp2); diffuse += compute_diffuse(P, N, originLoc, originLoc + offset) * weight; sum += weight; } diff --git a/WickedEngine/shaders/ssgi_upsampleCS.hlsl b/WickedEngine/shaders/ssgi_upsampleCS.hlsl index 104f4d35b7..bfb551cc76 100644 --- a/WickedEngine/shaders/ssgi_upsampleCS.hlsl +++ b/WickedEngine/shaders/ssgi_upsampleCS.hlsl @@ -12,13 +12,75 @@ Texture2D input_normal_high : register(t4); RWTexture2D output : register(u0); +#ifdef WIDE +static const uint THREADCOUNT = POSTPROCESS_BLOCKSIZE; +static const int TILE_BORDER = 6; +#else +static const uint THREADCOUNT = POSTPROCESS_BLOCKSIZE; +static const int TILE_BORDER = 2; +#endif // WIDE +static const int TILE_SIZE = TILE_BORDER + THREADCOUNT + TILE_BORDER; +groupshared float cache_z[TILE_SIZE * TILE_SIZE]; +groupshared uint cache_rgb[TILE_SIZE * TILE_SIZE]; +groupshared uint cache_oct[TILE_SIZE * TILE_SIZE]; + +inline uint coord_to_cache(int2 coord) +{ + return flatten2D(clamp(coord, 0, TILE_SIZE - 1), TILE_SIZE); +} + static const float depthThreshold = 0.1; static const float normalThreshold = 64; [numthreads(POSTPROCESS_BLOCKSIZE, POSTPROCESS_BLOCKSIZE, 1)] void main(uint2 Gid : SV_GroupID, uint groupIndex : SV_GroupIndex) { - uint2 GTid = remap_lane_8x8(groupIndex); + const uint2 GTid = remap_lane_8x8(groupIndex); + + const int2 tile_upperleft = Gid.xy * THREADCOUNT / 2 - TILE_BORDER; + for(uint x = GTid.x * 2; x < TILE_SIZE; x += THREADCOUNT * 2) + for(uint y = GTid.y * 2; y < TILE_SIZE; y += THREADCOUNT * 2) + { + const int2 pixel = tile_upperleft + int2(x, y); + const float2 uv = (pixel + 0.5f) * postprocess.params1.zw; + const float4 depths = input_depth_low.GatherRed(sampler_linear_clamp, uv); + const float4 reds = input_diffuse_low.GatherRed(sampler_linear_clamp, uv); + const float4 greens = input_diffuse_low.GatherGreen(sampler_linear_clamp, uv); + const float4 blues = input_diffuse_low.GatherBlue(sampler_linear_clamp, uv); + const float4 xxxx = input_normal_low.GatherRed(sampler_linear_clamp, uv); + const float4 yyyy = input_normal_low.GatherGreen(sampler_linear_clamp, uv); + const float Z0 = compute_lineardepth(depths.w); + const float Z1 = compute_lineardepth(depths.z); + const float Z2 = compute_lineardepth(depths.x); + const float Z3 = compute_lineardepth(depths.y); + const uint C0 = Pack_R11G11B10_FLOAT(float3(reds.w, greens.w, blues.w)); + const uint C1 = Pack_R11G11B10_FLOAT(float3(reds.z, greens.z, blues.z)); + const uint C2 = Pack_R11G11B10_FLOAT(float3(reds.x, greens.x, blues.x)); + const uint C3 = Pack_R11G11B10_FLOAT(float3(reds.y, greens.y, blues.y)); + const uint OCT0 = pack_half2(float2(xxxx.w, yyyy.w)); + const uint OCT1 = pack_half2(float2(xxxx.z, yyyy.z)); + const uint OCT2 = pack_half2(float2(xxxx.x, yyyy.x)); + const uint OCT3 = pack_half2(float2(xxxx.y, yyyy.y)); + + const uint t = coord_to_cache(int2(x, y)); + cache_z[t] = Z0; + cache_rgb[t] = C0; + cache_oct[t] = OCT0; + + cache_z[t + 1] = Z1; + cache_rgb[t + 1] = C1; + cache_oct[t + 1] = OCT1; + + cache_z[t + TILE_SIZE] = Z2; + cache_rgb[t + TILE_SIZE] = C2; + cache_oct[t + TILE_SIZE] = OCT2; + + cache_z[t + TILE_SIZE + 1] = Z3; + cache_rgb[t + TILE_SIZE + 1] = C3; + cache_oct[t + TILE_SIZE + 1] = OCT3; + } + GroupMemoryBarrierWithGroupSync(); + uint2 pixel = Gid * POSTPROCESS_BLOCKSIZE + GTid; const float2 uv = (pixel + 0.5) * postprocess.resolution_rcp; @@ -28,10 +90,10 @@ void main(uint2 Gid : SV_GroupID, uint groupIndex : SV_GroupIndex) #if 1 const int range = int(postprocess.params0.x); - float spread = postprocess.params0.y; + int spread = int(postprocess.params0.y); #else const int range = 2; - float spread = 2; + int spread = 2; #endif #if 0 @@ -45,24 +107,24 @@ void main(uint2 Gid : SV_GroupID, uint groupIndex : SV_GroupIndex) const float normalPow = normalThreshold; #endif + const int2 coord_base = GTid.xy / 2 + TILE_BORDER; + float3 result = 0; float sum = 0; for(int x = -range; x <= range; ++x) { for(int y = -range; y <= range; ++y) { - const float2 offset = float2(x, y) * spread * postprocess.resolution_rcp; - const float2 sample_uv = uv + offset; - - const float3 sampleDiffuse = input_diffuse_low.SampleLevel(sampler_linear_clamp, sample_uv, 0).rgb; - - const float sampleDepth = input_depth_low.SampleLevel(sampler_point_clamp, sample_uv, 0); - const float sampleLinearDepth = compute_lineardepth(sampleDepth); - float bilateralDepthWeight = 1 - saturate(abs(sampleLinearDepth - linearDepth) * depthThreshold); + const int2 coord = coord_base + int2(x, y) * spread; + const uint t = coord_to_cache(coord); - const float3 sampleN = decode_oct(input_normal_low.SampleLevel(sampler_linear_clamp, sample_uv, 0)); - float normalError = pow(saturate(dot(sampleN, N)), normalPow) + 0.001; - float bilateralNormalWeight = normalError; + const float3 sampleDiffuse = Unpack_R11G11B10_FLOAT(cache_rgb[t]); + const float sampleLinearDepth = cache_z[t]; + const float3 sampleN = decode_oct(unpack_half2(cache_oct[t])); + + float bilateralDepthWeight = 1 - saturate(abs(sampleLinearDepth - linearDepth) * depthThreshold); + + float bilateralNormalWeight = pow(saturate(dot(sampleN, N)), normalPow) + 0.001; float weight = bilateralDepthWeight * bilateralNormalWeight; diff --git a/WickedEngine/wiEnums.h b/WickedEngine/wiEnums.h index 6e74bd700d..e88eb65c4c 100644 --- a/WickedEngine/wiEnums.h +++ b/WickedEngine/wiEnums.h @@ -300,6 +300,7 @@ namespace wi::enums CSTYPE_POSTPROCESS_SSGI, CSTYPE_POSTPROCESS_SSGI_WIDE, CSTYPE_POSTPROCESS_SSGI_UPSAMPLE, + CSTYPE_POSTPROCESS_SSGI_UPSAMPLE_WIDE, CSTYPE_POSTPROCESS_RTDIFFUSE, CSTYPE_POSTPROCESS_RTDIFFUSE_SPATIAL, CSTYPE_POSTPROCESS_RTDIFFUSE_TEMPORAL, diff --git a/WickedEngine/wiRenderer.cpp b/WickedEngine/wiRenderer.cpp index eec4a6bbbe..ab9883b302 100644 --- a/WickedEngine/wiRenderer.cpp +++ b/WickedEngine/wiRenderer.cpp @@ -1044,6 +1044,7 @@ void LoadShaders() wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_POSTPROCESS_SSGI], "ssgiCS.cso"); }); wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_POSTPROCESS_SSGI_WIDE], "ssgiCS.cso", wi::graphics::ShaderModel::SM_5_0, { "WIDE" }); }); wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_POSTPROCESS_SSGI_UPSAMPLE], "ssgi_upsampleCS.cso"); }); + wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_POSTPROCESS_SSGI_UPSAMPLE_WIDE], "ssgi_upsampleCS.cso", wi::graphics::ShaderModel::SM_5_0, { "WIDE" }); }); wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_POSTPROCESS_RTDIFFUSE_SPATIAL], "rtdiffuse_spatialCS.cso"); }); wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_POSTPROCESS_RTDIFFUSE_TEMPORAL], "rtdiffuse_temporalCS.cso"); }); wi::jobsystem::Execute(ctx, [](wi::jobsystem::JobArgs args) { LoadShader(ShaderStage::CS, shaders[CSTYPE_POSTPROCESS_RTDIFFUSE_UPSAMPLE], "rtdiffuse_upsampleCS.cso"); }); @@ -12720,7 +12721,8 @@ void Postprocess_SSGI( { device->EventBegin("SSGI - upsample", cmd); - device->BindComputeShader(&shaders[CSTYPE_POSTPROCESS_SSGI_UPSAMPLE], cmd); + + device->BindComputeShader(&shaders[CSTYPE_POSTPROCESS_SSGI_UPSAMPLE_WIDE], cmd); // 16x -> 8x { @@ -12736,8 +12738,12 @@ void Postprocess_SSGI( postprocess.resolution.y = desc.height >> 2; postprocess.resolution_rcp.x = 1.0f / postprocess.resolution.x; postprocess.resolution_rcp.y = 1.0f / postprocess.resolution.y; - postprocess.params0.x = 2; // range - postprocess.params0.y = 4; // spread + postprocess.params0.x = 3; // range + postprocess.params0.y = 2; // spread + postprocess.params1.x = float(desc.width >> 3); + postprocess.params1.y = float(desc.height >> 3); + postprocess.params1.z = 1.0f / postprocess.params1.x; + postprocess.params1.w = 1.0f / postprocess.params1.y; device->PushConstants(&postprocess, sizeof(postprocess), cmd); device->Dispatch( @@ -12770,7 +12776,11 @@ void Postprocess_SSGI( postprocess.resolution_rcp.x = 1.0f / postprocess.resolution.x; postprocess.resolution_rcp.y = 1.0f / postprocess.resolution.y; postprocess.params0.x = 2; // range - postprocess.params0.y = 6; // spread + postprocess.params0.y = 3; // spread + postprocess.params1.x = float(desc.width >> 2); + postprocess.params1.y = float(desc.height >> 2); + postprocess.params1.z = 1.0f / postprocess.params1.x; + postprocess.params1.w = 1.0f / postprocess.params1.y; device->PushConstants(&postprocess, sizeof(postprocess), cmd); device->Dispatch( @@ -12788,6 +12798,8 @@ void Postprocess_SSGI( } } + device->BindComputeShader(&shaders[CSTYPE_POSTPROCESS_SSGI_UPSAMPLE], cmd); + // 4x -> 2x { device->BindResource(&res.texture_depth_mips, 0, cmd, 1); @@ -12803,7 +12815,11 @@ void Postprocess_SSGI( postprocess.resolution_rcp.x = 1.0f / postprocess.resolution.x; postprocess.resolution_rcp.y = 1.0f / postprocess.resolution.y; postprocess.params0.x = 1; // range - postprocess.params0.y = 4; // spread + postprocess.params0.y = 2; // spread + postprocess.params1.x = float(desc.width >> 1); + postprocess.params1.y = float(desc.height >> 1); + postprocess.params1.z = 1.0f / postprocess.params1.x; + postprocess.params1.w = 1.0f / postprocess.params1.y; device->PushConstants(&postprocess, sizeof(postprocess), cmd); device->Dispatch( @@ -12836,7 +12852,12 @@ void Postprocess_SSGI( postprocess.resolution_rcp.x = 1.0f / postprocess.resolution.x; postprocess.resolution_rcp.y = 1.0f / postprocess.resolution.y; postprocess.params0.x = 1; // range - postprocess.params0.y = 2; // spread + postprocess.params0.y = 1; // spread + const TextureDesc& desc2 = res.texture_diffuse_mips.desc; + postprocess.params1.x = float(desc2.width); + postprocess.params1.y = float(desc2.height); + postprocess.params1.z = 1.0f / postprocess.params1.x; + postprocess.params1.w = 1.0f / postprocess.params1.y; device->PushConstants(&postprocess, sizeof(postprocess), cmd); device->Dispatch( diff --git a/WickedEngine/wiVersion.cpp b/WickedEngine/wiVersion.cpp index 5ff08a105d..8777a107ea 100644 --- a/WickedEngine/wiVersion.cpp +++ b/WickedEngine/wiVersion.cpp @@ -9,7 +9,7 @@ namespace wi::version // minor features, major updates, breaking compatibility changes const int minor = 71; // minor bug fixes, alterations, refactors, updates - const int revision = 424; + const int revision = 425; const std::string version_string = std::to_string(major) + "." + std::to_string(minor) + "." + std::to_string(revision);