Suspicious slow compute shader... Mem write bottleneck?

Hey Guys,

I come here to borrow you guys' insightful eyes again to help me find what I did wrong...

I have a compute shader which will take 5 input Texture2D (they all 512x424) and then do bunch of computation and output 28 float for each input pixel position. Here is the shader:

#include "FastICP.inl"
#include "CalibData.inl"
Texture2D<uint> tex_srvKinectDepth : register(t0);    //R16_UINT
Texture2D<uint> tex_srvTSDFDepth : register(t1);      //R16_UINT
Texture2D<float4> tex_srvKinectNormal : register(t2); //R10G10B10A2_UNORM
Texture2D<float4> tex_srvTSDFNormal : register(t3);   //R10G10B10A2_UNORM
Texture2D<float> tex_srvWeight : register(t4);        //R8_UNORM

RWStructuredBuffer<float4> buf_uavData0 : register(u0);//CxCx,CxCy,CxCz,Ctr
RWStructuredBuffer<float4> buf_uavData1 : register(u1);//CxNx,CxNy,CxNz,CyCy
RWStructuredBuffer<float4> buf_uavData2 : register(u2);//CyNx,CyNy,CyNz,CyCz
RWStructuredBuffer<float4> buf_uavData3 : register(u3);//CzNx,CzNy,CzNz,CzCz
RWStructuredBuffer<float4> buf_uavData4 : register(u4);//NxNx,NxNy,NxNy,CxPQN
RWStructuredBuffer<float4> buf_uavData5 : register(u5);//NyNy,NyNz,NzNz,CyPQN
RWStructuredBuffer<float4> buf_uavData6 : register(u6);//NxPQN,NyPQN,NzPQN,CzPQN

void AllZero(uint uIdx)
{
    buf_uavData0[uIdx] = 0.f;
    buf_uavData1[uIdx] = 0.f;
    buf_uavData2[uIdx] = 0.f;
    buf_uavData3[uIdx] = 0.f;
    buf_uavData4[uIdx] = 0.f;
    buf_uavData5[uIdx] = 0.f;
    buf_uavData6[uIdx] = 0.f;
}

float3 ReprojectPt(uint2 u2xy, float fDepth)
{
    return float3(float2(u2xy - DEPTH_C) * fDepth / DEPTH_F, fDepth);
}

float GetNormalMatchedDepth(Texture2D<uint> tex_srvDepth, uint3 DTid)
{
    uint uAccDepth = tex_srvDepth.Load(DTid);
    uAccDepth += tex_srvDepth.Load(DTid, uint2(0, 1));
    uAccDepth += tex_srvDepth.Load(DTid, uint2(1, 0));
    uAccDepth += tex_srvDepth.Load(DTid, uint2(1, 1));
    return uAccDepth * -0.001f / 4.f;
}

[numthreads(8, 8, 1)]
void main(uint3 DTid : SV_DispatchThreadID)
{
    uint uIdx = DTid.x + DTid.y * u2AlignedReso.x;
    if (tex_srvWeight.Load(DTid) < 0.05f) {
        AllZero(uIdx);
        return;
    }
    float4 f4KinectNormal = tex_srvKinectNormal.Load(DTid) * 2.f - 1.f;
    // No valid normal data
    if (f4KinectNormal.w < 0.05f) {
        AllZero(uIdx);
        return;
    }
    float4 f4TSDFNormal = tex_srvTSDFNormal.Load(DTid) * 2.f - 1.f;
    // No valid normal data
    if (f4TSDFNormal.w < 0.05f) {
        AllZero(uIdx);
        return;
    }
    // Normals are too different
    if (dot(f4TSDFNormal.xyz, f4KinectNormal.xyz) < fNormalDiffThreshold) {
        AllZero(uIdx);
        return;
    }
    float fDepth = GetNormalMatchedDepth(tex_srvKinectDepth, DTid);
    // p is Kinect point, q is TSDF point, n is TSDF normal
    // c = p x n
    float3 p = ReprojectPt(DTid.xy, fDepth);
    float3 n = f4TSDFNormal.xyz;
    float3 c = cross(p, n);

    float3 cc = c.xxx * c.xyz; // Get CxCx, CxCy, CxCz
    buf_uavData0[uIdx] = float4(cc, 1.f); // last element is counter

    cc = c.yyz * c.yzz; // Get CyCy, CyCz, CzCz
    float3 cn = c.x * n; // Get CxNx, CxNy, CxNz
    buf_uavData1[uIdx] = float4(cn, cc.x);

    cn = c.y * n; // Get CyNx, CyNy, CyNz
    buf_uavData2[uIdx] = float4(cn, cc.y);

    cn = c.z * n; // Get CzNx, CzNy, CzNz
    buf_uavData3[uIdx] = float4(cn, cc.z);

    fDepth = GetNormalMatchedDepth(tex_srvTSDFDepth, DTid);
    float3 q = ReprojectPt(DTid.xy, fDepth);
    float pqn = dot(p - q, n);
    float3 cpqn = c * pqn; // Get cx(p-q)n, cy(p-q)n, cz(p-q)n

    float3 nn = n.xxx * n.xyz; // Get NxNx, NxNy, NxNz
    buf_uavData4[uIdx] = float4(nn, cpqn.x);

    nn = n.yyz * n.yzz; // Get NyNy, NyNz, NzNz
    buf_uavData5[uIdx] = float4(nn, cpqn.y);

    float3 npqn = n * pqn; // Get nx(p-q)n, ny(p-q)n, nz(p-q)n
    buf_uavData6[uIdx] = float4(npqn, cpqn.z);
    return;
}

Though I know this is mem intensive, and know it will be a little bit slow, but with 512x424 input resolution, taking 10ms on GTX680m doesn't seems right. Nvidia Nsight doesn't support 680m, so I can't get detailed perf data about where is the bottleneck (I know it must be mem write, but I don't think it will cause 10ms GPU time....or am I wrong?)

I kinda see I can change all the output UAV raw buffer to 64bit Typed buffer should help, but that means l lost precision.... So I think it's better first discuss with you guys before I try the typed one.

Also I was wondering maybe this pass is better using Pixel Shader since I didn't use LDS at all, and PS could use compressed write to RTs (Correct if I am wrong about that...) which may help with the mem write, but my output data size may exceed num_of_RTs limits, so end up with multiple pass....

So please let me know if you see me doing something silly in the code, or you have any suggestions.

As always, big thanks in advance

Latest Images

Trending Articles

Latest Images