Hey Guys,
I come here to borrow you guys' insightful eyes again to help me find what I did wrong...
I have a compute shader which will take 5 input Texture2D (they all 512x424) and then do bunch of computation and output 28 float for each input pixel position. Here is the shader:
#include "FastICP.inl" #include "CalibData.inl" Texture2D<uint> tex_srvKinectDepth : register(t0); //R16_UINT Texture2D<uint> tex_srvTSDFDepth : register(t1); //R16_UINT Texture2D<float4> tex_srvKinectNormal : register(t2); //R10G10B10A2_UNORM Texture2D<float4> tex_srvTSDFNormal : register(t3); //R10G10B10A2_UNORM Texture2D<float> tex_srvWeight : register(t4); //R8_UNORM RWStructuredBuffer<float4> buf_uavData0 : register(u0);//CxCx,CxCy,CxCz,Ctr RWStructuredBuffer<float4> buf_uavData1 : register(u1);//CxNx,CxNy,CxNz,CyCy RWStructuredBuffer<float4> buf_uavData2 : register(u2);//CyNx,CyNy,CyNz,CyCz RWStructuredBuffer<float4> buf_uavData3 : register(u3);//CzNx,CzNy,CzNz,CzCz RWStructuredBuffer<float4> buf_uavData4 : register(u4);//NxNx,NxNy,NxNy,CxPQN RWStructuredBuffer<float4> buf_uavData5 : register(u5);//NyNy,NyNz,NzNz,CyPQN RWStructuredBuffer<float4> buf_uavData6 : register(u6);//NxPQN,NyPQN,NzPQN,CzPQN void AllZero(uint uIdx) { buf_uavData0[uIdx] = 0.f; buf_uavData1[uIdx] = 0.f; buf_uavData2[uIdx] = 0.f; buf_uavData3[uIdx] = 0.f; buf_uavData4[uIdx] = 0.f; buf_uavData5[uIdx] = 0.f; buf_uavData6[uIdx] = 0.f; } float3 ReprojectPt(uint2 u2xy, float fDepth) { return float3(float2(u2xy - DEPTH_C) * fDepth / DEPTH_F, fDepth); } float GetNormalMatchedDepth(Texture2D<uint> tex_srvDepth, uint3 DTid) { uint uAccDepth = tex_srvDepth.Load(DTid); uAccDepth += tex_srvDepth.Load(DTid, uint2(0, 1)); uAccDepth += tex_srvDepth.Load(DTid, uint2(1, 0)); uAccDepth += tex_srvDepth.Load(DTid, uint2(1, 1)); return uAccDepth * -0.001f / 4.f; } [numthreads(8, 8, 1)] void main(uint3 DTid : SV_DispatchThreadID) { uint uIdx = DTid.x + DTid.y * u2AlignedReso.x; if (tex_srvWeight.Load(DTid) < 0.05f) { AllZero(uIdx); return; } float4 f4KinectNormal = tex_srvKinectNormal.Load(DTid) * 2.f - 1.f; // No valid normal data if (f4KinectNormal.w < 0.05f) { AllZero(uIdx); return; } float4 f4TSDFNormal = tex_srvTSDFNormal.Load(DTid) * 2.f - 1.f; // No valid normal data if (f4TSDFNormal.w < 0.05f) { AllZero(uIdx); return; } // Normals are too different if (dot(f4TSDFNormal.xyz, f4KinectNormal.xyz) < fNormalDiffThreshold) { AllZero(uIdx); return; } float fDepth = GetNormalMatchedDepth(tex_srvKinectDepth, DTid); // p is Kinect point, q is TSDF point, n is TSDF normal // c = p x n float3 p = ReprojectPt(DTid.xy, fDepth); float3 n = f4TSDFNormal.xyz; float3 c = cross(p, n); float3 cc = c.xxx * c.xyz; // Get CxCx, CxCy, CxCz buf_uavData0[uIdx] = float4(cc, 1.f); // last element is counter cc = c.yyz * c.yzz; // Get CyCy, CyCz, CzCz float3 cn = c.x * n; // Get CxNx, CxNy, CxNz buf_uavData1[uIdx] = float4(cn, cc.x); cn = c.y * n; // Get CyNx, CyNy, CyNz buf_uavData2[uIdx] = float4(cn, cc.y); cn = c.z * n; // Get CzNx, CzNy, CzNz buf_uavData3[uIdx] = float4(cn, cc.z); fDepth = GetNormalMatchedDepth(tex_srvTSDFDepth, DTid); float3 q = ReprojectPt(DTid.xy, fDepth); float pqn = dot(p - q, n); float3 cpqn = c * pqn; // Get cx(p-q)n, cy(p-q)n, cz(p-q)n float3 nn = n.xxx * n.xyz; // Get NxNx, NxNy, NxNz buf_uavData4[uIdx] = float4(nn, cpqn.x); nn = n.yyz * n.yzz; // Get NyNy, NyNz, NzNz buf_uavData5[uIdx] = float4(nn, cpqn.y); float3 npqn = n * pqn; // Get nx(p-q)n, ny(p-q)n, nz(p-q)n buf_uavData6[uIdx] = float4(npqn, cpqn.z); return; }
Though I know this is mem intensive, and know it will be a little bit slow, but with 512x424 input resolution, taking 10ms on GTX680m doesn't seems right. Nvidia Nsight doesn't support 680m, so I can't get detailed perf data about where is the bottleneck (I know it must be mem write, but I don't think it will cause 10ms GPU time....or am I wrong?)
I kinda see I can change all the output UAV raw buffer to 64bit Typed buffer should help, but that means l lost precision.... So I think it's better first discuss with you guys before I try the typed one.
Also I was wondering maybe this pass is better using Pixel Shader since I didn't use LDS at all, and PS could use compressed write to RTs (Correct if I am wrong about that...) which may help with the mem write, but my output data size may exceed num_of_RTs limits, so end up with multiple pass....
So please let me know if you see me doing something silly in the code, or you have any suggestions.
As always, big thanks in advance