Hello,
I have the following functioning kernel that performs the FDK (Feldkamp, Davis and, Kress) algorithm, but it runs much slower than expected.
Upon profiling the code, I find that none of the loads are coalesced… and neither are any of the stores. External Image
__global__
void kernel_fdk_regs_tex (float *dev_vol, float *dev_img, float *matrix, int2 img_dim, float2 ic, float3 nrm, float sad, float scale, float3 vol_offset, int3 vol_dim, float3 vol_pix_spacing, unsigned int Blocks_Y, float invBlocks_Y)
{
unsigned int blockIdx_z = __float2uint_rd(blockIdx.y * invBlocks_Y);
unsigned int blockIdx_y = blockIdx.y - __umul24(blockIdx_z, Blocks_Y);
unsigned int i = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
unsigned int j = __umul24(blockIdx_y, blockDim.y) + threadIdx.y;
unsigned int k = __umul24(blockIdx_z, blockDim.z) + threadIdx.z;
if( i >= vol_dim.x || j >= vol_dim.y || k >= vol_dim.z )
return;
// Index row major into the volume
long int vol_idx = i + ( j*(vol_dim.x) ) + ( k*(vol_dim.x)*(vol_dim.y) );
float3 vp;
float3 ip;
float s;
float voxel_data;
// offset volume coords
vp.x = vol_offset.x + i * vol_pix_spacing.x; // Compiler should combine into 1 FMAD.
vp.y = vol_offset.y + j * vol_pix_spacing.y; // Compiler should combine into 1 FMAD.
vp.z = vol_offset.z + k * vol_pix_spacing.z; // Compiler should combine into 1 FMAD.
// matrix multiply
ip.x = matrix[0]*vp.x + matrix[1]*vp.y + matrix[2]*vp.z + matrix[3];
ip.y = matrix[4]*vp.x + matrix[5]*vp.y + matrix[6]*vp.z + matrix[7];
ip.z = matrix[8]*vp.x + matrix[9]*vp.y + matrix[10]*vp.z + matrix[11];
// Change coordinate systems
ip.x = ic.x + ip.x / ip.z;
ip.y = ic.y + ip.y / ip.z;
// Get pixel from 2D image
ip.x = __float2int_rd(ip.x);
ip.y = __float2int_rd(ip.y);
voxel_data = tex1Dfetch(tex_img, ip.x*img_dim.x + ip.y);
// Dot product
s = nrm.x*vp.x + nrm.y*vp.y + nrm.z*vp.z;
// Conebeam weighting factor
s = sad - s;
s = (sad * sad) / (s * s);
// Place it into the volume
dev_vol[vol_idx] += scale * s * voxel_data;
}
The profiler output looks like this:
# CUDA_PROFILE_LOG_VERSION 1.3
method,gputime,cputime,occupancy,gld_coherent,gld_incoherent
,gst_coherent,gst_incoherent
method=[ memcopy ] gputime=[ 1400.000 ] cputime=[ 2423.492 ]
method=[ __globfunc__Z19kernel_fdk_regs_texPfS_S_4int26float26float3f
fS2_4int3S2_jf ] gputime=[ 414052.594 ] cputime=[ 414109.031 ] occupancy=[ 0.750 ] gld_coherent=[ 0 ] gld_incoherent=[ 24804208 ] gst_coherent=[ 0 ] gst_incoherent=[ 3816032 ]
(I am certainly concerned as to why the CPU time is nearly double the GPU time. Any insight would be very much appreciated. I am hoping this will go away once reads and writes become coalesced…)
And the kernel is invoked thusly:
.
.
.
cudaMemcpy( dev_img, cbi->img, cbi->dim[0]*cbi->dim[1]*sizeof(float), cudaMemcpyHostToDevice );
cudaBindTexture( 0, tex_img, dev_img, cbi->dim[0]*cbi->dim[1]*sizeof(float) );
cudaMemcpy( dev_matrix, kargs->matrix, sizeof(kargs->matrix), cudaMemcpyHostToDevice );
kernel_fdk_regs_tex<<< dimGrid, dimBlock >>>(dev_vol,
dev_img,
dev_matrix,
kargs->img_dim,
kargs->ic,
kargs->nrm,
kargs->sad,
kargs->scale,
kargs->vol_offset,
kargs->vol_dim,
kargs->vol_pix_spacing,
blocksInY,
1.0f/(float)blocksInY);
checkCUDAError("Kernel Panic!");
.
.
.
I have read through the programmers manual and understand the idea behind coalesced data access, but I am having trouble seeing why my kernel is not accesses data in a coalesced fashion. To me (at least), it would seem that data write to dev_vol would at least be coalesced due to how vol_idx is constructed.
Any insight at all would be greatly appreciated.
Best Regards,
tshack