Can someone tell me what cutilBankChecker does, what kind of memory it operates on, and what I can do to use it to speed up my algorithm.
I am doing a 3D gaussian filter on a texture volume and I’m not happy at all with my speedup.
I already tried using shared memory to store intermediate results and do all 3 passes on one block execution, but I go over the register limit (too many resources requested for launch) when I do so.
//host
cudaPitchedPtr d_result, d_temp;
const uint w = vol3d->get_width(), h = vol3d->get_height(), d = vol3d->get_depth();
cudaExtent extent = make_cudaExtent(w, h, d);
cutilSafeCall(cudaMallocPitch((void **)&d_result.ptr, &d_result.pitch, w*sizeof(uchar), h*d));
d_result.xsize = d_result.ysize = w;
cutilSafeCall(cudaMallocPitch((void **)&d_temp.ptr, &d_temp.pitch, w*sizeof(uchar), h*d));
d_temp.xsize = d_temp.ysize = w;
//controlls 2D or 3D gaussian
const int max_pass = 3;
//CUDA block and grid dimensions
const dim3 gaus_bs(8, 8, 8);
const dim3 gaus_gs(w / gaus_bs.x, h / gaus_bs.y, 1);
//events used to mark time
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
for(uint pass = 0; pass < max_pass; pass++)
{
for(uint z_off = 0; z_off < d; z_off += gaus_bs.z)
{
//call gaussian kernel (streams?)
d_gaussian3d<<<gaus_gs, gaus_bs>>>(d_result, d_temp, d, z_off, pass);
cutilCheckMsg("kernel failed");
}
}
//device
__constant__ __device__ uint gauss_fact[2] = { 1, 1 },
gauss_sum = 2,
gauss_width = 2;
//based on algorithm from
//http://www.gamedev.net/reference/programming/features/imageproc/page2.asp
__global__ void d_gaussian3d(cudaPitchedPtr d_output, cudaPitchedPtr d_temp, uint depth, uint z_off, uint pass)
{
//find 3D thread's position within cuda grid
uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
uint z = __umul24(blockIdx.z, blockDim.z) + threadIdx.z + z_off;
uchar *tp = (uchar *)d_temp.ptr;
size_t pitch = d_temp.pitch;
size_t slice_pitch = pitch * image_dim;
uchar *op = (uchar *)d_output.ptr;
size_t o_pitch = d_output.pitch;
size_t o_slice_pitch = o_pitch * image_dim;
//don't compute off the edge of the image volume space
if(x < image_dim && y < image_dim && z < depth)
{
//FIRST PASS//////////////////////////////////////////////
float s, t, r;
int si, ti, ri, sum = 0;
uint voxel;
if(pass == 0)
{
for(int k = 0; k < gauss_width; k++)
{
//non-normalized coords
//clamping takes care of edge conditions
s = x-((gauss_width-1)>>1)+k;
t = y;
r = z;
//read voxel from 3D texture
voxel = tex3D(tex, s, t, r)*255;
sum += __umul24(voxel, gauss_fact[k]);
}
//store normalized sum result back into memory on intermediate buffer
uchar *temp_slice = (tp + __umul24(z, slice_pitch));
uchar *temp_row = (temp_slice + __umul24(y, pitch));
temp_row[x] = sum / gauss_sum;
}
//SECOND PASS/////////////////////////////////////////////
else if(pass == 1)
{
sum = 0;
for(int k = 0; k < gauss_width; k++)
{
si = x;
ti = y-((gauss_width-1)>>1)+k;
ri = z;
voxel = 0;
if(ti >= 0 && ti < image_dim)
{
//read data from intermediate buffer
uchar *temp_slice = (tp + __umul24(ri, slice_pitch));
uchar *temp_row = (temp_slice + __umul24(ti, pitch));
voxel = temp_row[si];
}
sum += __umul24(voxel, gauss_fact[k]);
}
//store normalized sum result into output buffer
uchar *o_slice = (op + __umul24(z, o_slice_pitch));
uchar *output_row = (o_slice + __umul24(y, o_pitch));
output_row[x] = sum / gauss_sum;
}
//THIRD PASS//////////////////////////////////////////////
else if(pass == 2)
{
sum = 0;
for(int k = 0; k < gauss_width; k++)
{
si = x;
ti = y;
ri = z-((gauss_width-1)>>1)+k;
voxel = 0;
if(ri >= 0 && ri < depth)
{
//read data from output buffer
uchar *o_slice = (op + __umul24(ri, o_slice_pitch));
uchar *output_row = (o_slice + __umul24(ti, o_pitch));
voxel = output_row[si];
}
sum += __umul24(voxel, gauss_fact[k]);
}
//store normalized sum result into buffer
uchar *temp_slice = (tp + __umul24(z, slice_pitch));
uchar *temp_row = (temp_slice + __umul24(y, pitch));
temp_row[x] = sum / gauss_sum;
}
}
}