Hi everyone,
CUDA version: 2.0
Platform: OS X 10.5
The following CUDA kernel fails to launch giving the error:
too many resources requested for launch
I’ve almost restructured majority of the code and have reduced the register count from about 48 to 20, but still cannot reach the limit of 16 as hinted by the occupancy calculator. I’m using a lot of texture fetches and in between same number of calls to the fminf function (that gives a reg count of 23). Surprisingly, using a macro MIN reduces the register count further down to 20. Any hints from here after will be highly appreciated. Following is the kernel code. The texture fetches are to read the 26 neighbours of a voxel in a volume.
Thread block size: 8x8x8
grid size: 4096x1
.cubin:
smem = 48
reg = 20
#define MIN(a, b) (a<b)?a:b
#define BLOCK_DIM 8
#define SUBVOL_DIM 128
texture<float, 3, cudaReadModeElementType> tex; //3D texture
__global__ void d_kernel(cudaPitchedPtr d_volPPtr, cudaExtent logicalGridSize, float d)
{
unsigned int __x, __y, __z;
unsigned int pitchz;
pitchz = logicalGridSize.width*logicalGridSize.height;
__z = (unsigned int)floorf((float)blockIdx.x/(float)pitchz);
__y = (unsigned int)floorf((float)(blockIdx.x - __umul24(pitchz, __z))/(float)logicalGridSize.width);
__x = blockIdx.x - __umul24(pitchz, __z) - __umul24(logicalGridSize.width, __y);
//compute coordinates local (within subvolume)
__x = __umul24(BLOCK_DIM, __x) + threadIdx.x;
__y = __umul24(BLOCK_DIM, __y) + threadIdx.y;
__z = __umul24(BLOCK_DIM, __z) + threadIdx.z;
if(__x < 1 || __x > (SUBVOL_DIM-2) || __y < 1 || __y > (SUBVOL_DIM-2) || __z < 1 || __z > (SUBVOL_DIM-2)) return;
float x = (float)__x;
float y = (float)__y;
float z = (float)__z;
float val = tex3D(tex, x, y, z);
if(fabsf(val) <= (d - 1.0)) return; // this pixel is inside the distance band
float *row = (float*)((char*)d_volPPtr.ptr + (__z*d_volPPtr.ysize + __y)*d_volPPtr.pitch);//output voxel: row[__x]
float texval;
float region_sign = (val>0.0)?1.0:-1.0;
// Read 6-neighbors that are at a distance 1.0 units from central voxel
texval = region_sign*tex3D(tex, x-1.0, y, z);
texval = MIN(texval, region_sign*tex3D(tex, x+1.0, y, z));
texval = MIN(texval, region_sign*tex3D(tex, x, y-1.0, z));
texval = MIN(texval, region_sign*tex3D(tex, x, y+1.0, z));
texval = MIN(texval, region_sign*tex3D(tex, x, y, z-1.0));
texval = MIN(texval, region_sign*tex3D(tex, x, y, z+1.0));
if(texval < region_sign*d)
{
row[__x] = region_sign*d;
return;
}
// The 6-neighbors had Inf value, so read planar 12-diagonal neighbours now
texval = region_sign*tex3D(tex, x-1.0, y-1.0, z);
texval = MIN(texval, region_sign*tex3D(tex, x-1.0, y+1.0, z));
texval = MIN(texval, region_sign*tex3D(tex, x+1.0, y-1.0, z));
texval = MIN(texval, region_sign*tex3D(tex, x+1.0, y+1.0, z));
texval = MIN(texval, region_sign*tex3D(tex, x-1.0, y, z-1.0));
texval = MIN(texval, region_sign*tex3D(tex, x+1.0, y, z-1.0));
texval = MIN(texval, region_sign*tex3D(tex, x, y-1.0, z-1.0));
texval = MIN(texval, region_sign*tex3D(tex, x, y+1.0, z-1.0));
texval = MIN(texval, region_sign*tex3D(tex, x-1.0, y, z+1.0));
texval = MIN(texval, region_sign*tex3D(tex, x+1.0, y, z+1.0));
texval = MIN(texval, region_sign*tex3D(tex, x, y-1.0, z+1.0));
texval = MIN(texval, region_sign*tex3D(tex, x, y+1.0, z+1.0));
if(texval < region_sign*d)
{
row[__x] = region_sign*(d + sqrtf(2.0));
return;
}
//the 12-neighbors had Inf value, so lastly check the cubic 8-diagonals now
texval = region_sign*tex3D(tex, x-1.0, y-1.0, z+1.0);
texval = MIN(texval, region_sign*tex3D(tex, x+1.0, y-1.0, z+1.0));
texval = MIN(texval, region_sign*tex3D(tex, x-1.0, y+1.0, z+1.0));
texval = MIN(texval, region_sign*tex3D(tex, x+1.0, y+1.0, z+1.0));
texval = MIN(texval, region_sign*tex3D(tex, x-1.0, y-1.0, z-1.0));
texval = MIN(texval, region_sign*tex3D(tex, x+1.0, y-1.0, z-1.0));
texval = MIN(texval, region_sign*tex3D(tex, x-1.0, y+1.0, z-1.0));
texval = MIN(texval, region_sign*tex3D(tex, x+1.0, y+1.0, z-1.0));
if(texval < region_sign*d)
row[__x] = region_sign*(d + sqrtf(3.0));
// If all the neighbors had Inf value, we will reach here without assigning any value to row[x] and return.
}
Many thanks,
Ojaswa