I am getting hung up on a simple problem. With depth = 3, the code runs, but for higher depths I get a runtime error saying “Cuda error: Kernel execution failed in file ‘cppIntegration.cu’ in line 120 : invalid configuration argument” after the first kernel call.
__global__ void count_zero_kernel(int* count, int res, int cbs)
{
const int x = blockIdx.x*cbs + threadIdx.x;
const int y = blockIdx.y*cbs + threadIdx.y;
const int z = blockIdx.z*cbs + threadIdx.z;
const unsigned int i = ((z*res) + y)*res + x;
count[i] = 0;
}
__global__ void count_kernel(float3* pos, int* count, int res, int num)
{
const unsigned int i = blockIdx.x*pts_block_size + threadIdx.x;
if (i >= num)
return;
int x = pos[i].x * res;
int y = pos[i].y * res;
int z = pos[i].z * res;
atomicAdd(count + ((z*res + y)*res + x), 1);//count[(z*res + y)*res + x]++;
}
extern "C" void count_pts(int depth_)
{
depth = depth_;
int res = 1 << depth;
int count_num = res*res*res;
// alloc and zero on device
cudaMalloc((void**)&d_count, count_num*sizeof(int));
int cbs = min(8, res);
dim3 cz_blocks(res / cbs, res / cbs, res / cbs);
dim3 cz_threads(cbs, cbs, cbs);
count_zero_kernel<<<cz_blocks, cz_threads>>>(count, res, cbs);
CUT_CHECK_ERROR("Kernel execution failed");
// count the points
count_kernel<<<pts_grid_size, pts_block_size>>>(d_pts_pos, d_count, res, pts.num);
CUT_CHECK_ERROR("Kernel execution failed");
}
Sorry, pts.num depends on the input data, and can be on the order of 10k to 100k or more. Elsewhere those are declared as:
#define pts_block_size 256
int pts_grid_size = ceil(pts.num / (float)pts_block_size);
I should have trimmed down the code a bit more to isolate the problem, because the first kernel is failing, before the second kernel is reached. Trimmed down, the code that is failing looks like:
__global__ void count_zero_kernel(int* count, int res, int cbs)
{
const int x = blockIdx.x*cbs + threadIdx.x;
const int y = blockIdx.y*cbs + threadIdx.y;
const int z = blockIdx.z*cbs + threadIdx.z;
const unsigned int i = ((z*res) + y)*res + x;
count[i] = 0;
}
extern "C" void count_pts(int depth_)
{
depth = depth_;
int res = 1 << depth;
int count_num = res*res*res;
// alloc and zero on device
cudaMalloc((void**)&d_count, count_num*sizeof(int));
int cbs = min(8, res);
dim3 cz_blocks(res / cbs, res / cbs, res / cbs);
dim3 cz_threads(cbs, cbs, cbs);
count_zero_kernel<<<cz_blocks, cz_threads>>>(count, res, cbs);
CUT_CHECK_ERROR("Kernel execution failed");
}
That was the problem. 3D grids don’t work. Thanks a lot for the help! It would be nice if 3D grids would work though, to process volumetric data. Will that capability appear in some future version of CUDA?