I am getting hung up on a simple problem. With depth = 3, the code runs, but for higher depths I get a runtime error saying “Cuda error: Kernel execution failed in file ‘cppIntegration.cu’ in line 120 : invalid configuration argument” after the first kernel call.

```
__global__ void count_zero_kernel(int* count, int res, int cbs)
{
const int x = blockIdx.x*cbs + threadIdx.x;
const int y = blockIdx.y*cbs + threadIdx.y;
const int z = blockIdx.z*cbs + threadIdx.z;
const unsigned int i = ((z*res) + y)*res + x;
count[i] = 0;
}
__global__ void count_kernel(float3* pos, int* count, int res, int num)
{
const unsigned int i = blockIdx.x*pts_block_size + threadIdx.x;
if (i >= num)
return;
int x = pos[i].x * res;
int y = pos[i].y * res;
int z = pos[i].z * res;
atomicAdd(count + ((z*res + y)*res + x), 1);//count[(z*res + y)*res + x]++;
}
extern "C" void count_pts(int depth_)
{
depth = depth_;
int res = 1 << depth;
int count_num = res*res*res;
// alloc and zero on device
cudaMalloc((void**)&d_count, count_num*sizeof(int));
int cbs = min(8, res);
dim3 cz_blocks(res / cbs, res / cbs, res / cbs);
dim3 cz_threads(cbs, cbs, cbs);
count_zero_kernel<<<cz_blocks, cz_threads>>>(count, res, cbs);
CUT_CHECK_ERROR("Kernel execution failed");
// count the points
count_kernel<<<pts_grid_size, pts_block_size>>>(d_pts_pos, d_count, res, pts.num);
CUT_CHECK_ERROR("Kernel execution failed");
}
```