Below is a really simple kernel I wrote to fill a 3d array in global device memory, and a program to test that kernel. For some reason, the kernel isn’t being executed. There’s don’t seem to be any errors or anything like that; according to the Visual Profiler, the kernel call never happened! What’s going on here?
#include <stdio.h>
#include <assert.h>
__global__ void set_array(float *array, unsigned int Nx, unsigned int Ny, unsigned int Nz)
{
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
unsigned int z = blockIdx.z * blockDim.z + threadIdx.z;
float rx = (float)x / Nx;
float ry = (float)y / Ny;
float rz = (float)z / Nz;
array[z*Nx*Ny + y*Nx + x] = 64.0 * rx * ry * rz * (1-rx) * (1-ry) * (1-rz);
}
int main(int argc, char *argv[])
{
int Nx=128, Ny=128, Nz=128, N=Nx*Ny*Nz;
float *d_array;
cudaMalloc( (void**) &d_array, N*sizeof(float));
cudaMemset(d_array, 0, N*sizeof(float));
dim3 dimBlock(16, 4, 4);
dim3 dimGrid(Nx/dimBlock.x, Ny/dimBlock.y, Nz/dimBlock.z);
set_array <<< dimGrid, dimBlock >>> (d_array, Nx, Ny, Nz);
assert(cudaThreadSynchronize() == cudaSuccess);
float *array = (float *)malloc(N*sizeof(float));
cudaMemcpy(array, d_array, N*sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(d_array);
int i;
for (i=0; i<N; i++) {
if (array[i] > 0.0) {
printf("nonzero value found at index %d (good!)\n", i);
break;
}
}
if(i==N)
printf("no nonzero values found (bad!)\n");
free(array);
return 0;
}