I have a program that iterates a simple “add-one” function 100 times, using shared memory with
a total grid size of 1024*1024 variables. When I run the program, it crashes CUDA. Code is posted
in full; any ideas what’s going on?
global void FILENAME(int *d_out, int d_in, int nIterations)
{
extern shared int s_data[];
int in = blockDim.xblockIdx.x + threadIdx.x;
for (int k=0; k<nIterations; ++k)
{
s_data[in] = d_in[in];
__syncthreads();
d_out[in] = s_data[in] + 1;
int *temp = d_out;
d_out = d_in;
d_in = temp;
}
}
int main(int argc, char** argv)
{
int HOST_Array;
int dimA = 10241024;
int nIterations = 10;
int i;
int *device_OUT, *device_IN;
int numThreadsPerBlock = 256;
int numBlocks = dimA / numThreadsPerBlock;
int sharedMemSize = numThreadsPerBlock * sizeof(int);
size_t memSize = numBlocks * numThreadsPerBlock * sizeof(int);
HOST_Array = (int *) malloc(memSize);
cudaMalloc((void **) &device_IN, memSize);
cudaMalloc((void **) &device_OUT, memSize);
for (i=0; i<dimA; ++i)
{
HOST_Array[i] = float(i) + 1;
}
for (i=0; i<16; ++i)
{
printf("Host Array = %d\n", HOST_Array[i]);
}
clock_t CUDA = clock();
cudaMemcpy(device_IN, HOST_Array, memSize, cudaMemcpyHostToDevice);
dim3 dimGrid(numBlocks);
dim3 dimBlock(numThreadsPerBlock);
FILENAME <<<dimGrid, dimBlock, sharedMemSize>>> (device_OUT, device_IN, nIterations);
cudaThreadSynchronize();
cudaMemcpy(HOST_Array, device_OUT, memSize, cudaMemcpyDeviceToHost);
checkCUDAError("memcpy");
printf("CUDA Time = %f sec\n", ((double)clock() - CUDA)/CLOCKS_PER_SEC);
printf("\n\n");
for (i=0; i<16; ++i)
{
printf(“Host Array = %d\n”, HOST_Array[i]);
}
cudaFree(device_IN);
cudaFree(device_OUT);
free(HOST_Array);
}