I have a program that iterates a simple “add-one” function 100 times, using shared memory with
a total grid size of 1024*1024 variables. When I run the program, it crashes CUDA. Code is posted
in full, in case that helps; any ideas what’s going on?
As always, thanks in advance for any ideas.
[codebox]global void FILENAME(int *d_out, int *d_in, int nIterations)
{
extern __shared__ int s_data[];
int in = blockDim.x*blockIdx.x + threadIdx.x;
for (int k=0; k<nIterations; ++k)
{
s_data[in] = d_in[in];
__syncthreads();
d_out[in] = s_data[in] + 1;
int *temp = d_out;
d_out = d_in;
d_in = temp;
}
}
int main(int argc, char** argv)
{
int *HOST_Array;
int dimA = 1024*1024;
int nIterations = 100;
int i;
int *device_OUT, *device_IN;
int numThreadsPerBlock = 256;
int numBlocks = dimA / numThreadsPerBlock;
int sharedMemSize = numThreadsPerBlock * sizeof(int);
size_t memSize = numBlocks * numThreadsPerBlock * sizeof(int);
HOST_Array = (int *) malloc(memSize);
cudaMalloc((void **) &device_IN, memSize);
cudaMalloc((void **) &device_OUT, memSize);
cudaMemcpy(device_IN, HOST_Array, memSize, cudaMemcpyHostToDevice);
dim3 dimGrid(numBlocks);
dim3 dimBlock(numThreadsPerBlock);
FILENAME <<<dimGrid, dimBlock, sharedMemSize>>> (device_OUT, device_IN, nIterations);
cudaThreadSynchronize();
cudaMemcpy(HOST_Array, device_OUT, memSize, cudaMemcpyDeviceToHost);
cudaFree(device_IN);
cudaFree(device_OUT);
free(HOST_Array);
}
[/codebox]