I am not sure if this code should work in CUDA, I cannot find any documentation to let my know if copied memory will be available on the device throughout the use of a loop, ie (I know some thing may not be correct):
// allocate array on device
cudaMalloc((void **) &a_d, size);
cudaMalloc((void **) &a_d2, size2);
cudaMemcpy(a_d, CAGrid, sizeof(float)*N, cudaMemcpyHostToDevice);
cudaMemcpy(a_d2, newCAGrid, sizeof(float)*N, cudaMemcpyHostToDevice);
while(iterationNo < 2500) {
cudaMemcpy(a_d, newCAGrid, sizeof(float)*N, cudaMemcpyHostToDevice);
// do calculation on device:
// first compute execution configuration
dim3 kernelBlockGrid(80);
dim3 threadBlock(80, 1);
testFunction <<<kernelBlockGrid, threadBlock, sharedMemSize>>> (a_d, a_d2, N, radius, arraySize, slotSize);
// block until the device has completed
cudaThreadSynchronize();
// Copy the new array to the old one
//cudaMemcpy(CAGrid, newCAGrid, sizeof(float)*N, cudaMemcpyDeviceToDevice);
// Retrieve the new grid for drawing
cudaMemcpy(newCAGrid, a_d2, sizeof(float)*N, cudaMemcpyDeviceToHost);
drawGrid(dst, rect, cellWidth, padding); // The drawGrid function makes use of the newCAGrid
iterationNo++;
}
printf("End");