Loop to only copy memory once?

I am not sure if this code should work in CUDA, I cannot find any documentation to let my know if copied memory will be available on the device throughout the use of a loop, ie (I know some thing may not be correct):

    // allocate array on device 
cudaMalloc((void **) &a_d, size);
cudaMalloc((void **) &a_d2, size2);

cudaMemcpy(a_d, CAGrid, sizeof(float)*N, cudaMemcpyHostToDevice);
cudaMemcpy(a_d2, newCAGrid, sizeof(float)*N, cudaMemcpyHostToDevice);

while(iterationNo < 2500) {
	
	cudaMemcpy(a_d, newCAGrid, sizeof(float)*N, cudaMemcpyHostToDevice);

	// do calculation on device:
	// first compute execution configuration
	dim3 kernelBlockGrid(80);
	dim3 threadBlock(80, 1);

	testFunction <<<kernelBlockGrid, threadBlock, sharedMemSize>>> (a_d, a_d2, N, radius, arraySize, slotSize);

	// block until the device has completed
	cudaThreadSynchronize();

	// Copy the new array to the old one
	//cudaMemcpy(CAGrid, newCAGrid, sizeof(float)*N, cudaMemcpyDeviceToDevice);

	// Retrieve the new grid for drawing
	cudaMemcpy(newCAGrid, a_d2, sizeof(float)*N, cudaMemcpyDeviceToHost);

	drawGrid(dst, rect, cellWidth, padding); // The drawGrid function makes use of the newCAGrid

	iterationNo++;
}
printf("End");