I would like to declare a global array at some location in the device global memory that I can use in two different kernel calls.
It goes like this
device float* global_data;
main()
{
cudaMalloc(global_data);
kernel1<<>>(blah); //access global data
kernel2<<>>(blah); //access the same global data again
}
Is this possible?
kbam
2
Yes, but when I have done it I have used cudaMalloc to create the array in device memory,
or fixed size shared array e.g.
shared short shPosns[512];
You do need to put data into it, which can be either by copying an array from host to the device, or by writing to it from a kernel.
e.g.
cudaMemcpy( d_Counts, h_Counts, sizeT_Counts, cudaMemcpyHostToDevice );
Two kernels in following code share some arrays and are inside a loop that executes hundreds of times, so each kernel is called hundreds of times.
cudaMemcpy( d_line, h_line, count, cudaMemcpyHostToDevice ); // copy host array h_line to device array d_line
checkCUDAError("cudaMemcpy 1");
EndFieldKernel<<< dimGrid , dimBlock >>> ( d_line, (int)count, d_Posns, d_Counts, d_unProcd);
checkCUDAError("kernel 1 execution");
cudaThreadSynchronize();
checkCUDAError("Synch");
toFloatKernel<<< dimGrid , dimBlock >>> ( d_line, d_Data, count, d_Posns, d_Counts, d_unProcd, inArrayOffset, d_resultArrayElementsDone);
checkCUDAError("kernel 2 execution");
cudaThreadSynchronize();
checkCUDAError("Synch");
cudaMemcpy( h_resultArrayElementsDone, d_resultArrayElementsDone, sizeT_resultArrayElementsDone, cudaMemcpyDeviceToHost ); // copy device array d_resultArrayElementsDone to host array h_resultArrayElementsDone
checkCUDAError("cudaMemcpy offset back");