Thanks for all comments. I attach the kernel invocation code below.
My original understanding (in error?) was that the GPU processes its work sequentially, and therefore multiple CPU threads calling a routine could co-exist using persistent GPU memory.
As per comments above, my current understanding is that to achieve maximum throughput I have the following options:
- Avoid the O/H of creating/freeing memory on the GPU and protect the persistent memory in GPU with a mutex on the CPU side
- Create/free transient memory on the GPU on every call
Or is there a flaw in the code?
CODE:
[i]
unsigned char *ModeFilter(unsigned char *h_img, unsigned char *h_dest, int width, int height, int radius, int bitshift)
{
int stride = 4;
int Block_height, Block_width;
int widthadj = int(width / stride) + 2;
int heightadj = int(height / stride) + 2;
static unsigned char *d_img, *d_dest;
Block_height = 16;
Block_width = 16;
const dim3 grid(iDivUp(widthadj, Block_width), iDivUp(heightadj, Block_height), 1);
const dim3 block(Block_width, Block_height, 1);
cudaSetDevice(0);
static boolean START = true;
if (START) {
START = false;
checkCudaErrors(cudaMalloc((void **)&d_img, (width * height * sizeof(unsigned char))));
checkCudaErrors(cudaMalloc((void **)&d_dest, (width * height * sizeof(unsigned char))));
}
checkCudaErrors(cudaDeviceSynchronize());
// Load data
checkCudaErrors(cudaMemcpy(d_img, h_img, sizeof(unsigned char) * width*height, cudaMemcpyHostToDevice));
ModeFilter_Kernel_Function << <grid, block >> > (d_img, d_dest, width, height, radius, bitshift, stride);
checkCudaErrors(cudaDeviceSynchronize());
if( h_dest != NULL)
checkCudaErrors(cudaMemcpy(h_dest, d_dest, sizeof(unsigned char) * width*height, cudaMemcpyDeviceToHost));
return( d_dest );
}
[/i]