In Nvidia’s “GPU Computing SDK”, I’m seeing oclMedianFilter allocate a pinned memory buffer and then pass it to an input function which replaces it with a malloc()ed buffer. This would mean both that twice as much memory is used (for that single buffer) and that the application is using slower pageable memory for host->device transfers. Am I mistaken?
Below are two relevant portions, the oclMedianFilter portion and the shrUtils portion:
[codebox] // Allocate pinned input and output host image buffers: mem copy operations to/from pinned memory is much faster than paged memory
szBuffBytes = uiImageWidth * uiImageHeight * sizeof (unsigned int);
<u>cmPinnedBufIn</u> = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, szBuffBytes, NULL, &ciErrNum);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
cmPinnedBufOut = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, szBuffBytes, NULL, &ciErrNum);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
shrLog("\nclCreateBuffer (Input and Output Pinned Host buffers)...\n");
// Get mapped pointers for writing to pinned input and output host image pointers
<u>uiInput</u> = (cl_uint*)clEnqueueMapBuffer(cqCommandQueue[0], <u>cmPinnedBufIn</u>, CL_TRUE, CL_MAP_WRITE, 0, szBuffBytes, 0, NULL, NULL, &ciErrNum);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
uiOutput = (cl_uint*)clEnqueueMapBuffer(cqCommandQueue[0], cmPinnedBufOut, CL_TRUE, CL_MAP_READ, 0, szBuffBytes, 0, NULL, NULL, &ciErrNum);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
shrLog("clEnqueueMapBuffer (Pointer to Input and Output pinned host buffers)...\n");
// Load image data from file to pinned input host buffer
ciErrNum = <b>shrLoadPPM4ub</b>(cPathAndName, <u>(unsigned char **)&uiInput</u>, &uiImageWidth, &uiImageHeight);
oclCheckErrorEX(ciErrNum, shrTRUE, pCleanup);
shrLog("Load Input Image to Input pinned host buffer...\n"); [/codebox]
[codebox]shrBOOL shrLoadPPM4ub( const char* file, unsigned char** data,
unsigned int *w,unsigned int *h)
{
unsigned char *idata = 0;
unsigned int channels;
if (loadPPM( file, &idata, w, h, &channels)) {
// pad 4th component
int size = *w * *h;
// keep the original pointer
unsigned char* idata_orig = idata;
<u>*data</u> = (unsigned char*) malloc( sizeof(unsigned char) * size * 4);
unsigned char *ptr = *data;
for(int i=0; i<size; i++)
{
*ptr++ = *idata++;
*ptr++ = *idata++;
*ptr++ = *idata++;
*ptr++ = 0;
}
free( idata_orig);
return shrTRUE;
}
else
{
free(idata);
return shrFALSE;
}
}[/codebox]