SDK Bug: oclMedianFilter Program creates pinned memory but then uses pageable memory instead.

In Nvidia’s “GPU Computing SDK”, I’m seeing oclMedianFilter allocate a pinned memory buffer and then pass it to an input function which replaces it with a malloc()ed buffer. This would mean both that twice as much memory is used (for that single buffer) and that the application is using slower pageable memory for host->device transfers. Am I mistaken?

Below are two relevant portions, the oclMedianFilter portion and the shrUtils portion:

[codebox] // Allocate pinned input and output host image buffers: mem copy operations to/from pinned memory is much faster than paged memory

szBuffBytes = uiImageWidth * uiImageHeight * sizeof (unsigned int);

<u>cmPinnedBufIn</u> = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, szBuffBytes, NULL, &ciErrNum);

oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

cmPinnedBufOut = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, szBuffBytes, NULL, &ciErrNum);

oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

shrLog("\nclCreateBuffer (Input and Output Pinned Host buffers)...\n"); 

// Get mapped pointers for writing to pinned input and output host image pointers

<u>uiInput</u> = (cl_uint*)clEnqueueMapBuffer(cqCommandQueue[0], <u>cmPinnedBufIn</u>, CL_TRUE, CL_MAP_WRITE, 0, szBuffBytes, 0, NULL, NULL, &ciErrNum);

oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

uiOutput = (cl_uint*)clEnqueueMapBuffer(cqCommandQueue[0], cmPinnedBufOut, CL_TRUE, CL_MAP_READ, 0, szBuffBytes, 0, NULL, NULL, &ciErrNum);

oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

shrLog("clEnqueueMapBuffer (Pointer to Input and Output pinned host buffers)...\n"); 

// Load image data from file to pinned input host buffer

ciErrNum = <b>shrLoadPPM4ub</b>(cPathAndName, <u>(unsigned char **)&uiInput</u>, &uiImageWidth, &uiImageHeight);

oclCheckErrorEX(ciErrNum, shrTRUE, pCleanup);

shrLog("Load Input Image to Input pinned host buffer...\n"); [/codebox]

[codebox]shrBOOL shrLoadPPM4ub( const char* file, unsigned char** data,

            unsigned int *w,unsigned int *h)

{

unsigned char *idata = 0;

unsigned int channels;

if (loadPPM( file, &idata, w, h, &channels)) {

    // pad 4th component

    int size = *w * *h;

// keep the original pointer

    unsigned char* idata_orig = idata;

    <u>*data</u> = (unsigned char*) malloc( sizeof(unsigned char) * size * 4);

    unsigned char *ptr = *data;

    for(int i=0; i<size; i++) 

    {

        *ptr++ = *idata++;

        *ptr++ = *idata++;

        *ptr++ = *idata++;

        *ptr++ = 0;

    }

    free( idata_orig);

    return shrTRUE;

}

else

{

    free(idata);

    return shrFALSE;

}

}[/codebox]