Edit: Solved with a customized, stripped down, 10.04 installation. We still aren’t sure what the problem was, but it seems to be solved.
Whenever write-combining is enabled, the cuMemHostAlloc() function takes roughly 600 times longer to execute. Here is the output from the test program posted below:
[codebox]#include#include <time.h>
#include <cuda.h>
static const char *error_string(CUresult cur);
void cuErr(CUresult cerr);
CUdevice cuDevice;
CUcontext cuContext;
CUmodule cuModule;
int
main( int argc, char** argv)
{
// initialize CUDA
CUresult err;
cuInit(0);
if(cuDeviceGet(&cuDevice, 0) != CUDA_SUCCESS) {
printf("Could not get device\n");
exit(-1);
}
CUresult status = cuCtxCreate( &cuContext, 0, cuDevice );
if ( CUDA_SUCCESS != status )
exit(-1);
unsigned char *p;
int size = 96*1024*1024;
struct timespec start, end;
// write-combined, pinned memory
printf(“Allocating %d bytes of write-combined, pinned memory…\n”, size);
clock_gettime(CLOCK_REALTIME, &start);
cuErr(cuMemHostAlloc((void**)&p, size, CU_MEMHOSTALLOC_WRITECOMBINED | CU_MEMHOSTALLOC_PORTABLE));
clock_gettime(CLOCK_REALTIME, &end);
float elapsedTime = (float)(end.tv_sec - start.tv_sec) + (float)(end.tv_nsec - start.tv_nsec)/1000000000;
printf("Allocation time: %f s\n\n", elapsedTime);
printf(“Freeing %d bytes of write-combined, pinned memory…\n”, size);
clock_gettime(CLOCK_REALTIME, &start);
cuMemFreeHost(p);
clock_gettime(CLOCK_REALTIME, &end);
elapsedTime = (float)(end.tv_sec - start.tv_sec) + (float)(end.tv_nsec - start.tv_nsec)/1000000000;
printf("Free time: %f s\n\n", elapsedTime);
// pinned memory
printf(“Allocating %d bytes of pinned memory…\n”, size);
clock_gettime(CLOCK_REALTIME, &start);
cuErr(cuMemHostAlloc((void**)&p, size, CU_MEMHOSTALLOC_PORTABLE));
clock_gettime(CLOCK_REALTIME, &end);
elapsedTime = (float)(end.tv_sec - start.tv_sec) + (float)(end.tv_nsec - start.tv_nsec)/1000000000;
printf("Allocation time: %f s\n\n", elapsedTime);
printf(“Freeing %d bytes of pinned memory…\n”, size);
clock_gettime(CLOCK_REALTIME, &start);
cuMemFreeHost(p);
clock_gettime(CLOCK_REALTIME, &end);
elapsedTime = (float)(end.tv_sec - start.tv_sec) + (float)(end.tv_nsec - start.tv_nsec)/1000000000;
printf("Free time: %f s\n\n", elapsedTime);
cuCtxDetach(cuContext);
}
static const char *error_string(CUresult cur)
{
switch(cur)
{
case CUDA_SUCCESS :return "No errors";
case CUDA_ERROR_INVALID_VALUE :return "Invalid value";
case CUDA_ERROR_OUT_OF_MEMORY :return "Out of memory";
case CUDA_ERROR_NOT_INITIALIZED :return "Driver not initialized";
case CUDA_ERROR_DEINITIALIZED :return "Driver deinitialized";
case CUDA_ERROR_NO_DEVICE :return "No CUDA-capable device available";
case CUDA_ERROR_INVALID_DEVICE :return "Invalid device";
case CUDA_ERROR_INVALID_IMAGE :return "Invalid kernel image";
case CUDA_ERROR_INVALID_CONTEXT :return "Invalid context";
case CUDA_ERROR_CONTEXT_ALREADY_CURRENT:return "Context already current";
case CUDA_ERROR_MAP_FAILED :return "Map failed";
case CUDA_ERROR_UNMAP_FAILED :return "Unmap failed";
case CUDA_ERROR_ARRAY_IS_MAPPED :return "Array is mapped";
case CUDA_ERROR_ALREADY_MAPPED :return "Already mapped";
case CUDA_ERROR_NO_BINARY_FOR_GPU :return "No binary for GPU";
case CUDA_ERROR_ALREADY_ACQUIRED :return "Already acquired";
case CUDA_ERROR_NOT_MAPPED :return "Not mapped";
case CUDA_ERROR_INVALID_SOURCE :return "Invalid source";
case CUDA_ERROR_FILE_NOT_FOUND :return "File not found";
case CUDA_ERROR_INVALID_HANDLE :return "Invalid handle";
case CUDA_ERROR_NOT_FOUND :return "Not found";
case CUDA_ERROR_NOT_READY :return "CUDA not ready";
case CUDA_ERROR_LAUNCH_FAILED :return "Launch failed";
case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES:return "Launch exceeded resources";
case CUDA_ERROR_LAUNCH_TIMEOUT :return "Launch exceeded timeout";
case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING:return "Launch with incompatible texturing";
case CUDA_ERROR_UNKNOWN :return "Unknown error";
default :return “Unknown error 2”;
}
}
void cuErr(CUresult cerr)
{
if(cerr != CUDA_SUCCESS) {
printf("%s\n",error_string(cerr));
exit(-1);
}
}
[/codebox]