Very Slow CU_MEMHOST_WRITE_COMBINED Allocation

Edit: Solved with a customized, stripped down, 10.04 installation. We still aren’t sure what the problem was, but it seems to be solved.

Whenever write-combining is enabled, the cuMemHostAlloc() function takes roughly 600 times longer to execute. Here is the output from the test program posted below:

[codebox]#include

#include <time.h>

#include <cuda.h>

static const char *error_string(CUresult cur);

void cuErr(CUresult cerr);

CUdevice cuDevice;

CUcontext cuContext;

CUmodule cuModule;

int

main( int argc, char** argv)

{

// initialize CUDA

CUresult err;

cuInit(0);

if(cuDeviceGet(&cuDevice, 0) != CUDA_SUCCESS) {

    printf("Could not get device\n");

    exit(-1);

}

CUresult status = cuCtxCreate( &cuContext, 0, cuDevice );

if ( CUDA_SUCCESS != status )

    exit(-1);

unsigned char *p;

int size = 96*1024*1024;

struct timespec start, end;

// write-combined, pinned memory

printf(“Allocating %d bytes of write-combined, pinned memory…\n”, size);

clock_gettime(CLOCK_REALTIME, &start);

cuErr(cuMemHostAlloc((void**)&p, size, CU_MEMHOSTALLOC_WRITECOMBINED | CU_MEMHOSTALLOC_PORTABLE));

clock_gettime(CLOCK_REALTIME, &end);

float elapsedTime = (float)(end.tv_sec - start.tv_sec) + (float)(end.tv_nsec - start.tv_nsec)/1000000000;

printf("Allocation time: %f s\n\n", elapsedTime);

printf(“Freeing %d bytes of write-combined, pinned memory…\n”, size);

clock_gettime(CLOCK_REALTIME, &start);

cuMemFreeHost(p);

clock_gettime(CLOCK_REALTIME, &end);

elapsedTime = (float)(end.tv_sec - start.tv_sec) + (float)(end.tv_nsec - start.tv_nsec)/1000000000;

printf("Free time: %f s\n\n", elapsedTime);

// pinned memory

printf(“Allocating %d bytes of pinned memory…\n”, size);

clock_gettime(CLOCK_REALTIME, &start);

cuErr(cuMemHostAlloc((void**)&p, size, CU_MEMHOSTALLOC_PORTABLE));

clock_gettime(CLOCK_REALTIME, &end);

elapsedTime = (float)(end.tv_sec - start.tv_sec) + (float)(end.tv_nsec - start.tv_nsec)/1000000000;

printf("Allocation time: %f s\n\n", elapsedTime);

printf(“Freeing %d bytes of pinned memory…\n”, size);

clock_gettime(CLOCK_REALTIME, &start);

cuMemFreeHost(p);

clock_gettime(CLOCK_REALTIME, &end);

elapsedTime = (float)(end.tv_sec - start.tv_sec) + (float)(end.tv_nsec - start.tv_nsec)/1000000000;

printf("Free time: %f s\n\n", elapsedTime);

cuCtxDetach(cuContext);

}

static const char *error_string(CUresult cur)

{

switch(cur)

{

    case CUDA_SUCCESS                    :return "No errors";

    case CUDA_ERROR_INVALID_VALUE       :return "Invalid value";

    case CUDA_ERROR_OUT_OF_MEMORY       :return "Out of memory";

    case CUDA_ERROR_NOT_INITIALIZED     :return "Driver not initialized";

    case CUDA_ERROR_DEINITIALIZED       :return "Driver deinitialized";

    case CUDA_ERROR_NO_DEVICE            :return "No CUDA-capable device available";

    case CUDA_ERROR_INVALID_DEVICE       :return "Invalid device";

    case CUDA_ERROR_INVALID_IMAGE        :return "Invalid kernel image";

    case CUDA_ERROR_INVALID_CONTEXT      :return "Invalid context";

    case CUDA_ERROR_CONTEXT_ALREADY_CURRENT:return "Context already current";

    case CUDA_ERROR_MAP_FAILED           :return "Map failed";

    case CUDA_ERROR_UNMAP_FAILED         :return "Unmap failed";

    case CUDA_ERROR_ARRAY_IS_MAPPED      :return "Array is mapped";

    case CUDA_ERROR_ALREADY_MAPPED       :return "Already mapped";

    case CUDA_ERROR_NO_BINARY_FOR_GPU    :return "No binary for GPU";

    case CUDA_ERROR_ALREADY_ACQUIRED     :return "Already acquired";

    case CUDA_ERROR_NOT_MAPPED           :return "Not mapped";

    case CUDA_ERROR_INVALID_SOURCE       :return "Invalid source";

    case CUDA_ERROR_FILE_NOT_FOUND       :return "File not found";

    case CUDA_ERROR_INVALID_HANDLE       :return "Invalid handle";

    case CUDA_ERROR_NOT_FOUND            :return "Not found";

    case CUDA_ERROR_NOT_READY            :return "CUDA not ready";

    case CUDA_ERROR_LAUNCH_FAILED        :return "Launch failed";

    case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES:return "Launch exceeded resources";

    case CUDA_ERROR_LAUNCH_TIMEOUT       :return "Launch exceeded timeout";

    case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING:return "Launch with incompatible texturing";

    case CUDA_ERROR_UNKNOWN              :return "Unknown error";

default :return “Unknown error 2”;

    }

}

void cuErr(CUresult cerr)

{

if(cerr != CUDA_SUCCESS) {

    printf("%s\n",error_string(cerr));

    exit(-1);

}

}

[/codebox]

Anyone have any ideas?

Anyone have any ideas?

Could someone (preferably with a GTX 460 or Ubuntu) at least try and run the test program and post their results?

I doubt it’s supposed to take nearly 14 seconds to allocated 96MB of memory.

Could someone (preferably with a GTX 460 or Ubuntu) at least try and run the test program and post their results?

I doubt it’s supposed to take nearly 14 seconds to allocated 96MB of memory.

Compiled using 3.2rc with the 260.24 driver on 64 bit Ubuntu 9.04, I get this:

avidday@cuda:~$ nvcc -arch=sm_20 cumemhost.cu -lcuda

cumemhost.cu(16): warning: variable "err" was declared but never referenced

cumemhost.cu(16): warning: variable "err" was declared but never referenced

avidday@cuda:~$ ./a.out 

Allocating 100663296 bytes of write-combined, pinned memory...

Allocation time: 0.299165 s

Freeing 100663296 bytes of write-combined, pinned memory...

Free time: 0.231343 s

Allocating 100663296 bytes of pinned memory...

Allocation time: 0.038438 s

Freeing 100663296 bytes of pinned memory...

Free time: 0.026334 s

I would guess the problem lies with either your specific hardware or installation.

Compiled using 3.2rc with the 260.24 driver on 64 bit Ubuntu 9.04, I get this:

avidday@cuda:~$ nvcc -arch=sm_20 cumemhost.cu -lcuda

cumemhost.cu(16): warning: variable "err" was declared but never referenced

cumemhost.cu(16): warning: variable "err" was declared but never referenced

avidday@cuda:~$ ./a.out 

Allocating 100663296 bytes of write-combined, pinned memory...

Allocation time: 0.299165 s

Freeing 100663296 bytes of write-combined, pinned memory...

Free time: 0.231343 s

Allocating 100663296 bytes of pinned memory...

Allocation time: 0.038438 s

Freeing 100663296 bytes of pinned memory...

Free time: 0.026334 s

I would guess the problem lies with either your specific hardware or installation.

Thanks for testing it out. I’ve tested out on a couple different servers now and it looks like the problem could be related to hyperthreaded i7’s.

I tested on a dual X5550 system (8 cores, 16 threads) and the allocation was faster, but still a very slow 2.5 s for 96MB.

On both the X5650 and X5550 systems, with hyperthreading disabled, the allocation time was cut exactly in half.

Testing on a dual E5430 system (8 cores, 8 threads), the results were on par with your’s.

Edit2: Moving to Ubuntu 10.04 with custom kernel options resolved the issue. The X5650 is still slower than the X5550 which is slower than the E5430, but I’ve been able to allocate up to 1024 MB in 3 ms.

Thanks for testing it out. I’ve tested out on a couple different servers now and it looks like the problem could be related to hyperthreaded i7’s.

I tested on a dual X5550 system (8 cores, 16 threads) and the allocation was faster, but still a very slow 2.5 s for 96MB.

On both the X5650 and X5550 systems, with hyperthreading disabled, the allocation time was cut exactly in half.

Testing on a dual E5430 system (8 cores, 8 threads), the results were on par with your’s.

Edit2: Moving to Ubuntu 10.04 with custom kernel options resolved the issue. The X5650 is still slower than the X5550 which is slower than the E5430, but I’ve been able to allocate up to 1024 MB in 3 ms.