Does cudaMallocManaged put each allocation into separate memory pages?

Hi all

I’ve been using a custom allocator for std::vector objects that performs allocation with cudaMallocManaged. This is allowing me to access these buffers from both the CPU and the GPU. This has worked nicely in the past when I was accessing fairly large buffers created with this method.

However in my latest project I am allocating quite a large number of these vector objects, each being fairly small. I am linking to these buffers through pointer lists. And I am seeing quite miserable performance now and I suspect it’s caused by on demand paging.

I was wondering if each of the pointers returned by cudaMallocManaged will reside in separate memory pages. If that is the case I would probably have to introduce some memory pooling on top of this.

Does anyone have some more information on this?

Judging from a simple test case, managed allocations can share the same memory page. However the programming guide does not specify this.

#include <iostream>

char* alloc(size_t bytes){
 std::cerr << "alloc " << bytes << "\n";
 char* ptr;
 cudaMallocManaged(&ptr, bytes);
 return ptr;
}

void dealloc(char* ptr, size_t bytes){
 std::cerr << "dealloc " << bytes << "\n";
 cudaFree(ptr);
}

int main(){
 cudaFree(0);
 char* pointers[10];
 size_t bytes = 1;
 for(int i = 0; i < 10; i++){
      pointers[i] = alloc(bytes);
 }

 for(int i = 0; i < 10; i++){
      std::cerr << "pointer " << i << ": " << (void*)pointers[i] << "\n";
 }

 for(int i = 0; i < 10; i++){
      dealloc(pointers[i], bytes);
 }
}

On my machine, this prints

pointer 0: 0x7f187a000000
pointer 1: 0x7f187a000200
pointer 2: 0x7f187a000400
pointer 3: 0x7f187a000600
pointer 4: 0x7f187a000800
pointer 5: 0x7f187a000a00
pointer 6: 0x7f187a000c00
pointer 7: 0x7f187a000e00
pointer 8: 0x7f187a001000
pointer 9: 0x7f187a001200

Each pointer is 512 byte aligned, (it must be at least 256-byte aligned according to the programming guide). The first 8 pointers point to the same page.

If this code is run with strace, one can see that the first call to cudaMallocManage registers a memory range of 2MB which is then used to back the successive allocations.

write(2, "alloc ", 6alloc )                   = 6
write(2, "1", 11)                        = 1
write(2, "\n", 1
)                       = 1
futex(0x7f28b45f9888, FUTEX_WAKE_PRIVATE, 2147483647) = 0
mmap(NULL, 67104768, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f288c001000
munmap(0x7f288c001000, 33550336)        = 0
mmap(0x7f288e000000, 2097152, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_FIXED, 4, 0x7f288e000000) = 0x7f288e000000
ioctl(4, _IOC(0, 0, 0x48, 0), 0x7ffcbe731840) = 0
ioctl(4, _IOC(0, 0, 0x2d, 0), 0x7ffcbe731980) = 0
write(2, "alloc ", 6alloc )                   = 6
write(2, "1", 11)                        = 1
write(2, "\n", 1
)                       = 1
write(2, "alloc ", 6alloc )                   = 6
write(2, "1", 11)                        = 1
write(2, "\n", 1
)                       = 1
write(2, "alloc ", 6alloc )                   = 6
write(2, "1", 11)                        = 1
write(2, "\n", 1
)                       = 1
write(2, "alloc ", 6alloc )                   = 6
write(2, "1", 11)                        = 1
write(2, "\n", 1
)                       = 1
write(2, "alloc ", 6alloc )                   = 6
write(2, "1", 11)                        = 1
write(2, "\n", 1
)                       = 1
write(2, "alloc ", 6alloc )                   = 6
write(2, "1", 11)                        = 1
write(2, "\n", 1
)                       = 1
write(2, "alloc ", 6alloc )                   = 6
write(2, "1", 11)                        = 1
write(2, "\n", 1
)                       = 1
write(2, "alloc ", 6alloc )                   = 6
write(2, "1", 11)                        = 1
write(2, "\n", 1
)                       = 1
write(2, "alloc ", 6alloc )                   = 6
write(2, "1", 11)                        = 1
write(2, "\n", 1
)                       = 1
write(2, "pointer ", 8pointer )                 = 8
write(2, "0", 10)                        = 1
write(2, ": ", 2: )                       = 2
write(2, "0x7f288e000000", 140x7f288e000000)          = 14
write(2, "\n", 1
)                       = 1
write(2, "pointer ", 8pointer )                 = 8
write(2, "1", 11)                        = 1
write(2, ": ", 2: )                       = 2
write(2, "0x7f288e000200", 140x7f288e000200)          = 14
write(2, "\n", 1
)      

Thanks for the response, this was helpful.

It appears my CUDA code was generating so much computational output that the PCIe speed for getting the results back to the host became the limiting factor.

That’s an unusual situation. You may be able to optimize the performance by copying back data in the largest sized chunks possible. Since PCIe transfers are subject to various fixed-sized overheads, the throughput typically isn’t maximized until the chunk size reaches several MB.