Judging from a simple test case, managed allocations can share the same memory page. However the programming guide does not specify this.
#include <iostream>
char* alloc(size_t bytes){
std::cerr << "alloc " << bytes << "\n";
char* ptr;
cudaMallocManaged(&ptr, bytes);
return ptr;
}
void dealloc(char* ptr, size_t bytes){
std::cerr << "dealloc " << bytes << "\n";
cudaFree(ptr);
}
int main(){
cudaFree(0);
char* pointers[10];
size_t bytes = 1;
for(int i = 0; i < 10; i++){
pointers[i] = alloc(bytes);
}
for(int i = 0; i < 10; i++){
std::cerr << "pointer " << i << ": " << (void*)pointers[i] << "\n";
}
for(int i = 0; i < 10; i++){
dealloc(pointers[i], bytes);
}
}
On my machine, this prints
pointer 0: 0x7f187a000000
pointer 1: 0x7f187a000200
pointer 2: 0x7f187a000400
pointer 3: 0x7f187a000600
pointer 4: 0x7f187a000800
pointer 5: 0x7f187a000a00
pointer 6: 0x7f187a000c00
pointer 7: 0x7f187a000e00
pointer 8: 0x7f187a001000
pointer 9: 0x7f187a001200
Each pointer is 512 byte aligned, (it must be at least 256-byte aligned according to the programming guide). The first 8 pointers point to the same page.
If this code is run with strace
, one can see that the first call to cudaMallocManage registers a memory range of 2MB which is then used to back the successive allocations.
write(2, "alloc ", 6alloc ) = 6
write(2, "1", 11) = 1
write(2, "\n", 1
) = 1
futex(0x7f28b45f9888, FUTEX_WAKE_PRIVATE, 2147483647) = 0
mmap(NULL, 67104768, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f288c001000
munmap(0x7f288c001000, 33550336) = 0
mmap(0x7f288e000000, 2097152, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_FIXED, 4, 0x7f288e000000) = 0x7f288e000000
ioctl(4, _IOC(0, 0, 0x48, 0), 0x7ffcbe731840) = 0
ioctl(4, _IOC(0, 0, 0x2d, 0), 0x7ffcbe731980) = 0
write(2, "alloc ", 6alloc ) = 6
write(2, "1", 11) = 1
write(2, "\n", 1
) = 1
write(2, "alloc ", 6alloc ) = 6
write(2, "1", 11) = 1
write(2, "\n", 1
) = 1
write(2, "alloc ", 6alloc ) = 6
write(2, "1", 11) = 1
write(2, "\n", 1
) = 1
write(2, "alloc ", 6alloc ) = 6
write(2, "1", 11) = 1
write(2, "\n", 1
) = 1
write(2, "alloc ", 6alloc ) = 6
write(2, "1", 11) = 1
write(2, "\n", 1
) = 1
write(2, "alloc ", 6alloc ) = 6
write(2, "1", 11) = 1
write(2, "\n", 1
) = 1
write(2, "alloc ", 6alloc ) = 6
write(2, "1", 11) = 1
write(2, "\n", 1
) = 1
write(2, "alloc ", 6alloc ) = 6
write(2, "1", 11) = 1
write(2, "\n", 1
) = 1
write(2, "alloc ", 6alloc ) = 6
write(2, "1", 11) = 1
write(2, "\n", 1
) = 1
write(2, "pointer ", 8pointer ) = 8
write(2, "0", 10) = 1
write(2, ": ", 2: ) = 2
write(2, "0x7f288e000000", 140x7f288e000000) = 14
write(2, "\n", 1
) = 1
write(2, "pointer ", 8pointer ) = 8
write(2, "1", 11) = 1
write(2, ": ", 2: ) = 2
write(2, "0x7f288e000200", 140x7f288e000200) = 14
write(2, "\n", 1
)