According to the documentation CUDA Runtime API :: CUDA Toolkit Documentation :
If ptr has memory type cudaMemoryTypeDevice then this identifies the device on which the memory referred to by ptr physically resides.
However, when I use peer-to-peer mapping to map the memory in GPU 0 to GPU 1, I find the cudaPointerGetAttributes will report device 0 for vmm based API, while report device 1 for normal cudaMalloc based API.
The reproducible examples are:
#include <cuda_runtime.h>
#include <cuda.h>
#include <iostream>
#include <cstring>
#include <unistd.h>
#include <sys/wait.h>
#define CHECK_CUDA(call) \
do { \
cudaError_t err = call; \
if (err != cudaSuccess) { \
std::cerr << "CUDA error at " << __FILE__ << ":" << __LINE__ << " - " \
<< cudaGetErrorString(err) << std::endl; \
exit(1); \
} \
} while(0)
int main(int argc, char** argv) {
// Create pipe for IPC handle communication
int pipe_fd[2];
if (pipe(pipe_fd) == -1) {
std::cerr << "Failed to create pipe" << std::endl;
return 1;
}
// Fork to create child process
pid_t pid = fork();
if (pid == -1) {
std::cerr << "Failed to fork" << std::endl;
return 1;
}
if (pid > 0) {
// Parent process (Process 0)
close(pipe_fd[0]); // Close read end
// Set device 0
int device = 0;
CHECK_CUDA(cudaSetDevice(device));
std::cout << "Process 0 (parent) PID: " << getpid() << " using CUDA device " << device << std::endl;
// Allocate memory and create IPC handle
void* d_ptr;
const unsigned long long size_bytes = 2ull * 1024 * 1024 * 1024; // 2 GiB
CHECK_CUDA(cudaMalloc(&d_ptr, size_bytes));
std::cout << "Process 0: Allocated " << size_bytes << " bytes at " << d_ptr << std::endl;
// Create IPC memory handle
cudaIpcMemHandle_t ipc_handle;
CHECK_CUDA(cudaIpcGetMemHandle(&ipc_handle, d_ptr));
std::cout << "Process 0: Created IPC handle" << std::endl;
// Send IPC handle to child process through pipe
ssize_t written = write(pipe_fd[1], &ipc_handle, sizeof(cudaIpcMemHandle_t));
if (written != sizeof(cudaIpcMemHandle_t)) {
std::cerr << "Failed to write IPC handle to pipe" << std::endl;
exit(1);
}
std::cout << "Process 0: Sent IPC handle to child process" << std::endl;
// Wait for child process to finish
int status;
waitpid(pid, &status, 0);
// Clean up
CHECK_CUDA(cudaFree(d_ptr));
std::cout << "Process 0: Freed memory and exiting" << std::endl;
close(pipe_fd[1]); // Close write end
} else {
// Child process (Process 1)
close(pipe_fd[1]); // Close write end
// Set device 1
int device = 1;
CHECK_CUDA(cudaSetDevice(device));
std::cout << "Process 1 (child) PID: " << getpid() << " using CUDA device " << device << std::endl;
// Receive IPC handle from parent process through pipe
cudaIpcMemHandle_t ipc_handle;
ssize_t read_bytes = read(pipe_fd[0], &ipc_handle, sizeof(cudaIpcMemHandle_t));
if (read_bytes != sizeof(cudaIpcMemHandle_t)) {
std::cerr << "Failed to read IPC handle from pipe" << std::endl;
exit(1);
}
std::cout << "Process 1: Received IPC handle from parent process" << std::endl;
// Import the memory using the IPC handle
void* d_ptr;
CHECK_CUDA(cudaIpcOpenMemHandle(&d_ptr, ipc_handle, cudaIpcMemLazyEnablePeerAccess));
std::cout << "Process 1: Imported memory at " << d_ptr << " with lazy P2P access" << std::endl;
// Query pointer attributes to see which device it lives on
cudaPointerAttributes attr;
CHECK_CUDA(cudaPointerGetAttributes(&attr, d_ptr));
std::cout << "Process 1: Pointer attributes:" << std::endl;
std::cout << " Device: " << attr.device << std::endl;
std::cout << " Type: " << (attr.type == cudaMemoryTypeDevice ? "Device" : "Host") << std::endl;
std::cout << " Device pointer: " << attr.devicePointer << std::endl;
std::cout << " Host pointer: " << attr.hostPointer << std::endl;
// Close the IPC memory handle
CHECK_CUDA(cudaIpcCloseMemHandle(d_ptr));
std::cout << "Process 1: Closed IPC memory handle" << std::endl;
// Wait for Enter key before exiting
std::cout << "Process 1: Press Enter to exit..." << std::endl;
std::cin.get();
close(pipe_fd[0]); // Close read end
}
return 0;
}
Compile with just nvcc, and run it, we can see:
Process 0 (parent) PID: 503164 using CUDA device 0
Process 1 (child) PID: 503165 using CUDA device 1
Process 0: Allocated 2147483648 bytes at 0x7f8e00000000
Process 0: Created IPC handle
Process 0: Sent IPC handle to child process
Process 1: Received IPC handle from parent process
Process 1: Imported memory at 0x7f8e00000000 with lazy P2P access
Process 1: Pointer attributes:
Device: 1
Type: Device
Device pointer: 0x7f8e00000000
Host pointer: 0
Process 1: Closed IPC memory handle
Process 1: Press Enter to exit...
Process 0: Freed memory and exiting
When process 1 get p2p-shared ipc memory from process 0, cudaPointerGetAttributes gives the device 1, while the physical memory actually lives in process 0’s device 0.
When I switch to the vmm API:
#include <cuda_runtime.h>
#include <cuda.h>
#include <iostream>
#include <cstring>
#include <unistd.h>
#include <sys/wait.h>
#include <sys/syscall.h>
#include <fcntl.h>
#include <errno.h>
#define CHECK_CUDA(call) \
do { \
cudaError_t err = call; \
if (err != cudaSuccess) { \
std::cerr << "CUDA error at " << __FILE__ << ":" << __LINE__ << " - " \
<< cudaGetErrorString(err) << std::endl; \
exit(1); \
} \
} while(0)
#define CHECK_CU(call) \
do { \
CUresult err = call; \
if (err != CUDA_SUCCESS) { \
const char* err_str; \
cuGetErrorString(err, &err_str); \
std::cerr << "CUDA Driver error at " << __FILE__ << ":" << __LINE__ << " - " \
<< err_str << std::endl; \
exit(1); \
} \
} while(0)
// Define pidfd_open syscall if not available
#ifndef SYS_pidfd_open
#define SYS_pidfd_open 434
#endif
// Define pidfd_getfd syscall if not available
#ifndef SYS_pidfd_getfd
#define SYS_pidfd_getfd 438
#endif
// Wrapper function for pidfd_getfd syscall
static int pidfd_getfd(int pidfd, int targetfd, unsigned int flags) {
return syscall(SYS_pidfd_getfd, pidfd, targetfd, flags);
}
int main(int argc, char** argv) {
// Create pipe for communication
int pipe_fd[2];
if (pipe(pipe_fd) == -1) {
std::cerr << "Failed to create pipe" << std::endl;
return 1;
}
// Fork to create child process
pid_t pid = fork();
if (pid == -1) {
std::cerr << "Failed to fork" << std::endl;
close(pipe_fd[0]);
close(pipe_fd[1]);
return 1;
}
if (pid > 0) {
// Parent process (Process 0)
close(pipe_fd[0]); // Close read end
std::cout << "Process 0 (parent) PID: " << getpid() << std::endl;
// Set device 0 using runtime API
int device = 0;
CHECK_CUDA(cudaSetDevice(device));
std::cout << "Process 0: Using CUDA device " << device << std::endl;
// Allocate virtual memory using CUDA VMM
const unsigned long long size_bytes = 2ull * 1024 * 1024 * 1024; // 2 GiB
CUdeviceptr vmm_ptr;
CHECK_CU(cuMemAddressReserve(&vmm_ptr, size_bytes, 0, 0, 0));
std::cout << "Process 0: Reserved virtual memory at " << (void*)vmm_ptr << " (" << size_bytes << " bytes)" << std::endl;
// Create memory allocation properties
CUmemAllocationProp prop = {};
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
prop.location.id = device;
prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
prop.win32HandleMetaData = NULL;
// Create memory allocation
CUmemGenericAllocationHandle mem_handle;
CHECK_CU(cuMemCreate(&mem_handle, size_bytes, &prop, 0));
std::cout << "Process 0: Created memory allocation handle" << std::endl;
// Map the memory allocation to virtual address
CHECK_CU(cuMemMap(vmm_ptr, size_bytes, 0, mem_handle, 0));
std::cout << "Process 0: Mapped memory allocation to virtual address" << std::endl;
// Set access permissions for the virtual memory
CUmemAccessDesc access_desc;
access_desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
access_desc.location.id = device;
access_desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
CHECK_CU(cuMemSetAccess(vmm_ptr, size_bytes, &access_desc, 1));
std::cout << "Process 0: Set access permissions for device 0" << std::endl;
// Export the memory allocation to a shareable handle
int shareable_handle;
CHECK_CU(cuMemExportToShareableHandle(&shareable_handle, mem_handle,
CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, 0));
std::cout << "Process 0: Exported memory allocation to shareable handle " << shareable_handle << std::endl;
// Send PID and file descriptor number to child process through pipe
struct {
pid_t pid;
int fd;
uint64_t size;
} info = {getpid(), shareable_handle, size_bytes};
if (write(pipe_fd[1], &info, sizeof(info)) != sizeof(info)) {
std::cerr << "Failed to send info to child process" << std::endl;
exit(1);
}
std::cout << "Process 0: Sent PID " << info.pid << ", FD " << info.fd << ", and size to child process" << std::endl;
// Wait for child process to finish
int status;
waitpid(pid, &status, 0);
// Clean up
CHECK_CU(cuMemUnmap(vmm_ptr, size_bytes));
CHECK_CU(cuMemRelease(mem_handle));
CHECK_CU(cuMemAddressFree(vmm_ptr, size_bytes));
std::cout << "Process 0: Freed virtual memory and exiting" << std::endl;
close(shareable_handle);
close(pipe_fd[1]);
} else {
// Child process (Process 1)
close(pipe_fd[1]); // Close write end
std::cout << "Process 1 (child) PID: " << getpid() << std::endl;
// Set device 1 using runtime API
int device = 1;
CHECK_CUDA(cudaSetDevice(device));
std::cout << "Process 1: Using CUDA device " << device << std::endl;
// Receive PID, file descriptor number, and size from parent process through pipe
struct {
pid_t pid;
int fd;
uint64_t size;
} info;
ssize_t nread = read(pipe_fd[0], &info, sizeof(info));
if (nread != sizeof(info)) {
std::cerr << "Failed to receive info from parent process" << std::endl;
exit(1);
}
std::cout << "Process 1: Received PID " << info.pid << ", FD " << info.fd << ", size " << info.size << " bytes" << std::endl;
// Open a pidfd for the parent process
int pidfd = syscall(SYS_pidfd_open, info.pid, 0);
if (pidfd == -1) {
std::cerr << "Failed to open pidfd for parent process: " << strerror(errno) << std::endl;
exit(1);
}
std::cout << "Process 1: Opened pidfd " << pidfd << " for parent process" << std::endl;
// Use pidfd_getfd to obtain the file descriptor from the parent process
int received_handle = pidfd_getfd(pidfd, info.fd, 0);
if (received_handle == -1) {
std::cerr << "Failed to get file descriptor from parent process: " << strerror(errno) << std::endl;
close(pidfd);
exit(1);
}
std::cout << "Process 1: Obtained file descriptor " << received_handle << " from parent process" << std::endl;
close(pidfd); // Close the pidfd as it's no longer needed
// Import the memory allocation from the shareable handle
CUmemGenericAllocationHandle mem_handle;
CHECK_CU(cuMemImportFromShareableHandle(&mem_handle,
(void*)(uintptr_t)received_handle,
CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR));
std::cout << "Process 1: Imported memory allocation from shareable handle" << std::endl;
// Reserve virtual memory
CUdeviceptr vmm_ptr;
CHECK_CU(cuMemAddressReserve(&vmm_ptr, info.size, 0, 0, 0));
std::cout << "Process 1: Reserved virtual memory at " << (void*)vmm_ptr << " (" << info.size << " bytes)" << std::endl;
// Map the memory allocation to virtual address
CHECK_CU(cuMemMap(vmm_ptr, info.size, 0, mem_handle, 0));
std::cout << "Process 1: Mapped memory allocation to virtual address" << std::endl;
// Set access permissions for the virtual memory
CUmemAccessDesc access_desc;
access_desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
access_desc.location.id = device;
access_desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
CHECK_CU(cuMemSetAccess(vmm_ptr, info.size, &access_desc, 1));
std::cout << "Process 1: Set access permissions for device 1" << std::endl;
// Query pointer attributes to see which device it lives on
cudaPointerAttributes attr;
CHECK_CUDA(cudaPointerGetAttributes(&attr, (void*)vmm_ptr));
std::cout << "Process 1: Pointer attributes:" << std::endl;
std::cout << " Device: " << attr.device << std::endl;
std::cout << " Type: " << (attr.type == cudaMemoryTypeDevice ? "Device" : "Host") << std::endl;
std::cout << " Device pointer: " << attr.devicePointer << std::endl;
std::cout << " Host pointer: " << attr.hostPointer << std::endl;
// Wait for Enter key before exiting
std::cout << "Process 1: Press Enter to exit..." << std::endl;
std::cin.get();
// Unmap and free the virtual memory
CHECK_CU(cuMemUnmap(vmm_ptr, info.size));
CHECK_CU(cuMemRelease(mem_handle));
CHECK_CU(cuMemAddressFree(vmm_ptr, info.size));
std::cout << "Process 1: Freed virtual memory" << std::endl;
close(received_handle);
close(pipe_fd[0]);
}
return 0;
}
Compile with nvcc, and run it, we get:
Process 0 (parent) PID: 506481
Process 1 (child) PID: 506482
Process 0: Using CUDA device 0
Process 1: Using CUDA device 1
Process 0: Reserved virtual memory at 0xa20000000 (2147483648 bytes)
Process 0: Created memory allocation handle
Process 0: Mapped memory allocation to virtual address
Process 0: Set access permissions for device 0
Process 0: Exported memory allocation to shareable handle 69
Process 0: Sent PID 506481, FD 69, and size to child process
Process 1: Received PID 506481, FD 69, size 2147483648 bytes
Process 1: Opened pidfd 69 for parent process
Process 1: Obtained file descriptor 70 from parent process
Process 1: Imported memory allocation from shareable handle
Process 1: Reserved virtual memory at 0xa20000000 (2147483648 bytes)
Process 1: Mapped memory allocation to virtual address
Process 1: Set access permissions for device 1
Process 1: Pointer attributes:
Device: 0
Type: Device
Device pointer: 0xa20000000
Host pointer: 0
Process 1: Press Enter to exit...
Process 1: Freed virtual memory
Process 0: Freed virtual memory and exiting
When process 1 get p2p-shared ipc memory from process 0 using the vmm API, cudaPointerGetAttributes gives the device 0, which is where the physical memory actually lives in.
Does this mean it is a bug in the cudaPointerGetAttributes function, for p2p-shared IPC memory using the cudaIpcOpenMemHandle function?