Inconsistent behavior of cudaPointerGetAttributes between cudaMalloc IPC and vmm based IPC

According to the documentation CUDA Runtime API :: CUDA Toolkit Documentation :

If ptr has memory type cudaMemoryTypeDevice then this identifies the device on which the memory referred to by ptr physically resides.

However, when I use peer-to-peer mapping to map the memory in GPU 0 to GPU 1, I find the cudaPointerGetAttributes will report device 0 for vmm based API, while report device 1 for normal cudaMalloc based API.

The reproducible examples are:

#include <cuda_runtime.h>
#include <cuda.h>
#include <iostream>
#include <cstring>
#include <unistd.h>
#include <sys/wait.h>

#define CHECK_CUDA(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            std::cerr << "CUDA error at " << __FILE__ << ":" << __LINE__ << " - " \
                      << cudaGetErrorString(err) << std::endl; \
            exit(1); \
        } \
    } while(0)

int main(int argc, char** argv) {
    // Create pipe for IPC handle communication
    int pipe_fd[2];
    if (pipe(pipe_fd) == -1) {
        std::cerr << "Failed to create pipe" << std::endl;
        return 1;
    }
    
    // Fork to create child process
    pid_t pid = fork();
    
    if (pid == -1) {
        std::cerr << "Failed to fork" << std::endl;
        return 1;
    }
    
    if (pid > 0) {
        // Parent process (Process 0)
        close(pipe_fd[0]); // Close read end
        
        // Set device 0
        int device = 0;
        CHECK_CUDA(cudaSetDevice(device));
        
        std::cout << "Process 0 (parent) PID: " << getpid() << " using CUDA device " << device << std::endl;
        
        // Allocate memory and create IPC handle
        void* d_ptr;
        const unsigned long long size_bytes = 2ull * 1024 * 1024 * 1024; // 2 GiB
        
        CHECK_CUDA(cudaMalloc(&d_ptr, size_bytes));
        std::cout << "Process 0: Allocated " << size_bytes << " bytes at " << d_ptr << std::endl;
        
        // Create IPC memory handle
        cudaIpcMemHandle_t ipc_handle;
        CHECK_CUDA(cudaIpcGetMemHandle(&ipc_handle, d_ptr));
        std::cout << "Process 0: Created IPC handle" << std::endl;
        
        // Send IPC handle to child process through pipe
        ssize_t written = write(pipe_fd[1], &ipc_handle, sizeof(cudaIpcMemHandle_t));
        if (written != sizeof(cudaIpcMemHandle_t)) {
            std::cerr << "Failed to write IPC handle to pipe" << std::endl;
            exit(1);
        }
        std::cout << "Process 0: Sent IPC handle to child process" << std::endl;
        
        // Wait for child process to finish
        int status;
        waitpid(pid, &status, 0);
        
        // Clean up
        CHECK_CUDA(cudaFree(d_ptr));
        std::cout << "Process 0: Freed memory and exiting" << std::endl;
        
        close(pipe_fd[1]); // Close write end
        
    } else {
        // Child process (Process 1)
        close(pipe_fd[1]); // Close write end
        
        // Set device 1
        int device = 1;
        CHECK_CUDA(cudaSetDevice(device));
        
        std::cout << "Process 1 (child) PID: " << getpid() << " using CUDA device " << device << std::endl;
        
        // Receive IPC handle from parent process through pipe
        cudaIpcMemHandle_t ipc_handle;
        ssize_t read_bytes = read(pipe_fd[0], &ipc_handle, sizeof(cudaIpcMemHandle_t));
        if (read_bytes != sizeof(cudaIpcMemHandle_t)) {
            std::cerr << "Failed to read IPC handle from pipe" << std::endl;
            exit(1);
        }
        std::cout << "Process 1: Received IPC handle from parent process" << std::endl;
        
        // Import the memory using the IPC handle
        void* d_ptr;
        CHECK_CUDA(cudaIpcOpenMemHandle(&d_ptr, ipc_handle, cudaIpcMemLazyEnablePeerAccess));
        std::cout << "Process 1: Imported memory at " << d_ptr << " with lazy P2P access" << std::endl;
        
        // Query pointer attributes to see which device it lives on
        cudaPointerAttributes attr;
        CHECK_CUDA(cudaPointerGetAttributes(&attr, d_ptr));
        
        std::cout << "Process 1: Pointer attributes:" << std::endl;
        std::cout << "  Device: " << attr.device << std::endl;
        std::cout << "  Type: " << (attr.type == cudaMemoryTypeDevice ? "Device" : "Host") << std::endl;
        std::cout << "  Device pointer: " << attr.devicePointer << std::endl;
        std::cout << "  Host pointer: " << attr.hostPointer << std::endl;
        
        // Close the IPC memory handle
        CHECK_CUDA(cudaIpcCloseMemHandle(d_ptr));
        std::cout << "Process 1: Closed IPC memory handle" << std::endl;
        
        // Wait for Enter key before exiting
        std::cout << "Process 1: Press Enter to exit..." << std::endl;
        std::cin.get();
        
        close(pipe_fd[0]); // Close read end
    }
    
    return 0;
}

Compile with just nvcc, and run it, we can see:

Process 0 (parent) PID: 503164 using CUDA device 0
Process 1 (child) PID: 503165 using CUDA device 1
Process 0: Allocated 2147483648 bytes at 0x7f8e00000000
Process 0: Created IPC handle
Process 0: Sent IPC handle to child process
Process 1: Received IPC handle from parent process
Process 1: Imported memory at 0x7f8e00000000 with lazy P2P access
Process 1: Pointer attributes:
  Device: 1
  Type: Device
  Device pointer: 0x7f8e00000000
  Host pointer: 0
Process 1: Closed IPC memory handle
Process 1: Press Enter to exit...

Process 0: Freed memory and exiting

When process 1 get p2p-shared ipc memory from process 0, cudaPointerGetAttributes gives the device 1, while the physical memory actually lives in process 0’s device 0.

When I switch to the vmm API:

#include <cuda_runtime.h>
#include <cuda.h>
#include <iostream>
#include <cstring>
#include <unistd.h>
#include <sys/wait.h>
#include <sys/syscall.h>
#include <fcntl.h>
#include <errno.h>

#define CHECK_CUDA(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            std::cerr << "CUDA error at " << __FILE__ << ":" << __LINE__ << " - " \
                      << cudaGetErrorString(err) << std::endl; \
            exit(1); \
        } \
    } while(0)

#define CHECK_CU(call) \
    do { \
        CUresult err = call; \
        if (err != CUDA_SUCCESS) { \
            const char* err_str; \
            cuGetErrorString(err, &err_str); \
            std::cerr << "CUDA Driver error at " << __FILE__ << ":" << __LINE__ << " - " \
                      << err_str << std::endl; \
            exit(1); \
        } \
    } while(0)

// Define pidfd_open syscall if not available
#ifndef SYS_pidfd_open
#define SYS_pidfd_open 434
#endif

// Define pidfd_getfd syscall if not available
#ifndef SYS_pidfd_getfd
#define SYS_pidfd_getfd 438
#endif

// Wrapper function for pidfd_getfd syscall
static int pidfd_getfd(int pidfd, int targetfd, unsigned int flags) {
    return syscall(SYS_pidfd_getfd, pidfd, targetfd, flags);
}

int main(int argc, char** argv) {
    // Create pipe for communication
    int pipe_fd[2];
    if (pipe(pipe_fd) == -1) {
        std::cerr << "Failed to create pipe" << std::endl;
        return 1;
    }
    
    // Fork to create child process
    pid_t pid = fork();
    
    if (pid == -1) {
        std::cerr << "Failed to fork" << std::endl;
        close(pipe_fd[0]);
        close(pipe_fd[1]);
        return 1;
    }
    
    if (pid > 0) {
        // Parent process (Process 0)
        close(pipe_fd[0]); // Close read end
        
        std::cout << "Process 0 (parent) PID: " << getpid() << std::endl;
        
        // Set device 0 using runtime API
        int device = 0;
        CHECK_CUDA(cudaSetDevice(device));
        std::cout << "Process 0: Using CUDA device " << device << std::endl;
        
        // Allocate virtual memory using CUDA VMM
        const unsigned long long size_bytes = 2ull * 1024 * 1024 * 1024; // 2 GiB
        CUdeviceptr vmm_ptr;
        
        CHECK_CU(cuMemAddressReserve(&vmm_ptr, size_bytes, 0, 0, 0));
        std::cout << "Process 0: Reserved virtual memory at " << (void*)vmm_ptr << " (" << size_bytes << " bytes)" << std::endl;
        
        // Create memory allocation properties
        CUmemAllocationProp prop = {};
        prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
        prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
        prop.location.id = device;
        prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
        prop.win32HandleMetaData = NULL;
        
        // Create memory allocation
        CUmemGenericAllocationHandle mem_handle;
        CHECK_CU(cuMemCreate(&mem_handle, size_bytes, &prop, 0));
        std::cout << "Process 0: Created memory allocation handle" << std::endl;
        
        // Map the memory allocation to virtual address
        CHECK_CU(cuMemMap(vmm_ptr, size_bytes, 0, mem_handle, 0));
        std::cout << "Process 0: Mapped memory allocation to virtual address" << std::endl;
        
        // Set access permissions for the virtual memory
        CUmemAccessDesc access_desc;
        access_desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
        access_desc.location.id = device;
        access_desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
        
        CHECK_CU(cuMemSetAccess(vmm_ptr, size_bytes, &access_desc, 1));
        std::cout << "Process 0: Set access permissions for device 0" << std::endl;
        
        // Export the memory allocation to a shareable handle
        int shareable_handle;
        CHECK_CU(cuMemExportToShareableHandle(&shareable_handle, mem_handle, 
                                             CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, 0));
        std::cout << "Process 0: Exported memory allocation to shareable handle " << shareable_handle << std::endl;
        
        // Send PID and file descriptor number to child process through pipe
        struct {
            pid_t pid;
            int fd;
            uint64_t size;
        } info = {getpid(), shareable_handle, size_bytes};
        
        if (write(pipe_fd[1], &info, sizeof(info)) != sizeof(info)) {
            std::cerr << "Failed to send info to child process" << std::endl;
            exit(1);
        }
        std::cout << "Process 0: Sent PID " << info.pid << ", FD " << info.fd << ", and size to child process" << std::endl;

        // Wait for child process to finish
        int status;
        waitpid(pid, &status, 0);
        
        // Clean up
        CHECK_CU(cuMemUnmap(vmm_ptr, size_bytes));
        CHECK_CU(cuMemRelease(mem_handle));
        CHECK_CU(cuMemAddressFree(vmm_ptr, size_bytes));
        std::cout << "Process 0: Freed virtual memory and exiting" << std::endl;
        
        close(shareable_handle);
        close(pipe_fd[1]);
        
    } else {
        // Child process (Process 1)
        close(pipe_fd[1]); // Close write end
        
        std::cout << "Process 1 (child) PID: " << getpid() << std::endl;
        
        // Set device 1 using runtime API
        int device = 1;
        CHECK_CUDA(cudaSetDevice(device));
        std::cout << "Process 1: Using CUDA device " << device << std::endl;
        
        // Receive PID, file descriptor number, and size from parent process through pipe
        struct {
            pid_t pid;
            int fd;
            uint64_t size;
        } info;
        
        ssize_t nread = read(pipe_fd[0], &info, sizeof(info));
        if (nread != sizeof(info)) {
            std::cerr << "Failed to receive info from parent process" << std::endl;
            exit(1);
        }
        std::cout << "Process 1: Received PID " << info.pid << ", FD " << info.fd << ", size " << info.size << " bytes" << std::endl;
        
        // Open a pidfd for the parent process
        int pidfd = syscall(SYS_pidfd_open, info.pid, 0);
        if (pidfd == -1) {
            std::cerr << "Failed to open pidfd for parent process: " << strerror(errno) << std::endl;
            exit(1);
        }
        std::cout << "Process 1: Opened pidfd " << pidfd << " for parent process" << std::endl;
        
        // Use pidfd_getfd to obtain the file descriptor from the parent process
        int received_handle = pidfd_getfd(pidfd, info.fd, 0);
        if (received_handle == -1) {
            std::cerr << "Failed to get file descriptor from parent process: " << strerror(errno) << std::endl;
            close(pidfd);
            exit(1);
        }
        std::cout << "Process 1: Obtained file descriptor " << received_handle << " from parent process" << std::endl;
        
        close(pidfd); // Close the pidfd as it's no longer needed
        
        // Import the memory allocation from the shareable handle
        CUmemGenericAllocationHandle mem_handle;
        CHECK_CU(cuMemImportFromShareableHandle(&mem_handle, 
                                               (void*)(uintptr_t)received_handle,
                                               CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR));
        std::cout << "Process 1: Imported memory allocation from shareable handle" << std::endl;
        
        // Reserve virtual memory
        CUdeviceptr vmm_ptr;
        CHECK_CU(cuMemAddressReserve(&vmm_ptr, info.size, 0, 0, 0));
        std::cout << "Process 1: Reserved virtual memory at " << (void*)vmm_ptr << " (" << info.size << " bytes)" << std::endl;
        
        // Map the memory allocation to virtual address
        CHECK_CU(cuMemMap(vmm_ptr, info.size, 0, mem_handle, 0));
        std::cout << "Process 1: Mapped memory allocation to virtual address" << std::endl;
        
        // Set access permissions for the virtual memory
        CUmemAccessDesc access_desc;
        access_desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
        access_desc.location.id = device;
        access_desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
        
        CHECK_CU(cuMemSetAccess(vmm_ptr, info.size, &access_desc, 1));
        std::cout << "Process 1: Set access permissions for device 1" << std::endl;
        
        // Query pointer attributes to see which device it lives on
        cudaPointerAttributes attr;
        CHECK_CUDA(cudaPointerGetAttributes(&attr, (void*)vmm_ptr));
        
        std::cout << "Process 1: Pointer attributes:" << std::endl;
        std::cout << "  Device: " << attr.device << std::endl;
        std::cout << "  Type: " << (attr.type == cudaMemoryTypeDevice ? "Device" : "Host") << std::endl;
        std::cout << "  Device pointer: " << attr.devicePointer << std::endl;
        std::cout << "  Host pointer: " << attr.hostPointer << std::endl;
        
        // Wait for Enter key before exiting
        std::cout << "Process 1: Press Enter to exit..." << std::endl;
        std::cin.get();

        // Unmap and free the virtual memory
        CHECK_CU(cuMemUnmap(vmm_ptr, info.size));
        CHECK_CU(cuMemRelease(mem_handle));
        CHECK_CU(cuMemAddressFree(vmm_ptr, info.size));
        std::cout << "Process 1: Freed virtual memory" << std::endl;
        
        close(received_handle);
        close(pipe_fd[0]);
    }
    
    return 0;
}

Compile with nvcc, and run it, we get:

Process 0 (parent) PID: 506481
Process 1 (child) PID: 506482
Process 0: Using CUDA device 0
Process 1: Using CUDA device 1
Process 0: Reserved virtual memory at 0xa20000000 (2147483648 bytes)
Process 0: Created memory allocation handle
Process 0: Mapped memory allocation to virtual address
Process 0: Set access permissions for device 0
Process 0: Exported memory allocation to shareable handle 69
Process 0: Sent PID 506481, FD 69, and size to child process
Process 1: Received PID 506481, FD 69, size 2147483648 bytes
Process 1: Opened pidfd 69 for parent process
Process 1: Obtained file descriptor 70 from parent process
Process 1: Imported memory allocation from shareable handle
Process 1: Reserved virtual memory at 0xa20000000 (2147483648 bytes)
Process 1: Mapped memory allocation to virtual address
Process 1: Set access permissions for device 1
Process 1: Pointer attributes:
  Device: 0
  Type: Device
  Device pointer: 0xa20000000
  Host pointer: 0
Process 1: Press Enter to exit...

Process 1: Freed virtual memory
Process 0: Freed virtual memory and exiting

When process 1 get p2p-shared ipc memory from process 0 using the vmm API, cudaPointerGetAttributes gives the device 0, which is where the physical memory actually lives in.

Does this mean it is a bug in the cudaPointerGetAttributes function, for p2p-shared IPC memory using the cudaIpcOpenMemHandle function?

My driver version: 570.133.20
OS: Ubuntu 22.04.5 LTS
Machine: DGX H100 node

In both code snippets, I put this std::cin.get(); explicitly to wait for user input, so that we can switch to another terminal to see the output of nvidia-smi, and to confirm that the physical memory (2 GiB) indeed lives in device 0.

Thanks for reporting this issue, this would be a bug in the CUDA driver. We will address this in the next release.

@vramesh1 hi, thanks for confirming it! did you manage to reproduce it? is it just 570 driver or does it cover many driver versions? I’m going to submit another patch to pytorch, because this bug affects some usage of pytorch, and i’d like to know more about the coverage and significance.

We see this in the internal driver as well. So it is not just 570. I will try to post an update here once we know which version of the driver will have the fix.

1 Like

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.