How to correctly export and import CUDA memory pool?

Hi there,

I am trying to share some memory pool between processes, but I always get double free error when I try to import. I have not allocated any memory, let alone free it. So I am confused.

I have two programs. One of my programs calls the following function to create and export a pool, and then send the exported file descriptor to the other program.

void send_pool(int src_device, int dst_device, int sock) {
    // I have 4 GPUs; enable P2P access between GPUs
    cudaSetDevice(dst_device);
    cudaDeviceEnablePeerAccess(src_device, 0);
    
    cudaMemPool_t pool;

    cudaMemPoolProps poolProps = {};
    poolProps.allocType = cudaMemAllocationTypePinned;
    poolProps.handleTypes = cudaMemHandleTypePosixFileDescriptor;
    poolProps.location.type = cudaMemLocationTypeDevice;
    poolProps.location.id = dst_device;
    cudaMemPoolCreate(&pool, &poolProps);

    // Export pool to file descriptor
    int fd;
    cudaMemAllocationHandleType handleType = cudaMemHandleTypePosixFileDescriptor;
    cudaMemPoolExportToShareableHandle(&fd, pool, handleType, 0);

    // send the file descriptor via socket
    send_fd(sock, fd);
}

The other program executes the following function to wait until receiving the descriptor, and then it will import it as a pool.

void receive_pool(int dst_device, int sock) {
    cudaSetDevice(dst_device);

    int fd;
    // receive the file descriptor via socket
    receive_fd(sock, &fd);

    // import the pool
    cudaMemPool_t importPool;
    cudaMemPoolImportFromShareableHandle(&importPool, &fd, cudaMemHandleTypePosixFileDescriptor, 0);
}

But when I run these two programs, I always encounter the following error in the receiver process.

free(): double free detected in tcache 2

I believe this error happens immediately after I invoke the cudaMemPoolImportFromShareableHandle API. Am I using the APIs in a wrong way?

P.S. My environment has 4 Tesla V100 GPUs, and I am using CUDA 11.8 in Ubuntu 20.04.

Many thanks.

I don’t have a solution to your problem, but as a first step I would suggest adding proper CUDA API error checking.

Well, I did have error checks in my actual code. I just removed them to make my post look more concise.

Here is my actual code:

void send_pool(int src_device, int dst_device, int sock) {
    cudaError_t err = cudaSetDevice(dst_device);
    if (err != cudaSuccess) {
        std::cout << "CUDA SetDevice Error: " << cudaGetErrorString(err)<<std::endl;
    }
    err = cudaDeviceEnablePeerAccess(src_device, 0);
    if (err != cudaSuccess) {
        std::cout << "CUDA PeerAccess Error: " << cudaGetErrorString(err)<<std::endl;
    }

    cudaMemPool_t pool;

    cudaMemPoolProps poolProps = {};
    poolProps.allocType = cudaMemAllocationTypePinned;
    poolProps.handleTypes = cudaMemHandleTypePosixFileDescriptor;
    poolProps.location.type = cudaMemLocationTypeDevice;
    poolProps.location.id = dst_device;
    err = cudaMemPoolCreate(&pool, &poolProps);
    if (err != cudaSuccess) {
        std::cout << "CUDA PoolCreate Error: " << cudaGetErrorString(err) << std::endl;
    }

    int fd;
    cudaMemAllocationHandleType handleType = cudaMemHandleTypePosixFileDescriptor;
    err = cudaMemPoolExportToShareableHandle(&fd, pool, handleType, 0);
    if (err != cudaSuccess) {
        std::cout << "CUDA PoolExport Error: " << cudaGetErrorString(err) << std::endl;
    }

    send_fd(sock, fd);
}

void receive_pool(int sock) {
    int fd;
    receive_fd(sock, &fd);

    cudaMemPool_t importPool;
    cudaError_t err = cudaMemPoolImportFromShareableHandle(&importPool, &fd, cudaMemHandleTypePosixFileDescriptor, 0);
    if (err != cudaSuccess) {
        std::cout << "CUDA PoolImport Error: " << cudaGetErrorString(err) << std::endl;
    }
}

void send_fd(int socket, int fd) {
    struct msghdr msg = { nullptr };
    char buf[CMSG_SPACE(sizeof(fd))];
    memset(buf, '\0', sizeof(buf));
    struct iovec io = { .iov_base = (void *) "", .iov_len = 1 };
    msg.msg_iov = &io;
    msg.msg_iovlen = 1;
    msg.msg_control = buf;
    msg.msg_controllen = sizeof(buf);
    struct cmsghdr * cmsg = CMSG_FIRSTHDR(&msg);
    cmsg->cmsg_level = SOL_SOCKET;
    cmsg->cmsg_type = SCM_RIGHTS;
    cmsg->cmsg_len = CMSG_LEN(sizeof(fd));
    memmove(CMSG_DATA(cmsg), &fd, sizeof(fd));
    msg.msg_controllen = CMSG_SPACE(sizeof(fd));
    if (sendmsg(socket, &msg, 0) < 0)
        printf("Failed to send message\n");
}

void receive_fd(int socket, int *fd) {
    struct msghdr msg = {nullptr};
    char m_buffer[1];
    struct iovec io = { .iov_base = m_buffer, .iov_len = sizeof(m_buffer) };
    msg.msg_iov = &io;
    msg.msg_iovlen = 1;
    char c_buffer[256];
    msg.msg_control = c_buffer;
    msg.msg_controllen = sizeof(c_buffer);
    if (recvmsg(socket, &msg, 0) < 0)
        printf("Failed to receive message\n");
    struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
    memmove(fd, CMSG_DATA(cmsg), sizeof(*fd));
}

// Sender main
int main() {
    int sock = socket(AF_UNIX, SOCK_DGRAM, 0);
    struct sockaddr_un un{ .sun_family = AF_UNIX };
    strcpy(un.sun_path, "my_process");

    if (connect(sock, (struct sockaddr*)&un, sizeof(un)) < 0) {
        printf("connect failed\n");
        return 1;
    }
    send_pool(SRC_DEVICE, DST_DEVICE, sock);
}

// Receiver main
int main() {
    cudaSetDevice(DST_DEVICE);
    // Create socket
    int sock = socket(AF_UNIX, SOCK_DGRAM, 0);
    struct sockaddr_un un{ .sun_family = AF_UNIX };
    unlink("my_process");
    strcpy(un.sun_path, "my_process");
    if (bind(sock, (struct sockaddr*)&un, sizeof(un)) < 0) {
        printf("bind failed\n");
        return 1;
    }

    receive_pool(sock);
}

During execution, I did not encounter any errors other than the double free mentioned before. So I believe the issue is exactly about memory pool export/import.

Thanks for your reply anyway.

You could compile with debug symbols and use a tool like valgrind to detect host memory errors.

Does it work when you prevent the sender from exiting? The way your code is currently written, the sender could terminate, invalidating the pool before the reciever is able to import it.