cudaHostRegister on mmaped memory or how to get physically contiguous GPU memory

We are working an an application that will run on a NVIDIA Jetson Board. Input data will come from a custom PCIe device with a FPGA. For better performance, we are planning on the PCIe device sending data whenever ready through DMA, and sending an interrupt when done transfering. Transfers can be around 64 MBytes in size.

My initial plan was to get a big 4GB DMA buffer with contiguous at the kernel, using mmap in userspace to this buffer, and have the PCIe device use this as a circular buffer(since it is contiguous memory, it would only need the start address and size). I would then use cudaHostRegister so CUDA would treat it as pinned memory, and call my kernels from it. However, after testing, this seems to not work, cudaHostRegister returns Invalid argument with whichever flags I use. According to this forum post, this could be expected.

This is the userspace application I used to test both the mmap and if the maping would be accessible to GPU:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
#include <stdint.h>
#include <sys/mman.h>
#include <unistd.h>
#include <cuda_runtime.h>
#include <cuda.h>
#include <cuda_device_runtime_api.h>
#include <cuda_runtime_api.h>

#define FILE_SIZE 64 * 1024 * 1024 // 64MB
#define PAGE_SIZE 4096

__global__ void test(uint32_t* in){
	in[threadIdx.x + blockIdx.x * blockDim.x] *= 3;
}

int main(int argc, char *argv[]) {
    if (argc < 3 || argc > 5) {
        fprintf(stderr, "Usage: %s <file> <r or w> <offset> [word]\n", argv[0]);
        exit(EXIT_FAILURE);
    }

    char *filename = argv[1];
    char mode = argv[2][0];
    off_t offset = atol(argv[3]);
    int word = 0;

    if (mode == 'w' && argc == 5) {
        word = atoi(argv[4]);
    }

    int fd = open(filename, O_RDWR);
    if (fd == -1) {
        perror("Error opening file");
        exit(EXIT_FAILURE);
    }

    if (offset < 0 || offset > FILE_SIZE) {
        fprintf(stderr, "Invalid offset\n");
        close(fd);
        exit(EXIT_FAILURE);
    }

    void *addr = mmap(NULL, FILE_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_LOCKED, fd, 0);
    if (addr == MAP_FAILED) {
        perror("Error mmapping the file");
        close(fd);
        exit(EXIT_FAILURE);
    }

    if (mode == 'r') {
        int value = *(int *)((char *)addr + offset);
        printf("Value at offset %ld: %d\n", offset, value);
    } else if (mode == 'w') {
        *(int *)((char *)addr + offset) = word;
        printf("Wrote %d to offset %ld\n", word, offset);
    } else {
        fprintf(stderr, "Invalid mode\n");
    }

    CUcontext localContext;
    CUdevice device;
    cuDeviceGet(&device, 0);
    cuCtxCreate(&localContext, CU_CTX_MAP_HOST, device);

    cudaError_t error = cudaHostRegister(addr, FILE_SIZE, cudaHostRegisterMapped);// I also tried cudaHostRegisterIoMemory(returns not supported) and others
    if (error != cudaSucess){
        fprintf(stderr, "Error %d:: %s: %s\n", error,
                    cudaGetErrorName(error), cudaGetErrorString(error));
    }
    test<<<4,32>>>((uint32_t*)addr);


    munmap(addr, FILE_SIZE);
    close(fd);

    return 0;
}

And this is the kernel module used, really just getting a big buffer and allowing mmap:

#include <linux/module.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/cdev.h>
#include <linux/uaccess.h>
//#include <asm-generic/io.h>

#define DEVICE_NAME "mmap_dev"
#define BUFFER_SIZE (64 * 1024 * 1024) / 4096

static dev_t dev = 0;

static void *buffer;

static struct cdev mycdev;

int mymap_open(struct inode *inode, struct file *filp){
    return 0;
}

static int device_mmap(struct file *filp, struct vm_area_struct *vma)
{
    return remap_pfn_range(vma, vma->vm_start,
                           virt_to_phys(buffer) >> PAGE_SHIFT,
                           vma->vm_end - vma->vm_start,
                           vma->vm_page_prot);
}

static const struct file_operations fops = {
    .owner = THIS_MODULE,
    .open = mymap_open,
    .mmap = device_mmap,
};

static int __init mymmap_init(void)
{
    cdev_init(&mycdev, &fops);
    mycdev.owner = THIS_MODULE;
    mycdev.ops = &fops;
    int ret;
    int i;

    int result;
    result = alloc_chrdev_region(&dev, 0, 256, "arthur_module");
    printk("Major returned: %d, Minor returned: %d\n", MAJOR(dev), MINOR(dev));
    if (result){
        printk("Error getting device numbers\n");
        return result;
    }

    buffer = (void *)__get_free_pages(GFP_KERNEL, get_order(BUFFER_SIZE));
    if (!buffer) {
        pr_err("unable to allocate buffer\n");
        unregister_chrdev_region(dev, 256);
        return -ENOMEM;
    }
    u32 *tmp = buffer;
    for (i = 0; i < 100; i++){
        tmp[i] = i * 2 + 3;
    }

    cdev_add(&mycdev, dev, 1);

    return 0;
}

static void __exit mmap_exit(void)
{
    cdev_del(&mycdev);
    free_pages((unsigned long)buffer, get_order(BUFFER_SIZE));
    unregister_chrdev_region(dev, 256);
}

module_init(mymmap_init);
module_exit(mmap_exit);

MODULE_LICENSE("GPL");
MODULE_AUTHOR("Your Name");
MODULE_DESCRIPTION("A simple mmap kernel module");

As suggested then, I should use GPUDirect RDMA, but the FPGA logic for sending the data would be much simpler if I could get physically contiguous memory when allocating CUDA memory, otherwise, when doing nvidia_p2p_get_pages for a 4GB buffer and 16K pages, I’m expecting to get a list of more than 200k pages, which might require considerable FPGA resources to map. Even the minimum size we need, about 256MB would need 16 thousand pages, which could still be cumbersome.

Is there any alternative? Can I get cudaHostRegister to work in Jetson somehow? Can I get physically contiguous GPU memory? Any help is highly appreciated.