cudaHostRegister(..., ..., cudaHostRegisterIoMemory) for PCIe device BAR0 return code=801(cudaErrorNotSupported) on Jetson Xavier

Hi,
I have problem with registering mmaped to userspace memory to CUDA. Framework reports cudaHostRegister(…, …, cudaHostRegisterIoMemory) is unsupported on Xavier, but in cuda-for-tegra-appnote written that cudaHostRegister routine is supported.

Detailed explanation:
I’m experimenting with GPU access to XTRX SDR PCIe board.
I want to register mmaped to userspace BAR0 to CUDA to be able to read from it inside CUDA kernel.
Driver xtrx.c:

static int xtrx_probe(struct pci_dev *pdev,
const struct pci_device_id *id){ //init device

err = pci_request_regions(pdev, DRV_NAME);

bar0_addr = pci_iomap(pdev, 0, 1 << 12);// corresponding virtual address in kernelspace is valid and usable

}

static int xtrxfd_mmap(struct file *filp, struct vm_area_struct *vma){ //mmap handler

vma->vm_flags |= (VM_IO | VM_DONTEXPAND | VM_DONTDUMP);
vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
pfn = pci_resource_start(xtrxdev->pdev, bar) >> PAGE_SHIFT;
if (io_remap_pfn_range(vma, vma->vm_start,
pfn,
vma->vm_end - vma->vm_start,
vma->vm_page_prot)
) return -EAGAIN;

OR (same result)
vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
res = vm_iomap_memory(vma, pci_resource_start(xtrxdev->pdev, bar), vma->vm_end - vma->vm_start);

}

Test program:


int main(int argc, char** argv){

device = findCudaDevice(argc, (const char**)argv);
checkCudaErrors(cudaSetDeviceFlags(cudaDeviceMapHost));

unsigned int * a_UA;
unsigned int * a;
const int GP_PORT_RD_HWCFG = 26;
void* mem; unsigned int * vmem,*pmem;
char * ldev="/dev/xtrx0";
enum xtrxll_mmaps {
XTRXLL_MMAP_CONFREGS_OFF = 0,
XTRXLL_MMAP_CONFREGS_LEN = 4096,
};
int fd = open(ldev, O_RDWR); // open device
//Test cudaHostRegisterMapped flag -
a_UA = (unsigned int *) malloc(XTRXLL_MMAP_CONFREGS_LEN + MEMORY_ALIGNMENT);
a = (unsigned int *) ALIGN_UP(a_UA, MEMORY_ALIGNMENT);
checkCudaErrors(cudaHostRegister(a, XTRXLL_MMAP_CONFREGS_LEN, cudaHostRegisterMapped)); // OK. cudaHostRegisterMapped supported

//Test cudaHostRegisterIoMemory
mem = mmap(0, XTRXLL_MMAP_CONFREGS_LEN, PROT_READ | PROT_WRITE,
MAP_PRIVATE, fd, XTRXLL_MMAP_CONFREGS_OFF);
//vmem = (unsigned int ) ALIGN_UP((unsigned int)mem, MEMORY_ALIGNMENT); // no impact on error
vmem = (unsigned int *) mem;
printf(“XTRX GP_PORT_RD_HWCFG: 0x%x\n”,vmem[GP_PORT_RD_HWCFG]); // OK Prints correct register value. CPU Virtual address in userspace mmapped successfuly

checkCudaErrors(cudaHostRegister(vmem, XTRXLL_MMAP_CONFREGS_LEN, cudaHostRegisterIoMemory)); // FAIL Return cudaErrorNotSupported

}

Is cudaHostRegisterIoMemory flag supported on Xavier?

Hi,

Xavier supports the default cudaHostRegisterDefault type host memory.
We are checking the PCIe buffer’s details with our internal team.

Will share more information with you later.
Thanks.

Thanks,
I have tried example with mmaped file, that work with GPU on x86. It fails, but with another result: code=2(cudaErrorMemoryAllocation)

test:

char * filename="/home/user/testmmap.txt";
int data_file = open(filename, O_RDWR | O_CREAT);
void* mem; unsigned int * vmem,*pmem;
mem = mmap(0, XTRXLL_MMAP_CONFREGS_LEN, PROT_READ | PROT_WRITE,
MAP_PRIVATE , data_file, XTRXLL_MMAP_CONFREGS_OFF);
if (mem == MAP_FAILED) { … }
vmem = (unsigned int ) ALIGN_UP((unsigned int)mem, MEMORY_ALIGNMENT);
// OR vmem = (unsigned int *) mem;
checkCudaErrors(cudaHostRegister(vmem, XTRXLL_MMAP_CONFREGS_LEN, cudaHostRegisterDefault/cudaHostRegisterMapped/));
checkCudaErrors(cudaHostGetDevicePointer((void **)&pmem, (void *)vmem, 0));

close(data_file);

Result with list of flags:
1 CUDA error at … code=2(cudaErrorMemoryAllocation) “cudaHostRegister(vmem, XTRXLL_MMAP_CONFREGS_LEN, cudaHostRegisterMapped)”
2 CUDA error at… code=2(cudaErrorMemoryAllocation) “cudaHostRegister(vmem, XTRXLL_MMAP_CONFREGS_LEN, cudaHostRegisterDefault )”
3 CUDA error at… code=801(cudaErrorNotSupported) “cudaHostRegister(vmem, XTRXLL_MMAP_CONFREGS_LEN, cudaHostRegisterIoMemory)”

I think this link is related to my question too.

It appears, that now cudaHostRegister on Xavier can only be used with malloc, in cases replaceble by cudaHostAlloc.

Hi,

Just confirm this issue with our internal team.
Although we have cudaHostRegister(.) on Jetson, importing IOMemory into CUDA’s address space is not supported.

An alternative is to use GPUdirect for solving this.
Instead of trying to import externally allocated memory into CUDA, you can allocate CUDA-memory and then make it accessible to PCIe devices.

You can find a working sample below:

Thanks

Hi, AastaLLL,
Thanks for answer.

It’s disappointing that IOMemory on embeded system can’t be imported into CUDA address space, while it’s supported even on moderm gaming GPU :(.

Unfortunately, GPUdirect coudn’t be used in my case. In GPUdirect Virtual address (in CUDA managed Virtual Address space of application) points to list of allocated and pinned memory pages, from which scatter-gather list in driver is created to be used in DMA calls.

I wanted to create Virtual address in CUDA managed Virtual Address space (Unified virtual addressing) pointing to physical memory allocated by OS to PCIe device, not DMA buffer.