Error when trying to write data to GPU DMA memory (using GPU Direct RDMA)

I’m using Jetson AGX Xavier on custom hardware with an FPGA based frame grabber.
The physical address of the DMA memory is known to the FPGA and the FPGA writes data to the DMA memory.

I want to use GPU as DMA memory without CPU->GPU memcpy using GPUDirectRDMA.

So, I wrote a custom driver for FPGA and tested it, but when the FPGA tries to write data to the DMA memory, the following error occurs.

arm-smmu 12000000.iommu: Unhandled context error: fsr=0x80000402, iova=0xffc00000, fsynr=0x13, cbfrsynra=0x145b, cb=7’

I wrote the code by looking at GitHub - NVIDIA/jetson-rdma-picoevb: Minimal HW-based demo of GPUDirect RDMA on NVIDIA Jetson AGX Xavier running L4T.
How to use GPUDirectRDMA correctly?
Which part do I need to edit?

This is the code I wrote.

<User Program>
#define		DMA_ADDR_L	0x20000	
#define		DMA_ADDR_H	0x20004	

typedef struct _PHYS_DMA_MEM_INFO
	unsigned long memNum;
	unsigned long memSize;
	unsigned long long physAddr[MAX_PHYS_DMA_MEM_NUM];
	unsigned long long virtAddr[MAX_PHYS_DMA_MEM_NUM];
	unsigned long long userAddr[MAX_PHYS_DMA_MEM_NUM];

bool DrvWrite32(int handle, unsigned int addr, unsigned int data)
	int length = sizeof(unsigned int);

	Info.offset = addr;
	Info.size = length;	
	memcpy((void *)&, (void*)&data, length);

	if(ioctl(handle, IOCTL_DRV_WRITE, &Info) < 0)
		return false;

	return true;

int main()
    uint8_t pRdma;
    int nRdmaSize = 1 * 1024 * 1024;
    unsigned int flag = 1;
    cudaError_t ce = cudaHostAlloc((void**)&pRdma, nRdmaSize * sizeof(uint8_t), cudaHostAllocDefault);
    if (ce != cudaSuccess)
        qDebug() << "Allocation of src_d failed: " << ce;
        return false;
    CUresult cr = cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, (CUdeviceptr)pRdma);
    if (cr != CUDA_SUCCESS)
        qDebug() << "cuPointerSetAttribute(src_d) failed: " << cr;
        return false;

    PHYS_DMA_MEM_INFO   DevDmaMemInfo;

    DevDmaMemInfo.memNum = 1;
    DevDmaMemInfo.memSize = nRdmaSize;
    DevDmaMemInfo.userAddr[0] = (unsigned long long)pRdma;

    char strFileName[255];
    sprintf(strFileName, "/dev/RDMATEST0");	

     int handle = open(strFileName, O_RDWR);
     if(handle < 0)
         return false;

    if(ioctl(handle, IOCTL_DRV_GET_CUDA_DMA_MEM_INFO0, &DevDmaMemInfo) < 0)
        return false;

    for(int index =0; index<DevDmaPhyMemInfo.memNum; index++)
        uint32_t Addr_h = ( DevDmaMemInfo.nPhysAddr[index] >> 32) & 0xffffffff;
        uint32_t Addr_l = ( DevDmaMemInfo.nPhysAddr[index] >> 0)  & 0xffffffff;

        DrvWrite32(handle, DMA_ADDR_H	+0x8 * index, Addr_h);
        DrvWrite32(handle, DMA_ADDR_L	+0x8 * index, Addr_l);
    return true;

<Driver Program>
#define GPU_PAGE_SHIFT		12

#define GPU_PAGE_SIZE		(((u64)1) << GPU_PAGE_SHIFT)

#define VENDOR_ID 			0x1172
#define INTERFACE_ID 		0xe00c		

struct InterfaceInfo_t
	/* the kernel pci device data structure */
  	struct pci_dev *     pcidev;

	/* length of memory region. Used for error checking. */
	unsigned long        barlengths;

	/* kernel's virtual addr. for the mapped BARs */
	void * __iomem      bar;

	/* character device */
	dev_t			cdevNum;
	struct cdev		cdev;
  	struct class*		ifClass;
	int			        ifNum;
	int			        major;
	int			        minor;
	/* Interrupt */
	unsigned int		ifIrqNum;

	// bus num
	unsigned int		busNum;

struct rdmaTest_cuda_surface 
	u64				virtAddr;
	u64				offset;
	u64				len;
	struct nvidia_p2p_page_table*	page_table;

struct rdmaTest_userbuf_dma 
	dma_addr_t	addr;
	u64		        len;

struct rdmaTest_userbuf 
	bool cuda;

	int n_dmas;
	struct rdmaTest_userbuf_dma *dmas;

			int to_dev;
			int pagecount;
			struct page **pages;
			struct sg_table *sgt;
			int map_ret;
		} pages;

			struct rdmaTest_cuda_surface *cusurf;
			struct nvidia_p2p_dma_mapping *map;
		} cuda;

	} priv;

struct InterfaceInfo_t*				InterfaceInfo;

static void rdmaTest_put_userbuf_cuda(struct rdmaTest_userbuf *ubuf)
	if (ubuf->

static void rdmaTest_put_userbuf(struct rdmaTest_userbuf *ubuf)
	if (ubuf->cuda)


static void rdmaTest_userbuf_add_dma_chunk(struct rdmaTest_userbuf *ubuf, dma_addr_t addr, u64 len)
	struct rdmaTest_userbuf_dma *dma;
	dma_addr_t end;

	printk(KERN_INFO "ubuf->n_dmas : %d\n", ubuf->n_dmas);
	if (ubuf->n_dmas) 
		dma = &ubuf->dmas[ubuf->n_dmas - 1];
		end = dma->addr + dma->len;
		if (addr == end) 
			dma->len += len;

	dma = &ubuf->dmas[ubuf->n_dmas];
	dma->addr = addr;
	dma->len = len;

static int rdmaTest_get_userbuf_cuda(struct rdmaTest_userbuf *ubuf, struct rdmaTest_cuda_surface* cusurf, __u64 len, int to_dev)
	int id, ret, i;
	u64 offset, len_left;

	ubuf->cuda = true;	
	ubuf->priv.cuda.cusurf = cusurf;

	if (len > cusurf->len)
		return -EINVAL;
	ret = nvidia_p2p_dma_map_pages(&InterfaceInfo->pcidev->dev, cusurf->page_table, &ubuf->, to_dev ? DMA_TO_DEVICE : DMA_FROM_DEVICE);

	if (ret < 0)
		return ret;

	ubuf->dmas = kmalloc_array(ubuf->>entries, sizeof(*ubuf->dmas), GFP_KERNEL);
	if (!ubuf->dmas)
		return -ENOMEM;

	offset = cusurf->offset;
	len_left = cusurf->len;
	printk(KERN_INFO "ubuf->>entries : %d\n", ubuf->>entries);
	for (i = 0; i < ubuf->>entries; i++) 
		dma_addr_t dma_this = ubuf->>hw_address[i];
		u64 len_this = ubuf->>hw_len[i];
		printk(KERN_INFO "[%d] dma_addr : %llx, len : %ld\n", i, dma_this, len_this);

		dma_this += offset;
		rdmaTest_userbuf_add_dma_chunk(ubuf, dma_this, len_this);

		if (len_this >= len_left)
		len_left -= len_this;
		offset = 0;

	return 0;

static void rdmaTest_p2p_free_callback(void *data)
	struct rdmaTest_cuda_surface *cusurf = data;

static int rdmaTest_ioctl_GetCudaDMAMemInfo(unsigned long arg)
	void __user *argp = (void __user *)arg;
	struct rdmaTest_cuda_surface* cusurf;
	int ret;
	if (copy_from_user(&cudaMemInfo, argp, sizeof(PHYS_DMA_MEM_INFO)))
		return -EFAULT;
	cusurf = kzalloc(sizeof(*cusurf), GFP_KERNEL);
	if (!cusurf)
		return -ENOMEM;
	cusurf->virtAddr = cudaMemInfo.userAddr[0] & GPU_PAGE_MASK;
	cusurf->offset = cudaMemInfo.userAddr[0] & GPU_PAGE_OFFSET;
	cusurf->len = cudaMemInfo.memSize;
	u64 aligned_len = (cusurf->offset + cusurf->len + GPU_PAGE_SIZE - 1) & GPU_PAGE_MASK;
	ret = nvidia_p2p_get_pages(cusurf->virtAddr, aligned_len, &cusurf->page_table, rdmaTest_p2p_free_callback, cusurf);
	if (ret < 0) 
		return ret;
	struct rdmaTest_userbuf ubuf = {0};	
	ret = rdmaTest_get_userbuf_cuda(&ubuf, cusurf, cudaMemInfo.memSize, 1);	
	if (ret)
		goto put_userbuf_dst;

        cudaMemInfo.physAddr[0] =>hw_address[0];
	ret = copy_to_user(argp, &cudaMemInfo, sizeof(PHYS_DMA_MEM_INFO));
	return ret;
long drv_ioctl(struct file* filePtr, unsigned int cmd, unsigned long arg)
	void * baseAddrPtr;
	baseAddrPtr = (void*)InterfaceInfo->bar;
			return rdmaTest_ioctl_GetCudaDMAMemInfo(arg);	

	return 0;

int pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
	InterfaceInfo = NULL;

	InterfaceInfo = kzalloc(sizeof(struct InterfaceInfo_t), GFP_KERNEL);
	if (!InterfaceInfo)
		return -1;

	InterfaceInfo->pcidev = pdev;
	InterfaceInfo->busNum = pdev->bus->number;
		return -1;

	if(pci_request_regions(pdev, INTERFACE_NAME))
		return -1;

	unsigned long barStart  = pci_resource_start(pdev, 0);  //get BAR(base address register) start
	unsigned long barEnd    = pci_resource_end(pdev, 0); 
	unsigned long barLength = barEnd - barStart + 1;	//pci_resource_len(struct pci_dev *pdev, int bar)	
	InterfaceInfo->barlengths = barLength;

	if (!barStart || !barEnd) 
		barLength = 0;	

	InterfaceInfo->bar = pci_iomap(InterfaceInfo->pcidev, 0, barLength);
	if (!InterfaceInfo->bar)
		return -1;

	InterfaceInfo->ifIrqNum = pdev->irq;

	unsigned long ifId = (unsigned long)INTERFACE_ID;
	if(request_irq(InterfaceInfo->ifIrqNum , drv_interrupt, IRQF_SHARED, INTERFACE_NAME, (void *)ifId) < 0)
		return -1;


	int minor = 0;
	int major = 0; 
	int ifNum = -1;

	int nRet = alloc_chrdev_region(&InterfaceInfo->cdevNum, minor, 1 , INTERFACE_NAME);
	if (nRet < 0) 
		return nRet;

	major = MAJOR(InterfaceInfo->cdevNum);
	minor = MINOR(InterfaceInfo->cdevNum);
	ifNum = MKDEV(major, minor);

	InterfaceInfo->ifClass = class_create(THIS_MODULE, INTERFACE_NAME);
	cdev_init(&InterfaceInfo->cdev, &fileOps);
	InterfaceInfo->cdev.owner = THIS_MODULE;
	InterfaceInfo->cdev.ops = &fileOps;

	nRet = cdev_add(&InterfaceInfo->cdev, ifNum, 1 );
	if (nRet) 
		return -1;
		device_create(InterfaceInfo->ifClass, NULL, ifNum, NULL,  "%s%u", INTERFACE_NAME, minor);

	InterfaceInfo->ifNum = ifNum;
	InterfaceInfo->major = major;
	InterfaceInfo->minor = minor;

	return 0;


Just want to confirm first.
Is there any issue when writing the DMA memory with GPU?


I’ve never used dma with a gpu.

I thought gpudirectrdma was the way, so I looked at the gpudirect rdma example and wrote the code.

                        uint8_t *gpuBuf
   user                 cudaHostAlloc(gpuBuf)
                        // input: gpuBuf virtual address, output: gpuBuf page table 
   kernel               nvidia_p2p_get_pages(gpuBuf_virtAddr, &gpu_page_table) 
  (dirver)              // input: gpu_page_table (out value of nvidia_p2p_get_pages function), output: gpu_dma_mapping
                        nvidia_p2p_dma_map(gpu_page_table, &gpu_dma_mapping)

I think here ‘gpu_dma_mapping_->hw_address’ is the physical address of the gpu dma.
So I gave this address to the fpga.
However, when the fpga tries to write data to this address, an error occurs.

And I have an additional question.,/%20Create%20descriptor%20/,desc%2D%3Enxt_adr_hi%20%3D%200%3B,-return%20pevb_dma(pevb%3B-,/%20Create%20descriptor%20/,desc%2D%3Enxt_adr_hi%20%3D%200%3B,-return%20pevb_dma(pevb)
(Line 709 ~ Line 721)

I don’t understand this part.
Why does dma_alloc_coherent write gpu dma address to dma address allocated and write to hardware register?
Can’t the gpu dma address be directly written to a hardware register?


The sample shows that an FPGA (which has its own DMA controller) accesses iGPU/System memory over PCIe.
The opposite is not supported.

Here is an RDMA document for your reference:


For an FPGA (which has its own DMA controller) to access the iGPU/system memory via PCIe

  1. Allocate dma memory with dma_alloc_coherent
  2. Get GPU DMA physical address with nvidia_p2p_dma_map_pages
  3. Write the GPU DMA physical address to the DMA memory allocated in 1).
  4. Inform the FPGA of the physical address of the DMA memory allocated in 1).

Should I do this?

But I have a question here.
In the example code, fpga tells us the physical address of the dma memory allocated by dma_alloc_coherent. Does the fpga look at the gpu dma physical address written to this memory and write the data to the gpu dma memory?


You can find the sample below:

The iGPU/system memory is allocated with cudaHostAlloc/malloc.


Can an FPGA access iGPU/system memory via PCIe in this way?

  1. Allocate memory with cudaHostAlloc
  2. Using cuPointerSetAttribute
  3. nvidia_p2p_get_pages
  4. nvidia_p2p_dma_map_pages
  5. Write the GPU DMA physical address obtained from nvidia_p2p_dma_map_pages to the FPGA.


Sorry for the late update.

We have both cudaMalloc and malloc sample.
So cudaHostAlloc should work.


This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.