Error when trying to write data to GPU DMA memory (using GPU Direct RDMA)

_JJANG · April 19, 2023, 3:21am

I’m using Jetson AGX Xavier on custom hardware with an FPGA based frame grabber.
The physical address of the DMA memory is known to the FPGA and the FPGA writes data to the DMA memory.

I want to use GPU as DMA memory without CPU->GPU memcpy using GPUDirectRDMA.

So, I wrote a custom driver for FPGA and tested it, but when the FPGA tries to write data to the DMA memory, the following error occurs.

arm-smmu 12000000.iommu: Unhandled context error: fsr=0x80000402, iova=0xffc00000, fsynr=0x13, cbfrsynra=0x145b, cb=7’

I wrote the code by looking at GitHub - NVIDIA/jetson-rdma-picoevb: Minimal HW-based demo of GPUDirect RDMA on NVIDIA Jetson AGX Xavier running L4T.
How to use GPUDirectRDMA correctly?
Which part do I need to edit?

This is the code I wrote.

//-------------------------
<User Program>
//-------------------------
#define		DMA_ADDR_L	0x20000	
#define		DMA_ADDR_H	0x20004	

typedef struct _PHYS_DMA_MEM_INFO
{
	unsigned long memNum;
	unsigned long memSize;
	unsigned long long physAddr[MAX_PHYS_DMA_MEM_NUM];
	unsigned long long virtAddr[MAX_PHYS_DMA_MEM_NUM];
	unsigned long long userAddr[MAX_PHYS_DMA_MEM_NUM];
}PHYS_DMA_MEM_INFO;

bool DrvWrite32(int handle, unsigned int addr, unsigned int data)
{	 
	int length = sizeof(unsigned int);

	DRV_REG_MEM_RW Info;
	Info.offset = addr;
	Info.size = length;	
	memcpy((void *)&Info.data, (void*)&data, length);

	if(ioctl(handle, IOCTL_DRV_WRITE, &Info) < 0)
		return false;

	return true;
}

int main()
{ 
    uint8_t pRdma;
    int nRdmaSize = 1 * 1024 * 1024;
    unsigned int flag = 1;
    cudaError_t ce = cudaHostAlloc((void**)&pRdma, nRdmaSize * sizeof(uint8_t), cudaHostAllocDefault);
    if (ce != cudaSuccess)
    {
        qDebug() << "Allocation of src_d failed: " << ce;
        return false;
    }
    CUresult cr = cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, (CUdeviceptr)pRdma);
    if (cr != CUDA_SUCCESS)
    {
        qDebug() << "cuPointerSetAttribute(src_d) failed: " << cr;
        return false;
    }

    PHYS_DMA_MEM_INFO   DevDmaMemInfo;

    DevDmaMemInfo.memNum = 1;
    DevDmaMemInfo.memSize = nRdmaSize;
    DevDmaMemInfo.userAddr[0] = (unsigned long long)pRdma;

    char strFileName[255];
    sprintf(strFileName, "/dev/RDMATEST0");	

     int handle = open(strFileName, O_RDWR);
     if(handle < 0)
         return false;

    if(ioctl(handle, IOCTL_DRV_GET_CUDA_DMA_MEM_INFO0, &DevDmaMemInfo) < 0)
        return false;

    for(int index =0; index<DevDmaPhyMemInfo.memNum; index++)
    {
        uint32_t Addr_h = ( DevDmaMemInfo.nPhysAddr[index] >> 32) & 0xffffffff;
        uint32_t Addr_l = ( DevDmaMemInfo.nPhysAddr[index] >> 0)  & 0xffffffff;

        DrvWrite32(handle, DMA_ADDR_H	+0x8 * index, Addr_h);
        DrvWrite32(handle, DMA_ADDR_L	+0x8 * index, Addr_l);
    }
    return true;
}


//-------------------------
<Driver Program>
//-------------------------
#define GPU_PAGE_SHIFT		12

#define GPU_PAGE_SIZE		(((u64)1) << GPU_PAGE_SHIFT)
#define GPU_PAGE_OFFSET	(GPU_PAGE_SIZE - 1)
#define GPU_PAGE_MASK		(~GPU_PAGE_OFFSET)

#define VENDOR_ID 			0x1172
#define INTERFACE_ID 		0xe00c		
#define INTERFACE_CLASS	0xFF00
#define INTERFACE_NAME 	"RDMATEST"

struct InterfaceInfo_t
{
	/* the kernel pci device data structure */
  	struct pci_dev *     pcidev;

	/* length of memory region. Used for error checking. */
	unsigned long        barlengths;

	/* kernel's virtual addr. for the mapped BARs */
	void * __iomem      bar;

	/* character device */
	dev_t			cdevNum;
	struct cdev		cdev;
  	struct class*		ifClass;
	int			        ifNum;
	int			        major;
	int			        minor;
	
	/* Interrupt */
	unsigned int		ifIrqNum;

	// bus num
	unsigned int		busNum;
};

struct rdmaTest_cuda_surface 
{
	u64				virtAddr;
	u64				offset;
	u64				len;
	struct nvidia_p2p_page_table*	page_table;
};

struct rdmaTest_userbuf_dma 
{
	dma_addr_t	addr;
	u64		        len;
};

struct rdmaTest_userbuf 
{
	bool cuda;

	int n_dmas;
	struct rdmaTest_userbuf_dma *dmas;

	union 
	{
		struct 
		{
			int to_dev;
			int pagecount;
			struct page **pages;
			struct sg_table *sgt;
			int map_ret;
		} pages;

		struct 
		{
			struct rdmaTest_cuda_surface *cusurf;
			struct nvidia_p2p_dma_mapping *map;
		} cuda;

	} priv;
};		

struct InterfaceInfo_t*				InterfaceInfo;


static void rdmaTest_put_userbuf_cuda(struct rdmaTest_userbuf *ubuf)
{
	if (ubuf->priv.cuda.map)
		nvidia_p2p_dma_unmap_pages(ubuf->priv.cuda.map);
}

static void rdmaTest_put_userbuf(struct rdmaTest_userbuf *ubuf)
{
	if (ubuf->cuda)
		rdmaTest_put_userbuf_cuda(ubuf);

	kfree(ubuf->dmas);
}

static void rdmaTest_userbuf_add_dma_chunk(struct rdmaTest_userbuf *ubuf, dma_addr_t addr, u64 len)
{
	struct rdmaTest_userbuf_dma *dma;
	dma_addr_t end;

	printk(KERN_INFO "ubuf->n_dmas : %d\n", ubuf->n_dmas);
	
	if (ubuf->n_dmas) 
	{
		dma = &ubuf->dmas[ubuf->n_dmas - 1];
		end = dma->addr + dma->len;
		if (addr == end) 
		{
			dma->len += len;
			return;
		}
	}

	dma = &ubuf->dmas[ubuf->n_dmas];
	dma->addr = addr;
	dma->len = len;
	ubuf->n_dmas++;
}

static int rdmaTest_get_userbuf_cuda(struct rdmaTest_userbuf *ubuf, struct rdmaTest_cuda_surface* cusurf, __u64 len, int to_dev)
{
	int id, ret, i;
	u64 offset, len_left;

	ubuf->cuda = true;	
	ubuf->priv.cuda.cusurf = cusurf;

	if (len > cusurf->len)
		return -EINVAL;
	
	ret = nvidia_p2p_dma_map_pages(&InterfaceInfo->pcidev->dev, cusurf->page_table, &ubuf->priv.cuda.map, to_dev ? DMA_TO_DEVICE : DMA_FROM_DEVICE);


	if (ret < 0)
	{
		return ret;
	}

	ubuf->dmas = kmalloc_array(ubuf->priv.cuda.map->entries, sizeof(*ubuf->dmas), GFP_KERNEL);
	if (!ubuf->dmas)
		return -ENOMEM;

	offset = cusurf->offset;
	len_left = cusurf->len;
	printk(KERN_INFO "ubuf->priv.cuda.map->entries : %d\n", ubuf->priv.cuda.map->entries);
	for (i = 0; i < ubuf->priv.cuda.map->entries; i++) 
	{
		dma_addr_t dma_this = ubuf->priv.cuda.map->hw_address[i];
		u64 len_this = ubuf->priv.cuda.map->hw_len[i];
		printk(KERN_INFO "[%d] dma_addr : %llx, len : %ld\n", i, dma_this, len_this);

		dma_this += offset;
		rdmaTest_userbuf_add_dma_chunk(ubuf, dma_this, len_this);

		if (len_this >= len_left)
			break;
		len_left -= len_this;
		offset = 0;
	}

	return 0;
}

static void rdmaTest_p2p_free_callback(void *data)
{
	struct rdmaTest_cuda_surface *cusurf = data;
	nvidia_p2p_free_page_table(cusurf->page_table);
	kfree(cusurf);
}

static int rdmaTest_ioctl_GetCudaDMAMemInfo(unsigned long arg)
{
	void __user *argp = (void __user *)arg;
	PHYS_DMA_MEM_INFO	cudaMemInfo;
	struct rdmaTest_cuda_surface* cusurf;
	int ret;
	
	if (copy_from_user(&cudaMemInfo, argp, sizeof(PHYS_DMA_MEM_INFO)))
		return -EFAULT;
		
	cusurf = kzalloc(sizeof(*cusurf), GFP_KERNEL);
	if (!cusurf)
		return -ENOMEM;
		
	cusurf->virtAddr = cudaMemInfo.userAddr[0] & GPU_PAGE_MASK;
	cusurf->offset = cudaMemInfo.userAddr[0] & GPU_PAGE_OFFSET;
	cusurf->len = cudaMemInfo.memSize;
	u64 aligned_len = (cusurf->offset + cusurf->len + GPU_PAGE_SIZE - 1) & GPU_PAGE_MASK;
	ret = nvidia_p2p_get_pages(cusurf->virtAddr, aligned_len, &cusurf->page_table, rdmaTest_p2p_free_callback, cusurf);
	if (ret < 0) 
	{
		kfree(cusurf);
		return ret;
	}
	
	struct rdmaTest_userbuf ubuf = {0};	
	ret = rdmaTest_get_userbuf_cuda(&ubuf, cusurf, cudaMemInfo.memSize, 1);	
	if (ret)
		goto put_userbuf_dst;

        cudaMemInfo.physAddr[0] = ubuf.priv.cuda.map->hw_address[0];
	ret = copy_to_user(argp, &cudaMemInfo, sizeof(PHYS_DMA_MEM_INFO));
	
put_userbuf_dst:
	rdmaTest_put_userbuf(&ubuf);
	return ret;
}
 
long drv_ioctl(struct file* filePtr, unsigned int cmd, unsigned long arg)
{
	void * baseAddrPtr;
	baseAddrPtr = (void*)InterfaceInfo->bar;
 
	switch(cmd)
	{ 
		case IOCTL_DRV_GET_CUDA_DMA_MEM_INFO0:
		{
			return rdmaTest_ioctl_GetCudaDMAMemInfo(arg);	
		}
		break;	
	}

	return 0;
}

int pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
{
	InterfaceInfo = NULL;

	InterfaceInfo = kzalloc(sizeof(struct InterfaceInfo_t), GFP_KERNEL);
	if (!InterfaceInfo)
		return -1;

	InterfaceInfo->pcidev = pdev;
	InterfaceInfo->busNum = pdev->bus->number;
	
	if(pci_enable_device(pdev))
		return -1;

	if(pci_request_regions(pdev, INTERFACE_NAME))
		return -1;

	unsigned long barStart  = pci_resource_start(pdev, 0);  //get BAR(base address register) start
	unsigned long barEnd    = pci_resource_end(pdev, 0); 
	unsigned long barLength = barEnd - barStart + 1;	//pci_resource_len(struct pci_dev *pdev, int bar)	
	InterfaceInfo->barlengths = barLength;

	if (!barStart || !barEnd) 
	{
		barLength = 0;	
	}

	InterfaceInfo->bar = pci_iomap(InterfaceInfo->pcidev, 0, barLength);
	if (!InterfaceInfo->bar)
		return -1;

	InterfaceInfo->ifIrqNum = pdev->irq;

	unsigned long ifId = (unsigned long)INTERFACE_ID;
	if(request_irq(InterfaceInfo->ifIrqNum , drv_interrupt, IRQF_SHARED, INTERFACE_NAME, (void *)ifId) < 0)
		return -1;

	pci_set_master(pdev); 

	int minor = 0;
	int major = 0; 
	int ifNum = -1;

	int nRet = alloc_chrdev_region(&InterfaceInfo->cdevNum, minor, 1 , INTERFACE_NAME);
	if (nRet < 0) 
		return nRet;

	major = MAJOR(InterfaceInfo->cdevNum);
	minor = MINOR(InterfaceInfo->cdevNum);
	ifNum = MKDEV(major, minor);

	InterfaceInfo->ifClass = class_create(THIS_MODULE, INTERFACE_NAME);
	cdev_init(&InterfaceInfo->cdev, &fileOps);
	InterfaceInfo->cdev.owner = THIS_MODULE;
	InterfaceInfo->cdev.ops = &fileOps;

	nRet = cdev_add(&InterfaceInfo->cdev, ifNum, 1 );
	if (nRet) 
		return -1;
	else
	{
		device_create(InterfaceInfo->ifClass, NULL, ifNum, NULL,  "%s%u", INTERFACE_NAME, minor);
	}

	InterfaceInfo->ifNum = ifNum;
	InterfaceInfo->major = major;
	InterfaceInfo->minor = minor;

	return 0;
}

AastaLLL · April 19, 2023, 7:04am

Hi,

Just want to confirm first.
Is there any issue when writing the DMA memory with GPU?

Thanks.

_JJANG · April 20, 2023, 5:40am

I’ve never used dma with a gpu.

I thought gpudirectrdma was the way, so I looked at the gpudirect rdma example and wrote the code.

                        uint8_t *gpuBuf
   user                 cudaHostAlloc(gpuBuf)
                        cuPointerSetAttribute(gpuBuf)
   -------------------------------------------------------------------------------                        
                        // input: gpuBuf virtual address, output: gpuBuf page table 
   kernel               nvidia_p2p_get_pages(gpuBuf_virtAddr, &gpu_page_table) 
  (dirver)              // input: gpu_page_table (out value of nvidia_p2p_get_pages function), output: gpu_dma_mapping
                        nvidia_p2p_dma_map(gpu_page_table, &gpu_dma_mapping)

I think here ‘gpu_dma_mapping_->hw_address’ is the physical address of the gpu dma.
So I gave this address to the fpga.
However, when the fpga tries to write data to this address, an error occurs.

And I have an additional question.

https://github.com/NVIDIA/jetson-rdma-picoevb/blob/master/kernel-module/picoevb-rdma.c#:~:text=ram_offset%2C%20pcie_addr%2C%20len)%3B-,/%20Create%20descriptor%20/,desc%2D%3Enxt_adr_hi%20%3D%200%3B,-return%20pevb_dma(pevb%3B-,/%20Create%20descriptor%20/,desc%2D%3Enxt_adr_hi%20%3D%200%3B,-return%20pevb_dma(pevb)
(Line 709 ~ Line 721)

I don’t understand this part.
Why does dma_alloc_coherent write gpu dma address to dma address allocated and write to hardware register?
Can’t the gpu dma address be directly written to a hardware register?

AastaLLL · April 20, 2023, 6:50am

Hi,

The sample shows that an FPGA (which has its own DMA controller) accesses iGPU/System memory over PCIe.
The opposite is not supported.

Here is an RDMA document for your reference:

Thanks.

_JJANG · April 20, 2023, 6:58am

For an FPGA (which has its own DMA controller) to access the iGPU/system memory via PCIe

Allocate dma memory with dma_alloc_coherent
Get GPU DMA physical address with nvidia_p2p_dma_map_pages
Write the GPU DMA physical address to the DMA memory allocated in 1).
Inform the FPGA of the physical address of the DMA memory allocated in 1).

Should I do this?

But I have a question here.
In the example code, fpga tells us the physical address of the dma memory allocated by dma_alloc_coherent. Does the fpga look at the gpu dma physical address written to this memory and write the data to the gpu dma memory?

AastaLLL · April 20, 2023, 7:16am

Hi,

You can find the sample below:

github.com

NVIDIA/jetson-rdma-picoevb/blob/master/client-applications/rdma-cuda.cu

/*
 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.

This file has been truncated. show original

The iGPU/system memory is allocated with cudaHostAlloc/malloc.

Thanks.

_JJANG · April 24, 2023, 7:21am

Can an FPGA access iGPU/system memory via PCIe in this way?

Allocate memory with cudaHostAlloc
Using cuPointerSetAttribute
nvidia_p2p_get_pages
nvidia_p2p_dma_map_pages
Write the GPU DMA physical address obtained from nvidia_p2p_dma_map_pages to the FPGA.

AastaLLL · May 4, 2023, 5:40am

Hi,

Sorry for the late update.

We have both cudaMalloc and malloc sample.
So cudaHostAlloc should work.

Thanks.

system · May 30, 2023, 9:19am

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.

Topic		Replies	Views
GPU direct access to DMA memory over PCIe Jetson Xavier NX pcie , cuda	4	2351	April 22, 2022
I have a few questions about GPU Direct RDMA Jetson AGX Xavier cuda , kernel	4	784	December 1, 2022
GPUDirect RDMA - use Jetson's DMA Jetson AGX Orin gpu-computing	15	464	July 12, 2024
PCIe DMA on Tegra (Xavier NX) Jetson AGX Xavier kernel	24	2002	July 13, 2022
Reading GPU Memory fails for GPUDirect RDMA driver CUDA Programming and Performance	0	185	January 31, 2024
Get error data from PCIe card by cudaMemcpy() CUDA Programming and Performance	1	605	June 22, 2017
Issues porting desktop RDMA app to Tegra: mmap hangs kernel Jetson AGX Xavier cuda	11	1523	April 1, 2022
PCIE DMA Problem between TX2 & FPGA Jetson TX2	19	7638	October 18, 2021
From NIC to GPU. CUDA Programming and Performance	42	13793	August 21, 2025
Jetson-rdma-picoevb not working in x86 PC mode RDMA Software For GPU	2	39	August 13, 2024

Error when trying to write data to GPU DMA memory (using GPU Direct RDMA)

Related topics