The bandwidth of of virtual ethernet over PCIe between two xaviers is low

I am checking with internal team to see if we could disable virtual eth driver and use DMA test.

Wayne,

Do you have any update?

Thanks.

Sorry, still no update here. I will let you know once we finish the work.

I see. When will the function with DMA can be released? Or can I get the development branch of it?

Thanks.

Wayne,

Do you have any update for my question?

Sorry for late reply. Please follow the steps to start EP mode DMA.

Only C5 controller can be used as it can own open slot

Two Xavier devkit boards are required with one configured for C5’s RP operation and other configured for C5’s EP operation

Need finger-to-finger card to connect two devkits using their PCIe x8/x16 open slot

Please flash one devkit with default ODM data to operate C5 in Root port mode and other with bit-12 set to ‘1’ to operate C5 in Endpoint mode

patch 1

Subject: [PATCH] DNI: arm64: configs: Enable PCIe DMA test framework

Enables test framework for embedded DMA engine of PCIe IP to perform
read and write operations

---

diff --git a/arch/arm64/configs/tegra_defconfig b/arch/arm64/configs/tegra_defconfig
index 53f712a..ec693c6 100644
--- a/arch/arm64/configs/tegra_defconfig
+++ b/arch/arm64/configs/tegra_defconfig
@@ -51,6 +51,7 @@
 CONFIG_PCI_STUB=m
 CONFIG_PCI_IOV=y
 CONFIG_PCIE_TEGRA=y
+CONFIG_PCIE_TEGRA_DW_DMA_TEST=y
 CONFIG_PCIE_TEGRA_EP=y
 CONFIG_PCI_TEGRA=y
 CONFIG_PCI_ENDPOINT=y

patch 2

Subject: [PATCH] PCI: tegra: Fix DMA test framework build issue

Fix DMA test framework buid issue by defining 'struct dw_pcie *'
pointer correctly

---

diff --git a/drivers/pci/dwc/pcie-tegra.c b/drivers/pci/dwc/pcie-tegra.c
index 6baa78f..ed9d1ef 100644
--- a/drivers/pci/dwc/pcie-tegra.c
+++ b/drivers/pci/dwc/pcie-tegra.c
@@ -1038,6 +1038,7 @@
 #ifdef CONFIG_PCIE_TEGRA_DW_DMA_TEST
 static int dma_write(struct tegra_pcie_dw *pcie, struct dma_tx *tx)
 {
+	struct dw_pcie *pci = &pcie->pci;
 	struct device *dev = pcie->dev;
 	u32 val = 0, bit = 0;
 	int ret = 0;
@@ -1164,6 +1165,7 @@
 
 static int dma_read(struct tegra_pcie_dw *pcie, struct dma_tx *tx)
 {
+	struct dw_pcie *pci = &pcie->pci;
 	struct device *dev = pcie->dev;
 	u32 val = 0, bit = 0;
 	int ret = 0;

patch 3

Subject: [PATCH] DNI: PCI: tegra: Setup DMA test framework for perf measurement

---

diff --git a/drivers/misc/tegra-pcie-ep-mem.c b/drivers/misc/tegra-pcie-ep-mem.c
index 1f04f9f..821ea1e 100644
--- a/drivers/misc/tegra-pcie-ep-mem.c
+++ b/drivers/misc/tegra-pcie-ep-mem.c
@@ -90,7 +90,7 @@
 #define DMA_LLP_LOW_OFF_RDCH		(0x1C + 0x100)
 #define DMA_LLP_HIGH_OFF_RDCH		(0x20 + 0x100)
 
-static unsigned long alloc_size = 0xA00000;
+static unsigned long alloc_size = 0x20000000;
 
 module_param(alloc_size, ulong, 0660);
 MODULE_PARM_DESC(alloc_size, "Allocation Size");
@@ -457,10 +457,12 @@
 	}
 
 	/* compare copied data */
+#if 0
 	if (!memcmp(__io_virt(bar_mem), phys_to_virt(ep->dst), ep->size))
 		dev_info(&ep->pdev->dev, "DMA-Write test PASSED\n");
 	else
 		dev_info(&ep->pdev->dev, "DMA-Write test FAILED\n");
+#endif
 err_out:
 	iounmap(bar_mem);
 err_remap:
@@ -596,11 +598,12 @@
 	}
 
 	/* compare copied data */
+#if 0
 	if (!memcmp(__io_virt(bar_mem), phys_to_virt(ep->src), ep->size))
 		dev_info(&ep->pdev->dev, "DMA-Read test PASSED\n");
 	else
 		dev_info(&ep->pdev->dev, "DMA-Read test FAILED\n");
-
+#endif
 err_out:
 	iounmap(bar_mem);
 err_remap:
diff --git a/drivers/pci/dwc/pcie-tegra.c b/drivers/pci/dwc/pcie-tegra.c
index ed9d1ef..2a3672b 100644
--- a/drivers/pci/dwc/pcie-tegra.c
+++ b/drivers/pci/dwc/pcie-tegra.c
@@ -1317,11 +1317,12 @@
 	}
 
 	/* compare copied data */
+#if 0
 	if (!memcmp(pcie->cpu_virt_addr, dst_cpu_virt, pcie->size))
 		dev_info(pcie->dev, "DMA-Write test PASSED\n");
 	else
 		dev_info(pcie->dev, "DMA-Write test FAILED\n");
-
+#endif
 err_out:
 	iounmap(dst_cpu_virt);
 	return ret;
@@ -1432,11 +1433,12 @@
 	}
 
 	/* compare copied data */
+#if 0
 	if (!memcmp(dst_cpu_virt, pcie->cpu_virt_addr, pcie->size))
 		dev_info(pcie->dev, "DMA-Read test PASSED\n");
 	else
 		dev_info(pcie->dev, "DMA-Read test FAILED\n");
-
+#endif
 err_out:
 	iounmap(dst_cpu_virt);
 	return ret;
diff --git a/drivers/pci/endpoint/functions/pci-epf-nv-test.c b/drivers/pci/endpoint/functions/pci-epf-nv-test.c
index 8b2a1dc..f4132a7 100644
--- a/drivers/pci/endpoint/functions/pci-epf-nv-test.c
+++ b/drivers/pci/endpoint/functions/pci-epf-nv-test.c
@@ -16,7 +16,7 @@
 #include <linux/pci-epc.h>
 #include <linux/pci-epf.h>
 
-#define BAR0_SIZE SZ_64K
+#define BAR0_SIZE SZ_512M
 
 struct pci_epf_nv_test {
 	struct pci_epf_header header;
@@ -30,14 +30,11 @@
 	struct pci_epf_nv_test *epfnv = epf_get_drvdata(epf);
 	struct pci_epc *epc = epf->epc;
 	struct device *cdev = epc->dev.parent;
-	struct iommu_domain *domain = iommu_get_domain_for_dev(cdev);
 
 	pci_epc_stop(epc);
 	pci_epc_clear_bar(epc, BAR_0);
-	vunmap(epfnv->bar0_ram_map);
-	iommu_unmap(domain, epfnv->bar0_iova, PAGE_SIZE);
-	iommu_dma_free_iova(cdev, epfnv->bar0_iova, BAR0_SIZE);
-	__free_pages(epfnv->bar0_ram_page, 1);
+	dma_free_coherent(cdev, BAR0_SIZE, epfnv->bar0_ram_map,
+			  epfnv->bar0_iova);
 }
 
 static int pci_epf_nv_test_bind(struct pci_epf *epf)
@@ -47,7 +44,6 @@
 	struct pci_epf_header *header = epf->header;
 	struct device *fdev = &epf->dev;
 	struct device *cdev = epc->dev.parent;
-	struct iommu_domain *domain = iommu_get_domain_for_dev(cdev);
 	int ret;
 
 	ret = pci_epc_write_header(epc, header);
@@ -56,60 +52,29 @@
 		return ret;
 	}
 
-	epfnv->bar0_ram_page = alloc_pages(GFP_KERNEL, 1);
-	if (!epfnv->bar0_ram_page) {
-		dev_err(fdev, "alloc_pages() failed\n");
-		ret = -ENOMEM;
-		goto fail;
-	}
-	dev_info(fdev, "BAR0 RAM phys: 0x%llx\n",
-		 page_to_phys(epfnv->bar0_ram_page));
-
-	epfnv->bar0_iova = iommu_dma_alloc_iova(cdev, BAR0_SIZE,
-						cdev->coherent_dma_mask);
-	if (!epfnv->bar0_iova) {
-		dev_err(fdev, "iommu_dma_alloc_iova() failed\n");
-		ret = -ENOMEM;
-		goto fail_free_pages;
-	}
-
-	dev_info(fdev, "BAR0 RAM IOVA: 0x%08llx\n", epfnv->bar0_iova);
-
-	ret = iommu_map(domain, epfnv->bar0_iova,
-			page_to_phys(epfnv->bar0_ram_page),
-			PAGE_SIZE, IOMMU_READ | IOMMU_WRITE);
-	if (ret) {
-		dev_err(fdev, "iommu_map(RAM) failed: %d\n", ret);
-		goto fail_free_iova;
-	}
-	epfnv->bar0_ram_map = vmap(&epfnv->bar0_ram_page, 1, VM_MAP,
-				   PAGE_KERNEL);
+	epfnv->bar0_ram_map = dma_alloc_coherent(cdev, BAR0_SIZE,
+						 &epfnv->bar0_iova, GFP_KERNEL);
 	if (!epfnv->bar0_ram_map) {
-		dev_err(fdev, "vmap() failed\n");
+		dev_err(fdev, "dma_alloc_coherent() failed\n");
 		ret = -ENOMEM;
-		goto fail_unmap_ram_iova;
+		return ret;
 	}
-	dev_info(fdev, "BAR0 RAM virt: 0x%p\n", epfnv->bar0_ram_map);
+	dev_info(fdev, "BAR0 RAM IOVA: 0x%08llx\n", epfnv->bar0_iova);
 
 	ret = pci_epc_set_bar(epc, BAR_0, epfnv->bar0_iova, BAR0_SIZE,
 			      PCI_BASE_ADDRESS_SPACE_MEMORY |
 			      PCI_BASE_ADDRESS_MEM_TYPE_32);
 	if (ret) {
 		dev_err(fdev, "pci_epc_set_bar() failed: %d\n", ret);
-		goto fail_unmap_ram_virt;
+		goto fail_set_bar;
+		return ret;
 	}
 
 	return 0;
 
-fail_unmap_ram_virt:
-	vunmap(epfnv->bar0_ram_map);
-fail_unmap_ram_iova:
-	iommu_unmap(domain, epfnv->bar0_iova, PAGE_SIZE);
-fail_free_iova:
-	iommu_dma_free_iova(cdev, epfnv->bar0_iova, BAR0_SIZE);
-fail_free_pages:
-	__free_pages(epfnv->bar0_ram_page, 1);
-fail:
+fail_set_bar:
+	dma_free_coherent(cdev, BAR0_SIZE, epfnv->bar0_ram_map,
+			  epfnv->bar0_iova);
 	return ret;
 }

patch 4

Subject: [PATCH] DNI: dts: t19x: Enable DMA polling

Enable polling mechanism for DMA read/write operations instead of
interrupt mechanism for accurate perf measurements

---

diff --git a/kernel-dts/tegra194-soc/tegra194-soc-pcie.dtsi b/kernel-dts/tegra194-soc/tegra194-soc-pcie.dtsi
index d052446..00ca791 100644
--- a/kernel-dts/tegra194-soc/tegra194-soc-pcie.dtsi
+++ b/kernel-dts/tegra194-soc/tegra194-soc-pcie.dtsi
@@ -880,6 +880,7 @@
 							204000000 408000000 666000000  1066000000
 							408000000 666000000 1066000000 2133000000 >;
 
+		nvidia,dma-poll;
 		nvidia,max-speed = <4>;
 		nvidia,disable-aspm-states = <0xf>;
 		nvidia,controller-id = <&bpmp 0x5>;

And use below steps to start a test.

Go to the console of the system where PCIe IP is operating as endpoint and execute the following commands

cd /sys/kernel/config/pci_ep/
mkdir functions/pci_epf_nv_test/func1
echo 0x1AD5 > functions/pci_epf_nv_test/func1/deviceid
echo 16 > functions/pci_epf_nv_test/func1/msi_interrupts
ln -s functions/pci_epf_nv_test/func1 controllers/141a0000.pcie_ep/
echo 1 > controllers/141a0000.pcie_ep/start

NOTE:- Boot the host system only after executing all the above commands.

As soon as system boots, execute following commands to prepare the system for perf checkout

Execute following command to enable perf mode w.r.t ASPM

echo "performance" > /sys/module/pcie_aspm/parameters/policy

start the test

RP Mode DMA
In the below procedure, x being the number of the root port controller whose DMA is being used for perf checkout

Write

Go to the debugfs directory of the root port controller
     cd /sys/kernel/debug/pcie-x/
Set channel number (set it to one of 0,1,2,3)
     echo 1 > channel
Set size to 1GB
     echo 0x20000000 > size
Set source address for DMA.
          For this, grep for the string “---> Allocated memory for DMA” in dmesg log and use whatever address comes up in the grep output

     dmesg | grep " \-\-\-> Allocated memory for DMA"
          example output would be something like

     [    7.102149] tegra-pcie-dw 141a0000.pcie: ---> Allocated memory for DMA @ 0xA0000000
          So, use 0xA0000000  as the source address

     echo 0xA0000000 > src
          Note: - don’t forget to replace 0xA0000000 with your grep output value. In case it is not found in grep output, save full kernel boot log and search in it



Set destination address for DMA
          For this, execute the following command

     lspci -vv | grep -i "region 0"
          an example output would be something like

     Region 0: Memory at 1f40000000 (32-bit, non-prefetchable) 
          So, use 1f40000000  as destination address

     echo 0x1f40000000 > dst
          Note: - don’t forget to replace 0x1f40000000 with your grep output value. In case it is not found in grep output, save full kernel boot log and search in it

Execute write test
    cat write
          It prints the output in the following format

    tegra-pcie-dw 14100000.pcie_c1_rp: DMA write. Size: 536870912 bytes, Time diff: 316519776 ns
          Perf calculation:

     Perf = (Size * 8 * 1000000000)/(Time diff * 1024 * 1024 * 1024) Gbps
Read test can be performed by interchanging 'src' and 'dst' and executing 'echo read' command.

EP Mode DMA

Write
Go to the debugfs directory of the end point client driver

     cd /sys/kernel/debug/tegra_pcie_ep/
Set channel number (set it to one of 0,1,2,3)
     echo 1 > channel
Set size to 512 MB
     echo 0x20000000 > size
Set source address for EP’s DMA.
          For this, grep for the string "BAR0 RAM IOVA” in dmesg log of endpoint system console and use whatever address comes up in the grep output

     dmesg | grep "BAR0 RAM IOVA"
          an example output would be something like

     pci_epf_nv_test pci_epf_nv_test.0: BAR0 RAM IOVA: 0xe0000000
          So, use 0xe0000000  as source address

     echo 0xe0000000 > src
          Note: - don’t forget to replace 0xe0000000 with your grep output value. In case it is not found in grep output, save full kernel boot log and search in it

Set destination address for DMA
          For this, grep for the string “Allocated memory for DMA operation” in dmesg log of host system console (i.e. current system) and use whatever address comes up in the grep output

     dmesg | grep " Allocated memory for DMA operation"
          an example output would be something like

     tegra_ep_mem 0005:01:00.0: Allocated memory for DMA operation @ 0xC0000000, size=0x20000000
          So, use 0xC0000000 as source address

     echo 0xC0000000 > dst
          Note: - don’t forget to replace 0xC0000000 with your grep output value. In case it is not found in grep output, save full kernel boot log and search in it

Execute write test
     cat write
          It prints the output in the following format

     tegra_ep_mem 0000:01:00.0: DMA write: Size: 536870912 bytes, Time diff: 296565536 ns
          Perf calculation:

     Perf = (Size * 8 * 1000000000)/(Time diff * 1024 * 1024 * 1024) Gbps
Read test can be performed by interchanging 'src' and 'dst' and executing 'echo read' command.

Wayne,

Thank you very much. I will follow this instruction to do the test in these two days.
But I have one question here, this change you gave me is using pci_epf_nv_test, not “ethernet over PCIe”. Then when will DMA function ready for “ethernet over PCIe” to increase the bandwidth?

Thanks.

Currently not. The DMA of virtual ethernet interface may be fixed in future release.

Why do I open CONFIG_PCIE_TEGRA_DW_DMA_TEST? Does it mean the driver in 32.1 isn’t using DMA ?
Another question: If I change the code according to your instruction in pcie-tegra.c, is “Ethernet over PCIe” still working?

Thanks.

Wayne,

I tried your instruction above. In RP xavier, I run the command “lspci -vvv -s 0005:01:00.0” and can’t find information about “Region 0” whose size is 512M.

Control: I/O- Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop- ParErr- Stepping- SERR- FastB2B- Dis+
        Status: Cap+ 66MHz- UDF- FastB2B- ParErr- DEVSEL=fast >TAbort- <TAbort- <MAbort- >SERR- <PERR- I-
        Latency: 0
        Interrupt: pin A routed to IRQ 595
        Region 2: Memory at 1c00000000 (64-bit, prefetchable) 
        Region 4: Memory at <unassigned> (64-bit, non-prefetchable)
        Capabilities: <access denied>
        Kernel driver in use: tegra_ep_mem

If I change BAR0_SIZE in pci-epf-nv-test.c back to SZ_64K, I can find the Region 0. Is there something WRONG about the SZ_512M?

Please share the result of lspci -vv.

lspci -vv result in RP side:

0001:00:00.0 PCI bridge: NVIDIA Corporation Device 1ad2 (rev a1) (prog-if 00 [Normal decode])
        Control: I/O+ Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop- ParErr- Stepping- SERR+ FastB2B- Dis-
        Status: Cap+ 66MHz- UDF- FastB2B- ParErr- DEVSEL=fast >TAbort- <TAbort- <MAbort- >SERR- <PERR- I-
        Latency: 0
        Interrupt: pin A routed to IRQ 34
        Bus: primary=00, secondary=01, subordinate=ff, sec-latency=0
        I/O behind bridge: 00000000-00000fff
        Memory behind bridge: 30200000-302fffff
        Secondary status: 66MHz- FastB2B- ParErr- DEVSEL=fast >TAbort- <TAbort- <MAbort- <SERR- <PERR-
        BridgeCtl: Parity- SERR- NoISA- VGA- MAbort- >Reset- FastB2B-
                PriDiscTmr- SecDiscTmr- DiscTmrStat- DiscTmrSERREn-
        Capabilities: <access denied>
        Kernel driver in use: pcieport

0001:01:00.0 SATA controller: Marvell Technology Group Ltd. Device 9171 (rev 13) (prog-if 01 [AHCI 1.0])
        Subsystem: Marvell Technology Group Ltd. Device 9171
        Control: I/O+ Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop- ParErr- Stepping- SERR- FastB2B- Dis+
        Status: Cap+ 66MHz- UDF- FastB2B- ParErr- DEVSEL=fast >TAbort- <TAbort- <MAbort- >SERR- <PERR- I-
        Latency: 0
        Interrupt: pin A routed to IRQ 563
        Region 0: I/O ports at 100010 
        Region 1: I/O ports at 100020 
        Region 2: I/O ports at 100018 
        Region 3: I/O ports at 100024 
        Region 4: I/O ports at 100000 
        Region 5: Memory at 30210000 (32-bit, non-prefetchable) 
        Expansion ROM at 30200000 [disabled] 
        Capabilities: <access denied>
        Kernel driver in use: ahci

0005:00:00.0 PCI bridge: NVIDIA Corporation Device 1ad0 (rev a1) (prog-if 00 [Normal decode])
        Control: I/O+ Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop- ParErr- Stepping- SERR+ FastB2B- Dis-
        Status: Cap+ 66MHz- UDF- FastB2B- ParErr- DEVSEL=fast >TAbort- <TAbort- <MAbort- >SERR- <PERR- I-
        Latency: 0
        Interrupt: pin A routed to IRQ 38
        Bus: primary=00, secondary=01, subordinate=ff, sec-latency=0
        Prefetchable memory behind bridge: 0000001c00000000-0000001c000fffff
        Secondary status: 66MHz- FastB2B- ParErr- DEVSEL=fast >TAbort- <TAbort- <MAbort- <SERR- <PERR-
        BridgeCtl: Parity- SERR- NoISA- VGA- MAbort- >Reset- FastB2B-
                PriDiscTmr- SecDiscTmr- DiscTmrStat- DiscTmrSERREn-
        Capabilities: <access denied>
        Kernel driver in use: pcieport

0005:01:00.0 RAM memory: NVIDIA Corporation Device 1ad5
        Control: I/O- Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop- ParErr- Stepping- SERR- FastB2B- Dis+
        Status: Cap+ 66MHz- UDF- FastB2B- ParErr- DEVSEL=fast >TAbort- <TAbort- <MAbort- >SERR- <PERR- I-
        Latency: 0
        Interrupt: pin A routed to IRQ 595
        Region 2: Memory at 1c00000000 (64-bit, prefetchable) 
        Region 4: Memory at <unassigned> (64-bit, non-prefetchable)
        Capabilities: <access denied>
        Kernel driver in use: tegra_ep_mem

The dmesg in EP side:

BAR0 RAM IOVA: 0xfc000000

Thanks.

Hi zhuce_cgf,

That (region0) is just like an example. Is there any problem using region 2 with size=128?
Please note that these patches are just for experimental test. It is not an official solution.

I haven’t tried with size 128. But in my opinion, we should have a region whose size is 512M, becasue there is code change on it.
As I told you before, if I change BAR0_SIZE back to SZ_64K, we can find region 0 whose size is 64K. So I think there must be some problem after I change BAR0_SIZE from 64K to 512M.

Thanks.

Wayne,

Sorry to bother you again. Do you have any update for my prblem?

Thanks.

For the codebase you are using, please apply the following patch as well (along with all the aforementioned patches)

diff --git a/kernel-dts/tegra194-soc/tegra194-soc-pcie.dtsi b/kernel-dts/tegra194-soc/tegra194-soc-pcie.dtsi
index 96b46cf8b1bd..d052446d110a 100644
--- a/kernel-dts/tegra194-soc/tegra194-soc-pcie.dtsi
+++ b/kernel-dts/tegra194-soc/tegra194-soc-pcie.dtsi
@@ -542,8 +542,8 @@
 
 		bus-range = <0x0 0xff>;
 		ranges = <0x81000000 0x0 0x38100000 0x0 0x38100000 0x0 0x00100000      /* downstream I/O (1MB) */
-			  0x82000000 0x0 0x38200000 0x0 0x38200000 0x0 0x01E00000      /* non-prefetchable memory (30MB) */
-			  0xc2000000 0x18 0x00000000 0x18 0x00000000 0x4 0x00000000>;  /* prefetchable memory (16GB) */
+			  0x82000000 0x0 0x40000000 0x1B 0x40000000 0x0 0xC0000000     /* non-prefetchable memory (3GB) */
+			  0xc2000000 0x18 0x00000000 0x18 0x00000000 0x3 0x40000000>;  /* prefetchable memory (13GB) */
 
 		nvidia,cfg-link-cap-l1sub = <0x1c4>;
 		nvidia,cap-pl16g-status = <0x174>;
@@ -612,8 +612,8 @@
 
 		bus-range = <0x0 0xff>;
 		ranges = <0x81000000 0x0 0x30100000 0x0 0x30100000 0x0 0x00100000      /* downstream I/O (1MB) */
-			  0x82000000 0x0 0x30200000 0x0 0x30200000 0x0 0x01E00000      /* non-prefetchable memory (30MB) */
-			  0xc2000000 0x12 0x00000000 0x12 0x00000000 0x0 0x40000000>;  /* prefetchable memory (1GB) */
+			  0x82000000 0x0 0x40000000 0x12 0x30000000 0x0 0x10000000     /* non-prefetchable memory (256MB) */
+			  0xc2000000 0x12 0x00000000 0x12 0x00000000 0x0 0x30000000>;  /* prefetchable memory (768MB) */
 
 		nvidia,cfg-link-cap-l1sub = <0x194>;
 		nvidia,cap-pl16g-status = <0x164>;
@@ -681,8 +681,8 @@
 
 		bus-range = <0x0 0xff>;
 		ranges = <0x81000000 0x0 0x32100000 0x0 0x32100000 0x0 0x00100000      /* downstream I/O (1MB) */
-			  0x82000000 0x0 0x32200000 0x0 0x32200000 0x0 0x01E00000      /* non-prefetchable memory (30MB) */
-			  0xc2000000 0x12 0x40000000 0x12 0x40000000 0x0 0x40000000>;  /* prefetchable memory (1GB) */
+			  0x82000000 0x0 0x40000000 0x12 0x70000000 0x0 0x10000000     /* non-prefetchable memory (256MB) */
+			  0xc2000000 0x12 0x40000000 0x12 0x40000000 0x0 0x30000000>;  /* prefetchable memory (768MB) */
 
 		nvidia,cfg-link-cap-l1sub = <0x194>;
 		nvidia,cap-pl16g-status = <0x164>;
@@ -750,8 +750,8 @@
 
 		bus-range = <0x0 0xff>;
 		ranges = <0x81000000 0x0 0x34100000 0x0 0x34100000 0x0 0x00100000      /* downstream I/O (1MB) */
-			  0x82000000 0x0 0x34200000 0x0 0x34200000 0x0 0x01E00000      /* non-prefetchable memory (30MB) */
-			  0xc2000000 0x12 0x80000000 0x12 0x80000000 0x0 0x40000000>;  /* prefetchable memory (1GB) */
+			  0x82000000 0x0 0x40000000 0x12 0xB0000000 0x0 0x10000000     /* non-prefetchable memory (256MB) */
+			  0xc2000000 0x12 0x80000000 0x12 0x80000000 0x0 0x30000000>;  /* prefetchable memory (768MB) */
 
 		nvidia,cfg-link-cap-l1sub = <0x194>;
 		nvidia,cap-pl16g-status = <0x164>;
@@ -819,8 +819,8 @@
 
 		bus-range = <0x0 0xff>;
 		ranges = <0x81000000 0x0 0x36100000 0x0 0x36100000 0x0 0x00100000      /* downstream I/O (1MB) */
-			  0x82000000 0x0 0x36200000 0x0 0x36200000 0x0 0x01E00000      /* non-prefetchable memory (30MB) */
-			  0xc2000000 0x14 0x00000000 0x14 0x00000000 0x4 0x00000000>;  /* prefetchable memory (16GB) */
+			  0x82000000 0x0 0x40000000 0x17 0x40000000 0x0 0xC0000000      /* non-prefetchable memory (3GB) */
+			  0xc2000000 0x14 0x00000000 0x14 0x00000000 0x3 0x40000000>;  /* prefetchable memory (13GB) */
 
 		nvidia,cfg-link-cap-l1sub = <0x1b0>;
 		nvidia,cap-pl16g-status = <0x174>;
@@ -893,8 +893,8 @@
 
 		bus-range = <0x0 0xff>;
 		ranges = <0x81000000 0x0 0x3a100000 0x0 0x3a100000 0x0 0x00100000      /* downstream I/O (1MB) */
-			  0x82000000 0x0 0x3a200000 0x0 0x3a200000 0x0 0x01E00000      /* non-prefetchable memory (30MB) */
-			  0xc2000000 0x1c 0x00000000 0x1c 0x00000000 0x4 0x00000000>;  /* prefetchable memory (16GB) */
+			  0x82000000 0x0 0x40000000 0x1f 0x40000000 0x0 0xC0000000     /* non-prefetchable memory (3GB) */
+			  0xc2000000 0x1c 0x00000000 0x1c 0x00000000 0x3 0x40000000>;  /* prefetchable memory (13GB) */
 
 		nvidia,cfg-link-cap-l1sub = <0x1c4>;
 		nvidia,cap-pl16g-status = <0x174>;

Sorry for the late response.

I changed the device tree as you guys told me. And I can find the region 0 which is 512M. The dmesg is:

[  548.061426] tegra-pcie-dw 141a0000.pcie: DMA write. Size: 536870912 bytes, Time diff: 294291527 ns
[  591.932858] tegra-pcie-dw 141a0000.pcie: DMA write. Size: 536870912 bytes, Time diff: 294288508 ns

Q1: Then the bandwidth is 13.6GB/s. But actually we have 8 lanes in 141a0000, the bandwidth should be 5*8=40GB/s, what’s the problem here?

I also tested the read process, but “echo read” only print “read” in the terminal. Then I use “cat read” and the result is

[ 1273.527652] tegra-pcie-dw 141a0000.pcie: DMA read. Size: 536870912 bytes, Time diff: 611755436 ns

Q2: The bandwidth is 6.8GB/s, which is smaller than write. What’s problem here?

Q3: The last problem is that “/sys/kernel/debug/tegra_pcie_ep/” doesn’t exist when I test the EP side.

Thanks.

WayneWWW and vidyas,

I have updated my problem, please help to answer it.
Thanks.

Can you please check what is the link speed here? If you see it as Gen-1 speed, you may have to add “nvidia,max-speed = <4>;” to the controller which is operating in endpoint mode, which in this case is “pcie_ep@141a0000” node.
Regarding read speed being less, it is expected with Tegra<->Tegra back-to-back connection. Its a design limitation and in Tegra<->Tegra case, only DMA write should be used from both sides to transfer data in both directions (instead of DMA read and write from one side)
Regarding ‘tegra_pcie_ep’ folder not being present, please check whether the client device driver is binded with endpoint device on host or not. (“sudo lspci -vv” should give this information). If you see that it is not binded, you may have to see why is it not binded (Ideally it should)

Vidyas,

How can I check what is the link speed? From device tree?

What is the client device driver you meantioned in “check whether the client device driver is binded with endpoint device on host or not”? Is it tegra_ep_mem in the “lspci -vv” below.

And the result of “lspci -vv” on RC-AGX is:

0005:00:00.0 PCI bridge: NVIDIA Corporation Device 1ad0 (rev a1) (prog-if 00 [Normal decode])
	Control: I/O+ Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop- ParErr- Stepping- SERR+ FastB2B- DisINTx-
	Status: Cap+ 66MHz- UDF- FastB2B- ParErr- DEVSEL=fast >TAbort- <TAbort- <MAbort- >SERR- <PERR- INTx-
	Latency: 0
	Interrupt: pin A routed to IRQ 38
	Bus: primary=00, secondary=01, subordinate=ff, sec-latency=0
	Memory behind bridge: 40000000-6fffffff
	Prefetchable memory behind bridge: 0000001c00000000-0000001c000fffff
	Secondary status: 66MHz- FastB2B- ParErr- DEVSEL=fast >TAbort- <TAbort- <MAbort- <SERR- <PERR-
	BridgeCtl: Parity- SERR- NoISA- VGA- MAbort- >Reset- FastB2B-
		PriDiscTmr- SecDiscTmr- DiscTmrStat- DiscTmrSERREn-
	Capabilities: <access denied>
	Kernel driver in use: pcieport

0005:01:00.0 RAM memory: NVIDIA Corporation Device 1ad5
	Control: I/O- Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop- ParErr- Stepping- SERR- FastB2B- DisINTx+
	Status: Cap+ 66MHz- UDF- FastB2B- ParErr- DEVSEL=fast >TAbort- <TAbort- <MAbort- >SERR- <PERR- INTx-
	Latency: 0
	Interrupt: pin A routed to IRQ 595
	Region 0: Memory at 1f40000000 (32-bit, non-prefetchable) 
	Region 2: Memory at 1c00000000 (64-bit, prefetchable) 
	Region 4: Memory at 1f60000000 (64-bit, non-prefetchable) 
	Capabilities: <access denied>
	Kernel driver in use: tegra_ep_mem

Thanks.