pci_alloc_consistent memory leak on TX2?

Hi,

I have a custom miniPCIe card that I’m trying to get working on the Jetson TX2. This device has previously worked on a Jetson TX1. We use a proprietary DMA kernel module to stream data between the Host CPU and miniPCIe device. The kernel module uses pci_alloc_consistent and pci_free_consistent. I’ve found that in at least two BSP releases for the Jetson TX2 (4.4.15 and 4.4.38), the coherent memory seems to leak.

I’m using the kernel headers from Linux_for_Tegra_tx2 (kernel/kernel_headers.tbz2) to build my kernel module.

What I’m observing is that after repeated calls to insmod / rmmod, the pci_alloc_consistent is returning NULL.

I created a simplified version of the kernel module that just allocates and free coherent PCI memory and I see the same behavior. I verified that this simple kernel module does not fail on an x86-64 based laptop.

Is there some configuration or modification that needs to be done on the Jetson TX2’s kernel or my kernel module to make this work?

Here’s the kernel log.

nvidia@tegra-ubuntu:~$ sudo insmod ./test_pci_alloc.ko 
[sudo] password for nvidia: 
[   76.392702] test_pci_alloc: module license 'unspecified' taints kernel.
[   76.399373] Disabling lock debugging due to kernel taint
[   76.405153] test_pci_alloc 0000:01:00.0: enabling device (0000 -> 0002)
[   76.411874] pci_alloc_consistent gave p=ffffff8000e81000, handle=80000000
nvidia@tegra-ubuntu:~$ sudo rmmod test_pci_alloc 
nvidia@tegra-ubuntu:~$ sudo insmod ./test_pci_alloc.ko 
[  111.251786] pci_alloc_consistent gave p=ffffff8000ea1000, handle=80000000
nvidia@tegra-ubuntu:~$ sudo rmmod test_pci_alloc 
nvidia@tegra-ubuntu:~$ sudo insmod ./test_pci_alloc.ko 
[  114.240785] pci_alloc_consistent gave p=ffffff8000ec1000, handle=80000000
nvidia@tegra-ubuntu:~$ sudo rmmod test_pci_alloc 
nvidia@tegra-ubuntu:~$ sudo insmod ./test_pci_alloc.ko 
[  116.304506] pci_alloc_consistent gave p=ffffff8000ee1000, handle=80000000
nvidia@tegra-ubuntu:~$ sudo rmmod test_pci_alloc 
nvidia@tegra-ubuntu:~$ sudo insmod ./test_pci_alloc.ko 
[  118.143853] pci_alloc_consistent gave p=ffffff8000f01000, handle=80000000
nvidia@tegra-ubuntu:~$ sudo rmmod test_pci_alloc 
nvidia@tegra-ubuntu:~$ sudo insmod ./test_pci_alloc.ko 
[  119.760816] pci_alloc_consistent gave p=ffffff8000f21000, handle=80000000
nvidia@tegra-ubuntu:~$ sudo rmmod test_pci_alloc 
nvidia@tegra-ubuntu:~$ sudo insmod ./test_pci_alloc.ko 
[  121.478813] pci_alloc_consistent gave p=ffffff8000f41000, handle=80000000
nvidia@tegra-ubuntu:~$ sudo rmmod test_pci_alloc 
nvidia@tegra-ubuntu:~$ sudo insmod ./test_pci_alloc.ko 
[  122.973540] pci_alloc_consistent gave p=ffffff8000f61000, handle=80000000
nvidia@tegra-ubuntu:~$ sudo rmmod test_pci_alloc 
nvidia@tegra-ubuntu:~$ sudo insmod ./test_pci_alloc.ko 
[  124.684679] pci_alloc_consistent gave p=          (null), handle=ffffffffffffffff

Here’s the example kernel module source:

#include <linux/init.h>
#include <linux/device.h>
#include <linux/module.h>
#include <linux/pci.h>
#include <linux/pci_regs.h>

#define DRV_NAME                "test_pci_alloc"
#define MEM_SIZE                (128 << 10)

static struct pci_device_id test_pci_alloc_tbl[] = {
    { 0x19AA,  0xE004,  PCI_ANY_ID, PCI_ANY_ID },
    { 0x19AA,  0x7021,  PCI_ANY_ID, PCI_ANY_ID },
    { 0, }
};
MODULE_DEVICE_TABLE(pci, test_pci_alloc_tbl);

static void *p = NULL;
static dma_addr_t handle;

static int test_pci_alloc_init_one ( struct pci_dev *pdev,
                                     const struct pci_device_id *ent )
{
    int i, err;

    i = pci_enable_device (pdev);
    if (i)
        return i;

    err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
    if (!err)
        err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
    if (err)
    {
        printk(KERN_WARNING DRV_NAME ": No suitable DMA available.\n");
        goto out_pci_disable_device;
    }

    pci_set_master(pdev);

    p = pci_alloc_consistent(pdev, MEM_SIZE, &handle);
    printk(KERN_INFO "pci_alloc_consistent gave p=%p, handle=%llx\n", p, handle);

    return 0;

out_pci_disable_device:
    pci_disable_device(pdev);

    return err;
}

static void test_pci_alloc_remove_one ( struct pci_dev *pdev )
{
    if ( p != NULL )
    {
        pci_free_consistent( pdev, MEM_SIZE, p, handle);
    }
    p = NULL;

    pci_disable_device(pdev);
}

static struct pci_driver test_pci_alloc_driver = {
        .name           = DRV_NAME,
        .probe          = test_pci_alloc_init_one,
        .remove         = test_pci_alloc_remove_one,
        .id_table       = test_pci_alloc_tbl,
};

static int __init test_pci_alloc_init(void)
{
        return pci_register_driver(&test_pci_alloc_driver);
}
static void __exit test_pci_alloc_cleanup(void)
{
        pci_unregister_driver (&test_pci_alloc_driver);
}

module_init(test_pci_alloc_init);
module_exit(test_pci_alloc_cleanup);

MODULE_LICENSE("GPL");

Any help would be appreciated.

Jeremy

We acknowledge the issue and please try with following patch and let us know the result.

diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index 3d8cdd1..9401296 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -2729,12 +2729,11 @@
 	return NULL;
 }
 
-static void __iommu_free_atomic(struct device *dev, struct page **pages,
+static void __iommu_free_atomic(struct device *dev, void *cpu_addr,
 				dma_addr_t handle, size_t size, struct dma_attrs *attrs)
 {
-	trace_dmadebug_free_attrs(dev, handle, size, pages[0]);
 	__iommu_remove_mapping(dev, handle, size, attrs);
-	__free_from_pool(page_address(pages[0]), size);
+	__free_from_pool(cpu_addr, size);
 	dev_dbg(dev, "%s() %16llx(%zx)\n", __func__, handle, size);
 }
 
@@ -2833,7 +2832,8 @@
 	}
 
 	if (__in_atomic_pool(cpu_addr, size)) {
-		__iommu_free_atomic(dev, pages, handle, size, attrs);
+		trace_dmadebug_free_attrs(dev, handle, size, pages[0]);
+		__iommu_free_atomic(dev, cpu_addr, handle, size, attrs);
 		return;
 	}

I used the patch and it took care of the issue. Thank you for that.

Can I expect this patch to be applied in an upcoming release?

Jeremy

Yes. We will make sure that it will be there in next release

Great! Thanks for the feedback.

Jeremy