Memory leak on kernel 6.11.0 when using cudaMallocHost

Using the code below:

/**
 * Program main
 */
int main(int argc, char **argv) {
  unsigned long int sizeAllocate;
  printf("[Matrix Multiply Using CUDA] - Starting...\n");

  if (checkCmdLineFlag(argc, (const char **)argv, "help") ||
      checkCmdLineFlag(argc, (const char **)argv, "?")) {
    printf("Usage -device=n (n >= 0 for deviceID)\n");
    printf("      -size=k (k in KB)\n");

    exit(EXIT_SUCCESS);
  }

  if (checkCmdLineFlag(argc, (const char **)argv, "size")) {
    sizeAllocate = getCmdLineArgumentInt(argc, (const char **)argv, "size");
  }

  printf(" Allocating %lu bytes\n",sizeAllocate*1024);

  void *array;

  checkCudaErrors(cudaMallocHost(&array, sizeAllocate*1024));

  exit(0);
}

on kernel 6.11.[0-1] with size=30000000 results in:

CUDA error at testMem.cu:88 code=1(cudaErrorInvalidValue) "cudaMallocHost(&array, sizeAllocate*1024)" 
[Matrix Multiply Using CUDA] - Starting...
 Allocating 30720000000 bytes

The memory (/proc/meminfo) before (1) and after (2) running the program is:

6.11.1-gentoo/meminfo-1.txt:Active:         31042880 kB
6.11.1-gentoo/meminfo-1.txt:Active(anon):   30254840 kB
6.11.1-gentoo/meminfo-1.txt:Active(file):     788040 kB
6.11.1-gentoo/meminfo-2.txt:Active:         61042504 kB
6.11.1-gentoo/meminfo-2.txt:Active(anon):   60254392 kB
6.11.1-gentoo/meminfo-2.txt:Active(file):     788112 kB

On kernel 6.10.12 the program works and has no errors and no additional memory Active(anon) stays constant.
Running gentoo with AMD 5900X. 64GB RAM, 4060ti 16GB video card. Cuda is 12.6.2 and Nvidia driver 560.35.03.

It seems to be related to folios since bisecting the kernel gets this patch as the cause:

commit 53ba78de064b6a45f5925947b3b45e9e833c2f8a
Author: Vivek Kasireddy <vivek.kasireddy@intel.com>
Date:   Sun Jun 23 23:36:10 2024 -0700

    mm/gup: introduce check_and_migrate_movable_folios()
    
    This helper is the folio equivalent of check_and_migrate_movable_pages().
    Therefore, all the rules that apply to check_and_migrate_movable_pages()
    also apply to this one as well.  Currently, this helper is only used by
    memfd_pin_folios().
    
    This patch also includes changes to rename and convert the internal
    functions collect_longterm_unpinnable_pages() and
    migrate_longterm_unpinnable_pages() to work on folios.  As a result,
    check_and_migrate_movable_pages() is now a wrapper around
    check_and_migrate_movable_folios().
    
    Link: https://lkml.kernel.org/r/20240624063952.1572359-3-vivek.kasireddy@intel.com
    Signed-off-by: Vivek Kasireddy <vivek.kasireddy@intel.com>
    Suggested-by: David Hildenbrand <david@redhat.com>
    Acked-by: David Hildenbrand <david@redhat.com>
    Acked-by: Dave Airlie <airlied@redhat.com>
    Acked-by: Gerd Hoffmann <kraxel@redhat.com>
    Cc: Matthew Wilcox <willy@infradead.org>
    Cc: Christoph Hellwig <hch@infradead.org>
    Cc: Jason Gunthorpe <jgg@nvidia.com>
    Cc: Peter Xu <peterx@redhat.com>
    Cc: Arnd Bergmann <arnd@arndb.de>
    Cc: Christoph Hellwig <hch@lst.de>
    Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
    Cc: Dongwon Kim <dongwon.kim@intel.com>
    Cc: Hugh Dickins <hughd@google.com>
    Cc: Junxiao Chang <junxiao.chang@intel.com>
    Cc: Mike Kravetz <mike.kravetz@oracle.com>
    Cc: Oscar Salvador <osalvador@suse.de>
    Cc: Shuah Khan <shuah@kernel.org>
    Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

diff --git a/mm/gup.c b/mm/gup.c
index fd6a5b52a8f7..d98bb199241e 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2441,19 +2441,19 @@ struct page *get_dump_page(unsigned long addr)
 
 #ifdef CONFIG_MIGRATION
 /*
- * Returns the number of collected pages. Return value is always >= 0.
+ * Returns the number of collected folios. Return value is always >= 0.
  */
-static unsigned long collect_longterm_unpinnable_pages(
-                                       struct list_head *movable_page_list,
-                                       unsigned long nr_pages,
-                                       struct page **pages)
+static unsigned long collect_longterm_unpinnable_folios(
+                                       struct list_head *movable_folio_list,
+                                       unsigned long nr_folios,
+                                       struct folio **folios)
 {
        unsigned long i, collected = 0;
        struct folio *prev_folio = NULL;
        bool drain_allow = true;
 
-       for (i = 0; i < nr_pages; i++) {
-               struct folio *folio = page_folio(pages[i]);
+       for (i = 0; i < nr_folios; i++) {
+               struct folio *folio = folios[i];
 
                if (folio == prev_folio)
                        continue;
@@ -2468,7 +2468,7 @@ static unsigned long collect_longterm_unpinnable_pages(
                        continue;
 
                if (folio_test_hugetlb(folio)) {
-                       isolate_hugetlb(folio, movable_page_list);
+                       isolate_hugetlb(folio, movable_folio_list);
                        continue;
                }
 
@@ -2480,7 +2480,7 @@ static unsigned long collect_longterm_unpinnable_pages(
                if (!folio_isolate_lru(folio))
                        continue;
 
-               list_add_tail(&folio->lru, movable_page_list);
+               list_add_tail(&folio->lru, movable_folio_list);
                node_stat_mod_folio(folio,
                                    NR_ISOLATED_ANON + folio_is_file_lru(folio),
                                    folio_nr_pages(folio));
@@ -2490,27 +2490,28 @@ static unsigned long collect_longterm_unpinnable_pages(
 }
 
 /*
- * Unpins all pages and migrates device coherent pages and movable_page_list.
- * Returns -EAGAIN if all pages were successfully migrated or -errno for failure
- * (or partial success).
+ * Unpins all folios and migrates device coherent folios and movable_folio_list.
+ * Returns -EAGAIN if all folios were successfully migrated or -errno for
+ * failure (or partial success).
  */
-static int migrate_longterm_unpinnable_pages(
-                                       struct list_head *movable_page_list,
-                                       unsigned long nr_pages,
-                                       struct page **pages)
+static int migrate_longterm_unpinnable_folios(
+                                       struct list_head *movable_folio_list,
+                                       unsigned long nr_folios,
+                                       struct folio **folios)
 {
        int ret;
        unsigned long i;
 
-       for (i = 0; i < nr_pages; i++) {
-               struct folio *folio = page_folio(pages[i]);
+       for (i = 0; i < nr_folios; i++) {
+               struct folio *folio = folios[i];
 
                if (folio_is_device_coherent(folio)) {
                        /*
-                        * Migration will fail if the page is pinned, so convert
-                        * the pin on the source page to a normal reference.
+                        * Migration will fail if the folio is pinned, so
+                        * convert the pin on the source folio to a normal
+                        * reference.
                         */
-                       pages[i] = NULL;
+                       folios[i] = NULL;
                        folio_get(folio);
                        gup_put_folio(folio, 1, FOLL_PIN);
 
@@ -2523,24 +2524,24 @@ static int migrate_longterm_unpinnable_pages(
                }
 
                /*
-                * We can't migrate pages with unexpected references, so drop
+                * We can't migrate folios with unexpected references, so drop
                 * the reference obtained by __get_user_pages_locked().
-                * Migrating pages have been added to movable_page_list after
+                * Migrating folios have been added to movable_folio_list after
                 * calling folio_isolate_lru() which takes a reference so the
-                * page won't be freed if it's migrating.
+                * folio won't be freed if it's migrating.
                 */
-               unpin_user_page(pages[i]);
-               pages[i] = NULL;
+               unpin_folio(folios[i]);
+               folios[i] = NULL;
        }
 
-       if (!list_empty(movable_page_list)) {
+       if (!list_empty(movable_folio_list)) {
                struct migration_target_control mtc = {
                        .nid = NUMA_NO_NODE,
                        .gfp_mask = GFP_USER | __GFP_NOWARN,
                        .reason = MR_LONGTERM_PIN,
                };
 
-               if (migrate_pages(movable_page_list, alloc_migration_target,
+               if (migrate_pages(movable_folio_list, alloc_migration_target,
                                  NULL, (unsigned long)&mtc, MIGRATE_SYNC,
                                  MR_LONGTERM_PIN, NULL)) {
                        ret = -ENOMEM;
@@ -2548,48 +2549,71 @@ static int migrate_longterm_unpinnable_pages(
                }
        }
 
-       putback_movable_pages(movable_page_list);
+       putback_movable_pages(movable_folio_list);
 
        return -EAGAIN;
 
 err:
-       for (i = 0; i < nr_pages; i++)
-               if (pages[i])
-                       unpin_user_page(pages[i]);
-       putback_movable_pages(movable_page_list);
+       unpin_folios(folios, nr_folios);
+       putback_movable_pages(movable_folio_list);
 
        return ret;
 }
 
 /*
- * Check whether all pages are *allowed* to be pinned. Rather confusingly, all
- * pages in the range are required to be pinned via FOLL_PIN, before calling
- * this routine.
+ * Check whether all folios are *allowed* to be pinned indefinitely (longterm).
+ * Rather confusingly, all folios in the range are required to be pinned via
+ * FOLL_PIN, before calling this routine.
  *
- * If any pages in the range are not allowed to be pinned, then this routine
- * will migrate those pages away, unpin all the pages in the range and return
+ * If any folios in the range are not allowed to be pinned, then this routine
+ * will migrate those folios away, unpin all the folios in the range and return
  * -EAGAIN. The caller should re-pin the entire range with FOLL_PIN and then
  * call this routine again.
  *
  * If an error other than -EAGAIN occurs, this indicates a migration failure.
  * The caller should give up, and propagate the error back up the call stack.
  *
- * If everything is OK and all pages in the range are allowed to be pinned, then
- * this routine leaves all pages pinned and returns zero for success.
+ * If everything is OK and all folios in the range are allowed to be pinned,
+ * then this routine leaves all folios pinned and returns zero for success.
  */
-static long check_and_migrate_movable_pages(unsigned long nr_pages,
-                                           struct page **pages)
+static long check_and_migrate_movable_folios(unsigned long nr_folios,
+                                            struct folio **folios)
 {
        unsigned long collected;
-       LIST_HEAD(movable_page_list);
+       LIST_HEAD(movable_folio_list);
 
-       collected = collect_longterm_unpinnable_pages(&movable_page_list,
-                                               nr_pages, pages);
+       collected = collect_longterm_unpinnable_folios(&movable_folio_list,
+                                                      nr_folios, folios);
        if (!collected)
                return 0;
 
-       return migrate_longterm_unpinnable_pages(&movable_page_list, nr_pages,
-                                               pages);
+       return migrate_longterm_unpinnable_folios(&movable_folio_list,
+                                                 nr_folios, folios);
+}
+
+/*
+ * This routine just converts all the pages in the @pages array to folios and
+ * calls check_and_migrate_movable_folios() to do the heavy lifting.
+ *
+ * Please see the check_and_migrate_movable_folios() documentation for details.
+ */
+static long check_and_migrate_movable_pages(unsigned long nr_pages,
+                                           struct page **pages)
+{
+       struct folio **folios;
+       long i, ret;
+
+       folios = kmalloc_array(nr_pages, sizeof(*folios), GFP_KERNEL);
+       if (!folios)
+               return -ENOMEM;
+
+       for (i = 0; i < nr_pages; i++)
+               folios[i] = page_folio(pages[i]);
+
+       ret = check_and_migrate_movable_folios(nr_pages, folios);
+
+       kfree(folios);
+       return ret;
 }
 #else
 static long check_and_migrate_movable_pages(unsigned long nr_pages,
@@ -2597,6 +2621,12 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages,
 {
        return 0;
 }
+
+static long check_and_migrate_movable_folios(unsigned long nr_folios,
+                                            struct folio **folios)
+{
+       return 0;
+}
 #endif /* CONFIG_MIGRATION */
 
 /*

I suggest filing a bug.

This issue is mapped to internal ticket ID 4892957 .
The issue is resolved by a Linux kernel patch which is now in the mainline kernel

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.