Using CUDA virtual memory API for host allocation

How should I use the virtual memory to allocate memory on the host? I found CU_MEM_LOCATION_TYPE_HOST flag, but when I tried to use this flag, the cuMemCreate API returns an error of invalid arguments.
The following is my code, it works fine when the flag is CU_MEM_LOCATION_TYPE_DEVICE.

#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#include <numeric> // For std::iota

#define CUDA_CHECK(call)                                                  \
    do {                                                                  \
        CUresult err = call;                                              \
        if (err != CUDA_SUCCESS) {                                        \
            const char *err_str;                                          \
            cuGetErrorString(err, &err_str);                             \
            std::cerr << "CUDA Error in " << __FILE__ << ":" << __LINE__ \
                      << " (" << #call << "): " << err_str << std::endl; \
            exit(EXIT_FAILURE);                                           \
        }                                                                 \
    } while (0)

// Helper function to get allocation properties
static CUmemAllocationProp getAllocationProp() {
    CUmemAllocationProp prop = {};
    prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
    prop.location.type = CU_MEM_LOCATION_TYPE_HOST;// CU_MEM_LOCATION_TYPE_DEVICE;
    prop.location.id = 0; 
    return prop;
}

// Helper function to get access descriptor
static CUmemAccessDesc getAccessDesc() {
    CUmemAccessDesc accessDesc = {};
    accessDesc.location.type = CU_MEM_LOCATION_TYPE_HOST;// CU_MEM_LOCATION_TYPE_DEVICE;
    accessDesc.location.id = 0; // Assuming device 0
    accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
    return accessDesc;
}

__global__ void write(char *ptr, int len) {
    auto thread_id = blockIdx.x * blockDim.x + threadIdx.x;
    if (thread_id < len) {
        ptr[thread_id] = thread_id % 256; // Write some data
    }
}

__global__ void read(char *ptr, int len) {
    if (blockIdx.x == 0 && threadIdx.x < 32) {
        printf("Data at index %d: %d\n", threadIdx.x, ptr[threadIdx.x]);
    }
}


int main() {
    CUDA_CHECK(cuInit(0));

    size_t pageSize = 0;
    CUmemAllocationProp prop = getAllocationProp();
    CUDA_CHECK(cuMemGetAllocationGranularity(&pageSize, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
    std::cout << "Minimum allocation granularity (pageSize): " << pageSize << " bytes" << std::endl;

    // 1. Allocate physical memory handles
    size_t size1 = pageSize * 4; // 4 pages
    size_t size2 = pageSize * 8; // 8 pages
    CUmemGenericAllocationHandle physHandle1, physHandle2;

    std::cout << "Allocating physical memory chunk 1 of size: " << size1 << std::endl;
    CUDA_CHECK(cuMemCreate(&physHandle1, size1, &prop, 0));
    std::cout << "Allocating physical memory chunk 2 of size: " << size2 << std::endl;
    CUDA_CHECK(cuMemCreate(&physHandle2, size2, &prop, 0));

    // 2. Reserve virtual address ranges
    CUdeviceptr va1;
    size_t va1_size = size1 + size2; // Combined size for va1

    std::cout << "Reserving VA range 1 of size: " << va1_size << std::endl;
    CUDA_CHECK(cuMemAddressReserve(&va1, va1_size, pageSize, 0, 0));


    // 3. Map physical memory to virtual addresses
    CUmemAccessDesc accessDesc = getAccessDesc();

    std::cout << "Mapping physHandle1 to va1" << std::endl;
    CUDA_CHECK(cuMemMap(va1, size1, 0, physHandle1, 0));
    CUDA_CHECK(cuMemSetAccess(va1, size1, &accessDesc, 1));

    std::cout << "Mapping physHandle2 to va1 + size1" << std::endl;
    CUDA_CHECK(cuMemMap(va1 + size1, size2, 0, physHandle2, 0));
    CUDA_CHECK(cuMemSetAccess(va1 + size1, size2, &accessDesc, 1));

    // now we try to use va1 as a single contiguous VA
    cudaMemset((void*)va1, 0, size1 + size2); // Initialize the entire VA range to zero
    std::cout << "Initialized VA range 1 to zero" << std::endl;

    auto grid_size = (size1 + size2 + 1023) / 1024; // Calculate grid size for kernel launch
    write<<<grid_size, 1024>>>(reinterpret_cast<char *>(va1), size1 + size2);
    read<<<1, 32>>>(reinterpret_cast<char *>(va1), size1 + size2);

    cudaDeviceSynchronize();


    return 0;
}

The VMM api is for allocation of device memory, similar to what is created/produced by cudaMalloc. It is not intended for allocation of host memory.

The documentation for cuMemCreate states:

This creates a memory allocation on the target device

Yes, but it also says

To create a CPU allocation targeting a specific host NUMA node, applications must set CUmemAllocationProp::CUmemLocation::type to CU_MEM_LOCATION_TYPE_HOST_NUMA and CUmemAllocationProp::CUmemLocation::id must specify the NUMA ID of the CPU. On systems where NUMA is not available CUmemAllocationProp::CUmemLocation::id must be set to 0.
So I think host allocation should be OK?

I see. Right after that it says:

Specifying CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT or CU_MEM_LOCATION_TYPE_HOST as the CUmemLocation::type will result in CUDA_ERROR_INVALID_VALUE.

Your excerpt says:

applications must set CUmemAllocationProp::CUmemLocation::type to CU_MEM_LOCATION_TYPE_HOST_NUMA and CUmemAllocationProp::CUmemLocation::id must specify the NUMA ID of the CPU. On systems where NUMA is not available CUmemAllocationProp::CUmemLocation::id must be set to 0.

Did you try that instead? (Right now you are specifying CU_MEM_LOCATION_TYPE_HOST and the docs explicitly state that will result in CUDA_ERROR_INVALID_VALUE)

No, I didn’t try that before, but I just did as you suggested and set it to CU_MEM_LOCATION_TYPE_HOST_NUMA with the appropriate NUMA ID, and it worked successfully. Thank you for your advice! I had overlooked that only this flag is allowed in this case.

Now I have another question: Is the memory allocated with this method pinned, as the flag is CU_MEM_ALLOCATION_TYPE_PINNED? Is it similar to the memory allocated with cudaMallocHost in terms of being pinned and don’t need redunct copy when doing async transfer to gpu?

Thanks again for your help!