Driver API. Is it possible to create a memory mapping which can be accessed by the host?

striker159 · October 2, 2020, 9:46am

Hi,
I would like to use the virtual memory management API to create a contiguous virtual address range which is accessible by the host. However, I am not sure if it is possible.

I am already able to create a device memory range which can be successfully used in kernel calls. To get a host mapping, I tried the following approaches by using the device id CU_DEVICE_CPU:

Option 1: Create host memory → CUDA_ERROR_INVALID_DEVICE

        CUmemAllocationProp prop = {};
        prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
        prop.location.id = CU_DEVICE_CPU;
        prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
        prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
        prop.win32HandleMetaData = nullptr;

        CUDADRV_SAFE_CALL( cuMemCreate(&handle, size, &prop, 0) );

Option 2: Allow both host and device to access device memory → CUDA_ERROR_INVALID_VALUE

        CUmemAccessDesc desc[2];
        desc[0].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
        desc[0].location.id = device_;
        desc[0].location.type = CU_MEM_LOCATION_TYPE_DEVICE;
        desc[1].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
        desc[1].location.id = CU_DEVICE_CPU;
        desc[1].location.type = CU_MEM_LOCATION_TYPE_DEVICE;

        CUDADRV_SAFE_CALL( cuMemSetAccess(ptr, size, &desc[0], 2) );

Are there any other options? Is it possible at all to get this working?

striker159 · March 10, 2024, 6:19pm

Time has passed and since CUDA 12.2 (I think), it is possible to create pinned host allocations and specify host access to allocations using the virtual memory management API. The driver shipped with recent CUDA 12.4 contains a fix for this feature.
I am not sure what my intention was back then, but with the current API and driver fix it is now possible to have a single contiguous device memory address range that is backed by both device memory and pinned host memory.

Following is an example for a 2 GB allocation with 1 GB on the device and 1 GB on the host.

//nvcc -O3 -std=c++17 main.cu -lcuda -o main
#include <cuda.h>

#include <cassert>
#include <iostream>
#include <vector>

#include <thrust/fill.h>
#include <thrust/execution_policy.h>

int main(){
    //allocate a contiguous 2GB buffer where 1 GB resides on GPU 0 and 1 GB resides on the host
    //API requires cuda 12.2, driver bug is fixed with cuda 12.4 / driver 550.54.14 (linux)

    constexpr size_t GB = 1 << 30;

    cudaSetDevice(0); //initialize cuda context

    CUresult status = CUDA_SUCCESS;
    CUmemAllocationProp prop;
    memset(&prop, 0, sizeof(CUmemAllocationProp));
    prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;

    size_t granularityDevice = 0;
    size_t granularityHost = 0;
    prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
    prop.location.id = 0;
    status = cuMemGetAllocationGranularity(&granularityDevice, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM);
    assert(status == CUDA_SUCCESS);

    prop.location.type = CU_MEM_LOCATION_TYPE_HOST;
    prop.location.id = 0;
    status = cuMemGetAllocationGranularity(&granularityHost, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM);
    assert(status == CUDA_SUCCESS);

    size_t granularity = std::max(granularityDevice, granularityHost);

    const size_t allocationSize = 2*GB;
    assert(GB % granularity == 0);
    assert(allocationSize % granularity == 0);

    CUdeviceptr deviceptr = 0;
    CUmemGenericAllocationHandle allocationHandle;

    status = cuMemAddressReserve(&deviceptr, allocationSize, 0, 0, 0);
    assert(status == CUDA_SUCCESS);

    prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
    prop.location.id = 0;
    status = cuMemCreate(&allocationHandle, GB, &prop, 0);
    assert(status == CUDA_SUCCESS);
    status = cuMemMap(deviceptr, GB, 0, allocationHandle, 0);
    assert(status == CUDA_SUCCESS);
    status = cuMemRelease(allocationHandle);
    assert(status == CUDA_SUCCESS);

    prop.location.type = CU_MEM_LOCATION_TYPE_HOST_NUMA;
    prop.location.id = 0;
    status = cuMemCreate(&allocationHandle, GB, &prop, 0);
    assert(status == CUDA_SUCCESS);
    status = cuMemMap(deviceptr + GB, GB, 0, allocationHandle, 0);
    assert(status == CUDA_SUCCESS);
    status = cuMemRelease(allocationHandle);
    assert(status == CUDA_SUCCESS);


    //set access control such that the device chunk is only accessible from the device,
    //and the host chunk is also only accessible from the device
    std::vector<CUmemAccessDesc> accessDescriptors(1);
    accessDescriptors[0].location.type = CU_MEM_LOCATION_TYPE_DEVICE;
    accessDescriptors[0].location.id = 0;
    accessDescriptors[0].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
    status = cuMemSetAccess(deviceptr, GB, accessDescriptors.data(), 1);
    assert(status == CUDA_SUCCESS);
    status = cuMemSetAccess(deviceptr + GB, GB, accessDescriptors.data(), 1);
    assert(status == CUDA_SUCCESS);


    char* d_data = (char*)deviceptr;
    char* h_data; cudaMallocHost(&h_data, sizeof(char) * 2*GB);

    //older drivers may report errors on the next lines
    cudaError_t rtstatus = cudaSuccess;
    rtstatus = cudaMemset(d_data, 0, GB);
    std::cout << "cudaMemset device chunk: " << cudaGetErrorString(rtstatus) << "\n";
    cudaGetLastError();

    rtstatus = cudaMemset(d_data + GB, 0, GB);
    std::cout << "cudaMemset host chunk: " << cudaGetErrorString(rtstatus) << "\n";
    cudaGetLastError();

    rtstatus = cudaMemset(d_data, 0, 2*GB);
    std::cout << "cudaMemset full allocation: " << cudaGetErrorString(rtstatus) << "\n";
    cudaGetLastError();

    rtstatus = cudaMemcpy(d_data, h_data, GB, cudaMemcpyHostToDevice);
    std::cout << "cudaMemcpy to device chunk: " << cudaGetErrorString(rtstatus) << "\n";
    cudaGetLastError();

    rtstatus = cudaMemcpy(d_data, h_data, 2*GB, cudaMemcpyHostToDevice);
    std::cout << "cudaMemcpy to full allocation: " << cudaGetErrorString(rtstatus) << "\n";
    cudaGetLastError();



    cudaEvent_t eventA, eventB;
    cudaEventCreate(&eventA);
    cudaEventCreate(&eventB);
    float elapsed;

    cudaEventRecord(eventA);
    cudaMemcpy(d_data, h_data, GB, cudaMemcpyHostToDevice);
    cudaEventRecord(eventB);
    cudaEventSynchronize(eventB);
    cudaEventElapsedTime(&elapsed, eventA, eventB);
    std::cout << "cudaMemcpy to device chunk: " << elapsed << "\n";

    cudaEventRecord(eventA);
    cudaMemcpy(d_data + GB, h_data, GB, cudaMemcpyHostToDevice);
    cudaEventRecord(eventB);
    cudaEventSynchronize(eventB);
    cudaEventElapsedTime(&elapsed, eventA, eventB);
    std::cout << "cudaMemcpy to host chunk: " << elapsed << "\n";

    cudaEventRecord(eventA);
    cudaMemcpy(d_data, h_data, 2*GB, cudaMemcpyHostToDevice);
    cudaEventRecord(eventB);
    cudaEventSynchronize(eventB);
    cudaEventElapsedTime(&elapsed, eventA, eventB);
    std::cout << "cudaMemcpy to both: " << elapsed << "\n";

    cudaEventRecord(eventA);
    cudaMemset(d_data, 0, GB);
    cudaEventRecord(eventB);
    cudaEventSynchronize(eventB);
    cudaEventElapsedTime(&elapsed, eventA, eventB);
    std::cout << "cudaMemset device chunk: " << elapsed << "\n";

    cudaEventRecord(eventA);
    cudaMemset(d_data + GB, 0, GB);
    cudaEventRecord(eventB);
    cudaEventSynchronize(eventB);
    cudaEventElapsedTime(&elapsed, eventA, eventB);
    std::cout << "cudaMemset host chunk: " << elapsed << "\n";

    cudaEventRecord(eventA);
    cudaMemset(d_data, 0, 2*GB);
    cudaEventRecord(eventB);
    cudaEventSynchronize(eventB);
    cudaEventElapsedTime(&elapsed, eventA, eventB);
    std::cout << "cudaMemset both: " << elapsed << "\n";

    {
        int* begin = (int*)d_data;
        int* mid = (int*)(d_data + GB);
        int* end = (int*)(d_data + 2*GB);

        cudaEventRecord(eventA);
        thrust::fill(thrust::cuda::par_nosync, begin, mid, 0);
        cudaEventRecord(eventB);
        cudaEventSynchronize(eventB);
        cudaEventElapsedTime(&elapsed, eventA, eventB);
        std::cout << "thrust::fill device chunk: " << elapsed << "\n";

        cudaEventRecord(eventA);
        thrust::fill(thrust::cuda::par_nosync, mid, end, 0);
        cudaEventRecord(eventB);
        cudaEventSynchronize(eventB);
        cudaEventElapsedTime(&elapsed, eventA, eventB);
        std::cout << "thrust::fill host chunk: " << elapsed << "\n";

        cudaEventRecord(eventA);
        thrust::fill(thrust::cuda::par_nosync, begin, end, 0);
        cudaEventRecord(eventB);
        cudaEventSynchronize(eventB);
        cudaEventElapsedTime(&elapsed, eventA, eventB);
        std::cout << "thrust::fill both: " << elapsed << "\n";
    }

    status = cuMemUnmap(deviceptr, allocationSize);
    assert(status == CUDA_SUCCESS);
    status = cuMemAddressFree(deviceptr, allocationSize);
    assert(status == CUDA_SUCCESS);
}

system · March 24, 2024, 6:20pm

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.

Topic		Replies	Views
Transparent inter-GPU memory migration CUDA Programming and Performance cuda	4	322	December 14, 2023
Introducing Low-Level GPU Virtual Memory Management Technical Blog	59	7749	June 4, 2024
How to make host pinned shared memory across process fork(2)? CUDA Programming and Performance	14	5252	January 6, 2015
How get in host the memory allocated from device CUDA Programming and Performance	10	3001	August 16, 2017
From NIC to GPU. CUDA Programming and Performance	40	13612	February 12, 2011
Device to host data copy may not reflect on host side using graphs CUDA Programming and Performance	5	238	September 6, 2023
Cuda application crashes works fine for small data and crashes for big data CUDA Developer Tools	0	361	December 8, 2020
Device Memory Mangement CUDA Programming and Performance	14	3457	December 5, 2008
Can I create a pinned memory buffer to support overlapping compute/copy without cudaMallocHost overhead CUDA Programming and Performance cuda	13	797	November 3, 2020
Upgrading to CUDA 12.4 broke down the application GPU-Accelerated Libraries cublas , cusparse	13	1152	July 21, 2024

Driver API. Is it possible to create a memory mapping which can be accessed by the host?

Related topics