Driver API. Is it possible to create a memory mapping which can be accessed by the host?

Hi,
I would like to use the virtual memory management API to create a contiguous virtual address range which is accessible by the host. However, I am not sure if it is possible.

I am already able to create a device memory range which can be successfully used in kernel calls. To get a host mapping, I tried the following approaches by using the device id CU_DEVICE_CPU:

Option 1: Create host memory → CUDA_ERROR_INVALID_DEVICE

        CUmemAllocationProp prop = {};
        prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
        prop.location.id = CU_DEVICE_CPU;
        prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
        prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
        prop.win32HandleMetaData = nullptr;

        CUDADRV_SAFE_CALL( cuMemCreate(&handle, size, &prop, 0) );

Option 2: Allow both host and device to access device memory → CUDA_ERROR_INVALID_VALUE

        CUmemAccessDesc desc[2];
        desc[0].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
        desc[0].location.id = device_;
        desc[0].location.type = CU_MEM_LOCATION_TYPE_DEVICE;
        desc[1].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
        desc[1].location.id = CU_DEVICE_CPU;
        desc[1].location.type = CU_MEM_LOCATION_TYPE_DEVICE;

        CUDADRV_SAFE_CALL( cuMemSetAccess(ptr, size, &desc[0], 2) );

Are there any other options? Is it possible at all to get this working?

Time has passed and since CUDA 12.2 (I think), it is possible to create pinned host allocations and specify host access to allocations using the virtual memory management API. The driver shipped with recent CUDA 12.4 contains a fix for this feature.
I am not sure what my intention was back then, but with the current API and driver fix it is now possible to have a single contiguous device memory address range that is backed by both device memory and pinned host memory.

Following is an example for a 2 GB allocation with 1 GB on the device and 1 GB on the host.

//nvcc -O3 -std=c++17 main.cu -lcuda -o main
#include <cuda.h>

#include <cassert>
#include <iostream>
#include <vector>

#include <thrust/fill.h>
#include <thrust/execution_policy.h>

int main(){
    //allocate a contiguous 2GB buffer where 1 GB resides on GPU 0 and 1 GB resides on the host
    //API requires cuda 12.2, driver bug is fixed with cuda 12.4 / driver 550.54.14 (linux)

    constexpr size_t GB = 1 << 30;

    cudaSetDevice(0); //initialize cuda context

    CUresult status = CUDA_SUCCESS;
    CUmemAllocationProp prop;
    memset(&prop, 0, sizeof(CUmemAllocationProp));
    prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;

    size_t granularityDevice = 0;
    size_t granularityHost = 0;
    prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
    prop.location.id = 0;
    status = cuMemGetAllocationGranularity(&granularityDevice, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM);
    assert(status == CUDA_SUCCESS);

    prop.location.type = CU_MEM_LOCATION_TYPE_HOST;
    prop.location.id = 0;
    status = cuMemGetAllocationGranularity(&granularityHost, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM);
    assert(status == CUDA_SUCCESS);

    size_t granularity = std::max(granularityDevice, granularityHost);

    const size_t allocationSize = 2*GB;
    assert(GB % granularity == 0);
    assert(allocationSize % granularity == 0);

    CUdeviceptr deviceptr = 0;
    CUmemGenericAllocationHandle allocationHandle;

    status = cuMemAddressReserve(&deviceptr, allocationSize, 0, 0, 0);
    assert(status == CUDA_SUCCESS);

    prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
    prop.location.id = 0;
    status = cuMemCreate(&allocationHandle, GB, &prop, 0);
    assert(status == CUDA_SUCCESS);
    status = cuMemMap(deviceptr, GB, 0, allocationHandle, 0);
    assert(status == CUDA_SUCCESS);
    status = cuMemRelease(allocationHandle);
    assert(status == CUDA_SUCCESS);

    prop.location.type = CU_MEM_LOCATION_TYPE_HOST_NUMA;
    prop.location.id = 0;
    status = cuMemCreate(&allocationHandle, GB, &prop, 0);
    assert(status == CUDA_SUCCESS);
    status = cuMemMap(deviceptr + GB, GB, 0, allocationHandle, 0);
    assert(status == CUDA_SUCCESS);
    status = cuMemRelease(allocationHandle);
    assert(status == CUDA_SUCCESS);


    //set access control such that the device chunk is only accessible from the device,
    //and the host chunk is also only accessible from the device
    std::vector<CUmemAccessDesc> accessDescriptors(1);
    accessDescriptors[0].location.type = CU_MEM_LOCATION_TYPE_DEVICE;
    accessDescriptors[0].location.id = 0;
    accessDescriptors[0].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
    status = cuMemSetAccess(deviceptr, GB, accessDescriptors.data(), 1);
    assert(status == CUDA_SUCCESS);
    status = cuMemSetAccess(deviceptr + GB, GB, accessDescriptors.data(), 1);
    assert(status == CUDA_SUCCESS);


    char* d_data = (char*)deviceptr;
    char* h_data; cudaMallocHost(&h_data, sizeof(char) * 2*GB);

    //older drivers may report errors on the next lines
    cudaError_t rtstatus = cudaSuccess;
    rtstatus = cudaMemset(d_data, 0, GB);
    std::cout << "cudaMemset device chunk: " << cudaGetErrorString(rtstatus) << "\n";
    cudaGetLastError();

    rtstatus = cudaMemset(d_data + GB, 0, GB);
    std::cout << "cudaMemset host chunk: " << cudaGetErrorString(rtstatus) << "\n";
    cudaGetLastError();

    rtstatus = cudaMemset(d_data, 0, 2*GB);
    std::cout << "cudaMemset full allocation: " << cudaGetErrorString(rtstatus) << "\n";
    cudaGetLastError();

    rtstatus = cudaMemcpy(d_data, h_data, GB, cudaMemcpyHostToDevice);
    std::cout << "cudaMemcpy to device chunk: " << cudaGetErrorString(rtstatus) << "\n";
    cudaGetLastError();

    rtstatus = cudaMemcpy(d_data, h_data, 2*GB, cudaMemcpyHostToDevice);
    std::cout << "cudaMemcpy to full allocation: " << cudaGetErrorString(rtstatus) << "\n";
    cudaGetLastError();



    cudaEvent_t eventA, eventB;
    cudaEventCreate(&eventA);
    cudaEventCreate(&eventB);
    float elapsed;

    cudaEventRecord(eventA);
    cudaMemcpy(d_data, h_data, GB, cudaMemcpyHostToDevice);
    cudaEventRecord(eventB);
    cudaEventSynchronize(eventB);
    cudaEventElapsedTime(&elapsed, eventA, eventB);
    std::cout << "cudaMemcpy to device chunk: " << elapsed << "\n";

    cudaEventRecord(eventA);
    cudaMemcpy(d_data + GB, h_data, GB, cudaMemcpyHostToDevice);
    cudaEventRecord(eventB);
    cudaEventSynchronize(eventB);
    cudaEventElapsedTime(&elapsed, eventA, eventB);
    std::cout << "cudaMemcpy to host chunk: " << elapsed << "\n";

    cudaEventRecord(eventA);
    cudaMemcpy(d_data, h_data, 2*GB, cudaMemcpyHostToDevice);
    cudaEventRecord(eventB);
    cudaEventSynchronize(eventB);
    cudaEventElapsedTime(&elapsed, eventA, eventB);
    std::cout << "cudaMemcpy to both: " << elapsed << "\n";

    cudaEventRecord(eventA);
    cudaMemset(d_data, 0, GB);
    cudaEventRecord(eventB);
    cudaEventSynchronize(eventB);
    cudaEventElapsedTime(&elapsed, eventA, eventB);
    std::cout << "cudaMemset device chunk: " << elapsed << "\n";

    cudaEventRecord(eventA);
    cudaMemset(d_data + GB, 0, GB);
    cudaEventRecord(eventB);
    cudaEventSynchronize(eventB);
    cudaEventElapsedTime(&elapsed, eventA, eventB);
    std::cout << "cudaMemset host chunk: " << elapsed << "\n";

    cudaEventRecord(eventA);
    cudaMemset(d_data, 0, 2*GB);
    cudaEventRecord(eventB);
    cudaEventSynchronize(eventB);
    cudaEventElapsedTime(&elapsed, eventA, eventB);
    std::cout << "cudaMemset both: " << elapsed << "\n";

    {
        int* begin = (int*)d_data;
        int* mid = (int*)(d_data + GB);
        int* end = (int*)(d_data + 2*GB);

        cudaEventRecord(eventA);
        thrust::fill(thrust::cuda::par_nosync, begin, mid, 0);
        cudaEventRecord(eventB);
        cudaEventSynchronize(eventB);
        cudaEventElapsedTime(&elapsed, eventA, eventB);
        std::cout << "thrust::fill device chunk: " << elapsed << "\n";

        cudaEventRecord(eventA);
        thrust::fill(thrust::cuda::par_nosync, mid, end, 0);
        cudaEventRecord(eventB);
        cudaEventSynchronize(eventB);
        cudaEventElapsedTime(&elapsed, eventA, eventB);
        std::cout << "thrust::fill host chunk: " << elapsed << "\n";

        cudaEventRecord(eventA);
        thrust::fill(thrust::cuda::par_nosync, begin, end, 0);
        cudaEventRecord(eventB);
        cudaEventSynchronize(eventB);
        cudaEventElapsedTime(&elapsed, eventA, eventB);
        std::cout << "thrust::fill both: " << elapsed << "\n";
    }

    status = cuMemUnmap(deviceptr, allocationSize);
    assert(status == CUDA_SUCCESS);
    status = cuMemAddressFree(deviceptr, allocationSize);
    assert(status == CUDA_SUCCESS);
}
2 Likes

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.