Time has passed and since CUDA 12.2 (I think), it is possible to create pinned host allocations and specify host access to allocations using the virtual memory management API. The driver shipped with recent CUDA 12.4 contains a fix for this feature.
I am not sure what my intention was back then, but with the current API and driver fix it is now possible to have a single contiguous device memory address range that is backed by both device memory and pinned host memory.
Following is an example for a 2 GB allocation with 1 GB on the device and 1 GB on the host.
//nvcc -O3 -std=c++17 main.cu -lcuda -o main
#include <cuda.h>
#include <cassert>
#include <iostream>
#include <vector>
#include <thrust/fill.h>
#include <thrust/execution_policy.h>
int main(){
//allocate a contiguous 2GB buffer where 1 GB resides on GPU 0 and 1 GB resides on the host
//API requires cuda 12.2, driver bug is fixed with cuda 12.4 / driver 550.54.14 (linux)
constexpr size_t GB = 1 << 30;
cudaSetDevice(0); //initialize cuda context
CUresult status = CUDA_SUCCESS;
CUmemAllocationProp prop;
memset(&prop, 0, sizeof(CUmemAllocationProp));
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
size_t granularityDevice = 0;
size_t granularityHost = 0;
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
prop.location.id = 0;
status = cuMemGetAllocationGranularity(&granularityDevice, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM);
assert(status == CUDA_SUCCESS);
prop.location.type = CU_MEM_LOCATION_TYPE_HOST;
prop.location.id = 0;
status = cuMemGetAllocationGranularity(&granularityHost, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM);
assert(status == CUDA_SUCCESS);
size_t granularity = std::max(granularityDevice, granularityHost);
const size_t allocationSize = 2*GB;
assert(GB % granularity == 0);
assert(allocationSize % granularity == 0);
CUdeviceptr deviceptr = 0;
CUmemGenericAllocationHandle allocationHandle;
status = cuMemAddressReserve(&deviceptr, allocationSize, 0, 0, 0);
assert(status == CUDA_SUCCESS);
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
prop.location.id = 0;
status = cuMemCreate(&allocationHandle, GB, &prop, 0);
assert(status == CUDA_SUCCESS);
status = cuMemMap(deviceptr, GB, 0, allocationHandle, 0);
assert(status == CUDA_SUCCESS);
status = cuMemRelease(allocationHandle);
assert(status == CUDA_SUCCESS);
prop.location.type = CU_MEM_LOCATION_TYPE_HOST_NUMA;
prop.location.id = 0;
status = cuMemCreate(&allocationHandle, GB, &prop, 0);
assert(status == CUDA_SUCCESS);
status = cuMemMap(deviceptr + GB, GB, 0, allocationHandle, 0);
assert(status == CUDA_SUCCESS);
status = cuMemRelease(allocationHandle);
assert(status == CUDA_SUCCESS);
//set access control such that the device chunk is only accessible from the device,
//and the host chunk is also only accessible from the device
std::vector<CUmemAccessDesc> accessDescriptors(1);
accessDescriptors[0].location.type = CU_MEM_LOCATION_TYPE_DEVICE;
accessDescriptors[0].location.id = 0;
accessDescriptors[0].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
status = cuMemSetAccess(deviceptr, GB, accessDescriptors.data(), 1);
assert(status == CUDA_SUCCESS);
status = cuMemSetAccess(deviceptr + GB, GB, accessDescriptors.data(), 1);
assert(status == CUDA_SUCCESS);
char* d_data = (char*)deviceptr;
char* h_data; cudaMallocHost(&h_data, sizeof(char) * 2*GB);
//older drivers may report errors on the next lines
cudaError_t rtstatus = cudaSuccess;
rtstatus = cudaMemset(d_data, 0, GB);
std::cout << "cudaMemset device chunk: " << cudaGetErrorString(rtstatus) << "\n";
cudaGetLastError();
rtstatus = cudaMemset(d_data + GB, 0, GB);
std::cout << "cudaMemset host chunk: " << cudaGetErrorString(rtstatus) << "\n";
cudaGetLastError();
rtstatus = cudaMemset(d_data, 0, 2*GB);
std::cout << "cudaMemset full allocation: " << cudaGetErrorString(rtstatus) << "\n";
cudaGetLastError();
rtstatus = cudaMemcpy(d_data, h_data, GB, cudaMemcpyHostToDevice);
std::cout << "cudaMemcpy to device chunk: " << cudaGetErrorString(rtstatus) << "\n";
cudaGetLastError();
rtstatus = cudaMemcpy(d_data, h_data, 2*GB, cudaMemcpyHostToDevice);
std::cout << "cudaMemcpy to full allocation: " << cudaGetErrorString(rtstatus) << "\n";
cudaGetLastError();
cudaEvent_t eventA, eventB;
cudaEventCreate(&eventA);
cudaEventCreate(&eventB);
float elapsed;
cudaEventRecord(eventA);
cudaMemcpy(d_data, h_data, GB, cudaMemcpyHostToDevice);
cudaEventRecord(eventB);
cudaEventSynchronize(eventB);
cudaEventElapsedTime(&elapsed, eventA, eventB);
std::cout << "cudaMemcpy to device chunk: " << elapsed << "\n";
cudaEventRecord(eventA);
cudaMemcpy(d_data + GB, h_data, GB, cudaMemcpyHostToDevice);
cudaEventRecord(eventB);
cudaEventSynchronize(eventB);
cudaEventElapsedTime(&elapsed, eventA, eventB);
std::cout << "cudaMemcpy to host chunk: " << elapsed << "\n";
cudaEventRecord(eventA);
cudaMemcpy(d_data, h_data, 2*GB, cudaMemcpyHostToDevice);
cudaEventRecord(eventB);
cudaEventSynchronize(eventB);
cudaEventElapsedTime(&elapsed, eventA, eventB);
std::cout << "cudaMemcpy to both: " << elapsed << "\n";
cudaEventRecord(eventA);
cudaMemset(d_data, 0, GB);
cudaEventRecord(eventB);
cudaEventSynchronize(eventB);
cudaEventElapsedTime(&elapsed, eventA, eventB);
std::cout << "cudaMemset device chunk: " << elapsed << "\n";
cudaEventRecord(eventA);
cudaMemset(d_data + GB, 0, GB);
cudaEventRecord(eventB);
cudaEventSynchronize(eventB);
cudaEventElapsedTime(&elapsed, eventA, eventB);
std::cout << "cudaMemset host chunk: " << elapsed << "\n";
cudaEventRecord(eventA);
cudaMemset(d_data, 0, 2*GB);
cudaEventRecord(eventB);
cudaEventSynchronize(eventB);
cudaEventElapsedTime(&elapsed, eventA, eventB);
std::cout << "cudaMemset both: " << elapsed << "\n";
{
int* begin = (int*)d_data;
int* mid = (int*)(d_data + GB);
int* end = (int*)(d_data + 2*GB);
cudaEventRecord(eventA);
thrust::fill(thrust::cuda::par_nosync, begin, mid, 0);
cudaEventRecord(eventB);
cudaEventSynchronize(eventB);
cudaEventElapsedTime(&elapsed, eventA, eventB);
std::cout << "thrust::fill device chunk: " << elapsed << "\n";
cudaEventRecord(eventA);
thrust::fill(thrust::cuda::par_nosync, mid, end, 0);
cudaEventRecord(eventB);
cudaEventSynchronize(eventB);
cudaEventElapsedTime(&elapsed, eventA, eventB);
std::cout << "thrust::fill host chunk: " << elapsed << "\n";
cudaEventRecord(eventA);
thrust::fill(thrust::cuda::par_nosync, begin, end, 0);
cudaEventRecord(eventB);
cudaEventSynchronize(eventB);
cudaEventElapsedTime(&elapsed, eventA, eventB);
std::cout << "thrust::fill both: " << elapsed << "\n";
}
status = cuMemUnmap(deviceptr, allocationSize);
assert(status == CUDA_SUCCESS);
status = cuMemAddressFree(deviceptr, allocationSize);
assert(status == CUDA_SUCCESS);
}