How should I use the virtual memory to allocate memory on the host? I found CU_MEM_LOCATION_TYPE_HOST
flag, but when I tried to use this flag, the cuMemCreate
API returns an error of invalid arguments.
The following is my code, it works fine when the flag is CU_MEM_LOCATION_TYPE_DEVICE
.
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#include <numeric> // For std::iota
#define CUDA_CHECK(call) \
do { \
CUresult err = call; \
if (err != CUDA_SUCCESS) { \
const char *err_str; \
cuGetErrorString(err, &err_str); \
std::cerr << "CUDA Error in " << __FILE__ << ":" << __LINE__ \
<< " (" << #call << "): " << err_str << std::endl; \
exit(EXIT_FAILURE); \
} \
} while (0)
// Helper function to get allocation properties
static CUmemAllocationProp getAllocationProp() {
CUmemAllocationProp prop = {};
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
prop.location.type = CU_MEM_LOCATION_TYPE_HOST;// CU_MEM_LOCATION_TYPE_DEVICE;
prop.location.id = 0;
return prop;
}
// Helper function to get access descriptor
static CUmemAccessDesc getAccessDesc() {
CUmemAccessDesc accessDesc = {};
accessDesc.location.type = CU_MEM_LOCATION_TYPE_HOST;// CU_MEM_LOCATION_TYPE_DEVICE;
accessDesc.location.id = 0; // Assuming device 0
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
return accessDesc;
}
__global__ void write(char *ptr, int len) {
auto thread_id = blockIdx.x * blockDim.x + threadIdx.x;
if (thread_id < len) {
ptr[thread_id] = thread_id % 256; // Write some data
}
}
__global__ void read(char *ptr, int len) {
if (blockIdx.x == 0 && threadIdx.x < 32) {
printf("Data at index %d: %d\n", threadIdx.x, ptr[threadIdx.x]);
}
}
int main() {
CUDA_CHECK(cuInit(0));
size_t pageSize = 0;
CUmemAllocationProp prop = getAllocationProp();
CUDA_CHECK(cuMemGetAllocationGranularity(&pageSize, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
std::cout << "Minimum allocation granularity (pageSize): " << pageSize << " bytes" << std::endl;
// 1. Allocate physical memory handles
size_t size1 = pageSize * 4; // 4 pages
size_t size2 = pageSize * 8; // 8 pages
CUmemGenericAllocationHandle physHandle1, physHandle2;
std::cout << "Allocating physical memory chunk 1 of size: " << size1 << std::endl;
CUDA_CHECK(cuMemCreate(&physHandle1, size1, &prop, 0));
std::cout << "Allocating physical memory chunk 2 of size: " << size2 << std::endl;
CUDA_CHECK(cuMemCreate(&physHandle2, size2, &prop, 0));
// 2. Reserve virtual address ranges
CUdeviceptr va1;
size_t va1_size = size1 + size2; // Combined size for va1
std::cout << "Reserving VA range 1 of size: " << va1_size << std::endl;
CUDA_CHECK(cuMemAddressReserve(&va1, va1_size, pageSize, 0, 0));
// 3. Map physical memory to virtual addresses
CUmemAccessDesc accessDesc = getAccessDesc();
std::cout << "Mapping physHandle1 to va1" << std::endl;
CUDA_CHECK(cuMemMap(va1, size1, 0, physHandle1, 0));
CUDA_CHECK(cuMemSetAccess(va1, size1, &accessDesc, 1));
std::cout << "Mapping physHandle2 to va1 + size1" << std::endl;
CUDA_CHECK(cuMemMap(va1 + size1, size2, 0, physHandle2, 0));
CUDA_CHECK(cuMemSetAccess(va1 + size1, size2, &accessDesc, 1));
// now we try to use va1 as a single contiguous VA
cudaMemset((void*)va1, 0, size1 + size2); // Initialize the entire VA range to zero
std::cout << "Initialized VA range 1 to zero" << std::endl;
auto grid_size = (size1 + size2 + 1023) / 1024; // Calculate grid size for kernel launch
write<<<grid_size, 1024>>>(reinterpret_cast<char *>(va1), size1 + size2);
read<<<1, 32>>>(reinterpret_cast<char *>(va1), size1 + size2);
cudaDeviceSynchronize();
return 0;
}