I got a problem when using cudaHostAlloc(). When I use cudaHostAlloc() in main function, and access the allocated memory in main function, that OK. But if I place cudaHostAlloc()in a child function (e.g. mycudaHostAlloc), I cannot access the memory in main function, i.e. when the program proceeds to L1, system gives a segmentation fault.
Any solution? Thanks in advance.
#include <stdio.h>
#define SIZE 10
#include <cuda.h>
// Kernel definition, see also section 4.2.3 of Nvidia Cuda Programming Guide
global void vecAdd(float* A, float* B, float* C) {
// threadIdx.x is a built-in variable provided by CUDA at runtime
int i = threadIdx.x;
// A[i] = 0;
// B[i] = i;
C[i] = A[i] + B[i];
printf(“A[%d]=%f, B[%d]=%f, C[%d]=%f\n”, i, A[i], i, B[i], i, C[i]);
}
void mycudaHostAlloc(float * A, float * B, float * C, int memsize){
cudaHostAlloc((void **) &A, memsize, cudaHostAllocMapped);
cudaHostAlloc((void **) &B, memsize, cudaHostAllocMapped);
cudaHostAlloc((void **) &C, memsize, cudaHostAllocMapped);
}
void mycudaFreeHost(float * A, float * B, float * C){
cudaFreeHost(A);
cudaFreeHost(B);
cudaFreeHost(C);
}
int main() {
int N = SIZE;
int memsize = SIZE * sizeof(float);
cudaDeviceProp deviceProp;
// Get properties and verify device 0 supports mapped memory
cudaGetDeviceProperties(&deviceProp, 0);
if (!deviceProp.canMapHostMemory) {
fprintf(stderr, "Device %d cannot map host memory!\n", 0);
exit(EXIT_FAILURE);
}
// set the device flags for mapping host memory
cudaSetDeviceFlags(cudaDeviceMapHost);
float * A, *B, *C;
float *devPtrA, *devPtrB, *devPtrC;
mycudaHostAlloc(A, B, C, memsize);
L1: for (int i = 0; i < SIZE; i++) {
A[i] = B[i] = i;
}
cudaHostGetDevicePointer((void **) &devPtrA, (void *) A, 0);
cudaHostGetDevicePointer((void **) &devPtrB, (void *) B, 0);
cudaHostGetDevicePointer((void **) &devPtrC, (void *) C, 0);
vecAdd<<<1, N>>>(devPtrA, devPtrB, devPtrC);
cudaDeviceSynchronize();
for (int i = 0; i < SIZE; i++)
printf("C[%d]=%f\n", i, C[i]);
mycudaFreeHost(A, B, C);
}