memory allocated by cannot be accessed by parent function pinned memory, mapped host memory, no-zer

I got a problem when using cudaHostAlloc(). When I use cudaHostAlloc() in main function, and access the allocated memory in main function, that OK. But if I place cudaHostAlloc()in a child function (e.g. mycudaHostAlloc), I cannot access the memory in main function, i.e. when the program proceeds to L1, system gives a segmentation fault.

Any solution? Thanks in advance.

#include <stdio.h>
#define SIZE 10
#include <cuda.h>
// Kernel definition, see also section 4.2.3 of Nvidia Cuda Programming Guide
global void vecAdd(float* A, float* B, float* C) {
// threadIdx.x is a built-in variable provided by CUDA at runtime
int i = threadIdx.x;
// A[i] = 0;
// B[i] = i;
C[i] = A[i] + B[i];
printf(“A[%d]=%f, B[%d]=%f, C[%d]=%f\n”, i, A[i], i, B[i], i, C[i]);
}

void mycudaHostAlloc(float * A, float * B, float * C, int memsize){
cudaHostAlloc((void **) &A, memsize, cudaHostAllocMapped);
cudaHostAlloc((void **) &B, memsize, cudaHostAllocMapped);
cudaHostAlloc((void **) &C, memsize, cudaHostAllocMapped);
}
void mycudaFreeHost(float * A, float * B, float * C){
cudaFreeHost(A);
cudaFreeHost(B);
cudaFreeHost©;
}

int main() {
int N = SIZE;
int memsize = SIZE * sizeof(float);

cudaDeviceProp deviceProp;
// Get properties and verify device 0 supports mapped memory
cudaGetDeviceProperties(&deviceProp, 0);
if (!deviceProp.canMapHostMemory) {
	fprintf(stderr, "Device %d cannot map host memory!\n", 0);
	exit(EXIT_FAILURE);
}
// set the device flags for mapping host memory
cudaSetDeviceFlags(cudaDeviceMapHost);

float * A, *B, *C;
float *devPtrA,	 *devPtrB,  *devPtrC;

mycudaHostAlloc(A, B, C, memsize);

L1: for (int i = 0; i < SIZE; i++) {
A[i] = B[i] = i;
}

cudaHostGetDevicePointer((void **) &devPtrA, (void *) A, 0);
cudaHostGetDevicePointer((void **) &devPtrB, (void *) B, 0);
cudaHostGetDevicePointer((void **) &devPtrC, (void *) C, 0);

vecAdd<<<1, N>>>(devPtrA, devPtrB, devPtrC);
cudaDeviceSynchronize();

for (int i = 0; i < SIZE; i++)
	printf("C[%d]=%f\n", i, C[i]);

mycudaFreeHost(A, B, C);

}

C and thus CUDA has “call by value” semantics, so any changes to function arguments stay local to the function and do not influence the caller.

If you want “call by reference”, pass pointers to the variable you want to change:

void mycudaHostAlloc(float ** A, float ** B, float ** C, size_t memsize)

{

    cudaHostAlloc((void **) A, memsize, cudaHostAllocMapped);

    cudaHostAlloc((void **) B, memsize, cudaHostAllocMapped);

    cudaHostAlloc((void **) C, memsize, cudaHostAllocMapped);

}

[...]

float * A, *B, *C;

float *devPtrA,	 *devPtrB, *devPtrC;

mycudaHostAlloc(&A, &B, &C, memsize);

It’s the same reason why cudaHostAlloc() itself takes a pointer to a pointer.

I see. Thanks a lot for your reminding.