Problems with cudaHostAlloc and cudaMemcpyAsync

I have some problems with pinned memory. I want to use function cudaMemcpyAsync instead of cudaMemcpy, so I have to use pinned memory.

I tried to use the function cudaMallocHost or cudaHostAlloc instead of cudaMalloc, for the device memory allocation, and I also used the flag cudaHostAllocPortable in the function cudaHostAlloc, but I didn’t solve my problems.

Here there’s a test code for the addiction of arrays that works only without using the functions cudaHostAlloc and cudaMemcpyAsync.

#include <stdio.h>

#include <stdlib.h>

#include <cuda.h>

#include <cuda_runtime.h>

#define NO	   0

#define YES	  1

// Device code

__global__ void VecAdd(float* A, float* B, float* C, int N) {

	int i = blockDim.x * blockIdx.x + threadIdx.x;

	if (i < N)

		C[i] = A[i] + B[i];

}

// Host code

int main() {

	int N;

	size_t size;

	float *h_A;

	float *h_B;

	float *h_C;

	float *d_A;

	float *d_B;

	float *d_C;

	int threadsPerBlock;

	int blocksPerGrid;

	cudaError error;

	int usePinnedMemory = YES;

	N = 2000000;

	size = N * sizeof (float);

	threadsPerBlock = 240;

	blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

	h_A = (float *) malloc(size);

	h_B = (float *) malloc(size);

	h_C = (float *) malloc(size);

	if (usePinnedMemory == YES) {

		error=cudaHostAlloc((void**) & d_A, size,cudaHostAllocPortable);

		if(error!=cudaSuccess) fprintf(stderr,"1) %s\n",cudaGetErrorString(error));

		error=cudaHostAlloc((void**) & d_B, size,cudaHostAllocPortable);

		if(error!=cudaSuccess) fprintf(stderr,"2) %s\n",cudaGetErrorString(error));

		error=cudaHostAlloc((void**) & d_C, size,cudaHostAllocPortable);

		if(error!=cudaSuccess) fprintf(stderr,"3) %s\n",cudaGetErrorString(error));

		error=cudaMemcpyAsync(d_A, h_A, size, cudaMemcpyHostToDevice,0);

		if(error!=cudaSuccess) fprintf(stderr,"4) %s\n",cudaGetErrorString(error));

		error=cudaMemcpyAsync(d_B, h_B, size, cudaMemcpyHostToDevice,0);

		if(error!=cudaSuccess) fprintf(stderr,"5) %s\n",cudaGetErrorString(error));

		VecAdd <<< blocksPerGrid, threadsPerBlock >>>(d_A, d_B, d_C, N);

		error=cudaMemcpyAsync(h_C, d_C, size, cudaMemcpyDeviceToHost,0);

		if(error!=cudaSuccess) fprintf(stderr,"6) %s\n",cudaGetErrorString(error));

		error=cudaFreeHost(d_A);

		if(error!=cudaSuccess) fprintf(stderr,"7) %s\n",cudaGetErrorString(error));

		error=cudaFreeHost(d_B);

		if(error!=cudaSuccess) fprintf(stderr,"8) %s\n",cudaGetErrorString(error));

		error=cudaFreeHost(d_C);

		if(error!=cudaSuccess) fprintf(stderr,"9) %s\n",cudaGetErrorString(error));

	} else {

		error=cudaMalloc((void**) & d_A, size);

		if(error!=cudaSuccess) fprintf(stderr,"1) %s\n",cudaGetErrorString(error));

		error=cudaMalloc((void**) & d_B, size);

		if(error!=cudaSuccess) fprintf(stderr,"2) %s\n",cudaGetErrorString(error));

		error=cudaMalloc((void**) & d_C, size);

		if(error!=cudaSuccess) fprintf(stderr,"3) %s\n",cudaGetErrorString(error));

		error=cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);

		if(error!=cudaSuccess) fprintf(stderr,"4) %s\n",cudaGetErrorString(error));

		error=cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

		if(error!=cudaSuccess) fprintf(stderr,"5) %s\n",cudaGetErrorString(error));

		VecAdd <<< blocksPerGrid, threadsPerBlock >>>(d_A, d_B, d_C, N);

		error=cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

		if(error!=cudaSuccess) fprintf(stderr,"6) %s\n",cudaGetErrorString(error));

		error=cudaFree(d_A);

		if(error!=cudaSuccess) fprintf(stderr,"7) %s\n",cudaGetErrorString(error));

		error=cudaFree(d_B);

		if(error!=cudaSuccess) fprintf(stderr,"8) %s\n",cudaGetErrorString(error));

		error=cudaFree(d_C);

		if(error!=cudaSuccess) fprintf(stderr,"9) %s\n",cudaGetErrorString(error));

	}

	free(h_A);

	free(h_B);

	free(h_C);

	fprintf(stderr,"CUDA STATUS: %s\n",cudaGetErrorString(cudaGetLastError()));

	return 0;

}

if usePinnedMemory is set to YES, the output is:

4) invalid device pointer

5) invalid device pointer

6) invalid device pointer

7) unspecified launch failure

8) unspecified launch failure

9) unspecified launch failure

CUDA STATUS: unspecified launch failure

if usePinnedMemory is set to NO, the output is:

CUDA STATUS: no error

Hi,

You are doing something wrong in your code: pinned memory is on the host, not on the device.

    error=cudaHostAlloc((void**) & d_A, size,cudaHostAllocPortable);
    if(error!=cudaSuccess) fprintf(stderr,"1) %s\n",cudaGetErrorString(error));

That allocates some memory in main memory, and does something special in the driver, typically that should be the code for your h_A variable instead.

Cédric

Hi,
thank you for your help.

So, if I understood, the pinned memory required by cudaMemcpyAsync function is just for the host memory and, also in this case, I can use cudaMalloc function for device memory allocation.
Is that right?

Now I try to rewrite the code.

That’s it. To understand what happens, you have to understand what memory pinning is: say that you have a buffer in main memory and that you are trying to copy it onto the GPU. To do so, the CUDA driver will take all the virtual adresses and translate them into physical adresses. Those physical adresses are given to the DMA controller, which starts to copy those data from the memory to the device. Now imagine this situation:

You mapped a file A into a buffer. You tell CUDA to copy the buffer asynchronously For some reason, the system decides to swap the buffer, and puts something else at the same physical adress.

What does the DMA controller copy onto the GPU ? We don’t know, because the DMA controller has no way to notice that the physical adress are not valid anymore, so it just takes the physical adresses and consider them as valid. To solve that, you can tell the OS “please don’t move that piece of data away from physical memory” (we often call this mechanism memory pinning). As you see, the problems comes from the OS that could attempt to tamper with the physical memory layout of the process, so this is the host memory that should be protected.

On the other hand, the memory on the device is exclusively managed by the CUDA driver, and there is no reason why the physical memory mapping could be modified while there is a transfer on-going. So you can allocate your memory as usual with cudaMalloc on the device.

Hope that helps,

Cédric

Thank you for your explanation. You were very clear.
I think I understood the meaning of pinned memory and why I have to use it.

Glad it helped!

Cédric