I have some problems with pinned memory. I want to use function cudaMemcpyAsync instead of cudaMemcpy, so I have to use pinned memory.
I tried to use the function cudaMallocHost or cudaHostAlloc instead of cudaMalloc, for the device memory allocation, and I also used the flag cudaHostAllocPortable in the function cudaHostAlloc, but I didn’t solve my problems.
Here there’s a test code for the addiction of arrays that works only without using the functions cudaHostAlloc and cudaMemcpyAsync.
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>
#define NO 0
#define YES 1
// Device code
__global__ void VecAdd(float* A, float* B, float* C, int N) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < N)
C[i] = A[i] + B[i];
}
// Host code
int main() {
int N;
size_t size;
float *h_A;
float *h_B;
float *h_C;
float *d_A;
float *d_B;
float *d_C;
int threadsPerBlock;
int blocksPerGrid;
cudaError error;
int usePinnedMemory = YES;
N = 2000000;
size = N * sizeof (float);
threadsPerBlock = 240;
blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
h_A = (float *) malloc(size);
h_B = (float *) malloc(size);
h_C = (float *) malloc(size);
if (usePinnedMemory == YES) {
error=cudaHostAlloc((void**) & d_A, size,cudaHostAllocPortable);
if(error!=cudaSuccess) fprintf(stderr,"1) %s\n",cudaGetErrorString(error));
error=cudaHostAlloc((void**) & d_B, size,cudaHostAllocPortable);
if(error!=cudaSuccess) fprintf(stderr,"2) %s\n",cudaGetErrorString(error));
error=cudaHostAlloc((void**) & d_C, size,cudaHostAllocPortable);
if(error!=cudaSuccess) fprintf(stderr,"3) %s\n",cudaGetErrorString(error));
error=cudaMemcpyAsync(d_A, h_A, size, cudaMemcpyHostToDevice,0);
if(error!=cudaSuccess) fprintf(stderr,"4) %s\n",cudaGetErrorString(error));
error=cudaMemcpyAsync(d_B, h_B, size, cudaMemcpyHostToDevice,0);
if(error!=cudaSuccess) fprintf(stderr,"5) %s\n",cudaGetErrorString(error));
VecAdd <<< blocksPerGrid, threadsPerBlock >>>(d_A, d_B, d_C, N);
error=cudaMemcpyAsync(h_C, d_C, size, cudaMemcpyDeviceToHost,0);
if(error!=cudaSuccess) fprintf(stderr,"6) %s\n",cudaGetErrorString(error));
error=cudaFreeHost(d_A);
if(error!=cudaSuccess) fprintf(stderr,"7) %s\n",cudaGetErrorString(error));
error=cudaFreeHost(d_B);
if(error!=cudaSuccess) fprintf(stderr,"8) %s\n",cudaGetErrorString(error));
error=cudaFreeHost(d_C);
if(error!=cudaSuccess) fprintf(stderr,"9) %s\n",cudaGetErrorString(error));
} else {
error=cudaMalloc((void**) & d_A, size);
if(error!=cudaSuccess) fprintf(stderr,"1) %s\n",cudaGetErrorString(error));
error=cudaMalloc((void**) & d_B, size);
if(error!=cudaSuccess) fprintf(stderr,"2) %s\n",cudaGetErrorString(error));
error=cudaMalloc((void**) & d_C, size);
if(error!=cudaSuccess) fprintf(stderr,"3) %s\n",cudaGetErrorString(error));
error=cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
if(error!=cudaSuccess) fprintf(stderr,"4) %s\n",cudaGetErrorString(error));
error=cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
if(error!=cudaSuccess) fprintf(stderr,"5) %s\n",cudaGetErrorString(error));
VecAdd <<< blocksPerGrid, threadsPerBlock >>>(d_A, d_B, d_C, N);
error=cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
if(error!=cudaSuccess) fprintf(stderr,"6) %s\n",cudaGetErrorString(error));
error=cudaFree(d_A);
if(error!=cudaSuccess) fprintf(stderr,"7) %s\n",cudaGetErrorString(error));
error=cudaFree(d_B);
if(error!=cudaSuccess) fprintf(stderr,"8) %s\n",cudaGetErrorString(error));
error=cudaFree(d_C);
if(error!=cudaSuccess) fprintf(stderr,"9) %s\n",cudaGetErrorString(error));
}
free(h_A);
free(h_B);
free(h_C);
fprintf(stderr,"CUDA STATUS: %s\n",cudaGetErrorString(cudaGetLastError()));
return 0;
}
if usePinnedMemory is set to YES, the output is:
4) invalid device pointer
5) invalid device pointer
6) invalid device pointer
7) unspecified launch failure
8) unspecified launch failure
9) unspecified launch failure
CUDA STATUS: unspecified launch failure
if usePinnedMemory is set to NO, the output is:
CUDA STATUS: no error