I seem to have run into a problem with performing asynchronous memory copies from pinned host memory to paged host memory. I’ve tried this on Mac with CUDA 3.2 and also on a Linux box with CUDA 3.1 installed on it.
The intention is to be able to overlap device execution with host/device memory transfers using asynchronous memory copies on streams. However, because the data originates in paged memory, I first have to allocate pinned memory with cudaHostAlloc. I then have to stage data to this pinned area before passing it on to the device and vice versa when reading the data back.
Doing this in the most straightforward way gives erroneous results. The following short program demonstrates the problem:
#include <stdlib.h>
#include <stdio.h>
#include <cuda.h>
#define ARRAY_SIZE 1024
#define THREADS_PER_BLOCK 128
#define NUM_BLOCKS (ARRAY_SIZE/THREADS_PER_BLOCK)
#define CALL_CUDA(func_call) \
{ \
cudaError_t _call_cuda_err = (func_call); \
if (_call_cuda_err != cudaSuccess) \
{ \
printf("%s in %s at line %d\n", \
cudaGetErrorString(_call_cuda_err), \
__FILE__, \
__LINE__); \
exit(EXIT_FAILURE); \
} \
}
__global__ void kernel(const int *inArray, int *outArray)
{
int index = threadIdx.x + (blockIdx.x * blockDim.x);
if (index < ARRAY_SIZE)
{
outArray[index] = inArray[index];
}
}
int main(int argc, char **argv)
{
int inArray[ARRAY_SIZE];
int outArray[ARRAY_SIZE];
for (int i = 0; i < ARRAY_SIZE; i++)
{
inArray[i] = i;
outArray[i] = 123456;
}
int *pinnedInArray;
CALL_CUDA(cudaHostAlloc(&pinnedInArray,
sizeof(int) * ARRAY_SIZE,
cudaHostAllocWriteCombined));
int *pinnedOutArray;
CALL_CUDA(cudaHostAlloc(&pinnedOutArray,
sizeof(int) * ARRAY_SIZE,
cudaHostAllocDefault));
int *deviceInArray;
CALL_CUDA(cudaMalloc(&deviceInArray, sizeof(int)*ARRAY_SIZE));
int *deviceOutArray;
CALL_CUDA(cudaMalloc(&deviceOutArray, sizeof(int)*ARRAY_SIZE));
cudaStream_t stream;
CALL_CUDA(cudaStreamCreate(&stream));
/* This is the interesting part. Stage the data to pinned memory,
* asynchronously transfer the data to the device, run the kernel (which
* just copies the data), and then asynchronously transfer back through
* pinned data the same way. */
CALL_CUDA(cudaMemcpyAsync(pinnedInArray,
(const int *)inArray,
sizeof(int) * ARRAY_SIZE,
cudaMemcpyHostToHost,
stream));
CALL_CUDA(cudaMemcpyAsync(deviceInArray,
(const int *)pinnedInArray,
sizeof(int) * ARRAY_SIZE,
cudaMemcpyHostToDevice,
stream));
kernel<<<NUM_BLOCKS, THREADS_PER_BLOCK, 0, stream>>>
((const int *)deviceInArray, deviceOutArray);
CALL_CUDA(cudaMemcpyAsync(pinnedOutArray,
(const int *)deviceOutArray,
sizeof(int) * ARRAY_SIZE,
cudaMemcpyDeviceToHost,
stream));
/* If you uncomment this line, everything works, which is why I suspect that
the host-to-host copy is not properly synchronizing on the stream. */
/* CALL_CUDA(cudaStreamSynchronize(stream)); */
CALL_CUDA(cudaMemcpyAsync(outArray,
(const int *)pinnedOutArray,
sizeof(int) * ARRAY_SIZE,
cudaMemcpyHostToHost,
stream));
CALL_CUDA(cudaStreamSynchronize(stream));
/* Check to make sure we got back the correct data. */
for (int i = 0; i < ARRAY_SIZE; i++)
{
if (inArray[i] != outArray[i])
{
printf("For index %d: Expected %d, got %d\n", i, inArray[i], outArray[i]);
return EXIT_FAILURE;
}
}
CALL_CUDA(cudaStreamDestroy(stream));
CALL_CUDA(cudaFreeHost(pinnedInArray));
CALL_CUDA(cudaFreeHost(pinnedOutArray));
CALL_CUDA(cudaFree(deviceInArray));
CALL_CUDA(cudaFree(deviceOutArray));
return EXIT_SUCCESS;
}
The problem appears to be that when using cudaMemcpyAsync to perform the final copy of the results from pinned memory to paged memory, the copy does not wait for the pinned memory to be filled with data from the device. If you insert a cudaStreamSynchronize between these two cudaMemcpyAsync, it works fine.