cudaMemcpyAsync code problem

In the below code, I am getting an error in the call to cudaMemcpyAsync, if I replace with cudaMemcpy, with the same argument the code works, can anyone
look and give me a pointer regarding this ? I have bolded underlined the relevant code

////////////////////////////////////////////////////////////////////////////////
// GPU thread
////////////////////////////////////////////////////////////////////////////////
typedef struct {
//Device id
int device;

//Host-side input data
int dataN;
float *h_Data;

//Partial sum for this GPU
float *h_Sum;

} TGPUplan;

static CUT_THREADPROC solverThread(TGPUplan *plan){
const int BLOCK_N = 32;
const int THREAD_N = 256;
const int ACCUM_N = BLOCK_N * THREAD_N;

float
    *d_Data,
    *d_Sum;

float
    *h_Sum;

float sum;

int i;
//Set device
CUDA_SAFE_CALL( cudaSetDevice(plan->device) );


//Allocate memory
CUDA_SAFE_CALL( cudaMalloc((void**)&d_Data, plan->dataN * sizeof(float)) );
CUDA_SAFE_CALL( cudaMalloc((void**)&d_Sum, ACCUM_N * sizeof(float)) );
CUT_SAFE_MALLOC( h_Sum = (float *)malloc(ACCUM_N * sizeof(float)) );

//Copy input data from CPU
//YL CUDA_SAFE_CALL( cudaMemcpy(d_Data, plan->h_Data, plan->dataN * sizeof(float), cudaMemcpyHostToDevice) );
CUDA_SAFE_CALL( [b]cudaMemcpyAsync(d_Data, plan->h_Data, plan->dataN * sizeof(float), cudaMemcpyHostToDevice,plan->device) );[/b]

//Perform GPU computations
reduceKernel<<<BLOCK_N, THREAD_N>>>(d_Sum, d_Data, plan->dataN);
CUT_CHECK_ERROR("reduceKernel() execution failed.\n");

//Read back GPU results
//YL CUDA_SAFE_CALL( cudaMemcpy(h_Sum, d_Sum, ACCUM_N * sizeof(float), cudaMemcpyDeviceToHost) );
CUDA_SAFE_CALL( cudaMemcpyAsync(h_Sum, d_Sum, ACCUM_N * sizeof(float), cudaMemcpyDeviceToHost,plan->device));
//Finalize GPU reduction for current subvector
sum = 0;
for(i = 0; i < ACCUM_N; i++)
    sum += h_Sum[i];
*(plan->h_Sum) = (float)sum;

//Shut down this GPU
free(h_Sum);
CUDA_SAFE_CALL( cudaFree(d_Sum) );
CUDA_SAFE_CALL( cudaFree(d_Data) );
CUT_THREADEND;

}

This is wrong. The last argument of cudaMemcpyAsync is a cudaStream_t obtained through cudaStreamCreate. See the Cuda Reference Manual, page 34 and 13.

Additionally to using a cudaStream_t as the last argument in the above two calls, you may have to ensure that plan->h_Data and h_Sum both pointed to page-locked memory. This can be achieved with cudaMallocHost() (see page 30 of the CudaReferenceManual_2.0.pdf and the description of cudaMemcpyAsync() on page 34).

Thanks for the very quick response,

I am looking at the code example called simpleMultiGpu, what I am trying to do is an async copy instead of the copy, I added code as you suggested to create the cudaStream_t in plan->device I thought CudaStream_T is int, cudaMallocHost is used for host memory, anbd cudaMalloc for GPU memory, seems correct to me but still the last parameter (as you have pointed … is incorrect), can you show me the correct line ?

static CUT_THREADPROC solverThread(TGPUplan *plan){
const int BLOCK_N = 32;
const int THREAD_N = 256;
const int ACCUM_N = BLOCK_N * THREAD_N;

float
    *d_Data,
    *d_Sum;

float
    *h_Sum;

float sum;

int i;
//Set device
CUDA_SAFE_CALL( cudaSetDevice(plan->device) );
CUDA_SAFE_CALL( cudaStreamCreate(&(plan->device)) );


//Allocate memory
CUDA_SAFE_CALL( cudaMalloc((void**)&d_Data, plan->dataN * sizeof(float)) );
CUDA_SAFE_CALL( cudaMalloc((void**)&d_Sum, ACCUM_N * sizeof(float)) );
CUT_SAFE_MALLOC( h_Sum = (float *)malloc(ACCUM_N * sizeof(float)) );

//Copy input data from CPU
//CUDA_SAFE_CALL( cudaMemcpy(d_Data, plan->h_Data, plan->dataN * sizeof(float), cudaMemcpyHostToDevice) );
CUDA_SAFE_CALL( cudaMemcpyAsync((void *) d_Data, 
	            (void *) plan->h_Data, 
				(size_t) plan->dataN * sizeof(float), 
				cudaMemcpyHostToDevice,
				plan->device) );

//Perform GPU computations
reduceKernel<<<BLOCK_N, THREAD_N>>>(d_Sum, d_Data, plan->dataN);
CUT_CHECK_ERROR("reduceKernel() execution failed.\n");

//Read back GPU results
//CUDA_SAFE_CALL( cudaMemcpy(h_Sum, d_Sum, ACCUM_N * sizeof(float), cudaMemcpyDeviceToHost) );
CUDA_SAFE_CALL( cudaMemcpyAsync(h_Sum, 
	            d_Sum, ACCUM_N * sizeof(float), 
				cudaMemcpyDeviceToHost,
				plan->device));
//Finalize GPU reduction for current subvector
sum = 0;
for(i = 0; i < ACCUM_N; i++)
    sum += h_Sum[i];
*(plan->h_Sum) = (float)sum;

//Shut down this GPU
free(h_Sum);
CUDA_SAFE_CALL( cudaFree(d_Sum) );
CUDA_SAFE_CALL( cudaFree(d_Data) );
CUT_THREADEND;

}