strange behavior of stream, event, and MemcpyAsync

Hello everyone,

I have a small program, which was modified from “asyncAPI” example in the SDK.

The kernel first writes the input matrix, g_data, then sets a flag, flag, to indicate the completion. The host uses MemcpyAsync() and events to keep track the progress of the kernel, especially the flag.

However, the program will hang forever cause Memcpy always return a flag with 0xff. This is weird, cause this is not even the initial value of the flag, 0.

Can anyone help? Thank you very much.

#include <stdio.h>

#include <cutil_inline.h>

__global__ void increment_kernel(int *g_data, int * flag, int inc_value)

{ 

   int i;

   int idx = blockIdx.x * blockDim.x + threadIdx.x;

g_data[idx] = inc_value;

__threadfence();

  /* a global barrier should be used here, however, I ignore this at this moment */

if (idx == 0)

	* flag = 1;

}

int correct_output(int *data, const int n, const int x)

{

	for(int i = 0; i < n; i++)

		if(data[i] != x)

			return 0;

	return 1;

}

int main(int argc, char *argv[])

{

	if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )

		cutilDeviceInit(argc, argv);

	else

		cudaSetDevice( cutGetMaxGflopsDeviceId());

	int n = 512;

	int nbytes = n * sizeof(int);

	int value = 26;

	

	// allocate host memory

	int *a = 0;

	cutilSafeCall( cudaMallocHost((void**)&a, nbytes) );

	memset(a, 0, nbytes);

	int b = 0xff;

	// allocate device memory

	int *d_a=0;

	cutilSafeCall( cudaMalloc((void**)&d_a, nbytes) );

	cutilSafeCall( cudaMemset(d_a, 255, nbytes) );

	int *d_b=0;

	cutilSafeCall( cudaMalloc((void**)&d_b, sizeof(int)) );

	cutilSafeCall( cudaMemset(d_b, 0, sizeof(int)) );

	bool done = false;

	// set kernel launch configuration

	dim3 threads = dim3(512, 1);

	dim3 blocks  = dim3(1, 1);

	// create cuda event handles

	cudaEvent_t start, stop1, stop;

	cutilSafeCall( cudaEventCreate(&start) );

	cutilSafeCall( cudaEventCreate(&stop1)  );

	cutilSafeCall( cudaEventCreate(&stop)  );

	

	unsigned int timer;

	cutilCheckError(  cutCreateTimer(&timer)  );

	cutilCheckError(  cutResetTimer(timer)	);

	cutilSafeCall( cudaThreadSynchronize() );

	float gpu_time = 0.0f;

	// asynchronously issue work to the GPU (all to stream 0)

	cutilCheckError( cutStartTimer(timer) );

		cudaEventRecord(start, 0);

	cudaMemcpyAsync(d_a, a, nbytes, cudaMemcpyHostToDevice, 0);

	increment_kernel<<<blocks, threads, 0, 0>>>(d_a, d_b, value);

	do {

	  cudaMemcpyAsync((void *)&b, d_b, sizeof(int), cudaMemcpyDeviceToHost, 0);

	  cudaEventRecord(stop1, 0);

	  while (cudaEventQuery(stop1) == cudaErrorNotReady);

	  printf("b = %i\n", b);

	  if (b == 1) 

		done = true;

	} while (!done);

	cudaMemcpyAsync(a, d_a, nbytes, cudaMemcpyDeviceToHost, 0);

	cudaEventRecord(stop, 0);

	cutilCheckError( cutStopTimer(timer) );

	unsigned long int counter=0;

	while( cudaEventQuery(stop) == cudaErrorNotReady )

	{

		counter++;

	}

	cutilSafeCall( cudaEventElapsedTime(&gpu_time, start, stop) );

	// print the cpu and gpu times

	printf("time spent executing by the GPU: %.2f\n", gpu_time);

	printf("time spent by CPU in CUDA calls: %.2f\n", cutGetTimerValue(timer) );

	// check the output for correctness

	printf("--------------------------------------------------------------\n");

	if( correct_output(a, n, value) )

		printf("Test PASSED\n");

	else

		printf("Test FAILED\n");

	// release resources

	cutilSafeCall( cudaEventDestroy(start) );

	cutilSafeCall( cudaEventDestroy(stop) );

	cutilSafeCall( cudaFreeHost(a) );

	cutilSafeCall( cudaFree(d_a) );

	cudaThreadExit();

	cutilExit(argc, argv);

	return 0;

}

Crap, I know why. I should use cudaMallocHost() to allocate a memory block for MemcpyAsync(), instead of using an automatic variable. Now it works fine.