strange behavior of stream, event, and MemcpyAsync

boricworld · June 24, 2009, 4:40pm

Hello everyone,

I have a small program, which was modified from “asyncAPI” example in the SDK.

The kernel first writes the input matrix, g_data, then sets a flag, flag, to indicate the completion. The host uses MemcpyAsync() and events to keep track the progress of the kernel, especially the flag.

However, the program will hang forever cause Memcpy always return a flag with 0xff. This is weird, cause this is not even the initial value of the flag, 0.

Can anyone help? Thank you very much.

#include <stdio.h>

#include <cutil_inline.h>

__global__ void increment_kernel(int *g_data, int * flag, int inc_value)

{ 

   int i;

   int idx = blockIdx.x * blockDim.x + threadIdx.x;

g_data[idx] = inc_value;

__threadfence();

  /* a global barrier should be used here, however, I ignore this at this moment */

if (idx == 0)

	* flag = 1;

}

int correct_output(int *data, const int n, const int x)

{

	for(int i = 0; i < n; i++)

		if(data[i] != x)

			return 0;

	return 1;

}

int main(int argc, char *argv[])

{

	if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )

		cutilDeviceInit(argc, argv);

	else

		cudaSetDevice( cutGetMaxGflopsDeviceId());

	int n = 512;

	int nbytes = n * sizeof(int);

	int value = 26;

	

	// allocate host memory

	int *a = 0;

	cutilSafeCall( cudaMallocHost((void**)&a, nbytes) );

	memset(a, 0, nbytes);

	int b = 0xff;

	// allocate device memory

	int *d_a=0;

	cutilSafeCall( cudaMalloc((void**)&d_a, nbytes) );

	cutilSafeCall( cudaMemset(d_a, 255, nbytes) );

	int *d_b=0;

	cutilSafeCall( cudaMalloc((void**)&d_b, sizeof(int)) );

	cutilSafeCall( cudaMemset(d_b, 0, sizeof(int)) );

	bool done = false;

	// set kernel launch configuration

	dim3 threads = dim3(512, 1);

	dim3 blocks  = dim3(1, 1);

	// create cuda event handles

	cudaEvent_t start, stop1, stop;

	cutilSafeCall( cudaEventCreate(&start) );

	cutilSafeCall( cudaEventCreate(&stop1)  );

	cutilSafeCall( cudaEventCreate(&stop)  );

	

	unsigned int timer;

	cutilCheckError(  cutCreateTimer(&timer)  );

	cutilCheckError(  cutResetTimer(timer)	);

	cutilSafeCall( cudaThreadSynchronize() );

	float gpu_time = 0.0f;

	// asynchronously issue work to the GPU (all to stream 0)

	cutilCheckError( cutStartTimer(timer) );

		cudaEventRecord(start, 0);

	cudaMemcpyAsync(d_a, a, nbytes, cudaMemcpyHostToDevice, 0);

	increment_kernel<<<blocks, threads, 0, 0>>>(d_a, d_b, value);

	do {

	  cudaMemcpyAsync((void *)&b, d_b, sizeof(int), cudaMemcpyDeviceToHost, 0);

	  cudaEventRecord(stop1, 0);

	  while (cudaEventQuery(stop1) == cudaErrorNotReady);

	  printf("b = %i\n", b);

	  if (b == 1) 

		done = true;

	} while (!done);

	cudaMemcpyAsync(a, d_a, nbytes, cudaMemcpyDeviceToHost, 0);

	cudaEventRecord(stop, 0);

	cutilCheckError( cutStopTimer(timer) );

	unsigned long int counter=0;

	while( cudaEventQuery(stop) == cudaErrorNotReady )

	{

		counter++;

	}

	cutilSafeCall( cudaEventElapsedTime(&gpu_time, start, stop) );

	// print the cpu and gpu times

	printf("time spent executing by the GPU: %.2f\n", gpu_time);

	printf("time spent by CPU in CUDA calls: %.2f\n", cutGetTimerValue(timer) );

	// check the output for correctness

	printf("--------------------------------------------------------------\n");

	if( correct_output(a, n, value) )

		printf("Test PASSED\n");

	else

		printf("Test FAILED\n");

	// release resources

	cutilSafeCall( cudaEventDestroy(start) );

	cutilSafeCall( cudaEventDestroy(stop) );

	cutilSafeCall( cudaFreeHost(a) );

	cutilSafeCall( cudaFree(d_a) );

	cudaThreadExit();

	cutilExit(argc, argv);

	return 0;

}

boricworld · June 24, 2009, 5:39pm

Crap, I know why. I should use cudaMallocHost() to allocate a memory block for MemcpyAsync(), instead of using an automatic variable. Now it works fine.

Topic		Replies	Views
How to solve the problem "0: copyout MemcpyAsync (host=0x485360af8, dev=0x986780af8, size=32000 CUDA Programming and Performance	3	939	March 31, 2016
cudaMemcpyAsync not giving any answers using cudaMemcpyAsync function CUDA Programming and Performance	1	859	September 5, 2011
"cudaMemcpyAsync" does not work in my program CUDA Programming and Performance	3	1296	April 23, 2010
cudaMemcpyAsync CUDA Programming and Performance	10	22165	October 16, 2015
cudaMemcpyAsync waiting for another unrelated cudaMemcpyAsync CUDA Programming and Performance cuda	10	248	December 10, 2024
CPU blocked MUCH longer than expected calling a cudaMemcpy after a cuda graph launch CUDA Programming and Performance	6	753	October 5, 2023
cudaMemcpyAsync not behaving asynchronously CUDA Programming and Performance	5	2545	July 4, 2008
Async Memcpy calls blocking main thread CUDA Programming and Performance	3	2518	November 19, 2011
Async memory problems CUDA Programming and Performance	7	7365	February 11, 2011
Problem with asynchronous host to host memcpy CUDA Programming and Performance	1	4764	January 4, 2011

strange behavior of stream, event, and MemcpyAsync

Related topics