CUDA 4-way overlap and cudaMemcpyAsync Mistake nvidia 4-way overlap technique

Hi,

Recently, I am doing some experiment related to cudaMemcpyAsync and overlap. I tried the similar method in:

Nvidia Overlap Technique

Overlapping GPU<–>CPU transfer and CPU/GPU computation by using buffer, stream and cudaMemcpyAsync in order to improve performance.

Here is my code:

#include <stdio.h>

#include <stdlib.h>

#include <cuda_runtime.h>

#define N   (1024*1024)

#define FULL_DATA_SIZE   (N*20)

static void HandleError(cudaError_t err, const char* file, int line){

	if(err!=cudaSuccess){

		printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);

	exit(EXIT_FAILURE);

	}

}

#define HANDLE_ERROR( err ) (HandleError(err, __FILE__, __LINE__))

__global__ void partialcompute( int *in, int* out ) {

    int idx = threadIdx.x + blockIdx.x * blockDim.x;

    if (idx < N) {

    	out[idx]=in[idx];

//        int idx1 = (idx + 1) % 256;

//        int idx2 = (idx + 2) % 256;

//        a[idx]  = (a[idx] + a[idx1] + a[idx2]) / 3.0f;

        //float   bs = (b[idx] + b[idx1] + b[idx2]) / 3.0f;

        //c[idx] = (as + bs) / 2;

    }

    __syncthreads();

}

__global__ void totalcompute (int *din, int* dout){

    int idx = threadIdx.x + blockIdx.x * blockDim.x;

    if (idx < FULL_DATA_SIZE) {

	dout[idx] = din[idx];

	}

//	if(idx==0){

//    		printf("dev_a[0] is %f, dev_o[0] is %f\n", din[0],dout[0]);

//	}

    __syncthreads();

}

bool compare_data(int* a, int* b){

	int count = 0;

	for(int t = 0; t<FULL_DATA_SIZE;t++){

	if(a[t]!=b[t]){

		//printf("a[%d] and b[%d] are: %d,  %d\n",t,t,a[t],b[t]);

		//return false;

		count++;

		}

	}

	printf("\ndiff # is %d\n",count);

	return true;

}

int main( void ) {

//setup check

    cudaDeviceProp  prop;

    int whichDevice;

    HANDLE_ERROR( cudaGetDevice( &whichDevice ) );

    HANDLE_ERROR( cudaGetDeviceProperties( &prop, whichDevice ) );

    if (!prop.deviceOverlap) {

        printf( "Device will not handle overlaps, so no speed up from streams\n" );

        return 0;

    }

//start

    cudaEvent_t     start, stop;

    float           elapsedTime0, elapsedTime;

cudaStream_t    uploadStream, downloadStream, computeStream;

int *host_a, *host_b, *host_c, *host_out;

      int *dev_a, *dev_o, *part_dev, *part_devout;

// start the timers

    HANDLE_ERROR( cudaEventCreate( &start ) );

    HANDLE_ERROR( cudaEventCreate( &stop ) );

// initialize the streams

    HANDLE_ERROR( cudaStreamCreate( &uploadStream ) );

    HANDLE_ERROR( cudaStreamCreate( &downloadStream ) );

    HANDLE_ERROR( cudaStreamCreate( &computeStream ) );

HANDLE_ERROR(cudaHostAlloc((void**)&host_a, 

    	FULL_DATA_SIZE*sizeof(int),cudaHostAllocDefault));

HANDLE_ERROR(cudaHostAlloc((void**)&host_c, 

    	FULL_DATA_SIZE*sizeof(int),cudaHostAllocDefault));

HANDLE_ERROR(cudaMalloc((void**)&part_dev,N*sizeof(int)));

    HANDLE_ERROR(cudaMalloc((void**)&part_devout,N*sizeof(int)));

HANDLE_ERROR( cudaMalloc( (void**)&dev_a, FULL_DATA_SIZE * sizeof(int) ) );

    HANDLE_ERROR( cudaMalloc( (void**)&dev_o, FULL_DATA_SIZE * sizeof(int) ) );

host_b = (int*)malloc(FULL_DATA_SIZE*sizeof(int));

        host_out = (int*)malloc(FULL_DATA_SIZE*sizeof(int));

	float MB = (float)100*N*sizeof(int)/1024/1024;

	HANDLE_ERROR( cudaEventRecord( start, 0 ) );

	for(int m =0;m<FULL_DATA_SIZE;m++){

			host_a[m] = 1000+m%10000;

			host_b[m] = 1000+m%10000;

	}	

	

	HANDLE_ERROR(cudaMemcpy(dev_a,host_b,

	FULL_DATA_SIZE*sizeof(int),cudaMemcpyHostToDevice));

	totalcompute<<<FULL_DATA_SIZE/512,512,0>>>(dev_a,dev_o);

	HANDLE_ERROR(cudaMemcpy(host_out,dev_o,

	FULL_DATA_SIZE*sizeof(int),cudaMemcpyDeviceToHost));

	

	printf("host_out[10999] is: %d\n", host_out[10999]);

	cudaThreadSynchronize();

HANDLE_ERROR( cudaEventRecord(stop, 0));

    HANDLE_ERROR( cudaEventSynchronize( stop ) );

    HANDLE_ERROR( cudaEventElapsedTime( &elapsedTime0,

                                        start, stop ) );

printf( "Time taken:  %3.1f ms\n", elapsedTime0 );

    printf("speed: %3.1f MB/s \n", MB/(elapsedTime0/1000));

HANDLE_ERROR( cudaEventRecord( start, 0 ) );

    // now loop over full data, in bite-sized chunks

    for(int j=0; j<FULL_DATA_SIZE; j+=N){

	HANDLE_ERROR(cudaMemcpyAsync(part_dev,host_a+j,

					N*sizeof(int),

					cudaMemcpyHostToDevice,

					uploadStream));

	

	partialcompute<<<N/512,512,0,computeStream>>>(part_dev,part_devout);

	HANDLE_ERROR(cudaMemcpyAsync(host_c+j,part_devout,

					N*sizeof(int),

					cudaMemcpyDeviceToHost,

					downloadStream));

//	cudaThreadSynchronize();

	};

    HANDLE_ERROR( cudaStreamSynchronize( uploadStream) );

    HANDLE_ERROR( cudaStreamSynchronize( downloadStream) );

    HANDLE_ERROR( cudaStreamSynchronize( computeStream) );

    HANDLE_ERROR( cudaEventRecord(stop, 0));

    HANDLE_ERROR( cudaEventSynchronize( stop ) );

    HANDLE_ERROR( cudaEventElapsedTime( &elapsedTime,

                                        start, stop ) );

    printf( "Opt Time taken:  %3.1f ms\n", elapsedTime );

    printf("Opt speed: %3.1f MB/s \n", MB/(elapsedTime/1000));

	

    if(compare_data(host_out,host_a))

    	printf("success!\n");

    if(compare_data(host_c,host_a))

    	printf("success!\n");

    // cleanup the streams and memory

    free(host_out);

    free(host_b);

    HANDLE_ERROR(cudaFree(dev_a));

    HANDLE_ERROR(cudaFree(dev_o));

    HANDLE_ERROR(cudaFreeHost(host_a));

    HANDLE_ERROR(cudaFreeHost(host_c));

    HANDLE_ERROR( cudaFree(part_dev));

    HANDLE_ERROR( cudaFree(part_devout));

    HANDLE_ERROR( cudaStreamDestroy( uploadStream ) );

    HANDLE_ERROR( cudaStreamDestroy( downloadStream ) );

    HANDLE_ERROR( cudaStreamDestroy( computeStream ) );

    return 0;

}

Output:

-bash-3.2$ nvcc -o 4way 4wayOverlap.cu -O

-bash-3.2$ ./4way

host_out[10999] is: 1999

Time taken:  445.0 ms

speed: 898.9 MB/s

Opt Time taken:  71.7 ms

Opt speed: 5580.7 MB/s

diff # is 0

success!

diff # is 19922944 (this should be 0)

success!

It seems that the performance improved a lot, but the computation might run into some errors due to asynchronized operations. How to change the code in order to fix the problem?

Thanks.