Hi,
Recently, I am doing some experiment related to cudaMemcpyAsync and overlap. I tried the similar method in:
Overlapping GPU<–>CPU transfer and CPU/GPU computation by using buffer, stream and cudaMemcpyAsync in order to improve performance.
Here is my code:
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#define N (1024*1024)
#define FULL_DATA_SIZE (N*20)
static void HandleError(cudaError_t err, const char* file, int line){
if(err!=cudaSuccess){
printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
exit(EXIT_FAILURE);
}
}
#define HANDLE_ERROR( err ) (HandleError(err, __FILE__, __LINE__))
__global__ void partialcompute( int *in, int* out ) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < N) {
out[idx]=in[idx];
// int idx1 = (idx + 1) % 256;
// int idx2 = (idx + 2) % 256;
// a[idx] = (a[idx] + a[idx1] + a[idx2]) / 3.0f;
//float bs = (b[idx] + b[idx1] + b[idx2]) / 3.0f;
//c[idx] = (as + bs) / 2;
}
__syncthreads();
}
__global__ void totalcompute (int *din, int* dout){
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < FULL_DATA_SIZE) {
dout[idx] = din[idx];
}
// if(idx==0){
// printf("dev_a[0] is %f, dev_o[0] is %f\n", din[0],dout[0]);
// }
__syncthreads();
}
bool compare_data(int* a, int* b){
int count = 0;
for(int t = 0; t<FULL_DATA_SIZE;t++){
if(a[t]!=b[t]){
//printf("a[%d] and b[%d] are: %d, %d\n",t,t,a[t],b[t]);
//return false;
count++;
}
}
printf("\ndiff # is %d\n",count);
return true;
}
int main( void ) {
//setup check
cudaDeviceProp prop;
int whichDevice;
HANDLE_ERROR( cudaGetDevice( &whichDevice ) );
HANDLE_ERROR( cudaGetDeviceProperties( &prop, whichDevice ) );
if (!prop.deviceOverlap) {
printf( "Device will not handle overlaps, so no speed up from streams\n" );
return 0;
}
//start
cudaEvent_t start, stop;
float elapsedTime0, elapsedTime;
cudaStream_t uploadStream, downloadStream, computeStream;
int *host_a, *host_b, *host_c, *host_out;
int *dev_a, *dev_o, *part_dev, *part_devout;
// start the timers
HANDLE_ERROR( cudaEventCreate( &start ) );
HANDLE_ERROR( cudaEventCreate( &stop ) );
// initialize the streams
HANDLE_ERROR( cudaStreamCreate( &uploadStream ) );
HANDLE_ERROR( cudaStreamCreate( &downloadStream ) );
HANDLE_ERROR( cudaStreamCreate( &computeStream ) );
HANDLE_ERROR(cudaHostAlloc((void**)&host_a,
FULL_DATA_SIZE*sizeof(int),cudaHostAllocDefault));
HANDLE_ERROR(cudaHostAlloc((void**)&host_c,
FULL_DATA_SIZE*sizeof(int),cudaHostAllocDefault));
HANDLE_ERROR(cudaMalloc((void**)&part_dev,N*sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**)&part_devout,N*sizeof(int)));
HANDLE_ERROR( cudaMalloc( (void**)&dev_a, FULL_DATA_SIZE * sizeof(int) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_o, FULL_DATA_SIZE * sizeof(int) ) );
host_b = (int*)malloc(FULL_DATA_SIZE*sizeof(int));
host_out = (int*)malloc(FULL_DATA_SIZE*sizeof(int));
float MB = (float)100*N*sizeof(int)/1024/1024;
HANDLE_ERROR( cudaEventRecord( start, 0 ) );
for(int m =0;m<FULL_DATA_SIZE;m++){
host_a[m] = 1000+m%10000;
host_b[m] = 1000+m%10000;
}
HANDLE_ERROR(cudaMemcpy(dev_a,host_b,
FULL_DATA_SIZE*sizeof(int),cudaMemcpyHostToDevice));
totalcompute<<<FULL_DATA_SIZE/512,512,0>>>(dev_a,dev_o);
HANDLE_ERROR(cudaMemcpy(host_out,dev_o,
FULL_DATA_SIZE*sizeof(int),cudaMemcpyDeviceToHost));
printf("host_out[10999] is: %d\n", host_out[10999]);
cudaThreadSynchronize();
HANDLE_ERROR( cudaEventRecord(stop, 0));
HANDLE_ERROR( cudaEventSynchronize( stop ) );
HANDLE_ERROR( cudaEventElapsedTime( &elapsedTime0,
start, stop ) );
printf( "Time taken: %3.1f ms\n", elapsedTime0 );
printf("speed: %3.1f MB/s \n", MB/(elapsedTime0/1000));
HANDLE_ERROR( cudaEventRecord( start, 0 ) );
// now loop over full data, in bite-sized chunks
for(int j=0; j<FULL_DATA_SIZE; j+=N){
HANDLE_ERROR(cudaMemcpyAsync(part_dev,host_a+j,
N*sizeof(int),
cudaMemcpyHostToDevice,
uploadStream));
partialcompute<<<N/512,512,0,computeStream>>>(part_dev,part_devout);
HANDLE_ERROR(cudaMemcpyAsync(host_c+j,part_devout,
N*sizeof(int),
cudaMemcpyDeviceToHost,
downloadStream));
// cudaThreadSynchronize();
};
HANDLE_ERROR( cudaStreamSynchronize( uploadStream) );
HANDLE_ERROR( cudaStreamSynchronize( downloadStream) );
HANDLE_ERROR( cudaStreamSynchronize( computeStream) );
HANDLE_ERROR( cudaEventRecord(stop, 0));
HANDLE_ERROR( cudaEventSynchronize( stop ) );
HANDLE_ERROR( cudaEventElapsedTime( &elapsedTime,
start, stop ) );
printf( "Opt Time taken: %3.1f ms\n", elapsedTime );
printf("Opt speed: %3.1f MB/s \n", MB/(elapsedTime/1000));
if(compare_data(host_out,host_a))
printf("success!\n");
if(compare_data(host_c,host_a))
printf("success!\n");
// cleanup the streams and memory
free(host_out);
free(host_b);
HANDLE_ERROR(cudaFree(dev_a));
HANDLE_ERROR(cudaFree(dev_o));
HANDLE_ERROR(cudaFreeHost(host_a));
HANDLE_ERROR(cudaFreeHost(host_c));
HANDLE_ERROR( cudaFree(part_dev));
HANDLE_ERROR( cudaFree(part_devout));
HANDLE_ERROR( cudaStreamDestroy( uploadStream ) );
HANDLE_ERROR( cudaStreamDestroy( downloadStream ) );
HANDLE_ERROR( cudaStreamDestroy( computeStream ) );
return 0;
}
Output:
-bash-3.2$ nvcc -o 4way 4wayOverlap.cu -O
-bash-3.2$ ./4way
host_out[10999] is: 1999
Time taken: 445.0 ms
speed: 898.9 MB/s
Opt Time taken: 71.7 ms
Opt speed: 5580.7 MB/s
diff # is 0
success!
diff # is 19922944 (this should be 0)
success!
It seems that the performance improved a lot, but the computation might run into some errors due to asynchronized operations. How to change the code in order to fix the problem?
Thanks.