Here’s some of my code.
__global__ void vecAddGpu(int *a, int *b, int *sum, const int N, const int R)
int tid = blockDim.x * blockIdx.x + threadIdx.x;
if(tid < N)
for(int i=0; i<R; i++)
sum[tid] = a[tid] + b[tid]
main
int nsize = 512 * (1<<5);
int dimx = 128;
int repeat = 256;
int nbyte = sizeof(int)*nsize;
int isize = nsize/NSTREAM;
int ibyte = sizeof(int)*isize;
dim3 Db(dimx,1)
dim3 Dg((ibyte + Db.x - 1)/Db.x,1)
CHECK( cudaHostAlloc((void**)&h_a, nbyte, cudaHostAllocDefault) );
CHECK( cudaHostAlloc((void**)&h_b, nbyte, cudaHostAllocDefault) );
CHECK( cudaHostAlloc((void**)&gpuRef, nbyte, cudaHostAllocDefault) );
CHECK( cudaMalloc((void**)&d_a, nbyte) );
CHECK( cudaMalloc((void**)&d_b, nbyte) );
CHECK( cudaMalloc((void**)&d_sum, nbyte) );
cudaStream_t streams[NSTREAM];
for(int i=0; i<NSTREAM; i++)
cudaStreamCreate(&streams[i]);
for(int i=0; i<NSTREAM; i++){
int ioffset = i * isize;
CHECK( cudaMemcpyAsync(&d_a[ioffset], &h_a[ioffset], ibyte, cudaMemcpyHostToDevice, streams[i]) );
CHECK( cudaMemcpyAsync(&d_b[ioffset], &h_b[ioffset], ibyte, cudaMemcpyHostToDevice, streams[i]) );
vecAddGpu<<<Dg, Db, 0, streams[i]>>>(&d_a[ioffset], &d_b[ioffset], &d_sum[ioffset], isize, repeat);
CHECK( cudaMemcpyAsync(&gpuRef[ioffset], &d_sum[ioffset], ibyte, cudaMemcpyDeviceToHost, streams[i]) );
}
I think there is no problem on the code, but it was just my fault to see overlapping in data transfer in the same directions, which is fool.
But, initially I had no overlapping at all when I had relatively large amount of data to process.
I was suspicious about it as a possible cause, and reduced it a lot. Then, I started seeing overlapping.
How to determine or check the appropriate mount of data size to see overlapping in a given GPU?
I would be great if it is easy to check such things since GPU resources are limiting factor of overlapping, as I know.
Thank you.