Hi,
I might be wrong, but I guess that the line
copyTime += elapsed_time(&thetime);
means that you actually print the time taken for the cumulated copy time. What you would rather look at corresponds to
copyTime = elapsed_time(&thetime);
Then, I should display an about solid 5200MB/s bandwidth, up to 128MB, and then some very suspicious times probably corresponding to a failure in the data transfer…
You’re right, damn bug. The issue doesn’t actually get demonstrated in this example.
Thanks anyway.
I can post up part of the code which has this problem.
int unpackDigitisedDataToGPU(int nchan, int ninp, int windowBlocks, int nbatch, int bits_per_samp, unsigned char *digitised_data, float *cuda_inp_buf, int debug, int wordtype)
{
int i;
static int init = 0, ntoread = 0;
static unsigned char *cudaBuffer;
//static unsigned char *buffer;
static int numThreads = 128;
struct timeval starttime;
gettimeofday( &starttime, NULL );
if( init == 0 )
{
ntoread = ninp * nchan * 2 * nbatch * bits_per_samp / 8;
init = 1;
cudaMalloc( (void **)&cudaBuffer, ntoread );
//buffer = (unsigned char *)malloc(ntoread);
if( debug )
fprintf( stderr, "size of read buffer: %d bytes\n", ntoread );
}
struct timeval thetime;
float cudaCopyTime=0, totalTime=0, unpackTime=0;
//memcpy( buffer, digitised_data, ntoread );
gettimeofday( &thetime, NULL );
cudaMemcpy( cudaBuffer, digitised_data, ntoread, cudaMemcpyHostToDevice );
cudaCopyTime += elapsed_time(&thetime);
/* Copy the last (windowBlocks-1) chunks to the beginning for each stream.
*
*/
for( i = 0; i < ninp; i++ )
cudaMemcpy( &cuda_inp_buf[i * (nbatch+windowBlocks-1) * nchan * 2],
&cuda_inp_buf[i*(nbatch+windowBlocks-1)*nchan*2 + (nbatch)*nchan*2],
(windowBlocks-1) * nchan * 2 * sizeof(float), cudaMemcpyDeviceToDevice );
/* Thread number should be multiple of 32 for best efficiency */
/* Assume nchan to be power of 2 and larger than numThreads */
dim3 threads( numThreads, 1, 1 );
dim3 blocks( nchan * 2 / numThreads, nbatch, ninp );
/* cuda_inp_buf needs to be offset by (windowBlocks-1) chunks due to the algorithm design */
gettimeofday( &thetime, NULL );
if( wordtype == 0 )
unpackUnsignedData_kernel<<< blocks, threads >>>(cudaBuffer, &cuda_inp_buf[(windowBlocks-1) * nchan * 2]);
/* FIXME: Not sure about the correctness of signed data unpacking */
else if( wordtype == 1 )
unpackSignedData_kernel<<< blocks, threads >>>(cudaBuffer, &cuda_inp_buf[(windowBlocks-1) * nchan * 2]);
cudaThreadSynchronize();
unpackTime += elapsed_time(&thetime);
totalTime += elapsed_time(&starttime);
fprintf( stderr, "cudaMemcpy time: %g, size: %d MB\n", cudaCopyTime, ntoread / 1024 / 1024 );
return 0;
}
And the output looks like:
cudaMemcpy time: 0.787, size: 4 MB
cudaMemcpy time: 22.609, size: 4 MB
cudaMemcpy time: 22.852, size: 4 MB
cudaMemcpy time: 22.861, size: 4 MB
cudaMemcpy time: 22.854, size: 4 MB
cudaMemcpy time: 22.838, size: 4 MB
cudaMemcpy time: 22.832, size: 4 MB
cudaMemcpy time: 22.8329, size: 4 MB
cudaMemcpy time: 22.841, size: 4 MB
cudaMemcpy time: 22.849, size: 4 MB
cudaMemcpy time: 22.857, size: 4 MB
cudaMemcpy time: 22.833, size: 4 MB
cudaMemcpy time: 22.841, size: 4 MB
cudaMemcpy time: 22.826, size: 4 MB
cudaMemcpy time: 22.848, size: 4 MB
cudaMemcpy time: 22.839, size: 4 MB
cudaMemcpy time: 22.848, size: 4 MB
cudaMemcpy time: 22.85, size: 4 MB
cudaMemcpy time: 22.829, size: 4 MB
cudaMemcpy time: 22.843, size: 4 MB
cudaMemcpy time: 22.838, size: 4 MB
cudaMemcpy time: 22.853, size: 4 MB
cudaMemcpy time: 22.833, size: 4 MB
cudaMemcpy time: 22.827, size: 4 MB
cudaMemcpy time: 22.838, size: 4 MB
cudaMemcpy time: 22.83, size: 4 MB
cudaMemcpy time: 22.828, size: 4 MB
cudaMemcpy time: 22.828, size: 4 MB
cudaMemcpy time: 22.846, size: 4 MB
cudaMemcpy time: 22.834, size: 4 MB
cudaMemcpy time: 22.798, size: 4 MB
cudaMemcpy time: 22.784, size: 4 MB
cudaMemcpy time: 22.76, size: 4 MB
cudaMemcpy time: 22.77, size: 4 MB
cudaMemcpy time: 22.779, size: 4 MB
cudaMemcpy time: 22.78, size: 4 MB
cudaMemcpy time: 22.775, size: 4 MB
cudaMemcpy time: 22.778, size: 4 MB
cudaMemcpy time: 22.783, size: 4 MB
cudaMemcpy time: 22.823, size: 4 MB
cudaMemcpy time: 22.805, size: 4 MB
cudaMemcpy time: 22.786, size: 4 MB
cudaMemcpy time: 22.788, size: 4 MB
cudaMemcpy time: 22.787, size: 4 MB
cudaMemcpy time: 22.78, size: 4 MB
cudaMemcpy time: 22.791, size: 4 MB
cudaMemcpy time: 22.779, size: 4 MB
cudaMemcpy time: 22.766, size: 4 MB
cudaMemcpy time: 22.775, size: 4 MB
cudaMemcpy time: 22.833, size: 4 MB
cudaMemcpy time: 22.786, size: 4 MB
cudaMemcpy time: 22.779, size: 4 MB
cudaMemcpy time: 22.779, size: 4 MB
cudaMemcpy time: 22.776, size: 4 MB
cudaMemcpy time: 22.788, size: 4 MB
cudaMemcpy time: 22.767, size: 4 MB
cudaMemcpy time: 22.772, size: 4 MB
cudaMemcpy time: 22.762, size: 4 MB
cudaMemcpy time: 22.771, size: 4 MB
cudaMemcpy time: 22.83, size: 4 MB
cudaMemcpy time: 22.785, size: 4 MB
cudaMemcpy time: 22.768, size: 4 MB
cudaMemcpy time: 22.781, size: 4 MB
cudaMemcpy time: 22.761, size: 4 MB
cudaMemcpy time: 22.772, size: 4 MB
cudaMemcpy time: 22.764, size: 4 MB
cudaMemcpy time: 22.822, size: 4 MB
cudaMemcpy time: 22.778, size: 4 MB
cudaMemcpy time: 22.79, size: 4 MB
cudaMemcpy time: 22.814, size: 4 MB
cudaMemcpy time: 22.795, size: 4 MB
cudaMemcpy time: 22.768, size: 4 MB
cudaMemcpy time: 22.78, size: 4 MB
cudaMemcpy time: 22.776, size: 4 MB
cudaMemcpy time: 22.771, size: 4 MB
cudaMemcpy time: 22.776, size: 4 MB
cudaMemcpy time: 22.803, size: 4 MB
cudaMemcpy time: 22.783, size: 4 MB
cudaMemcpy time: 22.769, size: 4 MB
cudaMemcpy time: 22.801, size: 4 MB
cudaMemcpy time: 22.78, size: 4 MB
cudaMemcpy time: 22.7659, size: 4 MB
cudaMemcpy time: 22.812, size: 4 MB
cudaMemcpy time: 22.827, size: 4 MB
cudaMemcpy time: 22.794, size: 4 MB
cudaMemcpy time: 22.82, size: 4 MB
cudaMemcpy time: 22.826, size: 4 MB
cudaMemcpy time: 22.791, size: 4 MB
cudaMemcpy time: 22.811, size: 4 MB
cudaMemcpy time: 22.842, size: 4 MB
cudaMemcpy time: 22.796, size: 4 MB
cudaMemcpy time: 22.798, size: 4 MB
cudaMemcpy time: 22.787, size: 4 MB
cudaMemcpy time: 22.789, size: 4 MB
cudaMemcpy time: 22.81, size: 4 MB
cudaMemcpy time: 22.773, size: 4 MB
cudaMemcpy time: 22.822, size: 4 MB
cudaMemcpy time: 22.786, size: 4 MB
cudaMemcpy time: 22.81, size: 4 MB
cudaMemcpy time: 22.834, size: 4 MB
cudaMemcpy time: 22.796, size: 4 MB
cudaMemcpy time: 22.776, size: 4 MB
cudaMemcpy time: 22.776, size: 4 MB
cudaMemcpy time: 22.801, size: 4 MB
cudaMemcpy time: 22.783, size: 4 MB
And no, I didn’t use the
+=
operator for the cuda memcpy time.