I have recently ported an application to CUDA, and since most of the execution time for the application was spent on transferring data I thought I would try to use streaming to overlap the memory transfers (and, less importantly, kernel execution and memory transfer). I took the simpleMultiCopy example from the SDK, and modified it to work with my data. These modifications included performing two input memcpys and using one of the input memory areas as output on the GPU, as well as making sure the kernel wasn’t running on non-existent data and that only data that had been processed was copied back to the host. I’ve included a sketch of the code below to show the order I issue commands:
// Read first input
memcpy(pinned_data_in0[current_stream], pagable_data_in0, stream_size);
memcpy(pinned_data_in1[current_stream], pagable_data_in1, stream_size);
// Upload first frame
cudaMemcpyAsync(d_data_in_out[current_stream],
pinned_data_in0[current_stream], stream_size, cudaMemcpyHostToDevice,
stream[current_stream]);
cudaMemcpyAsync(d_data_in[current_stream],
pinned_data_in1[current_stream], stream_size, cudaMemcpyHostToDevice,
stream[current_stream]);
for(unsigned int i=0; i<kernel_calls; i++) {
next_stream = (current_stream + 1) % streams_used;
if(i>=streams_used)
{
// Store the result, but only when the stream we are trying to
// retrieve the result from has actually run once.
memcpy(&(pagable_data_out[(i - streams_used)*
(stream_size/sizeof(pagable_data_out[0]))]),
pinned_data_out[current_stream], stream_size);
}
if(i+1<kernel_calls)
{
// Read new input, but only if the next stream still has data to
// process.
memcpy(pinned_data_in0[next_stream],
&(pagable_data_in0[(i+1)*
(stream_size/sizeof(pagable_data_in0[0]))]),
stream_size);
memcpy(pinned_data_in1[next_stream],
&(pagable_data_in1[(i+1)*(stream_size/sizeof(pagable_data_in1[0]))]),
stream_size);
}
// Ensure that processing and copying of the last cycle has finished.
cudaEventSynchronize(cycleDone[next_stream]);
// Process current frame
myKernel<<<grid, block, 0, stream[current_stream]>>>(
d_data_const, d_data_in[current_stream],
d_data_in_out[current_stream], stream_size);
if(i+1<kernel_calls)
{
// Upload next frame, but only if there is remaining data to
// process.
cudaMemcpyAsync(d_data_in_out[next_stream],
pinned_data_in0[next_stream], stream_size,
cudaMemcpyHostToDevice, stream[next_stream]);
cudaMemcpyAsync(d_data_in[next_stream],
pinned_data_in1[next_stream], stream_size, cudaMemcpyHostToDevice,
stream[next_stream]);
}
// Download current frame
cudaMemcpyAsync(pinned_data_out[current_stream],
d_data_in_out[current_stream], stream_size, cudaMemcpyDeviceToHost,
stream[current_stream]);
cudaEventRecord(cycleDone[current_stream],
stream[current_stream]);
current_stream = next_stream;
}
// Flush the rest of the results from the pinned memory to the pagable memory.
for(unsigned int i=kernel_calls; i<kernel_calls+streams_used; i++)
{
next_stream = (current_stream + 1) % streams_used;
if(i>=streams_used)
{
// Ensure that processing and copying of the cycle has finished.
cudaEventSynchronize(cycleDone[current_stream]);
// Store the result, but only when the stream we are trying to
// retrieve the result from has actually run once.
memcpy(&(pagable_data_out[(i-streams_used)*
(stream_size/sizeof(pagable_data_out[0]))]),
pinned_data_out[current_stream], stream_size);
}
current_stream = next_stream;
}
The problem I’m having is that the streaming execution is giving a lower throughput than a non-streaming implementation using synchronous memory transfers, even when I make sure the kernels in both implementations are launching the same number of threads. As an example, when using 4 streams to process 128MB of data (each kernel call processing 8MB), the throughput was around 290MBps, whereas when I prepped and launched a single kernel to process 8MB I got a throughput of 330MBps. Is there some way I am being inefficient in the order I issue commands to the streams?
I’m using the 4.0 toolkit on a Tesla C2070 running on Windows 7 64-bit, compiling my code as win32/Release.