Hi! Am trying to find out how to accomplish concurrent H2D + D2H transfers. This is in CUDA 7.5 (and CUDA 7.0) on a Xeon E5-2667 v3 based server with Intel C610 chipset with two TITAN X and one Tesla K40m installed. The value of cudaDeviceProp.asyncEngineCount indicates these cards have two DMA engines. Advertising material suggests that two asynchronous engines should allow simultaneous read and write transfers across the PCI Express interface and provide duplex i.e. concurrent bidirectional PCIe transfers. For example the CUDA bandwidthTest example shows separate speeds for read of ~10 GB/s and write of ~10 GB/s, so I’d expect bidirectional read+write to perform at around ~20 GB/s or somewhat less due to host memory.
According to a previous thread, H2D + D2H transfers become concurrent when they are initiated from separate streams (https://devtalk.nvidia.com/default/topic/813603/cuda-combining-h2d-and-d2h-memory-transfer-operations/)
Wrote a test test program, see below. The result was a bit surprising, also given below. My questions are:
-
Any ideas why K40m not overlap the transfers? What is necessary in a CUDA 7.x program to make K40m do transfers concurrently?
-
Concurrent transfers on TITAN X apparently work, but at roughly half-speed only, meaning that in total they are not much faster than serial transfers. What might be causing this? Is it a TITAN X limitation? Or has it something to do with the PCIe bus of the host…?
Results for H2D+D2H (near-)concurrent transfer on a TITAN X card:
==108594== NVPROF is profiling process 108594, command: ./membw2 0
CUDA Device #0 : GeForce GTX TITAN X, Compute Capability 5.2, data transfer concurrency, 2 DMA engines
4 x 390.6 MiB : 0.1320 s GPU : 0.1321s CPU : 11.556 GB/s
==108594== Profiling result:
Start Duration Grid Size Block Size Regs* SSMem* DSMem* Size Throughput Device Context Stream Name
3.37848s 63.582ms - - - - - 390.63MB 5.9996GB/s GeForce GTX TIT 1 13 [CUDA memcpy HtoD]
3.38164s 65.698ms - - - - - 390.63MB 5.8064GB/s GeForce GTX TIT 1 14 [CUDA memcpy HtoD]
3.44207s 65.696ms - - - - - 390.63MB 5.8066GB/s GeForce GTX TIT 1 13 [CUDA memcpy HtoD]
3.44734s 63.147ms - - - - - 390.63MB 6.0410GB/s GeForce GTX TIT 1 14 [CUDA memcpy HtoD]
When the same transfers are run serially on TITAN X:
==108516== NVPROF is profiling process 108516, command: ./membw2 2 1
CUDA Device #2 : Tesla K40m, Compute Capability 3.5, data transfer concurrency, 2 DMA engines
Changed Stream IDs to 0 to force running H2D, D2H xfers serially.
4 x 390.6 MiB : 0.1553 s GPU : 0.1553s CPU : 9.828 GB/s
==108516== Profiling result:
Start Duration Grid Size Block Size Regs* SSMem* DSMem* Size Throughput Device Context Stream Name
3.37517s 38.834ms - - - - - 390.63MB 9.8232GB/s Tesla K40m (2) 1 7 [CUDA memcpy HtoD]
3.41400s 38.779ms - - - - - 390.63MB 9.8369GB/s Tesla K40m (2) 1 7 [CUDA memcpy HtoD]
3.45279s 38.834ms - - - - - 390.63MB 9.8231GB/s Tesla K40m (2) 1 7 [CUDA memcpy HtoD]
3.49162s 38.779ms - - - - - 390.63MB 9.8369GB/s Tesla K40m (2) 1 7 [CUDA memcpy HtoD]
Concurrent transfers on K40m do not actually get carried out concurrently, at least judging from nvprof timestamps:
==109753== NVPROF is profiling process 109753, command: ./membw2 2
CUDA Device #2 : Tesla K40m, Compute Capability 3.5, data transfer concurrency, 2 DMA engines
4 x 390.6 MiB : 0.1553 s GPU : 0.1553s CPU : 9.827 GB/s
==109753== Profiling result:
Start Duration Grid Size Block Size Regs* SSMem* DSMem* Size Throughput Device Context Stream Name
3.38293s 38.837ms - - - - - 390.63MB 9.8223GB/s Tesla K40m (2) 1 13 [CUDA memcpy HtoD]
3.42177s 38.782ms - - - - - 390.63MB 9.8362GB/s Tesla K40m (2) 1 14 [CUDA memcpy HtoD]
3.46055s 38.837ms - - - - - 390.63MB 9.8224GB/s Tesla K40m (2) 1 14 [CUDA memcpy HtoD]
3.49939s 38.782ms - - - - - 390.63MB 9.8362GB/s Tesla K40m (2) 1 13 [CUDA memcpy HtoD]
(Btw, yes, nvprof says 4 times HtoD. Yet the source code has two cudaMemcpyDeviceToHost and two cudaMemcpyHostToDevice calls? Some bug in my program?)
// File: membw2.cu
// Attempts concurrent D2H+H2D memory transfer.
// $ nvprof --profile-api-trace none --print-gpu-trace ./memb2
#include <cuda.h>
#include <helper_cuda.h>
#include <stdio.h>
#include <math.h>
#include <stdint.h>
#include <inttypes.h>
#include <sys/time.h>
int main(int argc, char** argv)
{
int device = 0, run_serially = 0;
cudaDeviceProp cuDev;
size_t nbyte = 409600 * 1000;
cudaEvent_t tstart, tstop;
struct timeval tv_start, tv_stop;
float dt_msec, dt_cpu;
float *d_idata, *h_idata;
float *d_odata, *h_odata;
cudaStream_t s[2];
if (argc > 1) {
device = atoi(argv[1]);
}
if (argc > 2) {
run_serially = atoi(argv[2]);
}
checkCudaErrors( cudaSetDevice(device) );
checkCudaErrors( cudaEventCreate( &tstart ) );
checkCudaErrors( cudaEventCreate( &tstop ) );
checkCudaErrors( cudaGetDeviceProperties(&cuDev, device) );
printf("CUDA Device #%d : %s, Compute Capability %d.%d, %s, %d DMA engines\n",
device, cuDev.name, cuDev.major, cuDev.minor,
cuDev.deviceOverlap ? "data transfer concurrency" : "no data transfer concurrency",
cuDev.deviceOverlap ? cuDev.asyncEngineCount : 1
);
checkCudaErrors( cudaStreamCreate(&s[0]) );
checkCudaErrors( cudaStreamCreate(&s[1]) );
checkCudaErrors( cudaMalloc( (void **)&d_idata, nbyte ) );
checkCudaErrors( cudaMalloc( (void **)&d_odata, nbyte ) );
checkCudaErrors( cudaHostAlloc( (void **)&h_idata, nbyte, cudaHostAllocDefault ) );
checkCudaErrors( cudaHostAlloc( (void **)&h_odata, nbyte, cudaHostAllocDefault ) );
if (run_serially) {
s[0] = 0; s[1] = 0;
printf("Changed Stream IDs to 0 to force running H2D, D2H xfers serially.\n");
}
gettimeofday(&tv_start, NULL);
checkCudaErrors( cudaEventRecord(tstart) );
// H2D+D2H
checkCudaErrors( cudaMemcpyAsync(d_idata, h_idata, nbyte, cudaMemcpyHostToDevice, s[0]) );
checkCudaErrors( cudaMemcpyAsync(d_odata, h_odata, nbyte, cudaMemcpyDeviceToHost, s[1]) );
// H2D+D2H once more
checkCudaErrors( cudaMemcpyAsync(d_idata, h_idata, nbyte, cudaMemcpyHostToDevice, s[0]) );
checkCudaErrors( cudaMemcpyAsync(d_odata, h_odata, nbyte, cudaMemcpyDeviceToHost, s[1]) );
checkCudaErrors( cudaEventRecord(tstop) );
checkCudaErrors( cudaEventSynchronize(tstop) );
checkCudaErrors( cudaEventElapsedTime(&dt_msec, tstart, tstop) );
gettimeofday(&tv_stop, NULL);
dt_cpu = (tv_stop.tv_sec - tv_start.tv_sec) + 1e-6*(tv_stop.tv_usec - tv_start.tv_usec);
double_t L_mib = nbyte/1048576.0;
double R_total = 4 * nbyte/(dt_msec*1e-3*1024.0*1048576.0); // 4 x nbyte were copied
printf(" 4 x %.1f MiB : %.4f s GPU : %.4fs CPU : %.3f GB/s\n",
L_mib, dt_msec*1e-3, dt_cpu, R_total
);
return 0;
}