I am running CUDA 4.0 on Linux using CUDA’s SDK as development framework.
I am running on 4 devices split between 2 GTX 590s.
I wanted to try to improve the bandwidth of a Host to Device broadcast using CUDA 4.0. For the following code I get a measured bandwidth of 1.45GB/s and no verification errors (it works fine!):
...
cutilSafeCall(cudaMallocHost(&h0, buf_size)); // Automatically portable with UVA
printf("Synchronous Broadcast:");
__QC_START_TIMER //timer macro
for(int i=0; i<trials; i++){
//copy to g0 on device0
cutilSafeCall(cudaMemcpy(g0, h0, buf_size, cudaMemcpyDefault));
//copy to g1 on device1
cutilSafeCall(cudaMemcpy(g1, h0, buf_size, cudaMemcpyDefault));
//copy to g2 on device2
cutilSafeCall(cudaMemcpy(g2, h0, buf_size, cudaMemcpyDefault));
//copy to g3 on device3
cutilSafeCall(cudaMemcpy(g3, h0, buf_size, cudaMemcpyDefault));
}
__QC_STOP_TIMER //timer macro
printf(" %.2fGB/s\n",(1.0f / (__QC_ELAPSED_TIME / 1000.0f)) * (float(trials * buf_size)) / 1000.0f / 1000.0f / 1000.0f);
VERIFY_DATA(); //verification macro
...
However, I thought I could speed up the bandwidth a bit by using cudaMemcpyAsync(); I get 4.32GB/s bandwidth, but the verification fails on g1, g2 and g3 for this code:
...
//streams were created on their respective devices
printf("Asynchronous Broadcast:");
trials = 1;
__QC_START_TIMER
for(int i=0; i<trials; i++){
cutilSafeCall(cudaSetDevice(gpuid_tesla[0]));
cutilSafeCall(cudaMemcpyAsync(g0, h0, buf_size, cudaMemcpyDefault, stream[0]));
cutilSafeCall(cudaSetDevice(gpuid_tesla[1]));
cutilSafeCall(cudaMemcpyAsync(g1, h0, buf_size, cudaMemcpyDefault, stream[1]));
cutilSafeCall(cudaSetDevice(gpuid_tesla[2]));
cutilSafeCall(cudaMemcpyAsync(g2, h0, buf_size, cudaMemcpyDefault, stream[2]));
cutilSafeCall(cudaSetDevice(gpuid_tesla[3]));
cutilSafeCall(cudaMemcpyAsync(g3, h0, buf_size, cudaMemcpyDefault, stream[3]));
cutilSafeCall(cudaStreamSynchronize(stream[0]));
cutilSafeCall(cudaStreamSynchronize(stream[1]));
cutilSafeCall(cudaStreamSynchronize(stream[2]));
cutilSafeCall(cudaStreamSynchronize(stream[3]));
cutilSafeCall(cudaDeviceSynchronize());
}
__QC_STOP_TIMER
printf(" %.2fGB/s\n",(1.0f / (__QC_ELAPSED_TIME / 1000.0f)) * (float(trials * buf_size)) / 1000.0f / 1000.0f / 1000.0f);
VERIFY_DATA();
...
I seems like I am only recording the time it takes for the first copy to complete (from h0 to g0).
Am I missing something? Do the other copies fail because they are reading from the same pinned host memory (h0)?