I wrote the following code to test the memory transfer bandwidth of our machine, which is a machine with 6 C2050 devices. Since C2050 supports bi-directional async memcpy, I used 8 streams to transfer memory from H2D and D2H at the same time. But I found that the aggregated bandwidth of bi-directional memcpy is even lower than H2D only.
Here is the result:
[root@A124 test]# ./a.out 0
testing uni-directional bandwidth:
time elapsed:0.355392
bandwidth at block size 67108864: 5.62759GB/s
[root@A124 test]# ./a.out 1
testing bi-directional bandwidth:
time elapsed:0.422472
bandwidth at block size 67108864: 4.73404GB/s
Any ideas? Is it possible that the bi-directional transfer is some how turned off on our machine?
OS: RHEL 5.3 x86_64
driver: 195.36.20
NVCC: release 3.0, V0.2.1221
deviceQuery Result:
[i] Device 0: “Tesla C2050”
CUDA Driver Version: 3.0
CUDA Runtime Version: 3.0
CUDA Capability Major revision number: 2
CUDA Capability Minor revision number: 0
Total amount of global memory: 2817982464 bytes
Number of multiprocessors: 14
Number of cores: 448
Total amount of constant memory: 65536 bytes
Total amount of shared memory per block: 49152 bytes
Total number of registers available per block: 32768
Warp size: 32
Maximum number of threads per block: 1024
Maximum sizes of each dimension of a block: 1024 x 1024 x 64
Maximum sizes of each dimension of a grid: 65535 x 65535 x 1
Maximum memory pitch: 2147483647 bytes
Texture alignment: 512 bytes
Clock rate: 1.15 GHz
Concurrent copy and execution: Yes
Run time limit on kernels: No
Integrated: No
Support host page-locked memory mapping: Yes
Compute mode: Default (multiple host threads can use this device simultaneously)
[/i]
Here is the code I used:
#include <stdlib.h>
using namespace std;
# define CUDA_SAFE_CALL_NO_SYNC( call) { \
cudaError err = call; \
if( cudaSuccess != err) { \
fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} }
# define CUDA_SAFE_CALL( call) CUDA_SAFE_CALL_NO_SYNC(call);
const int STREAMS=8;
double get_time(){
cudaThreadSynchronize();
timeval t;
gettimeofday(&t,0);
return (double)t.tv_sec+(double)t.tv_usec/1000000;
}
void bw_test(int bidirect){
CUDA_SAFE_CALL( cudaSetDevice(0) );
cpu_set_t cpu_set;
CPU_ZERO(&cpu_set);
CPU_SET(0, &cpu_set);
sched_setaffinity(0, 1, &cpu_set);
cudaStream_t streams[STREAMS];
for(int i=0;i<STREAMS;i++)
cudaStreamCreate(&streams[i]);
void * h_mem[STREAMS];
void * d_mem[STREAMS];
unsigned block_size=64*1024*1024;
unsigned iter=4;
// allocate memory
for(int i=0;i<STREAMS;i++){
CUDA_SAFE_CALL( cudaMallocHost(&h_mem[i], block_size) );
memset(h_mem[i], 1, block_size);
CUDA_SAFE_CALL( cudaMalloc((void**)&d_mem[i], block_size) );
}
// test bandwidth
double start=get_time();
for(int i=0;i<iter;i++){
for(int j=0;j<STREAMS;j++){
if(j%2==0){
CUDA_SAFE_CALL( cudaMemcpyAsync(d_mem[j], h_mem[j], block_size, cudaMemcpyHostToDevice,streams[j]) );
}
else{
if(bidirect){
CUDA_SAFE_CALL( cudaMemcpyAsync(h_mem[j], d_mem[j], block_size, cudaMemcpyDeviceToHost,streams[j]) );
}
else{
CUDA_SAFE_CALL( cudaMemcpyAsync(d_mem[j], h_mem[j], block_size, cudaMemcpyHostToDevice,streams[j]) );
}
}
}
}
for(int i=0;i<STREAMS;i++){
CUDA_SAFE_CALL( cudaStreamSynchronize(streams[i]) );
}
double end=get_time();
// output
double bw=(double)block_size*STREAMS*iter/(1024*1024*1024)/(end-start);
cout<<"time elapsed:"<<end-start<<endl;
cout<<"bandwidth at block size "<<block_size<<": "<<bw<<"GB/s"<<endl;
}
int main(int argc, char * argv[]){
if(argc!=2){
cout<<"usage: "<<argv[0]<<" 1/0"<<endl;
return 1;
}
int bi_directional=atoi(argv[1]);
if(bi_directional){
cout<<"testing bi-directional bandwidth:"<<endl;
}
else{
cout<<"testing uni-directional bandwidth:"<<endl;
}
bw_test(bi_directional);
return 1;
}