My problem is that - no matter how many devices I use The time to increase, as if I call cublasddot sequently
Why ?
I create in device context cublasHandle and stream, after this I set context and call cublasddot with Handle but time increase :(
P S Sory for my english
#include <mpi.h>
#include <cublas.h>
#include <cublas_v2.h>
#include <cuda.h>
#define kol 1000
#define kol_device 4
#define useMultiGPU
#ifdef useMultiGPU
#define cuSetDevice(kol_device)
CALL(cudaSetDevice(kol_device));
#else
#define cuSetDevice(kol_device)
#endif
#define iterations 1000
// Variables
Double *h_A, *h_B, *h_C[kol_device];
double* d_A[kol_device], *d_B[kol_device], *d_C[kol_device];
double seconds, sum, ddot=0.0;
int main(int argc, char** argv)
{
cudaStream_t streams = (cudaStream_t) malloc(kol_device * sizeof(cudaStream_t));
int N=kol;
//////////////////////////!!!/////////////////
long size = N * sizeof(double);
long size2 = sizeof(double);
//////////////////////////!!!/////////////////
int flag=0;
int n;
double start_mpi = 0,
end_mpi = 0,
sum_seconds = 0.0,
my_gpu_allsum = 0.0,
sum=0.0,
my_gpu_sum[kol_device];
long n_part, size_part;
cublasHandle_t hndl[kol_device];
n_part = N/kol_device+1;
size_part = n_part * sizeof(double);
szdouble = sizeof(double);
// Allocate input vectors h_A and h_B in host memory
h_A = (double*)malloc(size);
h_B = (double*)malloc(size);
for (int number_device=0; number_device<kol_device; number_device++)
{
cuSetDevice(number_device);
cudaMallocHost(&h_C[number_device], size_part);
// Allocate vectors in device memory
cudaMalloc(&d_A[number_device], size_part);
cudaMalloc(&d_B[number_device], size_part);
cudaMalloc(&d_C[number_device], size_part);
// Copy vectors from host memory to device memory
cudaMemcpy(d_A[number_device], h_A+number_device*n_part, size_part, cudaMemcpyHostToDevice) ;
cudaMemcpy(d_B[number_device], h_B+number_device*n_part, size_part, cudaMemcpyHostToDevice) ;
cudaStreamCreate(&(streams[number_device])) ;
cublasCreate(&hndl[number_device]);
}
MPI_Init(&argc, &argv);
for ( int III=0; III < iterations; III++)
{
n=n_part;
start_mpi = 0, end_mpi = 0;
// ******** Start timer ***************
cudaThreadSynchronize( ) ;
start_mpi = MPI_Wtime();
// ******** Eof Start timer ***********
for (int number_device=0; number_device<kol_device; number_device++)
{
cuSetDevice(number_device);
cublasSetStream(hndl[number_device], streams[number_device]) ;
cublasDdot(hndl[number_device], n_part, d_A[number_device], 1, d_B[number_device],1, & ddot);
sum+=ddot;
}
// ******** Stop timer *******************
cudaDeviceSynchronize();
end_mpi = MPI_Wtime();
sum_seconds+=end_mpi - start_mpi;
// ******** Eof Stop timer ***************
}
//printf(“result ddot = %f\n”, sum);
for (int number_device=0; number_device<kol_device; number_device++)
{
cudaFree(d_A[number_device]);
cudaFree(d_B[number_device]);
cudaFree(d_C[number_device]);
cudaFreeHost(h_C[number_device]);
cudaStreamDestroy(streams[number_device]);
}
free(h_A);
free(h_B);
MPI_Finalize();
}