I’m using omp to parallelise my 8 gpu code, and its’ actually resulting in a very big slowdown to the code:
af_multi7.cpp(300):Thread : 0 / 8, Gpu : 0 - GeForce GTX TITAN, time taken : 0.066377
af_multi7.cpp(300):Thread : 7 / 8, Gpu : 7 - GeForce GTX TITAN, time taken : 34.620190
af_multi7.cpp(300):Thread : 5 / 8, Gpu : 5 - GeForce GTX TITAN, time taken : 35.356863
af_multi7.cpp(300):Thread : 4 / 8, Gpu : 4 - GeForce GTX TITAN, time taken : 37.317620
af_multi7.cpp(300):Thread : 2 / 8, Gpu : 2 - GeForce GTX TITAN, time taken : 39.163119
af_multi7.cpp(300):Thread : 6 / 8, Gpu : 6 - Tesla K20Xm, time taken : 40.834065
af_multi7.cpp(300):Thread : 3 / 8, Gpu : 3 - GeForce GTX TITAN, time taken : 42.025256
af_multi7.cpp(300):Thread : 1 / 8, Gpu : 1 - GeForce GTX TITAN, time taken : 43.021797
Elapsed time is 84.573489 seconds.
Timings are obviously all in parallel, but typically one serial run should take 3 seconds. This is taking 40 seconds, or roughly a 2* slowdown. Every gpu is performing an equivalent calculation (same size, same values)
Code being benchmarked is as below:
Code to create a sparse matrix :
timer::start();
darray out(h_row, h_col, h_val, nnz, len, par);
MSG("Thread : %d / %d, Gpu : %d - %s, time taken : %6f",omp_get_thread_num(),ngpu,device,prop.name,timer::stop());
Relevant class constructor:
darray::darray(int *rowind, int *colind, float *val, int nnz, int r, int c) : sparse(true), nnz(nnz), dims0(r), dims1(c), data(NULL)
{
int *rowInd;
cusparseHandle_t handle;
CUSPARSE(cusparseCreate(&handle));
CUDA(cudaMalloc(&rowPtr,(r+1)*sizeof(int)));
CUDA(cudaMalloc(&colInd,nnz*sizeof(int)));
CUDA(cudaMalloc(&Val,nnz*sizeof(float)));
CUDA(cudaMalloc(&rowInd,nnz*sizeof(int)));
CUDA(cudaMemcpyAsync(rowInd,rowind,nnz*sizeof(int), cudaMemcpyDefault));
CUDA(cudaMemcpyAsync(colInd,colind,nnz*sizeof(int), cudaMemcpyDefault));
CUDA(cudaMemcpyAsync(Val,val,nnz*sizeof(float), cudaMemcpyDefault));
cudaDeviceSynchronize();
CUSPARSE(cusparseXcoo2csr(handle, rowInd, nnz, dims0, rowPtr, CUSPARSE_INDEX_BASE_ZERO)); //coo to csr conversion
CUSPARSE(cusparseDestroy(handle));
CUDA(cudaFree(rowInd));
}