CublasDger() is performing slower than the sequential code.
Below is the code for cuda version :
static void outer(const real_t *aBegin, const real_t *aEnd, real_t *M, const real_t *b, const real_t *bEnd)
{
#ifdef OP_TRACKING
const real_t* mStart = M;
#endif
const int s_a = distance(aBegin, aEnd);
const int s_b = distance(b, bEnd);
cudaDeviceProp deviceProp;
cudaError_t error;
real_t *d_m, *d_a, *d_b;
int size = sizeof(real_t);
error = cudaMalloc((void **) &d_m, size*s_a*s_b);
error = cudaMalloc((void **) &d_a, size*s_a);
error = cudaMalloc((void **) &d_b, size*s_b);
cublasSetMatrix(s_a,s_b, size, M, s_a, d_m, s_a);
cublasSetVector(s_a, size, aBegin, 1, d_a, 1);
cublasSetVector(s_b, size, b, 1, d_b, 1);
cublasHandle_t handle;
cublasStatus_t ret;
const double alf = 1.0;
const double bet = 1.0;
const double *alpha=&alf;
const double *beta=&bet;
ret = cublasCreate(&handle);
ret = cublasDger(handle, s_a, s_b, alpha, d_a, 1, d_b, 1, d_m, s_a);
cublasGetMatrix(s_a,s_b, size, d_m, s_a, M, s_a);
cudaFree(d_m);
cudaFree(d_a);
cudaFree(d_b);
cublasDestroy(handle);
#ifdef OP_TRACKING
matrixOps += M - mStart;
#endif
}
We are not able to understand why the code is running slower than sequential. The function is being called many times in our code. Is there any way to find, which part of the code is creating the problem.
Thanks,