Hi all,
I am using the cublasDgemm function from Cublas Library and I have tested my code on Nvidia 1080 ti and Tesla V100. I have the following matrix dimensions (to compute A’A):
[243 x 3614] x [3614 x 243]
The execution time of this call is just under 1 ms in most cases, however, it sometimes goes around 13 to 15 ms. It is worthy of mention that cudaStreamSynchronize is called right before and after the cublasDgemm call. The results of this matrix multiplication are extensively tested and there seem to be no other issue. Furthermore, there is similar issue with cv::cuda::gemm from OpenCV Library, which uses cublasDgemm in its implementation. My code is indeed based on OpenCV implementation. It should be mentioned that there is no other kernel launch to the associated Cuda device during this call and all previously launched ones on other streams are synchronized using cudaStreamSynchronize.
I would be appreciated it if anyone could help me with possible reasons for this high variation in execution time of cublasDgemm.
The main piece of code which calls cublasDgemm is based on OpenCV implementation as below (step 4 takes unusual time sometimes]:
// constructor
MyArithm::MyArithm(){
cublasSafeCall( cublasCreate_v2(&_cublas_v2H) );
cublas_wspace.create(32*1024*1024);
}
// destructor
MyArithm::~MyArithm()
{
if(_cublas_v2H){
cublasSafeCall( cublasDestroy_v2(_cublas_v2H) );
_cublas_v2H = nullptr;
}
}
// set stream and work space for cublas
void MyArithm::setStream(cudaStream_t c_stream)
{
if(c_stream){
stream = cv::cuda::StreamAccessor::wrapStream(c_stream);
cublasSafeCall( cublasSetStream_v2(_cublas_v2H, c_stream) );
cublasSafeCall( cublasSetWorkspace(_cublas_v2H, (void*)cublas_wspace.data(), cublas_wspace.size()) );
}
}
void MyArithm::gemm(const GpuMat &src1, const GpuMat &src2, double alpha, const GpuMat &src3, double beta, GpuMat &dst, int flags) const
{
QElapsedTimer local_timer;
local_timer.start();
// CUBLAS works with column-major matrices
assert( src1.type() == CV_32FC1 || src1.type() == CV_32FC2 || src1.type() == CV_64FC1 || src1.type() == CV_64FC2 );
assert( src2.type() == src1.type() && (src3.empty() || src3.type() == src1.type()) );
if (src1.depth() == CV_64F)
{
if (!deviceSupports(NATIVE_DOUBLE))
CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
}
bool tr1 = (flags & GEMM_1_T) != 0;
bool tr2 = (flags & GEMM_2_T) != 0;
bool tr3 = (flags & GEMM_3_T) != 0;
if (src1.type() == CV_64FC2)
{
if (tr1 || tr2 || tr3)
CV_Error(cv::Error::StsNotImplemented, "transpose operation doesn't implemented for CV_64FC2 type");
}
Size src1Size = tr1 ? Size(src1.rows, src1.cols) : src1.size();
Size src2Size = tr2 ? Size(src2.rows, src2.cols) : src2.size();
Size src3Size = tr3 ? Size(src3.rows, src3.cols) : src3.size();
Size dstSize(src2Size.width, src1Size.height);
assert( src1Size.width == src2Size.height );
assert( src3.empty() || src3Size == dstSize );
if( dst.size() != dstSize || dst.type()!=src1.type() )
cv::cuda::createContinuous(dstSize, src1.type(), dst);
if (beta != 0)
{
if (src3.empty())
{
dst.setTo(Scalar::all(0), stream);
}
else
{
if (tr3)
{
cuda::transpose(src3, dst, stream);
}
else
{
src3.copyTo(dst, stream);
}
}
}
stream.waitForCompletion(); // equivalent to cudaStreamSynchronize
std::cout << " ** Time of gemm [step 1] = " << local_timer.nsecsElapsed()/1000000.0 << " ms" << std::endl;
local_timer.start();
cublasSafeCall( cublasSetPointerMode_v2(_cublas_v2H, CUBLAS_POINTER_MODE_HOST) );
stream.waitForCompletion();
std::cout << " ** Time of gemm [step 2] = " << local_timer.nsecsElapsed()/1000000.0 << " ms" << std::endl;
local_timer.start();
const float alphaf = static_cast<float>(alpha);
const float betaf = static_cast<float>(beta);
const cuComplex alphacf = make_cuComplex(alphaf, 0);
const cuComplex betacf = make_cuComplex(betaf, 0);
const cuDoubleComplex alphac = make_cuDoubleComplex(alpha, 0);
const cuDoubleComplex betac = make_cuDoubleComplex(beta, 0);
cublasOperation_t transa = tr2 ? CUBLAS_OP_T : CUBLAS_OP_N;
cublasOperation_t transb = tr1 ? CUBLAS_OP_T : CUBLAS_OP_N;
stream.waitForCompletion();
std::cout << " ** Time of gemm [step 3] = " << local_timer.nsecsElapsed()/1000000.0 << " ms" << std::endl;
local_timer.start();
switch (src1.type())
{
case CV_32FC1:
cublasSafeCall( cublasSgemm_v2(_cublas_v2H, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows,
&alphaf,
src2.ptr<float>(), static_cast<int>(src2.step / sizeof(float)),
src1.ptr<float>(), static_cast<int>(src1.step / sizeof(float)),
&betaf,
dst.ptr<float>(), static_cast<int>(dst.step / sizeof(float))) );
break;
case CV_64FC1:
cublasSafeCall( cublasDgemm_v2(_cublas_v2H, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows,
&alpha,
src2.ptr<double>(), static_cast<int>(src2.step / sizeof(double)),
src1.ptr<double>(), static_cast<int>(src1.step / sizeof(double)),
&beta,
dst.ptr<double>(), static_cast<int>(dst.step / sizeof(double))) );
break;
case CV_32FC2:
cublasSafeCall( cublasCgemm_v2(_cublas_v2H, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows,
&alphacf,
src2.ptr<cuComplex>(), static_cast<int>(src2.step / sizeof(cuComplex)),
src1.ptr<cuComplex>(), static_cast<int>(src1.step / sizeof(cuComplex)),
&betacf,
dst.ptr<cuComplex>(), static_cast<int>(dst.step / sizeof(cuComplex))) );
break;
case CV_64FC2:
cublasSafeCall( cublasZgemm_v2(_cublas_v2H, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows,
&alphac,
src2.ptr<cuDoubleComplex>(), static_cast<int>(src2.step / sizeof(cuDoubleComplex)),
src1.ptr<cuDoubleComplex>(), static_cast<int>(src1.step / sizeof(cuDoubleComplex)),
&betac,
dst.ptr<cuDoubleComplex>(), static_cast<int>(dst.step / sizeof(cuDoubleComplex))) );
break;
}
stream.waitForCompletion();
std::cout << " ** Time of gemm [step 4] = " << local_timer.nsecsElapsed()/1000000.0 << " ms" << std::endl;
std::cout << " gemm size = " << src1.size() << " - " << src2.size() << std::endl;
local_timer.start();
// stream.waitForCompletion();
}