// Compute C = B*A, where A is already on device, B is on host and the result should be
// uploaded to host data at C. B is m x k, A is k x n, and C is m x n.
template<class T>
bool multMatrixOnDeviceRight(T* d_A, T* h_B, T* h_C, int m, int n, int k)
{
//Stopwatch::global.start_print();
// Upload h_B to d_B
T* d_B = NULL;
cublasStatus blasStat = cublasAlloc(m*k, sizeof(T), (void**)&d_B);
if ( blasStat != CUBLAS_STATUS_SUCCESS ) {
cout << "GPU device memory allocation failed!" << endl;
return false;
}
blasStat = cublasSetMatrix(m, k, sizeof(T), h_B, m, d_B, m);
// Alloc d_C
T* d_C = NULL;
blasStat = cublasAlloc(m*n, sizeof(T), (void**)&d_C);
if ( blasStat != CUBLAS_STATUS_SUCCESS ) {
cout << "GPU device memory allocation failed!" << endl;
return false;
}
if ( blasStat != CUBLAS_STATUS_SUCCESS ) {
cout << "GPU data download failed!" << endl;
cublasFree(d_B);
d_B = NULL;
return false;
}
// mult matrices on GPU
if ( sizeof(T) == 4 )
cublasSgemm('N', 'N', m, n, k, 1.0f, (float*)d_B, m, (float*)d_A, k, 0.0f, (float*)d_C, m);
else
cublasDgemm('N', 'N', m, n, k, 1.0f, (double*)d_B, m, (double*)d_A, k, 0.0f, (double*)d_C, m);
// upload the result back to CPU
blasStat = cublasGetMatrix(m, n, sizeof(T), d_C, m, h_C, m);
if ( blasStat != CUBLAS_STATUS_SUCCESS ) {
cout << "Data upload from GPU to CPU failed!" << endl;
cublasFree(d_C);
d_C = NULL;
return false;
}
if ( d_B ) {
cublasFree(d_B);
d_B = NULL;
}
if ( d_C ) {
cublasFree(d_C);
d_C = NULL;
}
//Stopwatch::global.print();
return true;
}