I’m trying to generalize a call for matrix-matrix multiplication using cublas, and everything works fine until I try to transpose one of the matrices.
I get the following error:
transpose(B)*A: ** On entry to DGEMM parameter number 8 had an illegal value
*** ERROR *** cublasSgemm returned error code 7, line 688
The implementation works flawlessly if I use a BLAS implementation running on the CPU, so I guess this may be a bug but I wanted to ask anyways.
My implementation, which I modified from one of the examples, looks like this:
static cublasStatus_t cublasXgemm(cublasHandle_t& handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k,
float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C,
int ldc) {
return cublasSgemm(handle, transa, transb, m, n, k, &alpha, A, m, B, k, &beta, C, m);
}
static cublasStatus_t cublasXgemm(cublasHandle_t& handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k,
double alpha, const double *A, int lda,
const double *B, int ldb, double beta, double *C,
int ldc) {
return cublasDgemm(handle, transa, transb, m, n, k, &alpha, A, m, B, k, &beta, C, m);
}
template
void cblas_gemm(cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k,
T alpha, const T *A, int lda, const T *B, int ldb, T beta, T *C, int ldc) {
cudaDeviceProp deviceProp;
cudaError_t error;
// make sure CUDA is initialized
if (!CUDA::getInstance().initialized()) {
cout<<“*** ERROR *** cuda not initialized”<<endl;
cout<<" Call array::CUDA::getInstance().initialize(argc, argv);"<<endl;
exit(EXIT_FAILURE);
}
int devID = CUDA::getInstance().devID();
error = cudaGetDeviceProperties(&deviceProp, devID);
if (error != cudaSuccess) {
cout<<"*** ERROR *** cudaGetDeviceProperties returned error code “<<error<<”, line "<<LINE<<endl;
exit(EXIT_FAILURE);
}
// use a larger block size for Fermi and above
int block_size = (deviceProp.major < 2) ? 16 : 32;
// allocate device memory
T *d_A, *d_B, *d_C;
unsigned int mem_size_A = sizeof(T)mk;
error = cudaMalloc((void ) &d_A, mem_size_A);
if (error != cudaSuccess) {
cout<<"* ERROR *** cudaMalloc d_A returned error code “<<error<<”, line "<<LINE<<endl;
exit(EXIT_FAILURE);
}
unsigned int mem_size_B = sizeof(T)ldbn;
error = cudaMalloc((void ) &d_B, mem_size_B);
if (error != cudaSuccess) {
cout<<"* ERROR *** cudaMalloc d_B returned error code “<<error<<”, line "<<LINE<<endl;
exit(EXIT_FAILURE);
}
// copy host memory to device
error = cudaMemcpy(d_A, A, mem_size_A, cudaMemcpyHostToDevice);
if (error != cudaSuccess) {
cout<<"*** ERROR *** cudaMemcpy d_A A returned error code “<<error<<”, line "<<LINE<<endl;
exit(EXIT_FAILURE);
}
error = cudaMemcpy(d_B, B, mem_size_B, cudaMemcpyHostToDevice);
if (error != cudaSuccess) {
cout<<"*** ERROR *** cudaMemcpy d_B B returned error code “<<error<<”, line "<<LINE<<endl;
exit(EXIT_FAILURE);
}
unsigned int mem_size_C = sizeof(T)mn;
error = cudaMalloc((void ) &d_C, mem_size_C);
if (error != cudaSuccess) {
cout<<"* ERROR *** cudaMalloc d_C returned error code “<<error<<”, line "<<LINE<<endl;
exit(EXIT_FAILURE);
}
error = cudaMemcpy(d_C, C, mem_size_C, cudaMemcpyHostToDevice);
if (error != cudaSuccess) {
cout<<"*** ERROR *** cudaMemcpy d_B B returned error code “<<error<<”, line "<<LINE<<endl;
exit(EXIT_FAILURE);
}
// setup execution parameters
dim3 threads(block_size, block_size);
dim3 grid(n / threads.x, m / threads.y);
// CUBLAS version 2.0
{
cublasHandle_t handle;
cublasStatus_t ret;
ret = cublasCreate(&handle);
if (ret != CUBLAS_STATUS_SUCCESS) {
cout<<"*** ERROR *** cublasCreate returned error code "<<ret<<", line "<<__LINE__<<endl;
exit(EXIT_FAILURE);
}
ret = cublasXgemm(handle, transa, transb, m, n, k, alpha, d_A, m, d_B, k, beta, d_C, m);
if (ret != CUBLAS_STATUS_SUCCESS) {
cout<<"*** ERROR *** cublasSgemm returned error code "<<ret<<", line "<<__LINE__<<endl;
exit(EXIT_FAILURE);
}
// copy result from device to host
error = cudaMemcpy(C, d_C, mem_size_C, cudaMemcpyDeviceToHost);
if (error != cudaSuccess) {
cout<<"*** ERROR *** cudaMemcpy h_CUBLAS d_C returned error code "<<error<<", line "<<__LINE__<<endl;
exit(EXIT_FAILURE);
}
}
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
}
Thank you.
aa