DGEMM parameter number 8 had an illegal value

I’m trying to generalize a call for matrix-matrix multiplication using cublas, and everything works fine until I try to transpose one of the matrices.
I get the following error:

transpose(B)*A: ** On entry to DGEMM parameter number 8 had an illegal value
*** ERROR *** cublasSgemm returned error code 7, line 688

The implementation works flawlessly if I use a BLAS implementation running on the CPU, so I guess this may be a bug but I wanted to ask anyways.

My implementation, which I modified from one of the examples, looks like this:

static cublasStatus_t cublasXgemm(cublasHandle_t& handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k,
float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C,
int ldc) {
return cublasSgemm(handle, transa, transb, m, n, k, &alpha, A, m, B, k, &beta, C, m);
}

static cublasStatus_t cublasXgemm(cublasHandle_t& handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k,
double alpha, const double *A, int lda,
const double *B, int ldb, double beta, double *C,
int ldc) {
return cublasDgemm(handle, transa, transb, m, n, k, &alpha, A, m, B, k, &beta, C, m);
}

template
void cblas_gemm(cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k,
T alpha, const T *A, int lda, const T *B, int ldb, T beta, T *C, int ldc) {

cudaDeviceProp deviceProp;
cudaError_t error;

// make sure CUDA is initialized
if (!CUDA::getInstance().initialized()) {
cout<<“*** ERROR *** cuda not initialized”<<endl;
cout<<" Call array::CUDA::getInstance().initialize(argc, argv);"<<endl;
exit(EXIT_FAILURE);
}

int devID = CUDA::getInstance().devID();

error = cudaGetDeviceProperties(&deviceProp, devID);

if (error != cudaSuccess) {
cout<<"*** ERROR *** cudaGetDeviceProperties returned error code “<<error<<”, line "<<LINE<<endl;
exit(EXIT_FAILURE);
}

// use a larger block size for Fermi and above
int block_size = (deviceProp.major < 2) ? 16 : 32;

// allocate device memory
T *d_A, *d_B, *d_C;

unsigned int mem_size_A = sizeof(T)mk;
error = cudaMalloc((void ) &d_A, mem_size_A);
if (error != cudaSuccess) {
cout<<"
* ERROR *** cudaMalloc d_A returned error code “<<error<<”, line "<<LINE<<endl;
exit(EXIT_FAILURE);
}

unsigned int mem_size_B = sizeof(T)ldbn;
error = cudaMalloc((void ) &d_B, mem_size_B);
if (error != cudaSuccess) {
cout<<"
* ERROR *** cudaMalloc d_B returned error code “<<error<<”, line "<<LINE<<endl;
exit(EXIT_FAILURE);
}

// copy host memory to device
error = cudaMemcpy(d_A, A, mem_size_A, cudaMemcpyHostToDevice);
if (error != cudaSuccess) {
cout<<"*** ERROR *** cudaMemcpy d_A A returned error code “<<error<<”, line "<<LINE<<endl;
exit(EXIT_FAILURE);
}

error = cudaMemcpy(d_B, B, mem_size_B, cudaMemcpyHostToDevice);
if (error != cudaSuccess) {
cout<<"*** ERROR *** cudaMemcpy d_B B returned error code “<<error<<”, line "<<LINE<<endl;
exit(EXIT_FAILURE);
}

unsigned int mem_size_C = sizeof(T)mn;
error = cudaMalloc((void ) &d_C, mem_size_C);
if (error != cudaSuccess) {
cout<<"
* ERROR *** cudaMalloc d_C returned error code “<<error<<”, line "<<LINE<<endl;
exit(EXIT_FAILURE);
}

error = cudaMemcpy(d_C, C, mem_size_C, cudaMemcpyHostToDevice);
if (error != cudaSuccess) {
cout<<"*** ERROR *** cudaMemcpy d_B B returned error code “<<error<<”, line "<<LINE<<endl;
exit(EXIT_FAILURE);
}

// setup execution parameters
dim3 threads(block_size, block_size);
dim3 grid(n / threads.x, m / threads.y);

// CUBLAS version 2.0
{
cublasHandle_t handle;

cublasStatus_t ret;

ret = cublasCreate(&handle);

if (ret != CUBLAS_STATUS_SUCCESS) {
  cout<<"*** ERROR *** cublasCreate returned error code "<<ret<<", line "<<__LINE__<<endl;
  exit(EXIT_FAILURE);
}    
  
ret = cublasXgemm(handle, transa, transb, m, n, k, alpha, d_A, m, d_B, k, beta, d_C, m);    
if (ret != CUBLAS_STATUS_SUCCESS) {
  cout<<"*** ERROR *** cublasSgemm returned error code "<<ret<<", line "<<__LINE__<<endl;
  exit(EXIT_FAILURE);
}

// copy result from device to host
error = cudaMemcpy(C, d_C, mem_size_C, cudaMemcpyDeviceToHost);

if (error != cudaSuccess) {
  cout<<"*** ERROR *** cudaMemcpy h_CUBLAS d_C returned error code "<<error<<", line "<<__LINE__<<endl;
  exit(EXIT_FAILURE);
}

}

cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
}

Thank you.
aa

There is probably no bug, rather you did not adjust the parameters to account for the transpose. Remember that *blas uses column major format.

Post a specific single example using code blocks which results in that error.

Reworked the example:

void cublasTgemm(cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k,
float alpha, const float *A, int lda, const float *B, int ldb, float beta, float *C, int ldc) {

cudaDeviceProp deviceProp;
cudaError_t error;

int devID = CUDA::getInstance().devID();

error = cudaGetDeviceProperties(&deviceProp, devID);

if (error != cudaSuccess) {
cout<<"*** ERROR *** cudaGetDeviceProperties returned error code “<<error<<”, line "<<LINE<<endl;
exit(EXIT_FAILURE);
}

// use a larger block size for Fermi and above
int block_size = (deviceProp.major < 2) ? 16 : 32;

// allocate device memory
float *d_A, *d_B, *d_C;

unsigned int mem_size_A = sizeof(float)mk;
error = cudaMalloc((void ) &d_A, mem_size_A);
if (error != cudaSuccess) {
cout<<"
* ERROR *** cudaMalloc d_A returned error code “<<error<<”, line "<<LINE<<endl;
exit(EXIT_FAILURE);
}

unsigned int mem_size_B = sizeof(float)ldbn;
error = cudaMalloc((void ) &d_B, mem_size_B);
if (error != cudaSuccess) {
cout<<"
* ERROR *** cudaMalloc d_B returned error code “<<error<<”, line "<<LINE<<endl;
exit(EXIT_FAILURE);
}

// copy host memory to device
error = cudaMemcpy(d_A, A, mem_size_A, cudaMemcpyHostToDevice);
if (error != cudaSuccess) {
cout<<"*** ERROR *** cudaMemcpy d_A A returned error code “<<error<<”, line "<<LINE<<endl;
exit(EXIT_FAILURE);
}

error = cudaMemcpy(d_B, B, mem_size_B, cudaMemcpyHostToDevice);
if (error != cudaSuccess) {
cout<<"*** ERROR *** cudaMemcpy d_B B returned error code “<<error<<”, line "<<LINE<<endl;
exit(EXIT_FAILURE);
}

unsigned int mem_size_C = sizeof(float)mn;
error = cudaMalloc((void ) &d_C, mem_size_C);
if (error != cudaSuccess) {
cout<<"
* ERROR *** cudaMalloc d_C returned error code “<<error<<”, line "<<LINE<<endl;
exit(EXIT_FAILURE);
}

error = cudaMemcpy(d_C, C, mem_size_C, cudaMemcpyHostToDevice);
if (error != cudaSuccess) {
cout<<"*** ERROR *** cudaMemcpy d_B B returned error code “<<error<<”, line "<<LINE<<endl;
exit(EXIT_FAILURE);
}

// setup execution parameters
dim3 threads(block_size, block_size);
dim3 grid(n / threads.x, m / threads.y);

// CUBLAS version 2.0
{
cublasHandle_t handle;

cublasStatus_t ret;

ret = cublasCreate(&handle);

if (ret != CUBLAS_STATUS_SUCCESS) {
  cout<<"*** ERROR *** cublasCreate returned error code "<<ret<<", line "<<__LINE__<<endl;
  exit(EXIT_FAILURE);
}

ret = cublasSgemm(handle, transa, transb, m, n, k, &alpha, d_A, m, d_B, k, &beta, d_C, m);
if (ret != CUBLAS_STATUS_SUCCESS) {
  cout<<"*** ERROR *** cublasSgemm returned error code "<<ret<<", line "<<__LINE__<<endl;
  exit(EXIT_FAILURE);
}

// copy result from device to host
error = cudaMemcpy(C, d_C, mem_size_C, cudaMemcpyDeviceToHost);

if (error != cudaSuccess) {
  cout<<"*** ERROR *** cudaMemcpy h_CUBLAS d_C returned error code "<<error<<", line "<<__LINE__<<endl;
  exit(EXIT_FAILURE);
}

}

cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
}

int main(int argc, char **argv) {

array::CUDA::getInstance().initialize(argc, argv);

constexpr int m=8,k=4,n=6;

float A = new float[km];
float B = new float[kn];

// initialization
int kk=0, ll=-1;
for (int j=0; j<m; ++j)
for (int i=0; i<k; ++i)
A[i+k*j] = ++kk;

for (int i=0; i<k; ++i)
for (int j=0; j<n; ++j)
B[i+k*j] = --ll;

// print matrices
cout<<“A: (”<<k<<“,”<<m<<“)”<<endl;
for (int i=0; i<k; ++i) {
for (int j=0; j<m; ++j)
cout<<" "<<A[i+k*j];
cout<<endl;
}

cout<<“\nB: (”<<k<<“,”<<n<<“)”<<endl;
for (int i=0; i<k; ++i) {
for (int j=0; j<n; ++j)
cout<<" "<<B[i+k*j];
cout<<endl;
}

float C = new float[km];

cublasTgemm(CblasTrans, CblasNoTrans, m, n, k, 1, A, k, B, k, 0, C, m);

cout<<“\nC: (”<<m<<“,”<<n<<“)”<<endl;
for (int i=0; i<m; ++i) {
for (int j=0; j<n; ++j)
cout<<" "<<C[i+k*j];
cout<<endl;
}

return 0;

}

The initialize function above is contained in the singleton:

class CUDA {
  
  cudaDeviceProp deviceProp;
  int devID_;
  bool init_;
  
public:
  
  static CUDA& getInstance() {
    static CUDA instance;
    return instance;
  }
  
  int devID() const
  { return devID_; }
  
private:
  
  CUDA() : devID_(), init_() {}
  CUDA(CUDA const&) = delete;
  void operator=(CUDA const&) = delete;
  ~CUDA() { cudaDeviceReset(); }
  
public:
  
  bool initialized()
  { return init_; }
  
  void initialize(int argc, char **argv)
  {
    
    if (init_)
      return;
    
    // By default, we use device 0, otherwise we override the device ID based on what is provided at the command line
    cudaError_t error;
    
    if (checkCmdLineFlag(argc, (const char **)argv, "device")) {
      devID_ = getCmdLineArgumentInt(argc, (const char **)argv, "device");
      error = cudaSetDevice(devID_);
      
      if (error != cudaSuccess) {
        cout<<"cudaSetDevice returned error code "<<error<<", line "<<__LINE__<<endl;
        exit(EXIT_FAILURE);
      }
    }
    
    // get number of SMs on this GPU
    error = cudaGetDevice(&devID_);
    if (error != cudaSuccess) {
      cout<<"cudaGetDevice returned error code "<<error<<", line(%d)\n"<<__LINE__<<endl;
      exit(EXIT_FAILURE);
    }
    
    cudaDeviceProp deviceProp;
    error = cudaGetDeviceProperties(&deviceProp, devID_);
    if (error != cudaSuccess) {
      cout<<"cudaGetDeviceProperties returned error code "<<error<<" , line "<<__LINE__<<endl;
      exit(EXIT_FAILURE);
    }
    
    cout<<"GPU Device "<<devID_<<": \""<<deviceProp.name<<"\" with compute capability "<<deviceProp.major<<"."<<deviceProp.minor<<endl;
    
    init_ = true;
  }
  
  int memory() {
    cudaDeviceProp props;
    cudaGetDeviceProperties(&props, devID_);
    return props.totalGlobalMem;
  }
  
};

You are making this much more complicated than necessary with that mass of boilerplate code.
I asked you to post a simple single example in code blocks which was resulting in the error, not more of the same.

First off read the cuBLAS documentation which has examples;

http://docs.nvidia.com/cuda/cublas/index.html

Assume you have a Matrix D_A and D_AT which are equal and ‘skinny’ (Arows>=Acols)

If you have a Matrix D_A in device memory and D_AT which will act as the transpose then this is the correct way to multiply D_AT x D_A;

cur=cublasSgemm_v2(handle,CUBLAS_OP_N,CUBLAS_OP_T,Acols,Acols,Arows,&t_alphA,D_A,Acols,D_AT,Acols,&_rho,tmpM2,Acols);

if(cur != CUBLAS_STATUS_SUCCESS){
printf("error code %d, line(%d)\n", cur, __LINE__);
exit(EXIT_FAILURE);}

notice the use of code blocks…

And if you want to multiply D_A times D_AT where (Acols>Arows);

cur=cublasSgemm_v2(handle,CUBLAS_OP_T,CUBLAS_OP_N,Arows,Arows,Acols,&_InvRho,D_A,Acols,D_AT,Acols,&t_alphA,tmpM2,Arows);//tmpM2=1/rho*(A*AT)+eye(Arows)
if(cur != CUBLAS_STATUS_SUCCESS){
printf("error code %d, line(%d)\n", cur, __LINE__);
exit(EXIT_FAILURE);}

I thought you were asking me to post the code needed to reproduce the error I obtained.

I am not following what you did in those code excerpts. If you multiply D_AT x D_A, shouldn’t be

cur=cublasSgemm_v2(handle,CUBLAS_OP_T,CUBLAS_OP_N,...

instead of

cur=cublasSgemm_v2(handle,CUBLAS_OP_N  ,CUBLAS_OP_T,...

I’m also using an implementation of the BLAS interface that runs on CPUs, and the code works. Why is it different in CUBLAS? The only difference should be the transferring of memory from host to device and back, shouldn’t be?

Column Major… read the documentation. Did you even try the code sample?

That sample is right, and you can even use D_A as both input parameters if you want to save memory, just make sure you get the op(A) and op(B) right. Here is a sample from that code output;

Matrix A(6x9):
0.0267077 0.343151 0.201016 0.276351 0.281531 0.50619 0.527489 0.0743201 0.513611 
0.0818331 0.393344 0.154334 0.443666 0.116665 0.187052 0.0449307 2.30646 0.0927682 
0.0942574 10.4862 0.541304 0.00539066 0.531868 0.203519 0.293706 0.0942255 0.292996 
0.315142 0.975442 0.0965368 0.464347 0.550861 0.0257292 0.449738 0.317111 0.0427645 
0.326991 1.85643 2.74352 0.20032 0.107482 0.130313 0.165632 24.7844 1.28518 
16.671 1.47094 0.18882 0.0867829 0.05194 1.65484 0.239544 8.80184 0.130771 

Matrix A^t(9x6):
0.0267077 0.0818331 0.0942574 0.315142 0.326991 16.671 
0.343151 0.393344 10.4862 0.975442 1.85643 1.47094 
0.201016 0.154334 0.541304 0.0965368 2.74352 0.18882 
0.276351 0.443666 0.00539066 0.464347 0.20032 0.0867829 
0.281531 0.116665 0.531868 0.550861 0.107482 0.05194 
0.50619 0.187052 0.203519 0.0257292 0.130313 1.65484 
0.527489 0.0449307 0.293706 0.449738 0.165632 0.239544 
0.0743201 2.30646 0.0942255 0.317111 24.7844 8.80184 
0.513611 0.0927682 0.292996 0.0427645 1.28518 0.130771 


cuBLAS AT x A =
278.146 26.4662 4.14438 1.70279 1.14184 27.6865 4.23478 155.14 2.66273 
26.4662 116.795 11.271 1.27835 6.53305 5.08259 4.37708 61.1875 5.90507 
4.14438 11.271 7.92913 0.737736 0.720364 0.913252 0.815013 70.1111 3.83089 
1.70279 1.27835 0.737736 0.536516 0.414258 0.405635 0.430092 6.92025 0.473327 
1.14184 6.53305 0.720364 0.414258 0.693452 0.386707 0.587947 3.63584 0.479738 
27.6865 5.08259 0.913252 0.405635 0.386707 3.08876 0.764749 18.2917 0.721947 
4.23478 4.37708 0.815013 0.430092 0.587947 0.764749 0.653607 6.52663 0.624571 
155.14 61.1875 70.1111 6.92025 3.63584 18.2917 6.52663 697.174 33.2967 
2.66273 5.90507 3.83089 0.473327 0.479738 0.721947 0.624571 33.2967 2.02886 

cuBLAS A x AT =
1.1183 0.661086 4.27635 0.941741 3.93827 2.7119 
0.661086 5.76106 4.57616 1.45504 58.5971 22.6501 
4.27635 4.57616 110.768 10.7859 23.8281 18.4011 
0.941741 1.45504 10.7859 1.8845 10.3232 9.72275 
3.93827 58.5971 23.8281 10.3232 627.095 227.295 
2.7119 22.6501 18.4011 9.72275 227.295 360.418

I also am new to this all, but you should experiment on your own with the parameters…

Hi there!

The parameters you´re passing to your own cublasTgemm are correct, but the parameters passed to the cuBLAS-call are not.

for the call in main, you use k as LD-parameter, which is correct here since you´re transposing A

cublasTgemm(CblasTrans, CblasNoTrans, m, n, k, 1, A, k, B, k, 0, C, m);

but in your call to cuBLAS, you use m as LDA-parameter which is correct UNLESS you´re transposing A!

ret = cublasSgemm(handle, transa, transb, m, n, k, &alpha, d_A, m, d_B, k, &beta, d_C, m);

You should use individual explicit variables for m/n/k and LDA/LDB parameters in your call to cuBLAS; i think you know that but lost track of it in your code.

Please tell, if this solved your problem!