I get the correct output when i’m using cublas_dgemm with a matrix initialized with column-major ordering.
I tried initializing a matrix in row major and just changing the cublas_dgemm option to transpose the input matrices. Technically, a row major matrix is a column major matrix that has been transposed. I thought that i should get the right values if I input row major matrices into cublas_dgemm and just changing the transpose option while keeping the other parameters the same. However, I’m getting the error on my output:
**On entry to DGEMM parameter number 8 had an illegal
and my result matrix is all zeros
what am I doing wrong when using row-major matrices in cublas_dgemm?
Here is the code I’m using to compute cublas_dgemm
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <iostream>
#include <cstdlib>
#include <iomanip>
#define n1 12
#define n2 20
#define n3 15
#define size1 (sizeof(double) * 12)
#define size2 (sizeof(double) * 20)
#define sizer (sizeof(double) * 15)
using namespace std;
int main() {
cudaSetDevice(0);
double * cuda_mat1;
double * cuda_mat2;
double * cuda_matr;
double * temp1;
double * temp2;
double * temp3;
temp1 = (double *)malloc(size1);
temp2 = (double *)malloc(size2);
temp3 = (double *)malloc(sizer);
cudaMalloc((void**)&cuda_mat1, size1);
cudaMalloc((void**)&cuda_mat2, size2);
cudaMalloc((void**)&cuda_matr, sizer);
for (int i = 0; i < n1; i++) {
double add = ((double)i) / 10;
//column major insertion
temp1[i/4 + (i%4*3)] = add;
//row major insertion
//temp1[i] = add;
}
cudaMemcpy(cuda_mat1, temp1, size1, cudaMemcpyHostToDevice);
for (int i = 0; i < n2; i++) {
double add = (((double)(i)) / 10) + i;
//column major insertion
temp2[i/5 + (i%5*4)] = add;
//row major insertion
//temp2[i] = add;
}
cudaMemcpy(cuda_mat2, temp2, size2, cudaMemcpyHostToDevice);
cublasHandle_t handle;
cublasCreate(&handle);
const double coeff = 1;
//column major matrix multiplication
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, 3, 5, 4, &coeff, cuda_mat1, 3, cuda_mat2, 4, &coeff, cuda_matr, 3);
//row major matrix multiplication
//cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_T, 3, 5, 4, &coeff, cuda_mat1, 3, cuda_mat2, 4, &coeff, cuda_matr, 3);
cout << "cublas result:" << endl;
cout << "matrix A:" << endl;
for (int i = 0; i < n1; i++) {
cout << fixed << setprecision(3);
cout << temp1[i/4 + (i%4 *3)] << " ";
if ((i + 1) % 4 == 0) {
cout << endl;
}
}
cout << endl << "matrix B:" << endl;
for (int i = 0; i < n2; i++) {
cout << temp2[i/5 + (i%5*4)] << " ";
if ((i + 1) % 5 == 0) {
cout << endl;
}
}
cout << endl << "result: " << endl;
cudaMemcpy(temp3, cuda_matr, sizer, cudaMemcpyDeviceToHost);
for (int i = 0; i < n3; i++) {
cout << temp3[i/5 + (i%5 *3)] << " ";
if ((i + 1) % 5 == 0) {
cout << endl;
}
}
cout << endl;
system("pause");
cudaFree(cuda_mat1);
cudaFree(cuda_mat2);
cudaFree(cuda_matr);
cublasDestroy(handle);
free(temp1);
free(temp2);
free(temp3);
return 0;
}