Hi I just started coding , and I need some help with cublasgemmex(). I dont really understand how leading dimension thing works. assuming i have these 2 arrays A[r][c] , B[c][k] and I wish to multiply them like so AxB

what would be the lda, ldb and ldc in this case?

Also here is the code that I was trying

```
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <cublas_v2.h>
#include <curand.h>
#include <stdlib.h>
#include <assert.h>
#include <time.h>
#include <stdio.h>
#include <math.h>
#include <iostream>
#include <cstdlib>
using namespace std;
template<typename T> void printMatrix(int rowCount, int colCount, const T* matrix) {
for (int i = 0; i < rowCount; i++) {
for (int j = 0; j < colCount; j++) {
cout << matrix[j * colCount + i] << "\t";
}
cout << endl;
}
}
int main() {
// Problem size
int r = 4;
int c = 3;
// Declare pointers to matrices on device and host
float* h_a, * h_b, * h_c;
float* d_a, * d_b, * d_c;
size_t bytesa = r * c * sizeof(float);
size_t bytesb = c * r * sizeof(float);
size_t bytesc = r * r * sizeof(float);
// Allocate memory
h_a = (float*)malloc(bytesa);
h_b = (float*)malloc(bytesb);
h_c = (float*)malloc(bytesc);
cudaMalloc(&d_a, bytesa);
cudaMalloc(&d_b, bytesb);
cudaMalloc(&d_c, bytesc);
// Pseudo random number generator
curandGenerator_t prng;
curandCreateGenerator(&prng, CURAND_RNG_PSEUDO_DEFAULT);
// Set the seed
curandSetPseudoRandomGeneratorSeed(prng, (unsigned long long)clock());
// Fill the matrix with random numbers on the device
curandGenerateUniform(prng, d_a, r * c);
curandGenerateUniform(prng, d_b, c * r);
// cuBLAS handle
cublasHandle_t handle;
cublasCreate(&handle);
// Scalaing factors
float alpha = 1.0f;
float beta = 0.0f;
cublasStatus_t status;
status = cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, r, c, r, &alpha, d_a,CUDA_R_16F, r, d_b,
CUDA_R_16F, c, &beta, d_c,CUDA_R_16F, r, CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
// Copy back the three matrices
cudaMemcpy(h_a, d_a, bytesa, cudaMemcpyDeviceToHost);
cudaMemcpy(h_b, d_b, bytesb, cudaMemcpyDeviceToHost);
cudaMemcpy(h_c, d_c, bytesc, cudaMemcpyDeviceToHost);
printMatrix(r, c, h_a);
printMatrix(c, r, h_b);
printMatrix(r, r, h_c);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf(stderr, "!!!! kernel execution error.\n");
return EXIT_FAILURE;
}
// Verify solution
//verify_solution(h_a, h_b, h_c, n);
printf("COMPLETED SUCCESSFULLY\n");
return 0;
}
```

but i get some errors

```
** On entry to GEMM_EX parameter number 12 had an illegal value
0.45563 0.444989 0.831396
0.796939 0.67204 0.182175
0.178881 0.830081 0.939539
0.444989 0.831396 0.703919
0.0115277 0.983714 0.824535 -4.22017e+37
0.241401 0.36433 0.539053 0
0.0816302 0.27666 0.749799 -3.77434e-28
0 0 0 0
0 0 0 0
0 0 0 0
0 0 0 0
!!!! kernel execution error.
```

would greatly appreciate any help

.