Thank you so much, I am very glad to post my code here and let someone help me find the error. I am new in CUDA so please help me.

Here is the mex file for cublasCgemm function, which computes multiplication of two complex matrixes.

```
#include <stdio.h>
#include <stdlib.h>
#include "mex.h"
#include "cublas.h"
void pack_r2c(cuComplex *output_float,
float *input_re,
int Ntot)
{
int i;
for (i = 0; i < Ntot; i++)
{
output_float[i].x = input_re[i];
output_float[i].y = 0.0f;
}
}
void pack_c2c(cuComplex *output_float,
float *input_re,
float *input_im,
int Ntot)
{
int i;
for (i = 0; i < Ntot; i++)
{
output_float[i].x = input_re[i];
output_float[i].y = input_im[i];
}
}
void unpack_c2c(cuComplex *input_float,
float *output_re,
float *output_im,
int Ntot)
{
int i;
for (i = 0; i < Ntot; i++)
{
output_re[i] = input_float[i].x;
output_im[i] = input_float[i].y;
}
}
void mexFunction(int nlhs, mxArray *plhs[],int nrhs,const mxArray *prhs[])
{
cuComplex* h_A; // h_A is in host memory
cuComplex* h_B;
cuComplex* h_C;
cuComplex* d_A = 0; // d_A is in device memory
cuComplex* d_B = 0;
cuComplex* d_C = 0;
int RowsA,RowsB,RowsC,ColsA,ColsB,ColsC;
cuComplex alpha = {1,0};
cuComplex beta = {0,0};
char transa, transb;
int lda,ldb,ldc;
float *ar,*ai;
//check the numbers of input and output arguments
if (nrhs!=2){
mexErrMsgTxt("FimageMul_CUDA requires 2 input arguments: matrix A, B ");
}else if (nlhs!=1){
mexErrMsgTxt("FimagMul_CUDA requires 1 output arguments");
}
transa = 'n';
transb = 'n';
RowsA = mxGetM(prhs[0]);
ColsA = mxGetN(prhs[0]);
RowsB = mxGetM(prhs[1]);
ColsB = mxGetN(prhs[1]);
if (ColsA != RowsB){
mexErrMsgTxt("the second image's rows(height) should= the first image's columns(width)");
return;
}
RowsC = RowsA;
ColsC = ColsB;
// get the input matrix A
/* Allocating working array on host */
h_A = (cuComplex*) mxCalloc(ColsA*RowsA,sizeof(cuComplex));
/* Pointer for the real part of the input matrix A*/
ar = (float *) mxGetData(prhs[0]);
if(mxIsComplex(prhs[0]))
{
/* The input array is complex */
ai = (float *) mxGetImagData(prhs[0]);
pack_c2c(h_A, ar, ai, ColsA*RowsA);
}
else
{
/* The input array is real */
pack_r2c(h_A, ar, ColsA*RowsA);
}
// get the input Array B
/* Allocating working array on host */
h_B = (cuComplex*) mxCalloc(ColsB*RowsB,sizeof(cuComplex));
/* Pointer for the real part of the input Array B*/
ar = (float *) mxGetData(prhs[1]);
if(mxIsComplex(prhs[1]))
{
/* The input array is complex */
ai = (float *) mxGetImagData(prhs[1]);
pack_c2c(h_B, ar, ai, ColsB*RowsB);
}
else
{
/* The input array is real */
pack_r2c(h_B, ar, ColsB*RowsB);
}
lda = max(1, RowsA); /* leading dimension of A*/
ldb = max(1, RowsB); /* leading dimension of B */
ldc = max(1, RowsA); /*leading dimension of C*/
int mA_lda = RowsA;
int mA_ldb = RowsA;
int mB_lda = RowsB;
int mB_ldb = RowsB;
int mC_lda = RowsC;
int mC_ldb = RowsC;
/***** Host memory allocated for result*****/
h_C = (cuComplex*) mxCalloc(ColsC*RowsC,sizeof(cuComplex));
/* Initialize CUBLAS */
cublasInit();
/* Allocate device memory for the matrices */
cublasAlloc(RowsA * ColsA, sizeof(cuComplex), (void**)&d_A);
cublasAlloc(RowsB * ColsB, sizeof(cuComplex), (void**)&d_B);
cublasAlloc(RowsC * ColsC, sizeof(cuComplex), (void**)&d_C);
/* Initialize the device matrices with the host matrices */
cublasStatus cpA = cublasSetMatrix(RowsA, ColsA, sizeof(cuComplex), h_A, mA_lda, d_A, mA_ldb);
cublasStatus cpB = cublasSetMatrix(RowsB, ColsB, sizeof(cuComplex), h_B, mB_lda, d_B, mB_ldb);
cublasStatus cpC = cublasSetMatrix(RowsC, ColsC, sizeof(cuComplex), h_C, mC_lda, d_C, mC_ldb);
cublasCgemm(transa,transb,RowsA,ColsB,ColsA,alpha,d_A,lda,d_B,ldb,beta,d_C,ldc);
cublasStatus cgRes = cublasGetError();
if (cgRes ==CUBLAS_STATUS_SUCCESS)
mexPrintf("cublasCgemm succeeds\n");
else
mexPrintf("!!!! cublasCgemm failed! (status = %x)", cgRes);
//get the result matrix ; copy from GPU tp CPU
cublasStatus cgm = cublasGetMatrix(RowsC, ColsC, sizeof(cuComplex), d_C, mC_lda, h_C, mC_lda);
// output setting up
const mwSize dims[]={RowsC,ColsC};
plhs[0] = mxCreateNumericArray(2,dims,mxSINGLE_CLASS,mxCOMPLEX);
ar = (float *)mxGetPr(plhs[0]);
ai = (float *)mxGetPi(plhs[0]);
unpack_c2c(h_C, ar, ai, RowsC*ColsC);
/* Memory clean up */
mxFree(h_A);
mxFree(h_B);
mxFree(h_C);
cublasFree(d_A);
cublasFree(d_B);
cublasFree(d_C);
/* Shutdown */
cublasShutdown();
return ;
}
```

Test code is

```
N = 32;
i = sqrt(-1);
A= rand(N,N) + i*rand(N,N);
B = rand(N,N) + i*rand(N,N);
A = single(A);
B = single(B);
disp('running on GPU');
tic
C = FimageMul_CUDA_float(A,B);
toc
disp('running on CPU');
tic
C_mat = A*B;
toc
```

One of the running result is

running on GPU

cublasCgemm succeeds

Elapsed time is 0.000751 seconds.

running on CPU

Elapsed time is 0.000024 seconds.

Am I wrong in the code?