I called the cublasSgemmBatched function to divide a 181 * 35 * 3 matrix and a 35 * 1 * 3 matrix into three blocks for batch processing, that is, into sub matrices of 181 * 35 and 35 * 1, for a total of three times. But I don’t understand the parameters very well, and the code I write always doesn’t run.
The code is as follows:
int BF_num=181;
int M_Array=35;
int NF=1;
cuDoubleComplex* a = (cuDoubleComplex*)malloc(BF_num * M_Array * 3 * sizeof(cuDoubleComplex));
cuDoubleComplex* b = (cuDoubleComplex*)malloc(M_Array * NF * 3 * sizeof(cuDoubleComplex));
cuDoubleComplex* c = (cuDoubleComplex*)malloc(BF_num * NF * 3 * sizeof(cuDoubleComplex));
int m = BF_num, n = NF, k = M_Array;
for(int i = 0; i < BF_num * M_Array * 3; i++)
a[i].x=i,a[i].y=i;
for(int j = 0; j < 1 * M_Array * 3; j++)
b[j].x=j,b[j].x=j;
for(int j = 0; j < 1 * BF_num * 3; j++)
c[j].x=j,c[j].y=j;;
cublasHandle_t handle_p;
cublasCreate(&handle_p);
cuDoubleComplex alpha = {1.0,0};//cublas
cuDoubleComplex beta = {0,0};//cublas
int batch=3;
int Ida = batch, Idb = batch, Idc = batch;
cuDoubleComplex *a_array[batch];
cuDoubleComplex *b_array[batch];
cuDoubleComplex *c_array[batch];
for (int i = 0; i < 3; i++) {
a_array[i] = a + i * BF_num ;
b_array[i] = b + i * M_Array;
c_array[i] = c + i * BF_num;
}
#pragma acc enter data copyin(a[0:BF_num * M_Array * batch],b[0:M_Array * NF * batch],c[0:BF_num * NF * batch],a_array[0:batch],b_array[0:batch],c_array[0:batch])
#pragma acc host_data use_device(a,b,c,a_array,b_array,c_array)
cublasZgemmBatched(handle_p,CUBLAS_OP_N,CUBLAS_OP_N,n,m,k,&alpha,b_array,batch,a_array,batch,&beta,c_array,batch,batch);
#pragma acc exit data copyout(c[0:BF_num * 1 * 3])
for(int j = 0; j < BF_num / 3; j++)
printf("c[%d]=%f\n",j,c[j]);