Hi, here’s the function for cuBLAS,
typedef float mytype;
void GPU_MultiStridedBatch(mytype *M, mytype *N, mytype *P, size_t pr, size_t pc, size_t mc, mytype alpha, mytype beta, int num_mat, int niter)
{
mytype *devM, *devN, *devP;
size_t p_size =sizeof(mytype) *pr*pc;
size_t m_size =sizeof(mytype) *pr*mc;
size_t n_size =sizeof(mytype) *mc*pc;
cudaMalloc((void**)&devM, m_size*num_mat );
cudaMalloc((void**)&devN, n_size*num_mat );
cudaMalloc((void**)&devP, p_size*num_mat );
cudaMemcpy(devM, M, m_size*num_mat , cudaMemcpyHostToDevice);
cudaMemcpy(devN, N, n_size*num_mat , cudaMemcpyHostToDevice);
cublasHandle_t myhandle;
cublasStatus_t cublas_result;
cublas_result = cublasCreate(&myhandle);
assert(cublas_result == CUBLAS_STATUS_SUCCESS);
for (int i=0; i<niter; i++){
cublas_result = cublasSgemmStridedBatched(myhandle, CUBLAS_OP_N, CUBLAS_OP_N
, pr, pc, mc
, &alpha, devM, pr, pr*mc, devN, mc, mc*pc
, &beta, devP, pr, pr*pc
, num_mat);
}
assert(cublas_result == CUBLAS_STATUS_SUCCESS);
cudaMemcpy(P, devP, p_size*num_mat, cudaMemcpyDeviceToHost);
cudaFree(devM);
cudaFree(devN);
cudaFree(devP);
cublasDestroy(myhandle);
}
And for the CPU version,
struct element{
mytype m[ROWM][COLM], n[COLM][COLN], p[ROWM][COLN];
};
int CPU_multi(int num_mat, int niter)
{
struct element* elms;
elms =new element[num_mat];
for(int i=0; i<num_mat; i++){
for(int j=0; j<ROWM; j++)
for(int k=0; k<COLM; k++)
elms[i].m[j][k] = 3.0f;
for(int j=0; j<COLM; j++)
for(int k=0; k<COLN; k++)
elms[i].n[j][k] = 2.0f;
for(int j=0; j<ROWM; j++)
for(int k=0; k<COLN; k++)
elms[i].p[j][k] = 0.0f;
}
double t1 = omp_get_wtime();
for(int it = 0; it<niter; it++)
for(int k =0; k<num_mat; k++)
for(int i=0; i< ROWM; i++){
for(int j=0; j<COLN; j++){
elms[k].p[i][j] = 0.0f;
for(int m=0; m<COLM; m++){
elms[k].p[i][j] += elms[k].m[i][m] * elms[k].n[m][j];
}
}
}
double t2 = omp_get_wtime();
printf("CPU serial time : %e seconds \n", t2-t1);
delete elms;
return 0;
}
Thanks!