Hi. I’ve computed matrix multiplation using CuBLAS & gt 330m.
As I know performance of gt 330m is around 180Gflops. But I can’t show performance of laptop that Gflops.
(Performance is around 61~62 Gflops)
What’s wrong on the my codes? Or Spec of gt 330m is different compare with I’ve known?
Anyway Here’s my codes(WIN7, 32bit,VS2010)
#include <cuda_runtime.h>
#include <cublas.h>
#include <malloc.h>
#include
void cudaBlasErr(const char *msg){
cublasStatus err = cublasGetError();
if( err !=CUBLAS_STATUS_SUCCESS) {
printf(“Due to Error of %d in cublas, %s\n”,err,msg);
}
}
void cudaErr(const char *msg){
cudaError_t err = cudaGetLastError();
if( err !=cudaSuccess) {
printf(“%d %s %s \n”, err, msg, cudaGetErrorString(err) );
}
}
int main (int argc, char *argv) {
cublasStatus stat;
float *ah,*bh,*ch;
float *ad,*bd,*cd;
float alpha = 1.0f;
float beta = 0.0f;
const int m = 500;
const int n = 500;
const int k = 500;
int n1 = m*n;
int n2 = n*k;
int n3 = m*n;
int N1bytes, N2bytes,N3bytes;
N1bytes = n1 * sizeof(float);
N2bytes = n2 * sizeof(float);
N3bytes = n3 * sizeof(float);
ah = (float*) malloc(N1bytes); cudaErr("Allocation of ah on the device 0");
// matrix a(n1) - Nxm
bh = (float*) malloc(N2bytes); cudaErr("Allocation of bh on the device 0");
// matrix b(n2) - mxa
ch = (float*) malloc(N3bytes); cudaErr("Allocation of ch on the device 0");
// matrix c(output) - Nxa
for(int i = 0;i < n1; i++)
{
ah[i] = (float)i;
}
for(int i = 0;i < n2; i++)
{
bh[i] = (float)i;
}
cublasInit();
cublasAlloc(n1,sizeof(float),(void**)&ad); cudaErr("Allocation of n1 on the device 0");
cublasAlloc(n2,sizeof(float),(void**)&bd); cudaErr("Allocation of n2 on the device 0");
cublasAlloc(n3,sizeof(float),(void**)&cd); cudaErr("Allocation of n3 on the device 0");// memory allocation of cublas at device
cublasSetVector(n1,sizeof(float),ah,1,ad,1); cudaBlasErr("Set Vector n1");
cublasSetVector(n2,sizeof(float),bh,1,bd,1); cudaBlasErr("Set Vector n2");
cublasSetVector(n3,sizeof(float),ch,1,cd,1); cudaBlasErr("Set Vector n3");// setVector host to device
cudaEvent_t start,stop;
float time1;
cudaEventCreate(&start); cudaEventCreate(&stop); // start & creation of cudaEvent
cudaEventRecord( start, 0 );
cublasSgemm('n','n',m,n,k,alpha,ad,m,bd,n,beta,cd,m); cudaBlasErr("Calculating of Matrix(n1xn2)"); //ì•žì˜ ë‘개는 ëª…ë ¹ì–´,그다ìŒì€ 행들,alpha와 beta는 ê³µì‹,ab와 m,bd와 n,cd와 m
cudaEventRecord( stop, 0 ); cudaEventSynchronize( stop );
cudaEventElapsedTime( &time1, start, stop ); // Estimation of ElapsedTime
cudaEventDestroy( start ); cudaEventDestroy( stop );
cublasGetVector(n3,sizeof(float),cd,1,ch,1); cudaBlasErr("Get Vector n3");
printf("%f\n",time1); // unit is ms.
float output[m][n];
int i = 0;
int j = 0;
for(int l = 0; l < n3; l++)
{
output[i][j] = ch[l];
i++;
if((l+1) % m == 0)
{
j++;
i = 0;
}
}
printf("%.2f\n",output[0][0]);
double gflops=((500*500*((2.*500)-1.))/time1)/1e6;// Gflops - In my laptop, ability of computation is around 62Gflops.
printf("%.6f\n",gflops);
free(ah); free(bh); free(ch);
cublasFree(ad);
cublasFree(bd);
cublasFree(cd);
cublasShutdown();
return 0;
}