Something looks like wrong - Gflops of Gt 330m(mobility cuda)

Hi. I’ve computed matrix multiplation using CuBLAS & gt 330m.
As I know performance of gt 330m is around 180Gflops. But I can’t show performance of laptop that Gflops.
(Performance is around 61~62 Gflops)
What’s wrong on the my codes? Or Spec of gt 330m is different compare with I’ve known?
Anyway Here’s my codes(WIN7, 32bit,VS2010)

#include <cuda_runtime.h>
#include <cublas.h>
#include <malloc.h>
#include

void cudaBlasErr(const char *msg){
cublasStatus err = cublasGetError();
if( err !=CUBLAS_STATUS_SUCCESS) {
printf(“Due to Error of %d in cublas, %s\n”,err,msg);
}
}
void cudaErr(const char *msg){
cudaError_t err = cudaGetLastError();
if( err !=cudaSuccess) {
printf(“%d %s %s \n”, err, msg, cudaGetErrorString(err) );
}
}

int main (int argc, char *argv) {

cublasStatus stat;
float *ah,*bh,*ch;
float *ad,*bd,*cd;
float alpha = 1.0f;
float beta = 0.0f;
const int m = 500;
const int n = 500;
const int k = 500;
int n1 = m*n;
int n2 = n*k;
int n3 = m*n;
int N1bytes, N2bytes,N3bytes;



N1bytes = n1 * sizeof(float);
N2bytes = n2 * sizeof(float);
N3bytes = n3 * sizeof(float);

ah = (float*) malloc(N1bytes); cudaErr("Allocation of ah on the device 0");
// matrix a(n1) - Nxm
bh = (float*) malloc(N2bytes); cudaErr("Allocation of bh on the device 0");
// matrix b(n2) - mxa
ch = (float*) malloc(N3bytes); cudaErr("Allocation of ch on the device 0");
// matrix c(output) - Nxa



for(int i = 0;i < n1; i++)
{
	ah[i] = (float)i;
}
for(int i = 0;i < n2; i++)
{
	bh[i] = (float)i;
}


cublasInit();


cublasAlloc(n1,sizeof(float),(void**)&ad); cudaErr("Allocation of n1 on the device 0");
cublasAlloc(n2,sizeof(float),(void**)&bd); cudaErr("Allocation of n2 on the device 0");
cublasAlloc(n3,sizeof(float),(void**)&cd); cudaErr("Allocation of n3 on the device 0");// memory allocation of cublas at device

cublasSetVector(n1,sizeof(float),ah,1,ad,1);	cudaBlasErr("Set Vector n1"); 
cublasSetVector(n2,sizeof(float),bh,1,bd,1);	cudaBlasErr("Set Vector n2");
cublasSetVector(n3,sizeof(float),ch,1,cd,1);	cudaBlasErr("Set Vector n3");// setVector host to device


cudaEvent_t start,stop; 
float time1;
cudaEventCreate(&start); cudaEventCreate(&stop); // start & creation of cudaEvent
cudaEventRecord( start, 0 );

cublasSgemm('n','n',m,n,k,alpha,ad,m,bd,n,beta,cd,m); cudaBlasErr("Calculating of Matrix(n1xn2)"); //앞의 두개는 ëª…ë ¹ì–´,그다음은 행들,alpha와 beta는 공식,ab와 m,bd와 n,cd와 m

cudaEventRecord( stop, 0 ); cudaEventSynchronize( stop );
cudaEventElapsedTime( &time1, start, stop );  // Estimation of ElapsedTime
cudaEventDestroy( start ); cudaEventDestroy( stop );

cublasGetVector(n3,sizeof(float),cd,1,ch,1);  cudaBlasErr("Get Vector n3");



printf("%f\n",time1); // unit is ms.
	
float output[m][n];
int i = 0;
int j = 0;

for(int l = 0; l < n3; l++)
{
	output[i][j] = ch[l];
	i++;
	if((l+1) % m == 0)
	{
		j++;
		i = 0;
	}
}

printf("%.2f\n",output[0][0]);
double gflops=((500*500*((2.*500)-1.))/time1)/1e6;// Gflops - In my laptop, ability of computation is around 62Gflops.
printf("%.6f\n",gflops);

free(ah); free(bh); free(ch);
cublasFree(ad); 
cublasFree(bd); 
cublasFree(cd);

cublasShutdown();

return 0;

}

Theoretical peak for matrix multiplication is only around 120 GFLOP/s as it has a multiplication:addition ratio of 1:1, while the 180 GFLOP/s value is for a ratio of 2:1. You might also achieve slightly higher throughput with m,n,k multiples of 32.