Characterization of Tesla C2050 GPU for matrix-vector multiplication

Hi, I am trying to characterize my Tesla C2050 GPU for matrix-vector multiplication by block decomposition. I have taken taken a 24000 x 24000 matrix genearted randomly and the corresponding vector is also generated randomly. I have used gettimeofday() for calculating the time taken for matrix-vector multiplication. The matrix has been divided into different blocks (single 24000 x 24000 block, four 12000 x 12000 block, three 7000 x 7000 blocks (rest of the elements in 3000 x 7000 and 7000 x 3000 blocks), 16 no.s of 6000 x 6000 blocks, 25 no.s of 4800 x 4800 blocks and 36 no.s of 4000 x 4000 blocks.I am using CUBLAS functions for copying the matrix to GPU memory, performing the multiplication and copying back the matrix to host memory. For each block size, the code has run for 600 times in one trial and 1000 times in next trial (system was restarted before performing the trials) and I got the avg. times as specified in the attachment…

[attachment=22185:timing.jpg]

In both cases the convergence of average has been checked and no process was running in parallel except processes from OS. In this scenario how can I characterize the GPU w.r.t. matrix vector multiplication? Please suggest External Image
System specifications:
OS: CentOS 5
CUDA 4
Four Tesla C 2050 GPU (out of which only one has been used)
Processor: two Intel Xeon quadcore.

                               /*****************************************                    CODE                 ***********************************************/

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cublas_v2.h>
#include <sys/time.h>

define MAX_COUNT 1000

int main(void)
{
int a,a_excess,m,n,i,j,p,count;
float *A, *x, *B, *A_d, *x_d, *B_d, *s, *block_A_address;
float alpha=1.0, beta=0.0;
double tstart, tend, diff, avg, temp=0;
size_t size_A, size_x, size_block, size_x_block;
cublasHandle_t handle;
cublasOperation_t trans;
trans=CUBLAS_OP_N;
struct timeval startTime;
struct timeval endTime;
FILE *fp;
/printf(“Enter the number of rows of the matrix A:\n”);
scanf(“%d”, &m);
printf(“Enter the number of columns of the matrix A:\n”);
scanf(“%d”, &n);
/
m=24000;
n=24000;
a=4000;
a_excess=m%a;
if(a_excess==0)
p=m/a;
else
p=(int(m/a))+1;
//printf(“p=%d”,p);

// Allocation of memory in host
size_A=mnsizeof(float);
size_x=nsizeof(float);
size_block=a
asizeof(float);
size_x_block=a
sizeof(float);
x=(float*)malloc(size_x);
A=(float*)malloc(size_A);
B=(float*)malloc(size_x);
fp=fopen(“block_4000.txt”,“w”);
if(fp==NULL)
{
printf(“Error opening file”);
exit(0);
}
fflush(stdout);
cudaMalloc((void**)&s,sizeof(float));
cudaFree(s);
if(a_excess==0)
{
for(count=0;count<MAX_COUNT;count++)
{
//printf(“%d\n”, count);
// Filling up A and x with random real numbers
for(i=0;i<n;i++)
{
(x+i)=(float)(rand()%1000);
}
//printf(“Printing A…\n”);
for(i=0;i<(m
n);i++)
{
(A+i)=(float)(rand()%1000);
//printf(“%f\n”,
(A+i));
}
gettimeofday(&startTime,NULL);

// Allocation of memory in dconcurrent device access by PCIeevice
cudaMalloc((void**)&A_d,size_block);
cudaMalloc((void**)&x_d,size_x_block);
cudaMalloc((void**)&B_d,size_x_block);
//CUBLAS!!!!
cublasCreate(&handle);
for(i=0;i<(p*p);i++)
	{
	j=i%p;
	if(i!=0)
		{
		if(j==0)
			{
			cublasGetMatrix(a,1,sizeof(float),B_d,a,(B+(((i/p)-1)*a)),a);
			}
		}
	if(j==0)
		beta=0.0;
	else
		beta=1.0;
	cublasSetMatrix(a,a,sizeof(float),(A+(i*a*a)),a,A_d,a);
	cublasSetMatrix(a,1,sizeof(float),(x+(j*a)),a,x_d,a);
	cublasSgemv(handle,trans,a,a,&alpha,A_d,a,x_d,1,&beta,B_d,1);
	if(i==((p*p)-1))
		{
		i++;
		cublasGetMatrix(a,1,sizeof(float),B_d,a,(B+(((i/p)-1)*a)),a);
		}
	
	}
cublasDestroy(handle);
cudaFree(A_d);
cudaFree(x_d);
cudaFree(B_d);
gettimeofday(&endTime,NULL);
tstart=(startTime.tv_sec*1000000)+(startTime.tv_usec);
tend=(endTime.tv_sec*1000000)+(endTime.tv_usec);
diff=tend-tstart;
temp+=diff;
fprintf(fp,"%lf\t",diff);
fprintf(fp,"\n\n");
/*//Printing x
printf("Printing x...\n");
for(i=0;i<n;i++)
	{
	fflush(stdout);
	printf("%f\n",*(x+i));
	}
//Printing B
printf("Printing B...\n");
for(i=0;i<n;i++)
	{
	fflush(stdout);
	printf("%f\n",*(B+i));
	}
*/

}

}

else
{
for(count=0;count<MAX_COUNT;count++)
{
//printf(“%d\n”, count);
// Filling up A and x with random real numbers
for(i=0;i<n;i++)
{
(x+i)=(float)(rand()%1000);
}
//printf(“Printing A…\n”);
for(i=0;i<(m
n);i++)
{
(A+i)=(float)(rand()%1000);
//printf(“%f\n”,
(A+i));
}
gettimeofday(&startTime,NULL);

// Allocation of memory in dconcurrent device access by PCIeevice
cudaMalloc((void**)&A_d,size_block);
cudaMalloc((void**)&x_d,size_x_block);
cudaMalloc((void**)&B_d,size_x_block);
block_A_address=A;
//CUBLAS!!!!
cublasCreate(&handle);
beta=0.0;
for(i=0;i<=((p*p)-p-1);i++)
	{
	j=i%p;
	if((i+1)%p==0)
		{
		cublasSetMatrix(a,a_excess,sizeof(float),block_A_address,a,A_d,a);
		cublasSetMatrix(a_excess,1,sizeof(float),(x+(j*a)),a_excess,x_d,a_excess);
		cublasSgemv(handle,trans,a,a_excess,&alpha,A_d,a,x_d,1,&beta,B_d,1);
		cublasGetMatrix(a,1,sizeof(float),B_d,a,(B+(((i+1)/p-1)*a)),a);
		beta=0.0;
		block_A_address=block_A_address+(a*a_excess);
		}
	else
		{			
		cublasSetMatrix(a,a,sizeof(float),block_A_address,a,A_d,a);
		cublasSetMatrix(a,1,sizeof(float),(x+(j*a)),a,x_d,a);
		cublasSgemv(handle,trans,a,a,&alpha,A_d,a,x_d,1,&beta,B_d,1);
		beta=1.0;
		block_A_address=block_A_address+(a*a);
		}
	}
beta=0.0;
for(i=(p*(p-1));i<(p*p);i++)
	{
	j=i%p;
	if((i+1)%p==0)
		{
		cublasSetMatrix(a_excess,a_excess,sizeof(float),block_A_address,a_excess,A_d,a_excess);
		cublasSetMatrix(a_excess,1,sizeof(float),(x+(j*a)),a_excess,x_d,a_excess);
		cublasSgemv(handle,trans,a_excess,a_excess,&alpha,A_d,a_excess,x_d,1,&beta,B_d,1);
		cublasGetMatrix(a_excess,1,sizeof(float),B_d,a_excess,(B+(((i+1)/p-1)*a)),a_excess);
		beta=0.0;
		}
	else
		{			
		cublasSetMatrix(a_excess,a,sizeof(float),block_A_address,a_excess,A_d,a_excess);
		cublasSetMatrix(a,1,sizeof(float),(x+(j*a)),a,x_d,a);
		cublasSgemv(handle,trans,a_excess,a,&alpha,A_d,a_excess,x_d,1,&beta,B_d,1);
		beta=1.0;
		block_A_address=block_A_address+(a_excess*a);
		}
	}
cublasDestroy(handle);
cudaFree(A_d);
cudaFree(x_d);
cudaFree(B_d);
gettimeofday(&endTime,NULL);
tstart=(startTime.tv_sec*1000000)+(startTime.tv_usec);
tend=(endTime.tv_sec*1000000)+(endTime.tv_usec);
diff=tend-tstart;
temp+=diff;
fprintf(fp,"%lf\t",diff);
fprintf(fp,"\n\n");
/*// Printing x
printf("Printing x...\n");
for(i=0;i<n;i++)
	{
	fflush(stdout);
	printf("%f\n",*(x+i));
	}
// Printing B
printf("Printing B...\n");
for(i=0;i<n;i++)
	{
	fflush(stdout);
	printf("%f\n",*(B+i));
	}
*/

}

}

avg=temp/MAX_COUNT;
fprintf(fp,“\nAvg. wall time taken %lf microseconds\n”,avg);
fclose(fp);
free(A);
free(B);
free(x);
}