Hi,

I was comparing the computation time (allocation, transfer, computation) of a Matlab matrix-vector and a CUDA matrix-vector multiplication using the mex options in Matlab as it is suggested by NVIDIA. In contrast to the matrix - matrix multiplication cublasSgemm, the function cuablsSgemv has poor results regarding the computation time: max 6x speed up for cublasSgemm , min 2.5 x speed “down” for cublasSgemv!

Both programs are written in the same way - the only difference is the use of different set and get methods (cublasGet/SetVector/Matrix) and the computation functions (cublasSgemm/Sgemv).

See the results :

SGEMM

SGEMV

Where is the problem ?

In order to see where the bottleneck is, I wrote the following test program. I get weird results, when the matrix dim is 1000x1000: 97 % of the overall computation time is wasted by transfering data ?! Problem with cublasSet- and cubalsGetvector or the timing function ???

```
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////
void
runTest( int argc, char** argv)
{
// cublasStatus status;
unsigned int timer_allocation = 0;
unsigned int timer_transfer = 0;
unsigned int timer_computation = 0;
float total_time = 0;
float* device_A = NULL;
float* device_x = NULL;
float* device_y = NULL;
float* host_A = NULL;
float* host_x = NULL;
float* host_y = NULL;
int m = 0,n = 0;
int mem_size_matrix, mem_size_vector;
printf("Enter matrix dimensions!\n");
printf("Enter number of rows: ");
scanf("%d",&m);
printf("Enter number of cols: ");
scanf("%d",&n);
mem_size_matrix = sizeof(float) * m * n;
mem_size_vector = sizeof(float) * n;
// allocate host memory
host_A = (float*) malloc(mem_size_matrix);
host_x = (float*) malloc(mem_size_vector);
host_y = (float*) malloc(mem_size_vector);
// initialize host data
randomInit(host_A, m * n);
randomInit(host_x, n);
randomInit(host_y, n);
cublasInit();
cutCreateTimer( &timer_allocation);
cutCreateTimer( &timer_transfer);
cutCreateTimer( &timer_computation);
// allocate device memory
cutStartTimer( timer_allocation);
cudaMalloc( (void**) &device_A, mem_size_matrix);
cudaMalloc( (void**) &device_x, mem_size_vector);
cudaMalloc( (void**) &device_y, mem_size_vector);
cutStopTimer( timer_allocation);
// copy host memory to device
cutStartTimer( timer_transfer);
cublasSetMatrix(m,n, sizeof(float), host_A, m , device_A, m);
cublasSetVector(n, sizeof(float), host_x, 1 , device_x, 1);
cublasSetVector(n, sizeof(float), host_y, 1 , device_y, 1);
cutStopTimer( timer_transfer);
// computation
cutStartTimer( timer_computation);
cublasSgemv('n' , m , n , 1.0 , device_A , m , device_x , 1, 1.0 , device_y , 1);
cutStopTimer( timer_computation);
// copy result from device to host
cutStartTimer( timer_transfer);
cublasGetVector(m, sizeof(float), device_y, 1, host_y, 1);
cutStopTimer( timer_transfer);
total_time = cutGetTimerValue(timer_allocation) + cutGetTimerValue(timer_transfer) + cutGetTimerValue(timer_computation);
printf( "Allocation : %f (ms) : %3.2f %% \n", cutGetTimerValue(timer_allocation), 100 * cutGetTimerValue(timer_allocation) / total_time);
printf( "Transfer : %f (ms) : %3.2f %% \n", cutGetTimerValue(timer_transfer), 100 * cutGetTimerValue(timer_transfer) / total_time);
printf( "Computation: %f (ms) : %3.2f %% \n", cutGetTimerValue(timer_computation), 100 * cutGetTimerValue(timer_computation) / total_time);
printf( "Overall : %f (ms)\n", total_time);
cutDeleteTimer( timer_allocation);
cutDeleteTimer( timer_transfer);
cutDeleteTimer( timer_computation);
// cleanup memory
free( host_A);
free( host_x);
free( host_y);
cublasFree(device_A);
cublasFree(device_y);
cublasFree(device_x);
}
// Allocates a matrix with random float entries.
void randomInit(float* data, int size)
{
for (int i = 0; i < size; ++i)
data[i] = rand() / (float)RAND_MAX;
}
```

Thanks for help. Cem