Matrix Multiplication by cublasSgemm

I have a simple matrix multiplication program using cublasSgemm function:

[codebox]#include <stdio.h>

#include <stdlib.h>

#include <malloc.h>

#include <sys/time.h>

#include <cublas.h>

int main ( int argc, char** argv ) {

int i;

cublasStatus status;

struct timeval timev1,timev2;

float time_seconds;

int N = atoi ( argv [ 1 ] );

printf ( “N = %d\n”, N );

float A , B, *C;

float *d_A, *d_B, *d_C;

A = (float*) malloc ( N * N * sizeof ( float ) );

B = (float*) malloc ( N * N * sizeof ( float ) );

C = (float*) malloc ( N * N * sizeof ( float ) );

for ( i = 0; i < N * N; i++ ) {

A [ i ] = 0.1;

B [ i ] = 0.2;

C [ i ] = 0.0;

}

cublasInit();

status = cublasAlloc ( N * N, sizeof(float), (void**)&d_A);

status = cublasAlloc ( N * N, sizeof(float), (void**)&d_B);

status = cublasAlloc ( N * N, sizeof(float), (void**)&d_C);

cublasSetMatrix ( N, N, sizeof(float), (void *) A, N, (void *) d_A, N);

cublasSetMatrix ( N, N, sizeof(float), (void *) B, N, (void *) d_B, N);

gettimeofday(&timev1,NULL);

cublasSgemm ( ‘n’, ‘n’, N, N, N, 1.0f, d_A, N, d_B, N, 1.0f, d_C, N );

cublasGetMatrix ( N, N, sizeof(float), (void *) d_C, N, (void *) C, N );

gettimeofday(&timev2,NULL);

time_seconds=timev2.tv_sec-timev1.tv_sec+0.000001*(timev2.tv_usec-timev1.tv_usec);

cublasFree (d_A);

cublasFree (d_B);

cublasFree (d_C);

cublasShutdown();

printf(“Elapsed time = %.4f\n”,time_seconds);

printf ( “C[0,0]=%f\n”, C[0] );

}[/codebox]

Then I compile it by ordinary way:

bash-3.2$ make

g++ -O3 -I/usr/local/cuda/include -c main.cpp

g++ -L/usr/local/cuda/lib64 -lcublas -o MatMul main.o

But the repeated runs of program give the different results,

in fact, each run adds value ~20 - a correct result of multiplication - to result:

bash-3.2$ make

g++ -O3 -I/usr/local/cuda/include -c main.cpp

g++ -L/usr/local/cuda/lib64 -lcublas -o MatMul main.o

bash-3.2$ ./MatMul 1024

N = 1024

Elapsed time = 0.0081

C[0,0]=61.626236

bash-3.2$ ./MatMul 1024

N = 1024

Elapsed time = 0.0082

C[0,0]=82.106514

bash-3.2$ ./MatMul 1024

N = 1024

Elapsed time = 0.0079

C[0,0]=102.586792

make-3.2$

g++ -O3 -I/usr/local/cuda/include -c main.cpp

g++ -L/usr/local/cuda/lib64 -lcublas -o MatMul main.o

bash-3.2$ ./MatMul 1024

N = 1024

Elapsed time = 0.0081

C[0,0]=123.067070

bash-3.2$ ./MatMul 1024

N = 1024

Elapsed time = 0.0078

C[0,0]=143.547348

bash-3.2$ ./MatMul 1024

N = 1024

Elapsed time = 0.0079

C[0,0]=164.027618

What is wrong in my program ?

Thanks.

The problem has been solved - in the program was missed

cublasSetMatrix ( N, N, sizeof(float), (void *) C, N, (void *) d_C, N);