I have a simple matrix multiplication program using cublasSgemm function:
[codebox]#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#include <sys/time.h>
#include <cublas.h>
int main ( int argc, char** argv ) {
int i;
cublasStatus status;
struct timeval timev1,timev2;
float time_seconds;
int N = atoi ( argv [ 1 ] );
printf ( “N = %d\n”, N );
float A , B, *C;
float *d_A, *d_B, *d_C;
A = (float*) malloc ( N * N * sizeof ( float ) );
B = (float*) malloc ( N * N * sizeof ( float ) );
C = (float*) malloc ( N * N * sizeof ( float ) );
for ( i = 0; i < N * N; i++ ) {
A [ i ] = 0.1;
B [ i ] = 0.2;
C [ i ] = 0.0;
}
cublasInit();
status = cublasAlloc ( N * N, sizeof(float), (void**)&d_A);
status = cublasAlloc ( N * N, sizeof(float), (void**)&d_B);
status = cublasAlloc ( N * N, sizeof(float), (void**)&d_C);
cublasSetMatrix ( N, N, sizeof(float), (void *) A, N, (void *) d_A, N);
cublasSetMatrix ( N, N, sizeof(float), (void *) B, N, (void *) d_B, N);
gettimeofday(&timev1,NULL);
cublasSgemm ( ‘n’, ‘n’, N, N, N, 1.0f, d_A, N, d_B, N, 1.0f, d_C, N );
cublasGetMatrix ( N, N, sizeof(float), (void *) d_C, N, (void *) C, N );
gettimeofday(&timev2,NULL);
time_seconds=timev2.tv_sec-timev1.tv_sec+0.000001*(timev2.tv_usec-timev1.tv_usec);
cublasFree (d_A);
cublasFree (d_B);
cublasFree (d_C);
cublasShutdown();
printf(“Elapsed time = %.4f\n”,time_seconds);
printf ( “C[0,0]=%f\n”, C[0] );
}[/codebox]
Then I compile it by ordinary way:
bash-3.2$ make
g++ -O3 -I/usr/local/cuda/include -c main.cpp
g++ -L/usr/local/cuda/lib64 -lcublas -o MatMul main.o
But the repeated runs of program give the different results,
in fact, each run adds value ~20 - a correct result of multiplication - to result:
bash-3.2$ make
g++ -O3 -I/usr/local/cuda/include -c main.cpp
g++ -L/usr/local/cuda/lib64 -lcublas -o MatMul main.o
bash-3.2$ ./MatMul 1024
N = 1024
Elapsed time = 0.0081
C[0,0]=61.626236
bash-3.2$ ./MatMul 1024
N = 1024
Elapsed time = 0.0082
C[0,0]=82.106514
bash-3.2$ ./MatMul 1024
N = 1024
Elapsed time = 0.0079
C[0,0]=102.586792
make-3.2$
g++ -O3 -I/usr/local/cuda/include -c main.cpp
g++ -L/usr/local/cuda/lib64 -lcublas -o MatMul main.o
bash-3.2$ ./MatMul 1024
N = 1024
Elapsed time = 0.0081
C[0,0]=123.067070
bash-3.2$ ./MatMul 1024
N = 1024
Elapsed time = 0.0078
C[0,0]=143.547348
bash-3.2$ ./MatMul 1024
N = 1024
Elapsed time = 0.0079
C[0,0]=164.027618
What is wrong in my program ?
Thanks.