Hi, all
I found that after calling “cublasSgemv”, calling function “cublasSdot” or “cublasSnrm2” will be extremely slow !!!
my test function is very simple, after setting values for matrix dev_A and vector dev_x, firstly compute
dev_v= dev_A * dev_x;
then compute dev_v dot product dev_v;
my result is
GPU sgemv time:0.038000
GPU sdot time:11.852000
here are my codes:
#include <cuda.h>
#include <stdio.h>
#include <cutil.h>
#include <cublas.h>
#include <cuda_runtime_api.h>
#define size 10000
// Global variables
float A;
float x;
// random generator
void matrixGen(float* A, int m, int n)
{
for (int i=0; i<m; i++){
for (int j=0; j<n; j++)
{
*(A+i*n+j)=(float)rand()/RAND_MAX;
}
}
}
main(){
float * dev_A;
float * dev_x;
float * dev_v;
float beta;
int N=size;
cublasStatus stat0;
matrixGen(A, N, N);
matrixGen(x, N, 1);
cublasInit();
// allocate memory and set variable
stat0=cublasAlloc(N*N, sizeof(float), (void**)&dev_A);
stat0=cublasAlloc(N, sizeof(float), (void**)&dev_v);
stat0=cublasAlloc(N, sizeof(float), (void**)&dev_x);
stat0=cublasSetMatrix(N, N, sizeof(float), A, N, dev_A, N);
stat0=cublasSetVector(N, sizeof(float), x, 1, dev_x, 1);
unsigned int timer=0;
CUT_SAFE_CALL(cutCreateTimer(&timer));
CUT_SAFE_CALL(cutStartTimer(timer));
cublasSgemv('T', N, N, 1.0, dev_A, N, dev_x, 1, 0, dev_v, 1);
CUT_SAFE_CALL(cutStopTimer(timer));
printf("\n\n GPU sgemv time:%f\n", cutGetTimerValue(timer));
CUT_SAFE_CALL(cutDeleteTimer(timer));
timer=0;
CUT_SAFE_CALL(cutCreateTimer(&timer));
CUT_SAFE_CALL(cutStartTimer(timer));
beta=cublasSdot(N, dev_v, 1, dev_v, 1);
CUT_SAFE_CALL(cutStopTimer(timer));
printf("\n\n GPU sdot time:%f\n", cutGetTimerValue(timer));
CUT_SAFE_CALL(cutDeleteTimer(timer));
}
does any expert have any idea why is that?
many thanks in advance!