cublas problem: some blas 1 functions extremely slow!

Hi, all

I found that after calling “cublasSgemv”, calling function “cublasSdot” or “cublasSnrm2” will be extremely slow !!!

my test function is very simple, after setting values for matrix dev_A and vector dev_x, firstly compute

dev_v= dev_A * dev_x;

then compute dev_v dot product dev_v;

my result is

GPU sgemv time:0.038000

GPU sdot time:11.852000

here are my codes:

#include <cuda.h>

#include <stdio.h>

#include <cutil.h>

#include <cublas.h>

#include <cuda_runtime_api.h>

#define size 10000

// Global variables

  float A;

  float x;

// random generator

  void matrixGen(float* A, int m, int n)

  {

	for (int i=0; i<m; i++){

	  for (int j=0; j<n; j++)

		{

		  *(A+i*n+j)=(float)rand()/RAND_MAX;

		}

	}

  }

main(){

float * dev_A;

  float * dev_x;

  float * dev_v;

  float beta;

  int N=size;

  cublasStatus stat0;

matrixGen(A, N, N);

  matrixGen(x, N, 1);

  cublasInit();

// allocate memory and set variable

stat0=cublasAlloc(N*N, sizeof(float), (void**)&dev_A);

  stat0=cublasAlloc(N, sizeof(float), (void**)&dev_v);

  stat0=cublasAlloc(N, sizeof(float), (void**)&dev_x);

  stat0=cublasSetMatrix(N, N, sizeof(float), A, N, dev_A, N);

  stat0=cublasSetVector(N, sizeof(float), x, 1, dev_x, 1);

unsigned int  timer=0;

 CUT_SAFE_CALL(cutCreateTimer(&timer));

 CUT_SAFE_CALL(cutStartTimer(timer));

 cublasSgemv('T', N, N, 1.0, dev_A, N, dev_x, 1, 0, dev_v, 1);

CUT_SAFE_CALL(cutStopTimer(timer));

 printf("\n\n GPU sgemv time:%f\n", cutGetTimerValue(timer));

 CUT_SAFE_CALL(cutDeleteTimer(timer));

timer=0;

 CUT_SAFE_CALL(cutCreateTimer(&timer));

 CUT_SAFE_CALL(cutStartTimer(timer));

beta=cublasSdot(N, dev_v, 1, dev_v, 1);

CUT_SAFE_CALL(cutStopTimer(timer));

 printf("\n\n GPU sdot time:%f\n", cutGetTimerValue(timer));

 CUT_SAFE_CALL(cutDeleteTimer(timer));

}

does any expert have any idea why is that?

many thanks in advance!

Your timing is wrong. CUBLAS runs asynchronous to the host, just like regular CUDA kernels. To time your SGEMV call correctly, do this:

CUT_SAFE_CALL(cutCreateTimer(&timer));

CUT_SAFE_CALL(cutStartTimer(timer));

cublasSgemv('T', N, N, 1.0, dev_A, N, dev_x, 1, 0, dev_v, 1);

cudaThreadSynchronize();

CUT_SAFE_CALL(cutStopTimer(timer));

What you think is the running time of SSDOT is mostly the running time of SGEMV.

oh, I see, many thanks for pointing out that!