Hi,

I run the follow code and I realized that the execution time change, sometimes is 0 and sometimes is 0.001(too much for that code!). Does anyone know why this happend?

Do you thing that I can improve the performance of the code storing the scalar variable in device as well ?

```
//all the matrices and vectors are stored in the device.
//all the scalar are stored in the host
.
.
clock_t start = clock();
cublasScopy(N, v, 1, v_old, 1); //v_old = v;
//v = v_hat/beta;
cublasScopy (N, v_hat, 1, v, 1); // v = v_hat;
cublasSscal(N,(1/beta),v,1);
// alpha = v'*A*v;
cublasSsbmv('U', N, K, 1, Ab, (2*K+1), v, 1, 0, sup0, 1); // sup0= A*v;
alpha= cublasSdot(N, sup0, 1,v, 1); // alpha = dot(sup0,sup0)=sup0'*sup0
//v_hat = A*v - alpha*v - beta*v_old;
cublasSaxpy(N, -alpha, v, 1, sup0, 1); // sup0 = -alpha*v +sup0 ----- sup0 was A*v
cublasSaxpy(N, -beta, v_old, 1, sup0, 1); // sup0 = -beta*v_old +sup0
cublasScopy(N, sup0, 1, v_hat, 1); //v_hat = sup0;
beta_old = beta; // beta_old = beta;
beta = cublasSnrm2(N,v_hat,1); // beta = norm(v_hat);
printf("\n Time elapsed : %f \n", ((double)clock() - start) / CLOCKS_PER_SEC);
.
.
.
```