Hi,

I implemented a MINRES linear solver using CUBLAS library, but the performance that I achieved are better than CPU_code only for really large matricies.

Can someone check my code and tell me if there are some bottleneck?

Thanks.

```
void GPU_BandMinRes(float *Ab, int N, int K,float *b,float *x0, float *xMR, float tol, int n_max){
// Ab, b, x0 xMR are device variable
// Ab is a compact BAND MATRIX
cublasStatus status;
float *sup0;
status = cublasAlloc(N*1, sizeof(float), (void**)&sup0); checkCublasStatus(status);
float *v, *v_hat, *v_old, *w, *w_old, *w_oold;
status = cublasAlloc(N*1, sizeof(float), (void**)&v); checkCublasStatus(status);
status = cublasAlloc(N*1, sizeof(float), (void**)&v_hat); checkCublasStatus(status);
status = cublasAlloc(N*1, sizeof(float), (void**)&v_old); checkCublasStatus(status);
status = cublasAlloc(N*1, sizeof(float), (void**)&w); checkCublasStatus(status);
status = cublasAlloc(N*1, sizeof(float), (void**)&w_old); checkCublasStatus(status);
status = cublasAlloc(N*1, sizeof(float), (void**)&w_oold); checkCublasStatus(status);
float beta, c, c_old, s, s_old, eta, norm_r0, norm_rMR, alpha;
float beta_old, c_oold, s_oold, r1_hat, r1, r2, r3;
int n = 1;
cublasScopy(N, b, 1, v_hat, 1); // v_hat=b;
cublasSgbmv('N', N, N, K, K, -1 , Ab, (2*K+1), x0, 1, 1, v_hat, 1); // v_hat= -A*x0 + v_hat;
beta = cublasSnrm2(N,v_hat,1); // beta = norm(v_hat);
c = 1;
c_old = 1;
s = 0;
s_old = 0;
cublasScopy (N, b, 1, w_old, 1); // w_old = b;
eta = beta;
cublasScopy (N, x0, 1, xMR, 1); //xMR = x0;
norm_rMR = beta;
norm_r0 = beta;
while ( (n<n_max+1) && ( (norm_rMR/norm_r0) > tol) ){
n++;
//-Calculate Lanczos Vectors
cublasScopy(N, v, 1, v_old, 1); //v_old = v;
//v = v_hat/beta;
cublasScopy (N, v_hat, 1, v, 1); // v = v_hat;
cublasSscal(N,(1/beta),v,1);
// alpha = v'*A*v;
cublasSgbmv('N', N, N, K, K, 1 , Ab, (2*K+1), v, 1, 0, sup0, 1); // sup0= A*v;
alpha= cublasSdot(N, sup0, 1,v, 1); // alpha = dot(sup0,sup0)=sup0'*sup0
//v_hat = A*v - alpha*v - beta*v_old;
cublasSaxpy(N, -alpha, v, 1, sup0, 1); // sup0 = -alpha*v +sup0 ----- sup0 was A*v
cublasSaxpy(N, -beta, v_old, 1, sup0, 1); // sup0 = -beta*v_old +sup0
cublasScopy(N, sup0, 1, v_hat, 1); //v_hat = sup0;
beta_old = beta; // beta_old = beta;
beta = cublasSnrm2(N,v_hat,1); // beta = norm(v_hat);
//-Calculate QR Factors
c_oold = c_old;
c_old = c;
s_oold = s_old;
s_old = s;
r1_hat = c_old*alpha - c_oold*s_old*beta_old;
r1 = sqrt( r1_hat*r1_hat + beta*beta );
r2 = s_old*alpha + c_oold*c_old*beta_old;
r3 = s_oold*beta_old;
//-Calculate New Givens Rotations
c = r1_hat/r1;
s = beta/r1;
//-Update Solution
cublasScopy (N, w_old, 1, w_oold, 1); // w_oold = w_old;
cublasScopy (N, w, 1, w_old, 1); // w_old = w;
// w= (v - r3*w_oold - r2*w_old)/r1;
cublasSscal(N, 0, w, 1); // w = 0;
cublasSaxpy(N, (1/r1), v, 1, w, 1); // w = (1/r1)*v +w
cublasSaxpy(N, -(r3/r1), w_oold, 1, w, 1);// w = -(r3/r1)*w_oold + w
cublasSaxpy(N, -(r2/r1), w_old, 1, w, 1); // w = -(r2/r1)*w_old + w
//xMR = xMR + c*eta*w;
cublasSaxpy(N, c*eta, w, 1, xMR, 1); // xMR = xMR + c*eta*w;
norm_rMR = norm_rMR*abs(s);
eta = -s*eta;
}
cublasFree(sup0);
cublasFree(v);
cublasFree(v_hat);
cublasFree(v_old);
cublasFree(w);
cublasFree(w_old);
cublasFree(w_oold);
printf( " \n GPUBandMinRes Done!..." );
}
```