This is the BLAS code…
[codebox]#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include “MATgiulio.h”
#include <cblas.h>
void BLAS_BandMinRes(float *ACD, int N, int HBand,float *b,float *x0, float *xMR, float tol, int n_max){
// ALL DEVIDE VARIABLES ALLOWED!!!
// ACD is a compact BAND MATRIX...USE cublasSgbmv() TO USE IT!!
float *sup0;//, *sup1, *sup2;
sup0 =(float *) calloc( N,sizeof(float) ); MATcheckMallocStatus(sup0);
enum CBLAS_ORDER order;
enum CBLAS_UPLO uplo;
order = CblasRowMajor;
uplo = CblasLower;
float *v, *v_hat, *v_old, *w, *w_old, *w_oold;
v = (float *) calloc( N*1,sizeof(float) ); MATcheckMallocStatus(v);
v_hat = (float *) calloc( N*1,sizeof(float) ); MATcheckMallocStatus(v_hat);
v_old = (float *) calloc( N*1,sizeof(float) ); MATcheckMallocStatus(v_old);
w = (float *) calloc( N*1,sizeof(float) ); MATcheckMallocStatus(w);
w_old = (float *) calloc( N*1,sizeof(float) ); MATcheckMallocStatus(w_old);
w_oold = (float *) calloc( N*1,sizeof(float) ); MATcheckMallocStatus(w_oold);
float beta, c, c_old, s, s_old, eta, norm_r0, norm_rMR, alpha;
float beta_old, c_oold, s_oold, r1_hat, r1, r2, r3;
int n = 1;
// v = zeros(N,1);
cblas_sscal (N, 0, v, 1);
cblas_sscal (N, 0, v_old, 1);
cblas_sscal (N, 0, w, 1);
cblas_sscal (N, 0, w_old, 1);
cblas_scopy (N, b, 1, v_hat, 1); // v_hat=b;
cblas_ssbmv(order, uplo, N, HBand-1, -1, ACD, HBand, x0, 1, 1, v_hat, 1);
beta = cblas_snrm2(N,v_hat,1); // beta = norm(v_hat);
c = 1;
c_old = 1;
s = 0;
s_old = 0;
// w = zeros(N,1);
cblas_scopy (N, b, 1, w_old, 1); // w_old = b;
eta = beta;
cblas_scopy (N, x0, 1, xMR, 1); //xMR = x0;
norm_rMR = beta;
norm_r0 = beta;
while ( (n<n_max+1) && ( (norm_rMR/norm_r0) > tol) ){
n++;
//-Calculate Lanczos Vectors
cblas_scopy(N, v, 1, v_old, 1); //v_old = v;
//v = v_hat/beta;
cblas_scopy (N, v_hat, 1, v, 1); // v = v_hat;
cblas_sscal(N,(1/beta),v,1);
printf("\n v is: \n");
MATprint(v,1,N);
// alpha = v'*A*v;
cblas_ssbmv(order, uplo, N, HBand-1, 1, ACD, HBand, v, 1, 0, sup0, 1);
alpha= cblas_sdot(N, sup0, 1,v, 1); // alpha = dot(sup0,sup0)=sup0'*sup0
//v_hat = A*v - alpha*v - beta*v_old;
cblas_saxpy(N, -alpha, v, 1, sup0, 1); // sup0 = -alpha*v +sup0 ----- sup0 was A*v
cblas_saxpy(N, -beta, v_old, 1, sup0, 1); // sup0 = -beta*v_old +sup0
cblas_scopy(N, sup0, 1, v_hat, 1); //v_hat = sup0;
beta_old = beta; // beta_old = beta;
beta = cblas_snrm2(N,v_hat,1); // beta = norm(v_hat);
//-Calculate QR Factors
c_oold = c_old;
c_old = c;
s_oold = s_old;
s_old = s;
r1_hat = c_old*alpha - c_oold*s_old*beta_old;
r1 = sqrt( r1_hat*r1_hat + beta*beta );
r2 = s_old*alpha + c_oold*c_old*beta_old;
r3 = s_oold*beta_old;
//-Calculate New Givens Rotations
c = r1_hat/r1;
s = beta/r1;
//-Update Solution
cblas_scopy (N, w_old, 1, w_oold, 1); // w_oold = w_old;
cblas_scopy (N, w, 1, w_old, 1); // w_old = w;
// w= (v - r3*w_oold - r2*w_old)/r1 ;
cblas_sscal(N, 0, w, 1); // w = 0;
cblas_saxpy(N, (1/r1), v, 1, w, 1); // w = (1/r1)*v +w
cblas_saxpy(N, -(r3/r1), w_oold, 1, w, 1);// w = -(r3/r1)*w_oold + w
cblas_saxpy(N, -(r2/r1), w_old, 1, w, 1); // w = -(r2/r1)*w_old + w
//xMR = xMR + c*eta*w;
cblas_saxpy(N, c*eta, w, 1, xMR, 1); // xMR = xMR + c*eta*w;
//printf("\n abs(s) is: %g \n \n", abs(s) );
//ABS DOESNT WORK
//norm_rMR = norm_rMR*abs(s);
if (s>0)
norm_rMR = norm_rMR*s;
else
norm_rMR = norm_rMR*(-s);
eta = -s*eta;
//printf("\n norm_rMR is: %g \n \n", norm_rMR );
//printf("\n norm_rMR/norm_r0 is: %g \n \n", norm_rMR/norm_r0 );
}
free(sup0);
free(v);
free(v_hat);
free(v_old);
free(w);
free(w_old);
free(w_oold);
}
[/codebox]
and this on the CUBLAS
[codebox]#include <stdio.h>
#include “MATgiulio.cuh”
#include <cublas.h>
void checkCublasStatus(cublasStatus stat){
if (stat != CUBLAS_STATUS_SUCCESS) {
printf("CUBLAS : operation completed without successfully!!");
scanf("ERROR!");
cublasShutdown();
//EXIT_FAILURE);
}
}
void CUBLAS_BandMinRes(float *ACD, int N, int HBand,float *b,float *x0, float *xMR, float tol, int n_max){
// ALL DEVIDE VARIABLES ALLOWED!!!
// ACD is a compact BAND MATRIX...USE cublasSgbmv() TO USE IT!!
cublasStatus status;
float *sup0;
status = cublasAlloc(N*1, sizeof(float), (void**)&sup0); checkCublasStatus(status);
float *v, *v_hat, *v_old, *w, *w_old, *w_oold;
status = cublasAlloc(N*1, sizeof(float), (void**)&v); checkCublasStatus(status);
status = cublasAlloc(N*1, sizeof(float), (void**)&v_hat); checkCublasStatus(status);
status = cublasAlloc(N*1, sizeof(float), (void**)&v_old); checkCublasStatus(status);
status = cublasAlloc(N*1, sizeof(float), (void**)&w); checkCublasStatus(status);
status = cublasAlloc(N*1, sizeof(float), (void**)&w_old); checkCublasStatus(status);
status = cublasAlloc(N*1, sizeof(float), (void**)&w_oold); checkCublasStatus(status);
float beta, c, c_old, s, s_old, eta, norm_r0, norm_rMR, alpha;
float beta_old, c_oold, s_oold, r1_hat, r1, r2, r3;
int n = 1;
// v = zeros(N,1);
cublasSscal (N, 0, v, 1);
cublasSscal (N, 0, v_old, 1);
cublasSscal (N, 0, w, 1);
cublasSscal (N, 0, w_old, 1);
cublasScopy (N, b, 1, v_hat, 1); // v_hat=b;
cublasSsbmv('U', N, HBand-1, -1, ACD, HBand, x0, 1, 1, v_hat, 1); // v_hat= -A*x0 + v_hat;
beta = cublasSnrm2(N,v_hat,1); // beta = norm(v_hat);
c = 1;
c_old = 1;
s = 0;
s_old = 0;
// w = zeros(N,1);
cublasScopy (N, b, 1, w_old, 1); // w_old = b;
eta = beta;
cublasScopy (N, x0, 1, xMR, 1); //xMR = x0;
norm_rMR = beta;
norm_r0 = beta;
while ( (n<n_max+1) && ( (norm_rMR/norm_r0) > tol) ){
n++;
//-Calculate Lanczos Vectors
cublasScopy(N, v, 1, v_old, 1); //v_old = v;
//v = v_hat/beta;
cublasScopy (N, v_hat, 1, v, 1); // v = v_hat;
cublasSscal(N,(1/beta),v,1);
// alpha = v'*A*v;
cublasSsbmv('U', N, HBand-1, 1, ACD, HBand, v, 1, 0, sup0, 1); // sup0= A*v;
alpha= cublasSdot(N, sup0, 1,v, 1); // alpha = dot(sup0,sup0)=sup0'*sup0
//printf("\n alpha is: %g \n \n", alpha);
//v_hat = A*v - alpha*v - beta*v_old;
cublasSaxpy(N, -alpha, v, 1, sup0, 1); // sup0 = -alpha*v +sup0 ----- sup0 was A*v
cublasSaxpy(N, -beta, v_old, 1, sup0, 1); // sup0 = -beta*v_old +sup0
cublasScopy(N, sup0, 1, v_hat, 1); //v_hat = sup0;
beta_old = beta; // beta_old = beta;
beta = cublasSnrm2(N,v_hat,1); // beta = norm(v_hat);
//printf("\n beta is: %g \n \n", beta);
//printf("\n beta_old is: %g \n \n", beta_old);
//-Calculate QR Factors
c_oold = c_old;
c_old = c;
s_oold = s_old;
s_old = s;
r1_hat = c_old*alpha - c_oold*s_old*beta_old;
r1 = sqrt( r1_hat*r1_hat + beta*beta );
r2 = s_old*alpha + c_oold*c_old*beta_old;
r3 = s_oold*beta_old;
//-Calculate New Givens Rotations
c = r1_hat/r1;
s = beta/r1;
//-Update Solution
cublasScopy (N, w_old, 1, w_oold, 1); // w_oold = w_old;
cublasScopy (N, w, 1, w_old, 1); // w_old = w;
// w= (v - r3*w_oold - r2*w_old)/r1 ;
cublasSscal(N, 0, w, 1); // w = 0;
cublasSaxpy(N, (1/r1), v, 1, w, 1); // w = (1/r1)*v +w
cublasSaxpy(N, -(r3/r1), w_oold, 1, w, 1);// w = -(r3/r1)*w_oold + w
cublasSaxpy(N, -(r2/r1), w_old, 1, w, 1); // w = -(r2/r1)*w_old + w
//xMR = xMR + c*eta*w;
cublasSaxpy(N, c*eta, w, 1, xMR, 1); // xMR = xMR + c*eta*w;
norm_rMR = norm_rMR*abs(s);
eta = -s*eta;
}
cublasFree(sup0);
cublasFree(v);
cublasFree(v_hat);
cublasFree(v_old);
cublasFree(w);
cublasFree(w_old);
cublasFree(w_oold);
}
[/codebox]
what do u think about the code? Is it reasonable?
Thank