Hello to everyone,
I wrote the below code and it seems to be slow…Can you find some bottleneck? Thanks!
[codebox]while ( (n<n_max+1) && ( (norm_rMR/norm_r0) > tol) ){
n++;
//-Calculate Lanczos Vectors
// alpha = v'*A*v; v=v_hat/beta
cublasSsbmv('U', N, K, 1/(beta), Ab, (K+1), v_hat, 1, 0, sup0, 1); // sup0= A*v;
alpha= cublasSdot(N, v_hat, 1,sup0, 1)*(1/beta); // alpha = dot(v,sup0)=v*sup0
beta_old = beta; // beta_old = beta;
lanczos<<<numBlocks, threadsPerBlock>>>(sup0, v_old, v_hat, v, alpha, beta, N);
beta = cublasSnrm2(N,v_hat,1); // beta = norm(v_hat);
//-Calculate QR Factors
c_oold = c_old;
c_old = c;
s_oold = s_old;
s_old = s;
r1_hat = c_old*alpha - c_oold*s_old*beta_old;
r1 = sqrt( r1_hat*r1_hat + beta*beta );
r2 = s_old*alpha + c_oold*c_old*beta_old;
r3 = s_oold*beta_old;
//-Calculate New Givens Rotations
c = r1_hat/r1;
s = beta/r1;
//-Update Solution
update<<<numBlocks, threadsPerBlock>>>(w_oold, w_old, w, v, xMR, r1,r2, r3, c, eta,N);
eta = -s*eta;
}
global void update(float* w_oold, float* w_old, float* w, float* v, float* xMR, float r1, float r2, float r3, float c, float eta,int N) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
//int i = threadIdx.x;
if (i<N){
w_oold[i] = w_old[i];
w_old[i] = w[i];
w[i] = (v[i] - r3*w_oold[i] -r2*w_old[i])/r1;
xMR[i] = xMR[i] + c*eta*w[i];
}
}
global void lanczos(float* Av, float* v_old, float* v_hat, float* v, float alpha, float beta, int N) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
//int i = threadIdx.x;
if (i<N){
v_old[i] = v[i];
v[i] = v_hat[i]/beta;
v_hat[i] = Av[i] - alpha*v[i] - beta*v_old[i];
}
}[/codebox]