CUBLAS vs GOTOBLAS2

Hello to everyone,

I have implemented a linear solver with gotoblas2 library and now I am moving to CUBLAS(just changing single blas call to CUBLAS call) for very large system like 7000 of dimension.
I was surprised that gotoblas2 is still much faster than CUBLAS…Is it reasonable?

Thanks

Depends on which BLAS call you are using. If it is SGEMM or a similar BLAS3 call, I would be very surprised. If it is SAXPY or another BLAS1 call, not so much.

Depends on which BLAS call you are using. If it is SGEMM or a similar BLAS3 call, I would be very surprised. If it is SAXPY or another BLAS1 call, not so much.

This is the BLAS code…

[codebox]#include <stdio.h>

#include <stdlib.h>

#include <math.h>

#include “MATgiulio.h”

#include <cblas.h>

void BLAS_BandMinRes(float *ACD, int N, int HBand,float *b,float *x0, float *xMR, float tol, int n_max){

// ALL DEVIDE VARIABLES ALLOWED!!!	

// ACD is a compact BAND MATRIX...USE cublasSgbmv() TO USE IT!!

float *sup0;//, *sup1, *sup2;

sup0 =(float *) calloc( N,sizeof(float) ); MATcheckMallocStatus(sup0);	



enum CBLAS_ORDER order;

enum CBLAS_UPLO uplo; 		

order = CblasRowMajor;

uplo = CblasLower;

float *v, *v_hat, *v_old, *w, *w_old, *w_oold;

v      = (float *) calloc( N*1,sizeof(float) ); MATcheckMallocStatus(v);	

v_hat  = (float *) calloc( N*1,sizeof(float) ); MATcheckMallocStatus(v_hat);	

v_old  = (float *) calloc( N*1,sizeof(float) ); MATcheckMallocStatus(v_old);	

w      = (float *) calloc( N*1,sizeof(float) ); MATcheckMallocStatus(w);	

w_old  = (float *) calloc( N*1,sizeof(float) ); MATcheckMallocStatus(w_old);	

w_oold = (float *) calloc( N*1,sizeof(float) ); MATcheckMallocStatus(w_oold);	

float beta, c, c_old, s, s_old, eta, norm_r0, norm_rMR, alpha;

float beta_old, c_oold, s_oold, r1_hat, r1, r2, r3;



int n = 1;

// v = zeros(N,1);

cblas_sscal (N, 0, v, 1);

cblas_sscal (N, 0, v_old, 1);

cblas_sscal (N, 0, w, 1);

cblas_sscal (N, 0, w_old, 1);

cblas_scopy (N, b, 1, v_hat, 1); // v_hat=b;

cblas_ssbmv(order, uplo, N, HBand-1, -1, ACD, HBand, x0, 1, 1, v_hat, 1);	



beta = cblas_snrm2(N,v_hat,1); // beta = norm(v_hat);

c	  = 1;

c_old = 1;

s	  = 0;

s_old = 0;

// w = zeros(N,1);

cblas_scopy (N, b, 1, w_old, 1); // w_old = b;

eta = beta;

cblas_scopy (N, x0, 1, xMR, 1);  //xMR = x0;

norm_rMR = beta;

norm_r0  = beta;



while ( (n<n_max+1) && ( (norm_rMR/norm_r0) > tol) ){

	n++;

//-Calculate Lanczos Vectors

	cblas_scopy(N, v, 1, v_old, 1); //v_old = v;		

	//v = v_hat/beta; 

	cblas_scopy (N, v_hat, 1, v, 1); // v = v_hat;

	cblas_sscal(N,(1/beta),v,1);

	printf("\n v is: \n");

	MATprint(v,1,N);

	// alpha = v'*A*v;

	cblas_ssbmv(order, uplo, N, HBand-1, 1, ACD, HBand, v, 1, 0, sup0, 1);	

	alpha= cblas_sdot(N, sup0, 1,v, 1); // alpha = dot(sup0,sup0)=sup0'*sup0

	//v_hat = A*v - alpha*v - beta*v_old;

	cblas_saxpy(N, -alpha, v, 1, sup0, 1); // sup0 = -alpha*v +sup0 ----- sup0 was A*v		

	cblas_saxpy(N, -beta, v_old, 1, sup0, 1); // sup0 = -beta*v_old +sup0

	cblas_scopy(N, sup0, 1, v_hat, 1); //v_hat = sup0;

	beta_old = beta;				// beta_old = beta;

	beta = cblas_snrm2(N,v_hat,1);  // beta = norm(v_hat);

//-Calculate QR Factors

	c_oold = c_old;

	c_old  = c;

	s_oold = s_old;

	s_old  = s;

	r1_hat = c_old*alpha - c_oold*s_old*beta_old;

	r1     = sqrt( r1_hat*r1_hat + beta*beta );

	r2	   = s_old*alpha + c_oold*c_old*beta_old;

	r3     = s_oold*beta_old;

//-Calculate New Givens Rotations

	c = r1_hat/r1;

	s = beta/r1;

//-Update Solution

	cblas_scopy (N, w_old, 1, w_oold, 1); // w_oold = w_old;

	cblas_scopy (N, w, 1, w_old, 1);	  // w_old  = w;



	// w= (v - r3*w_oold - r2*w_old)/r1 ;

	cblas_sscal(N, 0, w, 1);                  // w = 0;

	cblas_saxpy(N, (1/r1), v, 1, w, 1);       // w = (1/r1)*v +w

	cblas_saxpy(N, -(r3/r1), w_oold, 1, w, 1);// w = -(r3/r1)*w_oold + w

	cblas_saxpy(N, -(r2/r1), w_old, 1, w, 1); // w = -(r2/r1)*w_old + w

    //xMR = xMR + c*eta*w;

	cblas_saxpy(N, c*eta, w, 1, xMR, 1); // xMR = xMR + c*eta*w;

	//printf("\n abs(s) is: %g \n \n", abs(s) );

	

	//ABS DOESNT WORK

	//norm_rMR = norm_rMR*abs(s);

	if (s>0)

			norm_rMR = norm_rMR*s;

	else

			norm_rMR = norm_rMR*(-s);

	

			

	eta = -s*eta;

	//printf("\n norm_rMR is: %g \n \n", norm_rMR );

	//printf("\n norm_rMR/norm_r0 is: %g \n \n", norm_rMR/norm_r0 );

}



free(sup0);

free(v);

free(v_hat);

free(v_old);

free(w);

free(w_old);

free(w_oold);

}

[/codebox]

and this on the CUBLAS

[codebox]#include <stdio.h>

#include “MATgiulio.cuh”

#include <cublas.h>

void checkCublasStatus(cublasStatus stat){

if (stat != CUBLAS_STATUS_SUCCESS) {

	printf("CUBLAS : operation completed without successfully!!");

	scanf("ERROR!");

	cublasShutdown();

	//EXIT_FAILURE);

}

}

void CUBLAS_BandMinRes(float *ACD, int N, int HBand,float *b,float *x0, float *xMR, float tol, int n_max){

// ALL DEVIDE VARIABLES ALLOWED!!!	

// ACD is a compact BAND MATRIX...USE cublasSgbmv() TO USE IT!!

cublasStatus status;

float *sup0;

status = cublasAlloc(N*1, sizeof(float), (void**)&sup0); checkCublasStatus(status);



float *v, *v_hat, *v_old, *w, *w_old, *w_oold;

status = cublasAlloc(N*1, sizeof(float), (void**)&v);	   checkCublasStatus(status);

status = cublasAlloc(N*1, sizeof(float), (void**)&v_hat);  checkCublasStatus(status);

status = cublasAlloc(N*1, sizeof(float), (void**)&v_old);   checkCublasStatus(status);

status = cublasAlloc(N*1, sizeof(float), (void**)&w);   checkCublasStatus(status);

status = cublasAlloc(N*1, sizeof(float), (void**)&w_old);   checkCublasStatus(status);

status = cublasAlloc(N*1, sizeof(float), (void**)&w_oold);   checkCublasStatus(status);



float beta, c, c_old, s, s_old, eta, norm_r0, norm_rMR, alpha;

float beta_old, c_oold, s_oold, r1_hat, r1, r2, r3;



int n = 1;

// v = zeros(N,1);

cublasSscal (N, 0, v, 1);

cublasSscal (N, 0, v_old, 1);

cublasSscal (N, 0, w, 1);

cublasSscal (N, 0, w_old, 1);

cublasScopy (N, b, 1, v_hat, 1); // v_hat=b;

cublasSsbmv('U', N, HBand-1, -1, ACD, HBand, x0, 1, 1, v_hat, 1); // v_hat= -A*x0 + v_hat;

beta = cublasSnrm2(N,v_hat,1); // beta = norm(v_hat);

c	  = 1;

c_old = 1;

s	  = 0;

s_old = 0;

// w = zeros(N,1);

cublasScopy (N, b, 1, w_old, 1); // w_old = b;

eta = beta;

cublasScopy (N, x0, 1, xMR, 1);  //xMR = x0;

norm_rMR = beta;

norm_r0  = beta;





while ( (n<n_max+1) && ( (norm_rMR/norm_r0) > tol) ){

	n++;

//-Calculate Lanczos Vectors

	cublasScopy(N, v, 1, v_old, 1); //v_old = v;		

	//v = v_hat/beta; 

	cublasScopy (N, v_hat, 1, v, 1); // v = v_hat;

	cublasSscal(N,(1/beta),v,1);

	

	// alpha = v'*A*v;

	cublasSsbmv('U', N, HBand-1, 1, ACD, HBand, v, 1, 0, sup0, 1); // sup0= A*v;	 	

	alpha= cublasSdot(N, sup0, 1,v, 1); // alpha = dot(sup0,sup0)=sup0'*sup0

	//printf("\n alpha is: %g \n \n", alpha);

	//v_hat = A*v - alpha*v - beta*v_old;

	cublasSaxpy(N, -alpha, v, 1, sup0, 1); // sup0 = -alpha*v +sup0 ----- sup0 was A*v		

	cublasSaxpy(N, -beta, v_old, 1, sup0, 1); // sup0 = -beta*v_old +sup0

	cublasScopy(N, sup0, 1, v_hat, 1); //v_hat = sup0;



	beta_old = beta;				// beta_old = beta;

    	

	beta = cublasSnrm2(N,v_hat,1);  // beta = norm(v_hat);

//printf("\n beta is: %g \n \n", beta);

//printf("\n beta_old is: %g \n \n", beta_old);

//-Calculate QR Factors

	c_oold = c_old;

	c_old  = c;

	s_oold = s_old;

	s_old  = s;

	r1_hat = c_old*alpha - c_oold*s_old*beta_old;

	r1     = sqrt( r1_hat*r1_hat + beta*beta );

	r2	   = s_old*alpha + c_oold*c_old*beta_old;

	r3     = s_oold*beta_old;

//-Calculate New Givens Rotations

	c = r1_hat/r1;

	s = beta/r1;

//-Update Solution

	cublasScopy (N, w_old, 1, w_oold, 1); // w_oold = w_old;

	cublasScopy (N, w, 1, w_old, 1);	  // w_old  = w;



	// w= (v - r3*w_oold - r2*w_old)/r1 ;

	cublasSscal(N, 0, w, 1);                  // w = 0;

	cublasSaxpy(N, (1/r1), v, 1, w, 1);       // w = (1/r1)*v +w

	cublasSaxpy(N, -(r3/r1), w_oold, 1, w, 1);// w = -(r3/r1)*w_oold + w

	cublasSaxpy(N, -(r2/r1), w_old, 1, w, 1); // w = -(r2/r1)*w_old + w

    //xMR = xMR + c*eta*w;

	cublasSaxpy(N, c*eta, w, 1, xMR, 1); // xMR = xMR + c*eta*w;

	norm_rMR = norm_rMR*abs(s);

	eta = -s*eta;

}



cublasFree(sup0);

cublasFree(v);

cublasFree(v_hat);

cublasFree(v_old);

cublasFree(w);

cublasFree(w_old);

cublasFree(w_oold);

}

[/codebox]

what do u think about the code? Is it reasonable?

Thank

This is the BLAS code…

[codebox]#include <stdio.h>

#include <stdlib.h>

#include <math.h>

#include “MATgiulio.h”

#include <cblas.h>

void BLAS_BandMinRes(float *ACD, int N, int HBand,float *b,float *x0, float *xMR, float tol, int n_max){

// ALL DEVIDE VARIABLES ALLOWED!!!	

// ACD is a compact BAND MATRIX...USE cublasSgbmv() TO USE IT!!

float *sup0;//, *sup1, *sup2;

sup0 =(float *) calloc( N,sizeof(float) ); MATcheckMallocStatus(sup0);	



enum CBLAS_ORDER order;

enum CBLAS_UPLO uplo; 		

order = CblasRowMajor;

uplo = CblasLower;

float *v, *v_hat, *v_old, *w, *w_old, *w_oold;

v      = (float *) calloc( N*1,sizeof(float) ); MATcheckMallocStatus(v);	

v_hat  = (float *) calloc( N*1,sizeof(float) ); MATcheckMallocStatus(v_hat);	

v_old  = (float *) calloc( N*1,sizeof(float) ); MATcheckMallocStatus(v_old);	

w      = (float *) calloc( N*1,sizeof(float) ); MATcheckMallocStatus(w);	

w_old  = (float *) calloc( N*1,sizeof(float) ); MATcheckMallocStatus(w_old);	

w_oold = (float *) calloc( N*1,sizeof(float) ); MATcheckMallocStatus(w_oold);	

float beta, c, c_old, s, s_old, eta, norm_r0, norm_rMR, alpha;

float beta_old, c_oold, s_oold, r1_hat, r1, r2, r3;



int n = 1;

// v = zeros(N,1);

cblas_sscal (N, 0, v, 1);

cblas_sscal (N, 0, v_old, 1);

cblas_sscal (N, 0, w, 1);

cblas_sscal (N, 0, w_old, 1);

cblas_scopy (N, b, 1, v_hat, 1); // v_hat=b;

cblas_ssbmv(order, uplo, N, HBand-1, -1, ACD, HBand, x0, 1, 1, v_hat, 1);	



beta = cblas_snrm2(N,v_hat,1); // beta = norm(v_hat);

c	  = 1;

c_old = 1;

s	  = 0;

s_old = 0;

// w = zeros(N,1);

cblas_scopy (N, b, 1, w_old, 1); // w_old = b;

eta = beta;

cblas_scopy (N, x0, 1, xMR, 1);  //xMR = x0;

norm_rMR = beta;

norm_r0  = beta;



while ( (n<n_max+1) && ( (norm_rMR/norm_r0) > tol) ){

	n++;

//-Calculate Lanczos Vectors

	cblas_scopy(N, v, 1, v_old, 1); //v_old = v;		

	//v = v_hat/beta; 

	cblas_scopy (N, v_hat, 1, v, 1); // v = v_hat;

	cblas_sscal(N,(1/beta),v,1);

	printf("\n v is: \n");

	MATprint(v,1,N);

	// alpha = v'*A*v;

	cblas_ssbmv(order, uplo, N, HBand-1, 1, ACD, HBand, v, 1, 0, sup0, 1);	

	alpha= cblas_sdot(N, sup0, 1,v, 1); // alpha = dot(sup0,sup0)=sup0'*sup0

	//v_hat = A*v - alpha*v - beta*v_old;

	cblas_saxpy(N, -alpha, v, 1, sup0, 1); // sup0 = -alpha*v +sup0 ----- sup0 was A*v		

	cblas_saxpy(N, -beta, v_old, 1, sup0, 1); // sup0 = -beta*v_old +sup0

	cblas_scopy(N, sup0, 1, v_hat, 1); //v_hat = sup0;

	beta_old = beta;				// beta_old = beta;

	beta = cblas_snrm2(N,v_hat,1);  // beta = norm(v_hat);

//-Calculate QR Factors

	c_oold = c_old;

	c_old  = c;

	s_oold = s_old;

	s_old  = s;

	r1_hat = c_old*alpha - c_oold*s_old*beta_old;

	r1     = sqrt( r1_hat*r1_hat + beta*beta );

	r2	   = s_old*alpha + c_oold*c_old*beta_old;

	r3     = s_oold*beta_old;

//-Calculate New Givens Rotations

	c = r1_hat/r1;

	s = beta/r1;

//-Update Solution

	cblas_scopy (N, w_old, 1, w_oold, 1); // w_oold = w_old;

	cblas_scopy (N, w, 1, w_old, 1);	  // w_old  = w;



	// w= (v - r3*w_oold - r2*w_old)/r1 ;

	cblas_sscal(N, 0, w, 1);                  // w = 0;

	cblas_saxpy(N, (1/r1), v, 1, w, 1);       // w = (1/r1)*v +w

	cblas_saxpy(N, -(r3/r1), w_oold, 1, w, 1);// w = -(r3/r1)*w_oold + w

	cblas_saxpy(N, -(r2/r1), w_old, 1, w, 1); // w = -(r2/r1)*w_old + w

    //xMR = xMR + c*eta*w;

	cblas_saxpy(N, c*eta, w, 1, xMR, 1); // xMR = xMR + c*eta*w;

	//printf("\n abs(s) is: %g \n \n", abs(s) );

	

	//ABS DOESNT WORK

	//norm_rMR = norm_rMR*abs(s);

	if (s>0)

			norm_rMR = norm_rMR*s;

	else

			norm_rMR = norm_rMR*(-s);

	

			

	eta = -s*eta;

	//printf("\n norm_rMR is: %g \n \n", norm_rMR );

	//printf("\n norm_rMR/norm_r0 is: %g \n \n", norm_rMR/norm_r0 );

}



free(sup0);

free(v);

free(v_hat);

free(v_old);

free(w);

free(w_old);

free(w_oold);

}

[/codebox]

and this on the CUBLAS

[codebox]#include <stdio.h>

#include “MATgiulio.cuh”

#include <cublas.h>

void checkCublasStatus(cublasStatus stat){

if (stat != CUBLAS_STATUS_SUCCESS) {

	printf("CUBLAS : operation completed without successfully!!");

	scanf("ERROR!");

	cublasShutdown();

	//EXIT_FAILURE);

}

}

void CUBLAS_BandMinRes(float *ACD, int N, int HBand,float *b,float *x0, float *xMR, float tol, int n_max){

// ALL DEVIDE VARIABLES ALLOWED!!!	

// ACD is a compact BAND MATRIX...USE cublasSgbmv() TO USE IT!!

cublasStatus status;

float *sup0;

status = cublasAlloc(N*1, sizeof(float), (void**)&sup0); checkCublasStatus(status);



float *v, *v_hat, *v_old, *w, *w_old, *w_oold;

status = cublasAlloc(N*1, sizeof(float), (void**)&v);	   checkCublasStatus(status);

status = cublasAlloc(N*1, sizeof(float), (void**)&v_hat);  checkCublasStatus(status);

status = cublasAlloc(N*1, sizeof(float), (void**)&v_old);   checkCublasStatus(status);

status = cublasAlloc(N*1, sizeof(float), (void**)&w);   checkCublasStatus(status);

status = cublasAlloc(N*1, sizeof(float), (void**)&w_old);   checkCublasStatus(status);

status = cublasAlloc(N*1, sizeof(float), (void**)&w_oold);   checkCublasStatus(status);



float beta, c, c_old, s, s_old, eta, norm_r0, norm_rMR, alpha;

float beta_old, c_oold, s_oold, r1_hat, r1, r2, r3;



int n = 1;

// v = zeros(N,1);

cublasSscal (N, 0, v, 1);

cublasSscal (N, 0, v_old, 1);

cublasSscal (N, 0, w, 1);

cublasSscal (N, 0, w_old, 1);

cublasScopy (N, b, 1, v_hat, 1); // v_hat=b;

cublasSsbmv('U', N, HBand-1, -1, ACD, HBand, x0, 1, 1, v_hat, 1); // v_hat= -A*x0 + v_hat;

beta = cublasSnrm2(N,v_hat,1); // beta = norm(v_hat);

c	  = 1;

c_old = 1;

s	  = 0;

s_old = 0;

// w = zeros(N,1);

cublasScopy (N, b, 1, w_old, 1); // w_old = b;

eta = beta;

cublasScopy (N, x0, 1, xMR, 1);  //xMR = x0;

norm_rMR = beta;

norm_r0  = beta;





while ( (n<n_max+1) && ( (norm_rMR/norm_r0) > tol) ){

	n++;

//-Calculate Lanczos Vectors

	cublasScopy(N, v, 1, v_old, 1); //v_old = v;		

	//v = v_hat/beta; 

	cublasScopy (N, v_hat, 1, v, 1); // v = v_hat;

	cublasSscal(N,(1/beta),v,1);

	

	// alpha = v'*A*v;

	cublasSsbmv('U', N, HBand-1, 1, ACD, HBand, v, 1, 0, sup0, 1); // sup0= A*v;	 	

	alpha= cublasSdot(N, sup0, 1,v, 1); // alpha = dot(sup0,sup0)=sup0'*sup0

	//printf("\n alpha is: %g \n \n", alpha);

	//v_hat = A*v - alpha*v - beta*v_old;

	cublasSaxpy(N, -alpha, v, 1, sup0, 1); // sup0 = -alpha*v +sup0 ----- sup0 was A*v		

	cublasSaxpy(N, -beta, v_old, 1, sup0, 1); // sup0 = -beta*v_old +sup0

	cublasScopy(N, sup0, 1, v_hat, 1); //v_hat = sup0;



	beta_old = beta;				// beta_old = beta;

    	

	beta = cublasSnrm2(N,v_hat,1);  // beta = norm(v_hat);

//printf("\n beta is: %g \n \n", beta);

//printf("\n beta_old is: %g \n \n", beta_old);

//-Calculate QR Factors

	c_oold = c_old;

	c_old  = c;

	s_oold = s_old;

	s_old  = s;

	r1_hat = c_old*alpha - c_oold*s_old*beta_old;

	r1     = sqrt( r1_hat*r1_hat + beta*beta );

	r2	   = s_old*alpha + c_oold*c_old*beta_old;

	r3     = s_oold*beta_old;

//-Calculate New Givens Rotations

	c = r1_hat/r1;

	s = beta/r1;

//-Update Solution

	cublasScopy (N, w_old, 1, w_oold, 1); // w_oold = w_old;

	cublasScopy (N, w, 1, w_old, 1);	  // w_old  = w;



	// w= (v - r3*w_oold - r2*w_old)/r1 ;

	cublasSscal(N, 0, w, 1);                  // w = 0;

	cublasSaxpy(N, (1/r1), v, 1, w, 1);       // w = (1/r1)*v +w

	cublasSaxpy(N, -(r3/r1), w_oold, 1, w, 1);// w = -(r3/r1)*w_oold + w

	cublasSaxpy(N, -(r2/r1), w_old, 1, w, 1); // w = -(r2/r1)*w_old + w

    //xMR = xMR + c*eta*w;

	cublasSaxpy(N, c*eta, w, 1, xMR, 1); // xMR = xMR + c*eta*w;

	norm_rMR = norm_rMR*abs(s);

	eta = -s*eta;

}



cublasFree(sup0);

cublasFree(v);

cublasFree(v_hat);

cublasFree(v_old);

cublasFree(w);

cublasFree(w_old);

cublasFree(w_oold);

}

[/codebox]

what do u think about the code? Is it reasonable?

Thank

cublasSnrm2 is normally slower than any CPU version because it has to do a device to host copy to return the result. This incurs a lot of overhead, making it slower than the CPU. In general, all the level 1 BLAS functions don’t benefit much from CUDA because they don’t have many floating point operations to each memory access.

cublasSnrm2 is normally slower than any CPU version because it has to do a device to host copy to return the result. This incurs a lot of overhead, making it slower than the CPU. In general, all the level 1 BLAS functions don’t benefit much from CUDA because they don’t have many floating point operations to each memory access.

This means that I can just replace the matrix-vector multiplication “cblas_ssbmv” with the cublas one…but I am working with vector of dimension 7000!

The norm computation is just a parallel reduction…why should it be so slow?

This means that I can just replace the matrix-vector multiplication “cblas_ssbmv” with the cublas one…but I am working with vector of dimension 7000!

The norm computation is just a parallel reduction…why should it be so slow?

A reduction on 7000 entries is really a couple of orders of magnitude too small to get much speed up on the GPU, the total execution time becomes dominated by fixed latencies like the PCI-e bus transfer. The most computationally expensive call in your code is probably the ssbmv call, but even that really resolves to a few saxpy operations on the diagonals.

Your best hope is to fuse some of the operations together to reduce a lot of the fixed costs. Doing this

cublasSaxpy(N, -alpha, v, 1, sup0, 1); // sup0 = -alpha*v +sup0 ----- sup0 was A*v

cublasSaxpy(N, -beta, v_old, 1, sup0, 1); // sup0 = -beta*v_old +sup0

cublasScopy(N, sup0, 1, v_hat, 1); //v_hat = sup0;

and this

cublasScopy (N, w_old, 1, w_oold, 1); // w_oold = w_old;

cublasScopy (N, w, 1, w_old, 1); // w_old = w;

cublasSscal(N, 0, w, 1); // w = 0;

cublasSaxpy(N, (1/r1), v, 1, w, 1); // w = (1/r1)*v +w

cublasSaxpy(N, -(r3/r1), w_oold, 1, w, 1);// w = -(r3/r1)*w_oold + w

cublasSaxpy(N, -(r2/r1), w_old, 1, w, 1); // w = -(r2/r1)*w_old + w

//xMR = xMR + c*eta*w;

cublasSaxpy(N, c*eta, w, 1, xMR, 1); // xMR = xMR + c*eta*w;

is very inefficient compared to using a single kernel launch for each which fuses the computation and copy/assignment operations. In both cases only requires a two minimal kernels with a handful of lines of code each. Krylov subspace methods can be done efficiently with good speed up on GPUs if the problem is large, you have the matrix in an efficient sparse format, and the “housekeeping” calculations can be done efficiently with minimal overhead. I think this code probably fails on all three counts.

A reduction on 7000 entries is really a couple of orders of magnitude too small to get much speed up on the GPU, the total execution time becomes dominated by fixed latencies like the PCI-e bus transfer. The most computationally expensive call in your code is probably the ssbmv call, but even that really resolves to a few saxpy operations on the diagonals.

Your best hope is to fuse some of the operations together to reduce a lot of the fixed costs. Doing this

cublasSaxpy(N, -alpha, v, 1, sup0, 1); // sup0 = -alpha*v +sup0 ----- sup0 was A*v

cublasSaxpy(N, -beta, v_old, 1, sup0, 1); // sup0 = -beta*v_old +sup0

cublasScopy(N, sup0, 1, v_hat, 1); //v_hat = sup0;

and this

cublasScopy (N, w_old, 1, w_oold, 1); // w_oold = w_old;

cublasScopy (N, w, 1, w_old, 1); // w_old = w;

cublasSscal(N, 0, w, 1); // w = 0;

cublasSaxpy(N, (1/r1), v, 1, w, 1); // w = (1/r1)*v +w

cublasSaxpy(N, -(r3/r1), w_oold, 1, w, 1);// w = -(r3/r1)*w_oold + w

cublasSaxpy(N, -(r2/r1), w_old, 1, w, 1); // w = -(r2/r1)*w_old + w

//xMR = xMR + c*eta*w;

cublasSaxpy(N, c*eta, w, 1, xMR, 1); // xMR = xMR + c*eta*w;

is very inefficient compared to using a single kernel launch for each which fuses the computation and copy/assignment operations. In both cases only requires a two minimal kernels with a handful of lines of code each. Krylov subspace methods can be done efficiently with good speed up on GPUs if the problem is large, you have the matrix in an efficient sparse format, and the “housekeeping” calculations can be done efficiently with minimal overhead. I think this code probably fails on all three counts.

[quote name=‘avidday’ post=‘1117553’ date=‘Sep 15 2010, 07:24 AM’]

A reduction on 7000 entries is really a couple of orders of magnitude too small to get much speed up on the GPU, the total execution time becomes dominated by fixed latencies like the PCI-e bus transfer. The most computationally expensive call in your code is probably the ssbmv call, but even that really resolves to a few saxpy operations on the diagonals.

Your best hope is to fuse some of the operations together to reduce a lot of the fixed costs. Doing this

[codebox]/*GPU_BandMinRes(float *ACD, int N, int HBand,float *b,float *xMR, float tol, int n_max)
	ACD = Given a Band Symmetric Matrix As with dimension N and half band HBand, the equivalent 

		  compact band matrix ACD in CUBLAS format has dimention (2*HBand+1)xN as described in

		  CUBLAS reference guide Function cublasDgbmv();

example:

        As-> N=6 K=2

	    	

		| 1   2   3   0   0   0 | 

                | 4   5   6   7   0   0 |

		| 8   9  10  11  12   0 |

        As = | 0  13  14  15  16  17 |

		| 0   0  18  19  20  21 |

		| 0   0   0  22  23  24 |

		A-> 5x6 matrix

        

		| 0   0   3   7  12  17 |

		| 0   2   6  11  16  21 | <- first superdiagonal

  ACD    = | 1   5  10  15  20  24 | <- leading diagonal 			

		| 4   9  14  19  23   0 | <- first subdiagonal

		| 8  13  18  22   0   0 |

        

		the matrix in CUBLAS format must be ordered by column

   A  = {0,0,1,4,8,  0,2,5,9,13, ..., 17,21,24,0,0};

   b     = 

   x0    = initial guess;  and result;

   tol   = tollerance;

   n_max = num. max iteration;

*/

#include <cublas.h>

#include <math.h>

#include <cutil_inline.h>

#include “MATgiulio.cuh”

#define INITIALIZE_THREAD_PER_BLOCK 128

#define INITIALIZE_NUM_BLOCKS ceil( (N/(float)INITIALIZE_THREAD_PER_BLOCK) )

#define LANCZOS_THREAD_PER_BLOCK 128

#define LANCZOS_NUM_BLOCKS ceil((N/(float)LANCZOS_THREAD_PER_BLOCK))

#define UPDATE_THREAD_PER_BLOCK 128

#define UPDATE_NUM_BLOCKS ceil((N/(float)UPDATE_THREAD_PER_BLOCK))

global void initialize(float* sup0, float* v_hat, float* v, float* w, float* w_old, float* b, int N);

global void lanczos_norm(float* Av, float* v_hat, float* v, float alpha, float beta, float beta_old, int N);

global void update(float* w_old, float* w, float* v, float* xMR, float r1, float r2, float r3, float c, float eta,int N);

// ONLY DEVICE VARIABLES ALLOWED!!!

// ACD is a compact BAND SYMMETRIC MATRIX…USE cublasSsbmv() TO USE IT!!

device float *sup0;

device float *v, *v_hat, *v_old, *w, *w_old;// *w_oold;

device float *square_norm;

void GPU_BandMinRes(float *ACD, int N, int HBand,float *b,float *xMR, float tol, int n_max){

cublasStatus status;

if(cublasInit() == CUBLAS_STATUS_NOT_INITIALIZED) {

    printf("CUBLAS init error.\n");

}

cutilSafeCall( cudaMalloc( (void**)&v,     N*1*sizeof(float)) );

cutilSafeCall( cudaMalloc( (void**)&v_hat, N*1*sizeof(float)) );

cutilSafeCall( cudaMalloc( (void**)&v_old, N*1*sizeof(float)) );

    cutilSafeCall( cudaMalloc( (void**)&w,     N*1*sizeof(float)) );

cutilSafeCall( cudaMalloc( (void**)&w_old, N*1*sizeof(float)) );

cutilSafeCall( cudaMalloc( (void**)&sup0,  N*1*sizeof(float)) );



float     beta,      c,  c_old,      s, s_old, eta, norm_r0, norm_rMR, alpha;

float beta_old, c_oold, s_oold, r1_hat,    r1,  r2,      r3;

float *beta_n;

beta_n =(float *) calloc( max(max(LANCZOS_NUM_BLOCKS, DOT_NUM_BLOCKS),max(UPDATE_NUM_BLOCKS, INITIALIZE_NUM_BLOCKS)),sizeof(float) ); 



int n = 1;





// Initialization variables	

// v = zeros(N,1);

//----------------------------------------------------------------------------------

cublasSsbmv('U', N, HBand-1, 1, ACD, HBand, xMR, 1, 0, v_hat, 1); // v_hat= A*x0;

status = cublasGetError();

if (status != CUBLAS_STATUS_SUCCESS) fprintf (stderr, "!!!! device access error (write A)\n");



initialize<<<INITIALIZE_NUM_BLOCKS ,INITIALIZE_THREAD_PER_BLOCK>>>(sup0, v_hat, v, w, w_old, b, N);//(float* sup0, float* v_hat, float* v, float* w, float* w_old, float* b, int N)

cutilCheckMsg("initialize kernal failed\n");

//compute norm after reduction

beta=0;

cutilSafeCall( cudaMemcpy(beta_n,sup0,INITIALIZE_NUM_BLOCKS*sizeof(float),c

udaMemcpyDeviceToHost) ); // read partial sum from device memory

for (int i=0; i<INITIALIZE_NUM_BLOCKS; i++){

	beta += beta_n[i];          

}                               

beta = sqrt(beta);  

//----------------------------------------------------------------------------------

c	  = 1;

c_old = 1;

s	  = 0;

s_old = 0;

eta = beta;



norm_rMR = beta;

norm_r0  = beta;

	

while ( (n<n_max+1) && ( (norm_rMR/norm_r0) > tol) ){

	n++;

	

// Calculate Lanczos Vectors

	//----------------------------------------------------------------------------------------

	beta_old = beta;				// beta_old = beta;

	// Alpha = v'*A*v; v=v_hat/beta

	cublasSsbmv('U', N, HBand-1, (1/beta), ACD, HBand, v_hat, 1, 0, sup0, 1); // sup0= A*(v_hat)*(1/beta);



	status = cublasGetError();

	if (status != CUBLAS_STATUS_SUCCESS) fprintf (stderr, "!!!! device access error (write A)\n");

	

	alpha= cublasSdot(N, v_hat, 1,sup0, 1)*(1/beta); // alpha = dot(v_hat,sup0)/beta^2 

	lanczos_norm<<<LANCZOS_NUM_BLOCKS ,LANCZOS_THREAD_PER_BLOCK>>>(sup0, v_hat, v, alpha, beta, beta_old, N);

	cutilCheckMsg("lanczos_norm kernal failed\n");

	//compute norm after reduction

	beta=0;

	cutilSafeCall( cudaMemcpy(beta_n,sup0,LANCZOS_NUM_BLOCKS*sizeof(float),cuda

MemcpyDeviceToHost) ); // read partial sum from device memory

	for (int i=0; i<LANCZOS_NUM_BLOCKS; i++){

		beta += beta_n[i];          

	}                               

	beta = sqrt(beta);               

	//----------------------------------------------------------------------------------------

    

//-Calculate QR Factors

	//----------------------------------------------------------------------------------------

	c_oold = c_old;

	c_old  = c;

	s_oold = s_old;

	s_old  = s;

	r1_hat = c_old*alpha - c_oold*s_old*beta_old;

	r1     = sqrt( r1_hat*r1_hat + beta*beta );

	r2     = s_old*alpha + c_oold*c_old*beta_old;

	r3     = s_oold*beta_old;

	//----------------------------------------------------------------------------------------

//-Calculate New Givens Rotations

	//----------------------------------------------------------------------------------------

	c = r1_hat/r1;

	s = beta/r1;

	//----------------------------------------------------------------------------------------



//-Update Solution

	//----------------------------------------------------------------------------------------

	update<<<UPDATE_NUM_BLOCKS ,UPDATE_THREAD_PER_BLOCK>>>(w_old, w, v, xMR, r1,r2, r3, c, eta,N);

	cutilCheckMsg("update kernal failed\n");

	norm_rMR = norm_rMR*abs(s);

	eta = -s*eta;

	//----------------------------------------------------------------------------------------

}

//printf("The number of iteraction are %d",n);

cudaFree(sup0);

cudaFree(v);

cudaFree(v_hat);

cudaFree(v_old);

cudaFree(w);

cudaFree(w_old);

}

device volatile int lock=0;

device volatile float square_n=0;

global void initialize(float* sup0, float* v_hat, float* v, float* w, float* w_old, float* b, int N) {

__shared__ float svhat[INITIALIZE_THREAD_PER_BLOCK];

unsigned int tid = threadIdx.x;

unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;

if (i<N){

	v[i] = 0;

	w[i] = 0;

	w_old[i] = 0; 

	svhat[tid] = v_hat[i];

	svhat[tid] = b[i] - svhat[tid];

	v_hat[i] = svhat[tid];

	svhat[tid] = svhat[tid] *svhat[tid] ;

}

else svhat[tid] = 0;

__syncthreads();

if (INITIALIZE_THREAD_PER_BLOCK >= 512) {

	if (tid < 256) { svhat[tid] += svhat[tid + 256]; } 

__syncthreads();

}

if (INITIALIZE_THREAD_PER_BLOCK >= 256) {

	if (tid < 128) { svhat[tid] += svhat[tid + 128]; } 

__syncthreads();

}

if (INITIALIZE_THREAD_PER_BLOCK >= 128) {

	if (tid < 64) { svhat[tid] += svhat[tid + 64]; } 

__syncthreads();

}

if (tid < 32) {

	if (INITIALIZE_THREAD_PER_BLOCK >= 64) svhat[tid] += svhat[tid + 32];

	if (INITIALIZE_THREAD_PER_BLOCK >= 32) svhat[tid] += svhat[tid + 16];

	if (INITIALIZE_THREAD_PER_BLOCK >= 16) svhat[tid] += svhat[tid + 8];

	if (INITIALIZE_THREAD_PER_BLOCK >= 8) svhat[tid] += svhat[tid + 4];

	if (INITIALIZE_THREAD_PER_BLOCK >= 4) svhat[tid] += svhat[tid + 2];

	if (INITIALIZE_THREAD_PER_BLOCK >= 2) svhat[tid] += svhat[tid + 1];

}

// write result for this block to global mem

if (tid == 0) sup0[blockIdx.x] = svhat[0];



// TO PERFORM LAST SUM INSIDE A KERNEL

//atomicCAS doesnt work check topic <a target='_blank' rel='noopener noreferrer' href='"http://forums.nvidia.com/index.php?showtopic=98444"'>http://forums.nvidia.com/index.php?showtopic=98444</a>

//if (tid==0) {

//	do {} while(atomicCAS(&lock,0,1)); // set lock

//	int i=1;

//	square_n += svhat[0];

//	__threadfence(); // wait for write completion

//	lock = 0; // free lock

//}	

}

global void lanczos_norm(float* Av, float* v_hat, float* v, float alpha, float beta, float beta_old, int N) {

__shared__ float svhat[LANCZOS_THREAD_PER_BLOCK];

__shared__ float svold[LANCZOS_THREAD_PER_BLOCK];



unsigned int tid = threadIdx.x;

unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; 



if (i<N){

	

	svhat[tid] = v_hat[i];

	svold[tid] = v[i];

	v[i] = svhat[tid]/beta;

	svhat[tid] = Av[i] - alpha*v[i] - beta*svold[tid];

	v_hat[i]=svhat[tid];

	svhat[tid] = svhat[tid]*svhat[tid];

}

else svhat[tid] = 0;



__syncthreads();

if (LANCZOS_THREAD_PER_BLOCK >= 512) {

	if (tid < 256) { svhat[tid] += svhat[tid + 256]; } 

__syncthreads();

}

if (LANCZOS_THREAD_PER_BLOCK >= 256) {

	if (tid < 128) { svhat[tid] += svhat[tid + 128]; } 

__syncthreads();

}

if (LANCZOS_THREAD_PER_BLOCK >= 128) {

	if (tid < 64) { svhat[tid] += svhat[tid + 64]; } 

__syncthreads();

}

if (tid < 32) {

	if (LANCZOS_THREAD_PER_BLOCK >= 64) svhat[tid] += svhat[tid + 32];

	if (LANCZOS_THREAD_PER_BLOCK >= 32) svhat[tid] += svhat[tid + 16];

	if (LANCZOS_THREAD_PER_BLOCK >= 16) svhat[tid] += svhat[tid + 8];

	if (LANCZOS_THREAD_PER_BLOCK >= 8) svhat[tid] += svhat[tid + 4];

	if (LANCZOS_THREAD_PER_BLOCK >= 4) svhat[tid] += svhat[tid + 2];

	if (LANCZOS_THREAD_PER_BLOCK >= 2) svhat[tid] += svhat[tid + 1];

}

// write result for this block to global mem

if (tid == 0) Av[blockIdx.x] = svhat[0];

//if (tid==0) {

//	do {} while(atomicCAS(&lock,0,1)); // set lock

//	Av[0] += svhat[0];

//	__threadfence(); // wait for write completion

//	lock = 0; // free lock

//}

}

global void update(float* w_old, float* w, float* v, float* xMR, float r1, float r2, float r3, float c, float eta,int N) {

__shared__ float swoold[UPDATE_THREAD_PER_BLOCK];



unsigned int tid = threadIdx.x;

unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; 



if (i<N){

	swoold[tid] = w_old[i];

	w_old[i] = w[i];

	w[i] = (v[i] - r3*swoold[tid] -r2*w_old[i])/r1;

	xMR[i] += c*eta*w[i];

}

}[/codebox]

please let me know if you find any bottleneck in the code. Thanks!

[quote name=‘avidday’ post=‘1117553’ date=‘Sep 15 2010, 07:24 AM’]

A reduction on 7000 entries is really a couple of orders of magnitude too small to get much speed up on the GPU, the total execution time becomes dominated by fixed latencies like the PCI-e bus transfer. The most computationally expensive call in your code is probably the ssbmv call, but even that really resolves to a few saxpy operations on the diagonals.

Your best hope is to fuse some of the operations together to reduce a lot of the fixed costs. Doing this

[codebox]/*GPU_BandMinRes(float *ACD, int N, int HBand,float *b,float *xMR, float tol, int n_max)
	ACD = Given a Band Symmetric Matrix As with dimension N and half band HBand, the equivalent 

		  compact band matrix ACD in CUBLAS format has dimention (2*HBand+1)xN as described in

		  CUBLAS reference guide Function cublasDgbmv();

example:

        As-> N=6 K=2

	    	

		| 1   2   3   0   0   0 | 

                | 4   5   6   7   0   0 |

		| 8   9  10  11  12   0 |

        As = | 0  13  14  15  16  17 |

		| 0   0  18  19  20  21 |

		| 0   0   0  22  23  24 |

		A-> 5x6 matrix

        

		| 0   0   3   7  12  17 |

		| 0   2   6  11  16  21 | <- first superdiagonal

  ACD    = | 1   5  10  15  20  24 | <- leading diagonal 			

		| 4   9  14  19  23   0 | <- first subdiagonal

		| 8  13  18  22   0   0 |

        

		the matrix in CUBLAS format must be ordered by column

   A  = {0,0,1,4,8,  0,2,5,9,13, ..., 17,21,24,0,0};

   b     = 

   x0    = initial guess;  and result;

   tol   = tollerance;

   n_max = num. max iteration;

*/

#include <cublas.h>

#include <math.h>

#include <cutil_inline.h>

#include “MATgiulio.cuh”

#define INITIALIZE_THREAD_PER_BLOCK 128

#define INITIALIZE_NUM_BLOCKS ceil( (N/(float)INITIALIZE_THREAD_PER_BLOCK) )

#define LANCZOS_THREAD_PER_BLOCK 128

#define LANCZOS_NUM_BLOCKS ceil((N/(float)LANCZOS_THREAD_PER_BLOCK))

#define UPDATE_THREAD_PER_BLOCK 128

#define UPDATE_NUM_BLOCKS ceil((N/(float)UPDATE_THREAD_PER_BLOCK))

global void initialize(float* sup0, float* v_hat, float* v, float* w, float* w_old, float* b, int N);

global void lanczos_norm(float* Av, float* v_hat, float* v, float alpha, float beta, float beta_old, int N);

global void update(float* w_old, float* w, float* v, float* xMR, float r1, float r2, float r3, float c, float eta,int N);

// ONLY DEVICE VARIABLES ALLOWED!!!

// ACD is a compact BAND SYMMETRIC MATRIX…USE cublasSsbmv() TO USE IT!!

device float *sup0;

device float *v, *v_hat, *v_old, *w, *w_old;// *w_oold;

device float *square_norm;

void GPU_BandMinRes(float *ACD, int N, int HBand,float *b,float *xMR, float tol, int n_max){

cublasStatus status;

if(cublasInit() == CUBLAS_STATUS_NOT_INITIALIZED) {

    printf("CUBLAS init error.\n");

}

cutilSafeCall( cudaMalloc( (void**)&v,     N*1*sizeof(float)) );

cutilSafeCall( cudaMalloc( (void**)&v_hat, N*1*sizeof(float)) );

cutilSafeCall( cudaMalloc( (void**)&v_old, N*1*sizeof(float)) );

    cutilSafeCall( cudaMalloc( (void**)&w,     N*1*sizeof(float)) );

cutilSafeCall( cudaMalloc( (void**)&w_old, N*1*sizeof(float)) );

cutilSafeCall( cudaMalloc( (void**)&sup0,  N*1*sizeof(float)) );



float     beta,      c,  c_old,      s, s_old, eta, norm_r0, norm_rMR, alpha;

float beta_old, c_oold, s_oold, r1_hat,    r1,  r2,      r3;

float *beta_n;

beta_n =(float *) calloc( max(max(LANCZOS_NUM_BLOCKS, DOT_NUM_BLOCKS),max(UPDATE_NUM_BLOCKS, INITIALIZE_NUM_BLOCKS)),sizeof(float) ); 



int n = 1;





// Initialization variables	

// v = zeros(N,1);

//----------------------------------------------------------------------------------

cublasSsbmv('U', N, HBand-1, 1, ACD, HBand, xMR, 1, 0, v_hat, 1); // v_hat= A*x0;

status = cublasGetError();

if (status != CUBLAS_STATUS_SUCCESS) fprintf (stderr, "!!!! device access error (write A)\n");



initialize<<<INITIALIZE_NUM_BLOCKS ,INITIALIZE_THREAD_PER_BLOCK>>>(sup0, v_hat, v, w, w_old, b, N);//(float* sup0, float* v_hat, float* v, float* w, float* w_old, float* b, int N)

cutilCheckMsg("initialize kernal failed\n");

//compute norm after reduction

beta=0;

cutilSafeCall( cudaMemcpy(beta_n,sup0,INITIALIZE_NUM_BLOCKS*sizeof(float),c

udaMemcpyDeviceToHost) ); // read partial sum from device memory

for (int i=0; i<INITIALIZE_NUM_BLOCKS; i++){

	beta += beta_n[i];          

}                               

beta = sqrt(beta);  

//----------------------------------------------------------------------------------

c	  = 1;

c_old = 1;

s	  = 0;

s_old = 0;

eta = beta;



norm_rMR = beta;

norm_r0  = beta;

	

while ( (n<n_max+1) && ( (norm_rMR/norm_r0) > tol) ){

	n++;

	

// Calculate Lanczos Vectors

	//----------------------------------------------------------------------------------------

	beta_old = beta;				// beta_old = beta;

	// Alpha = v'*A*v; v=v_hat/beta

	cublasSsbmv('U', N, HBand-1, (1/beta), ACD, HBand, v_hat, 1, 0, sup0, 1); // sup0= A*(v_hat)*(1/beta);



	status = cublasGetError();

	if (status != CUBLAS_STATUS_SUCCESS) fprintf (stderr, "!!!! device access error (write A)\n");

	

	alpha= cublasSdot(N, v_hat, 1,sup0, 1)*(1/beta); // alpha = dot(v_hat,sup0)/beta^2 

	lanczos_norm<<<LANCZOS_NUM_BLOCKS ,LANCZOS_THREAD_PER_BLOCK>>>(sup0, v_hat, v, alpha, beta, beta_old, N);

	cutilCheckMsg("lanczos_norm kernal failed\n");

	//compute norm after reduction

	beta=0;

	cutilSafeCall( cudaMemcpy(beta_n,sup0,LANCZOS_NUM_BLOCKS*sizeof(float),cuda

MemcpyDeviceToHost) ); // read partial sum from device memory

	for (int i=0; i<LANCZOS_NUM_BLOCKS; i++){

		beta += beta_n[i];          

	}                               

	beta = sqrt(beta);               

	//----------------------------------------------------------------------------------------

    

//-Calculate QR Factors

	//----------------------------------------------------------------------------------------

	c_oold = c_old;

	c_old  = c;

	s_oold = s_old;

	s_old  = s;

	r1_hat = c_old*alpha - c_oold*s_old*beta_old;

	r1     = sqrt( r1_hat*r1_hat + beta*beta );

	r2     = s_old*alpha + c_oold*c_old*beta_old;

	r3     = s_oold*beta_old;

	//----------------------------------------------------------------------------------------

//-Calculate New Givens Rotations

	//----------------------------------------------------------------------------------------

	c = r1_hat/r1;

	s = beta/r1;

	//----------------------------------------------------------------------------------------



//-Update Solution

	//----------------------------------------------------------------------------------------

	update<<<UPDATE_NUM_BLOCKS ,UPDATE_THREAD_PER_BLOCK>>>(w_old, w, v, xMR, r1,r2, r3, c, eta,N);

	cutilCheckMsg("update kernal failed\n");

	norm_rMR = norm_rMR*abs(s);

	eta = -s*eta;

	//----------------------------------------------------------------------------------------

}

//printf("The number of iteraction are %d",n);

cudaFree(sup0);

cudaFree(v);

cudaFree(v_hat);

cudaFree(v_old);

cudaFree(w);

cudaFree(w_old);

}

device volatile int lock=0;

device volatile float square_n=0;

global void initialize(float* sup0, float* v_hat, float* v, float* w, float* w_old, float* b, int N) {

__shared__ float svhat[INITIALIZE_THREAD_PER_BLOCK];

unsigned int tid = threadIdx.x;

unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;

if (i<N){

	v[i] = 0;

	w[i] = 0;

	w_old[i] = 0; 

	svhat[tid] = v_hat[i];

	svhat[tid] = b[i] - svhat[tid];

	v_hat[i] = svhat[tid];

	svhat[tid] = svhat[tid] *svhat[tid] ;

}

else svhat[tid] = 0;

__syncthreads();

if (INITIALIZE_THREAD_PER_BLOCK >= 512) {

	if (tid < 256) { svhat[tid] += svhat[tid + 256]; } 

__syncthreads();

}

if (INITIALIZE_THREAD_PER_BLOCK >= 256) {

	if (tid < 128) { svhat[tid] += svhat[tid + 128]; } 

__syncthreads();

}

if (INITIALIZE_THREAD_PER_BLOCK >= 128) {

	if (tid < 64) { svhat[tid] += svhat[tid + 64]; } 

__syncthreads();

}

if (tid < 32) {

	if (INITIALIZE_THREAD_PER_BLOCK >= 64) svhat[tid] += svhat[tid + 32];

	if (INITIALIZE_THREAD_PER_BLOCK >= 32) svhat[tid] += svhat[tid + 16];

	if (INITIALIZE_THREAD_PER_BLOCK >= 16) svhat[tid] += svhat[tid + 8];

	if (INITIALIZE_THREAD_PER_BLOCK >= 8) svhat[tid] += svhat[tid + 4];

	if (INITIALIZE_THREAD_PER_BLOCK >= 4) svhat[tid] += svhat[tid + 2];

	if (INITIALIZE_THREAD_PER_BLOCK >= 2) svhat[tid] += svhat[tid + 1];

}

// write result for this block to global mem

if (tid == 0) sup0[blockIdx.x] = svhat[0];



// TO PERFORM LAST SUM INSIDE A KERNEL

//atomicCAS doesnt work check topic <a target='_blank' rel='noopener noreferrer' href='"http://forums.nvidia.com/index.php?showtopic=98444"'>http://forums.nvidia.com/index.php?showtopic=98444</a>

//if (tid==0) {

//	do {} while(atomicCAS(&lock,0,1)); // set lock

//	int i=1;

//	square_n += svhat[0];

//	__threadfence(); // wait for write completion

//	lock = 0; // free lock

//}	

}

global void lanczos_norm(float* Av, float* v_hat, float* v, float alpha, float beta, float beta_old, int N) {

__shared__ float svhat[LANCZOS_THREAD_PER_BLOCK];

__shared__ float svold[LANCZOS_THREAD_PER_BLOCK];



unsigned int tid = threadIdx.x;

unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; 



if (i<N){

	

	svhat[tid] = v_hat[i];

	svold[tid] = v[i];

	v[i] = svhat[tid]/beta;

	svhat[tid] = Av[i] - alpha*v[i] - beta*svold[tid];

	v_hat[i]=svhat[tid];

	svhat[tid] = svhat[tid]*svhat[tid];

}

else svhat[tid] = 0;



__syncthreads();

if (LANCZOS_THREAD_PER_BLOCK >= 512) {

	if (tid < 256) { svhat[tid] += svhat[tid + 256]; } 

__syncthreads();

}

if (LANCZOS_THREAD_PER_BLOCK >= 256) {

	if (tid < 128) { svhat[tid] += svhat[tid + 128]; } 

__syncthreads();

}

if (LANCZOS_THREAD_PER_BLOCK >= 128) {

	if (tid < 64) { svhat[tid] += svhat[tid + 64]; } 

__syncthreads();

}

if (tid < 32) {

	if (LANCZOS_THREAD_PER_BLOCK >= 64) svhat[tid] += svhat[tid + 32];

	if (LANCZOS_THREAD_PER_BLOCK >= 32) svhat[tid] += svhat[tid + 16];

	if (LANCZOS_THREAD_PER_BLOCK >= 16) svhat[tid] += svhat[tid + 8];

	if (LANCZOS_THREAD_PER_BLOCK >= 8) svhat[tid] += svhat[tid + 4];

	if (LANCZOS_THREAD_PER_BLOCK >= 4) svhat[tid] += svhat[tid + 2];

	if (LANCZOS_THREAD_PER_BLOCK >= 2) svhat[tid] += svhat[tid + 1];

}

// write result for this block to global mem

if (tid == 0) Av[blockIdx.x] = svhat[0];

//if (tid==0) {

//	do {} while(atomicCAS(&lock,0,1)); // set lock

//	Av[0] += svhat[0];

//	__threadfence(); // wait for write completion

//	lock = 0; // free lock

//}

}

global void update(float* w_old, float* w, float* v, float* xMR, float r1, float r2, float r3, float c, float eta,int N) {

__shared__ float swoold[UPDATE_THREAD_PER_BLOCK];



unsigned int tid = threadIdx.x;

unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; 



if (i<N){

	swoold[tid] = w_old[i];

	w_old[i] = w[i];

	w[i] = (v[i] - r3*swoold[tid] -r2*w_old[i])/r1;

	xMR[i] += c*eta*w[i];

}

}[/codebox]

please let me know if you find any bottleneck in the code. Thanks!

The only way to be sure is to profile it and see where it is slow. That said, all of your own kernels look like they could be improved in a number of way. Whether it is worth doing depends on where you code is spending most of its time.

Looking at the Lanzcos update kernel as an example:

__global__ void lanczos_norm(float* Av, float* v_hat, float* v, float alpha, float beta, float beta_old, int N) {

	__shared__ float svhat[LANCZOS_THREAD_PER_BLOCK];

	__shared__ float svold[LANCZOS_THREAD_PER_BLOCK];

	unsigned int tid = threadIdx.x;

	unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;

	if (i<N){

		svhat[tid] = v_hat[i];

		svold[tid] = v[i];

		v[i] = svhat[tid]/beta;

		svhat[tid] = Av[i] - alpha*v[i] - beta*svold[tid];

		v_hat[i]=svhat[tid];

		svhat[tid] = svhat[tid]*svhat[tid];

	}

	else svhat[tid] = 0;

	__syncthreads();

	if (LANCZOS_THREAD_PER_BLOCK >= 512) {

		if (tid < 256) { svhat[tid] += svhat[tid + 256]; }

		__syncthreads();

	}

	if (LANCZOS_THREAD_PER_BLOCK >= 256) {

		if (tid < 128) { svhat[tid] += svhat[tid + 128]; }

		__syncthreads();

	}

	if (LANCZOS_THREAD_PER_BLOCK >= 128) {

		if (tid < 64) { svhat[tid] += svhat[tid + 64]; }

		__syncthreads();

	}

	if (tid < 32) {

		if (LANCZOS_THREAD_PER_BLOCK >= 64) svhat[tid] += svhat[tid + 32];

		if (LANCZOS_THREAD_PER_BLOCK >= 32) svhat[tid] += svhat[tid + 16];

		if (LANCZOS_THREAD_PER_BLOCK >= 16) svhat[tid] += svhat[tid + 8];

		if (LANCZOS_THREAD_PER_BLOCK >= 8) svhat[tid] += svhat[tid + 4];

		if (LANCZOS_THREAD_PER_BLOCK >= 4) svhat[tid] += svhat[tid + 2];

		if (LANCZOS_THREAD_PER_BLOCK >= 2) svhat[tid] += svhat[tid + 1];

	}

	// write result for this block to global mem

	if (tid == 0) Av[blockIdx.x] = svhat[0];

}

This code computes one norm value per thread and then performs a block wise sum reduction on it. The arithmetic intensity of the kernel is very low and the “reduction ratio” is rather poor. Why not have every thread compute multiple norm values and sum them, then reduce them. By doing so you will need fewer blocks (so fewer total shared memory reductions, which is faster), need to copy less partial sums back from device (which will be faster), and need fewer operations on the host to complete the norm calculation (which is also faster).

The norm calculation itself doesn’t need to be done in shared memory, it only needs to be loaded into shared memory for the reduction. Doing the calculations using shared memory is slower than doing them using registers. svold is a completely unnecessary use of shared memory which will hurt occupancy. The reduction code itself could probably be improved a bit as well (if you are running on a Fermi card it may not always work correctly). My current code for doing something like this (an Linfty norm in this case) look something like:

typedef float ValueType;

__global__ void linfty_norm(unsigned int num_rows, const ValueType * x0, ValueType * norms)

{

	extern __shared__ ValueType vbuffer[];

	volatile int tidx = threadIdx.x + blockIdx.x * blockDim.x;

	volatile int stride = gridDim.x * blockDim.x;

	// Loop through vector to find local maximum

	ValueType lumax = 0;

	for( unsigned int i = tidx; i < num_rows; i+=stride ) {

		lumax = fmax( lumax, fabs(x0[i]) );

	}

	// Load into shared memory

	vbuffer[threadIdx.x] = lumax; __syncthreads();

	// First warp performs shared memory reduction

	// The volatile in register makes the code "Fermi safe"

	if (threadIdx.x < warpSize) {

		volatile ValueType *a = &vbuffer[threadIdx.x];

		#pragma unroll

		for(int i=warpSize; i<blockDim.x; i+=warpSize) {

			*a = fmax( *a, *(a+i));

		}

		*a = fmax( *a, *(a+16));

		*a = fmax( *a, *(a+8));

		*a = fmax( *a, *(a+4));

		*a = fmax( *a, *(a+2));

		// Finalise and write out the results to global memory

		if (threadIdx.x == 0)  {

			norms[blockIdx.x] = fmax(*a, *(a+1));

		}

	}

}

Just some random thoughts.

The only way to be sure is to profile it and see where it is slow. That said, all of your own kernels look like they could be improved in a number of way. Whether it is worth doing depends on where you code is spending most of its time.

Looking at the Lanzcos update kernel as an example:

__global__ void lanczos_norm(float* Av, float* v_hat, float* v, float alpha, float beta, float beta_old, int N) {

	__shared__ float svhat[LANCZOS_THREAD_PER_BLOCK];

	__shared__ float svold[LANCZOS_THREAD_PER_BLOCK];

	unsigned int tid = threadIdx.x;

	unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;

	if (i<N){

		svhat[tid] = v_hat[i];

		svold[tid] = v[i];

		v[i] = svhat[tid]/beta;

		svhat[tid] = Av[i] - alpha*v[i] - beta*svold[tid];

		v_hat[i]=svhat[tid];

		svhat[tid] = svhat[tid]*svhat[tid];

	}

	else svhat[tid] = 0;

	__syncthreads();

	if (LANCZOS_THREAD_PER_BLOCK >= 512) {

		if (tid < 256) { svhat[tid] += svhat[tid + 256]; }

		__syncthreads();

	}

	if (LANCZOS_THREAD_PER_BLOCK >= 256) {

		if (tid < 128) { svhat[tid] += svhat[tid + 128]; }

		__syncthreads();

	}

	if (LANCZOS_THREAD_PER_BLOCK >= 128) {

		if (tid < 64) { svhat[tid] += svhat[tid + 64]; }

		__syncthreads();

	}

	if (tid < 32) {

		if (LANCZOS_THREAD_PER_BLOCK >= 64) svhat[tid] += svhat[tid + 32];

		if (LANCZOS_THREAD_PER_BLOCK >= 32) svhat[tid] += svhat[tid + 16];

		if (LANCZOS_THREAD_PER_BLOCK >= 16) svhat[tid] += svhat[tid + 8];

		if (LANCZOS_THREAD_PER_BLOCK >= 8) svhat[tid] += svhat[tid + 4];

		if (LANCZOS_THREAD_PER_BLOCK >= 4) svhat[tid] += svhat[tid + 2];

		if (LANCZOS_THREAD_PER_BLOCK >= 2) svhat[tid] += svhat[tid + 1];

	}

	// write result for this block to global mem

	if (tid == 0) Av[blockIdx.x] = svhat[0];

}

This code computes one norm value per thread and then performs a block wise sum reduction on it. The arithmetic intensity of the kernel is very low and the “reduction ratio” is rather poor. Why not have every thread compute multiple norm values and sum them, then reduce them. By doing so you will need fewer blocks (so fewer total shared memory reductions, which is faster), need to copy less partial sums back from device (which will be faster), and need fewer operations on the host to complete the norm calculation (which is also faster).

The norm calculation itself doesn’t need to be done in shared memory, it only needs to be loaded into shared memory for the reduction. Doing the calculations using shared memory is slower than doing them using registers. svold is a completely unnecessary use of shared memory which will hurt occupancy. The reduction code itself could probably be improved a bit as well (if you are running on a Fermi card it may not always work correctly). My current code for doing something like this (an Linfty norm in this case) look something like:

typedef float ValueType;

__global__ void linfty_norm(unsigned int num_rows, const ValueType * x0, ValueType * norms)

{

	extern __shared__ ValueType vbuffer[];

	volatile int tidx = threadIdx.x + blockIdx.x * blockDim.x;

	volatile int stride = gridDim.x * blockDim.x;

	// Loop through vector to find local maximum

	ValueType lumax = 0;

	for( unsigned int i = tidx; i < num_rows; i+=stride ) {

		lumax = fmax( lumax, fabs(x0[i]) );

	}

	// Load into shared memory

	vbuffer[threadIdx.x] = lumax; __syncthreads();

	// First warp performs shared memory reduction

	// The volatile in register makes the code "Fermi safe"

	if (threadIdx.x < warpSize) {

		volatile ValueType *a = &vbuffer[threadIdx.x];

		#pragma unroll

		for(int i=warpSize; i<blockDim.x; i+=warpSize) {

			*a = fmax( *a, *(a+i));

		}

		*a = fmax( *a, *(a+16));

		*a = fmax( *a, *(a+8));

		*a = fmax( *a, *(a+4));

		*a = fmax( *a, *(a+2));

		// Finalise and write out the results to global memory

		if (threadIdx.x == 0)  {

			norms[blockIdx.x] = fmax(*a, *(a+1));

		}

	}

}

Just some random thoughts.

[quote name=‘avidday’ post=‘1117707’ date=‘Sep 15 2010, 02:12 PM’]

The only way to be sure is to profile it and see where it is slow. That said, all of your own kernels look like they could be improved in a number of way. Whether it is worth doing depends on where you code is spending most of its time.

Looking at the Lanzcos update kernel as an example:

[codebox] Method #call GPU usec %GPU time ins. throughput

ssbmvu_main 3596 391299 74.63 0.330816

sdot_gld_main 3595 44165.4 8.42 0.560313

memcpyDtoH 7191 32451.2 6.18

lanczos_norm 3595 30190.1 5.75 0.165724

update 3595 26139.4 4.98 0.0709608

memcpyHtoD 3 33.664 0

initialize 1 8.672 0 0.211152[/codebox]

The execution time depends mostly from the ssbmvu_main(matrix vector moltiplication) that is very slow…I can not understand why!

About improving my function…I have followed your suggestions…but I am not sure how to compute multiple norm values without using shared memory.

Moreover my “lanczos_norm kernel” compute not only the norm, but also vector “v_hat” that will be used later, this means that the number of blocks depend of the vector length.

This is the new code:

[codebox]global void lanczos_norm(float* Av, float* v_hat, float* v, float alpha, float beta, float beta_old, int N) {

shared float svhat[LANCZOS_THREAD_PER_BLOCK];

float rvhat=0;  //register	

float svold=0;  //register

    volatile int tid = threadIdx.x;

    volatile int i = blockIdx.x * blockDim.x + threadIdx.x; 

if (i<N){

       rvhat = v_hat[i];

       svold = v[i];

   v[i] = rvhat/beta;

       rvhat = Av[i] - alpha*v[i] - beta* svold;

v_hat[i]=rvhat; //COMPUTE V_HAT

       svhat[tid] = rvhat*rvhat; // COMPUTE NORM ELEMENT....HOW CAN I COMPUTE MORE NORM ELEMENTS WITHOUT SHARED MEM?

}

else svhat[tid] = 0;

__syncthreads();

   if (LANCZOS_THREAD_PER_BLOCK >= 512) {

	if (tid < 256) { svhat[tid] += svhat[tid + 256]; } 

__syncthreads();

}

if (LANCZOS_THREAD_PER_BLOCK >= 256) {

	if (tid < 128) { svhat[tid] += svhat[tid + 128]; } 

__syncthreads();

}

if (LANCZOS_THREAD_PER_BLOCK >= 128) {

	if (tid < 64) { svhat[tid] += svhat[tid + 64]; } 

__syncthreads();

}

if (tid < 32) {

	if (LANCZOS_THREAD_PER_BLOCK >= 64) svhat[tid] += svhat[tid + 32];

	if (LANCZOS_THREAD_PER_BLOCK >= 32) svhat[tid] += svhat[tid + 16];

	if (LANCZOS_THREAD_PER_BLOCK >= 16) svhat[tid] += svhat[tid + 8];

	if (LANCZOS_THREAD_PER_BLOCK >= 8) svhat[tid] += svhat[tid + 4];

	if (LANCZOS_THREAD_PER_BLOCK >= 4) svhat[tid] += svhat[tid + 2];

	if (LANCZOS_THREAD_PER_BLOCK >= 2) svhat[tid] += svhat[tid + 1];

}

// write result for this block to global mem 

if (tid == 0) Av[blockIdx.x] = svhat[0];

}[/codebox]

Finally, I have tried to allocate dynamically the shared memory using “extern shared float svhat;” but it doesnt work…Have you any idea why?

Thank you a lot !

[quote name=‘avidday’ post=‘1117707’ date=‘Sep 15 2010, 02:12 PM’]

The only way to be sure is to profile it and see where it is slow. That said, all of your own kernels look like they could be improved in a number of way. Whether it is worth doing depends on where you code is spending most of its time.

Looking at the Lanzcos update kernel as an example:

[codebox] Method #call GPU usec %GPU time ins. throughput

ssbmvu_main 3596 391299 74.63 0.330816

sdot_gld_main 3595 44165.4 8.42 0.560313

memcpyDtoH 7191 32451.2 6.18

lanczos_norm 3595 30190.1 5.75 0.165724

update 3595 26139.4 4.98 0.0709608

memcpyHtoD 3 33.664 0

initialize 1 8.672 0 0.211152[/codebox]

The execution time depends mostly from the ssbmvu_main(matrix vector moltiplication) that is very slow…I can not understand why!

About improving my function…I have followed your suggestions…but I am not sure how to compute multiple norm values without using shared memory.

Moreover my “lanczos_norm kernel” compute not only the norm, but also vector “v_hat” that will be used later, this means that the number of blocks depend of the vector length.

This is the new code:

[codebox]global void lanczos_norm(float* Av, float* v_hat, float* v, float alpha, float beta, float beta_old, int N) {

shared float svhat[LANCZOS_THREAD_PER_BLOCK];

float rvhat=0;  //register	

float svold=0;  //register

    volatile int tid = threadIdx.x;

    volatile int i = blockIdx.x * blockDim.x + threadIdx.x; 

if (i<N){

       rvhat = v_hat[i];

       svold = v[i];

   v[i] = rvhat/beta;

       rvhat = Av[i] - alpha*v[i] - beta* svold;

v_hat[i]=rvhat; //COMPUTE V_HAT

       svhat[tid] = rvhat*rvhat; // COMPUTE NORM ELEMENT....HOW CAN I COMPUTE MORE NORM ELEMENTS WITHOUT SHARED MEM?

}

else svhat[tid] = 0;

__syncthreads();

   if (LANCZOS_THREAD_PER_BLOCK >= 512) {

	if (tid < 256) { svhat[tid] += svhat[tid + 256]; } 

__syncthreads();

}

if (LANCZOS_THREAD_PER_BLOCK >= 256) {

	if (tid < 128) { svhat[tid] += svhat[tid + 128]; } 

__syncthreads();

}

if (LANCZOS_THREAD_PER_BLOCK >= 128) {

	if (tid < 64) { svhat[tid] += svhat[tid + 64]; } 

__syncthreads();

}

if (tid < 32) {

	if (LANCZOS_THREAD_PER_BLOCK >= 64) svhat[tid] += svhat[tid + 32];

	if (LANCZOS_THREAD_PER_BLOCK >= 32) svhat[tid] += svhat[tid + 16];

	if (LANCZOS_THREAD_PER_BLOCK >= 16) svhat[tid] += svhat[tid + 8];

	if (LANCZOS_THREAD_PER_BLOCK >= 8) svhat[tid] += svhat[tid + 4];

	if (LANCZOS_THREAD_PER_BLOCK >= 4) svhat[tid] += svhat[tid + 2];

	if (LANCZOS_THREAD_PER_BLOCK >= 2) svhat[tid] += svhat[tid + 1];

}

// write result for this block to global mem 

if (tid == 0) Av[blockIdx.x] = svhat[0];

}[/codebox]

Finally, I have tried to allocate dynamically the shared memory using “extern shared float svhat;” but it doesnt work…Have you any idea why?

Thank you a lot !

In a typical sparse/banded implementation on the GPU, you would launch 1 thread per row to compute the dot product, which is complete memory bandwidth bound. That probably means 700 threads for your 700x700 case, which is very small. The sort of sparse systems I solve using Krylov subspace methods on GPUs have several million rows. I suspect your problems are just far too small for the GPU to perform well.

Do it something like this:

volatile unsigned int tidx = threadIdx.x + threadIdx.y*blockDim.x;

volatile unsigned int gridsize = gridDim.x * blockDim.x;

float sum = 0.;

for (int i=tidx; i<N; i+=gridsize){

	rvhat = v_hat[i];

	svold = v[i];

	v[i] = rvhat/beta;

	rvhat = Av[i] - alpha*v[i] - beta* svold;

	v_hat[i]=rvhat; //COMPUTE V_HAT

	sum += rvhat *rvhat;

}

svhat[tid] = sum; __syncthreads();

// Reduction follows.....

You need to add the shared memory size to the kernel launch statement. So a kernel launch looks something like this (using the Linfty norm example again), where shmsize is the dynamic shared memory per block:

template <typename ValueType> ValueType linfty_norm(const thrust::device_vector<ValueType> & d_x)

{

	dim3 nblocks(112); // GTX 470 has 14 MP

	dim3 blocksize(96); // 3 warps per block

	size_t shmsize = blocksize.x * sizeof(ValueType);

	thrust::device_vector<ValueType> d_pnorms(nblocks.x);

	linfty_norm_kernel<ValueType> <<< nblocks, blocksize, shmsize>>> 

				 ( (unsigned int)d_x.size(), thrust::raw_pointer_cast(&d_x[0]), thrust::raw_pointer_cast(&d_pnorms[0]) ); 

	thrust::host_vector<ValueType> pnorms = d_pnorms;

	ValueType norm = pnorms[0];

	for(int i=1; i<blocksize.x; i++) { norm = fmax( pnorms[i], norm ); }

	return norm;

}