 # Optimize parallel cyclic reduction(PCR) code

I have tried to write a code to solve a tridiagonal system using PCR technique. Can it be more optimize?

``````// this kernel will Parallely solves the tridiagonal sysytem
__global__ void pcr(double* A, double* d1, int k)// k = ceil(log_2(n))
{
// i is the equation number

// allocating memory in the shared memory for lower diag, diag. upper diag and right hand vector
__shared__ double a[n];
__shared__ double b[n];
__shared__ double c[n];
__shared__ double d[n];

// initialize the coffecient arrys from the globally define tridiagonal matrix
a[i] = A[3*i];
b[i] = A[3*i+1];
c[i] = A[3*i+2];
d[i] = d1[i];

// waiting for every thread to finish above initialization

// executing all PCR steps by for loop
double alfa, beta, a1, c1, d2, a2, c2, d3;
for(int j = 0; j<k ; j++) // k = ceil(log_2(n))
{

// claculating upper(p) and lower(q) equation numbers for each step by the current equation
int p = i - powf(2,j);
int q = i + powf(2, j);

// making one new equation from three equations and calculating new coefficients for new equation
if(p>=0)
{
alfa = -a[i]/b[p];
a1  =  alfa * a[p];
c1  =  alfa * c[p];
d2  =  alfa * d[p];
}
else
{
a1  = 0;
c1  = 0;
d2  = 0;
}

if(q<=n-1)
{
beta = -c[i]/b[q];
a2   = beta * a[q];
c2   = beta * c[q];
d3   = beta * d[q];
}
else
{
a2 = 0;
c2 = 0;
d3 = 0;
}

// waiting for each thread to finish the making of new equation

// writing down the new coefficients in place of the coefficients of current equation
a[i] = a1;
b[i] = b[i] + c1 + a2;
c[i] = c2;
d[i] = d[i] + d3 + d2;

// waiting for each thread to finsh writing down the new coefficients before starting the new step