I have tried to write a code to solve a tridiagonal system using PCR technique. Can it be more optimize?

```
// this kernel will Parallely solves the tridiagonal sysytem
__global__ void pcr(double* A, double* d1, int k)// k = ceil(log_2(n))
{
// i is the equation number
int i = threadIdx.x;
// allocating memory in the shared memory for lower diag, diag. upper diag and right hand vector
__shared__ double a[n];
__shared__ double b[n];
__shared__ double c[n];
__shared__ double d[n];
// initialize the coffecient arrys from the globally define tridiagonal matrix
a[i] = A[3*i];
b[i] = A[3*i+1];
c[i] = A[3*i+2];
d[i] = d1[i];
// waiting for every thread to finish above initialization
__syncthreads();
// executing all PCR steps by for loop
double alfa, beta, a1, c1, d2, a2, c2, d3;
for(int j = 0; j<k ; j++) // k = ceil(log_2(n))
{
// claculating upper(p) and lower(q) equation numbers for each step by the current equation
int p = i - powf(2,j);
int q = i + powf(2, j);
// making one new equation from three equations and calculating new coefficients for new equation
if(p>=0)
{
alfa = -a[i]/b[p];
a1 = alfa * a[p];
c1 = alfa * c[p];
d2 = alfa * d[p];
}
else
{
a1 = 0;
c1 = 0;
d2 = 0;
}
if(q<=n-1)
{
beta = -c[i]/b[q];
a2 = beta * a[q];
c2 = beta * c[q];
d3 = beta * d[q];
}
else
{
a2 = 0;
c2 = 0;
d3 = 0;
}
// waiting for each thread to finish the making of new equation
__syncthreads();
// writing down the new coefficients in place of the coefficients of current equation
a[i] = a1;
b[i] = b[i] + c1 + a2;
c[i] = c2;
d[i] = d[i] + d3 + d2;
// waiting for each thread to finsh writing down the new coefficients before starting the new step
__syncthreads();
}
d1[i] = d[i]/b[i];
}
```