Optimize parallel cyclic reduction(PCR) code

I have tried to write a code to solve a tridiagonal system using PCR technique. Can it be more optimize?

// this kernel will Parallely solves the tridiagonal sysytem
__global__ void pcr(double* A, double* d1, int k)// k = ceil(log_2(n))
       {
// i is the equation number
	int i = threadIdx.x;

// allocating memory in the shared memory for lower diag, diag. upper diag and right hand vector  
        __shared__ double a[n];
        __shared__ double b[n];
        __shared__ double c[n];
        __shared__ double d[n];

// initialize the coffecient arrys from the globally define tridiagonal matrix       
        a[i] = A[3*i];
        b[i] = A[3*i+1];
        c[i] = A[3*i+2];
        d[i] = d1[i];

// waiting for every thread to finish above initialization
        __syncthreads();

// executing all PCR steps by for loop  
        double alfa, beta, a1, c1, d2, a2, c2, d3;
        for(int j = 0; j<k ; j++) // k = ceil(log_2(n))
         {

// claculating upper(p) and lower(q) equation numbers for each step by the current equation 
           int p = i - powf(2,j);
           int q = i + powf(2, j);

// making one new equation from three equations and calculating new coefficients for new equation
           if(p>=0)
             { 
             alfa = -a[i]/b[p];
              a1  =  alfa * a[p]; 
              c1  =  alfa * c[p];
              d2  =  alfa * d[p]; 
             } 
	   else
             {
             a1  = 0; 
             c1  = 0;
             d2  = 0;
             }

          if(q<=n-1)
             { 
             beta = -c[i]/b[q];
             a2   = beta * a[q];
             c2   = beta * c[q];
             d3   = beta * d[q];  
             } 
	   else
             {
              a2 = 0;
              c2 = 0;
              d3 = 0; 
             }

// waiting for each thread to finish the making of new equation 
          __syncthreads();

// writing down the new coefficients in place of the coefficients of current equation
          a[i] = a1;
          b[i] = b[i] + c1 + a2;
          c[i] = c2;
          d[i] = d[i] + d3 + d2; 

// waiting for each thread to finsh writing down the new coefficients before starting the new step                  
         __syncthreads();
         }


        d1[i] = d[i]/b[i]; 
       }