Loop Problem

I want to change FuntionA into FuntionB which is implemented by CUDA. The d_in (input date) is a 2-D Array that its size is widthheight. And the d_out (ouput date) is also a 2-D Array that its size is 512height.

  In the FuntionB, the loop as follow has a large calculational cost. Can it be changed into parallel loop which is implemented by CUDA?
  for (int i=tid+bid*blockDim.x; i<size; i+=blockDim.x*gridDim.x)
 {
       if ( i<16 || i>(size-16))  
      {
                tmpv = d_in[i];
      } 			
      else	
      {
                tmpv = 0;
                for(int k=0; k<16; k++)
                {			
                          tmpv += (d_in[i+k] + d_in[i-(k+1)])* tempLowpass[k];

                          __syncthreads();

                }
      }

}

FuntionA(int *d_in, char *d_out, int width, int height, char *lowpass, int gene )
{
int *tmp_out1;
int *tmp_out2;
char *tmp_out3;
tmp_out1 = (int *)malloc((width+1024)*sizeof(int));
tmp_out2 = (int *)malloc((width+1024)*sizeof(int));
tmp_out3 = (char *)malloc((width+1024)*sizeof(char));

      int tmpv = 0;

      for(int i=0; i<height; i++)
      {
                for(int j=0; j<width; j++)
                {
                          tmpv = 0;
                          tmp_out1[j] = (int)abs(d_in[i*width+j]);

                          if(j<16 || j>(width-16))
                          {
                                    tmp_out2[j] = tmp_out1[j];
                          }
                          else	
                          {
                                    for(int k=0; k<16; k++)
                                    {
                                              tmpv += (tmp_out1[j+k] + tmp_out1[j-(k+1)]) * lowpass[k];
                                    }
                                    tmpv = tmpv/128;
                                    tmp_out2[j] = tmpv;			

                          }

                          if(GetLog(tmp_out2[j]) > 255)
                                    tmp_out3[j] = char(255);
                          else
                                    tmp_out3[j] = char(GetLog(tmp_out2[j]));

                }

                for(int j=0; j<512; j++)
                {
                          d_out[512*i+j] = tmp_out3[j*gene];
                }
      }

      free(tmp_out1);
      free(tmp_out2);
      free(tmp_out3);

}

//--------------------------------------------------------------------------------------------------------------------------//

global void FuntionB(int *d_in, int *d_Temp, char *d_out, int width, int height, char *lowpass, int gene)
{
shared char tempLowpass[16];

      for (int k=0; k<16; k++)
      {
                tempLowpass[k] = lowpass[k];

                __syncthreads();
      }

      int tid = threadIdx.x;
      int bid = blockIdx.x;
      int size = width * height;

      float tmpv;

      for (int i=tid+bid*blockDim.x; i<size; i+=blockDim.x*gridDim.x)
      {

                d_in[i] = (int)abs(d_in[i]);

                __syncthreads();

      }	


      for (int i=tid+bid*blockDim.x; i<size; i+=blockDim.x*gridDim.x)
      {
                if ( i<16 || i>(size-16))  
                {
                          tmpv = d_in[i];
                } 			
                else	
                {
                          tmpv = 0;
                          for(int k=0; k<16; k++)
                          {			
                                    tmpv += (d_in[i+k] + d_in[i-(k+1)])* tempLowpass[k];

                                    __syncthreads();

                          }
                          tmpv = tmpv/128;
                }

                if(GetLog(tmpv) > 255)
                          d_Temp[i] = char(255);
                else
                          d_Temp[i] = char(GetLog(tmpv));
	
                __syncthreads();


      }

      for (int i=tid+bid*blockDim.x; i<512*height; i+=blockDim.x*gridDim.x)
      {
                d_out[i] = d_Temp[i*30];

                __syncthreads();

      }

}