Loop Problem

I want to change FuntionA into FuntionB which is implemented by CUDA. The d_in (input date) is a 2-D Array that its size is widthheight. And the d_out (ouput date) is also a 2-D Array that its size is 512height.

  In the FuntionB, the loop as follow has a large calculational cost. Can it be changed into parallel loop which is implemented by CUDA?
  for (int i=tid+bid*blockDim.x; i<size; i+=blockDim.x*gridDim.x)
       if ( i<16 || i>(size-16))  
                tmpv = d_in[i];
                tmpv = 0;
                for(int k=0; k<16; k++)
                          tmpv += (d_in[i+k] + d_in[i-(k+1)])* tempLowpass[k];




FuntionA(int *d_in, char *d_out, int width, int height, char *lowpass, int gene )
int *tmp_out1;
int *tmp_out2;
char *tmp_out3;
tmp_out1 = (int *)malloc((width+1024)*sizeof(int));
tmp_out2 = (int *)malloc((width+1024)*sizeof(int));
tmp_out3 = (char *)malloc((width+1024)*sizeof(char));

      int tmpv = 0;

      for(int i=0; i<height; i++)
                for(int j=0; j<width; j++)
                          tmpv = 0;
                          tmp_out1[j] = (int)abs(d_in[i*width+j]);

                          if(j<16 || j>(width-16))
                                    tmp_out2[j] = tmp_out1[j];
                                    for(int k=0; k<16; k++)
                                              tmpv += (tmp_out1[j+k] + tmp_out1[j-(k+1)]) * lowpass[k];
                                    tmpv = tmpv/128;
                                    tmp_out2[j] = tmpv;			


                          if(GetLog(tmp_out2[j]) > 255)
                                    tmp_out3[j] = char(255);
                                    tmp_out3[j] = char(GetLog(tmp_out2[j]));


                for(int j=0; j<512; j++)
                          d_out[512*i+j] = tmp_out3[j*gene];




global void FuntionB(int *d_in, int *d_Temp, char *d_out, int width, int height, char *lowpass, int gene)
shared char tempLowpass[16];

      for (int k=0; k<16; k++)
                tempLowpass[k] = lowpass[k];


      int tid = threadIdx.x;
      int bid = blockIdx.x;
      int size = width * height;

      float tmpv;

      for (int i=tid+bid*blockDim.x; i<size; i+=blockDim.x*gridDim.x)

                d_in[i] = (int)abs(d_in[i]);



      for (int i=tid+bid*blockDim.x; i<size; i+=blockDim.x*gridDim.x)
                if ( i<16 || i>(size-16))  
                          tmpv = d_in[i];
                          tmpv = 0;
                          for(int k=0; k<16; k++)
                                    tmpv += (d_in[i+k] + d_in[i-(k+1)])* tempLowpass[k];


                          tmpv = tmpv/128;

                if(GetLog(tmpv) > 255)
                          d_Temp[i] = char(255);
                          d_Temp[i] = char(GetLog(tmpv));


      for (int i=tid+bid*blockDim.x; i<512*height; i+=blockDim.x*gridDim.x)
                d_out[i] = d_Temp[i*30];


