Not all blocks working properly in a computation

Hi,

I am writing a simple image processing code in which I have a couple of kernels that perform two operations on each pixel. The first one does a sort of average with the neighbor pixels and the second one is just to copy a matrix to another to have the old state:

float *d_edge, *d_old_b, *d_new_b;

cudaMalloc((void **) &d_edge, size);
cudaMemcpy(d_edge, edge, size, cudaMemcpyHostToDevice);
cudaMalloc((void **) &d_old_b, size);
cudaMemcpy(d_old_b, old_b, size, cudaMemcpyHostToDevice);
cudaMalloc((void **) &d_new_b, size);
cudaMemcpy(d_new_b, new_b, size, cudaMemcpyHostToDevice);

dim3 dimGrid(ceil((n+2)/16.0), ceil((m+2)/16.0), 1);
dim3 dimBlock(16, 16, 1);
//Loop over iterations-------------------------------------
for(int it = 0; it < N; it++){
average_kernel<<<dimGrid,dimBlock>>>(d_edge, d_old_b, d_new_b, m, n);
backup_kernel<<<dimGrid,dimBlock>>>(d_old_b, d_new_b, m, n);
}
//---------------------------------------------------------

cudaMemcpy(old_b, d_old_b, size, cudaMemcpyDeviceToHost);

cudaFree(d_new_b);
cudaFree(d_old_b);
cudaFree(d_edge);

The kernels are:

__global__ void average_kernel(float *edge, float *old_b, float *new_b,
		       int m, int n) {

int i = blockIdx.y * blockDim.y + threadIdx.y;
int j = blockIdx.x * blockDim.x + threadIdx.x;

if((i > 0 && i < m+1) && (j > 0 && j < n+1)){
//printf("%d %d\n",i, j);
//get new values (do not update the halo)
new_b[i*(n+2)+j] = (old_b[i*(n+2)+(j-1)]+
old_b[i*(n+2)+(j+1)]+
old_b[(i-1)(n+2)+j]+
old_b[(i+1)
(n+2)+j]-
edge[i*(n+2)+j]) / 4;
}
}

global void backup_kernel(float *old_b, float *new_b,
int m, int n) {
int i = blockIdx.y * blockDim.y + threadIdx.y;
int j = blockIdx.x * blockDim.x + threadIdx.x;

if((i > 0 && i < m+1) && (j > 0 && j < n+1)){
//backup the new to the old array (without the halo)
old_b[i*(n+2)+j] = new_b[i*(n+2)+j];
}
}

The problem is that when I run the code the operation seems to work for the first few rows of blocks, but not for the rest. Any help would be very much appreciated.
edge192x128_re_gpu