What i’m trying to do is simple non-maximal suppression

It takes a 1-D array and treat it as 2-D, and compares each entries 8-neighbour

if any neighbour have greater value it put back a zero

```
__global__ static void kernel_nonmaximal(float* pInOut, int w, int h)
{
const unsigned int i = blockIdx.x*BLOCK_SIZE+threadIdx.x;
const unsigned short x = i % w;
const unsigned short y = i / w;
__shared__ float smem[BLOCK_SIZE];
if(i < w*h){
smem[threadIdx.x] = pInOut[i];
}
__syncthreads();
if(i < w*h){
for(int j = -1; j < 2; j++){
for(int k = -1; k < 2; k++){
if(x+k >= 0 && x+k < w && y+j >= 0 && y+j < h
&& (j != 0 || k != 0)
&& pInOut[(y+j)*w+(x+k)] >= smem[threadIdx.x]){
smem[threadIdx.x] = 0.0f;
}
}
}
}
__syncthreads();
if(i < w*h){
pInOut[i] = smem[threadIdx.x];
}
}
void nonmaximal(float* pInOut, int w, int h)
{
dim3 grid( ceil(w*h/(float)BLOCK_SIZE), 1, 1 );
dim3 block( BLOCK_SIZE, 1, 1 );
kernel_nonmaximal<<< grid, block >>>(pInOut, w, h);
cudaThreadSynchronize();
return;
}
```

I dunno wt I’m doing wrong here. Can u guyz help me plx?