I have a Quadro FX 1700 with Toolkit 2.1.1635 and SDK 2.10 whichs runs on Ubuntu 8.04 64 bit

for which i made an erossion filter. This filter uses one thread per vertical line and works just fine. But it prodcues incorrect results if introduce a ciculair buffer with modulo. In my test case(white image, black border) it results in black dots in the output image where the whole image should be completly white.

```
__global__ void cudaErodeLineLocal(unsigned char* input, unsigned char* output, unsigned int width, unsigned int height)
{
unsigned int x = IMUL(blockIdx.x, (blockDim.x - 2)) + threadIdx.x;
//unsigned int y = IMUL(blockIdx.y, blockDim.y) + threadIdx.y;
__shared__ unsigned char localMem[162*3];
//prefetch
localMem[blockDim.x*0+threadIdx.x] = input[0 * width + x];
localMem[blockDim.x*1+threadIdx.x] = input[1 * width + x];
for(unsigned int y = 1; y < height-1; y++ )
{
localMem[blockDim.x*((y+1)%3)+threadIdx.x] = input[(y+1)*width + x]; //next line
if ((threadIdx.x == 0) || (threadIdx.x == 161))
{
if ( (x == 0) )
{
output[y*width + x] = 0;
}
}
else
{
output[y*width + x] = ((
localMem[blockDim.x * ((y-1)%3) + threadIdx.x-1] + localMem[blockDim.x * ((y-1)%3) + threadIdx.x] + localMem[blockDim.x * ((y-1)%3) + threadIdx.x+1]
+ localMem[blockDim.x * ((y )%3) + threadIdx.x-1] + localMem[blockDim.x * ((y )%3) + threadIdx.x] + localMem[blockDim.x * ((y )%3) + threadIdx.x+1]
+ localMem[blockDim.x * ((y+1)%3) + threadIdx.x-1] + localMem[blockDim.x * ((y+1)%3) + threadIdx.x] + localMem[blockDim.x * ((y+1)%3) + threadIdx.x+1]) == 2295 )? 255 : 0;
}
}
}
```

placeing the localMem array in global memory doesn’t solve the problem.

The output varies if the modulo operator is changed, i’ve inlcuded some examples of %3 %32 %64

if the modulo operator is bigger as the image than is the output correct aswell