 # Simple 2d Convolution Low Pass filter like blur filter

Being newbie to Cuda programming , I need to write a Low pass filter which needs 2D convolution
quite honestly I was not able to understand the cuda SDK separable convolution implementation.
At the moment speed not exactly a big issue first I need to get it working within reasonable speed range and I will improve it later
I tried different ways (using shared memory , global memory etc ) Still no reasonable response .
Can anyone help me out with a simple 2d convolution algorithm using shared memory.
Any help would be much appreciated
this is one of the code I did I was trying to padd the 16x16 blocks for using bur filter

#define offset(Y,X) X+YblockDim.xgridDim.x
global
void Blur_Kernel (float* d_Data,float* d_Result,int imgW,int imgH)
{
//map from threadIdx/blockIdx to pixel position
int x= threadIdx.x+ blockIdx.x * blockDim.x ;
int y= threadIdx.y+ blockIdx.y * blockDim.y ;

``````int bw = blockDim.x;
int bh = blockDim.y;
__shared__ float d_DataS[16+2*r][16+2*r];

d_DataS[iy+r][ix+r]=d_Data[offset(y,x)];

``````

//boundary conditions
if (ix < r) {
if (x-r>=0 && x+bw<imgW){
// left
d_DataS[iy+r][ix]=d_Data[offset(y,x-r)];

``````    // right
d_DataS[iy+r][ix+r+bw]=d_Data[offset(x+bw,y)];
}
else {
d_DataS[iy+r][ix]=0;
d_DataS[iy+r][ix+r+bw]=0;
}
}
if (iy < r) {
if (y-r>=0 && y+bh<imgH ){
// top
d_DataS[iy][ix+r]=d_Data[offset(y-r,x)];

// bottom
d_DataS[iy+r+bh][ix+r]=d_Data[offset(y+bh,x)];
}
else {
d_DataS[iy][ix+r]=0;
d_DataS[iy+r+bh][ix+r]=0;
}
}

if ((ix < r) && (iy < r)) {

// top left
if (y-r>=0 && x-r>=0){

d_DataS[iy][ix]=d_Data[offset(y-r,x-r)];
}
else {
d_DataS[iy][ix]=0;
}
// bottom left
if (y+bh<imgH && x-r>=0){
d_DataS[iy+r+bh][ix]=d_Data[offset(y+bh,x-r)];
}
else {
d_DataS[iy+r+bh][ix]=0;
}
// top right
if (y-r>=0 && x+bh<imgW){

d_DataS[iy][ix+r+bw]=d_Data[offset(y-r,x+bh)];
}
else {
d_DataS[iy][ix+r+bw]=0;
}
// bottom right
if (y+bh<imgH && x+bw< imgW ){
d_DataS[iy+r+bh][ix+r+bh]=d_Data[offset(y+bh,x+bw)];
}
else{

d_DataS[iy+r+bh][ix+r+bh]=0;
}
}
``````

//convolution
for (int i=-r;i<r+1;i++){
for (int j=-r;j<r+1;j++)
d_DataS[iy+r][ix+r]+=d_DataS[iy+r+j][ix+r+i];