Dear all,

I am trying to introducing cuda to convolution function i have.

Following is the definition of convolution function

```
// H=61 LHHALF=30
// convolve_cwp_1( LH, -LHHALF, h, n, 0, x, n, 0, y);
void convolve_cwp_1(int lx, int ifx, float *x, int ly, int ify, float *y, int lz, int ifz, float *z)
{
//ilx= 30 ily=n-1 ilz=n-1
int ilx=ifx+lx-1, ily=ify+ly-1, ilz=ifz+lz-1,i,j,jlow,jhigh;
float sum;
x -= ifx; y -= ify; z -= ifz;
//i=0 to n-1
for (i=ifz; i<=ilz; ++i) {
//ily=n-1
jlow = i-ily; if (jlow<ifx) jlow = ifx;
jhigh = i-ify; if (jhigh>ilx) jhigh = ilx;
for (j=jlow,sum=0.0; j<=jhigh; ++j)
sum += x[j]*y[i-j];
z[i] = sum;
}
}
```

I have converted this function to

```
//x -= ifx; y -= ify; z -= ifz; this will be taken care in function call
__global__ void cuda_convoution(int lx, int ifx, float *x, int ly, int ify, float *y, int lz, int ifz, float *z, int ilx, int ily, int ilz)
{
int i, j, jlow, jhigh;
int i = threadIdx.x + blockIdx.x*blockDim.x;
if(i <= ilz)
{
jlow = i-ily; if (jlow<ifx) jlow = ifx;
jhigh = i-ify; if (jhigh>ilx) jhigh = ilx;
for(j=jlow,sum=0.0; j<=jhigh; ++j)
sum += x[j]*y[i-j];
z[i] = sum;
}
}
```

I am new to CUDA, Please suggest if correct any extra optimization i can done.