Hi Everybody. I’m trying to use CUDA in filtering a digital signal. Here is the formula:
x[n] is the input signal,
y[n] is the output signal, and
b[sub]i[/sub] are the filter coefficients.
The number of coefficients is 512, and 1048576 for the input signal.
Here is the Code on c++:
for (int i=n_inf-1; i>=0; i--)
{
outp[i]=0;
if (i<n_koef_ch) n_koef_ch=i+1;
if (n_koef_ch==0) n_koef_ch=1;
for (int j=0;j<n_koef_ch;j++)
{
outp[i]=outp[i]+inf[i-j]*koef[j];
}
}
The Code on Cuda(all the coefficients are in the constant memory, in dc_koef array):
...
dim3 threads (n_koef);
dim3 blocks ( n_inf/threads.x );
mult<<<blocks, threads>>>(d_inf,d_outp,n_koef,n_inf);
...
__global__ void mult(float* d_inf, float* d_outp, int n_koef, int n_inf)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
__shared__ float as [1024];
as[threadIdx.x]=d_inf[idx];
as[n_koef+threadIdx.x]=d_inf[n_koef+idx];
__syncthreads();
sum=0.0f;
for(int i=0;i<n_koef;i++)
{
sum+=as[threadIdx.x+i]*dc_koef[i];
}
d_outp[n_inf-1-idx]=sum;
}
And using IPP signal processing library
__host__ void ippfilt(int n_inf, int n_koef, float* inf, float* koef, float* outp)
{
IppStatus status;
IppsFIRState_32f *fctx;
ippsFIRInitAlloc_32f( &fctx, koef, n_koef, NULL );
status=ippsFIR_32f(inf, outp, n_inf, fctx);
}
Hardware: Intеl Corе™2 Duo (Е4500) 2.2 Ghz; GeForce 9600GT Palit Sonic
CUDA function runs 43 times faster then the code on c++, but about 6 times slower then when using IPP library.
Please, tell me, what i’m doing wrong?