Hi,
I tried to implement a FIR of 13 coefficients. I took some test vector from a CPU version, so I know what I suppose obtained. The length of my sample is 64992. But I got a strange result just for small boundary(between 32769 and 32780) of data in the output array. All others result are good except for this small range. The parameters to launch my kernel are <<<254,256>>>. I tried to modify these parameters for another to verify if the problem are coming from a memory interference, but I got the same result.
The code
__global__ void GPU_Lite_filter(double *shaped,double *shapedOut)
{
volatile __shared__ double buff[267];
//Track the position
short tid = threadIdx.x;
short pos = blockIdx.x*blockDim.x+threadIdx.x;
double data = shaped[pos];
//loop unrolling
//coeff#1
buff[tid] = GPUcoeff.coefficient[0]*data;
__syncthreads();
//coeff#2
if(tid == ZLENGHTBLOCK)
buff[tid+1] = GPUcoeff.coefficient[1]*data;
else
buff[tid+1] += GPUcoeff.coefficient[1]*data;
__syncthreads();
//coeff#3
if(tid == ZLENGHTBLOCK)
buff[tid+2] = GPUcoeff.coefficient[2]*data;
else
buff[tid+2] += GPUcoeff.coefficient[2]*data;
__syncthreads();
//coeff#4
if(tid == ZLENGHTBLOCK)
buff[tid+3] = GPUcoeff.coefficient[3]*data;
else
buff[tid+3] += GPUcoeff.coefficient[3]*data;
__syncthreads();
//coeff#5
if(tid == ZLENGHTBLOCK)
buff[tid+4] = GPUcoeff.coefficient[4]*data;
else
buff[tid+4] += GPUcoeff.coefficient[4]*data;
__syncthreads();
//coeff#6
if(tid == ZLENGHTBLOCK)
buff[tid+5] = GPUcoeff.coefficient[5]*data;
else
buff[tid+5] += GPUcoeff.coefficient[5]*data;
__syncthreads();
//coeff#7
if(tid == ZLENGHTBLOCK)
buff[tid+6] = GPUcoeff.coefficient[6]*data;
else
buff[tid+6] += GPUcoeff.coefficient[6]*data;
__syncthreads();
//coeff#8
if(tid == ZLENGHTBLOCK)
buff[tid+7] = GPUcoeff.coefficient[7]*data;
else
buff[tid+7] += GPUcoeff.coefficient[7]*data;
__syncthreads();
//coeff#9
if(tid == ZLENGHTBLOCK)
buff[tid+8] = GPUcoeff.coefficient[8]*data;
else
buff[tid+8] += GPUcoeff.coefficient[8]*data;
__syncthreads();
//coeff#10
if(tid == ZLENGHTBLOCK)
buff[tid+9] = GPUcoeff.coefficient[9]*data;
else
buff[tid+9] += GPUcoeff.coefficient[9]*data;
__syncthreads();
//coeff#11
if(tid == ZLENGHTBLOCK)
buff[tid+10] = GPUcoeff.coefficient[10]*data;
else
buff[tid+10] += GPUcoeff.coefficient[10]*data;
__syncthreads();
//coeff#12
if(tid == ZLENGHTBLOCK)
buff[tid+11] = GPUcoeff.coefficient[11]*data;
else
buff[tid+11] += GPUcoeff.coefficient[11]*data;
__syncthreads();
//coeff#13
if(tid == ZLENGHTBLOCK)
buff[tid+12] = GPUcoeff.coefficient[12]*data;
else
buff[tid+12] += GPUcoeff.coefficient[12]*data;
__syncthreads();
atomicAddDouble(&shapedOut[pos], buff[tid]);
if(tid<12)
{
atomicAddDouble(&shapedOut[pos+LENGHTBLOCK], buff[LENGHTBLOCK+tid]);
}
}
Did you know where this problem coming?