strange kernel

I have two kernels. I suppose they are doing the same task, but i get different results. Why?
(Sorry my browser does not insert CODEBOX)

#define FFT_SIZE 16
extern “C” global void CDC1(float2 *signal, int *out, int NumSweeps, int NumSamples)
{
int sample = (blockIdx.x * blockDim.x + threadIdx.x);
float2 a[FFT_SIZE];
for (int sweep = 0; sweep < NumSweeps; sweep++)
{
for (int k = 0; k < FFT_SIZE; k++) a[k] = signal[(sweep + k) * NumSamples + sample];
FFT16(a);
float max = -1;
for (int t = 0; t < FFT_SIZE; t++)
{
float ampl = (a[t].x * a[t].x + a[t].y * a[t].y);
if (ampl > max) max = ampl;
}
out[sweep * NumSamples + sample] = (int)max;
}
}
extern “C” global void CDC2(float2 *signal, int *out, int NumSweeps, int NumSamples)
{
int sample = (blockIdx.x * blockDim.x + threadIdx.x);
float2 a[FFT_SIZE];
for (int k = 1; k < FFT_SIZE; k++) a[k] = signal[(k - 1) * NumSamples + sample];
for (int sweep = 0; sweep < NumSweeps; sweep++)
{
a[0] = a[1];
a[1] = a[2];
a[2] = a[3];
a[3] = a[4];
a[4] = a[5];
a[5] = a[6];
a[6] = a[7];
a[7] = a[8];
a[8] = a[9];
a[9] = a[10];
a[10] = a[11];
a[11] = a[12];
a[12] = a[13];
a[13] = a[14];
a[14] = a[15];
a[15] = signal[(sweep + FFT_SIZE) * NumSamples + sample];
FFT16(a);
float max = -1;
for (int t = 0; t < FFT_SIZE; t++)
{
float ampl = (a[t].x * a[t].x + a[t].y * a[t].y);
if (ampl > max) max = ampl;
}
out[sweep * NumSamples + sample] = (int)max;
}
}