Here is some of my code：

```
__int64 i = threadIdx.x + blockDim.x * blockIdx.x;
if (i < kSize)
{
da_reverse[i] = d_a[kSize - 1 - i];
}
I tried to output da_reverse，and the result turns out not right: they are all zeros.
the following is my entire code, can you give me a hand?
__global__ void firls(__int64 freqSize, double* d_freq, __int64 amplitudeSize, double* d_amplitude, __int64 weightSize, double* d_weight, __int64 kSize, double* d_k, double* d_b, double* d_a, double* d_coefficients, double* da_reverse)
{
__int64 i = threadIdx.x + blockDim.x * blockIdx.x;
if (i < weightSize) // weightSize = 2;
{
d_weight[i] = 1;
}
if (i < freqSize) // freqSize = 4;
{
d_freq[i] = d_freq[i] * 0.5;
}
__int64 filterLength = (kSize - 1) * 2 + 1;
__int64 Nodd = filterLength % 2;
double b0 = 0;
if (Nodd == 0)
{
if (i < kSize)
{
d_k[i] = i + 0.5;
d_b[i] = 0;
}
}
else
{
if (i < kSize)
{
d_k[i] = i;
d_b[i] = 0;
}
}
for (int j = 0; j < freqSize; j += 2)
{
double slope = (d_amplitude[j + 1] - d_amplitude[j]) / (d_freq[j + 1] - d_freq[j]);
double b1 = d_amplitude[j] - slope * d_freq[j];
if (Nodd == 1)
{
b0 += (b1 * (d_freq[j + 1] - d_freq[j])) +
slope / 2.0 * (d_freq[j + 1] * d_freq[j + 1] - d_freq[j] * d_freq[j]) *
abs(d_weight[(j + 1) / 2] * d_weight[(j + 1) / 2]); // wt = abs(sqrt(complex(weight)));
}
if (i < kSize)
{
d_b[i] += (slope / (4 * PI * PI) *
(cos(2 * PI * d_k[i] * d_freq[j + 1]) - cos(2 * PI * d_k[i] * d_freq[j])) / (d_k[i] * d_k[i])) *
abs(d_weight[(j + 1) / 2] * d_weight[(j + 1) / 2]);
d_b[i] += (d_freq[j + 1] * (slope * d_freq[j + 1] + b1) * sinc(2 * d_k[i] * d_freq[j + 1]) -
d_freq[j] * (slope * d_freq[j] + b1) * sinc(2 * d_k[i] * d_freq[j])) *
abs(d_weight[(j + 1) / 2] * d_weight[(j + 1) / 2]);
}
}
if (Nodd == 1)
{
d_b[0] = b0;
}
if (i < kSize)
{
d_a[i] = d_weight[0] * d_weight[0] * 4 * d_b[i]; // until here the result of d_a is still right
}
if (i < kSize)
{
da_reverse[i] = d_a[kSize - 1 - i];
d_coefficients[i] = da_reverse[i]; // d_coefficients is the result i send back to host.
}
}
```

here is the code in main function:

```
__int64 length = 2 * n * maxFactor + 1;
dim3 ThreadPerBlock(512);
dim3 GridSize((length + ThreadPerBlock.x - 1) / ThreadPerBlock.x );
double firlsFreq = 1.0 / 2.0 / (double)maxFactor;
double firlsFreqs[] = { 0.0, 2.0 * firlsFreq, 2.0 * firlsFreq, 1.0 };
double* firlsFreqsV = (double*)malloc(4 * sizeof(double));
for (__int64 i = 0; i < 4; ++i)
firlsFreqsV[i] = firlsFreqs[i];
double firlsAmplitude[] = { 1.0, 1.0, 0.0, 0.0 };
double* firlsAmplitudeV = (double*)malloc(4 * sizeof(double));
for (__int64 i = 0; i < 4; ++i)
firlsAmplitudeV[i] = firlsAmplitude[i];
double* coefficients = (double*)malloc(length * sizeof(double));
double* d_weight;
double* d_k;
double* d_b;
double* d_a;
double* d_freq;
double* d_amplitude;
double* d_coefficients;
double* da_reverse;
__int64 freqSize = 4;
__int64 amplitudeSize = 4;
__int64 weightSize = 2;
__int64 kSize = length / 2 + 1;
cudaMalloc((void**)&d_weight, weightSize * sizeof(double));
cudaMalloc((void**)&d_k, kSize * sizeof(double));
cudaMalloc((void**)&d_b, kSize * sizeof(double));
cudaMalloc((void**)&d_a, kSize * sizeof(double));
cudaMalloc((void**)&d_freq, freqSize * sizeof(double));
cudaMalloc((void**)&d_amplitude, amplitudeSize * sizeof(double));
cudaMalloc((void**)&d_coefficients, length * sizeof(double));
cudaMalloc((void**)&da_reverse, kSize * sizeof(double));
cudaMemcpy(d_freq, firlsFreqsV, freqSize * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_amplitude, firlsAmplitudeV, amplitudeSize * sizeof(double), cudaMemcpyHostToDevice);
firls << <GridSize, ThreadPerBlock >> > (freqSize, d_freq, amplitudeSize, d_amplitude, weightSize, d_weight, kSize, d_k, d_b, d_a, d_coefficients,da_reverse);
cudaMemcpy(coefficients, d_coefficients, length * sizeof(double), cudaMemcpyDeviceToHost);
for (int i = 0; i < 1000; i++)
{
cout <<setprecision(20)<< coefficients[i] << "\t" << i << endl;
}
```

it turns out the output is all zero.