Kernel is calculating square mean by moving window along each column (signal can be assumed to be matrix of NumSamples(width) x NumSweeps(height) size)

extern “C” **global** void HeapCalc(float2 *signal, float *di_signal, int window_size, int NumSamples, int NumSweeps)

{

int sample = blockIdx.x * blockDim.x + threadIdx.x; (NumSamples is split into blocks, so that each thread is calculating separate column)

```
//need to get square mean for first elements
float Di = 0;
for (int sweep = 0; sweep < window_size; sweep++)
{
Di += (signal[sweep * NumSamples + sample].x * signal[sweep * NumSamples + sample].x + signal[sweep * NumSamples + sample].y * signal[sweep * NumSamples + sample].y);
}
di_signal[sample] = Di;
//main cycle (just need to add new element and remove last element and store result)
for (int sweep = window_size; sweep < NumSweeps; sweep++)
{
Di += (signal[sweep * NumSamples + sample].x * signal[sweep * NumSamples + sample].x + signal[sweep * NumSamples + sample].y * signal[sweep * NumSamples + sample].y);
Di -= (signal[(sweep - window_size) * NumSamples + sample].x * signal[(sweep - window_size) * NumSamples + sample].x + signal[(sweep - window_size) * NumSamples + sample].y * signal[(sweep - window_size) * NumSamples + sample].y);
di_signal[(sweep - window_size + 1) * NumSamples + sample] = Di;
}
```

}

Is this kernel optimal or are there some other tricks to calculate window functions?