Hi All!

I’m first time here, so, please be sorry for my possible inconsistency in this post.

So, I have to calculate the quadrature of signal from the first array around of peaks positions that are contained in the second array.

```
(float *)volts (int *)peaks
s [0..........................8159] [0..........................215]
e [0..........................8159] [0..........................215]
g [0..........................8159] [0..........................215]
m [0..........................8159] [0..........................215]
e
n * * * * * * * * * * * *
t
_ [0..........................8159] [0..........................215]
c ^ ^
n segment_size-1 peaks_cnt-1
t
```

The function “quadratures” are running with the maximum allowed threads by the Nvidia card and blocks = segment_cnt / max_threads + 1.

The function “quadratures_v2” is running with blocks=segment_cnt and threads=peak_cnt.

Why is the second variant slower than the first? The segment count is >= 1e5.

Both functions are below. Thanks in advance!

```
// overall number of threads = u32SegmentCount;
__global__ void quadratures(float *volts, int *peaks, int peaks_cnt, volatile float *quads,
int segment_cnt, int segment_size, double scaler, int integration_interval) {
INT64 n = blockDim.x * blockIdx.x + threadIdx.x;
UINT8 *vBuffer = (UINT8 *)volts;
UINT8 *qBuffer = (UINT8 *)quads;
if(n < segment_cnt) {
vBuffer += segment_size * sizeof(float) * n;
qBuffer += peaks_cnt * sizeof(float) * n;
float *p = (float *)vBuffer;
float *q = (float *)qBuffer;
// Integration for each pulse in a segment
for(int i = 0; i < peaks_cnt; i++) {
float voltage_integral = 0.0;
// Integration around the pulse position of the signal
for(int j = 0; j < integration_interval; j++) {
int interval_position = peaks[i] - (int)((float)integration_interval / 2.0) + j;
voltage_integral += p[interval_position];
}
q[i] = (float)(voltage_integral * scaler);
}
}
}
// blocks = segment count and threads = peaks count
__global__ void quadratures_v2(float *volts, int *peaks, int peaks_cnt, volatile float *quads,
int segment_cnt, int segment_size, double scaler, int integration_interval) {
UINT8 *vBuffer = (UINT8 *)volts;
UINT8 *qBuffer = (UINT8 *)quads;
if(blockIdx.x < segment_cnt && threadIdx.x < peaks_cnt) {
vBuffer += segment_size * sizeof(float) * blockIdx.x;
qBuffer += peaks_cnt * sizeof(float) * blockIdx.x;
float *p = (float *)vBuffer;
float *q = (float *)qBuffer;
int i = threadIdx.x;
float voltage_integral = 0.0;
// Integration around the pulse position of the signal
for(int j = 0; j < integration_interval; j++) {
int interval_position = peaks[i] - (int)((float)integration_interval / 2.0) + j;
voltage_integral += p[interval_position];
}
q[i] = (float)(voltage_integral * scaler);
}
}
```