I wrote a new version of the program that directly measures the time between the end of the previous launch and the start of the current launch and accumulates statistics.
As in my previous tests, the statistics apply to the final 50 of 200 launches.
typical results:
[HAGS disabled:] k_direct() stats (us): N = 50, Min = 1.024000, Max = 15.872000, Mean = 2.723840, Sigma = 2.100299
[HAGs enabled: ] k_direct() stats (us): N = 50, Min = 20.736000, Max = 43.008000, Mean = 28.748800, Sigma = 4.693744
a particularly bad case:
[HAGs enabled: ] k_direct() stats (us): N = 50, Min = 17.408000, Max = 77.824000, Mean = 33.582080, Sigma = 14.642904
Partial code listing:
__device__ TStats* StatsPtr { 0 };
__device__ double TNS_EndPrev { 0 };
// kernel that waits for NS_Kernel ns and accumulates statistics
// I... are launch indexes: current, first/last to be timed, where "timed" means measure the time since the end of the previous launch
__global__ void k_direct( int64_t I, int64_t I_TimeFrst, int64_t I_TimeLast, int64_t NS_Kernel ) {
int64_t Beg = NS_Clock();
int64_t End = 0ll;
if ( threadIdx.x == 0 ) {
if ( I_TimeFrst == 0 ) I_TimeFrst = 1;
if ( I == 0 ) StatsPtr = new TStats();
if ( StatsPtr == 0 ) return; // fail silently
TStats& Stats = *StatsPtr;
if ( ( I >= I_TimeFrst )
&& ( I <= I_TimeLast ) ) Stats.Submit( Beg - TNS_EndPrev );
do { End = NS_Clock(); } while ( ( End - Beg ) < NS_Kernel ); // wait
if ( I == I_TimeLast ) { // if last launch to be timed:
Stats.Compute();
Stats.Print( "k_direct() stats (us): ", 0.001 ); // print stats scaled by 0.001
}
TNS_EndPrev = NS_Clock();
}
}