Increased time to synchronize streams via event record/wait when HAGS is enabled

Using Windows 10 with HAGS disabled leads to inter-kernel gaps of ~2 microseconds when the kernels are in different streams and synchronized via event record/waits.

When I enable HAGS, I measure 20 to 30 microseconds between the end of one kernel and the start of another kernel when the kernels are in different streams and the streams are synchronized via events. This seems excessive for something that could be done entirely on the device.
Is it to be expected?

My host program alternates between two streams:

Stream A:  L0 R0    W1 L2 R2 ...
Stream B:     W0 L1 R1 ...

Li = kernel launch i, Ri = record event i, Wi = wait on event i. I do not reuse any of the events.

I create the streams with cudaStreamNonBlocking and do not use stream 0.
I create all events in advance using cudaEventDisableTiming.

The kernel runs for 100 us, so there should not be an issue with the CPU lagging the device, and I get similar results when I increase the kernel run time.

If I change the program to send everything to one stream, including event records/waits,
the inter-kernel time is ~ 2 us.

The Visual Profiler confirms the 100 us kernel run time and the 20-30 us gap between kernels (or 2 us gap if only one stream).

I am using a GTX 1070 Ti and Windows 10, with HAGS enabled.

Complete code:


#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <cuda/std/chrono>

#include <stdio.h>
#include <iostream>
#include <iomanip>
#include <stdint.h>

#include "cuda_profiler_api.h"


// return device-reported wall time in ns
__device__    inline  int64_t  NS_Clock() {
  auto                          TimeSinceEpoch_ns  =  cuda::std::chrono::duration_cast<cuda::std::chrono::nanoseconds>( cuda::std::chrono::system_clock::now().time_since_epoch() );
  return  static_cast<int64_t>( TimeSinceEpoch_ns.count() );
}


int  RunTest_Events( cudaStream_t StreamA    , cudaStream_t  StreamB ); 


int  main() {

  cudaError_t   ce;

  ce = cudaSetDevice( 0 );
  if ( ce != cudaSuccess ) {
    std::cout  << "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?";    return 1;
  }


  cudaStream_t  StreamA;  int PriorityA  =   0;
  cudaStream_t  StreamB;  int PriorityB  =   0;

  ce  =  cudaStreamCreateWithPriority( & StreamA, cudaStreamNonBlocking, PriorityA );
//ce  =  cudaStreamCreateWithPriority( & StreamA, cudaStreamDefault,     PriorityA );
  if ( ce != cudaSuccess ) {
    std::cout  << "cudaStreamCreateWithPriority [A] failed!";    return 1;
  }

  ce  =  cudaStreamCreateWithPriority( & StreamB, cudaStreamNonBlocking, PriorityB );
//ce  =  cudaStreamCreateWithPriority( & StreamB, cudaStreamDefault,     PriorityB );
  if ( ce != cudaSuccess ) {
    std::cout  << "cudaStreamCreateWithPriority [B] failed!";    return 1;
  }



// ****************** The Tests ****************** //
  int    Result  =  0;
         Result  =  RunTest_Events( StreamA, StreamB );
    if ( Result )  return  Result;

  std::cout << std::endl;
// *********************************************** //



  // cudaDeviceReset must be called before exiting in order for profiling and
  // tracing tools such as Nsight and Visual Profiler to show complete traces.
  ce = cudaDeviceReset();
  if ( ce != cudaSuccess ) {
    std::cout  << "cudaDeviceReset failed!";
    return 1;
  }


  return 0;
}

__device__  int64_t   TNS_Beg { 0 };
__device__  int64_t   TNS_End { 0 };



// kernel that waits for NS_Kernel ns and does little else
// I... are launch indexes: current, first/last to be timed
__global__  void  k_delay(  int64_t I, int64_t I_TimeFrst, int64_t I_TimeLast, int64_t NS_Kernel ) {
  int64_t  Beg  =  NS_Clock();
  int64_t  End  =         0ll;
  if ( threadIdx.x  ==  0 ) {
    if ( I    ==  I_TimeFrst )   TNS_Beg  =  Beg;         // if first launch to be timed: save the beg time to global memory
    do { End   =  NS_Clock(); }  while ( ( End - Beg )  <  NS_Kernel );           // wait
    if ( I    ==  I_TimeLast  ) {                         // if last  launch to be timed:
//    TNS_End  =  End;                                    //   save the end time to global memory (optional)
      int64_t  N  =  I_TimeLast - I_TimeFrst + 1ll;       // the number of launches that were timed
      double   US_RecWaitAvg  =  ( ( End - TNS_Beg ) - N * NS_Kernel ) * 0.001 / N;                // inter-kernel time in microseconds
      printf( "\n\n%lld k_delay() launches  -->  %f us between launches\n\n", N, US_RecWaitAvg );
    }
  }
}



int  RunTest_Events( cudaStream_t StreamA, cudaStream_t StreamB ) {
  const  uint32_t    N_ThrPerBlk     =   256;
  const  uint32_t    N_BlkPerLaunch  =     1;

  const  int64_t     NS_Kernel       =   100'000ll;         // the execution time of the kernel, in ns
  const  int64_t     N_Events        =   200;               // this the same as the number of launches
  const  int64_t     I_TimeFrst      =   N_Events -  50;    // the first of the timed launches
  const  int64_t     I_TimeLast      =   N_Events -   1;    // the last  of the timed launches
  

  cudaError_t        ce;
  
  cudaEvent_t        Events[   N_Events ];
  for ( uint32_t  i  =  0; i < N_Events; ++ i ) {
//       ce  =  cudaEventCreateWithFlags( Events + i, cudaEventDefault       );   // this seems to have little effect on the results
         ce  =  cudaEventCreateWithFlags( Events + i, cudaEventDisableTiming );
    if ( ce != cudaSuccess ) {
      std::cout  << "\nRunTest_Events() could not create event: " << cudaGetErrorString(ce);    return 1;
    }
  }


  cudaEvent_t*  EventPrev  =  0;
  for ( int64_t  i = 0;  i < N_Events;  ++ i ) {

      if ( i == I_TimeFrst ) { cudaProfilerStart();  }
      if ( i == I_TimeLast ) { cudaProfilerStop();   }

    cudaStream_t&  Stream         =       ( i  &  0x01 )  ?  StreamB  :  StreamA;       // alternate streams
//  cudaStream_t&  Stream         =       ( i  &  0x01 )  ?  StreamA  :  StreamA;       // send everything to one stream
    if ( EventPrev ) {
           ce  = cudaStreamWaitEvent( Stream, *EventPrev );                             // wait on event recorded in the other stream, if any
      if ( ce != cudaSuccess ) {
        std::cout  << "\ncudaStreamWaitEvent() failed: " << cudaGetErrorString(ce);    return 1;
      }
    }

    k_delay <<< N_BlkPerLaunch, N_ThrPerBlk, 0, Stream  >>> (  i, I_TimeFrst, I_TimeLast, NS_Kernel );
         ce  = cudaGetLastError();
    if ( ce != cudaSuccess ) {
      std::cout  << "\nk_delay() launch failed: " << cudaGetErrorString(ce);    return 1;
    }

    cudaEvent_t&                EventRecorded  =  Events[  i ];                         // event to be recorded this loop, after the kernel launch
         ce  = cudaEventRecord( EventRecorded, Stream );                                // record the event in the same stream as the kernel launch
    if ( ce != cudaSuccess ) {
      std::cout  << "\ncudaEventRecord() failed: " << cudaGetErrorString(ce);    return 1;
    }

    EventPrev  =  & EventRecorded;
  }
 
  
       ce  = cudaDeviceSynchronize();          // synchronize before reusing any events
  if ( ce != cudaSuccess ) {
    std::cout  << "\ncudaDeviceSynchronize() returned error code after launching kernels!";
    std::cout  << "\nError string: " << cudaGetErrorString(ce);    return 1;
  }


  std::cout  << "total launches:    " <<   N_Events              << " launches"        << std::endl;
  std::cout  << "blk per launch:    " <<   N_BlkPerLaunch        << " blk/launch"      << std::endl;
  std::cout  << "thr per block:     " <<   N_ThrPerBlk           << " thr/blk"         << std::endl;
//timing results are now calculated by the kernel launch

  return  0;
}

It wouldn’t surprise me if wddm were involved. There are numerous questions on various forums about the inefficiencies associated with wddm work issuance.

When I run on a L4 GPU on linux I get:

# ./t132


50 k_delay() launches  -->  1.355520 us between launches

total launches:    200 launches
blk per launch:    1 blk/launch
thr per block:     256 thr/blk

Is the “1.355520 us between launches” the figure of merit here? I haven’t studied your code and you haven’t shown any actual printout of a run, to compare to.

Yes. Here is my output:

50 k_delay() launches  -->  28.040960 us between launches

total launches:    200 launches
blk per launch:    1 blk/launch
thr per block:     256 thr/blk

I have noticed that the result tends to vary more than other things I have measured. That shows up in the Visual Profiler as well. E.g., 20 us to 50 us gaps.

P.S. If it matters, the GPU is not used for display.
Regarding wddm, I had performance issues in the past that seemed wddm-related, but they went away once I enabled HAGS.

I wrote a new version of the program that directly measures the time between the end of the previous launch and the start of the current launch and accumulates statistics.

As in my previous tests, the statistics apply to the final 50 of 200 launches.

typical results:
[HAGS disabled:] k_direct() stats (us): N = 50, Min = 1.024000, Max = 15.872000, Mean = 2.723840, Sigma = 2.100299

[HAGs enabled: ] k_direct() stats (us): N = 50, Min = 20.736000, Max = 43.008000, Mean = 28.748800, Sigma = 4.693744

a particularly bad case:
[HAGs enabled: ] k_direct() stats (us): N = 50, Min = 17.408000, Max = 77.824000, Mean = 33.582080, Sigma = 14.642904

Partial code listing:

__device__  TStats*  StatsPtr    { 0 };
__device__  double   TNS_EndPrev { 0 };


// kernel that waits for NS_Kernel ns and accumulates statistics
// I... are launch indexes: current, first/last to be timed, where "timed" means measure the time since the end of the previous launch
__global__  void  k_direct(  int64_t I, int64_t I_TimeFrst, int64_t I_TimeLast, int64_t NS_Kernel ) {
  int64_t  Beg  =  NS_Clock();
  int64_t  End  =         0ll;
  if ( threadIdx.x  ==  0 ) {
    if (   I_TimeFrst ==  0 )  I_TimeFrst  =  1;
    if (        I     ==  0            )  StatsPtr  =  new  TStats();
    if ( StatsPtr     ==  0            )  return;   // fail silently
    TStats&  Stats     =  *StatsPtr;
    if (      ( I     >=  I_TimeFrst )
          &&  ( I     <=  I_TimeLast ) )  Stats.Submit( Beg - TNS_EndPrev );
    do { End   =  NS_Clock(); }  while ( ( End - Beg )  <  NS_Kernel );           // wait
    if ( I    ==  I_TimeLast  ) {                         // if last  launch to be timed:
      Stats.Compute();
      Stats.Print( "k_direct() stats (us):  ", 0.001 );   // print stats scaled by 0.001
    }
    TNS_EndPrev  =  NS_Clock();
  }
}