Using Windows 10 with HAGS disabled leads to inter-kernel gaps of ~2 microseconds when the kernels are in different streams and synchronized via event record/waits.
When I enable HAGS, I measure 20 to 30 microseconds between the end of one kernel and the start of another kernel when the kernels are in different streams and the streams are synchronized via events. This seems excessive for something that could be done entirely on the device.
Is it to be expected?
My host program alternates between two streams:
Stream A: L0 R0 W1 L2 R2 ...
Stream B: W0 L1 R1 ...
Li = kernel launch i, Ri = record event i, Wi = wait on event i. I do not reuse any of the events.
I create the streams with cudaStreamNonBlocking and do not use stream 0.
I create all events in advance using cudaEventDisableTiming.
The kernel runs for 100 us, so there should not be an issue with the CPU lagging the device, and I get similar results when I increase the kernel run time.
If I change the program to send everything to one stream, including event records/waits,
the inter-kernel time is ~ 2 us.
The Visual Profiler confirms the 100 us kernel run time and the 20-30 us gap between kernels (or 2 us gap if only one stream).
I am using a GTX 1070 Ti and Windows 10, with HAGS enabled.
Complete code:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cuda/std/chrono>
#include <stdio.h>
#include <iostream>
#include <iomanip>
#include <stdint.h>
#include "cuda_profiler_api.h"
// return device-reported wall time in ns
__device__ inline int64_t NS_Clock() {
auto TimeSinceEpoch_ns = cuda::std::chrono::duration_cast<cuda::std::chrono::nanoseconds>( cuda::std::chrono::system_clock::now().time_since_epoch() );
return static_cast<int64_t>( TimeSinceEpoch_ns.count() );
}
int RunTest_Events( cudaStream_t StreamA , cudaStream_t StreamB );
int main() {
cudaError_t ce;
ce = cudaSetDevice( 0 );
if ( ce != cudaSuccess ) {
std::cout << "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?"; return 1;
}
cudaStream_t StreamA; int PriorityA = 0;
cudaStream_t StreamB; int PriorityB = 0;
ce = cudaStreamCreateWithPriority( & StreamA, cudaStreamNonBlocking, PriorityA );
//ce = cudaStreamCreateWithPriority( & StreamA, cudaStreamDefault, PriorityA );
if ( ce != cudaSuccess ) {
std::cout << "cudaStreamCreateWithPriority [A] failed!"; return 1;
}
ce = cudaStreamCreateWithPriority( & StreamB, cudaStreamNonBlocking, PriorityB );
//ce = cudaStreamCreateWithPriority( & StreamB, cudaStreamDefault, PriorityB );
if ( ce != cudaSuccess ) {
std::cout << "cudaStreamCreateWithPriority [B] failed!"; return 1;
}
// ****************** The Tests ****************** //
int Result = 0;
Result = RunTest_Events( StreamA, StreamB );
if ( Result ) return Result;
std::cout << std::endl;
// *********************************************** //
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
ce = cudaDeviceReset();
if ( ce != cudaSuccess ) {
std::cout << "cudaDeviceReset failed!";
return 1;
}
return 0;
}
__device__ int64_t TNS_Beg { 0 };
__device__ int64_t TNS_End { 0 };
// kernel that waits for NS_Kernel ns and does little else
// I... are launch indexes: current, first/last to be timed
__global__ void k_delay( int64_t I, int64_t I_TimeFrst, int64_t I_TimeLast, int64_t NS_Kernel ) {
int64_t Beg = NS_Clock();
int64_t End = 0ll;
if ( threadIdx.x == 0 ) {
if ( I == I_TimeFrst ) TNS_Beg = Beg; // if first launch to be timed: save the beg time to global memory
do { End = NS_Clock(); } while ( ( End - Beg ) < NS_Kernel ); // wait
if ( I == I_TimeLast ) { // if last launch to be timed:
// TNS_End = End; // save the end time to global memory (optional)
int64_t N = I_TimeLast - I_TimeFrst + 1ll; // the number of launches that were timed
double US_RecWaitAvg = ( ( End - TNS_Beg ) - N * NS_Kernel ) * 0.001 / N; // inter-kernel time in microseconds
printf( "\n\n%lld k_delay() launches --> %f us between launches\n\n", N, US_RecWaitAvg );
}
}
}
int RunTest_Events( cudaStream_t StreamA, cudaStream_t StreamB ) {
const uint32_t N_ThrPerBlk = 256;
const uint32_t N_BlkPerLaunch = 1;
const int64_t NS_Kernel = 100'000ll; // the execution time of the kernel, in ns
const int64_t N_Events = 200; // this the same as the number of launches
const int64_t I_TimeFrst = N_Events - 50; // the first of the timed launches
const int64_t I_TimeLast = N_Events - 1; // the last of the timed launches
cudaError_t ce;
cudaEvent_t Events[ N_Events ];
for ( uint32_t i = 0; i < N_Events; ++ i ) {
// ce = cudaEventCreateWithFlags( Events + i, cudaEventDefault ); // this seems to have little effect on the results
ce = cudaEventCreateWithFlags( Events + i, cudaEventDisableTiming );
if ( ce != cudaSuccess ) {
std::cout << "\nRunTest_Events() could not create event: " << cudaGetErrorString(ce); return 1;
}
}
cudaEvent_t* EventPrev = 0;
for ( int64_t i = 0; i < N_Events; ++ i ) {
if ( i == I_TimeFrst ) { cudaProfilerStart(); }
if ( i == I_TimeLast ) { cudaProfilerStop(); }
cudaStream_t& Stream = ( i & 0x01 ) ? StreamB : StreamA; // alternate streams
// cudaStream_t& Stream = ( i & 0x01 ) ? StreamA : StreamA; // send everything to one stream
if ( EventPrev ) {
ce = cudaStreamWaitEvent( Stream, *EventPrev ); // wait on event recorded in the other stream, if any
if ( ce != cudaSuccess ) {
std::cout << "\ncudaStreamWaitEvent() failed: " << cudaGetErrorString(ce); return 1;
}
}
k_delay <<< N_BlkPerLaunch, N_ThrPerBlk, 0, Stream >>> ( i, I_TimeFrst, I_TimeLast, NS_Kernel );
ce = cudaGetLastError();
if ( ce != cudaSuccess ) {
std::cout << "\nk_delay() launch failed: " << cudaGetErrorString(ce); return 1;
}
cudaEvent_t& EventRecorded = Events[ i ]; // event to be recorded this loop, after the kernel launch
ce = cudaEventRecord( EventRecorded, Stream ); // record the event in the same stream as the kernel launch
if ( ce != cudaSuccess ) {
std::cout << "\ncudaEventRecord() failed: " << cudaGetErrorString(ce); return 1;
}
EventPrev = & EventRecorded;
}
ce = cudaDeviceSynchronize(); // synchronize before reusing any events
if ( ce != cudaSuccess ) {
std::cout << "\ncudaDeviceSynchronize() returned error code after launching kernels!";
std::cout << "\nError string: " << cudaGetErrorString(ce); return 1;
}
std::cout << "total launches: " << N_Events << " launches" << std::endl;
std::cout << "blk per launch: " << N_BlkPerLaunch << " blk/launch" << std::endl;
std::cout << "thr per block: " << N_ThrPerBlk << " thr/blk" << std::endl;
//timing results are now calculated by the kernel launch
return 0;
}