CUcontext ctx; //!< [in] if NULL, the current CUcontext is used
size_t counterDataImageSize; //!< [in] size calculated from cuptiProfilerCounterDataImageCalculateSize
uint8_t* pCounterDataImage; //!< [in] address of CounterDataImage
size_t counterDataScratchBufferSize; //!< [in] size calculated from cuptiProfilerCounterDataImageInitializeScratchBuffer
uint8_t* pCounterDataScratchBuffer; //!< [in] address of CounterDataImage scratch buffer
uint8_t bDumpCounterDataInFile; //!< [in] [optional]
const char* pCounterDataFilePath; //!< [in] [optional]
CUpti_ProfilerRange range; //!< [in] CUpti_ProfilerRange
CUpti_ProfilerReplayMode replayMode; //!< [in] CUpti_ProfilerReplayMode
/* Replay options, required when replay is done by cupti user */
size_t maxRangesPerPass; //!< [in] Maximum number of ranges that can be recorded in a single pass.
size_t maxLaunchesPerPass; //!< [in] Maximum number of kernel launches that can be recorded in a single pass; must be >= maxRangesPerPass.
} CUpti_Profiler_BeginSession_Params;
maxRangesPerPass and maxLaunchesPerPass what scenarios are they used for?
what difference between maxRangesPerPass with maxNumRanges in struct CUpti_Profiler_CounterDataImageOptions, i think they are should be the same value.
maxRangesPerPass → Collection of a set of metrics may need multiple passes. A different set of counters is collected in each pass. Metrics are basically collection of counters. For this one may need to replay the CUDA workload multiple times. So this parameter says how many ranges will be replayed in a pass block (BeginPass and EndPass API block).
For Auto range mode this will be the number of kernels launched and for User range mode its the number of times Push/Pop Range API called for defining a range. Note: For kernel replay mode, CUPTI doesn’t have pass concept so this parameter should be set to 1.
maxLaunchesPerPass → This field is not used, and one need to set the value to 1.
To understand the difference between the maxRangesPerPass and maxNumRanges in struct CUpti_Profiler_CounterDataImageOptions, lets consider below code snippet.
// BeginSession and SetConfig API calls
// ...
// 1st Pass Block
do {
cuptiProfilerBeginPass(&beginPassParams);
{
cuptiProfilerEnableProfiling(&enableProfilingParams);
KernelA<<<>>>(); // range index 0
KernelB<<<>>>(); // range index 1
cuptiProfilerDisableProfiling(&disableProfilingParams);
}
KernelC<<<>>>(); // range index 2 (will not be profiled as it's outside enable/disable block)
{
cuptiProfilerEnableProfiling(&enableProfilingParams);
KernelD<<<>>>(); // range index 3
cuptiProfilerDisableProfiling(&disableProfilingParams);
}
cuptiProfilerEndPass(&endPassParams));
} while (!endPassParams.allPassSubmitted);
KernelE<<<>>>(); // Will not be profiled as it is outside the Pass block
// 2nd Pass Block
do {
cuptiProfilerBeginPass(&beginPassParams);
{
cuptiProfilerEnableProfiling(&enableProfilingParams);
KernelF<<<>>>(); // range index 4
KernelG<<<>>>(); // range index 5
cuptiProfilerDisableProfiling(&disableProfilingParams);
}
cuptiProfilerEndPass(&endPassParams);
} while (!endPassParams.allPassesSubmitted);
// Flush all the profiling data to counter data image
cuptiProfilerFlushCounterData(&flushCounterDataParams);
// ...
// UnsetConfig and EndSession API calls
// Note: for the code snippet, we have skipped the save-restore
// part which can be done using cupti checkpoint API.
In the code snippet, we have 2 pass blocks. In the first one, we are launching 4 kernels but KernelC is not profiled as its outside the Enable/Disable block, so 3 kernels are getting profiled. For the 2nd block we have 2 kernel launches and both are getting profiled. So here the maxRangesPerPass will be equal to 3 i.e. maximum kernels/ranges inside a pass block.
Now for maxNumRanges in struct CUpti_Profiler_CounterDataImageOptions, it asks for how many range data can be stored in a counter data image. So in the above case for getting profiling data for all the kernels, maxNumRanges need to be set to 5.