Hi, I encountered some problems when implementing a custom profiler using Range Profiling API(User Range + User Replay). It seems like any data related to time from user range mode will be much higher than the metrics collected using auto range profiling on the same kernel.
I tested on range_profiling.cu(under range_profiling directory) of the official CUPTI sample from CUDA toolkit 13.0 and changed it slightly to only profile a single “vectorAdd” kernel. Here are the results from auto range and user range:
Auto Range
Range Name: 0
-----------------------------------------------------------------------------------
l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum 12500.000
sm__throughput.avg.pct_of_peak_sustained_elapsed 1.979
sm__pipe_shared_cycles_active.max 32.000
gpu__time_duration.max 3488.000
gpc__cycles_elapsed.avg.per_second 1352883355.177
gpc__cycles_elapsed.max 4723.000
sm__cycles_active.max 1740.000
sm__pipe_alu_cycles_active.max 96.000
dram__throughput.avg.pct_of_peak_sustained_elapsed 1.164
gpu__time_duration.sum 3488.000
sm__pipe_tensor_cycles_active.max 32.000
l1tex__t_requests_pipe_lsu_mem_global_op_st.sum 1563.000
smsp__inst_executed.sum 25038.000
smsp__sass_inst_executed_op_global_ld.sum 3126.000
l1tex__t_sectors_pipe_lsu_mem_global_op_st.sum 6250.000
smsp__sass_inst_executed_op_global_st.sum 1563.000
l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum 3126.000
sm__pipe_fma_cycles_active.max 128.000
-----------------------------------------------------------------------------------
User Range
Range Name: VectorAdd
-----------------------------------------------------------------------------------
l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum 12500.000
sm__throughput.avg.pct_of_peak_sustained_elapsed 0.259
sm__pipe_shared_cycles_active.max 32.000
gpu__time_duration.max 1799744.000
gpc__cycles_elapsed.avg.per_second 1200339048.220
gpc__cycles_elapsed.max 2165576.000
sm__cycles_active.max 2224.000
sm__pipe_alu_cycles_active.max 96.000
dram__throughput.avg.pct_of_peak_sustained_elapsed 0.018
gpu__time_duration.sum 1799744.000
sm__pipe_tensor_cycles_active.max 32.000
l1tex__t_requests_pipe_lsu_mem_global_op_st.sum 1563.000
smsp__inst_executed.sum 25038.000
smsp__sass_inst_executed_op_global_ld.sum 3126.000
l1tex__t_sectors_pipe_lsu_mem_global_op_st.sum 6250.000
smsp__sass_inst_executed_op_global_st.sum 1563.000
l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum 3126.000
sm__pipe_fma_cycles_active.max 128.000
-----------------------------------------------------------------------------------
You can find that because they are profiling on the same kernel, some metrics are the same, for example:
- l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum
- smsp__sass_inst_executed_op_global_st.sum
- sm__pipe_fma_cycles_active.max
- sm__cycles_active.max
But for those metrics either directly or indirectly related to time, there is a huge difference:
- gpu__time_duration.max
- sm__throughput.avg.pct_of_peak_sustained_elapsed
- dram__throughput.avg.pct_of_peak_sustained_elapsed
You can see that gpu__time_duration.max raised from 3488 to 1799744, even though sm active cycles are similar, which is kind of interesting.
Could anyone explain why this difference exists? Is there any difference between the time measurement of auto range and user range mode?
Here is the code I am running. The only thing I changed from official sample is the checkpoint save and restore, nesting level setting, the synchronization after kernel call and the wall clock time measurement:
int main(int argc, char *argv[])
{
ParsedArgs args = parseArgs(argc, argv);
DRIVER_API_CALL(cuInit(0));
printf("Starting Range Profiling\n");
// Get the current ctx for the device
CUdevice cuDevice;
DRIVER_API_CALL(cuDeviceGet(&cuDevice, args.deviceIndex));
ProfilingDeviceSupportStatus(cuDevice);
int computeCapabilityMajor = 0, computeCapabilityMinor = 0;
DRIVER_API_CALL(cuDeviceGetAttribute(&computeCapabilityMajor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
DRIVER_API_CALL(cuDeviceGetAttribute(&computeCapabilityMinor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
printf("Compute Capability of Device: %d.%d\n", computeCapabilityMajor, computeCapabilityMinor);
if (computeCapabilityMajor < 7 || (computeCapabilityMajor == 7 && computeCapabilityMinor < 5))
{
std::cerr << "Range Profiling is supported only on devices with compute capability 7.5 and above" << std::endl;
exit(EXIT_FAILURE);
}
RangeProfilerConfig config;
config.maxNumOfRanges = args.maxRange;
config.minNestingLevel = 1;
config.numOfNestingLevel =1;
CuptiProfilerHostPtr pCuptiProfilerHost = std::make_shared<CuptiProfilerHost>();
// Create a context
CUcontext cuContext;
DRIVER_API_CALL(cuCtxCreate(&cuContext, 0, cuDevice));
RangeProfilerTargetPtr pRangeProfilerTarget = std::make_shared<RangeProfilerTarget>(cuContext, config);
// Get chip name
std::string chipName;
CUPTI_API_CALL(RangeProfilerTarget::GetChipName(cuDevice, chipName));
// Get Counter availability image
std::vector<uint8_t> counterAvailabilityImage;
CUPTI_API_CALL(RangeProfilerTarget::GetCounterAvailabilityImage(cuContext, counterAvailabilityImage));
// Create config image
std::vector<uint8_t> configImage;
pCuptiProfilerHost->SetUp(chipName, counterAvailabilityImage);
CUPTI_API_CALL(pCuptiProfilerHost->CreateConfigImage(args.metrics, configImage));
// Set up the workload
VectorLaunchWorkLoad vectorLaunchWorkLoad;
vectorLaunchWorkLoad.SetUp();
// Enable Range profiler
CUPTI_API_CALL(pRangeProfilerTarget->EnableRangeProfiler());
// Create CounterData Image
std::vector<uint8_t> counterDataImage;
CUPTI_API_CALL(pRangeProfilerTarget->CreateCounterDataImage(args.metrics, counterDataImage));
// Set range profiler configuration
printf("Range Mode: %s\n", args.rangeMode.c_str());
printf("Replay Mode: %s\n", args.replayMode.c_str());
CUPTI_API_CALL(pRangeProfilerTarget->SetConfig(
args.rangeMode == "auto" ? CUPTI_AutoRange : CUPTI_UserRange,
args.replayMode == "kernel" ? CUPTI_KernelReplay : CUPTI_UserReplay,
configImage,
counterDataImage
));
using NV::Cupti::Checkpoint::CUpti_Checkpoint;
CUpti_Checkpoint handle{CUpti_Checkpoint_STRUCT_SIZE};
handle.ctx = cuContext;
handle.optimizations = 0;
int passes = 0;
do
{
if (passes == 0) {
CUPTI_API_CALL(cuptiCheckpointSave(&handle));
} else {
CUPTI_API_CALL(cuptiCheckpointRestore(&handle));
}
std::cout << "Starting Pass: " << passes << std::endl;
// Start Range Profiling
CUPTI_API_CALL(pRangeProfilerTarget->StartRangeProfiler());
{
auto start = std::chrono::high_resolution_clock::now();
// Push Range (Level 1)
CUPTI_API_CALL(pRangeProfilerTarget->PushRange("VectorAdd"));
vectorLaunchWorkLoad.LaunchKernel();
cudaDeviceSynchronize();
CUPTI_API_CALL(pRangeProfilerTarget->PopRange());
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::nano> duration = end - start;
std::cout << "Host workload duration: " << duration.count() << " nanoseconds\n";
}
// Stop Range Profiling
CUPTI_API_CALL(pRangeProfilerTarget->StopRangeProfiler());
passes++;
}
while (!pRangeProfilerTarget->IsAllPassSubmitted());
// Get Profiler Data
CUPTI_API_CALL(pRangeProfilerTarget->DecodeCounterData());
// Evaluate the results
size_t numRanges = 0;
CUPTI_API_CALL(pCuptiProfilerHost->GetNumOfRanges(counterDataImage, numRanges));
for (size_t rangeIndex = 0; rangeIndex < numRanges; ++rangeIndex)
{
CUPTI_API_CALL(pCuptiProfilerHost->EvaluateCounterData(rangeIndex, args.metrics, counterDataImage));
}
pCuptiProfilerHost->PrintProfilerRanges();
// Clean up
CUPTI_API_CALL(pRangeProfilerTarget->DisableRangeProfiler());
pCuptiProfilerHost->TearDown();
vectorLaunchWorkLoad.TearDown();
DRIVER_API_CALL(cuCtxDestroy(cuContext));
return 0;
}