When I Use cudaMemcpyToSymbol with recording the time by cudaEventRecord & cudaEventElapsedTime,the duration time was longer with Nsight System.
cudaEvent_t start_event, stop_event;
cudaEventCreate(&start_event);
cudaEventCreate(&stop_event);
cudaEventRecord(start_event, 0);
for (int i = 0; i < times; ++i) {
cudaMemcpyToSymbol(d_global_array, h_a, nbytes, 0, cudaMemcpyHostToDevice);
}
cudaEventRecord(stop_event, 0);
cudaEventSynchronize(stop_event);
float single_time;
cudaEventElapsedTime(&single_time, start_event, stop_event);
single_time = single_time / times;
std::cout << "time:" << single_time << " ms" << std::endl;
and the result is
user@cdserver:~/nvidia/cuda-samples/Samples/0_Introduction/simpleStreams$ ./cudamemcpytosymbol
time:0.0034125 ms
user@cdserver:~/nvidia/cuda-samples/Samples/0_Introduction/simpleStreams$ nsys -v
NVIDIA Nsight Systems version 2025.2.1.130-252135690618v0
user@cdserver:~/nvidia/cuda-samples/Samples/0_Introduction/simpleStreams$ nsys profile -t cuda ./cudamemcpytosymbol
WARNING: CPU IP/backtrace sampling not supported, disabling.
Try the 'nsys status --environment' command to learn more.
WARNING: CPU context switch tracing not supported, disabling.
Try the 'nsys status --environment' command to learn more.
Collecting data...
time:0.0039857 ms
Generating '/tmp/nsys-report-d885.qdstrm'
[1/1] [========================100%] report1.nsys-rep
Generated:
/home/user/nvidia/cuda-samples/Samples/0_Introduction/simpleStreams/report3.nsys-rep
It’s just an interesting little experiment. I wonder whether it’s more accurate to use the CPU to record timestamps, or if Nsight Systems itself has this kind of issue. How should I eliminate this difference?