Different default-stream per thread behaviour when profiling with nvprof

I am testing how I can overlap cudaMemcpys with kernel executions without having to use cudaMallocHost i.e. pinned memory. This is the example I have come up with:

#include <atomic>
#include <cstdio>
#include <iostream>
#include <thread>
#include <vector>

#define gpuErrchk(ans) \
  { gpuAssert((ans), __FILE__, __LINE__); }
__host__ inline void gpuAssert(cudaError_t code, const char* file, int line,
                               bool abort = true) {
  if (code != cudaSuccess) {
    printf("GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
    if (abort) exit(EXIT_FAILURE);
  }
}

#define gpuErrchkInternal(ans, file, line) \
  { gpuAssert((ans), file, line); }

#define kernelStreamCheck(stream) \
  kernelStreamCheckFunc(stream, __FILE__, __LINE__)
__host__ inline void kernelStreamCheckFunc(cudaStream_t stream,
                                           const char* file, int line) {
  gpuErrchkInternal(cudaStreamSynchronize(stream), file, line);
  gpuErrchkInternal(cudaPeekAtLastError(), file, line);
}

// Kernel that sleeps for some time
__global__ void sleepKernel(size_t sleep_time) {
  int start = clock();
  int clock_offset = 0;
  while (clock_offset < sleep_time) {
    clock_offset = clock() - start;
  }
}

__host__ int main() {
  using ItemType = int;
  struct cudaDeviceProp prop;
  cudaGetDeviceProperties(&prop, 0);
  size_t const num_items = prop.totalGlobalMem / (10 * sizeof(ItemType));
  
  std::vector<ItemType> vec(num_items, 0);

  // Divide indices into chunks
  uint32_t const num_chunks = 10;
  size_t const chunk_size = num_items / num_chunks;
  size_t const last_chunk_size = chunk_size + num_items % num_chunks;

  ItemType* d_items;
  gpuErrchk(cudaMalloc(&d_items, num_items * sizeof(ItemType)));

  ItemType* d_results;
  gpuErrchk(cudaMalloc(&d_results, num_items * sizeof(ItemType)));

  std::atomic<uint32_t> chunk_index(0);

  std::vector<ItemType> results(num_items);

  // Copy All chunks serially
  std::thread t_HtoD(
      [=](std::atomic<uint32_t>& chunk_index) {
        for (uint32_t i = 0; i < num_chunks; ++i) {
          uint32_t const current_chunk_size =
              i == (num_chunks - 1) ? last_chunk_size : chunk_size;
          gpuErrchk(cudaMemcpy(
              d_items + chunk_size * i, vec.data() + chunk_size * i,
              current_chunk_size * sizeof(ItemType), cudaMemcpyHostToDevice));
          std::cout << "Chunk " << i << " of " << num_chunks << " copied"
                    << std::endl;
          chunk_index++;
        }
      },
      std::ref(chunk_index));

  for (uint32_t i = 1; i <= num_chunks; ++i) {
    while (chunk_index < i) {
      std::this_thread::yield();
    }
    std::cout << "Chunk " << i << " of " << num_chunks << " started"
              << std::endl;
    std::chrono::high_resolution_clock::time_point start =
        std::chrono::high_resolution_clock::now();
    sleepKernel<<<1, 1>>>(10000000);
    kernelStreamCheck(0);
    std::chrono::high_resolution_clock::time_point end =
        std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> time_span =
        std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
    std::cout << "Chunk " << i << " of " << num_chunks << " slept"
              << " in " << time_span.count() << " seconds" << std::endl;
  }
  t_HtoD.join();

  // copy results back to host
  gpuErrchk(cudaMemcpy(results.data(), d_results, num_items * sizeof(ItemType),
                       cudaMemcpyDeviceToHost));
  gpuErrchk(cudaFree(d_items));

  gpuErrchk(cudaFree(d_results));

  return 0;
}

Which is compiled with nvcc -o example -arch=sm_75 --default-stream per-thread test.cu with CUDA 12.4 and run on an RTX 2080 Ti on Linux. When running by itself, I get outputs similar to this:

Chunk 0 of 10 copied
Chunk 1 of 10 started
Chunk 1 of 10 copied
Chunk 2 of 10 copied
Chunk 1 of 10 slept in 0.0680279 seconds
Chunk 2 of 10 started
Chunk 2 of 10 slept in 0.00734371 seconds
Chunk 3 of 10 started
Chunk 3 of 10 slept in 0.00734193 seconds
Chunk 3 of 10 copied
Chunk 4 of 10 started
Chunk 4 of 10 slept in 0.00734278 seconds
Chunk 4 of 10 copied
Chunk 5 of 10 started
Chunk 5 of 10 slept in 0.00734305 seconds
Chunk 5 of 10 copied
Chunk 6 of 10 started
Chunk 6 of 10 slept in 0.0054466 seconds
Chunk 6 of 10 copied
Chunk 7 of 10 started
Chunk 7 of 10 slept in 0.00544596 seconds
Chunk 7 of 10 copied
Chunk 8 of 10 started
Chunk 8 of 10 slept in 0.00544566 seconds
Chunk 8 of 10 copied
Chunk 9 of 10 started
Chunk 9 of 10 slept in 0.00544588 seconds
Chunk 9 of 10 copied
Chunk 10 of 10 started
Chunk 10 of 10 slept in 0.00543664 seconds

Which is what I would expect. But when profiling with nvprof --export-profile "./test2.nvvp" -f --trace api,gpu ./example, the output tends to be much less interleaved, for example:

Chunk 0 of 10 copied
Chunk 1 of 10 started
Chunk 1 of 10 copied
Chunk 2 of 10 copied
Chunk 3 of 10 copied
Chunk 4 of 10 copied
Chunk 5 of 10 copied
Chunk 6 of 10 copied
Chunk 7 of 10 copied
Chunk 8 of 10 copied
Chunk 9 of 10 copied
Chunk 1 of 10 slept in 0.275931 seconds
Chunk 2 of 10 started
Chunk 2 of 10 slept in 0.00544324 seconds
Chunk 3 of 10 started
Chunk 3 of 10 slept in 0.00544005 seconds
Chunk 4 of 10 started
Chunk 4 of 10 slept in 0.00544078 seconds
Chunk 5 of 10 started
Chunk 5 of 10 slept in 0.00543852 seconds
Chunk 6 of 10 started
Chunk 6 of 10 slept in 0.00543788 seconds
Chunk 7 of 10 started
Chunk 7 of 10 slept in 0.00543785 seconds
Chunk 8 of 10 started
Chunk 8 of 10 slept in 0.00543824 seconds
Chunk 9 of 10 started
Chunk 9 of 10 slept in 0.00543945 seconds
Chunk 10 of 10 started
Chunk 10 of 10 slept in 0.00544015 seconds

This seems to defeat the purpose of profiling in this case, since from the profile alone, I would have thought the program was not running correctly. Is this expected behaviour from NVPROF?
Thank you for the help!

I wouldn’t recommend using nvprof with any GPU for which the new tools are supported. Profiler behavior questions should probably be directed to the relevant profiler forum, for nvprof it is here.

For future readers: with Nsight Systems, the problem does not occur.

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.