Time discrepancy between manual timing and profiler on CUB DeviceHistogram kernel

When I run the following program, compiled with nvcc -o example -arch=sm_75 cub_timing_test.cu:

#include <chrono>
#include <cub/device/device_histogram.cuh>
#include <random>

typedef unsigned long long cu_size_t;

#define gpuErrchk(ans) \
  { gpuAssert((ans), __FILE__, __LINE__); }
__host__ __device__ inline void gpuAssert(cudaError_t code, const char* file,
                                          int line, bool abort = true) {
  if (code != cudaSuccess) {
    printf("GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
    if (abort) assert(0);
  }
}

template <typename T>
std::vector<T> generateRandomData(std::vector<T> const& alphabet,
                                  size_t const data_size) {
  std::random_device rd;
  std::mt19937 gen(rd());
  std::uniform_int_distribution<size_t> dis(0, alphabet.size() - 1);

  std::vector<T> data(data_size);
  std::generate(data.begin(), data.end(), [&]() { return alphabet[dis(gen)]; });

  return data;
}

template <typename T>
void HistComputationCUB(size_t const data_size, size_t const alphabet_size) {
  auto alphabet = std::vector<T>(alphabet_size);
  std::iota(alphabet.begin(), alphabet.end(), 0);
  auto data = generateRandomData<T>(alphabet, data_size);

  T* d_data;
  T* d_alphabet;
  size_t* d_histogram;
  gpuErrchk(cudaMalloc(&d_data, data_size * sizeof(T)));
  gpuErrchk(cudaMemcpy(d_data, data.data(), data_size * sizeof(T),
                       cudaMemcpyHostToDevice));
  gpuErrchk(cudaMalloc(&d_alphabet, alphabet_size * sizeof(T)));
  gpuErrchk(cudaMemcpy(d_alphabet, alphabet.data(), alphabet_size * sizeof(T),
                       cudaMemcpyHostToDevice));
  gpuErrchk(cudaMalloc(&d_histogram, alphabet_size * sizeof(size_t)));

  void* d_temp_storage = nullptr;
  size_t temp_storage_bytes = 0;
  gpuErrchk(cub::DeviceHistogram::HistogramEven(
      d_temp_storage, temp_storage_bytes, d_data, (cu_size_t*)d_histogram,
      alphabet_size + 1, T(0), T(alphabet_size), data_size));
  gpuErrchk(cudaMalloc(&d_temp_storage, temp_storage_bytes));

  auto start = std::chrono::high_resolution_clock::now();
  for (int i = 0; i < 10; ++i) {
    gpuErrchk(cudaMemset(d_histogram, 0, alphabet_size * sizeof(size_t)));
    gpuErrchk(cub::DeviceHistogram::HistogramEven(
        d_temp_storage, temp_storage_bytes, d_data, (cu_size_t*)d_histogram,
        alphabet_size + 1, T(0), T(alphabet_size), data_size));
  }
  auto end = std::chrono::high_resolution_clock::now();
  auto time = std::chrono::duration_cast<std::chrono::microseconds>(end - start)
                  .count();
  std::cout << "Time per iter: " << time / 10 << " mus" << std::endl;
  gpuErrchk(cudaFree(d_data));
  gpuErrchk(cudaFree(d_alphabet));
  gpuErrchk(cudaFree(d_histogram));
  gpuErrchk(cudaFree(d_temp_storage));
}

int main(int argc, char** argv) {
  auto const data_size = argc > 1 ? std::stoul(argv[1]) : 1'000'000'000;
  auto const alphabet_size = argc > 2 ? std::stoul(argv[2]) : 256;

  if (alphabet_size <= std::numeric_limits<uint8_t>::max()) {
    HistComputationCUB<uint8_t>(data_size, alphabet_size);
  } else if (alphabet_size <= std::numeric_limits<uint16_t>::max()) {
    HistComputationCUB<uint16_t>(data_size, alphabet_size);
  } else if (alphabet_size <= std::numeric_limits<uint32_t>::max()) {
    HistComputationCUB<uint32_t>(data_size, alphabet_size);
  }
}

with the command ./example 1000000000 4I get an average time of 10 microseconds (on an NVIDIA GeForce RTX 2070 SUPER), while when profiling it with ncu --export "./CUB_hist_1G_4" --force-overwrite --launch-count 10 ./example 1000000000 4, the average time is 190 milliseconds. This huge discrepancy between timings does not happen with my own kernels. Why could this be the case?

CUDA kernel launches and activity that translates to kernel launches (such as CUB activity) are asynchronous. This means the CPU thread does not wait for their completion.

So if you are timing using host-based timing methods, it’s usually a good idea to put a synchronization in at the end of the timing region, perhaps like this:

gpuErrchk(cudaDeviceSynchronize());  // add this line
auto end = std::chrono::high_resolution_clock::now();

This topic is frequently encountered by CUDA programmers, so you can find many web postings discussing it.

(cudaMemset the way you are using it - targetting device memory - can also end up being asynchronous.)

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.