When I run the following program, compiled with nvcc -o example -arch=sm_75 cub_timing_test.cu
:
#include <chrono>
#include <cub/device/device_histogram.cuh>
#include <random>
typedef unsigned long long cu_size_t;
#define gpuErrchk(ans) \
{ gpuAssert((ans), __FILE__, __LINE__); }
__host__ __device__ inline void gpuAssert(cudaError_t code, const char* file,
int line, bool abort = true) {
if (code != cudaSuccess) {
printf("GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) assert(0);
}
}
template <typename T>
std::vector<T> generateRandomData(std::vector<T> const& alphabet,
size_t const data_size) {
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<size_t> dis(0, alphabet.size() - 1);
std::vector<T> data(data_size);
std::generate(data.begin(), data.end(), [&]() { return alphabet[dis(gen)]; });
return data;
}
template <typename T>
void HistComputationCUB(size_t const data_size, size_t const alphabet_size) {
auto alphabet = std::vector<T>(alphabet_size);
std::iota(alphabet.begin(), alphabet.end(), 0);
auto data = generateRandomData<T>(alphabet, data_size);
T* d_data;
T* d_alphabet;
size_t* d_histogram;
gpuErrchk(cudaMalloc(&d_data, data_size * sizeof(T)));
gpuErrchk(cudaMemcpy(d_data, data.data(), data_size * sizeof(T),
cudaMemcpyHostToDevice));
gpuErrchk(cudaMalloc(&d_alphabet, alphabet_size * sizeof(T)));
gpuErrchk(cudaMemcpy(d_alphabet, alphabet.data(), alphabet_size * sizeof(T),
cudaMemcpyHostToDevice));
gpuErrchk(cudaMalloc(&d_histogram, alphabet_size * sizeof(size_t)));
void* d_temp_storage = nullptr;
size_t temp_storage_bytes = 0;
gpuErrchk(cub::DeviceHistogram::HistogramEven(
d_temp_storage, temp_storage_bytes, d_data, (cu_size_t*)d_histogram,
alphabet_size + 1, T(0), T(alphabet_size), data_size));
gpuErrchk(cudaMalloc(&d_temp_storage, temp_storage_bytes));
auto start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < 10; ++i) {
gpuErrchk(cudaMemset(d_histogram, 0, alphabet_size * sizeof(size_t)));
gpuErrchk(cub::DeviceHistogram::HistogramEven(
d_temp_storage, temp_storage_bytes, d_data, (cu_size_t*)d_histogram,
alphabet_size + 1, T(0), T(alphabet_size), data_size));
}
auto end = std::chrono::high_resolution_clock::now();
auto time = std::chrono::duration_cast<std::chrono::microseconds>(end - start)
.count();
std::cout << "Time per iter: " << time / 10 << " mus" << std::endl;
gpuErrchk(cudaFree(d_data));
gpuErrchk(cudaFree(d_alphabet));
gpuErrchk(cudaFree(d_histogram));
gpuErrchk(cudaFree(d_temp_storage));
}
int main(int argc, char** argv) {
auto const data_size = argc > 1 ? std::stoul(argv[1]) : 1'000'000'000;
auto const alphabet_size = argc > 2 ? std::stoul(argv[2]) : 256;
if (alphabet_size <= std::numeric_limits<uint8_t>::max()) {
HistComputationCUB<uint8_t>(data_size, alphabet_size);
} else if (alphabet_size <= std::numeric_limits<uint16_t>::max()) {
HistComputationCUB<uint16_t>(data_size, alphabet_size);
} else if (alphabet_size <= std::numeric_limits<uint32_t>::max()) {
HistComputationCUB<uint32_t>(data_size, alphabet_size);
}
}
with the command ./example 1000000000 4
I get an average time of 10 microseconds (on an NVIDIA GeForce RTX 2070 SUPER), while when profiling it with ncu --export "./CUB_hist_1G_4" --force-overwrite --launch-count 10 ./example 1000000000 4
, the average time is 190 milliseconds. This huge discrepancy between timings does not happen with my own kernels. Why could this be the case?