Unable to time functions correctly

This is my code -

#include <iostream>
#include <chrono>
#include <thrust/device_vector.h>
#include <thrust/scan.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"

template <typename T>
__global__
void sum(T* output, const T* start, const T* stop, const int n) {
  int thid = threadIdx.x + blockIdx.x * blockDim.x;
  if (thid < n) {
    output[thid] = stop[thid] - start[thid];
  }
}

template <typename T>
void prefix_sum1(const T* arr, const T* arr2, const int size) {
  T* output = (T*)malloc(size * sizeof(T));
  int block, thread;
  if (size > 1024) {
    block = (size / 1024) + 1;
    thread = 1024;
  }
  else {
    thread = size;
    block = 1;
  }
  T* d_output, * d_arr, * d_arr2;
  cudaMalloc((void**)&d_output, size * sizeof(T));
  cudaMalloc((void**)&d_arr, size * sizeof(T));
  cudaMemcpy(d_arr, arr, size * sizeof(T), cudaMemcpyHostToDevice);
  cudaMalloc((void**)&d_arr2, size * sizeof(T));
  cudaMemcpy(d_arr2, arr2, size * sizeof(T), cudaMemcpyHostToDevice);
  sum<T> << <block, thread >> > (d_output, d_arr, d_arr2, size);
  cudaDeviceSynchronize();
  cudaMemcpy(output, d_output, size * sizeof(T), cudaMemcpyDeviceToHost);
  thrust::device_vector<T> data(output, output+size);
  thrust::device_vector<T> temp(data.size() + 1);
  thrust::exclusive_scan(data.begin(), data.end(), temp.begin());
  temp[data.size()] = data.back() + temp[data.size() - 1];
  //for (const auto& i : temp)
  //  std::cout << i << '\n';
  cudaFree(d_output);
  cudaFree(d_arr);
  cudaFree(d_arr2);
  free(output);
}

template <typename T>
void prefix_sum2(const T* arr, const T* arr2, const int size) {
  thrust::device_vector<T> d_arr(arr, arr + size);
  thrust::device_vector<T> d_arr2(arr2, arr2 + size);
  thrust::device_vector<T> data(size);
  thrust::transform(d_arr2.begin(), d_arr2.end(), d_arr.begin(), data.begin(), thrust::minus<T>());
  thrust::device_vector<T> temp(data.size() + 1);
  thrust::exclusive_scan(data.begin(), data.end(), temp.begin());
  temp[data.size()] = data.back() + temp[data.size() - 1];
  //for (const auto& i : temp)
  //  std::cout << i << '\n';
}

int main() {
  int const size = 100000;
  int starter, stopper;
  for (int i = 0; i < size; i++) {
    starter[i] = i;
    stopper[i] = i + 1;
  }
  auto start2 = std::chrono::high_resolution_clock::now();
  prefix_sum2<int>(starter, stopper, size);
  auto stop2 = std::chrono::high_resolution_clock::now();
  auto time2 = std::chrono::duration_cast<std::chrono::microseconds>(stop2 - start2);
  std::cout << "Time taken for thrust = " << time2.count() << "\n";
  auto start1 = std::chrono::high_resolution_clock::now();
  prefix_sum1<int>(starter, stopper, size);
  auto stop1 = std::chrono::high_resolution_clock::now();
  auto time1 = std::chrono::duration_cast<std::chrono::microseconds>(stop1 - start1);
  std::cout << "Time taken for kernel = " << time1.count() << "\n";
}

I am trying to test if vector subtraction is faster using thrust::transform or writing my own kernel (with my particular use case of exclusive scan). But the second function to be called, be it my own kernel or the function calling thrust::transform, is always a lot faster.

What is going on here? How do I accurately time my function calls?

  1. To time GPU function it’s best to use event timing. https://devblogs.nvidia.com/how-implement-performance-metrics-cuda-cc/

  2. I don’t think you want to be timing cudaFree

  3. You are only running function once. This is causing the first call to warm-up your GPU. That’s why the second one is always fastest. https://devtalk.nvidia.com/default/topic/999803/why-warm-up-/

  4. You might want to profile your code with Nsight Systems to get a better idea of wants going on under-the-hood. https://devblogs.nvidia.com/nsight-systems-exposes-gpu-optimization/

I think thrust does all of the reallocations and freeing internally, so I should measure the cudaMalloc and cudaFree times for an accurate comparison?

Also, I want to measure the time comprising the CPU overheads and all of the boilerplate that is involved, so I think I shouldn’t use cuda even timings and use CPU time instead? But I do want to only measure the 2 function calls and nothing else.

Thanks!
Just a followup - Although it is trivial to do this manually, are there any thrust or cuda functions that warm up the GPU?

void warmUpFunction( ) {

    using namespace thrust::placeholders;

    int                        N = 1 << 20;
    thrust::device_vector<int> d_x( N, 2 );  // alloc and copy host to device
    thrust::device_vector<int> d_y( N, 4 );

    // Perform SAXPY on 1M elements
    thrust::transform(
        d_x.begin( ), d_x.end( ), d_y.begin( ), d_y.begin( ), 2.0f * _1 + _2 );
}

But that is a manual kernel and not an available thrust or cuda function right?

warmUpFunction is just a host function. Inside it is not running a CUDA kernel. You can use anything to warmup a GPU. In this case, I’m performing SAXPY with a Thrust transform. That’s about a simple as it gets. There is no such thing as a warmup function…