This is my code -
#include <iostream>
#include <chrono>
#include <thrust/device_vector.h>
#include <thrust/scan.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
template <typename T>
__global__
void sum(T* output, const T* start, const T* stop, const int n) {
int thid = threadIdx.x + blockIdx.x * blockDim.x;
if (thid < n) {
output[thid] = stop[thid] - start[thid];
}
}
template <typename T>
void prefix_sum1(const T* arr, const T* arr2, const int size) {
T* output = (T*)malloc(size * sizeof(T));
int block, thread;
if (size > 1024) {
block = (size / 1024) + 1;
thread = 1024;
}
else {
thread = size;
block = 1;
}
T* d_output, * d_arr, * d_arr2;
cudaMalloc((void**)&d_output, size * sizeof(T));
cudaMalloc((void**)&d_arr, size * sizeof(T));
cudaMemcpy(d_arr, arr, size * sizeof(T), cudaMemcpyHostToDevice);
cudaMalloc((void**)&d_arr2, size * sizeof(T));
cudaMemcpy(d_arr2, arr2, size * sizeof(T), cudaMemcpyHostToDevice);
sum<T> << <block, thread >> > (d_output, d_arr, d_arr2, size);
cudaDeviceSynchronize();
cudaMemcpy(output, d_output, size * sizeof(T), cudaMemcpyDeviceToHost);
thrust::device_vector<T> data(output, output+size);
thrust::device_vector<T> temp(data.size() + 1);
thrust::exclusive_scan(data.begin(), data.end(), temp.begin());
temp[data.size()] = data.back() + temp[data.size() - 1];
//for (const auto& i : temp)
// std::cout << i << '\n';
cudaFree(d_output);
cudaFree(d_arr);
cudaFree(d_arr2);
free(output);
}
template <typename T>
void prefix_sum2(const T* arr, const T* arr2, const int size) {
thrust::device_vector<T> d_arr(arr, arr + size);
thrust::device_vector<T> d_arr2(arr2, arr2 + size);
thrust::device_vector<T> data(size);
thrust::transform(d_arr2.begin(), d_arr2.end(), d_arr.begin(), data.begin(), thrust::minus<T>());
thrust::device_vector<T> temp(data.size() + 1);
thrust::exclusive_scan(data.begin(), data.end(), temp.begin());
temp[data.size()] = data.back() + temp[data.size() - 1];
//for (const auto& i : temp)
// std::cout << i << '\n';
}
int main() {
int const size = 100000;
int starter, stopper;
for (int i = 0; i < size; i++) {
starter[i] = i;
stopper[i] = i + 1;
}
auto start2 = std::chrono::high_resolution_clock::now();
prefix_sum2<int>(starter, stopper, size);
auto stop2 = std::chrono::high_resolution_clock::now();
auto time2 = std::chrono::duration_cast<std::chrono::microseconds>(stop2 - start2);
std::cout << "Time taken for thrust = " << time2.count() << "\n";
auto start1 = std::chrono::high_resolution_clock::now();
prefix_sum1<int>(starter, stopper, size);
auto stop1 = std::chrono::high_resolution_clock::now();
auto time1 = std::chrono::duration_cast<std::chrono::microseconds>(stop1 - start1);
std::cout << "Time taken for kernel = " << time1.count() << "\n";
}
I am trying to test if vector subtraction is faster using thrust::transform or writing my own kernel (with my particular use case of exclusive scan). But the second function to be called, be it my own kernel or the function calling thrust::transform, is always a lot faster.
What is going on here? How do I accurately time my function calls?