You are casting correctly, but in line 19 you are trying to print values from the device pointer on the host. I guess what you’re trying to do is copy the data back??
#include <iostream>
#include <chrono>
#include <thrust/device_vector.h>
#include <thrust/scan.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
template <typename T>
void prefix_sum(T* output, const T* arr, const T* arr2, const int size) {
thrust::device_vector<T> d_arr(arr, arr + size);
thrust::device_vector<T> d_arr2(arr2, arr2 + size);
thrust::device_vector<T> data(size);
thrust::transform(d_arr2.begin(), d_arr2.end(), d_arr.begin(), data.begin(), thrust::minus<T>());
thrust::device_vector<T> temp(data.size() + 1);
thrust::exclusive_scan(data.begin(), data.end(), temp.begin());
temp[data.size()] = data.back() + temp[data.size() - 1];
// output = thrust::raw_pointer_cast(temp.data());
for (int i = 0; i < size + 1; i++) {
output[i] = temp[i];
}
for (int i = 0; i < size + 1; i++) {
std::cout << output[i] << " ";
}
std::cout << std::endl;
for (const auto& i : temp)
std::cout << i << ' ';
std::cout << std::endl;
}
int main() {
int const size = 100;
int starter, stopper, output;
for (int i = 0; i < size; i++) {
starter[i] = i;
stopper[i] = i + 1;
}
auto start = std::chrono::high_resolution_clock::now();
prefix_sum<int>(output, starter, stopper, size);
auto stop = std::chrono::high_resolution_clock::now();
auto time = std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
std::cout << "Time taken for thrust = " << time.count() << "\n";
}
Also, if you want you can just use host_vector.
#include <iostream>
#include <chrono>
#include <thrust/device_vector.h>
#include <thrust/scan.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
template <typename T>
void prefix_sum(T output, const T arr, const T arr2, const int size) {
using data_type = typename T::value_type;
thrust::device_vector<data_type> d_arr(arr);
thrust::device_vector<data_type> d_arr2(arr2);
thrust::device_vector<data_type> data(size);
thrust::transform(d_arr2.begin(), d_arr2.end(), d_arr.begin(), data.begin(), thrust::minus<data_type>());
thrust::device_vector<data_type> temp(data.size() + 1);
thrust::exclusive_scan(data.begin(), data.end(), temp.begin());
temp[data.size()] = data.back() + temp[data.size() - 1];
output = temp;
for (const auto& i : output)
std::cout << i << ' ';
std::cout << std::endl;
for (const auto& i : temp)
std::cout << i << ' ';
std::cout << std::endl;
}
int main() {
int const size = 100;
thrust::host_vector<int> starter(size);
thrust::host_vector<int> stopper(size);
thrust::host_vector<int> output(size + 1);
for (int i = 0; i < size; i++) {
starter[i] = i;
stopper[i] = i + 1;
}
auto start = std::chrono::high_resolution_clock::now();
// prefix_sum<thrust::host_vector<int>>(output, starter, stopper, size);
prefix_sum(output, starter, stopper, size); // both should work
auto stop = std::chrono::high_resolution_clock::now();
auto time = std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
std::cout << "Time taken for thrust = " << time.count() << "\n";
}
for (int i = 0; i < size + 1; i++) {
output[i] = temp[i];
}
I don’t want to copy each value from the device_vector to the host C array one at a time using a for-loop. That is way too slow. Is there a way to do this parallelly on the GPU? (I think I cannot write my own kernel to do this because I cannot pass a thrust device_vector to a GPU kernel)
I want to avoid any thrust or cuda code in the main function.