How to correctly cast thrust device_vector to a C array pointer?

I am casting my device vector like this -

thrust::device_vector<T> temp(data.size() + 1);
// temp is being filled
T* output = thrust::raw_pointer_cast(temp.data());
for (int i = 0; i < size + 1; i++) {
    std::cout << output[i] << "\n";
}

but, the array is not being properly displayed.

This is my full code for context-

#include <iostream>
#include <chrono>
#include <thrust/device_vector.h>
#include <thrust/scan.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"

template <typename T>
void prefix_sum(T* output, const T* arr, const T* arr2, const int size) {
  thrust::device_vector<T> d_arr(arr, arr + size);
  thrust::device_vector<T> d_arr2(arr2, arr2 + size);
  thrust::device_vector<T> data(size);
  thrust::transform(d_arr2.begin(), d_arr2.end(), d_arr.begin(), data.begin(), thrust::minus<T>());
  thrust::device_vector<T> temp(data.size() + 1);
  thrust::exclusive_scan(data.begin(), data.end(), temp.begin());
  temp[data.size()] = data.back() + temp[data.size() - 1];
  output = thrust::raw_pointer_cast(temp.data());
  for (int i = 0; i < size + 1; i++) {
    std::cout << output[i] << "\n";
  }
  //for (const auto& i : temp)
  //  std::cout << i << '\n';
}

int main() {
  int const size = 100;
  int starter, stopper, output;
  for (int i = 0; i < size; i++) {
    starter[i] = i;
    stopper[i] = i + 1;
  }
  auto start = std::chrono::high_resolution_clock::now();
  prefix_sum<int>(output, starter, stopper, size);
  auto stop = std::chrono::high_resolution_clock::now();
  auto time = std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
  std::cout << "Time taken for thrust = " << time.count() << "\n";
}

This code does not work.

========= CUDA-MEMCHECK
========= Error: process didn't terminate successfully
========= No CUDA-MEMCHECK results found

And just executing it gives no results -

PS C:\Users\reikd\source\repos\testgsoc\testgsoc> .\a.exe
PS C:\Users\reikd\source\repos\testgsoc\testgsoc>

You are casting correctly, but in line 19 you are trying to print values from the device pointer on the host. I guess what you’re trying to do is copy the data back??

#include <iostream>
#include <chrono>
#include <thrust/device_vector.h>
#include <thrust/scan.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"

template <typename T>
void prefix_sum(T* output, const T* arr, const T* arr2, const int size) {
    thrust::device_vector<T> d_arr(arr, arr + size);
    thrust::device_vector<T> d_arr2(arr2, arr2 + size);
    thrust::device_vector<T> data(size);
    thrust::transform(d_arr2.begin(), d_arr2.end(), d_arr.begin(), data.begin(), thrust::minus<T>());
    thrust::device_vector<T> temp(data.size() + 1);
    thrust::exclusive_scan(data.begin(), data.end(), temp.begin());
    temp[data.size()] = data.back() + temp[data.size() - 1];
    
//     output = thrust::raw_pointer_cast(temp.data());
    
    for (int i = 0; i < size + 1; i++) {
        output[i] = temp[i];
    }
    
      for (int i = 0; i < size + 1; i++) {
        std::cout << output[i] << " ";
      }
      std::cout << std::endl;
      
    for (const auto& i : temp)
        std::cout << i << ' ';
    std::cout << std::endl;
}

int main() {
    int const size = 100;
    int starter, stopper, output;
    for (int i = 0; i < size; i++) {
        starter[i] = i;
        stopper[i] = i + 1;
    }
    auto start = std::chrono::high_resolution_clock::now();
    prefix_sum<int>(output, starter, stopper, size);
    auto stop = std::chrono::high_resolution_clock::now();
    auto time = std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
    std::cout << "Time taken for thrust = " << time.count() << "\n";
}

Also, if you want you can just use host_vector.

#include <iostream>
#include <chrono>
#include <thrust/device_vector.h>
#include <thrust/scan.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"

template <typename T>
void prefix_sum(T output, const T arr, const T arr2, const int size) {
    
    using data_type = typename T::value_type;
    
    thrust::device_vector<data_type> d_arr(arr);
    thrust::device_vector<data_type> d_arr2(arr2);
    
    thrust::device_vector<data_type> data(size);
    
    thrust::transform(d_arr2.begin(), d_arr2.end(), d_arr.begin(), data.begin(), thrust::minus<data_type>());
    
    thrust::device_vector<data_type> temp(data.size() + 1);
    
    thrust::exclusive_scan(data.begin(), data.end(), temp.begin());
    
    temp[data.size()] = data.back() + temp[data.size() - 1];
    
    output = temp;
    
    for (const auto& i : output)
        std::cout << i << ' ';
    std::cout << std::endl;
    
    for (const auto& i : temp)
        std::cout << i << ' ';
    std::cout << std::endl;
}

int main() {
    int const size = 100;
    thrust::host_vector<int> starter(size);
    thrust::host_vector<int> stopper(size);
    thrust::host_vector<int> output(size + 1);
    
    for (int i = 0; i < size; i++) {
        starter[i] = i;
        stopper[i] = i + 1;
    }
    auto start = std::chrono::high_resolution_clock::now();
//     prefix_sum<thrust::host_vector<int>>(output, starter, stopper, size);
    prefix_sum(output, starter, stopper, size); // both should work
    auto stop = std::chrono::high_resolution_clock::now();
    auto time = std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
    std::cout << "Time taken for thrust = " << time.count() << "\n";
}
for (int i = 0; i < size + 1; i++) {
        output[i] = temp[i];
    }

I don’t want to copy each value from the device_vector to the host C array one at a time using a for-loop. That is way too slow. Is there a way to do this parallelly on the GPU? (I think I cannot write my own kernel to do this because I cannot pass a thrust device_vector to a GPU kernel)

I want to avoid any thrust or cuda code in the main function.

#include <iostream>
#include <chrono>
#include <thrust/device_vector.h>
#include <thrust/scan.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"

template <typename T>
void prefix_sum(T* output, const T* arr, const T* arr2, const int size) {
    thrust::device_vector<T> d_arr(arr, arr + size);
    thrust::device_vector<T> d_arr2(arr2, arr2 + size);
    thrust::device_vector<T> data(size);
    thrust::transform(d_arr2.begin(), d_arr2.end(), d_arr.begin(), data.begin(), thrust::minus<T>());
    thrust::device_vector<T> temp(data.size() + 1);
    thrust::exclusive_scan(data.begin(), data.end(), temp.begin());
    temp[data.size()] = data.back() + temp[data.size() - 1];
    
    //     output = thrust::raw_pointer_cast(temp.data());
    
//     for (int i = 0; i < size + 1; i++) {
//         output[i] = temp[i];
//     }
    
    thrust::copy(temp.begin(), temp.end(), output);
    
    for (int i = 0; i < size + 1; i++) {
        std::cout << output[i] << " ";
    }
    std::cout << std::endl;
    
    for (const auto& i : temp)
        std::cout << i << ' ';
    std::cout << std::endl;
}

int main() {
    int const size = 100;
    int starter, stopper, output;
    for (int i = 0; i < size; i++) {
        starter[i] = i;
        stopper[i] = i + 1;
    }
    auto start = std::chrono::high_resolution_clock::now();
    prefix_sum<int>(output, starter, stopper, size);
    auto stop = std::chrono::high_resolution_clock::now();
    auto time = std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
    std::cout << "Time taken for thrust = " << time.count() << "\n";
}

Thanks!