#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/scan.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
template <typename T>
void prefix_sum(T* output, const T* arr, const T* arr2, const int size) {
thrust::device_vector<T> d_arr(arr, arr + size);
thrust::device_vector<T> d_arr2(arr2, arr2 + size);
thrust::device_vector<T> data(size);
thrust::transform(d_arr2.begin(), d_arr2.end(), d_arr.begin(), data.begin(), thrust::minus<T>());
thrust::device_vector<T> temp(data.size() + 1);
thrust::exclusive_scan(data.begin(), data.end(), temp.begin());
temp[data.size()] = data.back() + temp[data.size() - 1];
thrust::copy(temp.begin(), temp.end(), output);
}
int main() {
int const size = 80000;
int starter, stopper, output;
for (int i = 0; i < size; i++) {
starter[i] = i;
stopper[i] = i + 1;
}
prefix_sum<int>(output, starter, stopper, size);
for (int i = 0; i < size + 1; i++) {
std::cout << output[i] << "\n";
}
}
The code works for size = 70000. For sizes larger than that, it crashes.
========= CUDA-MEMCHECK
========= Error: process didn't terminate successfully
========= No CUDA-MEMCHECK results found
Is there any reason why?
What do you mean it crashes?
I have no issue running the code above… What are hardware are you on?
...
79991
79992
79993
79994
79995
79996
79997
79998
79999
80000
There is no output.
PS C:\Users\reikd\source\repos\testgsoc\testgsoc> .\a.exe
PS C:\Users\reikd\source\repos\testgsoc\testgsoc>
But I get the proper output with size lesser than 70000.
I am using an Nvidia GeForce GTX 1070.
mnicely
February 6, 2020, 11:39pm
4
Try profiling your code with Nsight Systems to make sure work is being done on the GPU and you are getting a cudaMemcpy back to the host.
Yes I can see work is being done on the GPU and DtoH memcpy.
Unfortunately, I’m not able to reproduce your issue.
mnicely
February 11, 2020, 2:26pm
7
What error are you actually getting?? What are the specs on your computer? I was able to break your code by making size 700000. But fixed it by using std::vector, which allocates on the heap. https://stackoverflow.com/questions/1598397/creating-array-of-objects-on-the-stack-and-heap
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/scan.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <vector>
template <typename T>
void prefix_sum(T* output, const T* arr, const T* arr2, const int size) {
thrust::device_vector<T> d_arr(arr, arr + size);
thrust::device_vector<T> d_arr2(arr2, arr2 + size);
thrust::device_vector<T> data(size);
thrust::transform(d_arr2.begin(), d_arr2.end(), d_arr.begin(), data.begin(), thrust::minus<T>());
thrust::device_vector<T> temp(data.size() + 1);
thrust::exclusive_scan(data.begin(), data.end(), temp.begin());
temp[data.size()] = data.back() + temp[data.size() - 1];
thrust::copy(temp.begin(), temp.end(), output);
}
int main() {
int const size = 700000;
// int starter, stopper, output;
std::vector<int> starter (size);
std::vector<int> stopper (size);
std::vector<int> output (size + 1);
for (int i = 0; i < size; i++) {
starter[i] = i;
stopper[i] = i + 1;
}
prefix_sum<int>(output.data(), starter.data(), stopper.data(), size);
for (int i = 0; i < size + 1; i++) {
std::cout << output[i] << "\n";
}
}
reikdas
February 11, 2020, 4:47pm
8
To elaborate:
GPU - Nvidia GeForce GTX 1070 Max Q (6 GB VRAM)
CPU - i7-8750H
RAM - 16 GB
I switched to running the code on Linux over Windows and it works (breaks for 1000000 when allocating my arrays on the stack). I am not sure why. I am compiling on both platforms in Release Mode with sm61.
mnicely
February 12, 2020, 12:26am
9
Interesting. I think the issue is more related to the stack size and host code… Not sure :(