Why doesn't my code run for arrays with size larger than 70000?

#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/scan.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"

template <typename T>
void prefix_sum(T* output, const T* arr, const T* arr2, const int size) {
  thrust::device_vector<T> d_arr(arr, arr + size);
  thrust::device_vector<T> d_arr2(arr2, arr2 + size);
  thrust::device_vector<T> data(size);
  thrust::transform(d_arr2.begin(), d_arr2.end(), d_arr.begin(), data.begin(), thrust::minus<T>());
  thrust::device_vector<T> temp(data.size() + 1);
  thrust::exclusive_scan(data.begin(), data.end(), temp.begin());
  temp[data.size()] = data.back() + temp[data.size() - 1];
  thrust::copy(temp.begin(), temp.end(), output);
}

int main() {
  int const size = 80000;
  int starter, stopper, output;
  for (int i = 0; i < size; i++) {
    starter[i] = i;
    stopper[i] = i + 1;
  }
  prefix_sum<int>(output, starter, stopper, size);
  for (int i = 0; i < size + 1; i++) {
    std::cout << output[i] << "\n";
  }
}

The code works for size = 70000. For sizes larger than that, it crashes.

========= CUDA-MEMCHECK
========= Error: process didn't terminate successfully
========= No CUDA-MEMCHECK results found

Is there any reason why?

What do you mean it crashes?

I have no issue running the code above… What are hardware are you on?

...
79991
79992
79993
79994
79995
79996
79997
79998
79999
80000

There is no output.

PS C:\Users\reikd\source\repos\testgsoc\testgsoc> .\a.exe
PS C:\Users\reikd\source\repos\testgsoc\testgsoc>

But I get the proper output with size lesser than 70000.

I am using an Nvidia GeForce GTX 1070.

Try profiling your code with Nsight Systems to make sure work is being done on the GPU and you are getting a cudaMemcpy back to the host.

Yes I can see work is being done on the GPU and DtoH memcpy.

Unfortunately, I’m not able to reproduce your issue.

What error are you actually getting?? What are the specs on your computer? I was able to break your code by making size 700000. But fixed it by using std::vector, which allocates on the heap. https://stackoverflow.com/questions/1598397/creating-array-of-objects-on-the-stack-and-heap

#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/scan.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <vector>

template <typename T>
void prefix_sum(T* output, const T* arr, const T* arr2, const int size) {
  thrust::device_vector<T> d_arr(arr, arr + size);
  thrust::device_vector<T> d_arr2(arr2, arr2 + size);
  thrust::device_vector<T> data(size);
  thrust::transform(d_arr2.begin(), d_arr2.end(), d_arr.begin(), data.begin(), thrust::minus<T>());
  thrust::device_vector<T> temp(data.size() + 1);
  thrust::exclusive_scan(data.begin(), data.end(), temp.begin());
  temp[data.size()] = data.back() + temp[data.size() - 1];
  thrust::copy(temp.begin(), temp.end(), output);
}

int main() {
  int const size = 700000;
//   int starter, stopper, output;
  std::vector<int> starter (size);
  std::vector<int> stopper (size);
  std::vector<int> output (size + 1);
  
  for (int i = 0; i < size; i++) {
    starter[i] = i;
    stopper[i] = i + 1;
  }
  prefix_sum<int>(output.data(), starter.data(), stopper.data(), size);
  for (int i = 0; i < size + 1; i++) {
    std::cout << output[i] << "\n";
  }
}

To elaborate:
GPU - Nvidia GeForce GTX 1070 Max Q (6 GB VRAM)
CPU - i7-8750H
RAM - 16 GB

I switched to running the code on Linux over Windows and it works (breaks for 1000000 when allocating my arrays on the stack). I am not sure why. I am compiling on both platforms in Release Mode with sm61.

Interesting. I think the issue is more related to the stack size and host code… Not sure :(