Ah sorry. The code with the previous output is the following:
#include <cassert>
#include <cstdio>
#include <cuda.h>
__device__ __host__ inline void gpuAssert(cudaError_t code, const char *file,
int line, bool abort = false)
{
if (code != cudaSuccess)
{
printf("GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort)
{
printf("GPU error code: %d\n", code);
assert(0 && code);
}
}
}
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
namespace cu
{
template< class T >
__device__ __host__ constexpr std::remove_reference_t<T>&& move(T&& t) noexcept
{
return static_cast<typename std::remove_reference<T>::type&&>(t) ;
}
template <typename T>
class vector
{
public:
using value_type = T;
__device__ __host__ vector()
{
}
__device__ __host__ vector(size_t count, T const& value = T()) : size_{count}, capacity_{count}
{
if (count)
{
data_ = new T[count];
if (!data_)
{
printf("Failure at %s %d\n", __FILE__, __LINE__);
return;
}
for (size_t i{0}; i < size_; ++i)
this->operator[](i) = value;
}
}
__device__ __host__ vector(vector const& other) : vector(other.begin(), other.end())
{
}
__device__ __host__ vector(vector&& other)
{
data_ = cu::move(other.data_);
size_ = cu::move(other.size_);
capacity_ = cu::move(capacity_);
other.data_ = nullptr;
other.size_ = 0;
other.capacity_ = 0;
}
__device__ __host__ ~vector()
{
if (data_)
delete[] data_;
data_ = nullptr;
size_ = 0;
capacity_ = 0;
}
__device__ __host__ vector& operator=(vector const& other)
{
resize(other.size());
for (size_t i{0}; i < other.size(); ++i)
this->operator[](i) = other[i];
return *this;
}
__device__ __host__ vector& operator=(vector&& other)
{
delete[] data_;
data_ = other.data_;
size_ = cu::move(other.size_);
capacity_ = cu::move(capacity_);
other.data_ = nullptr;
other.size_ = 0;
other.capacity_ = 0;
return *this;
}
__device__ __host__ void reserve(size_t capacity)
{
if (capacity <= capacity_)
return;
T* d(nullptr);
d = new T[capacity];
if (!d)
{
printf("Failure at %s %d\n", __FILE__, __LINE__);
return;
}
for (size_t i{0}; i < size_; ++i)
d[i] = data_[i];
delete[] data_;
capacity_ = capacity;
data_ = d;
}
__device__ __host__ void resize(size_t size)
{
if (size > capacity_)
{
if (size <= 2 * capacity_)
reserve(2 * capacity_);
else
reserve(size);
}
size_ = size;
}
__device__ __host__ T& operator[](size_t i)
{
assert(data_ && i < size_);
return data_[i];
}
__device__ __host__ void push_back(T const& value)
{
resize(size_ + 1);
this->operator[](size_ - 1) = value;
}
private:
T* data_{nullptr};
size_t size_{0};
size_t capacity_{0};
};
}
__global__ void kernel()
{
cu::vector<int> v;
v.push_back(0);
v.push_back(1);
v.push_back(2);
}
int main()
{
gpuErrchk(cudaDeviceSetLimit(cudaLimitStackSize, 256 * 92));
gpuErrchk(cudaDeviceSetLimit(cudaLimitMallocHeapSize, 1024ull * 1024 * 1024));
kernel<<<128, 128>>>();
gpuErrchk(cudaDeviceSynchronize());
return 0;
}