CUDA thrust compilation error

Ah sorry. The code with the previous output is the following:

#include <cassert>
#include <cstdio>

#include <cuda.h>

__device__ __host__ inline void gpuAssert(cudaError_t code, const char *file,
                                          int line, bool abort = false)
{
    if (code != cudaSuccess) 
    {
        printf("GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);

        if (abort)
        {
            printf("GPU error code: %d\n", code);
            assert(0 && code);
        }
    }
}

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }

namespace cu
{
    template< class T >
    __device__ __host__ constexpr std::remove_reference_t<T>&& move(T&& t) noexcept
    {
        return static_cast<typename std::remove_reference<T>::type&&>(t) ;
    }

    template <typename T>
    class vector
    {
        public:
            using value_type = T;

            __device__ __host__ vector()
            {
            }

            __device__ __host__ vector(size_t count, T const& value = T()) : size_{count}, capacity_{count}
            {
                if (count)
                {
                    data_ = new T[count];

                    if (!data_)
                    {
                        printf("Failure at %s %d\n", __FILE__, __LINE__);
                        return;
                    }

                    for (size_t i{0}; i < size_; ++i)
                        this->operator[](i) = value;
                }
            }

            __device__ __host__ vector(vector const& other) : vector(other.begin(), other.end())
            {
            }

            __device__ __host__ vector(vector&& other)
            {
                data_ = cu::move(other.data_);
                size_ = cu::move(other.size_);
                capacity_ = cu::move(capacity_);
                
                other.data_ = nullptr;
                other.size_ = 0;
                other.capacity_ = 0;
            }

            __device__ __host__ ~vector()
            {
                if (data_)
                    delete[] data_;

                data_ = nullptr;
                size_ = 0;
                capacity_ = 0;
            }

            __device__ __host__ vector& operator=(vector const& other)
            {
                resize(other.size());
                
                for (size_t i{0}; i < other.size(); ++i)
                    this->operator[](i) = other[i];

                return *this;
            }

            __device__ __host__ vector& operator=(vector&& other)
            {
                delete[] data_;

                data_ = other.data_;
                size_ = cu::move(other.size_);
                capacity_ = cu::move(capacity_);
                
                other.data_ = nullptr;
                other.size_ = 0;
                other.capacity_ = 0;
                
                return *this;
            }

            __device__ __host__ void reserve(size_t capacity)
            {
                if (capacity <= capacity_)
                    return;

                T* d(nullptr);

                d = new T[capacity];

                if (!d)
                {
                    printf("Failure at %s %d\n", __FILE__, __LINE__);
                    return;
                }

                for (size_t i{0}; i < size_; ++i)
                    d[i] = data_[i];

                delete[] data_;

                capacity_ = capacity;
                data_ = d;
            }

            __device__ __host__ void resize(size_t size)
            {
                if (size > capacity_)
                {
                    if (size <= 2 * capacity_)
                        reserve(2 * capacity_);
                    else
                        reserve(size);
                }

                size_ = size;
            }

            __device__ __host__ T& operator[](size_t i)
            {
                assert(data_ && i < size_);

                return data_[i];
            }

            __device__ __host__ void push_back(T const& value)
            {
                resize(size_ + 1);

                this->operator[](size_ - 1) = value;
            }

        private:
            T* data_{nullptr};
            size_t size_{0};
            size_t capacity_{0};
    };
}

__global__ void kernel()
{
    cu::vector<int> v;
    v.push_back(0);
    v.push_back(1);
    v.push_back(2);
}

int main()
{
    gpuErrchk(cudaDeviceSetLimit(cudaLimitStackSize, 256 * 92));

    gpuErrchk(cudaDeviceSetLimit(cudaLimitMallocHeapSize, 1024ull * 1024 * 1024));

    kernel<<<128, 128>>>();

    gpuErrchk(cudaDeviceSynchronize());

    return 0;
}

So that means an allocation failed. I wouldn’t have guessed you’re running out of heap space but that would be the presumed proximal issue.

When I run that code on cc8.9/CUDA 12.2, whether <<<128,128>>> or <<<1024,1024>>> I get no errors of any kind from compute-sanitizer.

Puzzling. I will try updating my setup to CUDA 12.8 but that won’t happen till next week.

I tried with CUDA 12.8.1 on a cc7.5 and also a cc8.6 device, but did not see any issues reported by compute_sanitizer with either <<<128,128>>> or <<<1024,1024>>> launch configuration. I don’t have a GTX 1050 to test with.

Ok, thanks for this feedback. So it seems that the problem is linked with my card. Is there other thing that I can do or test from my side?

EDIT: I tested the code on Windows 10 and I have the same issue.

You can always move away from the original host c++ -style code.

  • Determine what is the maximum amount of memory you need, and allocate all memory beforehand. Alternatively, perform a dry-run of the algorithm which determines the exact amount of required memory. (Yes, this potentially doubles the execution time)
  • Dynamic parallelism the way you are using it is deprecated. I would suggest to just launch the kernels from the host.

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.