Preventing the copy of thrust device_vector to device

ep96302 · March 16, 2021, 8:38am

So I have a helper class (creatively named “BetterVector”) that is designed to be passed back and forth from host and device, with most of its functionality accessible from either side (a significant flaw of device_vector). However, kernels fail with a non-descriptive allocation error.
From the stack trace, it appears to trigger sometimes on the copy constructor, and sometimes on the deconstructor, and I’m not entirely sure why it changes. I figured it was the device_vector data member having a host-only constructor and deconstructor, which I used the following post to utilize a union to prevent the calling of these functions, but the issue still persists. If any of you have any suggestions, it would be greatly appreciated.

main.cu testing file:

#include <abstract/BetterVector.cuh>

struct thrust_functor {
    abstract::BetterVector<int> vector;

    explicit thrust_functor(const abstract::BetterVector<int> &vector) : vector(vector) {}

    __host__ void operator()(int i) {
        printf("Thrust functor index %d: %d\n", i, (int) vector[i]);
    }
};

__global__ void baseCudaPrint(abstract::BetterVector<int>* ptr) {
    const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
    abstract::BetterVector<int> vector = *ptr;
    printf("Cuda kernel index %zu: %d\n", i, (int) vector[i]);
}


int main() {
    abstract::BetterVector<int> vector({1, 2, 3, 4});
    for (int i = 0; i < 4; i++) {
        printf("Host index %d: %d\n", i, (int) vector[i]);
    }
    printf("\n");

    abstract::BetterVector<int>* devVectorPtr;
    cudaMalloc(&devVectorPtr, sizeof(abstract::BetterVector<int>));
    cudaMemcpy(devVectorPtr, &vector, 1, cudaMemcpyHostToDevice);
    baseCudaPrint<<<1, vector.size()>>>(devVectorPtr);
    cudaDeviceSynchronize();
    cudaFree(devVectorPtr);
    printf("\n");

    thrust::counting_iterator<int> first(0);
    thrust::counting_iterator<int> last = first + vector.size();
    thrust::for_each(thrust::host, first, last, thrust_functor(vector));
    cudaDeviceSynchronize();
    printf("\n");
}

abstract/BetterVector.cuh:

#include <thrust/device_vector.h>
#include <thrust/device_ptr.h>
#include <thrust/functional.h>

namespace abstract {
template<typename T>
    struct equal_to : public thrust::unary_function<T, bool> {
        T lhs;

        __device__ __host__ explicit equal_to(T lhs) : lhs(lhs) {}

        __device__ __host__ bool operator()(T rhs) {
            return lhs == rhs;
        }
    };
template<typename T, typename VecType = thrust::device_vector<T>>
class BetterVector {
protected:
    typename VecType::pointer raw;
    size_t cachedSize;
    union {
        VecType vector;
    };

public:

    __host__ BetterVector() : vector(), raw(vector.data()), cachedSize(0) {}

    __host__ explicit BetterVector(size_t size) : vector(size), raw(vector.data()), cachedSize(size) {}

    __host__ explicit BetterVector(VecType vec) : vector(vec), raw(vector.data()), cachedSize(vec.size()) {}

    __host__ explicit BetterVector(std::vector<T> vec) : vector(vec), raw(vector.data()), cachedSize(vec.size()) {}

    __host__ __device__ BetterVector(const BetterVector &otherVec) :
#ifndef __CUDA_ARCH__
            vector(otherVec.vector),
#endif
            cachedSize(otherVec.cachedSize), raw(otherVec.raw) {}


    __host__ __device__ virtual ~BetterVector() {
#ifndef __CUDA_ARCH__
        vector.~VecType();
#endif
    }

    __host__ __device__ typename VecType::const_reference operator[](size_t index) const {
#ifdef __CUDA_ARCH__
        return raw[index];
#else
        return vector[index];
#endif
    }

    __host__ __device__ size_t size() const {
#ifdef __CUDA_ARCH__
        return cachedSize;
#else
        return vector.size();
#endif
    }
}

Robert_Crovella · March 17, 2021, 4:56pm

Topic		Replies	Views
How to create vector of objects in the device? CUDA Programming and Performance cuda	1	904	February 2, 2023
Allocating an array of Thrust device_vector's GPU-Accelerated Libraries	4	4201	April 3, 2015
Vector Of Vecotrs Using Thrust GPU-Accelerated Libraries thrust	3	2121	February 27, 2023
Using thrust to handle vectors in Cuda classes CUDA Programming and Performance	0	1792	April 13, 2011
Using device pointer in thrust algorithm CUDA Programming and Performance	2	28706	December 17, 2021
Does thrust::device_vector::resize() cause reallocation when resizing to a smaller size? CUDA Programming and Performance	1	565	December 2, 2022
How to use thrust resize in cuda kernal CUDA Programming and Performance	0	1031	November 17, 2017
Using thrust::transform_output_iterator in real code CUDA Programming and Performance	3	882	October 19, 2019
"Selective usage" of __device__ in template class CUDA Programming and Performance	0	377	February 1, 2021
cuda and C++ CUDA Programming and Performance	4	2135	September 5, 2010

Preventing the copy of thrust device_vector to device

Related topics