Get memory usage of thrust::remove_if

In my program, I am using thrust::remove_if with the thrust::device execution policy. For some specific cases where the program is already using all the memory available on the GPU, I get a thrust::system::detail::bad_alloc error. To guard against this, I would like to know how much GPU memory the function will use for that input, so that I can move other things to unified memory. I haven’t found anywhere how I can figure out the memory usage of a Thrust function. Is it possible?

Thrust is open source. So in some way it should definitely possible.
(Either by understanding the memory requirements from the source code or creating a function returning amount of needed memory.)

Perhaps someone else answers for a possible simpler way?

Thrust is a convenient high-level library which abstracts away the memory handling. cub::DeviceSelect::If (cub::DeviceSelect — cub 2.5 documentation) might be better suited for you, with manual memory management.

That being said, you can implement custom thrust allocators which could, for example, fall back to managed memory.
Alternatively, one could use a custom allocator which throws a custom exception on allocation which contains the number of bytes to be allocated. Of course, this will only be usefull if thrust only performs a single allocation internally. However, the custom allocation could be used together with a proper allocation.

#include <thrust/device_vector.h>
#include <thrust/sequence.h>
#include <thrust/device_malloc_allocator.h>
#include <thrust/remove.h>
#include <thrust/execution_policy.h>

#include <iostream>

struct CustomThrustOOMException{
    size_t failedAllocationBytes = 0;
};

struct IsEven{
    __device__
    bool operator()(int i){return i % 2 == 0; }
};

template<class T>
struct ThrowBytesAllocator : thrust::device_malloc_allocator<T> {
    using value_type = T;
    using super_t = thrust::device_malloc_allocator<T>;
    using pointer = typename super_t::pointer;
    using size_type = typename super_t::size_type;
    using reference = typename super_t::reference;
    using const_reference = typename super_t::const_reference;

    pointer allocate(size_type n){
        throw CustomThrustOOMException{n * sizeof(T)};
	}

    void deallocate(pointer, size_type){}
};

template<class T>
struct FallbackAllocator : thrust::device_malloc_allocator<T> {
    using value_type = T;
    using super_t = thrust::device_malloc_allocator<T>;
    using pointer = typename super_t::pointer;
    using size_type = typename super_t::size_type;
    using reference = typename super_t::reference;
    using const_reference = typename super_t::const_reference;

    pointer allocate(size_type n){
        T* ptr = nullptr;
		cudaError_t status = cudaMalloc(&ptr, n * sizeof(T));
		if(status == cudaSuccess){
		}else{
			cudaGetLastError(); //reset the error of failed allocation

                //cudaMalloc failed, maybe managed memory will work?
	    	status = cudaMallocManaged(&ptr, n * sizeof(T));
    		if(status != cudaSuccess){
                cudaGetLastError(); //reset the error of failed allocation

                //throw with failed allocation size
    			throw CustomThrustOOMException{n * sizeof(T)};
    		}
		}
		return thrust::device_pointer_cast(ptr);
	}

    void deallocate(pointer ptr, size_type n){
    	cudaFree(ptr.get());
    }
};

int main(){
    size_t N = 1'000'000;
    thrust::device_vector<int> vec(N);
    thrust::sequence(vec.begin(), vec.end(), 0);

    size_t requiredBytes = 0;
    try{
        thrust::remove_if(
            thrust::device(ThrowBytesAllocator<char>{}), 
            vec.begin(), vec.end(), 
            IsEven{});
    }catch(CustomThrustOOMException e){
        requiredBytes = e.failedAllocationBytes;
    }

    std::cout << "Thrust would need " << requiredBytes << " bytes\n";

    try{
        FallbackAllocator<char> tempstorageAllocator;

        auto endIterator = thrust::remove_if(
            thrust::device(tempstorageAllocator), 
            vec.begin(), vec.end(), 
            IsEven{});
        std::cout << "vector size after copy_if: " << thrust::distance(vec.begin(), endIterator) << "\n";

    }catch(CustomThrustOOMException e){
        std::cout << "CustomThrustOOMException: failedAllocationBytes " << e.failedAllocationBytes << "\n";
    }

}

Thank you for the extensive reply, that helps a lot!

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.