Title should be clear.
I have a 12GB dataset and since it can’t all fit in device memory, I want to use managed vectors with thrust. Is there a way to use managed memory with thrust? Particularly with thrust::sort, thrust::erase, and thrust::lower_bound.
Thanks.
Looks like this has been asked before.
https://devtalk.nvidia.com/default/topic/830260/using-thrust-to-sort-unified-memory-buffer-/
and here:
No, there is no managed_vector. But you can create a device_vector with managed_memory and it should work mostly identical it seems:
// Allocate managed memory for indices
cudaMallocManaged((void **)&A, sizeof(int)*N);
thrust::device_ptr indices(A);
correct, there is no managed vector, currently
Perhaps I’m being pedantic, but I wouldn’t say the discussions you’ve linked show usage of thrust::device_vector with managed memory. They show usage of thrust::device_ptr with managed memory. They are not the same, however based on what you’ve mentioned so far, I’m assuming thrust::device_ptr should meet your needs.
In order to have a true thrust::device_vector (which is a true container, device_ptr is not) on top of a managed allocation, it would be necessary to replace the allocator used by thrust::device_vector
I think it’s as simple as this. I adapted Jared’s allocator from https://github.com/jaredhoberock/managed_allocator:
#pragma once
#include <thrust/system_error.h>
#include <thrust/system/cuda/error.h>
template<class T>
class managed_allocator : public thrust::device_malloc_allocator<T>
{
public:
using value_type = T;
typedef thrust::device_ptr<T> pointer;
inline pointer allocate(size_t n)
{
value_type* result = nullptr;
cudaError_t error = cudaMallocManaged(&result, n*sizeof(T), cudaMemAttachGlobal);
if(error != cudaSuccess)
{
throw thrust::system_error(error, thrust::cuda_category(), "managed_allocator::allocate(): cudaMallocManaged");
}
return thrust::device_pointer_cast(result);
}
inline void deallocate(pointer ptr, size_t)
{
cudaError_t error = cudaFree(thrust::raw_pointer_cast(ptr));
if(error != cudaSuccess)
{
throw thrust::system_error(error, thrust::cuda_category(), "managed_allocator::deallocate(): cudaFree");
}
}
};
Use it like:
template<class T>
using managed_device_vector = thrust::device_vector<T, managed_allocator<T>>;
// use it as a normal device vector
managed_device_vector myDeviceVector ( first, last);
... etc ...
Thanks for this very helpful code! The link to Jared’s allocator is broken.
Also I want to be able to:
cudaMallocManaged(&result, n*sizeof(T), cudaMemAttachGlobal)
OR:
cudaMallocManaged(&result, n*sizeof(T), cudaMemAttachHost)
Is it possible to make the flag passed to cudaMallocManaged be a template:
// CMallocFlag can be either cudaMemAttachHost or cudaMemAttachGlobal?
template<class T, unsigned int CMallocFlag>
class managed_allocator : public thrust::device_malloc_allocator<T>
{
public:
using value_type = T;
typedef thrust::device_ptr<T> pointer;
inline pointer allocate(size_t n)
{
value_type* result = nullptr;
cudaError_t error = cudaMallocManaged(&result, n*sizeof(T), CMallocFlag);
if(error != cudaSuccess)
{
throw thrust::system_error(error, thrust::cuda_category(), "managed_allocator::allocate(): cudaMallocManaged");
}
return thrust::device_pointer_cast(result);
}
inline void deallocate(pointer ptr, size_t)
{
cudaError_t error = cudaFree(thrust::raw_pointer_cast(ptr));
if(error != cudaSuccess)
{
throw thrust::system_error(error, thrust::cuda_category(), "managed_allocator::deallocate(): cudaFree");
}
}
};
template<class T>
using managed_host_vector = thrust::device_vector<T, managed_allocator<T, cudaMemAttachHost>>;
template<class T>
using managed_device_vector = thrust::device_vector<T, managed_allocator<T, cudaMemAttachGlobal>>;