I have the following piece of code:
for (size_t i = 0; i < count; ++i)
{
cudaSetDevice(i);
m_impl->_streamed_memory_per_device[i] = new rmm::mr::cuda_async_memory_resource();
m_impl->_thread_safe_streamed_memory_per_device[i] = new StreamedDeviceMemory(m_impl->_mutexes[i], m_impl->_streamed_memory_per_device[i]);
CUDA_ERRCHK(cudaMalloc(&m_impl->_heap_per_device[i], heap_size_per_device));
m_impl->_heap_head_per_device[i] = initialize_memory(m_impl->_heap_per_device[i], heap_size_per_device);
}
CUDA_ERRCHK(cudaMalloc(&m_impl->_heap_per_device[i], heap_size_per_device))
returns cudaErrorInvalidValue
. As far as I know, RMM uses cudaMallocFromPoolAsync
to allocate memory, and there is a streamed memory pool behind this API. From my observation, RMM memory pool occupies half of device memory of each GPU. Also, heap_size_per_device
is 536870912, which is 0.5GB. In my understanding, after creation of streamed memory pool, there is enough room to call cudaMalloc
. This phenomenon raises my concern. Does this memory pool conflict with cudaMalloc in general?