VPI wrap existing managed memory very slow

hannes09 · June 27, 2021, 8:41am

Hi,

I have tested the performance of vpiImageCreateCUDAMemWrapper and it seems to be very slow compared to simple memcpy.

For 100000 bytes of managed memory vpiImageCreateCUDAMemWrapper takes 320 micro seconds to wrap. memcpy takes 13 micro seconds. (xavier , power mode 3)

Can somebody explain what is going on here?

Code for testing vpiImageCreateCUDAMemWrappert performance:

#include <string>
#include <chrono>
#include <vpi/Stream.h>
#include <vpi/Image.h>
#include <vpi/algo/ConvertImageFormat.h>
#include <vpi/CUDAInterop.h>
#include <iostream>
#include <vector>
#include <cstdint>
#include <cstring>
#include <cuda.h>
#include "cuda_runtime.h"

inline bool cudaAllocMapped( void** cpuPtr, void** gpuPtr, size_t size )
{
	if( !cpuPtr || !gpuPtr || size == 0 )
		return false;

	if( cudaHostAlloc(cpuPtr, size, cudaHostAllocMapped) != cudaSuccess )
		return false;

	if( cudaHostGetDevicePointer(gpuPtr, *cpuPtr, 0) != cudaSuccess )
		return false;

	memset(*cpuPtr, 0, size);
	return true;
}

inline bool cudaAllocMapped( void** ptr, size_t size )
{
	void* cpuPtr = NULL;
	void* gpuPtr = NULL;

	if( !ptr || size == 0 )
		return false;

	if( !cudaAllocMapped(&cpuPtr, &gpuPtr, size) )
		return false;

	if( cpuPtr != gpuPtr )
	{
		std::cout<<"CUDA: cudaAllocMapped() - addresses of CPU and GPU pointers don't match"<<std::endl;
		return false;
	}

	*ptr = gpuPtr;
	return true;
}

class VPITest {
    public:
        VPITest();
        ~VPITest();
        void wrapCUDAMemoryU8(void * data,VPIBackend backend,VPIImage * image,
    int32_t height,int32_t width);
        void runTests();
    private:
        VPIStream m_vpiStream;
        VPIImage m_currImageU8;
        int m_width, m_height;
};

VPITest::VPITest() :
    m_width(1000),
    m_height(100)
{
    vpiStreamCreate(0, &m_vpiStream);
	cudaSetDeviceFlags(cudaDeviceMapHost);
}

VPITest::~VPITest()
{
  vpiStreamDestroy(m_vpiStream);
}

void VPITest::wrapCUDAMemoryU8(void * data,VPIBackend backend,VPIImage * image,
    int32_t height,int32_t width) {

    VPIImageData imgData;
    memset(&imgData, 0, sizeof(imgData));
    imgData.format               = VPI_IMAGE_FORMAT_U8;
    imgData.numPlanes            = 1;
    imgData.planes[0].width      = width;
    imgData.planes[0].height     = height;
    imgData.planes[0].pitchBytes = width;
    imgData.planes[0].data       = data;

    if(vpiImageCreateCUDAMemWrapper(&imgData,backend,image)!=VPI_SUCCESS)
    {
        std::cout<<"VPITest::wrapCUDAMemoryU8: vpiImageCreateCUDAMemWrapper failed"<<std::endl;
    }

}

void VPITest::runTests(){

    uint8_t * data;
    std::vector<uint8_t> testVec;
    testVec.resize(m_height * m_width);

    auto startedAlloc = std::chrono::high_resolution_clock::now();

    if( !cudaAllocMapped((void**)&data,m_height * m_width) )
        return;

    auto doneAlloc = std::chrono::high_resolution_clock::now();

    wrapCUDAMemoryU8(data, VPI_BACKEND_CUDA,&m_currImageU8,m_height,m_width);

    auto doneWrap = std::chrono::high_resolution_clock::now();

    vpiStreamSync(m_vpiStream);

    auto doneSync = std::chrono::high_resolution_clock::now();

    std::memcpy(&data[0], &testVec[0], m_height * m_width); 

    auto doneCpy = std::chrono::high_resolution_clock::now();

    std::cout<<"Alloc time: "<<std::chrono::duration_cast<std::chrono::microseconds>(doneAlloc-startedAlloc).count()<<std::endl;
    std::cout<<"Wrap time: " <<std::chrono::duration_cast<std::chrono::microseconds>(doneWrap-doneAlloc).count()<<std::endl;
    std::cout<<"Sync time: "<<std::chrono::duration_cast<std::chrono::microseconds>(doneSync-doneWrap).count()<<std::endl;
    std::cout<<"Memcpy time: "<<std::chrono::duration_cast<std::chrono::microseconds>(doneCpy-doneSync).count()<<std::endl;

}

int main(int argc, char *argv[]) {

    VPITest vpiTest;

    vpiTest.runTests();

    return 0;

}

AastaLLL · June 28, 2021, 4:14am

Hi,

Thanks for reporting this to us.
For comparison, could you also share the example of cudaMalloc with us?

Thanks.

hannes09 · June 28, 2021, 5:53am

I have added a test with cudaMalloc.

Results in micro seconds:

Alloc time managed page locked mem : 894
Alloc time Device mem: 982
Wrap time managed page locked mem: 290
Wrap time Device mem: 59
Sync time: 9
Memcpy time: 18

Device memory is much faster to wrap than managed memory, but still slow compared to memcpy.

#include <string>
#include <chrono>
#include <vpi/Stream.h>
#include <vpi/Image.h>
#include <vpi/algo/ConvertImageFormat.h>
#include <vpi/CUDAInterop.h>
#include <iostream>
#include <vector>
#include <cstdint>
#include <cstring>
#include <cuda.h>
#include "cuda_runtime.h"

inline bool cudaAllocMapped( void** cpuPtr, void** gpuPtr, size_t size )
{
	if( !cpuPtr || !gpuPtr || size == 0 )
		return false;

	if( cudaHostAlloc(cpuPtr, size, cudaHostAllocMapped) != cudaSuccess )
		return false;

	if( cudaHostGetDevicePointer(gpuPtr, *cpuPtr, 0) != cudaSuccess )
		return false;

	memset(*cpuPtr, 0, size);
	return true;
}

inline bool cudaAllocMapped( void** ptr, size_t size )
{
	void* cpuPtr = NULL;
	void* gpuPtr = NULL;

	if( !ptr || size == 0 )
		return false;

	if( !cudaAllocMapped(&cpuPtr, &gpuPtr, size) )
		return false;

	if( cpuPtr != gpuPtr )
	{
		std::cout<<"CUDA: cudaAllocMapped() - addresses of CPU and GPU pointers don't match"<<std::endl;
		return false;
	}

	*ptr = gpuPtr;
	return true;
}


class VPITest {
    public:
        VPITest();
        ~VPITest();
        void wrapCUDAMemoryU8(void * data,VPIBackend backend,VPIImage * image,
    int32_t height,int32_t width);
        void runTests();
    private:
        VPIStream m_vpiStream;
        VPIImage m_currImageU8;
        int m_width, m_height;
};

VPITest::VPITest() :
    m_width(1000),
    m_height(100)
{
    vpiStreamCreate(0, &m_vpiStream);
	cudaSetDeviceFlags(cudaDeviceMapHost);
}

VPITest::~VPITest()
{
  vpiStreamDestroy(m_vpiStream);
}

void VPITest::wrapCUDAMemoryU8(void * data,VPIBackend backend,VPIImage * image,
    int32_t height,int32_t width) {

    VPIImageData imgData;
    memset(&imgData, 0, sizeof(imgData));
    imgData.format               = VPI_IMAGE_FORMAT_U8;
    imgData.numPlanes            = 1;
    imgData.planes[0].width      = width;
    imgData.planes[0].height     = height;
    imgData.planes[0].pitchBytes = width;
    imgData.planes[0].data       = data;

    if(vpiImageCreateCUDAMemWrapper(&imgData,backend,image)!=VPI_SUCCESS)
    {
        std::cout<<"VPITest::wrapCUDAMemoryU8: vpiImageCreateCUDAMemWrapper failed"<<std::endl;
    }

}

void VPITest::runTests(){

    uint8_t * data;
    std::vector<uint8_t> testVec;
    testVec.resize(m_height * m_width);

    auto startedAlloc = std::chrono::high_resolution_clock::now();

    if( !cudaAllocMapped((void**)&data,m_height * m_width) )
        return;

    auto doneAllocMapped = std::chrono::high_resolution_clock::now();

    uint8_t * devPtr;
    cudaMalloc(&devPtr,m_height * m_width);

    auto doneAllocDev = std::chrono::high_resolution_clock::now();

    wrapCUDAMemoryU8(data, VPI_BACKEND_CUDA,&m_currImageU8,m_height,m_width);

    auto doneWrapManagedMem = std::chrono::high_resolution_clock::now();

    wrapCUDAMemoryU8(devPtr, VPI_BACKEND_CUDA,&m_currImageU8,m_height,m_width);

    auto doneWrapDevMem = std::chrono::high_resolution_clock::now();

    vpiStreamSync(m_vpiStream);

    auto doneSync = std::chrono::high_resolution_clock::now();

    std::memcpy(&data[0], &testVec[0], m_height * m_width); 

    auto doneCpy = std::chrono::high_resolution_clock::now();

    std::cout<<"Alloc time managed page locked mem : "<<std::chrono::duration_cast<std::chrono::microseconds>(doneAllocMapped-startedAlloc).count()<<std::endl;
    std::cout<<"Alloc time Device mem: "<<std::chrono::duration_cast<std::chrono::microseconds>(doneAllocDev-doneAllocMapped).count()<<std::endl;
    std::cout<<"Wrap time managed page locked mem: " <<std::chrono::duration_cast<std::chrono::microseconds>(doneWrapManagedMem-doneAllocDev).count()<<std::endl;
    std::cout<<"Wrap time Device mem: " <<std::chrono::duration_cast<std::chrono::microseconds>(doneWrapDevMem-doneWrapManagedMem).count()<<std::endl;
    std::cout<<"Sync time: "<<std::chrono::duration_cast<std::chrono::microseconds>(doneSync-doneWrapDevMem).count()<<std::endl;
    std::cout<<"Memcpy time: "<<std::chrono::duration_cast<std::chrono::microseconds>(doneCpy-doneSync).count()<<std::endl;

}

int main(int argc, char *argv[]) {

    VPITest vpiTest;

    vpiTest.runTests();

    return 0;

}

AastaLLL · June 30, 2021, 6:55am

Hi,

Thanks for sharing the example.

Based on your source, the cudaAllocMapped allocates a pinned memory rather than managed memory.
Managed memory refers to the buffer from cudaMallocManaged(...).

Do you want to compare the pinned memory and device memory?
You can find the memory type introduction on the below page:

Thanks.

hannes09 · June 30, 2021, 9:40am

Hi,

thanks for the help. I have added managed memory results. Hopefully correct now.

Results in micro seconds:

Alloc time pinned page locked mem : 962
Alloc time device mem: 983
Alloc time managed mem: 301
Wrap time pinned page locked mem: 273
Wrap time device mem: 52
Wrap time managed mem: 34
Sync time: 8
Memcpy time to pinned location: 19
Memcpy time to managed location: 242

Managed memory can be wrapped fast compared to pinned memory, but still slower than I woul have estimated.

#include <string>
#include <chrono>
#include <vpi/Stream.h>
#include <vpi/Image.h>
#include <vpi/algo/ConvertImageFormat.h>
#include <vpi/CUDAInterop.h>
#include <iostream>
#include <vector>
#include <cstdint>
#include <cstring>
#include <cuda.h>
#include "cuda_runtime.h"

inline bool cudaAllocMapped( void** cpuPtr, void** gpuPtr, size_t size )
{
	if( !cpuPtr || !gpuPtr || size == 0 )
		return false;

	if( cudaHostAlloc(cpuPtr, size, cudaHostAllocMapped) != cudaSuccess )
		return false;

	if( cudaHostGetDevicePointer(gpuPtr, *cpuPtr, 0) != cudaSuccess )
		return false;

	memset(*cpuPtr, 0, size);
	return true;
}

inline bool cudaAllocMapped( void** ptr, size_t size )
{
	void* cpuPtr = NULL;
	void* gpuPtr = NULL;

	if( !ptr || size == 0 )
		return false;

	if( !cudaAllocMapped(&cpuPtr, &gpuPtr, size) )
		return false;

	if( cpuPtr != gpuPtr )
	{
		std::cout<<"CUDA: cudaAllocMapped() - addresses of CPU and GPU pointers don't match"<<std::endl;
		return false;
	}

	*ptr = gpuPtr;
	return true;
}


class VPITest {
    public:
        VPITest();
        ~VPITest();
        void wrapCUDAMemoryU8(void * data,VPIBackend backend,VPIImage * image,
    int32_t height,int32_t width);
        void runTests();
    private:
        VPIStream m_vpiStream;
        VPIImage m_currImageU8;
        int m_width, m_height;
};

VPITest::VPITest() :
    m_width(1000),
    m_height(100)
{
    vpiStreamCreate(0, &m_vpiStream);
	cudaSetDeviceFlags(cudaDeviceMapHost);
}

VPITest::~VPITest()
{
  vpiStreamDestroy(m_vpiStream);
}

void VPITest::wrapCUDAMemoryU8(void * data,VPIBackend backend,VPIImage * image,
    int32_t height,int32_t width) {

    VPIImageData imgData;
    memset(&imgData, 0, sizeof(imgData));
    imgData.format               = VPI_IMAGE_FORMAT_U8;
    imgData.numPlanes            = 1;
    imgData.planes[0].width      = width;
    imgData.planes[0].height     = height;
    imgData.planes[0].pitchBytes = width;
    imgData.planes[0].data       = data;

    if(vpiImageCreateCUDAMemWrapper(&imgData,backend,image)!=VPI_SUCCESS)
    {
        std::cout<<"VPITest::wrapCUDAMemoryU8: vpiImageCreateCUDAMemWrapper failed"<<std::endl;
    }

}

void VPITest::runTests(){

    uint8_t * pinnedData;
    uint8_t * managedData;
    std::vector<uint8_t> testVec;
    testVec.resize(m_height * m_width);

    auto startedAlloc = std::chrono::high_resolution_clock::now();

    if( !cudaAllocMapped((void**)&pinnedData,m_height * m_width) )
        return;
    auto doneAllocMapped = std::chrono::high_resolution_clock::now();

    uint8_t * devPtr;
    cudaMalloc(&devPtr,m_height * m_width);
    auto doneAllocPinned = std::chrono::high_resolution_clock::now();
    
    cudaMallocManaged (&managedData,m_height * m_width);
    auto doneAllocManaged = std::chrono::high_resolution_clock::now();

    wrapCUDAMemoryU8(pinnedData, VPI_BACKEND_CUDA,&m_currImageU8,m_height,m_width);
    auto doneWrapPinnedMem = std::chrono::high_resolution_clock::now();

    wrapCUDAMemoryU8(devPtr, VPI_BACKEND_CUDA,&m_currImageU8,m_height,m_width);
    auto doneWrapDevMem = std::chrono::high_resolution_clock::now();

    wrapCUDAMemoryU8(managedData, VPI_BACKEND_CUDA,&m_currImageU8,m_height,m_width);
    auto doneWrapManagedMem = std::chrono::high_resolution_clock::now();

    vpiStreamSync(m_vpiStream);

    auto doneSync = std::chrono::high_resolution_clock::now();

    std::memcpy(&pinnedData[0], &testVec[0], m_height * m_width); 
    auto doneCpyPinned = std::chrono::high_resolution_clock::now();

    std::memcpy(&managedData[0], &testVec[0], m_height * m_width); 
    auto doneCpyManaged = std::chrono::high_resolution_clock::now();

    std::cout<<"Alloc time pinned page locked mem : "<<std::chrono::duration_cast<std::chrono::microseconds>(doneAllocMapped-startedAlloc).count()<<std::endl;
    std::cout<<"Alloc time device mem: "<<std::chrono::duration_cast<std::chrono::microseconds>(doneAllocPinned-doneAllocMapped).count()<<std::endl;
    std::cout<<"Alloc time managed mem: "<<std::chrono::duration_cast<std::chrono::microseconds>(doneAllocManaged-doneAllocPinned).count()<<std::endl;
    std::cout<<"Wrap time pinned page locked mem: " <<std::chrono::duration_cast<std::chrono::microseconds>(doneWrapPinnedMem-doneAllocManaged).count()<<std::endl;
    std::cout<<"Wrap time device mem: " <<std::chrono::duration_cast<std::chrono::microseconds>(doneWrapDevMem-doneWrapPinnedMem).count()<<std::endl;
    std::cout<<"Wrap time managed mem: " <<std::chrono::duration_cast<std::chrono::microseconds>(doneWrapManagedMem-doneWrapDevMem).count()<<std::endl;
    std::cout<<"Sync time: "<<std::chrono::duration_cast<std::chrono::microseconds>(doneSync-doneWrapManagedMem).count()<<std::endl;
    std::cout<<"Memcpy time to pinned location: "<<std::chrono::duration_cast<std::chrono::microseconds>(doneCpyPinned-doneSync).count()<<std::endl;
    std::cout<<"Memcpy time to managed location: "<<std::chrono::duration_cast<std::chrono::microseconds>(doneCpyManaged-doneCpyPinned).count()<<std::endl;

}

int main(int argc, char *argv[]) {

    VPITest vpiTest;

    vpiTest.runTests();

    return 0;

}

AastaLLL · July 1, 2021, 5:12am

Hi,

Based on your experiment, the wrapping time for unified memory is faster than device memory:

Wrap time device mem: 52
Wrap time managed mem: 34

Would you mind sharing your use case and requirement with us first?

Thanks.

hannes09 · July 1, 2021, 12:05pm

Hi,
my use case is some image processing of a high speed camera with image warping and some custom cuda kernels.
Learing about diffent memory types already helped me a lot. Dataflow starts on the host side.
I have changed the application design to use a ring buffer instead. So the memory of the ring buffer elemenst have to be wrapped only once, and not during the image processing.

But still I would like to understand why the wrapping takes some time. Initilialy I assumed that due to the cache coherence with xavier these kind of operations should be very fast.

AastaLLL · July 13, 2021, 5:32am

Hi,

As you know, wrapping should be a one-time job for continuous input.
The wrapping includes some initialization and setup which needs some time.

Thanks.

system · September 19, 2021, 5:58am

This topic was automatically closed 60 days after the last reply. New replies are no longer allowed.