Hi,
I have tested the performance of vpiImageCreateCUDAMemWrapper
and it seems to be very slow compared to simple memcpy
.
For 100000 bytes of managed memory vpiImageCreateCUDAMemWrapper
takes 320 micro seconds to wrap. memcpy
takes 13 micro seconds. (xavier , power mode 3)
Can somebody explain what is going on here?
Code for testing vpiImageCreateCUDAMemWrappert
performance:
#include <string>
#include <chrono>
#include <vpi/Stream.h>
#include <vpi/Image.h>
#include <vpi/algo/ConvertImageFormat.h>
#include <vpi/CUDAInterop.h>
#include <iostream>
#include <vector>
#include <cstdint>
#include <cstring>
#include <cuda.h>
#include "cuda_runtime.h"
inline bool cudaAllocMapped( void** cpuPtr, void** gpuPtr, size_t size )
{
if( !cpuPtr || !gpuPtr || size == 0 )
return false;
if( cudaHostAlloc(cpuPtr, size, cudaHostAllocMapped) != cudaSuccess )
return false;
if( cudaHostGetDevicePointer(gpuPtr, *cpuPtr, 0) != cudaSuccess )
return false;
memset(*cpuPtr, 0, size);
return true;
}
inline bool cudaAllocMapped( void** ptr, size_t size )
{
void* cpuPtr = NULL;
void* gpuPtr = NULL;
if( !ptr || size == 0 )
return false;
if( !cudaAllocMapped(&cpuPtr, &gpuPtr, size) )
return false;
if( cpuPtr != gpuPtr )
{
std::cout<<"CUDA: cudaAllocMapped() - addresses of CPU and GPU pointers don't match"<<std::endl;
return false;
}
*ptr = gpuPtr;
return true;
}
class VPITest {
public:
VPITest();
~VPITest();
void wrapCUDAMemoryU8(void * data,VPIBackend backend,VPIImage * image,
int32_t height,int32_t width);
void runTests();
private:
VPIStream m_vpiStream;
VPIImage m_currImageU8;
int m_width, m_height;
};
VPITest::VPITest() :
m_width(1000),
m_height(100)
{
vpiStreamCreate(0, &m_vpiStream);
cudaSetDeviceFlags(cudaDeviceMapHost);
}
VPITest::~VPITest()
{
vpiStreamDestroy(m_vpiStream);
}
void VPITest::wrapCUDAMemoryU8(void * data,VPIBackend backend,VPIImage * image,
int32_t height,int32_t width) {
VPIImageData imgData;
memset(&imgData, 0, sizeof(imgData));
imgData.format = VPI_IMAGE_FORMAT_U8;
imgData.numPlanes = 1;
imgData.planes[0].width = width;
imgData.planes[0].height = height;
imgData.planes[0].pitchBytes = width;
imgData.planes[0].data = data;
if(vpiImageCreateCUDAMemWrapper(&imgData,backend,image)!=VPI_SUCCESS)
{
std::cout<<"VPITest::wrapCUDAMemoryU8: vpiImageCreateCUDAMemWrapper failed"<<std::endl;
}
}
void VPITest::runTests(){
uint8_t * data;
std::vector<uint8_t> testVec;
testVec.resize(m_height * m_width);
auto startedAlloc = std::chrono::high_resolution_clock::now();
if( !cudaAllocMapped((void**)&data,m_height * m_width) )
return;
auto doneAlloc = std::chrono::high_resolution_clock::now();
wrapCUDAMemoryU8(data, VPI_BACKEND_CUDA,&m_currImageU8,m_height,m_width);
auto doneWrap = std::chrono::high_resolution_clock::now();
vpiStreamSync(m_vpiStream);
auto doneSync = std::chrono::high_resolution_clock::now();
std::memcpy(&data[0], &testVec[0], m_height * m_width);
auto doneCpy = std::chrono::high_resolution_clock::now();
std::cout<<"Alloc time: "<<std::chrono::duration_cast<std::chrono::microseconds>(doneAlloc-startedAlloc).count()<<std::endl;
std::cout<<"Wrap time: " <<std::chrono::duration_cast<std::chrono::microseconds>(doneWrap-doneAlloc).count()<<std::endl;
std::cout<<"Sync time: "<<std::chrono::duration_cast<std::chrono::microseconds>(doneSync-doneWrap).count()<<std::endl;
std::cout<<"Memcpy time: "<<std::chrono::duration_cast<std::chrono::microseconds>(doneCpy-doneSync).count()<<std::endl;
}
int main(int argc, char *argv[]) {
VPITest vpiTest;
vpiTest.runTests();
return 0;
}