Unified memory not working completely

dumbogeorge · July 14, 2017, 11:07am

Hi Folks,

I am trying to declare “Unified Memory” and copy a buffer into it. After doing so I am trying to display the image using cv::imshow available in OpenCV. I am successful in getting one copy of the image. However, the program crashes with Segmentation fault if I try to create 2 copies and try to display the second copy. The code snippet is as follows :

// Acquire a Frame of size 1920x1080

        UniqueObj<Frame> frame(iFrameConsumer->acquireFrame());
        IFrame *iFrame = interface_cast<IFrame>(frame);
        if (!iFrame)
            break;

        // Get the Frame's Image.
        Image *image = iFrame->getImage();
        EGLStream::NV::IImageNativeBuffer *iImageNativeBuffer
              = interface_cast<EGLStream::NV::IImageNativeBuffer>(image);
        TEST_ERROR_RETURN(!iImageNativeBuffer, "Failed to create an IImageNativeBuffer");

        int fd = iImageNativeBuffer->createNvBuffer(Argus::Size {m_framesize.width, m_framesize.height},
               NvBufferColorFormat_YUV420, NvBufferLayout_Pitch, &status);
        if (status != STATUS_OK)
               TEST_ERROR_RETURN(status != STATUS_OK, "Failed to create a native buffer");

 #if 1

	cudaSetDeviceFlags(cudaDeviceMapHost);

        NvBufferParams params;
        NvBufferGetParams(fd, &params);

 	char *data_mem = NULL;
	
	int size = m_framesize.width* m_framesize.height;
	
        int fsize = params.pitch[0] * m_framesize.height ;
        data_mem = (char*)mmap(NULL, fsize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, params.offset[0]);
	
	char *d_myimagen = NULL;
	
	char* h_cudaout = NULL;

	int read1 = cudaMallocManaged(&h_cudaout, ( m_framesize.height *  m_framesize.width)*sizeof(char));

	cout<<"read1 : " <<read1 <<endl;
	cout<<"h_cudaout :" <<h_cudaout <<endl;
 
	int read2 = cudaMallocManaged(&d_myimagen, ( m_framesize.height *  m_framesize.width)*sizeof(char));

	cout<<"read2 : " <<read2 <<endl;
	cout<<"d_myimagen :" <<d_myimagen <<endl;

	int copy1 = cudaMemcpy (h_cudaout,data_mem,m_framesize.width*m_framesize.height,cudaMemcpyHostToDevice) ;	
	cout<<"copy1 : " <<copy1 <<endl;
 
	int copy2 = cudaMemcpy (d_myimagen,data_mem,m_framesize.width*m_framesize.height,cudaMemcpyHostToDevice) ;	
	cout<<"copy2 : " <<copy2 <<endl;
 

	cudaDeviceSynchronize();

	cv::Mat CudaOUTimgbuf1 = cv::Mat(m_framesize.height, m_framesize.width, CV_8UC1, (void *) d_myimagen , params.pitch[0]);
	cv::imshow("CudaOUTimgbuf1", CudaOUTimgbuf1);
	cv::waitKey(1);

The code works fine if I try to display “h_cudaout” instead of “d_myimagen”. Please help me figure out where I am going wrong.

Thanks

AastaLLL · July 17, 2017, 9:17am

Hi,

You can’t call cv::imshow with GPU memory pointer.
OpenCV expects CPU data for display.

Please copy the memory back to host before calling imshow().
Thanks.

dumbogeorge · July 17, 2017, 9:57am

Hi AastaLLL,

Thanks for the response.

From my understanding , memory allocation done using cudaMallocManaged allocates “Unified Memory” which can be accessed using the pointer return both on the CPU as well as GPU. Please let me know if I am getting something wrong with the interpretation.

In the code snippet shared I first obtain the frame buffer using mmap by passing fd obtained using NvBufferGetParams.

data_mem = (char*)mmap(NULL, fsize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, params.offset[0]);

From my understanding mmap allocates memory on CPU (host). Please correct me if I am wrong.

Now I wish to use this frame buffer to perform some calculations. For doing so I try to create two copies of the buffer using “Unified Memory” allocation (cudaMallocManaged) and copy the frame buffer into both these allocated memory locations.

char *d_myimagen = NULL;

char* h_cudaout = NULL;

int read1 = cudaMallocManaged(&h_cudaout, ( m_framesize.height *  m_framesize.width)*sizeof(char));

cout<<"read1 : " <<read1 <<endl;
cout<<"h_cudaout :" <<h_cudaout <<endl;
 
int read2 = cudaMallocManaged(&d_myimagen, ( m_framesize.height *  m_framesize.width)*sizeof(char));

cout<<"read2 : " <<read2 <<endl;
cout<<"d_myimagen :" <<d_myimagen <<endl;

int copy1 = cudaMemcpy (h_cudaout,data_mem,m_framesize.width*m_framesize.height,cudaMemcpyHostToDevice) ;	
cout<<"copy1 : " <<copy1 <<endl;
 
int copy2 = cudaMemcpy (d_myimagen,data_mem,m_framesize.width*m_framesize.height,cudaMemcpyHostToDevice) ;	
cout<<"copy2 : " <<copy2 <<endl;

The returned value of ‘cudaMallocManaged’ and ‘cudaMemcpy’ calls is ‘0’ which implies “cudaSuccess”.

I try to display the image using imshow to check if the copy has happened. However, the program displays the right image captured only for the first allocated memory ( here the memory pointed to by h_cudaout) and crashes for the memory allocated next.

Working code : Successfully displays the captured image

cv::Mat CudaOUTimgbuf1 = cv::Mat(m_framesize.height, m_framesize.width, CV_8UC1, (void *) h_cudaout , params.pitch[0]);
cv::imshow("CudaOUTimgbuf1", CudaOUTimgbuf1);
cv::waitKey(1);

Crash code : crashes with “Segmentation Fault”

cv::Mat CudaOUTimgbuf1 = cv::Mat(m_framesize.height, m_framesize.width, CV_8UC1, (void *) d_myimagen , params.pitch[0]);
cv::imshow("CudaOUTimgbuf1", CudaOUTimgbuf1);
cv::waitKey(1);

If I try to change the order of allocation. For example, using the code below the programs crashes when I try to access memory pointed to by h_cudaout

char *d_myimagen = NULL;
	
char* h_cudaout = NULL;

int read2 = cudaMallocManaged(&d_myimagen, ( m_framesize.height *  m_framesize.width)*sizeof(char));
cout<<"read2 : " <<read2 <<endl;
cout<<"d_myimagen :" <<d_myimagen <<endl;

int read1 = cudaMallocManaged(&h_cudaout, ( m_framesize.height *  m_framesize.width)*sizeof(char));
cout<<"read1 : " <<read1 <<endl;
cout<<"h_cudaout :" <<h_cudaout <<endl;
 
int copy1 = cudaMemcpy (h_cudaout,data_mem,m_framesize.width*m_framesize.height,cudaMemcpyHostToDevice) ;	
cout<<"copy1 : " <<copy1 <<endl;

int copy2 = cudaMemcpy (d_myimagen,data_mem,m_framesize.width*m_framesize.height,cudaMemcpyHostToDevice) ;	
cout<<"copy2 : " <<copy2 <<endl;

Working code : Successfully displays the captured image

cv::Mat CudaOUTimgbuf1 = cv::Mat(m_framesize.height, m_framesize.width, CV_8UC1, (void *) d_myimagen , params.pitch[0]);
cv::imshow("CudaOUTimgbuf1", CudaOUTimgbuf1);
cv::waitKey(1);

Crash code : crashes with “Segmentation Fault”

cv::Mat CudaOUTimgbuf1 = cv::Mat(m_framesize.height, m_framesize.width, CV_8UC1, (void *) h_cudaout , params.pitch[0]);
cv::imshow("CudaOUTimgbuf1", CudaOUTimgbuf1);
cv::waitKey(1);

Following are some of the queries:

If there is something wrong with passing pointer to “Unified Memory” to imshow then why does it work using pointer to buffer declared first ?
Why do the error codes returned indicate “cudaSuccess” if there is something wrong with cudaMallocManaged or cudaMemcpy of second buffer ?
Is there any other way to directly receive GPU memory buffer or map buffer to GPU memory instead of allocating “Unified Memory” and copying the input buffer (which is compute expensive) ? If yes, could you please point me to existing samples which I can refer to ?

Thanks.

AastaLLL · July 19, 2017, 10:37am

Hi,

Guess that this is a similar issue to topic_1018809.
Could you check it first?
[url]https://devtalk.nvidia.com/default/topic/1018809/jetson-tx1/issue-with-access-of-nvbuffer-frame/post/5188031/#5188031[/url]

Unified memory usage is pretty similar to mapped memory.
Thanks.