Unified memory not working completely

Hi Folks,

I am trying to declare “Unified Memory” and copy a buffer into it. After doing so I am trying to display the image using cv::imshow available in OpenCV. I am successful in getting one copy of the image. However, the program crashes with Segmentation fault if I try to create 2 copies and try to display the second copy. The code snippet is as follows :

// Acquire a Frame of size 1920x1080

        UniqueObj<Frame> frame(iFrameConsumer->acquireFrame());
        IFrame *iFrame = interface_cast<IFrame>(frame);
        if (!iFrame)
            break;

        // Get the Frame's Image.
        Image *image = iFrame->getImage();
        EGLStream::NV::IImageNativeBuffer *iImageNativeBuffer
              = interface_cast<EGLStream::NV::IImageNativeBuffer>(image);
        TEST_ERROR_RETURN(!iImageNativeBuffer, "Failed to create an IImageNativeBuffer");

        int fd = iImageNativeBuffer->createNvBuffer(Argus::Size {m_framesize.width, m_framesize.height},
               NvBufferColorFormat_YUV420, NvBufferLayout_Pitch, &status);
        if (status != STATUS_OK)
               TEST_ERROR_RETURN(status != STATUS_OK, "Failed to create a native buffer");

 #if 1

	cudaSetDeviceFlags(cudaDeviceMapHost);

        NvBufferParams params;
        NvBufferGetParams(fd, &params);

 	char *data_mem = NULL;
	
	int size = m_framesize.width* m_framesize.height;
	
        int fsize = params.pitch[0] * m_framesize.height ;
        data_mem = (char*)mmap(NULL, fsize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, params.offset[0]);
	
	char *d_myimagen = NULL;
	
	char* h_cudaout = NULL;

	int read1 = cudaMallocManaged(&h_cudaout, ( m_framesize.height *  m_framesize.width)*sizeof(char));

	cout<<"read1 : " <<read1 <<endl;
	cout<<"h_cudaout :" <<h_cudaout <<endl;
 
	int read2 = cudaMallocManaged(&d_myimagen, ( m_framesize.height *  m_framesize.width)*sizeof(char));

	cout<<"read2 : " <<read2 <<endl;
	cout<<"d_myimagen :" <<d_myimagen <<endl;

	int copy1 = cudaMemcpy (h_cudaout,data_mem,m_framesize.width*m_framesize.height,cudaMemcpyHostToDevice) ;	
	cout<<"copy1 : " <<copy1 <<endl;
 
	int copy2 = cudaMemcpy (d_myimagen,data_mem,m_framesize.width*m_framesize.height,cudaMemcpyHostToDevice) ;	
	cout<<"copy2 : " <<copy2 <<endl;
 

	cudaDeviceSynchronize();

	cv::Mat CudaOUTimgbuf1 = cv::Mat(m_framesize.height, m_framesize.width, CV_8UC1, (void *) d_myimagen , params.pitch[0]);
	cv::imshow("CudaOUTimgbuf1", CudaOUTimgbuf1);
	cv::waitKey(1);

The code works fine if I try to display “h_cudaout” instead of “d_myimagen”. Please help me figure out where I am going wrong.

Thanks

Hi,

You can’t call cv::imshow with GPU memory pointer.
OpenCV expects CPU data for display.

Please copy the memory back to host before calling imshow().
Thanks.

Hi AastaLLL,

Thanks for the response.

From my understanding , memory allocation done using cudaMallocManaged allocates “Unified Memory” which can be accessed using the pointer return both on the CPU as well as GPU. Please let me know if I am getting something wrong with the interpretation.

In the code snippet shared I first obtain the frame buffer using mmap by passing fd obtained using NvBufferGetParams.

data_mem = (char*)mmap(NULL, fsize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, params.offset[0]);

From my understanding mmap allocates memory on CPU (host). Please correct me if I am wrong.

Now I wish to use this frame buffer to perform some calculations. For doing so I try to create two copies of the buffer using “Unified Memory” allocation (cudaMallocManaged) and copy the frame buffer into both these allocated memory locations.

char *d_myimagen = NULL;

char* h_cudaout = NULL;

int read1 = cudaMallocManaged(&h_cudaout, ( m_framesize.height *  m_framesize.width)*sizeof(char));

cout<<"read1 : " <<read1 <<endl;
cout<<"h_cudaout :" <<h_cudaout <<endl;
 
int read2 = cudaMallocManaged(&d_myimagen, ( m_framesize.height *  m_framesize.width)*sizeof(char));

cout<<"read2 : " <<read2 <<endl;
cout<<"d_myimagen :" <<d_myimagen <<endl;

int copy1 = cudaMemcpy (h_cudaout,data_mem,m_framesize.width*m_framesize.height,cudaMemcpyHostToDevice) ;	
cout<<"copy1 : " <<copy1 <<endl;
 
int copy2 = cudaMemcpy (d_myimagen,data_mem,m_framesize.width*m_framesize.height,cudaMemcpyHostToDevice) ;	
cout<<"copy2 : " <<copy2 <<endl;

The returned value of ‘cudaMallocManaged’ and ‘cudaMemcpy’ calls is ‘0’ which implies “cudaSuccess”.

I try to display the image using imshow to check if the copy has happened. However, the program displays the right image captured only for the first allocated memory ( here the memory pointed to by h_cudaout) and crashes for the memory allocated next.

Working code : Successfully displays the captured image

cv::Mat CudaOUTimgbuf1 = cv::Mat(m_framesize.height, m_framesize.width, CV_8UC1, (void *) h_cudaout , params.pitch[0]);
cv::imshow("CudaOUTimgbuf1", CudaOUTimgbuf1);
cv::waitKey(1);

Crash code : crashes with “Segmentation Fault”

cv::Mat CudaOUTimgbuf1 = cv::Mat(m_framesize.height, m_framesize.width, CV_8UC1, (void *) d_myimagen , params.pitch[0]);
cv::imshow("CudaOUTimgbuf1", CudaOUTimgbuf1);
cv::waitKey(1);

If I try to change the order of allocation. For example, using the code below the programs crashes when I try to access memory pointed to by h_cudaout

char *d_myimagen = NULL;
	
char* h_cudaout = NULL;

int read2 = cudaMallocManaged(&d_myimagen, ( m_framesize.height *  m_framesize.width)*sizeof(char));
cout<<"read2 : " <<read2 <<endl;
cout<<"d_myimagen :" <<d_myimagen <<endl;

int read1 = cudaMallocManaged(&h_cudaout, ( m_framesize.height *  m_framesize.width)*sizeof(char));
cout<<"read1 : " <<read1 <<endl;
cout<<"h_cudaout :" <<h_cudaout <<endl;
 
int copy1 = cudaMemcpy (h_cudaout,data_mem,m_framesize.width*m_framesize.height,cudaMemcpyHostToDevice) ;	
cout<<"copy1 : " <<copy1 <<endl;

int copy2 = cudaMemcpy (d_myimagen,data_mem,m_framesize.width*m_framesize.height,cudaMemcpyHostToDevice) ;	
cout<<"copy2 : " <<copy2 <<endl;

Working code : Successfully displays the captured image

cv::Mat CudaOUTimgbuf1 = cv::Mat(m_framesize.height, m_framesize.width, CV_8UC1, (void *) d_myimagen , params.pitch[0]);
cv::imshow("CudaOUTimgbuf1", CudaOUTimgbuf1);
cv::waitKey(1);

Crash code : crashes with “Segmentation Fault”

cv::Mat CudaOUTimgbuf1 = cv::Mat(m_framesize.height, m_framesize.width, CV_8UC1, (void *) h_cudaout , params.pitch[0]);
cv::imshow("CudaOUTimgbuf1", CudaOUTimgbuf1);
cv::waitKey(1);

Following are some of the queries:

  1. If there is something wrong with passing pointer to “Unified Memory” to imshow then why does it work using pointer to buffer declared first ?

  2. Why do the error codes returned indicate “cudaSuccess” if there is something wrong with cudaMallocManaged or cudaMemcpy of second buffer ?

  3. Is there any other way to directly receive GPU memory buffer or map buffer to GPU memory instead of allocating “Unified Memory” and copying the input buffer (which is compute expensive) ? If yes, could you please point me to existing samples which I can refer to ?

Thanks.

Hi,

Guess that this is a similar issue to topic_1018809.
Could you check it first?
https://devtalk.nvidia.com/default/topic/1018809/jetson-tx1/issue-with-access-of-nvbuffer-frame/post/5188031/#5188031

Unified memory usage is pretty similar to mapped memory.
Thanks.