Parallel processing twice stereo matching

Hi

I want to do a twice stereo matching process. The input are the same(left rectified image and right rectified image), but the output are not(calculate left to right disparity and right to left disparity).
After look into this forums, there is a way to do this. One way is to create two different graph, and each graph to deal with there own stereo matching. But after I finished my code. The result is the same as one graph to do twice stereo matching, following are the code.

SGBM::SGBM(vx_context context, const StereoMatchingParams& params,
	       const StereoCalibParams& calib_params,
               vx_image left, vx_image right, vx_image left_disparity, vx_image right_disparity)
        : main_graph_(nullptr)
    {
        vx_df_image format = VX_DF_IMAGE_VIRT;
	vx_uint32 width = 0;
	vx_uint32 height = 0;
		r_graph_ = nullptr;
        NVXIO_SAFE_CALL( vxQueryImage(left, VX_IMAGE_ATTRIBUTE_FORMAT, &format, sizeof(format)) );
        NVXIO_SAFE_CALL( vxQueryImage(left, VX_IMAGE_ATTRIBUTE_WIDTH, &width, sizeof(width)) );
        NVXIO_SAFE_CALL( vxQueryImage(left, VX_IMAGE_ATTRIBUTE_HEIGHT, &height, sizeof(height)) );

        main_graph_ = vxCreateGraph(context);
        r_graph_ = vxCreateGraph(context);
        NVXIO_CHECK_REFERENCE(main_graph_);
        NVXIO_CHECK_REFERENCE(r_graph_);

        // left stereo
        vx_image disparity_short = vxCreateVirtualImage(main_graph_, width, height, VX_DF_IMAGE_S16);
        NVXIO_CHECK_REFERENCE(disparity_short);
       
        semi_global_matching_node_ = nvxSemiGlobalMatchingNode(
            main_graph_,
            left,
            right,
            disparity_short,
            params.min_disparity,
            params.max_disparity,
            params.P1,
            params.P2,
            params.sad,
            params.ct_win_size,
            params.hc_win_size,
            params.bt_clip_value,
            params.max_diff,
            params.uniqueness_ratio,
            params.scanlines_mask,
            params.flags);
        NVXIO_CHECK_REFERENCE(semi_global_matching_node_);

        vx_int32 shift = 4;
        vx_scalar s_shift = vxCreateScalar(context, VX_TYPE_INT32, &shift);
        NVXIO_CHECK_REFERENCE(s_shift);
        convert_depth_node_ = vxConvertDepthNode(main_graph_, disparity_short, left_disparity, VX_CONVERT_POLICY_SATURATE, s_shift);
		NVXIO_CHECK_REFERENCE(convert_depth_node_);



        // right stereo
        vx_image r_disparity_short = vxCreateVirtualImage(r_graph_, width, height, VX_DF_IMAGE_S16);
        NVXIO_CHECK_REFERENCE(r_disparity_short);

		vx_image flip_left_gray = vxCreateVirtualImage(r_graph_, width, height, VX_DF_IMAGE_U8);
        NVXIO_CHECK_REFERENCE(flip_left_gray);

        vx_image flip_right_gray = vxCreateVirtualImage(r_graph_, width, height, VX_DF_IMAGE_U8);
        NVXIO_CHECK_REFERENCE(flip_right_gray);

		left_flip_node_ = nvxFlipImageNode(r_graph_, left, flip_left_gray, NVX_FLIP_HORIZONTAL);
        NVXIO_CHECK_REFERENCE(left_flip_node_);
		
		right_flip_node_ = nvxFlipImageNode(r_graph_, right, flip_right_gray, NVX_FLIP_HORIZONTAL);
        NVXIO_CHECK_REFERENCE(right_flip_node_);

		vx_image flip_disparity = vxCreateVirtualImage(r_graph_, width, height, VX_DF_IMAGE_U8);
        NVXIO_CHECK_REFERENCE(flip_disparity);
       

	    r_semi_global_matching_node_ = nvxSemiGlobalMatchingNode(
            r_graph_,
            flip_right_gray,
            flip_left_gray,
            r_disparity_short,
            params.min_disparity,
            params.max_disparity,
            params.P1,
            params.P2,
            params.sad,
            params.ct_win_size,
            params.hc_win_size,
            params.bt_clip_value,
            params.max_diff,
            params.uniqueness_ratio,
            params.scanlines_mask,
            params.flags);
        NVXIO_CHECK_REFERENCE(r_semi_global_matching_node_);



        // convert disparity from fixed point to grayscale
        r_convert_depth_node_ = vxConvertDepthNode(r_graph_, r_disparity_short, flip_disparity, VX_CONVERT_POLICY_SATURATE, s_shift);
		NVXIO_CHECK_REFERENCE(r_convert_depth_node_);
        vxReleaseScalar(&s_shift);
        
		flip_depth_node_ = nvxFlipImageNode(r_graph_, flip_disparity, right_disparity, NVX_FLIP_HORIZONTAL);
        NVXIO_CHECK_REFERENCE(flip_depth_node_);

        
        // verify the graph
        NVXIO_SAFE_CALL( vxVerifyGraph(main_graph_) );
		NVXIO_SAFE_CALL( vxVerifyGraph(r_graph_) );

        // clean up
        vxReleaseImage(&disparity_short);
		vxReleaseImage(&flip_disparity);
		vxReleaseImage(&flip_left_gray);
		vxReleaseImage(&flip_right_gray);
		vxReleaseImage(&r_disparity_short);
    }
I had defined "r_graph_" in class SGBM.
class SGBM : public StereoMatching
    {
    public:
        SGBM(vx_context context, const StereoMatchingParams& params,
	     const StereoCalibParams& calibParams,
             vx_image left, vx_image right, vx_image left_disparity, vx_image right_disparity);
        ~SGBM();

        virtual void run();

        void printPerfs() const;

    private:
        vx_graph main_graph_;
        vx_graph r_graph_;
		vx_node left_flip_node_;
		vx_node right_flip_node_;
		vx_node flip_depth_node_;

        vx_node left_cvt_color_node_;
        vx_node right_cvt_color_node_;
        vx_node left_remap_node_;
        vx_node right_remap_node_;
        vx_node semi_global_matching_node_;
        vx_node r_semi_global_matching_node_;
        vx_node convert_depth_node_;
        vx_node r_convert_depth_node_;
    };
The result did not have any difference then in only one graph. Especially the time consuming are much the same.
Maybe I had missed something when coding, please help me figure this out.
Thanks.

BR.

Hi.

Is there anyone had this kind of experience?

BR

Hi,

You need two pipeline:

  1. (left, right) -> stereo -> (disparity)
  2. (left, right) -> flipping -> (f_left, f_right) -> stereo -> (f_disparity) -> flipping -> (disparity)

Please check you have these two pipelines implemented and its functionality.

Thanks.

Hi:

 I am sure about this, I had tried these two nodes. Both nodes well functioned well.
 So how can I make these two nodes in parallel processing.

BR

Hi,

For the concurrent call, you can check this section of VisionWorks document:

Programming Model Overview > Valid Concurrent Calls

Thanks.

Hi.

I think I had followed the document guide. As is in "Shared Graph Input" one. 
But after I test the code, the time is the same.
The code is in the first comment in this issue.

Thanks.

Hi,

You can write the two pipeline in the same graph.
When you process the graph, two pipelines will be executed, and you can get the results at the same time.

Hi,

I don’t know if I understand you in the right way. So I list my code in the following.
The following code includes two graph:

main_graph_ = vxCreateGraph(context);
        NVXIO_CHECK_REFERENCE(main_graph_);
        // evaluate stereo
        vx_image disparity_short = vxCreateVirtualImage(main_graph_, width, height, VX_DF_IMAGE_S16);
        NVXIO_CHECK_REFERENCE(disparity_short);
       
	  	semi_global_matching_node_ = nvxSemiGlobalMatchingNode(
		        main_graph_,
		        left,
		        right,
		        disparity_short,
		        params.min_disparity,
		        params.max_disparity,
		        params.P1,
		        params.P2,
		        params.sad,
		        params.ct_win_size,
		        params.hc_win_size,
		        params.bt_clip_value,
		        params.max_diff,
		        params.uniqueness_ratio,
		        params.scanlines_mask,
		        params.flags);
        NVXIO_CHECK_REFERENCE(semi_global_matching_node_);     
   
 		// convert disparity from fixed point to grayscale
        vx_int32 shift = 4;
        vx_scalar s_shift = vxCreateScalar(context, VX_TYPE_INT32, &shift);
        NVXIO_CHECK_REFERENCE(s_shift);
        convert_depth_node_ = vxConvertDepthNode(main_graph_, disparity_short, left_disparity, VX_CONVERT_POLICY_SATURATE, s_shift);
		NVXIO_CHECK_REFERENCE(convert_depth_node_);


		/*/\ parrallel right to left disparity */

		main_graph_parrallel = vxCreateGraph(context);
        NVXIO_CHECK_REFERENCE(main_graph_parrallel);

		vx_image r_disparity_short = vxCreateVirtualImage(main_graph_parrallel, width, height, VX_DF_IMAGE_S16);
        NVXIO_CHECK_REFERENCE(r_disparity_short);

		vx_image flip_left_gray = vxCreateVirtualImage(main_graph_parrallel, width, height, VX_DF_IMAGE_U8);
        NVXIO_CHECK_REFERENCE(flip_left_gray);

        vx_image flip_right_gray = vxCreateVirtualImage(main_graph_parrallel, width, height, VX_DF_IMAGE_U8);
        NVXIO_CHECK_REFERENCE(flip_right_gray);

		left_flip_node_ = nvxFlipImageNode(main_graph_parrallel, left, flip_left_gray, NVX_FLIP_HORIZONTAL);
        NVXIO_CHECK_REFERENCE(left_flip_node_);
		
		right_flip_node_ = nvxFlipImageNode(main_graph_parrallel, right, flip_right_gray, NVX_FLIP_HORIZONTAL);
        NVXIO_CHECK_REFERENCE(right_flip_node_);

		vx_image flip_disparity = vxCreateVirtualImage(main_graph_parrallel, width, height, VX_DF_IMAGE_U8);
        NVXIO_CHECK_REFERENCE(flip_disparity);

		r_semi_global_matching_node_ = nvxSemiGlobalMatchingNode(
            main_graph_parrallel,
            flip_right_gray,
            flip_left_gray,
            r_disparity_short,
            params.min_disparity,
            params.max_disparity,
            params.P1,
            params.P2,
            params.sad,
            params.ct_win_size,
            params.hc_win_size,
            params.bt_clip_value,
            params.max_diff,
            params.uniqueness_ratio,
            params.scanlines_mask,
            params.flags);
        NVXIO_CHECK_REFERENCE(r_semi_global_matching_node_);

        r_convert_depth_node_ = vxConvertDepthNode(main_graph_parrallel, r_disparity_short, flip_disparity, VX_CONVERT_POLICY_SATURATE, s_shift);
		NVXIO_CHECK_REFERENCE(r_convert_depth_node_);
        vxReleaseScalar(&s_shift);
        
		flip_depth_node_ = nvxFlipImageNode(main_graph_parrallel, flip_disparity, right_disparity, NVX_FLIP_HORIZONTAL);
        NVXIO_CHECK_REFERENCE(flip_depth_node_);

And the following code includes one graph:

main_graph_ = vxCreateGraph(context);

        NVXIO_CHECK_REFERENCE(main_graph_);


        // left stereo
        vx_image disparity_short = vxCreateVirtualImage(main_graph_, width, height, VX_DF_IMAGE_S16);
        NVXIO_CHECK_REFERENCE(disparity_short);
       
        // right stereo
        vx_image r_disparity_short = vxCreateVirtualImage(main_graph_, width, height, VX_DF_IMAGE_S16);
        NVXIO_CHECK_REFERENCE(r_disparity_short);

		vx_image flip_left_gray = vxCreateVirtualImage(main_graph_, width, height, VX_DF_IMAGE_U8);
        NVXIO_CHECK_REFERENCE(flip_left_gray);

        vx_image flip_right_gray = vxCreateVirtualImage(main_graph_, width, height, VX_DF_IMAGE_U8);
        NVXIO_CHECK_REFERENCE(flip_right_gray);

		left_flip_node_ = nvxFlipImageNode(main_graph_, left, flip_left_gray, NVX_FLIP_HORIZONTAL);
        NVXIO_CHECK_REFERENCE(left_flip_node_);
		
		right_flip_node_ = nvxFlipImageNode(main_graph_, right, flip_right_gray, NVX_FLIP_HORIZONTAL);
        NVXIO_CHECK_REFERENCE(right_flip_node_);

		vx_image flip_disparity = vxCreateVirtualImage(main_graph_, width, height, VX_DF_IMAGE_U8);
        NVXIO_CHECK_REFERENCE(flip_disparity);


        semi_global_matching_node_ = nvxSemiGlobalMatchingNode(
            main_graph_,
            left,
            right,
            disparity_short,
            params.min_disparity,
            params.max_disparity,
            params.P1,
            params.P2,
            params.sad,
            params.ct_win_size,
            params.hc_win_size,
            params.bt_clip_value,
            params.max_diff,
            params.uniqueness_ratio,
            params.scanlines_mask,
            params.flags);
        NVXIO_CHECK_REFERENCE(semi_global_matching_node_);

	    r_semi_global_matching_node_ = nvxSemiGlobalMatchingNode(
            main_graph_,
            flip_right_gray,
            flip_left_gray,
            r_disparity_short,
            params.min_disparity,
            params.max_disparity,
            params.P1,
            params.P2,
            params.sad,
            params.ct_win_size,
            params.hc_win_size,
            params.bt_clip_value,
            params.max_diff,
            params.uniqueness_ratio,
            params.scanlines_mask,
            params.flags);
        NVXIO_CHECK_REFERENCE(r_semi_global_matching_node_);

        // convert disparity from fixed point to grayscale
        vx_int32 shift = 4;
        vx_scalar s_shift = vxCreateScalar(context, VX_TYPE_INT32, &shift);
        NVXIO_CHECK_REFERENCE(s_shift);

        convert_depth_node_ = vxConvertDepthNode(main_graph_, disparity_short, left_disparity, VX_CONVERT_POLICY_SATURATE, s_shift);
		NVXIO_CHECK_REFERENCE(convert_depth_node_);

        r_convert_depth_node_ = vxConvertDepthNode(main_graph_, r_disparity_short, flip_disparity, VX_CONVERT_POLICY_SATURATE, s_shift);
		NVXIO_CHECK_REFERENCE(r_convert_depth_node_);
        vxReleaseScalar(&s_shift);
        
		flip_depth_node_ = nvxFlipImageNode(main_graph_, flip_disparity, right_disparity, NVX_FLIP_HORIZONTAL);
        NVXIO_CHECK_REFERENCE(flip_depth_node_);

This two paragraphs have the same performance. Especially cost of time are the same.
As you can know the second paragraph has only one graph, and the two sgbm nodes are written in order. I want to know whether if this two nodes executed at the same time?
And how can I design this program to make these two nodes processed in orderly.

Thanks!

Hi,

The second one looks great.
Could you get the correct left/right disparity via left_disparity/right_disparity?

Thanks.

Hi,

Yes, I do. I could have correct left/right disparity.
Now these two part of code get the same result and the same performance(have the same time consuming).
And I am not sure I had process two sgbm nodes concurrently. Or how can I process two sgbm nodes concurrently. 

Thanks.

Hi,

It looks like you can get left_disparity and right_disparity at the same graph execution.
If the result of both disparities are correct, that should be what you want.

Hi.

One more question, how can I process these two nodes execute in order?

BR

Hi,

You can declare two graphs and call the graph execution in sequence.

Thanks.

Hi.

After I had tried the methods I mentioned before. Of course I got the right disparity map, but the thing is the twice sgbm did not process concurrently. Because the time consuming in these method are twice then one time sgbm .

So could you tell me how to parallel process two sgbm?

I had tried the " VisionWorks document:> Programming Model Overview > Valid Concurrent Calls". And the code I had listed before.

Thanks.

Hi.

After I had tried the methods I mentioned before. Of course I got the right disparity map, but the thing is the twice sgbm did not process concurrently. Because the time consuming in these method are twice then one time sgbm .

So could you tell me how to parallel process two sgbm?

I had tried the " VisionWorks document:> Programming Model Overview > Valid Concurrent Calls". And the code I had listed before.

Can you look into it, and tell me how to parallel process these two sgbm?

Thanks.

Hi,

Could you check the performance via nvprof first?
We want to know the twice time comes from full-occupied GPU or sequential scheduling.

Thanks.

Hi

Here are several command I tried, and the result that I got:

  1. "nvprof ./vision_node left1.jpg right1.jpg "
==2646== NVPROF is profiling process 2646, command: ./vision_node left1.jpg right1.jpg
==2646== Warning: Unified Memory Profiling is not supported on the underlying platform. System requirements for unified memory can be found at: http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-requirements
min  =  -9.779840, max = 756.867310min  =  -43.767956, max = 476.105225VisionWorks library info:
	 VisionWorks version : 1.6.0
	 OpenVX Standard version : 1.1.0

init done
success!!!!!
==2646== Profiling application: ./vision_node left1.jpg right1.jpg
==2646== Profiling result:
Time(%)      Time     Calls       Avg       Min       Max  Name
 41.32%  49.449ms        12  4.1208ms  3.5458ms  5.2072ms  void _GLOBAL__N__69_tmpxft_00004923_00000000_7_cuda_sgbm_aggregate_cost_scanlines_cpp1_ii_setup::scanlineDiagonalPow2x4<int=64>(cudev::GlobPtr<unsigned int>, _GLOBAL__N__69_tmpxft_00004923_00000000_7_cuda_sgbm_aggregate_cost_scanlines_cpp1_ii_setup::scanlineDiagonalPow2x4<int=64>::GlobPtrSz<unsigned __int64>, int, int, int, int, int, int, int)
 21.47%  25.696ms         2  12.848ms  3.0743ms  22.622ms  _GLOBAL__N__59_tmpxft_00004c34_00000000_7_cuda_sgbm_compute_costbt_cpp1_ii_bfe380cd::modifiedCostBTUnrolledMinMax(cudev::GlobPtr<unsigned int>, cudev::GlobPtr<unsigned int>, cudev::GlobPtr<unsigned int>, cudev::GlobPtr<unsigned int>, cudev::GlobPtrSz<unsigned char>, int, int, int, int, int)
 20.14%  24.096ms         4  6.0241ms  4.7773ms  7.2784ms  void _GLOBAL__N__69_tmpxft_00004923_00000000_7_cuda_sgbm_aggregate_cost_scanlines_cpp1_ii_setup::scanlineLeftRightPow2x2<unsigned char, unsigned short, int=64>(cudev::GlobPtr<unsigned short>, _GLOBAL__N__69_tmpxft_00004923_00000000_7_cuda_sgbm_aggregate_cost_scanlines_cpp1_ii_setup::scanlineLeftRightPow2x2<unsigned char, unsigned short, int=64>::GlobPtrSz<unsigned int>, int, int, int, int, int, int)
  6.53%  7.8151ms         2  3.9075ms  2.8396ms  4.9755ms  _GLOBAL__N__57_tmpxft_00004799_00000000_7_cuda_sgbm_convolvecost_cpp1_ii_8385c870::boxNorm3x3_U8Unrolled(cudev::GlobPtr<unsigned int>, cudev::GlobPtrSz<unsigned int>, int)
  5.94%  7.1041ms         2  3.5520ms  3.5491ms  3.5550ms  dispFromCostBW8(cudev::GlobPtr<short>, cudev::GlobPtrSz<short>, int, int, int, int, unsigned int)
  2.29%  2.7360ms         4  684.01us  164.01us  1.2021ms  _GLOBAL__N__59_tmpxft_00004c34_00000000_7_cuda_sgbm_compute_costbt_cpp1_ii_bfe380cd::sobelMinMax(cudev::GlobPtr<unsigned int>, cudev::GlobPtrSz<unsigned int>, int, int, int)
  1.70%  2.0284ms         2  1.0142ms  372.81us  1.6556ms  void _GLOBAL__N__59_tmpxft_00004c34_00000000_7_cuda_sgbm_compute_costbt_cpp1_ii_bfe380cd::invalidateBorderCost<unsigned char, uchar4>(cudev::GlobPtrSz<unsigned char>, int, int, int, int, int)
  0.26%  309.99us         2  155.00us  154.92us  155.08us  [CUDA memcpy HtoD]
  0.21%  252.71us         3  84.237us  28.321us  114.82us  void flipImage_horizontal_vload<unsigned char>(unsigned char const *, int, unsigned char*, int, int, int, int)
  0.06%  73.763us         2  36.881us  36.322us  37.441us  void cudev::grid_transform_detail::transformVectorized_warpwise<int=32, int=4, short, unsigned char, cudev::UnaryComposition<cudev::Binder2nd<cudev::bit_rshift<short>>, cudev::saturate_cast_func<short, unsigned char>>, cudev::WithOutMask>(cudev::GlobPtr<short>, cudev::grid_transform_detail::transformVectorized_warpwise<int=32, int=4, short, unsigned char, cudev::UnaryComposition<cudev::Binder2nd<cudev::bit_rshift<short, unsigned char>>, cudev::saturate_cast_func<short, unsigned char>>, cudev::WithOutMask>, short, cudev::bit_rshift<short>, int, int)
  0.05%  55.072us        45  1.2230us     320ns  2.6880us  [CUDA memset]
  0.04%  45.729us         2  22.864us  22.241us  23.488us  [CUDA memcpy DtoH]

==2646== API calls:
Time(%)      Time     Calls       Avg       Min       Max  Name
 78.46%  581.77ms        42  13.852ms  18.784us  574.33ms  cudaFree
 19.55%  144.93ms         1  144.93ms  144.93ms  144.93ms  cudaStreamSynchronize
  0.52%  3.8471ms        14  274.79us  17.312us  3.4828ms  cudaEventRecord
  0.36%  2.6456ms        39  67.836us  37.152us  304.80us  cudaMemset
  0.33%  2.4712ms        41  60.273us  14.176us  687.71us  cudaMalloc
  0.31%  2.2681ms         4  567.02us  301.86us  1.0245ms  cudaMemcpy2D
  0.22%  1.6198ms        33  49.086us  34.912us  182.53us  cudaLaunch
  0.07%  541.76us       137  3.9540us     992ns  32.640us  cudaGetDevice
  0.05%  357.28us         2  178.64us  164.10us  193.18us  cudaStreamCreate
  0.04%  318.59us         6  53.098us  36.160us  67.232us  cudaMemset2DAsync
  0.02%  136.29us       245     556ns     448ns  1.7920us  cudaSetupArgument
  0.02%  128.70us         2  64.352us  44.896us  83.808us  cudaStreamDestroy
  0.01%  77.568us        54  1.4360us     480ns  33.344us  cudaGetLastError
  0.01%  77.504us         7  11.072us  7.5520us  28.416us  cudaEventElapsedTime
  0.01%  76.928us        91     845ns     416ns  15.136us  cuDeviceGetAttribute
  0.01%  58.624us        14  4.1870us  2.4960us  14.432us  cudaEventCreate
  0.01%  38.848us        14  2.7740us  2.2400us  6.6560us  cudaEventDestroy
  0.00%  24.128us         1  24.128us  24.128us  24.128us  cudaGetDeviceProperties
  0.00%  22.112us        33     670ns     512ns  1.9200us  cudaConfigureCall
  0.00%  9.5360us         1  9.5360us  9.5360us  9.5360us  cudaSetDevice
  0.00%  4.8320us         3  1.6100us     704ns  2.5600us  cuDeviceGetCount
  0.00%  4.0640us         2  2.0320us     672ns  3.3920us  cudaCreateChannelDesc
  0.00%  3.2640us         1  3.2640us  3.2640us  3.2640us  cuDeviceTotalMem
  0.00%  2.2720us         3     757ns     704ns     832ns  cuDeviceGet
  0.00%  1.2800us         1  1.2800us  1.2800us  1.2800us  cudaGetDeviceCount
  0.00%     992ns         1     992ns     992ns     992ns  cuDeviceGetName
  0.00%     768ns         1     768ns     768ns     768ns  cudaDriverGetVersion
  0.00%     544ns         1     544ns     544ns     544ns  cudaRuntimeGetVersion
  1. “nvprof --metrics achieved_occupancy ./vision_node left1.jpg right1.jpg”
==2606== NVPROF is profiling process 2606, command: ./vision_node left1.jpg right1.jpg
min  =  -9.779840, max = 756.867310min  =  -43.767956, max = 476.105225VisionWorks library info:
	 VisionWorks version : 1.6.0
	 OpenVX Standard version : 1.1.0

init done
success!!!!!
==2606== Profiling application: ./vision_node left1.jpg right1.jpg
==2606== Profiling result:
==2606== Metric result:
Invocations                               Metric Name                        Metric Description         Min         Max         Avg
Device "GP10B (0)"
    Kernel: void _GLOBAL__N__59_tmpxft_00004c34_00000000_7_cuda_sgbm_compute_costbt_cpp1_ii_bfe380cd::invalidateBorderCost<unsigned char, uchar4>(cudev::GlobPtrSz<unsigned char>, int, int, int, int, int)
          2                        achieved_occupancy                        Achieved Occupancy    0.826666    0.881524    0.854095
    Kernel: _GLOBAL__N__59_tmpxft_00004c34_00000000_7_cuda_sgbm_compute_costbt_cpp1_ii_bfe380cd::sobelMinMax(cudev::GlobPtr<unsigned int>, cudev::GlobPtrSz<unsigned int>, int, int, int)
          4                        achieved_occupancy                        Achieved Occupancy    0.940704    0.944702    0.942707
    Kernel: void _GLOBAL__N__69_tmpxft_00004923_00000000_7_cuda_sgbm_aggregate_cost_scanlines_cpp1_ii_setup::scanlineDiagonalPow2x4<int=64>(cudev::GlobPtr<unsigned int>, _GLOBAL__N__69_tmpxft_00004923_00000000_7_cuda_sgbm_aggregate_cost_scanlines_cpp1_ii_setup::scanlineDiagonalPow2x4<int=64>::GlobPtrSz<unsigned __int64>, int, int, int, int, int, int, int)
         12                        achieved_occupancy                        Achieved Occupancy    0.943685    0.958177    0.953543
    Kernel: void cudev::grid_transform_detail::transformVectorized_warpwise<int=32, int=4, short, unsigned char, cudev::UnaryComposition<cudev::Binder2nd<cudev::bit_rshift<short>>, cudev::saturate_cast_func<short, unsigned char>>, cudev::WithOutMask>(cudev::GlobPtr<short>, cudev::grid_transform_detail::transformVectorized_warpwise<int=32, int=4, short, unsigned char, cudev::UnaryComposition<cudev::Binder2nd<cudev::bit_rshift<short, unsigned char>>, cudev::saturate_cast_func<short, unsigned char>>, cudev::WithOutMask>, short, cudev::bit_rshift<short>, int, int)
          2                        achieved_occupancy                        Achieved Occupancy    0.874224    0.895983    0.885104
    Kernel: _GLOBAL__N__57_tmpxft_00004799_00000000_7_cuda_sgbm_convolvecost_cpp1_ii_8385c870::boxNorm3x3_U8Unrolled(cudev::GlobPtr<unsigned int>, cudev::GlobPtrSz<unsigned int>, int)
          2                        achieved_occupancy                        Achieved Occupancy    0.925602    0.942271    0.933936
    Kernel: void flipImage_horizontal_vload<unsigned char>(unsigned char const *, int, unsigned char*, int, int, int, int)
          3                        achieved_occupancy                        Achieved Occupancy    0.870246    0.896894    0.885007
    Kernel: _GLOBAL__N__59_tmpxft_00004c34_00000000_7_cuda_sgbm_compute_costbt_cpp1_ii_bfe380cd::modifiedCostBTUnrolledMinMax(cudev::GlobPtr<unsigned int>, cudev::GlobPtr<unsigned int>, cudev::GlobPtr<unsigned int>, cudev::GlobPtr<unsigned int>, cudev::GlobPtrSz<unsigned char>, int, int, int, int, int)
          2                        achieved_occupancy                        Achieved Occupancy    0.485781    0.564048    0.524914
    Kernel: void _GLOBAL__N__69_tmpxft_00004923_00000000_7_cuda_sgbm_aggregate_cost_scanlines_cpp1_ii_setup::scanlineLeftRightPow2x2<unsigned char, unsigned short, int=64>(cudev::GlobPtr<unsigned short>, _GLOBAL__N__69_tmpxft_00004923_00000000_7_cuda_sgbm_aggregate_cost_scanlines_cpp1_ii_setup::scanlineLeftRightPow2x2<unsigned char, unsigned short, int=64>::GlobPtrSz<unsigned int>, int, int, int, int, int, int)
          4                        achieved_occupancy                        Achieved Occupancy    0.943042    0.943935    0.943595
    Kernel: dispFromCostBW8(cudev::GlobPtr<short>, cudev::GlobPtrSz<short>, int, int, int, int, unsigned int)
          2                        achieved_occupancy                        Achieved Occupancy    0.971381    0.972708    0.972044

Thanks.

Hi,

Sorry for the missing. We want the .nvprof file to check the GPU utilization cross timeline.
Could you run this command and attach the output file?

sudo ./nvprof -o test.nvprof [your app]

Thanks

Hi

You can check"https://drive.google.com/file/d/1wHA0oenHzQTD8G6Lk795_nfKFsbWlNYY/view?usp=sharing" to get a “temp.nvprof”.

Thanks

Hi,

From your profiling file, stream15 almost entirely occupied.
There is not much zoom for acceleration.

Could you also share tegrastats data when executing SGMB algorithm?

sudo ./tegrastats

Thanks.