StereoBM CPU and StereoBM GPU versions on OpenCV give very different output.

I am working on Jetson TX1 and I find a huge difference between the outputs of the CPU and GPU versions of StereoBM. I have compiled a GPU version of OpenCV 3.2 on my machine.

I am mentioning the outputs of both CPU and GPU, along with the GPU code.
Outputs can be seen at this link: http://answers.opencv.org/question/185481/stereobm-cpu-and-stereobm-gpu-versions-on-opencv-give-very-different-output/

#include "opencv2/opencv.hpp"
#include "opencv2/core.hpp"
#include "opencv2/highgui.hpp"
#include "opencv2/cudaarithm.hpp"
#include <iostream>
#include <stdio.h>
#include <limits.h>

using namespace cv;

int main()
{
cv::Mat img1, img2;

img1 = imread( "~/left.png", cv::IMREAD_GRAYSCALE);
img2 = imread( "~/right.png", cv::IMREAD_GRAYSCALE);

cuda::GpuMat d_left, d_right;
Ptr<cuda::StereoBM> bm;
bm = cuda::createStereoBM(64, 11); //numDisparities=64, windowSize=11
Mat disp(img1.size(), CV_8U);

cuda::GpuMat d_disp(img1.size(), CV_8U);
cuda::GpuMat d_disp_color(img1.size(), CV_8U);

d_left.upload(img1);
d_right.upload(img2);
bm->compute(d_right, d_left, d_disp);
cuda::drawColorDisp(d_disp, d_disp_color, 64);
d_disp.download(disp);
imwrite("~/disparity.png", disp);

d_disp_color.download(disp);
imwrite("~/disparity_color.png", disp);

return 0;
}

Following are some details about the machine being used:

  1. output of ldd <lib_file>
linux-vdso.so.1 =>  (0x0000007fb3fc8000)
libopencv_cudastereo.so.3.2 => /mnt/newssd/OPENCV/opencv-3.2.0/build/lib/libopencv_cudastereo.so.3.2 (0x0000007fb3e20000)
libopencv_imgcodecs.so.3.2 => /mnt/newssd/OPENCV/opencv-3.2.0/build/lib/libopencv_imgcodecs.so.3.2 (0x0000007fb3cfa000)
libopencv_core.so.3.2 => /mnt/newssd/OPENCV/opencv-3.2.0/build/lib/libopencv_core.so.3.2 (0x0000007fb3903000)
libstdc++.so.6 => /usr/lib/aarch64-linux-gnu/libstdc++.so.6 (0x0000007fb3732000)
libgcc_s.so.1 => /lib/aarch64-linux-gnu/libgcc_s.so.1 (0x0000007fb3710000)
libc.so.6 => /lib/aarch64-linux-gnu/libc.so.6 (0x0000007fb35c9000)
/lib/ld-linux-aarch64.so.1 (0x000000555c9ee000)
libcudart.so.8.0 => /usr/local/cuda-8.0/lib64/libcudart.so.8.0 (0x0000007fb3566000)
libm.so.6 => /lib/aarch64-linux-gnu/libm.so.6 (0x0000007fb34b8000)
libopencv_imgproc.so.3.2 => /mnt/newssd/OPENCV/opencv-3.2.0/build/lib/libopencv_imgproc.so.3.2 (0x0000007fb313f000)
libpthread.so.0 => /lib/aarch64-linux-gnu/libpthread.so.0 (0x0000007fb3113000)
libjpeg.so.8 => /usr/lib/aarch64-linux-gnu/libjpeg.so.8 (0x0000007fb30cb000)
libwebp.so.5 => /usr/lib/aarch64-linux-gnu/libwebp.so.5 (0x0000007fb306f000)
libpng12.so.0 => /lib/aarch64-linux-gnu/libpng12.so.0 (0x0000007fb3040000)
libtiff.so.5 => /usr/lib/aarch64-linux-gnu/libtiff.so.5 (0x0000007fb2fc8000)
libjasper.so.1 => /usr/lib/aarch64-linux-gnu/libjasper.so.1 (0x0000007fb2f6c000)
libz.so.1 => /lib/aarch64-linux-gnu/libz.so.1 (0x0000007fb2f45000)
libdl.so.2 => /lib/aarch64-linux-gnu/libdl.so.2 (0x0000007fb2f31000)
libtbb.so.2 => /usr/lib/aarch64-linux-gnu/libtbb.so.2 (0x0000007fb2ef2000)
libopenblas.so.0 => /usr/lib/libopenblas.so.0 (0x0000007fb2893000)
librt.so.1 => /lib/aarch64-linux-gnu/librt.so.1 (0x0000007fb287b000)
liblzma.so.5 => /lib/aarch64-linux-gnu/liblzma.so.5 (0x0000007fb284e000)
libjbig.so.0 => /usr/lib/aarch64-linux-gnu/libjbig.so.0 (0x0000007fb2830000)
libgfortran.so.3 => /usr/lib/aarch64-linux-gnu/libgfortran.so.3 (0x0000007fb2742000)

Cuda version: 8.0
TX1 release 28.1
disp_cpu.png
disp_gpu.png