The solution result of cuSolver is different from cv::solve

To solve the overdetermined equation of Ax=B, the result obtained by using the cusolver library is different from the result obtained by using cv::solve in OpenCV. The specific code is as follows:

#include "opencv2/opencv.hpp"
#include "device_launch_parameters.h"
#include "cublas_v2.h"
#include "cusolverDn.h"
#include "helper_cusolver.h"
#include <stdio.h>
#include <stdlib.h>
#include "CheckCudaError.h"
/*  solve A*x = b by QR */
int linearSolverQR(
    cusolverDnHandle_t handle,
    int n,
    const float *Acopy,
    int lda,
    const float *b,
    float *x)
{
    cublasHandle_t cublasHandle = NULL; // used in residual evaluation
    int bufferSize = 0;
    int bufferSize_geqrf = 0;
    int bufferSize_ormqr = 0;
    int *info = NULL;
    float *buffer = NULL;
    float *A = NULL;
    float *tau = NULL;
    int h_info = 0;
    float start, stop;
    float time_solve;
    const float one = 1.0;

    Check_Cuda_Error(cublasCreate(&cublasHandle));

    Check_Cuda_Error(cusolverDnSgeqrf_bufferSize(handle, n, n, (float*)Acopy, lda, &bufferSize_geqrf));
    Check_Cuda_Error(cusolverDnSormqr_bufferSize(
	    handle,
	    CUBLAS_SIDE_LEFT,
	    CUBLAS_OP_T,
	    n,
	    1,
	    n,
	    A,
	    lda,
	    NULL,
	    x,
	    n,
	    &bufferSize_ormqr));

    //printf("buffer_geqrf = %d, buffer_ormqr = %d \n", bufferSize_geqrf, bufferSize_ormqr);

    bufferSize = (bufferSize_geqrf > bufferSize_ormqr) ? bufferSize_geqrf : bufferSize_ormqr;

    Check_Cuda_Error(cudaMalloc(&info, sizeof(int)));
    Check_Cuda_Error(cudaMalloc(&buffer, sizeof(float)*bufferSize));
    Check_Cuda_Error(cudaMalloc(&A, sizeof(float)*lda*n));
    Check_Cuda_Error(cudaMalloc((void**)&tau, sizeof(float)*n));

    // prepare a copy of A because getrf will overwrite A with L
    Check_Cuda_Error(cudaMemcpy(A, Acopy, sizeof(float)*lda*n, cudaMemcpyDeviceToDevice));

    Check_Cuda_Error(cudaMemset(info, 0, sizeof(int)));

    // compute QR factorization
    Check_Cuda_Error(cusolverDnSgeqrf(handle, n, n, A, lda, tau, buffer, bufferSize, info));
    Check_Cuda_Error(cudaMemcpy(&h_info, info, sizeof(int), cudaMemcpyDeviceToHost));

    if (0 != h_info) {
	    fprintf(stderr, "Error: LU factorization failed\n");
    }

    Check_Cuda_Error(cudaMemcpy(x, b, sizeof(float)*n, cudaMemcpyDeviceToDevice));

    // compute Q^T*b
    Check_Cuda_Error(cusolverDnSormqr(
	    handle,
	    CUBLAS_SIDE_LEFT,
	    CUBLAS_OP_T,
	    n,
	    1,
	    n,
	    A,
	    lda,
	    tau,
	   x,
	   n,
	   buffer,
	   bufferSize,
	   info));

   // x = R \ Q^T*b
   Check_Cuda_Error(cublasStrsm(
	    cublasHandle,
	    CUBLAS_SIDE_LEFT,
	    CUBLAS_FILL_MODE_UPPER,
	    CUBLAS_OP_N,
	    CUBLAS_DIAG_NON_UNIT,
	    n,
	    1,
	    &one,
	    A,
	    lda,
	    x,
	    n));
    Check_Cuda_Error(cudaDeviceSynchronize());

    time_solve = stop - start;

    if (cublasHandle) { Check_Cuda_Error(cublasDestroy(cublasHandle)); }
    if (info) { Check_Cuda_Error(cudaFree(info)); }
    if (buffer) { Check_Cuda_Error(cudaFree(buffer)); }
    if (A) { Check_Cuda_Error(cudaFree(A)); }
    if (tau) { Check_Cuda_Error(cudaFree(tau)); }

    return 0;
}

The following code compares the calculation results of cuSolver and cv::solve

int main()
{
    const int ciValidPointsNum = 30;
    cv::FileStorage fs("../TestData/MatAB.xml", cv::FileStorage::READ);

    cv::Mat matA = cv::Mat::zeros(4, ciValidPointsNum, CV_32FC1);
    cv::Mat matB1 = cv::Mat::zeros(ciValidPointsNum, 1, CV_32FC1);
    cv::Mat matB2 = cv::Mat::zeros(ciValidPointsNum, 1, CV_32FC1);
    cv::Mat matB3 = cv::Mat::zeros(ciValidPointsNum, 1, CV_32FC1);
    cv::Mat matB4 = cv::Mat::ones(ciValidPointsNum, 1, CV_32FC1);
    cv::Mat matX1 = cv::Mat::zeros(4, 1, CV_32FC1);
    cv::Mat matX2 = cv::Mat::zeros(4, 1, CV_32FC1);
    cv::Mat matX3 = cv::Mat::zeros(4, 1, CV_32FC1);
    cv::Mat matX4 = cv::Mat::zeros(4, 1, CV_32FC1);

    fs["MatA"] >> matA;
    fs["MatB1"] >> matB1;
    fs["MatB2"] >> matB2;
    fs["MatB3"] >> matB3;
    fs.release();
    matA.resize(ciValidPointsNum);
    matB1.resize(ciValidPointsNum);
    matB2.resize(ciValidPointsNum);
    matB3.resize(ciValidPointsNum);
    std::cout << "matA = "<<matA << std::endl << "matB1 = "<< matB1 << std::endl << "matB2 = " << matB2 << std::endl << "matB3 = " << matB3 << std::endl;
    std::cout << "matB4 always1" << std::endl;
    cv::Mat matACopy = matA.clone();
    cv::Mat matB1Copy = matB1.clone();
    cv::Mat matB2Copy = matB2.clone();
    cv::Mat matB3Copy = matB3.clone();
    cv::Mat matB4Copy = matB4.clone();

    cv::solve(matA, matB1, matX1, cv::DECOMP_QR);
    cv::solve(matA, matB2, matX2, cv::DECOMP_QR);
    cv::solve(matA, matB3, matX3, cv::DECOMP_QR);
    cv::solve(matA, matB4, matX4, cv::DECOMP_QR);

    cv::Mat TestTi_CPU = cv::Mat::zeros(4, 4, CV_32FC1);
    float * ptrTestTi_CPU = (float *)TestTi_CPU.data;
    for (int i = 0; i < 4; i++)
    {
	    ptrTestTi_CPU[0 * 4 + i] = matX1.at<float>(i, 0);
	    ptrTestTi_CPU[1 * 4 + i] = matX2.at<float>(i, 0);
	    ptrTestTi_CPU[2 * 4 + i] = matX3.at<float>(i, 0);
	    ptrTestTi_CPU[3 * 4 + i] = matX4.at<float>(i, 0);
    }
    std::cout << std::endl;
    std::cout << "OpenCV cv::solve Result: " <<std::endl;
    std::cout << TestTi_CPU << std::endl;
    std::cout << std::endl;

    cusolverDnHandle_t handle = NULL;
    cublasHandle_t cublasHandle = NULL;
    cudaStream_t stream = NULL;

    int rowsA = ciValidPointsNum; // number of rows of A
    int colsA = 4; // number of columns of A
    int lda = ciValidPointsNum; // leading dimension in dense matrix
    float *d_A = NULL; // A in GPU
    float *d_b1 = NULL; // b in GPU
    float *d_b2 = NULL; // b in GPU
    float *d_b3 = NULL; // b in GPU
    float *d_b4 = NULL; // b in GPU
    float *d_x1 = NULL; // x = A \ b in GPU
    float *d_x2 = NULL; // x = A \ b in GPU
    float *d_x3 = NULL; // x = A \ b in GPU
    float *d_x4 = NULL; // x = A \ b in GPU

    float* h_A = (float*)malloc(sizeof(float)*lda*colsA);
    float* h_b1 = (float*)malloc(sizeof(float)*rowsA);
    float* h_b2 = (float*)malloc(sizeof(float)*rowsA);
    float* h_b3 = (float*)malloc(sizeof(float)*rowsA);
    float* h_b4 = (float*)malloc(sizeof(float)*rowsA);
    float* h_x1 = (float*)malloc(sizeof(float)*colsA);
    float* h_x2 = (float*)malloc(sizeof(float)*colsA);
    float* h_x3 = (float*)malloc(sizeof(float)*colsA);
    float* h_x4 = (float*)malloc(sizeof(float)*colsA);
    memset(h_A, 0, sizeof(float)*lda*colsA);
    memset(h_b1, 0, sizeof(float)*rowsA);
    memset(h_b2, 0, sizeof(float)*rowsA);
    memset(h_b3, 0, sizeof(float)*rowsA);
    memset(h_b4, 0, sizeof(float)*rowsA);
    memset(h_x1, 0, sizeof(float)*colsA);
    memset(h_x2, 0, sizeof(float)*colsA);
    memset(h_x3, 0, sizeof(float)*colsA);
    memset(h_x4, 0, sizeof(float)*colsA);

    matACopy = matACopy.t();
    memcpy(h_A, matACopy.data, sizeof(float) * 4 * ciValidPointsNum);
    memcpy(h_b1, matB1Copy.data, sizeof(float) * 1 * ciValidPointsNum);
    memcpy(h_b2, matB2Copy.data, sizeof(float) * 1 * ciValidPointsNum);
    memcpy(h_b3, matB3Copy.data, sizeof(float) * 1 * ciValidPointsNum);
    memcpy(h_b4, matB4Copy.data, sizeof(float) * 1 * ciValidPointsNum);

    Check_Cuda_Error(cudaMalloc((void **)&d_A, sizeof(float)*lda*colsA));
    Check_Cuda_Error(cudaMalloc((void **)&d_b1, sizeof(float)*rowsA));
    Check_Cuda_Error(cudaMalloc((void **)&d_b2, sizeof(float)*rowsA));
    Check_Cuda_Error(cudaMalloc((void **)&d_b3, sizeof(float)*rowsA));
    Check_Cuda_Error(cudaMalloc((void **)&d_b4, sizeof(float)*rowsA));
    Check_Cuda_Error(cudaMalloc((void **)&d_x1, sizeof(float)*colsA));
    Check_Cuda_Error(cudaMalloc((void **)&d_x2, sizeof(float)*colsA));
    Check_Cuda_Error(cudaMalloc((void **)&d_x3, sizeof(float)*colsA));
    Check_Cuda_Error(cudaMalloc((void **)&d_x4, sizeof(float)*colsA));
    Check_Cuda_Error(cudaMemcpy(d_A, h_A, sizeof(float)*lda*colsA, cudaMemcpyHostToDevice));
    Check_Cuda_Error(cudaMemcpy(d_b1, h_b1, sizeof(float)*rowsA, cudaMemcpyHostToDevice));
    Check_Cuda_Error(cudaMemcpy(d_b2, h_b2, sizeof(float)*rowsA, cudaMemcpyHostToDevice));
    Check_Cuda_Error(cudaMemcpy(d_b3, h_b3, sizeof(float)*rowsA, cudaMemcpyHostToDevice));
    Check_Cuda_Error(cudaMemcpy(d_b4, h_b4, sizeof(float)*rowsA, cudaMemcpyHostToDevice));

    Check_Cuda_Error(cusolverDnCreate(&handle));
    Check_Cuda_Error(cublasCreate(&cublasHandle));
    Check_Cuda_Error(cudaStreamCreate(&stream));

    linearSolverQR(handle, colsA, d_A, lda, d_b1, d_x1);
    linearSolverQR(handle, colsA, d_A, lda, d_b2, d_x2);
    linearSolverQR(handle, colsA, d_A, lda, d_b3, d_x3);
    linearSolverQR(handle, colsA, d_A, lda, d_b4, d_x4);

    Check_Cuda_Error(cudaMemcpy(h_x1, d_x1, sizeof(float)*colsA, cudaMemcpyDeviceToHost));
    Check_Cuda_Error(cudaMemcpy(h_x2, d_x2, sizeof(float)*colsA, cudaMemcpyDeviceToHost));
    Check_Cuda_Error(cudaMemcpy(h_x3, d_x3, sizeof(float)*colsA, cudaMemcpyDeviceToHost));
    Check_Cuda_Error(cudaMemcpy(h_x4, d_x4, sizeof(float)*colsA, cudaMemcpyDeviceToHost));

    cv::Mat TestTi_GPU = cv::Mat::zeros(4, 4, CV_32FC1);
    float * ptrTestTi_GPU = (float *)TestTi_GPU.data;
    for (int i = 0; i < 4; i++)
    {
	    ptrTestTi_GPU[0 * 4 + i] = static_cast<float>(h_x1[i]);
	    ptrTestTi_GPU[1 * 4 + i] = static_cast<float>(h_x2[i]);
	    ptrTestTi_GPU[2 * 4 + i] = static_cast<float>(h_x3[i]);
	    ptrTestTi_GPU[3 * 4 + i] = static_cast<float>(h_x4[i]);
    }
    std::cout << "cusolver Result: " << std::endl;
    std::cout << TestTi_GPU << std::endl;

    //Release
    if (handle) { Check_Cuda_Error(cusolverDnDestroy(handle)); }
    if (cublasHandle) { Check_Cuda_Error(cublasDestroy(cublasHandle)); }
    if (stream) { Check_Cuda_Error(cudaStreamDestroy(stream)); }
    if (d_A) { Check_Cuda_Error(cudaFree(d_A)); }
    if (d_b1) { Check_Cuda_Error(cudaFree(d_b1)); }
    if (d_b2) { Check_Cuda_Error(cudaFree(d_b2)); }
    if (d_b3) { Check_Cuda_Error(cudaFree(d_b3)); }
    if (d_b4) { Check_Cuda_Error(cudaFree(d_b4)); }
    if (d_x1) { Check_Cuda_Error(cudaFree(d_x1)); }
    if (d_x2) { Check_Cuda_Error(cudaFree(d_x2)); }
    if (d_x3) { Check_Cuda_Error(cudaFree(d_x3)); }
    if (d_x4) { Check_Cuda_Error(cudaFree(d_x4)); }
    if (h_A) { free(h_A); }
    if (h_b1) { free(h_b1); }
    if (h_b2) { free(h_b2); }
    if (h_b3) { free(h_b3); }
    if (h_b4) { free(h_b4); }
    if (h_x1) { free(h_x1); }
    if (h_x2) { free(h_x2); }
    if (h_x3) { free(h_x3); }
    if (h_x4) { free(h_x4); }

    return 0;
}

Mat data and results:

matA = [-1.3221376, 20.898346, -0.050307311, 1;
 -1.3230242, 21.137701, -0.054719526, 1;
 -1.323216, 21.37715, -0.051985338, 1;
 -1.3232133, 21.616615, -0.047243416, 1;
 -1.3244345, 21.855961, -0.0551024, 1;
 -1.3242892, 22.09543, -0.048894461, 1;
 -1.3247974, 22.334848, -0.049411014, 1;
 -1.3259401, 22.574232, -0.056459066, 1;
 -1.325868, 22.813688, -0.051007271, 1;
 -1.3263346, 23.053118, -0.051098794, 1;
 -1.32684, 23.292547, -0.051581748, 1;
 -1.3267975, 23.531979, -0.046437319, 1;
 -1.3279346, 23.771416, -0.053422518, 1;
 -1.3279115, 24.010843, -0.048473347, 1;
 -1.3286631, 24.250292, -0.051495273, 1;
 -1.3286657, 24.489712, -0.046807643, 1;
 -1.3296765, 24.729183, -0.052499481, 1;
 -1.3292327, 24.968561, -0.043229543, 1;
 -1.3298577, 25.208012, -0.044943374, 1;
 -1.3309158, 25.447512, -0.051113199, 1;
 -1.083158, 20.898655, -0.055020623, 1;
 -1.0832742, 21.138119, -0.051480997, 1;
 -1.0839992, 21.377501, -0.054265928, 1;
 -1.0843329, 21.616936, -0.052988637, 1;
 -1.0854648, 21.856295, -0.060025107, 1;
 -1.0855874, 22.095751, -0.056541249, 1;
 -1.0846509, 22.335274, -0.04201439, 1;
 -1.0856496, 22.574656, -0.047668073, 1;
 -1.0865787, 22.81406, -0.052586116, 1;
 -1.0869819, 23.053495, -0.052026145, 1]
matB1 = [-1.3526516;
 -1.353945;
 -1.3545439;
 -1.3549484;
 -1.3565764;
 -1.3568382;
 -1.3577534;
 -1.3593031;
 -1.3596381;
 -1.3605117;
 -1.3614241;
 -1.3617886;
 -1.3633326;
 -1.3637166;
 -1.3648753;
 -1.3652849;
 -1.3667028;
 -1.366666;
 -1.367698;
 -1.3691633;
 -1.1136749;
 -1.1141982;
 -1.1153301;
 -1.1160709;
 -1.1176097;
 -1.1181393;
 -1.11761;
 -1.1190157;
 -1.1203518;
 -1.1211619]
matB2 = [20.900888;
 21.14024;
 21.379686;
 21.61915;
 21.85849;
 22.097958;
 22.337372;
 22.57675;
 22.816204;
 23.055632;
 23.295057;
 23.534485;
 23.773918;
 24.013344;
 24.25279;
 24.492207;
 24.731674;
 24.97105;
 25.210499;
 25.449993;
 20.901604;
 21.141064;
 21.380444;
 21.619877;
 21.85923;
 22.098682;
 22.338205;
 22.577581;
 22.816984;
 23.056416]
matB3 = [-0.045307312;
 -0.049719527;
 -0.046985339;
 -0.042243417;
 -0.050102402;
 -0.043894462;
 -0.044411015;
 -0.051459067;
 -0.046007272;
 -0.046098795;
 -0.046581749;
 -0.04143732;
 -0.048422519;
 -0.043473348;
 -0.046495274;
 -0.041807644;
 -0.047499482;
 -0.038229544;
 -0.039943375;
 -0.0461132;
 -0.050020624;
 -0.046480998;
 -0.049265929;
 -0.047988638;
 -0.055025108;
 -0.05154125;
 -0.037014391;
 -0.042668074;
 -0.047586117;
 -0.047026146]
matB4 都为1


Project_And_Data.zip (301.7 KB)

Hi,
Could you please go to OpenCV forum to get further suggestion? These are 3rdparty contribution and we don’t have muxch experience in using the function. Better to get OpenCV experts’ suggestion.