To solve the overdetermined equation of Ax=B, the result obtained by using the cusolver library is different from the result obtained by using cv::solve in OpenCV. The specific code is as follows:
#include "opencv2/opencv.hpp"
#include "device_launch_parameters.h"
#include "cublas_v2.h"
#include "cusolverDn.h"
#include "helper_cusolver.h"
#include <stdio.h>
#include <stdlib.h>
#include "CheckCudaError.h"
/* solve A*x = b by QR */
int linearSolverQR(
cusolverDnHandle_t handle,
int n,
const float *Acopy,
int lda,
const float *b,
float *x)
{
cublasHandle_t cublasHandle = NULL; // used in residual evaluation
int bufferSize = 0;
int bufferSize_geqrf = 0;
int bufferSize_ormqr = 0;
int *info = NULL;
float *buffer = NULL;
float *A = NULL;
float *tau = NULL;
int h_info = 0;
float start, stop;
float time_solve;
const float one = 1.0;
Check_Cuda_Error(cublasCreate(&cublasHandle));
Check_Cuda_Error(cusolverDnSgeqrf_bufferSize(handle, n, n, (float*)Acopy, lda, &bufferSize_geqrf));
Check_Cuda_Error(cusolverDnSormqr_bufferSize(
handle,
CUBLAS_SIDE_LEFT,
CUBLAS_OP_T,
n,
1,
n,
A,
lda,
NULL,
x,
n,
&bufferSize_ormqr));
//printf("buffer_geqrf = %d, buffer_ormqr = %d \n", bufferSize_geqrf, bufferSize_ormqr);
bufferSize = (bufferSize_geqrf > bufferSize_ormqr) ? bufferSize_geqrf : bufferSize_ormqr;
Check_Cuda_Error(cudaMalloc(&info, sizeof(int)));
Check_Cuda_Error(cudaMalloc(&buffer, sizeof(float)*bufferSize));
Check_Cuda_Error(cudaMalloc(&A, sizeof(float)*lda*n));
Check_Cuda_Error(cudaMalloc((void**)&tau, sizeof(float)*n));
// prepare a copy of A because getrf will overwrite A with L
Check_Cuda_Error(cudaMemcpy(A, Acopy, sizeof(float)*lda*n, cudaMemcpyDeviceToDevice));
Check_Cuda_Error(cudaMemset(info, 0, sizeof(int)));
// compute QR factorization
Check_Cuda_Error(cusolverDnSgeqrf(handle, n, n, A, lda, tau, buffer, bufferSize, info));
Check_Cuda_Error(cudaMemcpy(&h_info, info, sizeof(int), cudaMemcpyDeviceToHost));
if (0 != h_info) {
fprintf(stderr, "Error: LU factorization failed\n");
}
Check_Cuda_Error(cudaMemcpy(x, b, sizeof(float)*n, cudaMemcpyDeviceToDevice));
// compute Q^T*b
Check_Cuda_Error(cusolverDnSormqr(
handle,
CUBLAS_SIDE_LEFT,
CUBLAS_OP_T,
n,
1,
n,
A,
lda,
tau,
x,
n,
buffer,
bufferSize,
info));
// x = R \ Q^T*b
Check_Cuda_Error(cublasStrsm(
cublasHandle,
CUBLAS_SIDE_LEFT,
CUBLAS_FILL_MODE_UPPER,
CUBLAS_OP_N,
CUBLAS_DIAG_NON_UNIT,
n,
1,
&one,
A,
lda,
x,
n));
Check_Cuda_Error(cudaDeviceSynchronize());
time_solve = stop - start;
if (cublasHandle) { Check_Cuda_Error(cublasDestroy(cublasHandle)); }
if (info) { Check_Cuda_Error(cudaFree(info)); }
if (buffer) { Check_Cuda_Error(cudaFree(buffer)); }
if (A) { Check_Cuda_Error(cudaFree(A)); }
if (tau) { Check_Cuda_Error(cudaFree(tau)); }
return 0;
}
The following code compares the calculation results of cuSolver and cv::solve
int main()
{
const int ciValidPointsNum = 30;
cv::FileStorage fs("../TestData/MatAB.xml", cv::FileStorage::READ);
cv::Mat matA = cv::Mat::zeros(4, ciValidPointsNum, CV_32FC1);
cv::Mat matB1 = cv::Mat::zeros(ciValidPointsNum, 1, CV_32FC1);
cv::Mat matB2 = cv::Mat::zeros(ciValidPointsNum, 1, CV_32FC1);
cv::Mat matB3 = cv::Mat::zeros(ciValidPointsNum, 1, CV_32FC1);
cv::Mat matB4 = cv::Mat::ones(ciValidPointsNum, 1, CV_32FC1);
cv::Mat matX1 = cv::Mat::zeros(4, 1, CV_32FC1);
cv::Mat matX2 = cv::Mat::zeros(4, 1, CV_32FC1);
cv::Mat matX3 = cv::Mat::zeros(4, 1, CV_32FC1);
cv::Mat matX4 = cv::Mat::zeros(4, 1, CV_32FC1);
fs["MatA"] >> matA;
fs["MatB1"] >> matB1;
fs["MatB2"] >> matB2;
fs["MatB3"] >> matB3;
fs.release();
matA.resize(ciValidPointsNum);
matB1.resize(ciValidPointsNum);
matB2.resize(ciValidPointsNum);
matB3.resize(ciValidPointsNum);
std::cout << "matA = "<<matA << std::endl << "matB1 = "<< matB1 << std::endl << "matB2 = " << matB2 << std::endl << "matB3 = " << matB3 << std::endl;
std::cout << "matB4 always1" << std::endl;
cv::Mat matACopy = matA.clone();
cv::Mat matB1Copy = matB1.clone();
cv::Mat matB2Copy = matB2.clone();
cv::Mat matB3Copy = matB3.clone();
cv::Mat matB4Copy = matB4.clone();
cv::solve(matA, matB1, matX1, cv::DECOMP_QR);
cv::solve(matA, matB2, matX2, cv::DECOMP_QR);
cv::solve(matA, matB3, matX3, cv::DECOMP_QR);
cv::solve(matA, matB4, matX4, cv::DECOMP_QR);
cv::Mat TestTi_CPU = cv::Mat::zeros(4, 4, CV_32FC1);
float * ptrTestTi_CPU = (float *)TestTi_CPU.data;
for (int i = 0; i < 4; i++)
{
ptrTestTi_CPU[0 * 4 + i] = matX1.at<float>(i, 0);
ptrTestTi_CPU[1 * 4 + i] = matX2.at<float>(i, 0);
ptrTestTi_CPU[2 * 4 + i] = matX3.at<float>(i, 0);
ptrTestTi_CPU[3 * 4 + i] = matX4.at<float>(i, 0);
}
std::cout << std::endl;
std::cout << "OpenCV cv::solve Result: " <<std::endl;
std::cout << TestTi_CPU << std::endl;
std::cout << std::endl;
cusolverDnHandle_t handle = NULL;
cublasHandle_t cublasHandle = NULL;
cudaStream_t stream = NULL;
int rowsA = ciValidPointsNum; // number of rows of A
int colsA = 4; // number of columns of A
int lda = ciValidPointsNum; // leading dimension in dense matrix
float *d_A = NULL; // A in GPU
float *d_b1 = NULL; // b in GPU
float *d_b2 = NULL; // b in GPU
float *d_b3 = NULL; // b in GPU
float *d_b4 = NULL; // b in GPU
float *d_x1 = NULL; // x = A \ b in GPU
float *d_x2 = NULL; // x = A \ b in GPU
float *d_x3 = NULL; // x = A \ b in GPU
float *d_x4 = NULL; // x = A \ b in GPU
float* h_A = (float*)malloc(sizeof(float)*lda*colsA);
float* h_b1 = (float*)malloc(sizeof(float)*rowsA);
float* h_b2 = (float*)malloc(sizeof(float)*rowsA);
float* h_b3 = (float*)malloc(sizeof(float)*rowsA);
float* h_b4 = (float*)malloc(sizeof(float)*rowsA);
float* h_x1 = (float*)malloc(sizeof(float)*colsA);
float* h_x2 = (float*)malloc(sizeof(float)*colsA);
float* h_x3 = (float*)malloc(sizeof(float)*colsA);
float* h_x4 = (float*)malloc(sizeof(float)*colsA);
memset(h_A, 0, sizeof(float)*lda*colsA);
memset(h_b1, 0, sizeof(float)*rowsA);
memset(h_b2, 0, sizeof(float)*rowsA);
memset(h_b3, 0, sizeof(float)*rowsA);
memset(h_b4, 0, sizeof(float)*rowsA);
memset(h_x1, 0, sizeof(float)*colsA);
memset(h_x2, 0, sizeof(float)*colsA);
memset(h_x3, 0, sizeof(float)*colsA);
memset(h_x4, 0, sizeof(float)*colsA);
matACopy = matACopy.t();
memcpy(h_A, matACopy.data, sizeof(float) * 4 * ciValidPointsNum);
memcpy(h_b1, matB1Copy.data, sizeof(float) * 1 * ciValidPointsNum);
memcpy(h_b2, matB2Copy.data, sizeof(float) * 1 * ciValidPointsNum);
memcpy(h_b3, matB3Copy.data, sizeof(float) * 1 * ciValidPointsNum);
memcpy(h_b4, matB4Copy.data, sizeof(float) * 1 * ciValidPointsNum);
Check_Cuda_Error(cudaMalloc((void **)&d_A, sizeof(float)*lda*colsA));
Check_Cuda_Error(cudaMalloc((void **)&d_b1, sizeof(float)*rowsA));
Check_Cuda_Error(cudaMalloc((void **)&d_b2, sizeof(float)*rowsA));
Check_Cuda_Error(cudaMalloc((void **)&d_b3, sizeof(float)*rowsA));
Check_Cuda_Error(cudaMalloc((void **)&d_b4, sizeof(float)*rowsA));
Check_Cuda_Error(cudaMalloc((void **)&d_x1, sizeof(float)*colsA));
Check_Cuda_Error(cudaMalloc((void **)&d_x2, sizeof(float)*colsA));
Check_Cuda_Error(cudaMalloc((void **)&d_x3, sizeof(float)*colsA));
Check_Cuda_Error(cudaMalloc((void **)&d_x4, sizeof(float)*colsA));
Check_Cuda_Error(cudaMemcpy(d_A, h_A, sizeof(float)*lda*colsA, cudaMemcpyHostToDevice));
Check_Cuda_Error(cudaMemcpy(d_b1, h_b1, sizeof(float)*rowsA, cudaMemcpyHostToDevice));
Check_Cuda_Error(cudaMemcpy(d_b2, h_b2, sizeof(float)*rowsA, cudaMemcpyHostToDevice));
Check_Cuda_Error(cudaMemcpy(d_b3, h_b3, sizeof(float)*rowsA, cudaMemcpyHostToDevice));
Check_Cuda_Error(cudaMemcpy(d_b4, h_b4, sizeof(float)*rowsA, cudaMemcpyHostToDevice));
Check_Cuda_Error(cusolverDnCreate(&handle));
Check_Cuda_Error(cublasCreate(&cublasHandle));
Check_Cuda_Error(cudaStreamCreate(&stream));
linearSolverQR(handle, colsA, d_A, lda, d_b1, d_x1);
linearSolverQR(handle, colsA, d_A, lda, d_b2, d_x2);
linearSolverQR(handle, colsA, d_A, lda, d_b3, d_x3);
linearSolverQR(handle, colsA, d_A, lda, d_b4, d_x4);
Check_Cuda_Error(cudaMemcpy(h_x1, d_x1, sizeof(float)*colsA, cudaMemcpyDeviceToHost));
Check_Cuda_Error(cudaMemcpy(h_x2, d_x2, sizeof(float)*colsA, cudaMemcpyDeviceToHost));
Check_Cuda_Error(cudaMemcpy(h_x3, d_x3, sizeof(float)*colsA, cudaMemcpyDeviceToHost));
Check_Cuda_Error(cudaMemcpy(h_x4, d_x4, sizeof(float)*colsA, cudaMemcpyDeviceToHost));
cv::Mat TestTi_GPU = cv::Mat::zeros(4, 4, CV_32FC1);
float * ptrTestTi_GPU = (float *)TestTi_GPU.data;
for (int i = 0; i < 4; i++)
{
ptrTestTi_GPU[0 * 4 + i] = static_cast<float>(h_x1[i]);
ptrTestTi_GPU[1 * 4 + i] = static_cast<float>(h_x2[i]);
ptrTestTi_GPU[2 * 4 + i] = static_cast<float>(h_x3[i]);
ptrTestTi_GPU[3 * 4 + i] = static_cast<float>(h_x4[i]);
}
std::cout << "cusolver Result: " << std::endl;
std::cout << TestTi_GPU << std::endl;
//Release
if (handle) { Check_Cuda_Error(cusolverDnDestroy(handle)); }
if (cublasHandle) { Check_Cuda_Error(cublasDestroy(cublasHandle)); }
if (stream) { Check_Cuda_Error(cudaStreamDestroy(stream)); }
if (d_A) { Check_Cuda_Error(cudaFree(d_A)); }
if (d_b1) { Check_Cuda_Error(cudaFree(d_b1)); }
if (d_b2) { Check_Cuda_Error(cudaFree(d_b2)); }
if (d_b3) { Check_Cuda_Error(cudaFree(d_b3)); }
if (d_b4) { Check_Cuda_Error(cudaFree(d_b4)); }
if (d_x1) { Check_Cuda_Error(cudaFree(d_x1)); }
if (d_x2) { Check_Cuda_Error(cudaFree(d_x2)); }
if (d_x3) { Check_Cuda_Error(cudaFree(d_x3)); }
if (d_x4) { Check_Cuda_Error(cudaFree(d_x4)); }
if (h_A) { free(h_A); }
if (h_b1) { free(h_b1); }
if (h_b2) { free(h_b2); }
if (h_b3) { free(h_b3); }
if (h_b4) { free(h_b4); }
if (h_x1) { free(h_x1); }
if (h_x2) { free(h_x2); }
if (h_x3) { free(h_x3); }
if (h_x4) { free(h_x4); }
return 0;
}
Mat data and results:
matA = [-1.3221376, 20.898346, -0.050307311, 1;
-1.3230242, 21.137701, -0.054719526, 1;
-1.323216, 21.37715, -0.051985338, 1;
-1.3232133, 21.616615, -0.047243416, 1;
-1.3244345, 21.855961, -0.0551024, 1;
-1.3242892, 22.09543, -0.048894461, 1;
-1.3247974, 22.334848, -0.049411014, 1;
-1.3259401, 22.574232, -0.056459066, 1;
-1.325868, 22.813688, -0.051007271, 1;
-1.3263346, 23.053118, -0.051098794, 1;
-1.32684, 23.292547, -0.051581748, 1;
-1.3267975, 23.531979, -0.046437319, 1;
-1.3279346, 23.771416, -0.053422518, 1;
-1.3279115, 24.010843, -0.048473347, 1;
-1.3286631, 24.250292, -0.051495273, 1;
-1.3286657, 24.489712, -0.046807643, 1;
-1.3296765, 24.729183, -0.052499481, 1;
-1.3292327, 24.968561, -0.043229543, 1;
-1.3298577, 25.208012, -0.044943374, 1;
-1.3309158, 25.447512, -0.051113199, 1;
-1.083158, 20.898655, -0.055020623, 1;
-1.0832742, 21.138119, -0.051480997, 1;
-1.0839992, 21.377501, -0.054265928, 1;
-1.0843329, 21.616936, -0.052988637, 1;
-1.0854648, 21.856295, -0.060025107, 1;
-1.0855874, 22.095751, -0.056541249, 1;
-1.0846509, 22.335274, -0.04201439, 1;
-1.0856496, 22.574656, -0.047668073, 1;
-1.0865787, 22.81406, -0.052586116, 1;
-1.0869819, 23.053495, -0.052026145, 1]
matB1 = [-1.3526516;
-1.353945;
-1.3545439;
-1.3549484;
-1.3565764;
-1.3568382;
-1.3577534;
-1.3593031;
-1.3596381;
-1.3605117;
-1.3614241;
-1.3617886;
-1.3633326;
-1.3637166;
-1.3648753;
-1.3652849;
-1.3667028;
-1.366666;
-1.367698;
-1.3691633;
-1.1136749;
-1.1141982;
-1.1153301;
-1.1160709;
-1.1176097;
-1.1181393;
-1.11761;
-1.1190157;
-1.1203518;
-1.1211619]
matB2 = [20.900888;
21.14024;
21.379686;
21.61915;
21.85849;
22.097958;
22.337372;
22.57675;
22.816204;
23.055632;
23.295057;
23.534485;
23.773918;
24.013344;
24.25279;
24.492207;
24.731674;
24.97105;
25.210499;
25.449993;
20.901604;
21.141064;
21.380444;
21.619877;
21.85923;
22.098682;
22.338205;
22.577581;
22.816984;
23.056416]
matB3 = [-0.045307312;
-0.049719527;
-0.046985339;
-0.042243417;
-0.050102402;
-0.043894462;
-0.044411015;
-0.051459067;
-0.046007272;
-0.046098795;
-0.046581749;
-0.04143732;
-0.048422519;
-0.043473348;
-0.046495274;
-0.041807644;
-0.047499482;
-0.038229544;
-0.039943375;
-0.0461132;
-0.050020624;
-0.046480998;
-0.049265929;
-0.047988638;
-0.055025108;
-0.05154125;
-0.037014391;
-0.042668074;
-0.047586117;
-0.047026146]
matB4 都为1
Project_And_Data.zip (301.7 KB)