Hello,

I am trying to write a function that takes a matrix A of size [m x n] and a vector b of size [m] and solves for a vector x of size [n].

Ax=b

I want to solve this using QR decomposition.

What i do in matlab is:

```
% create inputs
A = [...];
b = [...];
% create QR decomposition
[Q R] = qr(A);
% solve for x
x = R \ Q' * b;
```

This code works for both square and non-square matrices.

Given inputs A = [m x n] and b = [m] sized i should get a solution x = [n] sized.

Below are examples of the following cases:

Case (m > n) : A = [3 x 2], b = [3 x 1]

[Q R] = qr(A)

size(Q) = [3 x 3]

size® = [3 x 2]

Qtb = Q’ * b;

size(Qtb) = [3 x 1]

x = R \ Qtb;

size(x) = [2 x 1]

Case (m < n) : A = [2 x 3], b = [2 x 1]

[Q R] = qr(A)

size(Q) = [2 x 2]

size® = [2 x 3]

Qtb = Q’ * b;

size(Qtb) = [2 x 1]

x = R \ Qtb;

size(x) = [3 x 1]

Case (m == n) : A = [3 x 3], b = [3 x 1]

[Q R] = qr(A)

size(Q) = [3 x 3]

size® = [3 x 3]

Qtb = Q’ * b;

size(Qtb) = [3 x 1]

x = R \ Qtb;

size(x) = [3 x 1]

Now, i’m trying to port this functionality over to CUDA using cuSOLVER and cuBLAS.

I’ve run across the QR solver example in the cuSOLVER documentation http://docs.nvidia.com/cuda/cusolver/#ormqr-example1. This demonstrates solving for square matrices but doesn’t provide any examples of solving for non-square matrices.

I tried adapting the example but had no luck getting the right solution. I feel my problem (and i could be 100% wrong) is in the final step with the cuBLAS function cublasStrsm(). It doesn’t seem to allow passing in a vector B of one size and outputting a vector X of another size.

In the case of (m < n) the vector i’m passing in is Qtb of size [2 x 1] and the matrix i’m passing in is R of size [2 x 3], i expect to be able to get an output of size [3 x 1]. However, the result is written into the Qtb vector that is only [2 x 1] in size.

Below is my attempt at translating the code for non-square matrices. Currently it passes the (m == n) test, fails the (m < n) test, and throws an error after cusolverDnDormqr() in the (m > n) test.

```
/* * How to compile (assume cuda is installed at /usr/local/cuda/)
* nvcc -c -I/usr/local/cuda/include ormqr_example.cpp
* nvcc -o a.out ormqr_example.o -L/usr/local/cuda/lib64 -lcublas -lcusolver
*
*/
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cusolverDn.h>
void printMatrix(int m, int n, const double*A, int lda, const char* name)
{
for (int row = 0 ; row < m ; row++)
{
for(int col = 0 ; col < n ; col++)
{
double Areg = A[row + col*lda];
printf("%s(%d,%d) = %f\n", name, row+1, col+1, Areg);
}
}
}
void cudaQrSolve(double* pA,
int numRowsA,
int numColsA,
double* pB,
int numRowsB, // must equal numRowsA
double** ppX,
int* pNumRowsX) // will equal numColsA
{
/* | 1 2 3 |
* A = | 4 5 6 |
* | 2 1 1 |
*
* x = (1 1 1)'
* b = (6 15 4)'
*/
cusolverDnHandle_t cudenseH = NULL;
cublasHandle_t cublasH = NULL;
cublasStatus_t cublas_status = CUBLAS_STATUS_SUCCESS;
cusolverStatus_t cusolver_status = CUSOLVER_STATUS_SUCCESS;
cudaError_t cudaStat1 = cudaSuccess;
cudaError_t cudaStat2 = cudaSuccess;
cudaError_t cudaStat3 = cudaSuccess;
cudaError_t cudaStat4 = cudaSuccess;
const int m = numRowsA;
const int n = numColsA;
const int lda = m;
const int ldb = m;
const int nrhs = 1; // number of right hand side vectors
// A = [m x n]
// B = [m x 1]
// x = [n x 1]
/* | 1 2 3 |
* A = | 4 5 6 |
* | 2 1 1 |
*
* x = (1 1 1)'
* b = (6 15 4)'
*/
// solution matrix from GPU
double* XC = (double*)malloc(ldb*nrhs * sizeof(double));
double* d_A = NULL; // linear memory of GPU
double* d_tau = NULL; // linear memory of GPU
double* d_B = NULL;
int* devInfo = NULL; // info in gpu (device copy)
double* d_work = NULL;
int lwork = 0;
int info_gpu = 0;
const double one = 1;
printf("A = (matlab base-1)\n");
printMatrix(m, n, pA, lda, "A");
printf("=====\n");
printf("B = (matlab base-1)\n");
printMatrix(m, nrhs, pB, ldb, "B");
printf("=====\n");
// step 1: create cudense/cublas handle
cusolver_status = cusolverDnCreate(&cudenseH);
assert(CUSOLVER_STATUS_SUCCESS == cusolver_status);
cublas_status = cublasCreate(&cublasH);
assert(CUBLAS_STATUS_SUCCESS == cublas_status);
// step 2: copy A and B to device
cudaStat1 = cudaMalloc ((void**)&d_A , sizeof(double) * m * n);
cudaStat2 = cudaMalloc ((void**)&d_tau, sizeof(double) * m);
cudaStat3 = cudaMalloc ((void**)&d_B , sizeof(double) * m * nrhs);
cudaStat4 = cudaMalloc ((void**)&devInfo, sizeof(int));
assert(cudaSuccess == cudaStat1);
assert(cudaSuccess == cudaStat2);
assert(cudaSuccess == cudaStat3);
assert(cudaSuccess == cudaStat4);
cudaStat1 = cudaMemcpy(d_A, pA, sizeof(double) * m * n , cudaMemcpyHostToDevice);
cudaStat2 = cudaMemcpy(d_B, pB, sizeof(double) * m * nrhs, cudaMemcpyHostToDevice);
assert(cudaSuccess == cudaStat1);
assert(cudaSuccess == cudaStat2);
// step 3: query working space of geqrf and ormqr
cusolver_status = cusolverDnDgeqrf_bufferSize(cudenseH,
m,
n,
d_A,
lda,
&lwork);
assert(cusolver_status == CUSOLVER_STATUS_SUCCESS);
cudaStat1 = cudaMalloc((void**)&d_work, sizeof(double)*lwork);
assert(cudaSuccess == cudaStat1);
// step 4: compute QR factorization
cusolver_status = cusolverDnDgeqrf(cudenseH,
m,
n,
d_A,
lda,
d_tau,
d_work,
lwork,
devInfo);
cudaStat1 = cudaDeviceSynchronize();
assert(CUSOLVER_STATUS_SUCCESS == cusolver_status);
assert(cudaSuccess == cudaStat1);
// check if QR is good or not
cudaStat1 = cudaMemcpy(&info_gpu, devInfo, sizeof(int), cudaMemcpyDeviceToHost);
assert(cudaSuccess == cudaStat1);
printf("after geqrf: info_gpu = %d\n", info_gpu);
assert(0 == info_gpu);
// step 5: compute Q^T*B
cusolver_status = cusolverDnDormqr(cudenseH,
CUBLAS_SIDE_LEFT,
CUBLAS_OP_T,
m,
nrhs,
m,
d_A,
lda,
d_tau,
d_B,
ldb,
d_work,
lwork,
devInfo);
cudaStat1 = cudaDeviceSynchronize();
assert(CUSOLVER_STATUS_SUCCESS == cusolver_status);
assert(cudaSuccess == cudaStat1);
// check if QR is good or not
cudaStat1 = cudaMemcpy(&info_gpu, devInfo, sizeof(int), cudaMemcpyDeviceToHost);
assert(cudaSuccess == cudaStat1);
printf("after ormqr: info_gpu = %d\n", info_gpu);
assert(0 == info_gpu);
// step 6: compute x = R \ Q^T*B
cublas_status = cublasDtrsm(cublasH,
CUBLAS_SIDE_LEFT,
CUBLAS_FILL_MODE_UPPER,
CUBLAS_OP_N,
CUBLAS_DIAG_NON_UNIT,
m,
nrhs,
&one,
d_A,
lda,
d_B,
ldb);
cudaStat1 = cudaDeviceSynchronize();
assert(CUBLAS_STATUS_SUCCESS == cublas_status);
assert(cudaSuccess == cudaStat1);
cudaStat1 = cudaMemcpy(XC, d_B, sizeof(double)*ldb*nrhs, cudaMemcpyDeviceToHost);
assert(cudaSuccess == cudaStat1);
printf("X = (matlab base-1)\n");
printMatrix(m, nrhs, XC, ldb, "X");
// free resources
if (d_A ) cudaFree(d_A);
if (d_tau ) cudaFree(d_tau);
if (d_B ) cudaFree(d_B);
if (devInfo) cudaFree(devInfo);
if (d_work ) cudaFree(d_work);
if (cublasH ) cublasDestroy(cublasH);
if (cudenseH) cusolverDnDestroy(cudenseH);
if (ppX != NULL)
{
*ppX = XC;
}
else if (XC != NULL)
{
free(XC);
}
if (pNumRowsX != NULL)
{
*pNumRowsX = numColsA;
}
}
int main(int argc, char*argv[])
{
// test square (m == n)
{
/* | 1 2 3 |
* A = | 4 5 6 |
* | 2 1 1 |
*
* b = (6 15 4)'
*
* x = (1 1 1)'
*/
const int num_rows = 3;
const int num_cols = 3;
// @note column major
double A[num_rows * num_cols] = { 1.0, 4.0, 2.0,
2.0, 5.0, 1.0,
3.0, 6.0, 1.0};
double B[num_rows] = { 6.0,
15.0,
4.0};
double X[num_cols] = { 1.0,
1.0,
1.0};
double* p_x = NULL;
int num_rows_x = 0;
cudaQrSolve(A,
num_rows,
num_cols,
B,
num_rows,
&p_x,
&num_rows_x);
assert(num_rows_x == num_cols);
for (int i = 0; i < num_rows_x; ++i)
{
// check if they are almost equal
double diff = X[i] - p_x[i];
if (abs(diff) > 1.0e-6)
{
printf("ERROR: test square - expected != actual...\n"
" expected = %f\n"
" actual = %f\n"
" index = %i\n"
" num rows = %i\n",
X[i],
p_x[i],
i,
num_rows_x);
}
} // end for loop check
if (p_x != NULL)
{
free(p_x);
}
} // end test square matrix
// test rectangle (m < n)
{
/* | 1 2 3 |
* A = | 4 5 6 |
*
* b = (6 15)'
*
* x = (1.5 0.0 1.5)'
*/
const int num_rows = 2;
const int num_cols = 3;
// @note column major
double A[num_rows * num_cols] = { 1.0, 4.0,
2.0, 5.0,
3.0, 6.0};
double B[num_rows] = { 6.0,
15.0};
double X[num_cols] = { 1.5,
0.0,
1.5};
double* p_x = NULL;
int num_rows_x = 0;
cudaQrSolve(A,
num_rows,
num_cols,
B,
num_rows,
&p_x,
&num_rows_x);
assert(num_rows_x == num_cols);
for (int i = 0; i < num_rows_x; ++i)
{
// check if they are almost equal
double diff = X[i] - p_x[i];
if (abs(diff) > 1.0e-6)
{
printf("ERROR: test m<n - expected != actual...\n"
" expected = %f\n"
" actual = %f\n"
" index = %i\n"
" num rows = %i\n",
X[i],
p_x[i],
i,
num_rows_x);
}
} // end for loop check
if (p_x != NULL)
{
free(p_x);
}
} // end test square matrix
// test rectangle (m > n)
{
/* | 1 2 |
* A = | 4 5 |
* | 2 1 |
*
* b = (6 15 4 2)'
*
* x = (1 1 1 1)'
*/
const int num_rows = 3;
const int num_cols = 2;
// @note column major
double A[num_rows * num_cols] = { 1.0, 4.0, 2.0,
2.0, 5.0, 1.0 };
double B[num_rows] = { 6.0,
15.0,
4.0};
double X[num_cols] = { 0.666666666666,
2.5};
double* p_x = NULL;
int num_rows_x = 0;
cudaQrSolve(A,
num_rows,
num_cols,
B,
num_rows,
&p_x,
&num_rows_x);
assert(num_rows_x == num_cols);
for (int i = 0; i < num_rows_x; ++i)
{
// check if they are almost equal
double diff = X[i] - p_x[i];
if (abs(diff) > 1.0e-6)
{
printf("ERROR: test m>n - expected != actual...\n"
" expected = %f\n"
" actual = %f\n"
" index = %i\n"
" num rows = %i\n",
X[i],
p_x[i],
i,
num_rows_x);
}
} // end for loop check
if (p_x != NULL)
{
free(p_x);
}
} // end test square matrix
cudaDeviceReset();
return 0;
}
```

Thanks in advance for any help you can provide.