I am trying to use cuSolver’s LU to solve a linear equation. My code is basically copied from the sample code in the manual.
When I run the same code WITHOUT pivot, it produces the correct solution to the linear equation. However, I want to be able to run with pivoting. When I run it WITH pivot , it produces a pivot list which does not make any sense.
Not sure if I am following forum etiquette here so if the question needs to be improved please tell me how. Code follows:
#define EQNS_TOTAL 2
CallMAC(cudaMalloc((void**)&d_Ipiv512, sizeof(int) * 4*EQNS_TOTAL));
CallMAC(cudaMalloc((void**)&p_RHS, sizeof(double) * 4 * EQNS_TOTAL));
CallMAC(cudaMalloc((void**)&d_info, sizeof(int) * 4 * EQNS_TOTAL));
CallMAC(cudaMalloc((void**)&p_eqns, sizeof(double) * 4 * EQNS_TOTAL * 4 * EQNS_TOTAL));
…
cusolverDnHandle_t cusolverH = NULL;
cudaStream_t stream = NULL;
cusolverStatus_t status = CUSOLVER_STATUS_SUCCESS;
cudaError_t cudaStat1 = cudaSuccess;
cudaError_t cudaStat2 = cudaSuccess;
cudaError_t cudaStat3 = cudaSuccess;
cudaError_t cudaStat4 = cudaSuccess;
const int m = 4 * EQNS_TOTAL;
const int lda = m;
const int ldb = m;
int lwork = 0; /* size of workspace */
const int pivot_on = 1;
// 4. Solve equations : cuSolver parallel solve…
//====================================================
// Spit out matrix eqn for purpose of query on forums:
cudaMemcpy(p_eqns_host, p_eqns, 4 * EQNS_TOTAL * 4 * EQNS_TOTAL * sizeof(f64),
cudaMemcpyDeviceToHost);
cudaMemcpy(p_temphost5, p_RHS, 4 * EQNS_TOTAL * sizeof(f64), cudaMemcpyDeviceToHost);
for (i = 0; i < m; i++)
{
for (j = 0; j < m; j++) printf("%1.3E ", p_eqns_host[i + j*m]);
printf(" | %1.4E\n", p_temphost5[i]);
};
printf("\n");
if (pivot_on) {
printf("pivot is on : compute P*A = L*U \n");
} else {
printf("pivot is off: compute A = L*U (not numerically stable)\n");
};
// step 1: create cusolver handle, bind a stream
status = cusolverDnCreate(&cusolverH);
if (CUSOLVER_STATUS_SUCCESS != status) { printf("cusolverDnCreate(&cusolverH) failed.\n"); getch(); }
cudaStat1 = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
if (cudaSuccess != cudaStat1) { printf("cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking); failed.\n"); getch(); }
status = cusolverDnSetStream(cusolverH, stream);
if (CUSOLVER_STATUS_SUCCESS != status) {printf("cusolverDnSetStream(cusolverH, stream) failed.\n"); getch();}
// step 2: 'copy A to device'
// p_eqns is allocated on device, and so is p_RHS, d_Ipiv512, d_info
// step 3: query working space of getrf
// status = cusolverDnDgetrf_bufferSize(
// cusolverH, m, m, d_A,lda,&lwork);
status = cusolverDnDgetrf_bufferSize(
cusolverH,
4 * EQNS_TOTAL, 4 * EQNS_TOTAL,
p_eqns,
4 * EQNS_TOTAL, &lwork);
if (CUSOLVER_STATUS_SUCCESS != status) {
printf("cusolverDnDgetrf_bufferSize failed.\n"); getch();
} else {
printf("Success:cusolverDnDgetrf_bufferSize\n");
};
CallMAC(cudaMalloc((void**)&d_work, sizeof(double)*lwork));
// step 4: LU factorization
if (pivot_on) {
status = cusolverDnDgetrf(
cusolverH,
4 * EQNS_TOTAL, 4 * EQNS_TOTAL,
p_eqns, lda, d_work, d_Ipiv512, d_info);
// status = cusolverDnDgetrf(
// cusolverH, m, m, p_eqns, // 4N4N
// lda,d_work, d_Ipiv512, // 1284 = 512
// d_info);
} else {
status = cusolverDnDgetrf(
cusolverH,
4*EQNS_TOTAL, 4 * EQNS_TOTAL,
p_eqns, lda, d_work, NULL, d_info);
}
if (CUSOLVER_STATUS_SUCCESS != status) {
printf(“cusolverDnDgetrf failed.\n”); getch();
} else {
printf(“cusolverDnDgetrf : CUSOLVER_STATUS_SUCCESS == status\n”);
}
CallMAC(cudaDeviceSynchronize());
int info;
if (pivot_on) {
CallMAC(cudaMemcpy(Ipiv512, d_Ipiv512, sizeof(int)*m, cudaMemcpyDeviceToHost));
}
// cudaStat2 = cudaMemcpy(LU, p_eqns, sizeof(double)*4*4*EQNS_TOTAL, cudaMemcpyDeviceToHost);
CallMAC(cudaMemcpy(&info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
if (0 > info) {
printf("%d-th parameter is wrong \n", -info);
while (1) getch();
}
if ((pivot_on)) {
printf("pivoting sequence: (m = %d; lda = %d)\n", m, lda);
for (int j = 0; j < 4*EQNS_TOTAL; j++) {
printf("Ipiv[%d] = %d\n", j, Ipiv512[j]);
//if (j % 4 == 0) printf("");
};
}
while (1) getch();
The results of this code are as follows:
Solving equations.
8.572E+01 -3.108E+00 -8.500E-03 -3.278E-01 -2.540E+01 -2.573E+00 2.961E-03 1.070E-01 | 1.0897E-01
-4.214E+00 2.537E+00 3.783E-03 1.868E-02 -3.014E+00 2.921E-01 -6.221E-04 1.185E-02 | -8.3694E-03
-9.300E-03 -3.135E-03 1.000E+00 0.000E+00 1.882E-03 9.359E-04 -2.495E-07 0.000E+00 | 2.9530E-04
-9.054E-05 4.613E-07 0.000E+00 1.000E+00 3.362E-05 3.446E-06 0.000E+00 -1.279E-05 | -5.3642E-05
-2.496E+01 -1.681E+00 2.863E-03 1.051E-01 6.689E+01 -2.609E+00 -6.629E-03 -2.823E-01 | 5.7258E-02
-4.116E+00 6.478E-01 -4.760E-04 1.659E-02 -2.502E+00 4.028E+00 3.267E-03 1.449E-02 | -8.0247E-03
2.367E-03 7.722E-04 -3.022E-07 0.000E+00 -5.390E-03 -2.485E-03 1.000E+00 0.000E+00 | 4.3246E-05
3.516E-05 5.624E-06 0.000E+00 -1.336E-05 -9.226E-05 2.015E-06 0.000E+00 1.000E+00 | -4.3637E-04
pivot is on : compute PA = LU
Success:cusolverDnDgetrf_bufferSize
Success: cudaMalloc((void**)&d_work, sizeof(double)*lwork) |||
cusolverDnDgetrf : CUSOLVER_STATUS_SUCCESS == status
Success: cudaDeviceSynchronize() |||
pivoting sequence: (m = 8; lda = 8)
Ipiv[0] = 1
Ipiv[1] = 5
Ipiv[2] = 3
Ipiv[3] = 4
Ipiv[4] = 5
Ipiv[5] = 6
Ipiv[6] = 7
Ipiv[7] = 8
To me, that does not make any sense. The solution vector also comes out incorrect. Thanks for any suggestions.