the data link:
data.rar (7.4 MB)
int nnz = 3397636;
int N = 262144;
int* csrRow = new int[N + 1];
int* csrCol = new int[nnz];
float* csrVal = new float[nnz];
float* rhsXYZ[3];
rhsXYZ[0] = new float[N];
rhsXYZ[1] = new float[N];
rhsXYZ[2] = new float[N];
FILE* f1 = fopen("mtxF1.txt", "r");
for (int i = 0; i < nnz; i++)
fscanf(f1, "%d %f\n", &csrCol[i], &csrVal[i]);
fclose(f1);
FILE* f2 = fopen("mtxF2.txt", "r");
for (int i = 0; i < N + 1; i++)
fscanf(f2, "%d\n", &csrRow[i]);
fclose(f2);
FILE* f3 = fopen("bVecRHS.txt", "r");
for (int i = 0; i < N; i++)
fscanf(f3, "%f %f %f\n", &rhsXYZ[0][i], &rhsXYZ[1][i], &rhsXYZ[2][i]);
fclose(f3);
int* dCsrRow, * dCsrCol;
float* dCsrVal;
cudaMalloc((void**)&dCsrRow, sizeof(int) * (N + 1));
cudaMalloc((void**)&dCsrCol, sizeof(int) * (nnz));
cudaMalloc((void**)&dCsrVal, sizeof(float) * (nnz));
cudaMemcpy(dCsrRow, csrRow, sizeof(int) * (N + 1), cudaMemcpyHostToDevice);
cudaMemcpy(dCsrCol, csrCol, sizeof(int) * (nnz), cudaMemcpyHostToDevice);
cudaMemcpy(dCsrVal, csrVal, sizeof(float) * (nnz), cudaMemcpyHostToDevice);
float* dBVec, *dResVec;
cudaMalloc((void**)&dBVec, sizeof(float) * (N));
cudaMalloc((void**)&dResVec, sizeof(float) * (N));
cusolverStatus_t stat;
cusolverSpHandle_t cusolverSpH = NULL;
stat = cusolverSpCreate(&cusolverSpH);
cusparseMatDescr_t descrA = NULL;
cusparseCreateMatDescr(&descrA);
cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO);
csrcholInfo_t d_info = NULL;
cusolverSpCreateCsrcholInfo(&d_info);
printf("step 10: analyze chol(A) to know structure of L\n");
stat = cusolverSpXcsrcholAnalysis(
cusolverSpH, N, nnz,
descrA, dCsrRow, dCsrCol,
d_info);
printf("step 11: workspace for chol(A)\n");
size_t size_internal = 0;
size_t size_chol = 0; // size of working space for csrlu
stat = cusolverSpScsrcholBufferInfo(
cusolverSpH, N, nnz,
descrA, dCsrVal, dCsrRow, dCsrCol,
d_info,
&size_internal,
&size_chol);
void* buffer_gpu;
cudaMalloc(&buffer_gpu, sizeof(char) * size_chol);
printf("step 12: compute A = L*L^T \n");
stat = cusolverSpScsrcholFactor(
cusolverSpH, N, nnz,
descrA, dCsrVal, dCsrRow, dCsrCol,
d_info,
buffer_gpu);
printf("step 13: check if the matrix is singular \n");
int singularity = 0;
const float tol = 1.e-7;
stat = cusolverSpDcsrcholZeroPivot(
cusolverSpH, d_info, tol, &singularity);
if (0 <= singularity) {
fprintf(stderr, "Error: A is not invertible, singularity=%d\n", singularity);
// return 1;
}
float* resXYZ[3];
resXYZ[0] = new float[N];
resXYZ[1] = new float[N];
resXYZ[2] = new float[N];
printf("step 14: solve A*x = b \n");
for (int i = 0; i < 3; i++)
{
cudaMemcpy(dBVec, rhsXYZ[i], sizeof(float) * N, cudaMemcpyHostToDevice);
stat = cusolverSpScsrcholSolve(
cusolverSpH, N, dBVec, dResVec, d_info, buffer_gpu);
cudaMemcpy(resXYZ[i], dResVec, sizeof(float) * N, cudaMemcpyDeviceToHost);
}