Hello ,
I am trying to solve Ax=b using conjugate gradient method.
I checked the nvidia example and because my matrix A is not a sparse matrix , I used the cublasSgemv function.
The problem is that the residual first increase , then decreases and then increases again with the kast value being of 10^6.
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <iostream>
#include <cublas_v2.h>
using namespace std;
int main() {
int Rows = 80 ,Cols = 64;
int Total = Rows * Cols;
//allocate host memory
float * A,
* b,
* x;
A = (float *) malloc( Total * Total * sizeof(float) );
b = (float *) malloc( Total * sizeof(float) );
x = (float *) malloc( Total * sizeof(float) );
//allocate device memory
float * d_A,
* d_x,
* d_r,
* d_p,
* d_Ax;
cudaMalloc( (void **)&d_A, Total * Total * sizeof(float) );
cudaMalloc( (void **)&d_Ax, Total * sizeof(float) );
cudaMalloc( (void **)&d_x, Total * sizeof(float) );
cudaMalloc( (void **)&d_r, Total * sizeof(float));
cudaMalloc( (void **)&d_p, Total * sizeof(float));
//read Input
FILE * fileA;
fileA = fopen( "Asymmetric", "rb" );
fread( A, sizeof(float), Total * Total , fileA );
fclose( fileA );
//copy A to device
cudaMemcpy(d_A, A, Total * Total * sizeof(float), cudaMemcpyHostToDevice);
FILE * fileB;
fileB = fopen( "B", "rb" );
fread( b, sizeof(float), Total ,fileB );
fclose( fileB );
//copy b to device
cudaMemcpy(d_r, b, Total * sizeof(float), cudaMemcpyHostToDevice);
const float tol = 1e-5f;
const int max_iter = 10000;
float a, bb, na, r0, r1,dot;
float alpha, beta, alpham1;
for (int i = 0; i < Total; i++)
x[i] = 0.0;
//copy to host
cudaMemcpy(d_x, x, Total * sizeof(float), cudaMemcpyHostToDevice);
/* Get handle to the CUBLAS context */
cublasHandle_t cublasHandle = 0;
cublasStatus_t cublasStatus;
cublasStatus = cublasCreate(&cublasHandle);
cublasStatus;
alpha = 1.0;
alpham1 = -1.0;
beta = 0.0;
r0 = 0.;
// Solve Ax = b
cublasSgemv(cublasHandle , CUBLAS_OP_T , Total , Total , &alpha , d_A , Total , d_x ,1 , &beta , d_Ax , 1 );
cublasSaxpy(cublasHandle, Total , &alpham1, d_Ax, 1, d_r, 1);
cublasStatus = cublasSdot(cublasHandle, Total , d_r, 1, d_r, 1, &r1);
cudaDeviceSynchronize();
int k = 1;
while (r1 > tol*tol && k <= max_iter)
{
if (k > 1)
{
bb = r1 / r0;
cublasStatus = cublasSscal(cublasHandle, Total, &bb, d_p, 1);
cublasStatus = cublasSaxpy(cublasHandle, Total , &alpha, d_r, 1, d_p, 1);
}
else
{
cublasStatus = cublasScopy(cublasHandle, Total , d_r, 1, d_p, 1);
}
cublasSgemv(cublasHandle , CUBLAS_OP_T , Total , Total , &alpha , d_A , Total , d_p , 1 , &beta , d_Ax , 1 );
cublasStatus = cublasSdot(cublasHandle, Total , d_p, 1, d_Ax, 1, &dot);
cudaDeviceSynchronize();
a = r1 / dot;
cublasStatus = cublasSaxpy(cublasHandle, Total , &a, d_p, 1, d_x, 1);
na = -a;
cublasStatus = cublasSaxpy(cublasHandle, Total , &na, d_Ax, 1, d_r, 1);
r0 = r1;
cublasStatus = cublasSdot(cublasHandle, Total , d_r, 1, d_r, 1, &r1);
cudaDeviceSynchronize();
cout << "\niteration = "<<k<<" , residual = "<<sqrt(r1)<<endl;
k++;
}
cudaMemcpy(x, d_x, Total * sizeof(float), cudaMemcpyDeviceToHost);
cublasDestroy(cublasHandle);
free( b );
free( A );
free( x );
cudaFree( d_A );
cudaFree( d_Ax );
cudaFree( d_x );
cudaFree( d_r );
cudaDeviceReset();
return 0;
}
Thanks!
PS: I can’t upload the A and B matrices , I don’t know how.