residual is increasing instead of decreasing

Hello ,
I am trying to solve Ax=b using conjugate gradient method.
I checked the nvidia example and because my matrix A is not a sparse matrix , I used the cublasSgemv function.

The problem is that the residual first increase , then decreases and then increases again with the kast value being of 10^6.

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <iostream>
#include <cublas_v2.h>

using namespace std;

int main() {
   
	int Rows = 80 ,Cols = 64;
	int Total = Rows * Cols;
	
	//allocate host memory
	float * A,
		  * b,
		  * x;
		 
	
	A = (float *) malloc( Total  * Total * sizeof(float) );

	b = (float *) malloc( Total * sizeof(float) );
	
	x = (float *) malloc( Total * sizeof(float) );
	
		
	//allocate device memory
	float * d_A,
	      * d_x,
	      * d_r,
	      * d_p,
	      * d_Ax;
	      
		  
	cudaMalloc( (void **)&d_A, Total * Total * sizeof(float) );
	cudaMalloc( (void **)&d_Ax, Total  * sizeof(float) );
	cudaMalloc( (void **)&d_x, Total  * sizeof(float) );
	cudaMalloc( (void **)&d_r, Total * sizeof(float));
	cudaMalloc( (void **)&d_p, Total * sizeof(float));
    
	
	//read Input
	FILE * fileA;
	fileA = fopen( "Asymmetric", "rb" );
	fread( A, sizeof(float), Total * Total , fileA );
	fclose( fileA );
	
	//copy A to device
	cudaMemcpy(d_A, A, Total *  Total * sizeof(float), cudaMemcpyHostToDevice);
	
	FILE * fileB;
	fileB = fopen( "B", "rb" );
	fread( b, sizeof(float), Total  ,fileB );
	fclose( fileB );
	
	//copy b to device
	cudaMemcpy(d_r, b, Total * sizeof(float), cudaMemcpyHostToDevice);
	
		
	const float tol = 1e-5f;
    const int max_iter = 10000;
    float a, bb, na, r0, r1,dot;
    float alpha, beta, alpham1;

for (int i = 0; i < Total; i++)
        x[i] = 0.0;

	//copy to host
    cudaMemcpy(d_x, x, Total * sizeof(float), cudaMemcpyHostToDevice);
   
	
	/* Get handle to the CUBLAS context */
    cublasHandle_t cublasHandle = 0;
    cublasStatus_t cublasStatus;
    cublasStatus = cublasCreate(&cublasHandle);
    
    cublasStatus;
    
    alpha = 1.0;
    alpham1 = -1.0;
    beta = 0.0;
    r0 = 0.;
    
    //  Solve Ax = b 
	cublasSgemv(cublasHandle , CUBLAS_OP_T , Total  , Total , &alpha , d_A ,  Total  ,  d_x ,1 , &beta , d_Ax , 1 );
		
	cublasSaxpy(cublasHandle,  Total  , &alpham1, d_Ax, 1, d_r, 1);
	cublasStatus = cublasSdot(cublasHandle, Total , d_r, 1, d_r, 1, &r1);
	cudaDeviceSynchronize();
		
    int k = 1;
	
   	while (r1 > tol*tol && k <= max_iter)
    		{
        if (k > 1)
       			{
           			bb = r1 / r0;
           			cublasStatus = cublasSscal(cublasHandle, Total, &bb, d_p, 1);
           			cublasStatus = cublasSaxpy(cublasHandle,  Total , &alpha, d_r, 1, d_p, 1);
           			 
        		}
       	else
       			{
            		cublasStatus = cublasScopy(cublasHandle,  Total , d_r, 1, d_p, 1);
       			}

       	cublasSgemv(cublasHandle , CUBLAS_OP_T , Total  , Total ,  &alpha , d_A ,  Total  ,  d_p , 1 , &beta , d_Ax , 1 );
       	    
        cublasStatus = cublasSdot(cublasHandle,  Total , d_p, 1, d_Ax, 1, &dot);
        cudaDeviceSynchronize();
             
        a = r1 / dot;
        
        cublasStatus = cublasSaxpy(cublasHandle,  Total , &a, d_p, 1, d_x, 1);
              
        na = -a;
        	
        cublasStatus = cublasSaxpy(cublasHandle,  Total , &na, d_Ax, 1, d_r, 1);

        r0 = r1;
        cublasStatus = cublasSdot(cublasHandle,  Total , d_r, 1, d_r, 1, &r1); 	
        cudaDeviceSynchronize();
        
        cout << "\niteration = "<<k<<" , residual = "<<sqrt(r1)<<endl;
        k++;
   		 }
   		 
		
	cudaMemcpy(x, d_x, Total * sizeof(float), cudaMemcpyDeviceToHost);
	
    cublasDestroy(cublasHandle);

	
	free( b );
	free( A );
	free( x );
	
	cudaFree( d_A );
	cudaFree( d_Ax );
	cudaFree( d_x );
	cudaFree( d_r );
 
	cudaDeviceReset();
	
	return 0;

}

Thanks!

PS: I can’t upload the A and B matrices , I don’t know how.

Is A a symmetric positive definite matrix?

Yes.

Do you have any ideas for that?

Thanks!

I am not sure about the alpham1 = -1 .
Why the example in nvida uses -1 and not 1?

Also, printing the “dot” values ,I can see that they have values in the order of ^34.
And the last value is nan and the x vector I am taking has nan values.

Hmm , it seems I have something wrong in the generation of the SPDM matrix.

I now used curand and cublas functions to produce it and the residual is decreasing!
Ok!