Hi,
in fact, i do a bloc by bloc loading form my file which contains my matrix A and my vector B because the matrix is too big to be saved in the CPU memory.
For each bloc Ai of the matrix A i do this :
WHILE !feof(matrix_file)
load(Ai, Bi)
AiTAi = transpose(Ai) . Ai
AiTBi = transpose(Ai) . Bi
ATA += AiTAi
ATB += AiTBi
END
At the end of this part, AtA contains a column-major symetrical matrix and AtB is a simple vector, so i have no transpose to care about.
Now, when i try to call cublasDtrsv() like this it returns wrong values.
cublasDtrsv('L', 'N', 'N', NC, dev_AtA, NC, dev_Atb, 1);
If i try with this matrix,
1 0 0 0 0
0 1 0 0 0
0 0 1 0 0
0 0 0 1 0
0 0 0 0 1
and this vector B,
1
2
3
4
5
the call returns the right values.
But if i try with that matrix and the same vector B, the results are wrong again …
1 0 0 0 0
0 1 0 0 0
0 0 1 0 0
0 0 0 1 0
1 0 0 0 1
Here is my code, with french comments, sorry about that.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <cublas.h>
#include "magma.h"
#include "magma_lapack.h"
#define IDX2C(i,j,ld) (((j)*(ld))+(i))
#define IDX2F(i,j,ld) ((((j)-1)*(ld))+((i)-1))
void sum(double * A, double * Ai, int NL, int NC) {
int i;
for(i=0; i < NL*NC; ++i) {
*(A+i) += *(Ai+i);
}
}
int getMessage(cublasStatus c) {
switch(c) {
case CUBLAS_STATUS_SUCCESS : break;
case CUBLAS_STATUS_NOT_INITIALIZED :
printf("Librairie CUBLAS non initialisée.\n");
break;
case CUBLAS_STATUS_ALLOC_FAILED :
printf("Problème d'allocation mémoire.\n");
break;
case CUBLAS_STATUS_ARCH_MISMATCH :
printf("Le périphérique utilisé ne gère pas la double précision.\n");
break;
case CUBLAS_STATUS_EXECUTION_FAILED :
printf("Echec lors du l'exécution de la fonction sur le GPU.\n");
break;
default: printf("ERREUR NON REPERTORIEE\n"); break;
}
}
void read_dimensions(FILE * file_a, int * NL, int * NC) {
fscanf(file_a, "%d\n", NL);
fscanf(file_a, "%d\n", NC);
}
void read_bloc(FILE * file_a, FILE * file_b, double * Ai, double * bi, int NC, int BLOC, int * lines_read) {
int i = 0, j;
/* Lecture du fichier matrice */
while(!feof(file_a) && i < BLOC) {
for(j=0; j < NC; ++j) {
fscanf(file_a, "%lf ", (Ai+(i*NC)+j));
//fscanf(file_a, "%lf ", (Ai+(j*NC)+i));
}
i++;
}
/* Lecture du fichier vecteur */
i = 0;
while(!feof(file_b) && i < BLOC) {
fscanf(file_b, "%lf\n", (bi+i));
i++;
}
*lines_read += i;
}
int main(int argc, char * argv[]) {
if(argc != 4) {
printf("\n\tusage: ./cublas_cholesky [bloc] [file_a] [file_b]\n\n");
return 1;
}
/* Pointeurs GPU */
double * dev_Ai, * dev_AitAi, * dev_Aitbi, * dev_bi, * dev_AtA, * dev_Atb;
/* Pointeurs CPU */
double * Ai, * bi, * AitAi, * Aitbi, * AtA, * Atb;
int NL, NC, BLOC, i, j, lines_read = 0;
FILE * file_a, * file_b, * res;
/* Init */
BLOC = atoi(argv[1]);
file_a = fopen(argv[2], "r");
file_b = fopen(argv[3], "r");
res = fopen("res.txt", "w");
/* Ouverture du fichier de la matrice */
if(!file_a) {
printf("erreur: impossible d'ouvrir le fichier '%s'.\n", argv[2]);
return 1;
}
/* Ouverture du fichier du vecteur */
if(!file_b) {
printf("erreur: impossible d'ouvrir le fichier '%s'.\n", argv[3]);
return 1;
}
/* Chargement des dimensions */
read_dimensions(file_a, &NL, &NC);
/* bloc Ai */
if( !(Ai = (double*)malloc(NC*BLOC*sizeof(double)))) {
printf("fail!\n");
return 1;
}
/* bloc bi */
if( !(bi = (double*)malloc(BLOC*sizeof(double)))) {
printf("fail!\n");
return 1;
}
/* Ait.Ai */
if( !(AitAi = (double*)malloc(NC*NC*sizeof(double)))) {
printf("fail!\n");
return 1;
}
/* Ait.bi */
if( !(Aitbi = (double*)malloc(NC*sizeof(double)))) {
printf("fail!\n");
return 1;
}
/* At.A */
if( !(AtA = (double*)malloc(NC*NC*sizeof(double)))) {
printf("fail!\n");
return 1;
}
/* At.b */
if( !(Atb = (double*)malloc(NC*sizeof(double)))) {
printf("fail!\n");
return 1;
}
/* Initialisation de Cublas */
cublasInit();
/* Ai en GPU */
if(cublasAlloc(BLOC*NC, sizeof(double), (void **)&dev_Ai) != CUBLAS_STATUS_SUCCESS) {
printf("fail!\n");
cublasShutdown();
return 1;
}
/* vecteur b en GPU */
if(cublasAlloc(BLOC, sizeof(double), (void **)&dev_bi) != CUBLAS_STATUS_SUCCESS) {
printf("fail!\n");
cublasShutdown();
return 1;
}
/* AitAi en GPU */
if(cublasAlloc(NC*NC, sizeof(double), (void **)&dev_AitAi) != CUBLAS_STATUS_SUCCESS) {
printf("fail!\n");
cublasShutdown();
return 1;
}
/* Aitbi en GPU */
if(cublasAlloc(NC, sizeof(double), (void **)&dev_Aitbi) != CUBLAS_STATUS_SUCCESS) {
printf("fail!\n");
cublasShutdown();
return 1;
}
/* At.A en GPU */
if(cublasAlloc(NC*NC, sizeof(double), (void **)&dev_AtA) != CUBLAS_STATUS_SUCCESS) {
printf("fail!\n");
cublasShutdown();
return 1;
}
/* At.b en GPU */
if(cublasAlloc(NC, sizeof(double), (void **)&dev_Atb) != CUBLAS_STATUS_SUCCESS) {
printf("fail!\n");
cublasShutdown();
return 1;
}
/* Initialisation à 0 de AtA et Atb */
memset(AtA, 0, NC*NC*sizeof(double));
memset(Atb, 0, NC*sizeof(double));
while( (NL - lines_read) > 0 ) {
/* Chargement d'un bloc de la matrice en mémoire */
read_bloc(file_a, file_b, Ai, bi, NC, BLOC, &lines_read);
/* Mise en mémoire GPU de la matrice Ai */
if(cublasSetMatrix(BLOC, NC, sizeof(double), Ai, BLOC, dev_Ai, BLOC) != CUBLAS_STATUS_SUCCESS) {
printf("fail!\n");
cublasShutdown();
return 1;
}
/* Mise en mémoire GPU du vecteur b */
if(cublasSetVector(BLOC, sizeof(double), bi, 1, dev_bi, 1) != CUBLAS_STATUS_SUCCESS) {
printf("fail!\n");
cublasShutdown();
return 1;
}
/* Calcul de (Ait . Ai) */
for(i=0; i < NC; ++i) {
for(j=0; j < NC; ++j) {
*(AitAi+j*NC+i) = cublasDdot(BLOC, dev_Ai+i, NC, dev_Ai+j, NC);
}
}
if(cublasGetError() != CUBLAS_STATUS_SUCCESS) {
getMessage(cublasGetError());
cublasShutdown();
return 1;
}
/* Calcul de (Ait . bi) */
for(i=0; i < NC; ++i) {
*(Aitbi+i) = cublasDdot(BLOC, dev_Ai+i, NC, dev_bi, 1);
}
if(cublasGetError() != CUBLAS_STATUS_SUCCESS) {
getMessage(cublasGetError());
cublasShutdown();
return 1;
}
/* Somme de Ai et bi */
sum(AtA, AitAi, NC, NC);
sum(Atb, Aitbi, NC, 1);
}
/* Désallocation CPU et GPU des matrices inutiles */
cublasFree(dev_Ai);
cublasFree(dev_bi);
cublasFree(dev_AitAi);
cublasFree(dev_Aitbi);
free(Ai);
free(bi);
free(AitAi);
free(Aitbi);
close(file_a);
close(file_b);
/* Mise en mémoire GPU de la matrice At.A */
if(cublasSetMatrix(NC, NC, sizeof(double), AtA, NC, dev_AtA, NC) != CUBLAS_STATUS_SUCCESS) {
printf("fail!\n");
cublasShutdown();
return 1;
}
/* Mise en mémoire GPU de la matrice At.b */
if(cublasSetVector(NC, sizeof(double), Atb, 1, dev_Atb, 1) != CUBLAS_STATUS_SUCCESS) {
printf("fail!\n");
cublasShutdown();
return 1;
}
/* Résolution du système */
cublasDtrsv('L', 'N', 'N', NC, dev_AtA, NC, dev_Atb, 1);
if(cublasGetError() != CUBLAS_STATUS_SUCCESS) {
getMessage(cublasGetError());
cublasShutdown();
return 1;
}
memset(Atb, 0, NC*sizeof(double));
/* Récupération de la matrice */
if(cublasGetVector(NC, sizeof(double), dev_Atb, 1, Atb,1) != CUBLAS_STATUS_SUCCESS) {
printf("fail!\n");
getMessage(cublasGetError());
cublasShutdown();
return 1;
}
/* Enregistrement du vecteur X dans le fichier résultat */
for(i = 0; i < NC; ++i) {
printf("%f\n", *(Atb+i));
}
close(res);
/* Désallocation de la mémoire GPU */
cublasFree(dev_AtA);
cublasFree(dev_Atb);
/* Libération des ressources GPU de cublas */
cublasShutdown();
/* Désallocation de la mémoire CPU */
free(AtA);
free(Atb);
return 0;
}