cublasZgemm doesn't work ?

Hi,

I try to use cublasZgemm with a Quadro FX 5800 but I don’t get the expected results :

A : 3.000000 3.000000 3.000000 0.000000 9.000000 1.000000 5.000000 2.000000

B : 6.000000 1.000000 4.000000 8.000000 1.000000 1.000000 6.000000 1.000000

C : 9.000000 7.000000 6.000000 3.000000 6.000000 5.000000 9.000000 1.000000

C : -104.000000 312.000000 -52.000000 164.000000 66.000000 170.000000 38.000000

instead of :

C : -8.000000 116.000000 -60.000000 108.000000 70.000000 178.000000 -58.000000 318.000000

I searched in the CUBLAS programming guide but all I’m doing seems ok (I use -arch sm_13 to compile the code). Is it possible that the library just doesn’t gives the good answers ?

Thanks for your answers

Best regards

#include <stdio.h>

#include <stdlib.h>

#include <sys/time.h>

#include <cublas.h>

unsigned long long time_log(unsigned long long & last){

	struct timeval now;

	gettimeofday(&now, NULL);

	

	unsigned long long dnow = now.tv_sec * 1000000L + now.tv_usec;

	unsigned long long diff = dnow-last;

	

	last = dnow;

	return diff;

}

int main(int argc, char * argv[]){

	unsigned long long timeref=0;

	unsigned long long diffCalc, diffInit, diffAlloc, diffEcriture, diffLecture, diffLiberation;

	

	int m = atoi(argv[1]), n = atoi(argv[2]), k = atoi(argv[3]);

	double * A = (double*) malloc(2*m*k*sizeof(double)), *B = (double*) malloc(2*k*n*sizeof(double)), *C = (double*) malloc(2*m*n*sizeof(double));

	cuDoubleComplex * Ad, * Bd, * Cd;

	cublasStatus retStatus;

	int sizeA = 2*m*k, sizeB = 2*k*n, sizeC = 2*m*n;

	

	srand(123);

	

	double * start = A, *end = start+sizeA;

	for(;start!=end;++start)

		*start = rand()%10;

		

	start = B;

	end = start+sizeB;

	for(;start!=end;++start)

		*start = rand()%10;

		

	start = C;

	end = start+sizeC;

	for(;start!=end;++start)

		*start = rand()%10;

		

	cuDoubleComplex alpha, beta;

	

	alpha = make_cuDoubleComplex(atof(argv[4]), atof(argv[4]));

	beta = make_cuDoubleComplex(atof(argv[5]), atof(argv[5]));

	

	

	printf("A : ");

	start = A;

	end = start+sizeA;

	for(;start!=end;++start)

		printf("%f ", *start);

	printf("\n");

	

	printf("B : ");

	start = B;

	end = start+sizeB;

	for(;start!=end;++start)

		printf("%f ", *start);

	printf("\n");

		

	printf("C : ");

	start = C;

	end = start+sizeC;

	for(;start!=end;++start)

		printf("%f ", *start);

	printf("\n");

	

	

	time_log(timeref);

	

	cublasInit();

	diffInit = time_log(timeref);

	

	cublasAlloc (sizeA/2, sizeof(cuDoubleComplex), (void**)&Ad);

	

	retStatus = cublasGetError ();

    if (retStatus != CUBLAS_STATUS_SUCCESS) {

    	printf("CUBLAS: an error occured in cublasAlloc\n");

    } 

	cublasAlloc (sizeB/2, sizeof(cuDoubleComplex), (void**)&Bd);

	

	retStatus = cublasGetError ();

    if (retStatus != CUBLAS_STATUS_SUCCESS) {

    	printf("CUBLAS: an error occured in cublasAlloc\n");

    } 

	cublasAlloc (sizeC/2, sizeof(cuDoubleComplex), (void**)&Cd);

	diffAlloc = time_log(timeref);

	

	retStatus = cublasGetError ();

    if (retStatus != CUBLAS_STATUS_SUCCESS) {

    	printf("CUBLAS: an error occured in cublasAlloc\n");

    } 

	cublasSetMatrix (m, k, sizeof(cuDoubleComplex), A, k, (void*)Ad, k);

	

	retStatus = cublasGetError ();

    if (retStatus != CUBLAS_STATUS_SUCCESS) {

    	printf("CUBLAS: an error occured in cublasSetMatrix\n");

    } 

	cublasSetMatrix (k, n, sizeof(cuDoubleComplex), B, n, (void*)Bd, n);

	

	retStatus = cublasGetError ();

    if (retStatus != CUBLAS_STATUS_SUCCESS) {

    	printf("CUBLAS: an error occured in cublasSetMatrix\n");

    } 

	cublasSetMatrix (m, n, sizeof(cuDoubleComplex), C, n, (void*)Cd, n);

	diffEcriture = time_log(timeref);

	

	retStatus = cublasGetError ();

    if (retStatus != CUBLAS_STATUS_SUCCESS) {

    	printf("CUBLAS: an error occured in cublasSetMatrix\n");

    } 

	cublasZgemm ('n', 'n', m, n, k, alpha, (cuDoubleComplex *) Ad, k, (cuDoubleComplex *) Bd, n, beta, (cuDoubleComplex *) Cd, n);

	diffCalc = time_log(timeref);

	

	retStatus = cublasGetError ();

    if (retStatus != CUBLAS_STATUS_SUCCESS) {

    	printf("CUBLAS: an error occured in cublasZgemm : ");

    	if(retStatus == CUBLAS_STATUS_NOT_INITIALIZED)

    		printf("CUBLAS_STATUS_NOT_INITIALIZED\n");

    	else if (retStatus == CUBLAS_STATUS_INVALID_VALUE)

    		printf("CUBLAS_STATUS_INVALID_VALUE\n");

    	else if (retStatus == CUBLAS_STATUS_ARCH_MISMATCH)

    		printf("CUBLAS_STATUS_ARCH_MISMATCH\n");

    	else if (retStatus == CUBLAS_STATUS_EXECUTION_FAILED)

    		printf("CUBLAS_STATUS_EXECUTION_FAILED");

    	else

    		printf("IN YOUR ASS\n");

    } 

cublasGetMatrix (m, k, sizeof(cuDoubleComplex), Ad, k, A, k);

    cublasGetMatrix (k, n, sizeof(cuDoubleComplex), Bd, n, B, n);

	cublasGetMatrix (m, n, sizeof(cuDoubleComplex), Cd, n, C, n);

	

	retStatus = cublasGetError ();

    if (retStatus != CUBLAS_STATUS_SUCCESS) {

    	printf("CUBLAS: an error occured in cublasGetMatrix\n");

    } 

	diffLecture = time_log(timeref);

	

	cublasFree(Ad);

	cublasFree(Bd);

	cublasFree(Cd);

	diffLiberation = time_log(timeref);

	

	cublasShutdown(); 

	printf("A : ");

	start = A;

	end = start+sizeA;

	for(;start!=end;++start)

		printf("%f ", *start);

	printf("\n");

	

	printf("B : ");

	start = B;

	end = start+sizeB;

	for(;start!=end;++start)

		printf("%f ", *start);

	printf("\n");

	printf("C : ");

	start = C;

	end = start+sizeC;

	for(;start!=end;++start)

		printf("%f ", *start);

	printf("\n");

	printf("%llu usec requises pour l'initialisation \n",diffInit);

	printf("%llu usec requises pour l'allocation des données \n",diffAlloc);

	printf("%llu usec requises pour l'écriture des données \n",diffEcriture);

	printf("%llu usec requises pour le calcul \n",diffCalc);

	printf("%llu usec requises pour la lecture des données \n",diffLecture);

	printf("%llu usec requises pour la libération des données \n",diffLiberation);

	

	free(A);

	free(B);

	free(C);

	

	return 0;

}

Tested with the call ./prog 2 2 2 2 2

Are you sure that you use column-major layout?

I just found out (I was on somthing else) that this lib only works on col-major (yeah, in C we are used to work in row-major, but ok). I used the transpose transformation and it worked !! Thanks for your help.