Cublas, cublasSgemv Matrix vector operation size Limitation

I try to us cublas to realize au simple Matrix*vector multiplication.

“I” have “written” this code:

// Variables pour la gestion Cublas et 

 Â  Â cublasStatus status;

 Â  Â float *d_A, * d_B, Â *d_C;

  Â // Allocation de la mémoire

 Â  Â status = cublasAlloc(m1X*m1Y, sizeof(float), (void**)&d_A);

 Â  Â status = cublasAlloc(m2X, sizeof(float), (void**)&d_B);

 Â  Â status = cublasAlloc(m2X, sizeof(float), (void**)&d_C);

   Â // Affectation des données

 Â  Â cublasSetMatrix (m1Y, m1X, sizeof(float), fat1, m1X, d_A,m1X);

 Â  Â cublasSetVector(m1X,sizeof(float),fat2,1,d_B,1);

 Â  Â // Calcul A*y

 Â  Â cublasSgemv('t', m1X,m1Y,1.0,d_A,m1X,d_B,1,0.0,d_C,m1X);

   Â // Récupération des résultats

 Â  Â status = cublasGetVector(m1X,sizeof(float),d_C,m1X,fat3,1);

 Â  Â for (int i = 0; i < m1X; ++i) titi.mat[i] = (T)fat3[i];

 Â  Â 

 Â  Â cublasFree(d_A); cublasFree(d_B); cublasFree(d_C);

 Â  Â delete fat1; delete fat2; delete fat3;

I have any problem with this code with small matrix (and vector) size (< 1024 elements) … and with bigger matrix … the result are wrong without status error;

I think my problem is about the memory allocation but I don’t find it…

Complements:

I use one square matrix size mxm -> (A)

and one vector : size m -> (x)

if m < 1023 the result is good and it’s ok

if m > 1023, the result is bad, and it’s equivalent to the vector x and the screen “blink”

I use an NVIDIA Quadro FX570, the drivers 6.14.11.7735, CUDA 2.0 and Windows XP.

Tanks for your help

++ Beleys

Wouldn’t that need to be m1X, too?

It’s not the problem because in my code, m2X= m1X, it’s not logical …

Finaly, th following code is fonctionnal and compute the matrix vector product

template <class T>

Matrix<T> OpeGpu<T>::mvBlasV2(Matrix<T> & mat1, T * vec, int tY)

{

  float * h_A, *h_B, *h_C;

  float * d_A, *d_B, *d_C;

 int N = mat1._tx;

  int L = mat1._ty;

  int K =tY; // On travaille sur des matrices colonnes;

 Matrix<T> res(K,L,false);

 //A -> Matrice de taille N*L

  h_A = (float*)malloc(N*L*sizeof(float));

 //B -> Matrice de taille KxN

  h_B = (float*)malloc (K*N*sizeof(float));

 //C -> Matrice C = A*B de taille K*L

  h_C = (float*)malloc (K*L*sizeof(float));

 // Allocation des structures

  for (int i = 0; i < N*L; i++){

    h_A[i] = mat1.mat[i];

  }

  for (int i = 0; i < K*N; i++){

    h_B[i] = vec[i];

  }

  for (int i = 0; i < K*L; i++){

    h_C[i] = 0.0;

  }

 /* Initialize CUBLAS */

  cublasStatus status;

 status = cublasInit();

  if (status != CUBLAS_STATUS_SUCCESS) {

    fprintf (stderr, "!!!! CUBLAS initialization error\n");

    return EXIT_FAILURE;

  }

 malloc(N*L*sizeof(float));

  status = cublasAlloc(N*L, sizeof(d_A[0]), (void**)&d_A);

  status = cublasAlloc(K*N, sizeof(d_B[0]), (void**)&d_B);

  status = cublasAlloc(K*L, sizeof(d_C[0]), (void**)&d_C);

 status = cublasSetVector(N*L, sizeof(h_A[0]), h_A, 1, d_A, 1);

  status = cublasSetVector(K*N, sizeof(h_B[0]), h_B, 1, d_B, 1);

  status = cublasSetVector(K*L, sizeof(h_C[0]), h_C, 1, d_C, 1);

 cublasGetError();

  float alpha = 1.0;

  float beta = 0.0;

 cublasSgemm('t', 'n', L, K, N, alpha, d_A, N, d_B, N, beta, d_C, N);

 status = cublasGetError();

  if (status != CUBLAS_STATUS_SUCCESS) {

    fprintf (stderr, "!!!! kernel execution error.\n");

    return EXIT_FAILURE;

  }

 status = cublasGetVector(K*L, sizeof(h_C[0]), d_C, 1, h_C, 1);

 for (int i =0; i <  K*L; i++){

      res.mat[i] = h_C[i];

  }

  status = cublasFree(d_A);

  status = cublasFree(d_B);

  status = cublasFree(d_C);

 status = cublasShutdown();

 return res;

}

Thanks for your help

++ Beleys