Is it normal for GEMV operations to take twice as long depending on cublasOperation_t

I’m writing some code which requires a many multiplications by a matrix A and it’s transpose A^T and I’ve found that the time required to multiply by A is often twice the time it takes to multiply by A^T. On top of that, the problem seems to be most pronounced when A is a tall matrix as opposed to a fat matrix. I wrote a script to compare performance over 100 runs (the arguments to gemv_test are (m, n) – the dimensions of A):

$ ./gemv_test 100000 5000
Time (CUBLAS_OP_T, Tall-A, double): 4.684337e+01 ms
Time (CUBLAS_OP_N, Tall-A, double): 8.918843e+01 ms
Time (CUBLAS_OP_T, Fat-A, double): 4.606591e+01 ms
Time (CUBLAS_OP_N, Fat-A, double): 5.725338e+01 ms
$ ./gemv_test 200000 2000
Time (CUBLAS_OP_T, Tall-A, double): 3.953864e+01 ms
Time (CUBLAS_OP_N, Tall-A, double): 8.653539e+01 ms
Time (CUBLAS_OP_T, Fat-A, double): 3.543512e+01 ms
Time (CUBLAS_OP_N, Fat-A, double): 3.616319e+01 ms

As you see Ax for tall A is twice as slow as A^Tx for all A or Ax for fat A matrices. Is this to be expected? I’m a bit surprised that there to be such a huge difference.

Here is the test code for reference:

#include <cassert>
#include <cstdlib>
#include <cstdio>
#include <cublas_v2.h>

// Timing code
struct event_pair {
  cudaEvent_t start;
  cudaEvent_t end;
};

inline void start_timer(event_pair * p) {
  cudaEventCreate(&p->start);
  cudaEventCreate(&p->end);
  cudaEventRecord(p->start, 0);
}

inline double stop_timer(event_pair * p) {
  cudaEventRecord(p->end, 0);
  cudaEventSynchronize(p->end);
  
  float elapsed_time;
  cudaEventElapsedTime(&elapsed_time, p->start, p->end);
  cudaEventDestroy(p->start);
  cudaEventDestroy(p->end);
  return elapsed_time;
}

// Wrappers for gemv
void cublasgemv(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
           const float *alpha, const float *A, int lda, const float *x, int incx,
           const float *beta, float *y, int incy) {
  cublasSgemv(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
}

void cublasgemv(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
           const double *alpha, const double *A, int lda, const double *x, int incx,
           const double *beta, double *y, int incy) {
  cublasDgemv(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
}

// Test code
template <typename T>
void gemv_test(int m, int n, char name[]) {
  // Allocate, initialize and copy
  T *A_h = new T[m * n];
  T *x_h = new T[n];
  T *y_h = new T[m];

  for (unsigned int i = 0; i < m * n; ++i)
    A_h[i] = rand() / static_cast<T>(RAND_MAX);
  for (unsigned int i = 0; i < n; ++i)
    x_h[i] = rand() / static_cast<T>(RAND_MAX);
  for (unsigned int i = 0; i < m; ++i)
    y_h[i] = rand() / static_cast<T>(RAND_MAX);

  T *A_d, *x_d, *y_d;
  cudaMalloc(&A_d, m * n * sizeof(T));
  cudaMalloc(&x_d, n * sizeof(T));
  cudaMalloc(&y_d, m * sizeof(T));
  cudaMemcpy(A_d, A_h, m * n * sizeof(T), cudaMemcpyHostToDevice);
  cudaMemcpy(x_d, x_h, n * sizeof(T), cudaMemcpyHostToDevice);
  cudaMemcpy(y_d, y_h, m * sizeof(T), cudaMemcpyHostToDevice);

  // Other variables
  T alpha = 1, beta = 1;
  cublasHandle_t handle;
  cublasCreate(&handle);
  event_pair ep;

  // Compute
  unsigned int nrep = 100u;
  start_timer(&ep);
  for (unsigned int i = 0; i < nrep; ++i)
    cublasgemv(handle, CUBLAS_OP_T, m, n, &alpha, A_d, m, y_d, 1, &beta, x_d, 1);
  printf("Time (CUBLAS_OP_T, Tall-A, %s): %e ms\n", name, stop_timer(&ep) / nrep);

  start_timer(&ep);
  for (unsigned int i = 0; i < nrep; ++i)
    cublasgemv(handle, CUBLAS_OP_N, m, n, &alpha, A_d, m, x_d, 1, &beta, y_d, 1);
  printf("Time (CUBLAS_OP_N, Tall-A, %s): %e ms\n", name, stop_timer(&ep) / nrep);

  start_timer(&ep);
  for (unsigned int i = 0; i < nrep; ++i)
    cublasgemv(handle, CUBLAS_OP_T, n, m, &alpha, A_d, n, x_d, 1, &beta, y_d, 1);
  printf("Time (CUBLAS_OP_T, Fat-A, %s): %e ms\n", name, stop_timer(&ep) / nrep);

  start_timer(&ep);
  for (unsigned int i = 0; i < nrep; ++i)
    cublasgemv(handle, CUBLAS_OP_N, n, m, &alpha, A_d, n, y_d, 1, &beta, x_d, 1);
  printf("Time (CUBLAS_OP_N, Fat-A, %s): %e ms\n", name, stop_timer(&ep) / nrep);

  delete [] A_h;
  delete [] x_h;
  delete [] y_h;
  cudaFree(A_d);
  cudaFree(x_d);
  cudaFree(y_d);
}

int main(int argc, char *argv[]) {
  assert(argc == 3);
  int m = atoi(argv[1]);
  int n = atoi(argv[2]);
  assert(m >= n);
  gemv_test<double>(m, n, "double");
}

The graphics card is a Tesla C2070.