I’m writing some code which requires a many multiplications by a matrix A and it’s transpose A^T and I’ve found that the time required to multiply by A is often twice the time it takes to multiply by A^T. On top of that, the problem seems to be most pronounced when A is a tall matrix as opposed to a fat matrix. I wrote a script to compare performance over 100 runs (the arguments to gemv_test are (m, n) – the dimensions of A):
$ ./gemv_test 100000 5000
Time (CUBLAS_OP_T, Tall-A, double): 4.684337e+01 ms
Time (CUBLAS_OP_N, Tall-A, double): 8.918843e+01 ms
Time (CUBLAS_OP_T, Fat-A, double): 4.606591e+01 ms
Time (CUBLAS_OP_N, Fat-A, double): 5.725338e+01 ms
$ ./gemv_test 200000 2000
Time (CUBLAS_OP_T, Tall-A, double): 3.953864e+01 ms
Time (CUBLAS_OP_N, Tall-A, double): 8.653539e+01 ms
Time (CUBLAS_OP_T, Fat-A, double): 3.543512e+01 ms
Time (CUBLAS_OP_N, Fat-A, double): 3.616319e+01 ms
As you see Ax for tall A is twice as slow as A^Tx for all A or Ax for fat A matrices. Is this to be expected? I’m a bit surprised that there to be such a huge difference.
Here is the test code for reference:
#include <cassert>
#include <cstdlib>
#include <cstdio>
#include <cublas_v2.h>
// Timing code
struct event_pair {
cudaEvent_t start;
cudaEvent_t end;
};
inline void start_timer(event_pair * p) {
cudaEventCreate(&p->start);
cudaEventCreate(&p->end);
cudaEventRecord(p->start, 0);
}
inline double stop_timer(event_pair * p) {
cudaEventRecord(p->end, 0);
cudaEventSynchronize(p->end);
float elapsed_time;
cudaEventElapsedTime(&elapsed_time, p->start, p->end);
cudaEventDestroy(p->start);
cudaEventDestroy(p->end);
return elapsed_time;
}
// Wrappers for gemv
void cublasgemv(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
const float *alpha, const float *A, int lda, const float *x, int incx,
const float *beta, float *y, int incy) {
cublasSgemv(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
}
void cublasgemv(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
const double *alpha, const double *A, int lda, const double *x, int incx,
const double *beta, double *y, int incy) {
cublasDgemv(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
}
// Test code
template <typename T>
void gemv_test(int m, int n, char name[]) {
// Allocate, initialize and copy
T *A_h = new T[m * n];
T *x_h = new T[n];
T *y_h = new T[m];
for (unsigned int i = 0; i < m * n; ++i)
A_h[i] = rand() / static_cast<T>(RAND_MAX);
for (unsigned int i = 0; i < n; ++i)
x_h[i] = rand() / static_cast<T>(RAND_MAX);
for (unsigned int i = 0; i < m; ++i)
y_h[i] = rand() / static_cast<T>(RAND_MAX);
T *A_d, *x_d, *y_d;
cudaMalloc(&A_d, m * n * sizeof(T));
cudaMalloc(&x_d, n * sizeof(T));
cudaMalloc(&y_d, m * sizeof(T));
cudaMemcpy(A_d, A_h, m * n * sizeof(T), cudaMemcpyHostToDevice);
cudaMemcpy(x_d, x_h, n * sizeof(T), cudaMemcpyHostToDevice);
cudaMemcpy(y_d, y_h, m * sizeof(T), cudaMemcpyHostToDevice);
// Other variables
T alpha = 1, beta = 1;
cublasHandle_t handle;
cublasCreate(&handle);
event_pair ep;
// Compute
unsigned int nrep = 100u;
start_timer(&ep);
for (unsigned int i = 0; i < nrep; ++i)
cublasgemv(handle, CUBLAS_OP_T, m, n, &alpha, A_d, m, y_d, 1, &beta, x_d, 1);
printf("Time (CUBLAS_OP_T, Tall-A, %s): %e ms\n", name, stop_timer(&ep) / nrep);
start_timer(&ep);
for (unsigned int i = 0; i < nrep; ++i)
cublasgemv(handle, CUBLAS_OP_N, m, n, &alpha, A_d, m, x_d, 1, &beta, y_d, 1);
printf("Time (CUBLAS_OP_N, Tall-A, %s): %e ms\n", name, stop_timer(&ep) / nrep);
start_timer(&ep);
for (unsigned int i = 0; i < nrep; ++i)
cublasgemv(handle, CUBLAS_OP_T, n, m, &alpha, A_d, n, x_d, 1, &beta, y_d, 1);
printf("Time (CUBLAS_OP_T, Fat-A, %s): %e ms\n", name, stop_timer(&ep) / nrep);
start_timer(&ep);
for (unsigned int i = 0; i < nrep; ++i)
cublasgemv(handle, CUBLAS_OP_N, n, m, &alpha, A_d, n, y_d, 1, &beta, x_d, 1);
printf("Time (CUBLAS_OP_N, Fat-A, %s): %e ms\n", name, stop_timer(&ep) / nrep);
delete [] A_h;
delete [] x_h;
delete [] y_h;
cudaFree(A_d);
cudaFree(x_d);
cudaFree(y_d);
}
int main(int argc, char *argv[]) {
assert(argc == 3);
int m = atoi(argv[1]);
int n = atoi(argv[2]);
assert(m >= n);
gemv_test<double>(m, n, "double");
}
The graphics card is a Tesla C2070.