In my algorithm I’m making several calls to cublasStrmm (matrix multiply), all something like this:
cublasStrmm('r', 'u', 'n', 'n', 10000, 65, 1.0, A, 65, B, 10000);
So we have m=10000, n=65 and k=65.
I do 72 of these calls, and the total time taken is 645ms, so that’s 9ms per call
This seems a bit slow, even for the funny size matrices I’m using.
By my calculations
I was expecting something much higher than this? Is anyone else seeing similar performance? Or have I got my calculations wrong? :lol:
Thanks,
Alex
Here’s some test code… i get as output:
m=10000 k=65 n=65
execution time per call: 14.028479 ms
gflops = 3.058065
void main(int argc, char** argv) {
float* A;
float* B;
int m = 10000;
int k = 65;
int n = k;
int iters = 72;
cublasInit();
cublasStatus status = cublasAlloc(m * k, sizeof(float), (void**) &A);
status = cublasAlloc(k * n, sizeof(float), (void**) &B);
float* bigEmptySpace = new float[m*k];
memset(bigEmptySpace, 0, m*k*sizeof(float));
cublasSetMatrix(m, k, sizeof(float), bigEmptySpace, m, A, m);
cublasSetMatrix(k, n, sizeof(float), bigEmptySpace, k, B, k);
unsigned int timer;
CUT_SAFE_CALL(cutCreateTimer(&timer));
cutStartTimer(timer);
for (int i = 0; i < iters; i++) {
cublasStrmm('r', 'u', 'n', 'n', m, n, 1.0, A, n, B, m);
cudaSync(); // my C-wrapper for cudaThreadSynchronize() - needed here or it hangs!!
}
cutStopTimer(timer);
float timePerCall = cutGetTimerValue(timer) / iters;
printf("m=%d k=%d n=%d\n", m, k, n);
printf("execution time per call: %f ms\n", timePerCall);
float flopsPerCall = (n * (n+1) / 2) * 2 * m;
float seconds = timePerCall / 1000.0;
float flops = flopsPerCall / seconds;
float gflops = flops / 1000000000.0;
printf("gflops = %f\n", gflops);
cublasFree(A);
cublasFree(B);
}
figual
3
If you are still interested, maybe this post can help you:
http://forums.nvidia.com/index.php?showtopic=99094