 # Faster MatrixMult than CUBLAS!

Hi,

I have this code, basiclly the example NVIDIA provides for using shared memory. I am using this to compute multiplication of a vector by a matrix of size 1K * (1K*2K) and larger. The problem is that it’s faster than CUBLAS functions cublasSgemm or cublasSgemv. Shouldn’t CUBLAS be faster? Here is the time for the mentioned size:

GPU time: 60 us

GPU time (CUBLAS): 450 us

(the time is only for computation, not communication)

Kernel Code:

``````#ifndef _MATRIXMUL_KERNEL_H_

#define _MATRIXMUL_KERNEL_H_

#include <stdio.h>

#include "matrixMul.h"

#define CHECK_BANK_CONFLICTS 0

#if CHECK_BANK_CONFLICTS

#define AS(i) cutilBankChecker(((float*)&As), j)

#define BS(i, j) cutilBankChecker(((float*)&Bs), (BLOCK_SIZE * i + j))

#else

#define AS(i) As[i]

#define BS(i, j) Bs[i][j]

#endif

////////////////////////////////////////////////////////////////////////////////

//! Matrix multiplication on the device: C = A * B

//! wA is A's width and wB is B's width

////////////////////////////////////////////////////////////////////////////////

__global__ void

matrixMul( float* C, float* A, float* B, int wA, int wB)

{

// Block index

int bx = blockIdx.x;

int by = blockIdx.y;

// Index of the first sub-matrix of A processed by the block

int aBegin = wA * BLOCK_SIZE * by;

// Index of the last sub-matrix of A processed by the block

int aEnd   = aBegin + wA - 1;

// Step size used to iterate through the sub-matrices of A

int aStep  = BLOCK_SIZE;

// Index of the first sub-matrix of B processed by the block

int bBegin = BLOCK_SIZE * bx;

// Index of the last sub-matrix of A processed by the block

int bEnd   = bBegin + wB - 1;

// Step size used to iterate through the sub-matrices of B

int bStep  = BLOCK_SIZE * wB;

// Csub is used to store the element of the block sub-matrix

// that is computed by the thread

float Csub = 0;

// Loop over all the sub-matrices of A and B

// required to compute the block sub-matrix

for (int a = aBegin, b = bBegin;

a <= aEnd;

a += aStep, b += bStep) {

// Declaration of the shared memory array As used to

// store the sub-matrix of A

__shared__ float As[BLOCK_SIZE];

// Declaration of the shared memory array Bs used to

// store the sub-matrix of B

__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];

// Load the matrices from device memory

// one element of each matrix

AS(tx) = A[a + tx];

BS(ty, tx) = B[b + wB * ty + tx];

// Synchronize to make sure the matrices are loaded

// Multiply the two matrices together;

// each thread computes one element

// of the block sub-matrix

for (int k = 0; k < BLOCK_SIZE; ++k)

Csub += AS(k) * BS(k, tx);

// Synchronize to make sure that the preceding

// sub-matrices of A and B in the next iteration

}

// Write the block sub-matrix to device memory;

// each thread writes one element

int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;

C[c + wB * ty + tx] = Csub;

}

#endif // #ifndef _MATRIXMUL_KERNEL_H_
``````

The way I call the kernel: (WA & WB are width of the vectore and width of the matrix respectively.)

``````timer = 0;

cutilCheckError(cutCreateTimer(&timer));

cutilCheckError(cutStartTimer(timer));

// setup execution parameters

// execute the kernel

matrixMul<<< grid, threads >>>(d_C, d_A, d_B, WA, WB);

// check if kernel execution generated and error

cutilCheckMsg("Kernel execution failed");

// stop and destroy timer

cutilCheckError(cutStopTimer(timer));

printf("Processing time: %f (ms) \n", cutGetTimerValue(timer));

cutilCheckError(cutDeleteTimer(timer));
``````

For CUBLAS, the code is similar to simpleCUBLAS example in SDK.

Thanks,

• eka

The results are being compared against CPU-computed results and they match. So no need for more synchronization, right?

I checked the CUBLAS with symmetrical matrices, and it runs faster than non-symmetric matrices. Does anyone have results supporting my observations?

Thanks,

• eka

``````//start the timer
cutStartTimer(timer);

// execute the kernel, the control returns immediately to the CPU
matrixMul<<< grid, threads >>>(d_C, d_A, d_B, WA, WB);

// Need a cudaThreadSynchronize for correct timing of the GPU kernel otherwise you are measuring launch overhead