Cuda matrix multiplication too slow


I’m quite new at Cuda programming and I took the example of Cuda matrix multiplication (without using shared memory) from the Programming Guide, the result is right but is too slow. I use two 1024 x 1024 matrix with 16 x 16 blocks and I get an execution time of 5.39s (both in Debug and Release mode) whereas I get in C alone: 6.64s in Debug mode and 4.09s in Release mode. I use Visual Studio 2005. So in Release mode, Cuda seems no better than C, so I think I must have done something wrong somewhere.

Could you tell me what I did wrong, please?
Thank you for your help.

Try CUBLAS - the SDK code is not too efficient - “(without using shared memory)” - not a good idea. You also do not say what kind of card you have. A GTX280 will perform at about 380 GFlops single precision (SGEMM), while a 4-core Xeon will, with optimized code, using all cores, will be 80 GFlops.

Something else has got to be wrong here. The SDK matrix multiply example is about one third the speed of CUBLAS (I was timing this recently, for my own nefarious purposes), and copying 1024^2 matrices to the GPU and back isn’t that slow. What do these times include? Was the CUDA context already established before the timer began? Those times look suspiciously like ‘whole program’ times.

I forgot to say that I use a GeForce 8400 GS.

This is the program I run:

/* Programme Cuda pris dans le document NVIDIA CUDA: Programming Guide 2.3 (p.18 Ã 21)

#include “stdafx.h”

#include <stdio.h>

#include <cuda.h>

typedef struct {

int width;

int height;

float* elements;

} Matrix;

/* Taille des blocs de fils d’exécution */

/* Les dimensions de matrice sont supposées être des multiples de BLOCK_SIZE */

#define BLOCK_SIZE 16

#define MSIZE 1024

/* Déclaration de la matrice de multiplication à exécuter en parallèle */

global void MulMatKernel(const Matrix, const Matrix, Matrix);

int main(void)


/* Déclaration de pointeurs vers les matrices d’entrée */

Matrix a_h, b_h, c_h;

FILE *fp1, *fp2, *fp3;

fp1 = fopen(“a_h.txt”,“w”);

if(fp1 == NULL) {

printf("Ouverture du fichier %s impossible\n", "a_h.txt");



fp2 = fopen(“b_h.txt”,“w”);

if(fp2 == NULL) {

printf("Ouverture du fichier %s impossible\n", "b_h.txt");



fp3 = fopen(“c_h.txt”,“w”);

if(fp3 == NULL) {

printf("Ouverture du fichier %s impossible\n", "c_h.txt");



a_h.width = MSIZE; a_h.height = MSIZE;

b_h.width = MSIZE; b_h.height = MSIZE;

c_h.width = MSIZE; c_h.height = MSIZE;

size_t size = MSIZE * MSIZE * sizeof(float); /* size_t = unsigned int */

/* Allocation de mémoire pour les matrices d’entrée */

a_h.elements = (float*)malloc(size);

b_h.elements = (float*)malloc(size);

c_h.elements = (float*)malloc(size);

fprintf(fp1, “\n”); /* Espace dans le fichier avant l’écriture des nombres */

/* Initialisation des matrices hôte */

for (int j=0; j<a_h.height; j++)

	for (int i=0; i<a_h.width; i++){

		a_h.elements[a_h.width*j + i] = (float)rand()/RAND_MAX;

		fprintf(fp1, "%f\n", a_h.elements[a_h.width*j + i]);


fprintf(fp2, “\n”); /* Espace dans le fichier avant l’écriture des nombres */

/* La fonction rand délivre un nombre pseudoaléatoire compris entre 0 et 32767(RAND_MAX) */

for (int j=0; j<b_h.height; j++)

	for (int i=0; i<b_h.width; i++){

		b_h.elements[b_h.width*j + i] = (float)rand()/RAND_MAX;

		fprintf(fp2, "%f\n", b_h.elements[b_h.width*j + i]);


for (int j=0; j<c_h.height; j++)

	for (int i=0; i<c_h.width; i++)

		c_h.elements[c_h.width*j + i] = 0.0;

Matrix d_A, d_B, d_C;

d_A.width = a_h.width; d_A.height = a_h.height;

d_B.width = b_h.width; d_B.height = b_h.height;

d_C.width = c_h.width; d_C.height = c_h.height;

cudaMalloc((void**)&d_A.elements, size);

cudaMemcpy(d_A.elements, a_h.elements, size, cudaMemcpyHostToDevice);

cudaMalloc((void**)&d_B.elements, size);

cudaMemcpy(d_B.elements, b_h.elements, size, cudaMemcpyHostToDevice);

cudaMalloc((void**)&d_C.elements, size);

/* Appel de la fonction à exécuter en parallèle sur la carte graphique */

dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);

dim3 dimGrid(b_h.width / dimBlock.x, a_h.height / dimBlock.y);

// Mesure du temps d’exécution

cudaEvent_t start, stop;

float time;



cudaEventRecord( start, 0 ); // Début

MulMatKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);

//MulMat(a_h, b_h, c_h);


// Fin de la mesure du temps d’exécution du programme

cudaEventRecord( stop, 0 ); // Fin

cudaEventSynchronize( stop );

cudaEventElapsedTime( &time, start, stop );

cudaEventDestroy( start );

cudaEventDestroy( stop );

// Print results


printf(“Temps ecoule: %f ms\n”, time);

/* Transfert du résultat de la carte graphique à la CPU */

cudaMemcpy(c_h.elements, d_C.elements, size, cudaMemcpyDeviceToHost);

/* Désallocation de mémoire sur la carte graphique */




fprintf(fp3, “\n”); /* Espace dans le fichier avant l’écriture des nombres */

for (int i=0; i<MSIZE; i++) {

for (int j=0; j<MSIZE; j++){

	if (c_h.elements[i*c_h.width + j] > MSIZE || c_h.elements[i*c_h.width + j] < 0)

		printf("erreur = %f i = %d, j = %d\n", c_h.elements[i*c_h.width + j], i, j);

	fprintf(fp3, "%f\n", c_h.elements[c_h.width*j + i]);







// Cleanup





device void MulMatKernel(Matrix A, Matrix B, Matrix C)


// Each thread computes one element of C by accumulating results into Cvalue

/* Chaque fil d’exécution calcule un élément de la matrice résultat en cumulant

les résultats dans une valeur intermédiaire Cvalue */

float Cvalue = 0.0f;

int row = blockIdx.y * blockDim.y + threadIdx.y;

int col = blockIdx.x * blockDim.x + threadIdx.x;

/* Calcul et rangement en colonnes pour Matlab */

for (int e = 0; e < A.width; ++e)

Cvalue += A.elements[row * A.width + e] * B.elements[e * B.width + col];

C.elements[col * C.width + row] = Cvalue;


Thank you for your help.


These are the results I get from the profiler (see attached Excel document):
prof_mulmat3.xls (14 KB)


I took the example of Cuda matrix multiplication using shared memory from the Programming Guide. I use two 1024 x 1024 matrix with 16 x 16 blocks. I take use 8 registers per thread. I use a GPU 8400 GS with 8 stream processors (1400 MHz).

I get an execution time (for the kernel alone) of 387ms.

Please could you tell me whether it is a slow or a normal execution time?
Thank you for your help. :)