Matrix Multiplication: Shared vs Global Memory

Hi all,

I am starting using matrix multiplication example to learn CUDA. So I made two initial versions, just using global memory and using shared memory.

When I executed both versions, the shared memory version is slower than the global memory version. Why is it happening?

Example A*B=C

A 3200x1600

B 1600x1600

C 3200x1600

Global Memory version 1.44 secs

Shared Memory version 2.17 secs

If I use matrixes bigest the outcomes are the same, the time wasting on shared memory version is 44% highest than global memory.

I used the CUDA profiler and the unique difference is that I do not know why, but the dram reads are

Global 85 request

Shared 89 request

I thought that the shared memory version would make less requests to dram memory than the global version.

I leave my source code, if someone see anything wrong please help me. I do not know what is happening.

#include <stdio.h>

#include <stdlib.h>

#include <cuda.h>

#include <errno.h>

typedef float TIPO;

#define BLOCK_SIZE 16

//(T*)((char*)BaseAddress + Row * pitch) + Column;

#define Device_Element(m, i, j, p) (TIPO*)((char *)(m) + ((i) * (p)) + (SIZE_TIPO * (j)))

// Kernel GLOBAL MEMORY VERSION

__global__ void D_matrixMul(TIPO * mat1, TIPO * mat2, TIPO * matR, int ha, int wa, int pa, int hb, int wb, int pb, int pc)

{

   int i = blockIdx.y * blockDim.y + threadIdx.y;

   int j = blockIdx.x * blockDim.x + threadIdx.x;

if ((i < ha) && (j < wb)){

TIPO   acum = 0.0f;

for(int k = 0; k< wa ; k++)

         acum += *(Device_Element(mat1,i,k,pa)) * *(Device_Element(mat2,k,j,pb));

*(Device_Element(matR,i,j,pc)) = acum;

}

}

// KERNEL SHARED MEMORY VERSION

__global__ void D_matrixMulShared(TIPO * mat1, TIPO * mat2, TIPO * matR, int ha, int wa, int pa, int hb, int wb, int pb, int pc)

{

TIPO   auxC = 0.0f;

// Declarando memoria compartida

   __shared__ TIPO S1[BLOCK_SIZE][BLOCK_SIZE];

   __shared__ TIPO S2[BLOCK_SIZE][BLOCK_SIZE];

int Bby = BLOCK_SIZE * blockIdx.y;

   int Bbx =  blockIdx.x * BLOCK_SIZE;

   int tx = threadIdx.x;

   int ty = threadIdx.y;

   int i = 0, k = 0;

for(i = 0; i < (wa / BLOCK_SIZE) ; i++){

      S1[ty][tx] = *(Device_Element(mat1, ty + Bby, i * BLOCK_SIZE + tx, pa));

      S2[ty][tx] = *(Device_Element(mat2, i * BLOCK_SIZE + ty, Bbx + tx, pb));

__syncthreads();

for(k = 0; k < BLOCK_SIZE; k++)

         auxC += S1[ty][k] * S2[k][tx];

__syncthreads();

   }

*(Device_Element(matR,  Bby + ty, Bbx + tx, pc)) = auxC;

}

void Host_Inicializar(TIPO * m, int h, int w){

int  i = 0;

   int  j = 0;

for (i = 0; i < h; i++)

      for(j = 0; j < w; j++ )

         m[i*w+j] = i;

}

int ConversionStringToInt(const char * s){

errno = 0;

   char * endptr;

   int num = (int) strtol(s, &endptr, 10);

if ((errno == ERANGE && (num == LONG_MAX || num == LONG_MIN))

    || (errno != 0 && num == 0)) {

      perror("strtol");

      exit(EXIT_FAILURE);

   }

if (endptr == s) {

      fprintf(stderr, "No digits were found\n");

      exit(EXIT_FAILURE);

   }

return num;

}

int ComprobarPropiedades(int bx, int by, int tx, int ty){

#ifdef _SALIDA_

   fprintf(stdout, "Se ejecutan (%d,%d) bloques (%d,%d)=%d threads\n", bx, by, tx, ty, tx*ty);

   #endif

   cudaDeviceProp propiedades;

   int device = -1;

   fprintf(stderr, "Preguntanto por DEVICE: %s\n",cudaGetErrorString(cudaGetDevice(&device)));

   fprintf(stderr, "Comprobando propiedades de la Tarjeta: %s\n",cudaGetErrorString(cudaGetDeviceProperties (&propiedades, device)));

if (propiedades.kernelExecTimeoutEnabled)

      fprintf(stderr, "Temporizadores de la tarjeta activados\n");

   else

      fprintf(stderr,"Temporizadores de la tarjeta desactivados\n");

if ((propiedades.maxThreadsPerBlock <= (tx*ty)) || (propiedades.maxGridSize[0] <= bx) || (propiedades.maxGridSize[1] <= by)){

      fprintf(stderr, "Demasiados bloques o threads.\nMaximo numero de threads %d, maximo numero de bloques (%d,%d)\n",propiedades.maxThreadsPerBlock, propiedades.maxGridSize[0], propiedades.maxGridSize[1]);

      return 1;

   }

return 0;

}

int main(int argc, char** argv)

{

int ha = 0, wa = 0, hb = 0, wb = 0, hc = ha, wc = wb;

   //Bytes copaidos

   float br = 0.0;

   float bw = 0.0;

   float bandwidth = 0.0;

if (argc < 5){

      fprintf(stderr, "Modo de uso:\

              \nejecutable HA WA HB WB\n");

      return 0;

   }

   else{

      // Conversion

      ha = ConversionStringToInt(argv[1]);

      wa = ConversionStringToInt(argv[2]);

      hb = ConversionStringToInt(argv[3]);

      wb = ConversionStringToInt(argv[4]);

      hc = ha;

      wc = wb;

      if (wa != hb){

         fprintf(stderr, "El ancho de la primera matriz debe ser igual al alto de la segunda WA = HB\n\

                Los valores que has elegido son: (%d,%d)x(%d,%d)\n",ha,wa,hb,wb);

         return -1;

      }

   }

// Comprobando que la GPU tiene memoria suficiente

   size_t memtotal = ((ha*wa + hb*wb + hc*wc)*sizeof(TIPO));

   size_t mFree,total;

   fprintf(stderr, "Preguntando por memoria disponible: %s\n",cudaGetErrorString(cudaMemGetInfo(&mFree,&total)));

   if (mFree < memtotal){

      fprintf(stderr, "El dispositivo no tiene suficiente memoria para estas matrices\n");

      return 0;

   }

TIPO * H_A = (TIPO *) calloc(wa*ha,sizeof(TIPO));

   TIPO * H_B = (TIPO *) calloc(wb*hb,sizeof(TIPO));

   TIPO * H_C = (TIPO *) calloc(wc*hc,sizeof(TIPO));

Host_Inicializar(H_A,ha,wa);

   Host_Inicializar(H_B,hb,wb);

TIPO * D_A;

   TIPO * D_B;

   TIPO * D_C;

   size_t pitchA;

   size_t pitchB;

   size_t pitchC;

fprintf(stderr, "Reservando memoria matriz A en DEVICE: %s\n",cudaGetErrorString(cudaMallocPitch(&D_A, &pitchA, sizeof(TIPO)*wa, ha)));

   fprintf(stderr, "Reservando memoria matriz B en DEVICE: %s\n",cudaGetErrorString(cudaMallocPitch(&D_B, &pitchB, sizeof(TIPO)*wb, hb)));

   fprintf(stderr, "Reservando memoria matriz C en DEVICE: %s\n",cudaGetErrorString(cudaMallocPitch(&D_C, &pitchC, sizeof(TIPO)*wc, hc)));

fprintf(stderr, "Copiando mairiz A en DEVICE: %s\n",cudaGetErrorString(cudaMemcpy2D(D_A, pitchA, H_A, wa*sizeof(TIPO), wa*sizeof(TIPO), ha, cudaMemcpyHostToDevice)));

   fprintf(stderr, "Copiando mairiz B en DEVICE: %s\n",cudaGetErrorString(cudaMemcpy2D(D_B, pitchB, H_B, wb*sizeof(TIPO), wb*sizeof(TIPO), hb, cudaMemcpyHostToDevice)));

//KERNEL

   cudaEvent_t start;

   cudaEvent_t stop;

   float kernel_time;

fprintf(stderr, "Creando evento start: %s\n",cudaGetErrorString(cudaEventCreate(&start)));

   fprintf(stderr, "Creando evento stop : %s\n",cudaGetErrorString(cudaEventCreate(&stop)));

#ifdef _GLOBAL_VARIOS_BLOQUES_

   dim3 threads(BLOCK_SIZE, BLOCK_SIZE);

   dim3 bloques(DivisionEntera(wb, threads.x), DivisionEntera(ha, threads.y));

if (ComprobarPropiedades(bloques.x, bloques.y, threads.x, threads.y))

      return 0;

fprintf(stderr, "cudaEventRecord start: %s\n",cudaGetErrorString(cudaEventRecord(start,0)));	

   D_matrixMul<<<bloques,threads>>>(D_A, D_B, D_C, ha, wa, pitchA, hb, wb, pitchB, pitchC);

   fprintf(stderr, "Ejecutando el kernel: %s\n",cudaGetErrorString(cudaGetLastError()));

fprintf(stderr, "cudaEventRecord stop : %s\n",cudaGetErrorString(cudaEventRecord(stop,0)));

   fprintf(stderr, "cudaEventSynchronize : %s\n",cudaGetErrorString(cudaEventSynchronize(stop)));

   fprintf(stderr, "cudaEventElapsedTime : %s\n",cudaGetErrorString(cudaEventElapsedTime(&kernel_time,start,stop)));

// En total se ejecutan WC*HC threads

   // Cada thread hace 2*WA acessos de lectura

   br = (2.0*wa)*(wc*hc)*sizeof(TIPO);

   // Cada thread hace un sólo acceso de escritura

   bw = (float) (hc*wc)*sizeof(TIPO);

#endif

#ifdef _SHARED_

   dim3 threads(BLOCK_SIZE, BLOCK_SIZE);

   dim3 bloques(DivisionEntera(wb, threads.x), DivisionEntera(ha, threads.y));

if (ComprobarPropiedades(bloques.x, bloques.y, threads.x, threads.y))

      return 0;

fprintf(stderr, "cudaEventRecord start: %s\n",cudaGetErrorString(cudaEventRecord(start,0)));

   D_matrixMulShared<<<bloques,threads>>>(D_A, D_B, D_C, ha, wa, pitchA, hb, wb, pitchB, pitchC);

   fprintf(stderr, "Ejecutando el kernel: %s\n",cudaGetErrorString(cudaGetLastError()));

fprintf(stderr, "cudaEventRecord stop : %s\n",cudaGetErrorString(cudaEventRecord(stop,0)));

   fprintf(stderr, "cudaEventSynchronize : %s\n",cudaGetErrorString(cudaEventSynchronize(stop)));

   fprintf(stderr, "cudaEventElapsedTime : %s\n",cudaGetErrorString(cudaEventElapsedTime(&kernel_time,start,stop)));

// En total se ejecutan WC*HC threads

   // Cada thread hace 2*(WA/BLOCK_SIZE) accesos de escritura

   //br = ( DivisionEntera(wa,BLOCK_SIZE) * (2.0 + 2.0 * BLOCK_SIZE)) * (wc*hc) * sizeof(TIPO);

   br = (bloques.x * (2.0 + 2.0 * BLOCK_SIZE)) * (hc*wc) * sizeof(TIPO);

   // Cada thread hace un sólo acceso de escritura

   bw = ((2.0*bloques.x)+1)*(hc*wc)*sizeof(TIPO);

#endif

fprintf(stderr, "Copiando mairiz C en HOST: %s\n",cudaGetErrorString(cudaMemcpy2D(H_C, wc*sizeof(TIPO), D_C, pitchC, wc*sizeof(TIPO), hc, cudaMemcpyDeviceToHost)));

fprintf(stderr, "Liberando matriz A DEVICE: %s\n",cudaGetErrorString(cudaFree(D_A)));

   fprintf(stderr, "Liberando matriz B DEVICE: %s\n",cudaGetErrorString(cudaFree(D_B)));

   fprintf(stderr, "Liberando matriz C DEVICE: %s\n",cudaGetErrorString(cudaFree(D_C)));

#ifdef _ficheroBIN_

   Host_VolvadoMatrizBin(H_C,"resultadoBasico.bin",hc,wc);

   #endif

   #ifdef _ficheroTXT_

   Host_VolvadoMatrizTXT(H_C,"resultadoBasico.txt",hc,wc);

   #endif

free(H_A);

   free(H_B);

   free(H_C);

bandwidth = ((br + bw)/1000000000.0)/(kernel_time / 1000.0);

   fprintf(stdout, "Tiempo kernel por eventos %.4f segundos\n",kernel_time / 1000.0);

   fprintf(stdout, "Ancho de banda teorico: %.2f Gbyts/sec\n",(1800.0*1000000.0*(256/8)*2)/1000000000.0);

   fprintf(stdout, "Ancho de banda efectivo:((%.0f + %.0f)/10^9)/ %.4f = %.2f Gbyts/sec\n",br,bw,(kernel_time / 1000.0), bandwidth);

return 0;

}

Thanks!!

I have just found the error.
I used -G flag to compile it.