Hi all,
I am starting using matrix multiplication example to learn CUDA. So I made two initial versions, just using global memory and using shared memory.
When I executed both versions, the shared memory version is slower than the global memory version. Why is it happening?
Example A*B=C
A 3200x1600
B 1600x1600
C 3200x1600
Global Memory version 1.44 secs
Shared Memory version 2.17 secs
If I use matrixes bigest the outcomes are the same, the time wasting on shared memory version is 44% highest than global memory.
I used the CUDA profiler and the unique difference is that I do not know why, but the dram reads are
Global 85 request
Shared 89 request
I thought that the shared memory version would make less requests to dram memory than the global version.
I leave my source code, if someone see anything wrong please help me. I do not know what is happening.
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <errno.h>
typedef float TIPO;
#define BLOCK_SIZE 16
//(T*)((char*)BaseAddress + Row * pitch) + Column;
#define Device_Element(m, i, j, p) (TIPO*)((char *)(m) + ((i) * (p)) + (SIZE_TIPO * (j)))
// Kernel GLOBAL MEMORY VERSION
__global__ void D_matrixMul(TIPO * mat1, TIPO * mat2, TIPO * matR, int ha, int wa, int pa, int hb, int wb, int pb, int pc)
{
int i = blockIdx.y * blockDim.y + threadIdx.y;
int j = blockIdx.x * blockDim.x + threadIdx.x;
if ((i < ha) && (j < wb)){
TIPO acum = 0.0f;
for(int k = 0; k< wa ; k++)
acum += *(Device_Element(mat1,i,k,pa)) * *(Device_Element(mat2,k,j,pb));
*(Device_Element(matR,i,j,pc)) = acum;
}
}
// KERNEL SHARED MEMORY VERSION
__global__ void D_matrixMulShared(TIPO * mat1, TIPO * mat2, TIPO * matR, int ha, int wa, int pa, int hb, int wb, int pb, int pc)
{
TIPO auxC = 0.0f;
// Declarando memoria compartida
__shared__ TIPO S1[BLOCK_SIZE][BLOCK_SIZE];
__shared__ TIPO S2[BLOCK_SIZE][BLOCK_SIZE];
int Bby = BLOCK_SIZE * blockIdx.y;
int Bbx = blockIdx.x * BLOCK_SIZE;
int tx = threadIdx.x;
int ty = threadIdx.y;
int i = 0, k = 0;
for(i = 0; i < (wa / BLOCK_SIZE) ; i++){
S1[ty][tx] = *(Device_Element(mat1, ty + Bby, i * BLOCK_SIZE + tx, pa));
S2[ty][tx] = *(Device_Element(mat2, i * BLOCK_SIZE + ty, Bbx + tx, pb));
__syncthreads();
for(k = 0; k < BLOCK_SIZE; k++)
auxC += S1[ty][k] * S2[k][tx];
__syncthreads();
}
*(Device_Element(matR, Bby + ty, Bbx + tx, pc)) = auxC;
}
void Host_Inicializar(TIPO * m, int h, int w){
int i = 0;
int j = 0;
for (i = 0; i < h; i++)
for(j = 0; j < w; j++ )
m[i*w+j] = i;
}
int ConversionStringToInt(const char * s){
errno = 0;
char * endptr;
int num = (int) strtol(s, &endptr, 10);
if ((errno == ERANGE && (num == LONG_MAX || num == LONG_MIN))
|| (errno != 0 && num == 0)) {
perror("strtol");
exit(EXIT_FAILURE);
}
if (endptr == s) {
fprintf(stderr, "No digits were found\n");
exit(EXIT_FAILURE);
}
return num;
}
int ComprobarPropiedades(int bx, int by, int tx, int ty){
#ifdef _SALIDA_
fprintf(stdout, "Se ejecutan (%d,%d) bloques (%d,%d)=%d threads\n", bx, by, tx, ty, tx*ty);
#endif
cudaDeviceProp propiedades;
int device = -1;
fprintf(stderr, "Preguntanto por DEVICE: %s\n",cudaGetErrorString(cudaGetDevice(&device)));
fprintf(stderr, "Comprobando propiedades de la Tarjeta: %s\n",cudaGetErrorString(cudaGetDeviceProperties (&propiedades, device)));
if (propiedades.kernelExecTimeoutEnabled)
fprintf(stderr, "Temporizadores de la tarjeta activados\n");
else
fprintf(stderr,"Temporizadores de la tarjeta desactivados\n");
if ((propiedades.maxThreadsPerBlock <= (tx*ty)) || (propiedades.maxGridSize[0] <= bx) || (propiedades.maxGridSize[1] <= by)){
fprintf(stderr, "Demasiados bloques o threads.\nMaximo numero de threads %d, maximo numero de bloques (%d,%d)\n",propiedades.maxThreadsPerBlock, propiedades.maxGridSize[0], propiedades.maxGridSize[1]);
return 1;
}
return 0;
}
int main(int argc, char** argv)
{
int ha = 0, wa = 0, hb = 0, wb = 0, hc = ha, wc = wb;
//Bytes copaidos
float br = 0.0;
float bw = 0.0;
float bandwidth = 0.0;
if (argc < 5){
fprintf(stderr, "Modo de uso:\
\nejecutable HA WA HB WB\n");
return 0;
}
else{
// Conversion
ha = ConversionStringToInt(argv[1]);
wa = ConversionStringToInt(argv[2]);
hb = ConversionStringToInt(argv[3]);
wb = ConversionStringToInt(argv[4]);
hc = ha;
wc = wb;
if (wa != hb){
fprintf(stderr, "El ancho de la primera matriz debe ser igual al alto de la segunda WA = HB\n\
Los valores que has elegido son: (%d,%d)x(%d,%d)\n",ha,wa,hb,wb);
return -1;
}
}
// Comprobando que la GPU tiene memoria suficiente
size_t memtotal = ((ha*wa + hb*wb + hc*wc)*sizeof(TIPO));
size_t mFree,total;
fprintf(stderr, "Preguntando por memoria disponible: %s\n",cudaGetErrorString(cudaMemGetInfo(&mFree,&total)));
if (mFree < memtotal){
fprintf(stderr, "El dispositivo no tiene suficiente memoria para estas matrices\n");
return 0;
}
TIPO * H_A = (TIPO *) calloc(wa*ha,sizeof(TIPO));
TIPO * H_B = (TIPO *) calloc(wb*hb,sizeof(TIPO));
TIPO * H_C = (TIPO *) calloc(wc*hc,sizeof(TIPO));
Host_Inicializar(H_A,ha,wa);
Host_Inicializar(H_B,hb,wb);
TIPO * D_A;
TIPO * D_B;
TIPO * D_C;
size_t pitchA;
size_t pitchB;
size_t pitchC;
fprintf(stderr, "Reservando memoria matriz A en DEVICE: %s\n",cudaGetErrorString(cudaMallocPitch(&D_A, &pitchA, sizeof(TIPO)*wa, ha)));
fprintf(stderr, "Reservando memoria matriz B en DEVICE: %s\n",cudaGetErrorString(cudaMallocPitch(&D_B, &pitchB, sizeof(TIPO)*wb, hb)));
fprintf(stderr, "Reservando memoria matriz C en DEVICE: %s\n",cudaGetErrorString(cudaMallocPitch(&D_C, &pitchC, sizeof(TIPO)*wc, hc)));
fprintf(stderr, "Copiando mairiz A en DEVICE: %s\n",cudaGetErrorString(cudaMemcpy2D(D_A, pitchA, H_A, wa*sizeof(TIPO), wa*sizeof(TIPO), ha, cudaMemcpyHostToDevice)));
fprintf(stderr, "Copiando mairiz B en DEVICE: %s\n",cudaGetErrorString(cudaMemcpy2D(D_B, pitchB, H_B, wb*sizeof(TIPO), wb*sizeof(TIPO), hb, cudaMemcpyHostToDevice)));
//KERNEL
cudaEvent_t start;
cudaEvent_t stop;
float kernel_time;
fprintf(stderr, "Creando evento start: %s\n",cudaGetErrorString(cudaEventCreate(&start)));
fprintf(stderr, "Creando evento stop : %s\n",cudaGetErrorString(cudaEventCreate(&stop)));
#ifdef _GLOBAL_VARIOS_BLOQUES_
dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
dim3 bloques(DivisionEntera(wb, threads.x), DivisionEntera(ha, threads.y));
if (ComprobarPropiedades(bloques.x, bloques.y, threads.x, threads.y))
return 0;
fprintf(stderr, "cudaEventRecord start: %s\n",cudaGetErrorString(cudaEventRecord(start,0)));
D_matrixMul<<<bloques,threads>>>(D_A, D_B, D_C, ha, wa, pitchA, hb, wb, pitchB, pitchC);
fprintf(stderr, "Ejecutando el kernel: %s\n",cudaGetErrorString(cudaGetLastError()));
fprintf(stderr, "cudaEventRecord stop : %s\n",cudaGetErrorString(cudaEventRecord(stop,0)));
fprintf(stderr, "cudaEventSynchronize : %s\n",cudaGetErrorString(cudaEventSynchronize(stop)));
fprintf(stderr, "cudaEventElapsedTime : %s\n",cudaGetErrorString(cudaEventElapsedTime(&kernel_time,start,stop)));
// En total se ejecutan WC*HC threads
// Cada thread hace 2*WA acessos de lectura
br = (2.0*wa)*(wc*hc)*sizeof(TIPO);
// Cada thread hace un sólo acceso de escritura
bw = (float) (hc*wc)*sizeof(TIPO);
#endif
#ifdef _SHARED_
dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
dim3 bloques(DivisionEntera(wb, threads.x), DivisionEntera(ha, threads.y));
if (ComprobarPropiedades(bloques.x, bloques.y, threads.x, threads.y))
return 0;
fprintf(stderr, "cudaEventRecord start: %s\n",cudaGetErrorString(cudaEventRecord(start,0)));
D_matrixMulShared<<<bloques,threads>>>(D_A, D_B, D_C, ha, wa, pitchA, hb, wb, pitchB, pitchC);
fprintf(stderr, "Ejecutando el kernel: %s\n",cudaGetErrorString(cudaGetLastError()));
fprintf(stderr, "cudaEventRecord stop : %s\n",cudaGetErrorString(cudaEventRecord(stop,0)));
fprintf(stderr, "cudaEventSynchronize : %s\n",cudaGetErrorString(cudaEventSynchronize(stop)));
fprintf(stderr, "cudaEventElapsedTime : %s\n",cudaGetErrorString(cudaEventElapsedTime(&kernel_time,start,stop)));
// En total se ejecutan WC*HC threads
// Cada thread hace 2*(WA/BLOCK_SIZE) accesos de escritura
//br = ( DivisionEntera(wa,BLOCK_SIZE) * (2.0 + 2.0 * BLOCK_SIZE)) * (wc*hc) * sizeof(TIPO);
br = (bloques.x * (2.0 + 2.0 * BLOCK_SIZE)) * (hc*wc) * sizeof(TIPO);
// Cada thread hace un sólo acceso de escritura
bw = ((2.0*bloques.x)+1)*(hc*wc)*sizeof(TIPO);
#endif
fprintf(stderr, "Copiando mairiz C en HOST: %s\n",cudaGetErrorString(cudaMemcpy2D(H_C, wc*sizeof(TIPO), D_C, pitchC, wc*sizeof(TIPO), hc, cudaMemcpyDeviceToHost)));
fprintf(stderr, "Liberando matriz A DEVICE: %s\n",cudaGetErrorString(cudaFree(D_A)));
fprintf(stderr, "Liberando matriz B DEVICE: %s\n",cudaGetErrorString(cudaFree(D_B)));
fprintf(stderr, "Liberando matriz C DEVICE: %s\n",cudaGetErrorString(cudaFree(D_C)));
#ifdef _ficheroBIN_
Host_VolvadoMatrizBin(H_C,"resultadoBasico.bin",hc,wc);
#endif
#ifdef _ficheroTXT_
Host_VolvadoMatrizTXT(H_C,"resultadoBasico.txt",hc,wc);
#endif
free(H_A);
free(H_B);
free(H_C);
bandwidth = ((br + bw)/1000000000.0)/(kernel_time / 1000.0);
fprintf(stdout, "Tiempo kernel por eventos %.4f segundos\n",kernel_time / 1000.0);
fprintf(stdout, "Ancho de banda teorico: %.2f Gbyts/sec\n",(1800.0*1000000.0*(256/8)*2)/1000000000.0);
fprintf(stdout, "Ancho de banda efectivo:((%.0f + %.0f)/10^9)/ %.4f = %.2f Gbyts/sec\n",br,bw,(kernel_time / 1000.0), bandwidth);
return 0;
}
Thanks!!