Performance Float vs Double effective bandwidth and execution time

cdmessiah · June 1, 2011, 10:39am

Hello,

I am wondering about the different performance between float and double.

I use a example to explain my question.

The source code perform a multiplication, constant * matrix.

The source code:

// Multiplica un vector por un escalar

#include <stdio.h>

#include <stdlib.h>

#include <errno.h>

#include <time.h>

#include <cuda.h>

#include "../../tools/src/GestionMatricesCuda.cu"

#include "../../tools/src/tipo.h"

#define BLOCK_SIZE 16 

// Kernel con memoria global

// Parametros:

// matrices

__global__ void D_matrixMulEscalar(MatrizCuda mat)//, const TIPO escalar)

{

   int i = blockIdx.y * blockDim.y + threadIdx.y;

   int j = blockIdx.x * blockDim.x + threadIdx.x;

if ((i < mat.H) && (j < mat.W)){

      TIPO * p;

      #ifdef _DOUBLE_

      TIPO escalar = 2.0;

      #else

      TIPO escalar = 2.0f;

      #endif

      p = (TIPO *) Device_Element(mat,i,j);

      (*p)  = (escalar)*(*p);

   }

}

/*

 * Pintar Matriz

 *

 */

void PintarMatriz(MatrizHost m){

   int i = 0;

   int j = 0;

   TIPO * e;

for(i=0; i < m.H; i++){

      for(j=0; j < m.W; j++){

         e = (TIPO *) Host_Element(m,i,j);

         printf("%f ", *e);

      }

      printf("\n");

   }

return;

}

/*

 * Inicializa una matriz con nÂº aleatorios          (a = 0)

 *                       con la secuencia 1,2,...   (a = X)

 * RETURN 1 si todo OK, 0 si error

 *

 */

int Host_Inicializar(MatrizHost m, int a){

int  i = 0;

   int  j = 0;

   TIPO * e;

if (a == 0)

      for (i = 0; i < m.H; i++)

         for(j = 0; j < m.W; j++ ){

            e  = (TIPO *) Host_Element(m,i,j);

            *e = (TIPO)   (rand() / (TIPO)RAND_MAX);

         }

   else

      for (i = 0; i < m.H; i++)

         for(j = 0; j < m.W; j++ ){

            e  = (TIPO *) Host_Element(m,i,j);

            *e = (TIPO) (j + i * m.W );

          }

return 0;

}

/*

 *

 *Divide el numero de elementos entre el tamaÃ±o de bloque dando un numero de bloques suficioente

 *

 */

__host__ int DivisionEntera(double tam, int tam_bloque){

double division = tam/tam_bloque;

int entera = (int) division;

double pdecimal = division - entera;

if (pdecimal  >  0.0)

      entera++;

return entera;

}

int ConversionStringToInt(const char * s){

errno = 0;

   char * endptr;

   int num = (int) strtol(s, &endptr, 10);

if ((errno == ERANGE && (num == LONG_MAX || num == LONG_MIN))

    || (errno != 0 && num == 0)) {

      perror("strtol");

      exit(EXIT_FAILURE);

   }

if (endptr == s) {

      fprintf(stderr, "No digits were found\n");

      exit(EXIT_FAILURE);

   }

return num;

}

int main(int argc, char** argv)

{

int h = 0, w = 0;

   //Bytes copaidos

   float br = 0.0;

   float bw = 0.0;

   float bandwidth = 0.0;

if (argc < 3){

      fprintf(stderr, "Modo de uso:\

              \nejecutable H W\n");

      return 0;

   }

   else{

      // Conversion

      h = ConversionStringToInt(argv[1]);

      w = ConversionStringToInt(argv[2]);

   }

long long int memtotal = (h*w)*sizeof(TIPO);

   size_t free,total;

   fprintf(stderr, "Preguntanto Informacion DEVICE: %s\n",cudaGetErrorString(cudaMemGetInfo(&free,&total)));

   if (free < memtotal){

      fprintf(stderr, "El dispositivo no tiene suficiente memoria matriz\n");

      return 0;

   }

MatrizHost H_M;

fprintf(stdout, "Reservando en el host Matiz\n");

   Host_CrearMatriz(&H_M,h,w,sizeof(TIPO));

fprintf(stdout, "Mbytes a reservar en host %.2f\n",(double)((H_M.H * H_M.W)*sizeof(TIPO))/1204/1024);

fprintf(stdout, "Inicializando en el host Matiz\n");

   Host_Inicializar(H_M,2);

// Matriz del DEVICE

   MatrizCuda D_M;

fprintf(stdout, "Mbytes a reservar en device %.2f\n",(double)((h*w)*sizeof(TIPO))/1204/1024);

fprintf(stdout, "Reservando en el device Matiz\n");

   Device_CrearMatrizAlineada(&D_M,h,w,sizeof(TIPO));

cudaCopiaHostToDevice(H_M,D_M);

cudaEvent_t start;

   cudaEvent_t stop;

   float kernel_time;

fprintf(stderr, "Creando evento start: %s\n",cudaGetErrorString(cudaEventCreate(&start)));

   fprintf(stderr, "Creando evento stop : %s\n",cudaGetErrorString(cudaEventCreate(&stop)));

dim3 threads(BLOCK_SIZE, BLOCK_SIZE);

   dim3 bloques(DivisionEntera(w, threads.x), DivisionEntera(h, threads.y));

   fprintf(stdout, "Se van a ejecutar (%d,%d) bloques (%d,%d) threads\n",bloques.x,bloques.y,threads.x,threads.y);

cudaDeviceProp propiedades;

   int device = -1;

   fprintf(stderr, "Preguntanto por DEVICE: %s\n",cudaGetErrorString(cudaGetDevice(&device)));

   fprintf(stderr, "Comprobando propiedades de la Tarjeta: %s\n",cudaGetErrorString(cudaGetDeviceProperties (&propiedades, device)));

if ((propiedades.maxThreadsPerBlock < (threads.x*threads.y)) || (propiedades.maxGridSize[0] < bloques.x) || (propiedades.maxGridSize[1] < bloques.y)){

      fprintf(stderr, "Demasiados bloques o threads.\nMaximo numero de threads %d, maximo numero de bloques (%d,%d)\n",propiedades.maxThreadsPerBlock, propiedades.maxGridSize[0],propiedades.maxGridSize[1]);

      return 0;

   }

fprintf(stderr, "cudaEventRecord start: %s\n",cudaGetErrorString(cudaEventRecord(start,0)));

D_matrixMulEscalar<<<bloques,threads>>>(D_M);//, 23.0f);

fprintf(stderr, "cudaEventRecord stop : %s\n",cudaGetErrorString(cudaEventRecord(stop,0)));

   fprintf(stderr, "cudaEventSynchronize : %s\n",cudaGetErrorString(cudaEventSynchronize(stop)));

   fprintf(stderr, "cudaEventElapsedTime : %s\n",cudaGetErrorString(cudaEventElapsedTime(&kernel_time,start,stop)));

fprintf(stderr, "Ejecutando el kernel: %s\n",cudaGetErrorString(cudaGetLastError()));

cudaCopiaDeviceToHost(H_M,D_M);

br = H_M.W*H_M.H*sizeof(TIPO);

   bw = H_M.W*H_M.H*sizeof(TIPO);

fprintf(stderr, "cudaEventDestroy: %s\n",cudaGetErrorString(cudaEventDestroy(start)));

   fprintf(stderr, "cudaEventDestroy: %s\n",cudaGetErrorString(cudaEventDestroy(stop)));

Host_LiberaMatriz(H_M);

   Device_Liberear(D_M);

bandwidth = ((br + bw)/1000000000.0)/(kernel_time / 1000.0);

   fprintf(stdout, "Tiempo ejecucion kernel %.6f segundos\n",kernel_time );

   fprintf(stdout, "Ancho de banda teorico: %.2f Gbyts/sec\n",(1800.0*1000000.0*(256/8)*2)/1000000000.0);

   fprintf(stdout, "Ancho de banda efectivo:((%.0f + %.0f)/10^9)/ %.4f = %.2f Gbyts/sec\n",br,bw,kernel_time / 1000.0, bandwidth);

return 0;

}

The compilation command:

nvcc -O3 -arch sm_21 -o bin/matrizEscalarFloat_MA src/matrizEscalar.cu

nvcc -O3 -arch sm_21 -D _DOUBLE_ -o bin/matrizEscalarDouble_MA src/matrizEscalar.cu

I have a GTX460 and the results are:

Using Float:

Matrix 15000x15000 => 858 Mbytes

Effective bandwidth: 62 Gb/s

Time : 29 ms

Matrix 10500x10500 => 420 Mbytes

Effective bandwidth: 61 Gb/s

Time : 14 ms

Using Double

Matrix 10500x10500 => 841 Mbytes

Effective bandwidth: 93 Gb/s

Time : 18 ms

My question is:

Why effective bandwidth is higher using Double?

Why float is slower than double with the same number of bytes and why the effective bandwidth is lower as well?

Thanks

apostglen46 · June 1, 2011, 12:32pm

float is not slower than double for the same size of the problem.

cdmessiah · June 1, 2011, 12:44pm

I mean, If you compare float 15000x15000 with double 10500x10500 => float is slower with the same size in bytes

but if you compare float 10500x10500 with double 10500x10500 => double is slower with the same size of matrix

I do not understand why the effective bandwidth is higher using double.

Thankss

tera · June 1, 2011, 12:53pm

How do blocks of 32x8 threads (or 32x16) score in comparison?

cdmessiah · June 1, 2011, 1:17pm

I use blocks of 16x16 threads or 32x16 threads. Both have the same statistics.

I have the same problem in other programs, where the effective bandwidth using double is higher than using float.

Is it normal?

Thanks

eelsen · June 1, 2011, 6:29pm

It is normal. You will also find that an int2 has higher effective bandwidth than an int or int4. A float2 should have ~ the same bandwidth as a double.

cdmessiah · June 1, 2011, 7:02pm

thank you for answer, but why it is normal?

I mean, why it is possible? it is because each memory modules read two words?

Sorry about my ignorance

cdmessiah · June 6, 2011, 11:05am

Nobody know it, please I need it to understand the GPUs architecture.

Thanks

tera · June 6, 2011, 11:18am

It means that memory transactions of twice the size are used, so that with the same number of outstanding requests the number of bytes in flight at any one time is doubled.

Topic		Replies	Views
Low Bandwidth with simple data copy CUDA Programming and Performance	4	9131	December 7, 2011
Theoretical performance using doubles CUDA Programming and Performance	2	3751	May 15, 2008
Kernel faster in double precision than in simple ? CUDA Programming and Performance	4	1027	April 14, 2012
Transpose example, strange dim dependent lagg.. CUDA Programming and Performance	24	12311	October 25, 2009
How to get peak rate with simple opeartion Question about performance optimization CUDA Programming and Performance	17	13671	June 2, 2008
Effective Bandwidth Problem CUDA Programming and Performance	13	7747	March 23, 2011
Vehicle Routing Problem with CUDA CUDA Programming and Performance	18	4403	January 14, 2010
Bandwidth calculation Newbie question... CUDA Programming and Performance	10	5422	August 1, 2008
why reading doubles is much faster than reading floats? CUDA Programming and Performance	3	1054	January 2, 2014
clock cycles of double operation CUDA Programming and Performance	9	5120	April 23, 2009

Performance Float vs Double effective bandwidth and execution time

Related topics