Performance Float vs Double effective bandwidth and execution time


I am wondering about the different performance between float and double.

I use a example to explain my question.

The source code perform a multiplication, constant * matrix.

The source code:

// Multiplica un vector por un escalar

#include <stdio.h>

#include <stdlib.h>

#include <errno.h>

#include <time.h>

#include <cuda.h>

#include "../../tools/src/"

#include "../../tools/src/tipo.h"

#define BLOCK_SIZE 16 

// Kernel con memoria global

// Parametros:

// matrices

__global__ void D_matrixMulEscalar(MatrizCuda mat)//, const TIPO escalar)


   int i = blockIdx.y * blockDim.y + threadIdx.y;

   int j = blockIdx.x * blockDim.x + threadIdx.x;

if ((i < mat.H) && (j < mat.W)){

      TIPO * p;

      #ifdef _DOUBLE_

      TIPO escalar = 2.0;


      TIPO escalar = 2.0f;


      p = (TIPO *) Device_Element(mat,i,j);

      (*p)  = (escalar)*(*p);




 * Pintar Matriz



void PintarMatriz(MatrizHost m){

   int i = 0;

   int j = 0;

   TIPO * e;

for(i=0; i < m.H; i++){

      for(j=0; j < m.W; j++){

         e = (TIPO *) Host_Element(m,i,j);

         printf("%f ", *e);







 * Inicializa una matriz con nº aleatorios          (a = 0)

 *                       con la secuencia 1,2,...   (a = X)

 * RETURN 1 si todo OK, 0 si error



int Host_Inicializar(MatrizHost m, int a){

int  i = 0;

   int  j = 0;

   TIPO * e;

if (a == 0)

      for (i = 0; i < m.H; i++)

         for(j = 0; j < m.W; j++ ){

            e  = (TIPO *) Host_Element(m,i,j);

            *e = (TIPO)   (rand() / (TIPO)RAND_MAX);



      for (i = 0; i < m.H; i++)

         for(j = 0; j < m.W; j++ ){

            e  = (TIPO *) Host_Element(m,i,j);

            *e = (TIPO) (j + i * m.W );


return 0;




 *Divide el numero de elementos entre el tamaño de bloque dando un numero de bloques suficioente



__host__ int DivisionEntera(double tam, int tam_bloque){

double division = tam/tam_bloque;

int entera = (int) division;

double pdecimal = division - entera;

if (pdecimal  >  0.0)


return entera;


int ConversionStringToInt(const char * s){

errno = 0;

   char * endptr;

   int num = (int) strtol(s, &endptr, 10);

if ((errno == ERANGE && (num == LONG_MAX || num == LONG_MIN))

    || (errno != 0 && num == 0)) {




if (endptr == s) {

      fprintf(stderr, "No digits were found\n");



return num;


int main(int argc, char** argv)


int h = 0, w = 0;

   //Bytes copaidos

   float br = 0.0;

   float bw = 0.0;

   float bandwidth = 0.0;

if (argc < 3){

      fprintf(stderr, "Modo de uso:\

              \nejecutable H W\n");

      return 0;



      // Conversion

      h = ConversionStringToInt(argv[1]);

      w = ConversionStringToInt(argv[2]);


long long int memtotal = (h*w)*sizeof(TIPO);

   size_t free,total;

   fprintf(stderr, "Preguntanto Informacion DEVICE: %s\n",cudaGetErrorString(cudaMemGetInfo(&free,&total)));

   if (free < memtotal){

      fprintf(stderr, "El dispositivo no tiene suficiente memoria matriz\n");

      return 0;


MatrizHost H_M;

fprintf(stdout, "Reservando en el host Matiz\n");


fprintf(stdout, "Mbytes a reservar en host %.2f\n",(double)((H_M.H * H_M.W)*sizeof(TIPO))/1204/1024);

fprintf(stdout, "Inicializando en el host Matiz\n");


// Matriz del DEVICE

   MatrizCuda D_M;

fprintf(stdout, "Mbytes a reservar en device %.2f\n",(double)((h*w)*sizeof(TIPO))/1204/1024);

fprintf(stdout, "Reservando en el device Matiz\n");



cudaEvent_t start;

   cudaEvent_t stop;

   float kernel_time;

fprintf(stderr, "Creando evento start: %s\n",cudaGetErrorString(cudaEventCreate(&start)));

   fprintf(stderr, "Creando evento stop : %s\n",cudaGetErrorString(cudaEventCreate(&stop)));

dim3 threads(BLOCK_SIZE, BLOCK_SIZE);

   dim3 bloques(DivisionEntera(w, threads.x), DivisionEntera(h, threads.y));

   fprintf(stdout, "Se van a ejecutar (%d,%d) bloques (%d,%d) threads\n",bloques.x,bloques.y,threads.x,threads.y);

cudaDeviceProp propiedades;

   int device = -1;

   fprintf(stderr, "Preguntanto por DEVICE: %s\n",cudaGetErrorString(cudaGetDevice(&device)));

   fprintf(stderr, "Comprobando propiedades de la Tarjeta: %s\n",cudaGetErrorString(cudaGetDeviceProperties (&propiedades, device)));

if ((propiedades.maxThreadsPerBlock < (threads.x*threads.y)) || (propiedades.maxGridSize[0] < bloques.x) || (propiedades.maxGridSize[1] < bloques.y)){

      fprintf(stderr, "Demasiados bloques o threads.\nMaximo numero de threads %d, maximo numero de bloques (%d,%d)\n",propiedades.maxThreadsPerBlock, propiedades.maxGridSize[0],propiedades.maxGridSize[1]);

      return 0;


fprintf(stderr, "cudaEventRecord start: %s\n",cudaGetErrorString(cudaEventRecord(start,0)));

D_matrixMulEscalar<<<bloques,threads>>>(D_M);//, 23.0f);

fprintf(stderr, "cudaEventRecord stop : %s\n",cudaGetErrorString(cudaEventRecord(stop,0)));

   fprintf(stderr, "cudaEventSynchronize : %s\n",cudaGetErrorString(cudaEventSynchronize(stop)));

   fprintf(stderr, "cudaEventElapsedTime : %s\n",cudaGetErrorString(cudaEventElapsedTime(&kernel_time,start,stop)));

fprintf(stderr, "Ejecutando el kernel: %s\n",cudaGetErrorString(cudaGetLastError()));


br = H_M.W*H_M.H*sizeof(TIPO);

   bw = H_M.W*H_M.H*sizeof(TIPO);

fprintf(stderr, "cudaEventDestroy: %s\n",cudaGetErrorString(cudaEventDestroy(start)));

   fprintf(stderr, "cudaEventDestroy: %s\n",cudaGetErrorString(cudaEventDestroy(stop)));



bandwidth = ((br + bw)/1000000000.0)/(kernel_time / 1000.0);

   fprintf(stdout, "Tiempo ejecucion kernel %.6f segundos\n",kernel_time );

   fprintf(stdout, "Ancho de banda teorico: %.2f Gbyts/sec\n",(1800.0*1000000.0*(256/8)*2)/1000000000.0);

   fprintf(stdout, "Ancho de banda efectivo:((%.0f + %.0f)/10^9)/ %.4f = %.2f Gbyts/sec\n",br,bw,kernel_time / 1000.0, bandwidth);

return 0;


The compilation command:

nvcc -O3 -arch sm_21 -o bin/matrizEscalarFloat_MA src/

nvcc -O3 -arch sm_21 -D _DOUBLE_ -o bin/matrizEscalarDouble_MA src/

I have a GTX460 and the results are:

Using Float:

Matrix 15000x15000 => 858 Mbytes

Effective bandwidth: 62 Gb/s

Time : 29 ms

Matrix 10500x10500 => 420 Mbytes

Effective bandwidth: 61 Gb/s

Time : 14 ms

Using Double

Matrix 10500x10500 => 841 Mbytes

Effective bandwidth: 93 Gb/s

Time : 18 ms

My question is:

Why effective bandwidth is higher using Double?

Why float is slower than double with the same number of bytes and why the effective bandwidth is lower as well?


float is not slower than double for the same size of the problem.

I mean, If you compare float 15000x15000 with double 10500x10500 => float is slower with the same size in bytes

but if you compare float 10500x10500 with double 10500x10500 => double is slower with the same size of matrix

I do not understand why the effective bandwidth is higher using double.


How do blocks of 32x8 threads (or 32x16) score in comparison?

I use blocks of 16x16 threads or 32x16 threads. Both have the same statistics.

I have the same problem in other programs, where the effective bandwidth using double is higher than using float.

Is it normal?


It is normal. You will also find that an int2 has higher effective bandwidth than an int or int4. A float2 should have ~ the same bandwidth as a double.

thank you for answer, but why it is normal?

I mean, why it is possible? it is because each memory modules read two words?

Sorry about my ignorance

Nobody know it, please I need it to understand the GPUs architecture.


It means that memory transactions of twice the size are used, so that with the same number of outstanding requests the number of bytes in flight at any one time is doubled.