Hello,
I am wondering about the different performance between float and double.
I use a example to explain my question.
The source code perform a multiplication, constant * matrix.
The source code:
// Multiplica un vector por un escalar
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <time.h>
#include <cuda.h>
#include "../../tools/src/GestionMatricesCuda.cu"
#include "../../tools/src/tipo.h"
#define BLOCK_SIZE 16
// Kernel con memoria global
// Parametros:
// matrices
__global__ void D_matrixMulEscalar(MatrizCuda mat)//, const TIPO escalar)
{
int i = blockIdx.y * blockDim.y + threadIdx.y;
int j = blockIdx.x * blockDim.x + threadIdx.x;
if ((i < mat.H) && (j < mat.W)){
TIPO * p;
#ifdef _DOUBLE_
TIPO escalar = 2.0;
#else
TIPO escalar = 2.0f;
#endif
p = (TIPO *) Device_Element(mat,i,j);
(*p) = (escalar)*(*p);
}
}
/*
* Pintar Matriz
*
*/
void PintarMatriz(MatrizHost m){
int i = 0;
int j = 0;
TIPO * e;
for(i=0; i < m.H; i++){
for(j=0; j < m.W; j++){
e = (TIPO *) Host_Element(m,i,j);
printf("%f ", *e);
}
printf("\n");
}
return;
}
/*
* Inicializa una matriz con nº aleatorios (a = 0)
* con la secuencia 1,2,... (a = X)
* RETURN 1 si todo OK, 0 si error
*
*/
int Host_Inicializar(MatrizHost m, int a){
int i = 0;
int j = 0;
TIPO * e;
if (a == 0)
for (i = 0; i < m.H; i++)
for(j = 0; j < m.W; j++ ){
e = (TIPO *) Host_Element(m,i,j);
*e = (TIPO) (rand() / (TIPO)RAND_MAX);
}
else
for (i = 0; i < m.H; i++)
for(j = 0; j < m.W; j++ ){
e = (TIPO *) Host_Element(m,i,j);
*e = (TIPO) (j + i * m.W );
}
return 0;
}
/*
*
*Divide el numero de elementos entre el tamaño de bloque dando un numero de bloques suficioente
*
*/
__host__ int DivisionEntera(double tam, int tam_bloque){
double division = tam/tam_bloque;
int entera = (int) division;
double pdecimal = division - entera;
if (pdecimal > 0.0)
entera++;
return entera;
}
int ConversionStringToInt(const char * s){
errno = 0;
char * endptr;
int num = (int) strtol(s, &endptr, 10);
if ((errno == ERANGE && (num == LONG_MAX || num == LONG_MIN))
|| (errno != 0 && num == 0)) {
perror("strtol");
exit(EXIT_FAILURE);
}
if (endptr == s) {
fprintf(stderr, "No digits were found\n");
exit(EXIT_FAILURE);
}
return num;
}
int main(int argc, char** argv)
{
int h = 0, w = 0;
//Bytes copaidos
float br = 0.0;
float bw = 0.0;
float bandwidth = 0.0;
if (argc < 3){
fprintf(stderr, "Modo de uso:\
\nejecutable H W\n");
return 0;
}
else{
// Conversion
h = ConversionStringToInt(argv[1]);
w = ConversionStringToInt(argv[2]);
}
long long int memtotal = (h*w)*sizeof(TIPO);
size_t free,total;
fprintf(stderr, "Preguntanto Informacion DEVICE: %s\n",cudaGetErrorString(cudaMemGetInfo(&free,&total)));
if (free < memtotal){
fprintf(stderr, "El dispositivo no tiene suficiente memoria matriz\n");
return 0;
}
MatrizHost H_M;
fprintf(stdout, "Reservando en el host Matiz\n");
Host_CrearMatriz(&H_M,h,w,sizeof(TIPO));
fprintf(stdout, "Mbytes a reservar en host %.2f\n",(double)((H_M.H * H_M.W)*sizeof(TIPO))/1204/1024);
fprintf(stdout, "Inicializando en el host Matiz\n");
Host_Inicializar(H_M,2);
// Matriz del DEVICE
MatrizCuda D_M;
fprintf(stdout, "Mbytes a reservar en device %.2f\n",(double)((h*w)*sizeof(TIPO))/1204/1024);
fprintf(stdout, "Reservando en el device Matiz\n");
Device_CrearMatrizAlineada(&D_M,h,w,sizeof(TIPO));
cudaCopiaHostToDevice(H_M,D_M);
cudaEvent_t start;
cudaEvent_t stop;
float kernel_time;
fprintf(stderr, "Creando evento start: %s\n",cudaGetErrorString(cudaEventCreate(&start)));
fprintf(stderr, "Creando evento stop : %s\n",cudaGetErrorString(cudaEventCreate(&stop)));
dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
dim3 bloques(DivisionEntera(w, threads.x), DivisionEntera(h, threads.y));
fprintf(stdout, "Se van a ejecutar (%d,%d) bloques (%d,%d) threads\n",bloques.x,bloques.y,threads.x,threads.y);
cudaDeviceProp propiedades;
int device = -1;
fprintf(stderr, "Preguntanto por DEVICE: %s\n",cudaGetErrorString(cudaGetDevice(&device)));
fprintf(stderr, "Comprobando propiedades de la Tarjeta: %s\n",cudaGetErrorString(cudaGetDeviceProperties (&propiedades, device)));
if ((propiedades.maxThreadsPerBlock < (threads.x*threads.y)) || (propiedades.maxGridSize[0] < bloques.x) || (propiedades.maxGridSize[1] < bloques.y)){
fprintf(stderr, "Demasiados bloques o threads.\nMaximo numero de threads %d, maximo numero de bloques (%d,%d)\n",propiedades.maxThreadsPerBlock, propiedades.maxGridSize[0],propiedades.maxGridSize[1]);
return 0;
}
fprintf(stderr, "cudaEventRecord start: %s\n",cudaGetErrorString(cudaEventRecord(start,0)));
D_matrixMulEscalar<<<bloques,threads>>>(D_M);//, 23.0f);
fprintf(stderr, "cudaEventRecord stop : %s\n",cudaGetErrorString(cudaEventRecord(stop,0)));
fprintf(stderr, "cudaEventSynchronize : %s\n",cudaGetErrorString(cudaEventSynchronize(stop)));
fprintf(stderr, "cudaEventElapsedTime : %s\n",cudaGetErrorString(cudaEventElapsedTime(&kernel_time,start,stop)));
fprintf(stderr, "Ejecutando el kernel: %s\n",cudaGetErrorString(cudaGetLastError()));
cudaCopiaDeviceToHost(H_M,D_M);
br = H_M.W*H_M.H*sizeof(TIPO);
bw = H_M.W*H_M.H*sizeof(TIPO);
fprintf(stderr, "cudaEventDestroy: %s\n",cudaGetErrorString(cudaEventDestroy(start)));
fprintf(stderr, "cudaEventDestroy: %s\n",cudaGetErrorString(cudaEventDestroy(stop)));
Host_LiberaMatriz(H_M);
Device_Liberear(D_M);
bandwidth = ((br + bw)/1000000000.0)/(kernel_time / 1000.0);
fprintf(stdout, "Tiempo ejecucion kernel %.6f segundos\n",kernel_time );
fprintf(stdout, "Ancho de banda teorico: %.2f Gbyts/sec\n",(1800.0*1000000.0*(256/8)*2)/1000000000.0);
fprintf(stdout, "Ancho de banda efectivo:((%.0f + %.0f)/10^9)/ %.4f = %.2f Gbyts/sec\n",br,bw,kernel_time / 1000.0, bandwidth);
return 0;
}
The compilation command:
nvcc -O3 -arch sm_21 -o bin/matrizEscalarFloat_MA src/matrizEscalar.cu
nvcc -O3 -arch sm_21 -D _DOUBLE_ -o bin/matrizEscalarDouble_MA src/matrizEscalar.cu
I have a GTX460 and the results are:
Using Float:
Matrix 15000x15000 => 858 Mbytes
Effective bandwidth: 62 Gb/s
Time : 29 ms
Matrix 10500x10500 => 420 Mbytes
Effective bandwidth: 61 Gb/s
Time : 14 ms
Using Double
Matrix 10500x10500 => 841 Mbytes
Effective bandwidth: 93 Gb/s
Time : 18 ms
My question is:
Why effective bandwidth is higher using Double?
Why float is slower than double with the same number of bytes and why the effective bandwidth is lower as well?
Thanks