Hey there,
I have an artificial kernel for demonstration purposes and I want to compare the performance between double, float and half. I extracted an example code below. I’m not able to approximately half the computation time of the kernel when using halfs (compared to floats). I can’t even manage to let the half version perform better than float. I’m usually using a V100 with Cuda 10.2 and Ubuntu 18.04.3 LTS (GNU/Linux 4.15.0-74-generic x86_64), but I also reproduced the results on a GeForce RTX 2080 with Cuda 10.1 and Arch Linux
#include <sys/stat.h>
#include "cuda_runtime_api.h"
#include "cuda.h"
#include "cuda_runtime.h"
#include "cuda_fp16.h"
#include <chrono>
#include <string>
#include <vector>
#include <iomanip>
#include <ctime>
#include <iostream>
#include <fstream>
#include <string.h>
//#include <dir>
#include <new>
template<typename T>
__device__ __forceinline__ T computation(T a, T b){
return ((a * b + a * b + a * b) / (a * b + a * b + a * b));
}
template<typename T>
__global__ void add_comp_kernel(T* a, T* b, T* c, int N){
int idx = blockIdx.x*blockDim.x+threadIdx.x;
if(idx < N){
T a_d = a[idx];
T b_d = b[idx];
T c_d = 0;
for(int i = 0; i<1000; ++i){
c_d += computation(a_d,b_d);
}
c[idx]=c_d;
}
}
void writeTimeMeasurement_pinned(std::vector<cudaEvent_t> events, std::ofstream &outputFile, long int size);
void writeTimeMeasurement_pinned(std::vector<cudaEvent_t> events, std::ofstream &outputFile, long int size){
for(int i = 0; i < events.size()/2; ++i){
float millisecs = 0.0;
cudaEventElapsedTime(&millisecs ,events[i], events[i+5]);
outputFile << millisecs << " \t" ;
}
}
int main(int argc, char** argv){
int samplesize = atoi(argv[1]);
char* datatype = argv[2];
char* outdir= argv[3];
long int N[] = {10000000, 50000000, 100000000, 125000000, 250000000, 500000000, 1000000000};
std::string outdirpath = "/home/mkoehler/sparse-matrix-gpu/cuda-test/";
outdirpath += outdir;
std::string outfilename = outdirpath;
outfilename += std::string("/");
int threads = 1024;
std::vector<std::string> eventNames(6);
eventNames[0] = "buffer_a";
eventNames[1] = "buffer_b";
eventNames[2] = "buffer_c_alloc";
eventNames[3] = "kernel";
eventNames[4] = "buffer_c_to_host";
eventNames[5] = "wallclock_time";
for(int n = 0; n<7; ++n){
std::string size_string = std::to_string(N[n]);
std::ofstream outputFile(outfilename+datatype+"_"+size_string);
outputFile << "#" << eventNames[0]
<< " \t" << eventNames[1]
<< " \t" << eventNames[2]
<< " \t" << eventNames[3]
<< " \t" << eventNames[4]
<< " \t" << eventNames[5]
<< std::endl;
if(strcmp(datatype, "double") == 0){
for(int run = 0; run < samplesize; ++run){
std::clock_t c_start = std::clock();
double *host_a_d, *host_b_d, *host_c_d;
cudaMallocHost((void**)&host_a_d,sizeof(double)*N[n]);
cudaMallocHost((void**)&host_b_d,sizeof(double)*N[n]);
cudaMallocHost((void**)&host_c_d,sizeof(double)*N[n]);
double *d_a, *d_b, *d_c;
for(int i=0; i< N[n]; ++i){host_a_d[i] = 1; host_b_d[i]=1;}
std::vector<cudaEvent_t> events(10);
for(int i=0; i<events.size(); ++i) cudaEventCreate(&events[i]);
cudaDeviceSynchronize();
cudaEventRecord(events[0]);
cudaMalloc(&d_a, N[n]*sizeof(double));
cudaMemcpy(d_a, host_a_d, N[n]*sizeof(double), cudaMemcpyHostToDevice);
cudaEventRecord(events[5]);
cudaEventSynchronize(events[5]);
cudaEventRecord(events[1]);
cudaMalloc(&d_b, N[n]*sizeof(double));
cudaMemcpy(d_b, host_b_d, N[n]*sizeof(double), cudaMemcpyHostToDevice);
cudaEventRecord(events[6]);
cudaEventSynchronize(events[6]);
cudaEventRecord(events[2]);
cudaMalloc(&d_c, N[n]*sizeof(double));
cudaEventRecord(events[7]);
cudaEventSynchronize(events[7]);
cudaEventRecord(events[3]);
// Launch add() kernel on GPU
add_comp_kernel<<<(int)ceil(float(N[n])/threads),threads>>>(d_a, d_b, d_c, N[n]);
cudaEventRecord(events[8]);
cudaEventSynchronize(events[8]);
cudaEventRecord(events[4]);
// Copy result back to the host
cudaMemcpy(host_c_d, d_c, N[n]*sizeof(double), cudaMemcpyDeviceToHost);
cudaEventRecord(events[9]);
cudaEventSynchronize(events[9]);
cudaDeviceSynchronize();
std::clock_t c_end = std::clock();
writeTimeMeasurement_pinned(events, outputFile, N[n]);
outputFile << 1000.0*(c_end - c_start)/CLOCKS_PER_SEC << std::endl;
// Cleanup
for(auto e:events) cudaEventDestroy(e);
double average=0;
for(int i=0; i< N[n]; ++i){average += host_c_d[i];}
average = average/N[n];
std::cout << "sample run: " << run << std::endl;
std::cout << datatype << " c average: " << average << std::endl;
cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
cudaFreeHost(host_a_d); cudaFreeHost(host_b_d); cudaFreeHost(host_c_d);
cudaDeviceSynchronize();
}
}
else if(strcmp(datatype,"float") == 0){
for(int run = 0; run < samplesize; ++run){
std::clock_t c_start = std::clock();
float *host_a_d, *host_b_d, *host_c_d;
cudaMallocHost((void**)&host_a_d,sizeof(float)*N[n]);
cudaMallocHost((void**)&host_b_d,sizeof(float)*N[n]);
cudaMallocHost((void**)&host_c_d,sizeof(float)*N[n]);
float *d_a, *d_b, *d_c;
for(int i=0; i< N[n]; ++i){host_a_d[i] = 1; host_b_d[i]=1;}
std::vector<cudaEvent_t> events(10);
for(int i=0; i<events.size(); ++i) cudaEventCreate(&events[i]);
cudaDeviceSynchronize();
cudaEventRecord(events[0]);
cudaMalloc(&d_a, N[n]*sizeof(float));
cudaMemcpy(d_a, host_a_d, N[n]*sizeof(float), cudaMemcpyHostToDevice);
cudaEventRecord(events[5]);
cudaEventSynchronize(events[5]);
cudaEventRecord(events[1]);
cudaMalloc(&d_b, N[n]*sizeof(float));
cudaMemcpy(d_b, host_b_d, N[n]*sizeof(float), cudaMemcpyHostToDevice);
cudaEventRecord(events[6]);
cudaEventSynchronize(events[6]);
cudaEventRecord(events[2]);
cudaMalloc(&d_c, N[n]*sizeof(float));
cudaEventRecord(events[7]);
cudaEventSynchronize(events[7]);
cudaEventRecord(events[3]);
// Launch add() kernel on GPU
add_comp_kernel<<<(int)ceil(float(N[n])/threads),threads>>>(d_a, d_b, d_c, N[n]);
cudaEventRecord(events[8]);
cudaEventSynchronize(events[8]);
cudaEventRecord(events[4]);
// Copy result back to the host
cudaMemcpy(host_c_d, d_c, N[n]*sizeof(float), cudaMemcpyDeviceToHost);
cudaEventRecord(events[9]);
cudaEventSynchronize(events[9]);
cudaDeviceSynchronize();
std::clock_t c_end = std::clock();
writeTimeMeasurement_pinned(events, outputFile, N[n]);
outputFile << 1000.0*(c_end - c_start)/CLOCKS_PER_SEC << std::endl;
// Cleanup
for(auto e:events) cudaEventDestroy(e);
double average=0;
for(int i=0; i< N[n]; ++i){average += (double)host_c_d[i];}
average = average/N[n];
std::cout << "sample run: " << run << std::endl;
std::cout << datatype << " c average: " << average << std::endl;
cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
cudaFreeHost(host_a_d); cudaFreeHost(host_b_d); cudaFreeHost(host_c_d);
cudaDeviceSynchronize();
}
}
else if(strcmp(datatype,"half") == 0){
for(int run = 0; run < samplesize; ++run){
std::clock_t c_start = std::clock();
half *host_a_f, *host_b_f, *host_c_f;
cudaMallocHost((void**)&host_a_f,sizeof(half)*N[n]);
cudaMallocHost((void**)&host_b_f,sizeof(half)*N[n]);
cudaMallocHost((void**)&host_c_f,sizeof(half)*N[n]);
half *d_a, *d_b, *d_c;
for(int i=0; i< N[n]; ++i){host_a_f[i] = __float2half(1); host_b_f[i]=__float2half(1);}
std::vector<cudaEvent_t> events(10);
for(int i=0; i<events.size(); ++i) cudaEventCreate(&events[i]);
cudaDeviceSynchronize();
cudaEventRecord(events[0]);
cudaMalloc(&d_a, N[n]*sizeof(half));
cudaMemcpy(d_a, host_a_f, N[n]*sizeof(half), cudaMemcpyHostToDevice);
cudaEventRecord(events[5]);
cudaEventSynchronize(events[5]);
cudaEventRecord(events[1]);
cudaMalloc(&d_b, N[n]*sizeof(half));
cudaMemcpy(d_b, host_b_f, N[n]*sizeof(half), cudaMemcpyHostToDevice);
cudaEventRecord(events[6]);
cudaEventSynchronize(events[6]);
cudaEventRecord(events[2]);
cudaMalloc(&d_c, N[n]*sizeof(half));
cudaEventRecord(events[7]);
cudaEventSynchronize(events[7]);
cudaEventRecord(events[3]);
// Launch add() kernel on GPU
add_comp_kernel<<<(int)ceil(float(N[n])/threads),threads>>>(d_a, d_b, d_c, N[n]);
cudaEventRecord(events[8]);
cudaEventSynchronize(events[8]);
cudaEventRecord(events[4]);
// Copy result back to the host
cudaMemcpy(host_c_f, d_c, N[n]*sizeof(half), cudaMemcpyDeviceToHost);
cudaEventRecord(events[9]);
cudaEventSynchronize(events[9]);
cudaDeviceSynchronize();
std::clock_t c_end = std::clock();
writeTimeMeasurement_pinned(events, outputFile, N[n]);
outputFile << 1000.0*(c_end - c_start)/CLOCKS_PER_SEC << std::endl;
// Cleanup
for(auto e:events) cudaEventDestroy(e);
double average=0;
for(int i=0; i< N[n]; ++i){average += (double) host_c_f[i];}
average = average/N[n];
std::cout << "sample run: " << run << std::endl;
std::cout << datatype << " c average: " << average << std::endl;
cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
cudaFreeHost(host_a_f); cudaFreeHost(host_b_f); cudaFreeHost(host_c_f);
cudaDeviceSynchronize();
}
}
}
}
I thought for a moment that for halfs the arithemtic operators aren’t properly overloaded with __hadd and so on, so I made a template specification for halfs, but this didn’t change anything. I’d love to know if you can reproduce the poor half performance. Furthermore, I want to ask for your help, if I’m doing anything wrong.
I compiled the code with
nvcc --gpu-architecture=sm_70 -O0 bugreport.cu -o bugtest
and then tested it with
./bugtest 10 half bugtest_results
which performs 10 runs of the same vector size with datatype half and stores it in the pre-existing dir ./bugtest_results
If you want to test it, please take care of line 56 which defines the base path.
Thanks in advance,
Max