Poor half performance

Hey there,

I have an artificial kernel for demonstration purposes and I want to compare the performance between double, float and half. I extracted an example code below. I’m not able to approximately half the computation time of the kernel when using halfs (compared to floats). I can’t even manage to let the half version perform better than float. I’m usually using a V100 with Cuda 10.2 and Ubuntu 18.04.3 LTS (GNU/Linux 4.15.0-74-generic x86_64), but I also reproduced the results on a GeForce RTX 2080 with Cuda 10.1 and Arch Linux

#include <sys/stat.h>
#include "cuda_runtime_api.h"
#include "cuda.h"
#include "cuda_runtime.h"
#include "cuda_fp16.h" 
#include <chrono>
#include <string>
#include <vector>
#include <iomanip>
#include <ctime>
#include <iostream>
#include <fstream>
#include <string.h>
//#include <dir>
#include <new>

template<typename T>
__device__ __forceinline__ T computation(T a, T b){
	return ((a * b + a * b + a * b) / (a * b + a * b + a * b));
}

template<typename T>
__global__ void add_comp_kernel(T* a, T* b, T* c, int N){
	int idx = blockIdx.x*blockDim.x+threadIdx.x;
	if(idx < N){
		T a_d = a[idx];
		T b_d = b[idx];
		T c_d = 0;
		for(int i = 0; i<1000; ++i){
			c_d += computation(a_d,b_d);
		}
		c[idx]=c_d;
	}
}

void writeTimeMeasurement_pinned(std::vector<cudaEvent_t> events, std::ofstream &outputFile, long int size);

void writeTimeMeasurement_pinned(std::vector<cudaEvent_t> events, std::ofstream &outputFile, long int size){

	for(int i = 0; i < events.size()/2; ++i){
		float millisecs = 0.0;
		cudaEventElapsedTime(&millisecs ,events[i], events[i+5]);
		outputFile << millisecs << " \t" ;
	}

}

int main(int argc, char** argv){
	
	int samplesize = atoi(argv[1]);
	char* datatype = argv[2];
	char* outdir= argv[3];
	long int N[] = {10000000, 50000000, 100000000, 125000000, 250000000, 500000000, 1000000000};
	std::string outdirpath = "/home/mkoehler/sparse-matrix-gpu/cuda-test/";
	outdirpath += outdir;
	std::string outfilename = outdirpath;
	outfilename += std::string("/");
	int threads = 1024;

	std::vector<std::string> eventNames(6);
	eventNames[0] = "buffer_a";
	eventNames[1] = "buffer_b";
	eventNames[2] = "buffer_c_alloc";
	eventNames[3] = "kernel";
	eventNames[4] = "buffer_c_to_host";
	eventNames[5] = "wallclock_time";

	for(int n = 0; n<7; ++n){
		std::string size_string = std::to_string(N[n]);
		std::ofstream outputFile(outfilename+datatype+"_"+size_string);
	outputFile << "#" << eventNames[0]
		   << " \t" << eventNames[1]	
		   << " \t" << eventNames[2]	
		   << " \t" << eventNames[3]
		   << " \t" << eventNames[4]	
		   << " \t" << eventNames[5]
	   	   << std::endl;
	if(strcmp(datatype, "double")  == 0){

		for(int run = 0; run < samplesize; ++run){
			std::clock_t c_start = std::clock();
			double *host_a_d, *host_b_d, *host_c_d;
			cudaMallocHost((void**)&host_a_d,sizeof(double)*N[n]);
			cudaMallocHost((void**)&host_b_d,sizeof(double)*N[n]);
			cudaMallocHost((void**)&host_c_d,sizeof(double)*N[n]);
			double *d_a, *d_b, *d_c;
			for(int i=0; i< N[n]; ++i){host_a_d[i] = 1;  host_b_d[i]=1;}
			std::vector<cudaEvent_t> events(10);
			for(int i=0; i<events.size(); ++i) cudaEventCreate(&events[i]);
			cudaDeviceSynchronize();

			cudaEventRecord(events[0]);
			cudaMalloc(&d_a, N[n]*sizeof(double));
			cudaMemcpy(d_a, host_a_d, N[n]*sizeof(double), cudaMemcpyHostToDevice);
			cudaEventRecord(events[5]);
			cudaEventSynchronize(events[5]);

			cudaEventRecord(events[1]);
			cudaMalloc(&d_b, N[n]*sizeof(double));
			cudaMemcpy(d_b, host_b_d, N[n]*sizeof(double), cudaMemcpyHostToDevice);
			cudaEventRecord(events[6]);
			cudaEventSynchronize(events[6]);

			cudaEventRecord(events[2]);
			cudaMalloc(&d_c, N[n]*sizeof(double));		
			cudaEventRecord(events[7]);
			cudaEventSynchronize(events[7]);
			
			cudaEventRecord(events[3]);
			// Launch add() kernel on GPU
			add_comp_kernel<<<(int)ceil(float(N[n])/threads),threads>>>(d_a, d_b, d_c, N[n]);
			cudaEventRecord(events[8]);
			cudaEventSynchronize(events[8]);

			cudaEventRecord(events[4]);
			// Copy result back to the host
			cudaMemcpy(host_c_d, d_c, N[n]*sizeof(double), cudaMemcpyDeviceToHost);
			cudaEventRecord(events[9]);
			cudaEventSynchronize(events[9]);
			cudaDeviceSynchronize();

			std::clock_t c_end = std::clock();
			writeTimeMeasurement_pinned(events, outputFile, N[n]);
			outputFile << 1000.0*(c_end - c_start)/CLOCKS_PER_SEC << std::endl;
			// Cleanup
			for(auto e:events) cudaEventDestroy(e);
			double average=0;
			for(int i=0; i< N[n]; ++i){average += host_c_d[i];}
			average = average/N[n];
			std::cout << "sample run: " << run << std::endl;
			std::cout << datatype << " c average: " << average << std::endl; 
			cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
			cudaFreeHost(host_a_d); cudaFreeHost(host_b_d); cudaFreeHost(host_c_d);
			cudaDeviceSynchronize();	
		}
	}
	else if(strcmp(datatype,"float") == 0){

		for(int run = 0; run < samplesize; ++run){
			std::clock_t c_start = std::clock();
			float *host_a_d, *host_b_d, *host_c_d;
			cudaMallocHost((void**)&host_a_d,sizeof(float)*N[n]);
			cudaMallocHost((void**)&host_b_d,sizeof(float)*N[n]);
			cudaMallocHost((void**)&host_c_d,sizeof(float)*N[n]);
			float *d_a, *d_b, *d_c;
			for(int i=0; i< N[n]; ++i){host_a_d[i] = 1;  host_b_d[i]=1;}
			std::vector<cudaEvent_t> events(10);
			for(int i=0; i<events.size(); ++i) cudaEventCreate(&events[i]);
			cudaDeviceSynchronize();

			cudaEventRecord(events[0]);
			cudaMalloc(&d_a, N[n]*sizeof(float));
			cudaMemcpy(d_a, host_a_d, N[n]*sizeof(float), cudaMemcpyHostToDevice);
			cudaEventRecord(events[5]);
			cudaEventSynchronize(events[5]);

			cudaEventRecord(events[1]);
			cudaMalloc(&d_b, N[n]*sizeof(float));
			cudaMemcpy(d_b, host_b_d, N[n]*sizeof(float), cudaMemcpyHostToDevice);
			cudaEventRecord(events[6]);
			cudaEventSynchronize(events[6]);

			cudaEventRecord(events[2]);
			cudaMalloc(&d_c, N[n]*sizeof(float));		
			cudaEventRecord(events[7]);
			cudaEventSynchronize(events[7]);
			
			cudaEventRecord(events[3]);
			// Launch add() kernel on GPU
			add_comp_kernel<<<(int)ceil(float(N[n])/threads),threads>>>(d_a, d_b, d_c, N[n]);
			cudaEventRecord(events[8]);
			cudaEventSynchronize(events[8]);

			cudaEventRecord(events[4]);
			// Copy result back to the host
			cudaMemcpy(host_c_d, d_c, N[n]*sizeof(float), cudaMemcpyDeviceToHost);
			cudaEventRecord(events[9]);
			cudaEventSynchronize(events[9]);
			cudaDeviceSynchronize();

			std::clock_t c_end = std::clock();
			writeTimeMeasurement_pinned(events, outputFile, N[n]);
			outputFile << 1000.0*(c_end - c_start)/CLOCKS_PER_SEC << std::endl;
			// Cleanup
			for(auto e:events) cudaEventDestroy(e);
			double average=0;
			for(int i=0; i< N[n]; ++i){average += (double)host_c_d[i];}
			average = average/N[n];
			std::cout << "sample run: " << run << std::endl;
			std::cout << datatype << " c average: " << average << std::endl; 
			cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
			cudaFreeHost(host_a_d); cudaFreeHost(host_b_d); cudaFreeHost(host_c_d);
			cudaDeviceSynchronize();	
		}
	}
	else if(strcmp(datatype,"half") == 0){
		for(int run = 0; run < samplesize; ++run){
			std::clock_t c_start = std::clock();
			half *host_a_f, *host_b_f, *host_c_f;
			cudaMallocHost((void**)&host_a_f,sizeof(half)*N[n]);
			cudaMallocHost((void**)&host_b_f,sizeof(half)*N[n]);
			cudaMallocHost((void**)&host_c_f,sizeof(half)*N[n]);
			half *d_a, *d_b, *d_c;
			for(int i=0; i< N[n]; ++i){host_a_f[i] = __float2half(1);  host_b_f[i]=__float2half(1);}
			std::vector<cudaEvent_t> events(10);
			for(int i=0; i<events.size(); ++i) cudaEventCreate(&events[i]);
			cudaDeviceSynchronize();

			cudaEventRecord(events[0]);
			cudaMalloc(&d_a, N[n]*sizeof(half));
			cudaMemcpy(d_a, host_a_f, N[n]*sizeof(half), cudaMemcpyHostToDevice);
			cudaEventRecord(events[5]);
			cudaEventSynchronize(events[5]);

			cudaEventRecord(events[1]);
			cudaMalloc(&d_b, N[n]*sizeof(half));
			cudaMemcpy(d_b, host_b_f, N[n]*sizeof(half), cudaMemcpyHostToDevice);
			cudaEventRecord(events[6]);
			cudaEventSynchronize(events[6]);

			cudaEventRecord(events[2]);
			cudaMalloc(&d_c, N[n]*sizeof(half));		
			cudaEventRecord(events[7]);
			cudaEventSynchronize(events[7]);
			
			cudaEventRecord(events[3]);
			// Launch add() kernel on GPU
			add_comp_kernel<<<(int)ceil(float(N[n])/threads),threads>>>(d_a, d_b, d_c, N[n]);
			cudaEventRecord(events[8]);
			cudaEventSynchronize(events[8]);

			cudaEventRecord(events[4]);
			// Copy result back to the host
			cudaMemcpy(host_c_f, d_c, N[n]*sizeof(half), cudaMemcpyDeviceToHost);
			cudaEventRecord(events[9]);
			cudaEventSynchronize(events[9]);
			cudaDeviceSynchronize();

			std::clock_t c_end = std::clock();
			writeTimeMeasurement_pinned(events, outputFile, N[n]);
			outputFile << 1000.0*(c_end - c_start)/CLOCKS_PER_SEC << std::endl;
			// Cleanup
			for(auto e:events) cudaEventDestroy(e);
			double  average=0;
			for(int i=0; i< N[n]; ++i){average += (double) host_c_f[i];}
			average = average/N[n];
			std::cout << "sample run: " << run << std::endl;
			std::cout << datatype << " c average: " << average << std::endl; 
			cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
			cudaFreeHost(host_a_f); cudaFreeHost(host_b_f); cudaFreeHost(host_c_f);
			cudaDeviceSynchronize();	
		
		}	
	}
	}
}

I thought for a moment that for halfs the arithemtic operators aren’t properly overloaded with __hadd and so on, so I made a template specification for halfs, but this didn’t change anything. I’d love to know if you can reproduce the poor half performance. Furthermore, I want to ask for your help, if I’m doing anything wrong.

I compiled the code with

nvcc --gpu-architecture=sm_70 -O0 bugreport.cu -o bugtest

and then tested it with

./bugtest 10 half bugtest_results

which performs 10 runs of the same vector size with datatype half and stores it in the pre-existing dir ./bugtest_results

If you want to test it, please take care of line 56 which defines the base path.

Thanks in advance,
Max

There is no 16-bit register on the device. They are all 32-bits. To get double performance of a float, you need to use half2. Where you package two half precision variables in one register. And then use the appropriate intrinsic

https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____HALF2__ARITHMETIC.html#group__CUDA__MATH____HALF2__ARITHMETIC

Thanks for the hint.
Everything works fine now.

Edit: only the first vector size worked fine. half and half2 performs still worse than float. half2 is only slightly better than half.

#include <sys/stat.h>
#include "cuda_runtime_api.h"
#include "cuda.h"
#include "cuda_runtime.h"
#include "cuda_fp16.h" 
#include <chrono>
#include <string>
#include <vector>
#include <iomanip>
#include <ctime>
#include <iostream>
#include <fstream>
#include <string.h>
#include <new>

template<typename T>
__device__ __forceinline__ T computation(T a, T b){
        return ((a * b + a * b + a * b + a * b) / (a * b + a * b + a * b + a * b));
}
template<>
__device__ __forceinline__ half computation(half a, half b){
        return __hdiv(__hadd(__hadd(__hadd(__hmul(a,b),__hmul(a,b)), __hmul(a,b)), __hmul(a,b)), __hadd(__hadd(__hadd(__hmul(a,b),__hmul(a,b)), __hmul(a,b)),__hmul(a,b)));
}
template<>
__device__ __forceinline__ half2 computation(half2 a, half2 b){
        return __h2div(__hadd2(__hmul2(a,b), __hmul2(a,b)) ,__hadd2(__hmul2(a,b),__hmul2(a,b)));
}

template<typename T>
__global__ void add_comp_kernel(T* a, T* b, T* c, long int N){
        int idx = blockIdx.x*blockDim.x+threadIdx.x;
        if(idx < N){
                T a_d = a[idx];
                T b_d = b[idx];
                T c_d = c[idx];
                for(int i = 0; i<2000; ++i){
                        c_d += computation(a_d,b_d);
                }
                c[idx]=c_d;
        }
}

void writeTimeMeasurement_pinned(std::vector<cudaEvent_t> events, std::ofstream &outputFile, long int size);

void writeTimeMeasurement_pinned(std::vector<cudaEvent_t> events, std::ofstream &outputFile, long int size){

    	for(int i = 0; i < events.size()/2; ++i){
    		float millisecs = 0.0;
    		cudaEventElapsedTime(&millisecs ,events[i], events[i+5]);
    		outputFile << millisecs << " \t" ;
    	}

    }

int main(int argc, char** argv){
    	
    	int samplesize = atoi(argv[1]);
    	char* datatype = argv[2];
    	char* outdir= argv[3];
    	long int N[] = {10000000, 50000000, 100000000, 125000000, 250000000, 500000000, 1000000000};
    	std::string outdirpath = "/home/mkoehler/sparse-matrix-gpu/cuda-test/";
    	outdirpath += outdir;
    	std::string outfilename = outdirpath;
    	outfilename += std::string("/");
    	int threads = 1024;

    	std::vector<std::string> eventNames(6);
    	eventNames[0] = "buffer_a";
    	eventNames[1] = "buffer_b";
    	eventNames[2] = "buffer_c_alloc";
    	eventNames[3] = "kernel";
    	eventNames[4] = "buffer_c_to_host";
    	eventNames[5] = "wallclock_time";

    	for(int n = 0; n<7; ++n){
    		std::string size_string = std::to_string(N[n]);
    		std::ofstream outputFile(outfilename+datatype+"_"+size_string);
    	outputFile << "#" << eventNames[0]
    		   << " \t" << eventNames[1]	
    		   << " \t" << eventNames[2]	
    		   << " \t" << eventNames[3]
    		   << " \t" << eventNames[4]	
    		   << " \t" << eventNames[5]
    	   	   << std::endl;
    	if(strcmp(datatype, "double")  == 0){

    		for(int run = 0; run < samplesize; ++run){
    			std::clock_t c_start = std::clock();
    			double *host_a_d, *host_b_d, *host_c_d;
    			cudaMallocHost((void**)&host_a_d,sizeof(double)*N[n]);
    			cudaMallocHost((void**)&host_b_d,sizeof(double)*N[n]);
    			cudaMallocHost((void**)&host_c_d,sizeof(double)*N[n]);
    			double *d_a, *d_b, *d_c;
    			for(int i=0; i< N[n]; ++i){host_a_d[i] = 1;  host_b_d[i]=1;}
    			std::vector<cudaEvent_t> events(10);
    			for(int i=0; i<events.size(); ++i) cudaEventCreate(&events[i]);
    			cudaDeviceSynchronize();

    			cudaEventRecord(events[0]);
    			cudaMalloc(&d_a, N[n]*sizeof(double));
    			cudaMemcpy(d_a, host_a_d, N[n]*sizeof(double), cudaMemcpyHostToDevice);
    			cudaEventRecord(events[5]);
    			cudaEventSynchronize(events[5]);

    			cudaEventRecord(events[1]);
    			cudaMalloc(&d_b, N[n]*sizeof(double));
    			cudaMemcpy(d_b, host_b_d, N[n]*sizeof(double), cudaMemcpyHostToDevice);
    			cudaEventRecord(events[6]);
    			cudaEventSynchronize(events[6]);

    			cudaEventRecord(events[2]);
    			cudaMalloc(&d_c, N[n]*sizeof(double));		
    			cudaEventRecord(events[7]);
    			cudaEventSynchronize(events[7]);
    			
    			cudaEventRecord(events[3]);
    			// Launch add() kernel on GPU
    			add_comp_kernel<<<(int)ceil(float(N[n])/threads),threads>>>(d_a, d_b, d_c, N[n]);
    			cudaEventRecord(events[8]);
    			cudaEventSynchronize(events[8]);

    			cudaEventRecord(events[4]);
    			// Copy result back to the host
    			cudaMemcpy(host_c_d, d_c, N[n]*sizeof(double), cudaMemcpyDeviceToHost);
    			cudaEventRecord(events[9]);
    			cudaEventSynchronize(events[9]);
    			cudaDeviceSynchronize();

    			std::clock_t c_end = std::clock();
    			writeTimeMeasurement_pinned(events, outputFile, N[n]);
    			outputFile << 1000.0*(c_end - c_start)/CLOCKS_PER_SEC << std::endl;
    			// Cleanup
    			for(auto e:events) cudaEventDestroy(e);
    			double average=0;
    			for(int i=0; i< N[n]; ++i){average += host_c_d[i];}
    			average = average/N[n];
    			std::cout << "sample run: " << run << std::endl;
    			std::cout << datatype << " c average: " << average << std::endl; 
    			cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
    			cudaFreeHost(host_a_d); cudaFreeHost(host_b_d); cudaFreeHost(host_c_d);
    			cudaDeviceSynchronize();	
    		}
    	}
    	else if(strcmp(datatype,"float") == 0){

    		for(int run = 0; run < samplesize; ++run){
    			std::clock_t c_start = std::clock();
    			float *host_a_d, *host_b_d, *host_c_d;
    			cudaMallocHost((void**)&host_a_d,sizeof(float)*N[n]);
    			cudaMallocHost((void**)&host_b_d,sizeof(float)*N[n]);
    			cudaMallocHost((void**)&host_c_d,sizeof(float)*N[n]);
    			float *d_a, *d_b, *d_c;
    			for(int i=0; i< N[n]; ++i){host_a_d[i] = 1;  host_b_d[i]=1;}
    			std::vector<cudaEvent_t> events(10);
    			for(int i=0; i<events.size(); ++i) cudaEventCreate(&events[i]);
    			cudaDeviceSynchronize();

    			cudaEventRecord(events[0]);
    			cudaMalloc(&d_a, N[n]*sizeof(float));
    			cudaMemcpy(d_a, host_a_d, N[n]*sizeof(float), cudaMemcpyHostToDevice);
    			cudaEventRecord(events[5]);
    			cudaEventSynchronize(events[5]);

    			cudaEventRecord(events[1]);
    			cudaMalloc(&d_b, N[n]*sizeof(float));
    			cudaMemcpy(d_b, host_b_d, N[n]*sizeof(float), cudaMemcpyHostToDevice);
    			cudaEventRecord(events[6]);
    			cudaEventSynchronize(events[6]);

    			cudaEventRecord(events[2]);
    			cudaMalloc(&d_c, N[n]*sizeof(float));		
    			cudaEventRecord(events[7]);
    			cudaEventSynchronize(events[7]);
    			
    			cudaEventRecord(events[3]);
    			// Launch add() kernel on GPU
    			add_comp_kernel<<<(int)ceil(float(N[n])/threads),threads>>>(d_a, d_b, d_c, N[n]);
    			cudaEventRecord(events[8]);
    			cudaEventSynchronize(events[8]);

    			cudaEventRecord(events[4]);
    			// Copy result back to the host
    			cudaMemcpy(host_c_d, d_c, N[n]*sizeof(float), cudaMemcpyDeviceToHost);
    			cudaEventRecord(events[9]);
    			cudaEventSynchronize(events[9]);
    			cudaDeviceSynchronize();

    			std::clock_t c_end = std::clock();
    			writeTimeMeasurement_pinned(events, outputFile, N[n]);
    			outputFile << 1000.0*(c_end - c_start)/CLOCKS_PER_SEC << std::endl;
    			// Cleanup
    			for(auto e:events) cudaEventDestroy(e);
    			double average=0;
    			for(int i=0; i< N[n]; ++i){average += (double)host_c_d[i];}
    			average = average/N[n];
    			std::cout << "sample run: " << run << std::endl;
    			std::cout << datatype << " c average: " << average << std::endl; 
    			cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
    			cudaFreeHost(host_a_d); cudaFreeHost(host_b_d); cudaFreeHost(host_c_d);
    			cudaDeviceSynchronize();	
    		}
    	}
    	else if(strcmp(datatype,"half") == 0){
    		for(int run = 0; run < samplesize; ++run){
    			std::clock_t c_start = std::clock();
    			half *host_a_f, *host_b_f, *host_c_f;
    			cudaMallocHost((void**)&host_a_f,sizeof(half)*N[n]);
    			cudaMallocHost((void**)&host_b_f,sizeof(half)*N[n]);
    			cudaMallocHost((void**)&host_c_f,sizeof(half)*N[n]);
    			half *d_a, *d_b, *d_c;
    			for(int i=0; i< N[n]; ++i){host_a_f[i] = __float2half(1);  host_b_f[i]=__float2half(1);}
    			std::vector<cudaEvent_t> events(10);
    			for(int i=0; i<events.size(); ++i) cudaEventCreate(&events[i]);
    			cudaDeviceSynchronize();

    			cudaEventRecord(events[0]);
    			cudaMalloc(&d_a, N[n]*sizeof(half));
    			cudaMemcpy(d_a, host_a_f, N[n]*sizeof(half), cudaMemcpyHostToDevice);
    			cudaEventRecord(events[5]);
    			cudaEventSynchronize(events[5]);

    			cudaEventRecord(events[1]);
    			cudaMalloc(&d_b, N[n]*sizeof(half));
    			cudaMemcpy(d_b, host_b_f, N[n]*sizeof(half), cudaMemcpyHostToDevice);
    			cudaEventRecord(events[6]);
    			cudaEventSynchronize(events[6]);

    			cudaEventRecord(events[2]);
    			cudaMalloc(&d_c, N[n]*sizeof(half));		
    			cudaEventRecord(events[7]);
    			cudaEventSynchronize(events[7]);
    			
    			cudaEventRecord(events[3]);
    			// Launch add() kernel on GPU
    			add_comp_kernel<<<(int)ceil(float(N[n])/threads),threads>>>(d_a, d_b, d_c, N[n]);
    			cudaEventRecord(events[8]);
    			cudaEventSynchronize(events[8]);

    			cudaEventRecord(events[4]);
    			// Copy result back to the host
    			cudaMemcpy(host_c_f, d_c, N[n]*sizeof(half), cudaMemcpyDeviceToHost);
    			cudaEventRecord(events[9]);
    			cudaEventSynchronize(events[9]);
    			cudaDeviceSynchronize();

    			std::clock_t c_end = std::clock();
    			writeTimeMeasurement_pinned(events, outputFile, N[n]);
    			outputFile << 1000.0*(c_end - c_start)/CLOCKS_PER_SEC << std::endl;
    			// Cleanup
    			for(auto e:events) cudaEventDestroy(e);
    			double  average=0;
    			for(int i=0; i< N[n]; ++i){average += (double) host_c_f[i];}
    			average = average/N[n];
    			std::cout << "sample run: " << run << std::endl;
    			std::cout << datatype << " c average: " << average << std::endl; 
    			cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
    			cudaFreeHost(host_a_f); cudaFreeHost(host_b_f); cudaFreeHost(host_c_f);
    			cudaDeviceSynchronize();	
    		
    		}	
    	}
        else if(strcmp(datatype,"half2") == 0){
                for(int run = 0; run < samplesize; ++run){
                        std::clock_t c_start = std::clock();
                        half2 *host_a_f, *host_b_f, *host_c_f;
                        cudaMallocHost((void**)&host_a_f,sizeof(half2)*N[n]/2);
                        cudaMallocHost((void**)&host_b_f,sizeof(half2)*N[n]/2);
                        cudaMallocHost((void**)&host_c_f,sizeof(half2)*N[n]/2);
                        half2 *d_a, *d_b, *d_c;
                        float2 init = make_float2(1, 1);
                        for(int i=0; i< N[n]/2; ++i){host_a_f[i] = __float22half2_rn(init); host_b_f[i]=__float22half2_rn(init);}

                        std::vector<cudaEvent_t> events(10);
                        for(int i=0; i<events.size(); ++i) cudaEventCreate(&events[i]);

                        cudaEventRecord(events[0]);
                        cudaMalloc(&d_a, N[n]/2*sizeof(half2));
                        cudaMemcpy(d_a, host_a_f, N[n]/2*sizeof(half2), cudaMemcpyHostToDevice);
                        cudaEventRecord(events[5]);
                        cudaEventSynchronize(events[5]);

                        cudaEventRecord(events[1]);
                        cudaMalloc(&d_b, N[n]/2*sizeof(half2));
                        cudaMemcpy(d_b, host_b_f, N[n]/2*sizeof(half2), cudaMemcpyHostToDevice);
                        cudaEventRecord(events[6]);
                        cudaEventSynchronize(events[6]);

                        cudaEventRecord(events[2]);
                        cudaMalloc(&d_c, N[n]/2*sizeof(half2));
                        cudaEventRecord(events[7]);
                        cudaEventSynchronize(events[7]);

                        cudaEventRecord(events[3]);
                        // Launch add() kernel on GPU
                        add_comp_kernel<<<ceil((N[n])/threads),threads>>>(d_a, d_b, d_c, N[n]/2);
                        cudaEventRecord(events[8]);
                        cudaEventSynchronize(events[8]);

                        cudaEventRecord(events[4]);
                        // Copy result back to the host
                        cudaMemcpy(host_c_f, d_c, N[n]/2*sizeof(half2), cudaMemcpyDeviceToHost);
                        cudaEventRecord(events[9]);
                        cudaEventSynchronize(events[9]);
                        cudaDeviceSynchronize();

                        std::clock_t c_end = std::clock();
                        writeTimeMeasurement_pinned(events, outputFile, N[n]);
                        outputFile << 1000.0*(c_end - c_start)/CLOCKS_PER_SEC << std::endl;
                        // Cleanup
                        double average=0;
                        for(int i=0; i< N[n]/2; ++i){average += (double)__half2float(host_c_f[i].x) + (double)__half2float(host_c_f[i].y);}
                        average = average/ (N[n]);
                        std::cout << "sample run: " << run << std::endl;
                        std::cout << datatype << " c average: " << average << std::endl; 
                        cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
                        cudaFreeHost(host_a_f); cudaFreeHost(host_b_f); cudaFreeHost(host_c_f);
                        for(auto e:events) cudaEventDestroy(e);
                }
        }
    }
}

is the modified program

Result for N=500000000 looks like this:

double
#buffer_a       buffer_b        buffer_c_alloc  kernel  buffer_c_to_host        wallclock_time
325.299         325.022         4.44806         284.611         333.905         7133.86
325.442         325.09          4.42557         284.67          333.944         7136.12
325.285         324.992         4.42438         284.612         333.943         7135.01
325.285         325.011         4.42806         284.62          333.953         7139.27
325.381         325.009         4.4287          284.62          333.961         7133.92
325.434         325.121         4.43475         284.614         333.854         7129.58
325.379         325.022         4.4328          284.609         333.859         7150.98
325.386         325.139         4.42416         284.622         333.856         7129.82
325.394         325.081         4.43254         284.637         333.946         7117.46
325.393         325.016         4.42512         284.617         333.971         7109.87

float
#buffer_a       buffer_b        buffer_c_alloc  kernel  buffer_c_to_host        wallclock_time
162.753         162.634         2.24995         145.235         166.97          3571.63
162.756         162.557         2.24198         144.519         166.989         3572.84
162.852         162.55          2.23546         144.522         166.965         3569.91
162.777         162.598         2.23254         144.499         166.981         3572.64
162.775         162.538         2.25133         144.495         167.002         3571.75
162.764         162.551         2.23798         144.525         167.012         3570.74
162.7           162.553         2.24419         144.513         166.994         3562.9
162.821         162.587         2.24406         144.519         166.985         3562.99
162.741         162.624         2.236           144.522         166.992         3563.44
162.745         162.553         2.23485         144.52          167.008         3563.42

half
#buffer_a       buffer_b        buffer_c_alloc  kernel  buffer_c_to_host        wallclock_time
81.501          81.355          1.18051         611.824         83.4748         2306.3
81.5224         81.3616         1.18387         611.266         83.4755         2306.06
81.5144         81.359          1.17821         611.207         83.4964         2306.24
81.5417         81.3845         1.17546         611.166         83.471          2309.56
81.5311         82.9665         1.18115         611.299         83.4652         2307.28
81.5341         81.3475         1.18016         611.221         83.4752         2305.14
81.5137         81.3504         1.18416         611.306         83.4856         2305.29
81.5515         81.3461         1.17846         611.307         83.4827         2309.48
81.5275         81.3664         1.17862         611.282         83.4895         2309.68
81.5415         81.346          1.18144         611.304         83.4846         2309.82

half2
#buffer_a       buffer_b        buffer_c_alloc  kernel  buffer_c_to_host        wallclock_time
81.5397         81.4081         1.18294         455.23          83.4988         2326.09
81.5225         81.3829         1.18368         455.186         83.6331         2326.7
81.5467         81.3802         1.17638         455.145         83.498          2327.69
81.5592         81.3806         1.18381         460.978         83.4769         2333.55
81.5257         81.371          1.18307         455.195         83.5382         2325.97
81.5472         81.3951         1.17693         455.155         83.4998         2327.3
81.5314         81.3756         1.1777          455.207         83.4832         2325.84
81.5319         81.4044         1.17405         455.208         83.4858         2326.26
81.5494         81.3758         1.17594         455.187         83.4661         2326.97
81.5491         81.3708         1.18554         455.145         83.471          2326.56

Thanks again,
Max

After studying the SASS, it seems evident that the compiler is discovering that the result of computation is loop-invariant in the float and double cases, but not (not discovering that) in the half and half2 cases. Therefore, in the float/double case, it is calling your computation function once, then simply adding that value over and over again in an unrolled loop. In the half and half2 cases, it is calling the computation function 2000 times. The difference here may be due to the use of intrinsics in the computation functions, but I hesitate to speculate. It might arguably be also an example of less-than-optimal behavior by the compiler.

You can convince yourself of this assertion either by studying the SASS, or else by modifying your computation function so that it is not loop invariant. Be advised that this may not be a trivial thing to accomplish. It is generally necessary to study the SASS when you are trying to trick the compiler like this. Naive coding methods of running the exact same computation 2000 times in a loop are likely to run into trouble when used for performance benchmarking.

I’ll also mention that computationally, your half2 variant doesn’t seem to be doing the same work as your other variants, and you are launching twice the needed number of threads in the half2 case.

Here is a simpler test case demonstrating that it is possible for half2 calculations to proceed at approximately twice the throughput of float calculations:

$ cat t1663.cu
#include "cuda_fp16.h"
#define lc 1048576*2

__device__ __forceinline__ float computation(float a, float b, float c){
        return __fmaf_rn(a,b,c);
}

__device__ __forceinline__ half2 computation(half2 a, half2 b, half2 c){
        return __hfma2(a,b,c);
}

template<typename T>
__global__ void add_comp_kernel(T* a, T* b, T* c, long int N){
        int idx = blockIdx.x*blockDim.x+threadIdx.x;
        if(idx < N){
                T a_d = a[idx];
                T b_d = b[idx];
                T c_d = c[idx];
                for(int i = 0; i<lc; ++i){
                        c_d = computation(a_d,b_d, c_d);
                }
                c[idx]=c_d;
        }
}

int main(int argc, char** argv){

        long int N = 1048576ULL;
        int threads = 1024;

        {

                        float *d_a, *d_b, *d_c;

                        cudaMalloc(&d_a, N*sizeof(float));
                        cudaMalloc(&d_b, N*sizeof(float));
                        cudaMalloc(&d_c, N*sizeof(float));
                        add_comp_kernel<<<(int)ceil(float(N)/threads),threads>>>(d_a, d_b, d_c, N);
                        cudaDeviceSynchronize();

        }
        {
                        half2 *d_a, *d_b, *d_c;
                        cudaMalloc(&d_a, N/2*sizeof(half2));
                        cudaMalloc(&d_b, N/2*sizeof(half2));
                        cudaMalloc(&d_c, N/2*sizeof(half2));
                        add_comp_kernel<<<ceil(float(N/2)/threads),threads>>>(d_a, d_b, d_c, N/2);
                        cudaDeviceSynchronize();

        }
}
$ nvcc -o t1663 t1663.cu -arch=sm_70
$ nvprof ./t1663
==8243== NVPROF is profiling process 8243, command: ./t1663
==8243== Profiling application: ./t1663
==8243== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   68.13%  370.54ms         1  370.54ms  370.54ms  370.54ms  void add_comp_kernel<float>(float*, float*, float*, long)
                   31.87%  173.36ms         1  173.36ms  173.36ms  173.36ms  void add_comp_kernel<__half2>(__half2*, __half2*, __half2*, long)
      API calls:   62.06%  543.99ms         2  271.99ms  173.44ms  370.55ms  cudaDeviceSynchronize
                   36.67%  321.46ms         6  53.576ms  230.04us  320.25ms  cudaMalloc
                    0.62%  5.4191ms         4  1.3548ms  689.18us  3.3326ms  cuDeviceTotalMem
                    0.58%  5.0453ms       388  13.003us     388ns  530.55us  cuDeviceGetAttribute
                    0.06%  553.16us         4  138.29us  103.20us  234.79us  cuDeviceGetName
                    0.01%  96.266us         2  48.133us  24.891us  71.375us  cudaLaunchKernel
                    0.00%  27.172us         4  6.7930us  3.4800us  11.906us  cuDeviceGetPCIBusId
                    0.00%  8.2640us         8  1.0330us     404ns  1.6760us  cuDeviceGet
                    0.00%  3.6700us         3  1.2230us     400ns  1.7930us  cuDeviceGetCount
                    0.00%  2.8720us         4     718ns     567ns  1.0600us  cuDeviceGetUuid
$

Hey Robert,

Thanks again for your help. So, a reddit user stressed the fact that divisions are difficult on gpus and suggested to change the division in the kernel and see what happens. Indeed, when I substitute the division with an addition the kernel performance is as expected.

I’m also wondering about the compiler optimisation argument, I clearly see your point, but does the compiler ignore then the -O0 flag? If the compiler argument is valid, then why we don’t see the same behavior with the addition instead of division? The case is kinda solved anyway, but it would be still interesting to know why exactly this happens when using a division in the kernel

According to the documentation:

https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#options-for-altering-compiler-linker-behavior-optimize

“Specify optimization level for host code.”

This option has no effect on device code behavior, when specified as you have it directly to nvcc.

I did not use that option in my testing/analysis. I was able to reproduce your performance observations in the division case, and I already indicated the reason why the half2 performance was approximately 3x slower than the float performance.

As I said, I see your point, but why isn’t it valid when changing the arithmetic operation. All I did to obtain the plot above is to get rid of the division and substitute it by some other operation. Seems for me as if either the division implementation or the compiler optimisation is bad for halfs and vectorized halfs

It seems to me the answer is provided in its entirety in Robert Crovella’s post #4.

You need to take into account the possibility that compiler optimizations may be applied differently based on the floating-point type. To ascertain what compiler optimizations were applied, you need to study the generated SASS (machine code).

Orthogonal side-remark: Divisions are not “difficult” on GPUs. They are, however, emulated, as the simplified hardware of the GPU does not provide hardware instructions for division. It is possible that these emulation sequences are sub-optimal. If you can prove that to be the case (e.g. by providing a fully functional but faster emulation), feel free to file an enhancement request with NVIDIA.

Division, by itself, still seems to roughly obey the half2-can-be-2x-faster-than-float observation:

$ cat t1663.cu
#include "cuda_fp16.h"
#define lc 1048576*2

__device__ __forceinline__ float computation(float a, float b){
        return a/b;
}

__device__ __forceinline__ half2 computation(half2 a, half2 b){
        return __h2div(a,b);
}

template<typename T>
__global__ void add_comp_kernel(T* a, T* b, T* c, long int N){
        int idx = blockIdx.x*blockDim.x+threadIdx.x;
        if(idx < N){
                T a_d = a[idx];
                T b_d = b[idx];
                for(int i = 0; i<lc; ++i){
                        b_d = computation(a_d,b_d);
                }
                c[idx]=b_d;
        }
}

int main(int argc, char** argv){

        long int N = 1048576ULL;
        int threads = 1024;

        {

                        float *d_a, *d_b, *d_c;

                        cudaMalloc(&d_a, N*sizeof(float));
                        cudaMalloc(&d_b, N*sizeof(float));
                        cudaMalloc(&d_c, N*sizeof(float));
                        add_comp_kernel<<<(int)ceil(float(N)/threads),threads>>>(d_a, d_b, d_c, N);
                        cudaDeviceSynchronize();

        }
        {
                        half2 *d_a, *d_b, *d_c;
                        cudaMalloc(&d_a, N/2*sizeof(half2));
                        cudaMalloc(&d_b, N/2*sizeof(half2));
                        cudaMalloc(&d_c, N/2*sizeof(half2));
                        add_comp_kernel<<<ceil(float(N/2)/threads),threads>>>(d_a, d_b, d_c, N/2);
                        cudaDeviceSynchronize();

        }
}
$ nvcc -arch=sm_70 -o t1663 t1663.cu
$ nvprof ./t1663
==9721== NVPROF is profiling process 9721, command: ./t1663
==9721== Profiling application: ./t1663
==9721== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   60.01%  7.14270s         1  7.14270s  7.14270s  7.14270s  void add_comp_kernel<float>(float*, float*, float*, long)
                   39.99%  4.76070s         1  4.76070s  4.76070s  4.76070s  void add_comp_kernel<__half2>(__half2*, __half2*, __half2*, long)
      API calls:   97.34%  11.9035s         2  5.95173s  4.76071s  7.14274s  cudaDeviceSynchronize
                    2.56%  313.06ms         6  52.177ms  209.44us  311.82ms  cudaMalloc
                    0.05%  6.2274ms       388  16.049us     360ns  1.8219ms  cuDeviceGetAttribute
                    0.04%  5.3818ms         4  1.3455ms  631.84us  3.3347ms  cuDeviceTotalMem
                    0.00%  456.83us         4  114.21us  104.03us  134.55us  cuDeviceGetName
                    0.00%  136.55us         2  68.276us  64.121us  72.432us  cudaLaunchKernel
                    0.00%  39.149us         4  9.7870us  3.3690us  19.646us  cuDeviceGetPCIBusId
                    0.00%  8.3230us         8  1.0400us     526ns  1.5010us  cuDeviceGet
                    0.00%  5.5780us         3  1.8590us     342ns  3.4150us  cuDeviceGetCount
                    0.00%  3.1000us         4     775ns     646ns     860ns  cuDeviceGetUuid
$

Note that the kernel runtimes here are approximately 20x longer than the previous case, so I would expect the arithmetic performance to be dominated by the division operation, if it is surrounded by a small amount of other arithmetic.

When I craft a test case that attempts to heel closely to your original test case, but avoids the problems I have now identified, I witness a similar performance ratio of approximately 2x, even with division in computation:

$ cat t1663.cu
#include "cuda_fp16.h"
#define lc (1024*2)

__device__ __forceinline__ float computation(float a, float b, float c){
        return ((a * b + a * b + a * b + a * b) / (c * b + a * b + a * b + a * b));
}
__device__ __forceinline__ half2 computation(half2 a, half2 b, half2 c){
        half2 ab = __hmul2(a,b);
        half2 abpab = __hadd2(ab,ab);
        return __h2div(__hadd2(abpab, abpab) , __hadd2(__hadd2(__hmul2(c,b),ab),abpab));
}

template<typename T>
__global__ void add_comp_kernel(T* a, T* b, T* c, long int N){
        int idx = blockIdx.x*blockDim.x+threadIdx.x;
        if(idx < N){
                T a_d = a[idx];
                T b_d = b[idx];
                T c_d = c[idx];
                for(int i = 0; i<lc; ++i){
                        c_d = computation(a_d,b_d, c_d);
                }
                c[idx]=c_d;
        }
}

int main(int argc, char** argv){

        long int N = 1048576ULL;
        int threads = 1024;

        {

                        float *d_a, *d_b, *d_c;

                        cudaMalloc(&d_a, N*sizeof(float));
                        cudaMalloc(&d_b, N*sizeof(float));
                        cudaMalloc(&d_c, N*sizeof(float));
                        add_comp_kernel<<<(int)ceil(float(N)/threads),threads>>>(d_a, d_b, d_c, N);
                        cudaDeviceSynchronize();

        }
        {
                        half2 *d_a, *d_b, *d_c;
                        cudaMalloc(&d_a, N/2*sizeof(half2));
                        cudaMalloc(&d_b, N/2*sizeof(half2));
                        cudaMalloc(&d_c, N/2*sizeof(half2));
                        add_comp_kernel<<<ceil(float(N/2)/threads),threads>>>(d_a, d_b, d_c, N/2);
                        cudaDeviceSynchronize();

        }
}
$ nvcc -arch=sm_70 -o t1663 t1663.cu
$ nvprof ./t1663
==9963== NVPROF is profiling process 9963, command: ./t1663
==9963== Profiling application: ./t1663
==9963== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   64.49%  8.2775ms         1  8.2775ms  8.2775ms  8.2775ms  void add_comp_kernel<float>(float*, float*, float*, long)
                   35.51%  4.5580ms         1  4.5580ms  4.5580ms  4.5580ms  void add_comp_kernel<__half2>(__half2*, __half2*, __half2*, long)
      API calls:   92.50%  310.58ms         6  51.764ms  235.28us  309.40ms  cudaMalloc
                    3.83%  12.863ms         2  6.4317ms  4.5659ms  8.2976ms  cudaDeviceSynchronize
                    1.89%  6.3353ms       388  16.328us     335ns  1.9007ms  cuDeviceGetAttribute
                    1.59%  5.3517ms         4  1.3379ms  631.42us  3.3407ms  cuDeviceTotalMem
                    0.14%  463.98us         4  116.00us  97.013us  140.38us  cuDeviceGetName
                    0.03%  104.22us         2  52.110us  32.347us  71.873us  cudaLaunchKernel
                    0.01%  27.448us         4  6.8620us  3.4450us  12.443us  cuDeviceGetPCIBusId
                    0.00%  10.205us         8  1.2750us     464ns  3.1470us  cuDeviceGet
                    0.00%  5.8380us         3  1.9460us     388ns  3.6000us  cuDeviceGetCount
                    0.00%  2.6840us         4     671ns     575ns     846ns  cuDeviceGetUuid
$

I suspect, for your response in #6, that you have not modified your code to account for the first issue I mention (in the first paragraph of my response in #4). As I indicated there, the (float) code is not doing what your source code seems to desire (nor is it doing what the half2 code is doing), and the behavior is substantially different between float and half2 cases. It may be that by substituting the division with addition, that the behavior of the half2 case also falls into the same “trap” as the float code, and so becomes 2x faster than the float case. However, I claim that that example is not doing what you intended, for the reasons I’ve already stated, and it’s not something I would do performance analysis on; it is misleading in my opinion.