Why is FP16 much slower than FP32 on tx2
cuda 10.0
I’ve done the following, but nothing works
I need help
sudo nvpmodel - m 0
sudo ./ jetson_clocks .SH
code like this
#include <stdio.h>
#include <iostream>
#include "cuda.h"
#include "cuda_fp16.h"
inline
cudaError_t checkCuda(cudaError_t result)
{
if (result != cudaSuccess) {
fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
}
return result;
}
#define ITER 100
__global__ void load(float2 *floating, half2 *halving)
{
const int idx = blockDim.x*blockIdx.x+threadIdx.x;
floating[idx] = (float2){1.5f, 1.5f};
halving[idx] = __floats2half2_rn(1.5f, 1.5f);
}
__global__ void addh(half2 *halving)
{
const int idx = blockDim.x*blockIdx.x+threadIdx.x;
for(int i=0; i<100; i++)
halving[idx] = __hadd2(halving[idx], halving[idx]);
}
__global__ void addf(float2 *floating)
{
const int idx = blockDim.x*blockIdx.x+threadIdx.x;
for(int i=0; i<100; i++)
{
floating[idx].x = floating[idx].x + floating[idx].x;
floating[idx].y = floating[idx].y + floating[idx].y;
}
}
int main()
{
const int blockSize = 512;
const int n = 1024*100 * blockSize;
float2 *floating;
checkCuda(cudaMallocManaged(&floating, sizeof(float2)*n));
half2 *halving;
checkCuda(cudaMallocManaged(&halving, sizeof(half2)*n));
cudaEvent_t start, stop;
float kernel_time;
cudaEventCreate(&start);
cudaEventCreate(&stop);
load<<<n/blockSize, blockSize>>>(floating, halving);
checkCuda(cudaDeviceSynchronize());
cudaEventRecord(start,0);
for(int i=0; i<10; i++)
{
addh<<<n/blockSize, blockSize>>>(halving);
//addh<<<1, 512>>>(halving);
}
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&kernel_time, start, stop);
std::cout << "Half : " << kernel_time/1 << " ms" << std::endl;
cudaEventRecord(start,0);
for(int i=0; i<10; i++)
{
addf<<<n/blockSize, blockSize>>>(floating);
//addf<<<1, 512>>>(floating);
}
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&kernel_time, start, stop);
std::cout << "Float : " << kernel_time/1 << " ms" << std::endl;
checkCuda(cudaDeviceSynchronize());
checkCuda(cudaFree(floating));
checkCuda(cudaFree(halving));
}