FP16 vs FP32

Why is FP16 much slower than FP32 on tx2
cuda 10.0
I’ve done the following, but nothing works

I need help

sudo nvpmodel - m 0
sudo ./ jetson_clocks .SH

code like this

#include <stdio.h>
#include <iostream>
#include "cuda.h"
#include "cuda_fp16.h"

inline
cudaError_t checkCuda(cudaError_t result)
{
    if (result != cudaSuccess) {
        fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
    }
    return result;
}

#define ITER 100

__global__ void load(float2 *floating, half2 *halving)
{
    const int idx = blockDim.x*blockIdx.x+threadIdx.x;
    floating[idx] = (float2){1.5f, 1.5f};
    halving[idx] = __floats2half2_rn(1.5f, 1.5f);
}

__global__ void addh(half2 *halving)
{
    const int idx = blockDim.x*blockIdx.x+threadIdx.x;
    for(int i=0; i<100; i++)
        halving[idx] = __hadd2(halving[idx], halving[idx]);
}

__global__ void addf(float2 *floating)
{
    const int idx = blockDim.x*blockIdx.x+threadIdx.x;
    for(int i=0; i<100; i++)
    {
        floating[idx].x = floating[idx].x + floating[idx].x;
        floating[idx].y = floating[idx].y + floating[idx].y;
    }
}

int main()
{
    const int blockSize = 512;
    const int n = 1024*100 * blockSize;
    
    float2 *floating;
    checkCuda(cudaMallocManaged(&floating, sizeof(float2)*n));

    half2 *halving;
    checkCuda(cudaMallocManaged(&halving, sizeof(half2)*n));

    cudaEvent_t start, stop;
    float kernel_time;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    load<<<n/blockSize, blockSize>>>(floating, halving);

    checkCuda(cudaDeviceSynchronize());

    cudaEventRecord(start,0);
    for(int i=0; i<10; i++)
    {
        addh<<<n/blockSize, blockSize>>>(halving);
        //addh<<<1, 512>>>(halving);
    }
    cudaEventRecord(stop,0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&kernel_time, start, stop);
    std::cout << "Half : " << kernel_time/1 << " ms" << std::endl;
    
    cudaEventRecord(start,0);
    for(int i=0; i<10; i++)
    {
        addf<<<n/blockSize, blockSize>>>(floating);
        //addf<<<1, 512>>>(floating);
    }
    cudaEventRecord(stop,0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&kernel_time, start, stop);
    std::cout << "Float : " << kernel_time/1 << " ms" << std::endl;

    checkCuda(cudaDeviceSynchronize());

    checkCuda(cudaFree(floating));
    checkCuda(cudaFree(halving));
}

It is advantageous to ask questions about the Jetson TX2 in the sub-forum dedicated to it: [url]https://devtalk.nvidia.com/default/board/188/jetson-tx2/[/url]

From what I understand, the Jetson TX2 uses a Pascal-architecture GPU (compute capability 6.2), and therefore has low throughput for FP16 operations, specifically 1/64 the FP32 throughput. In the Pascal family, only the P100 part (compute capability 6.0) features full FP16 throughput.

According to the CUDA Programming Guide section 5.4.1, the Pascal GPU in the Jetson TX2 with compute capability 6.2 should have full throughput for FP16 instructions (i.e same throughput as FP32 instructions, but able to handle two operations per instruction; 2-way SIMD).

[url]Programming Guide :: CUDA Toolkit Documentation

How are you compiling the code? (What is your exact compile command line?)
How are you running the code? (What is your exact execution command line?)
What is the exact output generated when you run it on your TX2?
Which Jetpack is installed on your TX2?

I solved this problem because I used the debug compile parameter。