TX2 with FP16 Running Slower than FP32

I’m trying to do some computations in FP16 on a TX2 board, but they are running slower than in FP32. I have values already converted to half2 which are passed to the kernel.

__global__ void compute(half2 *in, half2 *out)
{
idx = blockIdx.x*blockDim.x+threadIdx.x+blockIdx.y*N; //coalesced memory access

// option 1
out.x = __hadd(in[idx].x,in[idx].x);
out.y = __hadd(in[idx].y,in[idx].y);
// option 2
out = __hadd2(in[idx],in[idx]);
}

Option 1 takes 10.7ms for all of the input data, and option 2 takes 18.19ms. Replacing the data with float2 and adding (same format as option 1), only takes 6.8ms. There’s about 2M data points in all, if that makes a difference. Used the Visual Profiler in Nsight for kernel timing.

Any ideas why fp16 is taking longer than fp32? It also takes longer for multiplication operations.

I’m compiling with sm_60 in cuda 9.0.

Hi gisselle.irad, the correct GPU arch for Jetson TX2 is sm_62. Does that help?

In Nsight 9.0, the Settings only include up to 6.1 (which helped a bit, but was still longer than expected). 6.2 is not listed in the Properties/Settings/CUDA tab. It only lists up to 6.0 in Nsight 9.1. I’ve been using 9.0 because that is what is available for the TX2 through Jetpack and the versions need to match so I can profile the application. Is there somewhere else that I can find the 6.2 setting?

Hmm I’m not familiar with using Nsight to set the compiler’s GPU arch, can you try doing it through Makefile or CMake like so? https://github.com/dusty-nv/jetson-inference/blob/e168b746d540fd739a6fa6ba652b5a5e8c6ffa08/CMakeLists.txt#L37

I don’t have a makefile outside of Nsight, but I was able to get it to compile in Nsight with sm_62 (which required re-creating the project in the IDE from scratch - 6.2 was available in the initial options, but not in properties after I had chosen the max 6.0 the first time).

I reran the profiling after compiling with 6.2 and the compute time did not change (for fp16 or fp32).

OK. Looking over your kernel, you may want to start by simplifying the indexing and block/grid dimensions to 1-dimensional at first to prove that FP16 optimizations are having an impact for you.

// idx = blockIdx.x*blockDim.x+threadIdx.x+blockIdx.y*N; //coalesced memory access
const int idx = blockIdx.x*blockDim.x+threadIdx.x;  // 1D indexing for testing

What block and grid dimensions are you launching with?

Then I also recommend saving the global memory values to intermediate register, like so:

const half2 input = in[idx];
half2 output;

// option 1
output.x = __hadd(input.x,input.x);
output.y = __hadd(input.y,input.y);
// option 2
output = __hadd2(input,input); 
out[idx] = output;

You may also be interested to try the half-precision FP16 example given here: https://devblogs.nvidia.com/mixed-precision-programming-cuda-8/

Launching with 512 threads per block, block dimensions are [2,250].

I swapped out to this:

const int idx = blockIdx.x*blockDim.x+threadIdx.x;
const half2 in = d_in[idx];
half2 out;
out.x = __hadd(in.x, in.x);
out.y = __hadd(in.y, in.x);
d_out[idx] = out;

It now takes ~17.5ms (longer than the first time).

I have seen that example, but I haven’t run it yet. I’ll try that too.

Just a thought, what impact is there if you change block dimensions to [250,1] (i.e. one-dimensional)?

Changing the block dimensions to [250,1] results:
half: 95 us
float: 59 us

I wrote the following in a separate project to test just the half vs float add time (so it’s definitely independent of the rest of my program), and half still takes longer (134 vs 50 us).

#define SAMPS (1024)

int main()
{
	float2 *floating;
	checkCudaErrors(cudaMallocManaged(&floating, sizeof(float2)*SAMPS));

	half2 *halving;
	checkCudaErrors(cudaMallocManaged(&halving, sizeof(half2)*SAMPS));

	load<<<1,1024>>>(floating, halving);

	checkCudaErrors(cudaDeviceSynchronize());

	addh<<<1,1024>>>(halving);

	addf<<<1,1024>>>(floating);

	checkCudaErrors(cudaDeviceSynchronize());

	checkCudaErrors(cudaFree(floating));
	checkCudaErrors(cudaFree(halving));
}

__global__ void load(float2 *floating, half2 *halving)
{
	const int idx = blockDim.x*blockIdx.x+threadIdx.x;
	floating[idx] = (float2){1.5f, 1.5f};
	halving[idx] = __floats2half2_rn(1.5f, 1.5f);
}

__global__ void addh(half2 *halving)
{
	const int idx = blockDim.x*blockIdx.x+threadIdx.x;
	const half2 in = halving[idx];
	halving[idx].x = __hadd(in.x, in.x);
	halving[idx].y = __hadd(in.y, in.y);
}

__global__ void addf(float2 *floating)
{
	const int idx = blockDim.x*blockIdx.x+threadIdx.x;
	const float2 in = floating[idx];
	floating[idx].x = in.x + in.x;
	floating[idx].y = in.y + in.y;
}

Hi,

We have checked the performance with cudaEvent_t and average it over 1000 execution.

...
cudaEvent_t start, stop;
float kernel_time;

cudaEventCreate(&start);
cudaEventCreate(&stop);

cudaEventRecord(start,0);
for(int i=0; i<ITER; i++)
{
    addh<<<1,1024>>>(halving);
}
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&kernel_time, start, stop);
std::cout << "Half : " << kernel_time/ITER << " ms" << std::endl;
...

Result of float and half in our environment are similar:
Half : 0.0121824 ms
Float: 0.0118529 ms

Could you also test this in your environment?
Thanks.

Half : 0.142135 ms
Float : 0.0567865 ms

Which suggests there is probably something wrong in the environment?
What part of the environment would cause that kind of slowdown?

Hi,

Have you maximized the CPU/GPU clock?

#please apply these commands in order
sudo nvpmodel -m 0
sudo ./jetson_clocks.sh

Please let us know the result.
Thanks.

That does fix the overall runtime, so now it is close to the numbers you reported.

Half : 0.015749 ms
Float : 0.0127058 ms

The original program now runs with a useful duration, so thanks!

The half precision still takes longer than the float though - why is that? I would have thought that reducing precision would decrease runtime.

Hi,

Have you also measured the packed operation?
ex.
device __half2 __hadd2 ( const __half2 a, const __half2 b );

Could you give it a try and share result with us?

Thanks.

I ran the same function with the two __hadd commands replaced with __hadd2, and it took 0.0139777 ms. So that is shorter than the separate __hadd lines, but still longer than adding floats.

Hi,

Not sure if this is related to unified memory.
Could you repeat your experiment with cudaMalloc memory?

Thanks

Swapped out both cudaMallocManaged statements with cudaMalloc and no other changes:

With __hadd2, Half : 0.0139182 ms.
Float : 0.0123827 ms

Hi,

We want to discuss this issue with our internal CUDA team.
Could you share the complete source of your last experiment?

Thanks.

I found this thread interesting because I have recently accelerated my application on the TX2 using half floats. I observed run time reduction in my application close to the expected 2x speedup.

I ran the code posted earlier here and observed the same results as reported with the half precision runtime either the same or slower than the single precision. However, if I use half2 and __hadd2(), give the kernel launches more blocks to run and the kernel more work to do with an internal loop, I observe close to the expected 2x speedup with half floats.

Half : 26.3336 ms
Float : 40.4393 ms

Troy.

#include <stdio.h>
#include <iostream>
#include "cuda.h"
#include "cuda_fp16.h"

inline
cudaError_t checkCuda(cudaError_t result)
{
    if (result != cudaSuccess) {
        fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
    }
    return result;
}

#define ITER 100

__global__ void load(float2 *floating, half2 *halving)
{
    const int idx = blockDim.x*blockIdx.x+threadIdx.x;
    floating[idx] = (float2){1.5f, 1.5f};
    halving[idx] = __floats2half2_rn(1.5f, 1.5f);
}

__global__ void addh(half2 *halving)
{
    const int idx = blockDim.x*blockIdx.x+threadIdx.x;
    for(int i=0; i<100; i++)
        halving[idx] = __hadd2(halving[idx], halving[idx]);
}

__global__ void addf(float2 *floating)
{
    const int idx = blockDim.x*blockIdx.x+threadIdx.x;
    for(int i=0; i<100; i++)
    {
        floating[idx].x = floating[idx].x + floating[idx].x;
        floating[idx].y = floating[idx].y + floating[idx].y;
    }
}

int main()
{
    const int blockSize = 512;
    const int n = 102400 * blockSize;
    
    float2 *floating;
    checkCuda(cudaMallocManaged(&floating, sizeof(float2)*n));

    half2 *halving;
    checkCuda(cudaMallocManaged(&halving, sizeof(half2)*n));

    cudaEvent_t start, stop;
    float kernel_time;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    load<<<n/blockSize, blockSize>>>(floating, halving);

    checkCuda(cudaDeviceSynchronize());

    cudaEventRecord(start,0);
    for(int i=0; i<ITER; i++)
    {
        addh<<<n/blockSize, blockSize>>>(halving);
    }
    cudaEventRecord(stop,0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&kernel_time, start, stop);
    std::cout << "Half : " << kernel_time/ITER << " ms" << std::endl;
    
    cudaEventRecord(start,0);
    for(int i=0; i<ITER; i++)
    {
        addf<<<n/blockSize, blockSize>>>(floating);
    }
    cudaEventRecord(stop,0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&kernel_time, start, stop);
    std::cout << "Float : " << kernel_time/ITER << " ms" << std::endl;

    checkCuda(cudaDeviceSynchronize());

    checkCuda(cudaFree(floating));
    checkCuda(cudaFree(halving));
}