TX2 with FP16 Running Slower than FP32

Here’s the full code that I’ve been running.

// main.cu:
//---------------------------------------------------
#include
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include “helper_cuda.h”
#include “fxns.cuh”

#define SAMPS (1024)
#define ITER 1000

int main()
{
float2 *floating;
// checkCudaErrors(cudaMallocManaged(&floating, sizeof(float2)*SAMPS));
checkCudaErrors(cudaMalloc(&floating, sizeof(float2)*SAMPS));

half2 *halving;

// checkCudaErrors(cudaMallocManaged(&halving, sizeof(half2)*SAMPS));
checkCudaErrors(cudaMalloc(&halving, sizeof(half2)*SAMPS));

load<<<1,1024>>>(floating, halving);

cudaEvent_t start, stop;
float kernel_time;

cudaEventCreate(&start);
cudaEventCreate(&stop);

cudaEventRecord(start,0);
for(int i=0; i<ITER; i++)
{
	addh<<<1,1024>>>(halving);

// addf<<<1,1024>>>(floating);
}
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&kernel_time, start, stop);
std::cout << “Half : " << kernel_time/ITER << " ms” << std::endl;

checkCudaErrors(cudaDeviceSynchronize());

checkCudaErrors(cudaFree(floating));
checkCudaErrors(cudaFree(halving));

}
//---------------------------------------------------

// fxns.cuh:
//---------------------------------------------------
/*

  • fxns.cuh
  • Created on: May 29, 2018
  •  Author: irad
    

*/

#ifndef FXNS_CUH_
#define FXNS_CUH_

#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include
#include

global void load(float2 *floating, half2 *halving);
global void addh(half2 *halving);
global void addf(float2 *floating);

#endif /* FXNS_CUH_ */
//---------------------------------------------------

// fxns.cu:
//---------------------------------------------------
/*

  • fxns.cu
  • Created on: May 29, 2018
  •  Author: irad
    

*/

#include “fxns.cuh”

global void load(float2 floating, half2 halving)
{
const int idx = blockDim.x
blockIdx.x+threadIdx.x+blockDim.y
blockIdx.y;
floating[idx] = (float2){1.5f, 1.5f};
halving[idx] = __floats2half2_rn(1.5f, 1.5f);
}

global void addh(half2 halving)
{
const int idx = blockDim.x
blockIdx.x+threadIdx.x+blockDim.y*blockIdx.y;
const half2 in = halving[idx];
// halving[idx].x = __hadd(in.x, in.x);
// halving[idx].y = __hadd(in.y, in.y);
halving[idx] = __hadd2(in, in);
}

global void addf(float2 floating)
{
const int idx = blockDim.x
blockIdx.x+threadIdx.x+blockDim.y*blockIdx.y;
const float2 in = floating[idx];
floating[idx].x = in.x + in.x;
floating[idx].y = in.y + in.y;
}
//---------------------------------------------------

Hi, gisselle.irad

We think the experiment of TroyK explains the situation. (Thanks TroyK!)

There are some basic overhead for launching CUDA kernel code.
More block/thread or more iteration can decrease the influence of overhead.

Thanks.