Here’s the full code that I’ve been running.
// main.cu:
//---------------------------------------------------
#include
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include “helper_cuda.h”
#include “fxns.cuh”
#define SAMPS (1024)
#define ITER 1000
int main()
{
float2 *floating;
// checkCudaErrors(cudaMallocManaged(&floating, sizeof(float2)*SAMPS));
checkCudaErrors(cudaMalloc(&floating, sizeof(float2)*SAMPS));
half2 *halving;
// checkCudaErrors(cudaMallocManaged(&halving, sizeof(half2)*SAMPS));
checkCudaErrors(cudaMalloc(&halving, sizeof(half2)*SAMPS));
load<<<1,1024>>>(floating, halving);
cudaEvent_t start, stop;
float kernel_time;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start,0);
for(int i=0; i<ITER; i++)
{
addh<<<1,1024>>>(halving);
// addf<<<1,1024>>>(floating);
}
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&kernel_time, start, stop);
std::cout << “Half : " << kernel_time/ITER << " ms” << std::endl;
checkCudaErrors(cudaDeviceSynchronize());
checkCudaErrors(cudaFree(floating));
checkCudaErrors(cudaFree(halving));
}
//---------------------------------------------------
// fxns.cuh:
//---------------------------------------------------
/*
- fxns.cuh
- Created on: May 29, 2018
-
Author: irad
*/
#ifndef FXNS_CUH_
#define FXNS_CUH_
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include
#include
global void load(float2 *floating, half2 *halving);
global void addh(half2 *halving);
global void addf(float2 *floating);
#endif /* FXNS_CUH_ */
//---------------------------------------------------
// fxns.cu:
//---------------------------------------------------
/*
- fxns.cu
- Created on: May 29, 2018
-
Author: irad
*/
#include “fxns.cuh”
global void load(float2 floating, half2 halving)
{
const int idx = blockDim.xblockIdx.x+threadIdx.x+blockDim.yblockIdx.y;
floating[idx] = (float2){1.5f, 1.5f};
halving[idx] = __floats2half2_rn(1.5f, 1.5f);
}
global void addh(half2 halving)
{
const int idx = blockDim.xblockIdx.x+threadIdx.x+blockDim.y*blockIdx.y;
const half2 in = halving[idx];
// halving[idx].x = __hadd(in.x, in.x);
// halving[idx].y = __hadd(in.y, in.y);
halving[idx] = __hadd2(in, in);
}
global void addf(float2 floating)
{
const int idx = blockDim.xblockIdx.x+threadIdx.x+blockDim.y*blockIdx.y;
const float2 in = floating[idx];
floating[idx].x = in.x + in.x;
floating[idx].y = in.y + in.y;
}
//---------------------------------------------------