Hi,
I ran benchmark below to check fp16 throughput on GTX 1060.
Here is the code:
#include <iostream>
#include <stdio.h>
#include <time.h>
#include <sys/time.h>
#include <cuda.h>
#include "cuda_runtime.h"
#include "cuda_fp16.h"
#define USECPSEC 1000000ULL
#define ITER 1024*1024*1024
#define WI 512
#define WG 6*10
#define SIZE WI<<1
unsigned long long dtime_usec(unsigned long long start){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
__global__ void DoHAdd(const half *in1d, half* outd) {
int tx = threadIdx.x;
half in1 = in1d[tx];
half out = outd[tx];
for (int i = 0; i < ITER; i++) {
out = __hadd(in1, out);
}
outd[tx] = out;
}
int main() {
half *in1d, *outd;
// cudaSetDevice(2);
int devID = 1;
cudaDeviceProp devProp;
cudaGetDeviceProperties(&devProp, devID);
std::cout<<devProp.name<<std::endl;
cudaMalloc((void**)&in1d, SIZE);
cudaMalloc((void**)&outd, SIZE);
unsigned long long dt = dtime_usec(0);
DoHAdd <<< dim3(WG), dim3(WI) >>> (in1d, outd);
cudaDeviceSynchronize();
dt = dtime_usec(dt);
unsigned long long ops = ITER;
ops *= WG;
ops *= WI;
float et = dt/(float)USECPSEC;
unsigned long long Mops = ops/1000000;
std::cout<<et<<"s for "<< Mops << " HAdds"<<std::endl;
float tp = (Mops)/(et*1000000);
std::cout << "Throughput: " << tp << " Tops/s" << std::endl;
}
Compilation commands:
nvcc -gencode arch=compute_61,code=sm_61 hadd.cu