the following code should get value 3.00f in all the positions in half * c, but it wrongly gets 1.00f. __half not working correctly
/* cuda program to sum 2 arrays of size W */
#include <cuda_runtime.h>
#include <stdlib.h>
#include <cuda_fp16.h>
#include <iostream>
using namespace std;
int W = 49;
__global__ void addition(half *a, half *b, half*c, const int W){// working perfectly
int tx = threadIdx.x;
int bx = blockIdx.x;
int abs_tid = bx * blockDim.x + tx;
for(int i = 0; i < W; i = i + (blockDim.x * 2)){
half temp = __hadd(a[abs_tid + i], b[abs_tid + i]);
c[abs_tid + i] = __hadd(a[abs_tid + i], b[abs_tid + i]);
if(tx ==0 && bx ==0){
printf(" vals: c:%f a:%f b:%f \n", __half2float(__hadd(b[abs_tid + i], a[abs_tid + i])),//__half2float(c[abs_tid + i]),
__half2float(a[abs_tid + i]), __half2float(b[abs_tid + i]));
printf("temp %f \n", __half2float(temp));
}
}
}
void printMat(half* arr, int W){
for(int i = 0 ; i < W ; i++){
printf("%5.2f ", __half2float(arr[i]));
}
}
half* setMat(half *arr, int W, const float &v){
for(int i = 0; i < W; i++){
arr[i] = __float2half(v);
}
return arr;
}
int main(){
half *ha, *hb, *hc;
half *da, *db, *dc;
ha = (half*)malloc(W*sizeof(float));
hb = (half*)malloc(W*sizeof(float));
hc = (half*)malloc(W*sizeof(float));
ha = setMat(ha, W, 1.00f);
hb = setMat(hb, W, 2.00f);
hc = setMat(hc, W, 0.00f);
cudaMalloc((void **)&da, W*sizeof(half));
cudaMalloc((void **)&db, W*sizeof(half));
cudaMalloc((void **)&dc, W*sizeof(half));
cudaMemcpy(da, ha, W*sizeof(half), cudaMemcpyHostToDevice);
cudaMemcpy(db, hb, W*sizeof(half), cudaMemcpyHostToDevice);
cudaMemcpy(dc, hc, W*sizeof(half), cudaMemcpyHostToDevice);
cout<<"A matrix:\n";
printMat(ha, W);
cout<<"B matrix:\n";
printMat(hb, W);
cudaError_t cudastatus;
addition<<<3, 32>>>(da, db, dc, W);
cudastatus = cudaDeviceSynchronize();
if(cudastatus != cudaSuccess){
printf("some error");
}
printf("\n C matrix :\n");
printMat(hc, W);
cudaMemcpy(hc, dc, W * sizeof(half), cudaMemcpyDeviceToHost);
printf("\n result:\n");
printMat(hc, W);
return 0;
}