__hadd not working correctly

the following code should get value 3.00f in all the positions in half * c, but it wrongly gets 1.00f. __half not working correctly

/* cuda program to sum 2 arrays of size W */

#include <cuda_runtime.h>
#include <stdlib.h>
#include <cuda_fp16.h>
#include <iostream>
using namespace std;

int W = 49;

__global__ void addition(half *a, half *b, half*c, const int W){// working perfectly

    int tx = threadIdx.x;
    int bx = blockIdx.x;

    int abs_tid = bx * blockDim.x + tx;

    for(int i = 0; i < W; i = i + (blockDim.x * 2)){
        half temp = __hadd(a[abs_tid + i], b[abs_tid + i]);
        c[abs_tid + i] = __hadd(a[abs_tid + i], b[abs_tid + i]); 
        if(tx ==0 && bx ==0){
            printf(" vals: c:%f a:%f b:%f \n", __half2float(__hadd(b[abs_tid + i], a[abs_tid + i])),//__half2float(c[abs_tid + i]),
             __half2float(a[abs_tid + i]), __half2float(b[abs_tid + i]));
            printf("temp %f \n", __half2float(temp));
        }
    }

}

void printMat(half* arr, int W){
    for(int i = 0 ; i < W ; i++){
        printf("%5.2f ", __half2float(arr[i]));   
    }
}

half* setMat(half *arr, int W, const float &v){
    for(int i = 0; i < W; i++){
            arr[i] = __float2half(v);   
    }
    return arr;
}

int main(){

    half *ha, *hb, *hc;
    half *da, *db, *dc;

    ha = (half*)malloc(W*sizeof(float));
    hb = (half*)malloc(W*sizeof(float));
    hc = (half*)malloc(W*sizeof(float));

    ha = setMat(ha, W, 1.00f);
    hb = setMat(hb, W, 2.00f);
    hc = setMat(hc, W, 0.00f);

    cudaMalloc((void **)&da, W*sizeof(half));
    cudaMalloc((void **)&db, W*sizeof(half));
    cudaMalloc((void **)&dc, W*sizeof(half));

    cudaMemcpy(da, ha, W*sizeof(half), cudaMemcpyHostToDevice);
    cudaMemcpy(db, hb, W*sizeof(half), cudaMemcpyHostToDevice);
    cudaMemcpy(dc, hc, W*sizeof(half), cudaMemcpyHostToDevice);

    cout<<"A matrix:\n";
    printMat(ha, W);
    cout<<"B matrix:\n";
    printMat(hb, W);

    cudaError_t cudastatus;

    addition<<<3, 32>>>(da, db, dc, W);
    cudastatus = cudaDeviceSynchronize();

    if(cudastatus != cudaSuccess){
        printf("some error");
    }

    printf("\n C matrix :\n");
    printMat(hc, W);
    
    cudaMemcpy(hc, dc, W * sizeof(half), cudaMemcpyDeviceToHost);

    printf("\n result:\n");
 
    printMat(hc, W);

    return 0;
}

When I execute the code, I get the following output

A matrix:
 1.00  1.00  1.00  1.00  1.00  1.00  1.00  1.00  1.00  1.00  1.00  1.00  1.00  1.00  1.00  1.00  1.00  1.00  1.00  1.00  1.00  1.00  1.00  1.00  1.00  1.00  1.00  1.00  1.00  1.00  1.00  1.00  1.00  1.00  1.00  1.00  1.00  1.00  1.00  1.00  1.00  1.00  1.00  1.00  1.00  1.00  1.00  1.00  1.00 B matrix:
 2.00  2.00  2.00  2.00  2.00  2.00  2.00  2.00  2.00  2.00  2.00  2.00  2.00  2.00  2.00  2.00  2.00  2.00  2.00  2.00  2.00  2.00  2.00  2.00  2.00  2.00  2.00  2.00  2.00  2.00  2.00  2.00  2.00  2.00  2.00  2.00  2.00  2.00  2.00  2.00  2.00  2.00  2.00  2.00  2.00  2.00  2.00  2.00  2.00  vals: c:3.000000 a:1.000000 b:2.000000 
temp 3.000000 

 C matrix :
 0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00 
 result:
 3.00  3.00  3.00  3.00  3.00  3.00  3.00  3.00  3.00  3.00  3.00  3.00  3.00  3.00  3.00  3.00  3.00  3.00  3.00  3.00  3.00  3.00  3.00  3.00  3.00  3.00  3.00  3.00  3.00  3.00  3.00  3.00  3.00  3.00  3.00  3.00  3.00  3.00  3.00  3.00  3.00  3.00  3.00  3.00  3.00  3.00  3.00  3.00  3.00

1+2=3 at each position. Which is the unexpected result?

What compilation command were you using?

nvcc -arch=sm_86 main.cu -o main

1 Like