Hi,
I have code which performs reduce sum operation. The input shape is (2,3,4) and axis is -1. The code works for float dtype. However, as I replaced float with half. The output is incorrect, Could someone helps me expalian why it is wrong and how to fix it?
The code is attached here:
extern "C" __global__ void fused_sum_3_kernel0( half* __restrict__ placeholder, half* __restrict__ placeholder_red) {
half placeholder_red_rf[1];
__shared__ half red_buf0[1024];
placeholder_red_rf[0] = __float2half_rn(0.000000e+00f);
if (((int)threadIdx.y) < 6) {
if (((int)threadIdx.x) < 4) {
placeholder_red_rf[0] = (placeholder_red_rf[0] + placeholder[((((int)threadIdx.y) * 4) + ((int)threadIdx.x))]);
}
}
__syncthreads();
((volatile __shared__ half*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))] = (((((((int)threadIdx.y) < 6) && (((int)threadIdx.y) < 6)) && (((int)threadIdx.y) < 6)) && (((int)threadIdx.y) < 6)) ? placeholder_red_rf[0] : __float2half_rn(0.000000e+00f));
__syncthreads();
if (((int)threadIdx.x) < 16) {
((volatile __shared__ half*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))] = (((volatile __shared__ half*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))] + ((volatile __shared__ half*)red_buf0)[(((((int)threadIdx.y) * 32) + ((int)threadIdx.x)) + 16)]);
((volatile __shared__ half*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))] = (((volatile __shared__ half*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))] + ((volatile __shared__ half*)red_buf0)[(((((int)threadIdx.y) * 32) + ((int)threadIdx.x)) + 8)]);
((volatile __shared__ half*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))] = (((volatile __shared__ half*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))] + ((volatile __shared__ half*)red_buf0)[(((((int)threadIdx.y) * 32) + ((int)threadIdx.x)) + 4)]);
((volatile __shared__ half*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))] = (((volatile __shared__ half*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))] + ((volatile __shared__ half*)red_buf0)[(((((int)threadIdx.y) * 32) + ((int)threadIdx.x)) + 2)]);
((volatile __shared__ half*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))] = (((volatile __shared__ half*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))] + ((volatile __shared__ half*)red_buf0)[(((((int)threadIdx.y) * 32) + ((int)threadIdx.x)) + 1)]);
}
__syncthreads();
if (((int)threadIdx.y) < 6) {
if (((int)threadIdx.x) == 0) {
placeholder_red[((int)threadIdx.y)] = ((volatile __shared__ half*)red_buf0)[(((int)threadIdx.y) * 32)];
}
}
}
As I put all 1 input, the results should be:
array([[[4.],
[4.],
[4.]],…
but I got:
x: array([[[3.],
[3.],
[3.]],…
My test of your code produces output of all 4:
$ cat t1538.cu
#include <cuda_fp16.h>
#include <iostream>
#ifndef USE_FLOAT
typedef half ft;
#else
typedef float ft;
#endif
__global__ void fused_sum_3_kernel0( ft* __restrict__ placeholder, ft* __restrict__ placeholder_red) {
ft placeholder_red_rf[1];
__shared__ ft red_buf0[1024];
#ifndef USE_FLOAT
const half my_zero = __float2half_rn(0.000000e+00f);
#else
const float my_zero = 0.0f;
#endif
placeholder_red_rf[0] = my_zero;
if (((int)threadIdx.y) < 6) {
if (((int)threadIdx.x) < 4) {
placeholder_red_rf[0] = (placeholder_red_rf[0] + placeholder[((((int)threadIdx.y) * 4) + ((int)threadIdx.x))]);
}
}
__syncthreads();
((volatile __shared__ ft*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))] = (((((((int)threadIdx.y) < 6) && (((int)threadIdx.y) < 6)) && (((int)threadIdx.y) < 6)) && (((int)threadIdx.y) < 6)) ? placeholder_red_rf[0] : my_zero);
__syncthreads();
if (((int)threadIdx.x) < 16) {
((volatile __shared__ ft*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))] = ((ft)(((volatile __shared__ ft*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))]) + (ft)(((volatile __shared__ ft*)red_buf0)[(((((int)threadIdx.y) * 32) + ((int)threadIdx.x)) + 16)]));
((volatile __shared__ ft*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))] = ((ft)(((volatile __shared__ ft*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))]) + (ft)(((volatile __shared__ ft*)red_buf0)[(((((int)threadIdx.y) * 32) + ((int)threadIdx.x)) + 8)]));
((volatile __shared__ ft*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))] = ((ft)(((volatile __shared__ ft*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))]) + (ft)(((volatile __shared__ ft*)red_buf0)[(((((int)threadIdx.y) * 32) + ((int)threadIdx.x)) + 4)]));
((volatile __shared__ ft*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))] = ((ft)(((volatile __shared__ ft*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))]) + (ft)(((volatile __shared__ ft*)red_buf0)[(((((int)threadIdx.y) * 32) + ((int)threadIdx.x)) + 2)]));
((volatile __shared__ ft*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))] = ((ft)(((volatile __shared__ ft*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))]) + (ft)(((volatile __shared__ ft*)red_buf0)[(((((int)threadIdx.y) * 32) + ((int)threadIdx.x)) + 1)]));
}
__syncthreads();
if (((int)threadIdx.y) < 6) {
if (((int)threadIdx.x) == 0) {
placeholder_red[((int)threadIdx.y)] = ((volatile __shared__ ft*)red_buf0)[(((int)threadIdx.y) * 32)];
}
}
}
int main(){
const int ds = 1024;
ft *h_i, *h_o, *d_i, *d_o;
h_i = (ft *)malloc(ds * sizeof(ft));
h_o = (ft *)malloc(ds * sizeof(ft));
cudaMalloc(&d_i, ds * sizeof(ft));
cudaMalloc(&d_o, ds * sizeof(ft));
for (int i = 0; i < ds; i++) h_i[i] = (ft)1.0f;
cudaMemset(d_o, 0, ds*sizeof(ft));
cudaMemcpy(d_i, h_i, ds*sizeof(ft), cudaMemcpyHostToDevice);
fused_sum_3_kernel0<<<1, dim3(32,32)>>>(d_i, d_o);
cudaMemcpy(h_o, d_o, ds*sizeof(ft), cudaMemcpyDeviceToHost);
for (int i = 0; i < 32; i++){
for (int j = 0; j < 32; j++) std::cout << (float)h_o[i*32+j] << " ";
std::cout << std::endl;}
return 0;
}
$ nvcc -o t1538 t1538.cu -arch=sm_70
t1538.cu(24): warning: attribute "__shared__" does not apply here
t1538.cu(27): warning: attribute "__shared__" does not apply here
t1538.cu(27): warning: attribute "__shared__" does not apply here
t1538.cu(27): warning: attribute "__shared__" does not apply here
t1538.cu(28): warning: attribute "__shared__" does not apply here
t1538.cu(28): warning: attribute "__shared__" does not apply here
t1538.cu(28): warning: attribute "__shared__" does not apply here
t1538.cu(29): warning: attribute "__shared__" does not apply here
t1538.cu(29): warning: attribute "__shared__" does not apply here
t1538.cu(29): warning: attribute "__shared__" does not apply here
t1538.cu(30): warning: attribute "__shared__" does not apply here
t1538.cu(30): warning: attribute "__shared__" does not apply here
t1538.cu(30): warning: attribute "__shared__" does not apply here
t1538.cu(31): warning: attribute "__shared__" does not apply here
t1538.cu(31): warning: attribute "__shared__" does not apply here
t1538.cu(31): warning: attribute "__shared__" does not apply here
t1538.cu(36): warning: attribute "__shared__" does not apply here
t1538.cu(24): warning: attribute "__shared__" does not apply here
t1538.cu(27): warning: attribute "__shared__" does not apply here
t1538.cu(27): warning: attribute "__shared__" does not apply here
t1538.cu(27): warning: attribute "__shared__" does not apply here
t1538.cu(28): warning: attribute "__shared__" does not apply here
t1538.cu(28): warning: attribute "__shared__" does not apply here
t1538.cu(28): warning: attribute "__shared__" does not apply here
t1538.cu(29): warning: attribute "__shared__" does not apply here
t1538.cu(29): warning: attribute "__shared__" does not apply here
t1538.cu(29): warning: attribute "__shared__" does not apply here
t1538.cu(30): warning: attribute "__shared__" does not apply here
t1538.cu(30): warning: attribute "__shared__" does not apply here
t1538.cu(30): warning: attribute "__shared__" does not apply here
t1538.cu(31): warning: attribute "__shared__" does not apply here
t1538.cu(31): warning: attribute "__shared__" does not apply here
t1538.cu(31): warning: attribute "__shared__" does not apply here
t1538.cu(36): warning: attribute "__shared__" does not apply here
$ ./t1538
4 4 4 4 4 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
$
I had to make some changes to get it to compile, but nothing that should affect functionality.
I’m guessing you are calling this maybe from cupy, just a guess. My guess is the problem lies elsewhere in your code, perhaps in the interface to this function. I also assume you are compiling for Pascal or newer architecture.