cudnnConvolutionForward returns wrong result when meets std::normal_distribution

I’m very new to cuda and cudnn, and I just wrote a simple cudnn convolution validation code, however, when the input is from std::normal_distribution, it returns wrong result.

void randn_cu(half *data, LL n, int seed)
{
    half *data_in_host = (half *)malloc(sizeof(half) * n);
    std::default_random_engine generator(seed);
    std::normal_distribution<float> distribution(0.0, 1.0);
#pragma omp parallel for
    for (LL i = 0; i < n; ++i)
    {
        data_in_host[i] = __float2half(distribution(generator));
    }
    cudaMemcpy(data, data_in_host, n * sizeof(half), cudaMemcpyHostToDevice);
    cudaDeviceSynchronize();
    free(data_in_host);
}

void print(float *data, int n, int c, int h, int w)
{
    std::vector<float> buffer(1 << 20);
    checkCUDA(cudaMemcpy(buffer.data(), data, n * c * h * w * sizeof(float), cudaMemcpyDeviceToHost));
    int a = 0;
    for (int i = 0; i < n; ++i)
    {
        for (int j = 0; j < c; ++j)
        {
            std::cout << "n=" << i << ", c=" << j << ":" << std::endl;
            for (int k = 0; k < h; ++k)
            {
                for (int l = 0; l < w; ++l)
                {
                    std::cout << std::setw(6) << std::right << std::fixed << std::setprecision(2) << buffer[a];
                    ++a;
                }
                std::cout << std::endl;
            }
        }
    }
    std::cout << std::endl;
}
int cudnn_conv2d_cu(float *signal, float *filter, float *output, int N, int C, int H, int W, int kernel_size, int out_channels, int algo)
{
    cudnnHandle_t cudnn;
    checkCUDNN(cudnnCreate(&cudnn));

    std::cout << "N: " << N << std::endl;
    std::cout << "C: " << C << std::endl;
    std::cout << "H: " << H << std::endl;
    std::cout << "W: " << W << std::endl;
    std::cout << std::endl;

    cudnnTensorDescriptor_t in_desc;
    checkCUDNN(cudnnCreateTensorDescriptor(&in_desc));
    checkCUDNN(cudnnSetTensor4dDescriptor(
        in_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT,
        N, C, H, W));

    // filter
    const int filt_k = out_channels;
    const int filt_c = C;
    const int filt_h = kernel_size;
    const int filt_w = kernel_size;
    std::cout << "filt_k: " << filt_k << std::endl;
    std::cout << "filt_c: " << filt_c << std::endl;
    std::cout << "filt_h: " << filt_h << std::endl;
    std::cout << "filt_w: " << filt_w << std::endl;
    std::cout << std::endl;

    cudnnFilterDescriptor_t filt_desc;
    checkCUDNN(cudnnCreateFilterDescriptor(&filt_desc));
    checkCUDNN(cudnnSetFilter4dDescriptor(
        filt_desc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW,
        filt_k, filt_c, filt_h, filt_w));

    // convolution
    const int pad_h = 0;
    const int pad_w = 0;
    const int str_h = 1;
    const int str_w = 1;
    const int dil_h = 1;
    const int dil_w = 1;
    std::cout << "pad_h: " << pad_h << std::endl;
    std::cout << "pad_w: " << pad_w << std::endl;
    std::cout << "str_h: " << str_h << std::endl;
    std::cout << "str_w: " << str_w << std::endl;
    std::cout << "dil_h: " << dil_h << std::endl;
    std::cout << "dil_w: " << dil_w << std::endl;
    std::cout << std::endl;

    cudnnConvolutionDescriptor_t conv_desc;
    checkCUDNN(cudnnCreateConvolutionDescriptor(&conv_desc));
    checkCUDNN(cudnnSetConvolution2dDescriptor(
        conv_desc,
        pad_h, pad_w, str_h, str_w, dil_h, dil_w,
        CUDNN_CONVOLUTION, CUDNN_DATA_FLOAT));

    // output
    int out_n;
    int out_c;
    int out_h;
    int out_w;

    checkCUDNN(cudnnGetConvolution2dForwardOutputDim(
        conv_desc, in_desc, filt_desc,
        &out_n, &out_c, &out_h, &out_w));

    std::cout << "out_n: " << out_n << std::endl;
    std::cout << "out_c: " << out_c << std::endl;
    std::cout << "out_h: " << out_h << std::endl;
    std::cout << "out_w: " << out_w << std::endl;
    std::cout << std::endl;

    cudnnTensorDescriptor_t out_desc;
    checkCUDNN(cudnnCreateTensorDescriptor(&out_desc));
    checkCUDNN(cudnnSetTensor4dDescriptor(
        out_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT,
        out_n, out_c, out_h, out_w));

    // algorithm
    cudnnConvolutionFwdAlgo_t algo_t;
    std::string algo_str;

    switch (algo)
    {
    case 0:
        algo_t = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
        algo_str = "CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM";
        break;
    case 1:
        algo_t = CUDNN_CONVOLUTION_FWD_ALGO_FFT;
        algo_str = "CUDNN_CONVOLUTION_FWD_ALGO_FFT";
        break;
    case 2:
        algo_t = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD;
        algo_str = "CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD";
        break;

    default:
        break;
    }

    std::cout << "Convolution algorithm: " << algo_str << std::endl;
    std::cout << std::endl;

    // workspace
    size_t ws_size;
    checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnn, in_desc, filt_desc, conv_desc, out_desc, algo_t, &ws_size));

    float *ws_data;
    checkCUDA(cudaMalloc(&ws_data, ws_size));

    std::cout << "Workspace size: " << ws_size << std::endl;
    std::cout << std::endl;

    // perform
    float alpha = 1.f;
    float beta = 0.f;

    print(signal, N, C, H, W);
    print(filter, filt_k, filt_c, filt_h, filt_w);

    checkCUDNN(cudnnConvolutionForward(
        cudnn,
        &alpha, in_desc, signal, filt_desc, filter,
        conv_desc, algo_t, ws_data, ws_size,
        &beta, out_desc, output));
        
    cudaDeviceSynchronize();
    print(output, out_n, out_c, out_h, out_w);

    // finalizing
    checkCUDA(cudaFree(ws_data));
    checkCUDNN(cudnnDestroyTensorDescriptor(out_desc));
    checkCUDNN(cudnnDestroyConvolutionDescriptor(conv_desc));
    checkCUDNN(cudnnDestroyFilterDescriptor(filt_desc));
    checkCUDNN(cudnnDestroyTensorDescriptor(in_desc));
    checkCUDNN(cudnnDestroy(cudnn));
    return 0;
}
void test_cudnn_conv2d()
{
    int N = 1;
    int C = 1;
    int H = 5;
    int W = 5;
    int out_channels = 1;
    int kernel_size = 2;

    int out_w = W - kernel_size + 1;
    int out_h = H - kernel_size + 1;

    /*Generate input data*/
    float *signal = (float *)malloc(sizeof(float) * N * C * H * W);
    float *filter = (float *)malloc(sizeof(float) * kernel_size * kernel_size * C * out_channels);
    float *signal_cu;
    float *filter_cu;
    cudaMalloc(&signal_cu, sizeof(float) * N * C * H * W);
    cudaMalloc(&filter_cu, sizeof(float) * kernel_size * kernel_size * C * out_channels);
    randn(signal, N * C * H * W);
    randn(filter, kernel_size * kernel_size * C * out_channels);
    cudaMemcpy(signal_cu, signal, sizeof(float) * N * C * H * W, cudaMemcpyHostToDevice);
    cudaMemcpy(filter_cu, filter, sizeof(float) * kernel_size * kernel_size * C * out_channels, cudaMemcpyHostToDevice);

    float *output_standard = (float *)malloc(sizeof(float) * N * out_channels * out_h * out_w);
    float *output_test = (float *)malloc(sizeof(float) * N * out_channels * out_h * out_w);
    float *output_test_cu;
    cudaMalloc(&output_test_cu, sizeof(float) * N * out_channels * out_h * out_w);

    /*Perform standard direct convolution on CPU*/
    conv2d_cpu(signal, filter, output_standard, H, W, kernel_size);

    /*Perform cudnn convolution on GPU*/
    cudnn_conv2d_cu(signal_cu, filter_cu, output_test_cu, N, C, H, W, kernel_size, out_channels, 0);
    cudaMemcpy(output_test, output_test_cu, sizeof(float) * N * out_channels * out_h * out_w, cudaMemcpyDeviceToHost);

    /*Get accuracy*/
    double acc = relative_error_2d(output_test, output_standard, out_h, out_w, N*out_channels);
    std::cout << "cudnn_conv2d_float VS float_conv"
              << " N " << N << " C " << C << " H " << H << " W " << W << " out_channels " << out_channels << " kernel_size " << kernel_size << " relative_error " << acc << std::endl;
}
n=0, c=0:
 -0.74  1.02 -0.11  0.01 -0.63
 -0.42  0.01 -0.25 -0.81 -0.32
  0.01  0.01 -1.59 -0.63 -0.12
 -0.81  0.44 -1.68  0.44  1.36
  0.21 -0.57 -0.57 -1.59  1.21

n=0, c=0:
 -0.90  0.90
 -1.20  0.90

n=0, c=0:
 -2.27  1.28  0.39  0.33
 -0.39  1.76 -0.12 -0.80
 -1.13  3.84 -2.58 -1.25
 -0.56  2.42 -1.11 -3.76

# this result is wrong

:( Anyone help

Is there any bugs in my code? idk :(