FP16 cudnnConvolutionForward

How do you do real FP16 calculations? The FP16 has the same speed as the FP32。

My code looks like this

device:jetson xavier
cuda 10.01
jetpack:4.2

cudnnCreateTensorDescriptor(&input_descriptor);
    CHECK_CUDNN(cudnnSetTensor4dDescriptor(input_descriptor,
                               CUDNN_TENSOR_NCHW,
                               dataType,
                               b, in_channel, in_h, in_w ));
    cudnnCreateFilterDescriptor(&kernel_descriptor);
 
    CHECK_CUDNN(cudnnSetFilter4dDescriptor(kernel_descriptor,
                               dataType,
                               CUDNN_TENSOR_NCHW,
                               num_output, in_channel, kernel_h, kernel_w));
    
    cudnnCreateTensorDescriptor(&output_descriptor);
    CHECK_CUDNN(cudnnSetTensor4dDescriptor(output_descriptor,
                               CUDNN_TENSOR_NCHW,
                               dataType,
                               b, out_channel, out_h, out_w));
                               
    cudnnCreateTensorDescriptor(&bias_descriptor);
    CHECK_CUDNN(cudnnSetTensor4dDescriptor(bias_descriptor,
                               CUDNN_TENSOR_NCHW,
                               dataType,
                               1, out_channel, 1, 1));
     
    // convolution descriptor
    cudnnCreateConvolutionDescriptor(&conv_descriptor);
    CHECK_CUDNN(cudnnSetConvolution2dDescriptor(conv_descriptor,
                                    pad_w, pad_h, // zero-padding
                                    stride_w, stride_h, // stride
                                    dilation_w, dilation_h, //dilation
                                    CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT));
    cudnnSetConvolutionMathType(conv_descriptor, CUDNN_TENSOR_OP_MATH);
    // algorithm
    CHECK_CUDNN(cudnnGetConvolutionForwardAlgorithm(handle,
                                        input_descriptor,
                                        kernel_descriptor,
                                        conv_descriptor,
                                        output_descriptor,
                                        CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
                                        0,
                                        &algo));                         
    CHECK_CUDNN(cudnnGetConvolutionForwardWorkspaceSize(handle,
                                            input_descriptor,
                                            kernel_descriptor,
                                            conv_descriptor,
                                            output_descriptor,
                                            algo,
                                            &workspace_size));

    CHECK(cudaMalloc(&workspace, workspace_size));

On a Volta GPU, you must use NHWC tensors, and channels must be a multiple of 8. Also cudnnSetConvolution2dDescriptor should use CUDNN_DATA_HALF instead of CUDNN_DATA_FLOAT.