How do you do real FP16 calculations? The FP16 has the same speed as the FP32。
My code looks like this
device:jetson xavier
cuda 10.01
jetpack:4.2
cudnnCreateTensorDescriptor(&input_descriptor);
CHECK_CUDNN(cudnnSetTensor4dDescriptor(input_descriptor,
CUDNN_TENSOR_NCHW,
dataType,
b, in_channel, in_h, in_w ));
cudnnCreateFilterDescriptor(&kernel_descriptor);
CHECK_CUDNN(cudnnSetFilter4dDescriptor(kernel_descriptor,
dataType,
CUDNN_TENSOR_NCHW,
num_output, in_channel, kernel_h, kernel_w));
cudnnCreateTensorDescriptor(&output_descriptor);
CHECK_CUDNN(cudnnSetTensor4dDescriptor(output_descriptor,
CUDNN_TENSOR_NCHW,
dataType,
b, out_channel, out_h, out_w));
cudnnCreateTensorDescriptor(&bias_descriptor);
CHECK_CUDNN(cudnnSetTensor4dDescriptor(bias_descriptor,
CUDNN_TENSOR_NCHW,
dataType,
1, out_channel, 1, 1));
// convolution descriptor
cudnnCreateConvolutionDescriptor(&conv_descriptor);
CHECK_CUDNN(cudnnSetConvolution2dDescriptor(conv_descriptor,
pad_w, pad_h, // zero-padding
stride_w, stride_h, // stride
dilation_w, dilation_h, //dilation
CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT));
cudnnSetConvolutionMathType(conv_descriptor, CUDNN_TENSOR_OP_MATH);
// algorithm
CHECK_CUDNN(cudnnGetConvolutionForwardAlgorithm(handle,
input_descriptor,
kernel_descriptor,
conv_descriptor,
output_descriptor,
CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
0,
&algo));
CHECK_CUDNN(cudnnGetConvolutionForwardWorkspaceSize(handle,
input_descriptor,
kernel_descriptor,
conv_descriptor,
output_descriptor,
algo,
&workspace_size));
CHECK(cudaMalloc(&workspace, workspace_size));