On Jetson Xavier, which is faster: pseudo FP16 or true FP16?

Jetson Xavier is based on volta architecure, so I thought True FP16 would be faster, but the experimental results show that Pseudo FP16 is faster. is this correct?

Input 2048 x 1024 x 8
Output 1024 x 512 x 16
Filter 3 x 3
Data Shape : HWC
Conv Math : CUDNN_TENSOR_OP_MATH

True FP16 : 1.925634
Pseudo FP16 : 0.900063

Hi,

Could you share your source and steps for testing?
We want to reproduce this internally first.

More, which JetPack version do you use?
Thanks.


int main(int argc, char **argv)
{

    cudnnHandle_t cudnnHandle;
    cudnnTensorDescriptor_t inTensorDesc, outTensorDesc, biasTensorDesc;
    cudnnFilterDescriptor_t filterDesc;
    cudnnConvolutionDescriptor_t convDesc;
    cudnnActivationDescriptor_t actDesc;


    cudaEvent_t start, stop;
    clock_t cpu_start, cpu_end;

    int in_channel = 8;
    int in_height = 1024;
    int in_width = 2048;
    int batch_count = 1;

    int filter_width = 3;
    int filter_height = 3;
    int out_channel = 16;

    int padding_w = 1;
    int padding_h = 1;

    int stride_horizontal = 2;
    int stride_vertical = 2;

    float alpha = 1.0f;
    float beta = 0.0f;

    

    checkCudaErrors(cudaEventCreate(&start));
    checkCudaErrors(cudaEventCreate(&stop));


    checkCUDNN(cudnnCreate(&cudnnHandle));
    checkCUDNN(cudnnCreateTensorDescriptor(&inTensorDesc));
    checkCUDNN(cudnnCreateTensorDescriptor(&outTensorDesc));
    checkCUDNN(cudnnCreateTensorDescriptor(&biasTensorDesc));
    checkCUDNN(cudnnCreateFilterDescriptor(&filterDesc));
    checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc));
    checkCUDNN(cudnnCreateActivationDescriptor(&actDesc));

    checkCUDNN(cudnnSetTensor4dDescriptor(inTensorDesc, CUDNN_TENSOR_NHWC, CUDNN_DATA_HALF, batch_count, in_channel, in_height, in_width));
    checkCUDNN(cudnnSetFilter4dDescriptor(filterDesc, CUDNN_DATA_HALF, CUDNN_TENSOR_NHWC, out_channel, in_channel, filter_height, filter_width));
    //checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc, padding_h, padding_w, stride_vertical, stride_horizontal, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_HALF));
    // FOR PSEUDO FP16
    checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc, padding_h, padding_w, stride_vertical, stride_horizontal, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT)); 

    checkCUDNN(cudnnSetTensor4dDescriptor(biasTensorDesc, CUDNN_TENSOR_NHWC, CUDNN_DATA_HALF, 1, out_channel, 1, 1));
    checkCUDNN(cudnnSetActivationDescriptor(actDesc, CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN, 0));



    void *inData_d;
    void *outData_d;
    void *filterData_d;
    void* workSpace;


    checkCudaErrors(cudaMalloc((void**)&inData_d, sizeof(float)*in_channel*in_width*in_height*out_channel));
    checkCudaErrors(cudaMalloc((void**)&filterData_d, sizeof(float)*in_channel*filter_width*filter_height*out_channel));
    cudaMemset(inData_d,0x00,sizeof(float)*in_channel*in_width*in_height*out_channel  );
    cudaMemset(filterData_d,0x00,sizeof(float)*in_channel*filter_width*filter_height*out_channel);


    int out_n, out_c, out_h, out_w;
    
    checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convDesc, inTensorDesc, filterDesc, &out_n, &out_c, &out_h, &out_w));
    printf("conv out shape (n x c x h x w) = (%d x %d x %d x %d)\n", out_n, out_c, out_h, out_w);
    checkCUDNN(cudnnSetTensor4dDescriptor(outTensorDesc, CUDNN_TENSOR_NHWC, CUDNN_DATA_HALF, out_n, out_c, out_h, out_w));
    checkCudaErrors(cudaMalloc((void**)&outData_d, sizeof(float)*out_c*out_w*out_h));

    checkCUDNN(cudnnSetConvolutionMathType(convDesc, CUDNN_TENSOR_OP_MATH));

    int RetCnt;
    cudnnConvolutionFwdAlgoPerf_t     fwd_algo_pref_[8];
    cudnnFindConvolutionForwardAlgorithm(cudnnHandle,
                                            inTensorDesc,
                                            filterDesc,
                                            convDesc,
                                            outTensorDesc,
                                            8,
                                            &RetCnt,
                                            fwd_algo_pref_);
    /*checkCUDNN(cudnnGetConvolutionForwardAlgorithm_v7(cudnnHandle,
                                            inTensorDesc,
                                            filterDesc,
                                            convDesc,
                                            outTensorDesc,
                                            4,
                                            &RetCnt,
                                            fwd_algo_pref_));  */                                      

    cout << "Fastest algorithm for conv = " << fwd_algo_pref_[0].algo << endl;

    size_t sizeInBytes = 0;
               
    checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle,
                                            inTensorDesc,
                                            filterDesc,
                                            convDesc,
                                            outTensorDesc,
                                            fwd_algo_pref_[0].algo,
                                            &sizeInBytes));

    cout << "sizeInBytes " << sizeInBytes << endl;


    if (sizeInBytes != 0) 
        checkCudaErrors(cudaMalloc(&workSpace, sizeInBytes));

    for(int i=0;i<300;i++)   // gpu warm-up
    {
        checkCUDNN(cudnnConvolutionForward(cudnnHandle,
                                            &alpha,
                                            inTensorDesc,
                                            inData_d,
                                            filterDesc,
                                            filterData_d,
                                            convDesc,
                                            fwd_algo_pref_[0].algo,
                                            workSpace,
                                            sizeInBytes,
                                            &beta,
                                            outTensorDesc,
                                            outData_d));
    }

    cudaEventRecord(start);
    for(int i=0;i<1000;i++)
    {
        checkCUDNN(cudnnConvolutionForward(cudnnHandle,
                                            &alpha,
                                            inTensorDesc,
                                            inData_d,
                                            filterDesc,
                                            filterData_d,
                                            convDesc,
                                            fwd_algo_pref_[0].algo,
                                            workSpace,
                                            sizeInBytes,
                                            &beta,
                                            outTensorDesc,
                                            outData_d));
    }
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
    printf("gpu - %f \n", milliseconds / 1000);
   

    checkCudaErrors(cudaFree(inData_d));
    checkCudaErrors(cudaFree(outData_d));
    if(sizeInBytes)
        checkCudaErrors(cudaFree(workSpace));

    cudaEventDestroy(start);
    cudaEventDestroy(stop);



    checkCUDNN(cudnnDestroyTensorDescriptor(inTensorDesc));
    checkCUDNN(cudnnDestroyTensorDescriptor(outTensorDesc));
    checkCUDNN(cudnnDestroyTensorDescriptor(biasTensorDesc));
    checkCUDNN(cudnnDestroyConvolutionDescriptor(convDesc));
    checkCUDNN(cudnnDestroyFilterDescriptor(filterDesc));
    checkCUDNN(cudnnDestroyActivationDescriptor(actDesc));
    checkCUDNN(cudnnDestroy(cudnnHandle));


    return EXIT_SUCCESS;
}

Also, I am using JetPack 5.0

Hi,

We try to reproduce this issue internally but got some errors when testing the true fp16 path.

Do we need to modify other places for the true fp16 profiling?
We only change the following lines:

checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc, padding_h, padding_w, stride_vertical, stride_horizontal, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_HALF));
// FOR PSEUDO FP16
//checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc, padding_h, padding_w, stride_vertical, stride_horizontal, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT));

Thanks.

There is no update from you for a period, assuming this is not an issue any more.
Hence we are closing this topic. If need further support, please open a new one.
Thanks

Is this still an issue to support? Any information can be shared? Thanks