Depthwise convolution in cudnn fp16 is slow than fp32

I test depthwise convolution using cudnn in FP16 on xvaier, and I found that fp16 is slow than fp32 in both depthwise convolution. Is it correct? And if that means I must using fp32 for inference when there has depthwise convolution like mobilenet-v2.
|in_channel|height|width|out_channel|kernel|stride|group|cudnn FP32|cudnn FP16|
| — | — | — | — | — | — | — | — | — | — |
|480|80|48|480|7|1|480|1.696ms|11.145ms|
|32|400|144|32|7|1|32|1.695ms|5.428ms|

code is as follows:

int conv_op_process(int& batch, int& in_channel, int& height, int& width, int& kernel_size, int& num_filter, int& stride, int& group, int&iter, bool& half){

    int input_size = batch * in_channel * height * width;
    float* A = (float*)malloc(sizeof(float) * input_size);
    int kernel = kernel_size * kernel_size * in_channel * num_filter;
    float* W = (float*)malloc(sizeof(float) * kernel);
    int output_size = batch * num_filter * height * width / stride / stride;
    float* B = (float*)malloc(sizeof(float) * output_size);

    // random assign data
    for(int i = 0; i < input_size; i++){
        A[i] = i/10;
    }
    for(int i = 0; i < kernel; i++){
        W[i] = i/15;
    }
    for(int i = 0; i < output_size; i++){
        B[i] = 1;
    }

    // alloc cuda memory
    void* d_A;
    void* d_W;
    void* d_B;
    checkCudaErr(cudaMalloc(&d_A, sizeof(float) * input_size));
    checkCudaErr(cudaMalloc(&d_W, sizeof(float) * kernel));
    checkCudaErr(cudaMalloc(&d_B, sizeof(float) * output_size));

    // mem for warm up
    void* d_A_w;
    void* d_W_w;
    void* d_B_w;
    checkCudaErr(cudaMalloc(&d_A_w, sizeof(float) * input_size));
    checkCudaErr(cudaMalloc(&d_W_w, sizeof(float) * kernel));
    checkCudaErr(cudaMalloc(&d_B_w, sizeof(float) * output_size));

    // memory copy
    checkCudaErr(cudaMemcpy(d_A, (void*)A, sizeof(float) * input_size, cudaMemcpyHostToDevice));
    checkCudaErr(cudaMemcpy(d_W, (void*)W, sizeof(float) * kernel, cudaMemcpyHostToDevice));
    checkCudaErr(cudaMemcpy(d_B, (void*)B, sizeof(float) * output_size, cudaMemcpyHostToDevice));
    checkCudaErr(cudaDeviceSynchronize());    // wait for copy finish

    struct timeval start, handle_create, end;
    // cudnn function init
    cudnnHandle_t handle;
    cudnnTensorDescriptor_t input_desc;
    cudnnTensorDescriptor_t output_desc;
    cudnnFilterDescriptor_t filter_desc;
    cudnnConvolutionDescriptor_t conv_op_desc;
    // handle create process is time cost
    checkCudnnErr(cudnnCreate(&handle));
    gettimeofday(&start, NULL);
    checkCudnnErr(cudnnCreateTensorDescriptor(&input_desc));
    checkCudnnErr(cudnnCreateTensorDescriptor(&output_desc));
    checkCudnnErr(cudnnCreateFilterDescriptor(&filter_desc));
    checkCudnnErr(cudnnCreateConvolutionDescriptor(&conv_op_desc));

    checkCudnnErr(cudnnSetConvolutionGroupCount(conv_op_desc, group));

    cudnnTensorFormat_t infer_type = CUDNN_TENSOR_NHWC;

    int pad = (kernel_size - 1) / 2;
    if(half){
	printf("Using FP16\n");
        // link mem with desc
        checkCudnnErr(cudnnSetTensor4dDescriptor(input_desc, infer_type, CUDNN_DATA_HALF, batch, in_channel, height, width));
        checkCudnnErr(cudnnSetTensor4dDescriptor(output_desc, infer_type, CUDNN_DATA_HALF, batch, num_filter, int(height/stride), int(width/stride)));
        checkCudnnErr(cudnnSetFilter4dDescriptor(filter_desc, CUDNN_DATA_HALF, infer_type, num_filter, int(in_channel/group), kernel_size, kernel_size));
        if(group != 1){
	    checkCudnnErr(cudnnSetConvolution2dDescriptor(conv_op_desc, pad, pad, stride, stride, 1, 1, CUDNN_CONVOLUTION, CUDNN_DATA_FLOAT));
	}
	else{
            checkCudnnErr(cudnnSetConvolution2dDescriptor(conv_op_desc, pad, pad, stride, stride, 1, 1, CUDNN_CONVOLUTION, CUDNN_DATA_HALF));
	}
        // using tensor core
       	checkCudnnErr( cudnnSetConvolutionMathType(conv_op_desc, CUDNN_TENSOR_OP_MATH));
    }
    else{
        // link mem with desc
        checkCudnnErr(cudnnSetTensor4dDescriptor(input_desc, infer_type, CUDNN_DATA_FLOAT, batch, in_channel, height, width));
        checkCudnnErr(cudnnSetTensor4dDescriptor(output_desc, infer_type, CUDNN_DATA_FLOAT, batch, num_filter, int(height/stride), int(width/stride)));
        checkCudnnErr(cudnnSetFilter4dDescriptor(filter_desc, CUDNN_DATA_FLOAT, infer_type, num_filter, int(in_channel/group), kernel_size, kernel_size));
        checkCudnnErr(cudnnSetConvolution2dDescriptor(conv_op_desc, pad, pad, stride, stride, 1, 1, CUDNN_CONVOLUTION, CUDNN_DATA_FLOAT));
    }

    printf("conv info %d %d %d %d %d %d %d %d\n", in_channel, height, width, num_filter, kernel_size, pad, stride, group);
    // conv op perpape
//    cudnnConvolutionFwdAlgoPerf_t algo_pref;
    std::unique_ptr<perf_t[]> perf_algos(new perf_t[8]);
    int num_algo = 8;
    int num_algo_get = 0;
    void* workspace_data_temp;
    size_t workspace_data_temp_size = 30 * 1024 * 102;
    checkCudaErr(cudaMalloc(&workspace_data_temp, workspace_data_temp_size));
    cudnnConvolutionFwdAlgo_t algo;
    //checkCudnnErr(cudnnGetConvolutionForwardAlgorithm_v7(handle, input_desc, filter_desc, conv_op_desc, output_desc, num_algo, &num_algo_get, perf_algos.get()));
    checkCudnnErr(cudnnFindConvolutionForwardAlgorithmEx(handle, input_desc, d_A_w, filter_desc, d_W_w, conv_op_desc, output_desc,d_B_w, num_algo, &num_algo_get, perf_algos.get(), workspace_data_temp, workspace_data_temp_size));
    algo = perf_algos[0].algo;
    //if(half){
    //    algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
    //}
    printf("Algo: %d\n", algo);
    size_t workspace_size = 0;
    checkCudnnErr(cudnnGetConvolutionForwardWorkspaceSize(handle, input_desc, filter_desc, conv_op_desc, output_desc, algo, &workspace_size));
    void* workspace;
    cudaMalloc(&workspace, workspace_size);
//    printf("workspace_size %d\n", (int)workspace_size);

//#define ITER 2
    // do conv
    float alpha = 1;
    float beta = 0;
    // warm up
    for(int i = 0; i < 10; i++){
    	checkCudnnErr(cudnnConvolutionForward(handle, (void*)&alpha, input_desc, d_A_w, filter_desc, d_W_w, conv_op_desc, algo, workspace, workspace_size, (void*)&beta, output_desc, d_B_w));
    	checkCudaErr(cudaDeviceSynchronize());
    }

    gettimeofday(&handle_create, NULL);
    for(int i = 0; i < iter; i++){
    	checkCudnnErr(cudnnConvolutionForward(handle, (void*)&alpha, input_desc, d_A, filter_desc, d_W, conv_op_desc, algo, workspace, workspace_size, (void*)&beta, output_desc, d_B));
    	checkCudaErr(cudaDeviceSynchronize());
    }
    gettimeofday(&end, NULL);

    int time = 0;
    printf("========== Time ==========\n");
    //time = 1000000 * (handle_create.tv_sec - start.tv_sec) + (handle_create.tv_usec - start.tv_usec);
    //printf("Handle Create: %d us\n", time);
    time = -1000000 * (handle_create.tv_sec - end.tv_sec) - (handle_create.tv_usec - end.tv_usec);
    printf("Conv Op Run: %d us\n", time/iter);
    // copy data into host
    checkCudaErr(cudaMemcpy((void*)B, d_B, sizeof(float) * output_size, cudaMemcpyDeviceToHost));
    checkCudaErr(cudaMemcpy((void*)A, d_A, sizeof(float) * input_size, cudaMemcpyDeviceToHost));
//    checkCudaErr(cudaMemcpy((void*)B, d_B, sizeof(float) * output_size, cudaMemcpyDeviceToHost));
    cudaDeviceSynchronize();

//    for(int i = 0; i < output_size; i++){
//        printf("%d -> %f\n", i, B[i]);
//    }

    return 0;
}

I found that the config of conv desc (CUDNN_CONVOLUTION or CUDNN_CROSS_CORRELATION) will affect the run time, So I wonder what is the difference between this two config.
cudnnSetConvolution2dDescriptor(conv_op_desc, pad, pad, stride, stride, 1, 1, CUDNN_CONVOLUTION, CUDNN_DATA_FLOAT)

Hi,

We try to reproduce this issue on our environment.
Will update more information later.

Thanks.

Hi,

We try to reproduce this issue on our environment but found a missing class called perf_t.
To reproduce this more efficiently, would you mind to share a complete source with us directly?

topic_145123.cpp:115:21: error: ‘perf_t’ was not declared in this scope
     std::unique_ptr<perf_t[]> perf_algos(new perf_t[8]);
                     ^~~~~~
topic_145123.cpp:115:21: note: suggested alternative: ‘perror’
     std::unique_ptr<perf_t[]> perf_algos(new perf_t[8]);
                     ^~~~~~
                     perror
topic_145123.cpp:115:29: error: template argument 1 is invalid
     std::unique_ptr<perf_t[]> perf_algos(new perf_t[8]);
                             ^
topic_145123.cpp:115:29: error: template argument 2 is invalid
topic_145123.cpp:115:46: error: ‘perf_t’ does not name a type; did you mean ‘perror’?
     std::unique_ptr<perf_t[]> perf_algos(new perf_t[8]);
                                              ^~~~~~
                                              perror
topic_145123.cpp:123:174: error: request for member ‘get’ in ‘perf_algos’, which is of non-class type ‘int’
     checkCudnnErr(cudnnFindConvolutionForwardAlgorithmEx(handle, input_desc, d_A_w, filter_desc, d_W_w, conv_op_desc, output_desc,d_B_w, num_algo, &num_algo_get, perf_algos.get(), workspace_data_temp, workspace_data_temp_size));
                                                                                                                                                                              ^
topic_145123.cpp:18:25: note: in definition of macro ‘checkCudnnErr’
     cudnnStatus_t err = call;                                 \
                         ^~~~
topic_145123.cpp:124:24: error: invalid types ‘int[int]’ for array subscript
     algo = perf_algos[0].algo;

Thanks.

I have modify cudnnSetConvolution2dDescriptor into CUDNN_CROSS_CORRELATION, and it run correctly. But when I use CUDNN_CONVOLUTION, and the fp16 run time is much longer.

conv_op.cpp (8.8 KB)

Hi

We try your sample above with CUDNN_CONVOLUTION but no able to reproduce the issue.
Do we miss anything?

diff --git a/topic_145123.cpp b/topic_145123.cpp
index eeb0e9a..c8b6a78 100644
--- a/topic_145123.cpp
+++ b/topic_145123.cpp
@@ -124,10 +124,10 @@ int conv_op_process(int& batch, int& in_channel, int& height, int& width, int& k
         checkCudnnErr(cudnnSetFilter4dDescriptor(filter_desc, CUDNN_DATA_HALF, infer_type, num_filter, int(in_channel/group), kernel_size, kernel_size));
         if(group != 1){
             // CUDNN_CROSS_CORRELATION is very importance in this config
-            checkCudnnErr(cudnnSetConvolution2dDescriptor(conv_op_desc, pad, pad, stride, stride, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT));
+            checkCudnnErr(cudnnSetConvolution2dDescriptor(conv_op_desc, pad, pad, stride, stride, 1, 1, CUDNN_CONVOLUTION, CUDNN_DATA_FLOAT));
         }
         else{
-            checkCudnnErr(cudnnSetConvolution2dDescriptor(conv_op_desc, pad, pad, stride, stride, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_HALF));
+            checkCudnnErr(cudnnSetConvolution2dDescriptor(conv_op_desc, pad, pad, stride, stride, 1, 1, CUDNN_CONVOLUTION, CUDNN_DATA_HALF));
         }
         // using tensor core
                checkCudnnErr( cudnnSetConvolutionMathType(conv_op_desc, CUDNN_TENSOR_OP_MATH));
@@ -138,7 +138,7 @@ int conv_op_process(int& batch, int& in_channel, int& height, int& width, int& k
         checkCudnnErr(cudnnSetTensor4dDescriptor(input_desc, infer_type, CUDNN_DATA_FLOAT, batch, in_channel, height, width));
         checkCudnnErr(cudnnSetTensor4dDescriptor(output_desc, infer_type, CUDNN_DATA_FLOAT, batch, num_filter, int(height/stride), int(width/stride)));
         checkCudnnErr(cudnnSetFilter4dDescriptor(filter_desc, CUDNN_DATA_FLOAT, infer_type, num_filter, int(in_channel/group), kernel_size, kernel_size));
-        checkCudnnErr(cudnnSetConvolution2dDescriptor(conv_op_desc, pad, pad, stride, stride, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT));
+        checkCudnnErr(cudnnSetConvolution2dDescriptor(conv_op_desc, pad, pad, stride, stride, 1, 1, CUDNN_CONVOLUTION, CUDNN_DATA_FLOAT));
     }
 
     printf("conv info %d %d %d %d %d %d %d %d\n", in_channel, height, width, num_filter, kernel_size, pad, stride, group);

[FP16]

$ ./test 480 80 48 480 7 1 480 1000 16
Using FP16
conv info 480 80 48 480 7 3 1 480
Algo: 1
========== Time ==========
Conv Op Run: 17499 us

[FP32]

$ ./test 480 80 48 480 7 1 480 1000 32
Using FP32
conv info 480 80 48 480 7 3 1 480
Algo: 1
========== Time ==========
Conv Op Run: 16479 us

By the way, have you maximized the device performance before benchmarking?

$ sudo nvpmodel -m 0
$ sudo jetson_clocks

Thanks