Problem using a convolution filter formatted with a NHWC layout format in cuDNN

Hi,

I would like to perform a 1D convolution with cudnnConvolutionForward(…) (with height always egal to 1).
My data are described with the NHWC layout format.
I first made a simple test to check the convolution results with the following dimensions:

batchsize = 1
input_channel = 1
output_channel = 3
input_height = 1
input_width = 8

The problem is :
cudnn seems to always interprets my filter with NCHW layer format even if I use the CUDNN_TENSOR_NHWC flag.

My input is :

| 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |

The convolution filters are :

in NCHW format :

	        |    w1     |    w2     |    w3     |
channel 1   |    1      |    1      |    1      |
channel 2   |    0      |    0      |    0      |   
channel 3   |    0      |    0      |    0      |   

in NHWC format :

       | channel 1        | channel 2        | channel 3        |
w1     |     1            |     1            |     1            | 
w2     |     0            |     0            |     0            |   
w3     |     0            |     0            |     0            |   

Whether it be CUDNN_TENSOR_NHWC or CUDNN_TENSOR_NCHW I obtain this results :

channel 1 | 6 | 9 | 12 | 15 | 18 | 21 |
channel 2 | 0 | 0 |  0 |  0 |  0 |  0 |
channel 3 | 0 | 0 |  0 |  0 |  0 |  0 |

This is the result I would expected for the NCHW. In my case, I should obtain :

channel 1 | 1 | 2 | 3 | 4 | 5 | 6 |
channel 2 | 1 | 2 | 3 | 4 | 5 | 6 |
channel 3 | 1 | 2 | 3 | 4 | 5 | 6 |

Am I missing something ?

My code :

float data[8] = {1, 2, 3, 4, 5, 6, 7, 8};

    //                             c1 c2 c3        
    const float h_kernel[3][20] = {{1, 0, 0}, \
                                   {0, 1, 0}, \
                                   {0, 0, 1}};
                            
    cudnnHandle_t cudnn;
    checkCUDNN(cudnnCreate(&cudnn));
    
    cudnnTensorDescriptor_t input_descriptor;
    checkCUDNN(cudnnCreateTensorDescriptor(&input_descriptor));
    checkCUDNN(cudnnSetTensor4dDescriptor(input_descriptor,
                                        /*format=*/CUDNN_TENSOR_NHWC,
                                        /*dataType=*/CUDNN_DATA_FLOAT,
                                        /*batch_size=*/1,
                                        /*channels=*/1,
                                        /*image_height=*/1,
                                        /*image_width=*/8));
    
    
    cudnnFilterDescriptor_t kernel_descriptor;
    checkCUDNN(cudnnCreateFilterDescriptor(&kernel_descriptor));
    checkCUDNN(cudnnSetFilter4dDescriptor(kernel_descriptor,
                                          /*dataType=*/CUDNN_DATA_FLOAT,
                                          /*format=*/CUDNN_TENSOR_NHWC,
                                          /*out_channels=*/3,
                                          /*in_channels=*/1,
                                          /*kernel_height=*/1,
                                          /*kernel_width=*/3));
    
    
    cudnnConvolutionDescriptor_t convolution_descriptor;
    checkCUDNN(cudnnCreateConvolutionDescriptor(&convolution_descriptor));
    checkCUDNN(cudnnSetConvolution2dDescriptor(convolution_descriptor,
                                               /*pad_height=*/0,
                                               /*pad_width=*/0,
                                               /*vertical_stride=*/1,
                                               /*horizontal_stride=*/1,
                                               /*dilation_height=*/1,
                                               /*dilation_width=*/1,
                                               /*mode=*/CUDNN_CROSS_CORRELATION,
                                               /*computeType=*/CUDNN_DATA_FLOAT));
    
    
    checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convolution_descriptor,
                                                            input_descriptor,
                                                            kernel_descriptor,
                                                            &output_batchsize,
                                                            &output_channel,
                                                            &output_height,
                                                            &output_width));
    
    std::cout << "convolution output dimension : ( n = " << output_batchsize << " , c = " << output_channel << " , h = " << output_height << " , w = " << output_width << " )" << std::endl;
    
    cudnnTensorDescriptor_t output_descriptor;
    checkCUDNN(cudnnCreateTensorDescriptor(&output_descriptor));
    checkCUDNN(cudnnSetTensor4dDescriptor(output_descriptor,
                                          /*format=*/CUDNN_TENSOR_NHWC,
                                          /*dataType=*/CUDNN_DATA_FLOAT,
                                          /*batch_size=*/output_batchsize,
                                          /*channels=*/output_channel,
                                          /*image_height=*/output_height,
                                          /*image_width=*/output_width));
    
    
    size_t workspace_bytes = 0;
    checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnn,
                                                   input_descriptor,
                                                   kernel_descriptor,
                                                   convolution_descriptor,
                                                   output_descriptor,
                                                   CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM,
                                                   &workspace_bytes));
    
    
    void* d_workspace{nullptr};
    cudaMalloc(&d_workspace, workspace_bytes);
    
    float* d_input{nullptr};
    cudaMalloc(&d_input, sizeof(data));
    cudaMemcpy(d_input, data, sizeof(data), cudaMemcpyHostToDevice);
    
    float* d_kernel{nullptr};
    cudaMalloc(&d_kernel, sizeof(h_kernel));
    cudaMemcpy(d_kernel, h_kernel, sizeof(h_kernel), cudaMemcpyHostToDevice);
    
    int output_bytes = output_batchsize * output_channel * output_height * output_width * sizeof(float);
    float* d_output{nullptr};
    cudaMalloc(&d_output, output_bytes);
    cudaMemset(d_output, 0, output_bytes);
    
    float* d_bias{nullptr};
    cudaMalloc(&d_bias, sizeof(h_bias));
    cudaMemcpy(d_bias, h_bias, sizeof(h_bias), cudaMemcpyHostToDevice);
    
    const float alpha = 1, beta = 0;
    checkCUDNN(cudnnConvolutionForward(cudnn,
                                       &alpha,
                                       input_descriptor,
                                       d_input,
                                       kernel_descriptor,
                                       d_kernel,
                                       convolution_descriptor,
                                       CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM,
                                       d_workspace,
                                       workspace_bytes,
                                       &beta,
                                       output_descriptor,
                                       d_output));
    
    
    float* h_output = new float[output_bytes];
    cudaMemcpy(h_output, d_output, output_bytes, cudaMemcpyDeviceToHost);
    
    std::cout << std::endl;
    std::cout << "output per channel: " << std::endl;
    for(int i = 0; i < output_width*output_channel; i++)
    {
         std::cout << "channel = " << i%3 << " , indice = " << i/3 << " : " << h_output[i] << std::endl;
    }
    
    
    delete[] h_output;
    cudaFree(d_kernel);
    cudaFree(d_input);
    cudaFree(d_output);
    cudaFree(d_workspace);
    
    cudnnDestroyTensorDescriptor(input_descriptor);
    cudnnDestroyTensorDescriptor(output_descriptor);
    cudnnDestroyFilterDescriptor(kernel_descriptor);
    cudnnDestroyConvolutionDescriptor(convolution_descriptor);

Thanks you in advance,