Hi,
I would like to perform a 1D convolution with cudnnConvolutionForward(…) (with height always egal to 1).
My data are described with the NHWC layout format.
I first made a simple test to check the convolution results with the following dimensions:
batchsize = 1
input_channel = 1
output_channel = 3
input_height = 1
input_width = 8
The problem is :
cudnn seems to always interprets my filter with NCHW layer format even if I use the CUDNN_TENSOR_NHWC flag.
My input is :
| 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |
The convolution filters are :
in NCHW format :
| w1 | w2 | w3 |
channel 1 | 1 | 1 | 1 |
channel 2 | 0 | 0 | 0 |
channel 3 | 0 | 0 | 0 |
in NHWC format :
| channel 1 | channel 2 | channel 3 |
w1 | 1 | 1 | 1 |
w2 | 0 | 0 | 0 |
w3 | 0 | 0 | 0 |
Whether it be CUDNN_TENSOR_NHWC or CUDNN_TENSOR_NCHW I obtain this results :
channel 1 | 6 | 9 | 12 | 15 | 18 | 21 |
channel 2 | 0 | 0 | 0 | 0 | 0 | 0 |
channel 3 | 0 | 0 | 0 | 0 | 0 | 0 |
This is the result I would expected for the NCHW. In my case, I should obtain :
channel 1 | 1 | 2 | 3 | 4 | 5 | 6 |
channel 2 | 1 | 2 | 3 | 4 | 5 | 6 |
channel 3 | 1 | 2 | 3 | 4 | 5 | 6 |
Am I missing something ?
My code :
float data[8] = {1, 2, 3, 4, 5, 6, 7, 8};
// c1 c2 c3 const float h_kernel[3][20] = {{1, 0, 0}, \ {0, 1, 0}, \ {0, 0, 1}}; cudnnHandle_t cudnn; checkCUDNN(cudnnCreate(&cudnn)); cudnnTensorDescriptor_t input_descriptor; checkCUDNN(cudnnCreateTensorDescriptor(&input_descriptor)); checkCUDNN(cudnnSetTensor4dDescriptor(input_descriptor, /*format=*/CUDNN_TENSOR_NHWC, /*dataType=*/CUDNN_DATA_FLOAT, /*batch_size=*/1, /*channels=*/1, /*image_height=*/1, /*image_width=*/8)); cudnnFilterDescriptor_t kernel_descriptor; checkCUDNN(cudnnCreateFilterDescriptor(&kernel_descriptor)); checkCUDNN(cudnnSetFilter4dDescriptor(kernel_descriptor, /*dataType=*/CUDNN_DATA_FLOAT, /*format=*/CUDNN_TENSOR_NHWC, /*out_channels=*/3, /*in_channels=*/1, /*kernel_height=*/1, /*kernel_width=*/3)); cudnnConvolutionDescriptor_t convolution_descriptor; checkCUDNN(cudnnCreateConvolutionDescriptor(&convolution_descriptor)); checkCUDNN(cudnnSetConvolution2dDescriptor(convolution_descriptor, /*pad_height=*/0, /*pad_width=*/0, /*vertical_stride=*/1, /*horizontal_stride=*/1, /*dilation_height=*/1, /*dilation_width=*/1, /*mode=*/CUDNN_CROSS_CORRELATION, /*computeType=*/CUDNN_DATA_FLOAT)); checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convolution_descriptor, input_descriptor, kernel_descriptor, &output_batchsize, &output_channel, &output_height, &output_width)); std::cout << "convolution output dimension : ( n = " << output_batchsize << " , c = " << output_channel << " , h = " << output_height << " , w = " << output_width << " )" << std::endl; cudnnTensorDescriptor_t output_descriptor; checkCUDNN(cudnnCreateTensorDescriptor(&output_descriptor)); checkCUDNN(cudnnSetTensor4dDescriptor(output_descriptor, /*format=*/CUDNN_TENSOR_NHWC, /*dataType=*/CUDNN_DATA_FLOAT, /*batch_size=*/output_batchsize, /*channels=*/output_channel, /*image_height=*/output_height, /*image_width=*/output_width)); size_t workspace_bytes = 0; checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnn, input_descriptor, kernel_descriptor, convolution_descriptor, output_descriptor, CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM, &workspace_bytes)); void* d_workspace{nullptr}; cudaMalloc(&d_workspace, workspace_bytes); float* d_input{nullptr}; cudaMalloc(&d_input, sizeof(data)); cudaMemcpy(d_input, data, sizeof(data), cudaMemcpyHostToDevice); float* d_kernel{nullptr}; cudaMalloc(&d_kernel, sizeof(h_kernel)); cudaMemcpy(d_kernel, h_kernel, sizeof(h_kernel), cudaMemcpyHostToDevice); int output_bytes = output_batchsize * output_channel * output_height * output_width * sizeof(float); float* d_output{nullptr}; cudaMalloc(&d_output, output_bytes); cudaMemset(d_output, 0, output_bytes); float* d_bias{nullptr}; cudaMalloc(&d_bias, sizeof(h_bias)); cudaMemcpy(d_bias, h_bias, sizeof(h_bias), cudaMemcpyHostToDevice); const float alpha = 1, beta = 0; checkCUDNN(cudnnConvolutionForward(cudnn, &alpha, input_descriptor, d_input, kernel_descriptor, d_kernel, convolution_descriptor, CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM, d_workspace, workspace_bytes, &beta, output_descriptor, d_output)); float* h_output = new float[output_bytes]; cudaMemcpy(h_output, d_output, output_bytes, cudaMemcpyDeviceToHost); std::cout << std::endl; std::cout << "output per channel: " << std::endl; for(int i = 0; i < output_width*output_channel; i++) { std::cout << "channel = " << i%3 << " , indice = " << i/3 << " : " << h_output[i] << std::endl; } delete[] h_output; cudaFree(d_kernel); cudaFree(d_input); cudaFree(d_output); cudaFree(d_workspace); cudnnDestroyTensorDescriptor(input_descriptor); cudnnDestroyTensorDescriptor(output_descriptor); cudnnDestroyFilterDescriptor(kernel_descriptor); cudnnDestroyConvolutionDescriptor(convolution_descriptor);
Thanks you in advance,