cudnnSoftmaxForward returns wrong result for small tensors when upgrading to 8.3.0

When trying to update cudnn from 8.2.2 to 8.3.0, some non regression tests start failing in our software.
I could pinpoint the failure to cudnnSoftmaxForward returning wrong results for small tensors.
Here is a simple reproducer:

#include <stdlib.h>
#include <float.h>
#include <stdio.h>

#include <cuda.h>
#include <cudnn.h>

#include <algorithm>
#include <vector>

#define CHECK_CUDA(call_) \
  { \
    cudaError_t returnCode_ = (call_); \
    if (returnCode_ != cudaSuccess) { \
      fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\": %s\n", __FILE__, __LINE__, static_cast<int>(returnCode_), cudaGetErrorName(returnCode_), #call_, cudaGetErrorString(returnCode_)); \
      exit(EXIT_FAILURE); \
    } \
  }

#define CHECK_CUDNN(call_) \
  { \
    cudnnStatus_t returnCode_ = (call_); \
    if (returnCode_ != CUDNN_STATUS_SUCCESS) { \
      fprintf(stderr, "CUDNN error at %s:%d code=%d(%s) \"%s\"\n", __FILE__, __LINE__, static_cast<int>(returnCode_), cudnnGetErrorString(returnCode_), #call_); \
      exit(EXIT_FAILURE); \
    } \
  }

static bool runSoftMaxOne(cudnnHandle_t cudnnHandle, int32_t n, int32_t c, int32_t h, int32_t w)
{
  cudnnTensorDescriptor_t srcTensorDescriptor = nullptr;
  CHECK_CUDNN(cudnnCreateTensorDescriptor(&srcTensorDescriptor));
  CHECK_CUDNN(cudnnSetTensor4dDescriptor(srcTensorDescriptor, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n, c, h, w));
  cudnnTensorDescriptor_t dstTensorDescriptor = nullptr;
  CHECK_CUDNN(cudnnCreateTensorDescriptor(&dstTensorDescriptor));
  CHECK_CUDNN(cudnnSetTensor4dDescriptor(dstTensorDescriptor, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n, c, h, w));

  const size_t tensorDataSize = n * c * h * w * sizeof(float);
  void* srcTensorData = nullptr;
  CHECK_CUDA(cudaMalloc(&srcTensorData, tensorDataSize));
  void* dstTensorData = nullptr;
  CHECK_CUDA(cudaMalloc(&dstTensorData, tensorDataSize));

  CHECK_CUDA(cudaMemset(srcTensorData, 0, tensorDataSize));
  const float alpha = 1.0F;
  const float beta = 0.0F;
  CHECK_CUDNN(cudnnSoftmaxForward(cudnnHandle, CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_CHANNEL, &alpha, srcTensorDescriptor, srcTensorData, &beta, dstTensorDescriptor, dstTensorData));
  std::vector<float> result((size_t)(n * c * h * w));
  CHECK_CUDA(cudaMemcpy(result.data(), dstTensorData, tensorDataSize, cudaMemcpyDeviceToHost));
  CHECK_CUDA(cudaFree(dstTensorData));
  CHECK_CUDA(cudaFree(srcTensorData));
  CHECK_CUDNN(cudnnDestroyTensorDescriptor(dstTensorDescriptor));
  CHECK_CUDNN(cudnnDestroyTensorDescriptor(srcTensorDescriptor));
  return std::all_of(result.begin(), result.end(), [&](float x) {return x == result[0]; });
}
static void runSoftMax(cudnnHandle_t cudnnHandle)
{
  for (int32_t n = 1; n < 128; ++n) {
    for (int32_t c = 1; c < 128; ++c) {
      for (int32_t h = 1; h < 128; ++h) {
        for (int32_t w = 1; w < 128; ++w) {
          if (!runSoftMaxOne(cudnnHandle, n, c, h, w)) {
            fprintf(stdout, "FAIL FOR %d, %d, %d, %d\n", n, c, h, w);
          }
        }
      }
    }
  }
}

int main()
{
  int runtimeVersion;
  CHECK_CUDA(cudaRuntimeGetVersion(&runtimeVersion));
  fprintf(stdout, "cuda runtime %d.%d.%d\n",runtimeVersion / 1000, (runtimeVersion / 10) % 100, runtimeVersion % 10);
  size_t cudnnVersion = cudnnGetVersion();
  fprintf(stdout, "cudnn %d.%d.%d\n", static_cast<int>(cudnnVersion / 1000), static_cast<int>((cudnnVersion / 100) % 10), static_cast<int>(cudnnVersion % 10));
  struct cudaDeviceProp deviceProp;
  CHECK_CUDA(cudaGetDeviceProperties(&deviceProp, 0));
  bool const supportTrueFP16 = (deviceProp.major > 6) || ((deviceProp.major == 6) && ((deviceProp.minor == 0) || (deviceProp.minor == 2)));

  cudnnHandle_t cudnnHandle = nullptr;
  CHECK_CUDNN(cudnnCreate(&cudnnHandle));
  runSoftMax(cudnnHandle);
  CHECK_CUDNN(cudnnDestroy(cudnnHandle)); cudnnHandle = nullptr;

  return EXIT_SUCCESS;
}

Edit 1:
This runs with cuda 11.1 on Windows 10 on a GTX 1050 Ti GPU
The same is happening on Linux on a GTX 1080 Ti, CUDA 11.1, drivers 470.63.01
Edit 2:
This only seems to happen for batch size 1
Edit 3:
Filed bug https://developer.nvidia.com/nvidia_bug/3434175

Hi @matthieu.darbois,

Thank you for filing a bug. We will try to reproduce the issue as per provided steps and keep you posted.

Thanks

FYI the ticket internal process here to help more public users .

The fix will be in next CUDNN version soon and we’ll add a warning in 8.3.0 like

Calling cudnnSoftmaxForward() with CUDNN_SOFTMAX_MODE_CHANNEL mode and N==1 in NCHW layout could result in incorrect results in cuDNN 8.3.0. This has been fixed in cuDNN 8.3.1.