When trying to update cudnn from 8.2.2 to 8.3.0, some non regression tests start failing in our software.
I could pinpoint the failure to cudnnSoftmaxForward returning wrong results for small tensors.
Here is a simple reproducer:
#include <stdlib.h>
#include <float.h>
#include <stdio.h>
#include <cuda.h>
#include <cudnn.h>
#include <algorithm>
#include <vector>
#define CHECK_CUDA(call_) \
{ \
cudaError_t returnCode_ = (call_); \
if (returnCode_ != cudaSuccess) { \
fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\": %s\n", __FILE__, __LINE__, static_cast<int>(returnCode_), cudaGetErrorName(returnCode_), #call_, cudaGetErrorString(returnCode_)); \
exit(EXIT_FAILURE); \
} \
}
#define CHECK_CUDNN(call_) \
{ \
cudnnStatus_t returnCode_ = (call_); \
if (returnCode_ != CUDNN_STATUS_SUCCESS) { \
fprintf(stderr, "CUDNN error at %s:%d code=%d(%s) \"%s\"\n", __FILE__, __LINE__, static_cast<int>(returnCode_), cudnnGetErrorString(returnCode_), #call_); \
exit(EXIT_FAILURE); \
} \
}
static bool runSoftMaxOne(cudnnHandle_t cudnnHandle, int32_t n, int32_t c, int32_t h, int32_t w)
{
cudnnTensorDescriptor_t srcTensorDescriptor = nullptr;
CHECK_CUDNN(cudnnCreateTensorDescriptor(&srcTensorDescriptor));
CHECK_CUDNN(cudnnSetTensor4dDescriptor(srcTensorDescriptor, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n, c, h, w));
cudnnTensorDescriptor_t dstTensorDescriptor = nullptr;
CHECK_CUDNN(cudnnCreateTensorDescriptor(&dstTensorDescriptor));
CHECK_CUDNN(cudnnSetTensor4dDescriptor(dstTensorDescriptor, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n, c, h, w));
const size_t tensorDataSize = n * c * h * w * sizeof(float);
void* srcTensorData = nullptr;
CHECK_CUDA(cudaMalloc(&srcTensorData, tensorDataSize));
void* dstTensorData = nullptr;
CHECK_CUDA(cudaMalloc(&dstTensorData, tensorDataSize));
CHECK_CUDA(cudaMemset(srcTensorData, 0, tensorDataSize));
const float alpha = 1.0F;
const float beta = 0.0F;
CHECK_CUDNN(cudnnSoftmaxForward(cudnnHandle, CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_CHANNEL, &alpha, srcTensorDescriptor, srcTensorData, &beta, dstTensorDescriptor, dstTensorData));
std::vector<float> result((size_t)(n * c * h * w));
CHECK_CUDA(cudaMemcpy(result.data(), dstTensorData, tensorDataSize, cudaMemcpyDeviceToHost));
CHECK_CUDA(cudaFree(dstTensorData));
CHECK_CUDA(cudaFree(srcTensorData));
CHECK_CUDNN(cudnnDestroyTensorDescriptor(dstTensorDescriptor));
CHECK_CUDNN(cudnnDestroyTensorDescriptor(srcTensorDescriptor));
return std::all_of(result.begin(), result.end(), [&](float x) {return x == result[0]; });
}
static void runSoftMax(cudnnHandle_t cudnnHandle)
{
for (int32_t n = 1; n < 128; ++n) {
for (int32_t c = 1; c < 128; ++c) {
for (int32_t h = 1; h < 128; ++h) {
for (int32_t w = 1; w < 128; ++w) {
if (!runSoftMaxOne(cudnnHandle, n, c, h, w)) {
fprintf(stdout, "FAIL FOR %d, %d, %d, %d\n", n, c, h, w);
}
}
}
}
}
}
int main()
{
int runtimeVersion;
CHECK_CUDA(cudaRuntimeGetVersion(&runtimeVersion));
fprintf(stdout, "cuda runtime %d.%d.%d\n",runtimeVersion / 1000, (runtimeVersion / 10) % 100, runtimeVersion % 10);
size_t cudnnVersion = cudnnGetVersion();
fprintf(stdout, "cudnn %d.%d.%d\n", static_cast<int>(cudnnVersion / 1000), static_cast<int>((cudnnVersion / 100) % 10), static_cast<int>(cudnnVersion % 10));
struct cudaDeviceProp deviceProp;
CHECK_CUDA(cudaGetDeviceProperties(&deviceProp, 0));
bool const supportTrueFP16 = (deviceProp.major > 6) || ((deviceProp.major == 6) && ((deviceProp.minor == 0) || (deviceProp.minor == 2)));
cudnnHandle_t cudnnHandle = nullptr;
CHECK_CUDNN(cudnnCreate(&cudnnHandle));
runSoftMax(cudnnHandle);
CHECK_CUDNN(cudnnDestroy(cudnnHandle)); cudnnHandle = nullptr;
return EXIT_SUCCESS;
}
Edit 1:
This runs with cuda 11.1 on Windows 10 on a GTX 1050 Ti GPU
The same is happening on Linux on a GTX 1080 Ti, CUDA 11.1, drivers 470.63.01
Edit 2:
This only seems to happen for batch size 1
Edit 3:
Filed bug https://developer.nvidia.com/nvidia_bug/3434175