Convolution error between different algorithm using rtx30series

#include <cuda_runtime.h>
#include <cudnn.h>
#include <stdio.h>
#include <stdlib.h>

void teReadDataFromDisk(float* pbuf, int size, const char* filename)
{
FILE* pFile = NULL;
fopen_s(&pFile, filename, “rb+”);
fread(pbuf, sizeof(float), size, pFile);
fclose(pFile);
}

void main()
{
const int BB = 1, CC = 128, WW = 16, HH = 48;
const int XSIZE = BB * CC * WW * HH;
const int WSIZE = CC * CC * 9;
cudaError_t cudaerr = cudaSetDevice(0);

cudnnHandle_t handle = nullptr;
cudnnStatus_t cudnnerr = cudnnCreate(&handle);

float fAlpha = 1.0f, fBeta = 0.0f;

cudnnTensorDescriptor_t xDesc, yDesc;
cudnnFilterDescriptor_t wDesc;
cudnnConvolutionDescriptor_t convDesc;

cudnnerr = cudnnCreateTensorDescriptor(&xDesc);
cudnnerr = cudnnCreateTensorDescriptor(&yDesc);
cudnnerr = cudnnCreateFilterDescriptor(&wDesc);
cudnnerr = cudnnCreateConvolutionDescriptor(&convDesc);

cudnnerr = cudnnSetTensor4dDescriptor(xDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, BB, CC, HH, WW);
cudnnerr = cudnnSetTensor4dDescriptor(yDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, BB, CC, HH, WW);
cudnnerr = cudnnSetFilter4dDescriptor(wDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, CC, CC, 3, 3);

cudnnerr = cudnnSetConvolutionGroupCount(convDesc, 1);
//cudnnSetConvolutionMathType(convDesc, CUDNN_TENSOR_OP_MATH);
cudnnerr = cudnnSetConvolution2dDescriptor(convDesc, 1, 1, 1, 1, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT);

//int b, c, w, h;
//cudnnerr = cudnnGetConvolution2dForwardOutputDim(convDesc, xDesc, wDesc, &b, &c, &h, &w);

cudnnConvolutionFwdAlgo_t algo = (cudnnConvolutionFwdAlgo_t)0;
size_t workspace_size = 0

cudnnerr = cudnnGetConvolutionForwardWorkspaceSize(handle, xDesc, wDesc, convDesc, yDesc,
	(cudnnConvolutionFwdAlgo_t)1, &workspace_size);//1769552

float *x_h = new float[XSIZE];
float *w_h = new float[WSIZE];
float *y_h1 = new float[XSIZE];
float *y_h2 = new float[XSIZE];
float *x_d, *w_d, *y_d, *workspace_d;

cudaerr = cudaMalloc(&x_d, XSIZE*4);
cudaerr = cudaMalloc(&y_d, XSIZE*4);
cudaerr = cudaMalloc(&w_d, WSIZE*4);
cudaerr = cudaMalloc(&workspace_d, workspace_size);

teReadDataFromDisk(x_h, XSIZE, "aaxx");
teReadDataFromDisk(w_h, WSIZE, "aaww");

cudaerr = cudaMemcpy(x_d, x_h, XSIZE * 4, cudaMemcpyHostToDevice);
cudaerr = cudaMemcpy(w_d, w_h, WSIZE * 4, cudaMemcpyHostToDevice);
 

cudnnerr = cudnnConvolutionForward(handle,
	&fAlpha, xDesc, x_d,
	wDesc, w_d,
	convDesc, algo, workspace_d, workspace_size,
	&fBeta,	yDesc, y_d);

cudaerr = cudaMemcpy(y_h1, y_d, XSIZE *4, cudaMemcpyDeviceToHost);

for (size_t ii = 0; ii < 20; ii++)
{
	printf("%f,", y_h1[ii]);
}
printf("\n\n");

algo = (cudnnConvolutionFwdAlgo_t)1;
cudnnerr = cudnnConvolutionForward(handle,
	&fAlpha, xDesc, x_d,
	wDesc, w_d,
	convDesc, algo, workspace_d, workspace_size,
	&fBeta, yDesc, y_d);

cudaerr = cudaMemcpy(y_h2, y_d, XSIZE *4, cudaMemcpyDeviceToHost);

for (size_t ii = 0; ii < 20; ii++)
{
	printf("%f,", y_h2[ii]);
}
printf("\n\n");

//destroy descriptors.
cudnnDestroy(handle);

system("Pause");

}
input&&weight.zip (885.7 KB)