I’m very new to cuda and cudnn, and I just wrote a simple cudnn convolution validation code, however, when the input is from std::normal_distribution, it returns wrong result.
void randn_cu(half *data, LL n, int seed)
{
half *data_in_host = (half *)malloc(sizeof(half) * n);
std::default_random_engine generator(seed);
std::normal_distribution<float> distribution(0.0, 1.0);
#pragma omp parallel for
for (LL i = 0; i < n; ++i)
{
data_in_host[i] = __float2half(distribution(generator));
}
cudaMemcpy(data, data_in_host, n * sizeof(half), cudaMemcpyHostToDevice);
cudaDeviceSynchronize();
free(data_in_host);
}
void print(float *data, int n, int c, int h, int w)
{
std::vector<float> buffer(1 << 20);
checkCUDA(cudaMemcpy(buffer.data(), data, n * c * h * w * sizeof(float), cudaMemcpyDeviceToHost));
int a = 0;
for (int i = 0; i < n; ++i)
{
for (int j = 0; j < c; ++j)
{
std::cout << "n=" << i << ", c=" << j << ":" << std::endl;
for (int k = 0; k < h; ++k)
{
for (int l = 0; l < w; ++l)
{
std::cout << std::setw(6) << std::right << std::fixed << std::setprecision(2) << buffer[a];
++a;
}
std::cout << std::endl;
}
}
}
std::cout << std::endl;
}
int cudnn_conv2d_cu(float *signal, float *filter, float *output, int N, int C, int H, int W, int kernel_size, int out_channels, int algo)
{
cudnnHandle_t cudnn;
checkCUDNN(cudnnCreate(&cudnn));
std::cout << "N: " << N << std::endl;
std::cout << "C: " << C << std::endl;
std::cout << "H: " << H << std::endl;
std::cout << "W: " << W << std::endl;
std::cout << std::endl;
cudnnTensorDescriptor_t in_desc;
checkCUDNN(cudnnCreateTensorDescriptor(&in_desc));
checkCUDNN(cudnnSetTensor4dDescriptor(
in_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT,
N, C, H, W));
// filter
const int filt_k = out_channels;
const int filt_c = C;
const int filt_h = kernel_size;
const int filt_w = kernel_size;
std::cout << "filt_k: " << filt_k << std::endl;
std::cout << "filt_c: " << filt_c << std::endl;
std::cout << "filt_h: " << filt_h << std::endl;
std::cout << "filt_w: " << filt_w << std::endl;
std::cout << std::endl;
cudnnFilterDescriptor_t filt_desc;
checkCUDNN(cudnnCreateFilterDescriptor(&filt_desc));
checkCUDNN(cudnnSetFilter4dDescriptor(
filt_desc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW,
filt_k, filt_c, filt_h, filt_w));
// convolution
const int pad_h = 0;
const int pad_w = 0;
const int str_h = 1;
const int str_w = 1;
const int dil_h = 1;
const int dil_w = 1;
std::cout << "pad_h: " << pad_h << std::endl;
std::cout << "pad_w: " << pad_w << std::endl;
std::cout << "str_h: " << str_h << std::endl;
std::cout << "str_w: " << str_w << std::endl;
std::cout << "dil_h: " << dil_h << std::endl;
std::cout << "dil_w: " << dil_w << std::endl;
std::cout << std::endl;
cudnnConvolutionDescriptor_t conv_desc;
checkCUDNN(cudnnCreateConvolutionDescriptor(&conv_desc));
checkCUDNN(cudnnSetConvolution2dDescriptor(
conv_desc,
pad_h, pad_w, str_h, str_w, dil_h, dil_w,
CUDNN_CONVOLUTION, CUDNN_DATA_FLOAT));
// output
int out_n;
int out_c;
int out_h;
int out_w;
checkCUDNN(cudnnGetConvolution2dForwardOutputDim(
conv_desc, in_desc, filt_desc,
&out_n, &out_c, &out_h, &out_w));
std::cout << "out_n: " << out_n << std::endl;
std::cout << "out_c: " << out_c << std::endl;
std::cout << "out_h: " << out_h << std::endl;
std::cout << "out_w: " << out_w << std::endl;
std::cout << std::endl;
cudnnTensorDescriptor_t out_desc;
checkCUDNN(cudnnCreateTensorDescriptor(&out_desc));
checkCUDNN(cudnnSetTensor4dDescriptor(
out_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT,
out_n, out_c, out_h, out_w));
// algorithm
cudnnConvolutionFwdAlgo_t algo_t;
std::string algo_str;
switch (algo)
{
case 0:
algo_t = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
algo_str = "CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM";
break;
case 1:
algo_t = CUDNN_CONVOLUTION_FWD_ALGO_FFT;
algo_str = "CUDNN_CONVOLUTION_FWD_ALGO_FFT";
break;
case 2:
algo_t = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD;
algo_str = "CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD";
break;
default:
break;
}
std::cout << "Convolution algorithm: " << algo_str << std::endl;
std::cout << std::endl;
// workspace
size_t ws_size;
checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnn, in_desc, filt_desc, conv_desc, out_desc, algo_t, &ws_size));
float *ws_data;
checkCUDA(cudaMalloc(&ws_data, ws_size));
std::cout << "Workspace size: " << ws_size << std::endl;
std::cout << std::endl;
// perform
float alpha = 1.f;
float beta = 0.f;
print(signal, N, C, H, W);
print(filter, filt_k, filt_c, filt_h, filt_w);
checkCUDNN(cudnnConvolutionForward(
cudnn,
&alpha, in_desc, signal, filt_desc, filter,
conv_desc, algo_t, ws_data, ws_size,
&beta, out_desc, output));
cudaDeviceSynchronize();
print(output, out_n, out_c, out_h, out_w);
// finalizing
checkCUDA(cudaFree(ws_data));
checkCUDNN(cudnnDestroyTensorDescriptor(out_desc));
checkCUDNN(cudnnDestroyConvolutionDescriptor(conv_desc));
checkCUDNN(cudnnDestroyFilterDescriptor(filt_desc));
checkCUDNN(cudnnDestroyTensorDescriptor(in_desc));
checkCUDNN(cudnnDestroy(cudnn));
return 0;
}
void test_cudnn_conv2d()
{
int N = 1;
int C = 1;
int H = 5;
int W = 5;
int out_channels = 1;
int kernel_size = 2;
int out_w = W - kernel_size + 1;
int out_h = H - kernel_size + 1;
/*Generate input data*/
float *signal = (float *)malloc(sizeof(float) * N * C * H * W);
float *filter = (float *)malloc(sizeof(float) * kernel_size * kernel_size * C * out_channels);
float *signal_cu;
float *filter_cu;
cudaMalloc(&signal_cu, sizeof(float) * N * C * H * W);
cudaMalloc(&filter_cu, sizeof(float) * kernel_size * kernel_size * C * out_channels);
randn(signal, N * C * H * W);
randn(filter, kernel_size * kernel_size * C * out_channels);
cudaMemcpy(signal_cu, signal, sizeof(float) * N * C * H * W, cudaMemcpyHostToDevice);
cudaMemcpy(filter_cu, filter, sizeof(float) * kernel_size * kernel_size * C * out_channels, cudaMemcpyHostToDevice);
float *output_standard = (float *)malloc(sizeof(float) * N * out_channels * out_h * out_w);
float *output_test = (float *)malloc(sizeof(float) * N * out_channels * out_h * out_w);
float *output_test_cu;
cudaMalloc(&output_test_cu, sizeof(float) * N * out_channels * out_h * out_w);
/*Perform standard direct convolution on CPU*/
conv2d_cpu(signal, filter, output_standard, H, W, kernel_size);
/*Perform cudnn convolution on GPU*/
cudnn_conv2d_cu(signal_cu, filter_cu, output_test_cu, N, C, H, W, kernel_size, out_channels, 0);
cudaMemcpy(output_test, output_test_cu, sizeof(float) * N * out_channels * out_h * out_w, cudaMemcpyDeviceToHost);
/*Get accuracy*/
double acc = relative_error_2d(output_test, output_standard, out_h, out_w, N*out_channels);
std::cout << "cudnn_conv2d_float VS float_conv"
<< " N " << N << " C " << C << " H " << H << " W " << W << " out_channels " << out_channels << " kernel_size " << kernel_size << " relative_error " << acc << std::endl;
}
n=0, c=0:
-0.74 1.02 -0.11 0.01 -0.63
-0.42 0.01 -0.25 -0.81 -0.32
0.01 0.01 -1.59 -0.63 -0.12
-0.81 0.44 -1.68 0.44 1.36
0.21 -0.57 -0.57 -1.59 1.21
n=0, c=0:
-0.90 0.90
-1.20 0.90
n=0, c=0:
-2.27 1.28 0.39 0.33
-0.39 1.76 -0.12 -0.80
-1.13 3.84 -2.58 -1.25
-0.56 2.42 -1.11 -3.76
# this result is wrong
:( Anyone help