Hi All,
I try to use cudnn in my framework, yet a call to cudnnConvolutionForward with specific combination leads to cuda failure
the code
void test_cudnn_err() {
cudnnTensorDescriptor_t input_desc = NULL, output_desc = NULL;
cudnnFilterDescriptor_t weight_desc = NULL;
cudnnConvolutionDescriptor_t conv_desc = NULL;
cudnnConvolutionFwdAlgo_t fwd_algo;
cudnnCreateTensorDescriptor(&input_desc) ;
cudnnCreateTensorDescriptor(&output_desc);
cudnnCreateFilterDescriptor(&weight_desc);
cudnnCreateConvolutionDescriptor(&conv_desc);
int mini_batch = 4;
int in_channels = 512;
int in_height = 26;
int in_width = 26;
int size = 3;
int padding = 1;
int stride = 2;
int out_channels = 1024;
int out_height = 0, out_width = 0;
void* workspace = NULL;
size_t workspace_bytes = 0;
float one = 1.0f, zero = 0.0f;
cudnnSetTensor4dDescriptor(input_desc,
CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, mini_batch, in_channels, in_height, in_width);
cudnnSetFilter4dDescriptor(weight_desc,
CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, out_channels, in_channels, size, size);
cudnnSetConvolution2dDescriptor(conv_desc,
padding, padding, stride, stride, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT);
int temp = 0;
cudnnGetConvolution2dForwardOutputDim(conv_desc,
input_desc, weight_desc, &temp, &out_channels, &out_height, &out_width);
cudnnSetTensor4dDescriptor(output_desc,
CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, mini_batch, out_channels, out_height, out_width);
cudnnGetConvolutionForwardAlgorithm(GetCUDNNHandle(),
input_desc, weight_desc, conv_desc, output_desc, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
/*memoryLimitInBytes=*/0, &fwd_algo);
size_t in_bytes = in_height * in_width * mini_batch * in_channels * sizeof(float);
size_t w_bytes = size * size * out_channels * sizeof(float);
size_t out_bytes = out_height * out_width * mini_batch * out_channels * sizeof(float);
float* input = NULL;
float* weights = NULL;
float* output = NULL;
cudaMalloc(&input, in_bytes);
cudaMalloc(&weights, w_bytes);
cudaMalloc(&output, out_bytes);
// BatchMatrix input(in_height, in_width, mini_batch * in_channels);
// BatchMatrix weights(size, size, out_channels);
// BatchMatrix output(out_height, out_width, mini_batch * out_channels);
cudnnGetConvolutionForwardWorkspaceSize(GetCUDNNHandle(),
input_desc, weight_desc, conv_desc, output_desc, fwd_algo, &workspace_bytes);
cudaMalloc(&workspace, workspace_bytes);
cudnnStatus_t status = cudnnConvolutionForward(GetCUDNNHandle(), &one,
input_desc, input, weight_desc, weights,
conv_desc, fwd_algo, workspace, workspace_bytes,
&zero, output_desc, output);
//BatchMatrix bm(out_height, out_width, mini_batch * out_channels);
//bool b = bm.Add(0.2); // failed
//b = bm.Add(-0.2);
cudaError_t e = cudaFree(input); // e == cudaErrorIllegalAddress(77)
e = cudaFree(weights);
e = cudaFree(output);
if (input_desc) cudnnDestroyTensorDescriptor(input_desc);
if (output_desc) cudnnDestroyTensorDescriptor(output_desc);
if (weight_desc) cudnnDestroyFilterDescriptor(weight_desc);
if (conv_desc) cudnnDestroyConvolutionDescriptor(conv_desc);
}