Critical Defect: call to cudnnConvolutionForward() leading to cudaDeviceSynchronize() ret error 77

Hi All,

I try to use cudnn in my framework, yet a call to cudnnConvolutionForward with specific combination leads to cuda failure

the code

void test_cudnn_err() {
	cudnnTensorDescriptor_t input_desc = NULL, output_desc = NULL;
	cudnnFilterDescriptor_t weight_desc = NULL;
	cudnnConvolutionDescriptor_t conv_desc = NULL;
	cudnnConvolutionFwdAlgo_t fwd_algo;
	cudnnCreateTensorDescriptor(&input_desc) ;
	cudnnCreateTensorDescriptor(&output_desc);
	cudnnCreateFilterDescriptor(&weight_desc);
	cudnnCreateConvolutionDescriptor(&conv_desc);

	int mini_batch = 4;
	int in_channels = 512;
	int in_height = 26;
	int in_width = 26;
	int size = 3;
	int padding = 1;
	int stride = 2;

	int out_channels = 1024;
	int out_height = 0, out_width = 0;

	void* workspace = NULL;
	size_t workspace_bytes = 0;
	float one = 1.0f, zero = 0.0f;

	cudnnSetTensor4dDescriptor(input_desc,
		CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, mini_batch, in_channels, in_height, in_width);

	cudnnSetFilter4dDescriptor(weight_desc,
		CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, out_channels, in_channels, size, size);

	cudnnSetConvolution2dDescriptor(conv_desc,
		padding, padding, stride, stride, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT);

	int temp = 0;
	cudnnGetConvolution2dForwardOutputDim(conv_desc,
			input_desc, weight_desc, &temp, &out_channels, &out_height, &out_width);
	cudnnSetTensor4dDescriptor(output_desc,
		CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, mini_batch, out_channels, out_height, out_width);
	cudnnGetConvolutionForwardAlgorithm(GetCUDNNHandle(),
		input_desc, weight_desc, conv_desc, output_desc, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
		/*memoryLimitInBytes=*/0, &fwd_algo);

	size_t in_bytes = in_height * in_width * mini_batch * in_channels * sizeof(float);
	size_t w_bytes = size * size * out_channels * sizeof(float);
	size_t out_bytes = out_height * out_width * mini_batch * out_channels * sizeof(float);

	float* input = NULL;
	float* weights = NULL;
	float* output = NULL;
	cudaMalloc(&input, in_bytes);
	cudaMalloc(&weights, w_bytes);
	cudaMalloc(&output, out_bytes);

//	BatchMatrix input(in_height, in_width, mini_batch * in_channels);
//	BatchMatrix weights(size, size, out_channels);
//	BatchMatrix output(out_height, out_width, mini_batch * out_channels);

	cudnnGetConvolutionForwardWorkspaceSize(GetCUDNNHandle(),
		input_desc, weight_desc, conv_desc, output_desc, fwd_algo, &workspace_bytes);
	
	cudaMalloc(&workspace, workspace_bytes);

	cudnnStatus_t status = cudnnConvolutionForward(GetCUDNNHandle(), &one,
		input_desc, input, weight_desc, weights,
		conv_desc, fwd_algo, workspace, workspace_bytes,
		&zero, output_desc, output);

	//BatchMatrix bm(out_height, out_width, mini_batch * out_channels);
	//bool b = bm.Add(0.2); // failed
	//b = bm.Add(-0.2);

	cudaError_t e = cudaFree(input); // e == cudaErrorIllegalAddress(77)
	e = cudaFree(weights);
	e = cudaFree(output);
 
	if (input_desc) cudnnDestroyTensorDescriptor(input_desc);
	if (output_desc) cudnnDestroyTensorDescriptor(output_desc);
	if (weight_desc) cudnnDestroyFilterDescriptor(weight_desc);
	if (conv_desc) cudnnDestroyConvolutionDescriptor(conv_desc);
}

latest version v7.4.2

cuda 9.2 + cuDNN 7.4.2

reduce out_channels to 256
or
change

size_t in_bytes = in_height * in_width * mini_batch * in_channels * sizeof(float) ;

to

size_t in_bytes = in_height * in_width * mini_batch * in_channels * sizeof(float) * 4;

problem solved.

however, why *4?

Sorry, Now I realized what mistake I’ve made.

size_t w_bytes = size * size * out_channels * in_channel * sizeof(float);