Which parameter is bad

cudnnRNNForward is returning CUDNN_STATUS_BAD_PARAM, how do I find out what parameter is bad, as far as i can see everything is configured correctly?

LSTMLayer::LSTMLayer(cudnnHandle_t cudnnHandle, cublasHandle_t cublasHandle, int batchSize, int seqLength, int inC, int outC, int numLayers, const char* layerName, bool train) : cudnnHandle_(cudnnHandle),
	cublasHandle_(cublasHandle), batchSize_(batchSize), seqLength_(seqLength), inC_(inC), outC_(outC), numLayers_(numLayers){
	layerName_ = layerName;
	train_ = train;
	checkCUDNN(cudnnCreateRNNDescriptor(&rnnDesc_));
	checkCUDNN(cudnnSetRNNDescriptor_v8( rnnDesc_, CUDNN_RNN_ALGO_STANDARD, CUDNN_LSTM, CUDNN_RNN_SINGLE_INP_BIAS, CUDNN_UNIDIRECTIONAL, CUDNN_LINEAR_INPUT, CUDNN_DATA_HALF, CUDNN_DATA_HALF, CUDNN_TENSOR_OP_MATH, inC_, outC_, outC_, numLayers_, nullptr, 0 ));
	checkCUDNN(cudnnCreateRNNDataDescriptor(&xDesc_));
	checkCUDNN(cudnnCreateRNNDataDescriptor(&yDesc_));
	constexpr cudnnRNNDataLayout_t layout = CUDNN_RNN_DATA_LAYOUT_BATCH_MAJOR_UNPACKED;
	const auto seqLengthArray = new int[batchSize_];
	for(int i = 0; i<batchSize_; ++i){ seqLengthArray[i] = seqLength_; }
	cudaMalloc(&seqLengthArray_, batchSize_*sizeof(int));
	cudaMemcpy(seqLengthArray_, seqLengthArray, batchSize_*sizeof(int), cudaMemcpyHostToDevice);
	checkCUDNN(cudnnSetRNNDataDescriptor( xDesc_, CUDNN_DATA_HALF, layout, seqLength_, batchSize_, inC_, seqLengthArray, nullptr ));
	checkCUDNN(cudnnSetRNNDataDescriptor( yDesc_, CUDNN_DATA_HALF, layout, seqLength_, batchSize_, outC_, seqLengthArray, nullptr ));
	checkCUDNN(cudnnCreateTensorDescriptor(&hDesc_));
	checkCUDNN(cudnnCreateTensorDescriptor(&cDesc_));
	checkCUDNN(cudnnSetTensor4dDescriptor( hDesc_, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, numLayers_, batchSize_, outC_, 1 ));
	checkCUDNN(cudnnSetTensor4dDescriptor( cDesc_, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, numLayers_, batchSize_, outC_, 1 ));
	checkCUDNN(cudnnGetRNNWeightSpaceSize(cudnnHandle_, rnnDesc_, &weightSpaceSize_));
	checkCUDA(cudaMalloc(&weights_, weightSpaceSize_));
	checkCUDA(cudaMalloc(&gradWeights_, weightSpaceSize_));
	checkCUDA(cudaMalloc(&gradOut_, batchSize_*inC_*sizeof(__half)));
	checkCUDA(cudaMalloc(&hy_, numLayers_*batchSize_*outC_*sizeof(__half)));
	checkCUDA(cudaMalloc(&cy_, numLayers_*batchSize_*outC_*sizeof(__half)));
	checkCUDA(cudaMalloc(&outData_, batchSize_*outC_*sizeof(__half)));
	checkCUDA(cudaMemset(weights_, 0, weightSpaceSize_));
	checkCUDA(cudaMemset(gradWeights_, 0, weightSpaceSize_));
	checkCUDA(cudaMemset(gradOut_, 0, batchSize_*inC_*sizeof(__half)));
	checkCUDA(cudaMemset(hy_, 0, numLayers_*batchSize_*outC_*sizeof(__half)));
	checkCUDA(cudaMemset(cy_, 0, numLayers_*batchSize_*outC_*sizeof(__half)));
	checkCUDA(cudaMemset(outData_, 0, batchSize_*outC_*sizeof(__half)));
	HeInit(weights_, weightSpaceSize_/sizeof(__half), inC_);
	fwdMode_ = train ? CUDNN_FWD_MODE_TRAINING : CUDNN_FWD_MODE_INFERENCE;
	checkCUDNN(cudnnGetRNNTempSpaceSizes( cudnnHandle_, rnnDesc_, fwdMode_, xDesc_, &workspaceSize_, &reserveSpaceSize_ ));
	checkCUDA(cudaMalloc(&workspace_, workspaceSize_));
	checkCUDA(cudaMemset(workspace_, 0, workspaceSize_));
	checkCUDA(cudaMalloc(&reserveSpace_, reserveSpaceSize_));
	checkCUDA(cudaMemset(reserveSpace_, 0, reserveSpaceSize_));
	if(useAdamW_){
		checkCUDA(cudaMalloc(&m_Weights_, weightSpaceSize_*sizeof(float)));
		checkCUDA(cudaMalloc(&v_Weights_, weightSpaceSize_*sizeof(float)));
		checkCUDA(cudaMemset(m_Weights_, 0, weightSpaceSize_*sizeof(float)));
		checkCUDA(cudaMemset(v_Weights_, 0, weightSpaceSize_*sizeof(float)));
	}
	cudnnCreateTensorDescriptor(&outDesc_);
	cudnnSetTensor4dDescriptor(outDesc_, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, batchSize_, outC_, 1, 1);
}
LSTMLayer::~LSTMLayer(){
	cudaFree(weights_);
	cudaFree(hy_);
	cudaFree(cy_);
	cudaFree(outData_);
	cudaFree(workspace_);
	cudaFree(reserveSpace_);
	checkCUDNN(cudnnDestroyRNNDescriptor(rnnDesc_));
	checkCUDNN(cudnnDestroyTensorDescriptor(hDesc_));
	checkCUDNN(cudnnDestroyTensorDescriptor(cDesc_));
	if(train_){
		cudaFree(gradOut_);
		cudaFree(gradWeights_);
		checkCUDNN(cudnnDestroyFilterDescriptor(wDesc_));
	}
}
__half* LSTMLayer::Forward(__half* data, bool train){
	inData_ = data;
	fwdMode_ = train ? CUDNN_FWD_MODE_TRAINING : CUDNN_FWD_MODE_INFERENCE;
	checkCUDNN(cudnnRNNForward(cudnnHandle_, rnnDesc_, fwdMode_, seqLengthArray_, xDesc_, data, yDesc_, outData_, hDesc_, nullptr, hy_, cDesc_, nullptr, cy_, weightSpaceSize_, weights_, workspaceSize_, workspace_, reserveSpaceSize_, reserveSpace_));
	return outData_;
}
__half* LSTMLayer::Backward(__half* grad){
	checkCUDNN(cudnnRNNBackwardWeights_v8( cudnnHandle_, rnnDesc_, CUDNN_WGRAD_MODE_ADD, seqLengthArray_, xDesc_, inData_, hDesc_, nullptr, yDesc_, outData_, weightSpaceSize_, gradWeights_, workspaceSize_, workspace_, reserveSpaceSize_, reserveSpace_ ));
	checkCUDNN(cudnnRNNBackwardData_v8( cudnnHandle_, rnnDesc_, seqLengthArray_, yDesc_, outData_, grad, xDesc_, gradOut_, hDesc_, nullptr, nullptr, nullptr, cDesc_, nullptr, nullptr, nullptr, weightSpaceSize_, weights_, workspaceSize_, workspace_, reserveSpaceSize_, reserveSpace_ ));
	return gradWeights_;
}

Hi @CommanderLake ,
Can you please help us with log trace and repro steps?
Thanks

Thank you for returning to the forum, i dont know what logs you are referring to but if you create an instance of the class and run the forward pass you can see the error, i dont know what else i can tell you?

@CommanderLake

Have a look here, there’s an environment variable that you can set which will cause cudnn to print out some more information when something goes wrong. Set CUDNN_LOGLEVEL_DBG to 2, basically. The errors can still be hard to understand but they’re better than nothing.

https://docs.nvidia.com/deeplearning/cudnn/latest/reference/troubleshooting.html#troubleshooting

Thanks but that doesnt seem to do anything on windows, i fixed most of my problems by going back to 7.6.5 but that would still be handy.

1 Like