cudnnRNNForward is returning CUDNN_STATUS_BAD_PARAM, how do I find out what parameter is bad, as far as i can see everything is configured correctly?
LSTMLayer::LSTMLayer(cudnnHandle_t cudnnHandle, cublasHandle_t cublasHandle, int batchSize, int seqLength, int inC, int outC, int numLayers, const char* layerName, bool train) : cudnnHandle_(cudnnHandle),
cublasHandle_(cublasHandle), batchSize_(batchSize), seqLength_(seqLength), inC_(inC), outC_(outC), numLayers_(numLayers){
layerName_ = layerName;
train_ = train;
checkCUDNN(cudnnCreateRNNDescriptor(&rnnDesc_));
checkCUDNN(cudnnSetRNNDescriptor_v8( rnnDesc_, CUDNN_RNN_ALGO_STANDARD, CUDNN_LSTM, CUDNN_RNN_SINGLE_INP_BIAS, CUDNN_UNIDIRECTIONAL, CUDNN_LINEAR_INPUT, CUDNN_DATA_HALF, CUDNN_DATA_HALF, CUDNN_TENSOR_OP_MATH, inC_, outC_, outC_, numLayers_, nullptr, 0 ));
checkCUDNN(cudnnCreateRNNDataDescriptor(&xDesc_));
checkCUDNN(cudnnCreateRNNDataDescriptor(&yDesc_));
constexpr cudnnRNNDataLayout_t layout = CUDNN_RNN_DATA_LAYOUT_BATCH_MAJOR_UNPACKED;
const auto seqLengthArray = new int[batchSize_];
for(int i = 0; i<batchSize_; ++i){ seqLengthArray[i] = seqLength_; }
cudaMalloc(&seqLengthArray_, batchSize_*sizeof(int));
cudaMemcpy(seqLengthArray_, seqLengthArray, batchSize_*sizeof(int), cudaMemcpyHostToDevice);
checkCUDNN(cudnnSetRNNDataDescriptor( xDesc_, CUDNN_DATA_HALF, layout, seqLength_, batchSize_, inC_, seqLengthArray, nullptr ));
checkCUDNN(cudnnSetRNNDataDescriptor( yDesc_, CUDNN_DATA_HALF, layout, seqLength_, batchSize_, outC_, seqLengthArray, nullptr ));
checkCUDNN(cudnnCreateTensorDescriptor(&hDesc_));
checkCUDNN(cudnnCreateTensorDescriptor(&cDesc_));
checkCUDNN(cudnnSetTensor4dDescriptor( hDesc_, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, numLayers_, batchSize_, outC_, 1 ));
checkCUDNN(cudnnSetTensor4dDescriptor( cDesc_, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, numLayers_, batchSize_, outC_, 1 ));
checkCUDNN(cudnnGetRNNWeightSpaceSize(cudnnHandle_, rnnDesc_, &weightSpaceSize_));
checkCUDA(cudaMalloc(&weights_, weightSpaceSize_));
checkCUDA(cudaMalloc(&gradWeights_, weightSpaceSize_));
checkCUDA(cudaMalloc(&gradOut_, batchSize_*inC_*sizeof(__half)));
checkCUDA(cudaMalloc(&hy_, numLayers_*batchSize_*outC_*sizeof(__half)));
checkCUDA(cudaMalloc(&cy_, numLayers_*batchSize_*outC_*sizeof(__half)));
checkCUDA(cudaMalloc(&outData_, batchSize_*outC_*sizeof(__half)));
checkCUDA(cudaMemset(weights_, 0, weightSpaceSize_));
checkCUDA(cudaMemset(gradWeights_, 0, weightSpaceSize_));
checkCUDA(cudaMemset(gradOut_, 0, batchSize_*inC_*sizeof(__half)));
checkCUDA(cudaMemset(hy_, 0, numLayers_*batchSize_*outC_*sizeof(__half)));
checkCUDA(cudaMemset(cy_, 0, numLayers_*batchSize_*outC_*sizeof(__half)));
checkCUDA(cudaMemset(outData_, 0, batchSize_*outC_*sizeof(__half)));
HeInit(weights_, weightSpaceSize_/sizeof(__half), inC_);
fwdMode_ = train ? CUDNN_FWD_MODE_TRAINING : CUDNN_FWD_MODE_INFERENCE;
checkCUDNN(cudnnGetRNNTempSpaceSizes( cudnnHandle_, rnnDesc_, fwdMode_, xDesc_, &workspaceSize_, &reserveSpaceSize_ ));
checkCUDA(cudaMalloc(&workspace_, workspaceSize_));
checkCUDA(cudaMemset(workspace_, 0, workspaceSize_));
checkCUDA(cudaMalloc(&reserveSpace_, reserveSpaceSize_));
checkCUDA(cudaMemset(reserveSpace_, 0, reserveSpaceSize_));
if(useAdamW_){
checkCUDA(cudaMalloc(&m_Weights_, weightSpaceSize_*sizeof(float)));
checkCUDA(cudaMalloc(&v_Weights_, weightSpaceSize_*sizeof(float)));
checkCUDA(cudaMemset(m_Weights_, 0, weightSpaceSize_*sizeof(float)));
checkCUDA(cudaMemset(v_Weights_, 0, weightSpaceSize_*sizeof(float)));
}
cudnnCreateTensorDescriptor(&outDesc_);
cudnnSetTensor4dDescriptor(outDesc_, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, batchSize_, outC_, 1, 1);
}
LSTMLayer::~LSTMLayer(){
cudaFree(weights_);
cudaFree(hy_);
cudaFree(cy_);
cudaFree(outData_);
cudaFree(workspace_);
cudaFree(reserveSpace_);
checkCUDNN(cudnnDestroyRNNDescriptor(rnnDesc_));
checkCUDNN(cudnnDestroyTensorDescriptor(hDesc_));
checkCUDNN(cudnnDestroyTensorDescriptor(cDesc_));
if(train_){
cudaFree(gradOut_);
cudaFree(gradWeights_);
checkCUDNN(cudnnDestroyFilterDescriptor(wDesc_));
}
}
__half* LSTMLayer::Forward(__half* data, bool train){
inData_ = data;
fwdMode_ = train ? CUDNN_FWD_MODE_TRAINING : CUDNN_FWD_MODE_INFERENCE;
checkCUDNN(cudnnRNNForward(cudnnHandle_, rnnDesc_, fwdMode_, seqLengthArray_, xDesc_, data, yDesc_, outData_, hDesc_, nullptr, hy_, cDesc_, nullptr, cy_, weightSpaceSize_, weights_, workspaceSize_, workspace_, reserveSpaceSize_, reserveSpace_));
return outData_;
}
__half* LSTMLayer::Backward(__half* grad){
checkCUDNN(cudnnRNNBackwardWeights_v8( cudnnHandle_, rnnDesc_, CUDNN_WGRAD_MODE_ADD, seqLengthArray_, xDesc_, inData_, hDesc_, nullptr, yDesc_, outData_, weightSpaceSize_, gradWeights_, workspaceSize_, workspace_, reserveSpaceSize_, reserveSpace_ ));
checkCUDNN(cudnnRNNBackwardData_v8( cudnnHandle_, rnnDesc_, seqLengthArray_, yDesc_, outData_, grad, xDesc_, gradOut_, hDesc_, nullptr, nullptr, nullptr, cDesc_, nullptr, nullptr, nullptr, weightSpaceSize_, weights_, workspaceSize_, workspace_, reserveSpaceSize_, reserveSpace_ ));
return gradWeights_;
}