Cuda error in file C:/source/rtExt/winograd/src/implicit_gemm.cu at line 648: invalid argument

Description

Good day!
I have a problem with a network composed with tensorRT layers. Engine builds without failures, inference pass as well and shows the result but it is not correct and an error appears “Cuda error in file C:/source/rtExt/winograd/src/implicit_gemm.cu at line 648: invalid argument”.

Code snippet:

nvinfer1::IActivationLayer* Tensor::refineBlock(nvinfer1::INetworkDefinition *network, std::map<std::string, nvinfer1::Weights>& weightMap,
	std::map<std::string, nvinfer1::Weights>& biasMap, nvinfer1::ITensor& input, layers layer)
{
	int outch1, outch2, s;
	nvinfer1::DimsHW ksize1, ksize2, p = nvinfer1::DimsHW{ 1,1 };
	nvinfer1::Weights wghs1, wghs2, bias1, bias2;
	switch (layer)
	{
	case layers::h0:
		ksize1 = nvinfer1::DimsHW{ 3,3 };
		outch1 = 4;
		wghs1 = weightMap["hh0.0"];
		bias1 = biasMap["hh0.0"];
		ksize2 = nvinfer1::DimsHW{ 3,3 };
		outch2 = 4;
		wghs2 = weightMap["hh0.2"];
		bias2 = biasMap["hh0.2"];
		break;
	case layers::h1:
		ksize1 = nvinfer1::DimsHW{ 3,3 };
		outch1 = 16;
		wghs1 = weightMap["hh1.0"];
		bias1 = biasMap["hh1.0"];
		ksize2 = nvinfer1::DimsHW{ 3,3 };
		outch2 = 16;
		wghs2 = weightMap["hh1.2"];
		bias2 = biasMap["hh1.2"];
		break;
	case layers::h2:
		ksize1 = nvinfer1::DimsHW{ 3,3 };
		outch1 = 32;
		wghs1 = weightMap["hh2.0"];
		bias1 = biasMap["hh2.0"];
		ksize2 = nvinfer1::DimsHW{ 3,3 };
		outch2 = 32;
		wghs2 = weightMap["hh2.2"];
		bias2 = biasMap["hh2.2"];
		break;
	case layers::v0:
		ksize1 = nvinfer1::DimsHW{ 3,3 };
		outch1 = 16;
		wghs1 = weightMap["vv0.0"];
		bias1 = biasMap["vv0.0"];
		ksize2 = nvinfer1::DimsHW{ 3,3 };
		outch2 = 4;
		wghs2 = weightMap["vv0.2"];
		bias2 = biasMap["vv0.2"];
		break;
	case layers::v1:
		ksize1 = nvinfer1::DimsHW{ 3,3 };
		outch1 = 64;
		wghs1 = weightMap["vv1.0"];
		bias1 = biasMap["vv1.0"];
		ksize2 = nvinfer1::DimsHW{ 3,3 };
		outch2 = 16;
		wghs2 = weightMap["vv1.2"];
		bias2 = biasMap["vv1.2"];
		break;
	case layers::v2:
		ksize1 = nvinfer1::DimsHW{ 3,3 };
		outch1 = 128;
		wghs1 = weightMap["vv2.0"];
		bias1 = biasMap["vv2.0"];
		ksize2 = nvinfer1::DimsHW{ 3,3 };
		outch2 = 32;
		wghs2 = weightMap["vv2.2"];
		bias2 = biasMap["vv2.2"];
		break;
	default:
		return nullptr;
	}
	nvinfer1::IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch1, ksize1, wghs1, bias1);
	assert(conv1);
	conv1->setPaddingNd(p);
	nvinfer1::IActivationLayer* relu1 = network->addActivation(*conv1->getOutput(0), nvinfer1::ActivationType::kRELU);
	assert(relu1);
	nvinfer1::IConvolutionLayer* conv2 = network->addConvolutionNd(*relu1->getOutput(0), outch2, ksize2, wghs2, bias2);
	assert(conv2);
	conv2->setPaddingNd(p);
	nvinfer1::IActivationLayer* relu2 = network->addActivation(*conv2->getOutput(0), nvinfer1::ActivationType::kRELU);
	assert(relu2);
	return relu2;
}

bool Tensor::create_network_from_weights(std::map<std::string, nvinfer1::Weights> wghts, std::map<std::string, nvinfer1::Weights> bias, uint32_t max_batch_size)
{
	nvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(_logger);
	const auto explicitBatch = 1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
	nvinfer1::INetworkDefinition *network(builder->createNetworkV2(explicitBatch));

	nvinfer1::ITensor *input = network->addInput("premier", nvinfer1::DataType::kFLOAT, nvinfer1::Dims4{ -1, 256, 1, 1 });
	nvinfer1::IDeconvolutionLayer* deconv = network->addDeconvolutionNd(*input, 32, nvinfer1::DimsHW{ 15, 15 }, wghts["decon"], bias["decon"]);

	nvinfer1::IActivationLayer* block_h2 = refineBlock(network, wghts, bias, *deconv->getOutput(0), layers::h2);
	nvinfer1::ITensor *p2 = network->addInput("p2", nvinfer1::DataType::kFLOAT, nvinfer1::Dims4{ -1, 512, 15, 15 });
	nvinfer1::IActivationLayer* block_v2 = refineBlock(network, wghts, bias, *p2, layers::v2);
	nvinfer1::IElementWiseLayer* sum_h2_v2 = network->addElementWise(*block_h2->getOutput(0), *block_v2->getOutput(0), nvinfer1::ElementWiseOperation::kSUM);
	nvinfer1::IResizeLayer* interpolation2 = network->addResize(*sum_h2_v2->getOutput(0));
	interpolation2->setResizeMode(nvinfer1::ResizeMode::kLINEAR);
	interpolation2->setOutputDimensions(nvinfer1::Dims4{ 1, 32, 31, 31 });
	nvinfer1::IConvolutionLayer* post0 = network->addConvolutionNd(*interpolation2->getOutput(0), 16, nvinfer1::DimsHW{ 3,3 }, wghts["post0"], bias["post0"]);
	assert(post0);
	post0->setPaddingNd(nvinfer1::DimsHW{ 1,1 });
	nvinfer1::IActivationLayer* block_h1 = refineBlock(network, wghts, bias, *post0->getOutput(0), layers::h1);
	nvinfer1::ITensor *p1 = network->addInput("p1", nvinfer1::DataType::kFLOAT, nvinfer1::Dims4{ -1, 256, 31, 31 });
	nvinfer1::IActivationLayer* block_v1 = refineBlock(network, wghts, bias, *p1, layers::v1);
	nvinfer1::IElementWiseLayer* sum_h1_v1 = network->addElementWise(*block_h1->getOutput(0), *block_v1->getOutput(0), nvinfer1::ElementWiseOperation::kSUM);
	nvinfer1::IResizeLayer* interpolation1 = network->addResize(*sum_h1_v1->getOutput(0));
	interpolation1->setResizeMode(nvinfer1::ResizeMode::kLINEAR);
	interpolation1->setOutputDimensions(nvinfer1::Dims4{ 1, 16, 61, 61 });
	nvinfer1::IConvolutionLayer* post1 = network->addConvolutionNd(*interpolation1->getOutput(0), 4, nvinfer1::DimsHW{ 3,3 }, wghts["post1"], bias["post1"]);
	assert(post1);
	post1->setPaddingNd(nvinfer1::DimsHW{ 1,1 });
	nvinfer1::IActivationLayer* block_h0 = refineBlock(network, wghts, bias, *post1->getOutput(0), layers::h0);
	nvinfer1::ITensor *p0 = network->addInput("p0", nvinfer1::DataType::kFLOAT, nvinfer1::Dims4{ -1, 64, 61, 61 });
	nvinfer1::IActivationLayer* block_v0 = refineBlock(network, wghts, bias, *p0, layers::v0);
	nvinfer1::IElementWiseLayer* sum_h0_v0 = network->addElementWise(*block_h0->getOutput(0), *block_v0->getOutput(0), nvinfer1::ElementWiseOperation::kSUM);
	nvinfer1::IResizeLayer* interpolation0 = network->addResize(*sum_h0_v0->getOutput(0));
	interpolation0->setResizeMode(nvinfer1::ResizeMode::kLINEAR);
	interpolation0->setOutputDimensions(nvinfer1::Dims4{ 1, 4, 127, 127 });
	nvinfer1::IConvolutionLayer* post2 = network->addConvolutionNd(*interpolation0->getOutput(0), 1, nvinfer1::DimsHW{ 3,3 }, wghts["post2"], bias["post2"]);
	assert(post2);
	post2->setPaddingNd(nvinfer1::DimsHW{ 1,1 });
	nvinfer1::IActivationLayer* sigmoid = network->addActivation(*post2->getOutput(0), nvinfer1::ActivationType::kSIGMOID);
	nvinfer1::ITensor* output = sigmoid->getOutput(0);


	auto preprocessorConfig = builder->createBuilderConfig();
	if (!preprocessorConfig)
		return false;

	auto profile = builder->createOptimizationProfile();


	profile->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kMIN, nvinfer1::Dims4{ 1, 256, 1, 1 });
	profile->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kOPT, nvinfer1::Dims4{ 1, 256, 1, 1 });
	profile->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kMAX, nvinfer1::Dims4{ 10, 256, 1, 1 });
	profile->setDimensions(p2->getName(), nvinfer1::OptProfileSelector::kMIN, nvinfer1::Dims4{ 1, 512, 15, 15 });
	profile->setDimensions(p2->getName(), nvinfer1::OptProfileSelector::kOPT, nvinfer1::Dims4{ 1, 512, 15, 15 });
	profile->setDimensions(p2->getName(), nvinfer1::OptProfileSelector::kMAX, nvinfer1::Dims4{ 10, 512, 15, 15 });
	profile->setDimensions(p1->getName(), nvinfer1::OptProfileSelector::kMIN, nvinfer1::Dims4{ 1, 256, 31, 31 });
	profile->setDimensions(p1->getName(), nvinfer1::OptProfileSelector::kOPT, nvinfer1::Dims4{ 1, 256, 31, 31 });
	profile->setDimensions(p1->getName(), nvinfer1::OptProfileSelector::kMAX, nvinfer1::Dims4{ 10, 256, 31, 31 });
	profile->setDimensions(p0->getName(), nvinfer1::OptProfileSelector::kMIN, nvinfer1::Dims4{ 1, 64, 61, 61 });
	profile->setDimensions(p0->getName(), nvinfer1::OptProfileSelector::kOPT, nvinfer1::Dims4{ 1, 64, 61, 61 });
	profile->setDimensions(p0->getName(), nvinfer1::OptProfileSelector::kMAX, nvinfer1::Dims4{ 10, 64, 61, 61 });
	preprocessorConfig->addOptimizationProfile(profile);
	
	cudaDeviceProp prop;
	cudaGetDeviceProperties(&prop, 0);

	preprocessorConfig->setMaxWorkspaceSize(prop.totalGlobalMem / 4);
	builder->setMaxWorkspaceSize(prop.totalGlobalMem / 4);

	network->markOutput(*output);

	_engine = builder->buildEngineWithConfig(*network, *preprocessorConfig);
	assert(_engine);
	save_engine(const_cast<char*>("TEST_ENGINE_3_trt8"), _engine);
	int count = 0;
	for (int i = 0; i < _engine->getNbBindings(); i++) {
		if (!_engine->bindingIsInput(i))
			count++;
	}
	std::cout << "ONNX engine bindings: " << _engine->getNbBindings() << " outputs: " << count << std::endl;
	_context = _engine->createExecutionContext();
	device_buffers.resize(_engine->getNbBindings());
	device_buffers_sizes.resize(_engine->getNbBindings());

	for (auto& mem : wghts, bias)
	{
		free((void*)(mem.second.values));
	}

	network->destroy();
	builder->destroy();
	preprocessorConfig->destroy();

	return true;
}


void inference(std::vector<void*>& Buffers) 
{
	cudaStream_t stream;
	CUDA_SAFE_CALL(cudaStreamCreate(&stream));
	if (_engine->getNbLayers() == 23)
	{
		_context->setOptimizationProfile(0);
		_context->setBindingDimensions(0, nvinfer1::Dims4(1, 256, 1, 1));
		_context->setBindingDimensions(1, nvinfer1::Dims4(1, 512, 15, 15));
		_context->setBindingDimensions(2, nvinfer1::Dims4(1, 256, 31, 31));
		_context->setBindingDimensions(3, nvinfer1::Dims4(1, 64, 61, 61));
	}
	_context->enqueueV2(Buffers.data(), stream, nullptr);
	cudaStreamSynchronize(stream);
	cudaStreamDestroy(stream);
}   

Environment

TensorRT Version: 7.2.3.4
GPU Type: GTX1060
Nvidia Driver Version: 456.71
CUDA Version: 10.2
CUDNN Version: 7.6.5
Operating System + Version: Win10
Python Version (if applicable): 3.7
TensorFlow Version (if applicable): -
PyTorch Version (if applicable): 1.0.0
Baremetal or Container (if container which image + tag):

Relevant Files

Please attach or include links to any models, data, files, or scripts necessary to reproduce your issue. (Github repo, Google Drive, Dropbox, etc.)

ModelBias.npy (1.8 KB) indent preformatted text by 4 spaces

Hi @freetown113,

We recommend you to try on latest TensorRT version 8.0. Please let us know if you still face this issue.

Thank you.