Cuda error in file C:/source/rtExt/winograd/src/implicit_gemm.cu at line 648: invalid argument

freetown113 · May 17, 2021, 7:51am

Description

Good day!
I have a problem with a network composed with tensorRT layers. Engine builds without failures, inference pass as well and shows the result but it is not correct and an error appears “Cuda error in file C:/source/rtExt/winograd/src/implicit_gemm.cu at line 648: invalid argument”.

Code snippet:

nvinfer1::IActivationLayer* Tensor::refineBlock(nvinfer1::INetworkDefinition *network, std::map<std::string, nvinfer1::Weights>& weightMap,
	std::map<std::string, nvinfer1::Weights>& biasMap, nvinfer1::ITensor& input, layers layer)
{
	int outch1, outch2, s;
	nvinfer1::DimsHW ksize1, ksize2, p = nvinfer1::DimsHW{ 1,1 };
	nvinfer1::Weights wghs1, wghs2, bias1, bias2;
	switch (layer)
	{
	case layers::h0:
		ksize1 = nvinfer1::DimsHW{ 3,3 };
		outch1 = 4;
		wghs1 = weightMap["hh0.0"];
		bias1 = biasMap["hh0.0"];
		ksize2 = nvinfer1::DimsHW{ 3,3 };
		outch2 = 4;
		wghs2 = weightMap["hh0.2"];
		bias2 = biasMap["hh0.2"];
		break;
	case layers::h1:
		ksize1 = nvinfer1::DimsHW{ 3,3 };
		outch1 = 16;
		wghs1 = weightMap["hh1.0"];
		bias1 = biasMap["hh1.0"];
		ksize2 = nvinfer1::DimsHW{ 3,3 };
		outch2 = 16;
		wghs2 = weightMap["hh1.2"];
		bias2 = biasMap["hh1.2"];
		break;
	case layers::h2:
		ksize1 = nvinfer1::DimsHW{ 3,3 };
		outch1 = 32;
		wghs1 = weightMap["hh2.0"];
		bias1 = biasMap["hh2.0"];
		ksize2 = nvinfer1::DimsHW{ 3,3 };
		outch2 = 32;
		wghs2 = weightMap["hh2.2"];
		bias2 = biasMap["hh2.2"];
		break;
	case layers::v0:
		ksize1 = nvinfer1::DimsHW{ 3,3 };
		outch1 = 16;
		wghs1 = weightMap["vv0.0"];
		bias1 = biasMap["vv0.0"];
		ksize2 = nvinfer1::DimsHW{ 3,3 };
		outch2 = 4;
		wghs2 = weightMap["vv0.2"];
		bias2 = biasMap["vv0.2"];
		break;
	case layers::v1:
		ksize1 = nvinfer1::DimsHW{ 3,3 };
		outch1 = 64;
		wghs1 = weightMap["vv1.0"];
		bias1 = biasMap["vv1.0"];
		ksize2 = nvinfer1::DimsHW{ 3,3 };
		outch2 = 16;
		wghs2 = weightMap["vv1.2"];
		bias2 = biasMap["vv1.2"];
		break;
	case layers::v2:
		ksize1 = nvinfer1::DimsHW{ 3,3 };
		outch1 = 128;
		wghs1 = weightMap["vv2.0"];
		bias1 = biasMap["vv2.0"];
		ksize2 = nvinfer1::DimsHW{ 3,3 };
		outch2 = 32;
		wghs2 = weightMap["vv2.2"];
		bias2 = biasMap["vv2.2"];
		break;
	default:
		return nullptr;
	}
	nvinfer1::IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch1, ksize1, wghs1, bias1);
	assert(conv1);
	conv1->setPaddingNd(p);
	nvinfer1::IActivationLayer* relu1 = network->addActivation(*conv1->getOutput(0), nvinfer1::ActivationType::kRELU);
	assert(relu1);
	nvinfer1::IConvolutionLayer* conv2 = network->addConvolutionNd(*relu1->getOutput(0), outch2, ksize2, wghs2, bias2);
	assert(conv2);
	conv2->setPaddingNd(p);
	nvinfer1::IActivationLayer* relu2 = network->addActivation(*conv2->getOutput(0), nvinfer1::ActivationType::kRELU);
	assert(relu2);
	return relu2;
}

bool Tensor::create_network_from_weights(std::map<std::string, nvinfer1::Weights> wghts, std::map<std::string, nvinfer1::Weights> bias, uint32_t max_batch_size)
{
	nvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(_logger);
	const auto explicitBatch = 1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
	nvinfer1::INetworkDefinition *network(builder->createNetworkV2(explicitBatch));

	nvinfer1::ITensor *input = network->addInput("premier", nvinfer1::DataType::kFLOAT, nvinfer1::Dims4{ -1, 256, 1, 1 });
	nvinfer1::IDeconvolutionLayer* deconv = network->addDeconvolutionNd(*input, 32, nvinfer1::DimsHW{ 15, 15 }, wghts["decon"], bias["decon"]);

	nvinfer1::IActivationLayer* block_h2 = refineBlock(network, wghts, bias, *deconv->getOutput(0), layers::h2);
	nvinfer1::ITensor *p2 = network->addInput("p2", nvinfer1::DataType::kFLOAT, nvinfer1::Dims4{ -1, 512, 15, 15 });
	nvinfer1::IActivationLayer* block_v2 = refineBlock(network, wghts, bias, *p2, layers::v2);
	nvinfer1::IElementWiseLayer* sum_h2_v2 = network->addElementWise(*block_h2->getOutput(0), *block_v2->getOutput(0), nvinfer1::ElementWiseOperation::kSUM);
	nvinfer1::IResizeLayer* interpolation2 = network->addResize(*sum_h2_v2->getOutput(0));
	interpolation2->setResizeMode(nvinfer1::ResizeMode::kLINEAR);
	interpolation2->setOutputDimensions(nvinfer1::Dims4{ 1, 32, 31, 31 });
	nvinfer1::IConvolutionLayer* post0 = network->addConvolutionNd(*interpolation2->getOutput(0), 16, nvinfer1::DimsHW{ 3,3 }, wghts["post0"], bias["post0"]);
	assert(post0);
	post0->setPaddingNd(nvinfer1::DimsHW{ 1,1 });
	nvinfer1::IActivationLayer* block_h1 = refineBlock(network, wghts, bias, *post0->getOutput(0), layers::h1);
	nvinfer1::ITensor *p1 = network->addInput("p1", nvinfer1::DataType::kFLOAT, nvinfer1::Dims4{ -1, 256, 31, 31 });
	nvinfer1::IActivationLayer* block_v1 = refineBlock(network, wghts, bias, *p1, layers::v1);
	nvinfer1::IElementWiseLayer* sum_h1_v1 = network->addElementWise(*block_h1->getOutput(0), *block_v1->getOutput(0), nvinfer1::ElementWiseOperation::kSUM);
	nvinfer1::IResizeLayer* interpolation1 = network->addResize(*sum_h1_v1->getOutput(0));
	interpolation1->setResizeMode(nvinfer1::ResizeMode::kLINEAR);
	interpolation1->setOutputDimensions(nvinfer1::Dims4{ 1, 16, 61, 61 });
	nvinfer1::IConvolutionLayer* post1 = network->addConvolutionNd(*interpolation1->getOutput(0), 4, nvinfer1::DimsHW{ 3,3 }, wghts["post1"], bias["post1"]);
	assert(post1);
	post1->setPaddingNd(nvinfer1::DimsHW{ 1,1 });
	nvinfer1::IActivationLayer* block_h0 = refineBlock(network, wghts, bias, *post1->getOutput(0), layers::h0);
	nvinfer1::ITensor *p0 = network->addInput("p0", nvinfer1::DataType::kFLOAT, nvinfer1::Dims4{ -1, 64, 61, 61 });
	nvinfer1::IActivationLayer* block_v0 = refineBlock(network, wghts, bias, *p0, layers::v0);
	nvinfer1::IElementWiseLayer* sum_h0_v0 = network->addElementWise(*block_h0->getOutput(0), *block_v0->getOutput(0), nvinfer1::ElementWiseOperation::kSUM);
	nvinfer1::IResizeLayer* interpolation0 = network->addResize(*sum_h0_v0->getOutput(0));
	interpolation0->setResizeMode(nvinfer1::ResizeMode::kLINEAR);
	interpolation0->setOutputDimensions(nvinfer1::Dims4{ 1, 4, 127, 127 });
	nvinfer1::IConvolutionLayer* post2 = network->addConvolutionNd(*interpolation0->getOutput(0), 1, nvinfer1::DimsHW{ 3,3 }, wghts["post2"], bias["post2"]);
	assert(post2);
	post2->setPaddingNd(nvinfer1::DimsHW{ 1,1 });
	nvinfer1::IActivationLayer* sigmoid = network->addActivation(*post2->getOutput(0), nvinfer1::ActivationType::kSIGMOID);
	nvinfer1::ITensor* output = sigmoid->getOutput(0);


	auto preprocessorConfig = builder->createBuilderConfig();
	if (!preprocessorConfig)
		return false;

	auto profile = builder->createOptimizationProfile();


	profile->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kMIN, nvinfer1::Dims4{ 1, 256, 1, 1 });
	profile->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kOPT, nvinfer1::Dims4{ 1, 256, 1, 1 });
	profile->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kMAX, nvinfer1::Dims4{ 10, 256, 1, 1 });
	profile->setDimensions(p2->getName(), nvinfer1::OptProfileSelector::kMIN, nvinfer1::Dims4{ 1, 512, 15, 15 });
	profile->setDimensions(p2->getName(), nvinfer1::OptProfileSelector::kOPT, nvinfer1::Dims4{ 1, 512, 15, 15 });
	profile->setDimensions(p2->getName(), nvinfer1::OptProfileSelector::kMAX, nvinfer1::Dims4{ 10, 512, 15, 15 });
	profile->setDimensions(p1->getName(), nvinfer1::OptProfileSelector::kMIN, nvinfer1::Dims4{ 1, 256, 31, 31 });
	profile->setDimensions(p1->getName(), nvinfer1::OptProfileSelector::kOPT, nvinfer1::Dims4{ 1, 256, 31, 31 });
	profile->setDimensions(p1->getName(), nvinfer1::OptProfileSelector::kMAX, nvinfer1::Dims4{ 10, 256, 31, 31 });
	profile->setDimensions(p0->getName(), nvinfer1::OptProfileSelector::kMIN, nvinfer1::Dims4{ 1, 64, 61, 61 });
	profile->setDimensions(p0->getName(), nvinfer1::OptProfileSelector::kOPT, nvinfer1::Dims4{ 1, 64, 61, 61 });
	profile->setDimensions(p0->getName(), nvinfer1::OptProfileSelector::kMAX, nvinfer1::Dims4{ 10, 64, 61, 61 });
	preprocessorConfig->addOptimizationProfile(profile);
	
	cudaDeviceProp prop;
	cudaGetDeviceProperties(&prop, 0);

	preprocessorConfig->setMaxWorkspaceSize(prop.totalGlobalMem / 4);
	builder->setMaxWorkspaceSize(prop.totalGlobalMem / 4);

	network->markOutput(*output);

	_engine = builder->buildEngineWithConfig(*network, *preprocessorConfig);
	assert(_engine);
	save_engine(const_cast<char*>("TEST_ENGINE_3_trt8"), _engine);
	int count = 0;
	for (int i = 0; i < _engine->getNbBindings(); i++) {
		if (!_engine->bindingIsInput(i))
			count++;
	}
	std::cout << "ONNX engine bindings: " << _engine->getNbBindings() << " outputs: " << count << std::endl;
	_context = _engine->createExecutionContext();
	device_buffers.resize(_engine->getNbBindings());
	device_buffers_sizes.resize(_engine->getNbBindings());

	for (auto& mem : wghts, bias)
	{
		free((void*)(mem.second.values));
	}

	network->destroy();
	builder->destroy();
	preprocessorConfig->destroy();

	return true;
}


void inference(std::vector<void*>& Buffers) 
{
	cudaStream_t stream;
	CUDA_SAFE_CALL(cudaStreamCreate(&stream));
	if (_engine->getNbLayers() == 23)
	{
		_context->setOptimizationProfile(0);
		_context->setBindingDimensions(0, nvinfer1::Dims4(1, 256, 1, 1));
		_context->setBindingDimensions(1, nvinfer1::Dims4(1, 512, 15, 15));
		_context->setBindingDimensions(2, nvinfer1::Dims4(1, 256, 31, 31));
		_context->setBindingDimensions(3, nvinfer1::Dims4(1, 64, 61, 61));
	}
	_context->enqueueV2(Buffers.data(), stream, nullptr);
	cudaStreamSynchronize(stream);
	cudaStreamDestroy(stream);
}

Environment

TensorRT Version: 7.2.3.4
GPU Type: GTX1060
Nvidia Driver Version: 456.71
CUDA Version: 10.2
CUDNN Version: 7.6.5
Operating System + Version: Win10
Python Version (if applicable): 3.7
TensorFlow Version (if applicable): -
PyTorch Version (if applicable): 1.0.0
Baremetal or Container (if container which image + tag):

Relevant Files

Please attach or include links to any models, data, files, or scripts necessary to reproduce your issue. (Github repo, Google Drive, Dropbox, etc.)

ModelBias.npy (1.8 KB) indent preformatted text by 4 spaces

spolisetty · May 18, 2021, 5:44am

Hi @freetown113,

We recommend you to try on latest TensorRT version 8.0. Please let us know if you still face this issue.

Thank you.

Topic		Replies	Views
ERROR: 1: [gemmBaseRunner.cpp::allocateContextResources::236] Error Code 1: Cuda Runtime (invalid argument) TensorRT cudnn	2	505	January 22, 2024
Cuda Error in executeMemcpy: 1 (invalid argument) TensorRT	1	2168	May 12, 2020
../rtSafe/cuda/cudaElementWiseRunner.cpp (149) - Cuda Error in execute: 9 (invalid configuration argument TensorRT	1	1451	December 17, 2019
[defaultAllocator.cpp::deallocate::35] Error Code 1: Cuda Runtime (invalid argument) TensorRT tensorrt	3	4083	March 31, 2022
Cuda Error in executeMemcpy: 1 (invalid argument) TensorRT	5	2344	October 13, 2021
(Unnamed Layer* 274) [Convolution] (scudnn_winograd) Set Tactic Name: maxwell_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148n_nt_v1 TensorRT	1	857	December 27, 2019
TensorRT batch Error TensorRT	1	1705	August 24, 2018
Error Code 1: Cuda Runtime (invalid argument) Segmentation fault (core dumped) TensorRT	2	321	September 13, 2024
TensorRT 5 Bug？cuda/cudaConvolutionLayer.cpp (163) - Cudnn Error in execute: 3 TensorRT	3	2736	June 28, 2019
Assertion 'upsample11' failed TensorRT cudnn	2	417	February 28, 2024

Cuda error in file C:/source/rtExt/winograd/src/implicit_gemm.cu at line 648: invalid argument

Description

Environment

Relevant Files

Related topics