Changed ONNX sample — wrong output, where's mistake?

I wanted to test concurrent execution on 2 contexts with 2 CUDA streams.
I wrote this code, but it gives obviously wrong result: 10% prob for each class on both executions.
Code:

#include <NvInfer.h>
#include <NvOnnxParser.h>
#include <cuda_runtime_api.h>

#include <iostream>
#include <fstream>
#include <string>
#include <vector>
#include <iomanip>
#include <algorithm>

//--------------------------------------------------------------------------------
#define CHECK(status)                                              \
		do                                                         \
		{                                                          \
			auto ret = (status);                                   \
			if (ret != 0)                                          \
			{                                                      \
				std::cerr << "CUDA error " << ret << ": "		   \
					<< cudaGetErrorString(ret) << std::endl;	   \
				abort();                                           \
			}                                                      \
		} while (0)

//--------------------------------------------------------------------------------
class gLogger : public nvinfer1::ILogger
{
private:
	Severity level;
public:
	void log(Severity severity, const char* msg) override
	{
		if (severity <= level)
			std::cout << msg << std::endl;
	};
	gLogger(Severity level) { this->level = level; };
	~gLogger() {};
};

//--------------------------------------------------------------------------------
struct InferDeleter
{
	template <typename T>
	void operator()(T* obj) const
	{
		if (obj)
			obj->destroy();
	}
};

//--------------------------------------------------------------------------------
int main()
{
	gLogger logger(nvinfer1::ILogger::Severity::kINFO);;
	nvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(logger);
	nvinfer1::INetworkDefinition* network = builder->createNetwork();
	nvonnxparser::IParser* parser = nvonnxparser::createParser(*network, logger);
	bool parsed = parser->parseFromFile("mnist.onnx", static_cast<int>(nvinfer1::ILogger::Severity::kWARNING));

	std::cout << network->getNbOutputs() << " intputs" << std::endl;
	auto s = network->getInput(0)->getDimensions();
	std::cout << s.nbDims << " input dimensions:" << std::endl;
	for (int i = 0; i < s.nbDims; i++)
		std::cout << "\t[" << i << "] " << s.d[i] << "" << std::endl;
	std::cout << network->getNbOutputs() << " outputs" << std::endl;
	s = network->getOutput(0)->getDimensions();
	std::cout << s.nbDims << " output dimensions:" << std::endl;
	for (int i = 0; i < s.nbDims; i++)
		std::cout << "\t[" << i << "] " << s.d[i] << "" << std::endl;

	builder->setMaxBatchSize(1);
	nvinfer1::IBuilderConfig* config = builder->createBuilderConfig();
	int maxWorkspaceSize{ 64 };
	config->setMaxWorkspaceSize(maxWorkspaceSize * (1 << 20));
	auto cudaEngine = std::shared_ptr<nvinfer1::ICudaEngine>(builder->buildEngineWithConfig(*network, *config), InferDeleter());
	auto inputDims = network->getInput(0)->getDimensions();
	auto outputDims = network->getOutput(0)->getDimensions();
	config->destroy();
	builder->destroy();
	parser->destroy();
	std::cout << "Engine created" << std::endl;

	size_t inputSize = inputDims.d[0] * inputDims.d[1] * inputDims.d[2];
	size_t outputSize = outputDims.d[0];

	cudaStream_t stream1;
	cudaStreamCreateWithFlags(&stream1, cudaStreamNonBlocking);
	void* bindings1[2];
	CHECK(cudaMalloc(&bindings1[0], inputSize * sizeof(float)));
	CHECK(cudaMalloc(&bindings1[1], outputSize * sizeof(float)));
	std::cout << "Created stream1, bindings1" << std::endl;

	cudaStream_t stream2;
	cudaStreamCreateWithFlags(&stream2, cudaStreamNonBlocking);
	void* bindings2[2];
	CHECK(cudaMalloc(&bindings2[0], inputSize * sizeof(float)));
	CHECK(cudaMalloc(&bindings2[1], outputSize * sizeof(float)));
	std::cout << "Created stream2, bindings2" << std::endl;

	auto context1 = cudaEngine->createExecutionContext();
	auto context2 = cudaEngine->createExecutionContext();
	std::cout << "Created contexts" << std::endl;

	cudaEvent_t inputConsumed1, inputConsumed2;
	cudaEventCreate(&inputConsumed1);
	cudaEventCreate(&inputConsumed2);
	std::cout << "Created CUDA events" << std::endl;

	std::vector<uint8_t> blob(inputSize);
	std::ifstream infile("4.pgm", std::ifstream::binary);
	std::string magic, h, w, max;
	infile >> magic >> h >> w >> max;
	infile.seekg(1, infile.cur);
	infile.read(reinterpret_cast<char*>(blob.data()), inputSize);
	infile.close();
	std::cout << "Input:" << std::endl;
	for (int i = 0; i < inputSize; i++)
		std::cout << (" .:-=+*#%@"[blob[i] / 26]) << (((i + 1) % inputDims.d[2]) ? "" : "\n");

	std::vector<float> input1(inputSize);
	std::vector<float> input2(inputSize);
	for (int i = 0; i < inputSize; i++)
	{
		input1[i] = 1.0 - float(blob[i] / 255.0);
		input2[i] = 1.0 - float(blob[i] / 255.0);
	}
	std::cout << "Input prepared" << std::endl;

	CHECK(cudaMemcpy(bindings1[0], (void*)(input1.data()), inputSize * sizeof(float), cudaMemcpyHostToDevice));
	CHECK(cudaMemcpy(bindings2[0], (void*)(input2.data()), inputSize * sizeof(float), cudaMemcpyHostToDevice));
	std::cout << "cudaMemcpy(HtD)" << std::endl;

	context1->enqueue(1, bindings1, stream1, &inputConsumed1);
	context2->enqueue(1, bindings2, stream2, &inputConsumed2);
	std::cout << "Enqueue launched" << std::endl;
	
	while (!(cudaStreamQuery(stream1) == cudaError::cudaSuccess && cudaStreamQuery(stream2) == cudaError::cudaSuccess))
		;

	float* result1 = new float[outputSize];
	float* result2 = new float[outputSize];

	CHECK(cudaMemcpy(result1, bindings1[0], outputSize * sizeof(float), cudaMemcpyDeviceToHost));
	CHECK(cudaMemcpy(result2, bindings2[0], outputSize * sizeof(float), cudaMemcpyDeviceToHost));
	std::cout << "cudaMemcpy(DtH)" << std::endl;

	int idx = 0;
	float sum = 0.0f, val = 0.0f;

	// 1
	for (int i = 0; i < outputSize; i++)
	{
		result1[i] = exp(result1[i]);
		sum += result1[i];
	}
	std::cout << "Output 1:" << std::endl;
	for (int i = 0; i < outputSize; i++)
	{
		result1[i] /= sum;
		val = std::max(val, result1[i]);
		if (val == result1[i])
			idx = i;
		std::cout << " Prob " << i << "  " << std::fixed << std::setw(5) << std::setprecision(4) << result1[i] << " "
			<< "Class " << i << ": " << std::string(int(std::floor(result1[i] * 10 + 0.5f)), '*') << std::endl;
	}
	std::cout << std::endl;

	// 2
	idx = 0;
	sum = 0.0f;
	val = 0.0f;
	for (int i = 0; i < outputSize; i++)
	{
		result2[i] = exp(result2[i]);
		sum += result2[i];
	}
	std::cout << "Output 2:" << std::endl;
	for (int i = 0; i < outputSize; i++)
	{
		result2[i] /= sum;
		val = std::max(val, result2[i]);
		if (val == result2[i])
			idx = i;
		std::cout << " Prob " << i << "  " << std::fixed << std::setw(5) << std::setprecision(4) << result2[i] << " "
			<< "Class " << i << ": " << std::string(int(std::floor(result2[i] * 10 + 0.5f)), '*') << std::endl;
	}
	std::cout << std::endl;
}

Command line output with wrong result:

Output 1:
 Prob 0  0.1000 Class 0: *
 Prob 1  0.1000 Class 1: *
 Prob 2  0.1000 Class 2: *
 Prob 3  0.1000 Class 3: *
 Prob 4  0.1000 Class 4: *
 Prob 5  0.1000 Class 5: *
 Prob 6  0.1000 Class 6: *
 Prob 7  0.1000 Class 7: *
 Prob 8  0.1000 Class 8: *
 Prob 9  0.1000 Class 9: *

Output 2:
 Prob 0  0.1000 Class 0: *
 Prob 1  0.1000 Class 1: *
 Prob 2  0.1000 Class 2: *
 Prob 3  0.1000 Class 3: *
 Prob 4  0.1000 Class 4: *
 Prob 5  0.1000 Class 5: *
 Prob 6  0.1000 Class 6: *
 Prob 7  0.1000 Class 7: *
 Prob 8  0.1000 Class 8: *
 Prob 9  0.1000 Class 9: *

Correct result from sampleOnnxMNIST:

[05/22/2020-18:25:09] [I] Output:
[05/22/2020-18:25:09] [I]  Prob 0  0.0000 Class 0:
[05/22/2020-18:25:09] [I]  Prob 1  0.0000 Class 1:
[05/22/2020-18:25:09] [I]  Prob 2  0.0000 Class 2:
[05/22/2020-18:25:09] [I]  Prob 3  0.0000 Class 3:
[05/22/2020-18:25:09] [I]  Prob 4  1.0000 Class 4: **********
[05/22/2020-18:25:09] [I]  Prob 5  0.0000 Class 5:
[05/22/2020-18:25:09] [I]  Prob 6  0.0000 Class 6:
[05/22/2020-18:25:09] [I]  Prob 7  0.0000 Class 7:
[05/22/2020-18:25:09] [I]  Prob 8  0.0000 Class 8:
[05/22/2020-18:25:09] [I]  Prob 9  0.0000 Class 9:

Where did I make mistake?

Hello @redoutracer,
Can you please help us with the below details :
o Linux distro and version
o GPU type
o Nvidia driver version
o CUDA version
o CUDNN version
o Python version [if using python]
o Tensorflow and PyTorch version
o TensorRT version

Also by looking at the code, few observations which might be useful -

CHECK(cudaMemcpy(result1, bindings1[1], outputSize * sizeof(float), cudaMemcpyDeviceToHost));
CHECK(cudaMemcpy(result2, bindings2[1], outputSize * sizeof(float), cudaMemcpyDeviceToHost));

We will work on reproducing the issue to share more details.
Thanks!

Please check if you are using correct output memory while copying?

Yeah… That was it, wrong index of bindings.
Thanks.
Works fine now:

Output 1:
 Prob 0  0.0000 Class 0:
 Prob 1  0.0000 Class 1:
 Prob 2  0.0000 Class 2:
 Prob 3  0.0000 Class 3:
 Prob 4  1.0000 Class 4: **********
 Prob 5  0.0000 Class 5:
 Prob 6  0.0000 Class 6:
 Prob 7  0.0000 Class 7:
 Prob 8  0.0000 Class 8:
 Prob 9  0.0000 Class 9:

Output 2:
 Prob 0  0.0000 Class 0:
 Prob 1  0.0000 Class 1:
 Prob 2  0.0000 Class 2:
 Prob 3  0.0000 Class 3:
 Prob 4  1.0000 Class 4: **********
 Prob 5  0.0000 Class 5:
 Prob 6  0.0000 Class 6:
 Prob 7  0.0000 Class 7:
 Prob 8  0.0000 Class 8:
 Prob 9  0.0000 Class 9: