I wanted to test concurrent execution on 2 contexts with 2 CUDA streams.
I wrote this code, but it gives obviously wrong result: 10% prob for each class on both executions.
Code:
#include <NvInfer.h>
#include <NvOnnxParser.h>
#include <cuda_runtime_api.h>
#include <iostream>
#include <fstream>
#include <string>
#include <vector>
#include <iomanip>
#include <algorithm>
//--------------------------------------------------------------------------------
#define CHECK(status) \
do \
{ \
auto ret = (status); \
if (ret != 0) \
{ \
std::cerr << "CUDA error " << ret << ": " \
<< cudaGetErrorString(ret) << std::endl; \
abort(); \
} \
} while (0)
//--------------------------------------------------------------------------------
class gLogger : public nvinfer1::ILogger
{
private:
Severity level;
public:
void log(Severity severity, const char* msg) override
{
if (severity <= level)
std::cout << msg << std::endl;
};
gLogger(Severity level) { this->level = level; };
~gLogger() {};
};
//--------------------------------------------------------------------------------
struct InferDeleter
{
template <typename T>
void operator()(T* obj) const
{
if (obj)
obj->destroy();
}
};
//--------------------------------------------------------------------------------
int main()
{
gLogger logger(nvinfer1::ILogger::Severity::kINFO);;
nvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(logger);
nvinfer1::INetworkDefinition* network = builder->createNetwork();
nvonnxparser::IParser* parser = nvonnxparser::createParser(*network, logger);
bool parsed = parser->parseFromFile("mnist.onnx", static_cast<int>(nvinfer1::ILogger::Severity::kWARNING));
std::cout << network->getNbOutputs() << " intputs" << std::endl;
auto s = network->getInput(0)->getDimensions();
std::cout << s.nbDims << " input dimensions:" << std::endl;
for (int i = 0; i < s.nbDims; i++)
std::cout << "\t[" << i << "] " << s.d[i] << "" << std::endl;
std::cout << network->getNbOutputs() << " outputs" << std::endl;
s = network->getOutput(0)->getDimensions();
std::cout << s.nbDims << " output dimensions:" << std::endl;
for (int i = 0; i < s.nbDims; i++)
std::cout << "\t[" << i << "] " << s.d[i] << "" << std::endl;
builder->setMaxBatchSize(1);
nvinfer1::IBuilderConfig* config = builder->createBuilderConfig();
int maxWorkspaceSize{ 64 };
config->setMaxWorkspaceSize(maxWorkspaceSize * (1 << 20));
auto cudaEngine = std::shared_ptr<nvinfer1::ICudaEngine>(builder->buildEngineWithConfig(*network, *config), InferDeleter());
auto inputDims = network->getInput(0)->getDimensions();
auto outputDims = network->getOutput(0)->getDimensions();
config->destroy();
builder->destroy();
parser->destroy();
std::cout << "Engine created" << std::endl;
size_t inputSize = inputDims.d[0] * inputDims.d[1] * inputDims.d[2];
size_t outputSize = outputDims.d[0];
cudaStream_t stream1;
cudaStreamCreateWithFlags(&stream1, cudaStreamNonBlocking);
void* bindings1[2];
CHECK(cudaMalloc(&bindings1[0], inputSize * sizeof(float)));
CHECK(cudaMalloc(&bindings1[1], outputSize * sizeof(float)));
std::cout << "Created stream1, bindings1" << std::endl;
cudaStream_t stream2;
cudaStreamCreateWithFlags(&stream2, cudaStreamNonBlocking);
void* bindings2[2];
CHECK(cudaMalloc(&bindings2[0], inputSize * sizeof(float)));
CHECK(cudaMalloc(&bindings2[1], outputSize * sizeof(float)));
std::cout << "Created stream2, bindings2" << std::endl;
auto context1 = cudaEngine->createExecutionContext();
auto context2 = cudaEngine->createExecutionContext();
std::cout << "Created contexts" << std::endl;
cudaEvent_t inputConsumed1, inputConsumed2;
cudaEventCreate(&inputConsumed1);
cudaEventCreate(&inputConsumed2);
std::cout << "Created CUDA events" << std::endl;
std::vector<uint8_t> blob(inputSize);
std::ifstream infile("4.pgm", std::ifstream::binary);
std::string magic, h, w, max;
infile >> magic >> h >> w >> max;
infile.seekg(1, infile.cur);
infile.read(reinterpret_cast<char*>(blob.data()), inputSize);
infile.close();
std::cout << "Input:" << std::endl;
for (int i = 0; i < inputSize; i++)
std::cout << (" .:-=+*#%@"[blob[i] / 26]) << (((i + 1) % inputDims.d[2]) ? "" : "\n");
std::vector<float> input1(inputSize);
std::vector<float> input2(inputSize);
for (int i = 0; i < inputSize; i++)
{
input1[i] = 1.0 - float(blob[i] / 255.0);
input2[i] = 1.0 - float(blob[i] / 255.0);
}
std::cout << "Input prepared" << std::endl;
CHECK(cudaMemcpy(bindings1[0], (void*)(input1.data()), inputSize * sizeof(float), cudaMemcpyHostToDevice));
CHECK(cudaMemcpy(bindings2[0], (void*)(input2.data()), inputSize * sizeof(float), cudaMemcpyHostToDevice));
std::cout << "cudaMemcpy(HtD)" << std::endl;
context1->enqueue(1, bindings1, stream1, &inputConsumed1);
context2->enqueue(1, bindings2, stream2, &inputConsumed2);
std::cout << "Enqueue launched" << std::endl;
while (!(cudaStreamQuery(stream1) == cudaError::cudaSuccess && cudaStreamQuery(stream2) == cudaError::cudaSuccess))
;
float* result1 = new float[outputSize];
float* result2 = new float[outputSize];
CHECK(cudaMemcpy(result1, bindings1[0], outputSize * sizeof(float), cudaMemcpyDeviceToHost));
CHECK(cudaMemcpy(result2, bindings2[0], outputSize * sizeof(float), cudaMemcpyDeviceToHost));
std::cout << "cudaMemcpy(DtH)" << std::endl;
int idx = 0;
float sum = 0.0f, val = 0.0f;
// 1
for (int i = 0; i < outputSize; i++)
{
result1[i] = exp(result1[i]);
sum += result1[i];
}
std::cout << "Output 1:" << std::endl;
for (int i = 0; i < outputSize; i++)
{
result1[i] /= sum;
val = std::max(val, result1[i]);
if (val == result1[i])
idx = i;
std::cout << " Prob " << i << " " << std::fixed << std::setw(5) << std::setprecision(4) << result1[i] << " "
<< "Class " << i << ": " << std::string(int(std::floor(result1[i] * 10 + 0.5f)), '*') << std::endl;
}
std::cout << std::endl;
// 2
idx = 0;
sum = 0.0f;
val = 0.0f;
for (int i = 0; i < outputSize; i++)
{
result2[i] = exp(result2[i]);
sum += result2[i];
}
std::cout << "Output 2:" << std::endl;
for (int i = 0; i < outputSize; i++)
{
result2[i] /= sum;
val = std::max(val, result2[i]);
if (val == result2[i])
idx = i;
std::cout << " Prob " << i << " " << std::fixed << std::setw(5) << std::setprecision(4) << result2[i] << " "
<< "Class " << i << ": " << std::string(int(std::floor(result2[i] * 10 + 0.5f)), '*') << std::endl;
}
std::cout << std::endl;
}
Command line output with wrong result:
Output 1:
Prob 0 0.1000 Class 0: *
Prob 1 0.1000 Class 1: *
Prob 2 0.1000 Class 2: *
Prob 3 0.1000 Class 3: *
Prob 4 0.1000 Class 4: *
Prob 5 0.1000 Class 5: *
Prob 6 0.1000 Class 6: *
Prob 7 0.1000 Class 7: *
Prob 8 0.1000 Class 8: *
Prob 9 0.1000 Class 9: *
Output 2:
Prob 0 0.1000 Class 0: *
Prob 1 0.1000 Class 1: *
Prob 2 0.1000 Class 2: *
Prob 3 0.1000 Class 3: *
Prob 4 0.1000 Class 4: *
Prob 5 0.1000 Class 5: *
Prob 6 0.1000 Class 6: *
Prob 7 0.1000 Class 7: *
Prob 8 0.1000 Class 8: *
Prob 9 0.1000 Class 9: *
Correct result from sampleOnnxMNIST:
[05/22/2020-18:25:09] [I] Output:
[05/22/2020-18:25:09] [I] Prob 0 0.0000 Class 0:
[05/22/2020-18:25:09] [I] Prob 1 0.0000 Class 1:
[05/22/2020-18:25:09] [I] Prob 2 0.0000 Class 2:
[05/22/2020-18:25:09] [I] Prob 3 0.0000 Class 3:
[05/22/2020-18:25:09] [I] Prob 4 1.0000 Class 4: **********
[05/22/2020-18:25:09] [I] Prob 5 0.0000 Class 5:
[05/22/2020-18:25:09] [I] Prob 6 0.0000 Class 6:
[05/22/2020-18:25:09] [I] Prob 7 0.0000 Class 7:
[05/22/2020-18:25:09] [I] Prob 8 0.0000 Class 8:
[05/22/2020-18:25:09] [I] Prob 9 0.0000 Class 9:
Where did I make mistake?