Description
I want to use multi-threaded inference. I have assigned a context to each thread, but encountered an error. I can only use one thread and one context. C++
nvinfer1::Dims4 inputDims = {32, 3, img_size_, img_size_};
for (int i = 0; i < thread_nums_; ++i) {
void* deviceInput;
void* deviceOutput;
cudaStream_t stream = nullptr;
cudaStreamCreate(&stream);
cudaMalloc(&deviceInput, 32 * 3 * 224 * 224 * sizeof(float));
cudaMalloc(&deviceOutput, 32 * 1000 * sizeof(float));
void** bindings = new void*[2];
bindings[0] = deviceInput;
bindings[1] = deviceOutput;
auto m_context = m_engine->createExecutionContext();
m_context->setInputShape("input", inputDims);
m_context->setTensorAddress("input", deviceInput);
m_context->setTensorAddress("output", deviceOutput);
m_contexts.emplace_back(m_context);
streams_.emplace_back(stream);
deviceInputs_.emplace_back(deviceInput);
deviceOutputs_.emplace_back(deviceOutput);
}
cudaMemcpyAsync(sptr->deviceInputs_[i], inputHost, 32 * 3 * 224 * 224 * sizeof(float), cudaMemcpyHostToDevice, sptr->streams_[i]);
if (sptr->m_contexts[i]->enqueueV3(sptr->streams_[i])) {
cudaMemcpyAsync(outputHost, sptr->deviceOutputs_[i], 32 * 1000 * sizeof(float), cudaMemcpyDeviceToHost, sptr->streams_[i]);
std::cout << outputHost[0] << std::endl;
}
Environment
TensorRT Version: 8.6.1
GPU Type:
Nvidia Driver Version: rtx3070
CUDA Version: cuda11.5
CUDNN Version:
Operating System + Version: ubuntu20.04
Python Version (if applicable):
TensorFlow Version (if applicable):
PyTorch Version (if applicable):
Baremetal or Container (if container which image + tag):
Relevant Files
Please attach or include links to any models, data, files, or scripts necessary to reproduce your issue. (Github repo, Google Drive, Dropbox, etc.)
Steps To Reproduce
Please include:
- Exact steps/commands to build your repro
- Exact steps/commands to run your repro
- Full traceback of errors encountered