error when run two different context parallel in TensorRT7

nobody answer my post: https://devtalk.nvidia.com/default/topic/1070696/tensorrt/error-run-2-context-parallel-in-tensorrt7/post/5427317/#5427317

out error when run about 1 minute:

...
[F] [TRT] Assertion failed: *refCount > 0
../rtSafe/WeightsPtr.cpp:20
Aborting...

[F] [TRT] Assertion failed: *refCount > 0
../rtSafe/WeightsPtr.cpp:20
Aborting...

[E] [TRT] FAILED_EXECUTION: std::exception
error when run graph TensorRT
[E] [TRT] FAILED_EXECUTION: std::exception
error when run graph TensorRT
0
1
0
1
0
1
[F] [TRT] Assertion failed: *refCount > 0
../rtSafe/WeightsPtr.cpp:20
Aborting...

[E] [TRT] FAILED_EXECUTION: std::exception
error when run graph TensorRT
[F] [TRT] Assertion failed: *refCount > 0
../rtSafe/WeightsPtr.cpp:20
Aborting...

[E] [TRT] FAILED_EXECUTION: std::exception
error when run graph TensorRT
0
1
double free or corruption (!prev)
[F] [TRT] Assertion failed: *refCount > 0
../rtSafe/WeightsPtr.cpp:20
Aborting...

[E] Aborted (core dumped)

test code:

#include "NvInfer.h"
#include <iostream>
#include "NvUtils.h"
#include "NvOnnxParser.h"
using namespace nvinfer1;
#include <thread>

#include "common/logger.h"
#include "common/buffers.h"
std::string model_path = "detection_model.onnx";

void convert_dims_to_vect(const nvinfer1::Dims& dims, std::vector<int>& v){
    v.resize(dims.nbDims);
    for (int i=0; i<dims.nbDims; ++i)
        v[i] = dims.d[i];
}
void make_explicit_shapes(IExecutionContext* context,const  std::vector<std::string>& tensorNames, std::vector<std::vector<int>>& explicit_shapes){
	int n = tensorNames.size();
	explicit_shapes.resize(n);
	std::string suffix;
	int profile_index = context->getOptimizationProfile();
	if (profile_index!=0)
		suffix = " [profile "+std::to_string(profile_index)+"]";
	std::vector<int> v;
	for (int i=0; i<n; ++i){
		int index = context->getEngine().getBindingIndex((tensorNames[i]+suffix).c_str());
		convert_dims_to_vect(context->getBindingDimensions(index), v);
		explicit_shapes[i] = v;
	}
}

std::string input_name = "fts_input_images:0";
Dims4 dims1(1,10,10,3);
Dims4 dims2(1,80,80,3);
Dims4 dims3(1,500,500,3);
void run(std::vector<IExecutionContext*> contexts, int i, std::vector<std::vector<int>> explicit_shapes){
    std::vector<samplesCommon::DeviceBuffer> deviceBuffers;
    std::vector<samplesCommon::HostBuffer> hostBuffers;
    for (int i=0; i<explicit_shapes.size(); ++i){
        size_t allocationSize = std::accumulate(explicit_shapes[i].begin(), explicit_shapes[i].end(), 1, std::multiplies<int>()) * 4;
        hostBuffers.emplace_back(allocationSize);
        // std::cout<<"allocationSize: "<<allocationSize<<"\n";
        deviceBuffers.emplace_back(allocationSize);
    }

    std::vector<void*> mDeviceBindings(i*deviceBuffers.size(), NULL);
    for (auto& buffer:(deviceBuffers)){
        // std::cout<<buffer.data()<<" buffer\n";
        mDeviceBindings.emplace_back(buffer.data());
    }
    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));
    if (!contexts[i]->enqueueV2(mDeviceBindings.data(), stream, nullptr)){
        std::cout<<"error when run graph TensorRT\n";
    }
    cudaStreamSynchronize(stream);
    cudaStreamDestroy(stream);
}

int main(int argc, char** argv) {
  auto builder = createInferBuilder(gLogger);

  auto config = builder->createBuilderConfig();
  for (int i=0; i<2; ++i){
    auto profile = builder->createOptimizationProfile();
    profile->setDimensions(input_name.c_str(), OptProfileSelector::kMIN, dims1);
    profile->setDimensions(input_name.c_str(), OptProfileSelector::kOPT, dims2);
    profile->setDimensions(input_name.c_str(), OptProfileSelector::kMAX, dims3);
    config->addOptimizationProfile(profile);
  }

  auto network = builder->createNetworkV2(1U << static_cast<int>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH));
  auto parser = nvonnxparser::createParser(*network, gLogger);
  parser->parseFromFile(model_path.c_str(), 3);
  auto engine = builder->buildEngineWithConfig(*network,*config);

    std::vector<std::string> tensorNames;
    for (int i=0; i<engine->getNbBindings(); ++i){
        std::string name(engine->getBindingName(i));
        if (name.find("[profile")==-1){
            tensorNames.emplace_back(name);
        }
    }

  std::vector<IExecutionContext*> contexts;
  std::vector<std::vector<int>> explicit_shapes;
  for (int i=0; i<2; ++i){
    contexts.emplace_back(engine->createExecutionContext());
    auto context = contexts.back();
    context->setOptimizationProfile(i);
    std::cout<<"allInputDimensionsSpecified: "<<context->allInputDimensionsSpecified()<<"\n";
    int index;
    if (i==0)
        index = engine->getBindingIndex((input_name).c_str());
    else
        index = engine->getBindingIndex((input_name+" [profile "+std::to_string(i)+"]").c_str());
    context->setBindingDimensions(index, dims2);
    std::cout<<"allInputDimensionsSpecified must equal 1: "<<context->allInputDimensionsSpecified()<<"\n";

    explicit_shapes.clear();
    make_explicit_shapes(context, tensorNames, explicit_shapes);
  }

    for (;;){
        std::vector<std::thread> v_thread;
        for (int i=0;i<2;++i){
            v_thread.emplace_back(run, contexts, i, explicit_shapes);
            std::cout<<i<<"\n";
        }
        for (auto p=v_thread.begin(); p!=v_thread.end(); p++)
            p->join();
    }

}

model: https://1drv.ms/u/s!AhFk3ICqlZI2irl-5pxC5LawRSixew?e=XfVdJb
I use tensorrt7 + driver440.48.02 + GTX1080 + ubuntu 18.04.
I tested with many version cuda and cudnn

Hi,

Our engg team is looking into this issue, will get back to you once we have any update.

Request you to please follow below topic for any updates:
https://devtalk.nvidia.com/default/topic/1070696/tensorrt/error-run-2-context-parallel-in-tensorrt7/post/5427317/

Thanks