error when run two different context parallel in TensorRT7

out error when run about 1 minute:

...
[F] [TRT] Assertion failed: *refCount > 0
../rtSafe/WeightsPtr.cpp:20
Aborting...

[F] [TRT] Assertion failed: *refCount > 0
../rtSafe/WeightsPtr.cpp:20
Aborting...

[E] [TRT] FAILED_EXECUTION: std::exception
error when run graph TensorRT
[E] [TRT] FAILED_EXECUTION: std::exception
error when run graph TensorRT
0
1
0
1
0
1
[F] [TRT] Assertion failed: *refCount > 0
../rtSafe/WeightsPtr.cpp:20
Aborting...

[E] [TRT] FAILED_EXECUTION: std::exception
error when run graph TensorRT
[F] [TRT] Assertion failed: *refCount > 0
../rtSafe/WeightsPtr.cpp:20
Aborting...

[E] [TRT] FAILED_EXECUTION: std::exception
error when run graph TensorRT
0
1
double free or corruption (!prev)
[F] [TRT] Assertion failed: *refCount > 0
../rtSafe/WeightsPtr.cpp:20
Aborting...

[E] Aborted (core dumped)

test code:

#include "NvInfer.h"
#include <iostream>
#include "NvUtils.h"
#include "NvOnnxParser.h"
using namespace nvinfer1;
#include <thread>

#include "common/logger.h"
#include "common/buffers.h"
std::string model_path = "detection_model.onnx";

void convert_dims_to_vect(const nvinfer1::Dims& dims, std::vector<int>& v){
    v.resize(dims.nbDims);
    for (int i=0; i<dims.nbDims; ++i)
        v[i] = dims.d[i];
}
void make_explicit_shapes(IExecutionContext* context,const  std::vector<std::string>& tensorNames, std::vector<std::vector<int>>& explicit_shapes){
	int n = tensorNames.size();
	explicit_shapes.resize(n);
	std::string suffix;
	int profile_index = context->getOptimizationProfile();
	if (profile_index!=0)
		suffix = " [profile "+std::to_string(profile_index)+"]";
	std::vector<int> v;
	for (int i=0; i<n; ++i){
		int index = context->getEngine().getBindingIndex((tensorNames[i]+suffix).c_str());
		convert_dims_to_vect(context->getBindingDimensions(index), v);
		explicit_shapes[i] = v;
	}
}

std::string input_name = "fts_input_images:0";
Dims4 dims1(1,10,10,3);
Dims4 dims2(1,80,80,3);
Dims4 dims3(1,500,500,3);
void run(std::vector<IExecutionContext*> contexts, int i, std::vector<std::vector<int>> explicit_shapes){
    std::vector<samplesCommon::DeviceBuffer> deviceBuffers;
    std::vector<samplesCommon::HostBuffer> hostBuffers;
    for (int i=0; i<explicit_shapes.size(); ++i){
        size_t allocationSize = std::accumulate(explicit_shapes[i].begin(), explicit_shapes[i].end(), 1, std::multiplies<int>()) * 4;
        hostBuffers.emplace_back(allocationSize);
        // std::cout<<"allocationSize: "<<allocationSize<<"\n";
        deviceBuffers.emplace_back(allocationSize);
    }

    std::vector<void*> mDeviceBindings(i*deviceBuffers.size(), NULL);
    for (auto& buffer:(deviceBuffers)){
        // std::cout<<buffer.data()<<" buffer\n";
        mDeviceBindings.emplace_back(buffer.data());
    }
    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));
    if (!contexts[i]->enqueueV2(mDeviceBindings.data(), stream, nullptr)){
        std::cout<<"error when run graph TensorRT\n";
    }
    cudaStreamSynchronize(stream);
    cudaStreamDestroy(stream);
}

int main(int argc, char** argv) {
  auto builder = createInferBuilder(gLogger);

  auto config = builder->createBuilderConfig();
  for (int i=0; i<2; ++i){
    auto profile = builder->createOptimizationProfile();
    profile->setDimensions(input_name.c_str(), OptProfileSelector::kMIN, dims1);
    profile->setDimensions(input_name.c_str(), OptProfileSelector::kOPT, dims2);
    profile->setDimensions(input_name.c_str(), OptProfileSelector::kMAX, dims3);
    config->addOptimizationProfile(profile);
  }

  auto network = builder->createNetworkV2(1U << static_cast<int>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH));
  auto parser = nvonnxparser::createParser(*network, gLogger);
  parser->parseFromFile(model_path.c_str(), 3);
  auto engine = builder->buildEngineWithConfig(*network,*config);

    std::vector<std::string> tensorNames;
    for (int i=0; i<engine->getNbBindings(); ++i){
        std::string name(engine->getBindingName(i));
        if (name.find("[profile")==-1){
            tensorNames.emplace_back(name);
        }
    }

  std::vector<IExecutionContext*> contexts;
  std::vector<std::vector<int>> explicit_shapes;
  for (int i=0; i<2; ++i){
    contexts.emplace_back(engine->createExecutionContext());
    auto context = contexts.back();
    context->setOptimizationProfile(i);
    std::cout<<"allInputDimensionsSpecified: "<<context->allInputDimensionsSpecified()<<"\n";
    int index;
    if (i==0)
        index = engine->getBindingIndex((input_name).c_str());
    else
        index = engine->getBindingIndex((input_name+" [profile "+std::to_string(i)+"]").c_str());
    context->setBindingDimensions(index, dims2);
    std::cout<<"allInputDimensionsSpecified must equal 1: "<<context->allInputDimensionsSpecified()<<"\n";

    explicit_shapes.clear();
    make_explicit_shapes(context, tensorNames, explicit_shapes);
  }

    for (;;){
        std::vector<std::thread> v_thread;
        for (int i=0;i<2;++i){
            v_thread.emplace_back(run, contexts, i, explicit_shapes);
            std::cout<<i<<"\n";
        }
        for (auto p=v_thread.begin(); p!=v_thread.end(); p++)
            p->join();
    }

}

model: https://1drv.ms/u/s!AhFk3ICqlZI2irl-5pxC5LawRSixew?e=XfVdJb
I use tensorrt7 + driver440.48.02 + GTX1080 + ubuntu 18.04.
I tested with many version cuda and cudnn

Hi,

Please refer below dev talk forum:
https://devtalk.nvidia.com/default/topic/1070696/tensorrt/error-run-2-context-parallel-in-tensorrt7

Thanks