TensorRT builtin shuffle layer prevent other stream kernel execute parallel

Hi,when I profile my inference project, I found some kernels in diffenent streams can not execute parallel in GPU. And if I remove shuffle layer, the kernels in different streams can execute parallel.
For clarity, I edit a test code as below.
Main test code:

for (int i = 0; i < max_streams; ++i) {
    if (i == 1) {
      //cudaStreamWaitEvent(streams[i], event, 0);
    }

    if (i == 0) {
      rt.Infer(streams[0]);
    }

    for (int k = 0; k < 10; k++) {
      launch_kernel(data[i], streams[i], 2160, 1024, N);
    }

    if (i == 0) {
      rt2.Infer(streams[0]);
    }
  }

Function T1Task::Proc() is used to launch kernels in two streams, low priority and high priority.
And RtDemo is a sample TensorRT context, which has only one layer implemented by TensorRT builtin Shuffle layer.
If I add

if (i == 0) {
  rt2.Infer(streams[0]);
}

Two stream kernels can not execute in parallel.
Because when before launch_kernel, I call rt.Infer(), and after launch_kernel, I call rt2.Infer() too.
I think two rt.Infer() prevent two stream execute concurrent.
If I cut

if (i == 0) {
  //rt2.Infer(streams[0]);
}

Two stream kernels can execute in parallel.
Attachment is my profile trace and code is as below.

test code:

#include <pthread.h>
#include <stdio.h>
#include <glog/logging.h>
#include "t1_task.h"
#include "rt_demo.h"

const int N = (1 << 20);
void *launch_kernel(float*data, cudaStream_t stream, int blocks, int threads, int n);
void *launch_kernel_default(float*data, int blocks, int threads, int n);

bool T1Task::Preprocess() {
  cudaSetDevice(0);
  max_streams = 2;

  int leastPriority = 0;
  int greatestPriority = 0;
  cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority);
  LOG(INFO) << "leastPriority: " << leastPriority << ", greatestPriority: " << greatestPriority;

  priority_ = -1;
  for (int i = 0; i < max_streams; ++i) {
    if (i == 1) {
      LOG(INFO) << "this set priority##############:" << priority_;
      cudaStreamCreateWithPriority(&streams[i], cudaStreamNonBlocking, priority_);//cudaStreamNonBlocking);
    }  else {
      cudaStreamCreateWithPriority(&streams[i], cudaStreamNonBlocking, 0);//cudaStreamNonBlocking);
    }

    cudaMalloc(&data[i], N * sizeof(float));
    cudaMalloc(&data[i + max_streams], N * sizeof(float));
  }
}

bool T1Task::Proc() {
  RtDemo rt;
  RtDemo rt2;
  rt.Init();
  rt2.Init();

  int threads = threads_;
  for (int i = 0; i < max_streams; ++i) {
    if (i == 1) {
      //cudaStreamWaitEvent(streams[i], event, 0);
    }

    if (i == 0) {
      rt.Infer(streams[0]);
    }

    for (int k = 0; k < 10; k++) {
      launch_kernel(data[i], streams[i], 2160, 1024, N);
    }

    if (i == 0) {
      rt2.Infer(streams[0]);
    }
  }

  for (int i = 0; i < max_streams; ++i) {
    cudaStreamSynchronize(streams[i]);
  }
  return true;
}

rt_demo.h

#include <glog/logging.h>

#include <assert.h>

#include <sys/stat.h>
#include <time.h>
#include <cuda_runtime_api.h>
#include <float.h>
#include <string>
#include <vector>
#include <unordered_map>
#include <cmath>
#include <fstream>
#include <sstream>
#include <iostream>
#include <algorithm>
#include <iterator>

#include <NvInfer.h>
#include <NvCaffeParser.h>
#include <NvInferPlugin.h>

namespace nvinfer1 {
class ICaffePoolOutputDimensionsFormula : public IOutputDimensionsFormula {
 public:
  virtual DimsHW compute(DimsHW input_dims,
                         DimsHW kernel_size,
                         DimsHW stride,
                         DimsHW padding,
                         DimsHW dilation,
                         const char *layerName)
#if NV_TENSORRT_MAJOR < 5
                         override {
#else
                         const override {
#endif
    const int kernel_extent_h = dilation.d[0] * (kernel_size.d[0] - 1) + 1;
    const int kernel_extent_w = dilation.d[1] * (kernel_size.d[1] - 1) + 1;
    auto &&h_temp = (input_dims.d[0] + 2 * padding.d[0] - kernel_extent_h) *
                    1.0 / stride.d[0];
    auto &&w_temp = (input_dims.d[1] + 2 * padding.d[1] - kernel_extent_w) *
                    1.0 / stride.d[1];

    std::string str_name(layerName);
    if (str_name.find("as_conv") == std::string::npos) {
      return DimsHW(ceil(h_temp) + 1, ceil(w_temp) + 1);
    } else {
      return DimsHW(floor(h_temp) + 1, floor(w_temp) + 1);
    }
  }

  ICaffePoolOutputDimensionsFormula() {}
  ~ICaffePoolOutputDimensionsFormula() {}
};
}  // namespace

class RtDemo {
 public:
  RtDemo() {}

  ~RtDemo() {}

  int Init();

  int Infer(cudaStream_t stream);

 private:
  nvinfer1::IExecutionContext *guard_context_ = nullptr;
  nvinfer1::IBuilder *guard_builder_ = nullptr;
  nvinfer1::INetworkDefinition *guard_network_ = nullptr;
  std::vector<void *> guard_buffers_;
};

rt_demo.cpp

#include <vector>
#include <map>
#include <set>
#include <iostream>
#include <fstream>
#include <sstream>

#include <NvInferPlugin.h>

#include "rt_demo.h"

class RTLogger : public nvinfer1::ILogger {
  void log(Severity severity, const char *msg) override {
    if (severity != Severity::kINFO) {
      std::cout << msg << std::endl;
    }
  }
} rt_gLogger;

int RtDemo::Init() {
  // 1. create
  guard_builder_ = nvinfer1::createInferBuilder(rt_gLogger);
  guard_network_ = guard_builder_->createNetwork();
  nvinfer1::ICaffePoolOutputDimensionsFormula poolFormula;
  guard_network_->setPoolingOutputDimensionsFormula(&poolFormula);

  // 2. add input
  std::string input_data = "data";
  //nvinfer1::DimsCHW input_dims{720, 1280, 3};
  nvinfer1::DimsCHW input_dims{600, 800, 3};
  auto data = guard_network_->addInput(input_data.c_str(),
                                   nvinfer1::DataType::kFLOAT,
                                   input_dims);
  data->setName(input_data.c_str());

  // add tensorrt builtin shuffle layer
  std::string permute_name = "data_perm";
  nvinfer1::Dims dims;
  dims.nbDims = 3;
  for (int i = 0; i < dims.nbDims; ++i) {
   dims.d[i] = 0;
   dims.type[i] = nvinfer1::DimensionType::kSPATIAL;
  }
  dims.type[0] = nvinfer1::DimensionType::kCHANNEL;

  // HWC -> CHW: [2, 0, 1], CHW -> HWC: [1, 2, 0]
  nvinfer1::Permutation perm;
  perm.order[0] = 2;
  perm.order[1] = 0;
  perm.order[2] = 1;

  for (int i = dims.nbDims; i < nvinfer1::Dims::MAX_DIMS; ++i) {
    perm.order[i] = 0;
  }

  nvinfer1::IShuffleLayer *shuffleLayer = guard_network_->addShuffle(*data);
  shuffleLayer->setFirstTranspose(perm);
  shuffleLayer->setName(permute_name.c_str());
  shuffleLayer->setReshapeDimensions(dims);
  shuffleLayer->getOutput(0)->setName(permute_name.c_str());

  guard_network_->markOutput(*shuffleLayer->getOutput(0));

  // 3. build engine
  guard_builder_->setMaxBatchSize(1);
  int work_space_size = 1 << 30;
  guard_builder_->setMaxWorkspaceSize(work_space_size);

  guard_builder_->setInt8Mode(false);
  guard_builder_->setInt8Calibrator(nullptr);

  guard_builder_->setDebugSync(true);

  std::cout << "Building cuda engine ...";

  nvinfer1::ICudaEngine *engine = nullptr;
  engine = guard_builder_->buildCudaEngine(*guard_network_);

  guard_context_ = engine->createExecutionContext();
  guard_buffers_.resize(2); // 1 input + 1 output

  void *gpu_ptr;
  cudaMalloc(&gpu_ptr, 16 << 20);
  guard_buffers_[0] = gpu_ptr;

  cudaMalloc(&gpu_ptr, 16 << 20);
  guard_buffers_[1] = gpu_ptr;
  if (guard_context_) {
    std::cout << "init guard success" << std::endl;
  }
  return true;
}

int RtDemo::Infer(cudaStream_t stream) {
  guard_context_->enqueue(1, &guard_buffers_[0], stream, nullptr);
  return 0;
}

Does TensorRT shuffle layer do something to prevent other stream kernels execute parallel?
And who can explain why other stream kernel can not execute parallel?
Thanks