Hi,when I profile my inference project, I found some kernels in diffenent streams can not execute parallel in GPU. And if I remove shuffle layer, the kernels in different streams can execute parallel.
For clarity, I edit a test code as below.
Main test code:
for (int i = 0; i < max_streams; ++i) {
if (i == 1) {
//cudaStreamWaitEvent(streams[i], event, 0);
}
if (i == 0) {
rt.Infer(streams[0]);
}
for (int k = 0; k < 10; k++) {
launch_kernel(data[i], streams[i], 2160, 1024, N);
}
if (i == 0) {
rt2.Infer(streams[0]);
}
}
Function T1Task::Proc() is used to launch kernels in two streams, low priority and high priority.
And RtDemo is a sample TensorRT context, which has only one layer implemented by TensorRT builtin Shuffle layer.
If I add
if (i == 0) {
rt2.Infer(streams[0]);
}
Two stream kernels can not execute in parallel.
Because when before launch_kernel, I call rt.Infer(), and after launch_kernel, I call rt2.Infer() too.
I think two rt.Infer() prevent two stream execute concurrent.
If I cut
if (i == 0) {
//rt2.Infer(streams[0]);
}
Two stream kernels can execute in parallel.
Attachment is my profile trace and code is as below.
test code:
#include <pthread.h>
#include <stdio.h>
#include <glog/logging.h>
#include "t1_task.h"
#include "rt_demo.h"
const int N = (1 << 20);
void *launch_kernel(float*data, cudaStream_t stream, int blocks, int threads, int n);
void *launch_kernel_default(float*data, int blocks, int threads, int n);
bool T1Task::Preprocess() {
cudaSetDevice(0);
max_streams = 2;
int leastPriority = 0;
int greatestPriority = 0;
cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority);
LOG(INFO) << "leastPriority: " << leastPriority << ", greatestPriority: " << greatestPriority;
priority_ = -1;
for (int i = 0; i < max_streams; ++i) {
if (i == 1) {
LOG(INFO) << "this set priority##############:" << priority_;
cudaStreamCreateWithPriority(&streams[i], cudaStreamNonBlocking, priority_);//cudaStreamNonBlocking);
} else {
cudaStreamCreateWithPriority(&streams[i], cudaStreamNonBlocking, 0);//cudaStreamNonBlocking);
}
cudaMalloc(&data[i], N * sizeof(float));
cudaMalloc(&data[i + max_streams], N * sizeof(float));
}
}
bool T1Task::Proc() {
RtDemo rt;
RtDemo rt2;
rt.Init();
rt2.Init();
int threads = threads_;
for (int i = 0; i < max_streams; ++i) {
if (i == 1) {
//cudaStreamWaitEvent(streams[i], event, 0);
}
if (i == 0) {
rt.Infer(streams[0]);
}
for (int k = 0; k < 10; k++) {
launch_kernel(data[i], streams[i], 2160, 1024, N);
}
if (i == 0) {
rt2.Infer(streams[0]);
}
}
for (int i = 0; i < max_streams; ++i) {
cudaStreamSynchronize(streams[i]);
}
return true;
}
rt_demo.h
#include <glog/logging.h>
#include <assert.h>
#include <sys/stat.h>
#include <time.h>
#include <cuda_runtime_api.h>
#include <float.h>
#include <string>
#include <vector>
#include <unordered_map>
#include <cmath>
#include <fstream>
#include <sstream>
#include <iostream>
#include <algorithm>
#include <iterator>
#include <NvInfer.h>
#include <NvCaffeParser.h>
#include <NvInferPlugin.h>
namespace nvinfer1 {
class ICaffePoolOutputDimensionsFormula : public IOutputDimensionsFormula {
public:
virtual DimsHW compute(DimsHW input_dims,
DimsHW kernel_size,
DimsHW stride,
DimsHW padding,
DimsHW dilation,
const char *layerName)
#if NV_TENSORRT_MAJOR < 5
override {
#else
const override {
#endif
const int kernel_extent_h = dilation.d[0] * (kernel_size.d[0] - 1) + 1;
const int kernel_extent_w = dilation.d[1] * (kernel_size.d[1] - 1) + 1;
auto &&h_temp = (input_dims.d[0] + 2 * padding.d[0] - kernel_extent_h) *
1.0 / stride.d[0];
auto &&w_temp = (input_dims.d[1] + 2 * padding.d[1] - kernel_extent_w) *
1.0 / stride.d[1];
std::string str_name(layerName);
if (str_name.find("as_conv") == std::string::npos) {
return DimsHW(ceil(h_temp) + 1, ceil(w_temp) + 1);
} else {
return DimsHW(floor(h_temp) + 1, floor(w_temp) + 1);
}
}
ICaffePoolOutputDimensionsFormula() {}
~ICaffePoolOutputDimensionsFormula() {}
};
} // namespace
class RtDemo {
public:
RtDemo() {}
~RtDemo() {}
int Init();
int Infer(cudaStream_t stream);
private:
nvinfer1::IExecutionContext *guard_context_ = nullptr;
nvinfer1::IBuilder *guard_builder_ = nullptr;
nvinfer1::INetworkDefinition *guard_network_ = nullptr;
std::vector<void *> guard_buffers_;
};
rt_demo.cpp
#include <vector>
#include <map>
#include <set>
#include <iostream>
#include <fstream>
#include <sstream>
#include <NvInferPlugin.h>
#include "rt_demo.h"
class RTLogger : public nvinfer1::ILogger {
void log(Severity severity, const char *msg) override {
if (severity != Severity::kINFO) {
std::cout << msg << std::endl;
}
}
} rt_gLogger;
int RtDemo::Init() {
// 1. create
guard_builder_ = nvinfer1::createInferBuilder(rt_gLogger);
guard_network_ = guard_builder_->createNetwork();
nvinfer1::ICaffePoolOutputDimensionsFormula poolFormula;
guard_network_->setPoolingOutputDimensionsFormula(&poolFormula);
// 2. add input
std::string input_data = "data";
//nvinfer1::DimsCHW input_dims{720, 1280, 3};
nvinfer1::DimsCHW input_dims{600, 800, 3};
auto data = guard_network_->addInput(input_data.c_str(),
nvinfer1::DataType::kFLOAT,
input_dims);
data->setName(input_data.c_str());
// add tensorrt builtin shuffle layer
std::string permute_name = "data_perm";
nvinfer1::Dims dims;
dims.nbDims = 3;
for (int i = 0; i < dims.nbDims; ++i) {
dims.d[i] = 0;
dims.type[i] = nvinfer1::DimensionType::kSPATIAL;
}
dims.type[0] = nvinfer1::DimensionType::kCHANNEL;
// HWC -> CHW: [2, 0, 1], CHW -> HWC: [1, 2, 0]
nvinfer1::Permutation perm;
perm.order[0] = 2;
perm.order[1] = 0;
perm.order[2] = 1;
for (int i = dims.nbDims; i < nvinfer1::Dims::MAX_DIMS; ++i) {
perm.order[i] = 0;
}
nvinfer1::IShuffleLayer *shuffleLayer = guard_network_->addShuffle(*data);
shuffleLayer->setFirstTranspose(perm);
shuffleLayer->setName(permute_name.c_str());
shuffleLayer->setReshapeDimensions(dims);
shuffleLayer->getOutput(0)->setName(permute_name.c_str());
guard_network_->markOutput(*shuffleLayer->getOutput(0));
// 3. build engine
guard_builder_->setMaxBatchSize(1);
int work_space_size = 1 << 30;
guard_builder_->setMaxWorkspaceSize(work_space_size);
guard_builder_->setInt8Mode(false);
guard_builder_->setInt8Calibrator(nullptr);
guard_builder_->setDebugSync(true);
std::cout << "Building cuda engine ...";
nvinfer1::ICudaEngine *engine = nullptr;
engine = guard_builder_->buildCudaEngine(*guard_network_);
guard_context_ = engine->createExecutionContext();
guard_buffers_.resize(2); // 1 input + 1 output
void *gpu_ptr;
cudaMalloc(&gpu_ptr, 16 << 20);
guard_buffers_[0] = gpu_ptr;
cudaMalloc(&gpu_ptr, 16 << 20);
guard_buffers_[1] = gpu_ptr;
if (guard_context_) {
std::cout << "init guard success" << std::endl;
}
return true;
}
int RtDemo::Infer(cudaStream_t stream) {
guard_context_->enqueue(1, &guard_buffers_[0], stream, nullptr);
return 0;
}
Does TensorRT shuffle layer do something to prevent other stream kernels execute parallel?
And who can explain why other stream kernel can not execute parallel?
Thanks