Hi,
I am trying to run my RFCN caffemodel on Xavier by tensorRT. It works well when using FP32 and FP16 mode, but doesn’t work when using INT8 mode. My caffeToTRTModel function is as follow:
void caffeToTRTModelINT8(const std::string& deployFile,
const std::string& modelFile,
const std::vector<std::string>& outputs,
unsigned int maxBatchSize,
nvcaffeparser1::IPluginFactoryExt* pluginFactory,
IHostMemory*& trtModelStream,
const std::string& imageSetFile,
const std::string& cachePath
)
{
IBuilder* builder = createInferBuilder(gLogger.getTRTLogger());
assert(builder != nullptr);
INetworkDefinition* network = builder->createNetwork();
ICaffeParser* parser = createCaffeParser();
parser->setPluginFactoryExt(pluginFactory);
// ----INT8----
const IBlobNameToTensor* blobNameToTensor = parser->parse(locateFile(deployFile, gArgs.dataDirs).c_str(),
locateFile(modelFile, gArgs.dataDirs).c_str(),
*network,
DataType::kFLOAT);
// specify which tensors are outputs
for (auto& s : outputs){
network->markOutput(*blobNameToTensor->find(s.c_str()));
}
// Build the engine
builder->setMaxBatchSize(maxBatchSize);
builder->setMaxWorkspaceSize(10 << 20);
// std::cout << "debug@tys: before setDummyInt8Scales" << std::endl;
// samplesCommon::setDummyInt8Scales(builder, network);
//std::cout << "debug@tys: before enableDLA" << std::endl;
//samplesCommon::enableDLA(builder, gArgs.useDLACore);
//builder->allowGPUFallback(true);
builder->setAverageFindIterations(1);
builder->setMinFindIterations(1);
builder->setDebugSync(true);
builder->setInt8Mode(true);
DimsCHW mDims(INPUT_C, INPUT_H, INPUT_W);
DataLoader dataLoader(imageSetFile,
BATCH_SIZE,mDims.w(),mDims.h(),
mDims.c());
Int8EntropyCalibrator calibrator(&dataLoader,
mDims.c(),mDims.h(),mDims.w(),
false,cachePath);
builder->setInt8Calibrator(&calibrator);
builder->setDefaultDeviceType(DeviceType::kGPU);
ICudaEngine* engine = builder->buildCudaEngine(*network);
assert(engine);
// we don't need the network any more, and we can destroy the parser
network->destroy();
parser->destroy();
// serialize the engine, then close everything down
// (*trtModelStream) = engine->serialize();
trtModelStream = engine->serialize();
std::ofstream ofs("/home/xxxx/tensorrt/samples/sampleRFCN_test/layerOut/serialData.txt", std::ios::out | std::ios::binary);
ofs.write((char*)(trtModelStream->data()), trtModelStream->size());
ofs.close();
engine->destroy();
builder->destroy();
shutdownProtobufLibrary();
}
And my Dataloader and Int8EntropyCalibrator classes are as follow:
class DataLoader
{
public:
DataLoader(const std::string& imageSetFile,
int batchSize,
int width,
int height,
int channels)
: batchSize(batchSize), width(width), height(height), channels(channels)
{
index = 0;
batchData = new float[batchSize * width * height * channels];
imInfo = new float[batchSize * 3];
for(int i = 0; i < batchSize; i++)
{
imInfo[i*batchSize + 0] = height;
imInfo[i*batchSize + 1] = width;
imInfo[i*batchSize + 2] = 1; //image scale
}
// read image list from imageSetFile
std::ifstream infile(imageSetFile);
std::string imageName;
int count = 0;
while(!infile.eof() && count < CALIBRATE_MAX_NUM)
{
count++;
string sTmp;
infile >> imageName;
std::cout << "@debug: DataLodaer imageName: " << imageName << std::endl;
getline(infile,sTmp);
imageList.push_back(imageName);
}
infile.close();
}
~DataLoader()
{
delete[] batchData;
delete[] imInfo;
}
float* getBatchData() { return this->batchData;}
float* getImInfo() { return this->imInfo; }
bool next()
{
std::cout << "Generate batch data for calibration: " << index+1 << "/"
<< imageList.size() << std::endl;
if((index + batchSize) >= imageList.size())
return false;
for(int i = 0; i < batchSize; i++)
{
std::string imageFile = imageList[index+i];
cv::Mat im = cv::imread(imageFile);
int im3d = height * width * channels;
int im2d = height * width;
if(!im.empty())
{
cv::resize(im, im, cv::Size(width, height));
for(int ch=0; ch < channels; ch++)
{
for(int r=0; r < height; r++)
{
for(int c=0; c < width; c++)
{
batchData[i*im3d + ch*im2d + r*width + c] =
(float)(im.at<cv::Vec3b>(r,c)[ch]) - pixelMean[ch];
// std::cout << "@debug: " << (float)(im.at<cv::Vec3b>(r,c)[ch]) <<
// " " << pixelMean[ch] << " " << batchData[i*im3d + ch*im2d + r*width + c] << std::endl;
}
}
}
}
else
{
std::cout << "Can't open " << imageFile << std::endl;
}
}
index += batchSize;
return true;
}
private:
unsigned int index;
std::vector<std::string> imageList;
float* batchData;
float* imInfo;
int batchSize;
int width;
int height;
int channels;
};
class Int8EntropyCalibrator : public IInt8EntropyCalibrator
{
public:
Int8EntropyCalibrator(DataLoader* dataLoader,
int channel, int height, int width,
bool readCache, const std::string& cachePath)
: dataLoader(dataLoader), mReadCache(readCache), gNetWorkName(cachePath)
{
mDims = nvinfer1::DimsNCHW{ BATCH_SIZE, channel, height, width };
mInputCount = mDims.n() * mDims.c() * mDims.h() * mDims.w();
CHECK(cudaMalloc(&mDeviceInput1, mInputCount * sizeof(float)));
CHECK(cudaMalloc(&mDeviceInput2, 3 * mDims.n() * sizeof(float)));
}
virtual ~Int8EntropyCalibrator()
{
CHECK(cudaFree(mDeviceInput1));
CHECK(cudaFree(mDeviceInput2));
}
int getBatchSize() const override { return mDims.n(); }
bool getBatch(void* bindings[], const char* names[], int nbBindings) override
{
if(!dataLoader->next())
return false;
CHECK(cudaMemcpy(mDeviceInput1, dataLoader->getBatchData(),
mInputCount * sizeof(float), cudaMemcpyHostToDevice));
assert(!strcmp(names[0], INPUT_BLOB_NAME0));
bindings[0] = mDeviceInput1;
CHECK(cudaMemcpy(mDeviceInput2, dataLoader->getImInfo(),
3 * mDims.n() * sizeof(float), cudaMemcpyHostToDevice));
assert(!strcmp(names[1], INPUT_BLOB_NAME1));
bindings[1] = mDeviceInput2;
return true;
}
const void* readCalibrationCache(size_t& length) override
{
//printf("ReadCalibrationCache\n");
mCalibrationCache.clear();
std::ifstream input(calibrationTableName(), std::ios::binary);
input >> std::noskipws;
if (mReadCache && input.good())
{
std::copy(std::istream_iterator<char>(input),
std::istream_iterator<char>(),
std::back_inserter(mCalibrationCache));
std::cout << "Read Calibration Cache from " << calibrationTableName() << std::endl;
}
length = mCalibrationCache.size();
return length ? &mCalibrationCache[0] : nullptr;
}
void writeCalibrationCache(const void* cache, size_t length) override
{
//printf("WriteCalibrationCache\n");
std::ofstream output(calibrationTableName(), std::ios::binary);
output.write(reinterpret_cast<const char*>(cache), length);
std::cout << "Write Calibration Cache to file " << calibrationTableName() << std::endl;
}
private:
std::string calibrationTableName()
{
return gNetWorkName + std::string("/CalibrationTable");
}
DataLoader* dataLoader;
bool mReadCache{ true };
std::string gNetWorkName;
size_t mInputCount;
void* mDeviceInput1{ nullptr };
void* mDeviceInput2{ nullptr };
std::vector<char> mCalibrationCache;
nvinfer1::DimsNCHW mDims;
};
I find it can generate batch data for calibration and do inference, but the rois and the rpn_cls_pre are not correct.Could anyone give me some advice?
By the way, I checked the output feature map of each layer of int8 and fp16, the results was same in the former 2 dense blocks, until layer “conv2_3/x1/scale”. The former 3 blocks is as follow:
name: “DENSENET_121_rcnn”
input: “data”
input_shape {
dim: 1
dim: 3
dim: 300
dim: 1000
}
input: “im_info”
input_shape {
dim: 1
dim: 1
dim: 1
dim: 3
}
layer {
name: “conv1”
type: “Convolution”
bottom: “data”
top: “conv1”
param {
lr_mult: 1
decay_mult: 1
}
convolution_param {
num_output: 64
bias_term: false
pad: 3
kernel_size: 7
stride: 2
}
}
layer {
name: “conv1/bn”
type: “BatchNorm”
bottom: “conv1”
top: “conv1/bn”
batch_norm_param {
eps: 1e-5
}
}
layer {
name: “conv1/scale”
type: “Scale”
bottom: “conv1/bn”
top: “conv1/bn”
scale_param {
bias_term: true
}
}
layer {
name: “relu1”
type: “ReLU”
bottom: “conv1/bn”
top: “conv1/bn”
}
layer {
name: “pool1”
type: “Pooling”
bottom: “conv1/bn”
top: “pool1”
pooling_param {
pool: MAX
kernel_size: 3
stride: 2
pad: 1
ceil_mode: false
}
}
layer {
name: “conv2_1/x1/bn”
type: “BatchNorm”
bottom: “pool1”
top: “conv2_1/x1/bn”
batch_norm_param {
eps: 1e-5
}
}
layer {
name: “conv2_1/x1/scale”
type: “Scale”
bottom: “conv2_1/x1/bn”
top: “conv2_1/x1/bn”
scale_param {
bias_term: true
}
}
layer {
name: “relu2_1/x1”
type: “ReLU”
bottom: “conv2_1/x1/bn”
top: “conv2_1/x1/bn”
}
layer {
name: “conv2_1/x1”
type: “Convolution”
bottom: “conv2_1/x1/bn”
top: “conv2_1/x1”
param {
lr_mult: 1
decay_mult: 1
}
convolution_param {
num_output: 128
bias_term: false
kernel_size: 1
}
}
layer {
name: “conv2_1/x2/bn”
type: “BatchNorm”
bottom: “conv2_1/x1”
top: “conv2_1/x2/bn”
batch_norm_param {
eps: 1e-5
}
}
layer {
name: “conv2_1/x2/scale”
type: “Scale”
bottom: “conv2_1/x2/bn”
top: “conv2_1/x2/bn”
scale_param {
bias_term: true
}
}
layer {
name: “relu2_1/x2”
type: “ReLU”
bottom: “conv2_1/x2/bn”
top: “conv2_1/x2/bn”
}
layer {
name: “conv2_1/x2”
type: “Convolution”
bottom: “conv2_1/x2/bn”
top: “conv2_1/x2”
param {
lr_mult: 1
decay_mult: 1
}
convolution_param {
num_output: 32
bias_term: false
pad: 1
kernel_size: 3
}
}
layer {
name: “concat_2_1”
type: “Concat”
bottom: “pool1”
bottom: “conv2_1/x2”
top: “concat_2_1”
}
layer {
name: “conv2_2/x1/bn”
type: “BatchNorm”
bottom: “concat_2_1”
top: “conv2_2/x1/bn”
batch_norm_param {
eps: 1e-5
}
}
layer {
name: “conv2_2/x1/scale”
type: “Scale”
bottom: “conv2_2/x1/bn”
top: “conv2_2/x1/bn”
scale_param {
bias_term: true
}
}
layer {
name: “relu2_2/x1”
type: “ReLU”
bottom: “conv2_2/x1/bn”
top: “conv2_2/x1/bn”
}
layer {
name: “conv2_2/x1”
type: “Convolution”
bottom: “conv2_2/x1/bn”
top: “conv2_2/x1”
param {
lr_mult: 1
decay_mult: 1
}
convolution_param {
num_output: 128
bias_term: false
kernel_size: 1
}
}
layer {
name: “conv2_2/x2/bn”
type: “BatchNorm”
bottom: “conv2_2/x1”
top: “conv2_2/x2/bn”
batch_norm_param {
eps: 1e-5
}
}
layer {
name: “conv2_2/x2/scale”
type: “Scale”
bottom: “conv2_2/x2/bn”
top: “conv2_2/x2/bn”
scale_param {
bias_term: true
}
}
layer {
name: “relu2_2/x2”
type: “ReLU”
bottom: “conv2_2/x2/bn”
top: “conv2_2/x2/bn”
}
layer {
name: “conv2_2/x2”
type: “Convolution”
bottom: “conv2_2/x2/bn”
top: “conv2_2/x2”
param {
lr_mult: 1
decay_mult: 1
}
convolution_param {
num_output: 32
bias_term: false
pad: 1
kernel_size: 3
}
}
layer {
name: “concat_2_2”
type: “Concat”
bottom: “concat_2_1”
bottom: “conv2_2/x2”
top: “concat_2_2”
}
layer {
name: “conv2_3/x1/bn”
type: “BatchNorm”
bottom: “concat_2_2”
top: “conv2_3/x1/bn”
batch_norm_param {
eps: 1e-5
}
}
layer {
name: “conv2_3/x1/scale”
type: “Scale”
bottom: “conv2_3/x1/bn”
top: “conv2_3/x1/bn”
scale_param {
bias_term: true
}
}
layer {
name: “relu2_3/x1”
type: “ReLU”
bottom: “conv2_3/x1/bn”
top: “conv2_3/x1/bn”
}
layer {
name: “conv2_3/x1”
type: “Convolution”
bottom: “conv2_3/x1/bn”
top: “conv2_3/x1”
param {
lr_mult: 1
decay_mult: 1
}
convolution_param {
num_output: 128
bias_term: false
kernel_size: 1
}
}
layer {
name: “conv2_3/x2/bn”
type: “BatchNorm”
bottom: “conv2_3/x1”
top: “conv2_3/x2/bn”
batch_norm_param {
eps: 1e-5
}
}
layer {
name: “conv2_3/x2/scale”
type: “Scale”
bottom: “conv2_3/x2/bn”
top: “conv2_3/x2/bn”
scale_param {
bias_term: true
}
}
layer {
name: “relu2_3/x2”
type: “ReLU”
bottom: “conv2_3/x2/bn”
top: “conv2_3/x2/bn”
}
layer {
name: “conv2_3/x2”
type: “Convolution”
bottom: “conv2_3/x2/bn”
top: “conv2_3/x2”
param {
lr_mult: 1
decay_mult: 1
}
convolution_param {
num_output: 32
bias_term: false
pad: 1
kernel_size: 3
}
}
layer {
name: “concat_2_3”
type: “Concat”
bottom: “concat_2_2”
bottom: “conv2_3/x2”
top: “concat_2_3”
}
SDK: JetPack 4.2.2
CUDA version: 10.0.326
Python version: 3.6.8
Tensorflow version: 1.14.0
TensorRT version: 5.1.6.1
Thanks!