TensorRT2.1 INT8 top1 and top5 is too low

INT8 run:400 batches of size 10 starting at 100
50
dims.n = 10

Top1: 0.00475, Top5: 0.0095
Processing 4000 images averaged 0.175353 ms/image and 1.75353 ms/batch.

FP32 run:400 batches of size 10 starting at 100

Top1: 0.00475, Top5: 0.00975

before this,I write the code to generate batch.
/***************

*****************/
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>

#include
#include
#include
#include
#include
std::vectorstd::string readTxt(std::string file)
{
std::vectorstd::string file_vector;
std::ifstream infile;
infile.open(file.data());
assert(infile.is_open());

std::string s;
while(getline(infile,s))
{
file_vector.push_back(s);
}
infile.close();
return file_vector;
}

std::vector readLabel(std::string file)
{
std::vector label_vector;
std::ifstream infile;
infile.open(file.data());
assert(infile.is_open());

std::string s;
while(getline(infile,s))
{
label_vector.push_back(std::stoi(s));
}
infile.close();
return label_vector;
}

int main(int argc, char** argv)
{

std::string img_list_file = argv[1];
std::string label_list_file = argv[2];
int N_val = std::atoi(argv[3]);
int C_val = 3;
int H_val = 224;
int W_val = 224;

std::vectorstd::string file_vector;
file_vector = readTxt(img_list_file);
std::cout<<file_vector.size()<<std::endl;

std::vector label_vector;
label_vector = readLabel(label_list_file);
std::cout<<label_vector.size()<<std::endl;

int sBatchId = 0;
cv::Scalar channel_mean = cv::Scalar(111.437, 108.631, 106.659);

cv::Size input_geometry_;
input_geometry_ = cv::Size(H_val, W_val);

int maxBatchCount = file_vector.size()/N_val;

for(int mBatchCount = 0; mBatchCount < maxBatchCount; mBatchCount++)
{
char buffer[1000];
sprintf(buffer, “/home/software/TensorRT-2.1.2/data/resnet/batches/batch%d”, sBatchId++);
FILE* file = fopen(buffer, “w”);
if(file==0)
{
abort();
}

int s[4] = { N_val, C_val, H_val, W_val};

fwrite(s, sizeof(int), 4, file);


for(int cnt = 0; cnt < N_val; cnt++)
{   
  //std::cout<<file_vector[mBatchCount*N_val + cnt]<<std::endl;
  cv::Mat img = cv::imread(file_vector[mBatchCount*N_val + cnt]);
  //cv::cvtColor(img, img, cv::COLOR_BGR2RGB);
  cv::Mat resized_img;
  cv::resize(img, resized_img, input_geometry_);

  cv::Mat sample_float;
  resized_img.convertTo(sample_float, CV_32FC3);
 
  cv::Mat sample_normalized;
  cv::subtract(sample_float, channel_mean, sample_normalized);
  //cv::cvtColor(sample_normalized, sample_normalized, cv::COLOR_BGR2RGB);
  //cv::cvtColor(sample_normalized, sample_normalized, cv::COLOR_RGB2BGR);
  //cv::Mat sample_normalized_int;
  //sample_normalized.convertTo(sample_normalized_int, CV_8UC3);


  //std::vector<cv::Mat> input_bgr;
  //cv::split(sample_normalized, input_bgr);

  //std::cout<<"input_bgr = "<<input_bgr[0].type()<<std::endl;

  for (int c = 0; c <sample_normalized.channels(); c++)
  {
    for (int nrow = 0; nrow <sample_normalized.rows; nrow++)
    {
      //float* pixelPtr = input_bgr[c].ptr<float>(nrow); 
      //fwrite(pixelPtr, sizeof(float), input_bgr[c].cols, file);  
      for (int ncol = 0; ncol <sample_normalized.cols; ncol++)
      {
        float ele = sample_normalized.at<cv::Vec3f>(nrow, ncol)[c];
        //std::cout<<ele<<std::endl;
        fwrite(&ele, sizeof(float), 1, file);    
      }
    }
  }
}
//label
for(int cnt = 0; cnt < N_val; cnt++)
{
  //std::cout<<label_vector[mBatchCount*N_val + cnt]<<std::endl;
    int label_val = label_vector[mBatchCount*N_val + cnt];
  fwrite(&label_val, sizeof(int), 1, file);
}
fclose(file);

}
std::cout<<"batch generation is done! "<<std::endl;
return 0;
}

Anyone who can help me

Hello,

are you saying int8 run is seeing no performance improvement when compared to fp32 run? I see you are running TRT2.1. We have introduced many features, fixes and performance improvements since TRT2.1. Please update to latest TRT and rerun your test.

If you are still seeing issues, please share a small repro including source, model, and dataset that demonstrate the performance issues you are seeing.

regards,
NVIDIA Enterprise Support

yes, there is no performance improvement when compared to fp32 run, but in addition to this,performance of int8 and fp32 are too slow

Hello, to help us debug can you please share a small repro including source, model, and dataset that demonstrate the performance issues you are seeing.

regards,
NVIDIA Enterprise Support