Weird issues come up when inferencing Keras Pretrained Models. I have exported some Keras pretrained models to UFF file and parsing them, serialize them into plan file. But inference with models other than resnet50 doesn’t seem to give right results. Information of my computer is as follows:
- TensorRT4
- Cuda9.0
- Ubuntu16
- TensorFlow 1.12.0
I first export UFF files. UFF exporting script is like this:
tf.keras.backend.set_learning_phase(0)
model = tf.keras.applications.VGG16(include_top = True)
model.load_weights('<.h5 file>')
# According to sample code save the model and convert to uff file
def save(model, filename):
output_names = model.output.op.name
sess = tf.keras.backend.get_session()
frozen_graph = tf.graph_util.convert_variables_to_constants(sess, sess.graph.as_graph_def(), [output_names])
uff.from_tensorflow(graphdef=frozen_graph,
output_filename=filename,
output_nodes=[output_names],
text=True)
save(model, 'vgg16.uff')
Information will come out as:
INFO:tensorflow:Froze 32 variables.
INFO:tensorflow:Converted 32 variables to const ops.
Using output node predictions/Softmax
Converting to UFF graph
DEBUG: convert reshape to flatten node
No. nodes: 88
UFF Output written to uff_models/vgg16.uff
UFF Text Output written to uff_models/vgg16.uff.pbtxt
Input: input_1
Output: predictions/Softmax
Then parse the uff file to engine and serialize it for later use. Script is like this:
ICudaEngine* UFFParser(const char* uff_file, int maxBatchsize, IUffParser* parser)
{
IBuilder* builder = createInferBuilder(gLogger);
INetworkDefinition* network = builder->createNetwork();
if(!parser->parse(uff_file, *network, DataType::kFLOAT))
{
std::cout << "Fail to parse" << std::endl;
exit(-1);
}
builder->setMaxBatchSize(maxBatchSize);
builder->setMaxWorkspaceSize(8 << 30);
ICudaEngine* engine = builder->buildCudaEngine(*network);
if(!engine)
{
std::cout << "Unable to create engine" << std::endl;
exit(-1);
}
return engine;
}
// ....
int main()
{
auto parser = createUffParser();
parser->registerInput(input_name, Dims3{3, 224, 224}, UffInputOrder::kNCHW);
parser->registerOutput(output_name);
ICudaEngine* engine = UFFParser(uff_file, 2, parser);
// serialization
return 0;
}
Then for the inference script, i’ve done things like this:
static const float pixelMean[3]{103.939f, 116.779f, 123.68f}; // in BGR order
void image_preprocess(PPM* img, float* data)
{
for(int c = 0; c < INPUT_C; ++c)
{
for(int j = 0, volChl = INPUT_H * INPUT_W; j < volChl; ++j)
{
data[c*volChl + j] = float(img->buffer[j*INPUT_C + 2 - c]) - pixelMean[c];
}
}
}
void doInference(nvinfer1::IExecutionContext& context, float* input, float* output, int batchSize)
{
const nvinfer1::ICudaEngine& engine = context.getEngine();
assert(engine.getNbBindings() == 2);
void* buffers[2];
int inputIndex, outputIndex;
for(int b = 0; b < engine.getNbBindings(); ++b)
{
if(engine.bindingIsInput(b))
inputIndex = b;
else
outputIndex = b;
}
// Create GPU buffers on device
CHECK(cudaMalloc(&buffers[inputIndex], batchSize * INPUT_C * INPUT_H * INPUT_W * sizeof(float)));
CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));
// Create stream
cudaStream_t stream;
CHECK(cudaStreamCreate(&stream));
// DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * INPUT_C * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
context.enqueue(batchSize, buffers, stream, nullptr);
CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
cudaStreamSynchronize(stream);
// Release stream and buffers
cudaStreamDestroy(stream);
CHECK(cudaFree(buffers[inputIndex]));
CHECK(cudaFree(buffers[outputIndex]));
}
int main()
{
//....
// PPM* input_img = read_img(...)
image_preprocess(Input_Img, data);
doInference(*context, data, prob, 1);
//...
}
I read the .ppm image, do preprocessing, which for resnet50 is substract the imagenet mean and convert to BGR order. Then do inference. I run the whole imagenet validation set. Only resnet50 gives me the correct result. Other models are just classifying images totally wrong. It seems like the problem shouldn’t locate in how i do inference with TensorRT, but the preprocessing Steps. But i have checked the keras pretrained script, it seems the preprocessing they used is just substract imagenet mean and convert to BGR order. Why only inference with resnet50 works? For vgg16, vgg19, inceptionV3, densenet121, they all fail to give correct results.
Is there any problem in my implementation of TensorRT inference? Where could the problem be located… It really confuses me.