Tiny Yolo ver-2 giving wrong output

Hi all:
I am trying to run Tiny Yolo version 2 with tensorRT optimization. i am giving input image in BGR format and values in range [0 to 1]. I have approximated leaky Relu with Relu+scale+eltwise operation. I am getting output at second last layer which is a convolution layer and its output size is 12x12x125 tensor. I have implemented the last detection layer in python seperately.
Everything is working fine using caffe but tensorRT is not giving correct output or may be i am interpreting output wrongly.

As i am getting output at second last layer, tensorRT gives me output in NCHW linearized array which is of size=1x125x12x12=18000.

I take this output from TensorRT and reshape it as 125x12x12 and send it to my python implemented detection layer. i am not getting correct results but in caffe implementations i am getting correct results.

Prototxt which i am using is given below

layer {
  name: "data"
  type: "Input"
  top: "data"
  input_param {
    shape {
      dim: 1
      dim: 3
      dim: 416
      dim: 416
    }
  }
}
layer {
  name: "conv1"
  type: "Convolution"
  bottom: "data"
  top: "conv1"
  convolution_param {
    num_output: 16
    bias_term: false
    pad: 1
    kernel_size: 3
    stride: 1
  }
}
layer {
  name: "conv1_bn"
  type: "BatchNorm"
  bottom: "conv1"
  top: "conv1_bn"
  batch_norm_param {
    use_global_stats: true
  }
}
layer {
  name: "conv1_scale"
  type: "Scale"
  bottom: "conv1_bn"
  top: "conv1_scale"
  scale_param {
    bias_term: true
  }
}
layer {
      name: "relu1"
      type: "ReLU"
      bottom: "conv1_scale"
      top: "relu1"	
    }
    layer {
      name: "scale1"
      type: "Power"
      bottom: "conv1_scale"
      top: "scale1"
      power_param {
        scale: 0.08
      }
    }

    layer {
      name: "eltwise1"
      type: "Eltwise"
      bottom: "relu1"
      bottom: "scale1"
      top: "layer1"
      eltwise_param {
        operation: SUM
      }
    }
layer {
  name: "pool1"
  type: "Pooling"
  bottom: "layer1"
  top: "pool1"
  pooling_param {
    pool: MAX
    kernel_size: 2
    stride: 2
  }
}
layer {
  name: "conv2"
  type: "Convolution"
  bottom: "pool1"
  top: "conv2"
  convolution_param {
    num_output: 32
    bias_term: false
    pad: 1
    kernel_size: 3
    stride: 1
  }
}
layer {
  name: "conv2_bn"
  type: "BatchNorm"
  bottom: "conv2"
  top: "conv2_bn"
  batch_norm_param {
    use_global_stats: true
  }
}
layer {
  name: "conv2_scale"
  type: "Scale"
  bottom: "conv2_bn"
  top: "conv2_scale"
  scale_param {
    bias_term: true
  }
}
layer {
      name: "relu2"
      type: "ReLU"
      bottom: "conv2_scale"
      top: "relu2"	
    }
    layer {
      name: "scale2"
      type: "Power"
      bottom: "conv2_scale"
      top: "scale2"
      power_param {
        scale: 0.08
      }
    }

    layer {
      name: "eltwise2"
      type: "Eltwise"
      bottom: "relu2"
      bottom: "scale2"
      top: "layer2"
      eltwise_param {
        operation: SUM
      }
    }
layer {
  name: "pool2"
  type: "Pooling"
  bottom: "layer2"
  top: "pool2"
  pooling_param {
    pool: MAX
    kernel_size: 2
    stride: 2
  }
}
layer {
  name: "conv3"
  type: "Convolution"
  bottom: "pool2"
  top: "conv3"
  convolution_param {
    num_output: 64
    bias_term: false
    pad: 1
    kernel_size: 3
    stride: 1
  }
}
layer {
  name: "conv3_bn"
  type: "BatchNorm"
  bottom: "conv3"
  top: "conv3_bn"
  batch_norm_param {
    use_global_stats: true
  }
}
layer {
  name: "conv3_scale"
  type: "Scale"
  bottom: "conv3_bn"
  top: "conv3_scale"
  scale_param {
    bias_term: true
  }
}
layer {
      name: "relu3"
      type: "ReLU"
      bottom: "conv3_scale"
      top: "relu3"	
    }
    layer {
      name: "scale3"
      type: "Power"
      bottom: "conv3_scale"
      top: "scale3"
      power_param {
        scale: 0.08
      }
    }

    layer {
      name: "eltwise3"
      type: "Eltwise"
      bottom: "relu3"
      bottom: "scale3"
      top: "layer3"
      eltwise_param {
        operation: SUM
      }
    }
layer {
  name: "pool3"
  type: "Pooling"
  bottom: "layer3"
  top: "pool3"
  pooling_param {
    pool: MAX
    kernel_size: 2
    stride: 2
  }
}
layer {
  name: "conv4"
  type: "Convolution"
  bottom: "pool3"
  top: "conv4"
  convolution_param {
    num_output: 128
    bias_term: false
    pad: 1
    kernel_size: 3
    stride: 1
  }
}
layer {
  name: "conv4_bn"
  type: "BatchNorm"
  bottom: "conv4"
  top: "conv4_bn"
  batch_norm_param {
    use_global_stats: true
  }
}
layer {
  name: "conv4_scale"
  type: "Scale"
  bottom: "conv4_bn"
  top: "conv4_scale"
  scale_param {
    bias_term: true
  }
}
layer {
      name: "relu4"
      type: "ReLU"
      bottom: "conv4_scale"
      top: "relu4"	
    }
    layer {
      name: "scale4"
      type: "Power"
      bottom: "conv4_scale"
      top: "scale4"
      power_param {
        scale: 0.08
      }
    }

    layer {
      name: "eltwise4"
      type: "Eltwise"
      bottom: "relu4"
      bottom: "scale4"
      top: "layer4"
      eltwise_param {
        operation: SUM
      }
    }


layer {
  name: "pool4"
  type: "Pooling"
  bottom: "layer4"
  top: "pool4"
  pooling_param {
    pool: MAX
    kernel_size: 2
    stride: 2
  }
}
layer {
  name: "conv5"
  type: "Convolution"
  bottom: "pool4"
  top: "conv5"
  convolution_param {
    num_output: 256
    bias_term: false
    pad: 1
    kernel_size: 3
    stride: 1
  }
}
layer {
  name: "conv5_bn"
  type: "BatchNorm"
  bottom: "conv5"
  top: "conv5_bn"
  batch_norm_param {
    use_global_stats: true
  }
}
layer {
  name: "conv5_scale"
  type: "Scale"
  bottom: "conv5_bn"
  top: "conv5_scale"
  scale_param {
    bias_term: true
  }
}
layer {
      name: "relu5"
      type: "ReLU"
      bottom: "conv5_scale"
      top: "relu5"	
    }
    layer {
      name: "scale5"
      type: "Power"
      bottom: "conv5_scale"
      top: "scale5"
      power_param {
        scale: 0.08
      }
    }

    layer {
      name: "eltwise5"
      type: "Eltwise"
      bottom: "relu5"
      bottom: "scale5"
      top: "layer5"
      eltwise_param {
        operation: SUM
      }
    }

layer {
  name: "pool5"
  type: "Pooling"
  bottom: "layer5"
  top: "pool5"
  pooling_param {
    pool: MAX
    kernel_size: 2
    stride: 2
  }
}
layer {
  name: "conv6"
  type: "Convolution"
  bottom: "pool5"
  top: "conv6"
  convolution_param {
    num_output: 512
    bias_term: false
    pad: 1
    kernel_size: 3
    stride: 1
  }
}
layer {
  name: "conv6_bn"
  type: "BatchNorm"
  bottom: "conv6"
  top: "conv6_bn"
  batch_norm_param {
    use_global_stats: true
  }
}
layer {
  name: "conv6_scale"
  type: "Scale"
  bottom: "conv6_bn"
  top: "conv6_scale"
  scale_param {
    bias_term: true
  }
}
layer {
      name: "relu6"
      type: "ReLU"
      bottom: "conv6_scale"
      top: "relu6"	
    }
    layer {
      name: "scale6"
      type: "Power"
      bottom: "conv6_scale"
      top: "scale6"
      power_param {
        scale: 0.08
      }
    }

    layer {
      name: "eltwise6"
      type: "Eltwise"
      bottom: "relu6"
      bottom: "scale6"
      top: "layer6"
      eltwise_param {
        operation: SUM
      }
    }
layer {
  name: "pool6"
  type: "Pooling"
  bottom: "layer6"
  top: "pool6"
  pooling_param {
    pool: MAX
    kernel_size: 2
    stride: 1
  }
}
layer {
  name: "conv7"
  type: "Convolution"
  bottom: "pool6"
  top: "conv7"
  convolution_param {
    num_output: 1024
    bias_term: false
    pad: 1
    kernel_size: 3
    stride: 1
  }
}
layer {
  name: "conv7_bn"
  type: "BatchNorm"
  bottom: "conv7"
  top: "conv7_bn"
  batch_norm_param {
    use_global_stats: true
  }
}
layer {
  name: "conv7_scale"
  type: "Scale"
  bottom: "conv7_bn"
  top: "conv7_scale"
  scale_param {
    bias_term: true
  }
}
layer {
      name: "relu7"
      type: "ReLU"
      bottom: "conv7_scale"
      top: "relu7"	
    }
    layer {
      name: "scale7"
      type: "Power"
      bottom: "conv7_scale"
      top: "scale7"
      power_param {
        scale: 0.08
      }
    }

    layer {
      name: "eltwise7"
      type: "Eltwise"
      bottom: "relu7"
      bottom: "scale7"
      top: "layer7"
      eltwise_param {
        operation: SUM
      }
    }
layer {
  name: "conv8"
  type: "Convolution"
  bottom: "layer7"
  top: "conv8"
  convolution_param {
    num_output: 1024
    bias_term: false
    pad: 1
    kernel_size: 3
    stride: 1
  }
}
layer {
  name: "conv8_bn"
  type: "BatchNorm"
  bottom: "conv8"
  top: "conv8_bn"
  batch_norm_param {
    use_global_stats: true
  }
}
layer {
  name: "conv8_scale"
  type: "Scale"
  bottom: "conv8_bn"
  top: "conv8_scale"
  scale_param {
    bias_term: true
  }
}
layer {
      name: "relu8"
      type: "ReLU"
      bottom: "conv8_scale"
      top: "relu8"	
    }
    layer {
      name: "scale8"
      type: "Power"
      bottom: "conv8_scale"
      top: "scale8"
      power_param {
        scale: 0.08
      }
    }

    layer {
      name: "eltwise8"
      type: "Eltwise"
      bottom: "relu8"
      bottom: "scale8"
      top: "layer8"
      eltwise_param {
        operation: SUM
      }
    }
layer {
  name: "conv9"
  type: "Convolution"
  bottom: "layer8"
  top: "result"
  convolution_param {
    num_output: 125
    pad: 0
    kernel_size: 1
    stride: 1
  }
}

Code which i am using for TensorRT optimization is:

#include </usr/include/opencv2/core.hpp>
#include </usr/include/opencv2/highgui.hpp>
#include </usr/include/opencv2/calib3d.hpp>
#include <new>
#include <assert.h>
#include <fstream>
#include <sstream>
#include <iostream>
#include <cmath>
#include <algorithm>
#include <sys/stat.h>
#include <time.h>


#include "common.h"
#include <cuda_runtime_api.h>
#include "NvInfer.h"
#include "NvCaffeParser.h"

using namespace nvinfer1;
using namespace nvcaffeparser1;
using namespace std;
using namespace cv;
//using namespace cv;
static const int INPUT_C = 3;
std::stringstream gieFileStream;
static Logger gLogger;
static const int INPUT_H = 416;
static const int INPUT_W = 416;
static const int OUTPUT_SIZE = 3*18000;
    //const int batchsize=10;
const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "result";

// stuff we know about the network and the caffe input/output blobs

void caffeToGIEModel( const char* deployFile,				// name for caffe prototxt
					  const char* modelFile,				// name for model 
					  const char* output,   // network outputs
					 uint16_t maxBatchSize				// batch size - NB must be at least as large as the batch we want to run with)
					 )    // output buffer for the GIE model
{
	
	printf("%s \n",deployFile);
	printf("%s \n",modelFile);
	printf("%s \n",output);
	std::cout<<maxBatchSize<<"\n";
	
	IHostMemory *gieModelStream(nullptr);
	// create the builder
	IBuilder* builder = createInferBuilder(gLogger);
	// parse the caffe model to populate the network, then set the outputs
	INetworkDefinition* network = builder->createNetwork();
	ICaffeParser* parser = createCaffeParser();
	const IBlobNameToTensor* blobNameToTensor = parser->parse(deployFile,
															  modelFile,
															  *network,
															  nvinfer1::DataType::kFLOAT);
															  
			std::cout<<"fine till here"<<"\n";													  

	// specify which tensors are outputs
	//for (auto& s : outputs)
		network->markOutput(*blobNameToTensor->find(output));

	// Build the engine
	builder->setMaxBatchSize(maxBatchSize);
	builder->setMaxWorkspaceSize(maxBatchSize*16 << 20);
	builder->setMinFindIterations(10);
	builder->setAverageFindIterations(10);
	//builder->setHalf2Mode(true);
	builder->setDebugSync(true);
	ICudaEngine* engine = builder->buildCudaEngine(*network);
	assert(engine);





	// we don't need the network any more, and we can destroy the parser
	network->destroy();
	parser->destroy();

	// serialize the engine, then close everything down
	gieModelStream = engine->serialize();
	//.................Storing model to file.................//
	
	gieFileStream.seekg(0, gieFileStream.beg);
	gieFileStream.write((const char*)gieModelStream->data(), gieModelStream->size());
	std::cout << "printing size of bytes allocated \t" << (gieModelStream->size())<< std::endl;
	std::ofstream SaveFile("optimize",std::ios::out|std::ios::binary);
	SaveFile.seekp(0,std::ios::beg);
	SaveFile << gieFileStream.rdbuf();
	SaveFile.close();
	gieFileStream.str(std::string());
	
	
	gieModelStream->destroy();
	engine->destroy();
	builder->destroy();
	shutdownProtobufLibrary();
}





void* Load_engine()    
{   
	// deserialize the engine 
	clock_t Start = clock();
	std::ifstream file("optimize",std::ios::in|std::ios::binary);
	file.seekg(0);
    gieFileStream << file.rdbuf();
    gieFileStream.seekg(0, std::ios::end);
    const int modelSize = gieFileStream.tellg();
	gieFileStream.seekg(0, std::ios::beg);
	std::cout << "printing size of read file bytes \t" << (modelSize)<< std::endl;
	void* modelMem = malloc(modelSize);
	if( !modelMem )
	{
		printf("failed to allocate %i bytes to deserialize model\n", modelSize);
	}
    gieFileStream.read((char*)modelMem, modelSize);
    file.close();
    //printf("Time taken to load engine: %.9fs\n", (double)(clock() - Start)/CLOCKS_PER_SEC);
	IRuntime* runtime = createInferRuntime(gLogger);
	Start = clock();
	ICudaEngine* engine = runtime->deserializeCudaEngine(modelMem, modelSize, nullptr);
	//printf("Deserialize Time Time: %.2fs\n", (double)(clock() - Start)/CLOCKS_PER_SEC);
	gieFileStream.str(std::string());
    runtime->destroy();
    shutdownProtobufLibrary();
    free(modelMem);
	return engine;
}

void doInference( void* eng, float *data, uint16_t size1,float *prob, uint16_t size2 ,uint16_t batchsize)
{   
	ICudaEngine *engine = static_cast< ICudaEngine *>(eng);
	IExecutionContext* context=engine->createExecutionContext();
	uint16_t input_HW=size1/batchsize; 	
	// input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
	// of these, but in this case we know that there is exactly one input and one output.
	assert(engine->getNbBindings() == 2);
	void* buffers[2];
	// In order to bind the buffers, we need to know the names of the input and output tensors.
	// note that indices are guaranteed to be less than IEngine::getNbBindings()
	int inputIndex = engine->getBindingIndex(INPUT_BLOB_NAME), 
		outputIndex = engine->getBindingIndex(OUTPUT_BLOB_NAME);

	// create GPU buffers and a stream
	CHECK(cudaMalloc(&buffers[inputIndex], size1 * sizeof(float)));
	CHECK(cudaMalloc(&buffers[outputIndex], size2 * sizeof(float)));

	cudaStream_t stream;
	CHECK(cudaStreamCreate(&stream));
	// DMA the input to the GPU,  execute the batch asynchronously, and DMA it back:
	CHECK(cudaMemcpyAsync(buffers[inputIndex], data, size1 * sizeof(float), cudaMemcpyHostToDevice, stream));
	context->enqueue(batchsize, buffers, stream, nullptr);
	CHECK(cudaMemcpyAsync(prob, buffers[outputIndex], size2*sizeof(float), cudaMemcpyDeviceToHost, stream));
	cudaStreamSynchronize(stream);
// write code for reporting layer timing

	// release the stream and the buffers
	cudaStreamDestroy(stream);
	CHECK(cudaFree(buffers[inputIndex]));
	CHECK(cudaFree(buffers[outputIndex]));
	context->destroy();
	engine->destroy();
  
}

Please tell me what I am doing wrong, either giving input in wrong format or getting output in wrong format?

Thanks in advance…

Hi,

The flag ‘use_global_stats’ in batch normalization layer is not supported and causes the difference.
This flag is normally used for training only and that’s why it doesn’t be supported by TensorRT.

A suggestion for your use case is to turn off the flag and re-training the model with Caffe.
Once the flag is off, TensorRT should be able to give the same results of Caffe.

Thanks.

Thanks AastaLLL for your reply I will try that.
Now I have another issue if I try to get output at following layer or any other layer in between input and output I get core dump error.

layer {
  name: "conv1"
  type: "Convolution"
  bottom: "data"
  top: "conv1"
  convolution_param {
    num_output: 16
    bias_term: false
    pad: 1
    kernel_size: 3
    stride: 1
  }
}

Why I cannot get output from in between layers by marking them as output? Kindly tell me whats the issue. If I run the same code and mark final layer as output then it works fine gives me output.

layer {
  name: "conv9"
  type: "Convolution"
  bottom: "layer8"
  top: "result"
  convolution_param {
    num_output: 125
    pad: 0
    kernel_size: 1
    stride: 1
  }
}

But it gives core dump error if I mark any other layer as output like “conv1”.

Hi,

markOutput() is required if you want to access the data value.
The default buffer is located on GPU and cannot be accessed unless it’s marked as output.

Please following our instruction to get the output of an intermediate buffer:

Thanks.