when batchsize>1, the inference performance used by tensorrt 2.1 is lower than used by caffe

when I used TensorRT to complete the forward inference of simple network on GTX1070, batchsize is 5, the performance is lower than using caffe directly, I wonder whether 1070 can not let tensorrt play ies part?

name: "VGG_CNN_M_1024"
layer {
  name: "data"
  type: "Input"
  top: "data"
  input_param { shape: { dim: 5 dim: 3 dim: 96 dim: 96 } }
}
layer {
  name: "conv1"
  type: "Convolution"
  bottom: "data"
  top: "conv1"
  param {
		lr_mult: 1
		decay_mult: 1
	}
  param {
		lr_mult: 2
		decay_mult: 0
	}

  convolution_param {
    num_output: 96
    kernel_size: 7
    stride: 2
    weight_filler {
		type: "gaussian"
		std: 0.01
	}
    bias_filler {
		type: "constant"
		value: 0.1
	}
}
}
layer {
  name: "relu1"
  type: "ReLU"
  bottom: "conv1"
  top: "conv1"
}
layer {
  name: "norm1"
  type: "LRN"
  bottom: "conv1"
  top: "norm1"
  lrn_param {
    local_size: 5
    alpha: 0.0005
    beta: 0.75
    k: 2
  }
}
layer {
  name: "pool1"
  type: "Pooling"
  bottom: "norm1"
  top: "pool1"
  pooling_param {
    pool: MAX
    kernel_size: 3
    stride: 2
  }
}
layer {
  name: "conv2"
  type: "Convolution"
  bottom: "pool1"
  top: "conv2"
  param {
		lr_mult: 1
		decay_mult: 1
	}
  param {
		lr_mult: 2
		decay_mult: 0
	}
  convolution_param {
    num_output: 256
    pad: 1
    kernel_size: 5
    stride: 2
    weight_filler {
		type: "gaussian"
		std: 0.01
	}
    bias_filler {
		type: "constant"
		value: 0.1
	}
  }
}
layer {
  name: "relu2"
  type: "ReLU"
  bottom: "conv2"
  top: "conv2"
}
layer {
  name: "norm2"
  type: "LRN"
  bottom: "conv2"
  top: "norm2"
  lrn_param {
    local_size: 5
    alpha: 0.0005
    beta: 0.75
    k: 2
  }
}
layer {
  name: "pool2"
  type: "Pooling"
  bottom: "norm2"
  top: "pool2"
  pooling_param {
    pool: MAX
    kernel_size: 3
    stride: 2
  }
}
layer {
  name: "conv3_"
  type: "Convolution"
  bottom: "pool2"
  top: "conv3_"
  param {
		lr_mult: 1
		decay_mult: 1
	}
  param {
		lr_mult: 2
		decay_mult: 0
	}
  convolution_param {
    num_output: 256
    pad: 1
    kernel_size: 3
    weight_filler {
		type: "gaussian"
		std: 0.01
	}
    bias_filler {
		type: "constant"
		value: 0.1
	}
  }
}
layer {
  name: "relu3"
  type: "ReLU"
  bottom: "conv3_"
  top: "conv3_"
}
layer {
  name: "conv4_"
  type: "Convolution"
  bottom: "conv3_"
  top: "conv4_"
  param {
		lr_mult: 1
		decay_mult: 1
	}
  param {
		lr_mult: 2
		decay_mult: 0
	}
  convolution_param {
    num_output: 256
    pad: 1
    kernel_size: 3
    weight_filler {
		type: "gaussian"
		std: 0.01
	}
    bias_filler {
		type: "constant"
		value: 0.1
	}
  }
}
layer {
  name: "relu4"
  type: "ReLU"
  bottom: "conv4_"
  top: "conv4_"
}
layer {
  name: "conv5_"
  type: "Convolution"
  bottom: "conv4_"
  top: "conv5_"
  param {
		lr_mult: 1
		decay_mult: 1
	}
  param {
		lr_mult: 2
		decay_mult: 0
	}
  convolution_param {
    num_output: 256
    pad: 1
    kernel_size: 3
    weight_filler {
		type: "gaussian"
		std: 0.01
	}
    bias_filler {
		type: "constant"
		value: 0.1
	}
  }
}
layer {
  name: "relu5"
  type: "ReLU"
  bottom: "conv5_"
  top: "conv5_"
}
layer {
  name: "pool5/classifier_"
  type: "InnerProduct"
  bottom: "conv5_"
  top: "pool5/classifier_"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  inner_product_param {
    num_output: 128
    weight_filler {
		type: "gaussian"
		std: 0.01
	}
    bias_filler {
		type: "constant"
		value: 0.1
	}
  }
}
layer {
  name: "pool5/relu"
  type: "ReLU"
  bottom: "pool5/classifier_"
  top: "pool5/classifier_"
}
layer {
  name: "pool5/drop"
  type: "Dropout"
  bottom: "pool5/classifier_"
  top: "pool5/classifier_"
  dropout_param {
    dropout_ratio: 0.4
  }
}