Hi!
when i run resnet50(down load from internet,standard cls net) with tensorRT on px2 and caffe on pc,but the result of px2 is diffenent with caffe on cpu/gpu(1080),the result of caffe on pc is right,i test different pictures.
- on px2: i make use of the sampleFasterRCNN on tensorRT2.1.2 .
i write the testPlugin just for get the input data of “res5c”, test code and part deploy as follow
int g_c = 0;int g_h = 0; int g_w = 0;
class testPlugin : public IPlugin
{
public:
testPlugin() {}
testPlugin(const void* buffer, size_t size)
{
assert(size == sizeof(mCopySize));
mCopySize = reinterpret_cast<const size_t>(buffer);
}
int getNbOutputs() const override
{
printf("testPlugin:getNbOutputs...\n");
return 1;
}
Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override
{
printf("testPlugin:getOutputDimensions...\n");
g_c = inputs[0].d[0];g_h = inputs[0].d[1];g_w = inputs[0].d[2];
printf("CHW:%d %d %d\n",inputs[0].d[0],inputs[0].d[1],inputs[0].d[2]);
return DimsCHW(inputs[0].d[0], inputs[0].d[1],inputs[0].d[2]);
}
int initialize() override
{
printf("testPlugin:initialize...\n");
return 0;
}
void terminate() override
{
printf("testPlugin:terminate...\n");
}
size_t getWorkspaceSize(int) const override
{
printf("testPlugin:getWorkspaceSize...\n");
return 0;
}
// currently it is not possible for a plugin to execute "in place". Therefore we memcpy the data from the input to the output buffer
int enqueue(int batchSize, const void*const *inputs, void** outputs, void*, cudaStream_t stream) override
{
printf("testPlugin:enqueue...\n");
//CHECK(cudaMemcpyAsync(outputs[0], inputs[0], mCopySize * batchSize, cudaMemcpyDeviceToDevice, stream));
testPlugin_forward_cpu(inputs, outputs,g_c,g_h,g_w);
return 0;
}
size_t getSerializationSize() override
{
printf("testPlugin:getSerializationSize...\n");
return sizeof(mCopySize);
}
void serialize(void* buffer) override
{
printf("testPlugin:serialize...\n");
*reinterpret_cast<size_t*>(buffer) = mCopySize;
}
void configure(const Dims*inputs, int nbInputs, const Dims* outputs, int nbOutputs, int) override
{
printf("testPlugin:configure...\n");
mCopySize = inputs[0].d[0] * inputs[0].d[1] * inputs[0].d[2] * sizeof(float);
}
protected:
size_t mCopySize;
//int c,h,w;
};
…
void testPlugin_forward_cpu(const void*const input, void* output,const int c,const int h,const int w)
{
printf(“begin testPlugin_forward_cpu…\n”);fflush(stdout);
////////gpu—>cpu/////
printf(“c:%d h:%d w:%d\n”,c,h,w);fflush(stdout);
float* data_buf = (float*)malloc(chwsizeof(float));
cudaMemcpy(data_buf, (const float)input[0], chw*sizeof(float), cudaMemcpyDeviceToHost);
//debug
FILE fp_in = fopen(“log_px2_testPlugin_0307.txt”, “a+”);
for(int i = 0; i < ch*w; i++) //
{
fprintf(fp_in,“%f\n”,data_buf[i]);
}
fclose(fp_in);
free(data_buf);
printf(“end testPlugin_forward_cpu…\n”);fflush(stdout);
}
name: “ResNet-50”
#input: “data”
#input_dim: 1
#input_dim: 3
#input_dim: 224
#input_dim: 224
layer {
name: “data”
type: “MemoryData”
top: “data”
top: “label”
memory_data_param {
batch_size: 1
channels: 3
height: 224
width: 224
}
}
…
layer {
bottom: “res5b”
bottom: “res5c_branch2c”
top: “res5c”
name: “res5c”
type: “Eltwise”
}
layer {
bottom: “res5c”
top: “res5c”
name: “res5c_relu”
type: “ReLU”
}
layer {
bottom: “res5c”
top: “pool5”
name: “pool5”
type: “Pooling”
pooling_param {
kernel_size: 7
stride: 1
pool: AVE
}
}
layer {
bottom: “pool5”
top: “fc1000”
name: “fc1000”
type: “InnerProduct”
inner_product_param {
num_output: 1000
}
}
layer {
bottom: “fc1000”
top: “prob”
name: “prob”
type: “Softmax”
}
2.on cpu/gpu(1080): i get the input (name: “res5c”)same with on px2
but the result if different!
the px2 (bottom0 of “res5c”):
0.000000
0.000000
3.217169
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.901252
0.332424
0.000000
0.000000
0.000000
0.000000
0.000000
1.156103
0.837259
1.147727
0.676404
0.000000
0.000000
0.000000
0.000000
0.000000
0.001276
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
…
the caffe on pc(bottom0 of “res5c”):
3.310690
4.821366
4.264204
1.190538
1.798670
0.946362
1.013385
3.501086
1.795165
2.163740
0.860631
0.187800
0.000000
0.000000
1.204569
0.381836
1.398308
0.000000
0.000000
0.000000
0.353026
0.000000
0.000000
0.000000
0.488210
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
1.233509
0.000000
0.000000
0.000000
0.000000
0.779470
0.230680
1.254745
0.000000
0.000000
0.424548
0.000000
0.000000
0.000000
0.089821
0.000000
2.994178
3.356085
0.000000
…
so,i want to known how to generate the different result?
anyone’s support is greatly appreciated!
thanks!