Hello.
I am using TensorRT 2.1 and want to implement a simple custom layer. To practice, I wanted to make an “Inc” layer (just adding 1.0 to an input tensor values and keeping dimension the same).
I kept everything almost the same with the “class Reshape : public Iplugin” in sampleFasterRNN.cpp, except “getOutputDimensions()” to keep the same dimension. (this seems fine.)
Where should I implement the “adding 1.0” part? I guess it should be in “enqueue()”. So, I tried
int enqueue(int batchSize, const void*const *inputs, void** outputs, void*, cudaStream_t stream) override
{
# the below is from the Reshape class. seems to copy from input to output
CHECK(cudaMemcpyAsync(outputs[0], inputs[0], mCopySize * batchSize, cudaMemcpyDeviceToDevice, stream));
# add 1.0 to first ten values
float* foutputs = (float*) outputs[0];
int i; for (i = 0; i < 10; i++) foutputs[i] += 1.0;
return 0;
}
However, this part results in “segmentation fault” error.
My questions are:
1) where and how can I implement some calculation between input and output (addition in this case)?
2) Can you provide a simple example?
** Just in case, I post the full code of this example API here. (almost same with the Reshape)
class Inc : public IPlugin
{
public:
Inc() {}
Inc(const void* buffer, size_t size)
{
assert(size == sizeof(mCopySize));
mCopySize = *reinterpret_cast<const size_t*>(buffer);
}
int getNbOutputs() const override
{
return 1;
}
Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override
{
assert(nbInputDims == 1);
assert(index == 0);
assert(inputs[index].nbDims == 3);
return DimsCHW(inputs[0].d[0], inputs[0].d[1], inputs[0].d[2]); // same dimension
}
int initialize() override
{
return 0;
}
void terminate() override
{
}
size_t getWorkspaceSize(int) const override
{
return 0;
}
// currently it is not possible for a plugin to execute "in place". Therefore we memcpy the data from the input to the output buffer
int enqueue(int batchSize, const void*const *inputs, void** outputs, void*, cudaStream_t stream) override
{
CHECK(cudaMemcpyAsync(outputs[0], inputs[0], mCopySize * batchSize, cudaMemcpyDeviceToDevice, stream));
float* foutputs = (float*) outputs[0];
int i; for (i = 0; i < 10; i++) foutputs[i] += 1.0;
return 0;
}
size_t getSerializationSize() override
{
return sizeof(mCopySize);
}
void serialize(void* buffer) override
{
*reinterpret_cast<size_t*>(buffer) = mCopySize;
}
void configure(const Dims*inputs, int nbInputs, const Dims* outputs, int nbOutputs, int) override
{
mCopySize = inputs[0].d[0] * inputs[0].d[1] * inputs[0].d[2] * sizeof(float);
}
protected:
size_t mCopySize;
};