I’ve had a quick look through your code and you don’t seem to be doing any memcpy’ing or malloc’ing in your constructors. Don’t you need to move the ELU alpha parameter into the GPU?
I’ve included code for a PReLU plugin layer that I know works. I hope this helps you.
nvinfer1::IPlugin* PluginFactory::createPlugin(const char* layerName, const nvinfer1::Weights* weights, int nbWeights)
{
assert(isPlugin(layerName));
if (!strcmp(layerName, "prelu1")||!strcmp(layerName, "prelu"))
{
assert(mPReLULayer1.get() == nullptr);
mPReLULayer1 = std::unique_ptr<PReLULayer>(new PReLULayer(weights, nbWeights));
return mPReLULayer1.get();
}
else if (!strcmp(layerName, "prelu2"))
{
assert(mPReLULayer2.get() == nullptr);
mPReLULayer2 = std::unique_ptr<PReLULayer>(new PReLULayer(weights, nbWeights));
return mPReLULayer2.get();
}
else if (!strcmp(layerName, "prelu3"))
{
assert(mPReLULayer3.get() == nullptr);
mPReLULayer3 = std::unique_ptr<PReLULayer>(new PReLULayer(weights, nbWeights));
return mPReLULayer3.get();
}
else if (!strcmp(layerName, "prelu4_1"))
{
assert(mPReLULayer4_1.get() == nullptr);
mPReLULayer4_1 = std::unique_ptr<PReLULayer>(new PReLULayer(weights, nbWeights));
return mPReLULayer4_1.get();
}
else if (!strcmp(layerName, "prelu4"))
{
assert(mPReLULayer4.get() == nullptr);
mPReLULayer4 = std::unique_ptr<PReLULayer>(new PReLULayer(weights, nbWeights));
return mPReLULayer4.get();
}
else if (!strcmp(layerName, "fc_prelu"))
{
assert(mPReLULayer5.get() == nullptr);
mPReLULayer5 = std::unique_ptr<PReLULayer>(new PReLULayer(weights, nbWeights));
return mPReLULayer5.get();
}
else
{
assert(0);
return nullptr;
}
}
IPlugin* PluginFactory::createPlugin(const char* layerName, const void* serialData, size_t serialLength)
{
assert(isPlugin(layerName));
if (!strcmp(layerName, "prelu1")||!strcmp(layerName, "prelu") )
{
assert(mPReLULayer1.get() == nullptr);
mPReLULayer1 = std::unique_ptr<PReLULayer>(new PReLULayer(serialData, serialLength));
return mPReLULayer1.get();
}
else if ( !strcmp(layerName, "prelu2") )
{
assert(mPReLULayer2.get() == nullptr);
mPReLULayer2 = std::unique_ptr<PReLULayer>(new PReLULayer(serialData, serialLength));
return mPReLULayer2.get();
}
else if ( !strcmp(layerName, "prelu3") )
{
assert(mPReLULayer3.get() == nullptr);
mPReLULayer3 = std::unique_ptr<PReLULayer>(new PReLULayer(serialData, serialLength));
return mPReLULayer3.get();
}
else if ( !strcmp(layerName, "prelu4_1") )
{
assert(mPReLULayer4_1.get() == nullptr);
mPReLULayer4_1 = std::unique_ptr<PReLULayer>(new PReLULayer(serialData, serialLength));
return mPReLULayer4_1.get();
}
else if ( !strcmp(layerName, "prelu4") )
{
assert(mPReLULayer4.get() == nullptr);
mPReLULayer4 = std::unique_ptr<PReLULayer>(new PReLULayer(serialData, serialLength));
return mPReLULayer4.get();
}
else if ( !strcmp(layerName, "fc_prelu") )
{
assert(mPReLULayer5.get() == nullptr);
mPReLULayer5 = std::unique_ptr<PReLULayer>(new PReLULayer(serialData, serialLength));
return mPReLULayer5.get();
}
else
{
assert(0);
return nullptr;
}
}
bool PluginFactory::isPlugin(const char* name)
{
return !strncmp(name, "prelu", 5) || !strcmp(name, "fc_prelu") ;
}
void PluginFactory::destroyPlugin()
{
mPReLULayer1.release();
mPReLULayer2.release();
mPReLULayer3.release();
mPReLULayer4.release();
mPReLULayer4_1.release();
mPReLULayer5.release();
mPReLULayer1 = nullptr;
mPReLULayer2 = nullptr;
mPReLULayer3 = nullptr;
mPReLULayer4 = nullptr;
mPReLULayer4_1 = nullptr;
mPReLULayer5 = nullptr;
}
/******************************/
// PReLU Plugin Layer Start
/******************************/
PReLULayer::PReLULayer(const Weights *weights, int nbWeights)
{
assert(nbWeights == 1);
mSlope = weights[0];
// std::cout << "PReLULayer(weight, nbWiegh)" << std::endl;
assert(mSlope.type == DataType::kFLOAT || mSlope.type == DataType::kHALF);
mSlope.values = malloc(mSlope.count*type2size(mSlope.type));
// std::cout << "Slope count " << weights[0].count << std::endl;
// std::cout << "Slope initialize " << reinterpret_cast<const float*>(weights[0].values)[1] << std::endl;
memcpy(const_cast<void*>(mSlope.values), weights[0].values, mSlope.count*type2size(mSlope.type));
}
PReLULayer::PReLULayer(const void* buffer, size_t size)
{
// std::cout << "PReLULayer(buffer, size)" << &mDeviceSlope << std::endl;
const char* d = static_cast<const char*>(buffer);
read(d, mSlope.count);
CHECK(cudaMalloc(&mDeviceSlopeCount, sizeof(int)));
CHECK(cudaMemcpy(mDeviceSlopeCount, d, sizeof(int), cudaMemcpyHostToDevice));
read(d, mDataType);
CHECK(cudaMalloc(&mDeviceSlopeCount, sizeof(int)));
CHECK(cudaMemcpy(mDeviceSlopeCount, d, sizeof(int), cudaMemcpyHostToDevice));
read(d, mC);
CHECK(cudaMalloc(&mDeviceC, sizeof(int)));
CHECK(cudaMemcpy(mDeviceC, d, sizeof(int), cudaMemcpyHostToDevice));
read(d, mH);
CHECK(cudaMalloc(&mDeviceH, sizeof(int)));
CHECK(cudaMemcpy(mDeviceH, d, sizeof(int), cudaMemcpyHostToDevice));
read(d, mW);
CHECK(cudaMalloc(&mDeviceW, sizeof(int)));
CHECK(cudaMemcpy(mDeviceW, d, sizeof(int), cudaMemcpyHostToDevice));
mDim = mW*mH;
read(d, mCount);
CHECK(cudaMalloc(&mDeviceW, sizeof(int)));
CHECK(cudaMemcpy(mDeviceW, d, sizeof(int), cudaMemcpyHostToDevice));
mSlope.values = nullptr;
CHECK(cudaMalloc(&mDeviceSlope, mSlope.count*type2size(mDataType)));
CHECK(cudaMemcpy(mDeviceSlope, d,
mSlope.count*type2size(mDataType), cudaMemcpyHostToDevice));
}
void PReLULayer::serialize(void* buffer)
{
char* d = static_cast<char*>(buffer), *a = d;
write(d, mSlope.count);
write(d, mDataType);
write(d, mC);
write(d, mH);
write(d, mW);
write(d, mCount);
convertAndCopyToBuffer(d, mSlope);
assert(d == a + getSerializationSize());
}
Dims PReLULayer::getOutputDimensions(int index, const Dims* inputDims, int nbInputDims)
{
mC = (*inputDims).d[0];
mH = (*inputDims).d[1];
mW = (*inputDims).d[2];
mCount = mC*mH*mW;
return DimsCHW(inputDims[0].d[0], (*inputDims).d[1], (*inputDims).d[2]);
}
int PReLULayer::initialize()
{
return 0;
}
void PReLULayer::convertAndCopyToBuffer(char*& buffer, const Weights& weights)
{
if (weights.type != mDataType)
for (int64_t v = 0; v < weights.count; ++v)
if (mDataType == DataType::kFLOAT)
reinterpret_cast<float*>(buffer)[v] = fp16::__half2float(static_cast<const __half*>(weights.values)[v]);
else
reinterpret_cast<__half*>(buffer)[v] = fp16::__float2half(static_cast<const float*>(weights.values)[v]);
else
memcpy(buffer, weights.values, weights.count * type2size(mDataType));
buffer += weights.count * type2size(mDataType);
}
int PReLULayer::enqueue(int batchSize, const void*const *inputs, void** outputs, void*, cudaStream_t stream)
{
PReLUForward(batchSize*mCount,
mC,
mDim,
inputs,
outputs,
mDeviceSlope,
stream);
return 0;
}
size_t PReLULayer::getSerializationSize()
{
return sizeof(mSlope.count) + 4*sizeof(mW) + sizeof(mDataType) +
(mSlope.count) * type2size(mDataType);
}
void PReLULayer::configure(const Dims*inputs, int nbInputs, const Dims* outputs, int nbOutputs, int)
{
}
size_t PReLULayer::type2size(DataType type)
{
return type == DataType::kFLOAT ? sizeof(float) : sizeof(__half);
}