But
context.enqueue(batchSize, &buffers[0],stream,nullptr);
is async api,should not it return immediatly?
Such as
cudaMemcpyAsync()
?
I guess it can not return immediatly because of a custom plugin named “FlattenConcat” define in sappleUffSSD.cpp.
the code is
class FlattenConcat : public IPluginV2
{
public:
FlattenConcat(int concatAxis, bool ignoreBatch)
: mIgnoreBatch(ignoreBatch)
, mConcatAxisID(concatAxis)
{
assert(mConcatAxisID == 1 || mConcatAxisID == 2 || mConcatAxisID == 3);
}
//clone constructor
FlattenConcat(int concatAxis, bool ignoreBatch, int numInputs, int outputConcatAxis, int* inputConcatAxis)
: mIgnoreBatch(ignoreBatch)
, mConcatAxisID(concatAxis)
, mOutputConcatAxis(outputConcatAxis)
, mNumInputs(numInputs)
{
CHECK(cudaMallocHost((void**) &mInputConcatAxis, mNumInputs * sizeof(int)));
for (int i = 0; i < mNumInputs; ++i)
mInputConcatAxis[i] = inputConcatAxis[i];
}
FlattenConcat(const void* data, size_t length)
{
const char *d = reinterpret_cast<const char*>(data), *a = d;
mIgnoreBatch = read<bool>(d);
mConcatAxisID = read<int>(d);
assert(mConcatAxisID == 1 || mConcatAxisID == 2 || mConcatAxisID == 3);
mOutputConcatAxis = read<int>(d);
mNumInputs = read<int>(d);
CHECK(cudaMallocHost((void**) &mInputConcatAxis, mNumInputs * sizeof(int)));
CHECK(cudaMallocHost((void**) &mCopySize, mNumInputs * sizeof(int)));
std::for_each(mInputConcatAxis, mInputConcatAxis + mNumInputs, [&](int& inp) { inp = read<int>(d); });
mCHW = read<nvinfer1::DimsCHW>(d);
std::for_each(mCopySize, mCopySize + mNumInputs, [&](size_t& inp) { inp = read<size_t>(d); });
assert(d == a + length);
}
~FlattenConcat()
{
if (mInputConcatAxis)
CHECK(cudaFreeHost(mInputConcatAxis));
if (mCopySize)
CHECK(cudaFreeHost(mCopySize));
}
int getNbOutputs() const override { return 1; }
Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override
{
assert(nbInputDims >= 1);
assert(index == 0);
mNumInputs = nbInputDims;
CHECK(cudaMallocHost((void**) &mInputConcatAxis, mNumInputs * sizeof(int)));
mOutputConcatAxis = 0;
#ifdef SSD_INT8_DEBUG
std::cout << " Concat nbInputs " << nbInputDims << "\n";
std::cout << " Concat axis " << mConcatAxisID << "\n";
for (int i = 0; i < 6; ++i)
for (int j = 0; j < 3; ++j)
std::cout << " Concat InputDims[" << i << "]"
<< "d[" << j << " is " << inputs[i].d[j] << "\n";
#endif
for (int i = 0; i < nbInputDims; ++i)
{
int flattenInput = 0;
assert(inputs[i].nbDims == 3);
if (mConcatAxisID != 1)
assert(inputs[i].d[0] == inputs[0].d[0]);
if (mConcatAxisID != 2)
assert(inputs[i].d[1] == inputs[0].d[1]);
if (mConcatAxisID != 3)
assert(inputs[i].d[2] == inputs[0].d[2]);
flattenInput = inputs[i].d[0] * inputs[i].d[1] * inputs[i].d[2];
mInputConcatAxis[i] = flattenInput;
mOutputConcatAxis += mInputConcatAxis[i];
}
return DimsCHW(mConcatAxisID == 1 ? mOutputConcatAxis : 1,
mConcatAxisID == 2 ? mOutputConcatAxis : 1,
mConcatAxisID == 3 ? mOutputConcatAxis : 1);
}
int initialize() override
{
CHECK(cublasCreate(&mCublas));
return 0;
}
void terminate() override
{
CHECK(cublasDestroy(mCublas));
}
size_t getWorkspaceSize(int) const override { return 0; }
int enqueue(int batchSize, const void* const* inputs, void** outputs, void*, cudaStream_t stream) override
{
int numConcats = 1;
assert(mConcatAxisID != 0);
numConcats = std::accumulate(mCHW.d, mCHW.d + mConcatAxisID - 1, 1, std::multiplies<int>());
if (!mIgnoreBatch)
numConcats *= batchSize;
float* output = reinterpret_cast<float*>(outputs[0]);
int offset = 0;
for (int i = 0; i < mNumInputs; ++i)
{
const float* input = reinterpret_cast<const float*>(inputs[i]);
float* inputTemp;
CHECK(cudaMalloc(&inputTemp, mCopySize[i] * batchSize));
CHECK(cudaMemcpyAsync(inputTemp, input, mCopySize[i] * batchSize, cudaMemcpyDeviceToDevice, stream));
for (int n = 0; n < numConcats; ++n)
{
CHECK(cublasScopy(mCublas, mInputConcatAxis[i],
inputTemp + n * mInputConcatAxis[i], 1,
output + (n * mOutputConcatAxis + offset), 1));
}
CHECK(cudaFree(inputTemp));
offset += mInputConcatAxis[i];
}
return 0;
}
size_t getSerializationSize() const override
{
return sizeof(bool) + sizeof(int) * (3 + mNumInputs) + sizeof(nvinfer1::Dims) + (sizeof(mCopySize) * mNumInputs);
}
void serialize(void* buffer) const override
{
char *d = reinterpret_cast<char*>(buffer), *a = d;
write(d, mIgnoreBatch);
write(d, mConcatAxisID);
write(d, mOutputConcatAxis);
write(d, mNumInputs);
for (int i = 0; i < mNumInputs; ++i)
{
write(d, mInputConcatAxis[i]);
}
write(d, mCHW);
for (int i = 0; i < mNumInputs; ++i)
{
write(d, mCopySize[i]);
}
assert(d == a + getSerializationSize());
}
void configureWithFormat(const Dims* inputs, int nbInputs, const Dims* outputDims, int nbOutputs, nvinfer1::DataType type, nvinfer1::PluginFormat format, int maxBatchSize) override
{
assert(nbOutputs == 1);
mCHW = inputs[0];
assert(inputs[0].nbDims == 3);
CHECK(cudaMallocHost((void**) &mCopySize, nbInputs * sizeof(int)));
for (int i = 0; i < nbInputs; ++i)
{
mCopySize[i] = inputs[i].d[0] * inputs[i].d[1] * inputs[i].d[2] * sizeof(float);
}
}
bool supportsFormat(DataType type, PluginFormat format) const override
{
return (type == DataType::kFLOAT && format == PluginFormat::kNCHW);
}
const char* getPluginType() const override { return "FlattenConcat_TRT"; }
const char* getPluginVersion() const override { return "1"; }
void destroy() override { delete this; }
IPluginV2* clone() const override
{
return new FlattenConcat(mConcatAxisID, mIgnoreBatch, mNumInputs, mOutputConcatAxis, mInputConcatAxis);
}
void setPluginNamespace(const char* libNamespace) override { mNamespace = libNamespace; }
const char* getPluginNamespace() const override { return mNamespace.c_str(); }
private:
template <typename T>
void write(char*& buffer, const T& val) const
{
*reinterpret_cast<T*>(buffer) = val;
buffer += sizeof(T);
}
template <typename T>
T read(const char*& buffer)
{
T val = *reinterpret_cast<const T*>(buffer);
buffer += sizeof(T);
return val;
}
size_t* mCopySize = nullptr;
bool mIgnoreBatch{false};
int mConcatAxisID{0}, mOutputConcatAxis{0}, mNumInputs{0};
int* mInputConcatAxis = nullptr;
nvinfer1::Dims mCHW;
cublasHandle_t mCublas;
std::string mNamespace;
};
namespace
{
const char* FLATTENCONCAT_PLUGIN_VERSION{"1"};
const char* FLATTENCONCAT_PLUGIN_NAME{"FlattenConcat_TRT"};
} // namespace
class FlattenConcatPluginCreator : public IPluginCreator
{
public:
FlattenConcatPluginCreator()
{
mPluginAttributes.emplace_back(PluginField("axis", nullptr, PluginFieldType::kINT32, 1));
mPluginAttributes.emplace_back(PluginField("ignoreBatch", nullptr, PluginFieldType::kINT32, 1));
mFC.nbFields = mPluginAttributes.size();
mFC.fields = mPluginAttributes.data();
}
~FlattenConcatPluginCreator() {}
const char* getPluginName() const override { return FLATTENCONCAT_PLUGIN_NAME; }
const char* getPluginVersion() const override { return FLATTENCONCAT_PLUGIN_VERSION; }
const PluginFieldCollection* getFieldNames() override { return &mFC; }
IPluginV2* createPlugin(const char* name, const PluginFieldCollection* fc) override
{
const PluginField* fields = fc->fields;
for (int i = 0; i < fc->nbFields; ++i)
{
const char* attrName = fields[i].name;
if (!strcmp(attrName, "axis"))
{
assert(fields[i].type == PluginFieldType::kINT32);
mConcatAxisID = *(static_cast<const int*>(fields[i].data));
}
if (!strcmp(attrName, "ignoreBatch"))
{
assert(fields[i].type == PluginFieldType::kINT32);
mIgnoreBatch = *(static_cast<const bool*>(fields[i].data));
}
}
return new FlattenConcat(mConcatAxisID, mIgnoreBatch);
}
IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) override
{
//This object will be deleted when the network is destroyed, which will
//call Concat::destroy()
return new FlattenConcat(serialData, serialLength);
}
void setPluginNamespace(const char* libNamespace) override { mNamespace = libNamespace; }
const char* getPluginNamespace() const override { return mNamespace.c_str(); }
private:
static PluginFieldCollection mFC;
bool mIgnoreBatch{false};
int mConcatAxisID;
static std::vector<PluginField> mPluginAttributes;
std::string mNamespace = "";
};
PluginFieldCollection FlattenConcatPluginCreator::mFC{};
std::vector<PluginField> FlattenConcatPluginCreator::mPluginAttributes;
REGISTER_TENSORRT_PLUGIN(FlattenConcatPluginCreator);
Am I right?
Is function “cublasScopy()” in “int enqueue(int batchSize, const void* const* inputs, void** outputs, void*, cudaStream_t stream)” a asynch api?