I observe that for a custom plugin layer in c++ this is the flow
int enqueue(int batchSize, const void* const* inputs, void** outputs, void*, cudaStream_t stream) override
{
const float* input = reinterpret_cast<const float*>(inputs[0]);
float* inputTemp;
CHECK(cudaMalloc(&inputTemp, inputsize));
CHECK(cudaMemcpyAsync(inputTemp, input, inputsize, cudaMemcpyDeviceToDevice, stream));
......
}
my quesion can this copy be somehow avoided?
CHECK(cudaMemcpyAsync(inputTemp, input, inputsize, cudaMemcpyDeviceToDevice, stream))
;
i observed this is a huge overhead on runtime