can cudaMemcpyAsync of input be avoided for custom layers ?

I observe that for a custom plugin layer in c++ this is the flow

int enqueue(int batchSize, const void* const* inputs, void** outputs, void*, cudaStream_t stream) override
    {
            const float* input = reinterpret_cast<const float*>(inputs[0]);
            float* inputTemp;
            CHECK(cudaMalloc(&inputTemp, inputsize));
            CHECK(cudaMemcpyAsync(inputTemp, input, inputsize, cudaMemcpyDeviceToDevice, stream));
            ......
    }

my quesion can this copy be somehow avoided?

CHECK(cudaMemcpyAsync(inputTemp, input, inputsize, cudaMemcpyDeviceToDevice, stream))

;
i observed this is a huge overhead on runtime

cudaMemcpyAsync of input can be avoided