I want to use tensorRT to acceletate my application, and there is a PReLU layer which tensorRT doesn’t support, so I want to implement it. I have referenced the demo.
and the enqueue code like this :
virtual int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override
{
calcPReLU(reinterpret_cast<const float *>(inputs[0]), (float*)outputs[0],
reinterpret_cast<const float*>(mPReLuWeights.values),
batchSize, mPReLuWeights.count, width, height);
return 0;
}
the kernel code like this :
__global__ void calcPReLUKernel(const float *input, float *output, const float *weights,
int width, int height, int channels)
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
if (x >= width || y >= height) {
return;
}
output[y * width + x] = input[y * width + x] > 0 ? input[y * width + x] : input[y * width + x] * weights[y % channels];
}
void calcPReLU(const float *input, float *output, const float* weights, int batchSize, int channels, int width, int height)
{
dim3 grids((width * height + 31) / 32, (batchSize * channels + 31) / 32);
dim3 blocks(32, 32);
calcPReLUKernel<<<grids, blocks>>>(input, output, weights, width * height, channels * batchSize, channels);
}
But the result seems error. How can I check it ?