About speed of image_resize?

Is tensorRT having the function of resize?

I written the resize using CUDA. The running time is 6ms. The requirement is under 1ms.

The code of the resize was calling by enqueue:

#include <cuda.h>
#include <cuda_runtime.h>
#include "time.h"

template <typename T>
__global__ void resizeKernel(const  float* src, int srcHeight, int srcWidth, int dstHeight,
                              int dstWidth, int depth,float * dst)

    float scale_y;
    float scale_x;
    //int dstOffset;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int z=blockIdx.z*blockDim.z+threadIdx.z;

    if (x >= dstWidth || y >= dstHeight) return;

    scale_x = (float) srcWidth / dstWidth;
    scale_y = (float) srcHeight / dstHeight;
    float srcYf = (float) ((y + 0.5) * scale_y - 0.5);
    int sy = (int) srcYf;
    srcYf -= sy;
    if (sy < 0) {
        srcYf = 0, sy = 0;
    if (sy >= srcHeight - 1) {
        srcYf = 0, sy = srcHeight - 2;

    short cbufy[2];
    cbufy[0] = (1.f - srcYf) * 2048;
    if (cbufy[2] > 32767) { cbufy[0] = 32767; }
    cbufy[1] = 2048 - cbufy[0];

    float srcXf = (float) ((x + 0.5) * scale_x - 0.5);
    int sx = (int) srcXf;
    srcXf -= sx;

    if (sx < 0) {
        srcXf = 0, sx = 0;
    if (sx >= srcWidth - 1) {
        srcXf = 0, sx = srcWidth - 2;

    short cbufx[2];
    cbufx[0] = (1.f - srcXf) * 2048;
    if (cbufx[0] > 32767) { cbufx[0] = 32767; }
    cbufx[1] = 2048 - cbufx[0];

    for (int k = 0; k < depth; k++) {
    *(dst + (y * dstWidth + x) * depth + k) = (float)((int)(*(src + (sy * srcWidth + sx) * depth + k) * cbufy[0] * cbufx[0] +
                                                                *(src + ((sy + 1) * srcWidth + sx) * depth + k) * cbufy[0] *
                                                                cbufx[1] +
                                                                *(src + (sy * srcWidth + (sx + 1)) * depth + k) * cbufy[1] *
                                                                cbufx[0] +
                                                                *(src + ((sy + 1) * srcWidth + (sx + 1)) * depth + k) *
                                                                cbufy[1] * cbufx[1]) >> 22);


int ResizeInference(cudaStream_t stream,  const void* image,
                    int batch_size, int input_height, int input_width, int resize_height,
                    int resize_width, int depth, void* output)
    int output_volume = batch_size * resize_height * resize_width * depth;
    int block = 1024;
    int grid = (output_volume + block - 1 ) / block;

    resizeKernel<float> <<< grid, block,0,stream >>>(static_cast<const float*>(image),input_height,input_width,resize_height,

    return 0;
