About speed of image_resize?

I have written a coding of image_resize. Here is the code:

#include <cuda.h>
#include <cuda_runtime.h>
#include "time.h"

template <typename T>
__global__ void resizeKernel(const  float* src, int srcHeight, int srcWidth, int dstHeight,
                              int dstWidth, int depth,float * dst)
{
    float scale_y;
    float scale_x;
  
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int z=blockIdx.z*blockDim.z+threadIdx.z;

    if (x >= dstWidth || y >= dstHeight) return;

    scale_x = (float) srcWidth / dstWidth;
    scale_y = (float) srcHeight / dstHeight;
   
    float srcYf = (float) ((y + 0.5) * scale_y - 0.5);
    int sy = (int) srcYf;
    srcYf -= sy;
   
    if (sy < 0) {
        srcYf = 0, sy = 0;
    }
    if (sy >= srcHeight - 1) {
        srcYf = 0, sy = srcHeight - 2;
    }

    short cbufy[2];
    cbufy[0] = (1.f - srcYf) * 2048;
    if (cbufy[2] > 32767) { cbufy[0] = 32767; }
    cbufy[1] = 2048 - cbufy[0];

    float srcXf = (float) ((x + 0.5) * scale_x - 0.5);
    int sx = (int) srcXf;
    srcXf -= sx;

    if (sx < 0) {
        srcXf = 0, sx = 0;
    }
    if (sx >= srcWidth - 1) {
        srcXf = 0, sx = srcWidth - 2;
    }

    short cbufx[2];
    cbufx[0] = (1.f - srcXf) * 2048;
    if (cbufx[0] > 32767) { cbufx[0] = 32767; }
    cbufx[1] = 2048 - cbufx[0];

    for (int k = 0; k < depth; k++) {
    *(dst + (y * dstWidth + x) * depth + k) = (float)((int)(*(src + (sy * srcWidth + sx) * depth + k) * cbufy[0] * cbufx[0] +
                                                                *(src + ((sy + 1) * srcWidth + sx) * depth + k) * cbufy[0] *
                                                                cbufx[1] +
                                                                *(src + (sy * srcWidth + (sx + 1)) * depth + k) * cbufy[1] *
                                                                cbufx[0] +
                                                                *(src + ((sy + 1) * srcWidth + (sx + 1)) * depth + k) *
                                                                cbufy[1] * cbufx[1]) >> 22);
    }

}

int ResizeInference(cudaStream_t stream,  const void* image,
                    int batch_size, int input_height, int input_width, int resize_height,
                    int resize_width, int depth, void* output)
{
    int output_volume = batch_size * resize_height * resize_width * depth;
    int block = 1024;
    int grid = (output_volume + block - 1 ) / block;
    dim3 block(uint,uint);

    resizeKernel<float> <<< grid, block,0,stream >>>(static_cast<const float*>(image),input_height,input_width,resize_height,
            resize_width,depth,static_cast<float*>(output));

    return 0;

}

I use TensorRT enqueue function to call it. The running time of the resize is 6ms. If there has some improvemnt of the speed. The requirement is under 1ms.

you might consider the batch resize functions in the NVIDIA NPP library

https://docs.nvidia.com/cuda/npp/group__image__resize__batch.html