I have written a coding of image_resize. Here is the code:
#include <cuda.h>
#include <cuda_runtime.h>
#include "time.h"
template <typename T>
__global__ void resizeKernel(const float* src, int srcHeight, int srcWidth, int dstHeight,
int dstWidth, int depth,float * dst)
{
float scale_y;
float scale_x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int x = blockIdx.x * blockDim.x + threadIdx.x;
int z=blockIdx.z*blockDim.z+threadIdx.z;
if (x >= dstWidth || y >= dstHeight) return;
scale_x = (float) srcWidth / dstWidth;
scale_y = (float) srcHeight / dstHeight;
float srcYf = (float) ((y + 0.5) * scale_y - 0.5);
int sy = (int) srcYf;
srcYf -= sy;
if (sy < 0) {
srcYf = 0, sy = 0;
}
if (sy >= srcHeight - 1) {
srcYf = 0, sy = srcHeight - 2;
}
short cbufy[2];
cbufy[0] = (1.f - srcYf) * 2048;
if (cbufy[2] > 32767) { cbufy[0] = 32767; }
cbufy[1] = 2048 - cbufy[0];
float srcXf = (float) ((x + 0.5) * scale_x - 0.5);
int sx = (int) srcXf;
srcXf -= sx;
if (sx < 0) {
srcXf = 0, sx = 0;
}
if (sx >= srcWidth - 1) {
srcXf = 0, sx = srcWidth - 2;
}
short cbufx[2];
cbufx[0] = (1.f - srcXf) * 2048;
if (cbufx[0] > 32767) { cbufx[0] = 32767; }
cbufx[1] = 2048 - cbufx[0];
for (int k = 0; k < depth; k++) {
*(dst + (y * dstWidth + x) * depth + k) = (float)((int)(*(src + (sy * srcWidth + sx) * depth + k) * cbufy[0] * cbufx[0] +
*(src + ((sy + 1) * srcWidth + sx) * depth + k) * cbufy[0] *
cbufx[1] +
*(src + (sy * srcWidth + (sx + 1)) * depth + k) * cbufy[1] *
cbufx[0] +
*(src + ((sy + 1) * srcWidth + (sx + 1)) * depth + k) *
cbufy[1] * cbufx[1]) >> 22);
}
}
int ResizeInference(cudaStream_t stream, const void* image,
int batch_size, int input_height, int input_width, int resize_height,
int resize_width, int depth, void* output)
{
int output_volume = batch_size * resize_height * resize_width * depth;
int block = 1024;
int grid = (output_volume + block - 1 ) / block;
dim3 block(uint,uint);
resizeKernel<float> <<< grid, block,0,stream >>>(static_cast<const float*>(image),input_height,input_width,resize_height,
resize_width,depth,static_cast<float*>(output));
return 0;
}
I use TensorRT enqueue function to call it. The running time of the resize is 6ms. If there has some improvemnt of the speed. The requirement is under 1ms.