Hello,
I am currently zero padding a batch of images using the below cuda kernel. I allocate a chunk of memory of the desired size full of 0’s, then use the kernel to move the smaller values into their respective positions. Unfortunately it is very slow when profiled giving me a time of 2ms + for the current settings. In the process of doing FFT convolution this padding takes more time than the rest of the operations. Ideally I would like to modify an FFT convolution algorithm to stay within the frequency domain only computing circular convolution, but without an efficient way to pad this will be a very slow process.
What is the fastest way to zero pad using Cuda?
/*
* Writing a Cuda kernel to pad the input
* Reference: https://devblogs.nvidia.com/even-easier-introduction-cuda/
*/
// Include CUDA runtime and CUFFT
#include<cuda_runtime.h>
#include<stdio.h>
#include<stdlib.h>
// CUDA Kernel With Threading
__global__
void Pad3DArray(int *img, int *padded, int batchSize, int imgH, int imgW, int padH, int padW)
{
int indX = blockIdx.x * blockDim.x + threadIdx.x;
int strideX = blockDim.x * gridDim.x;
int indY = blockIdx.y * blockDim.y + threadIdx.y;
int strideY = blockDim.y * gridDim.y;
int indZ = blockIdx.z * blockDim.z + threadIdx.z;
int strideZ = blockDim.z * gridDim.z;
for(int i = indX; i < batchSize; i += strideX)
{
for(int j = indY; j < imgH; j += strideY)
{
for(int k = indZ; k < imgW; k += strideZ)
{
padded[i*padH*padW + j*padW + k] = img[i*imgW*imgH + j*imgW + k];
}
}
}
}
int main()
{
int imgH = 64, imgW = 64;
int padH = imgH + 2, padW = imgW + 2;
int N = imgH * imgW;
int blockSize = 256;
int numBlocks = (N + blockSize - 1) / blockSize;
/*
* 3D Case
*/
printf("\n");
// Make an array of pointers to batchSize number of arrays
int *imgs, *pad3D;
int batchSize = 500;
cudaMallocManaged(&imgs, imgH * imgW * batchSize * sizeof(int));
cudaMallocManaged(&pad3D, padH * padW * batchSize * sizeof(int));
// Fill the array
for(int i = 0; i < batchSize * imgH * imgW; i++)
{
imgs[i] = i;
}
for(int i = 0; i < 50; i++)
{
// Run kernel
Pad3DArray<<<numBlocks,blockSize>>>(imgs,pad3D,batchSize,imgH,imgW,padH,padW);
}
cudaDeviceSynchronize();
cudaFree(imgs);
cudaFree(pad3D);
}