The approach I am trying is shown below:-

```
void GaussianFilterSeparable::applyXDirection(const float *input, float *output, const float *kernel, float sd, int dim, int W, int H) {
dim3 blockDims(16, 16, 1);
dim3 gridDims(static_cast<int>(ceil(W / blockDims.x)), static_cast<int>(ceil(H / blockDims.y)));
size_t sharedBufferSize = ((blockDims.y*(blockDims.x + 4*static_cast<int>(sd)))*dim);//TO-DO: remove hard coding channels.
filterGaussianX_device << <gridDims, blockDims, sharedBufferSize * sizeof(float) >> > (input, output, kernel, sd, dim, W, H, sharedBufferSize);
}
void GaussianFilterSeparable::applyYDirection(const float *input, float *output, const float *kernel, float sd, int dim, int W, int H) {
dim3 blockDims(16, 16, 1);
dim3 gridDims(static_cast<int>(ceil(W / blockDims.x)), static_cast<int>(ceil(H / blockDims.y)));
size_t sharedBufferSize = ((blockDims.y + 4 * static_cast<int>(sd))*blockDims.x)*dim;//TO-DO: remove hard coding channels.
filterGaussianY_device << <gridDims, blockDims, sharedBufferSize * sizeof(float) >> > (input, output, kernel, sd, dim, W, H, sharedBufferSize);
}
__global__
void filterGaussianX_device(const float *input, float *output, const float *kernel, float sd, int dim, int W, int H, size_t sharedSize) {
int x = blockIdx.x*blockDim.x + threadIdx.x;
int y = blockIdx.y*blockDim.y + threadIdx.y;
if (x >= W || y >= H) {
return;
}
extern __shared__ float distData[];
__syncthreads();
//Load data into shared memory.
int twoSD = 2 * (int)sd;
int idxLeftGlobal = x - twoSD;
int idxRightGlobal = x + twoSD;
int localIdxLeft = threadIdx.y*(blockDim.x + 2 * twoSD) + (threadIdx.x - twoSD);
int localIdxRight = threadIdx.y*(blockDim.x + 2 * twoSD) + (threadIdx.x + twoSD);
for (int i = 0; i < dim; i++) {
int sharedIdx = localIdxLeft*dim + i;
if (sharedIdx >= 0 && sharedIdx < sharedSize) {
distData[sharedIdx] = (idxLeftGlobal > 0) ? input[(y*W + idxLeftGlobal)*dim + i] : 0.0;
}
sharedIdx = localIdxRight*dim + i;
if (sharedIdx >= 0 && sharedIdx < sharedSize) {
distData[sharedIdx] = (idxRightGlobal < W) ? input[(y*W + idxRightGlobal)*dim + i] : 0.0;
}
}
__syncthreads();
//Do the convolution.
float *channelSums = new float[dim];
for (int i = 0; i < dim; i++) {
channelSums[i] = 0.0;
}
int rad = (int)sd;
float normaliser = 0.0;
int sharedIdx = 0;
for (int r = -rad; r <= rad; r++) {
normaliser += kernel[rad - r];
for (int i = 0; i < dim; i++) {
sharedIdx = (threadIdx.y*(blockDim.x + 2 * twoSD) + threadIdx.x + r)*dim + i;
if (sharedIdx >= 0 && sharedIdx < sharedSize) {
channelSums[i] += distData[sharedIdx] * kernel[rad - r];
}
}
}
if (normaliser > 0.0 || normaliser < 0.0) {
for (int i = 0; i < dim; i++) {
output[(y*W + x)*dim + i] = channelSums[i] / normaliser;
}
}
delete[] channelSums;
}
__global__
void filterGaussianY_device(const float *input, float *output, const float *kernel, float sd, int dim, int W, int H, size_t sharedSize) {
int x = blockIdx.x*blockDim.x + threadIdx.x;
int y = blockIdx.y*blockDim.y + threadIdx.y;
if (x >= W || y >= H) {
return;
}
extern __shared__ float distData[];
__syncthreads();
//Load data into shared memory.
int twoSD = 2 * (int)sd;
int idxUpGlobal = y - twoSD;
int idxDownGlobal = y + twoSD;
int localIdxUp = (threadIdx.y - twoSD)*blockDim.x + threadIdx.x;
int localIdxDown = (threadIdx.y + twoSD)*blockDim.x + threadIdx.x;
for (int i = 0; i < dim; i++) {
int sharedIdx = localIdxUp*dim + i;
if (sharedIdx >= 0 && sharedIdx < sharedSize) {
distData[sharedIdx] = (idxUpGlobal > 0) ? input[(idxUpGlobal*W + x)*dim + i] : 0.0;
}
sharedIdx = localIdxDown*dim + i;
if (sharedIdx >= 0 && sharedIdx < sharedSize) {
distData[sharedIdx] = (idxDownGlobal < H) ? input[(idxDownGlobal*W + x)*dim + i] : 0.0;
}
}
__syncthreads();
//Do the convolution.
float *channelSums = new float[dim];
for (int i = 0; i < dim; i++) {
channelSums[i] = 0.0;
}
int rad = (int)sd;
float normaliser = 0.0;
int sharedIdx = 0;
for (int r = -rad; r <= rad; r++) {
normaliser += kernel[rad - r];
for (int i = 0; i < dim; i++) {
sharedIdx = ((threadIdx.y + r)*blockDim.x + threadIdx.x)*dim + i;
if (sharedIdx >= 0 && sharedIdx < sharedSize) {
channelSums[i] += distData[sharedIdx] * kernel[rad - r];
}
}
}
if (normaliser > 0.0 || normaliser < 0.0) {
for (int i = 0; i < dim; i++) {
output[(y*W + x)*dim + i] = channelSums[i] / normaliser;
}
}
delete[] channelSums;
}
```

Essentially, I am loading for each thread in the block the two elements at the kernel size distance away on each side.

I am aware that it induces a race condition, but I believe it is benign.

The input is a 3D tensor, and the i,j,k element is accessed as follows:-

```
input[x][y][z] = (y*WIDTH + x)*DEPTH + z
```

Thanks for reading.