when to use shared memory

hello is using shared memory always perform better? I am trying to do convolution calculation on a 3d volume in x, y, and z direction.The 3D volume is stored as a linear memory buffer with the order of row first, column second and slice third. I have below kernels to do convolution with following block/thread configuration: 2d grid (256 by 256 blocks ) and 1d block (256 threads) so each thread access one pixel in the 256256256 volume. To speed up convolution calculation I load current row or column into shared memory ( I am doing separable Gaussian filter here). When using shared memory, the first kernel gaussianFilterX get speed up but the second kernel gaussianFilterY actually slows down much. These two kernels are the same except the mapping of blockIdx and threadIdx to x, y, z coords in the volume.

Any helps? I am not familiar with CUDA yet.

Thanks ahead!

global void gaussianFilterX( float* d_b, float* d_a, float* kernel, int halfKernelWidth)
//this one can be speed up by shared memory
{
int x, y, z;

z = blockIdx.y;
y = blockIdx.x;
x = threadIdx.x;

int imageSize = 256*256;
int lineSize = 256;
       
int xx;
float sum = 0.0f;

extern __shared__ int s_data[];
s_data[threadIdx.x] = d_a[z*imageSize + y*lineSize + x];
__syncthreads();

for (int i = -halfKernelWidth; i <= halfKernelWidth; i++)
{
    xx = (threadIdx.x + i + 256) & 255;

    sum += s_data[xx] * kernel[i+halfKernelWidth];
}

d_b[z*imageSize + y*lineSize + x] = sum;

}

global void gaussianFilterY( float* d_b, float* d_a, float* kernel, int halfKernelWidth)
//this one will be slow down by shared memory
{
int x, y, z;

z = blockIdx.y;
x = blockIdx.x;
y = threadIdx.x;

int imageSize = 256*256;
int lineSize = 256;
       
int xx;
float sum = 0.0f;

extern __shared__ int s_data[];
s_data[threadIdx.x] = d_a[z*imageSize + y*lineSize + x];
__syncthreads();

for (int i = -halfKernelWidth; i <= halfKernelWidth; i++)
{
    xx = (threadIdx.x + i + 256) & 255;

    sum += s_data[xx] * kernel[i+halfKernelWidth];
}

d_b[z*imageSize + y*lineSize + x] = sum;

}