Basic image kernel operation not working

I am trying to implement a very simple image kernel operation, i know this is non optimal, i am just trying to learn how to do it the straight forward way first. Both the source and destination are allocated in global device memory. But for dir==DX and dir== DY i get the EXACT same results?!?! I am not sure why this is failing to work, anyone have a clue?

global void conv_mean_dy(short* d_data, unsigned char* s_data, int d_pitch, int s_pitch)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;

d_data[d_pitch*y+x] = (short)((s_data[s_pitch*(y-1)+x] + 
	                           s_data[s_pitch*(y)+x] + 
							   s_data[s_pitch*(y+1)+x])/3);

}

global void conv_mean_dx(short* d_data, unsigned char* s_data, int d_pitch, int s_pitch)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;

d_data[d_pitch*y+x] = (short)((s_data[s_pitch*y+(x-1)] + 
	                           s_data[s_pitch*y+(x)] + 
							   s_data[s_pitch*y+(x+1)])/3);

}

and then executing them like this:

int2 grid_size;
grid_size.x = pDest->width / BLOCK_SIZE;
grid_size.y = pDest->height / BLOCK_SIZE;

dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
	dim3 grid(grid_size.x, grid_size.y);


if(dir == DX)
	conv_mean_dx<<<grid,threads>>>((short*)(pDest->pixels), (unsigned char*)(pSrc->pixels), pDest->pitch, pSrc->pitch);
else
	conv_mean_dy<<<grid,threads>>>((short*)(pDest->pixels), (unsigned char*)(pSrc->pixels), pDest->pitch, pSrc->pitch);

cudaThreadSynchronize();

I am sorry, didn’t mean to triple post.