I am trying to implement a very simple image kernel operation, i know this is non optimal, i am just trying to learn how to do it the straight forward way first. Both the source and destination are allocated in global device memory. But for dir==DX and dir== DY i get the EXACT same results?!?! I am not sure why this is failing to work, anyone have a clue?
global void conv_mean_dy(short* d_data, unsigned char* s_data, int d_pitch, int s_pitch)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
d_data[d_pitch*y+x] = (short)((s_data[s_pitch*(y-1)+x] +
s_data[s_pitch*(y)+x] +
s_data[s_pitch*(y+1)+x])/3);
}
global void conv_mean_dx(short* d_data, unsigned char* s_data, int d_pitch, int s_pitch)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
d_data[d_pitch*y+x] = (short)((s_data[s_pitch*y+(x-1)] +
s_data[s_pitch*y+(x)] +
s_data[s_pitch*y+(x+1)])/3);
}
and then executing them like this:
int2 grid_size;
grid_size.x = pDest->width / BLOCK_SIZE;
grid_size.y = pDest->height / BLOCK_SIZE;
dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
dim3 grid(grid_size.x, grid_size.y);
if(dir == DX)
conv_mean_dx<<<grid,threads>>>((short*)(pDest->pixels), (unsigned char*)(pSrc->pixels), pDest->pitch, pSrc->pitch);
else
conv_mean_dy<<<grid,threads>>>((short*)(pDest->pixels), (unsigned char*)(pSrc->pixels), pDest->pitch, pSrc->pitch);
cudaThreadSynchronize();