I have written a Sum of Absolute Differences Kernel and am using textures. The Search areas are generally ~ 100x100 and the masks are ~50x50. So in each thread there are 50x50 calculations. Can I use shared memory for this as well as textures? would this speed it up at all or is texture memory fairly fast for this? Can I load the 2 50x50 arrays into shared in a loop, synchronise threads, then do another loop finding the difference? Would this be faster or slower than the code below? Any help would be much appreciated.
global void SAD(float* output, int W,int H, int SS_x,int SS_y, int MS_x, int MS_y, int S_x, int S_y, int M_x, int M_y)
{
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x<(SS_x-MS_x) & y<(SS_y-MS_y))
{
int x_image_base = x + S_x;
int y_image_base = y + S_y;
int i,j, x_image, y_image, x_mask, y_mask;
float Total = 0;
float Diff;
for (i = 0; i < MS_x ; i++)
{
for (j = 0; j < MS_y ; j++)
{
x_image = x_image_base + i;
y_image = y_image_base + j;
x_mask = i;
y_mask = j;
Diff = abs( tex2D(texImage, x_image,y_image) - tex2D(texRef, x_mask,y_mask)) ;
Total = Total + Diff;
}
}
output[(SS_x-MS_x)*y + x] = Total;
}
}