i got this kernel,
__device__ int4 matchblock(
int blockSize,
int posX,
int posY,
int dsplX,
int dsplY
){
int sum = 0;
int pX = posX + dsplX;
int pY = posY + dsplY;
// iterate over block
for (int y = 0; y < blockSize; y++){
for (int x = 0; x < blockSize; x++){
sum = fabsf( tex2D(img, pX + x, pY + y ) -
tex2D(imgRef, posX + x, posY + y ));
//sum += fabsf( tex2D(img, pX + x, pY + y ) -
// tex2D(imgRef, posX + x, posY + y ));
}
}
printf ("Summe = %d \n", sum);
int4 ret = make_int4(sum, pX , pY ,100);
return ret;
}
called by
_global__ void blockmatchul (
int *blkOrigins,
int blockSize,
int dspl,
int4 *values,
int pitch
){
// init thread indexing
int tx = -threadIdx.x;
int ty = threadIdx.y;
// block index for current block
int bx = blockIdx.x;
int by = blockIdx.y;
// block start positions
int curPosInGrid = by * gridDim.x + bx;
int curPosInBlock = threadIdx.y * blockDim.x + threadIdx.x;
int posX = blkOrigins[curPosInGrid * 2];
int posY = blkOrigins[curPosInGrid * 2 + 1];
// match single block and save result, positions to values
int4* vals = (int4*)((char*)values + curPosInGrid * pitch);
vals[curPosInBlock] = matchblock(blockSize, posX, posY, tx, ty);
__syncthreads();
// sort the results and write back the one with the lowest sum
}
with execution parameters. The value are so low for testing issues. normaly the threads are 16x16, and grid is 20x15.
dim3 threads( 1, 1);
dim3 grid(2,2);
working on textures with adressmodewrap, size 640x480, pixel values between 0 and 255. Normally it should sum up the square-difference of two blocks in an image. for testing i let it just compute the fabf of the two pixels, but the same happens, some values doesn´t match the expected.
So i compiled in deviceemulation mode an added some print-outs. It´s a litte bit weird
some times the sum is like expected (between 0 and 255) but some times the value is complete aut of range. (around 2147483647 some times negativ some positiv) The weird thing is, that the texture values involved in the sum are correct, only the sum gets this strange value.