```
__forceinline__ __device__ void texPatch( volatile float s_roi[roiWidth][roiWidth + roiWidthPadding],
const int alephIndexX,
const int alephIndexY,
const int tavekIndexX,
const int tavekIndexY,
const int betIndexX,
const int betIndexY,
const int wrappedX,
const int wrappedY,
const int bitIndex,
const int outThread,
volatile unsigned int *out) {
// This assumes an 8x8 patch. As there are only 32 threads per warp, each thread will pull two values from each thread.
// The access pattern is interleaved to decrease the amount of shared memory padding necessary to avoid bank conflicts:
// each thread pulls a verticle pair from each patch.
const register float tavek0 = s_roi[tavekIndexY + wrappedY ][tavekIndexX + wrappedX]; // Tavek means "between".
const register float tavek1 = s_roi[tavekIndexY + wrappedY+1][tavekIndexX + wrappedX]; // It is our root patch.
const register float aleph0 = s_roi[alephIndexY + wrappedY ][alephIndexX + wrappedX]; // Aleph is "A"
const register float aleph1 = s_roi[alephIndexY + wrappedY+1][alephIndexX + wrappedX]; // Similarity to aleph is denoted by a bit set to 0
const register float bet0 = s_roi[betIndexY + wrappedY ][betIndexX + wrappedX]; // Bet is "B"
const register float bet1 = s_roi[betIndexY + wrappedY+1][betIndexX + wrappedX]; // Similarity to bet is denoted by a bit set to 1
// This variant is for sum of absolute differences... invariant to absolute pixel intensity scale
register float alephDiff0 = fabs(tavek0 - aleph0);
register float alephDiff1 = fabs(tavek1 - aleph1);
register float betDiff0 = fabs(tavek0 - bet0);
register float betDiff1 = fabs(tavek1 - bet1);
// Now we compute the sum of squared differences between both patch pairs.
// Now, differences:
// register float alephDiff0 = (tavek0 - aleph0);
// register float alephDiff1 = (tavek1 - aleph1);
// register float betDiff0 = (tavek0 - bet0);
// register float betDiff1 = (tavek1 - bet1);
//
// // Squared differences
// alephDiff0 *= alephDiff0;
// alephDiff1 *= alephDiff1;
// betDiff0 *= betDiff0;
// betDiff1 *= betDiff1;
alephDiff0 += alephDiff1; // Merge both interleaved squared differences, to make upcoming warp reduction faster
betDiff0 += betDiff1;
alephDiff0 -= betDiff0; // Easiest to just take this difference now, then reduce, then compare to 0. Same as reduce then compare relative to each other.
alephDiff0 += __shfl_xor(alephDiff0, 1);
alephDiff0 += __shfl_xor(alephDiff0, 2);
alephDiff0 += __shfl_xor(alephDiff0, 4);
alephDiff0 += __shfl_xor(alephDiff0, 8);
alephDiff0 += __shfl_xor(alephDiff0, 16); // By xor shfling, every thread has the resulting sum.
// One thread sets a specific bit high if tavek is closer to bet.
if (alephDiff0 < 0 && threadIdx.x == outThread) {
*out |= (1<<bitIndex);
}
}
// Launch as 32x32
__global__ void latch( const float *g_img,
const int *g_K,
unsigned int *g_D/*,
float *g_roi*/) {
volatile __shared__ int s_kpOffset;
volatile __shared__ float s_roi[roiHeight][roiWidth + roiWidthPadding];
volatile __shared__ unsigned int s_out[warpsPerBlock];
{
register int kpOffset;
{ // Give everyone the global memory offset to the bottom left corner of the image. (Assuming y-up, row-major.)
if (threadIdx.y == 0 && threadIdx.x < 2) { // 2 threads, 2 coordinates
register int k;
k = g_K[blockIdx.x*2 + threadIdx.x];
k -= _warpSize; // We want to index from bottom left of patch... saves us a lot of negative signs later.
if (k < 0) {
k = -999999; // Make sure the kpOffset will be negative, so everyone gets the signal to bail.
} else if (threadIdx.x == 0 && imgWidth-2*_warpSize < k) {
k = -999999;
}
if (threadIdx.x == 1) {
if (imgHeight-2*_warpSize < k) {
k = -999999;
} else {
k *= imgWidth;
}
}
k += __shfl_down(k, 1, 2);
if (threadIdx.x == 0) {
s_kpOffset = k;
kpOffset = k;
}
}
__threadfence_block();
__syncthreads();
if (threadIdx.y != 0 && threadIdx.x == 0) {
kpOffset = s_kpOffset;
}
kpOffset = __shfl(kpOffset, 0, _warpSize);
}
if (kpOffset < 0) {
return;
}
// 64 by 64 region of interest means four 32 by 32 loads.
s_roi[threadIdx.y ][threadIdx.x ] = g_img[kpOffset + ( threadIdx.y)*imgWidth + ( threadIdx.x)];
s_roi[threadIdx.y ][threadIdx.x + _warpSize] = g_img[kpOffset + ( threadIdx.y)*imgWidth + (_warpSize + threadIdx.x)];
s_roi[threadIdx.y + _warpSize][threadIdx.x ] = g_img[kpOffset + (_warpSize + threadIdx.y)*imgWidth + ( threadIdx.x)];
s_roi[threadIdx.y + _warpSize][threadIdx.x + _warpSize] = g_img[kpOffset + (_warpSize + threadIdx.y)*imgWidth + (_warpSize + threadIdx.x)];
}
register unsigned int out = 0;
const register int wrappedX = threadIdx.x % patchSize; // Offset for patch, interlaced to decrease padding needed for shared memory bank conflict avoidance
const register int wrappedY = 2 * (threadIdx.x / patchSize); // Each thread will use both wrappedY and wrappedY+1
__syncthreads();
__threadfence_block();
#pragma unroll
for (register int i=0; i<16; i++) {
texPatch(s_roi, tex2D(texRef, 6*i , threadIdx.y),
tex2D(texRef, 6*i+1, threadIdx.y),
tex2D(texRef, 6*i+2, threadIdx.y),
tex2D(texRef, 6*i+3, threadIdx.y),
tex2D(texRef, 6*i+4, threadIdx.y),
tex2D(texRef, 6*i+5, threadIdx.y),
wrappedX, wrappedY, 16*(threadIdx.y & 1) + i, 0, &out);
}
if (threadIdx.x == 0) { // In this case, only thread 0 ever has important data.
s_out[threadIdx.y] = out;
}
__syncthreads();
__threadfence_block();
if (threadIdx.y == 0) {
out = s_out[threadIdx.x]; // Warp 0 now has all the data we need to output.
__syncthreads();
__threadfence_block();
out |= __shfl_down(out, 1, _warpSize); // Each warp computed half a 32 bit word. Merge them before output.
out = __shfl(out, 2*threadIdx.x); // Only even threads have useful data after above shfl_down.
if (threadIdx.x < bitsPerDescriptor / bitsPerUInt32) { // 512 / 32 = 16
g_D[((paddingBitsPerDescriptor + bitsPerDescriptor) / bitsPerUInt32)*blockIdx.x + threadIdx.x] = out; // And that's it, with this write to global memory we're done with this descriptor.
}
}
}
```

This is the first pass at implementing this paper: http://arxiv.org/pdf/1501.03719.pdf

The function call out to texPatch has a lot of tex2D lookups that are terribly redundant… I will be fixing that, of course. I just switched over to using textures.