I use defines to avoid ‘32’ appearing in different places (contexts) and also improve readability & maintainability
makes it easier when testing performance of different blocksizes i.e. “threads per block”
#define THREADSPERBLOCK 512
#define SEARCHTHREADS 32
#define MAXPERSEARCHTHREAD 16
//#define MAXPERSEARCHTHREAD 32 // 16 works better, get 3 blocks per MP instead of 1, THREADSPERBLOCK also reduced from 1024
#define OVERLAP 32
#define INSTREAMS 4
// Can change INSTREAMS
---- eg this —
for ( int tt = 1; tt < SEARCHTHREADS; tt++)
{
used = used + shCounts[tt-1];
if ( threadIdx.x < MAXPERSEARCHTHREAD )
val = shPosns[tt*MAXPERSEARCHTHREAD + threadIdx.x];
…
}
— is preferable to —
for ( int tt = 1; tt < 32; tt++)
{
used = used + shCounts[tt-1];
if ( threadIdx.x < 32)
val = shPosns[tt*32+ threadIdx.x];
…
}