- Floating point register ( FPR ) indexing within kernels should be efficient as long as there is only a single dedicated integer register and 128 FPR’s in single hardware-CUDA core:

```
const uint32_t nThreads = 512, nBlocks = ( N / nThreads ) + 1;
/*
...
if ( ( nThreads * nBlocks ) > 16777215 ) { printf( "FPR addressing error!\n" ); return; };
...
*/
__global__ void fooKernel( float *d_in )
{
// max FPR indexing: nThreads=512; nBlocks=32767; (floatIndex < 16777215 ~=64GB)
float tdx = threadIdx.x + blockIdx.x * blockDim.x;
printf( "thread[%i].block[%i]\n", uint32_t( tdx ), blockDim.x );
};
```

- Cumulative error is silent error proner - even with usage of 64bit double. Please do consider host code with sequential separate accumulation:

```
template < typename T >
inline T localAccumulate( vector< T >& inVal )
{
uint16_t lloverflow = 0; //positive overflow counter
long long int llArr[ 3 ] = { 0, 0 }; //[0] - tmp LL; [1] - accumulator;
float f32fract = 0.0f;
for ( auto &x : inVal )
{
llArr[ 0 ] = ( long long int )x;
f32fract += float( x ) - float( llArr[ 0 ] );
if ( llArr[ 0 ] + llArr[ 1 ] > LLONG_MAX ) //long long int positive overflow only
lloverflow++;
llArr[ 1 ] += llArr[ 0 ];
if ( ( f32fract > 9.9f ) || ( f32fract < -9.9f ) ) //minimizing cumulative error - both positive and negative
{
llArr[ 0 ] = ( long long int )f32fract;
llArr[ 1 ] += llArr[ 0 ];
f32fract -= llArr[ 0 ];
}
}
return f32fract + ( float )( llArr[ 1 ] ) + ( float )( ( LLONG_MAX * lloverflow ) );
};
```

Post Scriptum: some C++11/CUDA C++11/LINUX programming examples are uploaded at free repository:

“https://github.com/PiotrLenarczykAnonim”