```
#ifndef _TEMPLATE_KERNEL_H_
#define _TEMPLATE_KERNEL_H_
#include <stdio.h>
__global__ void testKernel( float* x, float* px, float* y, float* py, float* t, float* dp,
float R11, float R12, float R16, float R21, float R22, float R26, float R33, float R34, float R43, float R44, float R51, float R52, float R55, float R56, float R66,
float T111, float T112, float T122, float T116, float T126, float T166, float T133, float T134, float T144,
float T211, float T212, float T222, float T216, float T226, float T266, float T233, float T234, float T244,
float T313, float T314, float T323, float T324, float T336, float T346,
float T413, float T414, float T423, float T424, float T436, float T446,
float T511, float T512, float T522, float T516, float T526, float T566, float T533, float T534, float T544)
{
// access thread id
const unsigned int tid = __mul24(blockIdx.x , blockDim.x) + threadIdx.x;
// read in input data from global memory
//copy host memory to device
float XDATA=x[tid];
float PXDATA=px[tid];
float YDATA=y[tid];
float PYDATA=py[tid];
float TDATA=t[tid];
float DPDATA=dp[tid];
// perform some computations
float _XDATA = R11*XDATA + R12*PXDATA + R16*DPDATA
+ T111*XDATA*XDATA + T112*XDATA*PXDATA + T122*PXDATA*PXDATA + T116*XDATA*DPDATA + T126*PXDATA*DPDATA
+ T166*DPDATA*DPDATA + T133*YDATA*YDATA + T134*YDATA*PYDATA + T144*PYDATA*PYDATA;
float _PXDATA = R21*XDATA + R22*PXDATA + R26*DPDATA
+ T211*XDATA*XDATA + T212*XDATA*PXDATA + T222*PXDATA*PXDATA + T216*XDATA*DPDATA + T226*PXDATA*DPDATA
+ T266*DPDATA*DPDATA + T233*YDATA*YDATA + T234*YDATA*PYDATA + T244*PYDATA*PYDATA;
float _YDATA = R33*YDATA + R34*PYDATA
+ T313*XDATA*YDATA + T314*XDATA*PYDATA + T323*PXDATA*YDATA + T324*PXDATA*PYDATA + T336*YDATA*DPDATA + T346*PYDATA*DPDATA;
float _PYDATA = R43*YDATA + R44*PYDATA
+ T413*XDATA*YDATA + T414*XDATA*PYDATA + T423*PXDATA*YDATA + T424*PXDATA*PYDATA + T436*YDATA*DPDATA + T446*PYDATA*DPDATA;
float _TDATA = R51*XDATA + R52*PXDATA + R55*TDATA + R56*DPDATA
+ T511*XDATA*XDATA + T512*XDATA*PXDATA + T522*PXDATA*PXDATA + T516*XDATA*DPDATA + T526*PXDATA*DPDATA
+ T566*DPDATA*DPDATA + T533*YDATA*YDATA + T534*YDATA*PYDATA + T544*PYDATA*PYDATA;
// write data to global memory
x[tid] = _XDATA;
px[tid] = _PXDATA;
y[tid] = _YDATA;
py[tid] = _PYDATA;
t[tid] = _TDATA;
}
#endif // #ifndef _TEMPLATE_KERNEL_H_
```

…and the kernel call…

```
// setup execution parameters
dim3 grid( num_blocks, 1, 1);
dim3 threads( num_threads, 1, 1);
// execute the kernel
testKernel<<< grid, threads >>>( d_i_x, d_i_px, d_i_y, d_i_py, d_i_t, d_i_dp,
R11, R12, R16, R21, R22, R26, R33, R34, R43, R44, R51, R52, R55, R56, R66,
T111, T112, T122, T116, T126, T166, T133, T134, T144,
T211, T212, T222, T216, T226, T266, T233, T234, T244,
T313, T314, T323, T324, T336, T346,
T413, T414, T423, T424, T436, T446,
T511, T512, T522, T516, T526, T566, T533, T534, T544);
```

I have also changed the C++ code that ensures the number of particles is a multiple of 256 using padding. Basically, if I change the number of threads to a different number, then it that many particles that have a value of #QNAN. I guess that means that only the first ‘block’ is affected.