Hi,
I suspect the following code has some syncronization bugs that causes the result to be faulty. Any assistance is more then welcomed.
Thanks in advance.
// smInputLoops is a shared memory int
for ( int i = 0; i < smInputLoops; i++ )
{
__syncthreads();
// find where the input starts for the current iteration for all threads.
if ( 0 == threadIdx.x )
{
smInputPos = pDeviceInputPerVAGC[ smInputIndex ];
smInputIndex++;
}
__syncthreads();
// load the shared information
smSample1[ threadIdx.x ] = pDeviceInput[ smInputPos + iTimeIndex ];
smSample2[ threadIdx.x ] = pDeviceInput[ smInputPos + constKernelParams[ 7 ] + iTimeIndex ];
__syncthreads();
// Accumolate the results into shared memory
smResults[ threadIdx.x ] += smSample1[ threadIdx.x ] * smSample1[ threadIdx.x ];
}
__syncthreads();
// Write accumulated result from shared memory to global memory
pDeviceOutput1[ smOutputPos + threadIdx.x ] = smResults[ threadIdx.x ];