Hi all,

I have a kernel which puzzles me, below is the kernel in its simplest way:

```
for ( int iTraceChunk = iTraceChunkStart; iTraceChunk < smTracesChunks; iTraceChunk++ )
{
__syncthreads();
int iCurrentTraceChunkSize = TRACE_SRXO_CHUNK_SIZE;
if ( threadIdx.x < iCurrentTraceChunkSize ) // One thread per trace - maybe should do 2 or 3 per trace ????.
{
// Do the algorithm's main calculation - uses ~30 registers. the smXXX paramters are the output params.
CalculateTraceParams( .... , smnb[ threadIdx.x ], smNb[ threadIdx.x ], smnumMoveSamples[ threadIdx.x ], smw1[ threadIdx.x ], smw2[ threadIdx.x ] );
}
__syncthreads();
////////////////////////////////////////////////////////////////////////////////////
// if I break out of the kernel now â€“ I get ~145ms which is amazing due to the fact that this is the
// main calculation including: 30 registers, asin, atan, sqrt etcâ€¦
////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////
// Continuing the kernel with the code below brings the time of the kernel to ~1900ms !!!!!!
// I donâ€™t get it. Its only additions and work with shared memory. Why would it take so much time?
////////////////////////////////////////////////////////////////////////////////////
// Go over the samples chunks.
for ( int iTimeChunk = 0; iTimeChunk < iTimeBlock; iTimeChunk++ )
{
float fOut1 = 0.0f, fOut2 = 0.0f, fOut3 = 0.0f;
int iTimeIndex = iTimeChunk * BLOCK_THREAD_SIZE + threadIdx.x;
// Go over the traces in the current chunk using the loop which I unrolled manually.
for( int iTraceIndex = 0; iTraceIndex < iCurrentTraceChunkSize; iTraceIndex += 8 )
{
int iCurrentTraceIndex = iTraceChunk * TRACE_SRXO_CHUNK_SIZE + iTraceIndex;
if ( ( smNb[ iTraceIndex ] != -1 ) && ( iTimeIndex >= smNb[ iTraceIndex ] ) && ( iTimeIndex < smNb[ iTraceIndex ] + smnumMoveSamples[ iTraceIndex ] ) )
{
fOut1 += smw1[ iTraceIndex ] + smw2[ iTraceIndex ];
fOut2 += smw1[ iTraceIndex ] - smw2[ iTraceIndex ];
fOut3 += smw1[ iTraceIndex ] * smw2[ iTraceIndex ];
}
iCurrentTraceIndex++;
if ( ( smNb[ iTraceIndex + 1 ] != -1 ) && ( iTimeIndex >= smNb[ iTraceIndex + 1 ] ) && ( iTimeIndex < smNb[ iTraceIndex + 1 ] + smnumMoveSamples[ iTraceIndex + 1 ] ) )
{
fOut1 += smw1[ iTraceIndex + 1 ] + smw2[ iTraceIndex + 1 ];
fOut2 += smw1[ iTraceIndex + 1 ] - smw2[ iTraceIndex + 1 ];
fOut3 += smw1[ iTraceIndex + 1 ] * smw2[ iTraceIndex + 1 ];
}
iCurrentTraceIndex++;
.......
.......
iCurrentTraceIndex++;
if ( ( smNb[ iTraceIndex + 7 ] != -1 ) && ( iTimeIndex >= smNb[ iTraceIndex + 7 ] ) && ( iTimeIndex < smNb[ iTraceIndex + 7 ] + smnumMoveSamples[ iTraceIndex + 7 ] ) )
{
fOut1 += smw1[ iTraceIndex + 7 ] + smw2[ iTraceIndex + 7 ];
fOut2 += smw1[ iTraceIndex + 7 ] - smw2[ iTraceIndex + 7 ];
fOut3 += smw1[ iTraceIndex + 7 ] * smw2[ iTraceIndex + 7 ];
}
}
// Write data to gmem
pTemp1[ smOutputPos + iTimeIndex ] += fOut1;
pTemp2[ smOutputPos + iTimeIndex ] += fOut2;
pTemp3[ smOutputPos + iTimeIndex ] += fOut3;
}
}
```

iTimeChunk == 4 and iCurrentTraceChunkSize == 80

Why does it take so much time? bank conflicts? if so what can be done?

Thanks in advance