Hello,
In the following simple kernel, each column (i) is subtracted from column (i+1).
Then ABS2 of the result is written to destination.
The code works fine on matrix with 12800 rows, 40 columns.
I checked data at start of destination and at the end of the destination. All data is correct.
Should I run: cudaDeviceSynchronize () after such kernel ?
I checked the elapsed time with and without cudaDeviceSynchronize .
With cudaDeviceSynchronize , the time is much longer.
/**************************************************************************************/
__global__ void mat_col_sub_kernel (ComplexInt32 *pSrc, uint32_t *pDest, int Cols, int Rows)
{
uint32_t RowId = threadIdx.x + blockIdx.x * blockDim.x;
uint32_t ColId = threadIdx.y + blockIdx.y * blockDim.y;
if (ColId>0 && ColId<Cols && RowId<Rows)
{
unsigned int idx;
unsigned int idOut;
uint32_t Re,Im;
idx = RowId*Cols + ColId;
idOut = RowId*(Cols-1) + ColId-1;
Re = pSrc[idx].Re - pSrc[idx-1].Re;
Im = pSrc[idx].Im - pSrc[idx-1].Im;
pDest[idOut] = Re*Re+Im*Im;
}
}
/**************************************************************************************/
void mat_col_sub (ComplexInt32 *pSrc, uint32_t *pDest, int Cols, int Rows)
{
dim3 dimBlock(DIMX, DIMY);
dim3 dimGrid;
dimGrid.x = (Rows+ dimBlock.x - 1) / dimBlock.x;
dimGrid.y = (Cols+ dimBlock.y - 1) / dimBlock.y;
clock_gettime (CLOCK_REALTIME, &Before);
mat_col_sub_kernel <<<dimGrid, dimBlock>>> (pSrc, pDest, Cols, Rows);
clock_gettime (CLOCK_REALTIME, &After);
}
Thank you,
Zvika