Kernel optimization and register usage reduction reducing the banching.

Hello all,

please see the code below.

This is an image processign application and i’m trying to use cuda for that.

__global__ void RenderFrame(float* Ipframes, float* slice..  ) /* 5 input params */

{

int nIndex = blockIdx.x * blockDim.x + threadIdx.x;

if( nIndex >= MAX_DIM )

       return;

// some calculation //

for() // approximate 500 times

{

   if( <condition >) 

     continue;

 if( <condition >)

  {

   // some calculation

    if ( )

     continue;

   if ( )

     continue;

 // bilinear interpolation    

    fpOp[ uiIndx  ] = fpOp[ uiIndx  ] + fIntensity;  

   }

  else

  {

    // some calculation

   

    fpOp[ uiIndx  ] = fpOp[ uiIndx  ] + fIntensity; 

   }

}

}

I cannot run this kernel with more than 384 threads per block ( kernel launch error will occur - too many resources requested for launch ) . further, with 2900 blocks, this takes around 154ms to complete.

but if i comment the " fpOp[ uiIndx ] = fpOp[ uiIndx ] + fIntensity; " line, the kernel runs in 1.4 ms and i can also increase the threads per block to maximum ( 512 ).

the fpOP is float array in global memory.

I’m a newbie in cuda.

Please advice me about optimizations.

Thanks in advance

you have two “fpOp[ uiIndx ] = fpOp[ uiIndx ] + fIntensity;” lines. when you comment both, the compiler (smarter than you think!!) decides that the whole kernel does nothing and simply rewrite your kernel as an empty body. this is a rough picture.

thanks for the reply.

well … there are lot more calculations going on there…

i changed a little bit … so that array wont be accessed inside the loop, declared a temp float variable outside.

aggregation done by

fTempVal = fTempVal + fIntensity;

and finaly outside the for loop

__syncthreads();

fpOpVolumeSlice[ uiIndx ] = fTempVal;

In this kernel hanged.

and without syncthreads, the timing is as earlier.

whats wrong.?

this seems to be correct :blink:

what could be done next .?

Maybe you’ll find it useful:

http://forums.nvidia.com/index.php?showtopic=72854&hl=

Thanks ,

i tried using the shared memory array method in the link, and was able reduce only 1 register - now 15regs ( when i replaced the loop variable )

any variable changed inside the loop increased the reg count to 17

I’m attaching the code, could someone pls comment in this

__global__ void RenderFrame(

                            float* fpIpProjectionframes,

                            float* fpOpVolumeSlice,

                            int nSliceProjectionZ, 

                            int nRowOffset,

                            int nSliceNumRows

                            )

{

    extern __shared__ int blockData[];

    int *myData= blockData+threadIdx.x * 3;

   float fTempVal;

    

    int nIndex = blockIdx.x * blockDim.x + threadIdx.x;

   if( nIndex >= m_nOPVolumeSliceDim)

        return;

   int nRowIndexY =  nIndex / m_nVolumetricDataDimensionWidthX; 

    int nColumnIndexX = nIndex - ( nRowIndexY * m_nVolumetricDataDimensionWidthX );

   int nColumnRelativeIndexX	=  nColumnIndexX-((m_nVolumetricDataDimensionWidthX - 1)>> 1);	

    float fColumnRelativeIndexX	= (nColumnRelativeIndexX - fCenter_X) * fVoxel_Size_X;

   int nRowRelativeIndexY =  nRowIndexY-((m_nVolumetricDataDimensionHeightY-1)>>1);

    float fRowRelativeIndexY = (nRowRelativeIndexY - fCenter_Y) * fVoxel_Size_Y;

   int nSliceRelativeIndexZ =  nSliceProjectionZ-((m_nVolumetricDataDimensionSlisesZ-1)>>1);

    float fSliceRelativeIndexZ = (nSliceRelativeIndexZ - fCenter_Z) * fVoxel_Size_Z;	

   nIndex = nSliceProjectionZ * m_nVolumetricDataDimensionWidthX * m_nVolumetricDataDimensionHeightY 

                          + nRowIndexY * m_nVolumetricDataDimensionWidthX + nColumnIndexX;

   fTempVal = fpOpVolumeSlice[ nIndex  ];

    

    #define wframeNum   (myData[0])

    for( wframeNum = 0; wframeNum < m_nTotal_Projections; ++wframeNum)

    {

        float fT, fS;

        {

            float fBeta          = (float)( 0.72f * wframeNum * - 0.0175f ); 

            float fSinBeta       = sinf( fBeta );

            float fCosBeta       = cosf( fBeta );      

           fT	= fColumnRelativeIndexX * fCosBeta + fRowRelativeIndexY * fSinBeta;

            fS	= -fColumnRelativeIndexX * fSinBeta + fRowRelativeIndexY * fCosBeta;

        }

        int nFrameColumnIndex,nFrameRowIndex;

        float fColumnDecimalValue,fRowDecimalValue;

        {

            float fColumnU	= ( fT * m_fDistance_From_Source_Object ) / ( m_fDistance_From_Source_Object - fS );

           fColumnU = (fColumnU / fPixel_Size_Cols) + float((m_nEach_Frame_Size_Columns + 1) >> 1) + fOffsetCols;	

           float fRowV	= ( fSliceRelativeIndexZ * m_fDistance_From_Source_Object ) / ( m_fDistance_From_Source_Object - fS );                   

           fRowV	= (fRowV / fPixel_Size_Rows) + float((m_nEach_Frame_Size_Rows + 1) >> 1) + fOffsetRows; 

           fRowV  = fRowV - nRowOffset; 

           nFrameColumnIndex	= (int)fColumnU;               

            nFrameRowIndex  = (int)fRowV;

           fColumnDecimalValue = fColumnU - nFrameColumnIndex;

            fRowDecimalValue = fRowV - nFrameRowIndex; 

        }

       if( nFrameColumnIndex < 0 ||  nFrameRowIndex < 0 || nFrameColumnIndex  >= m_nEach_Frame_Size_Columns

            || nFrameRowIndex >=  m_nEach_Frame_Size_Rows )

        {

           continue;

        }

       float fIntensity	= 0.00f;

        float fPower  = (m_fDistance_From_Source_Object - fS) * (m_fDistance_From_Source_Object - fS);

       int nIndexPreCalc = wframeNum * m_nEach_Frame_Size_Columns * nSliceNumRows;

        float fp1 = fpIpProjectionframes[ nIndexPreCalc + nFrameRowIndex * m_nEach_Frame_Size_Columns + nFrameColumnIndex ];

       if ( fColumnDecimalValue > 0.0f || fRowDecimalValue > 0.0f )

        {

            // bilinear interpolation

            // read ahead.. optimization

            int nFrameColumnIndex_Plus_1 = nFrameColumnIndex  + 1;

            if( nFrameColumnIndex_Plus_1 >= m_nEach_Frame_Size_Columns )

            {

                continue;

            }

            float fp2 = fpIpProjectionframes[ nIndexPreCalc + nFrameRowIndex * m_nEach_Frame_Size_Columns + nFrameColumnIndex_Plus_1 ];

           int  nFrameRowIndex_Plus_1	= nFrameRowIndex  + 1;

            if( nFrameRowIndex_Plus_1 >=  m_nEach_Frame_Size_Rows)

            {

                continue;

            }

            float fp3 = fpIpProjectionframes[ nIndexPreCalc + nFrameRowIndex_Plus_1 * m_nEach_Frame_Size_Columns + nFrameColumnIndex ];

            float fp4 = fpIpProjectionframes[ nIndexPreCalc + nFrameRowIndex_Plus_1 * m_nEach_Frame_Size_Columns + nFrameColumnIndex_Plus_1 ];

           float fClmnWght =   1.0f - fColumnDecimalValue;

            float fRowWght  =   1.0f - fRowDecimalValue;

           fp1 = fClmnWght*fRowWght * fp1;

            fp2 = fColumnDecimalValue * fRowWght * fp2;

            fp3 = fRowDecimalValue * fClmnWght * fp3;

            fp4 = fColumnDecimalValue * fRowDecimalValue * fp4;

           fIntensity  =  fp1 + fp2 +  fp3 + fp4;

           fIntensity	= fIntensity * (fDistanceSquare / fPower);

            

            fTempVal = fTempVal + fIntensity;

        }

        else

        { 

            fIntensity	= fp1 * (fDistanceSquare / fPower);

            fTempVal = fTempVal + fIntensity;

        }

        

    }

    fpOpVolumeSlice[ nIndex  ] = fTempVal;

}

Thanks

What was the question again: you want to know how to optimize your kernel? The same advice applies here that applies to every other kernel discussed on these forums.

Number 1: coalesce memory loads/stores.
Run it through the visual profiler and check the number of uncoalesced loads. Your reads/writes into fpOpVolumeSlice are coalesced, but I suspect that the reads from fpIpProjectionframes are not. Reading from a texture with tex1Dfetch could boost your performance

Number 2: Evaluate your performance:
count the number of GiB/s of memory read and written by your kernel and compare to the theoretical maximum for your card, you should be able to attain close to the max. Similarly count the number of GFLOP/s. How close you get to the max on this could be limited by the memory reads so don’t necessarily expect to get close to the device max.