Hello!
I’m working on some task related to graph traversal (Viterbi algorithm)
Each time step I have a compacted set of active states, some job is done in each state, and than results are propagated through outgoing arcs to each arc’s destination state and so new active set of states is built.
The problem is that number of outgoing arcs varies very heavily , from two or three to several thousands. So compute threads are loaded very ineffectively. I try to share the job like this:

int tx = threaIdx.x;

extern shared int smem;

int *stateSet_s = smem;
int arcSet_s = &(smem[Q_LEN]);
float scores_s = (float)&(smem[2
Q_LEN]);

shared int arcCnt;
shared int stateCnt;

if ( tx == 0 )
{
arcCnt = 0;
stateCnt = 0;
}

//load state index from compacted list of state indexes
int stateId = activeSetIn_g[gtx];

float srcCost = scores_g[ stateId ];
int startId = outputArcStartIds_g[stateId];

int nArcs = outputArcCounts_g[stateId]; //amount of job to be done

/////////////////////////////////////////////
/// prepare arc set
/// !!! that is the troubled code I think !!!

int myPos = atomicAdd ( &arcCnt, nArcs );

while ( nArcs > 0 ) && ( myPos < Q_LEN ) )
{
scores_s[myPos] = srcCost;
arcSet_s[myPos] = startId + nArcs - 1;

``````myPos++;
nArcs--;
``````

}

//////////////////////////////////////
/// parallel propagate arc set

if ( arcSet_s[tx] > 0 )
{
FstArc arc = arcs_g[ arcSet_s[tx] ];
float srcCost_ = scores_s[tx];

``````DoSomeJob ( &srcCost_ );

int *dst = &(transitionData_g[arc.dst]);

int old = atomicMax( dst, FloatToInt ( srcCost_ ) );

////////////////////////////////
//// new active set

if ( old == ILZERO )
{
int pos = atomicAdd ( &stateCnt, 1 );
stateSet_s[ pos ] = arc.dst;
}
``````

}

/////////////////////////////////////////////
/// transfer new active set from smem to gmem

shared int gPos;

if ( tx == 0 )
{
done = 0;
gPos = atomicAdd ( activeSetOutSz_g, stateCnt );
}

if ( tx < stateCnt )
{
activeSetOut_g[gPos + tx] = stateSet_s[tx];
}