Actually, i removed all the rest of the code and still freezes and I am trying it to run just as follows.

int tx = (blockDim.x * blockIdx.x) + threadIdx.x;

int k;

int count = 0;

for (k = 1; k <= *no_elements_Y_device; k++) {

count = count + 4;

__syncthreads();

if(threadIdx.x == 1)

atomicAdd(lock, 1);

while(count != *lock);

}

}

The full code supposes to be like this:

**global** static void P7Viterbi_cuda_device_loop_two(int *imx, int *mmx, int *dmx, int *no_elements_Y_device, int *no_elements_X_device, int *xmx, unsigned char *dsq, int * xsc, int *tsc, int * bsc, int * msc, int *isc, int* esc, int * lock){

```
int tx = (blockDim.x * blockIdx.x) + threadIdx.x;
int i = tx * (*no_elements_Y_device + 2);
int i_minus_one = (tx-1) * (*no_elements_Y_device + 2);
int i_xmx = tx * 4;
int i_xmx_minus_one = (tx-1) * 4;
int k, count = 0;
int sc;
int *mc, *dc, *ic; /* pointers to rows of mmx, dmx, imx */
int *ms, *is; /* pointers to msc[i], isc[i] */
int *mpp, *mpc, *ip; /* ptrs to mmx[i-1], mmx[i], imx[i-1] */
int *bp; /* ptr into bsc[] */
int *ep; /* ptr into esc[] */
int xmb; /* value of xmx[i-1][XMB] */
int xme; /* max for xmx[i][XME] */
int *dpp; /* ptr into dmx[i-1] (previous row) */
int *tpmm, *tpmi, *tpmd, *tpim, *tpii, *tpdm, *tpdd; /* ptrs into tsc */
if((i > 0) && (i <= ((*no_elements_Y_device + 2)* (*no_elements_X_device)))){
tpmm = &tsc[TMM];
tpim = &tsc[TIM];
tpdm = &tsc[TDM];
tpmd = &tsc[TMD];
tpdd = &tsc[TDD];
tpmi = &tsc[TMI];
tpii = &tsc[TII];
bp = bsc;
mc = &mmx[i];
dc = &dmx[i];
ic = &imx[i];
mpp = &mmx[i_minus_one];
dpp = &dmx[i_minus_one];
ip = &imx[i_minus_one];
xmb = xmx[i_xmx + XMB];
ms = &msc[dsq[tx]];
is = &isc[dsq[tx]];
mc[0] = -INFTY;
dc[0] = -INFTY;
ic[0] = -INFTY;
for (k = 1; k <= *no_elements_Y_device; k++) {
count = count + 4;
mc[k] = mpp[k-1] + tpmm[k-1];
if ((sc = ip[k-1] + tpim[k-1]) > mc[k]) mc[k] = sc;
if ((sc = dpp[k-1] + tpdm[k-1]) > mc[k]) mc[k] = sc;
if ((sc = xmb + bp[k]) > mc[k]) mc[k] = sc;
mc[k] += ms[k];
if (mc[k] < -INFTY) mc[k] = -INFTY;
dc[k] = dc[k-1] + tpdd[k-1];
if ((sc = mc[k-1] + tpmd[k-1]) > dc[k]) dc[k] = sc;
if (dc[k] < -INFTY) dc[k] = -INFTY;
if (k < (*no_elements_Y_device)) {
ic[k] = mpp[k] + tpmi[k];
if ((sc = ip[k] + tpii[k]) > ic[k]) ic[k] = sc;
ic[k] += is[k];
if (ic[k] < -INFTY) ic[k] = -INFTY;
}
__syncthreads();
if(threadIdx.x == 1)
atomicAdd(lock, 1);
while(count != *lock);
}
}
```

basically the [i][k] depends on [i-1][k-1] so I am trying to use the 4 microprocessors (using 4 blocks) in a synchronized manner to compute [all i’s][k], then [all i’s][k +1]…then [all i’s][k + 2] and so on.

Basically, when a block finishes with all k’s will add 1 to lock (we have 4 blocks so count will be 4 on the first loop) then when count == lock means that all blocks have computed all i’s for that k and can go compute the all i’s for the next (k+1) and again each block will add 1 to lock to make 8 …

get it??