Here is a complete example of “policeman” struct that does not work to me:

```
#include <stdio.h>
#include <iostream>
#include <iomanip>
#define NTH 2 // Number of threads
#define N 1<<5 // Number of total calculations
using namespace std;
struct policeman
{
bool deja;
int inThere[N]; // Is equal to 0 if the calculation (i) is not waiting in queue, 1 if it's waiting and 2 if it's his turn
bool ends[N]; // Is equal to "true" if the calculation has ended
int i;
__host__ __device__ void init() // Initialization
{
deja = false;
i = 0;
for(int i = 1; i < N; i++)
inThere[i] = 0;
for(int i = 0; i < N; i++)
ends[i] = false;
}
__host__ __device__ void end(int const& n) // Called when a thread has finished
{
ends[n] = true;
}
__host__ __device__ int const& getIT(int const& n) // Called to retrieve the value of inThere
{
return inThere[n];
}
__host__ __device__ void canI(int const& n) // Called for entering the queue
{
inThere[n] = 1;
}
__host__ __device__ void done(int const& n) // Called when calculation is done
{
inThere[n] = 0;
}
__host__ __device__ void handle() // Called by the thread 0, to handle accesses to memory
{
volatile int infinity = 1;
while(infinity)
{
ends[0] = true;
for(i = 1; i < N; i++) // If all calculations are done, finish this thread too
if(!ends[i])
{
i = N;
infinity = 0;
}
deja = false;
for(i = 1; i < N; i++) // If someone already access to the memory, "deja" will be true
if(inThere[i] == 2)
{
i = N;
deja = true;
}
if(!deja) // If noone is accessing the memory
for(i = 1; i < N; i++)
if(inThere[i] == 1)
{
i = N;
inThere[i] = 2; // Give the access to the memory to the first thread encountered
}
}
}
};
__global__
void saxpy(int n, policeman *x, double *val)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if(i == 0) // If it's the thread 0 of the block 0, handle
{
x->handle();
}
else if (i < n && threadIdx.x != 0) // Else...
{
x->canI(i); // Ask for permission...
volatile int infinity = 1;
while(infinity) // ... until he gets it, and then...
if(x->getIT(i) == 2)
infinity = 0;
(*val) += 1; // ... do the calculation and...
x->done(i); // ... tell it!
}
if (i != 0 && i < n)
x->end(i);
}
int main(void)
{
policeman *x, *d_x;
double *val, *d_val;
clock_t start, end;
val = (double*)malloc(sizeof(double));
val[0] = 0;
x = (policeman*)malloc(sizeof(policeman));
x->init();
start = clock();
cudaMalloc((void**)&d_x, sizeof(policeman));
cudaMalloc(&d_val, sizeof(double));
cudaMemcpy(d_x, x, sizeof(policeman), cudaMemcpyHostToDevice);
cudaMemcpy(d_val, val, sizeof(double), cudaMemcpyHostToDevice);
saxpy<<<(N+NTH-1)/NTH, NTH>>>(N, d_x, d_val);
cudaMemcpy(x, d_x, sizeof(policeman), cudaMemcpyDeviceToHost);
cudaMemcpy(val, d_val, sizeof(double), cudaMemcpyDeviceToHost);
end = clock();
double time_taken = double(end - start) / double(CLOCKS_PER_SEC);
cout << "Time taken by program is : " << fixed
<< time_taken << setprecision(5);
cout << " sec : " << val[0] << " " << x->i << endl;
cudaFree(d_x);
cudaFree(d_val);
free(x);
free(val);
}
```

For a dark reason that I don’t understand, the infinite loop from the “handle” function seems to work, but not the other one directly in the kernel (but I might be wrong).

Thanks a lot again,

ChoCChoK.