I am trying to implement a parallel version of Breadth-first search using Nvidia CUDA.

My algorithm gives the correct result for any small graph that I have made, and with an undirected graph with about 1.9 million nodes. The problems occur when I try with larger graphs (more than 2 mill nodes). The result is varying for each time, and I get a lot of errors.

My question is if there is some obvious synchronization issue here I am missing? Or maybe something else. Is my use of atomicCAS or atomicAdd naive in some way?

A little explanation of my algorithm, and the most important code:

I use 2 queues, one read_queue containing the vertices that I want to explore the neighbors of, and a write_queue which will contain the vertices for the next iteration.

The goal is to calculate the cost array, the minimum number of edges to each node from the source. Each cost start at -1, and are updated as soon as they are discovered for the first time. I compare my resulting cost array with one from a sequential bfs.

I have made sure that I have enough threads for each vertex, and I have tried with block_size of everything from 32 to 1024. My GPU is Nvidia GeForce MX-150 with Compute Capability 6.1

```
__global__ void bfs(int vertices, int *offset, int *adj, int *cost,
int *read_queue, int rqueue_size, int *write_queue, int *wqueue_size) {
int tid = blockDim.x * blockIdx.x + threadIdx.x;
if (tid < rqueue_size) {
int u = read_queue[tid];
int next_cost = cost[u] + 1;
for (int j = offset[u]; j < offset[u+1]; j++) {
int v = adj[j];
if (cost[v] == -1 && atomicCAS(&cost[v], -1, next_cost) == -1) {
int pos = atomicAdd(wqueue_size, 1);
write_queue[pos] = v;
}
}
}
}
```

And the initialization code

```
int vertices = graph->vertices;
int bytes = vertices * sizeof(int);
int block_size = 256;
int *d_cost;
int *adj;
int *offset;
int *read_queue;
int *write_queue;
int rqueue_size;
int *wqueue_size;
cudaMalloc(&offset, bytes + sizeof(int));
cudaMemcpy(offset, graph->offset, bytes + sizeof(int), cudaMemcpyHostToDevice);
cudaMalloc(&adj, graph->edges * sizeof(int));
cudaMemcpy(adj, graph->adj, graph->edges * sizeof(int), cudaMemcpyHostToDevice);
fill(h_cost, vertices, -1);
h_cost[source] = 0;
cudaMalloc(&d_cost, bytes);
cudaMemcpy(d_cost, h_cost, bytes, cudaMemcpyHostToDevice);
//Allocate the read and write queue, and insert the source vertex.
cudaMalloc(&read_queue, bytes);
cudaMemcpy(read_queue, &source, sizeof(int), cudaMemcpyHostToDevice);
cudaMalloc(&write_queue, bytes);
cudaMalloc(&wqueue_size, sizeof(int));
cudaMemset(wqueue_size, 0, sizeof(int));
int level = 0;
rqueue_size = 1;
int blocks;
while (rqueue_size > 0) {
blocks = (block_size + rqueue_size - 1) / block_size;
bfs<<<blocks, block_size>>>(vertices, offset, adj, d_cost, read_queue, rqueue_size, write_queue, wqueue_size);
cudaMemcpy(&rqueue_size, wqueue_size, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemset(wqueue_size, 0, sizeof(int));
int *tmp = write_queue;
read_queue = write_queue;
write_queue = tmp;
level++;
}
```