I have been trying to get the BFS Algorithm to work on my GPU using CUDA but I can’t get it to work for more than 10 vertices.

I am running Cuda toolkit 12.2 on Arch Linux

Intel i5 12450H

Nvidia RTX 3050 (laptop)

```
#include <bits/stdc++.h>
using std::cout;
using std::endl;
using std::find;
using std::queue;
using std::vector;
#define blockSize 256
#define numVert 512
#define numEdge 1024
#define gridSize (numVert + blockSize - 1) / blockSize
void GenGraph(int *rowPtr, int *colIdx, int *weights)
{
rowPtr[0] = 0;
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<> vertDist(1, numEdge / numVert);
std::uniform_int_distribution<> colDist(0, numVert - 1);
std::uniform_int_distribution<> weightDist(1, 99);
int k = 0;
for (int i = 0; i < numVert; i++)
{
int numEdgePerVert = vertDist(gen);
for (int j = 0; j < numEdgePerVert; j++)
{
int vert = colDist(gen);
int weight = weightDist(gen);
colIdx[k] = (vert);
weights[k] = (weight);
k++;
}
rowPtr[i + 1] = rowPtr[i] + numEdgePerVert;
}
}
void PrintGraphCSR(int *rowPtr, int *colIdx, int *weights)
{
for (int i = 0; i < numVert; i++)
{
for (int j = rowPtr[i]; j < rowPtr[i + 1]; j++)
{
cout << "Index: " << i << " Edge to vertex: " << colIdx[j] << " Weight Of Edge: " << weights[j] << endl;
}
}
}
__global__ void cudaBFS(int *rowPtr, int *colIdx, int *weights, bool *frontier, int *cost, bool *visited)
{
int tid = blockDim.x * blockIdx.x + threadIdx.x;
if (tid < numVert && frontier[tid])
{
frontier[tid] = false;
visited[tid] = true;
for (int i = rowPtr[tid]; i < rowPtr[tid + 1]; i++)
{
int vert = colIdx[i];
int weight = weights[i];
if (!visited[vert])
{
int old_cost = atomicMin(&cost[vert], cost[tid] + weight);
if (old_cost > cost[tid] + weight)
{
// The atomicMin operation updated the cost array, mark the neighbor as a new frontier
frontier[vert] = true;
}
}
}
}
}
void BFS(int *rowPtr, int *colIdx, int *weights, queue<int> frontier, int *cost)
{
while (!frontier.empty())
{
int tid = frontier.front();
frontier.pop();
for (int i = rowPtr[tid]; i < rowPtr[tid + 1]; i++)
{
int vert = colIdx[i];
int weight = weights[i];
if (cost[vert] == INT32_MAX)
{
cost[vert] = cost[tid] + weight;
frontier.push(vert);
}
}
}
}
int main()
{
int *rowPtr, *colIdx, *weights;
cudaMallocManaged(&rowPtr, (numVert + 1) * sizeof(int));
cudaMallocManaged(&colIdx, numEdge * sizeof(int));
cudaMallocManaged(&weights, numEdge * sizeof(int));
cudaDeviceSynchronize();
GenGraph(rowPtr, colIdx, weights);
PrintGraphCSR(rowPtr, colIdx, weights);
// initialising all the common variables
int *cost; //-1 denotes infinity;
bool *visited;
bool *frontier;
cudaMallocManaged(&frontier, numVert * sizeof(bool));
cudaMallocManaged(&cost, numVert * sizeof(int));
cudaMallocManaged(&visited, numVert * sizeof(bool));
for (int i = 0; i < numVert; i++)
{
cost[i] = INT32_MAX;
visited[i] = false;
frontier[i] = false;
}
cudaDeviceSynchronize();
int source = 0;
frontier[source] = true;
cost[source] = 0;
visited[source] = true;
while (true) // Infinite loop - BFS will terminate based on the state of the frontier array
{
cudaBFS<<<gridSize, blockSize>>>(rowPtr, colIdx, weights, frontier, cost, visited);
cudaDeviceSynchronize();
// Check if any new nodes were added to the frontier
bool frontierEmpty = true;
for (int i = 0; i < numVert; i++)
{
if (frontier[i])
{
frontierEmpty = false;
break;
}
}
if (frontierEmpty)
break; // BFS is completed, terminate the loop
}
int *costCPU = new int[numVert];
for (int i = 0; i < numVert; i++)
{
costCPU[i] = INT32_MAX;
}
costCPU[source] = 0;
queue<int> frontierCPU;
frontierCPU.push(source);
BFS(rowPtr, colIdx, weights, frontierCPU, costCPU);
// assert(memcmp(cost, costCPU, numVert * sizeof(int)) == 0);
// cout << "Correct Answer" << endl;
int firstDifferenceIndex = -1;
for (int i = 0; i < numVert; i++)
{
if (cost[i] != costCPU[i])
{
firstDifferenceIndex = i;
break;
}
}
if (firstDifferenceIndex == -1)
{
cout << "Arrays cost and costCPU are the same.\n";
}
else
{
cout << "cost: " << cost[firstDifferenceIndex] << " costCPU " << costCPU[firstDifferenceIndex] << " at index " << firstDifferenceIndex << endl;
}
return 0;
}
```

I wrote the kernel below at first -

```
__global__ void cudaBFS(int *rowPtr, int *colIdx, int *weights, bool *frontier, int *cost, bool *visited)
{
int tid = blockDim.x * blockIdx.x + threadIdx.x;
if (tid < numVert)
{
if (frontier[tid])
{
frontier[tid] = false;
visited[tid] = true;
__syncthreads();
for (int i = rowPtr[tid]; i < rowPtr[tid + 1]; i++)
{
intvert= colIdx[i];
int weight = weights[i];
if (!visited[col])
{
cost[col] = cost[tid] + weight;
frontier[col] = true;
}
}
}
}
}
```

I’ve been at it for hours now, any help is appreciated.

I tried to use __syncthreads() first to avoid racing, which is the only problem I can identify in the implementation. That didn;t work so I tried to use atomicMin to avoid racing at the cost of performance. But that is also not working.