BFS gives different answer on GPU and CPU

I have been trying to get the BFS Algorithm to work on my GPU using CUDA but I can’t get it to work for more than 10 vertices.

I am running Cuda toolkit 12.2 on Arch Linux
Intel i5 12450H
Nvidia RTX 3050 (laptop)

#include <bits/stdc++.h>

using std::cout;
using std::endl;
using std::find;
using std::queue;
using std::vector;

#define blockSize 256
#define numVert 512
#define numEdge 1024
#define gridSize (numVert + blockSize - 1) / blockSize

void GenGraph(int *rowPtr, int *colIdx, int *weights)
{
    rowPtr[0] = 0;

    std::random_device rd;
    std::mt19937 gen(rd());
    std::uniform_int_distribution<> vertDist(1, numEdge / numVert);
    std::uniform_int_distribution<> colDist(0, numVert - 1);
    std::uniform_int_distribution<> weightDist(1, 99);

    int k = 0;
    for (int i = 0; i < numVert; i++)
    {
        int numEdgePerVert = vertDist(gen);
        for (int j = 0; j < numEdgePerVert; j++)
        {
            int vert = colDist(gen);
            int weight = weightDist(gen);
            colIdx[k] = (vert);
            weights[k] = (weight);
            k++;
        }
        rowPtr[i + 1] = rowPtr[i] + numEdgePerVert;
    }
}

void PrintGraphCSR(int *rowPtr, int *colIdx, int *weights)
{
    for (int i = 0; i < numVert; i++)
    {
        for (int j = rowPtr[i]; j < rowPtr[i + 1]; j++)
        {
            cout << "Index: " << i << " Edge to vertex: " << colIdx[j] << " Weight Of Edge: " << weights[j] << endl;
        }
    }
}

__global__ void cudaBFS(int *rowPtr, int *colIdx, int *weights, bool *frontier, int *cost, bool *visited)
{
    int tid = blockDim.x * blockIdx.x + threadIdx.x;

    if (tid < numVert && frontier[tid])
    {
        frontier[tid] = false;
        visited[tid] = true;

        for (int i = rowPtr[tid]; i < rowPtr[tid + 1]; i++)
        {
            int vert = colIdx[i];
            int weight = weights[i];

            if (!visited[vert])
            {
                int old_cost = atomicMin(&cost[vert], cost[tid] + weight);
                if (old_cost > cost[tid] + weight)
                {
                    // The atomicMin operation updated the cost array, mark the neighbor as a new frontier
                    frontier[vert] = true;
                }
            }
        }
    }
}

void BFS(int *rowPtr, int *colIdx, int *weights, queue<int> frontier, int *cost)
{
    while (!frontier.empty())
    {
        int tid = frontier.front();
        frontier.pop();
        for (int i = rowPtr[tid]; i < rowPtr[tid + 1]; i++)
        {
            int vert = colIdx[i];
            int weight = weights[i];
            if (cost[vert] == INT32_MAX)
            {
                cost[vert] = cost[tid] + weight;
                frontier.push(vert);
            }
        }
    }
}

int main()
{
    int *rowPtr, *colIdx, *weights;
    cudaMallocManaged(&rowPtr, (numVert + 1) * sizeof(int));
    cudaMallocManaged(&colIdx, numEdge * sizeof(int));
    cudaMallocManaged(&weights, numEdge * sizeof(int));

    cudaDeviceSynchronize();
    GenGraph(rowPtr, colIdx, weights);
    PrintGraphCSR(rowPtr, colIdx, weights);

    // initialising all the common variables
    int *cost; //-1 denotes infinity;
    bool *visited;
    bool *frontier;
    cudaMallocManaged(&frontier, numVert * sizeof(bool));
    cudaMallocManaged(&cost, numVert * sizeof(int));
    cudaMallocManaged(&visited, numVert * sizeof(bool));

    for (int i = 0; i < numVert; i++)
    {
        cost[i] = INT32_MAX;
        visited[i] = false;
        frontier[i] = false;
    }

    cudaDeviceSynchronize();
    int source = 0;
    frontier[source] = true;
    cost[source] = 0;
    visited[source] = true;

    while (true) // Infinite loop - BFS will terminate based on the state of the frontier array
    {
        cudaBFS<<<gridSize, blockSize>>>(rowPtr, colIdx, weights, frontier, cost, visited);
        cudaDeviceSynchronize();

        // Check if any new nodes were added to the frontier
        bool frontierEmpty = true;
        for (int i = 0; i < numVert; i++)
        {
            if (frontier[i])
            {
                frontierEmpty = false;
                break;
            }
        }

        if (frontierEmpty)
            break; // BFS is completed, terminate the loop
    }

    int *costCPU = new int[numVert];
    for (int i = 0; i < numVert; i++)
    {
        costCPU[i] = INT32_MAX;
    }
    costCPU[source] = 0;
    queue<int> frontierCPU;
    frontierCPU.push(source);
    BFS(rowPtr, colIdx, weights, frontierCPU, costCPU);

    // assert(memcmp(cost, costCPU, numVert * sizeof(int)) == 0);
    // cout << "Correct Answer" << endl;

    int firstDifferenceIndex = -1;
    for (int i = 0; i < numVert; i++)
    {
        if (cost[i] != costCPU[i])
        {
            firstDifferenceIndex = i;
            break;
        }
    }

    if (firstDifferenceIndex == -1)
    {
        cout << "Arrays cost and costCPU are the same.\n";
    }
    else
    {
        cout << "cost: " << cost[firstDifferenceIndex] << " costCPU " << costCPU[firstDifferenceIndex] << " at index " << firstDifferenceIndex << endl;
    }

    return 0;
}

I wrote the kernel below at first -

__global__ void cudaBFS(int *rowPtr, int *colIdx, int *weights, bool *frontier, int *cost, bool *visited)
{
    int tid = blockDim.x * blockIdx.x + threadIdx.x;

    if (tid < numVert)
    {
        if (frontier[tid])
        {
            frontier[tid] = false;
            visited[tid] = true;
            __syncthreads();

            for (int i = rowPtr[tid]; i < rowPtr[tid + 1]; i++)
            {
                intvert= colIdx[i];
                int weight = weights[i];

                if (!visited[col])
                {
                    cost[col] = cost[tid] + weight;
                    frontier[col] = true;
                }
            }
        }
    }
}

I’ve been at it for hours now, any help is appreciated.

I tried to use __syncthreads() first to avoid racing, which is the only problem I can identify in the implementation. That didn;t work so I tried to use atomicMin to avoid racing at the cost of performance. But that is also not working.

(I guess a cross-posting is here)

Suppose thread 0 runs, and eventually decides that it should set frontier[256] to be true. Then, some time later, thread 256 begins to run, and immediately sets frontier[256] to be false. Is that your intent?

(note that the chosen frontier index isn’t terribly important. I could ask the same question with frontier[32]).

We can tighten this thought experiment a bit further since there is a loop in your kernel.

Suppose thread 0 (or 31, if you wish) runs, and eventually decides that it should set frontier[32] to be true. At about the same time, one femtosecond after that first setting of frontier[32] to be true, thread 32 starts, and immediately sets frontier[32] to be false. Is that your intent?

Yes that is by design, since in your example thread 0 sets the frontier[256] to be true, when thread 256 starts to run, frontier[256] should be false as now it is no longer part of the frontier, rather it is one of the visited nodes.

Edit - I am the one who posted this on stackoverflow as well, I am new to CUDA and still learning

OK let’s reverse the experiment. Suppose that thread 0 starts and gets to the point where it is about to set frontier[256] to be true. Let’s suppose that thread 256 starts at that point and sets frontier[256] to be false, and then immediately after that thread 0 sets frontier[256] to be true. Is that also your intent?

No that is not my intent, I can see how this is creating a problem.

This is a race condition. Communicating between threads using global variables is somewhat difficult in CUDA.

In CUDA, threads (even in the same warp, although that is not the case I have proposed here) can execute in any order. You have a situation where two different threads are interacting with the same global location (reading and writing), and depending on the order of execution, which is totally uncontrolled by you in your code, you can be at the exact same point in execution, but the global value can have one of two different values.

The only way that could be sensible is if the contents of that global variable are irrelevant to the behavior of your code.

I haven’t studied your entire code, but this sort of mechanism probably won’t work, and requires more thought as well as probably additional execution and visibility controls, to make it work.

Ok, so I need to make sure that race conditions don’t happen. I thought that the distance is getting updated by two or more threads simoultaneously so I used an atomic function there but I didn’t think that the frontier may also be update.
I don’t know yet how to resolve that, but thanks for the help! I was definitely not looking at the right place to resolve

I don’t know where all the concerns may be in your code. But in any situation where you have more than one thread interacting with a particular global location (with at least one thread writing), special attention is warranted. The atomics are certainly one possible way to sort this out, if you are doing some sort of reduction (e.g. sum, minimum, maximum, etc.)

But you should consider every situation like this. We’ve discussed frontier. You may want to look at your usage of visited also.

I was looking at other BFS codes online and I found one that using the same (almost) algorithm as me - having a frontier (Fa) and visited array (Xa), however his code seems to work


#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cuda.h>
#include <device_functions.h>
#include <cuda_runtime_api.h>

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <conio.h>
#define NUM_NODES 5

typedef struct
{
	int start;     // Index of first adjacent node in Ea	
	int length;    // Number of adjacent nodes 
} Node;

__global__ void CUDA_BFS_KERNEL(Node *Va, int *Ea, bool *Fa, bool *Xa, int *Ca,bool *done)
{

	int id = threadIdx.x + blockIdx.x * blockDim.x;
	if (id > NUM_NODES)
		*done = false;


	if (Fa[id] == true && Xa[id] == false)
	{
		printf("%d ", id); //This printf gives the order of vertices in BFS	
		Fa[id] = false;
		Xa[id] = true;
		__syncthreads(); 
		int k = 0;
		int i;
		int start = Va[id].start;
		int end = start + Va[id].length;
		for (int i = start; i < end; i++) 
		{
			int nid = Ea[i];

			if (Xa[nid] == false)
			{
				Ca[nid] = Ca[id] + 1;
				Fa[nid] = true;
				*done = false;
			}

		}

	}

}

// The BFS frontier corresponds to all the nodes being processed at the current level.


int main()
{




	 Node node[NUM_NODES];
	
	
	//int edgesSize = 2 * NUM_NODES;
	int edges[NUM_NODES];

	node[0].start = 0;
	node[0].length = 2;

	node[1].start = 2;
	node[1].length = 1;

	node[2].start = 3;
	node[2].length = 1;

	node[3].start = 4;
	node[3].length = 1;

	node[4].start = 5;
	node[4].length = 0;

	edges[0] = 1;
	edges[1] = 2;	
	edges[2] = 4;
	edges[3] = 3;
	edges[4] = 4;

	bool frontier[NUM_NODES] = { false };
	bool visited[NUM_NODES] = { false };
	int cost[NUM_NODES] = { 0 };

	int source = 0;
	frontier[source] = true;

	Node* Va;
	cudaMalloc((void**)&Va, sizeof(Node)*NUM_NODES);
	cudaMemcpy(Va, node, sizeof(Node)*NUM_NODES, cudaMemcpyHostToDevice);

	int* Ea;
	cudaMalloc((void**)&Ea, sizeof(Node)*NUM_NODES);
	cudaMemcpy(Ea, edges, sizeof(Node)*NUM_NODES, cudaMemcpyHostToDevice);

	bool* Fa;
	cudaMalloc((void**)&Fa, sizeof(bool)*NUM_NODES);
	cudaMemcpy(Fa, frontier, sizeof(bool)*NUM_NODES, cudaMemcpyHostToDevice);

	bool* Xa;
	cudaMalloc((void**)&Xa, sizeof(bool)*NUM_NODES);
	cudaMemcpy(Xa, visited, sizeof(bool)*NUM_NODES, cudaMemcpyHostToDevice);

	int* Ca;
	cudaMalloc((void**)&Ca, sizeof(int)*NUM_NODES);
	cudaMemcpy(Ca, cost, sizeof(int)*NUM_NODES, cudaMemcpyHostToDevice);

	

	int num_blks = 1;
	int threads = 5;



	bool done;
	bool* d_done;
	cudaMalloc((void**)&d_done, sizeof(bool));
	printf("\n\n");
	int count = 0;

	printf("Order: \n\n");
	do {
		count++;
		done = true;
		cudaMemcpy(d_done, &done, sizeof(bool), cudaMemcpyHostToDevice);
		CUDA_BFS_KERNEL <<<num_blks, threads >>>(Va, Ea, Fa, Xa, Ca,d_done);
		cudaMemcpy(&done, d_done , sizeof(bool), cudaMemcpyDeviceToHost);

	} while (!done);




	cudaMemcpy(cost, Ca, sizeof(int)*NUM_NODES, cudaMemcpyDeviceToHost);
	
	printf("Number of times the kernel is called : %d \n", count);


	printf("\nCost: ");
	for (int i = 0; i<NUM_NODES; i++)
		printf( "%d    ", cost[i]);
	printf("\n");
	_getch();
	
}

You mean it works for one block of 5 threads with 5 nodes?

That might not be a test case that accurately reflects the behavior when many threads and blocks are involved.

Yeah the graph its using is very very small, I will check this for a reasonable graph