Optimizing Multiple BFS Calls in cuGraph for Better Performance in Multithreaded Applications

I am currently working on a CUDA and cuGraph-based application where I need to perform multiple BFS (Breadth-First Search) operations in a multithreaded environment. The application experiences bottlenecks, particularly with frequent and concurrent BFS calls. Here is a simplified version of my current implementation:

auto upstream_mr = rmm::mr::get_current_device_resource();
std::size_t initial_pool_size = 4L * 1024 * 1024 * 1024; // 4 GB
std::size_t maximum_pool_size = 5L * 1024 * 1024 * 1024; // 5 GB
rmm::mr::pool_memory_resourcermm::mr::device_memory_resource pool_mr(upstream_mr, initial_pool_size, maximum_pool_size);
rmm::mr::set_current_device_resource(&pool_mr);

raft::handle_t handle2;
rmm::cuda_stream_view stream_view2(handle2.get_stream());
GraphX::createSrcsDsts(graphRecovery1); GraphX::constructRecoveryGraphInGPU(graphRecovery1,handle2,stream_view2);

fprintf(stderr, “CUGRAPH CREATED\n”);
GraphX::createSourcesInGpu(sources1,stream_view2);

#pragma omp parallel num_threads(min_thread)
{
int threadNum = omp_get_thread_num();

	   rmm::cuda_stream_view  stream_view = resources.get_stream_from_stream_pool(threadNum%min_thread);
	  
		raft::handle_t local_handle(stream_view);

rmm::device_uvector<int64_t> local_distances(graphRecovery1->size(), stream_view);
rmm::device_uvector<int64_t> local_distances2(graphRecovery1->size(), stream_view);
rmm::device_uvector<int64_t> local_predecessors(0, stream_view);
rmm::device_uvector<int64_t> local_predecessors2(0, stream_view);

#pragma omp for schedule(dynamic) nowait

		for (int i = 0; i < v_edgeWeight.size(); i++)
		{

std::vector v_edges = v_edgeWeight[i];
for (long j=0; j<v_edges; j++)
{

				long edgeWeight = 0;
				edge_t e0;
				e0 = v_edges[j].e_t;
				edgeWeight = v_edges[j].weight;
				vertex_t vv1_t = source(e0, (*G0));
				vertex_t vv2_t = target(e0, (*G0));

if(vv1_t != vv2_t){
if (GraphX::isPathExistRecoveryIndexCuGraph((long)vertex_indices[vv1_t], (long)vertex_indices[vv2_t],(long)vv1_t,(long)vv2_t, local_handle, stream_view,local_distances,local_distances2,local_predecessors,local_predecessors2) ){

					valid1 = false;
				}

in graphx.cpp:

bool GraphX::isPathExistRecoveryIndexCuGraph(int64_t i, int64_t j, int64_t v1, int64_t v2, raft::handle_t local_handle, rmm::cuda_stream_view stream_view,
rmm::device_uvector<int64_t>& local_distances, rmm::device_uvector<int64_t>& local_distances2,
rmm::device_uvector<int64_t>& local_predecessors,rmm::device_uvector<int64_t>& local_predecessors2) {

 if (!GraphX::graphView) {
        std::cerr << "Graph is not constructed. Please call constructGraphInGPU first." << std::endl;
    }
cugraph::bfs(
    local_handle,
    *GraphX::graphView,
    local_distances.data(),
    local_predecessors.data(),
    d_sources->begin() +i,
    1,
    false,
    std::numeric_limits<int64_t>::max(),
    false);

cugraph::bfs(
local_handle,
*GraphX::graphView,
local_distances2.data(),
local_predecessors2.data(),
d_sources->begin() +j,
1,
false,
std::numeric_limits<int64_t>::max(),
false);
int64_t distance_to_v2;
int64_t distance_to_v1;
raft::update_host(&distance_to_v2, local_distances.data() + v2, 1, stream_view);
raft::update_host(&distance_to_v1, local_distances2.data() + v1, 1, stream_view);
return distance_to_v2 != std::numeric_limits<int64_t>::max() || distance_to_v1 != std::numeric_limits<int64_t>::max();
}

Performance Bottleneck: Frequent launching of BFS seems to be a significant bottleneck, especially when these launches are from multiple threads.

you might wish to follow the suggestion for rapids community activity