Hello.
In 11.6 cudaDeviceSynchronize was deprecated. In the CUDA 12 New Features, graphs were presented for dynamic parallelism. I am not sure how graphs work as I am new to the concept.
I was hoping to parallelize a depth first search algorithm using dynamic parallelism like so::
// this is non working code , just wanted to get the idea across
#include <iostream>
#include <cuda_runtime.h>
__global__ void processData(int* data, int *results, int itemtoprocess)
{
}
__global__ void rundfs(int *itemstosearch, int *data, int **placetostore, int *size_of_place_to_store, int tid)
{
int itemtosearch = tid;
// Process data...
int *newArr = new int[100];
process<<<block,thread>>>(data, newArr,itemtosearch);
if (/*conditions met from processed data*/)
{
int **newplacetostore;
int *newsize_of_place_to_store;
int newtid = tid + 1;
// Allocate memory for new placetostore and size_of_place_to_store
int **newplacetostore = new *int[1000];
//cudaMallocManaged(&newplacetostore, sizeof(int *));
int *newsizeofplacetostore = new int[100]; //cudaMallocManaged(&newsize_of_place_to_store, sizeof(int));
rundfs<<blocks, threads>>>(itemstosearch, data, newplacetostore, newsize_of_place_to_store, newtid);
cudaDeviceSynchronize();
*size_of_place_to_store += *newsize_of_place_to_store;
// Free the allocated memory
cudaFree(newplacetostore);
cudaFree(newsize_of_place_to_store);
}
// Store the results
placetostore[tid] = /*store here*/;
}
int main()
{
int *itemstosearch;
int *data;
int **placetostore;
int *size_of_place_to_store;
// Allocate and initialize memory for itemstosearch, data, placetostore, and size_of_place_to_store
// ...
rundfs<<<blocks, threads>>>(itemstosearch, data, placetostore, size_of_place_to_store, 0);
cudaDeviceSynchronize();
// Process the results and clean up the memory
// ...
// copy results back and print them
return 0;
}
How would I go on about doing this since I cannot call cudaDeviceSynchronize in the global code?