cudaDeviceSynchronize() takes time

I’ve written a code in C++/Cuda. I think something is wrong with the cudaDeviceSynchronize(); that I’m using in my program. Can someone check the issue and help me in debugging it?

I’m very new to CUDA

It’s expected that cudaDeviceSynchronize(); “takes time”. It is waiting for the GPU to finish its work, such as kernel calls, that you have previously issued to it. A kernel launch in CUDA is asynchronous, it does not wait for the kernel to complete.

Will it be slower than the usual CPU performance? Because, it surpassed an hour for now and didn’t complete either. I forced stopped it. But the CPU performance ends in less than 3 minutes.

Is it because of the issue in my code?

It’s not possible for me to explain the behavior of code that you haven’t shown. However it is a common observation that the synchronizing step after a kernel launch seems to “absorb” the time taken by the CUDA kernel to execute. That is normal.

GPU performance might be slower than CPU performance. It might be faster. It depends on what you are doing. If I were writing a piece of code that spent an hour on a single kernel call and did not complete, I would suspect a coding defect on my part, for example inadvertently creating an endless loop. But I have no idea about your case.

This is my code


#include <iostream>
#include <vector>
#include <algorithm>
#include <chrono>
#include <cmath>
#include <cstdlib>
#include <ctime>
#include <cuda_runtime.h>

// Define the 3D point structure
struct Point3D {
    int x;
    int y;
    int z;
};

// Device function to check if a point is valid within the environment boundaries
__device__ bool isValidPoint(int x, int y, int z, int width, int height, int depth) {
    return (x >= 0 && x < width&& y >= 0 && y < height&& z >= 0 && z < depth);
}

// Device function to calculate the 1D index of a 3D point
int get1DIndex(int x, int y, int z, int width, int height) {
    return x + y * width + z * width * height;
}

// Device function to calculate the 1D index of a 3D point
__device__ int get1DIndexDevice(int x, int y, int z, int width, int height, int* openSetForward) {
    int index = x + y * width + z * width * height;
    int oldValue = atomicCAS(&openSetForward[0], -1, index);
    if (oldValue != -1) {
        return oldValue;
    }
    else {
        return index;
    }
}

// Device function to calculate the heuristic (Euclidean distance) between two points
__device__ float heuristic(int x1, int y1, int z1, int x2, int y2, int z2) {
    int dx = x2 - x1;
    int dy = y2 - y1;
    int dz = z2 - z1;
    return sqrtf(static_cast<float>(dx * dx + dy * dy + dz * dz));
}

// Device function to perform the late line of sight check
__device__ bool hasLineOfSight(const unsigned char* environment, int x1, int y1, int z1, int x2, int y2, int z2, int width, int height, int depth) {
    int dx = x2 - x1;
    int dy = y2 - y1;
    int dz = z2 - z1;
    int steps = max(max(abs(dx), abs(dy)), abs(dz));

    if (steps == 0) {
        return true;
    }

    float stepX = static_cast<float>(dx) / steps;
    float stepY = static_cast<float>(dy) / steps;
    float stepZ = static_cast<float>(dz) / steps;

    float x = static_cast<float>(x1);
    float y = static_cast<float>(y1);
    float z = static_cast<float>(z1);

    for (int i = 0; i <= steps; ++i) {
        // Explicitly cast width, height, and depth to int
        if (environment[static_cast<int>(x) + static_cast<int>(y) * width + static_cast<int>(z) * width * height] == 1) {
            return false;
        }
        x += stepX;
        y += stepY;
        z += stepZ;
    }

    return true;
}

// Device function to perform the parallel bidirectional A* search
__global__ void parallelBidirectionalAStar(const unsigned char* environment, Point3D* startNode, Point3D* goalNode, int width, int height, int depth, int* cameFromForward, int* cameFromBackward, int* gScoreForward, int* gScoreBackward, float* fScoreForward, float* fScoreBackward, int* openSetForward, int* openSetBackward, int* closedSet, int* pathFound) {
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;

    // Check if the environment is valid
    if (!isValidPoint(startNode->x, startNode->y, startNode->z, width, height, depth)) {
        return;
    }
    if (!isValidPoint(goalNode->x, goalNode->y, goalNode->z, width, height, depth)) {
        return;
    }

    // Initialize the open and closed sets for both directions
    if (index == 0) {
        openSetForward[0] = get1DIndexDevice(startNode->x, startNode->y, startNode->z, width, height, openSetForward);
        openSetBackward[0] = get1DIndexDevice(goalNode->x, goalNode->y, goalNode->z, width, height, openSetForward);
    }

    __syncthreads();

    while (true) {
        __syncthreads();

        // Check if any thread found the path or the search is exhausted
        if (pathFound[0] != 0 || (openSetForward[0] == -1 && openSetBackward[0] == -1)) {
            return;
        }

        __syncthreads();

        // Choose the node with the lowest fScore from the forward search
        int currentForward = -1;
        float lowestFScoreForward = INFINITY;
        for (int i = index; i < width * height * depth; i += stride) {
            if (openSetForward[0] == -1) {
                break;
            }
            if (fScoreForward[i] < lowestFScoreForward && closedSet[i] == 0) {
                lowestFScoreForward = fScoreForward[i];
                currentForward = i;
            }
        }

        __syncthreads();

        // Choose the node with the lowest fScore from the backward search
        int currentBackward = -1;
        float lowestFScoreBackward = INFINITY;
        for (int i = index; i < width * height * depth; i += stride) {
            if (openSetBackward[0] == -1) {
                break;
            }
            if (fScoreBackward[i] < lowestFScoreBackward && closedSet[i] == 0) {
                lowestFScoreBackward = fScoreBackward[i];
                currentBackward = i;
            }
        }

        __syncthreads();

        // Check if the current nodes meet
        if (currentForward != -1 && currentBackward != -1 && currentForward == currentBackward) {
            if (index == 0) {
                pathFound[0] = 1;
            }
            return;
        }

        // Process the forward node
        if (currentForward != -1) {
            if (index == 0) {
                openSetForward[0] = -1;
                closedSet[currentForward] = 1;
            }

            __syncthreads();

            int x = currentForward % width;
            int y = (currentForward / width) % height;
            int z = currentForward / (width * height);

            for (int dx = -1; dx <= 1; ++dx) {
                for (int dy = -1; dy <= 1; ++dy) {
                    for (int dz = -1; dz <= 1; ++dz) {
                        int newX = x + dx;
                        int newY = y + dy;
                        int newZ = z + dz;

                        if (isValidPoint(newX, newY, newZ, width, height, depth)) {
                            int neighborNode = get1DIndexDevice(newX, newY, newZ, width, height, openSetForward);
                            if (closedSet[neighborNode] == 0 && hasLineOfSight(environment, x, y, z, newX, newY, newZ, width, height, depth)) {
                                int tentativeGScore = gScoreForward[currentForward] + 1;
                                if (tentativeGScore < gScoreForward[neighborNode]) {
                                    cameFromForward[neighborNode] = currentForward;
                                    gScoreForward[neighborNode] = tentativeGScore;
                                    fScoreForward[neighborNode] = gScoreForward[neighborNode] + heuristic(newX, newY, newZ, goalNode->x, goalNode->y, goalNode->z);
                                    if (atomicCAS(&openSetForward[0], -1, neighborNode) == -1) {
                                        closedSet[neighborNode] = 0;
                                    }
                                }
                            }
                        }
                    }

                    __syncthreads();

                    // Process the backward node
                    if (currentBackward != -1) {
                        if (index == 0) {
                            openSetBackward[0] = -1;
                            closedSet[currentBackward] = 1;
                        }

                        __syncthreads();

                        int x = currentBackward % width;
                        int y = (currentBackward / width) % height;
                        int z = currentBackward / (width * height);

                        for (int dx = -1; dx <= 1; ++dx) {
                            for (int dy = -1; dy <= 1; ++dy) {
                                for (int dz = -1; dz <= 1; ++dz) {
                                    int newX = x + dx;
                                    int newY = y + dy;
                                    int newZ = z + dz;

                                    if (isValidPoint(newX, newY, newZ, width, height, depth)) {
                                        int neighborNode = get1DIndexDevice(newX, newY, newZ, width, height, openSetForward);
                                        if (closedSet[neighborNode] == 0 && hasLineOfSight(environment, x, y, z, newX, newY, newZ, width, height, depth)) {
                                            int tentativeGScore = gScoreBackward[currentBackward] + 1;
                                            if (tentativeGScore < gScoreBackward[neighborNode]) {
                                                cameFromBackward[neighborNode] = currentBackward;
                                                gScoreBackward[neighborNode] = tentativeGScore;
                                                fScoreBackward[neighborNode] = gScoreBackward[neighborNode] + heuristic(newX, newY, newZ, startNode->x, startNode->y, startNode->z);
                                                if (atomicCAS(&openSetBackward[0], -1, neighborNode) == -1) {
                                                    closedSet[neighborNode] = 0;
                                                }
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
    }
}

// Function to get a random valid point within the environment
Point3D getRandomPoint(int width, int height, int depth, const std::vector<unsigned char>& environment) {
    while (true) {
        int x = std::rand() % width;
        int y = std::rand() % height;
        int z = std::rand() % depth;
        if (environment[get1DIndex(x, y, z, width, height)] == 0) {
            Point3D point{ x, y, z };
            return point;
        }
    }
}



int main() {
    // Set the random seed
    std::srand(static_cast<unsigned>(std::time(nullptr)));
    std::cout << "Program starts" << std::endl;
    // Define the size of the environment
    int width = 10;
    int height = 10;
    int depth = 10;

    // Generate a random 3D environment
    std::vector<unsigned char> environment(width * height * depth);
    for (int i = 0; i < width * height * depth; ++i) {
        environment[i] = std::rand() % 3 == 0 ? 1 : 0;  // 1/3 probability of an obstacle
    }

    // Generate random start and goal points
    // Point3D startNode{ std::rand() % width, std::rand() % height, std::rand() % depth };
    // Point3D goalNode{ std::rand() % width, std::rand() % height, std::rand() % depth };

    // Generate random start and goal points until valid ones are found
    // Generate random start and goal points
    Point3D startNode = getRandomPoint(width, height, depth, environment);
    Point3D goalNode = getRandomPoint(width, height, depth, environment);


    // Define the device memory pointers
    unsigned char* devEnvironment;
    Point3D* devStartNode;
    Point3D* devGoalNode;
    int* devCameFromForward;
    int* devCameFromBackward;
    int* devGScoreForward;
    int* devGScoreBackward;
    float* devFScoreForward;
    float* devFScoreBackward;
    int* devOpenSetForward;
    int* devOpenSetBackward;
    int* devClosedSet;
    int* devPathFound;

    // Allocate memory on the device
    cudaMalloc((void**)&devEnvironment, width * height * depth * sizeof(unsigned char));
    cudaMalloc((void**)&devStartNode, sizeof(Point3D));
    cudaMalloc((void**)&devGoalNode, sizeof(Point3D));
    cudaMalloc((void**)&devCameFromForward, width * height * depth * sizeof(int));
    cudaMalloc((void**)&devCameFromBackward, width * height * depth * sizeof(int));
    cudaMalloc((void**)&devGScoreForward, width * height * depth * sizeof(int));
    cudaMalloc((void**)&devGScoreBackward, width * height * depth * sizeof(int));
    cudaMalloc((void**)&devFScoreForward, width * height * depth * sizeof(float));
    cudaMalloc((void**)&devFScoreBackward, width * height * depth * sizeof(float));
    cudaMalloc((void**)&devOpenSetForward, sizeof(int));
    cudaMalloc((void**)&devOpenSetBackward, sizeof(int));
    cudaMalloc((void**)&devClosedSet, width * height * depth * sizeof(int));
    cudaMalloc((void**)&devPathFound, sizeof(int));

    // Copy data from host to device
    cudaMemcpy(devEnvironment, environment.data(), width * height * depth * sizeof(unsigned char), cudaMemcpyHostToDevice);
    cudaMemcpy(devStartNode, &startNode, sizeof(Point3D), cudaMemcpyHostToDevice);
    cudaMemcpy(devGoalNode, &goalNode, sizeof(Point3D), cudaMemcpyHostToDevice);

    // Initialize device variables
    cudaMemset(devCameFromForward, -1, width * height * depth * sizeof(int));
    cudaMemset(devCameFromBackward, -1, width * height * depth * sizeof(int));
    cudaMemset(devGScoreForward, INT_MAX, width * height * depth * sizeof(int));
    cudaMemset(devGScoreBackward, INT_MAX, width * height * depth * sizeof(int));
    // Initialize the temporary float variable to infinity
    float infinity = INFINITY;

    // Copy the infinity value to the device memory for devFScoreForward
    cudaMemcpy(devFScoreForward, &infinity, sizeof(float), cudaMemcpyHostToDevice);

    // Copy the infinity value to the device memory for devFScoreBackward
    cudaMemcpy(devFScoreBackward, &infinity, sizeof(float), cudaMemcpyHostToDevice);

    cudaMemset(devOpenSetForward, -1, sizeof(int));
    cudaMemset(devOpenSetBackward, -1, sizeof(int));
    cudaMemset(devClosedSet, 0, width * height * depth * sizeof(int));
    cudaMemset(devPathFound, 0, sizeof(int));

    // Launch the parallel bidirectional A* search on the device
    int blockSize = 256;
    int numBlocks = (width * height * depth + blockSize - 1) / blockSize;
    parallelBidirectionalAStar << <numBlocks, blockSize >> > (devEnvironment, devStartNode, devGoalNode, width, height, depth, devCameFromForward, devCameFromBackward, devGScoreForward, devGScoreBackward, devFScoreForward, devFScoreBackward, devOpenSetForward, devOpenSetBackward, devClosedSet, devPathFound);
    std::cout << "parallelBidirectionalAStar ends" << std::endl;
    // Wait for the GPU to finish
    cudaDeviceSynchronize();
    std::cout << "cudaDeviceSynchronize ends" << std::endl;
    // Copy the results back from the device to the host
    cudaMemcpy(&startNode, devStartNode, sizeof(Point3D), cudaMemcpyDeviceToHost);
    cudaMemcpy(&goalNode, devGoalNode, sizeof(Point3D), cudaMemcpyDeviceToHost);
    int pathFound;
    cudaMemcpy(&pathFound, devPathFound, sizeof(int), cudaMemcpyDeviceToHost);

    if (pathFound == 1) {
        // Reconstruct the path from the start node to the goal node
        int currentForward = get1DIndex(goalNode.x, goalNode.y, goalNode.z, width, height);
        std::vector<int> pathForward;
        while (currentForward != -1) {
            pathForward.push_back(currentForward);
            currentForward = devCameFromForward[currentForward];
        }
        std::reverse(pathForward.begin(), pathForward.end());

        int currentBackward = get1DIndex(startNode.x, startNode.y, startNode.z, width, height);
        std::vector<int> pathBackward;
        while (currentBackward != -1) {
            pathBackward.push_back(currentBackward);
            currentBackward = devCameFromBackward[currentBackward];
        }

        // Merge the two paths
        pathForward.insert(pathForward.end(), pathBackward.begin(), pathBackward.end());

        // Print the path
        std::cout << "Path Found!" << std::endl;
        std::cout << "Path Length: " << pathForward.size() << std::endl;
        std::cout << "Nodes Opened: " << std::count(environment.begin(), environment.end(), 1) << std::endl;
        std::cout << "Environment Details:" << std::endl;
        for (int z = 0; z < depth; ++z) {
            std::cout << "Layer " << z + 1 << std::endl;
            for (int y = 0; y < height; ++y) {
                for (int x = 0; x < width; ++x) {
                    std::cout << static_cast<int>(environment[get1DIndex(x, y, z, width, height)]) << " ";
                }
                std::cout << std::endl;
            }
            std::cout << std::endl;
        }
        std::cout << "Start Point: (" << startNode.x << ", " << startNode.y << ", " << startNode.z << ")" << std::endl;
        std::cout << "Goal Point: (" << goalNode.x << ", " << goalNode.y << ", " << goalNode.z << ")" << std::endl;
    }
    else {
        std::cout << "Path not found!" << std::endl;
    }

    // Free device memory
    cudaFree(devEnvironment);
    cudaFree(devStartNode);
    cudaFree(devGoalNode);
    cudaFree(devCameFromForward);
    cudaFree(devCameFromBackward);
    cudaFree(devGScoreForward);
    cudaFree(devGScoreBackward);
    cudaFree(devFScoreForward);
    cudaFree(devFScoreBackward);
    cudaFree(devOpenSetForward);
    cudaFree(devOpenSetBackward);
    cudaFree(devClosedSet);
    cudaFree(devPathFound);

    return 0;
}

when posting code on these forums, please format it correctly. A simple method to do that is as follows:

  • edit your post by clicking the pencil icon underneath it.
  • select the code
  • click the </> button at the top of the edit window
  • save your changes

Please do that now.

I also recommend using proper CUDA error checking (you can google for that) and also run your code with compute-sanitizer.

1 Like

Doesn’t it seem like that opens the possibility for an endless loop, if you’ve made any sort of mistake in the handling of the things to be tested in the body of the loop?

compute-sanitizer may not give a complete picture if your code is spending a large amount of time in a single place. You might want to use debugging techniques to identify where the kernel is spending its time.

It’s a bit chaotic, but based on your report of this spending over an hour, I would immediately try to ascertain whether the kernel is getting trapped in that while (true statement.

No it is not going to. Because it is going to get terminated when the path is found/not found.

It seems evident to me that some of your threads are not hitting a termination condition (path found or not found).

Here’s how I modified your code:

#include <iostream>
#include <vector>
#include <algorithm>
#include <chrono>
#include <cmath>
#include <cstdlib>
#include <ctime>
#include <cuda_runtime.h>
#include <cassert>
#include <cstdio>

// Define the 3D point structure
struct Point3D {
    int x;
    int y;
    int z;
};

// Device function to check if a point is valid within the environment boundaries
__device__ bool isValidPoint(int x, int y, int z, int width, int height, int depth) {
    return (x >= 0 && x < width&& y >= 0 && y < height&& z >= 0 && z < depth);
}

// Device function to calculate the 1D index of a 3D point
int get1DIndex(int x, int y, int z, int width, int height) {
    return x + y * width + z * width * height;
}

// Device function to calculate the 1D index of a 3D point
__device__ int get1DIndexDevice(int x, int y, int z, int width, int height, int* openSetForward) {
    int index = x + y * width + z * width * height;
    int oldValue = atomicCAS(&openSetForward[0], -1, index);
    if (oldValue != -1) {
        return oldValue;
    }
    else {
        return index;
    }
}

// Device function to calculate the heuristic (Euclidean distance) between two points
__device__ float heuristic(int x1, int y1, int z1, int x2, int y2, int z2) {
    int dx = x2 - x1;
    int dy = y2 - y1;
    int dz = z2 - z1;
    return sqrtf(static_cast<float>(dx * dx + dy * dy + dz * dz));
}

// Device function to perform the late line of sight check
__device__ bool hasLineOfSight(const unsigned char* environment, int x1, int y1, int z1, int x2, int y2, int z2, int width, int height, int depth) {
    int dx = x2 - x1;
    int dy = y2 - y1;
    int dz = z2 - z1;
    int steps = max(max(abs(dx), abs(dy)), abs(dz));

    if (steps == 0) {
        return true;
    }

    float stepX = static_cast<float>(dx) / steps;
    float stepY = static_cast<float>(dy) / steps;
    float stepZ = static_cast<float>(dz) / steps;

    float x = static_cast<float>(x1);
    float y = static_cast<float>(y1);
    float z = static_cast<float>(z1);

    for (int i = 0; i <= steps; ++i) {
        // Explicitly cast width, height, and depth to int
        if (environment[static_cast<int>(x) + static_cast<int>(y) * width + static_cast<int>(z) * width * height] == 1) {
            return false;
        }
        x += stepX;
        y += stepY;
        z += stepZ;
    }

    return true;
}

// Device function to perform the parallel bidirectional A* search
__global__ void parallelBidirectionalAStar(const unsigned char* environment, Point3D* startNode, Point3D* goalNode, int width, int height, int depth, int* cameFromForward, int* cameFromBackward, int* gScoreForward, int* gScoreBackward, float* fScoreForward, float* fScoreBackward, int* openSetForward, int* openSetBackward, int* closedSet, int* pathFound) {
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;

    // Check if the environment is valid
    if (!isValidPoint(startNode->x, startNode->y, startNode->z, width, height, depth)) {
        return;
    }
    if (!isValidPoint(goalNode->x, goalNode->y, goalNode->z, width, height, depth)) {
        return;
    }

    // Initialize the open and closed sets for both directions
    if (index == 0) {
        openSetForward[0] = get1DIndexDevice(startNode->x, startNode->y, startNode->z, width, height, openSetForward);
        openSetBackward[0] = get1DIndexDevice(goalNode->x, goalNode->y, goalNode->z, width, height, openSetForward);
    }

    __syncthreads();

    while (true) {
    //    __syncthreads();

        // Check if any thread found the path or the search is exhausted
        if (pathFound[0] != 0 || (openSetForward[0] == -1 && openSetBackward[0] == -1)) {
        printf("1thread: %d returns\n", index);
        //assert(0);
                return;
        }

        __syncthreads();

        // Choose the node with the lowest fScore from the forward search
        int currentForward = -1;
        float lowestFScoreForward = INFINITY;
        for (int i = index; i < width * height * depth; i += stride) {
            if (openSetForward[0] == -1) {
                break;
            }
            if (fScoreForward[i] < lowestFScoreForward && closedSet[i] == 0) {
                lowestFScoreForward = fScoreForward[i];
                currentForward = i;
            }
        }

        __syncthreads();

        // Choose the node with the lowest fScore from the backward search
        int currentBackward = -1;
        float lowestFScoreBackward = INFINITY;
        for (int i = index; i < width * height * depth; i += stride) {
            if (openSetBackward[0] == -1) {
                break;
            }
            if (fScoreBackward[i] < lowestFScoreBackward && closedSet[i] == 0) {
                lowestFScoreBackward = fScoreBackward[i];
                currentBackward = i;
            }
        }

        __syncthreads();

        // Check if the current nodes meet
        if (currentForward != -1 && currentBackward != -1 && currentForward == currentBackward) {
            if (index == 0) {
                pathFound[0] = 1;
            }
        printf("%d\n", index);
        //assert(0);
                return;
        }

        // Process the forward node
        if (currentForward != -1) {
            if (index == 0) {
                openSetForward[0] = -1;
                closedSet[currentForward] = 1;
            }

            __syncthreads();

            int x = currentForward % width;
            int y = (currentForward / width) % height;
            int z = currentForward / (width * height);

            for (int dx = -1; dx <= 1; ++dx) {
                for (int dy = -1; dy <= 1; ++dy) {
                    for (int dz = -1; dz <= 1; ++dz) {
                        int newX = x + dx;
                        int newY = y + dy;
                        int newZ = z + dz;

                        if (isValidPoint(newX, newY, newZ, width, height, depth)) {
                            int neighborNode = get1DIndexDevice(newX, newY, newZ, width, height, openSetForward);
                            if (closedSet[neighborNode] == 0 && hasLineOfSight(environment, x, y, z, newX, newY, newZ, width, height, depth)) {
                                int tentativeGScore = gScoreForward[currentForward] + 1;
                                if (tentativeGScore < gScoreForward[neighborNode]) {
                                    cameFromForward[neighborNode] = currentForward;
                                    gScoreForward[neighborNode] = tentativeGScore;
                                    fScoreForward[neighborNode] = gScoreForward[neighborNode] + heuristic(newX, newY, newZ, goalNode->x, goalNode->y, goalNode->z);
                                    if (atomicCAS(&openSetForward[0], -1, neighborNode) == -1) {
                                        closedSet[neighborNode] = 0;
                                    }
                                }
                            }
                        }
                    }

                    __syncthreads();

                    // Process the backward node
                    if (currentBackward != -1) {
                        if (index == 0) {
                            openSetBackward[0] = -1;
                            closedSet[currentBackward] = 1;
                        }

                        __syncthreads();

                        int x = currentBackward % width;
                        int y = (currentBackward / width) % height;
                        int z = currentBackward / (width * height);

                        for (int dx = -1; dx <= 1; ++dx) {
                            for (int dy = -1; dy <= 1; ++dy) {
                                for (int dz = -1; dz <= 1; ++dz) {
                                    int newX = x + dx;
                                    int newY = y + dy;
                                    int newZ = z + dz;

                                    if (isValidPoint(newX, newY, newZ, width, height, depth)) {
                                        int neighborNode = get1DIndexDevice(newX, newY, newZ, width, height, openSetForward);
                                        if (closedSet[neighborNode] == 0 && hasLineOfSight(environment, x, y, z, newX, newY, newZ, width, height, depth)) {
                                            int tentativeGScore = gScoreBackward[currentBackward] + 1;
                                            if (tentativeGScore < gScoreBackward[neighborNode]) {
                                                cameFromBackward[neighborNode] = currentBackward;
                                                gScoreBackward[neighborNode] = tentativeGScore;
                                                fScoreBackward[neighborNode] = gScoreBackward[neighborNode] + heuristic(newX, newY, newZ, startNode->x, startNode->y, startNode->z);
                                                if (atomicCAS(&openSetBackward[0], -1, neighborNode) == -1) {
                                                    closedSet[neighborNode] = 0;
                                                }
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
    }
}

// Function to get a random valid point within the environment
Point3D getRandomPoint(int width, int height, int depth, const std::vector<unsigned char>& environment) {
    while (true) {
        int x = std::rand() % width;
        int y = std::rand() % height;
        int z = std::rand() % depth;
        if (environment[get1DIndex(x, y, z, width, height)] == 0) {
            Point3D point{ x, y, z };
            return point;
        }
    }
}



int main() {
    // Set the random seed
    std::srand(static_cast<unsigned>(std::time(nullptr)));
    std::cout << "Program starts" << std::endl;
    // Define the size of the environment
    int width = 10;
    int height = 10;
    int depth = 10;

    // Generate a random 3D environment
    std::vector<unsigned char> environment(width * height * depth);
    for (int i = 0; i < width * height * depth; ++i) {
        environment[i] = std::rand() % 3 == 0 ? 1 : 0;  // 1/3 probability of an obstacle
    }

    // Generate random start and goal points
    // Point3D startNode{ std::rand() % width, std::rand() % height, std::rand() % depth };
    // Point3D goalNode{ std::rand() % width, std::rand() % height, std::rand() % depth };

    // Generate random start and goal points until valid ones are found
    // Generate random start and goal points
    Point3D startNode = getRandomPoint(width, height, depth, environment);
    Point3D goalNode = getRandomPoint(width, height, depth, environment);


    // Define the device memory pointers
    unsigned char* devEnvironment;
    Point3D* devStartNode;
    Point3D* devGoalNode;
    int* devCameFromForward;
    int* devCameFromBackward;
    int* devGScoreForward;
    int* devGScoreBackward;
    float* devFScoreForward;
    float* devFScoreBackward;
    int* devOpenSetForward;
    int* devOpenSetBackward;
    int* devClosedSet;
    int* devPathFound;

    // Allocate memory on the device
    cudaMalloc((void**)&devEnvironment, width * height * depth * sizeof(unsigned char));
    cudaMalloc((void**)&devStartNode, sizeof(Point3D));
    cudaMalloc((void**)&devGoalNode, sizeof(Point3D));
    cudaMalloc((void**)&devCameFromForward, width * height * depth * sizeof(int));
    cudaMalloc((void**)&devCameFromBackward, width * height * depth * sizeof(int));
    cudaMalloc((void**)&devGScoreForward, width * height * depth * sizeof(int));
    cudaMalloc((void**)&devGScoreBackward, width * height * depth * sizeof(int));
    cudaMalloc((void**)&devFScoreForward, width * height * depth * sizeof(float));
    cudaMalloc((void**)&devFScoreBackward, width * height * depth * sizeof(float));
    cudaMalloc((void**)&devOpenSetForward, sizeof(int));
    cudaMalloc((void**)&devOpenSetBackward, sizeof(int));
    cudaMalloc((void**)&devClosedSet, width * height * depth * sizeof(int));
    cudaMalloc((void**)&devPathFound, sizeof(int));

    // Copy data from host to device
    cudaMemcpy(devEnvironment, environment.data(), width * height * depth * sizeof(unsigned char), cudaMemcpyHostToDevice);
    cudaMemcpy(devStartNode, &startNode, sizeof(Point3D), cudaMemcpyHostToDevice);
    cudaMemcpy(devGoalNode, &goalNode, sizeof(Point3D), cudaMemcpyHostToDevice);

    // Initialize device variables
    cudaMemset(devCameFromForward, -1, width * height * depth * sizeof(int));
    cudaMemset(devCameFromBackward, -1, width * height * depth * sizeof(int));
    cudaMemset(devGScoreForward, INT_MAX, width * height * depth * sizeof(int));
    cudaMemset(devGScoreBackward, INT_MAX, width * height * depth * sizeof(int));
    // Initialize the temporary float variable to infinity
    float infinity = INFINITY;

    // Copy the infinity value to the device memory for devFScoreForward
    cudaMemcpy(devFScoreForward, &infinity, sizeof(float), cudaMemcpyHostToDevice);

    // Copy the infinity value to the device memory for devFScoreBackward
    cudaMemcpy(devFScoreBackward, &infinity, sizeof(float), cudaMemcpyHostToDevice);

    cudaMemset(devOpenSetForward, -1, sizeof(int));
    cudaMemset(devOpenSetBackward, -1, sizeof(int));
    cudaMemset(devClosedSet, 0, width * height * depth * sizeof(int));
    cudaMemset(devPathFound, 0, sizeof(int));

    // Launch the parallel bidirectional A* search on the device
    int blockSize = 256;
    int numBlocks = (width * height * depth + blockSize - 1) / blockSize;
    std::cout << "launching " << blockSize*numBlocks << " threads" << std::endl;
    parallelBidirectionalAStar << <numBlocks, blockSize >> > (devEnvironment, devStartNode, devGoalNode, width, height, depth, devCameFromForward, devCameFromBackward, devGScoreForward, devGScoreBackward, devFScoreForward, devFScoreBackward, devOpenSetForward, devOpenSetBackward, devClosedSet, devPathFound);
    std::cout << "parallelBidirectionalAStar ends" << std::endl;
    // Wait for the GPU to finish
    cudaDeviceSynchronize();
    std::cout << "cudaDeviceSynchronize ends" << std::endl;
    // Copy the results back from the device to the host
    cudaMemcpy(&startNode, devStartNode, sizeof(Point3D), cudaMemcpyDeviceToHost);
    cudaMemcpy(&goalNode, devGoalNode, sizeof(Point3D), cudaMemcpyDeviceToHost);
    int pathFound;
    cudaMemcpy(&pathFound, devPathFound, sizeof(int), cudaMemcpyDeviceToHost);

    if (pathFound == 1) {
        // Reconstruct the path from the start node to the goal node
        int currentForward = get1DIndex(goalNode.x, goalNode.y, goalNode.z, width, height);
        std::vector<int> pathForward;
        while (currentForward != -1) {
            pathForward.push_back(currentForward);
            currentForward = devCameFromForward[currentForward];
        }
        std::reverse(pathForward.begin(), pathForward.end());

        int currentBackward = get1DIndex(startNode.x, startNode.y, startNode.z, width, height);
        std::vector<int> pathBackward;
        while (currentBackward != -1) {
            pathBackward.push_back(currentBackward);
            currentBackward = devCameFromBackward[currentBackward];
        }

        // Merge the two paths
        pathForward.insert(pathForward.end(), pathBackward.begin(), pathBackward.end());

        // Print the path
        std::cout << "Path Found!" << std::endl;
        std::cout << "Path Length: " << pathForward.size() << std::endl;
        std::cout << "Nodes Opened: " << std::count(environment.begin(), environment.end(), 1) << std::endl;
        std::cout << "Environment Details:" << std::endl;
        for (int z = 0; z < depth; ++z) {
            std::cout << "Layer " << z + 1 << std::endl;
            for (int y = 0; y < height; ++y) {
                for (int x = 0; x < width; ++x) {
                    std::cout << static_cast<int>(environment[get1DIndex(x, y, z, width, height)]) << " ";
                }
                std::cout << std::endl;
            }
            std::cout << std::endl;
        }
        std::cout << "Start Point: (" << startNode.x << ", " << startNode.y << ", " << startNode.z << ")" << std::endl;
        std::cout << "Goal Point: (" << goalNode.x << ", " << goalNode.y << ", " << goalNode.z << ")" << std::endl;
    }
    else {
        std::cout << "Path not found!" << std::endl;
    }

    // Free device memory
    cudaFree(devEnvironment);
    cudaFree(devStartNode);
    cudaFree(devGoalNode);
    cudaFree(devCameFromForward);
    cudaFree(devCameFromBackward);
    cudaFree(devGScoreForward);
    cudaFree(devGScoreBackward);
    cudaFree(devFScoreForward);
    cudaFree(devFScoreBackward);
    cudaFree(devOpenSetForward);
    cudaFree(devOpenSetBackward);
    cudaFree(devClosedSet);
    cudaFree(devPathFound);

    return 0;
}

When I compile and run that code after about 10 seconds, I get output like this:

$ ./t9
Program starts
launching 1024 threads
parallelBidirectionalAStar ends
992
993
994
995
996
997
998
999
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607

After that, I can wait 10 minutes or more and observe no additional output. If that were my code, I would pick a thread that is not in the list above, and start trying to find out why it never hits a terminating condition. You can use in-kernel printf() the way I have demonstrated.

Or use the debugging techniques I already linked. You have a logic error in your code that is preventing some threads from discovering a termination condition.

I will also point out that you seem to be making extensive use of __syncthreads(); I’m not sure why. It is worthwhile to note that use of syncthreads is only considered legal in conditional code if the condition evaluates the same across all threads in the block. If you have liberally scattered syncthreads through your code as a debugging technique, that may be a bad idea.

FWIW I commented out all the usage of __syncthreads(); in your code, and my observations did not change, so I don’t think the usage of __syncthreads() is the crux of the issue. I think the crux of the issue is that some of your threads are not discovering a terminating condition. I’ve indicated which threads I think those are.

Thanks. I’ve made some changes and this code works. I mean no issue in completion. But the problem now is, it is not finding the path. means, it returns no path even when i remove all the obstacles

My guess would be that would be an even easier debug scenario. You know which threads should find a path. Chase the data through your calculation, and see where it is falling apart.

I’m trying all the possibilities but still it is giving wrong answers. As of now what i see is, the two arrays that have the path details are set wrongly during the execution of parallel Astar algorithm.

#include <iostream>
#include <vector>
#include <algorithm>
#include <chrono>
#include <cmath>
#include <cstdlib>
#include <ctime>
#include <cuda_runtime.h>
#include <cassert>
#include <cstdio>
#include <stdio.h>
#include <stack>
// Include the math library for INFINITY constant
#include <math.h>
#include <cuda.h>

// Define the 3D point structure
struct Point3D {
    int x;
    int y;
    int z;
};

// Device function to check if a point is valid within the environment boundaries
__device__ bool isValidPoint(int x, int y, int z, int width, int height, int depth) {
    return (x >= 0 && x < width&& y >= 0 && y < height&& z >= 0 && z < depth);
}

// Host function to check if a point is valid within the environment boundaries
bool isValidPointHost(int x, int y, int z, int width, int height, int depth) {
    return (x >= 0 && x < width&& y >= 0 && y < height&& z >= 0 && z < depth);
}


// Device function to calculate the 1D index of a 3D point
int get1DIndex(int x, int y, int z, int width, int height) {
    return x + y * width + z * width * height;
}

// Device function to calculate the 1D index of a 3D point
__device__ int get1DIndexV2(int x, int y, int z, int width, int height) {
    return x + y * width + z * width * height;
}

// Device function to calculate the 1D index of a 3D point
__device__ int get1DIndexDevice(int x, int y, int z, int width, int height, int openSetForward) {
    int index = x + y * width + z * width * height;
    int oldValue = atomicCAS(&openSetForward, -1, index);
    if (oldValue != -1) {
        return oldValue;
    }
    else {
        return index;
    }
}

// Device function to calculate the heuristic (Euclidean distance) between two points
/*__device__ float heuristic(int x1, int y1, int z1, int x2, int y2, int z2) {
    int dx = x2 - x1;
    int dy = y2 - y1;
    int dz = z2 - z1;
    int steps = max(max(abs(dx), abs(dy)), abs(dz));
    float manhattanDistance = abs(dx) + abs(dy) + abs(dz);
    float euclideanDistance = sqrtf(static_cast<float>(dx * dx + dy * dy + dz * dz));
    float diagonalPenalty = 0.5f;
    return manhattanDistance + euclideanDistance * diagonalPenalty + (steps - 1);
}*/

// Device function to calculate the heuristic (Euclidean distance) between two points
__device__ float heuristic(int x, int y, int z, int goalX, int goalY, int goalZ) {
    float dx = x - goalX;
    float dy = y - goalY;
    float dz = z - goalZ;
    float octileDistance = 0.5 * (abs(dx) + abs(dy) + abs(dz));
    if (dx > 0 && dy > 0) {
        octileDistance += 0.5;
    }
    else if (dx < 0 && dy < 0) {
        octileDistance += 0.5;
    }
    return octileDistance;
}

/*__device__ float heuristic(int x, int y, int z, int goalX, int goalY, int goalZ) {
    return abs(x - goalX) + abs(y - goalY) + abs(z - goalZ);
}*/

// Device function to perform the late line of sight check
/*__device__ bool hasLineOfSight(const unsigned char* devEnvironment, int x1, int y1, int z1, int x2, int y2, int z2, int width, int height, int depth) {
    int dx = x2 - x1;
    int dy = y2 - y1;
    int dz = z2 - z1;
    int steps = max(max(abs(dx), abs(dy)), abs(dz));

    if (steps == 0) {
        return true;
    }

    float stepX = static_cast<float>(dx) / steps;
    float stepY = static_cast<float>(dy) / steps;
    float stepZ = static_cast<float>(dz) / steps;

    float x = static_cast<float>(x1);
    float y = static_cast<float>(y1);
    float z = static_cast<float>(z1);

    for (int i = 0; i <= steps; ++i) {
        // Explicitly cast width, height, and depth to int
        if (!isValidPoint(static_cast<int>(x), static_cast<int>(y), static_cast<int>(z), width, height, depth)) {
            return false;
        }
        if (devEnvironment[static_cast<int>(x) + static_cast<int>(y) * width + static_cast<int>(z) * width * height] == 1) {
            return false;
        }
        x += stepX;
        y += stepY;
        z += stepZ;
    }

    return true;
}*/
// Device function to perform the line of sight check
// Device function to perform the line of sight check
__device__ bool hasLineOfSight(const unsigned char* devEnvironment, int x1, int y1, int z1, int x2, int y2, int z2, int width, int height, int depth) {
    int dx = abs(x2 - x1);
    int dy = abs(y2 - y1);
    int dz = abs(z2 - z1);
    int steps = max(max(dx, dy), dz);

    if (steps == 0) {
        return true;
    }

    int incX = (x2 > x1) ? 1 : ((x2 < x1) ? -1 : 0);
    int incY = (y2 > y1) ? 1 : ((y2 < y1) ? -1 : 0);
    int incZ = (z2 > z1) ? 1 : ((z2 < z1) ? -1 : 0);

    int x = x1;
    int y = y1;
    int z = z1;

    for (int i = 0; i <= steps; ++i) {
        // Check if the current position is a valid point within the environment
        if (!isValidPoint(x, y, z, width, height, depth)) {
            return false; // The position is out of bounds
        }

        if (devEnvironment[get1DIndexV2(x, y, z, width, height)] == 1) {
            return false; // There is an obstacle at this position
        }

        int e1 = 2 * dy - dx;
        int e2 = 2 * dz - dx;
        if (e1 > -dx) {
            x += incX;
            if (e2 > -dx) {
                z += incZ;
                e2 -= 2 * dx;
            }
            e1 -= 2 * dx;
        }
        if (e1 < (dy - dx)) {
            y += incY;
            if (e2 < (dz - dx)) {
                z += incZ;
                e2 += 2 * dx;
            }
            e1 += 2 * dy;
        }
    }

    return true; // The line of sight is clear
}


// Helper function to set a value in the open set atomically
__device__ void setOpenSetValue(int* devOpenSet, int index, int value) {
    atomicCAS(&devOpenSet[index], -1, value);
    //atomicCAS(&devCameFromForward[index], -1, parent);
    //atomicCAS(&devCameFromBackward[index], -1, parent);
}
/*void setOpenSetValue(int* devOpenSet, int node, int parent) {
    int index = node % width * height * depth;
    devOpenSet[index] = parent;
}*/

// Helper function to set a node as closed atomically
__device__ void setClosedSet(int* devClosedSet, int index) {
    atomicExch(&devClosedSet[index], -2);
}

//parallelBidirectionalAStar << <numBlocks, blockSize >> > (devEnvironment, devStartNode, devGoalNode, width, height, depth, devCameFromForward, devCameFromBackward, devGScoreForward, devGScoreBackward, devFScoreForward, devFScoreBackward, devOpenSetForward, devOpenSetBackward, devClosedSet, devPathFound, lambda);
__global__ void parallelBidirectionalAStar(const unsigned char* devEnvironment, const Point3D* devStartNode, const Point3D* devGoalNode, int width, int height, int depth, int* devCameFromForward, int* devCameFromBackward, int* devGScoreForward, int* devGScoreBackward, float* devFScoreForward, float* devFScoreBackward, int* devOpenSetForward, int* devOpenSetBackward, int* devClosedSet, int* devPathFound, float lambda) {
    int index = threadIdx.x + blockIdx.x * blockDim.x;
    int stride = blockDim.x * gridDim.x;
    __shared__ int pathMeeting;

    if (index == 0) {
        pathMeeting = 0;
    }
    __syncthreads();

    int localOpenSetForward = devOpenSetForward[0];
    int localOpenSetBackward = devOpenSetBackward[0];

    while (pathMeeting == 0 || (localOpenSetForward != -1 && localOpenSetBackward != -1)) {
        // Check if any thread found the path or the search is exhausted
        if (pathMeeting != 0) {
            //printf("pathMeeting != 0 || (localOpenSetForward == -1 && localOpenSetBackward == -1)");
            atomicExch(devPathFound, pathMeeting);
            /*if (index == 0) {
                atomicExch(devPathFound, pathMeeting);
            }*/
            return;
        }

        __syncthreads();

        // Choose the node with the lowest fScore from the forward search
        int currentForward = -1;
        float lowestFScoreForward = INFINITY;
        for (int i = localOpenSetForward + index; i < width * height * depth; i += stride) {
            if (i >= 0 && i < width * height * depth && devFScoreForward[i] < lowestFScoreForward && devClosedSet[i] == 0) {
                lowestFScoreForward = devFScoreForward[i];
                currentForward = i;
            }
        }

        __syncthreads(); 

        // Process the forward node
        if (currentForward != -1) {
            if (index == 0) {
                localOpenSetForward = -1;
                devClosedSet[currentForward] = 1;
            }

            __syncthreads();

            int x = currentForward % width;
            int y = (currentForward / width) % height;
            int z = currentForward / (width * height);

            for (int dx = -1; dx <= 1; ++dx) {
                for (int dy = -1; dy <= 1; ++dy) {
                    for (int dz = -1; dz <= 1; ++dz) {
                        int newX = x + dx;
                        int newY = y + dy;
                        int newZ = z + dz;

                        if (isValidPoint(newX, newY, newZ, width, height, depth)) {
                            int neighborNode = get1DIndexDevice(newX, newY, newZ, width, height, localOpenSetForward);
                            if (devClosedSet[neighborNode] == 0 && hasLineOfSight(devEnvironment, x, y, z, newX, newY, newZ, width, height, depth)) {
                                int tentativeGScore = devGScoreForward[currentForward] + 1;
                                int oldCameFrom = devCameFromForward[neighborNode];
                                int newCameFrom = currentForward; // Initialize newCameFrom

                                while (true) {
                                    int prevCameFrom = atomicExch(&devCameFromForward[neighborNode], newCameFrom);
                                    if (prevCameFrom == oldCameFrom) {
                                        // Successfully updated devCameFromForward[neighborNode], now update other arrays atomically
                                        devGScoreForward[neighborNode] = tentativeGScore;

                                        // Calculate the fScore of the neighbor node
                                        devFScoreForward[neighborNode] = devGScoreForward[neighborNode] + lambda * heuristic(newX, newY, newZ, devGoalNode->x, devGoalNode->y, devGoalNode->z);

                                        // Use the helper function to update the open set value atomically
                                        setOpenSetValue(devOpenSetForward, neighborNode, currentForward);

                                        // Set the node as closed
                                        setClosedSet(devClosedSet, neighborNode);

                                        break; // Exit the loop since the update was successful
                                    }
                                    else {
                                        // Another thread modified devCameFromForward[neighborNode] before this thread, revert the update and try again
                                        newCameFrom = prevCameFrom;
                                    }
                                }
                            }
                        }

                    }
                }
            }

            // Update the next forward node to be expanded globally
            int nextForward = -1;
            float lowestFScoreForward = INFINITY;
            for (int i = localOpenSetForward + index; i < width * height * depth; i += stride) {
                if (i >= 0 && i < width * height * depth && devFScoreForward[i] < lowestFScoreForward && devClosedSet[i] == 0) {
                    lowestFScoreForward = devFScoreForward[i];
                    nextForward = i;
                }
            }
            atomicMin(&devOpenSetForward[0], nextForward);

            // Set the node as closed
            setClosedSet(devClosedSet, currentForward);
        }
        __syncthreads();

        int currentBackward = -1;
        float lowestFScoreBackward = INFINITY;
        for (int i = localOpenSetBackward + index; i < width * height * depth; i += stride) {
            if (i >= 0 && i < width * height * depth && devFScoreBackward[i] < lowestFScoreBackward && devClosedSet[i] == 0) {
                lowestFScoreBackward = devFScoreBackward[i];
                currentBackward = i;
            }
        }

        // Process the backward node
        if (currentBackward != -1) {
            if (index == 0) {
                localOpenSetBackward = -1;
                devClosedSet[currentBackward] = 1;
            }

            __syncthreads();

            int x = currentBackward % width;
            int y = (currentBackward / width) % height;
            int z = currentBackward / (width * height);

            for (int dx = -1; dx <= 1; ++dx) {
                for (int dy = -1; dy <= 1; ++dy) {
                    for (int dz = -1; dz <= 1; ++dz) {
                        int newX = x + dx;
                        int newY = y + dy;
                        int newZ = z + dz;

                        if (isValidPoint(newX, newY, newZ, width, height, depth)) {
                            int neighborNode = get1DIndexDevice(newX, newY, newZ, width, height, localOpenSetBackward);
                            if (devClosedSet[neighborNode] == 0 && hasLineOfSight(devEnvironment, x, y, z, newX, newY, newZ, width, height, depth)) {
                                int tentativeGScore = devGScoreBackward[currentBackward] + 1;
                                int oldCameFrom = devCameFromBackward[neighborNode];
                                int newCameFrom = currentBackward; // Initialize newCameFrom

                                while (true) {
                                    int prevCameFrom = atomicExch(&devCameFromBackward[neighborNode], newCameFrom);
                                    if (prevCameFrom == oldCameFrom) {
                                        // Successfully updated devCameFromBackward[neighborNode], now update other arrays atomically
                                        devGScoreBackward[neighborNode] = tentativeGScore;

                                        devFScoreBackward[neighborNode] = devGScoreBackward[neighborNode] + lambda * heuristic(newX, newY, newZ, devStartNode->x, devStartNode->y, devStartNode->z);

                                        // Use the helper function to update the open set value atomically
                                        setOpenSetValue(devOpenSetBackward, neighborNode, currentBackward);

                                        // Set the node as closed
                                        setClosedSet(devClosedSet, neighborNode);

                                        break; // Exit the loop since the update was successful
                                    }
                                    else {
                                        // Another thread modified devCameFromBackward[neighborNode] before this thread, revert the update and try again
                                        newCameFrom = prevCameFrom;
                                    }
                                }
                            }
                        }

                    }
                }
            }

            // Update the next backward node to be expanded globally
            int nextBackward = -1;
            float lowestFScoreBackward = INFINITY;
            for (int i = localOpenSetBackward + index; i < width * height * depth; i += stride) {
                if (i >= 0 && i < width * height * depth && devFScoreBackward[i] < lowestFScoreBackward && devClosedSet[i] == 0) {
                    lowestFScoreBackward = devFScoreBackward[i];
                    nextBackward = i;
                }
            }
            atomicMin(&devOpenSetBackward[0], nextBackward);

            // Set the node as closed
            setClosedSet(devClosedSet, currentBackward);
        }
        __syncthreads();
    }
}







// Function to get a random valid point within the environment
Point3D getRandomPoint(int width, int height, int depth, const std::vector<unsigned char>& environment) {
    while (true) {
        int x = std::rand() % width;
        int y = std::rand() % height;
        int z = std::rand() % depth;
        if (environment[get1DIndex(x, y, z, width, height)] == 0) {
            Point3D point{ x, y, z };
            return point;
        }
    }
}


// Host function to calculate the heuristic (Euclidean distance) between two points
// Host function to calculate the heuristic (Euclidean distance) between two points
float heuristicGet(int x1, int y1, int z1, int x2, int y2, int z2, int width, int height, int depth, const unsigned char* environment) {
    int dx = x2 - x1;
    int dy = y2 - y1;
    int dz = z2 - z1;
    int steps = max(max(abs(dx), abs(dy)), abs(dz));

    if (steps == 0) {
        return 0.0f;
    }

    float stepX = static_cast<float>(dx) / steps;
    float stepY = static_cast<float>(dy) / steps;
    float stepZ = static_cast<float>(dz) / steps;

    float x = static_cast<float>(x1);
    float y = static_cast<float>(y1);
    float z = static_cast<float>(z1);

    for (int i = 0; i <= steps; ++i) {
        // Check if the current position is a valid point within the environment
        if (isValidPointHost(static_cast<int>(x), static_cast<int>(y), static_cast<int>(z), width, height, depth)) {
            if (environment[get1DIndex(static_cast<int>(x), static_cast<int>(y), static_cast<int>(z), width, height)] == 1) {
                return INFINITY; // Return infinity if there is an obstacle in the path
            }
        }
        else {
            return INFINITY; // Return infinity if the position is out of bounds
        }

        x += stepX;
        y += stepY;
        z += stepZ;
    }

    int ddx = x2 - static_cast<int>(x);
    int ddy = y2 - static_cast<int>(y);
    int ddz = z2 - static_cast<int>(z);
    int deltaSteps = max(max(abs(ddx), abs(ddy)), abs(ddz));

    // Add extra steps to consider a conservative estimate of the remaining distance
    steps += deltaSteps;

    return sqrtf(static_cast<float>(dx * dx + dy * dy + dz * dz)) + (steps - 1);
}


// Function to print the environment (assuming a 3D environment represented as a linear array)
void printEnvironment(const unsigned char* devEnvironment, int width, int height, int depth) {
    // Copy environment data from the device to the host
    std::vector<unsigned char> environment(width * height * depth);
    cudaMemcpy(environment.data(), devEnvironment, width * height * depth * sizeof(unsigned char), cudaMemcpyDeviceToHost);

    // Print the environment
    for (int z = 0; z < depth; ++z) {
        printf("Layer %d\n", z);
        for (int y = 0; y < height; ++y) {
            for (int x = 0; x < width; ++x) {
                int index = x + y * width + z * width * height;
                printf("%c ", environment[index] ? '#' : '.');
            }
            printf("\n");
        }
        printf("\n");
    }
}


int main() {
    // Set the random seed
    std::srand(static_cast<unsigned>(std::time(nullptr)));
    std::cout << "Program starts" << std::endl;
    // Define the size of the environment
    int width = 10;
    int height = 10;
    int depth = 10;

    // Generate a random 3D environment
    std::vector<unsigned char> environment(width * height * depth);
    for (int i = 0; i < width * height * depth; ++i) {
        environment[i] = std::rand() % 20 == 0 ? 1 : 0;  // 1/3 probability of an obstacle
    }

    // Generate random start and goal points
    // Point3D startNode{ std::rand() % width, std::rand() % height, std::rand() % depth };
    // Point3D goalNode{ std::rand() % width, std::rand() % height, std::rand() % depth };

    // Generate random start and goal points until valid ones are found
    // Generate random start and goal points
    Point3D startNode = getRandomPoint(width, height, depth, environment);
    Point3D goalNode = getRandomPoint(width, height, depth, environment);
    // Print the start and goal points
    std::cout << "Start Point: (" << startNode.x << ", " << startNode.y << ", " << startNode.z << ")" << std::endl;
    std::cout << "Goal Point: (" << goalNode.x << ", " << goalNode.y << ", " << goalNode.z << ")" << std::endl;


    // Define the device memory pointers
    unsigned char* devEnvironment;
    Point3D* devStartNode;
    Point3D* devGoalNode;
    int* devCameFromForward;
    int* devCameFromBackward;
    int* devGScoreForward;
    int* devGScoreBackward;
    float* devFScoreForward;
    float* devFScoreBackward;
    int* devOpenSetForward;
    int* devOpenSetBackward;
    int* devClosedSet;
    int* devPathFound;

    int* devPathNotFound;
    cudaMalloc((void**)&devPathNotFound, sizeof(int));
    cudaMemset(devPathNotFound, 0, sizeof(int));

    // Allocate memory on the device
    cudaMalloc((void**)&devEnvironment, width * height * depth * sizeof(unsigned char));
    cudaMalloc((void**)&devStartNode, sizeof(Point3D));
    cudaMalloc((void**)&devGoalNode, sizeof(Point3D));
    cudaMalloc((void**)&devCameFromForward, width * height * depth * sizeof(int));
    cudaMalloc((void**)&devCameFromBackward, width * height * depth * sizeof(int));
    cudaMalloc((void**)&devGScoreForward, width * height * depth * sizeof(int));
    cudaMalloc((void**)&devGScoreBackward, width * height * depth * sizeof(int));
    cudaMalloc((void**)&devFScoreForward, width * height * depth * sizeof(float));
    cudaMalloc((void**)&devFScoreBackward, width * height * depth * sizeof(float));
    cudaMalloc((void**)&devOpenSetForward, sizeof(int));
    cudaMalloc((void**)&devOpenSetBackward, sizeof(int));
    cudaMalloc((void**)&devClosedSet, width * height * depth * sizeof(int));
    cudaMalloc((void**)&devPathFound, sizeof(int));

    // Copy data from host to device
    cudaMemcpy(devEnvironment, environment.data(), width * height * depth * sizeof(unsigned char), cudaMemcpyHostToDevice);
    cudaMemcpy(devStartNode, &startNode, sizeof(Point3D), cudaMemcpyHostToDevice);
    cudaMemcpy(devGoalNode, &goalNode, sizeof(Point3D), cudaMemcpyHostToDevice);

    //Newly added 4 lines
    //float hScoreStartForward = heuristicGet(startNode.x, startNode.y, startNode.z, goalNode.x, goalNode.y, goalNode.z, width, height, depth, environment.data());
    //float hScoreStartBackward = heuristicGet(goalNode.x, goalNode.y, goalNode.z, startNode.x, startNode.y, startNode.z, width, height, depth, environment.data());

    // Copy the heuristic values to the device memory for devFScoreForward and devFScoreBackward
    //cudaMemcpy(devFScoreForward, &hScoreStartForward, sizeof(float), cudaMemcpyHostToDevice);
    //cudaMemcpy(devFScoreBackward, &hScoreStartBackward, sizeof(float), cudaMemcpyHostToDevice);


    // Initialize device variables
    cudaMemset(devCameFromForward, -1, width * height * depth * sizeof(int));
    cudaMemset(devCameFromBackward, -1, width * height * depth * sizeof(int));
    cudaMemset(devGScoreForward, INT_MAX, width * height * depth * sizeof(int));
    cudaMemset(devGScoreBackward, INT_MAX, width * height * depth * sizeof(int));
    // Initialize the temporary float variable to infinity
    float infinity = INFINITY;

    // Copy the infinity value to the device memory for devFScoreForward
    cudaMemcpy(devFScoreForward, &infinity, sizeof(float), cudaMemcpyHostToDevice);

    // Copy the infinity value to the device memory for devFScoreBackward
    cudaMemcpy(devFScoreBackward, &infinity, sizeof(float), cudaMemcpyHostToDevice);

    cudaMemset(devOpenSetForward, -1, sizeof(int));
    cudaMemset(devOpenSetBackward, -1, sizeof(int));
    cudaMemset(devClosedSet, 0, width * height * depth * sizeof(int));
    cudaMemset(devPathFound, 0, sizeof(int));


    // Print the environment before launching the kernel
    //printf("Initial Environment:\n");
    //printEnvironment(devEnvironment, width, height, depth);
    int pathFound = 1;  // Initialize pathFound to 0 (no path found)
    cudaMemcpy(devPathFound, &pathFound, sizeof(int), cudaMemcpyHostToDevice);


    // Launch the parallel bidirectional A* search on the device
    int blockSize = 256;
    int numBlocks = (width * height * depth + blockSize - 1) / blockSize;
    std::cout << "launching " << blockSize * numBlocks << " threads" << std::endl;
    //parallelBidirectionalAStar << <numBlocks, blockSize >> > (devEnvironment, devStartNode, devGoalNode, width, height, depth, devCameFromForward, devCameFromBackward, devGScoreForward, devGScoreBackward, devFScoreForward, devFScoreBackward, devOpenSetForward, devOpenSetBackward, devClosedSet, devPathFound);
    float lambda = 1;
    parallelBidirectionalAStar << <numBlocks, blockSize >> > (devEnvironment, devStartNode, devGoalNode, width, height, depth, devCameFromForward, devCameFromBackward, devGScoreForward, devGScoreBackward, devFScoreForward, devFScoreBackward, devOpenSetForward, devOpenSetBackward, devClosedSet, devPathFound, lambda); // For A* (lambda = 0.5)

    std::cout << "parallelBidirectionalAStar ends" << std::endl;
    // Wait for the GPU to finish
    cudaDeviceSynchronize();
    std::cout << "cudaDeviceSynchronize ends" << std::endl;
    // Copy the results back from the device to the host
    cudaMemcpy(&startNode, devStartNode, sizeof(Point3D), cudaMemcpyDeviceToHost);
    cudaMemcpy(&goalNode, devGoalNode, sizeof(Point3D), cudaMemcpyDeviceToHost);

    std::cout << "Start Point: (" << startNode.x << ", " << startNode.y << ", " << startNode.z << ")" << std::endl;
    std::cout << "Goal Point: (" << goalNode.x << ", " << goalNode.y << ", " << goalNode.z << ")" << std::endl;

    // Print the environment after the kernel has completed (data might be updated)
    //printf("Final Environment:\n");
    //printEnvironment(devEnvironment, width, height, depth);

    //int pathFound=1;
    std::cout << "Pathfound starts : " << pathFound << std::endl;
    cudaMemcpy(&pathFound, devPathFound, sizeof(int), cudaMemcpyDeviceToHost);

    //cudaMemcpy(&pathFound, devPathFound, sizeof(int), cudaMemcpyDeviceToHost);
    std::cout << "Pathfound ends : " << pathFound << std::endl;

    int* cameFromForward = new int[width * height * depth];
    int* cameFromBackward = new int[width * height * depth];
    cudaMemcpy(cameFromForward, devCameFromForward, width* height* depth * sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(cameFromBackward, devCameFromBackward, width* height* depth * sizeof(int), cudaMemcpyDeviceToHost);

    std::cout << "cameFromForward starts : " << cameFromForward << std::endl;
    for (int i = 0; i < sizeof(cameFromForward)/ sizeof(cameFromForward[0]); ++i) {
        std::cout << cameFromForward[i] << " ";
    }
    std::cout << std::endl;

    std::cout << "cameFromForward ends : " << cameFromBackward << std::endl;
    std::cout << "cameFromBackward starts : " << cameFromForward << std::endl;
    for (int i = 0; i < sizeof(cameFromBackward) / sizeof(cameFromBackward[0]); ++i) {
        std::cout << cameFromBackward[i] << " ";
    }
    std::cout << std::endl;
    std::cout << "cameFromBackward ends : " << cameFromForward << std::endl;

    if (pathFound == 1) {
        // Reconstruct the path from the start node to the goal node
        int currentForward = get1DIndex(goalNode.x, goalNode.y, goalNode.z, width, height);
        std::cout << "Path Generation" << std::endl;

        std::vector<int> pathForward;
        while (currentForward != -1) {
            pathForward.push_back(currentForward);
            currentForward = cameFromForward[currentForward];
            std::cout << "currentForward" << currentForward << std::endl;
        }
        std::cout << "Exited while loop" << currentForward << std::endl;
        std::reverse(pathForward.begin(), pathForward.end());

        int currentBackward = get1DIndex(startNode.x, startNode.y, startNode.z, width, height);
        std::vector<int> pathBackward;
        std::cout << "currentBackward" << currentBackward << std::endl;
        while (currentBackward != -1) {
            pathBackward.push_back(currentBackward);
            currentBackward = cameFromBackward[currentBackward];
        }

        // Reverse the pathBackward array
        std::reverse(pathBackward.begin(), pathBackward.end());

        // Merge the two paths
        pathForward.insert(pathForward.end(), pathBackward.begin(), pathBackward.end());

        // Print the path
        std::cout << "Path Found!" << std::endl;
        std::cout << "Path Length: " << pathForward.size() << std::endl;
        std::cout << "Nodes Opened: " << std::count(environment.begin(), environment.end(), 1) << std::endl;
        std::cout << "Environment Details:" << std::endl;
        for (int z = 0; z < depth; ++z) {
            std::cout << "Layer " << z + 1 << std::endl;
            for (int y = 0; y < height; ++y) {
                for (int x = 0; x < width; ++x) {
                    std::cout << static_cast<int>(environment[get1DIndex(x, y, z, width, height)]) << " ";
                }
                std::cout << std::endl;
            }
            std::cout << std::endl;
        }
        std::cout << "Start Point: (" << startNode.x << ", " << startNode.y << ", " << startNode.z << ")" << std::endl;
        std::cout << "Goal Point: (" << goalNode.x << ", " << goalNode.y << ", " << goalNode.z << ")" << std::endl;
    }
    else {
        std::cout << "Path not found!" << std::endl;
    }


    std::cout << "Program ends" << std::endl;
    // Free device memory
    cudaFree(devEnvironment);
    cudaFree(devStartNode);
    cudaFree(devGoalNode);
    cudaFree(devCameFromForward);
    cudaFree(devCameFromBackward);
    cudaFree(devGScoreForward);
    cudaFree(devGScoreBackward);
    cudaFree(devFScoreForward);
    cudaFree(devFScoreBackward);
    cudaFree(devOpenSetForward);
    cudaFree(devOpenSetBackward);
    cudaFree(devClosedSet);
    cudaFree(devPathFound);

    return 0;
}

In path generation, i’m getting issue in

while (currentForward != -1) {
            pathForward.push_back(currentForward);
            currentForward = cameFromForward[currentForward];
            std::cout << "currentForward" << currentForward << std::endl;
        }

section.

I traced back and found that the cameFromForward and cameFromBackward are not updated correctly. Then i traced back and started debugging. The issue remains the same.

The output I obtained was

Program starts
Start Point: (4, 3, 6)
Goal Point: (7, 5, 5)
launching 1024 threads
parallelBidirectionalAStar ends
cudaDeviceSynchronize ends
Start Point: (4, 3, 6)
Goal Point: (7, 5, 5)
Pathfound starts : 1
Pathfound ends : 1
cameFromForward starts : 0000023AC9F21740
-898730464 570
cameFromForward ends : 0000023ACA6E7620
cameFromBackward starts : 0000023AC9F21740
-1051328176 570
cameFromBackward ends : 0000023AC9F21740
Path Generation
currentForward0
currentForward-898730464

And it waited for few sec and terminated.