Nsight Compute: specific kernel launch failure

Trying to debug the application for the parallelisation of the A* algorithm through Nsight Compute, the debug fails when it is up to the pathfindingKernel, returning an error saying the kernel is not even launched.
The main scripts are:

main.cu
#pragma once
#include "pathfinding.cuh"
#include <iostream>

int main() {
    curandState* d_state;
    
    Node h_startNode{};
    Node h_targetNode{};
    Node* h_map = new Node[mapWidth * mapHeight];
    int h_openListSize = 0;
    Node* h_openList = new Node[h_openListSize];

    Node* d_map;
    Node* d_startNode;
    Node* d_targetNode;
    
    // Initialize the h_map with zeros
    for (int i = 0; i < mapWidth * mapHeight; i++) {
        h_map[i] = Node();
    }

    //Device memory allocation
    cudaMalloc(&d_startNode, sizeof(Node));
    cudaMalloc(&d_targetNode, sizeof(Node));
    cudaMalloc(&d_state, mapWidth * mapHeight * sizeof(curandState));
    cudaMalloc(&d_map, mapWidth * mapHeight * sizeof(Node));

    randomizeStartAndEndNodesHost(&h_startNode, &h_targetNode);

    // Memcpy from host -> device 
    cudaMemcpy(d_map, h_map, mapWidth * mapHeight * sizeof(Node), cudaMemcpyHostToDevice);
    cudaMemcpy(d_startNode, &h_startNode, sizeof(Node), cudaMemcpyHostToDevice);
    cudaMemcpy(d_targetNode, &h_targetNode, sizeof(Node), cudaMemcpyHostToDevice);
    
    initCurand << <1, mapHeight * mapWidth >> > (d_state, time(NULL));
    randomWalkablePropAssignment << <1, mapHeight * mapWidth>> > (d_map, d_startNode, d_targetNode, d_state);
    
    cudaDeviceSynchronize();  
    
    // Memcpy from device -> host to print initial board
    cudaMemcpy(h_map, d_map, mapWidth * mapHeight * sizeof(Node), cudaMemcpyDeviceToHost);
    printf("Initial board:\n\n");
	printBoard(&h_startNode, &h_targetNode, h_map);

    pathfindingKernel << <1, 8 >> > (d_map, d_startNode, d_targetNode, h_openList, h_openListSize);
    
    cudaDeviceSynchronize();

    // Memcpy from device -> host to print final board
    cudaMemcpy(h_map, d_map, mapWidth * mapHeight * sizeof(Node), cudaMemcpyDeviceToHost);        
    printf("Final board:\n\n");
    printBoard(&h_startNode, &h_targetNode, h_map);

    //occupancyCompute(1, mapHeight * mapWidth);

    // Free device memory
    cudaFree(d_startNode);
	cudaFree(d_targetNode);
    cudaFree(d_map);
    cudaFree(d_state);

    // Free host memory
    delete[] h_openList;
    delete[] h_map;

    return 0;}

and

pathfinding.cu
#pragma once
#include "pathfinding.cuh"

void randomizeStartAndEndNodesHost(Node* h_startNode, Node* h_endNode) {
    srand(time(NULL)); 
    h_startNode->x = rand() % mapWidth;
    h_startNode->y = rand() % mapHeight;
    h_endNode->x = rand() % mapWidth;
    h_endNode->y = rand() % mapHeight;
    h_startNode->walkable = true;
    h_endNode->walkable = true;
    h_startNode->open = true;
    h_endNode->open = false; 
}

void printBoard(Node* start, Node* end, Node* h_map) {
    HANDLE hConsole = GetStdHandle(STD_OUTPUT_HANDLE);
    for (int i = 0; i < mapHeight; i++) {
        for (int j = 0; j < mapWidth; j++) {
            Node& node = h_map[i * mapWidth + j];
            if ((i == start->y && j == start->x) 
                || (i == end->y && j == end->x)) {
                SetConsoleTextAttribute(hConsole, 9); // color: blue
                printf("%3d: %d ", i * mapWidth + j, node.walkable ? 1 : 0);
                SetConsoleTextAttribute(hConsole, 7); // color: white
            }
            else {
                if (node.walkable) {
                    SetConsoleTextAttribute(hConsole, 10); // walkable --> green 
                }
                else {
                    SetConsoleTextAttribute(hConsole, 12); // not walkable --> red 
                }
                printf("%3d: %d ", i * mapWidth + j, node.walkable ? 1 : 0);
                SetConsoleTextAttribute(hConsole, 7);
            }
        }
        printf("\n");
    }
};

void occupancyCompute(int grid_size, int block_size) {
    int dev = 0;
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, dev);
    int gS = grid_size;
    int bS = block_size;
    double occupancy;

    int warpSize = deviceProp.warpSize;
    int maxThreadsPerMultiProcessor = deviceProp.maxThreadsPerMultiProcessor;

    int activeWarps = gS * bS / warpSize;
    int maxWarps = maxThreadsPerMultiProcessor / warpSize;

    occupancy = (double)activeWarps / maxWarps * 100;

    printf("\nMax Warps: %d\nActive Warps: %d\n", maxWarps, activeWarps);
    printf("Max Occupancy reached: %.2f%%\n", occupancy);
}

__global__ void initCurand(curandState* state, unsigned long seed) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    curand_init(seed, idx, 0, &state[idx]);
}

__global__ void randomWalkablePropAssignment(Node* d_map, Node* start, Node* end, curandState* state) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (!(d_map[idx].x == start->x && d_map[idx].y == start->y) 
        && !(d_map[idx].x == end->x && d_map[idx].y == end->y)){
            if (idx < mapWidth* mapHeight) {
            d_map[idx].walkable = (curand_uniform(&state[idx]) > 0.5) ? 1: 0;
            }
    }
}

__device__ void sortOpenList(Node** openList, int* openListSize) {

    void syncthreads();

    for (int i = 0; i < *openListSize - 1; i++) {
        for (int j = 0; j < *openListSize - i - 1; j++) {
            if (openList[j]->F > openList[j + 1]->F) {
                Node* temp = openList[j];
                openList[j] = openList[j + 1];
                openList[j + 1] = temp;
            }
        }
    }
}

__global__ void pathfindingKernel(Node* map, Node* startNode, Node* targetNode, Node* openList, int openListSize)
{
    __shared__ int* shared_openListSize;
    __shared__ Node** shared_openList;

    if (threadIdx.x == 0) {
        openList[0] = *startNode;
        openListSize++;
        *shared_openListSize = openListSize;
        *shared_openList = openList;
    }

    void __syncthreads();

    if (threadIdx.x < *shared_openListSize) {
        *shared_openList[threadIdx.x] = openList[threadIdx.x];
    }

    while (*shared_openListSize > 0) {
        sortOpenList(shared_openList, shared_openListSize);
        Node* currentNode = shared_openList[0];
        currentNode->closed = true;

        for (int i = 0; i < 8; i++) {
            int newX = currentNode->x + (i % 3) - 1;
            int newY = currentNode->y + (i / 3) - 1;
            if (newX >= 0 && newX < mapWidth && newY >= 0 && newY < mapHeight) {
                Node* adjacentNode = &map[newY * mapWidth + newX];
                if (!adjacentNode->walkable || adjacentNode->closed) continue;

                if (!adjacentNode->open) {
                    adjacentNode->open = true;
                    adjacentNode->parent = currentNode;
                    adjacentNode->G = currentNode->G + 1;
                    adjacentNode->F = adjacentNode->G + abs(targetNode->x - adjacentNode->x) + abs(targetNode->y - adjacentNode->y);
                    shared_openList[*shared_openListSize] = adjacentNode;
                    (*shared_openListSize)++;
                }
                else {
                    int newG = currentNode->G + 1;
                    if (newG < adjacentNode->G) {
                        adjacentNode->parent = currentNode;
                        adjacentNode->G = newG;
                        adjacentNode->F = newG + abs(targetNode->x - adjacentNode->x) + abs(targetNode->y - adjacentNode->y);
                    }
                }
            }
        }

        if (targetNode->open) {
            return;
        }
    }
}

Nsight Compute returns the following:

Launch succeeded.
Profiling...
==PROF== Connected to process 4392 (C:\MyPersonalFolder\GPU_Computing\astar_CUDA\astar_CUDA\x64\Debug\astar_CUDA.exe)

==PROF== Profiling "initCurand" - 0: 
==PROF== Profiling "randomWalkablePropAssignment" - 1: 
==PROF== Profiling "pathfindingKernel" - 2: 
==ERROR== LaunchFailed

==PROF== Trying to shutdown target application

Process terminated.

I tried to correct the shared memory usage to improve operation sync in the problematic kernel but anything has worked so far…

Hi, @Faber98

Sorry for the issue you met.
include “pathfinding.cuh” indicates a .cuh file required.

Are you building the project in visual studio ? Please clarify the build steps.
Is it possible to provide your whole project for us to have a repro ?

Yes, the project has been developed and builded in Visual Studio and then debugged using Nsight Compute. Here the required missing script:

pathfinding.cuh
#pragma once
#include <cuda_runtime.h>
#include <curand.h>
#include <curand_kernel.h>
#include <device_launch_parameters.h>
#include <cstdlib>
#include <windows.h> 
#include <iostream>

typedef struct Node {
    int x, y;
    int G, F;
    bool walkable;
    bool closed;
    bool open;
    bool isPath;
    struct Node* parent;
} Node;

const int mapWidth = 10;
const int mapHeight = 10;

void printBoard(Node* start, Node* end, Node* h_map);

void randomizeStartAndEndNodesHost(Node* h_startNode, Node* h_endNode);

void occupancyCompute(int grid_size, int block_size);

__global__ void initCurand(curandState* state, unsigned long seed);

__global__ void randomWalkablePropAssignment(Node* d_map, Node* start, Node* end, curandState* state);

__device__ void sortOpenList(Node** openList, int* openListSize);

__global__ void pathfindingKernel(Node* map, Node* startNode, Node* targetNode, Node* openList, int openListSize);

Hi, @Faber98

Thanks for the update ! We can reproduce your issue internal. After some investigation, we found it is due to there is error in your kernel.

If you use compute-sanitizer to detect the sample, you’ll get below error. So please check your implementation in pathfinding.cu:98

========= Invalid global write of size 4 bytes
========= at pathfindingKernel(Node *, Node *, Node *, Node *, int)+0x720 in D:/02-tests/forum280826/pathfinding.cu:98
========= by thread (0,0,0) in block (0,0,0)
========= Address 0x19c319a4b14 is out of bounds
========= and is 1,740,107,085,333 bytes after the nearest allocation at 0x70b234c00 of size 2,304 bytes
========= Saved host backtrace up to driver entry point at kernel launch time

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.