Nsight Compute: specific kernel launch failure

Faber98 · January 31, 2024, 6:58pm

Trying to debug the application for the parallelisation of the A* algorithm through Nsight Compute, the debug fails when it is up to the pathfindingKernel, returning an error saying the kernel is not even launched.
The main scripts are:

main.cu

#pragma once
#include "pathfinding.cuh"
#include <iostream>

int main() {
    curandState* d_state;
    
    Node h_startNode{};
    Node h_targetNode{};
    Node* h_map = new Node[mapWidth * mapHeight];
    int h_openListSize = 0;
    Node* h_openList = new Node[h_openListSize];

    Node* d_map;
    Node* d_startNode;
    Node* d_targetNode;
    
    // Initialize the h_map with zeros
    for (int i = 0; i < mapWidth * mapHeight; i++) {
        h_map[i] = Node();
    }

    //Device memory allocation
    cudaMalloc(&d_startNode, sizeof(Node));
    cudaMalloc(&d_targetNode, sizeof(Node));
    cudaMalloc(&d_state, mapWidth * mapHeight * sizeof(curandState));
    cudaMalloc(&d_map, mapWidth * mapHeight * sizeof(Node));

    randomizeStartAndEndNodesHost(&h_startNode, &h_targetNode);

    // Memcpy from host -> device 
    cudaMemcpy(d_map, h_map, mapWidth * mapHeight * sizeof(Node), cudaMemcpyHostToDevice);
    cudaMemcpy(d_startNode, &h_startNode, sizeof(Node), cudaMemcpyHostToDevice);
    cudaMemcpy(d_targetNode, &h_targetNode, sizeof(Node), cudaMemcpyHostToDevice);
    
    initCurand << <1, mapHeight * mapWidth >> > (d_state, time(NULL));
    randomWalkablePropAssignment << <1, mapHeight * mapWidth>> > (d_map, d_startNode, d_targetNode, d_state);
    
    cudaDeviceSynchronize();  
    
    // Memcpy from device -> host to print initial board
    cudaMemcpy(h_map, d_map, mapWidth * mapHeight * sizeof(Node), cudaMemcpyDeviceToHost);
    printf("Initial board:\n\n");
	printBoard(&h_startNode, &h_targetNode, h_map);

    pathfindingKernel << <1, 8 >> > (d_map, d_startNode, d_targetNode, h_openList, h_openListSize);
    
    cudaDeviceSynchronize();

    // Memcpy from device -> host to print final board
    cudaMemcpy(h_map, d_map, mapWidth * mapHeight * sizeof(Node), cudaMemcpyDeviceToHost);        
    printf("Final board:\n\n");
    printBoard(&h_startNode, &h_targetNode, h_map);

    //occupancyCompute(1, mapHeight * mapWidth);

    // Free device memory
    cudaFree(d_startNode);
	cudaFree(d_targetNode);
    cudaFree(d_map);
    cudaFree(d_state);

    // Free host memory
    delete[] h_openList;
    delete[] h_map;

    return 0;}

and

pathfinding.cu

#pragma once
#include "pathfinding.cuh"

void randomizeStartAndEndNodesHost(Node* h_startNode, Node* h_endNode) {
    srand(time(NULL)); 
    h_startNode->x = rand() % mapWidth;
    h_startNode->y = rand() % mapHeight;
    h_endNode->x = rand() % mapWidth;
    h_endNode->y = rand() % mapHeight;
    h_startNode->walkable = true;
    h_endNode->walkable = true;
    h_startNode->open = true;
    h_endNode->open = false; 
}

void printBoard(Node* start, Node* end, Node* h_map) {
    HANDLE hConsole = GetStdHandle(STD_OUTPUT_HANDLE);
    for (int i = 0; i < mapHeight; i++) {
        for (int j = 0; j < mapWidth; j++) {
            Node& node = h_map[i * mapWidth + j];
            if ((i == start->y && j == start->x) 
                || (i == end->y && j == end->x)) {
                SetConsoleTextAttribute(hConsole, 9); // color: blue
                printf("%3d: %d ", i * mapWidth + j, node.walkable ? 1 : 0);
                SetConsoleTextAttribute(hConsole, 7); // color: white
            }
            else {
                if (node.walkable) {
                    SetConsoleTextAttribute(hConsole, 10); // walkable --> green 
                }
                else {
                    SetConsoleTextAttribute(hConsole, 12); // not walkable --> red 
                }
                printf("%3d: %d ", i * mapWidth + j, node.walkable ? 1 : 0);
                SetConsoleTextAttribute(hConsole, 7);
            }
        }
        printf("\n");
    }
};

void occupancyCompute(int grid_size, int block_size) {
    int dev = 0;
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, dev);
    int gS = grid_size;
    int bS = block_size;
    double occupancy;

    int warpSize = deviceProp.warpSize;
    int maxThreadsPerMultiProcessor = deviceProp.maxThreadsPerMultiProcessor;

    int activeWarps = gS * bS / warpSize;
    int maxWarps = maxThreadsPerMultiProcessor / warpSize;

    occupancy = (double)activeWarps / maxWarps * 100;

    printf("\nMax Warps: %d\nActive Warps: %d\n", maxWarps, activeWarps);
    printf("Max Occupancy reached: %.2f%%\n", occupancy);
}

__global__ void initCurand(curandState* state, unsigned long seed) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    curand_init(seed, idx, 0, &state[idx]);
}

__global__ void randomWalkablePropAssignment(Node* d_map, Node* start, Node* end, curandState* state) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (!(d_map[idx].x == start->x && d_map[idx].y == start->y) 
        && !(d_map[idx].x == end->x && d_map[idx].y == end->y)){
            if (idx < mapWidth* mapHeight) {
            d_map[idx].walkable = (curand_uniform(&state[idx]) > 0.5) ? 1: 0;
            }
    }
}

__device__ void sortOpenList(Node** openList, int* openListSize) {

    void syncthreads();

    for (int i = 0; i < *openListSize - 1; i++) {
        for (int j = 0; j < *openListSize - i - 1; j++) {
            if (openList[j]->F > openList[j + 1]->F) {
                Node* temp = openList[j];
                openList[j] = openList[j + 1];
                openList[j + 1] = temp;
            }
        }
    }
}

__global__ void pathfindingKernel(Node* map, Node* startNode, Node* targetNode, Node* openList, int openListSize)
{
    __shared__ int* shared_openListSize;
    __shared__ Node** shared_openList;

    if (threadIdx.x == 0) {
        openList[0] = *startNode;
        openListSize++;
        *shared_openListSize = openListSize;
        *shared_openList = openList;
    }

    void __syncthreads();

    if (threadIdx.x < *shared_openListSize) {
        *shared_openList[threadIdx.x] = openList[threadIdx.x];
    }

    while (*shared_openListSize > 0) {
        sortOpenList(shared_openList, shared_openListSize);
        Node* currentNode = shared_openList[0];
        currentNode->closed = true;

        for (int i = 0; i < 8; i++) {
            int newX = currentNode->x + (i % 3) - 1;
            int newY = currentNode->y + (i / 3) - 1;
            if (newX >= 0 && newX < mapWidth && newY >= 0 && newY < mapHeight) {
                Node* adjacentNode = &map[newY * mapWidth + newX];
                if (!adjacentNode->walkable || adjacentNode->closed) continue;

                if (!adjacentNode->open) {
                    adjacentNode->open = true;
                    adjacentNode->parent = currentNode;
                    adjacentNode->G = currentNode->G + 1;
                    adjacentNode->F = adjacentNode->G + abs(targetNode->x - adjacentNode->x) + abs(targetNode->y - adjacentNode->y);
                    shared_openList[*shared_openListSize] = adjacentNode;
                    (*shared_openListSize)++;
                }
                else {
                    int newG = currentNode->G + 1;
                    if (newG < adjacentNode->G) {
                        adjacentNode->parent = currentNode;
                        adjacentNode->G = newG;
                        adjacentNode->F = newG + abs(targetNode->x - adjacentNode->x) + abs(targetNode->y - adjacentNode->y);
                    }
                }
            }
        }

        if (targetNode->open) {
            return;
        }
    }
}

Nsight Compute returns the following:

Launch succeeded.
Profiling...
==PROF== Connected to process 4392 (C:\MyPersonalFolder\GPU_Computing\astar_CUDA\astar_CUDA\x64\Debug\astar_CUDA.exe)

==PROF== Profiling "initCurand" - 0: 
==PROF== Profiling "randomWalkablePropAssignment" - 1: 
==PROF== Profiling "pathfindingKernel" - 2: 
==ERROR== LaunchFailed

==PROF== Trying to shutdown target application

Process terminated.

I tried to correct the shared memory usage to improve operation sync in the problematic kernel but anything has worked so far…

veraj · February 1, 2024, 3:11am

Hi, @Faber98

Sorry for the issue you met.
include “pathfinding.cuh” indicates a .cuh file required.

Are you building the project in visual studio ? Please clarify the build steps.
Is it possible to provide your whole project for us to have a repro ?

Faber98 · February 1, 2024, 12:41pm

Yes, the project has been developed and builded in Visual Studio and then debugged using Nsight Compute. Here the required missing script:

pathfinding.cuh

#pragma once
#include <cuda_runtime.h>
#include <curand.h>
#include <curand_kernel.h>
#include <device_launch_parameters.h>
#include <cstdlib>
#include <windows.h> 
#include <iostream>

typedef struct Node {
    int x, y;
    int G, F;
    bool walkable;
    bool closed;
    bool open;
    bool isPath;
    struct Node* parent;
} Node;

const int mapWidth = 10;
const int mapHeight = 10;

void printBoard(Node* start, Node* end, Node* h_map);

void randomizeStartAndEndNodesHost(Node* h_startNode, Node* h_endNode);

void occupancyCompute(int grid_size, int block_size);

__global__ void initCurand(curandState* state, unsigned long seed);

__global__ void randomWalkablePropAssignment(Node* d_map, Node* start, Node* end, curandState* state);

__device__ void sortOpenList(Node** openList, int* openListSize);

__global__ void pathfindingKernel(Node* map, Node* startNode, Node* targetNode, Node* openList, int openListSize);

veraj · February 2, 2024, 5:25am

Hi, @Faber98

Thanks for the update ! We can reproduce your issue internal. After some investigation, we found it is due to there is error in your kernel.

If you use compute-sanitizer to detect the sample, you’ll get below error. So please check your implementation in pathfinding.cu:98

========= Invalid global write of size 4 bytes
========= at pathfindingKernel(Node *, Node *, Node *, Node *, int)+0x720 in D:/02-tests/forum280826/pathfinding.cu:98
========= by thread (0,0,0) in block (0,0,0)
========= Address 0x19c319a4b14 is out of bounds
========= and is 1,740,107,085,333 bytes after the nearest allocation at 0x70b234c00 of size 2,304 bytes
========= Saved host backtrace up to driver entry point at kernel launch time

system · February 18, 2024, 6:36pm

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.

Topic		Replies	Views
Ncu does not detect kernels, ==ERROR== The application returned an error code (11) Nsight Compute kernel , profiling	6	1867	December 13, 2023
Nsight Compute crash with message: free(): invalid pointer Nsight Compute cuda , nsight	36	634	March 20, 2025
Nsight Compute not detecting kernel launch Nsight Compute profiling	13	3072	May 6, 2021
Run ncu command in ubuntu 20.04 Nsight Compute	7	5344	August 8, 2022
==ERROR== Failed to prepare kernel for profiling (0xc00000fd) but CUDA sample works Nsight Compute kernel , nvbugs	13	2051	November 6, 2021
Option to profile only master process Nsight Compute cuda	23	3518	December 1, 2023
NSight Compute not finding kernels Nsight Compute	24	705	October 24, 2024
Nsight in holohub not working Profiling Linux Targets	7	30	March 20, 2025
NVIDIA NSight Compute: The profiler returned an error code:1 Nsight Compute	13	1913	March 18, 2024
Kernels after a persistent kernel isn't executed unless running under Nsight System CUDA Programming and Performance cuda , kernel , nsight , nvcc	12	1122	August 25, 2023

Nsight Compute: specific kernel launch failure

Related topics