Trying to debug the application for the parallelisation of the A* algorithm through Nsight Compute, the debug fails when it is up to the pathfindingKernel
, returning an error saying the kernel is not even launched.
The main scripts are:
main.cu
#pragma once
#include "pathfinding.cuh"
#include <iostream>
int main() {
curandState* d_state;
Node h_startNode{};
Node h_targetNode{};
Node* h_map = new Node[mapWidth * mapHeight];
int h_openListSize = 0;
Node* h_openList = new Node[h_openListSize];
Node* d_map;
Node* d_startNode;
Node* d_targetNode;
// Initialize the h_map with zeros
for (int i = 0; i < mapWidth * mapHeight; i++) {
h_map[i] = Node();
}
//Device memory allocation
cudaMalloc(&d_startNode, sizeof(Node));
cudaMalloc(&d_targetNode, sizeof(Node));
cudaMalloc(&d_state, mapWidth * mapHeight * sizeof(curandState));
cudaMalloc(&d_map, mapWidth * mapHeight * sizeof(Node));
randomizeStartAndEndNodesHost(&h_startNode, &h_targetNode);
// Memcpy from host -> device
cudaMemcpy(d_map, h_map, mapWidth * mapHeight * sizeof(Node), cudaMemcpyHostToDevice);
cudaMemcpy(d_startNode, &h_startNode, sizeof(Node), cudaMemcpyHostToDevice);
cudaMemcpy(d_targetNode, &h_targetNode, sizeof(Node), cudaMemcpyHostToDevice);
initCurand << <1, mapHeight * mapWidth >> > (d_state, time(NULL));
randomWalkablePropAssignment << <1, mapHeight * mapWidth>> > (d_map, d_startNode, d_targetNode, d_state);
cudaDeviceSynchronize();
// Memcpy from device -> host to print initial board
cudaMemcpy(h_map, d_map, mapWidth * mapHeight * sizeof(Node), cudaMemcpyDeviceToHost);
printf("Initial board:\n\n");
printBoard(&h_startNode, &h_targetNode, h_map);
pathfindingKernel << <1, 8 >> > (d_map, d_startNode, d_targetNode, h_openList, h_openListSize);
cudaDeviceSynchronize();
// Memcpy from device -> host to print final board
cudaMemcpy(h_map, d_map, mapWidth * mapHeight * sizeof(Node), cudaMemcpyDeviceToHost);
printf("Final board:\n\n");
printBoard(&h_startNode, &h_targetNode, h_map);
//occupancyCompute(1, mapHeight * mapWidth);
// Free device memory
cudaFree(d_startNode);
cudaFree(d_targetNode);
cudaFree(d_map);
cudaFree(d_state);
// Free host memory
delete[] h_openList;
delete[] h_map;
return 0;}
and
pathfinding.cu
#pragma once
#include "pathfinding.cuh"
void randomizeStartAndEndNodesHost(Node* h_startNode, Node* h_endNode) {
srand(time(NULL));
h_startNode->x = rand() % mapWidth;
h_startNode->y = rand() % mapHeight;
h_endNode->x = rand() % mapWidth;
h_endNode->y = rand() % mapHeight;
h_startNode->walkable = true;
h_endNode->walkable = true;
h_startNode->open = true;
h_endNode->open = false;
}
void printBoard(Node* start, Node* end, Node* h_map) {
HANDLE hConsole = GetStdHandle(STD_OUTPUT_HANDLE);
for (int i = 0; i < mapHeight; i++) {
for (int j = 0; j < mapWidth; j++) {
Node& node = h_map[i * mapWidth + j];
if ((i == start->y && j == start->x)
|| (i == end->y && j == end->x)) {
SetConsoleTextAttribute(hConsole, 9); // color: blue
printf("%3d: %d ", i * mapWidth + j, node.walkable ? 1 : 0);
SetConsoleTextAttribute(hConsole, 7); // color: white
}
else {
if (node.walkable) {
SetConsoleTextAttribute(hConsole, 10); // walkable --> green
}
else {
SetConsoleTextAttribute(hConsole, 12); // not walkable --> red
}
printf("%3d: %d ", i * mapWidth + j, node.walkable ? 1 : 0);
SetConsoleTextAttribute(hConsole, 7);
}
}
printf("\n");
}
};
void occupancyCompute(int grid_size, int block_size) {
int dev = 0;
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, dev);
int gS = grid_size;
int bS = block_size;
double occupancy;
int warpSize = deviceProp.warpSize;
int maxThreadsPerMultiProcessor = deviceProp.maxThreadsPerMultiProcessor;
int activeWarps = gS * bS / warpSize;
int maxWarps = maxThreadsPerMultiProcessor / warpSize;
occupancy = (double)activeWarps / maxWarps * 100;
printf("\nMax Warps: %d\nActive Warps: %d\n", maxWarps, activeWarps);
printf("Max Occupancy reached: %.2f%%\n", occupancy);
}
__global__ void initCurand(curandState* state, unsigned long seed) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
curand_init(seed, idx, 0, &state[idx]);
}
__global__ void randomWalkablePropAssignment(Node* d_map, Node* start, Node* end, curandState* state) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (!(d_map[idx].x == start->x && d_map[idx].y == start->y)
&& !(d_map[idx].x == end->x && d_map[idx].y == end->y)){
if (idx < mapWidth* mapHeight) {
d_map[idx].walkable = (curand_uniform(&state[idx]) > 0.5) ? 1: 0;
}
}
}
__device__ void sortOpenList(Node** openList, int* openListSize) {
void syncthreads();
for (int i = 0; i < *openListSize - 1; i++) {
for (int j = 0; j < *openListSize - i - 1; j++) {
if (openList[j]->F > openList[j + 1]->F) {
Node* temp = openList[j];
openList[j] = openList[j + 1];
openList[j + 1] = temp;
}
}
}
}
__global__ void pathfindingKernel(Node* map, Node* startNode, Node* targetNode, Node* openList, int openListSize)
{
__shared__ int* shared_openListSize;
__shared__ Node** shared_openList;
if (threadIdx.x == 0) {
openList[0] = *startNode;
openListSize++;
*shared_openListSize = openListSize;
*shared_openList = openList;
}
void __syncthreads();
if (threadIdx.x < *shared_openListSize) {
*shared_openList[threadIdx.x] = openList[threadIdx.x];
}
while (*shared_openListSize > 0) {
sortOpenList(shared_openList, shared_openListSize);
Node* currentNode = shared_openList[0];
currentNode->closed = true;
for (int i = 0; i < 8; i++) {
int newX = currentNode->x + (i % 3) - 1;
int newY = currentNode->y + (i / 3) - 1;
if (newX >= 0 && newX < mapWidth && newY >= 0 && newY < mapHeight) {
Node* adjacentNode = &map[newY * mapWidth + newX];
if (!adjacentNode->walkable || adjacentNode->closed) continue;
if (!adjacentNode->open) {
adjacentNode->open = true;
adjacentNode->parent = currentNode;
adjacentNode->G = currentNode->G + 1;
adjacentNode->F = adjacentNode->G + abs(targetNode->x - adjacentNode->x) + abs(targetNode->y - adjacentNode->y);
shared_openList[*shared_openListSize] = adjacentNode;
(*shared_openListSize)++;
}
else {
int newG = currentNode->G + 1;
if (newG < adjacentNode->G) {
adjacentNode->parent = currentNode;
adjacentNode->G = newG;
adjacentNode->F = newG + abs(targetNode->x - adjacentNode->x) + abs(targetNode->y - adjacentNode->y);
}
}
}
}
if (targetNode->open) {
return;
}
}
}
Nsight Compute returns the following:
Launch succeeded.
Profiling...
==PROF== Connected to process 4392 (C:\MyPersonalFolder\GPU_Computing\astar_CUDA\astar_CUDA\x64\Debug\astar_CUDA.exe)
==PROF== Profiling "initCurand" - 0:
==PROF== Profiling "randomWalkablePropAssignment" - 1:
==PROF== Profiling "pathfindingKernel" - 2:
==ERROR== LaunchFailed
==PROF== Trying to shutdown target application
Process terminated.
I tried to correct the shared memory usage to improve operation sync in the problematic kernel but anything has worked so far…