Hello everyone…
So, I’m currently building a CUDA program. I have tracked the bug into the kernel due to the Unspecified Launch Failure that keeps appearing.
Here is a copy of the kernel in question:
#ifndef _ASEARCH_KERNEL_H_
#define _ASEARCH_KERNEL_H_
#include <map>
#include <math.h>
#include "agraph.h"
#include "binheap.cu"
// returns the guessing heuristic value for the distance to the
// end point
// only callable from within device, runs on device
__device__ float
heuristic( AVertex *d_g, int source, int end )
{
int s_posx, s_posy, e_posx, e_posy;
s_posx = d_g[source].posx;
s_posy = d_g[source].posy;
e_posx = d_g[end].posx;
e_posy = d_g[end].posy;
float result = sqrt(pow(float(e_posx-s_posx),2)+pow(float(e_posy-s_posy),2));
return result;
}
// the speculative searching A* kernel
__global__ void
aSearch( AVertex *d_g, int *d_seedID, HeapStruct *d_pqueue, int end_id, int num_steps)
{
// declare some things we will be using
AVertex currentVertex;
float cost, distance;
int step;
// load your thread-specific values
int seedID = d_seedID[threadIdx.x];
HeapStruct mypqueue = d_pqueue[threadIdx.x];
// clean out the heap
MakeEmpty(&mypqueue);
Packet currentPacket;
if(d_g[seedID].distance == -1)
{
d_g[seedID].distance = heuristic(d_g,seedID,end_id);
}
//Packet temp = newPacket(d_g[seedID].cost+d_g[seedID].distance,seedID);
Insert(newPacket(d_g[seedID].cost+d_g[seedID].distance,seedID),&mypqueue);
//Insert(temp,&mypqueue);
// do as many steps of search as we are allowed
for(step=0;step<num_steps;step++)
{
// get the smallest in the queue
// also removes it....
currentPacket = DeleteMin(&mypqueue);
// pull out the Vertex via ID
currentVertex = d_g[currentPacket.vertex_id];
currentVertex.taken = true;
// if we've reached the endpoint, quit out.
if(currentVertex.vertex_id == end_id){
step = num_steps;
}else{
// find the neighbors and calculate costs
for(int v=0; v<currentVertex.adj_len; v++)
{
// cost is cost to get to current + cost to get to next node
cost = currentVertex.cost + heuristic(d_g,currentVertex.vertex_id,currentVertex.adjacency[v]);
// calculate the distance if not already calculated
// extra work, but avoids an if statement
d_g[currentVertex.adjacency[v]].distance = heuristic(d_g,currentVertex.adjacency[v],end_id);
distance = d_g[currentVertex.adjacency[v]].distance;
// the cost to get there is less using this node
// i.e. we are on the shorter path to get there
// assign the path and update the cost value
if(cost < d_g[currentVertex.adjacency[v]].cost)
{
d_g[currentVertex.adjacency[v]].cost = cost;
d_g[currentVertex.adjacency[v]].prev_vertex_id = currentVertex.vertex_id;
// insert into the queue the cost of the node, but only
// if our cost is less than the one it has
// this prevents us from backtracking and lets us take over nodes where our thread
// produced a better cost value
Insert(newPacket(cost+distance,currentVertex.adjacency[v]),&mypqueue);
}else if(d_g[currentVertex.adjacency[v]].cost != 16384.0*2.0)
{
// we've arrived at a node, we took a slower path and someone got here with less cost
}
}
}
// loop, pick the new smallest f(x) as current vector, and go on
}
}
#endif
AVertex* d_g is a graph built using an array of AVertex structures. They are indexed by vertexID tags.
int *d_seedID is the pointer to an array of starting locations (vertexIDs) for the thread to start on the graph.
HeapStruct *d_pqueue is the pointer to an array of allocated space for a personal Priority Queue. HeapStruct is the structure using a static array to implement a binary heap. Currently built with 20 elements hardcoded. Each element is called a Packet.
end_id is the vertexID tag for the target vertex.
num_steps is how many steps along the search we’re allowed to do per thread.
I am currently calling this with only 1 thread, 1 block, and a graph of 5 nodes.
I am using a GeForce8800 GPU (190.62 drivers), WindowsXP (Visual Studio 9.0), and CUDA SDK 2.3.
Any ideas?