Unspecified Launch Failure yet another....

Hello everyone…

So, I’m currently building a CUDA program. I have tracked the bug into the kernel due to the Unspecified Launch Failure that keeps appearing.

Here is a copy of the kernel in question:

#ifndef _ASEARCH_KERNEL_H_

#define _ASEARCH_KERNEL_H_

#include <map>

#include <math.h>

#include "agraph.h"

#include "binheap.cu"

// returns the guessing heuristic value for the distance to the

// end point

// only callable from within device, runs on device

__device__ float

heuristic( AVertex *d_g, int source, int end )

{

	int s_posx, s_posy, e_posx, e_posy;

	s_posx = d_g[source].posx;

	s_posy = d_g[source].posy;

	e_posx = d_g[end].posx;

	e_posy = d_g[end].posy;

	float result = sqrt(pow(float(e_posx-s_posx),2)+pow(float(e_posy-s_posy),2));

	return result;

}

// the speculative searching A* kernel

__global__ void

aSearch( AVertex *d_g, int *d_seedID, HeapStruct *d_pqueue, int end_id, int num_steps) 

{

	// declare some things we will be using

	AVertex currentVertex;

	float cost, distance;

	int step;

	// load your thread-specific values

	int seedID = d_seedID[threadIdx.x];

	HeapStruct mypqueue = d_pqueue[threadIdx.x];

	// clean out the heap

	MakeEmpty(&mypqueue);

	Packet currentPacket;

	if(d_g[seedID].distance == -1)

	{

		d_g[seedID].distance = heuristic(d_g,seedID,end_id);

	}

	//Packet temp = newPacket(d_g[seedID].cost+d_g[seedID].distance,seedID);

	Insert(newPacket(d_g[seedID].cost+d_g[seedID].distance,seedID),&mypqueue);

	//Insert(temp,&mypqueue);

	// do as many steps of search as we are allowed

	for(step=0;step<num_steps;step++)

	{

		// get the smallest in the queue

		// also removes it....

		currentPacket = DeleteMin(&mypqueue);

		// pull out the Vertex via ID

		currentVertex = d_g[currentPacket.vertex_id];

		currentVertex.taken = true;

		// if we've reached the endpoint, quit out.

		if(currentVertex.vertex_id == end_id){

			step = num_steps;

		}else{

			// find the neighbors and calculate costs

			for(int v=0; v<currentVertex.adj_len; v++)

			{

				// cost is cost to get to current + cost to get to next node

				cost = currentVertex.cost + heuristic(d_g,currentVertex.vertex_id,currentVertex.adjacency[v]);

				// calculate the distance if not already calculated

				// extra work, but avoids an if statement

				d_g[currentVertex.adjacency[v]].distance = heuristic(d_g,currentVertex.adjacency[v],end_id);

				distance = d_g[currentVertex.adjacency[v]].distance;

				// the cost to get there is less using this node

				// i.e. we are on the shorter path to get there

				// assign the path and update the cost value

				if(cost < d_g[currentVertex.adjacency[v]].cost)

				{

					d_g[currentVertex.adjacency[v]].cost = cost;

					d_g[currentVertex.adjacency[v]].prev_vertex_id = currentVertex.vertex_id;

					// insert into the queue the cost of the node, but only

					// if our cost is less than the one it has

					// this prevents us from backtracking and lets us take over nodes where our thread

					// produced a better cost value

					Insert(newPacket(cost+distance,currentVertex.adjacency[v]),&mypqueue);

				}else if(d_g[currentVertex.adjacency[v]].cost != 16384.0*2.0)

				{

					// we've arrived at a node, we took a slower path and someone got here with less cost

				}

			}

		}

		// loop, pick the new smallest f(x) as current vector, and go on

	}

}

#endif

AVertex* d_g is a graph built using an array of AVertex structures. They are indexed by vertexID tags.

int *d_seedID is the pointer to an array of starting locations (vertexIDs) for the thread to start on the graph.

HeapStruct *d_pqueue is the pointer to an array of allocated space for a personal Priority Queue. HeapStruct is the structure using a static array to implement a binary heap. Currently built with 20 elements hardcoded. Each element is called a Packet.

end_id is the vertexID tag for the target vertex.

num_steps is how many steps along the search we’re allowed to do per thread.

I am currently calling this with only 1 thread, 1 block, and a graph of 5 nodes.

I am using a GeForce8800 GPU (190.62 drivers), WindowsXP (Visual Studio 9.0), and CUDA SDK 2.3.

Any ideas?

Update.

I have discovered via commenting out code that this function call is the culprit.

__device__ void

Insert( ElementType X, PriorityQueue H )

{

	int i;

	//if( IsFull( H ) )

	//{

		//Error( "Priority queue is full" );

	//	return;

	//}

	++H->Size;

	for( i = H->Size; H->Elements[ i / 2 ].priority > X.priority; i /= 2 ){

		H->Elements[ i ].priority = H->Elements[ i / 2 ].priority;

		H->Elements[ i ].vertex_id = H->Elements[ i / 2 ].vertex_id;

	}

	H->Elements[ i ].priority = X.priority;

	H->Elements[ i ].vertex_id = X.vertex_id;

}

I will eventually need to put back in the IsFull check, but for now the size of the queues is 20 and there’s only 5 nodes, so this should never happen.

Does the fact I get the following error help?

“Warning: Cannot tell what pointer points to, assuming global memory space”

I ran the exact same code, but without the CUDA copying, on the CPU and it worked. So, I’m thinking I did something special with CUDA…

SOLUTION!

Here’s a warning for all you enterprising coders who want to use structs. If you have a pointer in your structure, make sure that you copy over the object the pointer is pointing to (such as an array or whatnot) and then re-assign the pointer to the new device memory location. THEN copy over the struct.

For example:

// need to also copy each adjacency list within each AVertex before we copy them over

		for(int i=0;i<num_nodes;i++)

		{

			CUDA_SAFE_CALL(cudaMalloc((void**) &d_adjlist, h_g[i].adj_len*sizeof(int)));

			CUDA_SAFE_CALL(cudaMemcpy(d_adjlist, h_g[i].adjacency, h_g[i].adj_len*sizeof(int),

				cudaMemcpyHostToDevice));

			h_g[i].adjacency = d_adjlist;

		}

		// actual copy of the graph

		CUDA_SAFE_CALL(cudaMemcpy(d_g, h_g, num_nodes * sizeof(AVertex),

			cudaMemcpyHostToDevice));

Then just do the reverse when you get it all back from the device!