Error 700: Illegal address during kernel execution

Hello to everyone.

I have some troubles with this function (#pragma acc routine):

#pragma acc routine
template<typename Node1, typename Node2, typename Edge1, typename Edge2,
		typename NodeComparisonFunctor, typename EdgeComparisonFunctor>
int VF3ParallelSubState<Node1, Node2, Edge1, Edge2, NodeComparisonFunctor,
		EdgeComparisonFunctor>::ParallelIsFeasiblePair(node_id node1,
		node_id node2) {

	int isFeasible = 0;

	if ((node1 < n1) && (node2 < n2) && (core_1[node1] == NULL_NODE)
			&& (core_2[node2] == NULL_NODE)) {

		if (!nf(g1->GetNodeAttr(node1), g2->GetNodeAttr(node2)))
			isFeasible++;

		if (g1->InEdgeCount(node1) > g2->InEdgeCount(node2)
				|| g1->OutEdgeCount(node1) > g2->OutEdgeCount(node2))
			isFeasible++;

		int oec1 = g1->OutEdgeCount(node1);
		int iec1 = g1->InEdgeCount(node1);
		int oec2 = g2->OutEdgeCount(node2);
		int iec2 = g2->InEdgeCount(node2);

		const Edge1 *restrict vecEdgesOut1 = g1->GetOutEdgeAttrVec(node1);
		const Edge1 *restrict vecEdgesIn1 = g1->GetInEdgeAttrVec(node1);
		const node_id *restrict vecNodesOut1 = g1->GetOutEdgeNodeVec(node1);
		const node_id *restrict vecNodesIn1 = g1->GetInEdgeNodeVec(node1);
		const node_id *restrict vecNodesOut2 = g2->GetOutEdgeNodeVec(node2);
		const node_id *restrict vecNodesIn2 = g2->GetInEdgeNodeVec(node2);

		// Check the 'out' edges of node1
#pragma acc loop independent reduction(+:isFeasible)
		for (int i = 0; i < oec1; i++) {
			Edge1 eattr1 = vecEdgesOut1[i];
			node_id other1 = vecNodesOut1[i];
			int c_other = class_1[other1];
			if (core_1[other1] != NULL_NODE) {
				Edge2 eattr2;
				node_id other2 = core_1[other1];
				if (!g2->HasEdge(node2, other2, eattr2) || !ef(eattr1, eattr2))
					isFeasible++;
			}
		}

		// Check the 'in' edges of node1
#pragma acc loop independent reduction(+:isFeasible)
		for (int i = 0; i < iec1; i++) {
			Edge1 eattr1 = vecEdgesIn1[i];
			node_id other1 = vecNodesIn1[i];
			int c_other = class_1[other1];
			if (core_1[other1] != NULL_NODE) {
				Edge2 eattr2;
				node_id other2 = core_1[other1];
				if (!g2->HasEdge(other2, node2, eattr2) || !ef(eattr1, eattr2))
					isFeasible++;
			}
		}

		// Check the 'out' edges of node2
#pragma acc loop independent reduction(+:isFeasible)
		for (int i = 0; i < oec2; i++) {
			node_id other2 = vecNodesOut2[i];
			int c_other = class_2[other2];
			if (core_2[other2] != NULL_NODE) {
				node_id other1 = core_2[other2];
				if (!g1->HasEdge(node1, other1))
					isFeasible++;
			}
		}

		// Check the 'in' edges of node2
#pragma acc loop independent reduction(+:isFeasible)
		for (int i = 0; i < iec2; i++) {
			node_id other2 = vecNodesIn2[i];
			int c_other = class_2[other2];
			if (core_2[other2] != NULL_NODE) {
				node_id other1 = core_2[other2];
				if (!g1->HasEdge(other1, node1))
					isFeasible++;
			}
		}

	} else
		isFeasible++;

	return isFeasible;

}

This acc routine is called from this another piece of code shown below:

template<typename Node1, typename Node2, typename Edge1, typename Edge2,
		typename NodeComparisonFunctor, typename EdgeComparisonFunctor>
void VF3ParallelSubState<Node1, Node2, Edge1, Edge2, NodeComparisonFunctor,
		EdgeComparisonFunctor>::ParallelNextPairFromS0(node_id* restrict &prova) {

	node_id curr_n1 = order[core_len];
	int c = class_1[curr_n1];

	prova = (node_id*) malloc(sizeof(node_id) * n2);

#pragma acc parallel loop copy(prova[:n2])
	for (int i = 0; i < n2; i++) {
		if (class_2[i] == c) {
			if (ParallelIsFeasiblePair(curr_n1, (node_id) i)==0) {
				prova[i] = (node_id) i + 1;
			} else {
				prova[i] = (node_id) -1;
			}
		} else
		prova[i] = (node_id) 0;
	}
}

When I run it, I get the following error:

Accelerator Kernel Timing data
/home/christian/eclipse-workspace/ParallelVF3/src/../include/vf3_parallel_sub_state.hpp
  _ZN19VF3ParallelSubStateIii5EmptyS0_18EqualityComparatorIiiES1_IS0_S0_EE22ParallelNextPairFromS0ERrPt  NVIDIA  devicenum=0
    time(us): 14
    518: compute region reached 1 time
        518: kernel launched 1 time
            grid: [1]  block: [128]
             device time(us): total=0 max=0 min=0 avg=0
    518: data region reached 2 times
        34: kernel launched 1 time
            grid: [1]  block: [128]
            elapsed time(us): total=820 max=820 min=820 avg=820
        518: data copyin transfers: 2
             device time(us): total=14 max=11 min=3 avg=7
call to cuMemFreeHost returned error 700: Illegal address during kernel execution
Makefile:38: set di istruzioni per l'obiettivo "run_cuda" non riuscito
make: *** [run_cuda] Errore 1

but I don’t know why. Can you help me please?

Hi khrishino,

How are the arrays and pointers (like core_1 and g2) created and copied to the device? Are these class variables or global variables?

My guess is that you don’t have these variables over on the device, hence are trying to dereference host addresses.

Assuming that these variables are dynamically allocated, you can try using the flag “-ta=tesla:managed” to use CUDA Unified Memory. With unified memory, you can access host addresses on the device and the CUDA driver will implicitly copy the data to the device/host for you. Unified Memory is particularly useful for C++ codes like this where manually managing the device data can be difficult.

-Mat

Hi mkcolg,

thank you for your reply. I’m actually using Cuda Unified Memory, but I’m still having a lot of troubles.
Of course, the variables are dynamically allocated.

Are the variables class data members within the same class method or are they global variables?

If they are global variables, you’ll still need to put the pointer in a “declare create” directive so they will have a device reference that the device routine can access. Also, you’ll need to update the global pointer with the address to the array data. With Unified Memory, the address can be a host address, but still needs to be populated on the device.

-Mat

Hello Mat,

the variables are Class Variables and (I suppose) I copy them on the device in the class constructor:

template<typename Node1, typename Node2, typename Edge1, typename Edge2,
		typename NodeComparisonFunctor, typename EdgeComparisonFunctor>
VF3ParallelSubState<Node1, Node2, Edge1, Edge2, NodeComparisonFunctor,
		EdgeComparisonFunctor>::VF3ParallelSubState(ARGraph<Node1, Edge1> *ag1,
		ARGraph<Node2, Edge2> *ag2, int *restrict class1, int *restrict class2,
		int nclass, node_id *restrict orderVec) {

	if (class1 != NULL && class2 != NULL) {
		g1 = ag1;
		g2 = ag2;
		n1 = g1->NodeCount();
		n2 = g2->NodeCount();
		last_candidate_index = 0;
		classes_count = nclass;
		core_len = orig_core_len = 0;
		added_node1 = NULL_NODE;

		core_1 = (node_id*) malloc(sizeof(node_id) * n1);
		core_2 = (node_id*) malloc(sizeof(node_id) * n2);
		core_len_c = (node_id*) malloc(sizeof(node_id) * classes_count);
		predecessors = (node_id*) malloc(sizeof(node_id) * n1);
		dir = (node_dir_t*) malloc(sizeof(node_dir_t) * n1);
		order = (node_id*) malloc(sizeof(node_id) * n1);
		class_1 = (int*) malloc(sizeof(int) * n1);
		class_2 = (int*) malloc(sizeof(int) * n2);

#pragma acc enter data copyin(this[:1]) \
	create(g1[0:1], g2[0:1], core_1[:n1], core_2[:n2], core_len_c[:classes_count], \
			predecessors[:n1], dir[:n1], order[:n1], class_1[:n1], class_2[:n2])

#pragma acc parallel \
	present(g1[0:1],core_1[:n1], core_2[:n2], core_len_c[:classes_count], predecessors[:n1], dir[:n1], order[:n1], class_1[:n1], class_2[:n2]) \
	vector_length(32) \
	copyin(orderVec[:n1], class1[:n1], class2[:n2])
		{
#pragma acc loop independent
			for (int i = 0; i < n1; i++) {
				core_1[i] = NULL_NODE;
				dir[i] = NODE_DIR_NONE;
				predecessors[i] = NULL_NODE;
				order[i] = orderVec[i];
				class_1[i] = class1[i];
			}

#pragma acc loop independent
			for (int i = 0; i < n2; i++) {
				core_2[i] = NULL_NODE;
				class_2[i] = class2[i];
			}
			ComputeFirstGraphTraversing();
		}
	}
}

But when I try to call n1 or core_1 in the previous_#pragma acc routine_ I get the following error:

call to cuMemFreeHost returned error 700: Illegal address during kernel execution

I’m really hopeless because I don’t know what to do :(

Moreover, if I run this code:

template<typename Node1, typename Node2, typename Edge1, typename Edge2,
		typename NodeComparisonFunctor, typename EdgeComparisonFunctor>
int VF3ParallelSubState<Node1, Node2, Edge1, Edge2, NodeComparisonFunctor,
		EdgeComparisonFunctor>::ParallelIsFeasiblePair(node_id node1,
		node_id node2) {

	int isFeasible = 0;

	if ((node1 < this->n1) && (node2 < n2) && (core_1[node1] == NULL_NODE)
			&& (core_2[node2] == NULL_NODE)) {

		if (!nf(g1->GetNodeAttr(node1), g2->GetNodeAttr(node2))) {
			isFeasible++;

			if (g1->InEdgeCount(node1) > g2->InEdgeCount(node2)
					|| g1->OutEdgeCount(node1) > g2->OutEdgeCount(node2))
				isFeasible++;

			int oec1 = g1->OutEdgeCount(node1);
			int iec1 = g1->InEdgeCount(node1);
			int oec2 = g2->OutEdgeCount(node2);
			int iec2 = g2->InEdgeCount(node2);

			const Edge1 *restrict vecEdgesOut1 = g1->GetOutEdgeAttrVec(node1);
			const Edge1 *restrict vecEdgesIn1 = g1->GetInEdgeAttrVec(node1);
			const node_id *restrict vecNodesOut1 = g1->GetOutEdgeNodeVec(node1);
			const node_id *restrict vecNodesIn1 = g1->GetInEdgeNodeVec(node1);
			const node_id *restrict vecNodesOut2 = g2->GetOutEdgeNodeVec(node2);
			const node_id *restrict vecNodesIn2 = g2->GetInEdgeNodeVec(node2);

#pragma acc parallel reduction(+:isFeasible)
			{
				// Check the 'out' edges of node1
#pragma acc loop independent reduction(+:isFeasible)
				for (int i = 0; i < oec1; i++) {
					Edge1 eattr1;
					node_id other1;
					eattr1 = vecEdgesOut1[i];
					other1 = vecNodesOut1[i];
					int c_other = class_1[other1];
					if (core_1[other1] != NULL_NODE) {
						Edge2 eattr2;
						node_id other2 = core_1[other1];
						if (!g2->HasEdge(node2, other2, eattr2)
								|| !ef(eattr1, eattr2))
							isFeasible++;
					}
				}

				// Check the 'in' edges of node1
#pragma acc loop independent reduction(+:isFeasible)
				for (int i = 0; i < iec1; i++) {
					Edge1 eattr1;
					node_id other1;
					eattr1 = vecEdgesIn1[i];
					other1 = vecNodesIn1[i];
					int c_other = class_1[other1];
					if (core_1[other1] != NULL_NODE) {
						Edge2 eattr2;
						node_id other2 = core_1[other1];
						if (!g2->HasEdge(other2, node2, eattr2)
								|| !ef(eattr1, eattr2))
							isFeasible++;
					}
				}

				// Check the 'out' edges of node2
#pragma acc loop independent reduction(+:isFeasible)
				for (int i = 0; i < oec2; i++) {
					node_id other2;
					other2 = vecNodesOut2[i];
					int c_other = class_2[other2];
					if (core_2[other2] != NULL_NODE) {
						node_id other1 = core_2[other2];
						if (!g1->HasEdge(node1, other1))
							isFeasible++;
					}
				}

				// Check the 'in' edges of node2
#pragma acc loop independent reduction(+:isFeasible)
				for (int i = 0; i < iec2; i++) {
					node_id other2;
					other2 = vecNodesIn2[i];
					int c_other = class_2[other2];
					if (core_2[other2] != NULL_NODE) {
						node_id other1 = core_2[other2];
						if (!g1->HasEdge(other1, node1))
							isFeasible++;
					}
				}
			}

		} else
			isFeasible++;

		return isFeasible;

	}
}

it works correctly. Why?

The problem occurs when I try to run it in an acc routine (so the compute region is outside the function, because it isn’t possibile to have nested parallelism with PGI compiler).

Can anyone help me please?