Illegal Lane Address In Thread

Hi!

I’ve been learning CUDA for a school project, and I’m having a small issue. The kernel stops, seemingly randomly, when I run the program, and the output is always this (block ID being the iteration count of my algorithm) :

.....

Point sample at 0 : (0.329302, -0.150232, 0.110162)

Entering block ID 355

GPU: Kernel run: unspecified launch failure

If I run it under cuda-gdb, however, it stops, most of the time, at one of the first iterations, with :

.....

[Launch of CUDA kernel 7 (MLSIter) on Device 0]

Program received signal CUDA_EXCEPTION_1, Lane Illegal Address

[Switching to CUDA kernel 7 (<<<(7,0),(0,0,0)>>>)]

0x00000000011368f8 in MLSIter<<<(256,1),(3,1,1)>>> (...) at pmini_cli.cu:226

226                int cur = dpModelGridIdx[gid] + start + blockIdx.x; // Current vertex in grid cell

I believe it’s roughly an equivalent to a segmentation fault, so I looked into the gid variable :

(cuda-gdb) print gid

waring: Variable is not live at this point. Returning garbage value.

$1 = 0

The strange thing is, if I break just before the segmentation fault, there is no warning and the value is shown correctly.

So here is what I’m stil wondering about :

  1. What is the Lane Illegal Address exception, exactly? Is it really some kind of segfault?

  2. Why is the debugger showing these warnings when I print gid?

  3. I have searched for an answer on google, but to no avail. Perhaps this happens because I’m initializing the cur variable in every thread, and it’s causing some concurrency issues? I don’t think so, but nevertheless, it’s a possibility.

  4. Can you initialize a variable inside a for loop? I mean, can you do something like that?

for (int i...) {

	int var1 = 35 + threadIdx.x; // With a threadIdx

	int var2 = 35; // Without a threadIdx

}

Below are some snippets from my code, if you need to look at them :

// Run kernel on specified subset of data

MLSIter <<< blocks, threads >>> (dpModelGridIdx, dpModelGridPoints, dpModelInP, dpModelInN, dpModelOutP, dpModelOutN, ps.V.size(), pointGridSize, pNumBlock, pointMLSSize, vi, vj, vk, j);

cudaThreadSynchronize();

checkCUDAError ("Kernel run");
__global__ void MLSIter (int* dpModelGridIdx, int* dpModelGridPoints, float* dpModelInP, float* dpModelInN, float* dpModelOutP, float* dpModelOutN, int dVecCount, float dPointGridSize, int dNumBlock, float dPointMLSSize, int vi, int vj, int vk, int start) {

	// Initialize registers

	int sphGridSqSize = (int)(dPointGridSize * dNumBlock); sphGridSqSize = sphGridSqSize*sphGridSqSize; // Size of grid consecutive blocks to process

	int gid = GridId(dNumBlock, vi, vj, vk); // Current Grid Block Index

	__syncthreads();

	int gid_max = dNumBlock*dNumBlock*dNumBlock;

	int cur = dpModelGridIdx[gid] + start + blockIdx.x; // Current vertex in grid cell

	// Initialize barycenter computation

	float bary_sum_p[3] = {0., 0., 0.};

	float bary_sum_n[3] = {0., 0., 0.};

	float bary_sum_c[3] = {0., 0., 0.};

	float bary_coeff = 0.;

	int gid_it = 0;

	int pt_start = 0;

	int pt_end = 0;

	__syncthreads(); // Synchronize, in order to prevent values from being overwritten

	for (int dvi = -sphGridSqSize; dvi <= sphGridSqSize; dvi++) {

		for (int dvj = -sphGridSqSize; dvj <= sphGridSqSize; dvj++) {

			for (int dvk = -sphGridSqSize; dvk <= sphGridSqSize; dvk++) {

				if (vi+dvi >= dNumBlock || vi+dvi < 0 || vj+dvj >= dNumBlock || vj+dvj < 0 || vk+dvk >= dNumBlock || vk+dvk < 0) continue;

				gid_it = GridId (dNumBlock, vi+dvi, vj+dvj, vk+dvk);

				pt_start = dpModelGridIdx[gid_it];

				pt_end = gid_it<gid_max-1 ? dpModelGridIdx[gid_it+1] : dVecCount;

				for (int pt=pt_start; pt<pt_end; pt++) {

					float dx = dpModelInP[3*dpModelGridPoints[pt]]   - dpModelInP[3*dpModelGridPoints[cur]];

					float dy = dpModelInP[3*dpModelGridPoints[pt] + 1] - dpModelInP[3*dpModelGridPoints[cur] + 1];

					float dz = dpModelInP[3*dpModelGridPoints[pt] + 2] - dpModelInP[3*dpModelGridPoints[cur] + 2];

					float dist = dx*dx + dy*dy + dz*dz;

					if (dist > dPointMLSSize) continue;

					float kern = MLSKernel (dPointMLSSize, dist);

					float a = (dpModelInP[3*dpModelGridPoints[pt] + threadIdx.x] * kern);

					float b = (dpModelInN[3*dpModelGridPoints[pt] + threadIdx.x] * kern);

					bary_sum_p[threadIdx.x] += a;

					bary_sum_n[threadIdx.x] += b;

					bary_coeff += kern;

				}

			}

		}

	}

	__syncthreads(); // Now we've got everything inside bary_sum

	bary_sum_p[threadIdx.x] /= bary_coeff;

	float bary_sum_n_norm = sqrt (bary_sum_n[0]*bary_sum_n[0] + bary_sum_n[1]*bary_sum_n[1] + bary_sum_n[2]*bary_sum_n[2]);

	bary_sum_n[threadIdx.x] /= bary_sum_n_norm;

	bary_sum_c[threadIdx.x] = dpModelInP[3*dpModelGridPoints[cur] + threadIdx.x] - bary_sum_p[threadIdx.x];

	__syncthreads();

	/* Project on bary_sum_p, bary_sum_n */

	float w = bary_sum_c[0]*bary_sum_n[0] + bary_sum_c[1]*bary_sum_n[1] + bary_sum_c[2]*bary_sum_n[2];

	dpModelOutP[3*blockIdx.x + threadIdx.x] = dpModelInP[3*dpModelGridPoints[cur] + threadIdx.x] - w*bary_sum_n[threadIdx.x];

	dpModelOutN[3*blockIdx.x + threadIdx.x] = bary_sum_n[threadIdx.x];

}

I’m hoping someone can explain ;)