Hi!
I’ve been learning CUDA for a school project, and I’m having a small issue. The kernel stops, seemingly randomly, when I run the program, and the output is always this (block ID being the iteration count of my algorithm) :
.....
Point sample at 0 : (0.329302, -0.150232, 0.110162)
Entering block ID 355
GPU: Kernel run: unspecified launch failure
If I run it under cuda-gdb, however, it stops, most of the time, at one of the first iterations, with :
.....
[Launch of CUDA kernel 7 (MLSIter) on Device 0]
Program received signal CUDA_EXCEPTION_1, Lane Illegal Address
[Switching to CUDA kernel 7 (<<<(7,0),(0,0,0)>>>)]
0x00000000011368f8 in MLSIter<<<(256,1),(3,1,1)>>> (...) at pmini_cli.cu:226
226 int cur = dpModelGridIdx[gid] + start + blockIdx.x; // Current vertex in grid cell
I believe it’s roughly an equivalent to a segmentation fault, so I looked into the gid variable :
(cuda-gdb) print gid
waring: Variable is not live at this point. Returning garbage value.
$1 = 0
The strange thing is, if I break just before the segmentation fault, there is no warning and the value is shown correctly.
So here is what I’m stil wondering about :
-
What is the Lane Illegal Address exception, exactly? Is it really some kind of segfault?
-
Why is the debugger showing these warnings when I print gid?
-
I have searched for an answer on google, but to no avail. Perhaps this happens because I’m initializing the cur variable in every thread, and it’s causing some concurrency issues? I don’t think so, but nevertheless, it’s a possibility.
-
Can you initialize a variable inside a for loop? I mean, can you do something like that?
for (int i...) {
int var1 = 35 + threadIdx.x; // With a threadIdx
int var2 = 35; // Without a threadIdx
}
Below are some snippets from my code, if you need to look at them :
// Run kernel on specified subset of data
MLSIter <<< blocks, threads >>> (dpModelGridIdx, dpModelGridPoints, dpModelInP, dpModelInN, dpModelOutP, dpModelOutN, ps.V.size(), pointGridSize, pNumBlock, pointMLSSize, vi, vj, vk, j);
cudaThreadSynchronize();
checkCUDAError ("Kernel run");
__global__ void MLSIter (int* dpModelGridIdx, int* dpModelGridPoints, float* dpModelInP, float* dpModelInN, float* dpModelOutP, float* dpModelOutN, int dVecCount, float dPointGridSize, int dNumBlock, float dPointMLSSize, int vi, int vj, int vk, int start) {
// Initialize registers
int sphGridSqSize = (int)(dPointGridSize * dNumBlock); sphGridSqSize = sphGridSqSize*sphGridSqSize; // Size of grid consecutive blocks to process
int gid = GridId(dNumBlock, vi, vj, vk); // Current Grid Block Index
__syncthreads();
int gid_max = dNumBlock*dNumBlock*dNumBlock;
int cur = dpModelGridIdx[gid] + start + blockIdx.x; // Current vertex in grid cell
// Initialize barycenter computation
float bary_sum_p[3] = {0., 0., 0.};
float bary_sum_n[3] = {0., 0., 0.};
float bary_sum_c[3] = {0., 0., 0.};
float bary_coeff = 0.;
int gid_it = 0;
int pt_start = 0;
int pt_end = 0;
__syncthreads(); // Synchronize, in order to prevent values from being overwritten
for (int dvi = -sphGridSqSize; dvi <= sphGridSqSize; dvi++) {
for (int dvj = -sphGridSqSize; dvj <= sphGridSqSize; dvj++) {
for (int dvk = -sphGridSqSize; dvk <= sphGridSqSize; dvk++) {
if (vi+dvi >= dNumBlock || vi+dvi < 0 || vj+dvj >= dNumBlock || vj+dvj < 0 || vk+dvk >= dNumBlock || vk+dvk < 0) continue;
gid_it = GridId (dNumBlock, vi+dvi, vj+dvj, vk+dvk);
pt_start = dpModelGridIdx[gid_it];
pt_end = gid_it<gid_max-1 ? dpModelGridIdx[gid_it+1] : dVecCount;
for (int pt=pt_start; pt<pt_end; pt++) {
float dx = dpModelInP[3*dpModelGridPoints[pt]] - dpModelInP[3*dpModelGridPoints[cur]];
float dy = dpModelInP[3*dpModelGridPoints[pt] + 1] - dpModelInP[3*dpModelGridPoints[cur] + 1];
float dz = dpModelInP[3*dpModelGridPoints[pt] + 2] - dpModelInP[3*dpModelGridPoints[cur] + 2];
float dist = dx*dx + dy*dy + dz*dz;
if (dist > dPointMLSSize) continue;
float kern = MLSKernel (dPointMLSSize, dist);
float a = (dpModelInP[3*dpModelGridPoints[pt] + threadIdx.x] * kern);
float b = (dpModelInN[3*dpModelGridPoints[pt] + threadIdx.x] * kern);
bary_sum_p[threadIdx.x] += a;
bary_sum_n[threadIdx.x] += b;
bary_coeff += kern;
}
}
}
}
__syncthreads(); // Now we've got everything inside bary_sum
bary_sum_p[threadIdx.x] /= bary_coeff;
float bary_sum_n_norm = sqrt (bary_sum_n[0]*bary_sum_n[0] + bary_sum_n[1]*bary_sum_n[1] + bary_sum_n[2]*bary_sum_n[2]);
bary_sum_n[threadIdx.x] /= bary_sum_n_norm;
bary_sum_c[threadIdx.x] = dpModelInP[3*dpModelGridPoints[cur] + threadIdx.x] - bary_sum_p[threadIdx.x];
__syncthreads();
/* Project on bary_sum_p, bary_sum_n */
float w = bary_sum_c[0]*bary_sum_n[0] + bary_sum_c[1]*bary_sum_n[1] + bary_sum_c[2]*bary_sum_n[2];
dpModelOutP[3*blockIdx.x + threadIdx.x] = dpModelInP[3*dpModelGridPoints[cur] + threadIdx.x] - w*bary_sum_n[threadIdx.x];
dpModelOutN[3*blockIdx.x + threadIdx.x] = bary_sum_n[threadIdx.x];
}
I’m hoping someone can explain External Image