My kernel has a segmentation fault deep inside libcuda.so. cuda-gdb says that the crash is occurring at the opening curly brace of the kernel before any statements are executed, so setting breakpoints doesn’t seem to help; cuda-memcheck doesn’t reveal anything further. I’ve included the printout from cuda-gdb below, after I’ve attached to the process that will start the kernel. Does anyone have a suggestion that would help me determine the cause of this error?
I’m using Cuda 4.0 on a CentOS 5.6 Linux box and a very recent Nvidia driver. My device is a 580. The line of cuda-gdb output below in bold red gives the location of the opening curly brace of the kernel.
cuda-gdb program 11906
NVIDIA (R) CUDA Debugger
4.0 release
Portions Copyright (C) 2007-2011 NVIDIA Corporation
GNU gdb 6.6
Copyright (C) 2006 Free Software Foundation, Inc.
GDB is free software, covered by the GNU General Public License, and you are
welcome to change it and/or distribute copies of it under certain conditions.
Type “show copying” to see the conditions.
There is absolutely no warranty for GDB. Type “show warranty” for details.
This GDB was configured as “x86_64-unknown-linux-gnu”…
No symbol table is loaded. Use the “file” command.
program: No such file or directory.
Attaching to process 11906
Reading symbols from /home/kmccall/Cuda2/mcdm…done.
Using host libthread_db library “/lib64/libthread_db.so.1”.
Reading symbols from /lib64/librt.so.1…done.
Loaded symbols for /lib64/librt.so.1
Reading symbols from /usr/local/cuda/lib64/libcudart.so.4…done.
Loaded symbols for /usr/local/cuda/lib64/libcudart.so.4
Reading symbols from /usr/lib64/libstdc++.so.6…done.
Loaded symbols for /usr/lib64/libstdc++.so.6
Reading symbols from /lib64/libm.so.6…done.
Loaded symbols for /lib64/libm.so.6
Reading symbols from /lib64/libgcc_s.so.1…done.
Loaded symbols for /lib64/libgcc_s.so.1
Reading symbols from /lib64/libc.so.6…done.
Loaded symbols for /lib64/libc.so.6
Reading symbols from /lib64/libpthread.so.0…done.
[Thread debugging using libthread_db enabled]
[New Thread 47133281174816 (LWP 11906)]
Loaded symbols for /lib64/libpthread.so.0
Reading symbols from /lib64/ld-linux-x86-64.so.2…done.
Loaded symbols for /lib64/ld-linux-x86-64.so.2
Reading symbols from /lib64/libdl.so.2…done.
Loaded symbols for /lib64/libdl.so.2
Reading symbols from /usr/lib64/libcuda.so…done.
Loaded symbols for /usr/lib64/libcuda.so
Reading symbols from /usr/lib64/libz.so.1…done.
Loaded symbols for /usr/lib64/libz.so.1
warning: no loadable sections found in added symbol-file system-supplied DSO at 0x7fff7cbfc000
0x00000030b78d4ef5 in recv () from /lib64/libc.so.6
Breakpoint 1 at 0x470e0e: file KernelGroupKNN.cu, line 1282.
Breakpoint 2 at 0x46ed0d: file KernelGroupKNN.cu, line 1675.
(cuda-gdb) c
Continuing.
[Switching to Thread 47133281174816 (LWP 11906)]
Breakpoint 1, KernelGroupKNN::generateVars (this=0x1d5e8bf0, device=1, buffer_ndx=0, d_buf=0x200315200, d_min=0x200401000, d_max=0x200401600, d_cvars=0x200401c00, starting_ndx=0, n_to_generate=15) at KernelGroupKNN.cu:1282
1282 kernelGenerateCvars <<<grid, block, Ns>>> (
(cuda-gdb) c
Continuing.
[b]Program received signal SIGSEGV, Segmentation fault.
0x00002ade1289a1b3 in ?? () from /usr/lib64/libcuda.so[/b]
(cuda-gdb) where
#0 0x00002ade1289a1b3 in ?? () from /usr/lib64/libcuda.so
#1 0x00002ade128eeb1e in ?? () from /usr/lib64/libcuda.so
#2 0x00002ade12890479 in ?? () from /usr/lib64/libcuda.so
#3 0x00002ade128a1953 in ?? () from /usr/lib64/libcuda.so
#4 0x00002ade128714e5 in ?? () from /usr/lib64/libcuda.so
#5 0x00002ade128626ac in ?? () from /usr/lib64/libcuda.so
#6 0x00002ade1255a453 in ?? () from /usr/local/cuda/lib64/libcudart.so.4
#7 0x00002ade125900ca in cudaLaunch () from /usr/local/cuda/lib64/libcudart.so.4
#8 0x0000000000455039 in cudaLaunch (entry=0x46ec3c “UH\211?H\201??”) at /usr/local/cuda/bin/…/include/cuda_runtime.h:949
#9 0x000000000046ec3a in __device_stub__Z19kernelGenerateCvarsiiiiiiiiiiiiPfS_S_S_S_PiS_S_S_P15CompoundVarKDEC (__par0=1576, __par1=15, __par2=15, __par3=0, __par4=0, __par5=1, __par6=-1, __par7=64, __par8=6656, __par9=6656,
__par10=512, __par11=512, __par12=0x200200000, __par13=0x200400000, __par14=0x200400200, __par15=0x200500000, __par16=0x200501e00, __par17=0x200400e00, __par18=0x200315200, __par19=0x200401000, __par20=0x200401600,
__par21=0x200401c00) at /tmp/tmpxft_000037d6_00000000-1_<b>KernelGroupKNN.cudafe1.stub.c:5</b>
[b]#10 0x000000000046ed0d in kernelGenerateCvars (__cuda_0=1576, __cuda_1=15, __cuda_2=15, __cuda_3=0, __cuda_4=0, __cuda_5=1, __cuda_6=-1, __cuda_7=64, __cuda_8=6656, __cuda_9=6656, __cuda_10=512, __cuda_11=512, __cuda_12=0x200200000,
__cuda_13=0x200400000, __cuda_14=0x200400200, __cuda_15=0x200500000, __cuda_16=0x200501e00, __cuda_17=0x200400e00, __cuda_18=0x200315200, __cuda_19=0x200401000, __cuda_20=0x200401600, __cuda_21=0x200401c00)
at KernelGroupKNN.cu:1668 THIS IS THE OPENING CURLY BRACE OF THE KERNEL [/b]
#11 0x0000000000471036 in KernelGroupKNN::generateVars (this=0x1d5e8bf0, device=1, buffer_ndx=0, d_buf=0x200315200, d_min=0x200401000, d_max=0x200401600, d_cvars=0x200401c00, starting_ndx=0, n_to_generate=15) at KernelGroupKNN.cu:1282
#12 0x0000000000472cfd in KernelGroupKNN::runKernels (this=0x1d5e8bf0, device=1, device_relinquished=false, pass=0xd561ac) at KernelGroupKNN.cu:355
#13 0x000000000043cd62 in GpuServer::gpuProcessLoop (this=0x1d5c71c0) at GpuServer.cxx:327
#14 0x0000000000438d75 in multiProcessLoop () at GpuMain.cxx:424
#15 0x0000000000439df9 in main (argc=1, argv=0x7fff7cbcdae8) at GpuMain.cxx:148
Here is the kernel:
__global__ void kernelGenerateCvars(
const int n_samples, // # of Monte Carlo samples (runs)
const int n_input_vars_, // # of indivudal variables to combine
const int n_cvars_now_, // # of indiv/compound vars to generate
const int cvar_ndx_, // base absolute indiv/compound var ndx
const int ivar_ndx1_, // individual input var index 1
const int ivar_ndx2_, // individual input var index 2
const int op_, // operation code for the two vars
const int n_blocks, // NOTE
const int d_monte_carlo_pitch_, // pitch value for Monte Carlo matrix
const int d_buf_pitch_, // pitch value for d_buf_ array
const int d_min_per_block_pitch_, // pitch value for d_min_per_block
const int d_max_per_block_pitch_, // pitch value for d_max_per_block
FLOAT_TYPE_MINMAX *d_monte_carlo_data_, // Monte Carlo data matrix
FLOAT_TYPE_MINMAX *d_min_mc_, // minimums for Monte Carlo variables
FLOAT_TYPE_MINMAX *d_max_mc_, // maximums for Monte Carlo variables
FLOAT_TYPE_KNN *d_min_per_block_, // array of partial minimums
FLOAT_TYPE_KNN *d_max_per_block_, // array of partial maximums
int *d_var_list_, // array of input variable indices
FLOAT_TYPE_KNN *d_buf_, // indiv/compound vars data matrix
FLOAT_TYPE_KNN *d_min_, // indiv/compound vars minimums array
FLOAT_TYPE_KNN *d_max_, // indiv/compound vars maximums array
CompoundVarKDEC *d_cvars_ // indiv/compound vars specifications
)
{ <== line 1668 where the segfault occurs
// NOTE: need to fix shared memory function
extern __shared__ FLOAT_TYPE_KNN shared_memory[];
FLOAT_TYPE_KNN *min_sh = shared_memory;
FLOAT_TYPE_KNN *max_sh = shared_memory + blockDim.x;
int tx = threadIdx.x;
int bx = blockIdx.x;
int sampx = bx * blockDim.x + tx;
int v1ndx, v2ndx, vid1, vid2, start_in_loop;
bool all_pass = false;
FLOAT_TYPE_KNN xs1, xs2, cxs, cvid, sign_xs2;
// a thread for each Monte Carlo sample
if (sampx < n_samples)
{
// this logic makes it possible to start at any of the three operations
// NONE, subract or divide) in the loops below, corresponding to the
// variable indices and op code arguments
if (op_ == CompoundVarKDEC::OP_NONE_) start_in_loop = 0;
else if (op_ == CompoundVarKDEC::OP_SUBTRACT_) start_in_loop = 1;
else if (op_ == CompoundVarKDEC::OP_DIVIDE_) start_in_loop = 2;
cvid = cvar_ndx_;
v1ndx = ivar_ndx1_;
v2ndx = ivar_ndx2_;
// variable 1 loop. We include the very last input var so that it will be
// written individually out to global memory, not as part of a compound var.
// That is done immediately below by kernelGenerateIndividualVars().
while (v1ndx < n_input_vars_)
{
// each thread read one Monte Carlo sample from global memory for
// this run (sampx)
__syncthreads();
vid1 = d_var_list_[v1ndx];
xs1 = d_monte_carlo_data_[vid1 * d_monte_carlo_pitch_ + sampx];
if (all_pass || start_in_loop <= 0)
{
// place the individual variable array formed by all xs1 into the
// the global memory buffer d_buf_, which is where the compound variables
// will also go. Note that the argument cvid is incremented
kernelGenIndividualVar(n_samples, sampx, tx, bx, cvar_ndx_, cvid++, vid1,
n_blocks, d_buf_pitch_, d_min_per_block_pitch_, d_max_per_block_pitch_,
xs1, d_min_mc_, d_max_mc_, d_buf_,
d_min_per_block_, d_max_per_block_, d_cvars_);
if (cvid - cvar_ndx_ >= n_cvars_now_) return;
}
// variable2 loop. It won't execute if vid1 is at the last MC input
// variable
while (v2ndx < n_input_vars_ && v1ndx < n_input_vars_ - 1)
{
// each thread read one Monte Carlo sample from global memory
__syncthreads();
vid2 = d_var_list_[v2ndx];
xs2 = d_monte_carlo_data_[vid2 * d_monte_carlo_pitch_ + sampx];
// a compound variable created from the difference of the two MC
// variables
cxs = xs1 - xs2;
if (all_pass || start_in_loop <= 1)
{
// generate a compound variable. Note that cvid is incremented
kernelGenCompoundVar(n_samples, sampx, tx, bx, vid1, vid2, cvar_ndx_,
cvid++, n_blocks, d_buf_pitch_, d_min_per_block_pitch_,
d_max_per_block_pitch_, cxs, CompoundVarKDEC::OP_SUBTRACT_, min_sh, max_sh,
d_buf_, d_min_per_block_, d_max_per_block_, d_cvars_);
if (cvid - cvar_ndx_ >= n_cvars_now_) return;
}
__syncthreads();
// a compound variable created from the quotient of the two MC
// variables.
sign_xs2 = (xs2 >= 0.0 ? 1.0 : -1.0);
if (xs2 > -1.0e-06 && xs2 < 1.0e-06) xs2 = sign_xs2 * 1.0e-06;
cxs = xs1 / xs2;
if (all_pass || start_in_loop <= 2)
{
// generate a compound variable. Note that cvid is incremented
kernelGenCompoundVar(n_samples, sampx, tx, bx, vid1, vid2, cvar_ndx_,
cvid++, n_blocks, d_buf_pitch_, d_min_per_block_pitch_,
d_max_per_block_pitch_, cxs, CompoundVarKDEC::OP_DIVIDE_, min_sh, max_sh,
d_buf_, d_min_per_block_, d_max_per_block_, d_cvars_);
if (cvid - cvar_ndx_ >= n_cvars_now_) return;
}
all_pass = true;
v2ndx++;
}
v1ndx++;
v2ndx = v1ndx + 1;
}
}
}