NVreg_InitializeSystemMemoryAllocations and CUDA UVM Bus errors

When we use the nvidia driver’s kernel module option:

options nvidia NVreg_InitializeSystemMemoryAllocations=0

in the /etc/modprobe.d/nvidia.conf file, the nvidia driver will skip
over zero-ing the graphics memory buffers at allocation time. We prefer
to disable the clearing of buffer memory for efficiency reasons.

When this option is disabled, the CUDA v6.0 sample programs
“UnifiedMemoryStreams” and “conjugateGradientUM” will usually terminate
with a “Bus error” result when using CUDA capability 3.5 or greater GPUs.

However, when this option is enabled (the default nvidia driver settting),
these sample programs will run succesfully with no Bus error issues.

For example, with NVreg_InitializeSystemMemoryAllocations disabled:

UnifiedMemoryStreams

UnifiedMemoryStreams
GPU Device 0: “GeForce GT 640” with compute capability 3.5

Bus error

echo $?

263

We’ve tried using both the nvidia NVIDIA-Linux-x86_64-337.25.run
driver as well as the 331.62 run driver. This fails with various
kernel versions, including the CentOS 6.5 kernel:

uname -a

Linux mickey 2.6.32-431.el6.x86_64 #1 SMP Fri Nov 22 03:15:09 UTC 2013
x86_64 x86_64 x86_64 GNU/Linux

The GPU being used is:

lspci -s 83:0.0

83:00.0 VGA compatible controller:
NVIDIA Corporation GK208 [GeForce GT 640 Rev. 2] (rev a1)

lspci -s 83:0.0 -n

83:00.0 0300: 10de:1282 (rev a1)

Some cuda-gdb output from “conjugateGradientUM”
when it hits the bus error:

cuda-gdb conjugateGradientUM

NVIDIA (R) CUDA Debugger
6.0 release
Portions Copyright (C) 2007-2014 NVIDIA Corporation
GNU gdb (GDB) 7.6
Copyright (C) 2013 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later http://gnu.org/licenses/gpl.html
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law. Type “show copying”
and “show warranty” for details.
This GDB was configured as “x86_64-unknown-linux-gnu”.
For bug reporting instructions, please see:
http://www.gnu.org/software/gdb/bugs/
Reading symbols from /usr/share/doc/nvidia/cuda/samples/bin/x86_64/linux/release/conjugateGradientUM…(no debugging sym
(cuda-gdb) run
Starting program: /usr/share/doc/nvidia/cuda/samples/bin/x86_64/linux/release/conjugateGradientUM
[Thread debugging using libthread_db enabled]
Using host libthread_db library “/lib64/libthread_db.so.1”.
Starting [conjugateGradientManaged]…
GPU Device 0: “GeForce GT 640” with compute capability 3.5

GPU device has 2 Multi-Processors, SM 3.5 compute capabilities

[New Thread 0x7ffff2f38700 (LWP 9479)]

Program received signal CUDA_EXCEPTION_15, Invalid Managed Memory Access.
0x00000000004031c1 in genTridiag(int*, int*, float*, int, int) ()

(cuda-gdb) info stack
#0 0x00000000004031c1 in genTridiag(int*, int*, float*, int, int) ()
#1 0x0000000000403537 in main ()

(cuda-gdb) info reg
rax 0x900000000 38654705664
rbx 0x900420000 38659031040
rcx 0x100000 1048576
rdx 0x901020000 38671613952
rsi 0x900420000 38659031040
rdi 0x900000000 38654705664
rbp 0x7fffffffe4a0 0x7fffffffe4a0
rsp 0x7fffffffe460 0x7fffffffe460
r8 0x2ffffe 3145726
r9 0x0 0
r10 0xffffffff 4294967295
r11 0xaccd98 11324824
r12 0x402b60 4205408
r13 0x7fffffffe930 140737488349488
r14 0x0 0
r15 0x0 0
rip 0x4031c1 0x4031c1 <genTridiag(int*, int*, float*, int, int)+32>
eflags 0x10206 [ PF IF RF ]
cs 0x33 51
ss 0x2b 43
ds 0x0 0
es 0x0 0
fs 0x0 0
gs 0x0 0

Dump of assembler code for function _Z10genTridiagPiS_Pfii:
0x00000000004031a1 <+0>: push %rbp
0x00000000004031a2 <+1>: mov %rsp,%rbp
0x00000000004031a5 <+4>: push %rbx
0x00000000004031a6 <+5>: sub $0x38,%rsp
0x00000000004031aa <+9>: mov %rdi,-0x28(%rbp)
0x00000000004031ae <+13>: mov %rsi,-0x30(%rbp)
0x00000000004031b2 <+17>: mov %rdx,-0x38(%rbp)
0x00000000004031b6 <+21>: mov %ecx,-0x3c(%rbp)
0x00000000004031b9 <+24>: mov %r8d,-0x40(%rbp)
0x00000000004031bd <+28>: mov -0x28(%rbp),%rax
=> 0x00000000004031c1 <+32>: movl $0x0,(%rax)
0x00000000004031c7 <+38>: mov -0x30(%rbp),%rax
0x00000000004031cb <+42>: movl $0x0,(%rax)
0x00000000004031d1 <+48>: mov -0x30(%rbp),%rax
0x00000000004031d5 <+52>: add $0x4,%rax
0x00000000004031d9 <+56>: movl $0x1,(%rax)
0x00000000004031df <+62>: callq 0x402a10 rand@plt
0x00000000004031e4 <+67>: cvtsi2ss %eax,%xmm0

And some cuda-gdb output from UnifiedMemoryStreams
when it hits the Bus error:

cuda-gdb UnifiedMemoryStreams

NVIDIA (R) CUDA Debugger
6.0 release
Portions Copyright (C) 2007-2014 NVIDIA Corporation
GNU gdb (GDB) 7.6
Copyright (C) 2013 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later http://gnu.org/licenses/gpl.html
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law. Type “show copying”
and “show warranty” for details.
This GDB was configured as “x86_64-unknown-linux-gnu”.
For bug reporting instructions, please see:
http://www.gnu.org/software/gdb/bugs/
Reading symbols from /usr/share/doc/nvidia/cuda/samples/bin/x86_64/linux/release/UnifiedMemoryStreams…(no debugging sy
(cuda-gdb) run
Starting program: /usr/share/doc/nvidia/cuda/samples/bin/x86_64/linux/release/UnifiedMemoryStreams
[Thread debugging using libthread_db enabled]
Using host libthread_db library “/lib64/libthread_db.so.1”.
GPU Device 0: “GeForce GT 640” with compute capability 3.5

[New Thread 0x7ffff5376700 (LWP 6210)]

Program received signal CUDA_EXCEPTION_15, Invalid Managed Memory Access.
0x00000000004048f7 in Task::allocate(unsigned int, unsigned int) ()

(cuda-gdb) info stack
#0 0x00000000004048f7 in Task::allocate(unsigned int, unsigned int) ()
#1 0x0000000000404219 in void initialise_tasks(std::vector<Task, std::allocator<Task > >&) ()
#2 0x0000000000403428 in main ()

(cuda-gdb) info reg
rax 0x0 0
rbx 0x900000000 38654705664
rcx 0x63780 407424
rdx 0x24f63780 620115840
rsi 0x2519400000000 652646050430976
rdi 0x31c73933d0 213795812304
rbp 0x7fffffffe4a0 0x7fffffffe4a0
rsp 0x7fffffffe470 0x7fffffffe470
r8 0x5deece66d 25214903917
r9 0x0 0
r10 0xffffffff 4294967295
r11 0x0 0
r12 0x402be0 4205536
r13 0x7fffffffe930 140737488349488
r14 0x0 0
r15 0x0 0
rip 0x4048f7 0x4048f7 <Task::allocate(unsigned int, unsigned int)+299>
eflags 0x10202 [ IF RF ]
cs 0x33 51
ss 0x2b 43
ds 0x0 0
es 0x0 0
fs 0x0 0
gs 0x0 0

(cuda-gdb) disas 0x4048f7
Dump of assembler code for function _ZN4TaskIdE8allocateEjj:
0x00000000004047cc <+0>: push %rbp
0x00000000004047cd <+1>: mov %rsp,%rbp
0x00000000004047d0 <+4>: push %rbx
0x00000000004047d1 <+5>: sub $0x28,%rsp
0x00000000004047d5 <+9>: mov %rdi,-0x28(%rbp)
0x00000000004047d9 <+13>: mov %esi,-0x2c(%rbp)
0x00000000004047dc <+16>: mov %edx,-0x30(%rbp)
0x00000000004047df <+19>: mov -0x28(%rbp),%rax
0x00000000004047e3 <+23>: mov -0x30(%rbp),%edx
0x00000000004047e6 <+26>: mov %edx,0x4(%rax)
0x00000000004047e9 <+29>: mov -0x28(%rbp),%rax
0x00000000004047ed <+33>: mov -0x2c(%rbp),%edx
0x00000000004047f0 <+36>: mov %edx,(%rax)
0x00000000004047f2 <+38>: mov -0x28(%rbp),%rax
0x00000000004047f6 <+42>: mov (%rax),%eax
0x00000000004047f8 <+44>: mov %eax,%edx
0x00000000004047fa <+46>: mov -0x28(%rbp),%rax
0x00000000004047fe <+50>: mov (%rax),%eax
0x0000000000404800 <+52>: mov %eax,%eax
0x0000000000404802 <+54>: imul %rdx,%rax
0x0000000000404806 <+58>: lea 0x0(,%rax,8),%rcx
0x000000000040480e <+66>: mov -0x28(%rbp),%rax
0x0000000000404812 <+70>: add $0x8,%rax
0x0000000000404816 <+74>: mov $0x1,%edx
0x000000000040481b <+79>: mov %rcx,%rsi
0x000000000040481e <+82>: mov %rax,%rdi
0x0000000000404821 <+85>: callq 0x404b9c <_ZN91_GLOBAL__N__67_tmpxft_00001f5a_00000000_15_UnifiedMemoryStreams_co
0x0000000000404826 <+90>: mov $0x4d,%ecx
0x000000000040482b <+95>: mov $0x44e769,%edx
0x0000000000404830 <+100>: mov $0x44ec00,%esi
0x0000000000404835 <+105>: mov %eax,%edi
0x0000000000404837 <+107>: callq 0x403edd <_Z5checkI9cudaErrorEvT_PKcS3_i>
0x000000000040483c <+112>: mov -0x28(%rbp),%rax
0x0000000000404840 <+116>: mov (%rax),%eax
0x0000000000404842 <+118>: mov %eax,%eax
0x0000000000404844 <+120>: lea 0x0(,%rax,8),%rcx
0x000000000040484c <+128>: mov -0x28(%rbp),%rax
0x0000000000404850 <+132>: add $0x10,%rax
0x0000000000404854 <+136>: mov $0x1,%edx
0x0000000000404859 <+141>: mov %rcx,%rsi
0x000000000040485c <+144>: mov %rax,%rdi
0x000000000040485f <+147>: callq 0x404b9c <_ZN91_GLOBAL__N__67_tmpxft_00001f5a_00000000_15_UnifiedMemoryStreams_co
0x0000000000404864 <+152>: mov $0x4e,%ecx
0x0000000000404869 <+157>: mov $0x44e769,%edx
0x000000000040486e <+162>: mov $0x44ec30,%esi
0x0000000000404873 <+167>: mov %eax,%edi
0x0000000000404875 <+169>: callq 0x403edd <_Z5checkI9cudaErrorEvT_PKcS3_i>
0x000000000040487a <+174>: mov -0x28(%rbp),%rax
0x000000000040487e <+178>: mov (%rax),%eax
0x0000000000404880 <+180>: mov %eax,%eax
0x0000000000404882 <+182>: lea 0x0(,%rax,8),%rcx
0x000000000040488a <+190>: mov -0x28(%rbp),%rax
0x000000000040488e <+194>: add $0x18,%rax
0x0000000000404892 <+198>: mov $0x1,%edx
0x0000000000404897 <+203>: mov %rcx,%rsi
0x000000000040489a <+206>: mov %rax,%rdi
0x000000000040489d <+209>: callq 0x404b9c <_ZN91_GLOBAL__N__67_tmpxft_00001f5a_00000000_15_UnifiedMemoryStreams_co
0x00000000004048a2 <+214>: mov $0x4f,%ecx
0x00000000004048a7 <+219>: mov $0x44e769,%edx
0x00000000004048ac <+224>: mov $0x44ec60,%esi
0x00000000004048b1 <+229>: mov %eax,%edi
0x00000000004048b3 <+231>: callq 0x403edd <_Z5checkI9cudaErrorEvT_PKcS3_i>
0x00000000004048b8 <+236>: callq 0x43ad10
0x00000000004048bd <+241>: mov $0x50,%ecx
0x00000000004048c2 <+246>: mov $0x44e769,%edx
0x00000000004048c7 <+251>: mov $0x44e8c7,%esi
0x00000000004048cc <+256>: mov %eax,%edi
0x00000000004048ce <+258>: callq 0x403edd <_Z5checkI9cudaErrorEvT_PKcS3_i>
0x00000000004048d3 <+263>: movl $0x0,-0x18(%rbp)
0x00000000004048da <+270>: jmp 0x4048ff <_ZN4TaskIdE8allocateEjj+307>
0x00000000004048dc <+272>: mov -0x28(%rbp),%rax
0x00000000004048e0 <+276>: mov 0x8(%rax),%rax
0x00000000004048e4 <+280>: mov -0x18(%rbp),%edx
0x00000000004048e7 <+283>: movslq %edx,%rdx
0x00000000004048ea <+286>: shl $0x3,%rdx
0x00000000004048ee <+290>: lea (%rax,%rdx,1),%rbx
0x00000000004048f2 <+294>: callq 0x402bd0 drand48@plt
=> 0x00000000004048f7 <+299>: movsd %xmm0,(%rbx)
0x00000000004048fb <+303>: addl $0x1,-0x18(%rbp)
0x00000000004048ff <+307>: mov -0x18(%rbp),%edx
0x0000000000404902 <+310>: mov -0x28(%rbp),%rax
0x0000000000404906 <+314>: mov (%rax),%ecx
0x0000000000404908 <+316>: mov -0x28(%rbp),%rax