cufft 3.1 crash in cufftSetCompatibilityMode()

Hi,

when I try cufftSetCompatibilityMode(plan, CUFFT_COMPATIBILITY_FFTW_ALL ) on a successfully created plan, the following happens:

*** glibc detected *** /home/beekhof/work/src/cvmlcpp/trunk/testing/a.out: free(): invalid next size (normal): 0x00000000009ba260 ***
======= Backtrace: =========
/lib/libc.so.6(+0x775b6)[0x7ffff54095b6]
/lib/libc.so.6(cfree+0x73)[0x7ffff540fe53]
/usr/local/cuda/lib64/libcufft.so.3(+0x250ec)[0x7ffff65460ec]
/usr/local/cuda/lib64/libcufft.so.3(+0x24a02)[0x7ffff6545a02]
/usr/local/cuda/lib64/libcufft.so.3(cufftDestroy+0x3d)[0x7ffff65318bd]
/usr/local/cuda/lib64/libcufft.so.3(cufftSetCompatibilityMode+0xb3)[0x7ffff6531e63
]
/home/beekhof/work/src/cvmlcpp/trunk/testing/a.out[0x40e1bc]
/home/beekhof/work/src/cvmlcpp/trunk/testing/a.out[0x40c6c4]
/home/beekhof/work/src/cvmlcpp/trunk/testing/a.out[0x40a73d]
/home/beekhof/work/src/cvmlcpp/trunk/testing/a.out[0x408efc]
/home/beekhof/work/src/cvmlcpp/trunk/testing/a.out[0x40741a]
/home/beekhof/work/src/cvmlcpp/trunk/testing/a.out[0x4055d8]
/home/beekhof/work/src/cvmlcpp/trunk/testing/a.out[0x404291]
/home/beekhof/work/src/cvmlcpp/trunk/testing/a.out[0x403340]
/home/beekhof/work/src/cvmlcpp/trunk/testing/a.out[0x401b77]
/lib/libc.so.6(__libc_start_main+0xfd)[0x7ffff53b0c4d]
/home/beekhof/work/src/cvmlcpp/trunk/testing/a.out[0x401999]
======= Memory map: ========
00400000-00423000 r-xp 00000000 00:19 1210402 /home/beekhof/work/src/cvmlcpp/trunk/testing/a.out
00622000-00623000 r–p 00022000 00:19 1210402 /home/beekhof/work/src/cvmlcpp/trunk/testing/a.out
00623000-00624000 rw-p 00023000 00:19 1210402 /home/beekhof/work/src/cvmlcpp/trunk/testing/a.out
00624000-009c8000 rw-p 00000000 00:00 0 [heap]
7fffec000000-7fffec021000 rw-p 00000000 00:00 0
7fffec021000-7ffff0000000 —p 00000000 00:00 0
7ffff2f56000-7ffff2f57000 —p 00000000 00:00 0
7ffff2f57000-7ffff3757000 rwxp 00000000 00:00 0
7ffff3757000-7ffff3780000 rw-p 00000000 00:00 0
7ffff3780000-7ffff3880000 rw-s 64c77000 00:05 5121 /dev/nvidia0
7ffff3880000-7ffff3980000 rw-s 125f5000 00:05 5121 /dev/nvidia0
7ffff3980000-7ffff3a80000 rw-s 5f4f2000 00:05 5121 /dev/nvidia0
7ffff3a80000-7ffff3b80000 rw-s 7938c000 00:05 5121 /dev/nvidia0
7ffff3b80000-7ffff3b81000 rw-s 79389000 00:05 5121 /dev/nvidia0
7ffff3b81000-7ffff3b82000 rw-s dec08000 00:05 5121 /dev/nvidia0
7ffff3b82000-7ffff3b83000 rw-s 79388000 00:05 5121 /dev/nvidia0
7ffff3b83000-7ffff3f85000 rw-s 64c88000 00:05 5121 /dev/nvidia0
7ffff3f85000-7ffff4387000 rw-s 5c8f8000 00:05 5121 /dev/nvidia0
7ffff4387000-7ffff439d000 r-xp 00000000 08:01 16548105 /lib/libz.so.1.2.3.3
7ffff439d000-7ffff459c000 —p 00016000 08:01 16548105 /lib/libz.so.1.2.3.3
7ffff459c000-7ffff459d000 r–p 00015000 08:01 16548105 /lib/libz.so.1.2.3.3
7ffff459d000-7ffff459e000 rw-p 00016000 08:01 16548105 /lib/libz.so.1.2.3.3
7ffff459e000-7ffff4caf000 r-xp 00000000 08:01 2295838 /usr/lib/libcuda.so.256.40
7ffff4caf000-7ffff4eae000 —p 00711000 08:01 2295838 /usr/lib/libcuda.so.256.40
7ffff4eae000-7ffff4f62000 rw-p 00710000 08:01 2295838 /usr/lib/libcuda.so.256.40
7ffff4f62000-7ffff4f86000 rw-p 00000000 00:00 0
7ffff4f86000-7ffff4f8d000 r-xp 00000000 08:01 6948329 /lib/librt-2.11.1.so
7ffff4f8d000-7ffff518c000 —p 00007000 08:01 6948329 /lib/librt-2.11.1.so
7ffff518c000-7ffff518d000 r–p 00006000 08:01 6948329 /lib/librt-2.11.1.so
7ffff518d000-7ffff518e000 rw-p 00007000 08:01 6948329 /lib/librt-2.11.1.so
7ffff518e000-7ffff5190000 r-xp 00000000 08:01 6948316 /lib/libdl-2.11.1.so
7ffff5190000-7ffff5390000 —p 00002000 08:01 6948316 /lib/libdl-2.11.1.so
7ffff5390000-7ffff5391000 r–p 00002000 08:01 6948316 /lib/libdl-2.11.1.so
7ffff5391000-7ffff5392000 rw-p 00003000 08:01 6948316 /lib/libdl-2.11.1.so
7ffff5392000-7ffff550c000 r-xp 00000000 08:01 6948313 /lib/libc-2.11.1.so
7ffff550c000-7ffff570b000 —p 0017a000 08:01 6948313 /lib/libc-2.11.1.so
7ffff570b000-7ffff570f000 r–p 00179000 08:01 6948313 /lib/libc-2.11.1.so
7ffff570f000-7ffff5710000 rw-p 0017d000 08:01 6948313 /lib/libc-2.11.1.so
7ffff5710000-7ffff5715000 rw-p 00000000 00:00 0
7ffff5715000-7ffff572d000 r-xp 00000000 08:01 6948327 /lib/libpthread-2.11.1.so
7ffff572d000-7ffff592c000 —p 00018000 08:01 6948327 /lib/libpthread-2.11.1.so
7ffff592c000-7ffff592d000 r–p 00017000 08:01 6948327 /lib/libpthread-2.11.1.so
7ffff592d000-7ffff592e000 rw-p 00018000 08:01 6948327 /lib/libpthread-2.11.1.so
7ffff592e000-7ffff5932000 rw-p 00000000 00:00 0
7ffff5932000-7ffff5948000 r-xp 00000000 08:01 6946970 /lib/libgcc_s.so.1
7ffff5948000-7ffff5b47000 —p 00016000 08:01 6946970 /lib/libgcc_s.so.1
7ffff5b47000-7ffff5b48000 r–p 00015000 08:01 6946970 /lib/libgcc_s.so.1
7ffff5b48000-7ffff5b49000 rw-p 00016000 08:01 6946970 /lib/libgcc_s.so.1
7ffff5b49000-7ffff5b56000 r-xp 00000000 08:01 2294018 /usr/lib/libgomp.so.1.0.0
7ffff5b56000-7ffff5d55000 —p 0000d000 08:01 2294018 /usr/lib/libgomp.so.1.0.0
7ffff5d55000-7ffff5d56000 r–p 0000c000 08:01 2294018 /usr/lib/libgomp.so.1.0.0
7ffff5d56000-7ffff5d57000 rw-p 0000d000 08:01 2294018 /usr/lib/libgomp.so.1.0.0
7ffff5d57000-7ffff5dd9000 r-xp 00000000 08:01 6948317 /lib/libm-2.11.1.so
7ffff5dd9000-7ffff5fd8000 —p 00082000 08:01 6948317 /lib/libm-2.11.1.so
7ffff5fd8000-7ffff5fd9000 r–p 00081000 08:01 6948317 /lib/libm-2.11.1.so
7ffff5fd9000-7ffff5fda000 rw-p 00082000 08:01 6948317 /lib/libm-2.11.1.so
7ffff5fda000-7ffff60d0000 r-xp 00000000 08:01 2293866 /usr/lib/libstdc++.so.6.0.13
Program received signal SIGABRT, Aborted.
0x00007ffff53c5a75 in *__GI_raise (sig=) at …/nptl/sysdeps/unix/sysv/linux/raise.c:64
64 …/nptl/sysdeps/unix/sysv/linux/raise.c: No such file or directory.

According to the debugger, that happened in:

#0 0x00007ffff53c5a75 in *__GI_raise (sig=) at …/nptl/sysdeps/unix/sysv/linux/raise.c:64
#1 0x00007ffff53c95c0 in *__GI_abort () at abort.c:92
#2 0x00007ffff53ff4fb in __libc_message (do_abort=, fmt=) at …/sysdeps/unix/sysv/linux/libc_fatal.c:189
#3 0x00007ffff54095b6 in malloc_printerr (action=3, str=0x7ffff54dcc28 “free(): invalid next size (normal)”, ptr=) at malloc.c:6264
#4 0x00007ffff540fe53 in *__GI___libc_free (mem=) at malloc.c:3738
#5 0x00007ffff65460ec in ?? () from /usr/local/cuda/lib64/libcufft.so.3
#6 0x00007ffff6545a02 in ?? () from /usr/local/cuda/lib64/libcufft.so.3
#7 0x00007ffff65318bd in cufftDestroy () from /usr/local/cuda/lib64/libcufft.so.3
#8 0x00007ffff6531e63 in cufftSetCompatibilityMode () from /usr/local/cuda/lib64/libcufft.so.3

Anybody got an idea what’s going on ? I think that function is not supposed to crash under any circumstances …

More interesting bits from dmesg:

40.782511] ioremap: invalid physical address 38544550580000
[ 40.782514] ------------[ cut here ]------------
[ 40.782522] WARNING: at /build/buildd/linux-2.6.32/arch/x86/mm/ioremap.c:120 __ioremap_caller+0x360/0x3d0()
[ 40.782524] Hardware name: OptiPlex 745
[ 40.782525] Modules linked in: snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_ra
wmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device snd dell_wmi dcdbas nvidia(P) psmouse lp serio_raw soundcore intel_agp snd_page_alloc parport usbhid hid tg3
[ 40.782545] Pid: 999, comm: Xorg Tainted: P 2.6.32-24-generic #38-Ubuntu
[ 40.782547] Call Trace:
[ 40.782552] [] warn_slowpath_common+0x7b/0xc0
[ 40.782554] [] warn_slowpath_null+0x14/0x20
[ 40.782557] [] __ioremap_caller+0x360/0x3d0
[ 40.782755] [] ? os_map_kernel_space+0x8d/0xe0 [nvidia]
[ 40.782881] [] ? _nv020103rm+0xa2/0x164 [nvidia]
[ 40.782884] [] ioremap_nocache+0x17/0x20
[ 40.783037] [] os_map_kernel_space+0x8d/0xe0 [nvidia]
[ 40.783196] [] _nv021093rm+0x55/0x71 [nvidia]
[ 40.783320] [] ? _nv020133rm+0xad/0xbf [nvidia]
[ 40.783445] [] ? _nv020150rm+0x190/0x2f8 [nvidia]
[ 40.783570] [] ? _nv020200rm+0xaa/0x19a [nvidia]
[ 40.783694] [] ? _nv020149rm+0x50/0x5d [nvidia]
[ 40.783819] [] ? _nv020141rm+0x6e/0x78 [nvidia]
[ 40.783974] [] ? _nv017238rm+0x69/0x121 [nvidia]
[ 40.784129] [] ? _nv017257rm+0xde/0xf7 [nvidia]
[ 40.784258] [] ? _nv003941rm+0x68/0x19e [nvidia]
[ 40.784455] [] ? _nv014039rm+0x177/0x473 [nvidia]
[ 40.784652] [] ? _nv014320rm+0xc9/0x13a [nvidia]
[ 40.784773] [] ? _nv014506rm+0xd/0x12 [nvidia]
[ 40.784929] [] ? _nv002101rm+0x160/0x26f [nvidia]
[ 40.785084] [] ? _nv002095rm+0x493/0x65b [nvidia]
[ 40.785240] [] ? rm_init_adapter+0x83/0xf1 [nvidia]
[ 40.785393] [] ? nv_kern_open+0x59e/0x740 [nvidia]
[ 40.785398] [] ? chrdev_open+0x13a/0x240
[ 40.785401] [] ? chrdev_open+0x0/0x240
[ 40.785403] [] ? __dentry_open+0x113/0x370
[ 40.785407] [] ? security_inode_permission+0x1f/0x30
[ 40.785410] [] ? inode_permission+0xaf/0xd0
[ 40.785413] [] ? nameidata_to_filp+0x57/0x70
[ 40.785415] [] ? do_filp_open+0x2da/0xba0
[ 40.785418] [] ? notify_change+0x237/0x350
[ 40.785421] [] ? alloc_fd+0x10a/0x150
[ 40.785424] [] ? do_sys_open+0x69/0x170
[ 40.785426] [] ? sys_open+0x20/0x30
[ 40.785430] [] ? system_call_fastpath+0x16/0x1b
[ 40.785432] —[ end trace 5f9e1c32f9ed1415 ]—
[ 41.232716] ioremap: invalid physical address 38544550580000
[ 41.232732] ioremap: invalid physical address 38544550580000
[ 41.232743] ioremap: invalid physical address 38544550580000
[ 41.232754] ioremap: invalid physical address 38544550580000
[ 41.232764] ioremap: invalid physical address 38544550580000

Here is the binary with debugging symbols and all. It’s zipped twice because otherwise I couldn’t upload it.
crash.bz2.gz (176 KB)

… and the answer is: memory corruption by a careless cudamemcpy earlier in the program!

All fine now.

Nonetheless, if that kernel problem is related, we’ve just found a way to cause kernel problems from userland…