When running the below code, kernel call trace can be observed in dmesg. Could you please suggest some solution or workaround for the same. Thanks in advance.
System Configuration:
CPU: AMD Ryzen Threadripper 2990WX 32-Core Processor
MOTHERBOARD: ASUS ROG STRIX X399-E GAMING
OS: Ubuntu 18.04 LTS
RAM: 128 GB (16 GB * 8)
GPUs: GeForce RTX 2080, GeForce RTX 2080
NVIDIA Driver Version: 410.104
CUDA Version: 10.0
Issue faced:
- For any buffer more than ~2.5GB, cudaFreeHost throws a trace and takes too much time (In below logs, all attempts throw this error)
- For any buffer more than ~2.5GB, cudaHostAlloc sometimes throws an error.(In below logs, attempts 3 and 5 throw this error)
Note: There is sufficient available memory on RAM when performing these tests.
Code:
#include <iostream>
#include <unistd.h>
#define checkCudaErrors(func) { \
cudaError_t error = func; \
if (error != 0) { \
std::cout << "Cuda failure\nError: " << cudaGetErrorString(error) \
<< " val: " << error << " at: " << __FILE__ \
<< ":" << __LINE__ << std::endl; \
} \
else { \
std::cout << "Success" << std::endl; \
} \
}
int main(void)
{
size_t data_size = 5000000000;
float2 *buf {nullptr};
system("echo ================ Before cudaHostAlloc - Attempt 1 ================ >> /dev/kmsg");
checkCudaErrors(cudaHostAlloc((void **)&buf, data_size, cudaHostAllocDefault));
system("echo ================ Before cudaFreeHost - Attempt 1 ================ >> /dev/kmsg");
cudaFreeHost(buf);
system("echo ================ After cudaFreeHost - Attempt 1 ================ >> /dev/kmsg");
system("echo ================ Before cudaHostAlloc - Attempt 2 ================ >> /dev/kmsg");
checkCudaErrors(cudaHostAlloc((void **)&buf, data_size, cudaHostAllocDefault));
system("echo ================ Before cudaFreeHost - Attempt 2 ================ >> /dev/kmsg");
cudaFreeHost(buf);
system("echo ================ After cudaFreeHost - Attempt 2 ================ >> /dev/kmsg");
system("echo ================ Before cudaHostAlloc - Attempt 3 ================ >> /dev/kmsg");
checkCudaErrors(cudaHostAlloc((void **)&buf, data_size, cudaHostAllocDefault));
system("echo ================ Before cudaFreeHost - Attempt 3 ================ >> /dev/kmsg");
cudaFreeHost(buf);
system("echo ================ After cudaFreeHost - Attempt 3 ================ >> /dev/kmsg");
system("echo ================ Before cudaHostAlloc - Attempt 4 ================ >> /dev/kmsg");
checkCudaErrors(cudaHostAlloc((void **)&buf, data_size, cudaHostAllocDefault));
system("echo ================ Before cudaFreeHost - Attempt 4 ================ >> /dev/kmsg");
cudaFreeHost(buf);
system("echo ================ After cudaFreeHost - Attempt 4 ================ >> /dev/kmsg");
system("echo ================ Before cudaHostAlloc - Attempt 5 ================ >> /dev/kmsg");
checkCudaErrors(cudaHostAlloc((void **)&buf, data_size, cudaHostAllocDefault));
system("echo ================ Before cudaFreeHost - Attempt 5 ================ >> /dev/kmsg");
cudaFreeHost(buf);
system("echo ================ After cudaFreeHost - Attempt 5 ================ >> /dev/kmsg");
return 0;
}
Console Log:
Success
Success
Cuda failure
Error: OS call failed or operation not supported on this OS val: 63 at: test.cu:37
Success
Cuda failure
Error: OS call failed or operation not supported on this OS val: 63 at: test.cu:49
dmesg:
[99877.259244] ================ Before cudaHostAlloc - Attempt 1 ================
[99878.589486] ================ Before cudaFreeHost - Attempt 1 ================
[99905.159010] watchdog: BUG: soft lockup - CPU#20 stuck for 22s! [a.out:120203]
[99905.159013] Modules linked in: rfcomm ccm nf_conntrack_ipv4 nf_defrag_ipv4 xt_tcpudp xt_conntrack nf_conntrack libcrc32c ip6table_filter ip6_tables cmac bnep iptable_filter snd_hda_codec_hdmi binfmt_misc nvidia_uvm(POE) nvidia_drm(POE) nvidia_modeset(POE) snd_hda_codec_realtek snd_hda_codec_generic nls_iso8859_1 snd_hda_intel snd_hda_codec edac_mce_amd snd_hda_core snd_hwdep kvm snd_pcm irqbypass nvidia(POE) joydev input_leds arc4 crct10dif_pclmul crc32_pclmul btusb snd_seq_midi btrtl ghash_clmulni_intel snd_seq_midi_event btbcm snd_rawmidi btintel r8822be(C) pcbc bluetooth drm_kms_helper ecdh_generic snd_seq drm mac80211 aesni_intel snd_seq_device ipmi_devintf snd_timer aes_x86_64 ipmi_msghandler crypto_simd fb_sys_fops cfg80211 syscopyarea snd glue_helper sysfillrect cryptd sysimgblt soundcore
[99905.159045] ccp eeepc_wmi asus_wmi sparse_keymap k10temp shpchp video wmi_bmof mxm_wmi wmi mac_hid sch_fq_codel parport_pc ppdev lp parport ip_tables x_tables autofs4 igb i2c_algo_bit dca ptp pps_core ahci nvme libahci i2c_piix4 nvme_core gpio_amdpt gpio_generic hid_generic usbhid hid
[99905.159061] CPU: 20 PID: 120203 Comm: a.out Tainted: P C OEL 4.15.0-47-generic #50-Ubuntu
[99905.159061] Hardware name: System manufacturer System Product Name/ROG STRIX X399-E GAMING, BIOS 0808 10/12/2018
[99905.159067] RIP: 0010:iommu_unmap_page+0x11/0x100
[99905.159068] RSP: 0018:ffffb16452f87a08 EFLAGS: 00000206 ORIG_RAX: ffffffffffffff11
[99905.159069] RAX: 0000008000000000 RBX: 00008d0f4bbbf000 RCX: 0000000000000027
[99905.159070] RDX: 0000000000001000 RSI: 00008d0f4bbbe000 RDI: ffff9a75f2782000
[99905.159071] RBP: ffffb16452f87a28 R08: 0000000000000000 R09: 0000000000000000
[99905.159071] R10: ffffb16452f879f0 R11: 000000000000001b R12: ffff9a75f2782000
[99905.159072] R13: 00008fe9fffff000 R14: 00000000ffffffff R15: 00007fea00000000
[99905.159073] FS: 00007fea541a1b80(0000) GS:ffff9a75fd900000(0000) knlGS:0000000000000000
[99905.159073] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[99905.159074] CR2: 000055c8fd398360 CR3: 0000001670ef2000 CR4: 00000000003406e0
[99905.159074] Call Trace:
[99905.159078] __unmap_single.isra.24+0x62/0xf0
[99905.159079] unmap_sg+0x5f/0x70
[99905.159187] nv_unmap_dma_map_scatterlist+0x5f/0xa0 [nvidia]
[99905.159254] nv_dma_unmap_pages+0x62/0xb0 [nvidia]
[99905.159319] nv_dma_unmap_alloc+0x16/0x30 [nvidia]
[99905.159422] _nv030926rm+0x11e/0x210 [nvidia]
[99905.159526] ? _nv007745rm+0x5c/0x80 [nvidia]
[99905.159684] ? _nv007876rm+0x2d/0x50 [nvidia]
[99905.159829] ? _nv026970rm+0xae/0x200 [nvidia]
[99905.159940] ? _nv027057rm+0x535/0x610 [nvidia]
[99905.160053] ? _nv003653rm+0xd/0x20 [nvidia]
[99905.160167] ? _nv004259rm+0x15/0x80 [nvidia]
[99905.160280] ? _nv012034rm+0x194/0x290 [nvidia]
[99905.160388] ? _nv035122rm+0xf8/0x1a0 [nvidia]
[99905.160501] ? _nv035121rm+0x2c0/0x330 [nvidia]
[99905.160612] ? _nv001083rm+0x63/0xa0 [nvidia]
[99905.160719] ? _nv007937rm+0x52/0xd0 [nvidia]
[99905.160815] ? _nv001129rm+0x5ce/0x850 [nvidia]
[99905.160911] ? rm_ioctl+0x73/0x100 [nvidia]
[99905.160914] ? __kmalloc+0xa0/0x220
[99905.160916] ? __check_object_size+0xaf/0x1b0
[99905.160978] ? nvidia_ioctl+0x538/0x7c0 [nvidia]
[99905.161039] ? nvidia_frontend_unlocked_ioctl+0x42/0x50 [nvidia]
[99905.161042] ? do_vfs_ioctl+0xa8/0x630
[99905.161044] ? handle_mm_fault+0xb1/0x1f0
[99905.161047] ? __do_page_fault+0x270/0x4d0
[99905.161048] ? SyS_ioctl+0x79/0x90
[99905.161050] ? do_syscall_64+0x73/0x130
[99905.161053] ? entry_SYSCALL_64_after_hwframe+0x3d/0xa2
[99905.161054] Code: da 48 21 c3 48 21 f2 48 21 ca 48 09 d3 eb a2 31 db eb 9e e8 e2 9d a5 ff 66 90 0f 1f 44 00 00 55 48 89 e5 41 57 41 56 41 55 41 54 <53> 48 83 ec 10 65 48 8b 04 25 28 00 00 00 48 89 45 d0 31 c0 48
[99911.432462] ================ After cudaFreeHost - Attempt 1 ================
[99911.437183] ================ Before cudaHostAlloc - Attempt 2 ================
[99912.582346] ================ Before cudaFreeHost - Attempt 2 ================
[99937.131258] watchdog: BUG: soft lockup - CPU#8 stuck for 22s! [a.out:120203]
[99937.131262] Modules linked in: rfcomm ccm nf_conntrack_ipv4 nf_defrag_ipv4 xt_tcpudp xt_conntrack nf_conntrack libcrc32c ip6table_filter ip6_tables cmac bnep iptable_filter snd_hda_codec_hdmi binfmt_misc nvidia_uvm(POE) nvidia_drm(POE) nvidia_modeset(POE) snd_hda_codec_realtek snd_hda_codec_generic nls_iso8859_1 snd_hda_intel snd_hda_codec edac_mce_amd snd_hda_core snd_hwdep kvm snd_pcm irqbypass nvidia(POE) joydev input_leds arc4 crct10dif_pclmul crc32_pclmul btusb snd_seq_midi btrtl ghash_clmulni_intel snd_seq_midi_event btbcm snd_rawmidi btintel r8822be(C) pcbc bluetooth drm_kms_helper ecdh_generic snd_seq drm mac80211 aesni_intel snd_seq_device ipmi_devintf snd_timer aes_x86_64 ipmi_msghandler crypto_simd fb_sys_fops cfg80211 syscopyarea snd glue_helper sysfillrect cryptd sysimgblt soundcore
[99937.131289] ccp eeepc_wmi asus_wmi sparse_keymap k10temp shpchp video wmi_bmof mxm_wmi wmi mac_hid sch_fq_codel parport_pc ppdev lp parport ip_tables x_tables autofs4 igb i2c_algo_bit dca ptp pps_core ahci nvme libahci i2c_piix4 nvme_core gpio_amdpt gpio_generic hid_generic usbhid hid
[99937.131304] CPU: 8 PID: 120203 Comm: a.out Tainted: P C OEL 4.15.0-47-generic #50-Ubuntu
[99937.131305] Hardware name: System manufacturer System Product Name/ROG STRIX X399-E GAMING, BIOS 0808 10/12/2018
[99937.131310] RIP: 0010:iommu_unmap_page+0xa8/0x100
[99937.131311] RSP: 0018:ffffb16452f879f0 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff11
[99937.131312] RAX: 0000000000000000 RBX: 0000000000001000 RCX: 0000000000000027
[99937.131313] RDX: 00008c08e2eca000 RSI: 0000000000000000 RDI: 0000000000000004
[99937.131314] RBP: ffffb16452f87a28 R08: 0000000000000000 R09: 0000000000000000
[99937.131314] R10: ffffb16452f879f0 R11: 000000000000001b R12: ffff9a75f2782098
[99937.131315] R13: ffff9a75f2782000 R14: 0000000000000000 R15: 00008c08e2eca000
[99937.131316] FS: 00007fea541a1b80(0000) GS:ffff9a75fdc00000(0000) knlGS:0000000000000000
[99937.131316] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[99937.131317] CR2: 00000843315c4000 CR3: 0000001670ef2000 CR4: 00000000003406e0
[99937.131317] Call Trace:
[99937.131321] __unmap_single.isra.24+0x62/0xf0
[99937.131323] unmap_sg+0x5f/0x70
[99937.131401] nv_unmap_dma_map_scatterlist+0x5f/0xa0 [nvidia]
[99937.131467] nv_dma_unmap_pages+0x62/0xb0 [nvidia]
[99937.131531] nv_dma_unmap_alloc+0x16/0x30 [nvidia]
[99937.131631] _nv030926rm+0x11e/0x210 [nvidia]
[99937.131735] ? _nv007745rm+0x5c/0x80 [nvidia]
[99937.131889] ? _nv007876rm+0x2d/0x50 [nvidia]
[99937.132032] ? _nv026970rm+0xae/0x200 [nvidia]
[99937.132140] ? _nv027057rm+0x535/0x610 [nvidia]
[99937.132251] ? _nv003653rm+0xd/0x20 [nvidia]
[99937.132363] ? _nv004259rm+0x15/0x80 [nvidia]
[99937.132478] ? _nv012034rm+0x194/0x290 [nvidia]
[99937.132581] ? _nv035122rm+0xf8/0x1a0 [nvidia]
[99937.132689] ? _nv035121rm+0x2c0/0x330 [nvidia]
[99937.132790] ? _nv001083rm+0x63/0xa0 [nvidia]
[99937.132890] ? _nv007937rm+0x52/0xd0 [nvidia]
[99937.132986] ? _nv001129rm+0x5ce/0x850 [nvidia]
[99937.133082] ? rm_ioctl+0x73/0x100 [nvidia]
[99937.133085] ? __kmalloc+0xa0/0x220
[99937.133087] ? __check_object_size+0xaf/0x1b0
[99937.133148] ? nvidia_ioctl+0x538/0x7c0 [nvidia]
[99937.133209] ? nvidia_frontend_unlocked_ioctl+0x42/0x50 [nvidia]
[99937.133211] ? do_vfs_ioctl+0xa8/0x630
[99937.133213] ? handle_mm_fault+0xb1/0x1f0
[99937.133216] ? __do_page_fault+0x270/0x4d0
[99937.133217] ? SyS_ioctl+0x79/0x90
[99937.133219] ? do_syscall_64+0x73/0x130
[99937.133222] ? entry_SYSCALL_64_after_hwframe+0x3d/0xa2
[99937.133222] Code: 83 e9 0c 48 f7 e1 48 c1 ea 03 48 8d 04 d2 ba 01 00 00 00 48 29 c1 31 c0 48 d3 e2 48 c7 04 c6 00 00 00 00 48 83 c0 01 39 c2 7f f0 <48> 8b 45 c8 48 89 c2 49 01 c6 48 f7 da 49 21 d7 49 01 c7 4c 39
[99945.392599] ================ After cudaFreeHost - Attempt 2 ================
[99945.397365] ================ Before cudaHostAlloc - Attempt 3 ================
[99946.527821] 0000:0a:00.0: IOMMU mapping error in map_sg (io-pages: 1048575)
[99946.527829] NVRM: failed to create a DMA mapping!
[99947.111112] ================ Before cudaFreeHost - Attempt 3 ================
[99947.116263] ================ After cudaFreeHost - Attempt 3 ================
[99947.121425] ================ Before cudaHostAlloc - Attempt 4 ================
[99948.307501] ================ Before cudaFreeHost - Attempt 4 ================
[99973.115535] watchdog: BUG: soft lockup - CPU#1 stuck for 23s! [a.out:120203]
[99973.115538] Modules linked in: rfcomm ccm nf_conntrack_ipv4 nf_defrag_ipv4 xt_tcpudp xt_conntrack nf_conntrack libcrc32c ip6table_filter ip6_tables cmac bnep iptable_filter snd_hda_codec_hdmi binfmt_misc nvidia_uvm(POE) nvidia_drm(POE) nvidia_modeset(POE) snd_hda_codec_realtek snd_hda_codec_generic nls_iso8859_1 snd_hda_intel snd_hda_codec edac_mce_amd snd_hda_core snd_hwdep kvm snd_pcm irqbypass nvidia(POE) joydev input_leds arc4 crct10dif_pclmul crc32_pclmul btusb snd_seq_midi btrtl ghash_clmulni_intel snd_seq_midi_event btbcm snd_rawmidi btintel r8822be(C) pcbc bluetooth drm_kms_helper ecdh_generic snd_seq drm mac80211 aesni_intel snd_seq_device ipmi_devintf snd_timer aes_x86_64 ipmi_msghandler crypto_simd fb_sys_fops cfg80211 syscopyarea snd glue_helper sysfillrect cryptd sysimgblt soundcore
[99973.115565] ccp eeepc_wmi asus_wmi sparse_keymap k10temp shpchp video wmi_bmof mxm_wmi wmi mac_hid sch_fq_codel parport_pc ppdev lp parport ip_tables x_tables autofs4 igb i2c_algo_bit dca ptp pps_core ahci nvme libahci i2c_piix4 nvme_core gpio_amdpt gpio_generic hid_generic usbhid hid
[99973.115579] CPU: 1 PID: 120203 Comm: a.out Tainted: P C OEL 4.15.0-47-generic #50-Ubuntu
[99973.115580] Hardware name: System manufacturer System Product Name/ROG STRIX X399-E GAMING, BIOS 0808 10/12/2018
[99973.115584] RIP: 0010:iommu_unmap_page+0xef/0x100
[99973.115584] RSP: 0018:ffffb16452f879f0 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff11
[99973.115586] RAX: 0000007fffffffff RBX: 0000000000001000 RCX: 0000000000000027
[99973.115586] RDX: ffffff8000000000 RSI: 0000000000000000 RDI: 0000000000000004
[99973.115587] RBP: ffffb16452f87a28 R08: 0000000000000000 R09: 0000000000000000
[99973.115587] R10: ffffb16452f879f0 R11: 000000000000001b R12: ffff9a75f2782098
[99973.115588] R13: ffff9a75f2782000 R14: 0000008000000000 R15: 00008c0000000000
[99973.115588] FS: 00007fea541a1b80(0000) GS:ffff9a65ff240000(0000) knlGS:0000000000000000
[99973.115589] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[99973.115590] CR2: 0000301514d70030 CR3: 0000001670ef2000 CR4: 00000000003406e0
[99973.115590] Call Trace:
[99973.115593] __unmap_single.isra.24+0x62/0xf0
[99973.115595] unmap_sg+0x5f/0x70
[99973.115702] nv_unmap_dma_map_scatterlist+0x5f/0xa0 [nvidia]
[99973.115767] nv_dma_unmap_pages+0x62/0xb0 [nvidia]
[99973.115831] nv_dma_unmap_alloc+0x16/0x30 [nvidia]
[99973.115933] _nv030926rm+0x11e/0x210 [nvidia]
[99973.116036] ? _nv007745rm+0x5c/0x80 [nvidia]
[99973.116191] ? _nv007876rm+0x2d/0x50 [nvidia]
[99973.116335] ? _nv026970rm+0xae/0x200 [nvidia]
[99973.116444] ? _nv027057rm+0x535/0x610 [nvidia]
[99973.116556] ? _nv003653rm+0xd/0x20 [nvidia]
[99973.116669] ? _nv004259rm+0x15/0x80 [nvidia]
[99973.116781] ? _nv012034rm+0x194/0x290 [nvidia]
[99973.116887] ? _nv035122rm+0xf8/0x1a0 [nvidia]
[99973.116999] ? _nv035121rm+0x2c0/0x330 [nvidia]
[99973.117105] ? _nv001083rm+0x63/0xa0 [nvidia]
[99973.117211] ? _nv007937rm+0x52/0xd0 [nvidia]
[99973.117316] ? _nv001129rm+0x5ce/0x850 [nvidia]
[99973.117413] ? rm_ioctl+0x73/0x100 [nvidia]
[99973.117416] ? __kmalloc+0xa0/0x220
[99973.117417] ? __check_object_size+0xaf/0x1b0
[99973.117479] ? nvidia_ioctl+0x538/0x7c0 [nvidia]
[99973.117541] ? nvidia_frontend_unlocked_ioctl+0x42/0x50 [nvidia]
[99973.117543] ? do_vfs_ioctl+0xa8/0x630
[99973.117545] ? handle_mm_fault+0xb1/0x1f0
[99973.117547] ? __do_page_fault+0x270/0x4d0
[99973.117548] ? SyS_ioctl+0x79/0x90
[99973.117549] ? do_syscall_64+0x73/0x130
[99973.117552] ? entry_SYSCALL_64_after_hwframe+0x3d/0xa2
[99973.117552] Code: 23 48 8b 5d d0 65 48 33 1c 25 28 00 00 00 4c 89 f0 75 1c 48 83 c4 10 5b 41 5c 41 5d 41 5e 41 5f 5d c3 0f 0b 49 8d 46 ff 4c 85 f0 <74> d4 0f 0b e8 e8 9c a5 ff 0f 1f 84 00 00 00 00 00 0f 1f 44 00
[99981.849534] ================ After cudaFreeHost - Attempt 4 ================
[99981.854843] ================ Before cudaHostAlloc - Attempt 5 ================
[99982.914550] 0000:0a:00.0: IOMMU mapping error in map_sg (io-pages: 1048575)
[99982.914557] NVRM: failed to create a DMA mapping!
[99983.453835] ================ Before cudaFreeHost - Attempt 5 ================
[99983.458348] ================ After cudaFreeHost - Attempt 5 ================
nvidia-bug-report.log.gz (943 KB)