Internal Error from Kernel When Connecting a Bluetooth Device

Hi All,

Since the JetPack 5.0.1, I’m not able to connect any Bluetooth device to any Orin device. I have had this problem with all the AGX Orin Dev Kit, Orin NX and Orin Nano Dev Kits. The problem is, when I try to connect Logitech mouse or keyboard via Bluetooth, I see the device is listed on the bluetooth device list and once I click on it, the whole system freeze and reboots itself.

I tried the solution mentioned here: Nvidia Jetson Xavier NX Bluetooth connection issue - #47 by ekta.singh and it gives me the following error just before it gets stuck:

Message from syslogd@ubuntu at Jun 21 09:07:20 ...
 kernel:[58724.453229] Internal error: Oops: 96000004 [#1] PREEMPT SMP

What would be the problem for this issue? Any workarounds?

Which JetPack you’re using now?
Could you try re-flash your AGX-Orin devkits with JetPack 5.1.1 and try again.

Thanks for the quick reply. I’m currently using JetPack 5.1.1 but I also faced the issue on 5.0.2. I just also tested it with disconnecting AW-CB375NF WiFi-BT module and plugged in some TP-Link BT USB Dongle but problem persists.

We use RTL8822CE to do the test without problem, will try to find other dongle to see if can reproduce the problem.

Also, you should dump the log from serial console when kernel panic happened.

Syslog generally won’t record kernel panic …

I’m able to replicate this on my Jetson Orin Nano (DevKit) running JetPack 5.1.1. I’ve attached the log from serial console:

[  106.420435] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000000
[  106.429564] Mem abort info:
[  106.432537]   ESR = 0x96000004
[  106.435853]   EC = 0x25: DABT (current EL), IL = 32 bits
[  106.441361]   SET = 0, FnV = 0
[  106.444576]   EA = 0, S1PTW = 0
[  106.447888] Data abort info:
[  106.450905]   ISV = 0, ISS = 0x00000004
[  106.454980]   CM = 0, WnR = 0
[  106.458061] user pgtable: 4k pages, 48-bit VAs, pgdp=0000000140376000
[  106.464739] [0000000000000000] pgd=0000000000000000, p4d=0000000000000000
[  106.471809] Internal error: Oops: 96000004 [#1] PREEMPT SMP
[  106.477538] Modules linked in: nvidia_modeset(O) fuse xt_conntrack xt_MASQUERADE nf_conntrack_netlink nfnetlink xt_addrtype iptable_filter iptable_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 libcrc32c br_netfilter lzo_rle lzo_compress zram overlay ramoops reed_solomon bnep loop hid_logitech_hidpp snd_soc_tegra210_ope snd_soc_tegra186_dspk snd_soc_tegra210_iqc snd_soc_tegra186_asrc snd_soc_tegra210_mixer snd_soc_tegra186_arad snd_soc_tegra210_mvc snd_soc_tegra210_afc snd_soc_tegra210_dmic snd_soc_tegra210_adx snd_soc_tegra210_amx snd_soc_tegra210_i2s snd_soc_tegra210_admaif snd_soc_tegra210_sfc snd_soc_tegra_pcm input_leds aes_ce_blk crypto_simd cryptd aes_ce_cipher ghash_ce hid_logitech_dj sha2_ce sha256_arm64 sha1_ce rtk_btusb snd_soc_tegra_machine_driver btusb btrtl btbcm snd_soc_spdif_tx btintel fusb301 snd_soc_tegra210_adsp snd_soc_tegra_utils snd_soc_simple_card_utils snd_hda_codec_hdmi snd_soc_tegra210_ahub nvadsp snd_hda_tegra tegra210_adma tegra_bpmp_thermal snd_hda_codec
[  106.477635]  snd_hda_core userspace_alert nv_imx219 spi_tegra114 rtl8822ce r8168 cfg80211 r8169 realtek nvidia(O) binfmt_misc ina3221 pwm_fan nvgpu nvmap ip_tables x_tables [last unloaded: mtd]
[  106.587293] CPU: 0 PID: 95 Comm: kworker/u13:0 Tainted: G           O      5.10.104-tegra #1
[  106.595949] Hardware name: Unknown NVIDIA Orin Nano Developer Kit/NVIDIA Orin Nano Developer Kit, BIOS 3.1-32827747 03/19/2023
[  106.607641] Workqueue: hci0 hci_rx_work
[  106.611570] pstate: 20c00009 (nzCv daif +PAN +UAO -TCO BTYPE=--)
[  106.617725] pc : vli_mult+0x74/0x100
[  106.621390] lr : vli_mult+0x2c/0x100
[  106.625054] sp : ffff8000110f3820
[  106.628448] x29: ffff8000110f3820 x28: 0000000000002010 
[  106.633890] x27: ffff54e407200010 x26: 0000000000000040 
[  106.639342] x25: ffffca093c327380 x24: ffff54e454904680 
[  106.644794] x23: 0000000000000000 x22: ffff8000110f3898 
[  106.650233] x21: 0000000000000000 x20: ffff54e454904680 
[  106.655675] x19: 0000000000000004 x18: 0000000000000018 
[  106.661120] x17: ffff8000110f3798 x16: 0000000000000003 
[  106.666567] x15: ffff8000110f3750 x14: ffff8000110f37b0 
[  106.672014] x13: ffff8000110f3790 x12: ffff8000110f3738 
[  106.677453] x11: 0000000000000000 x10: 0000000000000000 
[  106.682897] x9 : b55171de7242460c x8 : 0000000000000000 
[  106.688349] x7 : 3260eb02a6086b19 x6 : 0000000000000000 
[  106.693795] x5 : 0000000000000000 x4 : 0000000000000000 
[  106.699239] x3 : 00000000fffffffd x2 : 0000000000000001 
[  106.704677] x1 : 0000000000000000 x0 : 0000000000000007 
[  106.710126] Call trace:
[  106.712633]  vli_mult+0x74/0x100
[  106.715942]  vli_mod_mult_fast+0x58/0xa0
[  106.719954]  ecc_is_pubkey_valid_partial+0x114/0x1e0
[  106.725031]  ecc_is_pubkey_valid_full+0x30/0xb0
[  106.729674]  ecc_make_pub_key+0xb4/0x140
[  106.733687]  ecdh_compute_value+0x164/0x180
[  106.737965]  generate_ecdh_public_key+0xe0/0x1a0
[  106.742693]  generate_ecdh_keys+0x3c/0x50
[  106.746799]  sc_send_public_key+0xf0/0x1a0
[  106.750993]  smp_recv_cb+0x13c4/0x16e0
[  106.754834]  l2cap_recv_frame+0xb40/0x16a0
[  106.759034]  l2cap_recv_acldata+0x20c/0x210
[  106.763324]  hci_rx_work+0x230/0x290
[  106.766987]  process_one_work+0x1c4/0x4a0
[  106.771104]  worker_thread+0x54/0x430
[  106.774858]  kthread+0x148/0x170
[  106.778167]  ret_from_fork+0x10/0x24
[  106.781829] Code: 7a441260 54000409 d280000a 4b040165 (f8645aa9) 
[  106.788081] ---[ end trace 3160c9483a3231aa ]---
[  106.798393] Kernel panic - not syncing: Oops: Fatal exception
[  106.804282] SMP: stopping secondary CPUs
[  106.808454] Kernel Offset: 0x4a092a4f0000 from 0xffff800010000000
[  106.814698] PHYS_OFFSET: 0xffffab1d00000000
[  106.818987] CPU features: 0x0040006,4a80aa38
[  106.823357] Memory Limit: none
[  106.831985] ---[ end Kernel panic - not syncing: Oops: Fatal exception ]---

I am able to connect to some devices without issue (my logitech Ergo K860 Keyboard doesn’t seem to have issues), but other devices cause the KP. I am able to reproduce this reliably by attempting to connect my Bluetooth Stadia controller.

1 Like

I’m actually having the issue with Logitech MX Master 3S mouse and Logitech MX Keys. Both casuses the same issue once I connect them to BT. It is strange that you are not having any issue on your Ergo keyboard.

@kayccc do you have any ideas after what @Eric567 shared?

@WayneWWW @kayccc FYI, it looks like my call trace exactly matches this one:

R35.1 crashes when connecting BLE Xbox Controller - Jetson & Embedded Systems / Jetson AGX Orin - NVIDIA Developer Forums

Given that this has been an ongoing issue now for several months, is there is an expected timeframe for a resolution? I purchased this device for robotics applications, and need to be able to run headless while still providing a mechanism for manual control.

I have confirmed that this issue also occurs with my Logitech Ergo M575 Trackball.

This KP can be reproduced very reliably; simply connect to the device via bluetoothctl.

Thank you.

We have tested with Lenovo BT mouse, HP BT keyboard, TS BT controller.
We have not observed any crash yet.
Not sure where the different is.

1 Like

Thanks @kayccc .

Do you have an XBox Bluetooth gaming controller that you could test? Both the stadia gaming controller and Xbox Bluetooth gaming controller reliably cause this issue for me.

I wonder if perhaps the issue is that these are low energy devices? Possibly unrelated, but I noticed that CONFIG_BT_LE is not set when the kernel is built.

What is the TS BT controller? Tegra Shield?

Here’s some free QA from your customers ;)

Boards

  • 1 x nVidia Xavier AGX @ L4T r35.3 @ 5.10.104-tegra - not reproducable
  • 3 x nVidia Orin @ L4T r35.3 @ 5.10.104-tegra - reproducable on all devices

Our BT devices is:

  • Dell Wireless 1820 (DW1820) on Qualcomm Atheros QCA6174A - reproducable on Orin
  • AW-CB375NF REV05 on Realtek RTL8822CE - reproducable on Orin

Steps to reproduce using LightBlue

As I understand, this is reproducable only with LE devices / GATT clients. So to 100% reproduce issue, you need a BLE device. Easiest way is emulate BLE device using free LightBlue app on iOS

On phone:

  1. Install LightBlue app on iOS and open it
  2. Go to “Virtual Devices
  3. Click “+” button in top right corner
  4. Choose “HID OVER GATT

On Orin board

  1. Open bluetoothctl
  2. Run scan on
  3. Wait 30 seconds
  4. Run devices
  5. In output of devices command find the MAC of device named HID OVER GATT
  6. Run pair MAC_FROM_PREVIOUS_STEP
  7. Device will hang immediately after pair

Steps to reproduce using 2 (two) Jetson Orin boards

Using this guide, we will create BLE package using another Jetson board, if you don’t want to use LightBlue for some reasons

On Board 1 (one)

  1. Run bluetoothctl. In bluetoothctl shell run the following code step by step:
agent off
agent NoInputNoOutput
menu advertise
uuids 0x180D
appearance 833
name Jetson_demo
discoverable on
back
advertise on

On Board 2 (two)

  1. Run bluetoothctl
  2. Run scan on
  3. Wait 30 seconds
  4. Run devices
  5. In output of devices command find the MAC of device named Jetson_demo
  6. Run pair MAC_FROM_PREVIOUS_STEP
  7. Device will hang immediately after pair

Part of Journalctl log file from Orin board

journalctl.ble.bug.log (1.9 KB)

1 Like

I believe that BT_LE is responsible only for enabling the LE Isochronous Channels option and likely does not affect the overall functioning of BLE.

1 Like

Tried compiling the kernel with the enabled CONFIG_BT_LE (everything else remained exactly the same; I took the official kernel from nVidia and compiled it following the instructions). The problem persists, nothing has changed.

We also tried using an external USB adapter Edimax BT-8500 with the same result: the device hangs.

Additionally, there was an attempt to compile the latest version of bluez (5.68), but it didn’t help.

Dear nVidia developers, what other part of your work can I help with? Send you a joystick with BLE?

I’ve been debugging this a little this afternoon, and I’ve tracked the issue down to the curve struct being partially uninitialized (curve->a and curve->b). Checking for this and throwing an error in ecc_is_pubkey_valid_partial at least prevents the entire kernel from going down, and I’ve actually been able to connect my controller on some occasions (perhaps the curve that’s generated is sometimes initialized?)

Here’s a snippet of the function I’ve modified (kernel/kernel-5.10/crypto/ecc.c)

/* SP800-56A section 5.6.2.3.4 partial verification: ephemeral keys only */
int ecc_is_pubkey_valid_partial(const struct ecc_curve *curve,
                                struct ecc_point *pk)
{
        if (!curve->p || !pk->x || !pk->y || !curve->a || !curve->b)
        {
                pr_err("NULL pointer within data structures passed to ecc_is_pubkey_valid_partial: curve->p=%p, pk->x=%p, pk->y=%p, curve->a=%p, curve->b=%p\n",
                curve->p, pk->x, pk->y, curve->a, curve->b);
                return -EINVAL;
        }

Obviously not a viable fix, but at least keeps the kernel from crashing while I spam requests to connect my bluetooth controller

dmesg after successful connection (rare):

[  517.154562] Bluetooth: hci0: unexpected SMP command 0x0b from e0:c0:c5:b7:e0:59
[  517.164460] rtk_btcoex: update_hid_active_state: handle 0x0010, interval 6
[  517.177212] NULL pointer within data structures passed to ecc_is_pubkey_valid_partial: curve->p=0000000052bfa0e5, pk->x=00000000879bc47e, pk->y=000000005033df44, curve->a=0000000000000000, curve->b=0000000000000000
[  519.159766] input: Stadia27KP-e059 as /devices/virtual/misc/uhid/0005:18D1:9400.000A/input/input20
[  519.160206] hid-generic 0005:18D1:9400.000A: input,hidraw5: BLUETOOTH HID v1.00 Gamepad [Stadia27KP-e059] on b4:8c:9d:34:ce:5a
1 Like

Looks like Nvidia somehow managed to mangle the NIST P256 curve? Here’s the correct one:

--- kernel/kernel-5.10/crypto/ecc.c	2023-07-29 21:04:43.469231617 -0400
+++ kernel/kernel-5.10/crypto/ecc.c	2023-07-29 21:07:47.188380995 -0400
@@ -70,6 +70,10 @@ static u64 nist_p256_p[] = { 0xFFFFFFFFF
 				0x0000000000000000ull, 0xFFFFFFFF00000001ull };
 static u64 nist_p256_n[] = { 0xF3B9CAC2FC632551ull, 0xBCE6FAADA7179E84ull,
 				0xFFFFFFFFFFFFFFFFull, 0xFFFFFFFF00000000ull };
+static u64 nist_p256_a[] = { 0xFFFFFFFFFFFFFFFCull, 0x00000000FFFFFFFFull,
+				0x0000000000000000ull, 0xFFFFFFFF00000001ull };
+static u64 nist_p256_b[] = { 0x3BCE3C3E27D2604Bull, 0x651D06B0CC53B0F6ull,
+				0xB3EBBD55769886BCull, 0x5AC635D8AA3A93E7ull };
 static struct ecc_curve nist_p256 = {
 	.name = "nist_256",
 	.g = {
@@ -78,7 +82,9 @@ static struct ecc_curve nist_p256 = {
 		.ndigits = 4,
 	},
 	.p = nist_p256_p,
-	.n = nist_p256_n
+	.n = nist_p256_n,
+	.a = nist_p256_a,
+	.b = nist_p256_b
 };
 
 /* BrainPool P-256 */

Note that this is exactly the same as the one here: ecc_curve_defs.h - crypto/ecc_curve_defs.h - Linux source code (v5.10.148) - Bootlin

To fix, update crypto/ecc.c with this patch, rebuild the kernel, and copy the Image to /boot.

Tested on both XBox and Stadia bluetooth controllers.

@doruk.sonmez1 @eugenyshcheglov , can you confirm this fixes the issues for you?

Edit: Also tested my logitech mouse that was failing earlier, and it works great now.

2 Likes

Yeah, look’s like they move code from the ecc_curve_defs.h directly to ecc.c

I even found a patch that is responsible for it: crypto:ecc: separate out ecc and ecdh · OE4T/linux-tegra-4.9@91cbe12 · GitHub (Don’t mind that this is the OE4T repository, it’s a fork of the original repository)

The history goes like this: long ago, 6 years ago, an nVidia employee made changes, in which he added the then-current NIST P256 into the code (specifically: nist_p256_g_x, nist_p256_g_y, nist_p256_p, and nist_p256_n).

However, time goes on, and new nist_p256_a and nist_p256_b were added to the kernel, which the nVidia developers did not incorporate into their modified code.

I haven’t tested the kernel with the changes yet, I’ll update you once I do.

Dear NVIDIA developers, we’ve done all the work for you. Maybe we deserve at least a couple of treats in the shape of your logo, huh?

Yep, and here’s the merge (5.10) that resulted in the bug:

https://nv-tegra.nvidia.com/r/plugins/gitiles/linux-5.10/+/32a12d12ce6e3324a0a336419493952a9300fff7

Check the diffs for ecc.c vs ecc_curve_defs.h.

Looks like it also dropped a and b parameters for nist_p192 curve. Wonder if that’s causing issues elsewhere?

Here’s the updated patch:

diff --git a/crypto/ecc.c b/crypto/ecc.c
index ee0e044403f6..6ee23f0bee32 100644
--- a/crypto/ecc.c
+++ b/crypto/ecc.c
@@ -50,6 +50,10 @@ static u64 nist_p192_p[] = { 0xFFFFFFFFFFFFFFFFull, 0xFFFFFFFFFFFFFFFEull,
                                0xFFFFFFFFFFFFFFFFull };
 static u64 nist_p192_n[] = { 0x146BC9B1B4D22831ull, 0xFFFFFFFF99DEF836ull,
                                0xFFFFFFFFFFFFFFFFull };
+static u64 nist_p192_a[] = { 0xFFFFFFFFFFFFFFFCull, 0xFFFFFFFFFFFFFFFEull,
+                               0xFFFFFFFFFFFFFFFFull };
+static u64 nist_p192_b[] = { 0xFEB8DEECC146B9B1ull, 0x0FA7E9AB72243049ull,
+                               0x64210519E59C80E7ull };
 static struct ecc_curve nist_p192 = {
        .name = "nist_192",
        .g = {
@@ -58,7 +62,9 @@ static struct ecc_curve nist_p192 = {
                .ndigits = 3,
        },
        .p = nist_p192_p,
-       .n = nist_p192_n
+       .n = nist_p192_n,
+       .a = nist_p192_a,
+       .b = nist_p192_b
 };
 
 /* NIST P-256 */
@@ -70,6 +76,10 @@ static u64 nist_p256_p[] = { 0xFFFFFFFFFFFFFFFFull, 0x00000000FFFFFFFFull,
                                0x0000000000000000ull, 0xFFFFFFFF00000001ull };
 static u64 nist_p256_n[] = { 0xF3B9CAC2FC632551ull, 0xBCE6FAADA7179E84ull,
                                0xFFFFFFFFFFFFFFFFull, 0xFFFFFFFF00000000ull };
+static u64 nist_p256_a[] = { 0xFFFFFFFFFFFFFFFCull, 0x00000000FFFFFFFFull,
+                               0x0000000000000000ull, 0xFFFFFFFF00000001ull };
+static u64 nist_p256_b[] = { 0x3BCE3C3E27D2604Bull, 0x651D06B0CC53B0F6ull,
+                               0xB3EBBD55769886BCull, 0x5AC635D8AA3A93E7ull };
 static struct ecc_curve nist_p256 = {
        .name = "nist_256",
        .g = {
@@ -78,7 +88,9 @@ static struct ecc_curve nist_p256 = {
                .ndigits = 4,
        },
        .p = nist_p256_p,
-       .n = nist_p256_n
+       .n = nist_p256_n,
+       .a = nist_p256_a,
+       .b = nist_p256_b
 };
 
 /* BrainPool P-256 */

Hi @Eric567

Thanks for root cause this issue, I have forwarded it to internal team to see how to correct this error.

FYI, Issue also replicates on Orin devices with r35.4.1 (JetPack 5.1.2). Fortunately this ^ fix works there as well!