diff mbox series

patch 46/47 causes NULL pointer deref on mt7921

Message ID 20240711175156.4465-1-spasswolf@web.de
State New
Headers show
Series patch 46/47 causes NULL pointer deref on mt7921 | expand

Commit Message

Bert Karwatzki July 11, 2024, 5:51 p.m. UTC
Since linux-next-20240711 my linux system fails to start with a NULL pointer
error. Hardware: MSI Alpha 15 Laptop Ryzen 5800H with
04:00.0 Network controller: MEDIATEK Corp. MT7921K (RZ608) Wi-Fi 6E 80MHz

[  T843] BUG: unable to handle page fault for address: ffffffffffffffa0
[  T843] #PF: supervisor read access in kernel mode
[  T843] #PF: error_code(0x0000) - not-present page
[  T843] PGD e5c81a067 P4D e5c81a067 PUD e5c81c067 PMD 0 
[  T843] Oops: Oops: 0000 [#1] PREEMPT SMP NOPTI
[  T843] CPU: 2 UID: 0 PID: 843 Comm: NetworkManager Not tainted 6.10.0-rc7-next-20240711-dirty #9
[  T843] Hardware name: Micro-Star International Co., Ltd. Alpha 15 B5EEK/MS-158L, BIOS E158LAMS.107 11/10/2021
[  T843] RIP: 0010:mt792x_mac_link_bss_remove+0x24/0x110 [mt792x_lib]
[  T843] Code: 84 00 00 00 00 00 f3 0f 1e fa 41 56 41 55 41 54 49 89 f4 55 48 89 fd 53 48 8b 46 18 48 89 d3 44 0f b7 aa b8 00 00 00 8b 56 60 <66> 83 78 a0 00 74 0f 83 fa 0e 77 0a 4c 8b b4 d0 28 ff ff ff eb 07
[  T843] RSP: 0018:ffffbc1b43b3b688 EFLAGS: 00010282
[  T843] RAX: 0000000000000000 RBX: ffff906ab80a9f00 RCX: 000fffffffe00000
[  T843] RDX: 0000000000000000 RSI: ffff906ab80a9e20 RDI: ffff9069c1712000
[  T843] RBP: ffff9069c1712000 R08: ffff9069c0402018 R09: ffffffffab226720
[  T843] R10: 0000000000000000 R11: 0000000000000000 R12: ffff906ab80a9e20
[  T843] R13: 0000000000000013 R14: 0000000000000000 R15: ffff906ab80a9990
[  T843] FS:  00007fb2edd7b500(0000) GS:ffff90786e680000(0000) knlGS:0000000000000000
[  T843] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  T843] CR2: ffffffffffffffa0 CR3: 000000010a104000 CR4: 0000000000750ef0
[  T843] PKRU: 55555554
[  T843] Call Trace:
[  T843]  <TASK>
[  T843]  ? __die+0x1e/0x60
[  T843]  ? page_fault_oops+0x157/0x450
[  T843]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T843]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T843]  ? search_bpf_extables+0x5a/0x80
[  T843]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T843]  ? exc_page_fault+0x2bb/0x670
[  T843]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T843]  ? lock_timer_base+0x71/0x90
[  T843]  ? asm_exc_page_fault+0x26/0x30
[  T843]  ? mt792x_mac_link_bss_remove+0x24/0x110 [mt792x_lib]
[  T843]  ? mt792x_remove_interface+0x6e/0x90 [mt792x_lib]
[  T843]  ? ieee80211_do_stop+0x507/0x7e0 [mac80211]
[  T843]  ? ieee80211_stop+0x53/0x190 [mac80211]
[  T843]  ? __dev_close_many+0xa5/0x120
[  T843]  ? __dev_change_flags+0x18c/0x220
[  T843]  ? dev_change_flags+0x21/0x60
[  T843]  ? do_setlink+0xdf9/0x11d0
[  T843]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T843]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T843]  ? security_sock_rcv_skb+0x33/0x50
[  T843]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T843]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T843]  ? __nla_validate_parse+0x61/0xd10
[  T843]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T843]  ? genl_done+0x53/0x80
[  T843]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T843]  ? netlink_dump+0x357/0x410
[  T843]  ? __rtnl_newlink+0x5d6/0x980
[  T843]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T843]  ? genl_family_rcv_msg_dumpit+0xdf/0xf0
[  T843]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T843]  ? __kmalloc_cache_noprof+0x44/0x210
[  T843]  ? rtnl_newlink+0x42/0x60
[  T843]  ? rtnetlink_rcv_msg+0x152/0x3f0
[  T843]  ? mptcp_pm_nl_dump_addr+0x180/0x180
[  T843]  ? rtnl_calcit.isra.0+0x130/0x130
[  T843]  ? netlink_rcv_skb+0x56/0x100
[  T843]  ? netlink_unicast+0x199/0x290
[  T843]  ? netlink_sendmsg+0x21d/0x490
[  T843]  ? __sock_sendmsg+0x78/0x80
[  T843]  ? ____sys_sendmsg+0x23f/0x2e0
[  T843]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T843]  ? copy_msghdr_from_user+0x68/0xa0
[  T843]  ? ___sys_sendmsg+0x81/0xd0
[  T843]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T843]  ? crng_fast_key_erasure+0xbc/0xf0
[  T843]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T843]  ? get_random_bytes_user+0x126/0x140
[  T843]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T843]  ? __fdget+0xb1/0xe0
[  T843]  ? __sys_sendmsg+0x56/0xa0
[  T843]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T843]  ? do_syscall_64+0x5f/0x170
[  T843]  ? entry_SYSCALL_64_after_hwframe+0x55/0x5d
[  T843]  </TASK>
[  T843] Modules linked in: bnep nls_ascii nls_cp437 vfat fat snd_ctl_led snd_hda_codec_realtek snd_hda_codec_generic snd_hda_scodec_component snd_hda_codec_hdmi btusb btrtl btintel snd_hda_intel uvcvideo btbcm snd_intel_dspcfg btmtk snd_hda_codec snd_soc_dmic snd_acp3x_pdm_dma snd_acp3x_rn videobuf2_vmalloc snd_hwdep uvc bluetooth videobuf2_memops snd_soc_core snd_hda_core videobuf2_v4l2 snd_pcm_oss snd_mixer_oss videodev snd_pcm snd_rn_pci_acp3x videobuf2_common snd_acp_config snd_timer msi_wmi snd_soc_acpi ecdh_generic amd_atl mc ecc sparse_keymap edac_mce_amd wmi_bmof snd ccp k10temp snd_pci_acp3x soundcore ac battery button hid_sensor_prox hid_sensor_gyro_3d hid_sensor_als hid_sensor_accel_3d hid_sensor_magn_3d hid_sensor_trigger industrialio_triggered_buffer joydev kfifo_buf industrialio amd_pmc hid_sensor_iio_common evdev hid_multitouch serio_raw mt7921e mt7921_common mt792x_lib mt76_connac_lib mt76 mac80211 libarc4 cfg80211 rfkill msr fuse nvme_fabrics configfs efi_pstore efivarfs autofs4 ext4
[  T843]  crc32c_generic mbcache jbd2 usbhid amdgpu i2c_algo_bit drm_ttm_helper xhci_pci ttm drm_exec drm_suballoc_helper xhci_hcd amdxcp drm_buddy hid_sensor_hub usbcore nvme gpu_sched mfd_core hid_generic crc32c_intel psmouse drm_display_helper amd_sfh i2c_piix4 usb_common nvme_core crc16 r8169 i2c_hid_acpi i2c_hid hid i2c_designware_platform i2c_designware_core
[  T843] CR2: ffffffffffffffa0
[  T843] ---[ end trace 0000000000000000 ]---

I bisected the error between linux-6.10-rc7 and linux-next-20240711 and the
first offending commit which showed a NULL pointer error was
https://lore.kernel.org/all/20240613030241.5771-47-sean.wang@kernel.org/
but the error is actually a different but most likely related NULL pointer error.
To debug this I added some printk to the offending commit:


With these I get the following error message on startup:

[  T858] mt792x_remove_interface 157: hw = ffff92dc11560900 vif = ffff92dbe072d970 mvif = ffff92dbe072de00 dev = ffff92dc11562000
[  T858] mt792x_link_conf_to_mconf 263: vif = ffff92dbe072d970 mvif = ffff92dbe072de00
[  T858] mt792x_vif_to_link 233: vif = ffff92dbe072d970
[  T858] mt792x_mac_link_bss_remove 122
[  T858] mt792x_mac_link_bss_remove 125
[  T858] mt792x_mac_link_bss_remove 127
[  T858] mt792x_mac_link_bss_remove 129
[  T858] mt792x_mac_link_bss_remove 132
[  T858] mt792x_mac_link_bss_remove 135: mconf->vif = 0000000000000000
[  T858] BUG: kernel NULL pointer dereference, address: 00000000000004b8
[  T858] #PF: supervisor read access in kernel mode
[  T858] #PF: error_code(0x0000) - not-present page
[  T858] PGD 0 P4D 0 
[  T858] Oops: Oops: 0000 [#1] PREEMPT SMP NOPTI
[  T858] CPU: 0 PID: 858 Comm: NetworkManager Not tainted 6.10.0-rc5-debug-01238-g1541d63c5fe2-dirty #30
[  T858] Hardware name: Micro-Star International Co., Ltd. Alpha 15 B5EEK/MS-158L, BIOS E158LAMS.107 11/10/2021
[  T858] RIP: 0010:mt792x_remove_interface+0x1df/0x2e0 [mt792x_lib]
[  T858] Code: 85 f0 30 00 00 49 8b 4f 18 e8 5d 4f f0 f7 49 8b 47 18 41 0f b6 4f 01 ba 89 00 00 00 48 c7 c6 90 24 05 c1 48 c7 c7 36 33 05 c1 <48> 8b 80 b8 04 00 00 49 d3 e4 49 f7 d4 4c 21 a0 10 27 00 00 4c 8d
[  T858] RSP: 0018:ffff9fcf03db7698 EFLAGS: 00010246
[  T858] RAX: 0000000000000000 RBX: ffff92dbe072d970 RCX: 0000000000000000
[  T858] RDX: 0000000000000089 RSI: ffffffffc1052490 RDI: ffffffffc1053336
[  T858] RBP: ffff92dc11562000 R08: 0000000000000000 R09: 0000000000000003
[  T858] R10: ffff9fcf03db7550 R11: ffffffffba099d28 R12: 0000000000000001
[  T858] R13: ffff92dbe072ded8 R14: ffff92dc1156a150 R15: ffff92dbe072de00
[  T858] FS:  00007fa13c515500(0000) GS:ffff92ea6e600000(0000) knlGS:0000000000000000
[  T858] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  T858] CR2: 00000000000004b8 CR3: 00000001046a2000 CR4: 0000000000750ef0
[  T858] PKRU: 55555554
[  T858] Call Trace:
[  T858]  <TASK>
[  T858]  ? __die+0x1e/0x60
[  T858]  ? page_fault_oops+0x157/0x450
[  T858]  ? _prb_read_valid+0x273/0x2e0
[  T858]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T858]  ? exc_page_fault+0x331/0x670
[  T858]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T858]  ? prb_read_valid+0x16/0x20
[  T858]  ? asm_exc_page_fault+0x26/0x30
[  T858]  ? mt792x_remove_interface+0x1df/0x2e0 [mt792x_lib]
[  T858]  ? mt792x_remove_interface+0x1c3/0x2e0 [mt792x_lib]
[  T858]  ? ieee80211_do_stop+0x507/0x7e0 [mac80211]
[  T858]  ? ieee80211_stop+0x53/0x190 [mac80211]
[  T858]  ? __dev_close_many+0xa5/0x120
[  T858]  ? __dev_change_flags+0x18c/0x220
[  T858]  ? dev_change_flags+0x21/0x60
[  T858]  ? do_setlink+0xdf9/0x11d0
[  T858]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T858]  ? __kmalloc_large_node+0x7e/0xb0
[  T858]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T858]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T858]  ? security_sock_rcv_skb+0x33/0x50
[  T858]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T858]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T858]  ? __nla_validate_parse+0x61/0xd10
[  T858]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T858]  ? genl_done+0x53/0x80
[  T858]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T858]  ? netlink_dump+0x357/0x410
[  T858]  ? __rtnl_newlink+0x5d1/0x980
[  T858]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T858]  ? genl_family_rcv_msg_dumpit+0xdf/0xf0
[  T858]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T858]  ? kmalloc_trace_noprof+0x44/0x210
[  T858]  ? rtnl_newlink+0x42/0x60
[  T858]  ? rtnetlink_rcv_msg+0x14d/0x3f0
[  T858]  ? mptcp_pm_nl_dump_addr+0x180/0x180
[  T858]  ? rtnl_calcit.isra.0+0x130/0x130
[  T858]  ? netlink_rcv_skb+0x56/0x100
[  T858]  ? netlink_unicast+0x199/0x290
[  T858]  ? netlink_sendmsg+0x21d/0x490
[  T858]  ? __sock_sendmsg+0x78/0x80
[  T858]  ? ____sys_sendmsg+0x23f/0x2e0
[  T858]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T858]  ? copy_msghdr_from_user+0x68/0xa0
[  T858]  ? ___sys_sendmsg+0x81/0xd0
[  T858]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T858]  ? crng_fast_key_erasure+0xbc/0xf0
[  T858]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T858]  ? get_random_bytes_user+0x126/0x140
[  T858]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T858]  ? __fdget+0xb1/0xe0
[  T858]  ? __sys_sendmsg+0x56/0xa0
[  T858]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T858]  ? do_syscall_64+0x5f/0x170
[  T858]  ? entry_SYSCALL_64_after_hwframe+0x55/0x5d
[  T858]  </TASK>
[  T858] Modules linked in: cmac bnep nls_ascii nls_cp437 vfat fat snd_ctl_led snd_hda_codec_realtek snd_hda_codec_generic snd_hda_scodec_component btusb snd_hda_codec_hdmi btrtl btintel btbcm btmtk snd_hda_intel amd_atl snd_intel_dspcfg bluetooth snd_acp3x_pdm_dma snd_soc_dmic snd_acp3x_rn snd_hda_codec uvcvideo snd_soc_core videobuf2_vmalloc uvc snd_hwdep videobuf2_memops videobuf2_v4l2 snd_hda_core videodev snd_pcm_oss snd_mixer_oss snd_pcm snd_rn_pci_acp3x snd_acp_config videobuf2_common snd_timer msi_wmi snd_soc_acpi ecdh_generic ecc mc sparse_keymap edac_mce_amd snd wmi_bmof k10temp ccp snd_pci_acp3x soundcore battery ac button joydev hid_sensor_magn_3d hid_sensor_gyro_3d hid_sensor_als hid_sensor_accel_3d hid_sensor_prox hid_sensor_trigger industrialio_triggered_buffer kfifo_buf industrialio amd_pmc hid_sensor_iio_common evdev hid_multitouch serio_raw mt7921e mt7921_common mt792x_lib mt76_connac_lib mt76 mac80211 libarc4 cfg80211 rfkill msr fuse nvme_fabrics efi_pstore configfs efivarfs autofs4 ext4
[  T858]  crc32c_generic crc16 mbcache jbd2 usbhid amdgpu i2c_algo_bit xhci_pci drm_ttm_helper ttm xhci_hcd drm_exec drm_suballoc_helper amdxcp nvme drm_buddy hid_sensor_hub usbcore gpu_sched nvme_core mfd_core hid_generic crc32c_intel psmouse amd_sfh i2c_piix4 usb_common t10_pi drm_display_helper r8169 i2c_hid_acpi i2c_hid hid i2c_designware_platform i2c_designware_core
[  T858] CR2: 00000000000004b8
[  T858] ---[ end trace 0000000000000000 ]---

So the problem is here that mconf->vif is still NULL probably because
on mt7921 nobody is bothering to set it.

I did a similar investigation for the error in linux-next-20240711

void mt792x_mac_link_bss_remove(struct mt792x_dev *dev,
				struct mt792x_bss_conf *mconf,
				struct mt792x_link_sta *mlink)
{
	struct ieee80211_vif *vif = container_of((void *)mconf->vif,
						 struct ieee80211_vif, drv_priv);
	struct ieee80211_bss_conf *link_conf;
	int idx = mlink->wcid.idx;

	printk(KERN_INFO "%s %d: dev = %px mconf = %px mlink = %px vif = %px\n",
			__func__, __LINE__, dev, mconf, mlink, vif);
	link_conf = mt792x_vif_to_bss_conf(vif, mconf->link_id);

This leads to the following message on startup
 
[  T848] mt792x_mac_link_bss_remove 147: dev = ffff9403c1672000 mconf = ffff9403c1a35e20 mlink = ffff9403c1a35f00 vif = fffffffffffffb70
[  T848] BUG: unable to handle page fault for address: ffffffffffffffa0
[  T848] #PF: supervisor read access in kernel mode
[  T848] #PF: error_code(0x0000) - not-present page
[skipped backtrace]

showing that vif is an invalid (though not NULL) pointer here, too.

Bert Karwatzki

Comments

Bert Karwatzki July 17, 2024, 2:38 p.m. UTC | #1
Am Freitag, dem 12.07.2024 um 13:06 +0200 schrieb Bert Karwatzki:
> Am Donnerstag, dem 11.07.2024 um 18:40 -0500 schrieb Sean Wang:
> > Hi Bert,
> >
> > Thanks for the detailed debug log. I've quickly made a change to fix
> > the issue. Right now, I can't access the test environment, but I'll
> > test it and send it out as soon as possible. Here's the patch.
> >
> > diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> > b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> > index 2e6268cb06c0..1bab93d049df 100644
> > --- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> > +++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> > @@ -303,6 +303,7 @@ mt7921_add_interface(struct ieee80211_hw *hw,
> > struct ieee80211_vif *vif)
> >
> >         mvif->bss_conf.mt76.omac_idx = mvif->bss_conf.mt76.idx;
> >         mvif->phy = phy;
> > +       mvif->bss_conf.vif = mvif;
> >         mvif->bss_conf.mt76.band_idx = 0;
> >         mvif->bss_conf.mt76.wmm_idx = mvif->bss_conf.mt76.idx %
> > MT76_CONNAC_MAX_WMM_SETS;
> >
>
> I wrote earlier that this patch works fine with linux-next-20240711 and at first
> it did, but then another NULL pointer error occured. I'm not sure if I can
> bisect this as it does not trigger automatically it seems. Also I'm currently
> bisecting the problem with linux-20240712
>
> Bert Karwatzki

Now the above mentioned NULL pointer dereference has happended again, this time
on linux-next-20240716. It cann be triggered by repeatedly turning the Wifi off
and on again. For further investigation I created this patch:

diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
index 2e6268cb06c0..3ecedf7bc9f3 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
@@ -303,6 +303,8 @@ mt7921_add_interface(struct ieee80211_hw *hw, struct
ieee80211_vif *vif)

 	mvif->bss_conf.mt76.omac_idx = mvif->bss_conf.mt76.idx;
 	mvif->phy = phy;
+	WARN(!phy, "%s: phy = NULL\n", __func__);
+	mvif->bss_conf.vif = mvif;
 	mvif->bss_conf.mt76.band_idx = 0;
 	mvif->bss_conf.mt76.wmm_idx = mvif->bss_conf.mt76.idx %
MT76_CONNAC_MAX_WMM_SETS;

@@ -1182,6 +1184,12 @@ static void mt7921_ipv6_addr_change(struct ieee80211_hw
*hw,
 				    struct inet6_dev *idev)
 {
 	struct mt792x_vif *mvif = (struct mt792x_vif *)vif->drv_priv;
+	printk(KERN_INFO "%s: mvif = %px\n", __func__, mvif);
+	printk(KERN_INFO "%s: mvif->phy = %px\n", __func__, mvif->phy);
+	if (!mvif->phy) {
+		printk(KERN_INFO "%s: mvif->phy = NULL\n", __func__);
+		return;
+	}
 	struct mt792x_dev *dev = mvif->phy->dev;
 	struct inet6_ifaddr *ifa;
 	struct in6_addr ns_addrs[IEEE80211_BSS_ARP_ADDR_LIST_LEN];

And the result is this (the WARN in mt7921_add_interface did not trigger):

[  367.121740] [    T861] wlp4s0: deauthenticating from 54:67:51:3d:a2:d2 by
local choice (Reason: 3=DEAUTH_LEAVING)
[  367.209603] [    T861] mt7921_ipv6_addr_change: mvif = ffff954e7500de40
[  367.209615] [    T861] mt7921_ipv6_addr_change: mvif->phy = 0000000000000000
[  367.209621] [    T861] mt7921_ipv6_addr_change: mvif->phy = NULL
[  367.250026] [    T861] mt7921_ipv6_addr_change: mvif = ffff954e7500de40
[  367.250034] [    T861] mt7921_ipv6_addr_change: mvif->phy = ffff954e44427768
[  367.251537] [    T861] mt7921_ipv6_addr_change: mvif = ffff954e7500de40
[  367.251542] [    T861] mt7921_ipv6_addr_change: mvif->phy = ffff954e44427768
[  369.977123] [    T862] wlp4s0: authenticate with 54:67:51:3d:a2:d2 (local
address=c8:94:02:c1:bd:69)
[  369.984864] [    T862] wlp4s0: send auth to 54:67:51:3d:a2:d2 (try 1/3)
[  370.006199] [    T104] wlp4s0: authenticated
[  370.006680] [    T104] wlp4s0: associate with 54:67:51:3d:a2:d2 (try 1/3)
[  370.059080] [     T98] wlp4s0: RX AssocResp from 54:67:51:3d:a2:d2
(capab=0x511 status=0 aid=2)
[  370.064067] [     T98] wlp4s0: associated

So mvif->phy can be NULL at the start of mt7921_ipv6_addr_change. The early
return in that case avoids the NULL pointer and mvif->phy has its usual value
again on the next call to mt7921_ipv6_addr_change so Wifi is working again. I
don't know how this could happen but perhaps you have an idea.

Bert Karwatzki
Felix Fietkau July 17, 2024, 3:25 p.m. UTC | #2
On 17.07.24 16:38, Bert Karwatzki wrote:
> Am Freitag, dem 12.07.2024 um 13:06 +0200 schrieb Bert Karwatzki:
>> Am Donnerstag, dem 11.07.2024 um 18:40 -0500 schrieb Sean Wang:
>> > Hi Bert,
>> >
>> > Thanks for the detailed debug log. I've quickly made a change to fix
>> > the issue. Right now, I can't access the test environment, but I'll
>> > test it and send it out as soon as possible. Here's the patch.
>> >
>> > diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
>> > b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
>> > index 2e6268cb06c0..1bab93d049df 100644
>> > --- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
>> > +++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
>> > @@ -303,6 +303,7 @@ mt7921_add_interface(struct ieee80211_hw *hw,
>> > struct ieee80211_vif *vif)
>> >
>> >         mvif->bss_conf.mt76.omac_idx = mvif->bss_conf.mt76.idx;
>> >         mvif->phy = phy;
>> > +       mvif->bss_conf.vif = mvif;
>> >         mvif->bss_conf.mt76.band_idx = 0;
>> >         mvif->bss_conf.mt76.wmm_idx = mvif->bss_conf.mt76.idx %
>> > MT76_CONNAC_MAX_WMM_SETS;
>> >
>>
>> I wrote earlier that this patch works fine with linux-next-20240711 and at first
>> it did, but then another NULL pointer error occured. I'm not sure if I can
>> bisect this as it does not trigger automatically it seems. Also I'm currently
>> bisecting the problem with linux-20240712
>>
>> Bert Karwatzki
> 
> Now the above mentioned NULL pointer dereference has happended again, this time
> on linux-next-20240716. It cann be triggered by repeatedly turning the Wifi off
> and on again. For further investigation I created this patch:
> 
> diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> index 2e6268cb06c0..3ecedf7bc9f3 100644
> --- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> +++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> @@ -303,6 +303,8 @@ mt7921_add_interface(struct ieee80211_hw *hw, struct
> ieee80211_vif *vif)
> 
>   	mvif->bss_conf.mt76.omac_idx = mvif->bss_conf.mt76.idx;
>   	mvif->phy = phy;
> +	WARN(!phy, "%s: phy = NULL\n", __func__);
> +	mvif->bss_conf.vif = mvif;
>   	mvif->bss_conf.mt76.band_idx = 0;
>   	mvif->bss_conf.mt76.wmm_idx = mvif->bss_conf.mt76.idx %
> MT76_CONNAC_MAX_WMM_SETS;
> 
> @@ -1182,6 +1184,12 @@ static void mt7921_ipv6_addr_change(struct ieee80211_hw
> *hw,
>   				    struct inet6_dev *idev)
>   {
>   	struct mt792x_vif *mvif = (struct mt792x_vif *)vif->drv_priv;
> +	printk(KERN_INFO "%s: mvif = %px\n", __func__, mvif);
> +	printk(KERN_INFO "%s: mvif->phy = %px\n", __func__, mvif->phy);
> +	if (!mvif->phy) {
> +		printk(KERN_INFO "%s: mvif->phy = NULL\n", __func__);
> +		return;
> +	}
>   	struct mt792x_dev *dev = mvif->phy->dev;
>   	struct inet6_ifaddr *ifa;
>   	struct in6_addr ns_addrs[IEEE80211_BSS_ARP_ADDR_LIST_LEN];
> 
> And the result is this (the WARN in mt7921_add_interface did not trigger):
> 
> [  367.121740] [    T861] wlp4s0: deauthenticating from 54:67:51:3d:a2:d2 by
> local choice (Reason: 3=DEAUTH_LEAVING)
> [  367.209603] [    T861] mt7921_ipv6_addr_change: mvif = ffff954e7500de40
> [  367.209615] [    T861] mt7921_ipv6_addr_change: mvif->phy = 0000000000000000
> [  367.209621] [    T861] mt7921_ipv6_addr_change: mvif->phy = NULL
> [  367.250026] [    T861] mt7921_ipv6_addr_change: mvif = ffff954e7500de40
> [  367.250034] [    T861] mt7921_ipv6_addr_change: mvif->phy = ffff954e44427768
> [  367.251537] [    T861] mt7921_ipv6_addr_change: mvif = ffff954e7500de40
> [  367.251542] [    T861] mt7921_ipv6_addr_change: mvif->phy = ffff954e44427768
> [  369.977123] [    T862] wlp4s0: authenticate with 54:67:51:3d:a2:d2 (local
> address=c8:94:02:c1:bd:69)
> [  369.984864] [    T862] wlp4s0: send auth to 54:67:51:3d:a2:d2 (try 1/3)
> [  370.006199] [    T104] wlp4s0: authenticated
> [  370.006680] [    T104] wlp4s0: associate with 54:67:51:3d:a2:d2 (try 1/3)
> [  370.059080] [     T98] wlp4s0: RX AssocResp from 54:67:51:3d:a2:d2
> (capab=0x511 status=0 aid=2)
> [  370.064067] [     T98] wlp4s0: associated
> 
> So mvif->phy can be NULL at the start of mt7921_ipv6_addr_change. The early
> return in that case avoids the NULL pointer and mvif->phy has its usual value
> again on the next call to mt7921_ipv6_addr_change so Wifi is working again. I
> don't know how this could happen but perhaps you have an idea.

This change should fix it: https://nbd.name/p/0747f54f
Please test.

Thanks,

- Felix
Bert Karwatzki July 17, 2024, 5:05 p.m. UTC | #3
Am Mittwoch, dem 17.07.2024 um 17:25 +0200 schrieb Felix Fietkau:
> On 17.07.24 16:38, Bert Karwatzki wrote:
> > Am Freitag, dem 12.07.2024 um 13:06 +0200 schrieb Bert Karwatzki:
> > > Am Donnerstag, dem 11.07.2024 um 18:40 -0500 schrieb Sean Wang:
> > > > Hi Bert,
> > > >
> > > > Thanks for the detailed debug log. I've quickly made a change to fix
> > > > the issue. Right now, I can't access the test environment, but I'll
> > > > test it and send it out as soon as possible. Here's the patch.
> > > >
> > > > diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> > > > b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> > > > index 2e6268cb06c0..1bab93d049df 100644
> > > > --- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> > > > +++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> > > > @@ -303,6 +303,7 @@ mt7921_add_interface(struct ieee80211_hw *hw,
> > > > struct ieee80211_vif *vif)
> > > >
> > > >         mvif->bss_conf.mt76.omac_idx = mvif->bss_conf.mt76.idx;
> > > >         mvif->phy = phy;
> > > > +       mvif->bss_conf.vif = mvif;
> > > >         mvif->bss_conf.mt76.band_idx = 0;
> > > >         mvif->bss_conf.mt76.wmm_idx = mvif->bss_conf.mt76.idx %
> > > > MT76_CONNAC_MAX_WMM_SETS;
> > > >
> > >
> > > I wrote earlier that this patch works fine with linux-next-20240711 and at first
> > > it did, but then another NULL pointer error occured. I'm not sure if I can
> > > bisect this as it does not trigger automatically it seems. Also I'm currently
> > > bisecting the problem with linux-20240712
> > >
> > > Bert Karwatzki
> >
> > Now the above mentioned NULL pointer dereference has happended again, this time
> > on linux-next-20240716. It cann be triggered by repeatedly turning the Wifi off
> > and on again. For further investigation I created this patch:
> >
> > diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> > b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> > index 2e6268cb06c0..3ecedf7bc9f3 100644
> > --- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> > +++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> > @@ -303,6 +303,8 @@ mt7921_add_interface(struct ieee80211_hw *hw, struct
> > ieee80211_vif *vif)
> >
> >   	mvif->bss_conf.mt76.omac_idx = mvif->bss_conf.mt76.idx;
> >   	mvif->phy = phy;
> > +	WARN(!phy, "%s: phy = NULL\n", __func__);
> > +	mvif->bss_conf.vif = mvif;
> >   	mvif->bss_conf.mt76.band_idx = 0;
> >   	mvif->bss_conf.mt76.wmm_idx = mvif->bss_conf.mt76.idx %
> > MT76_CONNAC_MAX_WMM_SETS;
> >
> > @@ -1182,6 +1184,12 @@ static void mt7921_ipv6_addr_change(struct ieee80211_hw
> > *hw,
> >   				    struct inet6_dev *idev)
> >   {
> >   	struct mt792x_vif *mvif = (struct mt792x_vif *)vif->drv_priv;
> > +	printk(KERN_INFO "%s: mvif = %px\n", __func__, mvif);
> > +	printk(KERN_INFO "%s: mvif->phy = %px\n", __func__, mvif->phy);
> > +	if (!mvif->phy) {
> > +		printk(KERN_INFO "%s: mvif->phy = NULL\n", __func__);
> > +		return;
> > +	}
> >   	struct mt792x_dev *dev = mvif->phy->dev;
> >   	struct inet6_ifaddr *ifa;
> >   	struct in6_addr ns_addrs[IEEE80211_BSS_ARP_ADDR_LIST_LEN];
> >
> > And the result is this (the WARN in mt7921_add_interface did not trigger):
> >
> > [  367.121740] [    T861] wlp4s0: deauthenticating from 54:67:51:3d:a2:d2 by
> > local choice (Reason: 3=DEAUTH_LEAVING)
> > [  367.209603] [    T861] mt7921_ipv6_addr_change: mvif = ffff954e7500de40
> > [  367.209615] [    T861] mt7921_ipv6_addr_change: mvif->phy = 0000000000000000
> > [  367.209621] [    T861] mt7921_ipv6_addr_change: mvif->phy = NULL
> > [  367.250026] [    T861] mt7921_ipv6_addr_change: mvif = ffff954e7500de40
> > [  367.250034] [    T861] mt7921_ipv6_addr_change: mvif->phy = ffff954e44427768
> > [  367.251537] [    T861] mt7921_ipv6_addr_change: mvif = ffff954e7500de40
> > [  367.251542] [    T861] mt7921_ipv6_addr_change: mvif->phy = ffff954e44427768
> > [  369.977123] [    T862] wlp4s0: authenticate with 54:67:51:3d:a2:d2 (local
> > address=c8:94:02:c1:bd:69)
> > [  369.984864] [    T862] wlp4s0: send auth to 54:67:51:3d:a2:d2 (try 1/3)
> > [  370.006199] [    T104] wlp4s0: authenticated
> > [  370.006680] [    T104] wlp4s0: associate with 54:67:51:3d:a2:d2 (try 1/3)
> > [  370.059080] [     T98] wlp4s0: RX AssocResp from 54:67:51:3d:a2:d2
> > (capab=0x511 status=0 aid=2)
> > [  370.064067] [     T98] wlp4s0: associated
> >
> > So mvif->phy can be NULL at the start of mt7921_ipv6_addr_change. The early
> > return in that case avoids the NULL pointer and mvif->phy has its usual value
> > again on the next call to mt7921_ipv6_addr_change so Wifi is working again. I
> > don't know how this could happen but perhaps you have an idea.
>
> This change should fix it: https://nbd.name/p/0747f54f
> Please test.
>
> Thanks,
>
> - Felix
>

Your fix works. (I added a WARN() statement on the early return, to see if mvif-
>phy actually was NULL during testing).

Bert Karwatzki
Bert Karwatzki July 18, 2024, 1:10 a.m. UTC | #4
Am Mittwoch, dem 17.07.2024 um 19:05 +0200 schrieb Bert Karwatzki:
> Am
>
> Your fix works. (I added a WARN() statement on the early return, to see if mvif-
> > phy actually was NULL during testing).
>
> Bert Karwatzki
>

While your fix works there was still the question why the driver is suddenly
forgetting mvif->phy? So I've been testing with this script:

#!/bin/sh
for i in $(seq 1 100);
do
		nmcli radio wifi off
		nmcli radio wifi on
	# wait for wifi
	sleep 3
done

together with this patch:

diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
index 4f30426afbb7..206f10473d92 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
@@ -1182,6 +1182,10 @@ static void mt7921_ipv6_addr_change(struct ieee80211_hw
*hw,
                                    struct inet6_dev *idev)
 {
        struct mt792x_vif *mvif = (struct mt792x_vif *)vif->drv_priv;
+       if (!mvif->phy) {
+               WARN(1, "mvif->phy == NULL\n");
+               return;
+       }
        struct mt792x_dev *dev = mvif->phy->dev;
        struct inet6_ifaddr *ifa;
        struct in6_addr ns_addrs[IEEE80211_BSS_ARP_ADDR_LIST_LEN];


On linux-6.10 the script can be run several times without triggering a warning
while on linux-next-20240716 running the script will trigger several warnings.
In the end the result of a bisection is this as the first commit to trigger the
warning:
commit 574e609c4e6a0843a9ed53de79e00da8fb3e7437
Author: Felix Fietkau <nbd@nbd.name>
Date:   Thu Jul 4 15:09:47 2024 +0200

    wifi: mac80211: clear vif drv_priv after remove_interface when stopping

    Avoid reusing stale driver data when an interface is brought down and up
    again. In order to avoid having to duplicate the memset in every single
    driver, do it here.

    Signed-off-by: Felix Fietkau <nbd@nbd.name>
    Link: https://patch.msgid.link/20240704130947.48609-1-nbd@nbd.name
    Signed-off-by: Johannes Berg <johannes.berg@intel.com>

diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 6d969d9f1ac9..97aee0a1a39a 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -689,8 +689,12 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data
*sdata, bool going_do

 		fallthrough;
 	default:
-		if (going_down)
-			drv_remove_interface(local, sdata);
+		if (!going_down)
+			break;
+		drv_remove_interface(local, sdata);
+
+		/* Clear private driver data to prevent reuse */
+		memset(sdata->vif.drv_priv, 0, local->hw.vif_data_size);
 	}

 	ieee80211_recalc_ps(local);

As this is in generic mac80211 code this could probably also affect other
drivers and their ipv6_addr_change function. (which in turn could easily be
fixed by an early exit when mvif->phy == NULL)

Bert Karwatzki
Bert Karwatzki July 18, 2024, 10:43 a.m. UTC | #5
I looked more close at the call trace of the warning and it seems that the
problems 
occur when shutting down the interface:
[  T847] Call Trace:
[  T847]  <TASK>
[  T847]  ? __warn+0x6a/0xc0
[  T847]  ? mt7921_ipv6_addr_change+0x1d0/0x1f0 [mt7921_common]
[  T847]  ? report_bug+0x142/0x180
[  T847]  ? handle_bug+0x3a/0x70
[  T847]  ? exc_invalid_op+0x17/0x70
[  T847]  ? asm_exc_invalid_op+0x1a/0x20
[  T847]  ? mt7921_ipv6_addr_change+0x1d0/0x1f0 [mt7921_common]
[  T847]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T847]  ? __ipv6_ifa_notify+0x16f/0x4d0
[  T847]  ? ieee80211_ifa6_changed+0x5e/0x70 [mac80211]
[  T847]  ? atomic_notifier_call_chain+0x51/0x80
[  T847]  ? addrconf_ifdown.isra.0+0x43f/0x810
[  T847]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T847]  ? addrconf_notify+0x15d/0x760
[  T847]  ? __timer_delete_sync+0x70/0xd0
[  T847]  ? raw_notifier_call_chain+0x43/0x60
[  T847]  ? dev_close_many+0xea/0x160
[  T847]  ? dev_close+0x65/0x80
[  T847]  ? cfg80211_shutdown_all_interfaces+0x48/0xe0 [cfg80211]
[  T847]  ? cfg80211_rfkill_set_block+0x25/0x40 [cfg80211]
[  T847]  ? rfkill_set_block+0x8f/0x160 [rfkill]
[  T847]  ? rfkill_fop_write+0x14e/0x1e0 [rfkill]
[  T847]  ? vfs_write+0xf3/0x420
[  T847]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T847]  ? ksys_write+0xae/0xe0
[  T847]  ? do_syscall_64+0x5f/0x170
[  T847]  ? entry_SYSCALL_64_after_hwframe+0x55/0x5d
[  T847]  </TASK>
[  T847] ---[ end trace 0000000000000000 ]---

I think there's a race happening on shutdown between ipv6_addr_change (which
uses mvif->phy)
and ieee80211_do_stop (which zeros the private data including mvif->phy).

Resend with fixed formatting.

Bert Karwatzki
Bert Karwatzki July 29, 2024, 11:12 a.m. UTC | #6
Am Mittwoch, dem 17.07.2024 um 17:25 +0200 schrieb Felix Fietkau:
> On 17.07.24 16:38, Bert Karwatzki wrote:
> > Am Freitag, dem 12.07.2024 um 13:06 +0200 schrieb Bert Karwatzki:
> > > Am Donnerstag, dem 11.07.2024 um 18:40 -0500 schrieb Sean Wang:
> > > > Hi Bert,
> > > >
> > > > Thanks for the detailed debug log. I've quickly made a change to fix
> > > > the issue. Right now, I can't access the test environment, but I'll
> > > > test it and send it out as soon as possible. Here's the patch.
> > > >
> > > > diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> > > > b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> > > > index 2e6268cb06c0..1bab93d049df 100644
> > > > --- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> > > > +++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> > > > @@ -303,6 +303,7 @@ mt7921_add_interface(struct ieee80211_hw *hw,
> > > > struct ieee80211_vif *vif)
> > > >
> > > >         mvif->bss_conf.mt76.omac_idx = mvif->bss_conf.mt76.idx;
> > > >         mvif->phy = phy;
> > > > +       mvif->bss_conf.vif = mvif;
> > > >         mvif->bss_conf.mt76.band_idx = 0;
> > > >         mvif->bss_conf.mt76.wmm_idx = mvif->bss_conf.mt76.idx %
> > > > MT76_CONNAC_MAX_WMM_SETS;
> > > >
> > >
> > > I wrote earlier that this patch works fine with linux-next-20240711 and at first
> > > it did, but then another NULL pointer error occured. I'm not sure if I can
> > > bisect this as it does not trigger automatically it seems. Also I'm currently
> > > bisecting the problem with linux-20240712
> > >
> > > Bert Karwatzki
> >
> > Now the above mentioned NULL pointer dereference has happended again, this time
> > on linux-next-20240716. It cann be triggered by repeatedly turning the Wifi off
> > and on again. For further investigation I created this patch:
> >
> > diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> > b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> > index 2e6268cb06c0..3ecedf7bc9f3 100644
> > --- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> > +++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> > @@ -303,6 +303,8 @@ mt7921_add_interface(struct ieee80211_hw *hw, struct
> > ieee80211_vif *vif)
> >
> >   	mvif->bss_conf.mt76.omac_idx = mvif->bss_conf.mt76.idx;
> >   	mvif->phy = phy;
> > +	WARN(!phy, "%s: phy = NULL\n", __func__);
> > +	mvif->bss_conf.vif = mvif;
> >   	mvif->bss_conf.mt76.band_idx = 0;
> >   	mvif->bss_conf.mt76.wmm_idx = mvif->bss_conf.mt76.idx %
> > MT76_CONNAC_MAX_WMM_SETS;
> >
> > @@ -1182,6 +1184,12 @@ static void mt7921_ipv6_addr_change(struct ieee80211_hw
> > *hw,
> >   				    struct inet6_dev *idev)
> >   {
> >   	struct mt792x_vif *mvif = (struct mt792x_vif *)vif->drv_priv;
> > +	printk(KERN_INFO "%s: mvif = %px\n", __func__, mvif);
> > +	printk(KERN_INFO "%s: mvif->phy = %px\n", __func__, mvif->phy);
> > +	if (!mvif->phy) {
> > +		printk(KERN_INFO "%s: mvif->phy = NULL\n", __func__);
> > +		return;
> > +	}
> >   	struct mt792x_dev *dev = mvif->phy->dev;
> >   	struct inet6_ifaddr *ifa;
> >   	struct in6_addr ns_addrs[IEEE80211_BSS_ARP_ADDR_LIST_LEN];
> >
> > And the result is this (the WARN in mt7921_add_interface did not trigger):
> >
> > [  367.121740] [    T861] wlp4s0: deauthenticating from 54:67:51:3d:a2:d2 by
> > local choice (Reason: 3=DEAUTH_LEAVING)
> > [  367.209603] [    T861] mt7921_ipv6_addr_change: mvif = ffff954e7500de40
> > [  367.209615] [    T861] mt7921_ipv6_addr_change: mvif->phy = 0000000000000000
> > [  367.209621] [    T861] mt7921_ipv6_addr_change: mvif->phy = NULL
> > [  367.250026] [    T861] mt7921_ipv6_addr_change: mvif = ffff954e7500de40
> > [  367.250034] [    T861] mt7921_ipv6_addr_change: mvif->phy = ffff954e44427768
> > [  367.251537] [    T861] mt7921_ipv6_addr_change: mvif = ffff954e7500de40
> > [  367.251542] [    T861] mt7921_ipv6_addr_change: mvif->phy = ffff954e44427768
> > [  369.977123] [    T862] wlp4s0: authenticate with 54:67:51:3d:a2:d2 (local
> > address=c8:94:02:c1:bd:69)
> > [  369.984864] [    T862] wlp4s0: send auth to 54:67:51:3d:a2:d2 (try 1/3)
> > [  370.006199] [    T104] wlp4s0: authenticated
> > [  370.006680] [    T104] wlp4s0: associate with 54:67:51:3d:a2:d2 (try 1/3)
> > [  370.059080] [     T98] wlp4s0: RX AssocResp from 54:67:51:3d:a2:d2
> > (capab=0x511 status=0 aid=2)
> > [  370.064067] [     T98] wlp4s0: associated
> >
> > So mvif->phy can be NULL at the start of mt7921_ipv6_addr_change. The early
> > return in that case avoids the NULL pointer and mvif->phy has its usual value
> > again on the next call to mt7921_ipv6_addr_change so Wifi is working again. I
> > don't know how this could happen but perhaps you have an idea.
>
> This change should fix it: https://nbd.name/p/0747f54f
> Please test.
>
> Thanks,
>
> - Felix
>

The BUG is still present in linux-6.11-rc1.

Bert Karwatzki
Kalle Valo July 31, 2024, 8:51 a.m. UTC | #7
Bert Karwatzki <spasswolf@web.de> writes:

> Am Mittwoch, dem 17.07.2024 um 17:25 +0200 schrieb Felix Fietkau:
>
>> On 17.07.24 16:38, Bert Karwatzki wrote:
>>
>> > So mvif->phy can be NULL at the start of mt7921_ipv6_addr_change. The early
>> > return in that case avoids the NULL pointer and mvif->phy has its usual value
>> > again on the next call to mt7921_ipv6_addr_change so Wifi is working again. I
>> > don't know how this could happen but perhaps you have an idea.
>>
>> This change should fix it: https://nbd.name/p/0747f54f
>> Please test.
>
> The BUG is still present in linux-6.11-rc1.

I'm not sure what's the status with this. There's one mt76 patch going
to v6.11-rc2:

https://git.kernel.org/pub/scm/linux/kernel/git/wireless/wireless.git/commit/?id=6557a28f3e3a54cff4f0dcdd1dfa649b26557ab3

But that looks to be a fix for a different problem, right? Felix, are
you planning to submit that 0747f54f as a proper patch? I could then
take it to wireless tree.
Bert Karwatzki Aug. 6, 2024, 11:22 a.m. UTC | #8
Am Mittwoch, dem 31.07.2024 um 11:51 +0300 schrieb Kalle Valo:
> Bert Karwatzki <spasswolf@web.de> writes:
>
> > Am Mittwoch, dem 17.07.2024 um 17:25 +0200 schrieb Felix Fietkau:
> >
> > > On 17.07.24 16:38, Bert Karwatzki wrote:
> > >
> > > > So mvif->phy can be NULL at the start of mt7921_ipv6_addr_change. The early
> > > > return in that case avoids the NULL pointer and mvif->phy has its usual value
> > > > again on the next call to mt7921_ipv6_addr_change so Wifi is working again. I
> > > > don't know how this could happen but perhaps you have an idea.
> > >
> > > This change should fix it: https://nbd.name/p/0747f54f
> > > Please test.
> >
> > The BUG is still present in linux-6.11-rc1.
>
> I'm not sure what's the status with this. There's one mt76 patch going
> to v6.11-rc2:
>
> https://git.kernel.org/pub/scm/linux/kernel/git/wireless/wireless.git/commit/?id=6557a28f3e3a54cff4f0dcdd1dfa649b26557ab3
>
> But that looks to be a fix for a different problem, right? Felix, are
> you planning to submit that 0747f54f as a proper patch? I could then
> take it to wireless tree.
>
The Bug is still present in linux-6.11-rc2 and linux-next-20240806. Also the
mvif->phy NULL check in the original patch is not neccessary (and feels a little
out of place as mvif->phy is not needed anymore). This patch is sufficient to
fix the NULL pointer dereference:
diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
index 1bab93d049df..23b228804289 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
@@ -1183,7 +1183,7 @@ static void mt7921_ipv6_addr_change(struct ieee80211_hw
*hw,
                                    struct inet6_dev *idev)
 {
        struct mt792x_vif *mvif = (struct mt792x_vif *)vif->drv_priv;
-       struct mt792x_dev *dev = mvif->phy->dev;
+       struct mt792x_dev *dev = mt792x_hw_dev(hw);
        struct inet6_ifaddr *ifa;
        struct in6_addr ns_addrs[IEEE80211_BSS_ARP_ADDR_LIST_LEN];
        struct sk_buff *skb;

Bert Karwatzki
Bert Karwatzki Aug. 12, 2024, 8:57 a.m. UTC | #9
Am Dienstag, dem 06.08.2024 um 13:22 +0200 schrieb Bert Karwatzki:
> Am Mittwoch, dem 31.07.2024 um 11:51 +0300 schrieb Kalle Valo:
> > Bert Karwatzki <spasswolf@web.de> writes:
> >
> > > Am Mittwoch, dem 17.07.2024 um 17:25 +0200 schrieb Felix Fietkau:
> > >
> > > > On 17.07.24 16:38, Bert Karwatzki wrote:
> > > >
> > > > > So mvif->phy can be NULL at the start of mt7921_ipv6_addr_change. The early
> > > > > return in that case avoids the NULL pointer and mvif->phy has its usual value
> > > > > again on the next call to mt7921_ipv6_addr_change so Wifi is working again. I
> > > > > don't know how this could happen but perhaps you have an idea.
> > > >
> > > > This change should fix it: https://nbd.name/p/0747f54f
> > > > Please test.
> > >
> > > The BUG is still present in linux-6.11-rc1.
> >
> > I'm not sure what's the status with this. There's one mt76 patch going
> > to v6.11-rc2:
> >
> > https://git.kernel.org/pub/scm/linux/kernel/git/wireless/wireless.git/commit/?id=6557a28f3e3a54cff4f0dcdd1dfa649b26557ab3
> >
> > But that looks to be a fix for a different problem, right? Felix, are
> > you planning to submit that 0747f54f as a proper patch? I could then
> > take it to wireless tree.
> >
> The Bug is still present in linux-6.11-rc2 and linux-next-20240806. Also the
> mvif->phy NULL check in the original patch is not neccessary (and feels a little
> out of place as mvif->phy is not needed anymore). This patch is sufficient to
> fix the NULL pointer dereference:
> diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> index 1bab93d049df..23b228804289 100644
> --- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> +++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> @@ -1183,7 +1183,7 @@ static void mt7921_ipv6_addr_change(struct ieee80211_hw
> *hw,
>                                     struct inet6_dev *idev)
>  {
>         struct mt792x_vif *mvif = (struct mt792x_vif *)vif->drv_priv;
> -       struct mt792x_dev *dev = mvif->phy->dev;
> +       struct mt792x_dev *dev = mt792x_hw_dev(hw);
>         struct inet6_ifaddr *ifa;
>         struct in6_addr ns_addrs[IEEE80211_BSS_ARP_ADDR_LIST_LEN];
>         struct sk_buff *skb;
>
> Bert Karwatzki

This error is still present in v6.11-rc3.

Bert Karwatzki
Kalle Valo Aug. 12, 2024, 10:29 a.m. UTC | #10
Bert Karwatzki <spasswolf@web.de> writes:

> Am Dienstag, dem 06.08.2024 um 13:22 +0200 schrieb Bert Karwatzki:
>> Am Mittwoch, dem 31.07.2024 um 11:51 +0300 schrieb Kalle Valo:
>> > Bert Karwatzki <spasswolf@web.de> writes:
>> >
>> > > Am Mittwoch, dem 17.07.2024 um 17:25 +0200 schrieb Felix Fietkau:
>> > >
>> > > > On 17.07.24 16:38, Bert Karwatzki wrote:
>> > > >
>> > > > > So mvif->phy can be NULL at the start of mt7921_ipv6_addr_change. The early
>> > > > > return in that case avoids the NULL pointer and mvif->phy
>> > > > > has its usual value
>> > > > > again on the next call to mt7921_ipv6_addr_change so Wifi is
>> > > > > working again. I
>> > > > > don't know how this could happen but perhaps you have an idea.
>> > > >
>> > > > This change should fix it: https://nbd.name/p/0747f54f
>> > > > Please test.
>> > >
>> > > The BUG is still present in linux-6.11-rc1.
>> >
>> > I'm not sure what's the status with this. There's one mt76 patch going
>> > to v6.11-rc2:
>> >
>> > https://git.kernel.org/pub/scm/linux/kernel/git/wireless/wireless.git/commit/?id=6557a28f3e3a54cff4f0dcdd1dfa649b26557ab3
>> >
>> > But that looks to be a fix for a different problem, right? Felix, are
>> > you planning to submit that 0747f54f as a proper patch? I could then
>> > take it to wireless tree.
>> >
>> The Bug is still present in linux-6.11-rc2 and linux-next-20240806. Also the
>> mvif->phy NULL check in the original patch is not neccessary (and feels a little
>> out of place as mvif->phy is not needed anymore). This patch is sufficient to
>> fix the NULL pointer dereference:
>> diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
>> b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
>> index 1bab93d049df..23b228804289 100644
>> --- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
>> +++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
>> @@ -1183,7 +1183,7 @@ static void mt7921_ipv6_addr_change(struct ieee80211_hw
>> *hw,
>>                                     struct inet6_dev *idev)
>>  {
>>         struct mt792x_vif *mvif = (struct mt792x_vif *)vif->drv_priv;
>> -       struct mt792x_dev *dev = mvif->phy->dev;
>> +       struct mt792x_dev *dev = mt792x_hw_dev(hw);
>>         struct inet6_ifaddr *ifa;
>>         struct in6_addr ns_addrs[IEEE80211_BSS_ARP_ADDR_LIST_LEN];
>>         struct sk_buff *skb;
>>
>> Bert Karwatzki
>
> This error is still present in v6.11-rc3.

Bert, can you send your fix as a proper patch? More information in the
wiki below and please mark it for wireless tree.
diff mbox series

Patch

diff --git a/drivers/net/wireless/mediatek/mt76/mt792x.h b/drivers/net/wireless/mediatek/mt76/mt792x.h
index 69eb8dac0b70..c17195559b82 100644
--- a/drivers/net/wireless/mediatek/mt76/mt792x.h
+++ b/drivers/net/wireless/mediatek/mt76/mt792x.h
@@ -230,6 +230,7 @@  mt792x_vif_to_link(struct mt792x_vif *mvif, u8 link_id)
 	struct ieee80211_vif *vif;
 
 	vif = container_of((void *)mvif, struct ieee80211_vif, drv_priv);
+	printk(KERN_INFO "%s %d: vif = %px\n", __func__, __LINE__, vif);
 
 	if (!ieee80211_vif_is_mld(vif) ||
 	    link_id >= IEEE80211_LINK_UNSPECIFIED)
@@ -259,6 +260,7 @@  mt792x_link_conf_to_mconf(struct ieee80211_bss_conf *link_conf)
 {
 	struct ieee80211_vif *vif = link_conf->vif;
 	struct mt792x_vif *mvif = (struct mt792x_vif *)vif->drv_priv;
+	printk(KERN_INFO "%s %d: vif = %px mvif = %px\n", __func__, __LINE__, vif, mvif);
 
 	return mt792x_vif_to_link(mvif, link_conf->link_id);
 }
diff --git a/drivers/net/wireless/mediatek/mt76/mt792x_core.c b/drivers/net/wireless/mediatek/mt76/mt792x_core.c
index 813296fad0ed..ff627f5986bd 100644
--- a/drivers/net/wireless/mediatek/mt76/mt792x_core.c
+++ b/drivers/net/wireless/mediatek/mt76/mt792x_core.c
@@ -119,23 +119,34 @@  static void mt792x_mac_link_bss_remove(struct mt792x_dev *dev,
 {
 	struct mt792x_bss_conf *mconf = mt792x_link_conf_to_mconf(link_conf);
 	int idx = mlink->wcid.idx;
+	printk(KERN_INFO "%s %d\n", __func__, __LINE__);
 
 	mt792x_mutex_acquire(dev);
+	printk(KERN_INFO "%s %d\n", __func__, __LINE__);
 	mt76_connac_free_pending_tx_skbs(&dev->pm, &mlink->wcid);
+	printk(KERN_INFO "%s %d\n", __func__, __LINE__);
 	mt76_connac_mcu_uni_add_dev(&dev->mphy, link_conf, &mlink->wcid, false);
+	printk(KERN_INFO "%s %d\n", __func__, __LINE__);
 
 	rcu_assign_pointer(dev->mt76.wcid[idx], NULL);
+	printk(KERN_INFO "%s %d\n", __func__, __LINE__);
 
 	dev->mt76.vif_mask &= ~BIT_ULL(mconf->mt76.idx);
+	printk(KERN_INFO "%s %d: mconf->vif = %px\n", __func__, __LINE__, mconf->vif);
 	mconf->vif->phy->omac_mask &= ~BIT_ULL(mconf->mt76.omac_idx);
+	printk(KERN_INFO "%s %d\n", __func__, __LINE__);
 	mt792x_mutex_release(dev);
+	printk(KERN_INFO "%s %d\n", __func__, __LINE__);
 
 	spin_lock_bh(&dev->mt76.sta_poll_lock);
+	printk(KERN_INFO "%s %d\n", __func__, __LINE__);
 	if (!list_empty(&mlink->wcid.poll_list))
 		list_del_init(&mlink->wcid.poll_list);
 	spin_unlock_bh(&dev->mt76.sta_poll_lock);
+	printk(KERN_INFO "%s %d\n", __func__, __LINE__);
 
 	mt76_wcid_cleanup(&dev->mt76, &mlink->wcid);
+	printk(KERN_INFO "%s %d\n", __func__, __LINE__);
 }
 
 void mt792x_remove_interface(struct ieee80211_hw *hw,
@@ -143,6 +154,8 @@  void mt792x_remove_interface(struct ieee80211_hw *hw,
 {
 	struct mt792x_vif *mvif = (struct mt792x_vif *)vif->drv_priv;
 	struct mt792x_dev *dev = mt792x_hw_dev(hw);
+	printk(KERN_INFO "%s %d: hw = %px vif = %px mvif = %px dev = %px\n",
+			__func__, __LINE__, hw, vif, mvif, dev);
 
 	mt792x_mac_link_bss_remove(dev, &vif->bss_conf, &mvif->sta.deflink);
 }