Message ID | 20240711175156.4465-1-spasswolf@web.de |
---|---|
State | New |
Headers | show |
Series | patch 46/47 causes NULL pointer deref on mt7921 | expand |
Am Freitag, dem 12.07.2024 um 13:06 +0200 schrieb Bert Karwatzki: > Am Donnerstag, dem 11.07.2024 um 18:40 -0500 schrieb Sean Wang: > > Hi Bert, > > > > Thanks for the detailed debug log. I've quickly made a change to fix > > the issue. Right now, I can't access the test environment, but I'll > > test it and send it out as soon as possible. Here's the patch. > > > > diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c > > b/drivers/net/wireless/mediatek/mt76/mt7921/main.c > > index 2e6268cb06c0..1bab93d049df 100644 > > --- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c > > +++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c > > @@ -303,6 +303,7 @@ mt7921_add_interface(struct ieee80211_hw *hw, > > struct ieee80211_vif *vif) > > > > mvif->bss_conf.mt76.omac_idx = mvif->bss_conf.mt76.idx; > > mvif->phy = phy; > > + mvif->bss_conf.vif = mvif; > > mvif->bss_conf.mt76.band_idx = 0; > > mvif->bss_conf.mt76.wmm_idx = mvif->bss_conf.mt76.idx % > > MT76_CONNAC_MAX_WMM_SETS; > > > > I wrote earlier that this patch works fine with linux-next-20240711 and at first > it did, but then another NULL pointer error occured. I'm not sure if I can > bisect this as it does not trigger automatically it seems. Also I'm currently > bisecting the problem with linux-20240712 > > Bert Karwatzki Now the above mentioned NULL pointer dereference has happended again, this time on linux-next-20240716. It cann be triggered by repeatedly turning the Wifi off and on again. For further investigation I created this patch: diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c b/drivers/net/wireless/mediatek/mt76/mt7921/main.c index 2e6268cb06c0..3ecedf7bc9f3 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c +++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c @@ -303,6 +303,8 @@ mt7921_add_interface(struct ieee80211_hw *hw, struct ieee80211_vif *vif) mvif->bss_conf.mt76.omac_idx = mvif->bss_conf.mt76.idx; mvif->phy = phy; + WARN(!phy, "%s: phy = NULL\n", __func__); + mvif->bss_conf.vif = mvif; mvif->bss_conf.mt76.band_idx = 0; mvif->bss_conf.mt76.wmm_idx = mvif->bss_conf.mt76.idx % MT76_CONNAC_MAX_WMM_SETS; @@ -1182,6 +1184,12 @@ static void mt7921_ipv6_addr_change(struct ieee80211_hw *hw, struct inet6_dev *idev) { struct mt792x_vif *mvif = (struct mt792x_vif *)vif->drv_priv; + printk(KERN_INFO "%s: mvif = %px\n", __func__, mvif); + printk(KERN_INFO "%s: mvif->phy = %px\n", __func__, mvif->phy); + if (!mvif->phy) { + printk(KERN_INFO "%s: mvif->phy = NULL\n", __func__); + return; + } struct mt792x_dev *dev = mvif->phy->dev; struct inet6_ifaddr *ifa; struct in6_addr ns_addrs[IEEE80211_BSS_ARP_ADDR_LIST_LEN]; And the result is this (the WARN in mt7921_add_interface did not trigger): [ 367.121740] [ T861] wlp4s0: deauthenticating from 54:67:51:3d:a2:d2 by local choice (Reason: 3=DEAUTH_LEAVING) [ 367.209603] [ T861] mt7921_ipv6_addr_change: mvif = ffff954e7500de40 [ 367.209615] [ T861] mt7921_ipv6_addr_change: mvif->phy = 0000000000000000 [ 367.209621] [ T861] mt7921_ipv6_addr_change: mvif->phy = NULL [ 367.250026] [ T861] mt7921_ipv6_addr_change: mvif = ffff954e7500de40 [ 367.250034] [ T861] mt7921_ipv6_addr_change: mvif->phy = ffff954e44427768 [ 367.251537] [ T861] mt7921_ipv6_addr_change: mvif = ffff954e7500de40 [ 367.251542] [ T861] mt7921_ipv6_addr_change: mvif->phy = ffff954e44427768 [ 369.977123] [ T862] wlp4s0: authenticate with 54:67:51:3d:a2:d2 (local address=c8:94:02:c1:bd:69) [ 369.984864] [ T862] wlp4s0: send auth to 54:67:51:3d:a2:d2 (try 1/3) [ 370.006199] [ T104] wlp4s0: authenticated [ 370.006680] [ T104] wlp4s0: associate with 54:67:51:3d:a2:d2 (try 1/3) [ 370.059080] [ T98] wlp4s0: RX AssocResp from 54:67:51:3d:a2:d2 (capab=0x511 status=0 aid=2) [ 370.064067] [ T98] wlp4s0: associated So mvif->phy can be NULL at the start of mt7921_ipv6_addr_change. The early return in that case avoids the NULL pointer and mvif->phy has its usual value again on the next call to mt7921_ipv6_addr_change so Wifi is working again. I don't know how this could happen but perhaps you have an idea. Bert Karwatzki
On 17.07.24 16:38, Bert Karwatzki wrote: > Am Freitag, dem 12.07.2024 um 13:06 +0200 schrieb Bert Karwatzki: >> Am Donnerstag, dem 11.07.2024 um 18:40 -0500 schrieb Sean Wang: >> > Hi Bert, >> > >> > Thanks for the detailed debug log. I've quickly made a change to fix >> > the issue. Right now, I can't access the test environment, but I'll >> > test it and send it out as soon as possible. Here's the patch. >> > >> > diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c >> > b/drivers/net/wireless/mediatek/mt76/mt7921/main.c >> > index 2e6268cb06c0..1bab93d049df 100644 >> > --- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c >> > +++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c >> > @@ -303,6 +303,7 @@ mt7921_add_interface(struct ieee80211_hw *hw, >> > struct ieee80211_vif *vif) >> > >> > mvif->bss_conf.mt76.omac_idx = mvif->bss_conf.mt76.idx; >> > mvif->phy = phy; >> > + mvif->bss_conf.vif = mvif; >> > mvif->bss_conf.mt76.band_idx = 0; >> > mvif->bss_conf.mt76.wmm_idx = mvif->bss_conf.mt76.idx % >> > MT76_CONNAC_MAX_WMM_SETS; >> > >> >> I wrote earlier that this patch works fine with linux-next-20240711 and at first >> it did, but then another NULL pointer error occured. I'm not sure if I can >> bisect this as it does not trigger automatically it seems. Also I'm currently >> bisecting the problem with linux-20240712 >> >> Bert Karwatzki > > Now the above mentioned NULL pointer dereference has happended again, this time > on linux-next-20240716. It cann be triggered by repeatedly turning the Wifi off > and on again. For further investigation I created this patch: > > diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c > b/drivers/net/wireless/mediatek/mt76/mt7921/main.c > index 2e6268cb06c0..3ecedf7bc9f3 100644 > --- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c > +++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c > @@ -303,6 +303,8 @@ mt7921_add_interface(struct ieee80211_hw *hw, struct > ieee80211_vif *vif) > > mvif->bss_conf.mt76.omac_idx = mvif->bss_conf.mt76.idx; > mvif->phy = phy; > + WARN(!phy, "%s: phy = NULL\n", __func__); > + mvif->bss_conf.vif = mvif; > mvif->bss_conf.mt76.band_idx = 0; > mvif->bss_conf.mt76.wmm_idx = mvif->bss_conf.mt76.idx % > MT76_CONNAC_MAX_WMM_SETS; > > @@ -1182,6 +1184,12 @@ static void mt7921_ipv6_addr_change(struct ieee80211_hw > *hw, > struct inet6_dev *idev) > { > struct mt792x_vif *mvif = (struct mt792x_vif *)vif->drv_priv; > + printk(KERN_INFO "%s: mvif = %px\n", __func__, mvif); > + printk(KERN_INFO "%s: mvif->phy = %px\n", __func__, mvif->phy); > + if (!mvif->phy) { > + printk(KERN_INFO "%s: mvif->phy = NULL\n", __func__); > + return; > + } > struct mt792x_dev *dev = mvif->phy->dev; > struct inet6_ifaddr *ifa; > struct in6_addr ns_addrs[IEEE80211_BSS_ARP_ADDR_LIST_LEN]; > > And the result is this (the WARN in mt7921_add_interface did not trigger): > > [ 367.121740] [ T861] wlp4s0: deauthenticating from 54:67:51:3d:a2:d2 by > local choice (Reason: 3=DEAUTH_LEAVING) > [ 367.209603] [ T861] mt7921_ipv6_addr_change: mvif = ffff954e7500de40 > [ 367.209615] [ T861] mt7921_ipv6_addr_change: mvif->phy = 0000000000000000 > [ 367.209621] [ T861] mt7921_ipv6_addr_change: mvif->phy = NULL > [ 367.250026] [ T861] mt7921_ipv6_addr_change: mvif = ffff954e7500de40 > [ 367.250034] [ T861] mt7921_ipv6_addr_change: mvif->phy = ffff954e44427768 > [ 367.251537] [ T861] mt7921_ipv6_addr_change: mvif = ffff954e7500de40 > [ 367.251542] [ T861] mt7921_ipv6_addr_change: mvif->phy = ffff954e44427768 > [ 369.977123] [ T862] wlp4s0: authenticate with 54:67:51:3d:a2:d2 (local > address=c8:94:02:c1:bd:69) > [ 369.984864] [ T862] wlp4s0: send auth to 54:67:51:3d:a2:d2 (try 1/3) > [ 370.006199] [ T104] wlp4s0: authenticated > [ 370.006680] [ T104] wlp4s0: associate with 54:67:51:3d:a2:d2 (try 1/3) > [ 370.059080] [ T98] wlp4s0: RX AssocResp from 54:67:51:3d:a2:d2 > (capab=0x511 status=0 aid=2) > [ 370.064067] [ T98] wlp4s0: associated > > So mvif->phy can be NULL at the start of mt7921_ipv6_addr_change. The early > return in that case avoids the NULL pointer and mvif->phy has its usual value > again on the next call to mt7921_ipv6_addr_change so Wifi is working again. I > don't know how this could happen but perhaps you have an idea. This change should fix it: https://nbd.name/p/0747f54f Please test. Thanks, - Felix
Am Mittwoch, dem 17.07.2024 um 17:25 +0200 schrieb Felix Fietkau: > On 17.07.24 16:38, Bert Karwatzki wrote: > > Am Freitag, dem 12.07.2024 um 13:06 +0200 schrieb Bert Karwatzki: > > > Am Donnerstag, dem 11.07.2024 um 18:40 -0500 schrieb Sean Wang: > > > > Hi Bert, > > > > > > > > Thanks for the detailed debug log. I've quickly made a change to fix > > > > the issue. Right now, I can't access the test environment, but I'll > > > > test it and send it out as soon as possible. Here's the patch. > > > > > > > > diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c > > > > b/drivers/net/wireless/mediatek/mt76/mt7921/main.c > > > > index 2e6268cb06c0..1bab93d049df 100644 > > > > --- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c > > > > +++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c > > > > @@ -303,6 +303,7 @@ mt7921_add_interface(struct ieee80211_hw *hw, > > > > struct ieee80211_vif *vif) > > > > > > > > mvif->bss_conf.mt76.omac_idx = mvif->bss_conf.mt76.idx; > > > > mvif->phy = phy; > > > > + mvif->bss_conf.vif = mvif; > > > > mvif->bss_conf.mt76.band_idx = 0; > > > > mvif->bss_conf.mt76.wmm_idx = mvif->bss_conf.mt76.idx % > > > > MT76_CONNAC_MAX_WMM_SETS; > > > > > > > > > > I wrote earlier that this patch works fine with linux-next-20240711 and at first > > > it did, but then another NULL pointer error occured. I'm not sure if I can > > > bisect this as it does not trigger automatically it seems. Also I'm currently > > > bisecting the problem with linux-20240712 > > > > > > Bert Karwatzki > > > > Now the above mentioned NULL pointer dereference has happended again, this time > > on linux-next-20240716. It cann be triggered by repeatedly turning the Wifi off > > and on again. For further investigation I created this patch: > > > > diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c > > b/drivers/net/wireless/mediatek/mt76/mt7921/main.c > > index 2e6268cb06c0..3ecedf7bc9f3 100644 > > --- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c > > +++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c > > @@ -303,6 +303,8 @@ mt7921_add_interface(struct ieee80211_hw *hw, struct > > ieee80211_vif *vif) > > > > mvif->bss_conf.mt76.omac_idx = mvif->bss_conf.mt76.idx; > > mvif->phy = phy; > > + WARN(!phy, "%s: phy = NULL\n", __func__); > > + mvif->bss_conf.vif = mvif; > > mvif->bss_conf.mt76.band_idx = 0; > > mvif->bss_conf.mt76.wmm_idx = mvif->bss_conf.mt76.idx % > > MT76_CONNAC_MAX_WMM_SETS; > > > > @@ -1182,6 +1184,12 @@ static void mt7921_ipv6_addr_change(struct ieee80211_hw > > *hw, > > struct inet6_dev *idev) > > { > > struct mt792x_vif *mvif = (struct mt792x_vif *)vif->drv_priv; > > + printk(KERN_INFO "%s: mvif = %px\n", __func__, mvif); > > + printk(KERN_INFO "%s: mvif->phy = %px\n", __func__, mvif->phy); > > + if (!mvif->phy) { > > + printk(KERN_INFO "%s: mvif->phy = NULL\n", __func__); > > + return; > > + } > > struct mt792x_dev *dev = mvif->phy->dev; > > struct inet6_ifaddr *ifa; > > struct in6_addr ns_addrs[IEEE80211_BSS_ARP_ADDR_LIST_LEN]; > > > > And the result is this (the WARN in mt7921_add_interface did not trigger): > > > > [ 367.121740] [ T861] wlp4s0: deauthenticating from 54:67:51:3d:a2:d2 by > > local choice (Reason: 3=DEAUTH_LEAVING) > > [ 367.209603] [ T861] mt7921_ipv6_addr_change: mvif = ffff954e7500de40 > > [ 367.209615] [ T861] mt7921_ipv6_addr_change: mvif->phy = 0000000000000000 > > [ 367.209621] [ T861] mt7921_ipv6_addr_change: mvif->phy = NULL > > [ 367.250026] [ T861] mt7921_ipv6_addr_change: mvif = ffff954e7500de40 > > [ 367.250034] [ T861] mt7921_ipv6_addr_change: mvif->phy = ffff954e44427768 > > [ 367.251537] [ T861] mt7921_ipv6_addr_change: mvif = ffff954e7500de40 > > [ 367.251542] [ T861] mt7921_ipv6_addr_change: mvif->phy = ffff954e44427768 > > [ 369.977123] [ T862] wlp4s0: authenticate with 54:67:51:3d:a2:d2 (local > > address=c8:94:02:c1:bd:69) > > [ 369.984864] [ T862] wlp4s0: send auth to 54:67:51:3d:a2:d2 (try 1/3) > > [ 370.006199] [ T104] wlp4s0: authenticated > > [ 370.006680] [ T104] wlp4s0: associate with 54:67:51:3d:a2:d2 (try 1/3) > > [ 370.059080] [ T98] wlp4s0: RX AssocResp from 54:67:51:3d:a2:d2 > > (capab=0x511 status=0 aid=2) > > [ 370.064067] [ T98] wlp4s0: associated > > > > So mvif->phy can be NULL at the start of mt7921_ipv6_addr_change. The early > > return in that case avoids the NULL pointer and mvif->phy has its usual value > > again on the next call to mt7921_ipv6_addr_change so Wifi is working again. I > > don't know how this could happen but perhaps you have an idea. > > This change should fix it: https://nbd.name/p/0747f54f > Please test. > > Thanks, > > - Felix > Your fix works. (I added a WARN() statement on the early return, to see if mvif- >phy actually was NULL during testing). Bert Karwatzki
Am Mittwoch, dem 17.07.2024 um 19:05 +0200 schrieb Bert Karwatzki: > Am > > Your fix works. (I added a WARN() statement on the early return, to see if mvif- > > phy actually was NULL during testing). > > Bert Karwatzki > While your fix works there was still the question why the driver is suddenly forgetting mvif->phy? So I've been testing with this script: #!/bin/sh for i in $(seq 1 100); do nmcli radio wifi off nmcli radio wifi on # wait for wifi sleep 3 done together with this patch: diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c b/drivers/net/wireless/mediatek/mt76/mt7921/main.c index 4f30426afbb7..206f10473d92 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c +++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c @@ -1182,6 +1182,10 @@ static void mt7921_ipv6_addr_change(struct ieee80211_hw *hw, struct inet6_dev *idev) { struct mt792x_vif *mvif = (struct mt792x_vif *)vif->drv_priv; + if (!mvif->phy) { + WARN(1, "mvif->phy == NULL\n"); + return; + } struct mt792x_dev *dev = mvif->phy->dev; struct inet6_ifaddr *ifa; struct in6_addr ns_addrs[IEEE80211_BSS_ARP_ADDR_LIST_LEN]; On linux-6.10 the script can be run several times without triggering a warning while on linux-next-20240716 running the script will trigger several warnings. In the end the result of a bisection is this as the first commit to trigger the warning: commit 574e609c4e6a0843a9ed53de79e00da8fb3e7437 Author: Felix Fietkau <nbd@nbd.name> Date: Thu Jul 4 15:09:47 2024 +0200 wifi: mac80211: clear vif drv_priv after remove_interface when stopping Avoid reusing stale driver data when an interface is brought down and up again. In order to avoid having to duplicate the memset in every single driver, do it here. Signed-off-by: Felix Fietkau <nbd@nbd.name> Link: https://patch.msgid.link/20240704130947.48609-1-nbd@nbd.name Signed-off-by: Johannes Berg <johannes.berg@intel.com> diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 6d969d9f1ac9..97aee0a1a39a 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -689,8 +689,12 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do fallthrough; default: - if (going_down) - drv_remove_interface(local, sdata); + if (!going_down) + break; + drv_remove_interface(local, sdata); + + /* Clear private driver data to prevent reuse */ + memset(sdata->vif.drv_priv, 0, local->hw.vif_data_size); } ieee80211_recalc_ps(local); As this is in generic mac80211 code this could probably also affect other drivers and their ipv6_addr_change function. (which in turn could easily be fixed by an early exit when mvif->phy == NULL) Bert Karwatzki
I looked more close at the call trace of the warning and it seems that the problems occur when shutting down the interface: [ T847] Call Trace: [ T847] <TASK> [ T847] ? __warn+0x6a/0xc0 [ T847] ? mt7921_ipv6_addr_change+0x1d0/0x1f0 [mt7921_common] [ T847] ? report_bug+0x142/0x180 [ T847] ? handle_bug+0x3a/0x70 [ T847] ? exc_invalid_op+0x17/0x70 [ T847] ? asm_exc_invalid_op+0x1a/0x20 [ T847] ? mt7921_ipv6_addr_change+0x1d0/0x1f0 [mt7921_common] [ T847] ? srso_alias_return_thunk+0x5/0xfbef5 [ T847] ? __ipv6_ifa_notify+0x16f/0x4d0 [ T847] ? ieee80211_ifa6_changed+0x5e/0x70 [mac80211] [ T847] ? atomic_notifier_call_chain+0x51/0x80 [ T847] ? addrconf_ifdown.isra.0+0x43f/0x810 [ T847] ? srso_alias_return_thunk+0x5/0xfbef5 [ T847] ? addrconf_notify+0x15d/0x760 [ T847] ? __timer_delete_sync+0x70/0xd0 [ T847] ? raw_notifier_call_chain+0x43/0x60 [ T847] ? dev_close_many+0xea/0x160 [ T847] ? dev_close+0x65/0x80 [ T847] ? cfg80211_shutdown_all_interfaces+0x48/0xe0 [cfg80211] [ T847] ? cfg80211_rfkill_set_block+0x25/0x40 [cfg80211] [ T847] ? rfkill_set_block+0x8f/0x160 [rfkill] [ T847] ? rfkill_fop_write+0x14e/0x1e0 [rfkill] [ T847] ? vfs_write+0xf3/0x420 [ T847] ? srso_alias_return_thunk+0x5/0xfbef5 [ T847] ? ksys_write+0xae/0xe0 [ T847] ? do_syscall_64+0x5f/0x170 [ T847] ? entry_SYSCALL_64_after_hwframe+0x55/0x5d [ T847] </TASK> [ T847] ---[ end trace 0000000000000000 ]--- I think there's a race happening on shutdown between ipv6_addr_change (which uses mvif->phy) and ieee80211_do_stop (which zeros the private data including mvif->phy). Resend with fixed formatting. Bert Karwatzki
Am Mittwoch, dem 17.07.2024 um 17:25 +0200 schrieb Felix Fietkau: > On 17.07.24 16:38, Bert Karwatzki wrote: > > Am Freitag, dem 12.07.2024 um 13:06 +0200 schrieb Bert Karwatzki: > > > Am Donnerstag, dem 11.07.2024 um 18:40 -0500 schrieb Sean Wang: > > > > Hi Bert, > > > > > > > > Thanks for the detailed debug log. I've quickly made a change to fix > > > > the issue. Right now, I can't access the test environment, but I'll > > > > test it and send it out as soon as possible. Here's the patch. > > > > > > > > diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c > > > > b/drivers/net/wireless/mediatek/mt76/mt7921/main.c > > > > index 2e6268cb06c0..1bab93d049df 100644 > > > > --- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c > > > > +++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c > > > > @@ -303,6 +303,7 @@ mt7921_add_interface(struct ieee80211_hw *hw, > > > > struct ieee80211_vif *vif) > > > > > > > > mvif->bss_conf.mt76.omac_idx = mvif->bss_conf.mt76.idx; > > > > mvif->phy = phy; > > > > + mvif->bss_conf.vif = mvif; > > > > mvif->bss_conf.mt76.band_idx = 0; > > > > mvif->bss_conf.mt76.wmm_idx = mvif->bss_conf.mt76.idx % > > > > MT76_CONNAC_MAX_WMM_SETS; > > > > > > > > > > I wrote earlier that this patch works fine with linux-next-20240711 and at first > > > it did, but then another NULL pointer error occured. I'm not sure if I can > > > bisect this as it does not trigger automatically it seems. Also I'm currently > > > bisecting the problem with linux-20240712 > > > > > > Bert Karwatzki > > > > Now the above mentioned NULL pointer dereference has happended again, this time > > on linux-next-20240716. It cann be triggered by repeatedly turning the Wifi off > > and on again. For further investigation I created this patch: > > > > diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c > > b/drivers/net/wireless/mediatek/mt76/mt7921/main.c > > index 2e6268cb06c0..3ecedf7bc9f3 100644 > > --- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c > > +++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c > > @@ -303,6 +303,8 @@ mt7921_add_interface(struct ieee80211_hw *hw, struct > > ieee80211_vif *vif) > > > > mvif->bss_conf.mt76.omac_idx = mvif->bss_conf.mt76.idx; > > mvif->phy = phy; > > + WARN(!phy, "%s: phy = NULL\n", __func__); > > + mvif->bss_conf.vif = mvif; > > mvif->bss_conf.mt76.band_idx = 0; > > mvif->bss_conf.mt76.wmm_idx = mvif->bss_conf.mt76.idx % > > MT76_CONNAC_MAX_WMM_SETS; > > > > @@ -1182,6 +1184,12 @@ static void mt7921_ipv6_addr_change(struct ieee80211_hw > > *hw, > > struct inet6_dev *idev) > > { > > struct mt792x_vif *mvif = (struct mt792x_vif *)vif->drv_priv; > > + printk(KERN_INFO "%s: mvif = %px\n", __func__, mvif); > > + printk(KERN_INFO "%s: mvif->phy = %px\n", __func__, mvif->phy); > > + if (!mvif->phy) { > > + printk(KERN_INFO "%s: mvif->phy = NULL\n", __func__); > > + return; > > + } > > struct mt792x_dev *dev = mvif->phy->dev; > > struct inet6_ifaddr *ifa; > > struct in6_addr ns_addrs[IEEE80211_BSS_ARP_ADDR_LIST_LEN]; > > > > And the result is this (the WARN in mt7921_add_interface did not trigger): > > > > [ 367.121740] [ T861] wlp4s0: deauthenticating from 54:67:51:3d:a2:d2 by > > local choice (Reason: 3=DEAUTH_LEAVING) > > [ 367.209603] [ T861] mt7921_ipv6_addr_change: mvif = ffff954e7500de40 > > [ 367.209615] [ T861] mt7921_ipv6_addr_change: mvif->phy = 0000000000000000 > > [ 367.209621] [ T861] mt7921_ipv6_addr_change: mvif->phy = NULL > > [ 367.250026] [ T861] mt7921_ipv6_addr_change: mvif = ffff954e7500de40 > > [ 367.250034] [ T861] mt7921_ipv6_addr_change: mvif->phy = ffff954e44427768 > > [ 367.251537] [ T861] mt7921_ipv6_addr_change: mvif = ffff954e7500de40 > > [ 367.251542] [ T861] mt7921_ipv6_addr_change: mvif->phy = ffff954e44427768 > > [ 369.977123] [ T862] wlp4s0: authenticate with 54:67:51:3d:a2:d2 (local > > address=c8:94:02:c1:bd:69) > > [ 369.984864] [ T862] wlp4s0: send auth to 54:67:51:3d:a2:d2 (try 1/3) > > [ 370.006199] [ T104] wlp4s0: authenticated > > [ 370.006680] [ T104] wlp4s0: associate with 54:67:51:3d:a2:d2 (try 1/3) > > [ 370.059080] [ T98] wlp4s0: RX AssocResp from 54:67:51:3d:a2:d2 > > (capab=0x511 status=0 aid=2) > > [ 370.064067] [ T98] wlp4s0: associated > > > > So mvif->phy can be NULL at the start of mt7921_ipv6_addr_change. The early > > return in that case avoids the NULL pointer and mvif->phy has its usual value > > again on the next call to mt7921_ipv6_addr_change so Wifi is working again. I > > don't know how this could happen but perhaps you have an idea. > > This change should fix it: https://nbd.name/p/0747f54f > Please test. > > Thanks, > > - Felix > The BUG is still present in linux-6.11-rc1. Bert Karwatzki
Bert Karwatzki <spasswolf@web.de> writes: > Am Mittwoch, dem 17.07.2024 um 17:25 +0200 schrieb Felix Fietkau: > >> On 17.07.24 16:38, Bert Karwatzki wrote: >> >> > So mvif->phy can be NULL at the start of mt7921_ipv6_addr_change. The early >> > return in that case avoids the NULL pointer and mvif->phy has its usual value >> > again on the next call to mt7921_ipv6_addr_change so Wifi is working again. I >> > don't know how this could happen but perhaps you have an idea. >> >> This change should fix it: https://nbd.name/p/0747f54f >> Please test. > > The BUG is still present in linux-6.11-rc1. I'm not sure what's the status with this. There's one mt76 patch going to v6.11-rc2: https://git.kernel.org/pub/scm/linux/kernel/git/wireless/wireless.git/commit/?id=6557a28f3e3a54cff4f0dcdd1dfa649b26557ab3 But that looks to be a fix for a different problem, right? Felix, are you planning to submit that 0747f54f as a proper patch? I could then take it to wireless tree.
Am Mittwoch, dem 31.07.2024 um 11:51 +0300 schrieb Kalle Valo: > Bert Karwatzki <spasswolf@web.de> writes: > > > Am Mittwoch, dem 17.07.2024 um 17:25 +0200 schrieb Felix Fietkau: > > > > > On 17.07.24 16:38, Bert Karwatzki wrote: > > > > > > > So mvif->phy can be NULL at the start of mt7921_ipv6_addr_change. The early > > > > return in that case avoids the NULL pointer and mvif->phy has its usual value > > > > again on the next call to mt7921_ipv6_addr_change so Wifi is working again. I > > > > don't know how this could happen but perhaps you have an idea. > > > > > > This change should fix it: https://nbd.name/p/0747f54f > > > Please test. > > > > The BUG is still present in linux-6.11-rc1. > > I'm not sure what's the status with this. There's one mt76 patch going > to v6.11-rc2: > > https://git.kernel.org/pub/scm/linux/kernel/git/wireless/wireless.git/commit/?id=6557a28f3e3a54cff4f0dcdd1dfa649b26557ab3 > > But that looks to be a fix for a different problem, right? Felix, are > you planning to submit that 0747f54f as a proper patch? I could then > take it to wireless tree. > The Bug is still present in linux-6.11-rc2 and linux-next-20240806. Also the mvif->phy NULL check in the original patch is not neccessary (and feels a little out of place as mvif->phy is not needed anymore). This patch is sufficient to fix the NULL pointer dereference: diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c b/drivers/net/wireless/mediatek/mt76/mt7921/main.c index 1bab93d049df..23b228804289 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c +++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c @@ -1183,7 +1183,7 @@ static void mt7921_ipv6_addr_change(struct ieee80211_hw *hw, struct inet6_dev *idev) { struct mt792x_vif *mvif = (struct mt792x_vif *)vif->drv_priv; - struct mt792x_dev *dev = mvif->phy->dev; + struct mt792x_dev *dev = mt792x_hw_dev(hw); struct inet6_ifaddr *ifa; struct in6_addr ns_addrs[IEEE80211_BSS_ARP_ADDR_LIST_LEN]; struct sk_buff *skb; Bert Karwatzki
Am Dienstag, dem 06.08.2024 um 13:22 +0200 schrieb Bert Karwatzki: > Am Mittwoch, dem 31.07.2024 um 11:51 +0300 schrieb Kalle Valo: > > Bert Karwatzki <spasswolf@web.de> writes: > > > > > Am Mittwoch, dem 17.07.2024 um 17:25 +0200 schrieb Felix Fietkau: > > > > > > > On 17.07.24 16:38, Bert Karwatzki wrote: > > > > > > > > > So mvif->phy can be NULL at the start of mt7921_ipv6_addr_change. The early > > > > > return in that case avoids the NULL pointer and mvif->phy has its usual value > > > > > again on the next call to mt7921_ipv6_addr_change so Wifi is working again. I > > > > > don't know how this could happen but perhaps you have an idea. > > > > > > > > This change should fix it: https://nbd.name/p/0747f54f > > > > Please test. > > > > > > The BUG is still present in linux-6.11-rc1. > > > > I'm not sure what's the status with this. There's one mt76 patch going > > to v6.11-rc2: > > > > https://git.kernel.org/pub/scm/linux/kernel/git/wireless/wireless.git/commit/?id=6557a28f3e3a54cff4f0dcdd1dfa649b26557ab3 > > > > But that looks to be a fix for a different problem, right? Felix, are > > you planning to submit that 0747f54f as a proper patch? I could then > > take it to wireless tree. > > > The Bug is still present in linux-6.11-rc2 and linux-next-20240806. Also the > mvif->phy NULL check in the original patch is not neccessary (and feels a little > out of place as mvif->phy is not needed anymore). This patch is sufficient to > fix the NULL pointer dereference: > diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c > b/drivers/net/wireless/mediatek/mt76/mt7921/main.c > index 1bab93d049df..23b228804289 100644 > --- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c > +++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c > @@ -1183,7 +1183,7 @@ static void mt7921_ipv6_addr_change(struct ieee80211_hw > *hw, > struct inet6_dev *idev) > { > struct mt792x_vif *mvif = (struct mt792x_vif *)vif->drv_priv; > - struct mt792x_dev *dev = mvif->phy->dev; > + struct mt792x_dev *dev = mt792x_hw_dev(hw); > struct inet6_ifaddr *ifa; > struct in6_addr ns_addrs[IEEE80211_BSS_ARP_ADDR_LIST_LEN]; > struct sk_buff *skb; > > Bert Karwatzki This error is still present in v6.11-rc3. Bert Karwatzki
Bert Karwatzki <spasswolf@web.de> writes: > Am Dienstag, dem 06.08.2024 um 13:22 +0200 schrieb Bert Karwatzki: >> Am Mittwoch, dem 31.07.2024 um 11:51 +0300 schrieb Kalle Valo: >> > Bert Karwatzki <spasswolf@web.de> writes: >> > >> > > Am Mittwoch, dem 17.07.2024 um 17:25 +0200 schrieb Felix Fietkau: >> > > >> > > > On 17.07.24 16:38, Bert Karwatzki wrote: >> > > > >> > > > > So mvif->phy can be NULL at the start of mt7921_ipv6_addr_change. The early >> > > > > return in that case avoids the NULL pointer and mvif->phy >> > > > > has its usual value >> > > > > again on the next call to mt7921_ipv6_addr_change so Wifi is >> > > > > working again. I >> > > > > don't know how this could happen but perhaps you have an idea. >> > > > >> > > > This change should fix it: https://nbd.name/p/0747f54f >> > > > Please test. >> > > >> > > The BUG is still present in linux-6.11-rc1. >> > >> > I'm not sure what's the status with this. There's one mt76 patch going >> > to v6.11-rc2: >> > >> > https://git.kernel.org/pub/scm/linux/kernel/git/wireless/wireless.git/commit/?id=6557a28f3e3a54cff4f0dcdd1dfa649b26557ab3 >> > >> > But that looks to be a fix for a different problem, right? Felix, are >> > you planning to submit that 0747f54f as a proper patch? I could then >> > take it to wireless tree. >> > >> The Bug is still present in linux-6.11-rc2 and linux-next-20240806. Also the >> mvif->phy NULL check in the original patch is not neccessary (and feels a little >> out of place as mvif->phy is not needed anymore). This patch is sufficient to >> fix the NULL pointer dereference: >> diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c >> b/drivers/net/wireless/mediatek/mt76/mt7921/main.c >> index 1bab93d049df..23b228804289 100644 >> --- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c >> +++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c >> @@ -1183,7 +1183,7 @@ static void mt7921_ipv6_addr_change(struct ieee80211_hw >> *hw, >> struct inet6_dev *idev) >> { >> struct mt792x_vif *mvif = (struct mt792x_vif *)vif->drv_priv; >> - struct mt792x_dev *dev = mvif->phy->dev; >> + struct mt792x_dev *dev = mt792x_hw_dev(hw); >> struct inet6_ifaddr *ifa; >> struct in6_addr ns_addrs[IEEE80211_BSS_ARP_ADDR_LIST_LEN]; >> struct sk_buff *skb; >> >> Bert Karwatzki > > This error is still present in v6.11-rc3. Bert, can you send your fix as a proper patch? More information in the wiki below and please mark it for wireless tree.
diff --git a/drivers/net/wireless/mediatek/mt76/mt792x.h b/drivers/net/wireless/mediatek/mt76/mt792x.h index 69eb8dac0b70..c17195559b82 100644 --- a/drivers/net/wireless/mediatek/mt76/mt792x.h +++ b/drivers/net/wireless/mediatek/mt76/mt792x.h @@ -230,6 +230,7 @@ mt792x_vif_to_link(struct mt792x_vif *mvif, u8 link_id) struct ieee80211_vif *vif; vif = container_of((void *)mvif, struct ieee80211_vif, drv_priv); + printk(KERN_INFO "%s %d: vif = %px\n", __func__, __LINE__, vif); if (!ieee80211_vif_is_mld(vif) || link_id >= IEEE80211_LINK_UNSPECIFIED) @@ -259,6 +260,7 @@ mt792x_link_conf_to_mconf(struct ieee80211_bss_conf *link_conf) { struct ieee80211_vif *vif = link_conf->vif; struct mt792x_vif *mvif = (struct mt792x_vif *)vif->drv_priv; + printk(KERN_INFO "%s %d: vif = %px mvif = %px\n", __func__, __LINE__, vif, mvif); return mt792x_vif_to_link(mvif, link_conf->link_id); } diff --git a/drivers/net/wireless/mediatek/mt76/mt792x_core.c b/drivers/net/wireless/mediatek/mt76/mt792x_core.c index 813296fad0ed..ff627f5986bd 100644 --- a/drivers/net/wireless/mediatek/mt76/mt792x_core.c +++ b/drivers/net/wireless/mediatek/mt76/mt792x_core.c @@ -119,23 +119,34 @@ static void mt792x_mac_link_bss_remove(struct mt792x_dev *dev, { struct mt792x_bss_conf *mconf = mt792x_link_conf_to_mconf(link_conf); int idx = mlink->wcid.idx; + printk(KERN_INFO "%s %d\n", __func__, __LINE__); mt792x_mutex_acquire(dev); + printk(KERN_INFO "%s %d\n", __func__, __LINE__); mt76_connac_free_pending_tx_skbs(&dev->pm, &mlink->wcid); + printk(KERN_INFO "%s %d\n", __func__, __LINE__); mt76_connac_mcu_uni_add_dev(&dev->mphy, link_conf, &mlink->wcid, false); + printk(KERN_INFO "%s %d\n", __func__, __LINE__); rcu_assign_pointer(dev->mt76.wcid[idx], NULL); + printk(KERN_INFO "%s %d\n", __func__, __LINE__); dev->mt76.vif_mask &= ~BIT_ULL(mconf->mt76.idx); + printk(KERN_INFO "%s %d: mconf->vif = %px\n", __func__, __LINE__, mconf->vif); mconf->vif->phy->omac_mask &= ~BIT_ULL(mconf->mt76.omac_idx); + printk(KERN_INFO "%s %d\n", __func__, __LINE__); mt792x_mutex_release(dev); + printk(KERN_INFO "%s %d\n", __func__, __LINE__); spin_lock_bh(&dev->mt76.sta_poll_lock); + printk(KERN_INFO "%s %d\n", __func__, __LINE__); if (!list_empty(&mlink->wcid.poll_list)) list_del_init(&mlink->wcid.poll_list); spin_unlock_bh(&dev->mt76.sta_poll_lock); + printk(KERN_INFO "%s %d\n", __func__, __LINE__); mt76_wcid_cleanup(&dev->mt76, &mlink->wcid); + printk(KERN_INFO "%s %d\n", __func__, __LINE__); } void mt792x_remove_interface(struct ieee80211_hw *hw, @@ -143,6 +154,8 @@ void mt792x_remove_interface(struct ieee80211_hw *hw, { struct mt792x_vif *mvif = (struct mt792x_vif *)vif->drv_priv; struct mt792x_dev *dev = mt792x_hw_dev(hw); + printk(KERN_INFO "%s %d: hw = %px vif = %px mvif = %px dev = %px\n", + __func__, __LINE__, hw, vif, mvif, dev); mt792x_mac_link_bss_remove(dev, &vif->bss_conf, &mvif->sta.deflink); }