Message ID | 1606404547-10737-9-git-send-email-loic.poulain@linaro.org |
---|---|
State | Superseded |
Headers | show |
Series | mhi: pci_generic: Misc improvements | expand |
On Thu, Nov 26, 2020 at 04:29:06PM +0100, Loic Poulain wrote: > If the modem crashes for any reason, we may not be able to detect > it at MHI level (MHI registers not reachable anymore). > > This patch implements a health-check mechanism to check regularly > that device is alive (MHI layer can communicate with). If device > is not alive (because a crash or unexpected reset), the recovery > procedure is triggered. > > Tested successfully with Telit FN980m module. > > Signed-off-by: Loic Poulain <loic.poulain@linaro.org> Bhaumik, Hemant, does this looks reasonable to you? Or we can do a better job in the MHI stack. To me this is not a specific usecase for Telit. If you guys plan to implement it later, I can just apply this patch in the meantime as it looks good to me. Reviewed-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org> Thanks, Mani > --- > drivers/bus/mhi/pci_generic.c | 34 ++++++++++++++++++++++++++++++++++ > 1 file changed, 34 insertions(+) > > diff --git a/drivers/bus/mhi/pci_generic.c b/drivers/bus/mhi/pci_generic.c > index 3ac5cd2..26c2dfa 100644 > --- a/drivers/bus/mhi/pci_generic.c > +++ b/drivers/bus/mhi/pci_generic.c > @@ -14,12 +14,15 @@ > #include <linux/mhi.h> > #include <linux/module.h> > #include <linux/pci.h> > +#include <linux/timer.h> > #include <linux/workqueue.h> > > #define MHI_PCI_DEFAULT_BAR_NUM 0 > > #define DEV_RESET_REG (0xB0) > > +#define HEALTH_CHECK_PERIOD (HZ * 5) > + > /** > * struct mhi_pci_dev_info - MHI PCI device specific information > * @config: MHI controller configuration > @@ -190,6 +193,7 @@ struct mhi_pci_device { > struct mhi_controller mhi_cntrl; > struct pci_saved_state *pci_state; > struct work_struct recovery_work; > + struct timer_list health_check_timer; > unsigned long status; > }; > > @@ -332,6 +336,8 @@ static void mhi_pci_recovery_work(struct work_struct *work) > > dev_warn(&pdev->dev, "device recovery started\n"); > > + del_timer(&mhi_pdev->health_check_timer); > + > /* Clean up MHI state */ > if (test_and_clear_bit(MHI_PCI_DEV_STARTED, &mhi_pdev->status)) { > mhi_power_down(mhi_cntrl, false); > @@ -355,6 +361,7 @@ static void mhi_pci_recovery_work(struct work_struct *work) > goto err_unprepare; > > set_bit(MHI_PCI_DEV_STARTED, &mhi_pdev->status); > + mod_timer(&mhi_pdev->health_check_timer, jiffies + HEALTH_CHECK_PERIOD); > return; > > err_unprepare: > @@ -364,6 +371,21 @@ static void mhi_pci_recovery_work(struct work_struct *work) > dev_err(&pdev->dev, "Recovery failed\n"); > } > > +static void health_check(struct timer_list *t) > +{ > + struct mhi_pci_device *mhi_pdev = from_timer(mhi_pdev, t, health_check_timer); > + struct mhi_controller *mhi_cntrl = &mhi_pdev->mhi_cntrl; > + > + if (!mhi_pci_is_alive(mhi_cntrl)) { > + dev_err(mhi_cntrl->cntrl_dev, "Device died\n"); > + queue_work(system_long_wq, &mhi_pdev->recovery_work); > + return; > + } > + > + /* reschedule in two seconds */ > + mod_timer(&mhi_pdev->health_check_timer, jiffies + HEALTH_CHECK_PERIOD); > +} > + > static int mhi_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) > { > const struct mhi_pci_dev_info *info = (struct mhi_pci_dev_info *) id->driver_data; > @@ -379,6 +401,7 @@ static int mhi_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) > return -ENOMEM; > > INIT_WORK(&mhi_pdev->recovery_work, mhi_pci_recovery_work); > + timer_setup(&mhi_pdev->health_check_timer, health_check, 0); > > mhi_cntrl_config = info->config; > mhi_cntrl = &mhi_pdev->mhi_cntrl; > @@ -431,6 +454,9 @@ static int mhi_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) > > set_bit(MHI_PCI_DEV_STARTED, &mhi_pdev->status); > > + /* start health check */ > + mod_timer(&mhi_pdev->health_check_timer, jiffies + HEALTH_CHECK_PERIOD); > + > return 0; > > err_unprepare: > @@ -446,6 +472,7 @@ static void mhi_pci_remove(struct pci_dev *pdev) > struct mhi_pci_device *mhi_pdev = pci_get_drvdata(pdev); > struct mhi_controller *mhi_cntrl = &mhi_pdev->mhi_cntrl; > > + del_timer(&mhi_pdev->health_check_timer); > cancel_work_sync(&mhi_pdev->recovery_work); > > if (test_and_clear_bit(MHI_PCI_DEV_STARTED, &mhi_pdev->status)) { > @@ -466,6 +493,8 @@ void mhi_pci_reset_prepare(struct pci_dev *pdev) > > dev_info(&pdev->dev, "reset\n"); > > + del_timer(&mhi_pdev->health_check_timer); > + > /* Clean up MHI state */ > if (test_and_clear_bit(MHI_PCI_DEV_STARTED, &mhi_pdev->status)) { > mhi_power_down(mhi_cntrl, false); > @@ -509,6 +538,7 @@ void mhi_pci_reset_done(struct pci_dev *pdev) > } > > set_bit(MHI_PCI_DEV_STARTED, &mhi_pdev->status); > + mod_timer(&mhi_pdev->health_check_timer, jiffies + HEALTH_CHECK_PERIOD); > } > > static pci_ers_result_t mhi_pci_error_detected(struct pci_dev *pdev, > @@ -569,6 +599,7 @@ int __maybe_unused mhi_pci_suspend(struct device *dev) > struct mhi_pci_device *mhi_pdev = dev_get_drvdata(dev); > struct mhi_controller *mhi_cntrl = &mhi_pdev->mhi_cntrl; > > + del_timer(&mhi_pdev->health_check_timer); > cancel_work_sync(&mhi_pdev->recovery_work); > > /* Transition to M3 state */ > @@ -604,6 +635,9 @@ static int __maybe_unused mhi_pci_resume(struct device *dev) > goto err_recovery; > } > > + /* Resume health check */ > + mod_timer(&mhi_pdev->health_check_timer, jiffies + HEALTH_CHECK_PERIOD); > + > return 0; > > err_recovery: > -- > 2.7.4 >
Hi Mani, On 11/27/20 9:59 PM, Manivannan Sadhasivam wrote: > On Thu, Nov 26, 2020 at 04:29:06PM +0100, Loic Poulain wrote: >> If the modem crashes for any reason, we may not be able to detect >> it at MHI level (MHI registers not reachable anymore). >> >> This patch implements a health-check mechanism to check regularly >> that device is alive (MHI layer can communicate with). If device >> is not alive (because a crash or unexpected reset), the recovery >> procedure is triggered. >> >> Tested successfully with Telit FN980m module. >> >> Signed-off-by: Loic Poulain <loic.poulain@linaro.org> > > Bhaumik, Hemant, does this looks reasonable to you? Or we can do a > better job in the MHI stack. To me this is not a specific usecase for > Telit. As far as i understood the change, MHI is unaware of this because this check is for underlying transport e.g. pci. This change looks good to me. Please apply this patch. Thanks, Hemant > > If you guys plan to implement it later, I can just apply this patch in > the meantime as it looks good to me. > > Reviewed-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org> > > Thanks, > Mani > >> --- >> drivers/bus/mhi/pci_generic.c | 34 ++++++++++++++++++++++++++++++++++ >> 1 file changed, 34 insertions(+) >> >> diff --git a/drivers/bus/mhi/pci_generic.c b/drivers/bus/mhi/pci_generic.c >> index 3ac5cd2..26c2dfa 100644 >> --- a/drivers/bus/mhi/pci_generic.c >> +++ b/drivers/bus/mhi/pci_generic.c >> @@ -14,12 +14,15 @@ >> #include <linux/mhi.h> >> #include <linux/module.h> >> #include <linux/pci.h> >> +#include <linux/timer.h> >> #include <linux/workqueue.h> >> >> #define MHI_PCI_DEFAULT_BAR_NUM 0 >> >> #define DEV_RESET_REG (0xB0) >> >> +#define HEALTH_CHECK_PERIOD (HZ * 5) >> + >> /** >> * struct mhi_pci_dev_info - MHI PCI device specific information >> * @config: MHI controller configuration >> @@ -190,6 +193,7 @@ struct mhi_pci_device { >> struct mhi_controller mhi_cntrl; >> struct pci_saved_state *pci_state; >> struct work_struct recovery_work; >> + struct timer_list health_check_timer; >> unsigned long status; >> }; >> >> @@ -332,6 +336,8 @@ static void mhi_pci_recovery_work(struct work_struct *work) >> >> dev_warn(&pdev->dev, "device recovery started\n"); >> >> + del_timer(&mhi_pdev->health_check_timer); >> + >> /* Clean up MHI state */ >> if (test_and_clear_bit(MHI_PCI_DEV_STARTED, &mhi_pdev->status)) { >> mhi_power_down(mhi_cntrl, false); >> @@ -355,6 +361,7 @@ static void mhi_pci_recovery_work(struct work_struct *work) >> goto err_unprepare; >> >> set_bit(MHI_PCI_DEV_STARTED, &mhi_pdev->status); >> + mod_timer(&mhi_pdev->health_check_timer, jiffies + HEALTH_CHECK_PERIOD); >> return; >> >> err_unprepare: >> @@ -364,6 +371,21 @@ static void mhi_pci_recovery_work(struct work_struct *work) >> dev_err(&pdev->dev, "Recovery failed\n"); >> } >> >> +static void health_check(struct timer_list *t) >> +{ >> + struct mhi_pci_device *mhi_pdev = from_timer(mhi_pdev, t, health_check_timer); >> + struct mhi_controller *mhi_cntrl = &mhi_pdev->mhi_cntrl; >> + >> + if (!mhi_pci_is_alive(mhi_cntrl)) { >> + dev_err(mhi_cntrl->cntrl_dev, "Device died\n"); >> + queue_work(system_long_wq, &mhi_pdev->recovery_work); >> + return; >> + } >> + >> + /* reschedule in two seconds */ >> + mod_timer(&mhi_pdev->health_check_timer, jiffies + HEALTH_CHECK_PERIOD); >> +} >> + >> static int mhi_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) >> { >> const struct mhi_pci_dev_info *info = (struct mhi_pci_dev_info *) id->driver_data; >> @@ -379,6 +401,7 @@ static int mhi_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) >> return -ENOMEM; >> >> INIT_WORK(&mhi_pdev->recovery_work, mhi_pci_recovery_work); >> + timer_setup(&mhi_pdev->health_check_timer, health_check, 0); >> >> mhi_cntrl_config = info->config; >> mhi_cntrl = &mhi_pdev->mhi_cntrl; >> @@ -431,6 +454,9 @@ static int mhi_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) >> >> set_bit(MHI_PCI_DEV_STARTED, &mhi_pdev->status); >> >> + /* start health check */ >> + mod_timer(&mhi_pdev->health_check_timer, jiffies + HEALTH_CHECK_PERIOD); >> + >> return 0; >> >> err_unprepare: >> @@ -446,6 +472,7 @@ static void mhi_pci_remove(struct pci_dev *pdev) >> struct mhi_pci_device *mhi_pdev = pci_get_drvdata(pdev); >> struct mhi_controller *mhi_cntrl = &mhi_pdev->mhi_cntrl; >> >> + del_timer(&mhi_pdev->health_check_timer); >> cancel_work_sync(&mhi_pdev->recovery_work); >> >> if (test_and_clear_bit(MHI_PCI_DEV_STARTED, &mhi_pdev->status)) { >> @@ -466,6 +493,8 @@ void mhi_pci_reset_prepare(struct pci_dev *pdev) >> >> dev_info(&pdev->dev, "reset\n"); >> >> + del_timer(&mhi_pdev->health_check_timer); >> + >> /* Clean up MHI state */ >> if (test_and_clear_bit(MHI_PCI_DEV_STARTED, &mhi_pdev->status)) { >> mhi_power_down(mhi_cntrl, false); >> @@ -509,6 +538,7 @@ void mhi_pci_reset_done(struct pci_dev *pdev) >> } >> >> set_bit(MHI_PCI_DEV_STARTED, &mhi_pdev->status); >> + mod_timer(&mhi_pdev->health_check_timer, jiffies + HEALTH_CHECK_PERIOD); >> } >> >> static pci_ers_result_t mhi_pci_error_detected(struct pci_dev *pdev, >> @@ -569,6 +599,7 @@ int __maybe_unused mhi_pci_suspend(struct device *dev) >> struct mhi_pci_device *mhi_pdev = dev_get_drvdata(dev); >> struct mhi_controller *mhi_cntrl = &mhi_pdev->mhi_cntrl; >> >> + del_timer(&mhi_pdev->health_check_timer); >> cancel_work_sync(&mhi_pdev->recovery_work); >> >> /* Transition to M3 state */ >> @@ -604,6 +635,9 @@ static int __maybe_unused mhi_pci_resume(struct device *dev) >> goto err_recovery; >> } >> >> + /* Resume health check */ >> + mod_timer(&mhi_pdev->health_check_timer, jiffies + HEALTH_CHECK_PERIOD); >> + >> return 0; >> >> err_recovery: >> -- >> 2.7.4 >> -- The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum, a Linux Foundation Collaborative Project
On 11/26/20 7:29 AM, Loic Poulain wrote: > If the modem crashes for any reason, we may not be able to detect > it at MHI level (MHI registers not reachable anymore). > > This patch implements a health-check mechanism to check regularly > that device is alive (MHI layer can communicate with). If device > is not alive (because a crash or unexpected reset), the recovery > procedure is triggered. > > Tested successfully with Telit FN980m module. > > Signed-off-by: Loic Poulain <loic.poulain@linaro.org> > --- Reviewed-by: Hemant Kumar <hemantk@codeaurora.org> -- The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum, a Linux Foundation Collaborative Project
diff --git a/drivers/bus/mhi/pci_generic.c b/drivers/bus/mhi/pci_generic.c index 3ac5cd2..26c2dfa 100644 --- a/drivers/bus/mhi/pci_generic.c +++ b/drivers/bus/mhi/pci_generic.c @@ -14,12 +14,15 @@ #include <linux/mhi.h> #include <linux/module.h> #include <linux/pci.h> +#include <linux/timer.h> #include <linux/workqueue.h> #define MHI_PCI_DEFAULT_BAR_NUM 0 #define DEV_RESET_REG (0xB0) +#define HEALTH_CHECK_PERIOD (HZ * 5) + /** * struct mhi_pci_dev_info - MHI PCI device specific information * @config: MHI controller configuration @@ -190,6 +193,7 @@ struct mhi_pci_device { struct mhi_controller mhi_cntrl; struct pci_saved_state *pci_state; struct work_struct recovery_work; + struct timer_list health_check_timer; unsigned long status; }; @@ -332,6 +336,8 @@ static void mhi_pci_recovery_work(struct work_struct *work) dev_warn(&pdev->dev, "device recovery started\n"); + del_timer(&mhi_pdev->health_check_timer); + /* Clean up MHI state */ if (test_and_clear_bit(MHI_PCI_DEV_STARTED, &mhi_pdev->status)) { mhi_power_down(mhi_cntrl, false); @@ -355,6 +361,7 @@ static void mhi_pci_recovery_work(struct work_struct *work) goto err_unprepare; set_bit(MHI_PCI_DEV_STARTED, &mhi_pdev->status); + mod_timer(&mhi_pdev->health_check_timer, jiffies + HEALTH_CHECK_PERIOD); return; err_unprepare: @@ -364,6 +371,21 @@ static void mhi_pci_recovery_work(struct work_struct *work) dev_err(&pdev->dev, "Recovery failed\n"); } +static void health_check(struct timer_list *t) +{ + struct mhi_pci_device *mhi_pdev = from_timer(mhi_pdev, t, health_check_timer); + struct mhi_controller *mhi_cntrl = &mhi_pdev->mhi_cntrl; + + if (!mhi_pci_is_alive(mhi_cntrl)) { + dev_err(mhi_cntrl->cntrl_dev, "Device died\n"); + queue_work(system_long_wq, &mhi_pdev->recovery_work); + return; + } + + /* reschedule in two seconds */ + mod_timer(&mhi_pdev->health_check_timer, jiffies + HEALTH_CHECK_PERIOD); +} + static int mhi_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) { const struct mhi_pci_dev_info *info = (struct mhi_pci_dev_info *) id->driver_data; @@ -379,6 +401,7 @@ static int mhi_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) return -ENOMEM; INIT_WORK(&mhi_pdev->recovery_work, mhi_pci_recovery_work); + timer_setup(&mhi_pdev->health_check_timer, health_check, 0); mhi_cntrl_config = info->config; mhi_cntrl = &mhi_pdev->mhi_cntrl; @@ -431,6 +454,9 @@ static int mhi_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) set_bit(MHI_PCI_DEV_STARTED, &mhi_pdev->status); + /* start health check */ + mod_timer(&mhi_pdev->health_check_timer, jiffies + HEALTH_CHECK_PERIOD); + return 0; err_unprepare: @@ -446,6 +472,7 @@ static void mhi_pci_remove(struct pci_dev *pdev) struct mhi_pci_device *mhi_pdev = pci_get_drvdata(pdev); struct mhi_controller *mhi_cntrl = &mhi_pdev->mhi_cntrl; + del_timer(&mhi_pdev->health_check_timer); cancel_work_sync(&mhi_pdev->recovery_work); if (test_and_clear_bit(MHI_PCI_DEV_STARTED, &mhi_pdev->status)) { @@ -466,6 +493,8 @@ void mhi_pci_reset_prepare(struct pci_dev *pdev) dev_info(&pdev->dev, "reset\n"); + del_timer(&mhi_pdev->health_check_timer); + /* Clean up MHI state */ if (test_and_clear_bit(MHI_PCI_DEV_STARTED, &mhi_pdev->status)) { mhi_power_down(mhi_cntrl, false); @@ -509,6 +538,7 @@ void mhi_pci_reset_done(struct pci_dev *pdev) } set_bit(MHI_PCI_DEV_STARTED, &mhi_pdev->status); + mod_timer(&mhi_pdev->health_check_timer, jiffies + HEALTH_CHECK_PERIOD); } static pci_ers_result_t mhi_pci_error_detected(struct pci_dev *pdev, @@ -569,6 +599,7 @@ int __maybe_unused mhi_pci_suspend(struct device *dev) struct mhi_pci_device *mhi_pdev = dev_get_drvdata(dev); struct mhi_controller *mhi_cntrl = &mhi_pdev->mhi_cntrl; + del_timer(&mhi_pdev->health_check_timer); cancel_work_sync(&mhi_pdev->recovery_work); /* Transition to M3 state */ @@ -604,6 +635,9 @@ static int __maybe_unused mhi_pci_resume(struct device *dev) goto err_recovery; } + /* Resume health check */ + mod_timer(&mhi_pdev->health_check_timer, jiffies + HEALTH_CHECK_PERIOD); + return 0; err_recovery:
If the modem crashes for any reason, we may not be able to detect it at MHI level (MHI registers not reachable anymore). This patch implements a health-check mechanism to check regularly that device is alive (MHI layer can communicate with). If device is not alive (because a crash or unexpected reset), the recovery procedure is triggered. Tested successfully with Telit FN980m module. Signed-off-by: Loic Poulain <loic.poulain@linaro.org> --- drivers/bus/mhi/pci_generic.c | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) -- 2.7.4