diff mbox series

[v3,8/9] mhi: pci_generic: Add health-check

Message ID 1606404547-10737-9-git-send-email-loic.poulain@linaro.org
State Superseded
Headers show
Series mhi: pci_generic: Misc improvements | expand

Commit Message

Loic Poulain Nov. 26, 2020, 3:29 p.m. UTC
If the modem crashes for any reason, we may not be able to detect
it at MHI level (MHI registers not reachable anymore).

This patch implements a health-check mechanism to check regularly
that device is alive (MHI layer can communicate with). If device
is not alive (because a crash or unexpected reset), the recovery
procedure is triggered.

Tested successfully with Telit FN980m module.

Signed-off-by: Loic Poulain <loic.poulain@linaro.org>

---
 drivers/bus/mhi/pci_generic.c | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

-- 
2.7.4

Comments

Manivannan Sadhasivam Nov. 28, 2020, 5:59 a.m. UTC | #1
On Thu, Nov 26, 2020 at 04:29:06PM +0100, Loic Poulain wrote:
> If the modem crashes for any reason, we may not be able to detect

> it at MHI level (MHI registers not reachable anymore).

> 

> This patch implements a health-check mechanism to check regularly

> that device is alive (MHI layer can communicate with). If device

> is not alive (because a crash or unexpected reset), the recovery

> procedure is triggered.

> 

> Tested successfully with Telit FN980m module.

> 

> Signed-off-by: Loic Poulain <loic.poulain@linaro.org>


Bhaumik, Hemant, does this looks reasonable to you? Or we can do a
better job in the MHI stack. To me this is not a specific usecase for
Telit.

If you guys plan to implement it later, I can just apply this patch in
the meantime as it looks good to me.

Reviewed-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>


Thanks,
Mani

> ---

>  drivers/bus/mhi/pci_generic.c | 34 ++++++++++++++++++++++++++++++++++

>  1 file changed, 34 insertions(+)

> 

> diff --git a/drivers/bus/mhi/pci_generic.c b/drivers/bus/mhi/pci_generic.c

> index 3ac5cd2..26c2dfa 100644

> --- a/drivers/bus/mhi/pci_generic.c

> +++ b/drivers/bus/mhi/pci_generic.c

> @@ -14,12 +14,15 @@

>  #include <linux/mhi.h>

>  #include <linux/module.h>

>  #include <linux/pci.h>

> +#include <linux/timer.h>

>  #include <linux/workqueue.h>

>  

>  #define MHI_PCI_DEFAULT_BAR_NUM 0

>  

>  #define DEV_RESET_REG (0xB0)

>  

> +#define HEALTH_CHECK_PERIOD (HZ * 5)

> +

>  /**

>   * struct mhi_pci_dev_info - MHI PCI device specific information

>   * @config: MHI controller configuration

> @@ -190,6 +193,7 @@ struct mhi_pci_device {

>  	struct mhi_controller mhi_cntrl;

>  	struct pci_saved_state *pci_state;

>  	struct work_struct recovery_work;

> +	struct timer_list health_check_timer;

>  	unsigned long status;

>  };

>  

> @@ -332,6 +336,8 @@ static void mhi_pci_recovery_work(struct work_struct *work)

>  

>  	dev_warn(&pdev->dev, "device recovery started\n");

>  

> +	del_timer(&mhi_pdev->health_check_timer);

> +

>  	/* Clean up MHI state */

>  	if (test_and_clear_bit(MHI_PCI_DEV_STARTED, &mhi_pdev->status)) {

>  		mhi_power_down(mhi_cntrl, false);

> @@ -355,6 +361,7 @@ static void mhi_pci_recovery_work(struct work_struct *work)

>  		goto err_unprepare;

>  

>  	set_bit(MHI_PCI_DEV_STARTED, &mhi_pdev->status);

> +	mod_timer(&mhi_pdev->health_check_timer, jiffies + HEALTH_CHECK_PERIOD);

>  	return;

>  

>  err_unprepare:

> @@ -364,6 +371,21 @@ static void mhi_pci_recovery_work(struct work_struct *work)

>  		dev_err(&pdev->dev, "Recovery failed\n");

>  }

>  

> +static void health_check(struct timer_list *t)

> +{

> +	struct mhi_pci_device *mhi_pdev = from_timer(mhi_pdev, t, health_check_timer);

> +	struct mhi_controller *mhi_cntrl = &mhi_pdev->mhi_cntrl;

> +

> +	if (!mhi_pci_is_alive(mhi_cntrl)) {

> +		dev_err(mhi_cntrl->cntrl_dev, "Device died\n");

> +		queue_work(system_long_wq, &mhi_pdev->recovery_work);

> +		return;

> +	}

> +

> +	/* reschedule in two seconds */

> +	mod_timer(&mhi_pdev->health_check_timer, jiffies + HEALTH_CHECK_PERIOD);

> +}

> +

>  static int mhi_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)

>  {

>  	const struct mhi_pci_dev_info *info = (struct mhi_pci_dev_info *) id->driver_data;

> @@ -379,6 +401,7 @@ static int mhi_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)

>  		return -ENOMEM;

>  

>  	INIT_WORK(&mhi_pdev->recovery_work, mhi_pci_recovery_work);

> +	timer_setup(&mhi_pdev->health_check_timer, health_check, 0);

>  

>  	mhi_cntrl_config = info->config;

>  	mhi_cntrl = &mhi_pdev->mhi_cntrl;

> @@ -431,6 +454,9 @@ static int mhi_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)

>  

>  	set_bit(MHI_PCI_DEV_STARTED, &mhi_pdev->status);

>  

> +	/* start health check */

> +	mod_timer(&mhi_pdev->health_check_timer, jiffies + HEALTH_CHECK_PERIOD);

> +

>  	return 0;

>  

>  err_unprepare:

> @@ -446,6 +472,7 @@ static void mhi_pci_remove(struct pci_dev *pdev)

>  	struct mhi_pci_device *mhi_pdev = pci_get_drvdata(pdev);

>  	struct mhi_controller *mhi_cntrl = &mhi_pdev->mhi_cntrl;

>  

> +	del_timer(&mhi_pdev->health_check_timer);

>  	cancel_work_sync(&mhi_pdev->recovery_work);

>  

>  	if (test_and_clear_bit(MHI_PCI_DEV_STARTED, &mhi_pdev->status)) {

> @@ -466,6 +493,8 @@ void mhi_pci_reset_prepare(struct pci_dev *pdev)

>  

>  	dev_info(&pdev->dev, "reset\n");

>  

> +	del_timer(&mhi_pdev->health_check_timer);

> +

>  	/* Clean up MHI state */

>  	if (test_and_clear_bit(MHI_PCI_DEV_STARTED, &mhi_pdev->status)) {

>  		mhi_power_down(mhi_cntrl, false);

> @@ -509,6 +538,7 @@ void mhi_pci_reset_done(struct pci_dev *pdev)

>  	}

>  

>  	set_bit(MHI_PCI_DEV_STARTED, &mhi_pdev->status);

> +	mod_timer(&mhi_pdev->health_check_timer, jiffies + HEALTH_CHECK_PERIOD);

>  }

>  

>  static pci_ers_result_t mhi_pci_error_detected(struct pci_dev *pdev,

> @@ -569,6 +599,7 @@ int  __maybe_unused mhi_pci_suspend(struct device *dev)

>  	struct mhi_pci_device *mhi_pdev = dev_get_drvdata(dev);

>  	struct mhi_controller *mhi_cntrl = &mhi_pdev->mhi_cntrl;

>  

> +	del_timer(&mhi_pdev->health_check_timer);

>  	cancel_work_sync(&mhi_pdev->recovery_work);

>  

>  	/* Transition to M3 state */

> @@ -604,6 +635,9 @@ static int __maybe_unused mhi_pci_resume(struct device *dev)

>  		goto err_recovery;

>  	}

>  

> +	/* Resume health check */

> +	mod_timer(&mhi_pdev->health_check_timer, jiffies + HEALTH_CHECK_PERIOD);

> +

>  	return 0;

>  

>  err_recovery:

> -- 

> 2.7.4

>
Hemant Kumar Dec. 1, 2020, 12:59 a.m. UTC | #2
Hi Mani,

On 11/27/20 9:59 PM, Manivannan Sadhasivam wrote:
> On Thu, Nov 26, 2020 at 04:29:06PM +0100, Loic Poulain wrote:

>> If the modem crashes for any reason, we may not be able to detect

>> it at MHI level (MHI registers not reachable anymore).

>>

>> This patch implements a health-check mechanism to check regularly

>> that device is alive (MHI layer can communicate with). If device

>> is not alive (because a crash or unexpected reset), the recovery

>> procedure is triggered.

>>

>> Tested successfully with Telit FN980m module.

>>

>> Signed-off-by: Loic Poulain <loic.poulain@linaro.org>

> 

> Bhaumik, Hemant, does this looks reasonable to you? Or we can do a

> better job in the MHI stack. To me this is not a specific usecase for

> Telit.


As far as i understood the change, MHI is unaware of this because this 
check is for underlying transport e.g. pci. This change looks good to 
me. Please apply this patch.

Thanks,
Hemant
> 

> If you guys plan to implement it later, I can just apply this patch in

> the meantime as it looks good to me.

> 

> Reviewed-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>

> 

> Thanks,

> Mani

> 

>> ---

>>   drivers/bus/mhi/pci_generic.c | 34 ++++++++++++++++++++++++++++++++++

>>   1 file changed, 34 insertions(+)

>>

>> diff --git a/drivers/bus/mhi/pci_generic.c b/drivers/bus/mhi/pci_generic.c

>> index 3ac5cd2..26c2dfa 100644

>> --- a/drivers/bus/mhi/pci_generic.c

>> +++ b/drivers/bus/mhi/pci_generic.c

>> @@ -14,12 +14,15 @@

>>   #include <linux/mhi.h>

>>   #include <linux/module.h>

>>   #include <linux/pci.h>

>> +#include <linux/timer.h>

>>   #include <linux/workqueue.h>

>>   

>>   #define MHI_PCI_DEFAULT_BAR_NUM 0

>>   

>>   #define DEV_RESET_REG (0xB0)

>>   

>> +#define HEALTH_CHECK_PERIOD (HZ * 5)

>> +

>>   /**

>>    * struct mhi_pci_dev_info - MHI PCI device specific information

>>    * @config: MHI controller configuration

>> @@ -190,6 +193,7 @@ struct mhi_pci_device {

>>   	struct mhi_controller mhi_cntrl;

>>   	struct pci_saved_state *pci_state;

>>   	struct work_struct recovery_work;

>> +	struct timer_list health_check_timer;

>>   	unsigned long status;

>>   };

>>   

>> @@ -332,6 +336,8 @@ static void mhi_pci_recovery_work(struct work_struct *work)

>>   

>>   	dev_warn(&pdev->dev, "device recovery started\n");

>>   

>> +	del_timer(&mhi_pdev->health_check_timer);

>> +

>>   	/* Clean up MHI state */

>>   	if (test_and_clear_bit(MHI_PCI_DEV_STARTED, &mhi_pdev->status)) {

>>   		mhi_power_down(mhi_cntrl, false);

>> @@ -355,6 +361,7 @@ static void mhi_pci_recovery_work(struct work_struct *work)

>>   		goto err_unprepare;

>>   

>>   	set_bit(MHI_PCI_DEV_STARTED, &mhi_pdev->status);

>> +	mod_timer(&mhi_pdev->health_check_timer, jiffies + HEALTH_CHECK_PERIOD);

>>   	return;

>>   

>>   err_unprepare:

>> @@ -364,6 +371,21 @@ static void mhi_pci_recovery_work(struct work_struct *work)

>>   		dev_err(&pdev->dev, "Recovery failed\n");

>>   }

>>   

>> +static void health_check(struct timer_list *t)

>> +{

>> +	struct mhi_pci_device *mhi_pdev = from_timer(mhi_pdev, t, health_check_timer);

>> +	struct mhi_controller *mhi_cntrl = &mhi_pdev->mhi_cntrl;

>> +

>> +	if (!mhi_pci_is_alive(mhi_cntrl)) {

>> +		dev_err(mhi_cntrl->cntrl_dev, "Device died\n");

>> +		queue_work(system_long_wq, &mhi_pdev->recovery_work);

>> +		return;

>> +	}

>> +

>> +	/* reschedule in two seconds */

>> +	mod_timer(&mhi_pdev->health_check_timer, jiffies + HEALTH_CHECK_PERIOD);

>> +}

>> +

>>   static int mhi_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)

>>   {

>>   	const struct mhi_pci_dev_info *info = (struct mhi_pci_dev_info *) id->driver_data;

>> @@ -379,6 +401,7 @@ static int mhi_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)

>>   		return -ENOMEM;

>>   

>>   	INIT_WORK(&mhi_pdev->recovery_work, mhi_pci_recovery_work);

>> +	timer_setup(&mhi_pdev->health_check_timer, health_check, 0);

>>   

>>   	mhi_cntrl_config = info->config;

>>   	mhi_cntrl = &mhi_pdev->mhi_cntrl;

>> @@ -431,6 +454,9 @@ static int mhi_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)

>>   

>>   	set_bit(MHI_PCI_DEV_STARTED, &mhi_pdev->status);

>>   

>> +	/* start health check */

>> +	mod_timer(&mhi_pdev->health_check_timer, jiffies + HEALTH_CHECK_PERIOD);

>> +

>>   	return 0;

>>   

>>   err_unprepare:

>> @@ -446,6 +472,7 @@ static void mhi_pci_remove(struct pci_dev *pdev)

>>   	struct mhi_pci_device *mhi_pdev = pci_get_drvdata(pdev);

>>   	struct mhi_controller *mhi_cntrl = &mhi_pdev->mhi_cntrl;

>>   

>> +	del_timer(&mhi_pdev->health_check_timer);

>>   	cancel_work_sync(&mhi_pdev->recovery_work);

>>   

>>   	if (test_and_clear_bit(MHI_PCI_DEV_STARTED, &mhi_pdev->status)) {

>> @@ -466,6 +493,8 @@ void mhi_pci_reset_prepare(struct pci_dev *pdev)

>>   

>>   	dev_info(&pdev->dev, "reset\n");

>>   

>> +	del_timer(&mhi_pdev->health_check_timer);

>> +

>>   	/* Clean up MHI state */

>>   	if (test_and_clear_bit(MHI_PCI_DEV_STARTED, &mhi_pdev->status)) {

>>   		mhi_power_down(mhi_cntrl, false);

>> @@ -509,6 +538,7 @@ void mhi_pci_reset_done(struct pci_dev *pdev)

>>   	}

>>   

>>   	set_bit(MHI_PCI_DEV_STARTED, &mhi_pdev->status);

>> +	mod_timer(&mhi_pdev->health_check_timer, jiffies + HEALTH_CHECK_PERIOD);

>>   }

>>   

>>   static pci_ers_result_t mhi_pci_error_detected(struct pci_dev *pdev,

>> @@ -569,6 +599,7 @@ int  __maybe_unused mhi_pci_suspend(struct device *dev)

>>   	struct mhi_pci_device *mhi_pdev = dev_get_drvdata(dev);

>>   	struct mhi_controller *mhi_cntrl = &mhi_pdev->mhi_cntrl;

>>   

>> +	del_timer(&mhi_pdev->health_check_timer);

>>   	cancel_work_sync(&mhi_pdev->recovery_work);

>>   

>>   	/* Transition to M3 state */

>> @@ -604,6 +635,9 @@ static int __maybe_unused mhi_pci_resume(struct device *dev)

>>   		goto err_recovery;

>>   	}

>>   

>> +	/* Resume health check */

>> +	mod_timer(&mhi_pdev->health_check_timer, jiffies + HEALTH_CHECK_PERIOD);

>> +

>>   	return 0;

>>   

>>   err_recovery:

>> -- 

>> 2.7.4

>>


-- 
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project
Hemant Kumar Dec. 1, 2020, 1:02 a.m. UTC | #3
On 11/26/20 7:29 AM, Loic Poulain wrote:
> If the modem crashes for any reason, we may not be able to detect

> it at MHI level (MHI registers not reachable anymore).

> 

> This patch implements a health-check mechanism to check regularly

> that device is alive (MHI layer can communicate with). If device

> is not alive (because a crash or unexpected reset), the recovery

> procedure is triggered.

> 

> Tested successfully with Telit FN980m module.

> 

> Signed-off-by: Loic Poulain <loic.poulain@linaro.org>

> ---


Reviewed-by: Hemant Kumar <hemantk@codeaurora.org>

-- 
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project
diff mbox series

Patch

diff --git a/drivers/bus/mhi/pci_generic.c b/drivers/bus/mhi/pci_generic.c
index 3ac5cd2..26c2dfa 100644
--- a/drivers/bus/mhi/pci_generic.c
+++ b/drivers/bus/mhi/pci_generic.c
@@ -14,12 +14,15 @@ 
 #include <linux/mhi.h>
 #include <linux/module.h>
 #include <linux/pci.h>
+#include <linux/timer.h>
 #include <linux/workqueue.h>
 
 #define MHI_PCI_DEFAULT_BAR_NUM 0
 
 #define DEV_RESET_REG (0xB0)
 
+#define HEALTH_CHECK_PERIOD (HZ * 5)
+
 /**
  * struct mhi_pci_dev_info - MHI PCI device specific information
  * @config: MHI controller configuration
@@ -190,6 +193,7 @@  struct mhi_pci_device {
 	struct mhi_controller mhi_cntrl;
 	struct pci_saved_state *pci_state;
 	struct work_struct recovery_work;
+	struct timer_list health_check_timer;
 	unsigned long status;
 };
 
@@ -332,6 +336,8 @@  static void mhi_pci_recovery_work(struct work_struct *work)
 
 	dev_warn(&pdev->dev, "device recovery started\n");
 
+	del_timer(&mhi_pdev->health_check_timer);
+
 	/* Clean up MHI state */
 	if (test_and_clear_bit(MHI_PCI_DEV_STARTED, &mhi_pdev->status)) {
 		mhi_power_down(mhi_cntrl, false);
@@ -355,6 +361,7 @@  static void mhi_pci_recovery_work(struct work_struct *work)
 		goto err_unprepare;
 
 	set_bit(MHI_PCI_DEV_STARTED, &mhi_pdev->status);
+	mod_timer(&mhi_pdev->health_check_timer, jiffies + HEALTH_CHECK_PERIOD);
 	return;
 
 err_unprepare:
@@ -364,6 +371,21 @@  static void mhi_pci_recovery_work(struct work_struct *work)
 		dev_err(&pdev->dev, "Recovery failed\n");
 }
 
+static void health_check(struct timer_list *t)
+{
+	struct mhi_pci_device *mhi_pdev = from_timer(mhi_pdev, t, health_check_timer);
+	struct mhi_controller *mhi_cntrl = &mhi_pdev->mhi_cntrl;
+
+	if (!mhi_pci_is_alive(mhi_cntrl)) {
+		dev_err(mhi_cntrl->cntrl_dev, "Device died\n");
+		queue_work(system_long_wq, &mhi_pdev->recovery_work);
+		return;
+	}
+
+	/* reschedule in two seconds */
+	mod_timer(&mhi_pdev->health_check_timer, jiffies + HEALTH_CHECK_PERIOD);
+}
+
 static int mhi_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
 	const struct mhi_pci_dev_info *info = (struct mhi_pci_dev_info *) id->driver_data;
@@ -379,6 +401,7 @@  static int mhi_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 		return -ENOMEM;
 
 	INIT_WORK(&mhi_pdev->recovery_work, mhi_pci_recovery_work);
+	timer_setup(&mhi_pdev->health_check_timer, health_check, 0);
 
 	mhi_cntrl_config = info->config;
 	mhi_cntrl = &mhi_pdev->mhi_cntrl;
@@ -431,6 +454,9 @@  static int mhi_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 
 	set_bit(MHI_PCI_DEV_STARTED, &mhi_pdev->status);
 
+	/* start health check */
+	mod_timer(&mhi_pdev->health_check_timer, jiffies + HEALTH_CHECK_PERIOD);
+
 	return 0;
 
 err_unprepare:
@@ -446,6 +472,7 @@  static void mhi_pci_remove(struct pci_dev *pdev)
 	struct mhi_pci_device *mhi_pdev = pci_get_drvdata(pdev);
 	struct mhi_controller *mhi_cntrl = &mhi_pdev->mhi_cntrl;
 
+	del_timer(&mhi_pdev->health_check_timer);
 	cancel_work_sync(&mhi_pdev->recovery_work);
 
 	if (test_and_clear_bit(MHI_PCI_DEV_STARTED, &mhi_pdev->status)) {
@@ -466,6 +493,8 @@  void mhi_pci_reset_prepare(struct pci_dev *pdev)
 
 	dev_info(&pdev->dev, "reset\n");
 
+	del_timer(&mhi_pdev->health_check_timer);
+
 	/* Clean up MHI state */
 	if (test_and_clear_bit(MHI_PCI_DEV_STARTED, &mhi_pdev->status)) {
 		mhi_power_down(mhi_cntrl, false);
@@ -509,6 +538,7 @@  void mhi_pci_reset_done(struct pci_dev *pdev)
 	}
 
 	set_bit(MHI_PCI_DEV_STARTED, &mhi_pdev->status);
+	mod_timer(&mhi_pdev->health_check_timer, jiffies + HEALTH_CHECK_PERIOD);
 }
 
 static pci_ers_result_t mhi_pci_error_detected(struct pci_dev *pdev,
@@ -569,6 +599,7 @@  int  __maybe_unused mhi_pci_suspend(struct device *dev)
 	struct mhi_pci_device *mhi_pdev = dev_get_drvdata(dev);
 	struct mhi_controller *mhi_cntrl = &mhi_pdev->mhi_cntrl;
 
+	del_timer(&mhi_pdev->health_check_timer);
 	cancel_work_sync(&mhi_pdev->recovery_work);
 
 	/* Transition to M3 state */
@@ -604,6 +635,9 @@  static int __maybe_unused mhi_pci_resume(struct device *dev)
 		goto err_recovery;
 	}
 
+	/* Resume health check */
+	mod_timer(&mhi_pdev->health_check_timer, jiffies + HEALTH_CHECK_PERIOD);
+
 	return 0;
 
 err_recovery: