diff mbox series

[v3,4/8] drm/msm: Fix cx collapse issue during recovery

Message ID 20220730150952.v3.4.I4ac27a0b34ea796ce0f938bb509e257516bc6f57@changeid
State New
Headers show
Series Improve GPU Recovery | expand

Commit Message

Akhil P Oommen July 30, 2022, 9:40 a.m. UTC
There are some hardware logic under CX domain. For a successful
recovery, we should ensure cx headswitch collapses to ensure all the
stale states are cleard out. This is especially true to for a6xx family
where we can GMU co-processor.

Currently, cx doesn't collapse due to a devlink between gpu and its
smmu. So the *struct gpu device* needs to be runtime suspended to ensure
that the iommu driver removes its vote on cx gdsc.

Signed-off-by: Akhil P Oommen <quic_akhilpo@quicinc.com>
---

Changes in v3:
- Simplied the pm refcount drop since we have just a single refcount now
for all active submits

 drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 24 +++++++++++++++++++++---
 drivers/gpu/drm/msm/msm_gpu.c         |  4 +---
 2 files changed, 22 insertions(+), 6 deletions(-)

Comments

Rob Clark July 31, 2022, 4:22 p.m. UTC | #1
On Sat, Jul 30, 2022 at 2:41 AM Akhil P Oommen <quic_akhilpo@quicinc.com> wrote:
>
> There are some hardware logic under CX domain. For a successful
> recovery, we should ensure cx headswitch collapses to ensure all the
> stale states are cleard out. This is especially true to for a6xx family
> where we can GMU co-processor.
>
> Currently, cx doesn't collapse due to a devlink between gpu and its
> smmu. So the *struct gpu device* needs to be runtime suspended to ensure
> that the iommu driver removes its vote on cx gdsc.
>
> Signed-off-by: Akhil P Oommen <quic_akhilpo@quicinc.com>
> ---
>
> Changes in v3:
> - Simplied the pm refcount drop since we have just a single refcount now
> for all active submits
>
>  drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 24 +++++++++++++++++++++---
>  drivers/gpu/drm/msm/msm_gpu.c         |  4 +---
>  2 files changed, 22 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> index 42ed9a3..1b049c5 100644
> --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> @@ -1193,7 +1193,7 @@ static void a6xx_recover(struct msm_gpu *gpu)
>  {
>         struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
>         struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu);
> -       int i;
> +       int i, active_submits;
>
>         adreno_dump_info(gpu);
>
> @@ -1210,8 +1210,26 @@ static void a6xx_recover(struct msm_gpu *gpu)
>          */
>         gmu_write(&a6xx_gpu->gmu, REG_A6XX_GMU_GMU_PWR_COL_KEEPALIVE, 0);
>
> -       gpu->funcs->pm_suspend(gpu);
> -       gpu->funcs->pm_resume(gpu);
> +       pm_runtime_dont_use_autosuspend(&gpu->pdev->dev);
> +
> +       /* active_submit won't change until we make a submission */
> +       mutex_lock(&gpu->active_lock);
> +       active_submits = gpu->active_submits;
> +       mutex_unlock(&gpu->active_lock);
> +
> +       /* Drop the rpm refcount from active submits */
> +       if (active_submits)
> +               pm_runtime_put(&gpu->pdev->dev);

Couldn't this race with an incoming submit triggering active_submits
to transition 0 -> 1?  Moving the mutex_unlock() would solve this.

Actually, maybe just move the mutex_unlock() to the end of the entire
sequence.  You could also clear gpu->active_submits and restore it
before unlocking, so you can drop the removal of the WARN_ON_ONCE
(patch 6/8) which should otherwise be squashed into this patch to keep
things bisectable

> +
> +       /* And the final one from recover worker */
> +       pm_runtime_put_sync(&gpu->pdev->dev);
> +
> +       pm_runtime_use_autosuspend(&gpu->pdev->dev);
> +
> +       if (active_submits)
> +               pm_runtime_get(&gpu->pdev->dev);
> +
> +       pm_runtime_get_sync(&gpu->pdev->dev);
>
>         msm_gpu_hw_init(gpu);
>  }
> diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
> index 1945efb..07e55a6 100644
> --- a/drivers/gpu/drm/msm/msm_gpu.c
> +++ b/drivers/gpu/drm/msm/msm_gpu.c
> @@ -426,9 +426,7 @@ static void recover_worker(struct kthread_work *work)
>                 /* retire completed submits, plus the one that hung: */
>                 retire_submits(gpu);
>
> -               pm_runtime_get_sync(&gpu->pdev->dev);
>                 gpu->funcs->recover(gpu);
> -               pm_runtime_put_sync(&gpu->pdev->dev);

Hmm, could this have some fallout on earlier gens?

I guess I should extend the igt msm_recovery test to run on things
prior to a6xx..

BR,
-R

>
>                 /*
>                  * Replay all remaining submits starting with highest priority
> @@ -445,7 +443,7 @@ static void recover_worker(struct kthread_work *work)
>                 }
>         }
>
> -       pm_runtime_put_sync(&gpu->pdev->dev);
> +       pm_runtime_put(&gpu->pdev->dev);
>
>         mutex_unlock(&gpu->lock);
>
> --
> 2.7.4
>
Akhil P Oommen Aug. 1, 2022, 3:10 p.m. UTC | #2
On 7/31/2022 9:52 PM, Rob Clark wrote:
> On Sat, Jul 30, 2022 at 2:41 AM Akhil P Oommen <quic_akhilpo@quicinc.com> wrote:
>> There are some hardware logic under CX domain. For a successful
>> recovery, we should ensure cx headswitch collapses to ensure all the
>> stale states are cleard out. This is especially true to for a6xx family
>> where we can GMU co-processor.
>>
>> Currently, cx doesn't collapse due to a devlink between gpu and its
>> smmu. So the *struct gpu device* needs to be runtime suspended to ensure
>> that the iommu driver removes its vote on cx gdsc.
>>
>> Signed-off-by: Akhil P Oommen <quic_akhilpo@quicinc.com>
>> ---
>>
>> Changes in v3:
>> - Simplied the pm refcount drop since we have just a single refcount now
>> for all active submits
>>
>>   drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 24 +++++++++++++++++++++---
>>   drivers/gpu/drm/msm/msm_gpu.c         |  4 +---
>>   2 files changed, 22 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
>> index 42ed9a3..1b049c5 100644
>> --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
>> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
>> @@ -1193,7 +1193,7 @@ static void a6xx_recover(struct msm_gpu *gpu)
>>   {
>>          struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
>>          struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu);
>> -       int i;
>> +       int i, active_submits;
>>
>>          adreno_dump_info(gpu);
>>
>> @@ -1210,8 +1210,26 @@ static void a6xx_recover(struct msm_gpu *gpu)
>>           */
>>          gmu_write(&a6xx_gpu->gmu, REG_A6XX_GMU_GMU_PWR_COL_KEEPALIVE, 0);
>>
>> -       gpu->funcs->pm_suspend(gpu);
>> -       gpu->funcs->pm_resume(gpu);
>> +       pm_runtime_dont_use_autosuspend(&gpu->pdev->dev);
>> +
>> +       /* active_submit won't change until we make a submission */
>> +       mutex_lock(&gpu->active_lock);
>> +       active_submits = gpu->active_submits;
>> +       mutex_unlock(&gpu->active_lock);
>> +
>> +       /* Drop the rpm refcount from active submits */
>> +       if (active_submits)
>> +               pm_runtime_put(&gpu->pdev->dev);
> Couldn't this race with an incoming submit triggering active_submits
> to transition 0 -> 1?  Moving the mutex_unlock() would solve this.
>
> Actually, maybe just move the mutex_unlock() to the end of the entire
> sequence.  You could also clear gpu->active_submits and restore it
> before unlocking, so you can drop the removal of the WARN_ON_ONCE
> (patch 6/8) which should otherwise be squashed into this patch to keep
> things bisectable
Because we are holding gpu->lock, there won't be any new submissions to 
gpu. But I agree with your both suggestions.

-Akhil.
>
>> +
>> +       /* And the final one from recover worker */
>> +       pm_runtime_put_sync(&gpu->pdev->dev);
>> +
>> +       pm_runtime_use_autosuspend(&gpu->pdev->dev);
>> +
>> +       if (active_submits)
>> +               pm_runtime_get(&gpu->pdev->dev);
>> +
>> +       pm_runtime_get_sync(&gpu->pdev->dev);
>>
>>          msm_gpu_hw_init(gpu);
>>   }
>> diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
>> index 1945efb..07e55a6 100644
>> --- a/drivers/gpu/drm/msm/msm_gpu.c
>> +++ b/drivers/gpu/drm/msm/msm_gpu.c
>> @@ -426,9 +426,7 @@ static void recover_worker(struct kthread_work *work)
>>                  /* retire completed submits, plus the one that hung: */
>>                  retire_submits(gpu);
>>
>> -               pm_runtime_get_sync(&gpu->pdev->dev);
>>                  gpu->funcs->recover(gpu);
>> -               pm_runtime_put_sync(&gpu->pdev->dev);
> Hmm, could this have some fallout on earlier gens?
>
> I guess I should extend the igt msm_recovery test to run on things
> prior to a6xx..
>
> BR,
> -R
No, because of patch 3/8 in this series.

-Akhil.
>
>>                  /*
>>                   * Replay all remaining submits starting with highest priority
>> @@ -445,7 +443,7 @@ static void recover_worker(struct kthread_work *work)
>>                  }
>>          }
>>
>> -       pm_runtime_put_sync(&gpu->pdev->dev);
>> +       pm_runtime_put(&gpu->pdev->dev);
>>
>>          mutex_unlock(&gpu->lock);
>>
>> --
>> 2.7.4
>>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
index 42ed9a3..1b049c5 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
@@ -1193,7 +1193,7 @@  static void a6xx_recover(struct msm_gpu *gpu)
 {
 	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
 	struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu);
-	int i;
+	int i, active_submits;
 
 	adreno_dump_info(gpu);
 
@@ -1210,8 +1210,26 @@  static void a6xx_recover(struct msm_gpu *gpu)
 	 */
 	gmu_write(&a6xx_gpu->gmu, REG_A6XX_GMU_GMU_PWR_COL_KEEPALIVE, 0);
 
-	gpu->funcs->pm_suspend(gpu);
-	gpu->funcs->pm_resume(gpu);
+	pm_runtime_dont_use_autosuspend(&gpu->pdev->dev);
+
+	/* active_submit won't change until we make a submission */
+	mutex_lock(&gpu->active_lock);
+	active_submits = gpu->active_submits;
+	mutex_unlock(&gpu->active_lock);
+
+	/* Drop the rpm refcount from active submits */
+	if (active_submits)
+		pm_runtime_put(&gpu->pdev->dev);
+
+	/* And the final one from recover worker */
+	pm_runtime_put_sync(&gpu->pdev->dev);
+
+	pm_runtime_use_autosuspend(&gpu->pdev->dev);
+
+	if (active_submits)
+		pm_runtime_get(&gpu->pdev->dev);
+
+	pm_runtime_get_sync(&gpu->pdev->dev);
 
 	msm_gpu_hw_init(gpu);
 }
diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
index 1945efb..07e55a6 100644
--- a/drivers/gpu/drm/msm/msm_gpu.c
+++ b/drivers/gpu/drm/msm/msm_gpu.c
@@ -426,9 +426,7 @@  static void recover_worker(struct kthread_work *work)
 		/* retire completed submits, plus the one that hung: */
 		retire_submits(gpu);
 
-		pm_runtime_get_sync(&gpu->pdev->dev);
 		gpu->funcs->recover(gpu);
-		pm_runtime_put_sync(&gpu->pdev->dev);
 
 		/*
 		 * Replay all remaining submits starting with highest priority
@@ -445,7 +443,7 @@  static void recover_worker(struct kthread_work *work)
 		}
 	}
 
-	pm_runtime_put_sync(&gpu->pdev->dev);
+	pm_runtime_put(&gpu->pdev->dev);
 
 	mutex_unlock(&gpu->lock);