diff mbox series

[v2,3/8] drm/panfrost: Hold runtime PM reference until jobs complete

Message ID 20190823021216.5862-4-robh@kernel.org
State New
Headers show
Series [v2,1/8] drm/panfrost: Fix possible suspend in panfrost_remove | expand

Commit Message

Rob Herring Aug. 23, 2019, 2:12 a.m. UTC
Doing a pm_runtime_put as soon as a job is submitted is wrong as it should
not happen until the job completes. It works because we are relying on the
autosuspend timeout to keep the h/w enabled.

Cc: Tomeu Vizoso <tomeu.vizoso@collabora.com>
Cc: Steven Price <steven.price@arm.com>
Cc: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Cc: David Airlie <airlied@linux.ie>
Cc: Daniel Vetter <daniel@ffwll.ch>
Signed-off-by: Rob Herring <robh@kernel.org>
---
v2: new patch

 drivers/gpu/drm/panfrost/panfrost_job.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

Comments

Steven Price Aug. 23, 2019, 2:50 p.m. UTC | #1
On 23/08/2019 03:12, Rob Herring wrote:
> Doing a pm_runtime_put as soon as a job is submitted is wrong as it should
> not happen until the job completes. It works because we are relying on the
> autosuspend timeout to keep the h/w enabled.
> 
> Cc: Tomeu Vizoso <tomeu.vizoso@collabora.com>
> Cc: Steven Price <steven.price@arm.com>
> Cc: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
> Cc: David Airlie <airlied@linux.ie>
> Cc: Daniel Vetter <daniel@ffwll.ch>
> Signed-off-by: Rob Herring <robh@kernel.org>

Nice find, but I'm a bit worried about one thing.

> ---
> v2: new patch
> 
>   drivers/gpu/drm/panfrost/panfrost_job.c | 16 +++++++++-------
>   1 file changed, 9 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c
> index 05c85f45a0de..80c9cab9a01b 100644
> --- a/drivers/gpu/drm/panfrost/panfrost_job.c
> +++ b/drivers/gpu/drm/panfrost/panfrost_job.c
> @@ -151,7 +151,7 @@ static void panfrost_job_hw_submit(struct panfrost_job *job, int js)
>   		return;
>   
>   	if (WARN_ON(job_read(pfdev, JS_COMMAND_NEXT(js))))
> -		goto end;
> +		return;
>   
>   	cfg = panfrost_mmu_as_get(pfdev, &job->file_priv->mmu);
>   
> @@ -187,10 +187,6 @@ static void panfrost_job_hw_submit(struct panfrost_job *job, int js)
>   	job_write(pfdev, JS_COMMAND_NEXT(js), JS_COMMAND_START);
>   
>   	spin_unlock_irqrestore(&pfdev->hwaccess_lock, flags);
> -
> -end:
> -	pm_runtime_mark_last_busy(pfdev->dev);
> -	pm_runtime_put_autosuspend(pfdev->dev);
>   }
>   
>   static void panfrost_acquire_object_fences(struct drm_gem_object **bos,
> @@ -388,9 +384,13 @@ static void panfrost_job_timedout(struct drm_sched_job *sched_job)
>   
>   	mutex_lock(&pfdev->reset_lock);
>   
> -	for (i = 0; i < NUM_JOB_SLOTS; i++)
> +	for (i = 0; i < NUM_JOB_SLOTS; i++) {
>   		drm_sched_stop(&pfdev->js->queue[i].sched, sched_job);
> -
> +		if (pfdev->jobs[i]) {
> +			pm_runtime_put_noidle(pfdev->dev);
> +			pfdev->jobs[i] = NULL;

I can't see what prevents this racing with panfrost_job_irq_handler() - 
the job could be completing at the same time as we assign NULL. Then 
panfrost_job_irq_handler() will happily dereference the NULL pointer...

Admittedly this patch is an improvement over the situation before :)

Steve

> +		}
> +	}
>   	if (sched_job)
>   		drm_sched_increase_karma(sched_job);
>   
> @@ -455,7 +455,9 @@ static irqreturn_t panfrost_job_irq_handler(int irq, void *data)
>   			pfdev->jobs[j] = NULL;
>   			panfrost_mmu_as_put(pfdev, &job->file_priv->mmu);
>   			panfrost_devfreq_record_transition(pfdev, j);
> +
>   			dma_fence_signal(job->done_fence);
> +			pm_runtime_put_autosuspend(pfdev->dev);
>   		}
>   
>   		status &= ~mask;
>
Rob Herring Aug. 23, 2019, 3:13 p.m. UTC | #2
On Fri, Aug 23, 2019 at 9:50 AM Steven Price <steven.price@arm.com> wrote:
>
> On 23/08/2019 03:12, Rob Herring wrote:
> > Doing a pm_runtime_put as soon as a job is submitted is wrong as it should
> > not happen until the job completes. It works because we are relying on the
> > autosuspend timeout to keep the h/w enabled.
> >
> > Cc: Tomeu Vizoso <tomeu.vizoso@collabora.com>
> > Cc: Steven Price <steven.price@arm.com>
> > Cc: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
> > Cc: David Airlie <airlied@linux.ie>
> > Cc: Daniel Vetter <daniel@ffwll.ch>
> > Signed-off-by: Rob Herring <robh@kernel.org>
>
> Nice find, but I'm a bit worried about one thing.
>
> > ---
> > v2: new patch
> >
> >   drivers/gpu/drm/panfrost/panfrost_job.c | 16 +++++++++-------
> >   1 file changed, 9 insertions(+), 7 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c
> > index 05c85f45a0de..80c9cab9a01b 100644
> > --- a/drivers/gpu/drm/panfrost/panfrost_job.c
> > +++ b/drivers/gpu/drm/panfrost/panfrost_job.c
> > @@ -151,7 +151,7 @@ static void panfrost_job_hw_submit(struct panfrost_job *job, int js)
> >               return;
> >
> >       if (WARN_ON(job_read(pfdev, JS_COMMAND_NEXT(js))))
> > -             goto end;
> > +             return;
> >
> >       cfg = panfrost_mmu_as_get(pfdev, &job->file_priv->mmu);
> >
> > @@ -187,10 +187,6 @@ static void panfrost_job_hw_submit(struct panfrost_job *job, int js)
> >       job_write(pfdev, JS_COMMAND_NEXT(js), JS_COMMAND_START);
> >
> >       spin_unlock_irqrestore(&pfdev->hwaccess_lock, flags);
> > -
> > -end:
> > -     pm_runtime_mark_last_busy(pfdev->dev);
> > -     pm_runtime_put_autosuspend(pfdev->dev);
> >   }
> >
> >   static void panfrost_acquire_object_fences(struct drm_gem_object **bos,
> > @@ -388,9 +384,13 @@ static void panfrost_job_timedout(struct drm_sched_job *sched_job)
> >
> >       mutex_lock(&pfdev->reset_lock);
> >
> > -     for (i = 0; i < NUM_JOB_SLOTS; i++)
> > +     for (i = 0; i < NUM_JOB_SLOTS; i++) {
> >               drm_sched_stop(&pfdev->js->queue[i].sched, sched_job);
> > -
> > +             if (pfdev->jobs[i]) {
> > +                     pm_runtime_put_noidle(pfdev->dev);
> > +                     pfdev->jobs[i] = NULL;
>
> I can't see what prevents this racing with panfrost_job_irq_handler() -
> the job could be completing at the same time as we assign NULL. Then
> panfrost_job_irq_handler() will happily dereference the NULL pointer...

The fact that 1 job's timeout handler is cleaning up all the other
jobs is messy. I'm not sure if there's a better way...

We could perhaps disable the job interrupt at the beginning though I
think we can still have a race as we can't use disable_irq() with a
shared irq. Or do this after the reset.

> Admittedly this patch is an improvement over the situation before :)

Yes, and I'd like to stop digging a deeper hole...

Rob
diff mbox series

Patch

diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c
index 05c85f45a0de..80c9cab9a01b 100644
--- a/drivers/gpu/drm/panfrost/panfrost_job.c
+++ b/drivers/gpu/drm/panfrost/panfrost_job.c
@@ -151,7 +151,7 @@  static void panfrost_job_hw_submit(struct panfrost_job *job, int js)
 		return;
 
 	if (WARN_ON(job_read(pfdev, JS_COMMAND_NEXT(js))))
-		goto end;
+		return;
 
 	cfg = panfrost_mmu_as_get(pfdev, &job->file_priv->mmu);
 
@@ -187,10 +187,6 @@  static void panfrost_job_hw_submit(struct panfrost_job *job, int js)
 	job_write(pfdev, JS_COMMAND_NEXT(js), JS_COMMAND_START);
 
 	spin_unlock_irqrestore(&pfdev->hwaccess_lock, flags);
-
-end:
-	pm_runtime_mark_last_busy(pfdev->dev);
-	pm_runtime_put_autosuspend(pfdev->dev);
 }
 
 static void panfrost_acquire_object_fences(struct drm_gem_object **bos,
@@ -388,9 +384,13 @@  static void panfrost_job_timedout(struct drm_sched_job *sched_job)
 
 	mutex_lock(&pfdev->reset_lock);
 
-	for (i = 0; i < NUM_JOB_SLOTS; i++)
+	for (i = 0; i < NUM_JOB_SLOTS; i++) {
 		drm_sched_stop(&pfdev->js->queue[i].sched, sched_job);
-
+		if (pfdev->jobs[i]) {
+			pm_runtime_put_noidle(pfdev->dev);
+			pfdev->jobs[i] = NULL;
+		}
+	}
 	if (sched_job)
 		drm_sched_increase_karma(sched_job);
 
@@ -455,7 +455,9 @@  static irqreturn_t panfrost_job_irq_handler(int irq, void *data)
 			pfdev->jobs[j] = NULL;
 			panfrost_mmu_as_put(pfdev, &job->file_priv->mmu);
 			panfrost_devfreq_record_transition(pfdev, j);
+
 			dma_fence_signal(job->done_fence);
+			pm_runtime_put_autosuspend(pfdev->dev);
 		}
 
 		status &= ~mask;