diff mbox series

drm/msm/a6xx+: Insert a fence wait before SMMU table update

Message ID 20240913195132.8282-1-robdclark@gmail.com
State New
Headers show
Series drm/msm/a6xx+: Insert a fence wait before SMMU table update | expand

Commit Message

Rob Clark Sept. 13, 2024, 7:51 p.m. UTC
From: Rob Clark <robdclark@chromium.org>

The CP_SMMU_TABLE_UPDATE _should_ be waiting for idle, but on some
devices (x1-85, possibly others), it seems to pass that barrier while
there are still things in the event completion FIFO waiting to be
written back to memory.

Work around that by adding a fence wait before context switch.  The
CP_EVENT_WRITE that writes the fence is the last write from a submit,
so seeing this value hit memory is a reliable indication that it is
safe to proceed with the context switch.

Closes: https://gitlab.freedesktop.org/drm/msm/-/issues/63
Signed-off-by: Rob Clark <robdclark@chromium.org>
---
 drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

Comments

Konrad Dybcio Sept. 17, 2024, 11:37 p.m. UTC | #1
On 17.09.2024 5:30 PM, Rob Clark wrote:
> On Tue, Sep 17, 2024 at 6:47 AM Konrad Dybcio <konradybcio@kernel.org> wrote:
>>
>> On 13.09.2024 9:51 PM, Rob Clark wrote:
>>> From: Rob Clark <robdclark@chromium.org>
>>>
>>> The CP_SMMU_TABLE_UPDATE _should_ be waiting for idle, but on some
>>> devices (x1-85, possibly others), it seems to pass that barrier while
>>> there are still things in the event completion FIFO waiting to be
>>> written back to memory.
>>
>> Can we try to force-fault around here on other GPUs and perhaps
>> limit this workaround?
> 
> not sure what you mean by "force-fault"...

I suppose 'reproduce' is what I meant

> we could probably limit
> this to certain GPUs, the only reason I didn't is (a) it should be
> harmless when it is not needed,

Do we have any realistic perf hits here?

> and (b) I have no real good way to get
> an exhaustive list of where it is needed.  Maybe/hopefully it is only
> x1-85, but idk.
> 
> It does bring up an interesting question about preemption, though

Yeah..

Do we know what windows does here?

Konrad
Connor Abbott Sept. 18, 2024, 4:51 p.m. UTC | #2
On Fri, Sep 13, 2024 at 8:51 PM Rob Clark <robdclark@gmail.com> wrote:
>
> From: Rob Clark <robdclark@chromium.org>
>
> The CP_SMMU_TABLE_UPDATE _should_ be waiting for idle, but on some
> devices (x1-85, possibly others), it seems to pass that barrier while
> there are still things in the event completion FIFO waiting to be
> written back to memory.
>
> Work around that by adding a fence wait before context switch.  The
> CP_EVENT_WRITE that writes the fence is the last write from a submit,
> so seeing this value hit memory is a reliable indication that it is
> safe to proceed with the context switch.
>
> Closes: https://gitlab.freedesktop.org/drm/msm/-/issues/63
> Signed-off-by: Rob Clark <robdclark@chromium.org>
> ---
>  drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 14 +++++++++++---
>  1 file changed, 11 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> index bcaec86ac67a..ba5b35502e6d 100644
> --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> @@ -101,9 +101,10 @@ static void get_stats_counter(struct msm_ringbuffer *ring, u32 counter,
>  }
>
>  static void a6xx_set_pagetable(struct a6xx_gpu *a6xx_gpu,
> -               struct msm_ringbuffer *ring, struct msm_file_private *ctx)
> +               struct msm_ringbuffer *ring, struct msm_gem_submit *submit)
>  {
>         bool sysprof = refcount_read(&a6xx_gpu->base.base.sysprof_active) > 1;
> +       struct msm_file_private *ctx = submit->queue->ctx;
>         struct adreno_gpu *adreno_gpu = &a6xx_gpu->base;
>         phys_addr_t ttbr;
>         u32 asid;
> @@ -115,6 +116,13 @@ static void a6xx_set_pagetable(struct a6xx_gpu *a6xx_gpu,
>         if (msm_iommu_pagetable_params(ctx->aspace->mmu, &ttbr, &asid))
>                 return;
>
> +       /* Wait for previous submit to complete before continuing: */
> +       OUT_PKT7(ring, CP_WAIT_TIMESTAMP, 4);

CP_WAIT_TIMESTAMP doesn't exist on a6xx, so this won't work there. I
don't know if the bug exists on a6xx, but I'd be inclined to say it
has always existed and we just never hit it because it requires some
very specific timing conditions. We can make it work on a6xx by using
CP_WAIT_REG_MEM and waiting for it to equal the exact value.

Connor

> +       OUT_RING(ring, 0);
> +       OUT_RING(ring, lower_32_bits(rbmemptr(ring, fence)));
> +       OUT_RING(ring, upper_32_bits(rbmemptr(ring, fence)));
> +       OUT_RING(ring, submit->seqno - 1);
> +
>         if (!sysprof) {
>                 if (!adreno_is_a7xx(adreno_gpu)) {
>                         /* Turn off protected mode to write to special registers */
> @@ -193,7 +201,7 @@ static void a6xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
>         struct msm_ringbuffer *ring = submit->ring;
>         unsigned int i, ibs = 0;
>
> -       a6xx_set_pagetable(a6xx_gpu, ring, submit->queue->ctx);
> +       a6xx_set_pagetable(a6xx_gpu, ring, submit);
>
>         get_stats_counter(ring, REG_A6XX_RBBM_PERFCTR_CP(0),
>                 rbmemptr_stats(ring, index, cpcycles_start));
> @@ -283,7 +291,7 @@ static void a7xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
>         OUT_PKT7(ring, CP_THREAD_CONTROL, 1);
>         OUT_RING(ring, CP_THREAD_CONTROL_0_SYNC_THREADS | CP_SET_THREAD_BR);
>
> -       a6xx_set_pagetable(a6xx_gpu, ring, submit->queue->ctx);
> +       a6xx_set_pagetable(a6xx_gpu, ring, submit);
>
>         get_stats_counter(ring, REG_A7XX_RBBM_PERFCTR_CP(0),
>                 rbmemptr_stats(ring, index, cpcycles_start));
> --
> 2.46.0
>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
index bcaec86ac67a..ba5b35502e6d 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
@@ -101,9 +101,10 @@  static void get_stats_counter(struct msm_ringbuffer *ring, u32 counter,
 }
 
 static void a6xx_set_pagetable(struct a6xx_gpu *a6xx_gpu,
-		struct msm_ringbuffer *ring, struct msm_file_private *ctx)
+		struct msm_ringbuffer *ring, struct msm_gem_submit *submit)
 {
 	bool sysprof = refcount_read(&a6xx_gpu->base.base.sysprof_active) > 1;
+	struct msm_file_private *ctx = submit->queue->ctx;
 	struct adreno_gpu *adreno_gpu = &a6xx_gpu->base;
 	phys_addr_t ttbr;
 	u32 asid;
@@ -115,6 +116,13 @@  static void a6xx_set_pagetable(struct a6xx_gpu *a6xx_gpu,
 	if (msm_iommu_pagetable_params(ctx->aspace->mmu, &ttbr, &asid))
 		return;
 
+	/* Wait for previous submit to complete before continuing: */
+	OUT_PKT7(ring, CP_WAIT_TIMESTAMP, 4);
+	OUT_RING(ring, 0);
+	OUT_RING(ring, lower_32_bits(rbmemptr(ring, fence)));
+	OUT_RING(ring, upper_32_bits(rbmemptr(ring, fence)));
+	OUT_RING(ring, submit->seqno - 1);
+
 	if (!sysprof) {
 		if (!adreno_is_a7xx(adreno_gpu)) {
 			/* Turn off protected mode to write to special registers */
@@ -193,7 +201,7 @@  static void a6xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
 	struct msm_ringbuffer *ring = submit->ring;
 	unsigned int i, ibs = 0;
 
-	a6xx_set_pagetable(a6xx_gpu, ring, submit->queue->ctx);
+	a6xx_set_pagetable(a6xx_gpu, ring, submit);
 
 	get_stats_counter(ring, REG_A6XX_RBBM_PERFCTR_CP(0),
 		rbmemptr_stats(ring, index, cpcycles_start));
@@ -283,7 +291,7 @@  static void a7xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
 	OUT_PKT7(ring, CP_THREAD_CONTROL, 1);
 	OUT_RING(ring, CP_THREAD_CONTROL_0_SYNC_THREADS | CP_SET_THREAD_BR);
 
-	a6xx_set_pagetable(a6xx_gpu, ring, submit->queue->ctx);
+	a6xx_set_pagetable(a6xx_gpu, ring, submit);
 
 	get_stats_counter(ring, REG_A7XX_RBBM_PERFCTR_CP(0),
 		rbmemptr_stats(ring, index, cpcycles_start));