[v4,2/2] drm/msm: Hangcheck progress detection

Message ID	20221114193049.1533391-3-robdclark@gmail.com
State	New
Headers	show Return-Path: <linux-arm-msm-owner@kernel.org> From: Rob Clark <robdclark@gmail.com> To: dri-devel@lists.freedesktop.org Cc: freedreno@lists.freedesktop.org, linux-arm-msm@vger.kernel.org, Rob Clark <robdclark@chromium.org>, Chia-I Wu <olvaffe@gmail.com>, Akhil P Oommen <quic_akhilpo@quicinc.com>, Rob Clark <robdclark@gmail.com>, Abhinav Kumar <quic_abhinavk@quicinc.com>, Dmitry Baryshkov <dmitry.baryshkov@linaro.org>, Sean Paul <sean@poorly.run>, David Airlie <airlied@gmail.com>, Daniel Vetter <daniel@ffwll.ch>, Konrad Dybcio <konrad.dybcio@somainline.org>, Douglas Anderson <dianders@chromium.org>, linux-kernel@vger.kernel.org (open list) Subject: [PATCH v4 2/2] drm/msm: Hangcheck progress detection Date: Mon, 14 Nov 2022 11:30:41 -0800 Message-Id: <20221114193049.1533391-3-robdclark@gmail.com> In-Reply-To: <20221114193049.1533391-1-robdclark@gmail.com> References: <20221114193049.1533391-1-robdclark@gmail.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Precedence: bulk
Series	drm/msm: Improved hang detection \| expand [v4,0/2] drm/msm: Improved hang detection [v4,1/2] drm/msm/adreno: Simplify read64/write64 helpers [v4,2/2] drm/msm: Hangcheck progress detection

diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c index 1ff605c18ee6..7fe60c65a1eb 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c @@ -1843,6 +1843,39 @@ static uint32_t a6xx_get_rptr(struct msm_gpu *gpu, struct msm_ringbuffer *ring) return ring->memptrs->rptr = gpu_read(gpu, REG_A6XX_CP_RB_RPTR); } +static bool a6xx_progress(struct msm_gpu *gpu, struct msm_ringbuffer *ring) +{ + struct msm_cp_state cp_state = { + .ib1_base = gpu_read64(gpu, REG_A6XX_CP_IB1_BASE), + .ib2_base = gpu_read64(gpu, REG_A6XX_CP_IB2_BASE), + .ib1_rem = gpu_read(gpu, REG_A6XX_CP_IB1_REM_SIZE), + .ib2_rem = gpu_read(gpu, REG_A6XX_CP_IB2_REM_SIZE), + }; + bool progress; + + /* + * Adjust the remaining data to account for what has already been + * fetched from memory, but not yet consumed by the SQE. + * + * This is not *technically* correct, the amount buffered could + * exceed the IB size due to hw prefetching ahead, but: + * + * (1) We aren't trying to find the exact position, just whether + * progress has been made + * (2) The CP_REG_TO_MEM at the end of a submit should be enough + * to prevent prefetching into an unrelated submit. (And + * either way, at some point the ROQ will be full.) + */ + cp_state.ib1_rem += gpu_read(gpu, REG_A6XX_CP_CSQ_IB1_STAT) >> 16; + cp_state.ib2_rem += gpu_read(gpu, REG_A6XX_CP_CSQ_IB2_STAT) >> 16; + + progress = !!memcmp(&cp_state, &ring->last_cp_state, sizeof(cp_state)); + + ring->last_cp_state = cp_state; + + return progress; +} + static u32 a618_get_speed_bin(u32 fuse) { if (fuse == 0) @@ -1961,6 +1994,7 @@ static const struct adreno_gpu_funcs funcs = { .create_address_space = a6xx_create_address_space, .create_private_address_space = a6xx_create_private_address_space, .get_rptr = a6xx_get_rptr, + .progress = a6xx_progress, }, .get_timestamp = a6xx_get_timestamp, }; diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c index 670651cdfa79..c3b77b44b2aa 100644 --- a/drivers/gpu/drm/msm/msm_drv.c +++ b/drivers/gpu/drm/msm/msm_drv.c @@ -419,7 +419,6 @@ static int msm_drm_init(struct device *dev, const struct drm_driver *drv) priv->dev = ddev; priv->wq = alloc_ordered_workqueue("msm", 0); - priv->hangcheck_period = DRM_MSM_HANGCHECK_DEFAULT_PERIOD; INIT_LIST_HEAD(&priv->objects); mutex_init(&priv->obj_lock); diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h index 0609daf4fa4c..876d8d5eec2f 100644 --- a/drivers/gpu/drm/msm/msm_drv.h +++ b/drivers/gpu/drm/msm/msm_drv.h @@ -225,7 +225,13 @@ struct msm_drm_private { struct drm_atomic_state *pm_state; - /* For hang detection, in ms */ + /** + * hangcheck_period: For hang detection, in ms + * + * Note that in practice, a submit/job will get at least two hangcheck + * periods, due to checking for progress being implemented as simply + * "have the CP position registers changed since last time?" + */ unsigned int hangcheck_period; /** diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c index 3dffee54a951..bfef659d3a5c 100644 --- a/drivers/gpu/drm/msm/msm_gpu.c +++ b/drivers/gpu/drm/msm/msm_gpu.c @@ -500,6 +500,21 @@ static void hangcheck_timer_reset(struct msm_gpu *gpu) round_jiffies_up(jiffies + msecs_to_jiffies(priv->hangcheck_period))); } +static bool made_progress(struct msm_gpu *gpu, struct msm_ringbuffer *ring) +{ + if (ring->hangcheck_progress_retries >= DRM_MSM_HANGCHECK_PROGRESS_RETRIES) + return false; + + if (!gpu->funcs->progress) + return false; + + if (!gpu->funcs->progress(gpu, ring)) + return false; + + ring->hangcheck_progress_retries++; + return true; +} + static void hangcheck_handler(struct timer_list *t) { struct msm_gpu *gpu = from_timer(gpu, t, hangcheck_timer); @@ -511,9 +526,12 @@ static void hangcheck_handler(struct timer_list *t) if (fence != ring->hangcheck_fence) { /* some progress has been made.. ya! */ ring->hangcheck_fence = fence; - } else if (fence_before(fence, ring->fctx->last_fence)) { + ring->hangcheck_progress_retries = 0; + } else if (fence_before(fence, ring->fctx->last_fence) && + !made_progress(gpu, ring)) { /* no progress and not done.. hung! */ ring->hangcheck_fence = fence; + ring->hangcheck_progress_retries = 0; DRM_DEV_ERROR(dev->dev, "%s: hangcheck detected gpu lockup rb %d!\n", gpu->name, ring->id); DRM_DEV_ERROR(dev->dev, "%s: completed fence: %u\n", @@ -845,6 +863,7 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev, struct msm_gpu *gpu, const struct msm_gpu_funcs *funcs, const char *name, struct msm_gpu_config *config) { + struct msm_drm_private *priv = drm->dev_private; int i, ret, nr_rings = config->nr_rings; void *memptrs; uint64_t memptrs_iova; @@ -872,6 +891,16 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev, kthread_init_work(&gpu->recover_work, recover_worker); kthread_init_work(&gpu->fault_work, fault_worker); + priv->hangcheck_period = DRM_MSM_HANGCHECK_DEFAULT_PERIOD; + + /* + * If progress detection is supported, halve the hangcheck timer + * duration, as it takes two iterations of the hangcheck handler + * to detect a hang. + */ + if (funcs->progress) + priv->hangcheck_period /= 2; + timer_setup(&gpu->hangcheck_timer, hangcheck_handler, 0); spin_lock_init(&gpu->perf_lock); diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h index 585fd9c8d45a..651786bc55e5 100644 --- a/drivers/gpu/drm/msm/msm_gpu.h +++ b/drivers/gpu/drm/msm/msm_gpu.h @@ -78,6 +78,15 @@ struct msm_gpu_funcs { struct msm_gem_address_space *(*create_private_address_space) (struct msm_gpu *gpu); uint32_t (*get_rptr)(struct msm_gpu *gpu, struct msm_ringbuffer *ring); + + /** + * progress: Has the GPU made progress? + * + * Return true if GPU position in cmdstream has advanced (or changed) + * since the last call. To avoid false negatives, this should account + * for cmdstream that is buffered in this FIFO upstream of the CP fw. + */ + bool (*progress)(struct msm_gpu *gpu, struct msm_ringbuffer *ring); }; /* Additional state for iommu faults: */ @@ -237,6 +246,7 @@ struct msm_gpu { #define DRM_MSM_INACTIVE_PERIOD 66 /* in ms (roughly four frames) */ #define DRM_MSM_HANGCHECK_DEFAULT_PERIOD 500 /* in ms */ +#define DRM_MSM_HANGCHECK_PROGRESS_RETRIES 3 struct timer_list hangcheck_timer; /* Fault info for most recent iova fault: */ diff --git a/drivers/gpu/drm/msm/msm_ringbuffer.h b/drivers/gpu/drm/msm/msm_ringbuffer.h index 2a5045abe46e..698b333abccd 100644 --- a/drivers/gpu/drm/msm/msm_ringbuffer.h +++ b/drivers/gpu/drm/msm/msm_ringbuffer.h @@ -35,6 +35,11 @@ struct msm_rbmemptrs { volatile u64 ttbr0; }; +struct msm_cp_state { + uint64_t ib1_base, ib2_base; + uint32_t ib1_rem, ib2_rem; +}; + struct msm_ringbuffer { struct msm_gpu *gpu; int id; @@ -64,6 +69,29 @@ struct msm_ringbuffer { uint64_t memptrs_iova; struct msm_fence_context *fctx; + /** + * hangcheck_progress_retries: + * + * The number of extra hangcheck duration cycles that we have given + * due to it appearing that the GPU is making forward progress. + * + * For GPU generations which support progress detection (see. + * msm_gpu_funcs::progress()), if the GPU appears to be making progress + * (ie. the CP has advanced in the command stream, we'll allow up to + * DRM_MSM_HANGCHECK_PROGRESS_RETRIES expirations of the hangcheck timer + * before killing the job. But to detect progress we need two sample + * points, so the duration of the hangcheck timer is halved. In other + * words we'll let the submit run for up to: + * + * (DRM_MSM_HANGCHECK_DEFAULT_PERIOD / 2) * (DRM_MSM_HANGCHECK_PROGRESS_RETRIES + 1) + */ + int hangcheck_progress_retries; + + /** + * last_cp_state: The state of the CP at the last call to gpu->progress() + */ + struct msm_cp_state last_cp_state; + /* * preempt_lock protects preemption and serializes wptr updates against * preemption. Can be aquired from irq context.

[v4,2/2] drm/msm: Hangcheck progress detection

Commit Message

Patch