drm/vc4: improve throughput by pipelining binning and rendering jobs

Message ID	1455716301-14586-1-git-send-email-varadgautam@gmail.com
State	Accepted
Commit	ca26d28bbaa39f31d5e7e4812603b015c8d54207
Headers	show Delivered-To: patch@linaro.org Received-SPF: pass (google.com: best guess record for domain of dri-devel-bounces@lists.freedesktop.org designates 131.252.210.177 as permitted sender) client-ip=131.252.210.177; From: Varad Gautam <varadgautam@gmail.com> To: dri-devel@lists.freedesktop.org Subject: [PATCH] drm/vc4: improve throughput by pipelining binning and rendering jobs Date: Wed, 17 Feb 2016 19:08:21 +0530 Message-Id: <1455716301-14586-1-git-send-email-varadgautam@gmail.com> Precedence: list MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: base64 Errors-To: dri-devel-bounces@lists.freedesktop.org Sender: "dri-devel" <dri-devel-bounces@lists.freedesktop.org>

diff --git a/drivers/gpu/drm/vc4/vc4_drv.h b/drivers/gpu/drm/vc4/vc4_drv.h index 4c734d0..9c2304b 100644 --- a/drivers/gpu/drm/vc4/vc4_drv.h +++ b/drivers/gpu/drm/vc4/vc4_drv.h @@ -52,7 +52,7 @@ struct vc4_dev { /* Protects bo_cache and the BO stats. */ struct mutex bo_lock; - /* Sequence number for the last job queued in job_list. + /* Sequence number for the last job queued in bin_job_list. * Starts at 0 (no jobs emitted). */ uint64_t emit_seqno; @@ -62,11 +62,19 @@ struct vc4_dev { */ uint64_t finished_seqno; - /* List of all struct vc4_exec_info for jobs to be executed. - * The first job in the list is the one currently programmed - * into ct0ca/ct1ca for execution. + /* List of all struct vc4_exec_info for jobs to be executed in + * the binner. The first job in the list is the one currently + * programmed into ct0ca for execution. */ - struct list_head job_list; + struct list_head bin_job_list; + + /* List of all struct vc4_exec_info for jobs that have + * completed binning and are ready for rendering. The first + * job in the list is the one currently programmed into ct1ca + * for execution. + */ + struct list_head render_job_list; + /* List of the finished vc4_exec_infos waiting to be freed by * job_done_work. */ @@ -276,11 +284,20 @@ struct vc4_exec_info { }; static inline struct vc4_exec_info * -vc4_first_job(struct vc4_dev *vc4) +vc4_first_bin_job(struct vc4_dev *vc4) +{ + if (list_empty(&vc4->bin_job_list)) + return NULL; + return list_first_entry(&vc4->bin_job_list, struct vc4_exec_info, head); +} + +static inline struct vc4_exec_info * +vc4_first_render_job(struct vc4_dev *vc4) { - if (list_empty(&vc4->job_list)) + if (list_empty(&vc4->render_job_list)) return NULL; - return list_first_entry(&vc4->job_list, struct vc4_exec_info, head); + return list_first_entry(&vc4->render_job_list, + struct vc4_exec_info, head); } /** @@ -394,7 +411,9 @@ int vc4_wait_seqno_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv); int vc4_wait_bo_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv); -void vc4_submit_next_job(struct drm_device *dev); +void vc4_submit_next_bin_job(struct drm_device *dev); +void vc4_submit_next_render_job(struct drm_device *dev); +void vc4_move_job_to_render(struct drm_device *dev, struct vc4_exec_info *exec); int vc4_wait_for_seqno(struct drm_device *dev, uint64_t seqno, uint64_t timeout_ns, bool interruptible); void vc4_job_handle_completed(struct vc4_dev *vc4); diff --git a/drivers/gpu/drm/vc4/vc4_gem.c b/drivers/gpu/drm/vc4/vc4_gem.c index 48ce30a..b08f74b 100644 --- a/drivers/gpu/drm/vc4/vc4_gem.c +++ b/drivers/gpu/drm/vc4/vc4_gem.c @@ -140,10 +140,10 @@ vc4_save_hang_state(struct drm_device *dev) struct vc4_dev *vc4 = to_vc4_dev(dev); struct drm_vc4_get_hang_state *state; struct vc4_hang_state *kernel_state; - struct vc4_exec_info *exec; + struct vc4_exec_info *exec[2]; struct vc4_bo *bo; unsigned long irqflags; - unsigned int i, unref_list_count; + unsigned int i, j, unref_list_count, prev_idx; kernel_state = kcalloc(1, sizeof(*kernel_state), GFP_KERNEL); if (!kernel_state) @@ -152,37 +152,55 @@ vc4_save_hang_state(struct drm_device *dev) state = &kernel_state->user_state; spin_lock_irqsave(&vc4->job_lock, irqflags); - exec = vc4_first_job(vc4); - if (!exec) { + exec[0] = vc4_first_bin_job(vc4); + exec[1] = vc4_first_render_job(vc4); + if (!exec[0] && !exec[1]) { spin_unlock_irqrestore(&vc4->job_lock, irqflags); return; } - unref_list_count = 0; - list_for_each_entry(bo, &exec->unref_list, unref_head) - unref_list_count++; + /* Get the bos from both binner and renderer into hang state. */ + state->bo_count = 0; + for (i = 0; i < 2; i++) { + if (!exec[i]) + continue; + + unref_list_count = 0; + list_for_each_entry(bo, &exec[i]->unref_list, unref_head) + unref_list_count++; + state->bo_count += exec[i]->bo_count + unref_list_count; + } + + kernel_state->bo = kcalloc(state->bo_count, + sizeof(*kernel_state->bo), GFP_ATOMIC); - state->bo_count = exec->bo_count + unref_list_count; - kernel_state->bo = kcalloc(state->bo_count, sizeof(*kernel_state->bo), - GFP_ATOMIC); if (!kernel_state->bo) { spin_unlock_irqrestore(&vc4->job_lock, irqflags); return; } - for (i = 0; i < exec->bo_count; i++) { - drm_gem_object_reference(&exec->bo[i]->base); - kernel_state->bo[i] = &exec->bo[i]->base; - } + prev_idx = 0; + for (i = 0; i < 2; i++) { + if (!exec[i]) + continue; - list_for_each_entry(bo, &exec->unref_list, unref_head) { - drm_gem_object_reference(&bo->base.base); - kernel_state->bo[i] = &bo->base.base; - i++; + for (j = 0; j < exec[i]->bo_count; j++) { + drm_gem_object_reference(&exec[i]->bo[j]->base); + kernel_state->bo[j + prev_idx] = &exec[i]->bo[j]->base; + } + + list_for_each_entry(bo, &exec[i]->unref_list, unref_head) { + drm_gem_object_reference(&bo->base.base); + kernel_state->bo[j + prev_idx] = &bo->base.base; + j++; + } + prev_idx = j + 1; } - state->start_bin = exec->ct0ca; - state->start_render = exec->ct1ca; + if (exec[0]) + state->start_bin = exec[0]->ct0ca; + if (exec[1]) + state->start_render = exec[1]->ct1ca; spin_unlock_irqrestore(&vc4->job_lock, irqflags); @@ -259,7 +277,7 @@ vc4_hangcheck_elapsed(unsigned long data) uint32_t ct0ca, ct1ca; /* If idle, we can stop watching for hangs. */ - if (list_empty(&vc4->job_list)) + if (list_empty(&vc4->bin_job_list) && list_empty(&vc4->render_job_list)) return; ct0ca = V3D_READ(V3D_CTNCA(0)); @@ -373,11 +391,13 @@ vc4_flush_caches(struct drm_device *dev) * The job_lock should be held during this. */ void -vc4_submit_next_job(struct drm_device *dev) +vc4_submit_next_bin_job(struct drm_device *dev) { struct vc4_dev *vc4 = to_vc4_dev(dev); - struct vc4_exec_info *exec = vc4_first_job(vc4); + struct vc4_exec_info *exec; +again: + exec = vc4_first_bin_job(vc4); if (!exec) return; @@ -387,11 +407,40 @@ vc4_submit_next_job(struct drm_device *dev) V3D_WRITE(V3D_BPOA, 0); V3D_WRITE(V3D_BPOS, 0); - if (exec->ct0ca != exec->ct0ea) + /* Either put the job in the binner if it uses the binner, or + * immediately move it to the to-be-rendered queue. + */ + if (exec->ct0ca != exec->ct0ea) { submit_cl(dev, 0, exec->ct0ca, exec->ct0ea); + } else { + vc4_move_job_to_render(dev, exec); + goto again; + } +} + +void +vc4_submit_next_render_job(struct drm_device *dev) +{ + struct vc4_dev *vc4 = to_vc4_dev(dev); + struct vc4_exec_info *exec = vc4_first_render_job(vc4); + + if (!exec) + return; + submit_cl(dev, 1, exec->ct1ca, exec->ct1ea); } +void +vc4_move_job_to_render(struct drm_device *dev, struct vc4_exec_info *exec) +{ + struct vc4_dev *vc4 = to_vc4_dev(dev); + bool was_empty = list_empty(&vc4->render_job_list); + + list_move_tail(&exec->head, &vc4->render_job_list); + if (was_empty) + vc4_submit_next_render_job(dev); +} + static void vc4_update_bo_seqnos(struct vc4_exec_info *exec, uint64_t seqno) { @@ -430,14 +479,14 @@ vc4_queue_submit(struct drm_device *dev, struct vc4_exec_info *exec) exec->seqno = seqno; vc4_update_bo_seqnos(exec, seqno); - list_add_tail(&exec->head, &vc4->job_list); + list_add_tail(&exec->head, &vc4->bin_job_list); /* If no job was executing, kick ours off. Otherwise, it'll - * get started when the previous job's frame done interrupt + * get started when the previous job's flush done interrupt * occurs. */ - if (vc4_first_job(vc4) == exec) { - vc4_submit_next_job(dev); + if (vc4_first_bin_job(vc4) == exec) { + vc4_submit_next_bin_job(dev); vc4_queue_hangcheck(dev); } @@ -828,7 +877,8 @@ vc4_gem_init(struct drm_device *dev) { struct vc4_dev *vc4 = to_vc4_dev(dev); - INIT_LIST_HEAD(&vc4->job_list); + INIT_LIST_HEAD(&vc4->bin_job_list); + INIT_LIST_HEAD(&vc4->render_job_list); INIT_LIST_HEAD(&vc4->job_done_list); INIT_LIST_HEAD(&vc4->seqno_cb_list); spin_lock_init(&vc4->job_lock); diff --git a/drivers/gpu/drm/vc4/vc4_irq.c b/drivers/gpu/drm/vc4/vc4_irq.c index b68060e..b02e728 100644 --- a/drivers/gpu/drm/vc4/vc4_irq.c +++ b/drivers/gpu/drm/vc4/vc4_irq.c @@ -30,6 +30,10 @@ * disables that specific interrupt, and 0s written are ignored * (reading either one returns the set of enabled interrupts). * + * When we take a binning flush done interrupt, we need to submit the + * next frame for binning and move the finished frame to the render + * thread. + * * When we take a render frame interrupt, we need to wake the * processes waiting for some frame to be done, and get the next frame * submitted ASAP (so the hardware doesn't sit idle when there's work @@ -44,6 +48,7 @@ #include "vc4_regs.h" #define V3D_DRIVER_IRQS (V3D_INT_OUTOMEM | \ + V3D_INT_FLDONE | \ V3D_INT_FRDONE) DECLARE_WAIT_QUEUE_HEAD(render_wait); @@ -77,7 +82,7 @@ vc4_overflow_mem_work(struct work_struct *work) unsigned long irqflags; spin_lock_irqsave(&vc4->job_lock, irqflags); - current_exec = vc4_first_job(vc4); + current_exec = vc4_first_bin_job(vc4); if (current_exec) { vc4->overflow_mem->seqno = vc4->finished_seqno + 1; list_add_tail(&vc4->overflow_mem->unref_head, @@ -98,17 +103,43 @@ vc4_overflow_mem_work(struct work_struct *work) } static void -vc4_irq_finish_job(struct drm_device *dev) +vc4_irq_finish_bin_job(struct drm_device *dev) +{ + struct vc4_dev *vc4 = to_vc4_dev(dev); + struct vc4_exec_info *exec = vc4_first_bin_job(vc4); + + if (!exec) + return; + + vc4_move_job_to_render(dev, exec); + vc4_submit_next_bin_job(dev); +} + +static void +vc4_cancel_bin_job(struct drm_device *dev) +{ + struct vc4_dev *vc4 = to_vc4_dev(dev); + struct vc4_exec_info *exec = vc4_first_bin_job(vc4); + + if (!exec) + return; + + list_move_tail(&exec->head, &vc4->bin_job_list); + vc4_submit_next_bin_job(dev); +} + +static void +vc4_irq_finish_render_job(struct drm_device *dev) { struct vc4_dev *vc4 = to_vc4_dev(dev); - struct vc4_exec_info *exec = vc4_first_job(vc4); + struct vc4_exec_info *exec = vc4_first_render_job(vc4); if (!exec) return; vc4->finished_seqno++; list_move_tail(&exec->head, &vc4->job_done_list); - vc4_submit_next_job(dev); + vc4_submit_next_render_job(dev); wake_up_all(&vc4->job_wait_queue); schedule_work(&vc4->job_done_work); @@ -125,9 +156,10 @@ vc4_irq(int irq, void *arg) barrier(); intctl = V3D_READ(V3D_INTCTL); - /* Acknowledge the interrupts we're handling here. The render - * frame done interrupt will be cleared, while OUTOMEM will - * stay high until the underlying cause is cleared. + /* Acknowledge the interrupts we're handling here. The binner + * last flush / render frame done interrupt will be cleared, + * while OUTOMEM will stay high until the underlying cause is + * cleared. */ V3D_WRITE(V3D_INTCTL, intctl); @@ -138,9 +170,16 @@ vc4_irq(int irq, void *arg) status = IRQ_HANDLED; } + if (intctl & V3D_INT_FLDONE) { + spin_lock(&vc4->job_lock); + vc4_irq_finish_bin_job(dev); + spin_unlock(&vc4->job_lock); + status = IRQ_HANDLED; + } + if (intctl & V3D_INT_FRDONE) { spin_lock(&vc4->job_lock); - vc4_irq_finish_job(dev); + vc4_irq_finish_render_job(dev); spin_unlock(&vc4->job_lock); status = IRQ_HANDLED; } @@ -205,6 +244,7 @@ void vc4_irq_reset(struct drm_device *dev) V3D_WRITE(V3D_INTENA, V3D_DRIVER_IRQS); spin_lock_irqsave(&vc4->job_lock, irqflags); - vc4_irq_finish_job(dev); + vc4_cancel_bin_job(dev); + vc4_irq_finish_render_job(dev); spin_unlock_irqrestore(&vc4->job_lock, irqflags); }

drm/vc4: improve throughput by pipelining binning and rendering jobs

Commit Message

Comments

Patch