[08/16] target/arm: Use SVEContLdSt in sve_ld1_r

Message ID 20200311064420.30606-9-richard.henderson@linaro.org
State Superseded
Headers show
Series
  • target/arm: sve load/store improvements
Related show

Commit Message

Richard Henderson March 11, 2020, 6:44 a.m.
First use of the new helper functions, so we can remove the
unused markup.  No longer need a scratch for user-only, as
we completely probe the page set before reading; system mode
still requires a scratch for MMIO.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

---
 target/arm/sve_helper.c | 188 +++++++++++++++++++++-------------------
 1 file changed, 97 insertions(+), 91 deletions(-)

-- 
2.20.1

Comments

Peter Maydell April 16, 2020, 1:26 p.m. | #1
On Wed, 11 Mar 2020 at 06:44, Richard Henderson
<richard.henderson@linaro.org> wrote:
>

> First use of the new helper functions, so we can remove the

> unused markup.  No longer need a scratch for user-only, as

> we completely probe the page set before reading; system mode

> still requires a scratch for MMIO.

>

> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>


> +    /* The entire operation is in RAM, on valid pages. */

> +

> +    memset(vd, 0, reg_max);

> +    mem_off = info.mem_off_first[0];

> +    reg_off = info.reg_off_first[0];

> +    reg_last = info.reg_off_last[0];

> +    host = info.page[0].host;

> +

> +    while (reg_off <= reg_last) {

> +        uint64_t pg = vg[reg_off >> 6];

> +        do {

> +            if ((pg >> (reg_off & 63)) & 1) {

> +                host_fn(vd, reg_off, host + mem_off);

> +            }

> +            reg_off += 1 << esz;

> +            mem_off += 1 << msz;

> +        } while (reg_off <= reg_last && (reg_off & 63));

> +    }

> +

> +    /*

> +     * Use the slow path to manage the cross-page misalignment.

> +     * But we know this is RAM and cannot trap.

> +     */

> +    mem_off = info.mem_off_split;

> +    if (unlikely(mem_off >= 0)) {

> +        tlb_fn(env, vd, info.reg_off_split, addr + mem_off, retaddr);

> +    }

> +

> +    mem_off = info.mem_off_first[1];

> +    if (unlikely(mem_off >= 0)) {

> +        reg_off = info.reg_off_first[1];

> +        reg_last = info.reg_off_last[1];

> +        host = info.page[1].host;

> +

> +        do {

> +            uint64_t pg = vg[reg_off >> 6];

> +            do {

> +                if ((pg >> (reg_off & 63)) & 1) {

> +                    host_fn(vd, reg_off, host + mem_off);

> +                }

> +                reg_off += 1 << esz;

> +                mem_off += 1 << msz;

> +            } while (reg_off & 63);

> +        } while (reg_off <= reg_last);


Does this loop for the second page need to be phrased
differently than the loop for the first page was? I was
expecting the two chunks of code to be identical, and they
almost are, but not quite...

Either way
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>


thanks
-- PMM
Richard Henderson April 18, 2020, 3:41 a.m. | #2
On 4/16/20 6:26 AM, Peter Maydell wrote:
> On Wed, 11 Mar 2020 at 06:44, Richard Henderson

> <richard.henderson@linaro.org> wrote:

>>

>> First use of the new helper functions, so we can remove the

>> unused markup.  No longer need a scratch for user-only, as

>> we completely probe the page set before reading; system mode

>> still requires a scratch for MMIO.

>>

>> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

> 

>> +    /* The entire operation is in RAM, on valid pages. */

>> +

>> +    memset(vd, 0, reg_max);

>> +    mem_off = info.mem_off_first[0];

>> +    reg_off = info.reg_off_first[0];

>> +    reg_last = info.reg_off_last[0];

>> +    host = info.page[0].host;

>> +

>> +    while (reg_off <= reg_last) {

>> +        uint64_t pg = vg[reg_off >> 6];

>> +        do {

>> +            if ((pg >> (reg_off & 63)) & 1) {

>> +                host_fn(vd, reg_off, host + mem_off);

>> +            }

>> +            reg_off += 1 << esz;

>> +            mem_off += 1 << msz;

>> +        } while (reg_off <= reg_last && (reg_off & 63));

>> +    }

>> +

>> +    /*

>> +     * Use the slow path to manage the cross-page misalignment.

>> +     * But we know this is RAM and cannot trap.

>> +     */

>> +    mem_off = info.mem_off_split;

>> +    if (unlikely(mem_off >= 0)) {

>> +        tlb_fn(env, vd, info.reg_off_split, addr + mem_off, retaddr);

>> +    }

>> +

>> +    mem_off = info.mem_off_first[1];

>> +    if (unlikely(mem_off >= 0)) {

>> +        reg_off = info.reg_off_first[1];

>> +        reg_last = info.reg_off_last[1];

>> +        host = info.page[1].host;

>> +

>> +        do {

>> +            uint64_t pg = vg[reg_off >> 6];

>> +            do {

>> +                if ((pg >> (reg_off & 63)) & 1) {

>> +                    host_fn(vd, reg_off, host + mem_off);

>> +                }

>> +                reg_off += 1 << esz;

>> +                mem_off += 1 << msz;

>> +            } while (reg_off & 63);

>> +        } while (reg_off <= reg_last);

> 

> Does this loop for the second page need to be phrased

> differently than the loop for the first page was? I was

> expecting the two chunks of code to be identical, and they

> almost are, but not quite...


Yes, they do need to be different.  In particular, the first page may only have
the one element that crosses the page boundary, so we may have reg_first >
reg_last.  The second page is never has that problem.


r~

Patch

diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index 1397c2b634..b827900a4e 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -4227,9 +4227,9 @@  typedef struct {
  * final element on each page.  Identify any single element that spans
  * the page boundary.  Return true if there are any active elements.
  */
-static bool __attribute__((unused))
-sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr, uint64_t *vg,
-                       intptr_t reg_max, int esz, int msize)
+static bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr,
+                                   uint64_t *vg, intptr_t reg_max,
+                                   int esz, int msize)
 {
     const int esize = 1 << esz;
     const uint64_t pg_mask = pred_esz_masks[esz];
@@ -4319,10 +4319,9 @@  sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr, uint64_t *vg,
  * Control the generation of page faults with @fault.  Return false if
  * there is no work to do, which can only happen with @fault == FAULT_NO.
  */
-static bool __attribute__((unused))
-sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault, CPUARMState *env,
-                    target_ulong addr, MMUAccessType access_type,
-                    uintptr_t retaddr)
+static bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault,
+                                CPUARMState *env, target_ulong addr,
+                                MMUAccessType access_type, uintptr_t retaddr)
 {
     int mmu_idx = cpu_mmu_index(env, false);
     int mem_off = info->mem_off_first[0];
@@ -4394,109 +4393,116 @@  static inline bool test_host_page(void *host)
 /*
  * Common helper for all contiguous one-register predicated loads.
  */
-static void sve_ld1_r(CPUARMState *env, void *vg, const target_ulong addr,
-                      uint32_t desc, const uintptr_t retaddr,
-                      const int esz, const int msz,
-                      sve_ldst1_host_fn *host_fn,
-                      sve_ldst1_tlb_fn *tlb_fn)
+static inline QEMU_ALWAYS_INLINE
+void sve_ld1_r(CPUARMState *env, uint64_t *vg, const target_ulong addr,
+               uint32_t desc, const uintptr_t retaddr,
+               const int esz, const int msz,
+               sve_ldst1_host_fn *host_fn,
+               sve_ldst1_tlb_fn *tlb_fn)
 {
-    const TCGMemOpIdx oi = extract32(desc, SIMD_DATA_SHIFT, MEMOPIDX_SHIFT);
-    const int mmu_idx = get_mmuidx(oi);
     const unsigned rd = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 5);
     void *vd = &env->vfp.zregs[rd];
-    const int diffsz = esz - msz;
     const intptr_t reg_max = simd_oprsz(desc);
-    const intptr_t mem_max = reg_max >> diffsz;
-    ARMVectorReg scratch;
+    intptr_t reg_off, reg_last, mem_off;
+    SVEContLdSt info;
     void *host;
-    intptr_t split, reg_off, mem_off;
+    int flags;
 
-    /* Find the first active element.  */
-    reg_off = find_next_active(vg, 0, reg_max, esz);
-    if (unlikely(reg_off == reg_max)) {
+    /* Find the active elements.  */
+    if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) {
         /* The entire predicate was false; no load occurs.  */
         memset(vd, 0, reg_max);
         return;
     }
-    mem_off = reg_off >> diffsz;
 
-    /*
-     * If the (remaining) load is entirely within a single page, then:
-     * For softmmu, and the tlb hits, then no faults will occur;
-     * For user-only, either the first load will fault or none will.
-     * We can thus perform the load directly to the destination and
-     * Vd will be unmodified on any exception path.
-     */
-    split = max_for_page(addr, mem_off, mem_max);
-    if (likely(split == mem_max)) {
-        host = tlb_vaddr_to_host(env, addr + mem_off, MMU_DATA_LOAD, mmu_idx);
-        if (test_host_page(host)) {
-            intptr_t i = reg_off;
-            host -= mem_off;
-            do {
-                host_fn(vd, i, host + (i >> diffsz));
-                i = find_next_active(vg, i + (1 << esz), reg_max, esz);
-            } while (i < reg_max);
-            /* After having taken any fault, zero leading inactive elements. */
-            swap_memzero(vd, reg_off);
-            return;
-        }
-    }
+    /* Probe the page(s).  Exit with exception for any invalid page. */
+    sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr);
 
-    /*
-     * Perform the predicated read into a temporary, thus ensuring
-     * if the load of the last element faults, Vd is not modified.
-     */
+    flags = info.page[0].flags | info.page[1].flags;
+    if (unlikely(flags != 0)) {
 #ifdef CONFIG_USER_ONLY
-    swap_memzero(&scratch, reg_off);
-    host = g2h(addr);
-    do {
-        host_fn(&scratch, reg_off, host + (reg_off >> diffsz));
-        reg_off += 1 << esz;
-        reg_off = find_next_active(vg, reg_off, reg_max, esz);
-    } while (reg_off < reg_max);
+        g_assert_not_reached();
 #else
-    memset(&scratch, 0, reg_max);
-    goto start;
-    while (1) {
-        reg_off = find_next_active(vg, reg_off, reg_max, esz);
-        if (reg_off >= reg_max) {
-            break;
-        }
-        mem_off = reg_off >> diffsz;
-        split = max_for_page(addr, mem_off, mem_max);
+        /*
+         * At least one page includes MMIO (or watchpoints).
+         * Any bus operation can fail with cpu_transaction_failed,
+         * which for ARM will raise SyncExternal.  Perform the load
+         * into scratch memory to preserve register state until the end.
+         */
+        ARMVectorReg scratch;
 
-    start:
-        if (split - mem_off >= (1 << msz)) {
-            /* At least one whole element on this page.  */
-            host = tlb_vaddr_to_host(env, addr + mem_off,
-                                     MMU_DATA_LOAD, mmu_idx);
-            if (host) {
-                host -= mem_off;
-                do {
-                    host_fn(&scratch, reg_off, host + mem_off);
-                    reg_off += 1 << esz;
-                    reg_off = find_next_active(vg, reg_off, reg_max, esz);
-                    mem_off = reg_off >> diffsz;
-                } while (split - mem_off >= (1 << msz));
-                continue;
+        memset(&scratch, 0, reg_max);
+        mem_off = info.mem_off_first[0];
+        reg_off = info.reg_off_first[0];
+        reg_last = info.reg_off_last[1];
+        if (reg_last < 0) {
+            reg_last = info.reg_off_split;
+            if (reg_last < 0) {
+                reg_last = info.reg_off_last[0];
             }
         }
 
-        /*
-         * Perform one normal read.  This may fault, longjmping out to the
-         * main loop in order to raise an exception.  It may succeed, and
-         * as a side-effect load the TLB entry for the next round.  Finally,
-         * in the extremely unlikely case we're performing this operation
-         * on I/O memory, it may succeed but not bring in the TLB entry.
-         * But even then we have still made forward progress.
-         */
-        tlb_fn(env, &scratch, reg_off, addr + mem_off, retaddr);
-        reg_off += 1 << esz;
-    }
-#endif
+        do {
+            uint64_t pg = vg[reg_off >> 6];
+            do {
+                if ((pg >> (reg_off & 63)) & 1) {
+                    tlb_fn(env, &scratch, reg_off, addr + mem_off, retaddr);
+                }
+                reg_off += 1 << esz;
+                mem_off += 1 << msz;
+            } while (reg_off & 63);
+        } while (reg_off <= reg_last);
 
-    memcpy(vd, &scratch, reg_max);
+        memcpy(vd, &scratch, reg_max);
+        return;
+#endif
+    }
+
+    /* The entire operation is in RAM, on valid pages. */
+
+    memset(vd, 0, reg_max);
+    mem_off = info.mem_off_first[0];
+    reg_off = info.reg_off_first[0];
+    reg_last = info.reg_off_last[0];
+    host = info.page[0].host;
+
+    while (reg_off <= reg_last) {
+        uint64_t pg = vg[reg_off >> 6];
+        do {
+            if ((pg >> (reg_off & 63)) & 1) {
+                host_fn(vd, reg_off, host + mem_off);
+            }
+            reg_off += 1 << esz;
+            mem_off += 1 << msz;
+        } while (reg_off <= reg_last && (reg_off & 63));
+    }
+
+    /*
+     * Use the slow path to manage the cross-page misalignment.
+     * But we know this is RAM and cannot trap.
+     */
+    mem_off = info.mem_off_split;
+    if (unlikely(mem_off >= 0)) {
+        tlb_fn(env, vd, info.reg_off_split, addr + mem_off, retaddr);
+    }
+
+    mem_off = info.mem_off_first[1];
+    if (unlikely(mem_off >= 0)) {
+        reg_off = info.reg_off_first[1];
+        reg_last = info.reg_off_last[1];
+        host = info.page[1].host;
+
+        do {
+            uint64_t pg = vg[reg_off >> 6];
+            do {
+                if ((pg >> (reg_off & 63)) & 1) {
+                    host_fn(vd, reg_off, host + mem_off);
+                }
+                reg_off += 1 << esz;
+                mem_off += 1 << msz;
+            } while (reg_off & 63);
+        } while (reg_off <= reg_last);
+    }
 }
 
 #define DO_LD1_1(NAME, ESZ) \