diff mbox series

[RFC,v3,31/34] Hexagon (target/hexagon) translation

Message ID 1597765847-16637-32-git-send-email-tsimpson@quicinc.com
State New
Headers show
Series Hexagon patch series | expand

Commit Message

Taylor Simpson Aug. 18, 2020, 3:50 p.m. UTC
Read the instruction memory
Create a packet data structure
Generate TCG code for the start of the packet
Invoke the generate function for each instruction
Generate TCG code for the end of the packet

Signed-off-by: Taylor Simpson <tsimpson@quicinc.com>
---
 target/hexagon/translate.h | 103 +++++++
 target/hexagon/translate.c | 730 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 833 insertions(+)
 create mode 100644 target/hexagon/translate.h
 create mode 100644 target/hexagon/translate.c

Comments

Taylor Simpson Aug. 30, 2020, 7:37 p.m. UTC | #1
> -----Original Message-----

> From: Richard Henderson <richard.henderson@linaro.org>

> Sent: Friday, August 28, 2020 8:50 PM

> To: Taylor Simpson <tsimpson@quicinc.com>; qemu-devel@nongnu.org

> Cc: philmd@redhat.com; laurent@vivier.eu; riku.voipio@iki.fi;

> aleksandar.m.mail@gmail.com; ale@rev.ng

> Subject: Re: [RFC PATCH v3 31/34] Hexagon (target/hexagon) translation

>

> > +static inline void ctx_log_reg_write(DisasContext *ctx, int rnum)

> > +{

> > +#if HEX_DEBUG

> > +    int i;

> > +    for (i = 0; i < ctx->reg_log_idx; i++) {

> > +        if (ctx->reg_log[i] == rnum) {

> > +            HEX_DEBUG_LOG("WARNING: Multiple writes to r%d\n", rnum);

> > +        }

> > +    }

> > +#endif

> > +    ctx->reg_log[ctx->reg_log_idx] = rnum;

> > +    ctx->reg_log_idx++;

> > +}

>

> Why not just keep a bitmask of the rnum written?

> Does the order of this log really matter?


OK

> > +static inline void gen_slot_cancelled_check(TCGv check, int slot_num)

> > +{

> > +    TCGv mask = tcg_const_tl(1 << slot_num);

> > +    TCGv one = tcg_const_tl(1);

> > +    TCGv zero = tcg_const_tl(0);

> > +

> > +    tcg_gen_and_tl(mask, hex_slot_cancelled, mask);

> > +    tcg_gen_movcond_tl(TCG_COND_NE, check, mask, zero, one, zero);

>

> This is a bit silly.  Better as

>

>     tcg_gen_extract_i32(check, hex_slot_cancelled, slot_num, 1);


OK

> > +static int read_packet_words(CPUHexagonState *env, DisasContext *ctx,

> > +                             uint32_t words[])

> > +{

> > +    bool found_end = false;

> > +    int max_words;

> > +    int nwords;

> > +    int i;

> > +

> > +    /* Make sure we don't cross a page boundary */

> > +    max_words = -(ctx->base.pc_next | TARGET_PAGE_MASK) /

> sizeof(uint32_t);

> > +    if (max_words < PACKET_WORDS_MAX) {

> > +        /* Might cross a page boundary */

> > +        if (ctx->base.num_insns == 1) {

> > +            /* OK if it's the first packet in the TB */

> > +            max_words = PACKET_WORDS_MAX;

> > +        }

> > +    } else {

> > +        max_words = PACKET_WORDS_MAX;

> > +    }

> > +

> > +    memset(words, 0, PACKET_WORDS_MAX * sizeof(uint32_t));

> > +    for (nwords = 0; !found_end && nwords < max_words; nwords++) {

> > +        words[nwords] = cpu_ldl_code(env,

> > +                                ctx->base.pc_next + nwords * sizeof(uint32_t));

> > +        found_end = is_packet_end(words[nwords]);

> > +    }

> > +    if (!found_end) {

> > +        if (nwords == PACKET_WORDS_MAX) {

> > +            /* Read too many words without finding the end */

> > +            gen_exception(HEX_EXCP_INVALID_PACKET);

> > +            ctx->base.is_jmp = DISAS_NORETURN;

> > +            return 0;

> > +        }

> > +        /* Crosses page boundary - defer to next TB */

> > +        ctx->base.is_jmp = DISAS_TOO_MANY;

>

> The problem with this is that the translator has asked for the next insn, and

> you havn't provided it.

>

> One way to fix this might be to decrement ctx->base.num_insns, to

> compensate

> for the increment that *will* happen after you return.

>

> Another way, which involves less poking about into internals is to look for the

> next packet once you've completed the current packet.  This is what Arm

> does

> for thumb2 insns:

>

> >     if (dc->base.is_jmp == DISAS_NEXT

> >         && (dc->base.pc_next - dc->page_start >= TARGET_PAGE_SIZE

> >             || (dc->base.pc_next - dc->page_start >= TARGET_PAGE_SIZE - 3

> >                 && insn_crosses_page(env, dc)))) {

> >         dc->base.is_jmp = DISAS_TOO_MANY;

> >     }

>

> I.e. you only have to do this for the few packets that are near enough to the

> end of the page that PACKET_WORDS_MAX crosses.


I'm actually checking two conditions here.
1) packet crossing a page boundary
2) reading too many words without finding the end of the packet.
I guess it would be better to separate them.

What is the correct behavior for the second case?  Should we return an error code from here and have the higher level code generate the invalid packet exception?

> > +    tcg_gen_movi_tl(hex_gpr[HEX_REG_PC], ctx->base.pc_next);

> > +    tcg_gen_movi_tl(hex_slot_cancelled, 0);

> > +    if (pkt->pkt_has_cof) {

> > +        tcg_gen_movi_tl(hex_branch_taken, 0);

> > +        tcg_gen_movi_tl(hex_next_PC, next_PC);

> > +    }

> > +    tcg_gen_movi_tl(hex_pred_written, 0);

> > +}

>

> Surely you don't need to actually set PC for every PC?


What do other targets do?

> Nor set hex_slot_cancelled if the packet contains nothing that can cancel

> anything.  Nor set hex_pred_written if no predicates are written.


Checking for instructions that can cancel is pretty straightforward because we have the CONDEXEC attribute.  Checking if any predicates are written will be more complex.  I'll scratch my head and figure out the cleanest way to do this.

>

> > +    TCGv cancelled = tcg_temp_local_new();

> > +    TCGLabel *label_end = gen_new_label();

> > +

> > +    /* Don't do anything if the slot was cancelled */

> > +    gen_slot_cancelled_check(cancelled, slot_num);

> > +    tcg_gen_brcondi_tl(TCG_COND_NE, cancelled, 0, label_end);

>

> cancelled does not need to be local; it is consumed by the branch and not

> consumed afterward.  Just free it here.


OK

> > +        /*

> > +         * If we know the width from the DisasContext, we can

> > +         * generate much cleaner code.

> > +         * Unfortunately, not all instructions execute the fSTORE

> > +         * macro during code generation.  Anything that uses the

> > +         * generic helper will have this problem.  Instructions

> > +         * that use fWRAP to generate proper TCG code will be OK.

> > +         */

>

> OMG.  How disgusting.


The word "generic" is a typo - should be "generated".  In order to keep this series small, we're only overriding the helper for the minimal number of instructions.  Over time, we'll override all the stores and we can eliminate this.


> > +            value = tcg_temp_new();

> > +            tcg_gen_mov_tl(value, hex_store_val32[slot_num]);

> > +            tcg_gen_qemu_st8(value, address, ctx->mem_idx);

> > +            tcg_temp_free(value);

>

> Why are you copying to a temporary?


Will fix.

> > +        default:

> > +            /*

> > +             * If we get to here, we don't know the width at

> > +             * TCG generation time, we'll generate branching

> > +             * based on the width at runtime.

> > +             */

> > +            label_w2 = gen_new_label();

> > +            label_w4 = gen_new_label();

> > +            label_w8 = gen_new_label();

> > +            TCGv width = tcg_temp_local_new();

>

> You might as well make this a helper.  This is going to generate a *lot* of

> code.


This is the part that will go away when we override all the stores.

> > +static void gen_exec_counters(packet_t *pkt)

> > +{

> > +    int num_insns = pkt->num_insns;

> > +    int num_real_insns = 0;

> > +    int i;

> > +

> > +    for (i = 0; i < num_insns; i++) {

> > +        if (!pkt->insn[i].is_endloop &&

> > +            !pkt->insn[i].part1 &&

> > +            !GET_ATTRIB(pkt->insn[i].opcode, A_IT_NOP)) {

> > +            num_real_insns++;

> > +        }

> > +    }

> > +

> > +    tcg_gen_addi_tl(hex_gpr[HEX_REG_QEMU_PKT_CNT],

> > +                    hex_gpr[HEX_REG_QEMU_PKT_CNT], 1);

> > +    if (num_real_insns) {

> > +        tcg_gen_addi_tl(hex_gpr[HEX_REG_QEMU_INSN_CNT],

> > +                        hex_gpr[HEX_REG_QEMU_INSN_CNT], num_real_insns);

> > +    }

>

> tcg_gen_addi_tl will check for the immediate == 0.


OK, great.  I also see that tcg_gen_mov_i32 will check that the source and dest are different.  So no TCG code will be generated.

> As with updating PC for every insn, this is going to be expensive.

>

> You could accumulate these values through the TB and then update them at

> the

> end.  You'd need to store the intermediate values in the space managed by

> TARGET_INSN_START_EXTRA_WORDS, so that you can update them on any

> exceptional

> path out of the TB, in restore_state_to_opc().


OK

> > +    if (end_tb) {

> > +        tcg_gen_exit_tb(NULL, 0);

>

> This misses out on ctx->base.singlestep_enabled, and almost certainly

> belongs

> in hexagon_tr_tb_stop.  Use

>

> #define DISAS_EXIT  DISAS_TARGET_0

>

> or some other appropriate naming.


OK
Richard Henderson Aug. 30, 2020, 11:08 p.m. UTC | #2
On 8/30/20 12:37 PM, Taylor Simpson wrote:
> I'm actually checking two conditions here.
> 1) packet crossing a page boundary
> 2) reading too many words without finding the end of the packet.
> I guess it would be better to separate them.
> 
> What is the correct behavior for the second case?  Should we return an error code from here and have the higher level code generate the invalid packet exception?

I would return an error code.

In fact, I would also pass in the max number of words to read:

static int read_packet_words(CPUHexagonState *env,
                             DisasContext *ctx,
                             uint32_t words[PACKET_WORDS_MAX],
                             int max_words)
{
   // stuff
   return !found_end ? 0 : nwords;
}


Then, in translate_packet,


    uint32_t words[PACKET_WORDS_MAX];
    int nwords = read_packet_words(env, ctx, words,
                                   PACKET_WORDS_MAX);

    if (nwords == 0) {
        // raise exception
        return;
    }

    decode_and_translate_packet(env, ctx, words, nwords);

    /* If we're going to try for another packet... */
    if (ctx->base.is_jmp == DISAS_NEXT &&
        ctx->base.num_insns < ctx->base.num_insns) {
        /*
         * Remember the end of the page containing the
         * first packet.  Note that the first packet
         * is allowed to span two pages, so this is not
         * necessarily the same as the end of the page
         * containing ctx->base.pc_start.
         */
        if (ctx->base.num_insns == 1) {
            ctx->page_end
                = TARGET_PAGE_ALIGN(ctx->base.pc_next);
        }

        /*
         * If there are not PACKET_WORDS_MAX remaining on
         * the page, check to see if a full packet remains.
         * If not, split the TB so that the packet that
         * crosses the page begins the next TB.
         */
        target_long left = ctx->page_end - ctx->base.pc_next;
        tcg_debug_assert(left >= 0);
        if (left == 0
            || (left < PACKET_WORDS_MAX * 4 &&
                !read_packet_words(env, ctx, words, left / 4)) {
            ctx->base.is_jmp = DISAS_TOO_MANY;
        }
    }


The reason for all this is to properly capture the behaviour of instruction
execution vs SIGSEGV.

First, during translate we do not want to read from the next page unless
absolutely necessary.  Doing so could raise SIGSEGV before it would be
appropriate, e.g. because the TB should have branched away (or raised SIGFPE,
or anything else) before getting that far.

Second, when dispatching a TB, we check the 1 or 2 pages that the TB occupies
for validity.  If the second page is invalid, we raise SIGSEGV without
executing the TB at all.  Which makes it appear as if the SIGSEGV happened at
the first insn of the TB.  Which is correct if and only if the first insn is
the one that did cross the page.

>> Surely you don't need to actually set PC for every PC?
> 
> What do other targets do?

If you have a pc-relative instruction, e.g. x86_64's

  lea  offset(%rip), %rax

then you just use the known immediate for %rip:

  tcg_gen_movi_tl(cpu_reg[eax], ctx->base.pc_next + offset);

Normally, PC is only valid when explicitly returning to the cpu loop
(tcg_gen_exit_tb, static exception), for indirect branching
(tcg_gen_lookup_and_goto_ptr), or after dynamic exception unwinding
(restore_state_to_opc).

When using goto_tb, we can get away with *assuming* static state, because the
values get baked into the link to a specific next-TB.  That's why the general
form is

  tcg_gen_goto_tb(n);
  gen_set_pc_im(s, dest);
  tcg_gen_exit_tb(s->base.tb, n);

The first time we cross link N, the link is unset, which causes the goto_tb to
continue to the next tcg opcode.  Which then sets all of the static state that
has been assumed (often, as here, just the pc).  We then exit, telling the
cpu_loop to examine cpu state, locate the next TB, and fill in link N from the
current TB.

The second time we cross link N, the link is set, which causes the goto_tb to
continue immediately to the next TB.  We do not execute the store to PC, as it
is implied by next_tb->pc.


r~
diff mbox series

Patch

diff --git a/target/hexagon/translate.h b/target/hexagon/translate.h
new file mode 100644
index 0000000..144140f
--- /dev/null
+++ b/target/hexagon/translate.h
@@ -0,0 +1,103 @@ 
+/*
+ *  Copyright(c) 2019-2020 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef HEXAGON_TRANSLATE_H
+#define HEXAGON_TRANSLATE_H
+
+#include "cpu.h"
+#include "exec/translator.h"
+#include "tcg/tcg-op.h"
+#include "internal.h"
+
+typedef struct DisasContext {
+    DisasContextBase base;
+    uint32_t mem_idx;
+    int reg_log[REG_WRITES_MAX];
+    int reg_log_idx;
+    int preg_log[PRED_WRITES_MAX];
+    int preg_log_idx;
+    uint8_t store_width[STORES_MAX];
+} DisasContext;
+
+static inline void ctx_log_reg_write(DisasContext *ctx, int rnum)
+{
+#if HEX_DEBUG
+    int i;
+    for (i = 0; i < ctx->reg_log_idx; i++) {
+        if (ctx->reg_log[i] == rnum) {
+            HEX_DEBUG_LOG("WARNING: Multiple writes to r%d\n", rnum);
+        }
+    }
+#endif
+    ctx->reg_log[ctx->reg_log_idx] = rnum;
+    ctx->reg_log_idx++;
+}
+
+static inline void ctx_log_pred_write(DisasContext *ctx, int pnum)
+{
+    ctx->preg_log[ctx->preg_log_idx] = pnum;
+    ctx->preg_log_idx++;
+}
+
+static inline bool is_preloaded(DisasContext *ctx, int num)
+{
+    int i;
+    for (i = 0; i < ctx->reg_log_idx; i++) {
+        if (ctx->reg_log[i] == num) {
+            return true;
+        }
+    }
+    return false;
+}
+
+extern TCGv hex_gpr[TOTAL_PER_THREAD_REGS];
+extern TCGv hex_pred[NUM_PREGS];
+extern TCGv hex_next_PC;
+extern TCGv hex_this_PC;
+extern TCGv hex_slot_cancelled;
+extern TCGv hex_branch_taken;
+extern TCGv hex_new_value[TOTAL_PER_THREAD_REGS];
+extern TCGv hex_reg_written[TOTAL_PER_THREAD_REGS];
+extern TCGv hex_new_pred_value[NUM_PREGS];
+extern TCGv hex_pred_written;
+extern TCGv hex_store_addr[STORES_MAX];
+extern TCGv hex_store_width[STORES_MAX];
+extern TCGv hex_store_val32[STORES_MAX];
+extern TCGv_i64 hex_store_val64[STORES_MAX];
+extern TCGv hex_dczero_addr;
+extern TCGv hex_llsc_addr;
+extern TCGv hex_llsc_val;
+extern TCGv_i64 hex_llsc_val_i64;
+
+static inline void gen_slot_cancelled_check(TCGv check, int slot_num)
+{
+    TCGv mask = tcg_const_tl(1 << slot_num);
+    TCGv one = tcg_const_tl(1);
+    TCGv zero = tcg_const_tl(0);
+
+    tcg_gen_and_tl(mask, hex_slot_cancelled, mask);
+    tcg_gen_movcond_tl(TCG_COND_NE, check, mask, zero, one, zero);
+
+    tcg_temp_free(one);
+    tcg_temp_free(zero);
+    tcg_temp_free(mask);
+}
+
+extern void gen_exception(int excp);
+extern void gen_exception_debug(void);
+
+#endif
diff --git a/target/hexagon/translate.c b/target/hexagon/translate.c
new file mode 100644
index 0000000..9e3f4af
--- /dev/null
+++ b/target/hexagon/translate.c
@@ -0,0 +1,730 @@ 
+/*
+ *  Copyright(c) 2019-2020 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#define QEMU_GENERATE
+#include "qemu/osdep.h"
+#include "cpu.h"
+#include "tcg/tcg-op.h"
+#include "exec/cpu_ldst.h"
+#include "exec/log.h"
+#include "internal.h"
+#include "attribs.h"
+#include "insn.h"
+#include "decode.h"
+#include "translate.h"
+#include "printinsn.h"
+
+TCGv hex_gpr[TOTAL_PER_THREAD_REGS];
+TCGv hex_pred[NUM_PREGS];
+TCGv hex_next_PC;
+TCGv hex_this_PC;
+TCGv hex_slot_cancelled;
+TCGv hex_branch_taken;
+TCGv hex_new_value[TOTAL_PER_THREAD_REGS];
+#if HEX_DEBUG
+TCGv hex_reg_written[TOTAL_PER_THREAD_REGS];
+#endif
+TCGv hex_new_pred_value[NUM_PREGS];
+TCGv hex_pred_written;
+TCGv hex_store_addr[STORES_MAX];
+TCGv hex_store_width[STORES_MAX];
+TCGv hex_store_val32[STORES_MAX];
+TCGv_i64 hex_store_val64[STORES_MAX];
+TCGv hex_pkt_has_store_s1;
+TCGv hex_dczero_addr;
+TCGv hex_llsc_addr;
+TCGv hex_llsc_val;
+TCGv_i64 hex_llsc_val_i64;
+
+static const char * const hexagon_prednames[] = {
+  "p0", "p1", "p2", "p3"
+};
+
+void gen_exception(int excp)
+{
+    TCGv_i32 helper_tmp = tcg_const_i32(excp);
+    gen_helper_raise_exception(cpu_env, helper_tmp);
+    tcg_temp_free_i32(helper_tmp);
+}
+
+void gen_exception_debug(void)
+{
+    gen_exception(EXCP_DEBUG);
+}
+
+#if HEX_DEBUG
+#define PACKET_BUFFER_LEN              1028
+static void print_pkt(packet_t *pkt)
+{
+    char buf[PACKET_BUFFER_LEN];
+    snprint_a_pkt(buf, PACKET_BUFFER_LEN, pkt);
+    HEX_DEBUG_LOG("%s", buf);
+}
+#define HEX_DEBUG_PRINT_PKT(pkt)  print_pkt(pkt)
+#else
+#define HEX_DEBUG_PRINT_PKT(pkt)  /* nothing */
+#endif
+
+static int read_packet_words(CPUHexagonState *env, DisasContext *ctx,
+                             uint32_t words[])
+{
+    bool found_end = false;
+    int max_words;
+    int nwords;
+    int i;
+
+    /* Make sure we don't cross a page boundary */
+    max_words = -(ctx->base.pc_next | TARGET_PAGE_MASK) / sizeof(uint32_t);
+    if (max_words < PACKET_WORDS_MAX) {
+        /* Might cross a page boundary */
+        if (ctx->base.num_insns == 1) {
+            /* OK if it's the first packet in the TB */
+            max_words = PACKET_WORDS_MAX;
+        }
+    } else {
+        max_words = PACKET_WORDS_MAX;
+    }
+
+    memset(words, 0, PACKET_WORDS_MAX * sizeof(uint32_t));
+    for (nwords = 0; !found_end && nwords < max_words; nwords++) {
+        words[nwords] = cpu_ldl_code(env,
+                                ctx->base.pc_next + nwords * sizeof(uint32_t));
+        found_end = is_packet_end(words[nwords]);
+    }
+    if (!found_end) {
+        if (nwords == PACKET_WORDS_MAX) {
+            /* Read too many words without finding the end */
+            gen_exception(HEX_EXCP_INVALID_PACKET);
+            ctx->base.is_jmp = DISAS_NORETURN;
+            return 0;
+        }
+        /* Crosses page boundary - defer to next TB */
+        ctx->base.is_jmp = DISAS_TOO_MANY;
+        return 0;
+    }
+
+    HEX_DEBUG_LOG("decode_packet: pc = 0x%x\n", ctx->base.pc_next);
+    HEX_DEBUG_LOG("    words = { ");
+    for (i = 0; i < nwords; i++) {
+        HEX_DEBUG_LOG("0x%x, ", words[i]);
+    }
+    HEX_DEBUG_LOG("}\n");
+
+    return nwords;
+}
+
+static void gen_start_packet(DisasContext *ctx, packet_t *pkt)
+{
+    target_ulong next_PC = ctx->base.pc_next + pkt->encod_pkt_size_in_bytes;
+    int i;
+
+    /* Clear out the disassembly context */
+    ctx->reg_log_idx = 0;
+    ctx->preg_log_idx = 0;
+    for (i = 0; i < STORES_MAX; i++) {
+        ctx->store_width[i] = 0;
+    }
+    tcg_gen_movi_tl(hex_pkt_has_store_s1, pkt->pkt_has_store_s1);
+
+#if HEX_DEBUG
+    /* Handy place to set a breakpoint before the packet executes */
+    gen_helper_debug_start_packet(cpu_env);
+    tcg_gen_movi_tl(hex_this_PC, ctx->base.pc_next);
+#endif
+
+    /* Initialize the runtime state for packet semantics */
+    tcg_gen_movi_tl(hex_gpr[HEX_REG_PC], ctx->base.pc_next);
+    tcg_gen_movi_tl(hex_slot_cancelled, 0);
+    if (pkt->pkt_has_cof) {
+        tcg_gen_movi_tl(hex_branch_taken, 0);
+        tcg_gen_movi_tl(hex_next_PC, next_PC);
+    }
+    tcg_gen_movi_tl(hex_pred_written, 0);
+}
+
+/*
+ * The LOG_*_WRITE macros mark most of the writes in a packet
+ * However, there are some implicit writes marked as attributes
+ * of the applicable instructions.
+ */
+static void mark_implicit_reg_write(DisasContext *ctx, insn_t *insn,
+                                    int attrib, int rnum)
+{
+    if (GET_ATTRIB(insn->opcode, attrib)) {
+        int is_predicated = GET_ATTRIB(insn->opcode, A_CONDEXEC);
+        if (is_predicated && !is_preloaded(ctx, rnum)) {
+            tcg_gen_mov_tl(hex_new_value[rnum], hex_gpr[rnum]);
+        }
+
+        ctx->reg_log[ctx->reg_log_idx] = rnum;
+        ctx->reg_log_idx++;
+    }
+}
+
+static void mark_implicit_pred_write(DisasContext *ctx, insn_t *insn,
+                                     int attrib, int pnum)
+{
+    if (GET_ATTRIB(insn->opcode, attrib)) {
+        ctx->preg_log[ctx->preg_log_idx] = pnum;
+        ctx->preg_log_idx++;
+    }
+}
+
+static void mark_implicit_writes(DisasContext *ctx, insn_t *insn)
+{
+    mark_implicit_reg_write(ctx, insn, A_IMPLICIT_WRITES_FP,  HEX_REG_FP);
+    mark_implicit_reg_write(ctx, insn, A_IMPLICIT_WRITES_SP,  HEX_REG_SP);
+    mark_implicit_reg_write(ctx, insn, A_IMPLICIT_WRITES_LR,  HEX_REG_LR);
+    mark_implicit_reg_write(ctx, insn, A_IMPLICIT_WRITES_LC0, HEX_REG_LC0);
+    mark_implicit_reg_write(ctx, insn, A_IMPLICIT_WRITES_SA0, HEX_REG_SA0);
+    mark_implicit_reg_write(ctx, insn, A_IMPLICIT_WRITES_LC1, HEX_REG_LC1);
+    mark_implicit_reg_write(ctx, insn, A_IMPLICIT_WRITES_SA1, HEX_REG_SA1);
+
+    mark_implicit_pred_write(ctx, insn, A_IMPLICIT_WRITES_P0, 0);
+    mark_implicit_pred_write(ctx, insn, A_IMPLICIT_WRITES_P1, 1);
+    mark_implicit_pred_write(ctx, insn, A_IMPLICIT_WRITES_P2, 2);
+    mark_implicit_pred_write(ctx, insn, A_IMPLICIT_WRITES_P3, 3);
+}
+
+static void gen_insn(CPUHexagonState *env, DisasContext *ctx,
+                     insn_t *insn, packet_t *pkt)
+{
+    if (insn->generate) {
+        mark_implicit_writes(ctx, insn);
+        insn->generate(env, ctx, insn, pkt);
+    } else {
+        gen_exception(HEX_EXCP_INVALID_OPCODE);
+        ctx->base.is_jmp = DISAS_NORETURN;
+    }
+}
+
+/*
+ * Helpers for generating the packet commit
+ */
+static void gen_reg_writes(DisasContext *ctx)
+{
+    int i;
+
+    for (i = 0; i < ctx->reg_log_idx; i++) {
+        int reg_num = ctx->reg_log[i];
+
+        tcg_gen_mov_tl(hex_gpr[reg_num], hex_new_value[reg_num]);
+    }
+}
+
+static void gen_pred_writes(DisasContext *ctx, packet_t *pkt)
+{
+    /* Early exit if the log is empty */
+    if (!ctx->preg_log_idx) {
+        return;
+    }
+
+    TCGv zero = tcg_const_tl(0);
+    TCGv control_reg = tcg_temp_new();
+    TCGv pval = tcg_temp_new();
+    int i;
+
+    /*
+     * Only endloop instructions will conditionally
+     * write a predicate.  If there are no endloop
+     * instructions, we can use the non-conditional
+     * write of the predicates.
+     */
+    if (pkt->pkt_has_endloop) {
+        TCGv pred_written = tcg_temp_new();
+        for (i = 0; i < ctx->preg_log_idx; i++) {
+            int pred_num = ctx->preg_log[i];
+
+            tcg_gen_andi_tl(pred_written, hex_pred_written, 1 << pred_num);
+            tcg_gen_movcond_tl(TCG_COND_NE, hex_pred[pred_num],
+                               pred_written, zero,
+                               hex_new_pred_value[pred_num],
+                               hex_pred[pred_num]);
+        }
+        tcg_temp_free(pred_written);
+    } else {
+        for (i = 0; i < ctx->preg_log_idx; i++) {
+            int pred_num = ctx->preg_log[i];
+            tcg_gen_mov_tl(hex_pred[pred_num], hex_new_pred_value[pred_num]);
+#if HEX_DEBUG
+            /* Do this so HELPER(debug_commit_end) will know */
+            tcg_gen_ori_tl(hex_pred_written, hex_pred_written, 1 << pred_num);
+#endif
+        }
+    }
+
+    tcg_temp_free(zero);
+    tcg_temp_free(control_reg);
+    tcg_temp_free(pval);
+}
+
+#if HEX_DEBUG
+static inline void gen_check_store_width(DisasContext *ctx, int slot_num)
+{
+    TCGv slot = tcg_const_tl(slot_num);
+    TCGv check = tcg_const_tl(ctx->store_width[slot_num]);
+    gen_helper_debug_check_store_width(cpu_env, slot, check);
+    tcg_temp_free(slot);
+    tcg_temp_free(check);
+}
+#define HEX_DEBUG_GEN_CHECK_STORE_WIDTH(ctx, slot_num) \
+    gen_check_store_width(ctx, slot_num)
+#else
+#define HEX_DEBUG_GEN_CHECK_STORE_WIDTH(ctx, slot_num)  /* nothing */
+#endif
+
+static void process_store(DisasContext *ctx, int slot_num)
+{
+    TCGv cancelled = tcg_temp_local_new();
+    TCGLabel *label_end = gen_new_label();
+
+    /* Don't do anything if the slot was cancelled */
+    gen_slot_cancelled_check(cancelled, slot_num);
+    tcg_gen_brcondi_tl(TCG_COND_NE, cancelled, 0, label_end);
+    {
+        int ctx_width = ctx->store_width[slot_num];
+        TCGv address = tcg_temp_local_new();
+        tcg_gen_mov_tl(address, hex_store_addr[slot_num]);
+
+        /*
+         * If we know the width from the DisasContext, we can
+         * generate much cleaner code.
+         * Unfortunately, not all instructions execute the fSTORE
+         * macro during code generation.  Anything that uses the
+         * generic helper will have this problem.  Instructions
+         * that use fWRAP to generate proper TCG code will be OK.
+         */
+        TCGv value;
+        TCGv_i64 value_i64;
+        TCGLabel *label_w2;
+        TCGLabel *label_w4;
+        TCGLabel *label_w8;
+        switch (ctx_width) {
+        case 1:
+            HEX_DEBUG_GEN_CHECK_STORE_WIDTH(ctx, slot_num);
+            value = tcg_temp_new();
+            tcg_gen_mov_tl(value, hex_store_val32[slot_num]);
+            tcg_gen_qemu_st8(value, address, ctx->mem_idx);
+            tcg_temp_free(value);
+            break;
+        case 2:
+            HEX_DEBUG_GEN_CHECK_STORE_WIDTH(ctx, slot_num);
+            value = tcg_temp_new();
+            tcg_gen_mov_tl(value, hex_store_val32[slot_num]);
+            tcg_gen_qemu_st16(value, address, ctx->mem_idx);
+            tcg_temp_free(value);
+            break;
+        case 4:
+            HEX_DEBUG_GEN_CHECK_STORE_WIDTH(ctx, slot_num);
+            value = tcg_temp_new();
+            tcg_gen_mov_tl(value, hex_store_val32[slot_num]);
+            tcg_gen_qemu_st32(value, address, ctx->mem_idx);
+            tcg_temp_free(value);
+            break;
+        case 8:
+            HEX_DEBUG_GEN_CHECK_STORE_WIDTH(ctx, slot_num);
+            value_i64 = tcg_temp_new_i64();
+            tcg_gen_mov_i64(value_i64, hex_store_val64[slot_num]);
+            tcg_gen_qemu_st64(value_i64, address, ctx->mem_idx);
+            tcg_temp_free_i64(value_i64);
+            break;
+        default:
+            /*
+             * If we get to here, we don't know the width at
+             * TCG generation time, we'll generate branching
+             * based on the width at runtime.
+             */
+            label_w2 = gen_new_label();
+            label_w4 = gen_new_label();
+            label_w8 = gen_new_label();
+            TCGv width = tcg_temp_local_new();
+
+            tcg_gen_mov_tl(width, hex_store_width[slot_num]);
+            tcg_gen_brcondi_tl(TCG_COND_NE, width, 1, label_w2);
+            {
+                /* Width is 1 byte */
+                TCGv value = tcg_temp_new();
+                tcg_gen_mov_tl(value, hex_store_val32[slot_num]);
+                tcg_gen_qemu_st8(value, address, ctx->mem_idx);
+                tcg_gen_br(label_end);
+                tcg_temp_free(value);
+            }
+            gen_set_label(label_w2);
+            tcg_gen_brcondi_tl(TCG_COND_NE, width, 2, label_w4);
+            {
+                /* Width is 2 bytes */
+                TCGv value = tcg_temp_new();
+                tcg_gen_mov_tl(value, hex_store_val32[slot_num]);
+                tcg_gen_qemu_st16(value, address, ctx->mem_idx);
+                tcg_gen_br(label_end);
+                tcg_temp_free(value);
+            }
+            gen_set_label(label_w4);
+            tcg_gen_brcondi_tl(TCG_COND_NE, width, 4, label_w8);
+            {
+                /* Width is 4 bytes */
+                TCGv value = tcg_temp_new();
+                tcg_gen_mov_tl(value, hex_store_val32[slot_num]);
+                tcg_gen_qemu_st32(value, address, ctx->mem_idx);
+                tcg_gen_br(label_end);
+                tcg_temp_free(value);
+            }
+            gen_set_label(label_w8);
+            {
+                /* Width is 8 bytes */
+                TCGv_i64 value = tcg_temp_new_i64();
+                tcg_gen_mov_i64(value, hex_store_val64[slot_num]);
+                tcg_gen_qemu_st64(value, address, ctx->mem_idx);
+                tcg_gen_br(label_end);
+                tcg_temp_free_i64(value);
+            }
+
+            tcg_temp_free(width);
+        }
+        tcg_temp_free(address);
+    }
+    gen_set_label(label_end);
+
+    tcg_temp_free(cancelled);
+}
+
+static void process_store_log(DisasContext *ctx, packet_t *pkt)
+{
+    /*
+     *  When a packet has two stores, the hardware processes
+     *  slot 1 and then slot 2.  This will be important when
+     *  the memory accesses overlap.
+     */
+    if (pkt->pkt_has_store_s1 && !pkt->pkt_has_dczeroa) {
+        process_store(ctx, 1);
+    }
+    if (pkt->pkt_has_store_s0 && !pkt->pkt_has_dczeroa) {
+        process_store(ctx, 0);
+    }
+}
+
+/* Zero out a 32-bit cache line */
+static void process_dczeroa(DisasContext *ctx, packet_t *pkt)
+{
+    if (pkt->pkt_has_dczeroa) {
+        /* Store 32 bytes of zero starting at (addr & ~0x1f) */
+        TCGv addr = tcg_temp_new();
+        TCGv_i64 zero = tcg_const_i64(0);
+
+        tcg_gen_andi_tl(addr, hex_dczero_addr, ~0x1f);
+        tcg_gen_qemu_st64(zero, addr, ctx->mem_idx);
+        tcg_gen_addi_tl(addr, addr, 8);
+        tcg_gen_qemu_st64(zero, addr, ctx->mem_idx);
+        tcg_gen_addi_tl(addr, addr, 8);
+        tcg_gen_qemu_st64(zero, addr, ctx->mem_idx);
+        tcg_gen_addi_tl(addr, addr, 8);
+        tcg_gen_qemu_st64(zero, addr, ctx->mem_idx);
+
+        tcg_temp_free(addr);
+        tcg_temp_free_i64(zero);
+    }
+}
+
+static bool process_change_of_flow(DisasContext *ctx, packet_t *pkt)
+{
+    if (pkt->pkt_has_cof) {
+        tcg_gen_mov_tl(hex_gpr[HEX_REG_PC], hex_next_PC);
+        return true;
+    }
+    return false;
+}
+
+static void gen_exec_counters(packet_t *pkt)
+{
+    int num_insns = pkt->num_insns;
+    int num_real_insns = 0;
+    int i;
+
+    for (i = 0; i < num_insns; i++) {
+        if (!pkt->insn[i].is_endloop &&
+            !pkt->insn[i].part1 &&
+            !GET_ATTRIB(pkt->insn[i].opcode, A_IT_NOP)) {
+            num_real_insns++;
+        }
+    }
+
+    tcg_gen_addi_tl(hex_gpr[HEX_REG_QEMU_PKT_CNT],
+                    hex_gpr[HEX_REG_QEMU_PKT_CNT], 1);
+    if (num_real_insns) {
+        tcg_gen_addi_tl(hex_gpr[HEX_REG_QEMU_INSN_CNT],
+                        hex_gpr[HEX_REG_QEMU_INSN_CNT], num_real_insns);
+    }
+}
+
+static void gen_commit_packet(DisasContext *ctx, packet_t *pkt)
+{
+    bool end_tb = false;
+
+    gen_reg_writes(ctx);
+    gen_pred_writes(ctx, pkt);
+    process_store_log(ctx, pkt);
+    process_dczeroa(ctx, pkt);
+    end_tb |= process_change_of_flow(ctx, pkt);
+    gen_exec_counters(pkt);
+#if HEX_DEBUG
+    {
+        TCGv has_st0 =
+            tcg_const_tl(pkt->pkt_has_store_s0 && !pkt->pkt_has_dczeroa);
+        TCGv has_st1 =
+            tcg_const_tl(pkt->pkt_has_store_s1 && !pkt->pkt_has_dczeroa);
+
+        /* Handy place to set a breakpoint at the end of execution */
+        gen_helper_debug_commit_end(cpu_env, has_st0, has_st1);
+
+        tcg_temp_free(has_st0);
+        tcg_temp_free(has_st1);
+    }
+#endif
+
+    if (end_tb) {
+        tcg_gen_exit_tb(NULL, 0);
+        ctx->base.is_jmp = DISAS_NORETURN;
+    }
+}
+
+static void decode_and_translate_packet(CPUHexagonState *env, DisasContext *ctx)
+{
+    uint32_t words[PACKET_WORDS_MAX];
+    int nwords;
+    packet_t pkt;
+    int i;
+
+    nwords = read_packet_words(env, ctx, words);
+    if (!nwords) {
+        return;
+    }
+
+    if (decode_this(nwords, words, &pkt)) {
+        HEX_DEBUG_PRINT_PKT(&pkt);
+        gen_start_packet(ctx, &pkt);
+        for (i = 0; i < pkt.num_insns; i++) {
+            gen_insn(env, ctx, &pkt.insn[i], &pkt);
+        }
+        gen_commit_packet(ctx, &pkt);
+        ctx->base.pc_next += pkt.encod_pkt_size_in_bytes;
+    } else {
+        gen_exception(HEX_EXCP_INVALID_PACKET);
+        ctx->base.is_jmp = DISAS_NORETURN;
+    }
+}
+
+static void hexagon_tr_init_disas_context(DisasContextBase *dcbase,
+                                          CPUState *cs)
+{
+    DisasContext *ctx = container_of(dcbase, DisasContext, base);
+
+    ctx->mem_idx = MMU_USER_IDX;
+}
+
+static void hexagon_tr_tb_start(DisasContextBase *db, CPUState *cpu)
+{
+}
+
+static void hexagon_tr_insn_start(DisasContextBase *dcbase, CPUState *cpu)
+{
+    DisasContext *ctx = container_of(dcbase, DisasContext, base);
+
+    tcg_gen_insn_start(ctx->base.pc_next);
+}
+
+static bool hexagon_tr_breakpoint_check(DisasContextBase *dcbase, CPUState *cpu,
+                                        const CPUBreakpoint *bp)
+{
+    DisasContext *ctx = container_of(dcbase, DisasContext, base);
+
+    tcg_gen_movi_tl(hex_gpr[HEX_REG_PC], ctx->base.pc_next);
+    ctx->base.is_jmp = DISAS_NORETURN;
+    gen_exception_debug();
+    /*
+     * The address covered by the breakpoint must be included in
+     * [tb->pc, tb->pc + tb->size) in order to for it to be
+     * properly cleared -- thus we increment the PC here so that
+     * the logic setting tb->size below does the right thing.
+     */
+    ctx->base.pc_next += 4;
+    return true;
+}
+
+
+static void hexagon_tr_translate_packet(DisasContextBase *dcbase, CPUState *cpu)
+{
+    DisasContext *ctx = container_of(dcbase, DisasContext, base);
+    CPUHexagonState *env = cpu->env_ptr;
+
+    decode_and_translate_packet(env, ctx);
+
+    if (ctx->base.is_jmp == DISAS_NEXT) {
+        target_ulong page_start;
+
+        page_start = ctx->base.pc_first & TARGET_PAGE_MASK;
+        if (ctx->base.pc_next - page_start >= TARGET_PAGE_SIZE) {
+            ctx->base.is_jmp = DISAS_TOO_MANY;
+        }
+
+        /*
+         * The CPU log is used to compare against LLDB single stepping,
+         * so end the TLB after every packet.
+         */
+        if (qemu_loglevel_mask(CPU_LOG_TB_CPU)) {
+            ctx->base.is_jmp = DISAS_TOO_MANY;
+        }
+#if HEX_DEBUG
+        /* When debugging, only put one packet per TB */
+        ctx->base.is_jmp = DISAS_TOO_MANY;
+#endif
+    }
+}
+
+static void hexagon_tr_tb_stop(DisasContextBase *dcbase, CPUState *cpu)
+{
+    DisasContext *ctx = container_of(dcbase, DisasContext, base);
+
+    switch (ctx->base.is_jmp) {
+    case DISAS_TOO_MANY:
+        tcg_gen_movi_tl(hex_gpr[HEX_REG_PC], ctx->base.pc_next);
+        if (ctx->base.singlestep_enabled) {
+            gen_exception_debug();
+        } else {
+            tcg_gen_exit_tb(NULL, 0);
+        }
+        break;
+    case DISAS_NORETURN:
+        break;
+    default:
+        g_assert_not_reached();
+    }
+}
+
+static void hexagon_tr_disas_log(const DisasContextBase *dcbase, CPUState *cpu)
+{
+    qemu_log("IN: %s\n", lookup_symbol(dcbase->pc_first));
+    log_target_disas(cpu, dcbase->pc_first, dcbase->tb->size);
+}
+
+
+static const TranslatorOps hexagon_tr_ops = {
+    .init_disas_context = hexagon_tr_init_disas_context,
+    .tb_start           = hexagon_tr_tb_start,
+    .insn_start         = hexagon_tr_insn_start,
+    .breakpoint_check   = hexagon_tr_breakpoint_check,
+    .translate_insn     = hexagon_tr_translate_packet,
+    .tb_stop            = hexagon_tr_tb_stop,
+    .disas_log          = hexagon_tr_disas_log,
+};
+
+void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int max_insns)
+{
+    DisasContext ctx;
+
+    translator_loop(&hexagon_tr_ops, &ctx.base, cs, tb, max_insns);
+}
+
+#define NAME_LEN               64
+static char new_value_names[TOTAL_PER_THREAD_REGS][NAME_LEN];
+#if HEX_DEBUG
+static char reg_written_names[TOTAL_PER_THREAD_REGS][NAME_LEN];
+#endif
+static char new_pred_value_names[NUM_PREGS][NAME_LEN];
+static char store_addr_names[STORES_MAX][NAME_LEN];
+static char store_width_names[STORES_MAX][NAME_LEN];
+static char store_val32_names[STORES_MAX][NAME_LEN];
+static char store_val64_names[STORES_MAX][NAME_LEN];
+
+void hexagon_translate_init(void)
+{
+    int i;
+
+    opcode_init();
+
+    for (i = 0; i < TOTAL_PER_THREAD_REGS; i++) {
+        hex_gpr[i] = tcg_global_mem_new(cpu_env,
+            offsetof(CPUHexagonState, gpr[i]),
+            hexagon_regnames[i]);
+
+        snprintf(new_value_names[i], NAME_LEN, "new_%s", hexagon_regnames[i]);
+        hex_new_value[i] = tcg_global_mem_new(cpu_env,
+            offsetof(CPUHexagonState, new_value[i]),
+            new_value_names[i]);
+
+#if HEX_DEBUG
+        snprintf(reg_written_names[i], NAME_LEN, "reg_written_%s",
+                 hexagon_regnames[i]);
+        hex_reg_written[i] = tcg_global_mem_new(cpu_env,
+            offsetof(CPUHexagonState, reg_written[i]),
+            reg_written_names[i]);
+#endif
+    }
+    for (i = 0; i < NUM_PREGS; i++) {
+        hex_pred[i] = tcg_global_mem_new(cpu_env,
+            offsetof(CPUHexagonState, pred[i]),
+            hexagon_prednames[i]);
+
+        snprintf(new_pred_value_names[i], NAME_LEN, "new_pred_%s",
+                 hexagon_prednames[i]);
+        hex_new_pred_value[i] = tcg_global_mem_new(cpu_env,
+            offsetof(CPUHexagonState, new_pred_value[i]),
+            new_pred_value_names[i]);
+    }
+    hex_pred_written = tcg_global_mem_new(cpu_env,
+        offsetof(CPUHexagonState, pred_written), "pred_written");
+    hex_next_PC = tcg_global_mem_new(cpu_env,
+        offsetof(CPUHexagonState, next_PC), "next_PC");
+    hex_this_PC = tcg_global_mem_new(cpu_env,
+        offsetof(CPUHexagonState, this_PC), "this_PC");
+    hex_slot_cancelled = tcg_global_mem_new(cpu_env,
+        offsetof(CPUHexagonState, slot_cancelled), "slot_cancelled");
+    hex_branch_taken = tcg_global_mem_new(cpu_env,
+        offsetof(CPUHexagonState, branch_taken), "branch_taken");
+    hex_pkt_has_store_s1 = tcg_global_mem_new(cpu_env,
+        offsetof(CPUHexagonState, pkt_has_store_s1), "pkt_has_store_s1");
+    hex_dczero_addr = tcg_global_mem_new(cpu_env,
+        offsetof(CPUHexagonState, dczero_addr), "dczero_addr");
+    hex_llsc_addr = tcg_global_mem_new(cpu_env,
+        offsetof(CPUHexagonState, llsc_addr), "llsc_addr");
+    hex_llsc_val = tcg_global_mem_new(cpu_env,
+        offsetof(CPUHexagonState, llsc_val), "llsc_val");
+    hex_llsc_val_i64 = tcg_global_mem_new_i64(cpu_env,
+        offsetof(CPUHexagonState, llsc_val_i64), "llsc_val_i64");
+    for (i = 0; i < STORES_MAX; i++) {
+        snprintf(store_addr_names[i], NAME_LEN, "store_addr_%d", i);
+        hex_store_addr[i] = tcg_global_mem_new(cpu_env,
+            offsetof(CPUHexagonState, mem_log_stores[i].va),
+            store_addr_names[i]);
+
+        snprintf(store_width_names[i], NAME_LEN, "store_width_%d", i);
+        hex_store_width[i] = tcg_global_mem_new(cpu_env,
+            offsetof(CPUHexagonState, mem_log_stores[i].width),
+            store_width_names[i]);
+
+        snprintf(store_val32_names[i], NAME_LEN, "store_val32_%d", i);
+        hex_store_val32[i] = tcg_global_mem_new(cpu_env,
+            offsetof(CPUHexagonState, mem_log_stores[i].data32),
+            store_val32_names[i]);
+
+        snprintf(store_val64_names[i], NAME_LEN, "store_val64_%d", i);
+        hex_store_val64[i] = tcg_global_mem_new_i64(cpu_env,
+            offsetof(CPUHexagonState, mem_log_stores[i].data64),
+            store_val64_names[i]);
+    }
+
+    init_genptr();
+}