diff mbox series

[v2,20/21] accel/tcg: allow plugin instrumentation to be disable via cflags

Message ID 20210210221053.18050-21-alex.bennee@linaro.org
State New
Headers show
Series plugins/next pre-PR (hwprofile, regression fixes, icount count fix) | expand

Commit Message

Alex Bennée Feb. 10, 2021, 10:10 p.m. UTC
When icount is enabled and we recompile an MMIO access we end up
double counting the instruction execution. To avoid this we introduce
the CF_NOINSTR cflag which disables instrumentation for the next TB.
As this is part of the hashed compile flags we will only execute the
generated TB while coming out of a cpu_io_recompile.

While we are at it delete the old TODO. We might as well keep the
translation handy as it's likely you will repeatedly hit it on each
MMIO access.

Reported-by: Aaron Lindsay <aaron@os.amperecomputing.com>
Signed-off-by: Alex Bennée <alex.bennee@linaro.org>

Reviewed-by: Richard Henderson <richard.henderson@linaro.org>

Message-Id: <20210209182749.31323-12-alex.bennee@linaro.org>

---
v2
  - squashed CH_HASHMASK to ~CF_INVALID
---
 include/exec/exec-all.h   |  6 +++---
 accel/tcg/translate-all.c | 17 ++++++++---------
 accel/tcg/translator.c    |  2 +-
 3 files changed, 12 insertions(+), 13 deletions(-)

-- 
2.20.1

Comments

Zhijian Li (Fujitsu)" via Feb. 12, 2021, 12:53 a.m. UTC | #1
On Feb 10 22:10, Alex Bennée wrote:
> When icount is enabled and we recompile an MMIO access we end up

> double counting the instruction execution. To avoid this we introduce

> the CF_NOINSTR cflag which disables instrumentation for the next TB.

> As this is part of the hashed compile flags we will only execute the

> generated TB while coming out of a cpu_io_recompile.


Unfortunately this patch works a little too well!

With this change, the memory access callbacks registered via
`qemu_plugin_register_vcpu_mem_cb()` are never called for the
re-translated instruction making the IO access, since we've disabled all
instrumentation.

Is it possible to selectively disable only instruction callbacks using
this mechanism, while still allowing others that would not yet have been
called for the re-translated instruction?

-Aaron

> While we are at it delete the old TODO. We might as well keep the

> translation handy as it's likely you will repeatedly hit it on each

> MMIO access.

> 

> Reported-by: Aaron Lindsay <aaron@os.amperecomputing.com>

> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>

> Reviewed-by: Richard Henderson <richard.henderson@linaro.org>

> Message-Id: <20210209182749.31323-12-alex.bennee@linaro.org>

> 

> ---

> v2

>   - squashed CH_HASHMASK to ~CF_INVALID

> ---

>  include/exec/exec-all.h   |  6 +++---

>  accel/tcg/translate-all.c | 17 ++++++++---------

>  accel/tcg/translator.c    |  2 +-

>  3 files changed, 12 insertions(+), 13 deletions(-)

> 

> diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h

> index e08179de34..299282cc59 100644

> --- a/include/exec/exec-all.h

> +++ b/include/exec/exec-all.h

> @@ -454,14 +454,14 @@ struct TranslationBlock {

>      uint32_t cflags;    /* compile flags */

>  #define CF_COUNT_MASK  0x00007fff

>  #define CF_LAST_IO     0x00008000 /* Last insn may be an IO access.  */

> +#define CF_NOINSTR     0x00010000 /* Disable instrumentation of TB */

>  #define CF_USE_ICOUNT  0x00020000

>  #define CF_INVALID     0x00040000 /* TB is stale. Set with @jmp_lock held */

>  #define CF_PARALLEL    0x00080000 /* Generate code for a parallel context */

>  #define CF_CLUSTER_MASK 0xff000000 /* Top 8 bits are cluster ID */

>  #define CF_CLUSTER_SHIFT 24

> -/* cflags' mask for hashing/comparison */

> -#define CF_HASH_MASK   \

> -    (CF_COUNT_MASK | CF_LAST_IO | CF_USE_ICOUNT | CF_PARALLEL | CF_CLUSTER_MASK)

> +/* cflags' mask for hashing/comparison, basically ignore CF_INVALID */

> +#define CF_HASH_MASK   (~CF_INVALID)

>  

>      /* Per-vCPU dynamic tracing state used to generate this TB */

>      uint32_t trace_vcpu_dstate;

> diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c

> index 0666f9ef14..32a3d8fe24 100644

> --- a/accel/tcg/translate-all.c

> +++ b/accel/tcg/translate-all.c

> @@ -2399,7 +2399,8 @@ void tb_check_watchpoint(CPUState *cpu, uintptr_t retaddr)

>  }

>  

>  #ifndef CONFIG_USER_ONLY

> -/* in deterministic execution mode, instructions doing device I/Os

> +/*

> + * In deterministic execution mode, instructions doing device I/Os

>   * must be at the end of the TB.

>   *

>   * Called by softmmu_template.h, with iothread mutex not held.

> @@ -2430,19 +2431,17 @@ void cpu_io_recompile(CPUState *cpu, uintptr_t retaddr)

>          n = 2;

>      }

>  

> -    /* Generate a new TB executing the I/O insn.  */

> -    cpu->cflags_next_tb = curr_cflags() | CF_LAST_IO | n;

> +    /*

> +     * Exit the loop and potentially generate a new TB executing the

> +     * just the I/O insns. We also disable instrumentation so we don't

> +     * double count the instruction.

> +     */

> +    cpu->cflags_next_tb = curr_cflags() | CF_NOINSTR | CF_LAST_IO | n;

>  

>      qemu_log_mask_and_addr(CPU_LOG_EXEC, tb->pc,

>                             "cpu_io_recompile: rewound execution of TB to "

>                             TARGET_FMT_lx "\n", tb->pc);

>  

> -    /* TODO: If env->pc != tb->pc (i.e. the faulting instruction was not

> -     * the first in the TB) then we end up generating a whole new TB and

> -     *  repeating the fault, which is horribly inefficient.

> -     *  Better would be to execute just this insn uncached, or generate a

> -     *  second new TB.

> -     */

>      cpu_loop_exit_noexc(cpu);

>  }

>  

> diff --git a/accel/tcg/translator.c b/accel/tcg/translator.c

> index a49a794065..14d1ea795d 100644

> --- a/accel/tcg/translator.c

> +++ b/accel/tcg/translator.c

> @@ -58,7 +58,7 @@ void translator_loop(const TranslatorOps *ops, DisasContextBase *db,

>      ops->tb_start(db, cpu);

>      tcg_debug_assert(db->is_jmp == DISAS_NEXT);  /* no early exit */

>  

> -    plugin_enabled = plugin_gen_tb_start(cpu, tb);

> +    plugin_enabled = !(tb_cflags(db->tb) & CF_NOINSTR) && plugin_gen_tb_start(cpu, tb);

>  

>      while (true) {

>          db->num_insns++;

> -- 

> 2.20.1

>
Alex Bennée Feb. 12, 2021, 11:22 a.m. UTC | #2
Aaron Lindsay <aaron@os.amperecomputing.com> writes:

> On Feb 10 22:10, Alex Bennée wrote:

>> When icount is enabled and we recompile an MMIO access we end up

>> double counting the instruction execution. To avoid this we introduce

>> the CF_NOINSTR cflag which disables instrumentation for the next TB.

>> As this is part of the hashed compile flags we will only execute the

>> generated TB while coming out of a cpu_io_recompile.

>

> Unfortunately this patch works a little too well!

>

> With this change, the memory access callbacks registered via

> `qemu_plugin_register_vcpu_mem_cb()` are never called for the

> re-translated instruction making the IO access, since we've disabled all

> instrumentation.


Hmm well we correctly don't instrument stores (as we have already
executed the plugin for them) - but of course the load instrumentation
is after the fact so we are now missing them.


> Is it possible to selectively disable only instruction callbacks using

> this mechanism, while still allowing others that would not yet have been

> called for the re-translated instruction?


Hmmm let me see if I can finesse the CF_NOINSTR logic to allow
plugin_gen_insn_end() without the rest? It probably needs a better name
for the flag as well. 

>

> -Aaron

>

>> While we are at it delete the old TODO. We might as well keep the

>> translation handy as it's likely you will repeatedly hit it on each

>> MMIO access.

>> 

>> Reported-by: Aaron Lindsay <aaron@os.amperecomputing.com>

>> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>

>> Reviewed-by: Richard Henderson <richard.henderson@linaro.org>

>> Message-Id: <20210209182749.31323-12-alex.bennee@linaro.org>

>> 

>> ---

>> v2

>>   - squashed CH_HASHMASK to ~CF_INVALID

>> ---

>>  include/exec/exec-all.h   |  6 +++---

>>  accel/tcg/translate-all.c | 17 ++++++++---------

>>  accel/tcg/translator.c    |  2 +-

>>  3 files changed, 12 insertions(+), 13 deletions(-)

>> 

>> diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h

>> index e08179de34..299282cc59 100644

>> --- a/include/exec/exec-all.h

>> +++ b/include/exec/exec-all.h

>> @@ -454,14 +454,14 @@ struct TranslationBlock {

>>      uint32_t cflags;    /* compile flags */

>>  #define CF_COUNT_MASK  0x00007fff

>>  #define CF_LAST_IO     0x00008000 /* Last insn may be an IO access.  */

>> +#define CF_NOINSTR     0x00010000 /* Disable instrumentation of TB */

>>  #define CF_USE_ICOUNT  0x00020000

>>  #define CF_INVALID     0x00040000 /* TB is stale. Set with @jmp_lock held */

>>  #define CF_PARALLEL    0x00080000 /* Generate code for a parallel context */

>>  #define CF_CLUSTER_MASK 0xff000000 /* Top 8 bits are cluster ID */

>>  #define CF_CLUSTER_SHIFT 24

>> -/* cflags' mask for hashing/comparison */

>> -#define CF_HASH_MASK   \

>> -    (CF_COUNT_MASK | CF_LAST_IO | CF_USE_ICOUNT | CF_PARALLEL | CF_CLUSTER_MASK)

>> +/* cflags' mask for hashing/comparison, basically ignore CF_INVALID */

>> +#define CF_HASH_MASK   (~CF_INVALID)

>>  

>>      /* Per-vCPU dynamic tracing state used to generate this TB */

>>      uint32_t trace_vcpu_dstate;

>> diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c

>> index 0666f9ef14..32a3d8fe24 100644

>> --- a/accel/tcg/translate-all.c

>> +++ b/accel/tcg/translate-all.c

>> @@ -2399,7 +2399,8 @@ void tb_check_watchpoint(CPUState *cpu, uintptr_t retaddr)

>>  }

>>  

>>  #ifndef CONFIG_USER_ONLY

>> -/* in deterministic execution mode, instructions doing device I/Os

>> +/*

>> + * In deterministic execution mode, instructions doing device I/Os

>>   * must be at the end of the TB.

>>   *

>>   * Called by softmmu_template.h, with iothread mutex not held.

>> @@ -2430,19 +2431,17 @@ void cpu_io_recompile(CPUState *cpu, uintptr_t retaddr)

>>          n = 2;

>>      }

>>  

>> -    /* Generate a new TB executing the I/O insn.  */

>> -    cpu->cflags_next_tb = curr_cflags() | CF_LAST_IO | n;

>> +    /*

>> +     * Exit the loop and potentially generate a new TB executing the

>> +     * just the I/O insns. We also disable instrumentation so we don't

>> +     * double count the instruction.

>> +     */

>> +    cpu->cflags_next_tb = curr_cflags() | CF_NOINSTR | CF_LAST_IO | n;

>>  

>>      qemu_log_mask_and_addr(CPU_LOG_EXEC, tb->pc,

>>                             "cpu_io_recompile: rewound execution of TB to "

>>                             TARGET_FMT_lx "\n", tb->pc);

>>  

>> -    /* TODO: If env->pc != tb->pc (i.e. the faulting instruction was not

>> -     * the first in the TB) then we end up generating a whole new TB and

>> -     *  repeating the fault, which is horribly inefficient.

>> -     *  Better would be to execute just this insn uncached, or generate a

>> -     *  second new TB.

>> -     */

>>      cpu_loop_exit_noexc(cpu);

>>  }

>>  

>> diff --git a/accel/tcg/translator.c b/accel/tcg/translator.c

>> index a49a794065..14d1ea795d 100644

>> --- a/accel/tcg/translator.c

>> +++ b/accel/tcg/translator.c

>> @@ -58,7 +58,7 @@ void translator_loop(const TranslatorOps *ops, DisasContextBase *db,

>>      ops->tb_start(db, cpu);

>>      tcg_debug_assert(db->is_jmp == DISAS_NEXT);  /* no early exit */

>>  

>> -    plugin_enabled = plugin_gen_tb_start(cpu, tb);

>> +    plugin_enabled = !(tb_cflags(db->tb) & CF_NOINSTR) && plugin_gen_tb_start(cpu, tb);

>>  

>>      while (true) {

>>          db->num_insns++;

>> -- 

>> 2.20.1

>> 



-- 
Alex Bennée
Zhijian Li (Fujitsu)" via Feb. 12, 2021, 2:31 p.m. UTC | #3
On Feb 12 11:22, Alex Bennée wrote:
> Aaron Lindsay <aaron@os.amperecomputing.com> writes:

> > On Feb 10 22:10, Alex Bennée wrote:

> >> When icount is enabled and we recompile an MMIO access we end up

> >> double counting the instruction execution. To avoid this we introduce

> >> the CF_NOINSTR cflag which disables instrumentation for the next TB.

> >> As this is part of the hashed compile flags we will only execute the

> >> generated TB while coming out of a cpu_io_recompile.

> >

> > Unfortunately this patch works a little too well!

> >

> > With this change, the memory access callbacks registered via

> > `qemu_plugin_register_vcpu_mem_cb()` are never called for the

> > re-translated instruction making the IO access, since we've disabled all

> > instrumentation.

> 

> Hmm well we correctly don't instrument stores (as we have already

> executed the plugin for them) - but of course the load instrumentation

> is after the fact so we are now missing them.


I do not believe I am seeing memory callbacks for stores, either. Are
you saying I definitely should be?

My original observation was that the callbacks for store instructions to
IO followed the same pattern as loads:

1) Initial instruction callback (presumably as part of larger block)
2) Second instruction callback (presumably as part of single-instruction block)
3) Memory callback (presumably as part of single-instruction block)

After applying v2 of your patchset I now see only 1), even for stores.

> > Is it possible to selectively disable only instruction callbacks using

> > this mechanism, while still allowing others that would not yet have been

> > called for the re-translated instruction?

> 

> Hmmm let me see if I can finesse the CF_NOINSTR logic to allow

> plugin_gen_insn_end() without the rest? It probably needs a better name

> for the flag as well. 


Funny, the first time reading through this patch I was unsure for a
second whether "CF_NOINSTR" stood for "NO INSTRuction callbacks" or "NO
INSTRumentation"!

-Aaron
Alex Bennée Feb. 12, 2021, 2:43 p.m. UTC | #4
Aaron Lindsay <aaron@os.amperecomputing.com> writes:

> On Feb 10 22:10, Alex Bennée wrote:

>> When icount is enabled and we recompile an MMIO access we end up

>> double counting the instruction execution. To avoid this we introduce

>> the CF_NOINSTR cflag which disables instrumentation for the next TB.

>> As this is part of the hashed compile flags we will only execute the

>> generated TB while coming out of a cpu_io_recompile.

>

> Unfortunately this patch works a little too well!

>

> With this change, the memory access callbacks registered via

> `qemu_plugin_register_vcpu_mem_cb()` are never called for the

> re-translated instruction making the IO access, since we've disabled all

> instrumentation.

>

> Is it possible to selectively disable only instruction callbacks using

> this mechanism, while still allowing others that would not yet have been

> called for the re-translated instruction?


Can you try the following fugly patch on top of this series:

--8<---------------cut here---------------start------------->8---
diff --git a/include/exec/plugin-gen.h b/include/exec/plugin-gen.h
index 4834a9e2f4..b1b72b5d90 100644
--- a/include/exec/plugin-gen.h
+++ b/include/exec/plugin-gen.h
@@ -19,7 +19,7 @@ struct DisasContextBase;
 
 #ifdef CONFIG_PLUGIN
 
-bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb);
+bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb, bool supress);
 void plugin_gen_tb_end(CPUState *cpu);
 void plugin_gen_insn_start(CPUState *cpu, const struct DisasContextBase *db);
 void plugin_gen_insn_end(void);
@@ -41,7 +41,7 @@ static inline void plugin_insn_append(const void *from, size_t size)
 #else /* !CONFIG_PLUGIN */
 
 static inline
-bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb)
+bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb, bool supress)
 {
     return false;
 }
diff --git a/include/qemu/plugin.h b/include/qemu/plugin.h
index 841deed79c..2a26a2277f 100644
--- a/include/qemu/plugin.h
+++ b/include/qemu/plugin.h
@@ -92,6 +92,7 @@ struct qemu_plugin_dyn_cb {
     };
 };
 
+/* Internal context for instrumenting an instruction */
 struct qemu_plugin_insn {
     GByteArray *data;
     uint64_t vaddr;
@@ -99,6 +100,7 @@ struct qemu_plugin_insn {
     GArray *cbs[PLUGIN_N_CB_TYPES][PLUGIN_N_CB_SUBTYPES];
     bool calls_helpers;
     bool mem_helper;
+    bool store_only;
 };
 
 /*
@@ -128,6 +130,7 @@ static inline struct qemu_plugin_insn *qemu_plugin_insn_alloc(void)
     return insn;
 }
 
+/* Internal context for this TranslationBlock */
 struct qemu_plugin_tb {
     GPtrArray *insns;
     size_t n;
@@ -135,6 +138,7 @@ struct qemu_plugin_tb {
     uint64_t vaddr2;
     void *haddr1;
     void *haddr2;
+    bool store_only;
     GArray *cbs[PLUGIN_N_CB_SUBTYPES];
 };
 
diff --git a/accel/tcg/plugin-gen.c b/accel/tcg/plugin-gen.c
index 8a1bb801e0..137b91282e 100644
--- a/accel/tcg/plugin-gen.c
+++ b/accel/tcg/plugin-gen.c
@@ -842,7 +842,7 @@ static void plugin_gen_inject(const struct qemu_plugin_tb *plugin_tb)
     pr_ops();
 }
 
-bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb)
+bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb, bool store_only)
 {
     struct qemu_plugin_tb *ptb = tcg_ctx->plugin_tb;
     bool ret = false;
@@ -855,6 +855,7 @@ bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb)
         ptb->vaddr2 = -1;
         get_page_addr_code_hostp(cpu->env_ptr, tb->pc, &ptb->haddr1);
         ptb->haddr2 = NULL;
+        ptb->store_only = store_only;
 
         plugin_gen_empty_callback(PLUGIN_GEN_FROM_TB);
     }
diff --git a/accel/tcg/translator.c b/accel/tcg/translator.c
index 14d1ea795d..082f2c8ee1 100644
--- a/accel/tcg/translator.c
+++ b/accel/tcg/translator.c
@@ -58,7 +58,7 @@ void translator_loop(const TranslatorOps *ops, DisasContextBase *db,
     ops->tb_start(db, cpu);
     tcg_debug_assert(db->is_jmp == DISAS_NEXT);  /* no early exit */
 
-    plugin_enabled = !(tb_cflags(db->tb) & CF_NOINSTR) && plugin_gen_tb_start(cpu, tb);
+    plugin_enabled = plugin_gen_tb_start(cpu, tb, tb_cflags(db->tb) & CF_NOINSTR);
 
     while (true) {
         db->num_insns++;
@@ -100,6 +100,8 @@ void translator_loop(const TranslatorOps *ops, DisasContextBase *db,
             gen_io_start();
             ops->translate_insn(db, cpu);
         } else {
+            /* we should only see NOINSTR for io_recompile */
+            g_assert(!(tb_cflags(db->tb) & CF_NOINSTR));
             ops->translate_insn(db, cpu);
         }
 
diff --git a/plugins/api.c b/plugins/api.c
index 5dc8e6f934..ac8475707d 100644
--- a/plugins/api.c
+++ b/plugins/api.c
@@ -84,15 +84,19 @@ void qemu_plugin_register_vcpu_tb_exec_cb(struct qemu_plugin_tb *tb,
                                           enum qemu_plugin_cb_flags flags,
                                           void *udata)
 {
-    plugin_register_dyn_cb__udata(&tb->cbs[PLUGIN_CB_REGULAR],
-                                  cb, flags, udata);
+    if (!tb->store_only) {
+        plugin_register_dyn_cb__udata(&tb->cbs[PLUGIN_CB_REGULAR],
+                                      cb, flags, udata);
+    }
 }
 
 void qemu_plugin_register_vcpu_tb_exec_inline(struct qemu_plugin_tb *tb,
                                               enum qemu_plugin_op op,
                                               void *ptr, uint64_t imm)
 {
-    plugin_register_inline_op(&tb->cbs[PLUGIN_CB_INLINE], 0, op, ptr, imm);
+    if (!tb->store_only) {
+        plugin_register_inline_op(&tb->cbs[PLUGIN_CB_INLINE], 0, op, ptr, imm);
+    }
 }
 
 void qemu_plugin_register_vcpu_insn_exec_cb(struct qemu_plugin_insn *insn,
@@ -100,16 +104,20 @@ void qemu_plugin_register_vcpu_insn_exec_cb(struct qemu_plugin_insn *insn,
                                             enum qemu_plugin_cb_flags flags,
                                             void *udata)
 {
-    plugin_register_dyn_cb__udata(&insn->cbs[PLUGIN_CB_INSN][PLUGIN_CB_REGULAR],
-        cb, flags, udata);
+    if (!insn->store_only) {
+        plugin_register_dyn_cb__udata(&insn->cbs[PLUGIN_CB_INSN][PLUGIN_CB_REGULAR],
+                                      cb, flags, udata);
+    }
 }
 
 void qemu_plugin_register_vcpu_insn_exec_inline(struct qemu_plugin_insn *insn,
                                                 enum qemu_plugin_op op,
                                                 void *ptr, uint64_t imm)
 {
-    plugin_register_inline_op(&insn->cbs[PLUGIN_CB_INSN][PLUGIN_CB_INLINE],
-                              0, op, ptr, imm);
+    if (!insn->store_only) {
+        plugin_register_inline_op(&insn->cbs[PLUGIN_CB_INSN][PLUGIN_CB_INLINE],
+                                  0, op, ptr, imm);
+    }
 }
 
 
@@ -120,8 +128,13 @@ void qemu_plugin_register_vcpu_mem_cb(struct qemu_plugin_insn *insn,
                                       enum qemu_plugin_mem_rw rw,
                                       void *udata)
 {
-    plugin_register_vcpu_mem_cb(&insn->cbs[PLUGIN_CB_MEM][PLUGIN_CB_REGULAR],
-                                cb, flags, rw, udata);
+    if (insn->store_only && (rw & QEMU_PLUGIN_MEM_W)) {
+        plugin_register_vcpu_mem_cb(&insn->cbs[PLUGIN_CB_MEM][PLUGIN_CB_REGULAR],
+                                    cb, flags, QEMU_PLUGIN_MEM_W, udata);
+    } else {
+        plugin_register_vcpu_mem_cb(&insn->cbs[PLUGIN_CB_MEM][PLUGIN_CB_REGULAR],
+                                    cb, flags, rw, udata);
+    }
 }
 
 void qemu_plugin_register_vcpu_mem_inline(struct qemu_plugin_insn *insn,
@@ -129,8 +142,10 @@ void qemu_plugin_register_vcpu_mem_inline(struct qemu_plugin_insn *insn,
                                           enum qemu_plugin_op op, void *ptr,
                                           uint64_t imm)
 {
-    plugin_register_inline_op(&insn->cbs[PLUGIN_CB_MEM][PLUGIN_CB_INLINE],
-        rw, op, ptr, imm);
+    if (!insn->store_only) {
+        plugin_register_inline_op(&insn->cbs[PLUGIN_CB_MEM][PLUGIN_CB_INLINE],
+                                  rw, op, ptr, imm);
+    }
 }
 
 void qemu_plugin_register_vcpu_tb_trans_cb(qemu_plugin_id_t id,
@@ -181,10 +196,13 @@ uint64_t qemu_plugin_tb_vaddr(const struct qemu_plugin_tb *tb)
 struct qemu_plugin_insn *
 qemu_plugin_tb_get_insn(const struct qemu_plugin_tb *tb, size_t idx)
 {
+    struct qemu_plugin_insn *insn;
     if (unlikely(idx >= tb->n)) {
         return NULL;
     }
-    return g_ptr_array_index(tb->insns, idx);
+    insn = g_ptr_array_index(tb->insns, idx);
+    insn->store_only = tb->store_only;
+    return insn;
 }
 
 /*
--8<---------------cut here---------------end--------------->8---

-- 
Alex Bennée
Alex Bennée Feb. 12, 2021, 2:59 p.m. UTC | #5
Aaron Lindsay <aaron@os.amperecomputing.com> writes:

> On Feb 12 11:22, Alex Bennée wrote:

>> Aaron Lindsay <aaron@os.amperecomputing.com> writes:

>> > On Feb 10 22:10, Alex Bennée wrote:

>> >> When icount is enabled and we recompile an MMIO access we end up

>> >> double counting the instruction execution. To avoid this we introduce

>> >> the CF_NOINSTR cflag which disables instrumentation for the next TB.

>> >> As this is part of the hashed compile flags we will only execute the

>> >> generated TB while coming out of a cpu_io_recompile.

>> >

>> > Unfortunately this patch works a little too well!

>> >

>> > With this change, the memory access callbacks registered via

>> > `qemu_plugin_register_vcpu_mem_cb()` are never called for the

>> > re-translated instruction making the IO access, since we've disabled all

>> > instrumentation.

>> 

>> Hmm well we correctly don't instrument stores (as we have already

>> executed the plugin for them) - but of course the load instrumentation

>> is after the fact so we are now missing them.

>

> I do not believe I am seeing memory callbacks for stores, either. Are

> you saying I definitely should be?

>

> My original observation was that the callbacks for store instructions to

> IO followed the same pattern as loads:

>

> 1) Initial instruction callback (presumably as part of larger block)

> 2) Second instruction callback (presumably as part of single-instruction block)

> 3) Memory callback (presumably as part of single-instruction block)

>

> After applying v2 of your patchset I now see only 1), even for stores.


Right - but any pre-instruction instrumentation shouldn't be done in the
(now badly names CF_NOINSTR) case. It's also confusing because we have
pre and post helpers and inline callbacks are always pre (you can only
count so don't see data).

Can you check the patch in my other email and see if that works better?

>

>> > Is it possible to selectively disable only instruction callbacks using

>> > this mechanism, while still allowing others that would not yet have been

>> > called for the re-translated instruction?

>> 

>> Hmmm let me see if I can finesse the CF_NOINSTR logic to allow

>> plugin_gen_insn_end() without the rest? It probably needs a better name

>> for the flag as well. 

>

> Funny, the first time reading through this patch I was unsure for a

> second whether "CF_NOINSTR" stood for "NO INSTRuction callbacks" or "NO

> INSTRumentation"!

>

> -Aaron



-- 
Alex Bennée
Zhijian Li (Fujitsu)" via Feb. 12, 2021, 3:41 p.m. UTC | #6
On Feb 12 14:43, Alex Bennée wrote:
> Aaron Lindsay <aaron@os.amperecomputing.com> writes:

> > On Feb 10 22:10, Alex Bennée wrote:

> >> When icount is enabled and we recompile an MMIO access we end up

> >> double counting the instruction execution. To avoid this we introduce

> >> the CF_NOINSTR cflag which disables instrumentation for the next TB.

> >> As this is part of the hashed compile flags we will only execute the

> >> generated TB while coming out of a cpu_io_recompile.

> >

> > Unfortunately this patch works a little too well!

> >

> > With this change, the memory access callbacks registered via

> > `qemu_plugin_register_vcpu_mem_cb()` are never called for the

> > re-translated instruction making the IO access, since we've disabled all

> > instrumentation.

> >

> > Is it possible to selectively disable only instruction callbacks using

> > this mechanism, while still allowing others that would not yet have been

> > called for the re-translated instruction?

> 

> Can you try the following fugly patch on top of this series:


This patch does allow me to successfully observe memory callbacks for
stores in this case. It seems from looking at the patch that you
intentionally only allowed memory callbacks for stores in this case, and
I still don't see callbacks any for loads.

-Aaron

> --8<---------------cut here---------------start------------->8---

> diff --git a/include/exec/plugin-gen.h b/include/exec/plugin-gen.h

> index 4834a9e2f4..b1b72b5d90 100644

> --- a/include/exec/plugin-gen.h

> +++ b/include/exec/plugin-gen.h

> @@ -19,7 +19,7 @@ struct DisasContextBase;

>  

>  #ifdef CONFIG_PLUGIN

>  

> -bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb);

> +bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb, bool supress);

>  void plugin_gen_tb_end(CPUState *cpu);

>  void plugin_gen_insn_start(CPUState *cpu, const struct DisasContextBase *db);

>  void plugin_gen_insn_end(void);

> @@ -41,7 +41,7 @@ static inline void plugin_insn_append(const void *from, size_t size)

>  #else /* !CONFIG_PLUGIN */

>  

>  static inline

> -bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb)

> +bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb, bool supress)

>  {

>      return false;

>  }

> diff --git a/include/qemu/plugin.h b/include/qemu/plugin.h

> index 841deed79c..2a26a2277f 100644

> --- a/include/qemu/plugin.h

> +++ b/include/qemu/plugin.h

> @@ -92,6 +92,7 @@ struct qemu_plugin_dyn_cb {

>      };

>  };

>  

> +/* Internal context for instrumenting an instruction */

>  struct qemu_plugin_insn {

>      GByteArray *data;

>      uint64_t vaddr;

> @@ -99,6 +100,7 @@ struct qemu_plugin_insn {

>      GArray *cbs[PLUGIN_N_CB_TYPES][PLUGIN_N_CB_SUBTYPES];

>      bool calls_helpers;

>      bool mem_helper;

> +    bool store_only;

>  };

>  

>  /*

> @@ -128,6 +130,7 @@ static inline struct qemu_plugin_insn *qemu_plugin_insn_alloc(void)

>      return insn;

>  }

>  

> +/* Internal context for this TranslationBlock */

>  struct qemu_plugin_tb {

>      GPtrArray *insns;

>      size_t n;

> @@ -135,6 +138,7 @@ struct qemu_plugin_tb {

>      uint64_t vaddr2;

>      void *haddr1;

>      void *haddr2;

> +    bool store_only;

>      GArray *cbs[PLUGIN_N_CB_SUBTYPES];

>  };

>  

> diff --git a/accel/tcg/plugin-gen.c b/accel/tcg/plugin-gen.c

> index 8a1bb801e0..137b91282e 100644

> --- a/accel/tcg/plugin-gen.c

> +++ b/accel/tcg/plugin-gen.c

> @@ -842,7 +842,7 @@ static void plugin_gen_inject(const struct qemu_plugin_tb *plugin_tb)

>      pr_ops();

>  }

>  

> -bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb)

> +bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb, bool store_only)

>  {

>      struct qemu_plugin_tb *ptb = tcg_ctx->plugin_tb;

>      bool ret = false;

> @@ -855,6 +855,7 @@ bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb)

>          ptb->vaddr2 = -1;

>          get_page_addr_code_hostp(cpu->env_ptr, tb->pc, &ptb->haddr1);

>          ptb->haddr2 = NULL;

> +        ptb->store_only = store_only;

>  

>          plugin_gen_empty_callback(PLUGIN_GEN_FROM_TB);

>      }

> diff --git a/accel/tcg/translator.c b/accel/tcg/translator.c

> index 14d1ea795d..082f2c8ee1 100644

> --- a/accel/tcg/translator.c

> +++ b/accel/tcg/translator.c

> @@ -58,7 +58,7 @@ void translator_loop(const TranslatorOps *ops, DisasContextBase *db,

>      ops->tb_start(db, cpu);

>      tcg_debug_assert(db->is_jmp == DISAS_NEXT);  /* no early exit */

>  

> -    plugin_enabled = !(tb_cflags(db->tb) & CF_NOINSTR) && plugin_gen_tb_start(cpu, tb);

> +    plugin_enabled = plugin_gen_tb_start(cpu, tb, tb_cflags(db->tb) & CF_NOINSTR);

>  

>      while (true) {

>          db->num_insns++;

> @@ -100,6 +100,8 @@ void translator_loop(const TranslatorOps *ops, DisasContextBase *db,

>              gen_io_start();

>              ops->translate_insn(db, cpu);

>          } else {

> +            /* we should only see NOINSTR for io_recompile */

> +            g_assert(!(tb_cflags(db->tb) & CF_NOINSTR));

>              ops->translate_insn(db, cpu);

>          }

>  

> diff --git a/plugins/api.c b/plugins/api.c

> index 5dc8e6f934..ac8475707d 100644

> --- a/plugins/api.c

> +++ b/plugins/api.c

> @@ -84,15 +84,19 @@ void qemu_plugin_register_vcpu_tb_exec_cb(struct qemu_plugin_tb *tb,

>                                            enum qemu_plugin_cb_flags flags,

>                                            void *udata)

>  {

> -    plugin_register_dyn_cb__udata(&tb->cbs[PLUGIN_CB_REGULAR],

> -                                  cb, flags, udata);

> +    if (!tb->store_only) {

> +        plugin_register_dyn_cb__udata(&tb->cbs[PLUGIN_CB_REGULAR],

> +                                      cb, flags, udata);

> +    }

>  }

>  

>  void qemu_plugin_register_vcpu_tb_exec_inline(struct qemu_plugin_tb *tb,

>                                                enum qemu_plugin_op op,

>                                                void *ptr, uint64_t imm)

>  {

> -    plugin_register_inline_op(&tb->cbs[PLUGIN_CB_INLINE], 0, op, ptr, imm);

> +    if (!tb->store_only) {

> +        plugin_register_inline_op(&tb->cbs[PLUGIN_CB_INLINE], 0, op, ptr, imm);

> +    }

>  }

>  

>  void qemu_plugin_register_vcpu_insn_exec_cb(struct qemu_plugin_insn *insn,

> @@ -100,16 +104,20 @@ void qemu_plugin_register_vcpu_insn_exec_cb(struct qemu_plugin_insn *insn,

>                                              enum qemu_plugin_cb_flags flags,

>                                              void *udata)

>  {

> -    plugin_register_dyn_cb__udata(&insn->cbs[PLUGIN_CB_INSN][PLUGIN_CB_REGULAR],

> -        cb, flags, udata);

> +    if (!insn->store_only) {

> +        plugin_register_dyn_cb__udata(&insn->cbs[PLUGIN_CB_INSN][PLUGIN_CB_REGULAR],

> +                                      cb, flags, udata);

> +    }

>  }

>  

>  void qemu_plugin_register_vcpu_insn_exec_inline(struct qemu_plugin_insn *insn,

>                                                  enum qemu_plugin_op op,

>                                                  void *ptr, uint64_t imm)

>  {

> -    plugin_register_inline_op(&insn->cbs[PLUGIN_CB_INSN][PLUGIN_CB_INLINE],

> -                              0, op, ptr, imm);

> +    if (!insn->store_only) {

> +        plugin_register_inline_op(&insn->cbs[PLUGIN_CB_INSN][PLUGIN_CB_INLINE],

> +                                  0, op, ptr, imm);

> +    }

>  }

>  

>  

> @@ -120,8 +128,13 @@ void qemu_plugin_register_vcpu_mem_cb(struct qemu_plugin_insn *insn,

>                                        enum qemu_plugin_mem_rw rw,

>                                        void *udata)

>  {

> -    plugin_register_vcpu_mem_cb(&insn->cbs[PLUGIN_CB_MEM][PLUGIN_CB_REGULAR],

> -                                cb, flags, rw, udata);

> +    if (insn->store_only && (rw & QEMU_PLUGIN_MEM_W)) {

> +        plugin_register_vcpu_mem_cb(&insn->cbs[PLUGIN_CB_MEM][PLUGIN_CB_REGULAR],

> +                                    cb, flags, QEMU_PLUGIN_MEM_W, udata);

> +    } else {

> +        plugin_register_vcpu_mem_cb(&insn->cbs[PLUGIN_CB_MEM][PLUGIN_CB_REGULAR],

> +                                    cb, flags, rw, udata);

> +    }

>  }

>  

>  void qemu_plugin_register_vcpu_mem_inline(struct qemu_plugin_insn *insn,

> @@ -129,8 +142,10 @@ void qemu_plugin_register_vcpu_mem_inline(struct qemu_plugin_insn *insn,

>                                            enum qemu_plugin_op op, void *ptr,

>                                            uint64_t imm)

>  {

> -    plugin_register_inline_op(&insn->cbs[PLUGIN_CB_MEM][PLUGIN_CB_INLINE],

> -        rw, op, ptr, imm);

> +    if (!insn->store_only) {

> +        plugin_register_inline_op(&insn->cbs[PLUGIN_CB_MEM][PLUGIN_CB_INLINE],

> +                                  rw, op, ptr, imm);

> +    }

>  }

>  

>  void qemu_plugin_register_vcpu_tb_trans_cb(qemu_plugin_id_t id,

> @@ -181,10 +196,13 @@ uint64_t qemu_plugin_tb_vaddr(const struct qemu_plugin_tb *tb)

>  struct qemu_plugin_insn *

>  qemu_plugin_tb_get_insn(const struct qemu_plugin_tb *tb, size_t idx)

>  {

> +    struct qemu_plugin_insn *insn;

>      if (unlikely(idx >= tb->n)) {

>          return NULL;

>      }

> -    return g_ptr_array_index(tb->insns, idx);

> +    insn = g_ptr_array_index(tb->insns, idx);

> +    insn->store_only = tb->store_only;

> +    return insn;

>  }

>  

>  /*

> --8<---------------cut here---------------end--------------->8---

> 

> -- 

> Alex Bennée
Alex Bennée Feb. 12, 2021, 4 p.m. UTC | #7
Alex Bennée <alex.bennee@linaro.org> writes:

> Aaron Lindsay <aaron@os.amperecomputing.com> writes:

>

>> On Feb 10 22:10, Alex Bennée wrote:

>>> When icount is enabled and we recompile an MMIO access we end up

>>> double counting the instruction execution. To avoid this we introduce

>>> the CF_NOINSTR cflag which disables instrumentation for the next TB.

>>> As this is part of the hashed compile flags we will only execute the

>>> generated TB while coming out of a cpu_io_recompile.

>>

>> Unfortunately this patch works a little too well!

>>

>> With this change, the memory access callbacks registered via

>> `qemu_plugin_register_vcpu_mem_cb()` are never called for the

>> re-translated instruction making the IO access, since we've disabled all

>> instrumentation.

>>

>> Is it possible to selectively disable only instruction callbacks using

>> this mechanism, while still allowing others that would not yet have been

>> called for the re-translated instruction?

>

> Can you try the following fugly patch on top of this series:

>

<snip>
> @@ -120,8 +128,13 @@ void qemu_plugin_register_vcpu_mem_cb(struct qemu_plugin_insn *insn,

>                                        enum qemu_plugin_mem_rw rw,

>                                        void *udata)

>  {

> -    plugin_register_vcpu_mem_cb(&insn->cbs[PLUGIN_CB_MEM][PLUGIN_CB_REGULAR],

> -                                cb, flags, rw, udata);

> +    if (insn->store_only && (rw & QEMU_PLUGIN_MEM_W)) {

> +        plugin_register_vcpu_mem_cb(&insn->cbs[PLUGIN_CB_MEM][PLUGIN_CB_REGULAR],

> +                                    cb, flags, QEMU_PLUGIN_MEM_W, udata);

> +    } else {

> +        plugin_register_vcpu_mem_cb(&insn->cbs[PLUGIN_CB_MEM][PLUGIN_CB_REGULAR],

> +                                    cb, flags, rw, udata);

> +    }

>  }

<snip>

Actually I'm wondering if I've got my sense the wrong way around. Should
it be loads only:

  void qemu_plugin_register_vcpu_mem_cb(struct qemu_plugin_insn *insn,
                                        qemu_plugin_vcpu_mem_cb_t cb,
                                        enum qemu_plugin_cb_flags flags,
                                        enum qemu_plugin_mem_rw rw,
                                        void *udata)
  {
      if (insn->store_only && (rw & QEMU_PLUGIN_MEM_R)) {
          plugin_register_vcpu_mem_cb(&insn->cbs[PLUGIN_CB_MEM][PLUGIN_CB_REGULAR],
                                      cb, flags, QEMU_PLUGIN_MEM_R, udata);
      } else {
          plugin_register_vcpu_mem_cb(&insn->cbs[PLUGIN_CB_MEM][PLUGIN_CB_REGULAR],
                                      cb, flags, rw, udata);
      }
  }


obviously I'd have to rename the variables :-/

-- 
Alex Bennée
Alex Bennée Feb. 12, 2021, 4:04 p.m. UTC | #8
Do you see two stores or one store? I think I got the sense the wrong
way around because the store is instrumented before the mmu code,
hence should be skipped on a re-instrumented block.

On Fri, 12 Feb 2021 at 15:41, Aaron Lindsay
<aaron@os.amperecomputing.com> wrote:
>

> On Feb 12 14:43, Alex Bennée wrote:

> > Aaron Lindsay <aaron@os.amperecomputing.com> writes:

> > > On Feb 10 22:10, Alex Bennée wrote:

> > >> When icount is enabled and we recompile an MMIO access we end up

> > >> double counting the instruction execution. To avoid this we introduce

> > >> the CF_NOINSTR cflag which disables instrumentation for the next TB.

> > >> As this is part of the hashed compile flags we will only execute the

> > >> generated TB while coming out of a cpu_io_recompile.

> > >

> > > Unfortunately this patch works a little too well!

> > >

> > > With this change, the memory access callbacks registered via

> > > `qemu_plugin_register_vcpu_mem_cb()` are never called for the

> > > re-translated instruction making the IO access, since we've disabled all

> > > instrumentation.

> > >

> > > Is it possible to selectively disable only instruction callbacks using

> > > this mechanism, while still allowing others that would not yet have been

> > > called for the re-translated instruction?

> >

> > Can you try the following fugly patch on top of this series:

>

> This patch does allow me to successfully observe memory callbacks for

> stores in this case. It seems from looking at the patch that you

> intentionally only allowed memory callbacks for stores in this case, and

> I still don't see callbacks any for loads.

>

> -Aaron

>

> > --8<---------------cut here---------------start------------->8---

> > diff --git a/include/exec/plugin-gen.h b/include/exec/plugin-gen.h

> > index 4834a9e2f4..b1b72b5d90 100644

> > --- a/include/exec/plugin-gen.h

> > +++ b/include/exec/plugin-gen.h

> > @@ -19,7 +19,7 @@ struct DisasContextBase;

> >

> >  #ifdef CONFIG_PLUGIN

> >

> > -bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb);

> > +bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb, bool supress);

> >  void plugin_gen_tb_end(CPUState *cpu);

> >  void plugin_gen_insn_start(CPUState *cpu, const struct DisasContextBase *db);

> >  void plugin_gen_insn_end(void);

> > @@ -41,7 +41,7 @@ static inline void plugin_insn_append(const void *from, size_t size)

> >  #else /* !CONFIG_PLUGIN */

> >

> >  static inline

> > -bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb)

> > +bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb, bool supress)

> >  {

> >      return false;

> >  }

> > diff --git a/include/qemu/plugin.h b/include/qemu/plugin.h

> > index 841deed79c..2a26a2277f 100644

> > --- a/include/qemu/plugin.h

> > +++ b/include/qemu/plugin.h

> > @@ -92,6 +92,7 @@ struct qemu_plugin_dyn_cb {

> >      };

> >  };

> >

> > +/* Internal context for instrumenting an instruction */

> >  struct qemu_plugin_insn {

> >      GByteArray *data;

> >      uint64_t vaddr;

> > @@ -99,6 +100,7 @@ struct qemu_plugin_insn {

> >      GArray *cbs[PLUGIN_N_CB_TYPES][PLUGIN_N_CB_SUBTYPES];

> >      bool calls_helpers;

> >      bool mem_helper;

> > +    bool store_only;

> >  };

> >

> >  /*

> > @@ -128,6 +130,7 @@ static inline struct qemu_plugin_insn *qemu_plugin_insn_alloc(void)

> >      return insn;

> >  }

> >

> > +/* Internal context for this TranslationBlock */

> >  struct qemu_plugin_tb {

> >      GPtrArray *insns;

> >      size_t n;

> > @@ -135,6 +138,7 @@ struct qemu_plugin_tb {

> >      uint64_t vaddr2;

> >      void *haddr1;

> >      void *haddr2;

> > +    bool store_only;

> >      GArray *cbs[PLUGIN_N_CB_SUBTYPES];

> >  };

> >

> > diff --git a/accel/tcg/plugin-gen.c b/accel/tcg/plugin-gen.c

> > index 8a1bb801e0..137b91282e 100644

> > --- a/accel/tcg/plugin-gen.c

> > +++ b/accel/tcg/plugin-gen.c

> > @@ -842,7 +842,7 @@ static void plugin_gen_inject(const struct qemu_plugin_tb *plugin_tb)

> >      pr_ops();

> >  }

> >

> > -bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb)

> > +bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb, bool store_only)

> >  {

> >      struct qemu_plugin_tb *ptb = tcg_ctx->plugin_tb;

> >      bool ret = false;

> > @@ -855,6 +855,7 @@ bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb)

> >          ptb->vaddr2 = -1;

> >          get_page_addr_code_hostp(cpu->env_ptr, tb->pc, &ptb->haddr1);

> >          ptb->haddr2 = NULL;

> > +        ptb->store_only = store_only;

> >

> >          plugin_gen_empty_callback(PLUGIN_GEN_FROM_TB);

> >      }

> > diff --git a/accel/tcg/translator.c b/accel/tcg/translator.c

> > index 14d1ea795d..082f2c8ee1 100644

> > --- a/accel/tcg/translator.c

> > +++ b/accel/tcg/translator.c

> > @@ -58,7 +58,7 @@ void translator_loop(const TranslatorOps *ops, DisasContextBase *db,

> >      ops->tb_start(db, cpu);

> >      tcg_debug_assert(db->is_jmp == DISAS_NEXT);  /* no early exit */

> >

> > -    plugin_enabled = !(tb_cflags(db->tb) & CF_NOINSTR) && plugin_gen_tb_start(cpu, tb);

> > +    plugin_enabled = plugin_gen_tb_start(cpu, tb, tb_cflags(db->tb) & CF_NOINSTR);

> >

> >      while (true) {

> >          db->num_insns++;

> > @@ -100,6 +100,8 @@ void translator_loop(const TranslatorOps *ops, DisasContextBase *db,

> >              gen_io_start();

> >              ops->translate_insn(db, cpu);

> >          } else {

> > +            /* we should only see NOINSTR for io_recompile */

> > +            g_assert(!(tb_cflags(db->tb) & CF_NOINSTR));

> >              ops->translate_insn(db, cpu);

> >          }

> >

> > diff --git a/plugins/api.c b/plugins/api.c

> > index 5dc8e6f934..ac8475707d 100644

> > --- a/plugins/api.c

> > +++ b/plugins/api.c

> > @@ -84,15 +84,19 @@ void qemu_plugin_register_vcpu_tb_exec_cb(struct qemu_plugin_tb *tb,

> >                                            enum qemu_plugin_cb_flags flags,

> >                                            void *udata)

> >  {

> > -    plugin_register_dyn_cb__udata(&tb->cbs[PLUGIN_CB_REGULAR],

> > -                                  cb, flags, udata);

> > +    if (!tb->store_only) {

> > +        plugin_register_dyn_cb__udata(&tb->cbs[PLUGIN_CB_REGULAR],

> > +                                      cb, flags, udata);

> > +    }

> >  }

> >

> >  void qemu_plugin_register_vcpu_tb_exec_inline(struct qemu_plugin_tb *tb,

> >                                                enum qemu_plugin_op op,

> >                                                void *ptr, uint64_t imm)

> >  {

> > -    plugin_register_inline_op(&tb->cbs[PLUGIN_CB_INLINE], 0, op, ptr, imm);

> > +    if (!tb->store_only) {

> > +        plugin_register_inline_op(&tb->cbs[PLUGIN_CB_INLINE], 0, op, ptr, imm);

> > +    }

> >  }

> >

> >  void qemu_plugin_register_vcpu_insn_exec_cb(struct qemu_plugin_insn *insn,

> > @@ -100,16 +104,20 @@ void qemu_plugin_register_vcpu_insn_exec_cb(struct qemu_plugin_insn *insn,

> >                                              enum qemu_plugin_cb_flags flags,

> >                                              void *udata)

> >  {

> > -    plugin_register_dyn_cb__udata(&insn->cbs[PLUGIN_CB_INSN][PLUGIN_CB_REGULAR],

> > -        cb, flags, udata);

> > +    if (!insn->store_only) {

> > +        plugin_register_dyn_cb__udata(&insn->cbs[PLUGIN_CB_INSN][PLUGIN_CB_REGULAR],

> > +                                      cb, flags, udata);

> > +    }

> >  }

> >

> >  void qemu_plugin_register_vcpu_insn_exec_inline(struct qemu_plugin_insn *insn,

> >                                                  enum qemu_plugin_op op,

> >                                                  void *ptr, uint64_t imm)

> >  {

> > -    plugin_register_inline_op(&insn->cbs[PLUGIN_CB_INSN][PLUGIN_CB_INLINE],

> > -                              0, op, ptr, imm);

> > +    if (!insn->store_only) {

> > +        plugin_register_inline_op(&insn->cbs[PLUGIN_CB_INSN][PLUGIN_CB_INLINE],

> > +                                  0, op, ptr, imm);

> > +    }

> >  }

> >

> >

> > @@ -120,8 +128,13 @@ void qemu_plugin_register_vcpu_mem_cb(struct qemu_plugin_insn *insn,

> >                                        enum qemu_plugin_mem_rw rw,

> >                                        void *udata)

> >  {

> > -    plugin_register_vcpu_mem_cb(&insn->cbs[PLUGIN_CB_MEM][PLUGIN_CB_REGULAR],

> > -                                cb, flags, rw, udata);

> > +    if (insn->store_only && (rw & QEMU_PLUGIN_MEM_W)) {

> > +        plugin_register_vcpu_mem_cb(&insn->cbs[PLUGIN_CB_MEM][PLUGIN_CB_REGULAR],

> > +                                    cb, flags, QEMU_PLUGIN_MEM_W, udata);

> > +    } else {

> > +        plugin_register_vcpu_mem_cb(&insn->cbs[PLUGIN_CB_MEM][PLUGIN_CB_REGULAR],

> > +                                    cb, flags, rw, udata);

> > +    }

> >  }

> >

> >  void qemu_plugin_register_vcpu_mem_inline(struct qemu_plugin_insn *insn,

> > @@ -129,8 +142,10 @@ void qemu_plugin_register_vcpu_mem_inline(struct qemu_plugin_insn *insn,

> >                                            enum qemu_plugin_op op, void *ptr,

> >                                            uint64_t imm)

> >  {

> > -    plugin_register_inline_op(&insn->cbs[PLUGIN_CB_MEM][PLUGIN_CB_INLINE],

> > -        rw, op, ptr, imm);

> > +    if (!insn->store_only) {

> > +        plugin_register_inline_op(&insn->cbs[PLUGIN_CB_MEM][PLUGIN_CB_INLINE],

> > +                                  rw, op, ptr, imm);

> > +    }

> >  }

> >

> >  void qemu_plugin_register_vcpu_tb_trans_cb(qemu_plugin_id_t id,

> > @@ -181,10 +196,13 @@ uint64_t qemu_plugin_tb_vaddr(const struct qemu_plugin_tb *tb)

> >  struct qemu_plugin_insn *

> >  qemu_plugin_tb_get_insn(const struct qemu_plugin_tb *tb, size_t idx)

> >  {

> > +    struct qemu_plugin_insn *insn;

> >      if (unlikely(idx >= tb->n)) {

> >          return NULL;

> >      }

> > -    return g_ptr_array_index(tb->insns, idx);

> > +    insn = g_ptr_array_index(tb->insns, idx);

> > +    insn->store_only = tb->store_only;

> > +    return insn;

> >  }

> >

> >  /*

> > --8<---------------cut here---------------end--------------->8---

> >

> > --

> > Alex Bennée




-- 
Alex Bennée
KVM/QEMU Hacker for Linaro
Zhijian Li (Fujitsu)" via Feb. 12, 2021, 4:50 p.m. UTC | #9
On Feb 12 16:04, Alex Bennée wrote:
> Do you see two stores or one store? I think I got the sense the wrong

> way around because the store is instrumented before the mmu code,

> hence should be skipped on a re-instrumented block.


I only see one store between the instruction callback for the store and
the instruction callback for the subsequent instruction.

-Aaron

> On Fri, 12 Feb 2021 at 15:41, Aaron Lindsay

> <aaron@os.amperecomputing.com> wrote:

> >

> > On Feb 12 14:43, Alex Bennée wrote:

> > > Aaron Lindsay <aaron@os.amperecomputing.com> writes:

> > > > On Feb 10 22:10, Alex Bennée wrote:

> > > >> When icount is enabled and we recompile an MMIO access we end up

> > > >> double counting the instruction execution. To avoid this we introduce

> > > >> the CF_NOINSTR cflag which disables instrumentation for the next TB.

> > > >> As this is part of the hashed compile flags we will only execute the

> > > >> generated TB while coming out of a cpu_io_recompile.

> > > >

> > > > Unfortunately this patch works a little too well!

> > > >

> > > > With this change, the memory access callbacks registered via

> > > > `qemu_plugin_register_vcpu_mem_cb()` are never called for the

> > > > re-translated instruction making the IO access, since we've disabled all

> > > > instrumentation.

> > > >

> > > > Is it possible to selectively disable only instruction callbacks using

> > > > this mechanism, while still allowing others that would not yet have been

> > > > called for the re-translated instruction?

> > >

> > > Can you try the following fugly patch on top of this series:

> >

> > This patch does allow me to successfully observe memory callbacks for

> > stores in this case. It seems from looking at the patch that you

> > intentionally only allowed memory callbacks for stores in this case, and

> > I still don't see callbacks any for loads.

> >

> > -Aaron

> >

> > > --8<---------------cut here---------------start------------->8---

> > > diff --git a/include/exec/plugin-gen.h b/include/exec/plugin-gen.h

> > > index 4834a9e2f4..b1b72b5d90 100644

> > > --- a/include/exec/plugin-gen.h

> > > +++ b/include/exec/plugin-gen.h

> > > @@ -19,7 +19,7 @@ struct DisasContextBase;

> > >

> > >  #ifdef CONFIG_PLUGIN

> > >

> > > -bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb);

> > > +bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb, bool supress);

> > >  void plugin_gen_tb_end(CPUState *cpu);

> > >  void plugin_gen_insn_start(CPUState *cpu, const struct DisasContextBase *db);

> > >  void plugin_gen_insn_end(void);

> > > @@ -41,7 +41,7 @@ static inline void plugin_insn_append(const void *from, size_t size)

> > >  #else /* !CONFIG_PLUGIN */

> > >

> > >  static inline

> > > -bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb)

> > > +bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb, bool supress)

> > >  {

> > >      return false;

> > >  }

> > > diff --git a/include/qemu/plugin.h b/include/qemu/plugin.h

> > > index 841deed79c..2a26a2277f 100644

> > > --- a/include/qemu/plugin.h

> > > +++ b/include/qemu/plugin.h

> > > @@ -92,6 +92,7 @@ struct qemu_plugin_dyn_cb {

> > >      };

> > >  };

> > >

> > > +/* Internal context for instrumenting an instruction */

> > >  struct qemu_plugin_insn {

> > >      GByteArray *data;

> > >      uint64_t vaddr;

> > > @@ -99,6 +100,7 @@ struct qemu_plugin_insn {

> > >      GArray *cbs[PLUGIN_N_CB_TYPES][PLUGIN_N_CB_SUBTYPES];

> > >      bool calls_helpers;

> > >      bool mem_helper;

> > > +    bool store_only;

> > >  };

> > >

> > >  /*

> > > @@ -128,6 +130,7 @@ static inline struct qemu_plugin_insn *qemu_plugin_insn_alloc(void)

> > >      return insn;

> > >  }

> > >

> > > +/* Internal context for this TranslationBlock */

> > >  struct qemu_plugin_tb {

> > >      GPtrArray *insns;

> > >      size_t n;

> > > @@ -135,6 +138,7 @@ struct qemu_plugin_tb {

> > >      uint64_t vaddr2;

> > >      void *haddr1;

> > >      void *haddr2;

> > > +    bool store_only;

> > >      GArray *cbs[PLUGIN_N_CB_SUBTYPES];

> > >  };

> > >

> > > diff --git a/accel/tcg/plugin-gen.c b/accel/tcg/plugin-gen.c

> > > index 8a1bb801e0..137b91282e 100644

> > > --- a/accel/tcg/plugin-gen.c

> > > +++ b/accel/tcg/plugin-gen.c

> > > @@ -842,7 +842,7 @@ static void plugin_gen_inject(const struct qemu_plugin_tb *plugin_tb)

> > >      pr_ops();

> > >  }

> > >

> > > -bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb)

> > > +bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb, bool store_only)

> > >  {

> > >      struct qemu_plugin_tb *ptb = tcg_ctx->plugin_tb;

> > >      bool ret = false;

> > > @@ -855,6 +855,7 @@ bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb)

> > >          ptb->vaddr2 = -1;

> > >          get_page_addr_code_hostp(cpu->env_ptr, tb->pc, &ptb->haddr1);

> > >          ptb->haddr2 = NULL;

> > > +        ptb->store_only = store_only;

> > >

> > >          plugin_gen_empty_callback(PLUGIN_GEN_FROM_TB);

> > >      }

> > > diff --git a/accel/tcg/translator.c b/accel/tcg/translator.c

> > > index 14d1ea795d..082f2c8ee1 100644

> > > --- a/accel/tcg/translator.c

> > > +++ b/accel/tcg/translator.c

> > > @@ -58,7 +58,7 @@ void translator_loop(const TranslatorOps *ops, DisasContextBase *db,

> > >      ops->tb_start(db, cpu);

> > >      tcg_debug_assert(db->is_jmp == DISAS_NEXT);  /* no early exit */

> > >

> > > -    plugin_enabled = !(tb_cflags(db->tb) & CF_NOINSTR) && plugin_gen_tb_start(cpu, tb);

> > > +    plugin_enabled = plugin_gen_tb_start(cpu, tb, tb_cflags(db->tb) & CF_NOINSTR);

> > >

> > >      while (true) {

> > >          db->num_insns++;

> > > @@ -100,6 +100,8 @@ void translator_loop(const TranslatorOps *ops, DisasContextBase *db,

> > >              gen_io_start();

> > >              ops->translate_insn(db, cpu);

> > >          } else {

> > > +            /* we should only see NOINSTR for io_recompile */

> > > +            g_assert(!(tb_cflags(db->tb) & CF_NOINSTR));

> > >              ops->translate_insn(db, cpu);

> > >          }

> > >

> > > diff --git a/plugins/api.c b/plugins/api.c

> > > index 5dc8e6f934..ac8475707d 100644

> > > --- a/plugins/api.c

> > > +++ b/plugins/api.c

> > > @@ -84,15 +84,19 @@ void qemu_plugin_register_vcpu_tb_exec_cb(struct qemu_plugin_tb *tb,

> > >                                            enum qemu_plugin_cb_flags flags,

> > >                                            void *udata)

> > >  {

> > > -    plugin_register_dyn_cb__udata(&tb->cbs[PLUGIN_CB_REGULAR],

> > > -                                  cb, flags, udata);

> > > +    if (!tb->store_only) {

> > > +        plugin_register_dyn_cb__udata(&tb->cbs[PLUGIN_CB_REGULAR],

> > > +                                      cb, flags, udata);

> > > +    }

> > >  }

> > >

> > >  void qemu_plugin_register_vcpu_tb_exec_inline(struct qemu_plugin_tb *tb,

> > >                                                enum qemu_plugin_op op,

> > >                                                void *ptr, uint64_t imm)

> > >  {

> > > -    plugin_register_inline_op(&tb->cbs[PLUGIN_CB_INLINE], 0, op, ptr, imm);

> > > +    if (!tb->store_only) {

> > > +        plugin_register_inline_op(&tb->cbs[PLUGIN_CB_INLINE], 0, op, ptr, imm);

> > > +    }

> > >  }

> > >

> > >  void qemu_plugin_register_vcpu_insn_exec_cb(struct qemu_plugin_insn *insn,

> > > @@ -100,16 +104,20 @@ void qemu_plugin_register_vcpu_insn_exec_cb(struct qemu_plugin_insn *insn,

> > >                                              enum qemu_plugin_cb_flags flags,

> > >                                              void *udata)

> > >  {

> > > -    plugin_register_dyn_cb__udata(&insn->cbs[PLUGIN_CB_INSN][PLUGIN_CB_REGULAR],

> > > -        cb, flags, udata);

> > > +    if (!insn->store_only) {

> > > +        plugin_register_dyn_cb__udata(&insn->cbs[PLUGIN_CB_INSN][PLUGIN_CB_REGULAR],

> > > +                                      cb, flags, udata);

> > > +    }

> > >  }

> > >

> > >  void qemu_plugin_register_vcpu_insn_exec_inline(struct qemu_plugin_insn *insn,

> > >                                                  enum qemu_plugin_op op,

> > >                                                  void *ptr, uint64_t imm)

> > >  {

> > > -    plugin_register_inline_op(&insn->cbs[PLUGIN_CB_INSN][PLUGIN_CB_INLINE],

> > > -                              0, op, ptr, imm);

> > > +    if (!insn->store_only) {

> > > +        plugin_register_inline_op(&insn->cbs[PLUGIN_CB_INSN][PLUGIN_CB_INLINE],

> > > +                                  0, op, ptr, imm);

> > > +    }

> > >  }

> > >

> > >

> > > @@ -120,8 +128,13 @@ void qemu_plugin_register_vcpu_mem_cb(struct qemu_plugin_insn *insn,

> > >                                        enum qemu_plugin_mem_rw rw,

> > >                                        void *udata)

> > >  {

> > > -    plugin_register_vcpu_mem_cb(&insn->cbs[PLUGIN_CB_MEM][PLUGIN_CB_REGULAR],

> > > -                                cb, flags, rw, udata);

> > > +    if (insn->store_only && (rw & QEMU_PLUGIN_MEM_W)) {

> > > +        plugin_register_vcpu_mem_cb(&insn->cbs[PLUGIN_CB_MEM][PLUGIN_CB_REGULAR],

> > > +                                    cb, flags, QEMU_PLUGIN_MEM_W, udata);

> > > +    } else {

> > > +        plugin_register_vcpu_mem_cb(&insn->cbs[PLUGIN_CB_MEM][PLUGIN_CB_REGULAR],

> > > +                                    cb, flags, rw, udata);

> > > +    }

> > >  }

> > >

> > >  void qemu_plugin_register_vcpu_mem_inline(struct qemu_plugin_insn *insn,

> > > @@ -129,8 +142,10 @@ void qemu_plugin_register_vcpu_mem_inline(struct qemu_plugin_insn *insn,

> > >                                            enum qemu_plugin_op op, void *ptr,

> > >                                            uint64_t imm)

> > >  {

> > > -    plugin_register_inline_op(&insn->cbs[PLUGIN_CB_MEM][PLUGIN_CB_INLINE],

> > > -        rw, op, ptr, imm);

> > > +    if (!insn->store_only) {

> > > +        plugin_register_inline_op(&insn->cbs[PLUGIN_CB_MEM][PLUGIN_CB_INLINE],

> > > +                                  rw, op, ptr, imm);

> > > +    }

> > >  }

> > >

> > >  void qemu_plugin_register_vcpu_tb_trans_cb(qemu_plugin_id_t id,

> > > @@ -181,10 +196,13 @@ uint64_t qemu_plugin_tb_vaddr(const struct qemu_plugin_tb *tb)

> > >  struct qemu_plugin_insn *

> > >  qemu_plugin_tb_get_insn(const struct qemu_plugin_tb *tb, size_t idx)

> > >  {

> > > +    struct qemu_plugin_insn *insn;

> > >      if (unlikely(idx >= tb->n)) {

> > >          return NULL;

> > >      }

> > > -    return g_ptr_array_index(tb->insns, idx);

> > > +    insn = g_ptr_array_index(tb->insns, idx);

> > > +    insn->store_only = tb->store_only;

> > > +    return insn;

> > >  }

> > >

> > >  /*

> > > --8<---------------cut here---------------end--------------->8---

> > >

> > > --

> > > Alex Bennée

> 

> 

> 

> -- 

> Alex Bennée

> KVM/QEMU Hacker for Linaro
Zhijian Li (Fujitsu)" via Feb. 12, 2021, 5:04 p.m. UTC | #10
On Feb 12 16:00, Alex Bennée wrote:
> 

> Alex Bennée <alex.bennee@linaro.org> writes:

> 

> > Aaron Lindsay <aaron@os.amperecomputing.com> writes:

> >

> >> On Feb 10 22:10, Alex Bennée wrote:

> >>> When icount is enabled and we recompile an MMIO access we end up

> >>> double counting the instruction execution. To avoid this we introduce

> >>> the CF_NOINSTR cflag which disables instrumentation for the next TB.

> >>> As this is part of the hashed compile flags we will only execute the

> >>> generated TB while coming out of a cpu_io_recompile.

> >>

> >> Unfortunately this patch works a little too well!

> >>

> >> With this change, the memory access callbacks registered via

> >> `qemu_plugin_register_vcpu_mem_cb()` are never called for the

> >> re-translated instruction making the IO access, since we've disabled all

> >> instrumentation.

> >>

> >> Is it possible to selectively disable only instruction callbacks using

> >> this mechanism, while still allowing others that would not yet have been

> >> called for the re-translated instruction?

> >

> > Can you try the following fugly patch on top of this series:

> >

> <snip>

> > @@ -120,8 +128,13 @@ void qemu_plugin_register_vcpu_mem_cb(struct qemu_plugin_insn *insn,

> >                                        enum qemu_plugin_mem_rw rw,

> >                                        void *udata)

> >  {

> > -    plugin_register_vcpu_mem_cb(&insn->cbs[PLUGIN_CB_MEM][PLUGIN_CB_REGULAR],

> > -                                cb, flags, rw, udata);

> > +    if (insn->store_only && (rw & QEMU_PLUGIN_MEM_W)) {

> > +        plugin_register_vcpu_mem_cb(&insn->cbs[PLUGIN_CB_MEM][PLUGIN_CB_REGULAR],

> > +                                    cb, flags, QEMU_PLUGIN_MEM_W, udata);

> > +    } else {

> > +        plugin_register_vcpu_mem_cb(&insn->cbs[PLUGIN_CB_MEM][PLUGIN_CB_REGULAR],

> > +                                    cb, flags, rw, udata);

> > +    }

> >  }

> <snip>

> 

> Actually I'm wondering if I've got my sense the wrong way around. Should

> it be loads only:

> 

>   void qemu_plugin_register_vcpu_mem_cb(struct qemu_plugin_insn *insn,

>                                         qemu_plugin_vcpu_mem_cb_t cb,

>                                         enum qemu_plugin_cb_flags flags,

>                                         enum qemu_plugin_mem_rw rw,

>                                         void *udata)

>   {

>       if (insn->store_only && (rw & QEMU_PLUGIN_MEM_R)) {

>           plugin_register_vcpu_mem_cb(&insn->cbs[PLUGIN_CB_MEM][PLUGIN_CB_REGULAR],

>                                       cb, flags, QEMU_PLUGIN_MEM_R, udata);

>       } else {

>           plugin_register_vcpu_mem_cb(&insn->cbs[PLUGIN_CB_MEM][PLUGIN_CB_REGULAR],

>                                       cb, flags, rw, udata);

>       }

>   }

> 

> obviously I'd have to rename the variables :-/


This gets me only loads and no stores. I've modified it to be just:

void qemu_plugin_register_vcpu_mem_cb(struct qemu_plugin_insn *insn,
                                      qemu_plugin_vcpu_mem_cb_t cb,
                                      enum qemu_plugin_cb_flags flags,
                                      enum qemu_plugin_mem_rw rw,
                                      void *udata)
{
    plugin_register_vcpu_mem_cb(&insn->cbs[PLUGIN_CB_MEM][PLUGIN_CB_REGULAR],
                                cb, flags, rw, udata);
}

And that appears to get me one memory callback both for loads and stores.

-Aaron
Alex Bennée Feb. 12, 2021, 5:19 p.m. UTC | #11
Aaron Lindsay <aaron@os.amperecomputing.com> writes:

> On Feb 12 16:04, Alex Bennée wrote:

>> Do you see two stores or one store? I think I got the sense the wrong

>> way around because the store is instrumented before the mmu code,

>> hence should be skipped on a re-instrumented block.

>

> I only see one store between the instruction callback for the store and

> the instruction callback for the subsequent instruction.


OK - having looked more closely and reminded myself what's going on I
think the difference is memory callbacks versus memory inline. All
inline calls happen before the actual instructions. The callbacks have a
pre and post memory helper where the actual callback comes after the
operation. Those are what we want to preserve.

Let me re-spin the patch and see if I can add a test case to compare the
counts between inline and cb (which should be the same with
deterministic icount).

>

> -Aaron

>

>> On Fri, 12 Feb 2021 at 15:41, Aaron Lindsay

>> <aaron@os.amperecomputing.com> wrote:

>> >

>> > On Feb 12 14:43, Alex Bennée wrote:

>> > > Aaron Lindsay <aaron@os.amperecomputing.com> writes:

>> > > > On Feb 10 22:10, Alex Bennée wrote:

>> > > >> When icount is enabled and we recompile an MMIO access we end up

>> > > >> double counting the instruction execution. To avoid this we introduce

>> > > >> the CF_NOINSTR cflag which disables instrumentation for the next TB.

>> > > >> As this is part of the hashed compile flags we will only execute the

>> > > >> generated TB while coming out of a cpu_io_recompile.

>> > > >

>> > > > Unfortunately this patch works a little too well!

>> > > >

>> > > > With this change, the memory access callbacks registered via

>> > > > `qemu_plugin_register_vcpu_mem_cb()` are never called for the

>> > > > re-translated instruction making the IO access, since we've disabled all

>> > > > instrumentation.

>> > > >

>> > > > Is it possible to selectively disable only instruction callbacks using

>> > > > this mechanism, while still allowing others that would not yet have been

>> > > > called for the re-translated instruction?

>> > >

>> > > Can you try the following fugly patch on top of this series:

>> >

>> > This patch does allow me to successfully observe memory callbacks for

>> > stores in this case. It seems from looking at the patch that you

>> > intentionally only allowed memory callbacks for stores in this case, and

>> > I still don't see callbacks any for loads.

>> >

>> > -Aaron

>> >

>> > > --8<---------------cut here---------------start------------->8---

>> > > diff --git a/include/exec/plugin-gen.h b/include/exec/plugin-gen.h

>> > > index 4834a9e2f4..b1b72b5d90 100644

>> > > --- a/include/exec/plugin-gen.h

>> > > +++ b/include/exec/plugin-gen.h

>> > > @@ -19,7 +19,7 @@ struct DisasContextBase;

>> > >

>> > >  #ifdef CONFIG_PLUGIN

>> > >

>> > > -bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb);

>> > > +bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb, bool supress);

>> > >  void plugin_gen_tb_end(CPUState *cpu);

>> > >  void plugin_gen_insn_start(CPUState *cpu, const struct DisasContextBase *db);

>> > >  void plugin_gen_insn_end(void);

>> > > @@ -41,7 +41,7 @@ static inline void plugin_insn_append(const void *from, size_t size)

>> > >  #else /* !CONFIG_PLUGIN */

>> > >

>> > >  static inline

>> > > -bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb)

>> > > +bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb, bool supress)

>> > >  {

>> > >      return false;

>> > >  }

>> > > diff --git a/include/qemu/plugin.h b/include/qemu/plugin.h

>> > > index 841deed79c..2a26a2277f 100644

>> > > --- a/include/qemu/plugin.h

>> > > +++ b/include/qemu/plugin.h

>> > > @@ -92,6 +92,7 @@ struct qemu_plugin_dyn_cb {

>> > >      };

>> > >  };

>> > >

>> > > +/* Internal context for instrumenting an instruction */

>> > >  struct qemu_plugin_insn {

>> > >      GByteArray *data;

>> > >      uint64_t vaddr;

>> > > @@ -99,6 +100,7 @@ struct qemu_plugin_insn {

>> > >      GArray *cbs[PLUGIN_N_CB_TYPES][PLUGIN_N_CB_SUBTYPES];

>> > >      bool calls_helpers;

>> > >      bool mem_helper;

>> > > +    bool store_only;

>> > >  };

>> > >

>> > >  /*

>> > > @@ -128,6 +130,7 @@ static inline struct qemu_plugin_insn *qemu_plugin_insn_alloc(void)

>> > >      return insn;

>> > >  }

>> > >

>> > > +/* Internal context for this TranslationBlock */

>> > >  struct qemu_plugin_tb {

>> > >      GPtrArray *insns;

>> > >      size_t n;

>> > > @@ -135,6 +138,7 @@ struct qemu_plugin_tb {

>> > >      uint64_t vaddr2;

>> > >      void *haddr1;

>> > >      void *haddr2;

>> > > +    bool store_only;

>> > >      GArray *cbs[PLUGIN_N_CB_SUBTYPES];

>> > >  };

>> > >

>> > > diff --git a/accel/tcg/plugin-gen.c b/accel/tcg/plugin-gen.c

>> > > index 8a1bb801e0..137b91282e 100644

>> > > --- a/accel/tcg/plugin-gen.c

>> > > +++ b/accel/tcg/plugin-gen.c

>> > > @@ -842,7 +842,7 @@ static void plugin_gen_inject(const struct qemu_plugin_tb *plugin_tb)

>> > >      pr_ops();

>> > >  }

>> > >

>> > > -bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb)

>> > > +bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb, bool store_only)

>> > >  {

>> > >      struct qemu_plugin_tb *ptb = tcg_ctx->plugin_tb;

>> > >      bool ret = false;

>> > > @@ -855,6 +855,7 @@ bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb)

>> > >          ptb->vaddr2 = -1;

>> > >          get_page_addr_code_hostp(cpu->env_ptr, tb->pc, &ptb->haddr1);

>> > >          ptb->haddr2 = NULL;

>> > > +        ptb->store_only = store_only;

>> > >

>> > >          plugin_gen_empty_callback(PLUGIN_GEN_FROM_TB);

>> > >      }

>> > > diff --git a/accel/tcg/translator.c b/accel/tcg/translator.c

>> > > index 14d1ea795d..082f2c8ee1 100644

>> > > --- a/accel/tcg/translator.c

>> > > +++ b/accel/tcg/translator.c

>> > > @@ -58,7 +58,7 @@ void translator_loop(const TranslatorOps *ops, DisasContextBase *db,

>> > >      ops->tb_start(db, cpu);

>> > >      tcg_debug_assert(db->is_jmp == DISAS_NEXT);  /* no early exit */

>> > >

>> > > -    plugin_enabled = !(tb_cflags(db->tb) & CF_NOINSTR) && plugin_gen_tb_start(cpu, tb);

>> > > +    plugin_enabled = plugin_gen_tb_start(cpu, tb, tb_cflags(db->tb) & CF_NOINSTR);

>> > >

>> > >      while (true) {

>> > >          db->num_insns++;

>> > > @@ -100,6 +100,8 @@ void translator_loop(const TranslatorOps *ops, DisasContextBase *db,

>> > >              gen_io_start();

>> > >              ops->translate_insn(db, cpu);

>> > >          } else {

>> > > +            /* we should only see NOINSTR for io_recompile */

>> > > +            g_assert(!(tb_cflags(db->tb) & CF_NOINSTR));

>> > >              ops->translate_insn(db, cpu);

>> > >          }

>> > >

>> > > diff --git a/plugins/api.c b/plugins/api.c

>> > > index 5dc8e6f934..ac8475707d 100644

>> > > --- a/plugins/api.c

>> > > +++ b/plugins/api.c

>> > > @@ -84,15 +84,19 @@ void qemu_plugin_register_vcpu_tb_exec_cb(struct qemu_plugin_tb *tb,

>> > >                                            enum qemu_plugin_cb_flags flags,

>> > >                                            void *udata)

>> > >  {

>> > > -    plugin_register_dyn_cb__udata(&tb->cbs[PLUGIN_CB_REGULAR],

>> > > -                                  cb, flags, udata);

>> > > +    if (!tb->store_only) {

>> > > +        plugin_register_dyn_cb__udata(&tb->cbs[PLUGIN_CB_REGULAR],

>> > > +                                      cb, flags, udata);

>> > > +    }

>> > >  }

>> > >

>> > >  void qemu_plugin_register_vcpu_tb_exec_inline(struct qemu_plugin_tb *tb,

>> > >                                                enum qemu_plugin_op op,

>> > >                                                void *ptr, uint64_t imm)

>> > >  {

>> > > -    plugin_register_inline_op(&tb->cbs[PLUGIN_CB_INLINE], 0, op, ptr, imm);

>> > > +    if (!tb->store_only) {

>> > > +        plugin_register_inline_op(&tb->cbs[PLUGIN_CB_INLINE], 0, op, ptr, imm);

>> > > +    }

>> > >  }

>> > >

>> > >  void qemu_plugin_register_vcpu_insn_exec_cb(struct qemu_plugin_insn *insn,

>> > > @@ -100,16 +104,20 @@ void qemu_plugin_register_vcpu_insn_exec_cb(struct qemu_plugin_insn *insn,

>> > >                                              enum qemu_plugin_cb_flags flags,

>> > >                                              void *udata)

>> > >  {

>> > > -    plugin_register_dyn_cb__udata(&insn->cbs[PLUGIN_CB_INSN][PLUGIN_CB_REGULAR],

>> > > -        cb, flags, udata);

>> > > +    if (!insn->store_only) {

>> > > +        plugin_register_dyn_cb__udata(&insn->cbs[PLUGIN_CB_INSN][PLUGIN_CB_REGULAR],

>> > > +                                      cb, flags, udata);

>> > > +    }

>> > >  }

>> > >

>> > >  void qemu_plugin_register_vcpu_insn_exec_inline(struct qemu_plugin_insn *insn,

>> > >                                                  enum qemu_plugin_op op,

>> > >                                                  void *ptr, uint64_t imm)

>> > >  {

>> > > -    plugin_register_inline_op(&insn->cbs[PLUGIN_CB_INSN][PLUGIN_CB_INLINE],

>> > > -                              0, op, ptr, imm);

>> > > +    if (!insn->store_only) {

>> > > +        plugin_register_inline_op(&insn->cbs[PLUGIN_CB_INSN][PLUGIN_CB_INLINE],

>> > > +                                  0, op, ptr, imm);

>> > > +    }

>> > >  }

>> > >

>> > >

>> > > @@ -120,8 +128,13 @@ void qemu_plugin_register_vcpu_mem_cb(struct qemu_plugin_insn *insn,

>> > >                                        enum qemu_plugin_mem_rw rw,

>> > >                                        void *udata)

>> > >  {

>> > > -    plugin_register_vcpu_mem_cb(&insn->cbs[PLUGIN_CB_MEM][PLUGIN_CB_REGULAR],

>> > > -                                cb, flags, rw, udata);

>> > > +    if (insn->store_only && (rw & QEMU_PLUGIN_MEM_W)) {

>> > > +        plugin_register_vcpu_mem_cb(&insn->cbs[PLUGIN_CB_MEM][PLUGIN_CB_REGULAR],

>> > > +                                    cb, flags, QEMU_PLUGIN_MEM_W, udata);

>> > > +    } else {

>> > > +        plugin_register_vcpu_mem_cb(&insn->cbs[PLUGIN_CB_MEM][PLUGIN_CB_REGULAR],

>> > > +                                    cb, flags, rw, udata);

>> > > +    }

>> > >  }

>> > >

>> > >  void qemu_plugin_register_vcpu_mem_inline(struct qemu_plugin_insn *insn,

>> > > @@ -129,8 +142,10 @@ void qemu_plugin_register_vcpu_mem_inline(struct qemu_plugin_insn *insn,

>> > >                                            enum qemu_plugin_op op, void *ptr,

>> > >                                            uint64_t imm)

>> > >  {

>> > > -    plugin_register_inline_op(&insn->cbs[PLUGIN_CB_MEM][PLUGIN_CB_INLINE],

>> > > -        rw, op, ptr, imm);

>> > > +    if (!insn->store_only) {

>> > > +        plugin_register_inline_op(&insn->cbs[PLUGIN_CB_MEM][PLUGIN_CB_INLINE],

>> > > +                                  rw, op, ptr, imm);

>> > > +    }

>> > >  }

>> > >

>> > >  void qemu_plugin_register_vcpu_tb_trans_cb(qemu_plugin_id_t id,

>> > > @@ -181,10 +196,13 @@ uint64_t qemu_plugin_tb_vaddr(const struct qemu_plugin_tb *tb)

>> > >  struct qemu_plugin_insn *

>> > >  qemu_plugin_tb_get_insn(const struct qemu_plugin_tb *tb, size_t idx)

>> > >  {

>> > > +    struct qemu_plugin_insn *insn;

>> > >      if (unlikely(idx >= tb->n)) {

>> > >          return NULL;

>> > >      }

>> > > -    return g_ptr_array_index(tb->insns, idx);

>> > > +    insn = g_ptr_array_index(tb->insns, idx);

>> > > +    insn->store_only = tb->store_only;

>> > > +    return insn;

>> > >  }

>> > >

>> > >  /*

>> > > --8<---------------cut here---------------end--------------->8---

>> > >

>> > > --

>> > > Alex Bennée

>> 

>> 

>> 

>> -- 

>> Alex Bennée

>> KVM/QEMU Hacker for Linaro



-- 
Alex Bennée
Alex Bennée Feb. 16, 2021, 10:34 a.m. UTC | #12
Aaron Lindsay <aaron@os.amperecomputing.com> writes:

> On Feb 12 16:04, Alex Bennée wrote:

>> Do you see two stores or one store? I think I got the sense the wrong

>> way around because the store is instrumented before the mmu code,

>> hence should be skipped on a re-instrumented block.

>

> I only see one store between the instruction callback for the store and

> the instruction callback for the subsequent instruction.


I've posted:

  Subject: [PATCH  v3 00/23] plugins/next pre-PR (hwprofile, regression fixes, icount count fix)
  Date: Sat, 13 Feb 2021 13:03:02 +0000
  Message-Id: <20210213130325.14781-1-alex.bennee@linaro.org>

which I think solves it. Could you have a look?

>

> -Aaron

>

>> On Fri, 12 Feb 2021 at 15:41, Aaron Lindsay

>> <aaron@os.amperecomputing.com> wrote:

>> >

>> > On Feb 12 14:43, Alex Bennée wrote:

>> > > Aaron Lindsay <aaron@os.amperecomputing.com> writes:

>> > > > On Feb 10 22:10, Alex Bennée wrote:

>> > > >> When icount is enabled and we recompile an MMIO access we end up

>> > > >> double counting the instruction execution. To avoid this we introduce

>> > > >> the CF_NOINSTR cflag which disables instrumentation for the next TB.

>> > > >> As this is part of the hashed compile flags we will only execute the

>> > > >> generated TB while coming out of a cpu_io_recompile.

>> > > >

>> > > > Unfortunately this patch works a little too well!

>> > > >

>> > > > With this change, the memory access callbacks registered via

>> > > > `qemu_plugin_register_vcpu_mem_cb()` are never called for the

>> > > > re-translated instruction making the IO access, since we've disabled all

>> > > > instrumentation.

>> > > >

>> > > > Is it possible to selectively disable only instruction callbacks using

>> > > > this mechanism, while still allowing others that would not yet have been

>> > > > called for the re-translated instruction?

>> > >

>> > > Can you try the following fugly patch on top of this series:

>> >

>> > This patch does allow me to successfully observe memory callbacks for

>> > stores in this case. It seems from looking at the patch that you

>> > intentionally only allowed memory callbacks for stores in this case, and

>> > I still don't see callbacks any for loads.

>> >

>> > -Aaron

>> >

>> > > --8<---------------cut here---------------start------------->8---

>> > > diff --git a/include/exec/plugin-gen.h b/include/exec/plugin-gen.h

>> > > index 4834a9e2f4..b1b72b5d90 100644

>> > > --- a/include/exec/plugin-gen.h

>> > > +++ b/include/exec/plugin-gen.h

>> > > @@ -19,7 +19,7 @@ struct DisasContextBase;

>> > >

>> > >  #ifdef CONFIG_PLUGIN

>> > >

>> > > -bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb);

>> > > +bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb, bool supress);

>> > >  void plugin_gen_tb_end(CPUState *cpu);

>> > >  void plugin_gen_insn_start(CPUState *cpu, const struct DisasContextBase *db);

>> > >  void plugin_gen_insn_end(void);

>> > > @@ -41,7 +41,7 @@ static inline void plugin_insn_append(const void *from, size_t size)

>> > >  #else /* !CONFIG_PLUGIN */

>> > >

>> > >  static inline

>> > > -bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb)

>> > > +bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb, bool supress)

>> > >  {

>> > >      return false;

>> > >  }

>> > > diff --git a/include/qemu/plugin.h b/include/qemu/plugin.h

>> > > index 841deed79c..2a26a2277f 100644

>> > > --- a/include/qemu/plugin.h

>> > > +++ b/include/qemu/plugin.h

>> > > @@ -92,6 +92,7 @@ struct qemu_plugin_dyn_cb {

>> > >      };

>> > >  };

>> > >

>> > > +/* Internal context for instrumenting an instruction */

>> > >  struct qemu_plugin_insn {

>> > >      GByteArray *data;

>> > >      uint64_t vaddr;

>> > > @@ -99,6 +100,7 @@ struct qemu_plugin_insn {

>> > >      GArray *cbs[PLUGIN_N_CB_TYPES][PLUGIN_N_CB_SUBTYPES];

>> > >      bool calls_helpers;

>> > >      bool mem_helper;

>> > > +    bool store_only;

>> > >  };

>> > >

>> > >  /*

>> > > @@ -128,6 +130,7 @@ static inline struct qemu_plugin_insn *qemu_plugin_insn_alloc(void)

>> > >      return insn;

>> > >  }

>> > >

>> > > +/* Internal context for this TranslationBlock */

>> > >  struct qemu_plugin_tb {

>> > >      GPtrArray *insns;

>> > >      size_t n;

>> > > @@ -135,6 +138,7 @@ struct qemu_plugin_tb {

>> > >      uint64_t vaddr2;

>> > >      void *haddr1;

>> > >      void *haddr2;

>> > > +    bool store_only;

>> > >      GArray *cbs[PLUGIN_N_CB_SUBTYPES];

>> > >  };

>> > >

>> > > diff --git a/accel/tcg/plugin-gen.c b/accel/tcg/plugin-gen.c

>> > > index 8a1bb801e0..137b91282e 100644

>> > > --- a/accel/tcg/plugin-gen.c

>> > > +++ b/accel/tcg/plugin-gen.c

>> > > @@ -842,7 +842,7 @@ static void plugin_gen_inject(const struct qemu_plugin_tb *plugin_tb)

>> > >      pr_ops();

>> > >  }

>> > >

>> > > -bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb)

>> > > +bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb, bool store_only)

>> > >  {

>> > >      struct qemu_plugin_tb *ptb = tcg_ctx->plugin_tb;

>> > >      bool ret = false;

>> > > @@ -855,6 +855,7 @@ bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb)

>> > >          ptb->vaddr2 = -1;

>> > >          get_page_addr_code_hostp(cpu->env_ptr, tb->pc, &ptb->haddr1);

>> > >          ptb->haddr2 = NULL;

>> > > +        ptb->store_only = store_only;

>> > >

>> > >          plugin_gen_empty_callback(PLUGIN_GEN_FROM_TB);

>> > >      }

>> > > diff --git a/accel/tcg/translator.c b/accel/tcg/translator.c

>> > > index 14d1ea795d..082f2c8ee1 100644

>> > > --- a/accel/tcg/translator.c

>> > > +++ b/accel/tcg/translator.c

>> > > @@ -58,7 +58,7 @@ void translator_loop(const TranslatorOps *ops, DisasContextBase *db,

>> > >      ops->tb_start(db, cpu);

>> > >      tcg_debug_assert(db->is_jmp == DISAS_NEXT);  /* no early exit */

>> > >

>> > > -    plugin_enabled = !(tb_cflags(db->tb) & CF_NOINSTR) && plugin_gen_tb_start(cpu, tb);

>> > > +    plugin_enabled = plugin_gen_tb_start(cpu, tb, tb_cflags(db->tb) & CF_NOINSTR);

>> > >

>> > >      while (true) {

>> > >          db->num_insns++;

>> > > @@ -100,6 +100,8 @@ void translator_loop(const TranslatorOps *ops, DisasContextBase *db,

>> > >              gen_io_start();

>> > >              ops->translate_insn(db, cpu);

>> > >          } else {

>> > > +            /* we should only see NOINSTR for io_recompile */

>> > > +            g_assert(!(tb_cflags(db->tb) & CF_NOINSTR));

>> > >              ops->translate_insn(db, cpu);

>> > >          }

>> > >

>> > > diff --git a/plugins/api.c b/plugins/api.c

>> > > index 5dc8e6f934..ac8475707d 100644

>> > > --- a/plugins/api.c

>> > > +++ b/plugins/api.c

>> > > @@ -84,15 +84,19 @@ void qemu_plugin_register_vcpu_tb_exec_cb(struct qemu_plugin_tb *tb,

>> > >                                            enum qemu_plugin_cb_flags flags,

>> > >                                            void *udata)

>> > >  {

>> > > -    plugin_register_dyn_cb__udata(&tb->cbs[PLUGIN_CB_REGULAR],

>> > > -                                  cb, flags, udata);

>> > > +    if (!tb->store_only) {

>> > > +        plugin_register_dyn_cb__udata(&tb->cbs[PLUGIN_CB_REGULAR],

>> > > +                                      cb, flags, udata);

>> > > +    }

>> > >  }

>> > >

>> > >  void qemu_plugin_register_vcpu_tb_exec_inline(struct qemu_plugin_tb *tb,

>> > >                                                enum qemu_plugin_op op,

>> > >                                                void *ptr, uint64_t imm)

>> > >  {

>> > > -    plugin_register_inline_op(&tb->cbs[PLUGIN_CB_INLINE], 0, op, ptr, imm);

>> > > +    if (!tb->store_only) {

>> > > +        plugin_register_inline_op(&tb->cbs[PLUGIN_CB_INLINE], 0, op, ptr, imm);

>> > > +    }

>> > >  }

>> > >

>> > >  void qemu_plugin_register_vcpu_insn_exec_cb(struct qemu_plugin_insn *insn,

>> > > @@ -100,16 +104,20 @@ void qemu_plugin_register_vcpu_insn_exec_cb(struct qemu_plugin_insn *insn,

>> > >                                              enum qemu_plugin_cb_flags flags,

>> > >                                              void *udata)

>> > >  {

>> > > -    plugin_register_dyn_cb__udata(&insn->cbs[PLUGIN_CB_INSN][PLUGIN_CB_REGULAR],

>> > > -        cb, flags, udata);

>> > > +    if (!insn->store_only) {

>> > > +        plugin_register_dyn_cb__udata(&insn->cbs[PLUGIN_CB_INSN][PLUGIN_CB_REGULAR],

>> > > +                                      cb, flags, udata);

>> > > +    }

>> > >  }

>> > >

>> > >  void qemu_plugin_register_vcpu_insn_exec_inline(struct qemu_plugin_insn *insn,

>> > >                                                  enum qemu_plugin_op op,

>> > >                                                  void *ptr, uint64_t imm)

>> > >  {

>> > > -    plugin_register_inline_op(&insn->cbs[PLUGIN_CB_INSN][PLUGIN_CB_INLINE],

>> > > -                              0, op, ptr, imm);

>> > > +    if (!insn->store_only) {

>> > > +        plugin_register_inline_op(&insn->cbs[PLUGIN_CB_INSN][PLUGIN_CB_INLINE],

>> > > +                                  0, op, ptr, imm);

>> > > +    }

>> > >  }

>> > >

>> > >

>> > > @@ -120,8 +128,13 @@ void qemu_plugin_register_vcpu_mem_cb(struct qemu_plugin_insn *insn,

>> > >                                        enum qemu_plugin_mem_rw rw,

>> > >                                        void *udata)

>> > >  {

>> > > -    plugin_register_vcpu_mem_cb(&insn->cbs[PLUGIN_CB_MEM][PLUGIN_CB_REGULAR],

>> > > -                                cb, flags, rw, udata);

>> > > +    if (insn->store_only && (rw & QEMU_PLUGIN_MEM_W)) {

>> > > +        plugin_register_vcpu_mem_cb(&insn->cbs[PLUGIN_CB_MEM][PLUGIN_CB_REGULAR],

>> > > +                                    cb, flags, QEMU_PLUGIN_MEM_W, udata);

>> > > +    } else {

>> > > +        plugin_register_vcpu_mem_cb(&insn->cbs[PLUGIN_CB_MEM][PLUGIN_CB_REGULAR],

>> > > +                                    cb, flags, rw, udata);

>> > > +    }

>> > >  }

>> > >

>> > >  void qemu_plugin_register_vcpu_mem_inline(struct qemu_plugin_insn *insn,

>> > > @@ -129,8 +142,10 @@ void qemu_plugin_register_vcpu_mem_inline(struct qemu_plugin_insn *insn,

>> > >                                            enum qemu_plugin_op op, void *ptr,

>> > >                                            uint64_t imm)

>> > >  {

>> > > -    plugin_register_inline_op(&insn->cbs[PLUGIN_CB_MEM][PLUGIN_CB_INLINE],

>> > > -        rw, op, ptr, imm);

>> > > +    if (!insn->store_only) {

>> > > +        plugin_register_inline_op(&insn->cbs[PLUGIN_CB_MEM][PLUGIN_CB_INLINE],

>> > > +                                  rw, op, ptr, imm);

>> > > +    }

>> > >  }

>> > >

>> > >  void qemu_plugin_register_vcpu_tb_trans_cb(qemu_plugin_id_t id,

>> > > @@ -181,10 +196,13 @@ uint64_t qemu_plugin_tb_vaddr(const struct qemu_plugin_tb *tb)

>> > >  struct qemu_plugin_insn *

>> > >  qemu_plugin_tb_get_insn(const struct qemu_plugin_tb *tb, size_t idx)

>> > >  {

>> > > +    struct qemu_plugin_insn *insn;

>> > >      if (unlikely(idx >= tb->n)) {

>> > >          return NULL;

>> > >      }

>> > > -    return g_ptr_array_index(tb->insns, idx);

>> > > +    insn = g_ptr_array_index(tb->insns, idx);

>> > > +    insn->store_only = tb->store_only;

>> > > +    return insn;

>> > >  }

>> > >

>> > >  /*

>> > > --8<---------------cut here---------------end--------------->8---

>> > >

>> > > --

>> > > Alex Bennée

>> 

>> 

>> 

>> -- 

>> Alex Bennée

>> KVM/QEMU Hacker for Linaro



-- 
Alex Bennée
Zhijian Li (Fujitsu)" via Feb. 17, 2021, 4:32 p.m. UTC | #13
On Feb 16 10:34, Alex Bennée wrote:
> 

> Aaron Lindsay <aaron@os.amperecomputing.com> writes:

> 

> > On Feb 12 16:04, Alex Bennée wrote:

> >> Do you see two stores or one store? I think I got the sense the wrong

> >> way around because the store is instrumented before the mmu code,

> >> hence should be skipped on a re-instrumented block.

> >

> > I only see one store between the instruction callback for the store and

> > the instruction callback for the subsequent instruction.

> 

> I've posted:

> 

>   Subject: [PATCH  v3 00/23] plugins/next pre-PR (hwprofile, regression fixes, icount count fix)

>   Date: Sat, 13 Feb 2021 13:03:02 +0000

>   Message-Id: <20210213130325.14781-1-alex.bennee@linaro.org>

> 

> which I think solves it. Could you have a look?


Just did, and it looks good to me - Thanks!

-Aaron

> >

> > -Aaron

> >

> >> On Fri, 12 Feb 2021 at 15:41, Aaron Lindsay

> >> <aaron@os.amperecomputing.com> wrote:

> >> >

> >> > On Feb 12 14:43, Alex Bennée wrote:

> >> > > Aaron Lindsay <aaron@os.amperecomputing.com> writes:

> >> > > > On Feb 10 22:10, Alex Bennée wrote:

> >> > > >> When icount is enabled and we recompile an MMIO access we end up

> >> > > >> double counting the instruction execution. To avoid this we introduce

> >> > > >> the CF_NOINSTR cflag which disables instrumentation for the next TB.

> >> > > >> As this is part of the hashed compile flags we will only execute the

> >> > > >> generated TB while coming out of a cpu_io_recompile.

> >> > > >

> >> > > > Unfortunately this patch works a little too well!

> >> > > >

> >> > > > With this change, the memory access callbacks registered via

> >> > > > `qemu_plugin_register_vcpu_mem_cb()` are never called for the

> >> > > > re-translated instruction making the IO access, since we've disabled all

> >> > > > instrumentation.

> >> > > >

> >> > > > Is it possible to selectively disable only instruction callbacks using

> >> > > > this mechanism, while still allowing others that would not yet have been

> >> > > > called for the re-translated instruction?

> >> > >

> >> > > Can you try the following fugly patch on top of this series:

> >> >

> >> > This patch does allow me to successfully observe memory callbacks for

> >> > stores in this case. It seems from looking at the patch that you

> >> > intentionally only allowed memory callbacks for stores in this case, and

> >> > I still don't see callbacks any for loads.

> >> >

> >> > -Aaron

> >> >

> >> > > --8<---------------cut here---------------start------------->8---

> >> > > diff --git a/include/exec/plugin-gen.h b/include/exec/plugin-gen.h

> >> > > index 4834a9e2f4..b1b72b5d90 100644

> >> > > --- a/include/exec/plugin-gen.h

> >> > > +++ b/include/exec/plugin-gen.h

> >> > > @@ -19,7 +19,7 @@ struct DisasContextBase;

> >> > >

> >> > >  #ifdef CONFIG_PLUGIN

> >> > >

> >> > > -bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb);

> >> > > +bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb, bool supress);

> >> > >  void plugin_gen_tb_end(CPUState *cpu);

> >> > >  void plugin_gen_insn_start(CPUState *cpu, const struct DisasContextBase *db);

> >> > >  void plugin_gen_insn_end(void);

> >> > > @@ -41,7 +41,7 @@ static inline void plugin_insn_append(const void *from, size_t size)

> >> > >  #else /* !CONFIG_PLUGIN */

> >> > >

> >> > >  static inline

> >> > > -bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb)

> >> > > +bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb, bool supress)

> >> > >  {

> >> > >      return false;

> >> > >  }

> >> > > diff --git a/include/qemu/plugin.h b/include/qemu/plugin.h

> >> > > index 841deed79c..2a26a2277f 100644

> >> > > --- a/include/qemu/plugin.h

> >> > > +++ b/include/qemu/plugin.h

> >> > > @@ -92,6 +92,7 @@ struct qemu_plugin_dyn_cb {

> >> > >      };

> >> > >  };

> >> > >

> >> > > +/* Internal context for instrumenting an instruction */

> >> > >  struct qemu_plugin_insn {

> >> > >      GByteArray *data;

> >> > >      uint64_t vaddr;

> >> > > @@ -99,6 +100,7 @@ struct qemu_plugin_insn {

> >> > >      GArray *cbs[PLUGIN_N_CB_TYPES][PLUGIN_N_CB_SUBTYPES];

> >> > >      bool calls_helpers;

> >> > >      bool mem_helper;

> >> > > +    bool store_only;

> >> > >  };

> >> > >

> >> > >  /*

> >> > > @@ -128,6 +130,7 @@ static inline struct qemu_plugin_insn *qemu_plugin_insn_alloc(void)

> >> > >      return insn;

> >> > >  }

> >> > >

> >> > > +/* Internal context for this TranslationBlock */

> >> > >  struct qemu_plugin_tb {

> >> > >      GPtrArray *insns;

> >> > >      size_t n;

> >> > > @@ -135,6 +138,7 @@ struct qemu_plugin_tb {

> >> > >      uint64_t vaddr2;

> >> > >      void *haddr1;

> >> > >      void *haddr2;

> >> > > +    bool store_only;

> >> > >      GArray *cbs[PLUGIN_N_CB_SUBTYPES];

> >> > >  };

> >> > >

> >> > > diff --git a/accel/tcg/plugin-gen.c b/accel/tcg/plugin-gen.c

> >> > > index 8a1bb801e0..137b91282e 100644

> >> > > --- a/accel/tcg/plugin-gen.c

> >> > > +++ b/accel/tcg/plugin-gen.c

> >> > > @@ -842,7 +842,7 @@ static void plugin_gen_inject(const struct qemu_plugin_tb *plugin_tb)

> >> > >      pr_ops();

> >> > >  }

> >> > >

> >> > > -bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb)

> >> > > +bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb, bool store_only)

> >> > >  {

> >> > >      struct qemu_plugin_tb *ptb = tcg_ctx->plugin_tb;

> >> > >      bool ret = false;

> >> > > @@ -855,6 +855,7 @@ bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb)

> >> > >          ptb->vaddr2 = -1;

> >> > >          get_page_addr_code_hostp(cpu->env_ptr, tb->pc, &ptb->haddr1);

> >> > >          ptb->haddr2 = NULL;

> >> > > +        ptb->store_only = store_only;

> >> > >

> >> > >          plugin_gen_empty_callback(PLUGIN_GEN_FROM_TB);

> >> > >      }

> >> > > diff --git a/accel/tcg/translator.c b/accel/tcg/translator.c

> >> > > index 14d1ea795d..082f2c8ee1 100644

> >> > > --- a/accel/tcg/translator.c

> >> > > +++ b/accel/tcg/translator.c

> >> > > @@ -58,7 +58,7 @@ void translator_loop(const TranslatorOps *ops, DisasContextBase *db,

> >> > >      ops->tb_start(db, cpu);

> >> > >      tcg_debug_assert(db->is_jmp == DISAS_NEXT);  /* no early exit */

> >> > >

> >> > > -    plugin_enabled = !(tb_cflags(db->tb) & CF_NOINSTR) && plugin_gen_tb_start(cpu, tb);

> >> > > +    plugin_enabled = plugin_gen_tb_start(cpu, tb, tb_cflags(db->tb) & CF_NOINSTR);

> >> > >

> >> > >      while (true) {

> >> > >          db->num_insns++;

> >> > > @@ -100,6 +100,8 @@ void translator_loop(const TranslatorOps *ops, DisasContextBase *db,

> >> > >              gen_io_start();

> >> > >              ops->translate_insn(db, cpu);

> >> > >          } else {

> >> > > +            /* we should only see NOINSTR for io_recompile */

> >> > > +            g_assert(!(tb_cflags(db->tb) & CF_NOINSTR));

> >> > >              ops->translate_insn(db, cpu);

> >> > >          }

> >> > >

> >> > > diff --git a/plugins/api.c b/plugins/api.c

> >> > > index 5dc8e6f934..ac8475707d 100644

> >> > > --- a/plugins/api.c

> >> > > +++ b/plugins/api.c

> >> > > @@ -84,15 +84,19 @@ void qemu_plugin_register_vcpu_tb_exec_cb(struct qemu_plugin_tb *tb,

> >> > >                                            enum qemu_plugin_cb_flags flags,

> >> > >                                            void *udata)

> >> > >  {

> >> > > -    plugin_register_dyn_cb__udata(&tb->cbs[PLUGIN_CB_REGULAR],

> >> > > -                                  cb, flags, udata);

> >> > > +    if (!tb->store_only) {

> >> > > +        plugin_register_dyn_cb__udata(&tb->cbs[PLUGIN_CB_REGULAR],

> >> > > +                                      cb, flags, udata);

> >> > > +    }

> >> > >  }

> >> > >

> >> > >  void qemu_plugin_register_vcpu_tb_exec_inline(struct qemu_plugin_tb *tb,

> >> > >                                                enum qemu_plugin_op op,

> >> > >                                                void *ptr, uint64_t imm)

> >> > >  {

> >> > > -    plugin_register_inline_op(&tb->cbs[PLUGIN_CB_INLINE], 0, op, ptr, imm);

> >> > > +    if (!tb->store_only) {

> >> > > +        plugin_register_inline_op(&tb->cbs[PLUGIN_CB_INLINE], 0, op, ptr, imm);

> >> > > +    }

> >> > >  }

> >> > >

> >> > >  void qemu_plugin_register_vcpu_insn_exec_cb(struct qemu_plugin_insn *insn,

> >> > > @@ -100,16 +104,20 @@ void qemu_plugin_register_vcpu_insn_exec_cb(struct qemu_plugin_insn *insn,

> >> > >                                              enum qemu_plugin_cb_flags flags,

> >> > >                                              void *udata)

> >> > >  {

> >> > > -    plugin_register_dyn_cb__udata(&insn->cbs[PLUGIN_CB_INSN][PLUGIN_CB_REGULAR],

> >> > > -        cb, flags, udata);

> >> > > +    if (!insn->store_only) {

> >> > > +        plugin_register_dyn_cb__udata(&insn->cbs[PLUGIN_CB_INSN][PLUGIN_CB_REGULAR],

> >> > > +                                      cb, flags, udata);

> >> > > +    }

> >> > >  }

> >> > >

> >> > >  void qemu_plugin_register_vcpu_insn_exec_inline(struct qemu_plugin_insn *insn,

> >> > >                                                  enum qemu_plugin_op op,

> >> > >                                                  void *ptr, uint64_t imm)

> >> > >  {

> >> > > -    plugin_register_inline_op(&insn->cbs[PLUGIN_CB_INSN][PLUGIN_CB_INLINE],

> >> > > -                              0, op, ptr, imm);

> >> > > +    if (!insn->store_only) {

> >> > > +        plugin_register_inline_op(&insn->cbs[PLUGIN_CB_INSN][PLUGIN_CB_INLINE],

> >> > > +                                  0, op, ptr, imm);

> >> > > +    }

> >> > >  }

> >> > >

> >> > >

> >> > > @@ -120,8 +128,13 @@ void qemu_plugin_register_vcpu_mem_cb(struct qemu_plugin_insn *insn,

> >> > >                                        enum qemu_plugin_mem_rw rw,

> >> > >                                        void *udata)

> >> > >  {

> >> > > -    plugin_register_vcpu_mem_cb(&insn->cbs[PLUGIN_CB_MEM][PLUGIN_CB_REGULAR],

> >> > > -                                cb, flags, rw, udata);

> >> > > +    if (insn->store_only && (rw & QEMU_PLUGIN_MEM_W)) {

> >> > > +        plugin_register_vcpu_mem_cb(&insn->cbs[PLUGIN_CB_MEM][PLUGIN_CB_REGULAR],

> >> > > +                                    cb, flags, QEMU_PLUGIN_MEM_W, udata);

> >> > > +    } else {

> >> > > +        plugin_register_vcpu_mem_cb(&insn->cbs[PLUGIN_CB_MEM][PLUGIN_CB_REGULAR],

> >> > > +                                    cb, flags, rw, udata);

> >> > > +    }

> >> > >  }

> >> > >

> >> > >  void qemu_plugin_register_vcpu_mem_inline(struct qemu_plugin_insn *insn,

> >> > > @@ -129,8 +142,10 @@ void qemu_plugin_register_vcpu_mem_inline(struct qemu_plugin_insn *insn,

> >> > >                                            enum qemu_plugin_op op, void *ptr,

> >> > >                                            uint64_t imm)

> >> > >  {

> >> > > -    plugin_register_inline_op(&insn->cbs[PLUGIN_CB_MEM][PLUGIN_CB_INLINE],

> >> > > -        rw, op, ptr, imm);

> >> > > +    if (!insn->store_only) {

> >> > > +        plugin_register_inline_op(&insn->cbs[PLUGIN_CB_MEM][PLUGIN_CB_INLINE],

> >> > > +                                  rw, op, ptr, imm);

> >> > > +    }

> >> > >  }

> >> > >

> >> > >  void qemu_plugin_register_vcpu_tb_trans_cb(qemu_plugin_id_t id,

> >> > > @@ -181,10 +196,13 @@ uint64_t qemu_plugin_tb_vaddr(const struct qemu_plugin_tb *tb)

> >> > >  struct qemu_plugin_insn *

> >> > >  qemu_plugin_tb_get_insn(const struct qemu_plugin_tb *tb, size_t idx)

> >> > >  {

> >> > > +    struct qemu_plugin_insn *insn;

> >> > >      if (unlikely(idx >= tb->n)) {

> >> > >          return NULL;

> >> > >      }

> >> > > -    return g_ptr_array_index(tb->insns, idx);

> >> > > +    insn = g_ptr_array_index(tb->insns, idx);

> >> > > +    insn->store_only = tb->store_only;

> >> > > +    return insn;

> >> > >  }

> >> > >

> >> > >  /*

> >> > > --8<---------------cut here---------------end--------------->8---

> >> > >

> >> > > --

> >> > > Alex Bennée

> >> 

> >> 

> >> 

> >> -- 

> >> Alex Bennée

> >> KVM/QEMU Hacker for Linaro

> 

> 

> -- 

> Alex Bennée
diff mbox series

Patch

diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index e08179de34..299282cc59 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -454,14 +454,14 @@  struct TranslationBlock {
     uint32_t cflags;    /* compile flags */
 #define CF_COUNT_MASK  0x00007fff
 #define CF_LAST_IO     0x00008000 /* Last insn may be an IO access.  */
+#define CF_NOINSTR     0x00010000 /* Disable instrumentation of TB */
 #define CF_USE_ICOUNT  0x00020000
 #define CF_INVALID     0x00040000 /* TB is stale. Set with @jmp_lock held */
 #define CF_PARALLEL    0x00080000 /* Generate code for a parallel context */
 #define CF_CLUSTER_MASK 0xff000000 /* Top 8 bits are cluster ID */
 #define CF_CLUSTER_SHIFT 24
-/* cflags' mask for hashing/comparison */
-#define CF_HASH_MASK   \
-    (CF_COUNT_MASK | CF_LAST_IO | CF_USE_ICOUNT | CF_PARALLEL | CF_CLUSTER_MASK)
+/* cflags' mask for hashing/comparison, basically ignore CF_INVALID */
+#define CF_HASH_MASK   (~CF_INVALID)
 
     /* Per-vCPU dynamic tracing state used to generate this TB */
     uint32_t trace_vcpu_dstate;
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index 0666f9ef14..32a3d8fe24 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -2399,7 +2399,8 @@  void tb_check_watchpoint(CPUState *cpu, uintptr_t retaddr)
 }
 
 #ifndef CONFIG_USER_ONLY
-/* in deterministic execution mode, instructions doing device I/Os
+/*
+ * In deterministic execution mode, instructions doing device I/Os
  * must be at the end of the TB.
  *
  * Called by softmmu_template.h, with iothread mutex not held.
@@ -2430,19 +2431,17 @@  void cpu_io_recompile(CPUState *cpu, uintptr_t retaddr)
         n = 2;
     }
 
-    /* Generate a new TB executing the I/O insn.  */
-    cpu->cflags_next_tb = curr_cflags() | CF_LAST_IO | n;
+    /*
+     * Exit the loop and potentially generate a new TB executing the
+     * just the I/O insns. We also disable instrumentation so we don't
+     * double count the instruction.
+     */
+    cpu->cflags_next_tb = curr_cflags() | CF_NOINSTR | CF_LAST_IO | n;
 
     qemu_log_mask_and_addr(CPU_LOG_EXEC, tb->pc,
                            "cpu_io_recompile: rewound execution of TB to "
                            TARGET_FMT_lx "\n", tb->pc);
 
-    /* TODO: If env->pc != tb->pc (i.e. the faulting instruction was not
-     * the first in the TB) then we end up generating a whole new TB and
-     *  repeating the fault, which is horribly inefficient.
-     *  Better would be to execute just this insn uncached, or generate a
-     *  second new TB.
-     */
     cpu_loop_exit_noexc(cpu);
 }
 
diff --git a/accel/tcg/translator.c b/accel/tcg/translator.c
index a49a794065..14d1ea795d 100644
--- a/accel/tcg/translator.c
+++ b/accel/tcg/translator.c
@@ -58,7 +58,7 @@  void translator_loop(const TranslatorOps *ops, DisasContextBase *db,
     ops->tb_start(db, cpu);
     tcg_debug_assert(db->is_jmp == DISAS_NEXT);  /* no early exit */
 
-    plugin_enabled = plugin_gen_tb_start(cpu, tb);
+    plugin_enabled = !(tb_cflags(db->tb) & CF_NOINSTR) && plugin_gen_tb_start(cpu, tb);
 
     while (true) {
         db->num_insns++;