diff mbox series

[v2,01/14] plugins: implement inline operation relative to cpu_index

Message ID 20240118032400.3762658-2-pierrick.bouvier@linaro.org
State New
Headers show
Series TCG Plugin inline operation enhancement | expand

Commit Message

Pierrick Bouvier Jan. 18, 2024, 3:23 a.m. UTC
Instead of working on a fixed memory location, allow to address it based
on cpu_index, an element size and a given offset.
Result address: ptr + offset + cpu_index * element_size.

With this, we can target a member in a struct array from a base pointer.

Current semantic is not modified, thus inline operation still targets
always the same memory location.

Signed-off-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
---
 accel/tcg/plugin-gen.c | 67 +++++++++++++++++++++++++++++++++++-------
 include/qemu/plugin.h  |  3 ++
 plugins/api.c          |  7 +++--
 plugins/core.c         | 18 +++++++++---
 plugins/plugin.h       |  6 ++--
 5 files changed, 81 insertions(+), 20 deletions(-)

Comments

Alex Bennée Jan. 26, 2024, 12:07 p.m. UTC | #1
Pierrick Bouvier <pierrick.bouvier@linaro.org> writes:

> Instead of working on a fixed memory location, allow to address it based
> on cpu_index, an element size and a given offset.
> Result address: ptr + offset + cpu_index * element_size.
>
> With this, we can target a member in a struct array from a base pointer.
>
> Current semantic is not modified, thus inline operation still targets
> always the same memory location.
>
> Signed-off-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
> ---
>  accel/tcg/plugin-gen.c | 67 +++++++++++++++++++++++++++++++++++-------
>  include/qemu/plugin.h  |  3 ++
>  plugins/api.c          |  7 +++--
>  plugins/core.c         | 18 +++++++++---
>  plugins/plugin.h       |  6 ++--
>  5 files changed, 81 insertions(+), 20 deletions(-)
>
> diff --git a/accel/tcg/plugin-gen.c b/accel/tcg/plugin-gen.c
> index b37ce7683e6..1a2375d7779 100644
> --- a/accel/tcg/plugin-gen.c
> +++ b/accel/tcg/plugin-gen.c
> @@ -132,16 +132,28 @@ static void gen_empty_udata_cb_no_rwg(void)
>   */
>  static void gen_empty_inline_cb(void)
>  {
> +    TCGv_i32 cpu_index = tcg_temp_ebb_new_i32();
> +    TCGv_ptr cpu_index_as_ptr = tcg_temp_ebb_new_ptr();
>      TCGv_i64 val = tcg_temp_ebb_new_i64();
>      TCGv_ptr ptr = tcg_temp_ebb_new_ptr();
>  
> +    tcg_gen_ld_i32(cpu_index, tcg_env,
> +                   -offsetof(ArchCPU, env) + offsetof(CPUState, cpu_index));
> +    /* pass an immediate != 0 so that it doesn't get optimized away */
> +    tcg_gen_muli_i32(cpu_index, cpu_index, 0xdeadbeef);
> +    tcg_gen_ext_i32_ptr(cpu_index_as_ptr, cpu_index);
> +
>      tcg_gen_movi_ptr(ptr, 0);
> +    tcg_gen_add_ptr(ptr, ptr, cpu_index_as_ptr);
>      tcg_gen_ld_i64(val, ptr, 0);
>      /* pass an immediate != 0 so that it doesn't get optimized away */
>      tcg_gen_addi_i64(val, val, 0xdeadface);
> +
>      tcg_gen_st_i64(val, ptr, 0);
>      tcg_temp_free_ptr(ptr);
>      tcg_temp_free_i64(val);
> +    tcg_temp_free_ptr(cpu_index_as_ptr);
> +    tcg_temp_free_i32(cpu_index);
>  }
>  
<snip>
>      if (UINTPTR_MAX == UINT32_MAX) {
> @@ -395,18 +439,19 @@ static TCGOp *append_inline_cb(const struct qemu_plugin_dyn_cb *cb,
>                                 TCGOp *begin_op, TCGOp *op,
>                                 int *unused)
>  {
> -    /* const_ptr */
> -    op = copy_const_ptr(&begin_op, op, cb->userp);
> -
> -    /* ld_i64 */
> +    char *ptr = cb->userp;
> +    if (!cb->inline_direct_ptr) {
> +        /* dereference userp once to get access to memory location */
> +        ptr = *(char **)cb->userp;
> +    }

I'm confused here, probably because inline_direct_ptr gets removed later
on?

> +    op = copy_ld_i32(&begin_op, op);
> +    op = copy_mul_i32(&begin_op, op, cb->inline_element_size);
> +    op = copy_ext_i32_ptr(&begin_op, op);
> +    op = copy_const_ptr(&begin_op, op, ptr + cb->inline_offset);
> +    op = copy_add_ptr(&begin_op, op);
>      op = copy_ld_i64(&begin_op, op);
> -
> -    /* add_i64 */
>      op = copy_add_i64(&begin_op, op, cb->inline_insn.imm);
> -
> -    /* st_i64 */
>      op = copy_st_i64(&begin_op, op);
> -
>      return op;
>  }
>  
> diff --git a/include/qemu/plugin.h b/include/qemu/plugin.h
> index b0c5ac68293..9346249145d 100644
> --- a/include/qemu/plugin.h
> +++ b/include/qemu/plugin.h
> @@ -86,6 +86,9 @@ enum plugin_dyn_cb_subtype {
>  struct qemu_plugin_dyn_cb {
>      union qemu_plugin_cb_sig f;
>      void *userp;
> +    size_t inline_offset;
> +    size_t inline_element_size;
> +    bool inline_direct_ptr;

Ahh here it is.

(I seem to recall there is a setting for git to order headers first that
helps with this).

We could do with some documentation here. I can guess the offset and
element size values but what inline_direct_ptr means is not clear.

>      enum plugin_dyn_cb_subtype type;
>      /* @rw applies to mem callbacks only (both regular and inline) */
>      enum qemu_plugin_mem_rw rw;
> diff --git a/plugins/api.c b/plugins/api.c
> index 8d5cca53295..e777eb4e9fc 100644
> --- a/plugins/api.c
> +++ b/plugins/api.c
> @@ -106,7 +106,8 @@ void qemu_plugin_register_vcpu_tb_exec_inline(struct qemu_plugin_tb *tb,
>                                                void *ptr, uint64_t imm)
>  {
>      if (!tb->mem_only) {
> -        plugin_register_inline_op(&tb->cbs[PLUGIN_CB_INLINE], 0, op, ptr, imm);
> +        plugin_register_inline_op(&tb->cbs[PLUGIN_CB_INLINE],
> +                                  0, op, ptr, 0, sizeof(uint64_t), true, imm);
>      }
>  }
>  
> @@ -131,7 +132,7 @@ void qemu_plugin_register_vcpu_insn_exec_inline(struct qemu_plugin_insn *insn,
>  {
>      if (!insn->mem_only) {
>          plugin_register_inline_op(&insn->cbs[PLUGIN_CB_INSN][PLUGIN_CB_INLINE],
> -                                  0, op, ptr, imm);
> +                                  0, op, ptr, 0, sizeof(uint64_t), true, imm);
>      }
>  }
>  
> @@ -156,7 +157,7 @@ void qemu_plugin_register_vcpu_mem_inline(struct qemu_plugin_insn *insn,
>                                            uint64_t imm)
>  {
>      plugin_register_inline_op(&insn->cbs[PLUGIN_CB_MEM][PLUGIN_CB_INLINE],
> -                              rw, op, ptr, imm);
> +                              rw, op, ptr, 0, sizeof(uint64_t), true, imm);
>  }
>  
>  void qemu_plugin_register_vcpu_tb_trans_cb(qemu_plugin_id_t id,
> diff --git a/plugins/core.c b/plugins/core.c
> index 49588285dd0..e07b9cdf229 100644
> --- a/plugins/core.c
> +++ b/plugins/core.c
> @@ -280,13 +280,18 @@ static struct qemu_plugin_dyn_cb *plugin_get_dyn_cb(GArray **arr)
>  
>  void plugin_register_inline_op(GArray **arr,
>                                 enum qemu_plugin_mem_rw rw,
> -                               enum qemu_plugin_op op, void *ptr,
> +                               enum qemu_plugin_op op,
> +                               void *ptr, size_t offset, size_t element_size,
> +                               bool direct_ptr,
>                                 uint64_t imm)
>  {
>      struct qemu_plugin_dyn_cb *dyn_cb;
>  
>      dyn_cb = plugin_get_dyn_cb(arr);
>      dyn_cb->userp = ptr;
> +    dyn_cb->inline_element_size = element_size;
> +    dyn_cb->inline_offset = offset;
> +    dyn_cb->inline_direct_ptr = direct_ptr;
>      dyn_cb->type = PLUGIN_CB_INLINE;
>      dyn_cb->rw = rw;
>      dyn_cb->inline_insn.op = op;
> @@ -431,9 +436,14 @@ void qemu_plugin_flush_cb(void)
>      plugin_cb__simple(QEMU_PLUGIN_EV_FLUSH);
>  }
>  
> -void exec_inline_op(struct qemu_plugin_dyn_cb *cb)
> +void exec_inline_op(struct qemu_plugin_dyn_cb *cb, int cpu_index)
>  {
> -    uint64_t *val = cb->userp;
> +    char *ptr = cb->userp;
> +    if (!cb->inline_direct_ptr) {
> +        ptr = *(char **) cb->userp;
> +    }
> +    ptr += cb->inline_offset;
> +    uint64_t *val = (uint64_t *)(ptr + cpu_index * cb->inline_element_size);
>  
>      switch (cb->inline_insn.op) {
>      case QEMU_PLUGIN_INLINE_ADD_U64:
> @@ -466,7 +476,7 @@ void qemu_plugin_vcpu_mem_cb(CPUState *cpu, uint64_t vaddr,
>                             vaddr, cb->userp);
>              break;
>          case PLUGIN_CB_INLINE:
> -            exec_inline_op(cb);
> +            exec_inline_op(cb, cpu->cpu_index);
>              break;
>          default:
>              g_assert_not_reached();
> diff --git a/plugins/plugin.h b/plugins/plugin.h
> index 5eb2fdbc85e..2c278379b70 100644
> --- a/plugins/plugin.h
> +++ b/plugins/plugin.h
> @@ -66,7 +66,9 @@ struct qemu_plugin_ctx *plugin_id_to_ctx_locked(qemu_plugin_id_t id);
>  
>  void plugin_register_inline_op(GArray **arr,
>                                 enum qemu_plugin_mem_rw rw,
> -                               enum qemu_plugin_op op, void *ptr,
> +                               enum qemu_plugin_op op,
> +                               void *ptr, size_t offset, size_t element_size,
> +                               bool direct_ptr,
>                                 uint64_t imm);
>  
>  void plugin_reset_uninstall(qemu_plugin_id_t id,
> @@ -95,6 +97,6 @@ void plugin_register_vcpu_mem_cb(GArray **arr,
>                                   enum qemu_plugin_mem_rw rw,
>                                   void *udata);
>  
> -void exec_inline_op(struct qemu_plugin_dyn_cb *cb);
> +void exec_inline_op(struct qemu_plugin_dyn_cb *cb, int cpu_index);
>  
>  #endif /* PLUGIN_H */
Pierrick Bouvier Jan. 29, 2024, 10:10 a.m. UTC | #2
On 1/26/24 16:07, Alex Bennée wrote:
> Pierrick Bouvier <pierrick.bouvier@linaro.org> writes:
> 
>> Instead of working on a fixed memory location, allow to address it based
>> on cpu_index, an element size and a given offset.
>> Result address: ptr + offset + cpu_index * element_size.
>>
>> With this, we can target a member in a struct array from a base pointer.
>>
>> Current semantic is not modified, thus inline operation still targets
>> always the same memory location.
>>
>> Signed-off-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
>> ---
>>   accel/tcg/plugin-gen.c | 67 +++++++++++++++++++++++++++++++++++-------
>>   include/qemu/plugin.h  |  3 ++
>>   plugins/api.c          |  7 +++--
>>   plugins/core.c         | 18 +++++++++---
>>   plugins/plugin.h       |  6 ++--
>>   5 files changed, 81 insertions(+), 20 deletions(-)
>>
>> diff --git a/accel/tcg/plugin-gen.c b/accel/tcg/plugin-gen.c
>> index b37ce7683e6..1a2375d7779 100644
>> --- a/accel/tcg/plugin-gen.c
>> +++ b/accel/tcg/plugin-gen.c
>> @@ -132,16 +132,28 @@ static void gen_empty_udata_cb_no_rwg(void)
>>    */
>>   static void gen_empty_inline_cb(void)
>>   {
>> +    TCGv_i32 cpu_index = tcg_temp_ebb_new_i32();
>> +    TCGv_ptr cpu_index_as_ptr = tcg_temp_ebb_new_ptr();
>>       TCGv_i64 val = tcg_temp_ebb_new_i64();
>>       TCGv_ptr ptr = tcg_temp_ebb_new_ptr();
>>   
>> +    tcg_gen_ld_i32(cpu_index, tcg_env,
>> +                   -offsetof(ArchCPU, env) + offsetof(CPUState, cpu_index));
>> +    /* pass an immediate != 0 so that it doesn't get optimized away */
>> +    tcg_gen_muli_i32(cpu_index, cpu_index, 0xdeadbeef);
>> +    tcg_gen_ext_i32_ptr(cpu_index_as_ptr, cpu_index);
>> +
>>       tcg_gen_movi_ptr(ptr, 0);
>> +    tcg_gen_add_ptr(ptr, ptr, cpu_index_as_ptr);
>>       tcg_gen_ld_i64(val, ptr, 0);
>>       /* pass an immediate != 0 so that it doesn't get optimized away */
>>       tcg_gen_addi_i64(val, val, 0xdeadface);
>> +
>>       tcg_gen_st_i64(val, ptr, 0);
>>       tcg_temp_free_ptr(ptr);
>>       tcg_temp_free_i64(val);
>> +    tcg_temp_free_ptr(cpu_index_as_ptr);
>> +    tcg_temp_free_i32(cpu_index);
>>   }
>>   
> <snip>
>>       if (UINTPTR_MAX == UINT32_MAX) {
>> @@ -395,18 +439,19 @@ static TCGOp *append_inline_cb(const struct qemu_plugin_dyn_cb *cb,
>>                                  TCGOp *begin_op, TCGOp *op,
>>                                  int *unused)
>>   {
>> -    /* const_ptr */
>> -    op = copy_const_ptr(&begin_op, op, cb->userp);
>> -
>> -    /* ld_i64 */
>> +    char *ptr = cb->userp;
>> +    if (!cb->inline_direct_ptr) {
>> +        /* dereference userp once to get access to memory location */
>> +        ptr = *(char **)cb->userp;
>> +    }
> 
> I'm confused here, probably because inline_direct_ptr gets removed later
> on?
> 

Yes, we temporarily need two code paths for this patch series. Else, 
plugins should be updated at the same time than we make all changes to 
not break anything.

>> +    op = copy_ld_i32(&begin_op, op);
>> +    op = copy_mul_i32(&begin_op, op, cb->inline_element_size);
>> +    op = copy_ext_i32_ptr(&begin_op, op);
>> +    op = copy_const_ptr(&begin_op, op, ptr + cb->inline_offset);
>> +    op = copy_add_ptr(&begin_op, op);
>>       op = copy_ld_i64(&begin_op, op);
>> -
>> -    /* add_i64 */
>>       op = copy_add_i64(&begin_op, op, cb->inline_insn.imm);
>> -
>> -    /* st_i64 */
>>       op = copy_st_i64(&begin_op, op);
>> -
>>       return op;
>>   }
>>   
>> diff --git a/include/qemu/plugin.h b/include/qemu/plugin.h
>> index b0c5ac68293..9346249145d 100644
>> --- a/include/qemu/plugin.h
>> +++ b/include/qemu/plugin.h
>> @@ -86,6 +86,9 @@ enum plugin_dyn_cb_subtype {
>>   struct qemu_plugin_dyn_cb {
>>       union qemu_plugin_cb_sig f;
>>       void *userp;
>> +    size_t inline_offset;
>> +    size_t inline_element_size;
>> +    bool inline_direct_ptr;
> 
> Ahh here it is.
> 
> (I seem to recall there is a setting for git to order headers first that
> helps with this).
> 

Indeed, found it (thanks):
https://gitlab.com/qemu-project/qemu/-/blob/master/scripts/git.orderfile

> We could do with some documentation here. I can guess the offset and
> element size values but what inline_direct_ptr means is not clear.
> 

It represents if the userp is a pointer to data, or a pointer to pointer 
to data. Any suggestion for name having this detail?

I have another patch that replace all this by a qemu_plugin_u64_t (from 
scoreboard), that I'll integrate in a v3, which is much clearer. But 
there will still be one commit needed with this.

>>       enum plugin_dyn_cb_subtype type;
>>       /* @rw applies to mem callbacks only (both regular and inline) */
>>       enum qemu_plugin_mem_rw rw;
>> diff --git a/plugins/api.c b/plugins/api.c
>> index 8d5cca53295..e777eb4e9fc 100644
>> --- a/plugins/api.c
>> +++ b/plugins/api.c
>> @@ -106,7 +106,8 @@ void qemu_plugin_register_vcpu_tb_exec_inline(struct qemu_plugin_tb *tb,
>>                                                 void *ptr, uint64_t imm)
>>   {
>>       if (!tb->mem_only) {
>> -        plugin_register_inline_op(&tb->cbs[PLUGIN_CB_INLINE], 0, op, ptr, imm);
>> +        plugin_register_inline_op(&tb->cbs[PLUGIN_CB_INLINE],
>> +                                  0, op, ptr, 0, sizeof(uint64_t), true, imm);
>>       }
>>   }
>>   
>> @@ -131,7 +132,7 @@ void qemu_plugin_register_vcpu_insn_exec_inline(struct qemu_plugin_insn *insn,
>>   {
>>       if (!insn->mem_only) {
>>           plugin_register_inline_op(&insn->cbs[PLUGIN_CB_INSN][PLUGIN_CB_INLINE],
>> -                                  0, op, ptr, imm);
>> +                                  0, op, ptr, 0, sizeof(uint64_t), true, imm);
>>       }
>>   }
>>   
>> @@ -156,7 +157,7 @@ void qemu_plugin_register_vcpu_mem_inline(struct qemu_plugin_insn *insn,
>>                                             uint64_t imm)
>>   {
>>       plugin_register_inline_op(&insn->cbs[PLUGIN_CB_MEM][PLUGIN_CB_INLINE],
>> -                              rw, op, ptr, imm);
>> +                              rw, op, ptr, 0, sizeof(uint64_t), true, imm);
>>   }
>>   
>>   void qemu_plugin_register_vcpu_tb_trans_cb(qemu_plugin_id_t id,
>> diff --git a/plugins/core.c b/plugins/core.c
>> index 49588285dd0..e07b9cdf229 100644
>> --- a/plugins/core.c
>> +++ b/plugins/core.c
>> @@ -280,13 +280,18 @@ static struct qemu_plugin_dyn_cb *plugin_get_dyn_cb(GArray **arr)
>>   
>>   void plugin_register_inline_op(GArray **arr,
>>                                  enum qemu_plugin_mem_rw rw,
>> -                               enum qemu_plugin_op op, void *ptr,
>> +                               enum qemu_plugin_op op,
>> +                               void *ptr, size_t offset, size_t element_size,
>> +                               bool direct_ptr,
>>                                  uint64_t imm)
>>   {
>>       struct qemu_plugin_dyn_cb *dyn_cb;
>>   
>>       dyn_cb = plugin_get_dyn_cb(arr);
>>       dyn_cb->userp = ptr;
>> +    dyn_cb->inline_element_size = element_size;
>> +    dyn_cb->inline_offset = offset;
>> +    dyn_cb->inline_direct_ptr = direct_ptr;
>>       dyn_cb->type = PLUGIN_CB_INLINE;
>>       dyn_cb->rw = rw;
>>       dyn_cb->inline_insn.op = op;
>> @@ -431,9 +436,14 @@ void qemu_plugin_flush_cb(void)
>>       plugin_cb__simple(QEMU_PLUGIN_EV_FLUSH);
>>   }
>>   
>> -void exec_inline_op(struct qemu_plugin_dyn_cb *cb)
>> +void exec_inline_op(struct qemu_plugin_dyn_cb *cb, int cpu_index)
>>   {
>> -    uint64_t *val = cb->userp;
>> +    char *ptr = cb->userp;
>> +    if (!cb->inline_direct_ptr) {
>> +        ptr = *(char **) cb->userp;
>> +    }
>> +    ptr += cb->inline_offset;
>> +    uint64_t *val = (uint64_t *)(ptr + cpu_index * cb->inline_element_size);
>>   
>>       switch (cb->inline_insn.op) {
>>       case QEMU_PLUGIN_INLINE_ADD_U64:
>> @@ -466,7 +476,7 @@ void qemu_plugin_vcpu_mem_cb(CPUState *cpu, uint64_t vaddr,
>>                              vaddr, cb->userp);
>>               break;
>>           case PLUGIN_CB_INLINE:
>> -            exec_inline_op(cb);
>> +            exec_inline_op(cb, cpu->cpu_index);
>>               break;
>>           default:
>>               g_assert_not_reached();
>> diff --git a/plugins/plugin.h b/plugins/plugin.h
>> index 5eb2fdbc85e..2c278379b70 100644
>> --- a/plugins/plugin.h
>> +++ b/plugins/plugin.h
>> @@ -66,7 +66,9 @@ struct qemu_plugin_ctx *plugin_id_to_ctx_locked(qemu_plugin_id_t id);
>>   
>>   void plugin_register_inline_op(GArray **arr,
>>                                  enum qemu_plugin_mem_rw rw,
>> -                               enum qemu_plugin_op op, void *ptr,
>> +                               enum qemu_plugin_op op,
>> +                               void *ptr, size_t offset, size_t element_size,
>> +                               bool direct_ptr,
>>                                  uint64_t imm);
>>   
>>   void plugin_reset_uninstall(qemu_plugin_id_t id,
>> @@ -95,6 +97,6 @@ void plugin_register_vcpu_mem_cb(GArray **arr,
>>                                    enum qemu_plugin_mem_rw rw,
>>                                    void *udata);
>>   
>> -void exec_inline_op(struct qemu_plugin_dyn_cb *cb);
>> +void exec_inline_op(struct qemu_plugin_dyn_cb *cb, int cpu_index);
>>   
>>   #endif /* PLUGIN_H */
>
diff mbox series

Patch

diff --git a/accel/tcg/plugin-gen.c b/accel/tcg/plugin-gen.c
index b37ce7683e6..1a2375d7779 100644
--- a/accel/tcg/plugin-gen.c
+++ b/accel/tcg/plugin-gen.c
@@ -132,16 +132,28 @@  static void gen_empty_udata_cb_no_rwg(void)
  */
 static void gen_empty_inline_cb(void)
 {
+    TCGv_i32 cpu_index = tcg_temp_ebb_new_i32();
+    TCGv_ptr cpu_index_as_ptr = tcg_temp_ebb_new_ptr();
     TCGv_i64 val = tcg_temp_ebb_new_i64();
     TCGv_ptr ptr = tcg_temp_ebb_new_ptr();
 
+    tcg_gen_ld_i32(cpu_index, tcg_env,
+                   -offsetof(ArchCPU, env) + offsetof(CPUState, cpu_index));
+    /* pass an immediate != 0 so that it doesn't get optimized away */
+    tcg_gen_muli_i32(cpu_index, cpu_index, 0xdeadbeef);
+    tcg_gen_ext_i32_ptr(cpu_index_as_ptr, cpu_index);
+
     tcg_gen_movi_ptr(ptr, 0);
+    tcg_gen_add_ptr(ptr, ptr, cpu_index_as_ptr);
     tcg_gen_ld_i64(val, ptr, 0);
     /* pass an immediate != 0 so that it doesn't get optimized away */
     tcg_gen_addi_i64(val, val, 0xdeadface);
+
     tcg_gen_st_i64(val, ptr, 0);
     tcg_temp_free_ptr(ptr);
     tcg_temp_free_i64(val);
+    tcg_temp_free_ptr(cpu_index_as_ptr);
+    tcg_temp_free_i32(cpu_index);
 }
 
 static void gen_empty_mem_cb(TCGv_i64 addr, uint32_t info)
@@ -289,12 +301,37 @@  static TCGOp *copy_const_ptr(TCGOp **begin_op, TCGOp *op, void *ptr)
     return op;
 }
 
+static TCGOp *copy_ld_i32(TCGOp **begin_op, TCGOp *op)
+{
+    return copy_op(begin_op, op, INDEX_op_ld_i32);
+}
+
+static TCGOp *copy_ext_i32_ptr(TCGOp **begin_op, TCGOp *op)
+{
+    if (UINTPTR_MAX == UINT32_MAX) {
+        op = copy_op(begin_op, op, INDEX_op_mov_i32);
+    } else {
+        op = copy_op(begin_op, op, INDEX_op_ext_i32_i64);
+    }
+    return op;
+}
+
+static TCGOp *copy_add_ptr(TCGOp **begin_op, TCGOp *op)
+{
+    if (UINTPTR_MAX == UINT32_MAX) {
+        op = copy_op(begin_op, op, INDEX_op_add_i32);
+    } else {
+        op = copy_op(begin_op, op, INDEX_op_add_i64);
+    }
+    return op;
+}
+
 static TCGOp *copy_ld_i64(TCGOp **begin_op, TCGOp *op)
 {
     if (TCG_TARGET_REG_BITS == 32) {
         /* 2x ld_i32 */
-        op = copy_op(begin_op, op, INDEX_op_ld_i32);
-        op = copy_op(begin_op, op, INDEX_op_ld_i32);
+        op = copy_ld_i32(begin_op, op);
+        op = copy_ld_i32(begin_op, op);
     } else {
         /* ld_i64 */
         op = copy_op(begin_op, op, INDEX_op_ld_i64);
@@ -330,6 +367,13 @@  static TCGOp *copy_add_i64(TCGOp **begin_op, TCGOp *op, uint64_t v)
     return op;
 }
 
+static TCGOp *copy_mul_i32(TCGOp **begin_op, TCGOp *op, uint32_t v)
+{
+    op = copy_op(begin_op, op, INDEX_op_mul_i32);
+    op->args[2] = tcgv_i32_arg(tcg_constant_i32(v));
+    return op;
+}
+
 static TCGOp *copy_st_ptr(TCGOp **begin_op, TCGOp *op)
 {
     if (UINTPTR_MAX == UINT32_MAX) {
@@ -395,18 +439,19 @@  static TCGOp *append_inline_cb(const struct qemu_plugin_dyn_cb *cb,
                                TCGOp *begin_op, TCGOp *op,
                                int *unused)
 {
-    /* const_ptr */
-    op = copy_const_ptr(&begin_op, op, cb->userp);
-
-    /* ld_i64 */
+    char *ptr = cb->userp;
+    if (!cb->inline_direct_ptr) {
+        /* dereference userp once to get access to memory location */
+        ptr = *(char **)cb->userp;
+    }
+    op = copy_ld_i32(&begin_op, op);
+    op = copy_mul_i32(&begin_op, op, cb->inline_element_size);
+    op = copy_ext_i32_ptr(&begin_op, op);
+    op = copy_const_ptr(&begin_op, op, ptr + cb->inline_offset);
+    op = copy_add_ptr(&begin_op, op);
     op = copy_ld_i64(&begin_op, op);
-
-    /* add_i64 */
     op = copy_add_i64(&begin_op, op, cb->inline_insn.imm);
-
-    /* st_i64 */
     op = copy_st_i64(&begin_op, op);
-
     return op;
 }
 
diff --git a/include/qemu/plugin.h b/include/qemu/plugin.h
index b0c5ac68293..9346249145d 100644
--- a/include/qemu/plugin.h
+++ b/include/qemu/plugin.h
@@ -86,6 +86,9 @@  enum plugin_dyn_cb_subtype {
 struct qemu_plugin_dyn_cb {
     union qemu_plugin_cb_sig f;
     void *userp;
+    size_t inline_offset;
+    size_t inline_element_size;
+    bool inline_direct_ptr;
     enum plugin_dyn_cb_subtype type;
     /* @rw applies to mem callbacks only (both regular and inline) */
     enum qemu_plugin_mem_rw rw;
diff --git a/plugins/api.c b/plugins/api.c
index 8d5cca53295..e777eb4e9fc 100644
--- a/plugins/api.c
+++ b/plugins/api.c
@@ -106,7 +106,8 @@  void qemu_plugin_register_vcpu_tb_exec_inline(struct qemu_plugin_tb *tb,
                                               void *ptr, uint64_t imm)
 {
     if (!tb->mem_only) {
-        plugin_register_inline_op(&tb->cbs[PLUGIN_CB_INLINE], 0, op, ptr, imm);
+        plugin_register_inline_op(&tb->cbs[PLUGIN_CB_INLINE],
+                                  0, op, ptr, 0, sizeof(uint64_t), true, imm);
     }
 }
 
@@ -131,7 +132,7 @@  void qemu_plugin_register_vcpu_insn_exec_inline(struct qemu_plugin_insn *insn,
 {
     if (!insn->mem_only) {
         plugin_register_inline_op(&insn->cbs[PLUGIN_CB_INSN][PLUGIN_CB_INLINE],
-                                  0, op, ptr, imm);
+                                  0, op, ptr, 0, sizeof(uint64_t), true, imm);
     }
 }
 
@@ -156,7 +157,7 @@  void qemu_plugin_register_vcpu_mem_inline(struct qemu_plugin_insn *insn,
                                           uint64_t imm)
 {
     plugin_register_inline_op(&insn->cbs[PLUGIN_CB_MEM][PLUGIN_CB_INLINE],
-                              rw, op, ptr, imm);
+                              rw, op, ptr, 0, sizeof(uint64_t), true, imm);
 }
 
 void qemu_plugin_register_vcpu_tb_trans_cb(qemu_plugin_id_t id,
diff --git a/plugins/core.c b/plugins/core.c
index 49588285dd0..e07b9cdf229 100644
--- a/plugins/core.c
+++ b/plugins/core.c
@@ -280,13 +280,18 @@  static struct qemu_plugin_dyn_cb *plugin_get_dyn_cb(GArray **arr)
 
 void plugin_register_inline_op(GArray **arr,
                                enum qemu_plugin_mem_rw rw,
-                               enum qemu_plugin_op op, void *ptr,
+                               enum qemu_plugin_op op,
+                               void *ptr, size_t offset, size_t element_size,
+                               bool direct_ptr,
                                uint64_t imm)
 {
     struct qemu_plugin_dyn_cb *dyn_cb;
 
     dyn_cb = plugin_get_dyn_cb(arr);
     dyn_cb->userp = ptr;
+    dyn_cb->inline_element_size = element_size;
+    dyn_cb->inline_offset = offset;
+    dyn_cb->inline_direct_ptr = direct_ptr;
     dyn_cb->type = PLUGIN_CB_INLINE;
     dyn_cb->rw = rw;
     dyn_cb->inline_insn.op = op;
@@ -431,9 +436,14 @@  void qemu_plugin_flush_cb(void)
     plugin_cb__simple(QEMU_PLUGIN_EV_FLUSH);
 }
 
-void exec_inline_op(struct qemu_plugin_dyn_cb *cb)
+void exec_inline_op(struct qemu_plugin_dyn_cb *cb, int cpu_index)
 {
-    uint64_t *val = cb->userp;
+    char *ptr = cb->userp;
+    if (!cb->inline_direct_ptr) {
+        ptr = *(char **) cb->userp;
+    }
+    ptr += cb->inline_offset;
+    uint64_t *val = (uint64_t *)(ptr + cpu_index * cb->inline_element_size);
 
     switch (cb->inline_insn.op) {
     case QEMU_PLUGIN_INLINE_ADD_U64:
@@ -466,7 +476,7 @@  void qemu_plugin_vcpu_mem_cb(CPUState *cpu, uint64_t vaddr,
                            vaddr, cb->userp);
             break;
         case PLUGIN_CB_INLINE:
-            exec_inline_op(cb);
+            exec_inline_op(cb, cpu->cpu_index);
             break;
         default:
             g_assert_not_reached();
diff --git a/plugins/plugin.h b/plugins/plugin.h
index 5eb2fdbc85e..2c278379b70 100644
--- a/plugins/plugin.h
+++ b/plugins/plugin.h
@@ -66,7 +66,9 @@  struct qemu_plugin_ctx *plugin_id_to_ctx_locked(qemu_plugin_id_t id);
 
 void plugin_register_inline_op(GArray **arr,
                                enum qemu_plugin_mem_rw rw,
-                               enum qemu_plugin_op op, void *ptr,
+                               enum qemu_plugin_op op,
+                               void *ptr, size_t offset, size_t element_size,
+                               bool direct_ptr,
                                uint64_t imm);
 
 void plugin_reset_uninstall(qemu_plugin_id_t id,
@@ -95,6 +97,6 @@  void plugin_register_vcpu_mem_cb(GArray **arr,
                                  enum qemu_plugin_mem_rw rw,
                                  void *udata);
 
-void exec_inline_op(struct qemu_plugin_dyn_cb *cb);
+void exec_inline_op(struct qemu_plugin_dyn_cb *cb, int cpu_index);
 
 #endif /* PLUGIN_H */