diff mbox series

[v5,05/36] tcg: Add TCG_CALL_{RET,ARG}_BY_REF

Message ID 20230126043824.54819-6-richard.henderson@linaro.org
State Superseded
Headers show
Series tcg: Support for Int128 with helpers | expand

Commit Message

Richard Henderson Jan. 26, 2023, 4:37 a.m. UTC
These will be used by some hosts, both 32 and 64-bit, to pass and
return i128.  Not yet used, because allocation is not yet enabled.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tcg-internal.h |   3 +
 tcg/tcg.c          | 135 ++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 135 insertions(+), 3 deletions(-)

Comments

Alex Bennée Jan. 27, 2023, 10:40 a.m. UTC | #1
Richard Henderson <richard.henderson@linaro.org> writes:

> These will be used by some hosts, both 32 and 64-bit, to pass and
> return i128.  Not yet used, because allocation is not yet enabled.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>  tcg/tcg-internal.h |   3 +
>  tcg/tcg.c          | 135 ++++++++++++++++++++++++++++++++++++++++++++-
>  2 files changed, 135 insertions(+), 3 deletions(-)
>
> diff --git a/tcg/tcg-internal.h b/tcg/tcg-internal.h
> index 6e50aeba3a..2ec1ea01df 100644
> --- a/tcg/tcg-internal.h
> +++ b/tcg/tcg-internal.h
> @@ -36,6 +36,7 @@
>   */
>  typedef enum {
>      TCG_CALL_RET_NORMAL,         /* by registers */
> +    TCG_CALL_RET_BY_REF,         /* for i128, by reference */
>  } TCGCallReturnKind;
>  
>  typedef enum {
> @@ -44,6 +45,8 @@ typedef enum {
>      TCG_CALL_ARG_EXTEND,         /* for i32, as a sign/zero-extended i64 */
>      TCG_CALL_ARG_EXTEND_U,       /*      ... as a zero-extended i64 */
>      TCG_CALL_ARG_EXTEND_S,       /*      ... as a sign-extended i64 */
> +    TCG_CALL_ARG_BY_REF,         /* for i128, by reference, first */
> +    TCG_CALL_ARG_BY_REF_N,       /*       ... by reference, subsequent */
>  } TCGCallArgumentKind;
>  
>  typedef struct TCGCallArgumentLoc {
> diff --git a/tcg/tcg.c b/tcg/tcg.c
> index a561ef3ced..644dc53196 100644
> --- a/tcg/tcg.c
> +++ b/tcg/tcg.c
> @@ -104,8 +104,7 @@ static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg1,
>  static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg);
>  static void tcg_out_movi(TCGContext *s, TCGType type,
>                           TCGReg ret, tcg_target_long arg);
> -static void tcg_out_addi_ptr(TCGContext *s, TCGReg, TCGReg, tcg_target_long)
> -    __attribute__((unused));
> +static void tcg_out_addi_ptr(TCGContext *s, TCGReg, TCGReg, tcg_target_long);
>  static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg);
>  static void tcg_out_goto_tb(TCGContext *s, int which);
>  static void tcg_out_op(TCGContext *s, TCGOpcode opc,
> @@ -683,6 +682,38 @@ static void layout_arg_normal_n(TCGCumulativeArgs *cum,
>      cum->arg_slot += n;
>  }
>  
> +static void layout_arg_by_ref(TCGCumulativeArgs *cum, TCGHelperInfo *info)
> +{
> +    TCGCallArgumentLoc *loc = &info->in[cum->info_in_idx];
> +    int n = 128 / TCG_TARGET_REG_BITS;
> +
> +    /* The first subindex carries the pointer. */
> +    layout_arg_1(cum, info, TCG_CALL_ARG_BY_REF);
> +
> +    /*
> +     * The callee is allowed to clobber memory associated with
> +     * structure pass by-reference.  Therefore we must make copies.
> +     * Allocate space from "ref_slot", which will be adjusted to
> +     * follow the parameters on the stack.
> +     */
> +    loc[0].ref_slot = cum->ref_slot;
> +
> +    /*
> +     * Subsequent words also go into the reference slot, but
> +     * do not accumulate into the regular arguments.
> +     */
> +    for (int i = 1; i < n; ++i) {
> +        loc[i] = (TCGCallArgumentLoc){
> +            .kind = TCG_CALL_ARG_BY_REF_N,
> +            .arg_idx = cum->arg_idx,
> +            .tmp_subindex = i,
> +            .ref_slot = cum->ref_slot + i,
> +        };
> +    }
> +    cum->info_in_idx += n;
> +    cum->ref_slot += n;
> +}

I'm surprised this is in the core TCG. Are the stack frames organised
the same way across all our host architectures?

> +
>  static void init_call_layout(TCGHelperInfo *info)
>  {
>      int max_reg_slots = ARRAY_SIZE(tcg_target_call_iarg_regs);
> @@ -718,6 +749,14 @@ static void init_call_layout(TCGHelperInfo *info)
>          case TCG_CALL_RET_NORMAL:
>              assert(info->nr_out <= ARRAY_SIZE(tcg_target_call_oarg_regs));
>              break;
> +        case TCG_CALL_RET_BY_REF:
> +            /*
> +             * Allocate the first argument to the output.
> +             * We don't need to store this anywhere, just make it
> +             * unavailable for use in the input loop below.
> +             */
> +            cum.arg_slot = 1;
> +            break;

It would have helped if ->typemask was documented or accessed with
something like dh_get_typemask(0) for my understanding here.

>          default:
>              qemu_build_not_reached();
>          }
> @@ -796,6 +835,9 @@ static void init_call_layout(TCGHelperInfo *info)
>              case TCG_CALL_ARG_NORMAL:
>                  layout_arg_normal_n(&cum, info, 128 / TCG_TARGET_REG_BITS);
>                  break;
> +            case TCG_CALL_ARG_BY_REF:
> +                layout_arg_by_ref(&cum, info);
> +                break;
>              default:
>                  qemu_build_not_reached();
>              }
> @@ -811,7 +853,39 @@ static void init_call_layout(TCGHelperInfo *info)
>      assert(cum.info_in_idx <= ARRAY_SIZE(info->in));
>      /* Validate the backend has enough argument space. */
>      assert(cum.arg_slot <= max_reg_slots + max_stk_slots);
> -    assert(cum.ref_slot <= max_stk_slots);
> +
> +    /*
> +     * Relocate the "ref_slot" area to the end of the parameters.
> +     * Minimizing this stack offset helps code size for x86,
> +     * which has a signed 8-bit offset encoding.
> +     */

I don't quite follow this. Are we free to move parameters around in the
stack frame? I thought the position would be directly related to the
argument number?

> +    if (cum.ref_slot != 0) {
> +        int ref_base = 0;
> +
> +        if (cum.arg_slot > max_reg_slots) {
> +            int align = __alignof(Int128) / sizeof(tcg_target_long);
> +
> +            ref_base = cum.arg_slot - max_reg_slots;
> +            if (align > 1) {
> +                ref_base = ROUND_UP(ref_base, align);
> +            }
> +        }
> +        assert(ref_base + cum.ref_slot <= max_stk_slots);
> +
> +        if (ref_base != 0) {
> +            for (int i = cum.info_in_idx - 1; i >= 0; --i) {
> +                TCGCallArgumentLoc *loc = &info->in[i];
> +                switch (loc->kind) {
> +                case TCG_CALL_ARG_BY_REF:
> +                case TCG_CALL_ARG_BY_REF_N:
> +                    loc->ref_slot += ref_base;
> +                    break;
> +                default:
> +                    break;
> +                }
> +            }
> +        }
> +    }
>  }
>  
>  static int indirect_reg_alloc_order[ARRAY_SIZE(tcg_target_reg_alloc_order)];
> @@ -1738,6 +1812,8 @@ void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args)
>  
>          switch (loc->kind) {
>          case TCG_CALL_ARG_NORMAL:
> +        case TCG_CALL_ARG_BY_REF:
> +        case TCG_CALL_ARG_BY_REF_N:
>              op->args[pi++] = temp_arg(ts);
>              break;
>  
> @@ -4404,6 +4480,27 @@ static void load_arg_normal(TCGContext *s, const TCGCallArgumentLoc *l,
>      }
>  }
>  
> +static void load_arg_ref(TCGContext *s, int arg_slot, TCGReg ref_base,
> +                         intptr_t ref_off, TCGRegSet *allocated_regs)
> +{
> +    TCGReg reg;
> +    int stk_slot = arg_slot - ARRAY_SIZE(tcg_target_call_iarg_regs);
> +
> +    if (stk_slot < 0) {
> +        reg = tcg_target_call_iarg_regs[arg_slot];
> +        tcg_reg_free(s, reg, *allocated_regs);
> +        tcg_out_addi_ptr(s, reg, ref_base, ref_off);
> +        tcg_regset_set_reg(*allocated_regs, reg);
> +    } else {
> +        reg = tcg_reg_alloc(s, tcg_target_available_regs[TCG_TYPE_PTR],
> +                            *allocated_regs, 0, false);
> +        tcg_out_addi_ptr(s, reg, ref_base, ref_off);
> +        tcg_out_st(s, TCG_TYPE_PTR, reg, TCG_REG_CALL_STACK,
> +                   TCG_TARGET_CALL_STACK_OFFSET
> +                   + stk_slot * sizeof(tcg_target_long));
> +    }
> +}
> +
>  static void tcg_reg_alloc_call(TCGContext *s, TCGOp *op)
>  {
>      const int nb_oargs = TCGOP_CALLO(op);
> @@ -4427,6 +4524,16 @@ static void tcg_reg_alloc_call(TCGContext *s, TCGOp *op)
>          case TCG_CALL_ARG_EXTEND_S:
>              load_arg_normal(s, loc, ts, &allocated_regs);
>              break;
> +        case TCG_CALL_ARG_BY_REF:
> +            load_arg_stk(s, loc->ref_slot, ts, allocated_regs);
> +            load_arg_ref(s, loc->arg_slot, TCG_REG_CALL_STACK,
> +                         TCG_TARGET_CALL_STACK_OFFSET
> +                         + loc->ref_slot * sizeof(tcg_target_long),
> +                         &allocated_regs);
> +            break;
> +        case TCG_CALL_ARG_BY_REF_N:
> +            load_arg_stk(s, loc->ref_slot, ts, allocated_regs);
> +            break;
>          default:
>              g_assert_not_reached();
>          }
> @@ -4458,6 +4565,19 @@ static void tcg_reg_alloc_call(TCGContext *s, TCGOp *op)
>          save_globals(s, allocated_regs);
>      }
>  
> +    /*
> +     * If the ABI passes a pointer to the returned struct as the first
> +     * argument, load that now.  Pass a pointer to the output home slot.
> +     */
> +    if (info->out_kind == TCG_CALL_RET_BY_REF) {
> +        TCGTemp *ts = arg_temp(op->args[0]);
> +
> +        if (!ts->mem_allocated) {
> +            temp_allocate_frame(s, ts);
> +        }
> +        load_arg_ref(s, 0, ts->mem_base->reg, ts->mem_offset, &allocated_regs);
> +    }
> +
>      tcg_out_call(s, tcg_call_func(op), info);
>  
>      /* Assign output registers and emit moves if needed.  */
> @@ -4474,6 +4594,15 @@ static void tcg_reg_alloc_call(TCGContext *s, TCGOp *op)
>              ts->mem_coherent = 0;
>          }
>          break;
> +
> +    case TCG_CALL_RET_BY_REF:
> +        /* The callee has performed a write through the reference. */
> +        for (i = 0; i < nb_oargs; i++) {
> +            TCGTemp *ts = arg_temp(op->args[i]);
> +            ts->val_type = TEMP_VAL_MEM;
> +        }
> +        break;
> +
>      default:
>          g_assert_not_reached();
>      }
Richard Henderson Jan. 27, 2023, 6:48 p.m. UTC | #2
On 1/27/23 00:40, Alex Bennée wrote:
> 
> Richard Henderson <richard.henderson@linaro.org> writes:
> 
>> These will be used by some hosts, both 32 and 64-bit, to pass and
>> return i128.  Not yet used, because allocation is not yet enabled.
>>
>> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
>> ---
>>   tcg/tcg-internal.h |   3 +
>>   tcg/tcg.c          | 135 ++++++++++++++++++++++++++++++++++++++++++++-
>>   2 files changed, 135 insertions(+), 3 deletions(-)
>>
>> diff --git a/tcg/tcg-internal.h b/tcg/tcg-internal.h
>> index 6e50aeba3a..2ec1ea01df 100644
>> --- a/tcg/tcg-internal.h
>> +++ b/tcg/tcg-internal.h
>> @@ -36,6 +36,7 @@
>>    */
>>   typedef enum {
>>       TCG_CALL_RET_NORMAL,         /* by registers */
>> +    TCG_CALL_RET_BY_REF,         /* for i128, by reference */
>>   } TCGCallReturnKind;
>>   
>>   typedef enum {
>> @@ -44,6 +45,8 @@ typedef enum {
>>       TCG_CALL_ARG_EXTEND,         /* for i32, as a sign/zero-extended i64 */
>>       TCG_CALL_ARG_EXTEND_U,       /*      ... as a zero-extended i64 */
>>       TCG_CALL_ARG_EXTEND_S,       /*      ... as a sign-extended i64 */
>> +    TCG_CALL_ARG_BY_REF,         /* for i128, by reference, first */
>> +    TCG_CALL_ARG_BY_REF_N,       /*       ... by reference, subsequent */
>>   } TCGCallArgumentKind;
>>   
>>   typedef struct TCGCallArgumentLoc {
>> diff --git a/tcg/tcg.c b/tcg/tcg.c
>> index a561ef3ced..644dc53196 100644
>> --- a/tcg/tcg.c
>> +++ b/tcg/tcg.c
>> @@ -104,8 +104,7 @@ static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg1,
>>   static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg);
>>   static void tcg_out_movi(TCGContext *s, TCGType type,
>>                            TCGReg ret, tcg_target_long arg);
>> -static void tcg_out_addi_ptr(TCGContext *s, TCGReg, TCGReg, tcg_target_long)
>> -    __attribute__((unused));
>> +static void tcg_out_addi_ptr(TCGContext *s, TCGReg, TCGReg, tcg_target_long);
>>   static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg);
>>   static void tcg_out_goto_tb(TCGContext *s, int which);
>>   static void tcg_out_op(TCGContext *s, TCGOpcode opc,
>> @@ -683,6 +682,38 @@ static void layout_arg_normal_n(TCGCumulativeArgs *cum,
>>       cum->arg_slot += n;
>>   }
>>   
>> +static void layout_arg_by_ref(TCGCumulativeArgs *cum, TCGHelperInfo *info)
>> +{
>> +    TCGCallArgumentLoc *loc = &info->in[cum->info_in_idx];
>> +    int n = 128 / TCG_TARGET_REG_BITS;
>> +
>> +    /* The first subindex carries the pointer. */
>> +    layout_arg_1(cum, info, TCG_CALL_ARG_BY_REF);
>> +
>> +    /*
>> +     * The callee is allowed to clobber memory associated with
>> +     * structure pass by-reference.  Therefore we must make copies.
>> +     * Allocate space from "ref_slot", which will be adjusted to
>> +     * follow the parameters on the stack.
>> +     */
>> +    loc[0].ref_slot = cum->ref_slot;
>> +
>> +    /*
>> +     * Subsequent words also go into the reference slot, but
>> +     * do not accumulate into the regular arguments.
>> +     */
>> +    for (int i = 1; i < n; ++i) {
>> +        loc[i] = (TCGCallArgumentLoc){
>> +            .kind = TCG_CALL_ARG_BY_REF_N,
>> +            .arg_idx = cum->arg_idx,
>> +            .tmp_subindex = i,
>> +            .ref_slot = cum->ref_slot + i,
>> +        };
>> +    }
>> +    cum->info_in_idx += n;
>> +    cum->ref_slot += n;
>> +}
> 
> I'm surprised this is in the core TCG. Are the stack frames organised
> the same way across all our host architectures?

Where else would it be?  It's the core TCG that has to take care of moving the data to the 
correct place, either register or stack.  The backend can't do that because the backend 
doesn't know how other registers are allocated.

Our host architectures use the same general form -- place arguments in N integer registers 
before placing them on the stack, incrementing from some offset O.  The differences 
between the hosts is being expressed precicely by these TCG_CALL_ARG_* enumerations.

>> @@ -718,6 +749,14 @@ static void init_call_layout(TCGHelperInfo *info)
>>           case TCG_CALL_RET_NORMAL:
>>               assert(info->nr_out <= ARRAY_SIZE(tcg_target_call_oarg_regs));
>>               break;
>> +        case TCG_CALL_RET_BY_REF:
>> +            /*
>> +             * Allocate the first argument to the output.
>> +             * We don't need to store this anywhere, just make it
>> +             * unavailable for use in the input loop below.
>> +             */
>> +            cum.arg_slot = 1;
>> +            break;
> 
> It would have helped if ->typemask was documented or accessed with
> something like dh_get_typemask(0) for my understanding here.

Where?  Above this fragment?  Yes, that seems like a good cleanup.

>> +
>> +    /*
>> +     * Relocate the "ref_slot" area to the end of the parameters.
>> +     * Minimizing this stack offset helps code size for x86,
>> +     * which has a signed 8-bit offset encoding.
>> +     */
> 
> I don't quite follow this. Are we free to move parameters around in the
> stack frame? I thought the position would be directly related to the
> argument number?

This is a reference.  The actual argument is a pointer:

> +    /* The first subindex carries the pointer. */
> +    layout_arg_1(cum, info, TCG_CALL_ARG_BY_REF);

and later

>> +static void load_arg_ref(TCGContext *s, int arg_slot, TCGReg ref_base,
>> +                         intptr_t ref_off, TCGRegSet *allocated_regs)
>> +{
>> +    TCGReg reg;
>> +    int stk_slot = arg_slot - ARRAY_SIZE(tcg_target_call_iarg_regs);
>> +
>> +    if (stk_slot < 0) {
>> +        reg = tcg_target_call_iarg_regs[arg_slot];
>> +        tcg_reg_free(s, reg, *allocated_regs);
>> +        tcg_out_addi_ptr(s, reg, ref_base, ref_off);
>> +        tcg_regset_set_reg(*allocated_regs, reg);
>> +    } else {
>> +        reg = tcg_reg_alloc(s, tcg_target_available_regs[TCG_TYPE_PTR],
>> +                            *allocated_regs, 0, false);
>> +        tcg_out_addi_ptr(s, reg, ref_base, ref_off);
>> +        tcg_out_st(s, TCG_TYPE_PTR, reg, TCG_REG_CALL_STACK,
>> +                   TCG_TARGET_CALL_STACK_OFFSET
>> +                   + stk_slot * sizeof(tcg_target_long));
>> +    }
>> +}

where the pointer is computed, as stack + ref_off.
We can move ref_off anywhere we like because the pointer carries the information.

The need for a copy of the data into *some* spot on the stack was detailed near the beginning:

> +    /*
> +     * The callee is allowed to clobber memory associated with
> +     * structure pass by-reference.  Therefore we must make copies.
> +     * Allocate space from "ref_slot", which will be adjusted to
> +     * follow the parameters on the stack.
> +     */


r~
diff mbox series

Patch

diff --git a/tcg/tcg-internal.h b/tcg/tcg-internal.h
index 6e50aeba3a..2ec1ea01df 100644
--- a/tcg/tcg-internal.h
+++ b/tcg/tcg-internal.h
@@ -36,6 +36,7 @@ 
  */
 typedef enum {
     TCG_CALL_RET_NORMAL,         /* by registers */
+    TCG_CALL_RET_BY_REF,         /* for i128, by reference */
 } TCGCallReturnKind;
 
 typedef enum {
@@ -44,6 +45,8 @@  typedef enum {
     TCG_CALL_ARG_EXTEND,         /* for i32, as a sign/zero-extended i64 */
     TCG_CALL_ARG_EXTEND_U,       /*      ... as a zero-extended i64 */
     TCG_CALL_ARG_EXTEND_S,       /*      ... as a sign-extended i64 */
+    TCG_CALL_ARG_BY_REF,         /* for i128, by reference, first */
+    TCG_CALL_ARG_BY_REF_N,       /*       ... by reference, subsequent */
 } TCGCallArgumentKind;
 
 typedef struct TCGCallArgumentLoc {
diff --git a/tcg/tcg.c b/tcg/tcg.c
index a561ef3ced..644dc53196 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -104,8 +104,7 @@  static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg1,
 static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg);
 static void tcg_out_movi(TCGContext *s, TCGType type,
                          TCGReg ret, tcg_target_long arg);
-static void tcg_out_addi_ptr(TCGContext *s, TCGReg, TCGReg, tcg_target_long)
-    __attribute__((unused));
+static void tcg_out_addi_ptr(TCGContext *s, TCGReg, TCGReg, tcg_target_long);
 static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg);
 static void tcg_out_goto_tb(TCGContext *s, int which);
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
@@ -683,6 +682,38 @@  static void layout_arg_normal_n(TCGCumulativeArgs *cum,
     cum->arg_slot += n;
 }
 
+static void layout_arg_by_ref(TCGCumulativeArgs *cum, TCGHelperInfo *info)
+{
+    TCGCallArgumentLoc *loc = &info->in[cum->info_in_idx];
+    int n = 128 / TCG_TARGET_REG_BITS;
+
+    /* The first subindex carries the pointer. */
+    layout_arg_1(cum, info, TCG_CALL_ARG_BY_REF);
+
+    /*
+     * The callee is allowed to clobber memory associated with
+     * structure pass by-reference.  Therefore we must make copies.
+     * Allocate space from "ref_slot", which will be adjusted to
+     * follow the parameters on the stack.
+     */
+    loc[0].ref_slot = cum->ref_slot;
+
+    /*
+     * Subsequent words also go into the reference slot, but
+     * do not accumulate into the regular arguments.
+     */
+    for (int i = 1; i < n; ++i) {
+        loc[i] = (TCGCallArgumentLoc){
+            .kind = TCG_CALL_ARG_BY_REF_N,
+            .arg_idx = cum->arg_idx,
+            .tmp_subindex = i,
+            .ref_slot = cum->ref_slot + i,
+        };
+    }
+    cum->info_in_idx += n;
+    cum->ref_slot += n;
+}
+
 static void init_call_layout(TCGHelperInfo *info)
 {
     int max_reg_slots = ARRAY_SIZE(tcg_target_call_iarg_regs);
@@ -718,6 +749,14 @@  static void init_call_layout(TCGHelperInfo *info)
         case TCG_CALL_RET_NORMAL:
             assert(info->nr_out <= ARRAY_SIZE(tcg_target_call_oarg_regs));
             break;
+        case TCG_CALL_RET_BY_REF:
+            /*
+             * Allocate the first argument to the output.
+             * We don't need to store this anywhere, just make it
+             * unavailable for use in the input loop below.
+             */
+            cum.arg_slot = 1;
+            break;
         default:
             qemu_build_not_reached();
         }
@@ -796,6 +835,9 @@  static void init_call_layout(TCGHelperInfo *info)
             case TCG_CALL_ARG_NORMAL:
                 layout_arg_normal_n(&cum, info, 128 / TCG_TARGET_REG_BITS);
                 break;
+            case TCG_CALL_ARG_BY_REF:
+                layout_arg_by_ref(&cum, info);
+                break;
             default:
                 qemu_build_not_reached();
             }
@@ -811,7 +853,39 @@  static void init_call_layout(TCGHelperInfo *info)
     assert(cum.info_in_idx <= ARRAY_SIZE(info->in));
     /* Validate the backend has enough argument space. */
     assert(cum.arg_slot <= max_reg_slots + max_stk_slots);
-    assert(cum.ref_slot <= max_stk_slots);
+
+    /*
+     * Relocate the "ref_slot" area to the end of the parameters.
+     * Minimizing this stack offset helps code size for x86,
+     * which has a signed 8-bit offset encoding.
+     */
+    if (cum.ref_slot != 0) {
+        int ref_base = 0;
+
+        if (cum.arg_slot > max_reg_slots) {
+            int align = __alignof(Int128) / sizeof(tcg_target_long);
+
+            ref_base = cum.arg_slot - max_reg_slots;
+            if (align > 1) {
+                ref_base = ROUND_UP(ref_base, align);
+            }
+        }
+        assert(ref_base + cum.ref_slot <= max_stk_slots);
+
+        if (ref_base != 0) {
+            for (int i = cum.info_in_idx - 1; i >= 0; --i) {
+                TCGCallArgumentLoc *loc = &info->in[i];
+                switch (loc->kind) {
+                case TCG_CALL_ARG_BY_REF:
+                case TCG_CALL_ARG_BY_REF_N:
+                    loc->ref_slot += ref_base;
+                    break;
+                default:
+                    break;
+                }
+            }
+        }
+    }
 }
 
 static int indirect_reg_alloc_order[ARRAY_SIZE(tcg_target_reg_alloc_order)];
@@ -1738,6 +1812,8 @@  void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args)
 
         switch (loc->kind) {
         case TCG_CALL_ARG_NORMAL:
+        case TCG_CALL_ARG_BY_REF:
+        case TCG_CALL_ARG_BY_REF_N:
             op->args[pi++] = temp_arg(ts);
             break;
 
@@ -4404,6 +4480,27 @@  static void load_arg_normal(TCGContext *s, const TCGCallArgumentLoc *l,
     }
 }
 
+static void load_arg_ref(TCGContext *s, int arg_slot, TCGReg ref_base,
+                         intptr_t ref_off, TCGRegSet *allocated_regs)
+{
+    TCGReg reg;
+    int stk_slot = arg_slot - ARRAY_SIZE(tcg_target_call_iarg_regs);
+
+    if (stk_slot < 0) {
+        reg = tcg_target_call_iarg_regs[arg_slot];
+        tcg_reg_free(s, reg, *allocated_regs);
+        tcg_out_addi_ptr(s, reg, ref_base, ref_off);
+        tcg_regset_set_reg(*allocated_regs, reg);
+    } else {
+        reg = tcg_reg_alloc(s, tcg_target_available_regs[TCG_TYPE_PTR],
+                            *allocated_regs, 0, false);
+        tcg_out_addi_ptr(s, reg, ref_base, ref_off);
+        tcg_out_st(s, TCG_TYPE_PTR, reg, TCG_REG_CALL_STACK,
+                   TCG_TARGET_CALL_STACK_OFFSET
+                   + stk_slot * sizeof(tcg_target_long));
+    }
+}
+
 static void tcg_reg_alloc_call(TCGContext *s, TCGOp *op)
 {
     const int nb_oargs = TCGOP_CALLO(op);
@@ -4427,6 +4524,16 @@  static void tcg_reg_alloc_call(TCGContext *s, TCGOp *op)
         case TCG_CALL_ARG_EXTEND_S:
             load_arg_normal(s, loc, ts, &allocated_regs);
             break;
+        case TCG_CALL_ARG_BY_REF:
+            load_arg_stk(s, loc->ref_slot, ts, allocated_regs);
+            load_arg_ref(s, loc->arg_slot, TCG_REG_CALL_STACK,
+                         TCG_TARGET_CALL_STACK_OFFSET
+                         + loc->ref_slot * sizeof(tcg_target_long),
+                         &allocated_regs);
+            break;
+        case TCG_CALL_ARG_BY_REF_N:
+            load_arg_stk(s, loc->ref_slot, ts, allocated_regs);
+            break;
         default:
             g_assert_not_reached();
         }
@@ -4458,6 +4565,19 @@  static void tcg_reg_alloc_call(TCGContext *s, TCGOp *op)
         save_globals(s, allocated_regs);
     }
 
+    /*
+     * If the ABI passes a pointer to the returned struct as the first
+     * argument, load that now.  Pass a pointer to the output home slot.
+     */
+    if (info->out_kind == TCG_CALL_RET_BY_REF) {
+        TCGTemp *ts = arg_temp(op->args[0]);
+
+        if (!ts->mem_allocated) {
+            temp_allocate_frame(s, ts);
+        }
+        load_arg_ref(s, 0, ts->mem_base->reg, ts->mem_offset, &allocated_regs);
+    }
+
     tcg_out_call(s, tcg_call_func(op), info);
 
     /* Assign output registers and emit moves if needed.  */
@@ -4474,6 +4594,15 @@  static void tcg_reg_alloc_call(TCGContext *s, TCGOp *op)
             ts->mem_coherent = 0;
         }
         break;
+
+    case TCG_CALL_RET_BY_REF:
+        /* The callee has performed a write through the reference. */
+        for (i = 0; i < nb_oargs; i++) {
+            TCGTemp *ts = arg_temp(op->args[i]);
+            ts->val_type = TEMP_VAL_MEM;
+        }
+        break;
+
     default:
         g_assert_not_reached();
     }