diff mbox series

[7/7] tcg/ppc: Use prefixed instructions for tcg_out_goto_tb

Message ID 20230804213355.294443-8-richard.henderson@linaro.org
State New
Headers show
Series tcg/ppc: Support power10 prefixed instructions | expand

Commit Message

Richard Henderson Aug. 4, 2023, 9:33 p.m. UTC
When a direct branch is out of range, we can load the destination for
the indirect branch using PLA (for 16GB worth of buffer) and PLD from
the TranslationBlock for everything larger.

This means the patch affects exactly one instruction: B (plus filler),
PLA or PLD.  Which means we can update and execute the patch atomically.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/ppc/tcg-target.c.inc | 76 ++++++++++++++++++++++++++++++----------
 1 file changed, 58 insertions(+), 18 deletions(-)

Comments

Nicholas Piggin Aug. 6, 2023, 12:55 p.m. UTC | #1
On Sat Aug 5, 2023 at 7:33 AM AEST, Richard Henderson wrote:
> When a direct branch is out of range, we can load the destination for
> the indirect branch using PLA (for 16GB worth of buffer) and PLD from
> the TranslationBlock for everything larger.
>
> This means the patch affects exactly one instruction: B (plus filler),
> PLA or PLD.  Which means we can update and execute the patch atomically.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>  tcg/ppc/tcg-target.c.inc | 76 ++++++++++++++++++++++++++++++----------
>  1 file changed, 58 insertions(+), 18 deletions(-)
>
> diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
> index 5b243b2353..47c71bb5f2 100644
> --- a/tcg/ppc/tcg-target.c.inc
> +++ b/tcg/ppc/tcg-target.c.inc
> @@ -2642,31 +2642,41 @@ static void tcg_out_goto_tb(TCGContext *s, int which)
>      uintptr_t ptr = get_jmp_target_addr(s, which);
>  
>      if (USE_REG_TB) {
> +        /*
> +         * With REG_TB, we must always use indirect branching,
> +         * so that the branch destination and TCG_REG_TB match.
> +         */
>          ptrdiff_t offset = tcg_tbrel_diff(s, (void *)ptr);
>          tcg_out_mem_long(s, LD, LDX, TCG_REG_TB, TCG_REG_TB, offset);
> -    
> -        /* TODO: Use direct branches when possible. */
> -        set_jmp_insn_offset(s, which);
>          tcg_out32(s, MTSPR | RS(TCG_REG_TB) | CTR);
> -
>          tcg_out32(s, BCCTR | BO_ALWAYS);
>  
>          /* For the unlinked case, need to reset TCG_REG_TB.  */
>          set_jmp_reset_offset(s, which);
>          tcg_out_mem_long(s, ADDI, ADD, TCG_REG_TB, TCG_REG_TB,
>                           -tcg_current_code_size(s));
> +        return;
> +    }
> +
> +    if (have_isa_3_10) {
> +        /* Align, so that we can patch 8 bytes atomically. */
> +        if ((uintptr_t)s->code_ptr & 7) {
> +            tcg_out32(s, NOP);
> +        }
> +        set_jmp_insn_offset(s, which);
> +        /* Direct branch will be patched by tb_target_set_jmp_target. */
> +        tcg_out_mls_d(s, ADDI, TCG_REG_TMP1, 0, 0, 1);
>      } else {
>          /* Direct branch will be patched by tb_target_set_jmp_target. */
> -        set_jmp_insn_offset(s, which);
> -        tcg_out32(s, NOP);
> -
> +        tcg_out32(s, B);
>          /* When branch is out of range, fall through to indirect. */
>          tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP1, ptr - (int16_t)ptr);
>          tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_REG_TMP1, (int16_t)ptr);
> -        tcg_out32(s, MTSPR | RS(TCG_REG_TMP1) | CTR);
> -        tcg_out32(s, BCCTR | BO_ALWAYS);
> -        set_jmp_reset_offset(s, which);
>      }
> +
> +    tcg_out32(s, MTSPR | RS(TCG_REG_TMP1) | CTR);
> +    tcg_out32(s, BCCTR | BO_ALWAYS);
> +    set_jmp_reset_offset(s, which);
>  }
>  
>  void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
> @@ -2674,20 +2684,50 @@ void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
>  {
>      uintptr_t addr = tb->jmp_target_addr[n];
>      intptr_t diff = addr - jmp_rx;
> -    tcg_insn_unit insn;
>  
>      if (USE_REG_TB) {
>          return;
>      }
>  
> -    if (in_range_b(diff)) {
> -        insn = B | (diff & 0x3fffffc);
> -    } else {
> -        insn = NOP;
> -    }
> +    if (have_isa_3_10) {
> +        tcg_insn_unit insn1, insn2;
> +        uint64_t pair;
>  
> -    qatomic_set((uint32_t *)jmp_rw, insn);
> -    flush_idcache_range(jmp_rx, jmp_rw, 4);
> +        if (in_range_b(diff)) {
> +            insn1 = B | (diff & 0x3fffffc);
> +            insn2 = NOP;
> +        } else if (diff == sextract64(diff, 0, 34)) {
> +            /* PLA tmp1, diff */
> +            insn1 = OPCD(1) | (2 << 24) | (1 << 20) | ((diff >> 16) & 0x3ffff);
> +            insn2 = ADDI | TAI(TCG_REG_TMP1, 0, diff);
> +        } else {
> +            addr = (uintptr_t)&tb->jmp_target_addr[n];
> +            diff = addr - jmp_rx;
> +            tcg_debug_assert(diff == sextract64(diff, 0, 34));
> +            /* PLD tmp1, diff */
> +            insn1 = OPCD(1) | (1 << 20) | ((diff >> 16) & 0x3ffff);
> +            insn2 = PLD | TAI(TCG_REG_TMP1, 0, diff);
> +        }

B is a "patch class" word instruction as per CMODX in the ISA, which may
be patched to/from other instructions without a flush+isync sequence
betwen. So that part is okay, at least if you were just patching the B
word. But patching between the PLA and PLD I don't think is kosher per
ISA.

I struggle a bit with this part of the ISA, particularly with prefix
instructions (it only talks about patching 4 bytes at a time).

If we patch something it has to go through a patch instruction, which
is a direct branch, trap, or nop. I think that makes this non-trivial.

It could work if you only patched between B and PLD. B->PLD would have
to patch the suffix word first, possibly with an interleaving sync, and
then the prefix. PLD->B could just patch the B word.

How much would losing the PLA hurt?

Thanks,
Nick

> +
> +        if (HOST_BIG_ENDIAN) {
> +            pair = ((uint64_t)insn1) << 32 | insn2;
> +        } else {
> +            pair = ((uint64_t)insn2) << 32 | insn1;
> +        }
> +
> +        qatomic_set((uint64_t *)jmp_rw, pair);
> +        flush_idcache_range(jmp_rx, jmp_rw, 8);
> +    } else {
> +        tcg_insn_unit insn;
> +
> +        if (in_range_b(diff)) {
> +            insn = B | (diff & 0x3fffffc);
> +        } else {
> +            insn = NOP;
> +        }
> +        qatomic_set((uint32_t *)jmp_rw, insn);
> +        flush_idcache_range(jmp_rx, jmp_rw, 4);
> +    }
>  }
>  
>  static void tcg_out_op(TCGContext *s, TCGOpcode opc,
Richard Henderson Aug. 6, 2023, 2:13 p.m. UTC | #2
On 8/6/23 05:55, Nicholas Piggin wrote:
> On Sat Aug 5, 2023 at 7:33 AM AEST, Richard Henderson wrote:
>> When a direct branch is out of range, we can load the destination for
>> the indirect branch using PLA (for 16GB worth of buffer) and PLD from
>> the TranslationBlock for everything larger.
>>
>> This means the patch affects exactly one instruction: B (plus filler),
>> PLA or PLD.  Which means we can update and execute the patch atomically.
>>
>> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
>> ---
>>   tcg/ppc/tcg-target.c.inc | 76 ++++++++++++++++++++++++++++++----------
>>   1 file changed, 58 insertions(+), 18 deletions(-)
>>
>> diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
>> index 5b243b2353..47c71bb5f2 100644
>> --- a/tcg/ppc/tcg-target.c.inc
>> +++ b/tcg/ppc/tcg-target.c.inc
>> @@ -2642,31 +2642,41 @@ static void tcg_out_goto_tb(TCGContext *s, int which)
>>       uintptr_t ptr = get_jmp_target_addr(s, which);
>>   
>>       if (USE_REG_TB) {
>> +        /*
>> +         * With REG_TB, we must always use indirect branching,
>> +         * so that the branch destination and TCG_REG_TB match.
>> +         */
>>           ptrdiff_t offset = tcg_tbrel_diff(s, (void *)ptr);
>>           tcg_out_mem_long(s, LD, LDX, TCG_REG_TB, TCG_REG_TB, offset);
>> -
>> -        /* TODO: Use direct branches when possible. */
>> -        set_jmp_insn_offset(s, which);
>>           tcg_out32(s, MTSPR | RS(TCG_REG_TB) | CTR);
>> -
>>           tcg_out32(s, BCCTR | BO_ALWAYS);
>>   
>>           /* For the unlinked case, need to reset TCG_REG_TB.  */
>>           set_jmp_reset_offset(s, which);
>>           tcg_out_mem_long(s, ADDI, ADD, TCG_REG_TB, TCG_REG_TB,
>>                            -tcg_current_code_size(s));
>> +        return;
>> +    }
>> +
>> +    if (have_isa_3_10) {
>> +        /* Align, so that we can patch 8 bytes atomically. */
>> +        if ((uintptr_t)s->code_ptr & 7) {
>> +            tcg_out32(s, NOP);
>> +        }
>> +        set_jmp_insn_offset(s, which);
>> +        /* Direct branch will be patched by tb_target_set_jmp_target. */
>> +        tcg_out_mls_d(s, ADDI, TCG_REG_TMP1, 0, 0, 1);
>>       } else {
>>           /* Direct branch will be patched by tb_target_set_jmp_target. */
>> -        set_jmp_insn_offset(s, which);
>> -        tcg_out32(s, NOP);
>> -
>> +        tcg_out32(s, B);
>>           /* When branch is out of range, fall through to indirect. */
>>           tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP1, ptr - (int16_t)ptr);
>>           tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_REG_TMP1, (int16_t)ptr);
>> -        tcg_out32(s, MTSPR | RS(TCG_REG_TMP1) | CTR);
>> -        tcg_out32(s, BCCTR | BO_ALWAYS);
>> -        set_jmp_reset_offset(s, which);
>>       }
>> +
>> +    tcg_out32(s, MTSPR | RS(TCG_REG_TMP1) | CTR);
>> +    tcg_out32(s, BCCTR | BO_ALWAYS);
>> +    set_jmp_reset_offset(s, which);
>>   }
>>   
>>   void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
>> @@ -2674,20 +2684,50 @@ void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
>>   {
>>       uintptr_t addr = tb->jmp_target_addr[n];
>>       intptr_t diff = addr - jmp_rx;
>> -    tcg_insn_unit insn;
>>   
>>       if (USE_REG_TB) {
>>           return;
>>       }
>>   
>> -    if (in_range_b(diff)) {
>> -        insn = B | (diff & 0x3fffffc);
>> -    } else {
>> -        insn = NOP;
>> -    }
>> +    if (have_isa_3_10) {
>> +        tcg_insn_unit insn1, insn2;
>> +        uint64_t pair;
>>   
>> -    qatomic_set((uint32_t *)jmp_rw, insn);
>> -    flush_idcache_range(jmp_rx, jmp_rw, 4);
>> +        if (in_range_b(diff)) {
>> +            insn1 = B | (diff & 0x3fffffc);
>> +            insn2 = NOP;
>> +        } else if (diff == sextract64(diff, 0, 34)) {
>> +            /* PLA tmp1, diff */
>> +            insn1 = OPCD(1) | (2 << 24) | (1 << 20) | ((diff >> 16) & 0x3ffff);
>> +            insn2 = ADDI | TAI(TCG_REG_TMP1, 0, diff);
>> +        } else {
>> +            addr = (uintptr_t)&tb->jmp_target_addr[n];
>> +            diff = addr - jmp_rx;
>> +            tcg_debug_assert(diff == sextract64(diff, 0, 34));
>> +            /* PLD tmp1, diff */
>> +            insn1 = OPCD(1) | (1 << 20) | ((diff >> 16) & 0x3ffff);
>> +            insn2 = PLD | TAI(TCG_REG_TMP1, 0, diff);
>> +        }
> 
> B is a "patch class" word instruction as per CMODX in the ISA, which may
> be patched to/from other instructions without a flush+isync sequence
> betwen. So that part is okay, at least if you were just patching the B
> word. But patching between the PLA and PLD I don't think is kosher per
> ISA.
> 
> I struggle a bit with this part of the ISA, particularly with prefix
> instructions (it only talks about patching 4 bytes at a time).
> 
> If we patch something it has to go through a patch instruction, which
> is a direct branch, trap, or nop. I think that makes this non-trivial.
> 
> It could work if you only patched between B and PLD. B->PLD would have
> to patch the suffix word first, possibly with an interleaving sync, and
> then the prefix. PLD->B could just patch the B word.
> 
> How much would losing the PLA hurt?

Really?  I can't imagine how some icache would see a torn prefixed insn given an atomic 
store (CMODX talks about prefixed instructions which "may be unaligned" -- but what if 
they are not?).

But if patching an aligned prefixed insn isn't allowed, I would patch between B and NOP, 
leave the PLD alone on the fall-through path, and drop the PLA.


r~
Nicholas Piggin Aug. 7, 2023, 1:51 a.m. UTC | #3
On Mon Aug 7, 2023 at 12:13 AM AEST, Richard Henderson wrote:
> On 8/6/23 05:55, Nicholas Piggin wrote:
> > On Sat Aug 5, 2023 at 7:33 AM AEST, Richard Henderson wrote:
> >> When a direct branch is out of range, we can load the destination for
> >> the indirect branch using PLA (for 16GB worth of buffer) and PLD from
> >> the TranslationBlock for everything larger.
> >>
> >> This means the patch affects exactly one instruction: B (plus filler),
> >> PLA or PLD.  Which means we can update and execute the patch atomically.
> >>
> >> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> >> ---
> >>   tcg/ppc/tcg-target.c.inc | 76 ++++++++++++++++++++++++++++++----------
> >>   1 file changed, 58 insertions(+), 18 deletions(-)
> >>
> >> diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
> >> index 5b243b2353..47c71bb5f2 100644
> >> --- a/tcg/ppc/tcg-target.c.inc
> >> +++ b/tcg/ppc/tcg-target.c.inc
> >> @@ -2642,31 +2642,41 @@ static void tcg_out_goto_tb(TCGContext *s, int which)
> >>       uintptr_t ptr = get_jmp_target_addr(s, which);
> >>   
> >>       if (USE_REG_TB) {
> >> +        /*
> >> +         * With REG_TB, we must always use indirect branching,
> >> +         * so that the branch destination and TCG_REG_TB match.
> >> +         */
> >>           ptrdiff_t offset = tcg_tbrel_diff(s, (void *)ptr);
> >>           tcg_out_mem_long(s, LD, LDX, TCG_REG_TB, TCG_REG_TB, offset);
> >> -
> >> -        /* TODO: Use direct branches when possible. */
> >> -        set_jmp_insn_offset(s, which);
> >>           tcg_out32(s, MTSPR | RS(TCG_REG_TB) | CTR);
> >> -
> >>           tcg_out32(s, BCCTR | BO_ALWAYS);
> >>   
> >>           /* For the unlinked case, need to reset TCG_REG_TB.  */
> >>           set_jmp_reset_offset(s, which);
> >>           tcg_out_mem_long(s, ADDI, ADD, TCG_REG_TB, TCG_REG_TB,
> >>                            -tcg_current_code_size(s));
> >> +        return;
> >> +    }
> >> +
> >> +    if (have_isa_3_10) {
> >> +        /* Align, so that we can patch 8 bytes atomically. */
> >> +        if ((uintptr_t)s->code_ptr & 7) {
> >> +            tcg_out32(s, NOP);
> >> +        }
> >> +        set_jmp_insn_offset(s, which);
> >> +        /* Direct branch will be patched by tb_target_set_jmp_target. */
> >> +        tcg_out_mls_d(s, ADDI, TCG_REG_TMP1, 0, 0, 1);
> >>       } else {
> >>           /* Direct branch will be patched by tb_target_set_jmp_target. */
> >> -        set_jmp_insn_offset(s, which);
> >> -        tcg_out32(s, NOP);
> >> -
> >> +        tcg_out32(s, B);
> >>           /* When branch is out of range, fall through to indirect. */
> >>           tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP1, ptr - (int16_t)ptr);
> >>           tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_REG_TMP1, (int16_t)ptr);
> >> -        tcg_out32(s, MTSPR | RS(TCG_REG_TMP1) | CTR);
> >> -        tcg_out32(s, BCCTR | BO_ALWAYS);
> >> -        set_jmp_reset_offset(s, which);
> >>       }
> >> +
> >> +    tcg_out32(s, MTSPR | RS(TCG_REG_TMP1) | CTR);
> >> +    tcg_out32(s, BCCTR | BO_ALWAYS);
> >> +    set_jmp_reset_offset(s, which);
> >>   }
> >>   
> >>   void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
> >> @@ -2674,20 +2684,50 @@ void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
> >>   {
> >>       uintptr_t addr = tb->jmp_target_addr[n];
> >>       intptr_t diff = addr - jmp_rx;
> >> -    tcg_insn_unit insn;
> >>   
> >>       if (USE_REG_TB) {
> >>           return;
> >>       }
> >>   
> >> -    if (in_range_b(diff)) {
> >> -        insn = B | (diff & 0x3fffffc);
> >> -    } else {
> >> -        insn = NOP;
> >> -    }
> >> +    if (have_isa_3_10) {
> >> +        tcg_insn_unit insn1, insn2;
> >> +        uint64_t pair;
> >>   
> >> -    qatomic_set((uint32_t *)jmp_rw, insn);
> >> -    flush_idcache_range(jmp_rx, jmp_rw, 4);
> >> +        if (in_range_b(diff)) {
> >> +            insn1 = B | (diff & 0x3fffffc);
> >> +            insn2 = NOP;
> >> +        } else if (diff == sextract64(diff, 0, 34)) {
> >> +            /* PLA tmp1, diff */
> >> +            insn1 = OPCD(1) | (2 << 24) | (1 << 20) | ((diff >> 16) & 0x3ffff);
> >> +            insn2 = ADDI | TAI(TCG_REG_TMP1, 0, diff);
> >> +        } else {
> >> +            addr = (uintptr_t)&tb->jmp_target_addr[n];
> >> +            diff = addr - jmp_rx;
> >> +            tcg_debug_assert(diff == sextract64(diff, 0, 34));
> >> +            /* PLD tmp1, diff */
> >> +            insn1 = OPCD(1) | (1 << 20) | ((diff >> 16) & 0x3ffff);
> >> +            insn2 = PLD | TAI(TCG_REG_TMP1, 0, diff);
> >> +        }
> > 
> > B is a "patch class" word instruction as per CMODX in the ISA, which may
> > be patched to/from other instructions without a flush+isync sequence
> > betwen. So that part is okay, at least if you were just patching the B
> > word. But patching between the PLA and PLD I don't think is kosher per
> > ISA.
> > 
> > I struggle a bit with this part of the ISA, particularly with prefix
> > instructions (it only talks about patching 4 bytes at a time).
> > 
> > If we patch something it has to go through a patch instruction, which
> > is a direct branch, trap, or nop. I think that makes this non-trivial.
> > 
> > It could work if you only patched between B and PLD. B->PLD would have
> > to patch the suffix word first, possibly with an interleaving sync, and
> > then the prefix. PLD->B could just patch the B word.
> > 
> > How much would losing the PLA hurt?
>
> Really?  I can't imagine how some icache would see a torn prefixed insn given an atomic 
> store (CMODX talks about prefixed instructions which "may be unaligned" -- but what if 
> they are not?).

Good question, that might just be a case of not wanting to define and
verify it. Right now I think the way to CMODX a suffix is to patch the
prefix such that the suffix word can no longer be concurrently executed,
(e.g., with a branch or trap), then patch the suffix word, then patch a
the desired prefix word back in. I'd be almost certain it would work
correctly to patch an aligned dword insn at once, but better to go by
the book.

But there is an additional issue which is not just about torn write,
implementations that fetch an instruction more than once. The same
restriction exists for 4 byte instructions, you aren't meant to patch
an ld to an add, for example.

I don't know how much of that is historical, or whether it's just to
reduce verification space. ISTR there are some real concerns around
pipeline recovery corner cases.

> But if patching an aligned prefixed insn isn't allowed, I would patch between B and NOP, 
> leave the PLD alone on the fall-through path, and drop the PLA.

For now if you do that, maybe just leaving a comment that we can't
patch PLA due to CMODX restrictions, I would be less nervous about it.

I might ask about this internally, I did a few months ago have some
similar questions about problems with patching prefixes but didn't
have a good example.

Thanks,
Nick
Jordan Niethe Aug. 7, 2023, 4:08 a.m. UTC | #4
On Sat, Aug 5, 2023 at 7:34 AM Richard Henderson
<richard.henderson@linaro.org> wrote:
>
> When a direct branch is out of range, we can load the destination for
> the indirect branch using PLA (for 16GB worth of buffer) and PLD from
> the TranslationBlock for everything larger.
>
> This means the patch affects exactly one instruction: B (plus filler),
> PLA or PLD.  Which means we can update and execute the patch atomically.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>  tcg/ppc/tcg-target.c.inc | 76 ++++++++++++++++++++++++++++++----------
>  1 file changed, 58 insertions(+), 18 deletions(-)
>
> diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
> index 5b243b2353..47c71bb5f2 100644
> --- a/tcg/ppc/tcg-target.c.inc
> +++ b/tcg/ppc/tcg-target.c.inc
> @@ -2642,31 +2642,41 @@ static void tcg_out_goto_tb(TCGContext *s, int which)
>      uintptr_t ptr = get_jmp_target_addr(s, which);
>
>      if (USE_REG_TB) {
> +        /*
> +         * With REG_TB, we must always use indirect branching,
> +         * so that the branch destination and TCG_REG_TB match.
> +         */
>          ptrdiff_t offset = tcg_tbrel_diff(s, (void *)ptr);
>          tcg_out_mem_long(s, LD, LDX, TCG_REG_TB, TCG_REG_TB, offset);
> -
> -        /* TODO: Use direct branches when possible. */
> -        set_jmp_insn_offset(s, which);
>          tcg_out32(s, MTSPR | RS(TCG_REG_TB) | CTR);
> -
>          tcg_out32(s, BCCTR | BO_ALWAYS);
>
>          /* For the unlinked case, need to reset TCG_REG_TB.  */
>          set_jmp_reset_offset(s, which);
>          tcg_out_mem_long(s, ADDI, ADD, TCG_REG_TB, TCG_REG_TB,
>                           -tcg_current_code_size(s));
> +        return;
> +    }
> +
> +    if (have_isa_3_10) {
> +        /* Align, so that we can patch 8 bytes atomically. */
> +        if ((uintptr_t)s->code_ptr & 7) {
> +            tcg_out32(s, NOP);
> +        }
> +        set_jmp_insn_offset(s, which);
> +        /* Direct branch will be patched by tb_target_set_jmp_target. */
> +        tcg_out_mls_d(s, ADDI, TCG_REG_TMP1, 0, 0, 1);
>      } else {
>          /* Direct branch will be patched by tb_target_set_jmp_target. */
> -        set_jmp_insn_offset(s, which);

It looks like 32bit loses its set_jmp_insn_offset(), is that intended?

> -        tcg_out32(s, NOP);
> -
> +        tcg_out32(s, B);
>          /* When branch is out of range, fall through to indirect. */
>          tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP1, ptr - (int16_t)ptr);
>          tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_REG_TMP1, (int16_t)ptr);
> -        tcg_out32(s, MTSPR | RS(TCG_REG_TMP1) | CTR);
> -        tcg_out32(s, BCCTR | BO_ALWAYS);
> -        set_jmp_reset_offset(s, which);
>      }
> +
> +    tcg_out32(s, MTSPR | RS(TCG_REG_TMP1) | CTR);
> +    tcg_out32(s, BCCTR | BO_ALWAYS);
> +    set_jmp_reset_offset(s, which);
>  }
>
>  void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
> @@ -2674,20 +2684,50 @@ void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
>  {
>      uintptr_t addr = tb->jmp_target_addr[n];
>      intptr_t diff = addr - jmp_rx;
> -    tcg_insn_unit insn;
>
>      if (USE_REG_TB) {
>          return;
>      }
>
> -    if (in_range_b(diff)) {
> -        insn = B | (diff & 0x3fffffc);
> -    } else {
> -        insn = NOP;
> -    }
> +    if (have_isa_3_10) {
> +        tcg_insn_unit insn1, insn2;
> +        uint64_t pair;
>
> -    qatomic_set((uint32_t *)jmp_rw, insn);
> -    flush_idcache_range(jmp_rx, jmp_rw, 4);
> +        if (in_range_b(diff)) {
> +            insn1 = B | (diff & 0x3fffffc);
> +            insn2 = NOP;
> +        } else if (diff == sextract64(diff, 0, 34)) {
> +            /* PLA tmp1, diff */
> +            insn1 = OPCD(1) | (2 << 24) | (1 << 20) | ((diff >> 16) & 0x3ffff);
> +            insn2 = ADDI | TAI(TCG_REG_TMP1, 0, diff);
> +        } else {
> +            addr = (uintptr_t)&tb->jmp_target_addr[n];
> +            diff = addr - jmp_rx;
> +            tcg_debug_assert(diff == sextract64(diff, 0, 34));
> +            /* PLD tmp1, diff */
> +            insn1 = OPCD(1) | (1 << 20) | ((diff >> 16) & 0x3ffff);
> +            insn2 = PLD | TAI(TCG_REG_TMP1, 0, diff);
> +        }
> +
> +        if (HOST_BIG_ENDIAN) {
> +            pair = ((uint64_t)insn1) << 32 | insn2;
> +        } else {
> +            pair = ((uint64_t)insn2) << 32 | insn1;
> +        }
> +
> +        qatomic_set((uint64_t *)jmp_rw, pair);
> +        flush_idcache_range(jmp_rx, jmp_rw, 8);
> +    } else {
> +        tcg_insn_unit insn;
> +
> +        if (in_range_b(diff)) {
> +            insn = B | (diff & 0x3fffffc);
> +        } else {
> +            insn = NOP;
> +        }
> +        qatomic_set((uint32_t *)jmp_rw, insn);
> +        flush_idcache_range(jmp_rx, jmp_rw, 4);
> +    }
>  }
>
>  static void tcg_out_op(TCGContext *s, TCGOpcode opc,
> --
> 2.34.1
>
Nicholas Piggin Aug. 7, 2023, 7:29 a.m. UTC | #5
On Mon Aug 7, 2023 at 12:13 AM AEST, Richard Henderson wrote:
> On 8/6/23 05:55, Nicholas Piggin wrote:
> > On Sat Aug 5, 2023 at 7:33 AM AEST, Richard Henderson wrote:
> >> When a direct branch is out of range, we can load the destination for
> >> the indirect branch using PLA (for 16GB worth of buffer) and PLD from
> >> the TranslationBlock for everything larger.
> >>
> >> This means the patch affects exactly one instruction: B (plus filler),
> >> PLA or PLD.  Which means we can update and execute the patch atomically.
> >>
> >> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> >> ---
> >>   tcg/ppc/tcg-target.c.inc | 76 ++++++++++++++++++++++++++++++----------
> >>   1 file changed, 58 insertions(+), 18 deletions(-)
> >>
> >> diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
> >> index 5b243b2353..47c71bb5f2 100644
> >> --- a/tcg/ppc/tcg-target.c.inc
> >> +++ b/tcg/ppc/tcg-target.c.inc
> >> @@ -2642,31 +2642,41 @@ static void tcg_out_goto_tb(TCGContext *s, int which)
> >>       uintptr_t ptr = get_jmp_target_addr(s, which);
> >>   
> >>       if (USE_REG_TB) {
> >> +        /*
> >> +         * With REG_TB, we must always use indirect branching,
> >> +         * so that the branch destination and TCG_REG_TB match.
> >> +         */
> >>           ptrdiff_t offset = tcg_tbrel_diff(s, (void *)ptr);
> >>           tcg_out_mem_long(s, LD, LDX, TCG_REG_TB, TCG_REG_TB, offset);
> >> -
> >> -        /* TODO: Use direct branches when possible. */
> >> -        set_jmp_insn_offset(s, which);
> >>           tcg_out32(s, MTSPR | RS(TCG_REG_TB) | CTR);
> >> -
> >>           tcg_out32(s, BCCTR | BO_ALWAYS);
> >>   
> >>           /* For the unlinked case, need to reset TCG_REG_TB.  */
> >>           set_jmp_reset_offset(s, which);
> >>           tcg_out_mem_long(s, ADDI, ADD, TCG_REG_TB, TCG_REG_TB,
> >>                            -tcg_current_code_size(s));
> >> +        return;
> >> +    }
> >> +
> >> +    if (have_isa_3_10) {
> >> +        /* Align, so that we can patch 8 bytes atomically. */
> >> +        if ((uintptr_t)s->code_ptr & 7) {
> >> +            tcg_out32(s, NOP);
> >> +        }
> >> +        set_jmp_insn_offset(s, which);
> >> +        /* Direct branch will be patched by tb_target_set_jmp_target. */
> >> +        tcg_out_mls_d(s, ADDI, TCG_REG_TMP1, 0, 0, 1);
> >>       } else {
> >>           /* Direct branch will be patched by tb_target_set_jmp_target. */
> >> -        set_jmp_insn_offset(s, which);
> >> -        tcg_out32(s, NOP);
> >> -
> >> +        tcg_out32(s, B);
> >>           /* When branch is out of range, fall through to indirect. */
> >>           tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP1, ptr - (int16_t)ptr);
> >>           tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_REG_TMP1, (int16_t)ptr);
> >> -        tcg_out32(s, MTSPR | RS(TCG_REG_TMP1) | CTR);
> >> -        tcg_out32(s, BCCTR | BO_ALWAYS);
> >> -        set_jmp_reset_offset(s, which);
> >>       }
> >> +
> >> +    tcg_out32(s, MTSPR | RS(TCG_REG_TMP1) | CTR);
> >> +    tcg_out32(s, BCCTR | BO_ALWAYS);
> >> +    set_jmp_reset_offset(s, which);
> >>   }
> >>   
> >>   void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
> >> @@ -2674,20 +2684,50 @@ void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
> >>   {
> >>       uintptr_t addr = tb->jmp_target_addr[n];
> >>       intptr_t diff = addr - jmp_rx;
> >> -    tcg_insn_unit insn;
> >>   
> >>       if (USE_REG_TB) {
> >>           return;
> >>       }
> >>   
> >> -    if (in_range_b(diff)) {
> >> -        insn = B | (diff & 0x3fffffc);
> >> -    } else {
> >> -        insn = NOP;
> >> -    }
> >> +    if (have_isa_3_10) {
> >> +        tcg_insn_unit insn1, insn2;
> >> +        uint64_t pair;
> >>   
> >> -    qatomic_set((uint32_t *)jmp_rw, insn);
> >> -    flush_idcache_range(jmp_rx, jmp_rw, 4);
> >> +        if (in_range_b(diff)) {
> >> +            insn1 = B | (diff & 0x3fffffc);
> >> +            insn2 = NOP;
> >> +        } else if (diff == sextract64(diff, 0, 34)) {
> >> +            /* PLA tmp1, diff */
> >> +            insn1 = OPCD(1) | (2 << 24) | (1 << 20) | ((diff >> 16) & 0x3ffff);
> >> +            insn2 = ADDI | TAI(TCG_REG_TMP1, 0, diff);
> >> +        } else {
> >> +            addr = (uintptr_t)&tb->jmp_target_addr[n];
> >> +            diff = addr - jmp_rx;
> >> +            tcg_debug_assert(diff == sextract64(diff, 0, 34));
> >> +            /* PLD tmp1, diff */
> >> +            insn1 = OPCD(1) | (1 << 20) | ((diff >> 16) & 0x3ffff);
> >> +            insn2 = PLD | TAI(TCG_REG_TMP1, 0, diff);
> >> +        }
> > 
> > B is a "patch class" word instruction as per CMODX in the ISA, which may
> > be patched to/from other instructions without a flush+isync sequence
> > betwen. So that part is okay, at least if you were just patching the B
> > word. But patching between the PLA and PLD I don't think is kosher per
> > ISA.
> > 
> > I struggle a bit with this part of the ISA, particularly with prefix
> > instructions (it only talks about patching 4 bytes at a time).
> > 
> > If we patch something it has to go through a patch instruction, which
> > is a direct branch, trap, or nop. I think that makes this non-trivial.
> > 
> > It could work if you only patched between B and PLD. B->PLD would have
> > to patch the suffix word first, possibly with an interleaving sync, and
> > then the prefix. PLD->B could just patch the B word.
> > 
> > How much would losing the PLA hurt?
>
> Really?  I can't imagine how some icache would see a torn prefixed insn given an atomic 
> store (CMODX talks about prefixed instructions which "may be unaligned" -- but what if 
> they are not?).
>
> But if patching an aligned prefixed insn isn't allowed, I would patch between B and NOP, 
> leave the PLD alone on the fall-through path, and drop the PLA.

Hmm, even patching two different offset B instructions in some sequence
with a PLD would go against the ISA, because PLD is not a patch class
instruction. The only case you can patch those has to have a sequence of
exactly two instruction values. So I think you need the B first, and you
can patch that between any offset of B or a NOP to fall through to PLD.
NOP and B are both patch class, so any sequence of them is allowed.

Thanks,
Nick
Nicholas Piggin Aug. 7, 2023, 9:38 a.m. UTC | #6
On Mon Aug 7, 2023 at 5:29 PM AEST, Nicholas Piggin wrote:
> On Mon Aug 7, 2023 at 12:13 AM AEST, Richard Henderson wrote:
> > On 8/6/23 05:55, Nicholas Piggin wrote:
> > > On Sat Aug 5, 2023 at 7:33 AM AEST, Richard Henderson wrote:
> > >> When a direct branch is out of range, we can load the destination for
> > >> the indirect branch using PLA (for 16GB worth of buffer) and PLD from
> > >> the TranslationBlock for everything larger.
> > >>
> > >> This means the patch affects exactly one instruction: B (plus filler),
> > >> PLA or PLD.  Which means we can update and execute the patch atomically.
> > >>
> > >> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> > >> ---
> > >>   tcg/ppc/tcg-target.c.inc | 76 ++++++++++++++++++++++++++++++----------
> > >>   1 file changed, 58 insertions(+), 18 deletions(-)
> > >>
> > >> diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
> > >> index 5b243b2353..47c71bb5f2 100644
> > >> --- a/tcg/ppc/tcg-target.c.inc
> > >> +++ b/tcg/ppc/tcg-target.c.inc
> > >> @@ -2642,31 +2642,41 @@ static void tcg_out_goto_tb(TCGContext *s, int which)
> > >>       uintptr_t ptr = get_jmp_target_addr(s, which);
> > >>   
> > >>       if (USE_REG_TB) {
> > >> +        /*
> > >> +         * With REG_TB, we must always use indirect branching,
> > >> +         * so that the branch destination and TCG_REG_TB match.
> > >> +         */
> > >>           ptrdiff_t offset = tcg_tbrel_diff(s, (void *)ptr);
> > >>           tcg_out_mem_long(s, LD, LDX, TCG_REG_TB, TCG_REG_TB, offset);
> > >> -
> > >> -        /* TODO: Use direct branches when possible. */
> > >> -        set_jmp_insn_offset(s, which);
> > >>           tcg_out32(s, MTSPR | RS(TCG_REG_TB) | CTR);
> > >> -
> > >>           tcg_out32(s, BCCTR | BO_ALWAYS);
> > >>   
> > >>           /* For the unlinked case, need to reset TCG_REG_TB.  */
> > >>           set_jmp_reset_offset(s, which);
> > >>           tcg_out_mem_long(s, ADDI, ADD, TCG_REG_TB, TCG_REG_TB,
> > >>                            -tcg_current_code_size(s));
> > >> +        return;
> > >> +    }
> > >> +
> > >> +    if (have_isa_3_10) {
> > >> +        /* Align, so that we can patch 8 bytes atomically. */
> > >> +        if ((uintptr_t)s->code_ptr & 7) {
> > >> +            tcg_out32(s, NOP);
> > >> +        }
> > >> +        set_jmp_insn_offset(s, which);
> > >> +        /* Direct branch will be patched by tb_target_set_jmp_target. */
> > >> +        tcg_out_mls_d(s, ADDI, TCG_REG_TMP1, 0, 0, 1);
> > >>       } else {
> > >>           /* Direct branch will be patched by tb_target_set_jmp_target. */
> > >> -        set_jmp_insn_offset(s, which);
> > >> -        tcg_out32(s, NOP);
> > >> -
> > >> +        tcg_out32(s, B);
> > >>           /* When branch is out of range, fall through to indirect. */
> > >>           tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP1, ptr - (int16_t)ptr);
> > >>           tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_REG_TMP1, (int16_t)ptr);
> > >> -        tcg_out32(s, MTSPR | RS(TCG_REG_TMP1) | CTR);
> > >> -        tcg_out32(s, BCCTR | BO_ALWAYS);
> > >> -        set_jmp_reset_offset(s, which);
> > >>       }
> > >> +
> > >> +    tcg_out32(s, MTSPR | RS(TCG_REG_TMP1) | CTR);
> > >> +    tcg_out32(s, BCCTR | BO_ALWAYS);
> > >> +    set_jmp_reset_offset(s, which);
> > >>   }
> > >>   
> > >>   void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
> > >> @@ -2674,20 +2684,50 @@ void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
> > >>   {
> > >>       uintptr_t addr = tb->jmp_target_addr[n];
> > >>       intptr_t diff = addr - jmp_rx;
> > >> -    tcg_insn_unit insn;
> > >>   
> > >>       if (USE_REG_TB) {
> > >>           return;
> > >>       }
> > >>   
> > >> -    if (in_range_b(diff)) {
> > >> -        insn = B | (diff & 0x3fffffc);
> > >> -    } else {
> > >> -        insn = NOP;
> > >> -    }
> > >> +    if (have_isa_3_10) {
> > >> +        tcg_insn_unit insn1, insn2;
> > >> +        uint64_t pair;
> > >>   
> > >> -    qatomic_set((uint32_t *)jmp_rw, insn);
> > >> -    flush_idcache_range(jmp_rx, jmp_rw, 4);
> > >> +        if (in_range_b(diff)) {
> > >> +            insn1 = B | (diff & 0x3fffffc);
> > >> +            insn2 = NOP;
> > >> +        } else if (diff == sextract64(diff, 0, 34)) {
> > >> +            /* PLA tmp1, diff */
> > >> +            insn1 = OPCD(1) | (2 << 24) | (1 << 20) | ((diff >> 16) & 0x3ffff);
> > >> +            insn2 = ADDI | TAI(TCG_REG_TMP1, 0, diff);
> > >> +        } else {
> > >> +            addr = (uintptr_t)&tb->jmp_target_addr[n];
> > >> +            diff = addr - jmp_rx;
> > >> +            tcg_debug_assert(diff == sextract64(diff, 0, 34));
> > >> +            /* PLD tmp1, diff */
> > >> +            insn1 = OPCD(1) | (1 << 20) | ((diff >> 16) & 0x3ffff);
> > >> +            insn2 = PLD | TAI(TCG_REG_TMP1, 0, diff);
> > >> +        }
> > > 
> > > B is a "patch class" word instruction as per CMODX in the ISA, which may
> > > be patched to/from other instructions without a flush+isync sequence
> > > betwen. So that part is okay, at least if you were just patching the B
> > > word. But patching between the PLA and PLD I don't think is kosher per
> > > ISA.
> > > 
> > > I struggle a bit with this part of the ISA, particularly with prefix
> > > instructions (it only talks about patching 4 bytes at a time).
> > > 
> > > If we patch something it has to go through a patch instruction, which
> > > is a direct branch, trap, or nop. I think that makes this non-trivial.
> > > 
> > > It could work if you only patched between B and PLD. B->PLD would have
> > > to patch the suffix word first, possibly with an interleaving sync, and
> > > then the prefix. PLD->B could just patch the B word.
> > > 
> > > How much would losing the PLA hurt?
> >
> > Really?  I can't imagine how some icache would see a torn prefixed insn given an atomic 
> > store (CMODX talks about prefixed instructions which "may be unaligned" -- but what if 
> > they are not?).
> >
> > But if patching an aligned prefixed insn isn't allowed, I would patch between B and NOP, 
> > leave the PLD alone on the fall-through path, and drop the PLA.
>
> Hmm, even patching two different offset B instructions in some sequence
> with a PLD would go against the ISA, because PLD is not a patch class
> instruction. The only case you can patch those has to have a sequence of
> exactly two instruction values. So I think you need the B first, and you
> can patch that between any offset of B or a NOP to fall through to PLD.
> NOP and B are both patch class, so any sequence of them is allowed.

Here is an incremental diff that hopefully solves those issues. Sadly,
by removing the nice PLA patching :( But it seems to be around or close
to original performance on my test.

Feel free to squash it in or take inspiration (or dispute it).

Thanks,
Nick

---
diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index 47c71bb5f2..569c2e3647 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -748,6 +748,11 @@ static void tcg_out_prefix_align(TCGContext *s)
     }
 }
 
+static ptrdiff_t tcg_pcrel_diff_for_prefix(TCGContext *s, const void *target)
+{
+    return tcg_pcrel_diff(s, target) - (tcg_out_need_prefix_align(s) ? 4 : 0);
+}
+
 /* Output Type 00 Prefix - 8-Byte Load/Store Form (8LS:D) */
 static void tcg_out_8ls_d(TCGContext *s, tcg_insn_unit opc, unsigned rt,
                           unsigned ra, tcg_target_long imm, bool r)
@@ -1072,8 +1077,7 @@ static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret,
             return;
         }
 
-        tmp = tcg_out_need_prefix_align(s) * 4;
-        tmp = tcg_pcrel_diff(s, (void *)arg) - tmp;
+        tmp = tcg_pcrel_diff_for_prefix(s, (void *)arg);
         if (tmp == sextract64(tmp, 0, 34)) {
             /* pla ret,value = paddi ret,0,value,1 */
             tcg_out_mls_d(s, ADDI, ret, 0, tmp, 1);
@@ -2658,18 +2662,19 @@ static void tcg_out_goto_tb(TCGContext *s, int which)
         return;
     }
 
+    /* Direct branch will be patched by tb_target_set_jmp_target. */
+    set_jmp_insn_offset(s, which);
+    tcg_out32(s, NOP);
+
+    /* When branch is out of range, fall through to indirect. */
     if (have_isa_3_10) {
-        /* Align, so that we can patch 8 bytes atomically. */
-        if ((uintptr_t)s->code_ptr & 7) {
-            tcg_out32(s, NOP);
-        }
-        set_jmp_insn_offset(s, which);
-        /* Direct branch will be patched by tb_target_set_jmp_target. */
-        tcg_out_mls_d(s, ADDI, TCG_REG_TMP1, 0, 0, 1);
+        ptrdiff_t offset = tcg_pcrel_diff_for_prefix(s, (void *)ptr);
+        /*
+         * Would be nice to use PLA if offset is in range,
+         * but CMODX rules make that difficult.
+         */
+        tcg_out_8ls_d(s, PLD, TCG_REG_TMP1, 0, offset, 1);
     } else {
-        /* Direct branch will be patched by tb_target_set_jmp_target. */
-        tcg_out32(s, B);
-        /* When branch is out of range, fall through to indirect. */
         tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP1, ptr - (int16_t)ptr);
         tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_REG_TMP1, (int16_t)ptr);
     }
@@ -2684,50 +2689,19 @@ void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
 {
     uintptr_t addr = tb->jmp_target_addr[n];
     intptr_t diff = addr - jmp_rx;
+    tcg_insn_unit insn;
 
     if (USE_REG_TB) {
         return;
     }
 
-    if (have_isa_3_10) {
-        tcg_insn_unit insn1, insn2;
-        uint64_t pair;
-
-        if (in_range_b(diff)) {
-            insn1 = B | (diff & 0x3fffffc);
-            insn2 = NOP;
-        } else if (diff == sextract64(diff, 0, 34)) {
-            /* PLA tmp1, diff */
-            insn1 = OPCD(1) | (2 << 24) | (1 << 20) | ((diff >> 16) & 0x3ffff);
-            insn2 = ADDI | TAI(TCG_REG_TMP1, 0, diff);
-        } else {
-            addr = (uintptr_t)&tb->jmp_target_addr[n];
-            diff = addr - jmp_rx;
-            tcg_debug_assert(diff == sextract64(diff, 0, 34));
-            /* PLD tmp1, diff */
-            insn1 = OPCD(1) | (1 << 20) | ((diff >> 16) & 0x3ffff);
-            insn2 = PLD | TAI(TCG_REG_TMP1, 0, diff);
-        }
-
-        if (HOST_BIG_ENDIAN) {
-            pair = ((uint64_t)insn1) << 32 | insn2;
-        } else {
-            pair = ((uint64_t)insn2) << 32 | insn1;
-        }
-
-        qatomic_set((uint64_t *)jmp_rw, pair);
-        flush_idcache_range(jmp_rx, jmp_rw, 8);
+    if (in_range_b(diff)) {
+        insn = B | (diff & 0x3fffffc);
     } else {
-        tcg_insn_unit insn;
-
-        if (in_range_b(diff)) {
-            insn = B | (diff & 0x3fffffc);
-        } else {
-            insn = NOP;
-        }
-        qatomic_set((uint32_t *)jmp_rw, insn);
-        flush_idcache_range(jmp_rx, jmp_rw, 4);
+        insn = NOP;
     }
+    qatomic_set((uint32_t *)jmp_rw, insn);
+    flush_idcache_range(jmp_rx, jmp_rw, 4);
 }
 
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
diff mbox series

Patch

diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index 5b243b2353..47c71bb5f2 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -2642,31 +2642,41 @@  static void tcg_out_goto_tb(TCGContext *s, int which)
     uintptr_t ptr = get_jmp_target_addr(s, which);
 
     if (USE_REG_TB) {
+        /*
+         * With REG_TB, we must always use indirect branching,
+         * so that the branch destination and TCG_REG_TB match.
+         */
         ptrdiff_t offset = tcg_tbrel_diff(s, (void *)ptr);
         tcg_out_mem_long(s, LD, LDX, TCG_REG_TB, TCG_REG_TB, offset);
-    
-        /* TODO: Use direct branches when possible. */
-        set_jmp_insn_offset(s, which);
         tcg_out32(s, MTSPR | RS(TCG_REG_TB) | CTR);
-
         tcg_out32(s, BCCTR | BO_ALWAYS);
 
         /* For the unlinked case, need to reset TCG_REG_TB.  */
         set_jmp_reset_offset(s, which);
         tcg_out_mem_long(s, ADDI, ADD, TCG_REG_TB, TCG_REG_TB,
                          -tcg_current_code_size(s));
+        return;
+    }
+
+    if (have_isa_3_10) {
+        /* Align, so that we can patch 8 bytes atomically. */
+        if ((uintptr_t)s->code_ptr & 7) {
+            tcg_out32(s, NOP);
+        }
+        set_jmp_insn_offset(s, which);
+        /* Direct branch will be patched by tb_target_set_jmp_target. */
+        tcg_out_mls_d(s, ADDI, TCG_REG_TMP1, 0, 0, 1);
     } else {
         /* Direct branch will be patched by tb_target_set_jmp_target. */
-        set_jmp_insn_offset(s, which);
-        tcg_out32(s, NOP);
-
+        tcg_out32(s, B);
         /* When branch is out of range, fall through to indirect. */
         tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP1, ptr - (int16_t)ptr);
         tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_REG_TMP1, (int16_t)ptr);
-        tcg_out32(s, MTSPR | RS(TCG_REG_TMP1) | CTR);
-        tcg_out32(s, BCCTR | BO_ALWAYS);
-        set_jmp_reset_offset(s, which);
     }
+
+    tcg_out32(s, MTSPR | RS(TCG_REG_TMP1) | CTR);
+    tcg_out32(s, BCCTR | BO_ALWAYS);
+    set_jmp_reset_offset(s, which);
 }
 
 void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
@@ -2674,20 +2684,50 @@  void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
 {
     uintptr_t addr = tb->jmp_target_addr[n];
     intptr_t diff = addr - jmp_rx;
-    tcg_insn_unit insn;
 
     if (USE_REG_TB) {
         return;
     }
 
-    if (in_range_b(diff)) {
-        insn = B | (diff & 0x3fffffc);
-    } else {
-        insn = NOP;
-    }
+    if (have_isa_3_10) {
+        tcg_insn_unit insn1, insn2;
+        uint64_t pair;
 
-    qatomic_set((uint32_t *)jmp_rw, insn);
-    flush_idcache_range(jmp_rx, jmp_rw, 4);
+        if (in_range_b(diff)) {
+            insn1 = B | (diff & 0x3fffffc);
+            insn2 = NOP;
+        } else if (diff == sextract64(diff, 0, 34)) {
+            /* PLA tmp1, diff */
+            insn1 = OPCD(1) | (2 << 24) | (1 << 20) | ((diff >> 16) & 0x3ffff);
+            insn2 = ADDI | TAI(TCG_REG_TMP1, 0, diff);
+        } else {
+            addr = (uintptr_t)&tb->jmp_target_addr[n];
+            diff = addr - jmp_rx;
+            tcg_debug_assert(diff == sextract64(diff, 0, 34));
+            /* PLD tmp1, diff */
+            insn1 = OPCD(1) | (1 << 20) | ((diff >> 16) & 0x3ffff);
+            insn2 = PLD | TAI(TCG_REG_TMP1, 0, diff);
+        }
+
+        if (HOST_BIG_ENDIAN) {
+            pair = ((uint64_t)insn1) << 32 | insn2;
+        } else {
+            pair = ((uint64_t)insn2) << 32 | insn1;
+        }
+
+        qatomic_set((uint64_t *)jmp_rw, pair);
+        flush_idcache_range(jmp_rx, jmp_rw, 8);
+    } else {
+        tcg_insn_unit insn;
+
+        if (in_range_b(diff)) {
+            insn = B | (diff & 0x3fffffc);
+        } else {
+            insn = NOP;
+        }
+        qatomic_set((uint32_t *)jmp_rw, insn);
+        flush_idcache_range(jmp_rx, jmp_rw, 4);
+    }
 }
 
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,