[edk2,v3,3/4] MdeModulePkg/EbxDxe AARCH64: use tail call for EBC to native thunk

Message ID 1471445945-19239-4-git-send-email-ard.biesheuvel@linaro.org
State New
Headers show

Commit Message

Ard Biesheuvel Aug. 17, 2016, 2:59 p.m.
Instead of pessimistically copying at least 64 bytes from the VM stack
to the native stack, and popping off the register arguments again
before doing the native call, try to avoid touching the stack completely
if the VM stack frame is < 64 bytes. Also, if the stack frame does exceed
64 bytes, there is no need to copy the first 64 bytes, since we are passing
those in registers anyway.

Contributed-under: TianoCore Contribution Agreement 1.0
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>

---
 MdeModulePkg/Universal/EbcDxe/AArch64/EbcLowLevel.S | 73 +++++++++++++++-----
 1 file changed, 55 insertions(+), 18 deletions(-)

-- 
2.7.4

_______________________________________________
edk2-devel mailing list
edk2-devel@lists.01.org
https://lists.01.org/mailman/listinfo/edk2-devel

Comments

Leif Lindholm Aug. 26, 2016, 12:54 p.m. | #1
On Wed, Aug 17, 2016 at 04:59:04PM +0200, Ard Biesheuvel wrote:
> Instead of pessimistically copying at least 64 bytes from the VM stack

> to the native stack, and popping off the register arguments again

> before doing the native call, try to avoid touching the stack completely

> if the VM stack frame is < 64 bytes. Also, if the stack frame does exceed


Should this say "does not"?

> 64 bytes, there is no need to copy the first 64 bytes, since we are passing

> those in registers anyway.

> 

> Contributed-under: TianoCore Contribution Agreement 1.0

> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>

> ---

>  MdeModulePkg/Universal/EbcDxe/AArch64/EbcLowLevel.S | 73 +++++++++++++++-----

>  1 file changed, 55 insertions(+), 18 deletions(-)

> 

> diff --git a/MdeModulePkg/Universal/EbcDxe/AArch64/EbcLowLevel.S b/MdeModulePkg/Universal/EbcDxe/AArch64/EbcLowLevel.S

> index cb7a70b5a4f8..d95713e82b0f 100644

> --- a/MdeModulePkg/Universal/EbcDxe/AArch64/EbcLowLevel.S

> +++ b/MdeModulePkg/Universal/EbcDxe/AArch64/EbcLowLevel.S

> @@ -35,30 +35,67 @@ ASM_GLOBAL ASM_PFX(mEbcInstructionBufferTemplate)

>  //****************************************************************************

>  // UINTN EbcLLCALLEXNative(UINTN FuncAddr, UINTN NewStackPointer, VOID *FramePtr)

>  ASM_PFX(EbcLLCALLEXNative):

> -      stp  x19, x20, [sp, #-16]!

> -      stp  x29, x30, [sp, #-16]!

> +    mov     x8, x0                 // Preserve x0

> +    mov     x9, x1                 // Preserve x1

>  

> -      mov  x19, x0

> -      mov  x20, sp

> -      sub  x2, x2, x1   // Length = NewStackPointer-FramePtr

> -      sub  sp, sp, x2

> -      sub  sp, sp, #64  // Make sure there is room for at least 8 args in the new stack

> -      mov  x0, sp

> +    //

> +    // If the EBC stack frame is smaller than or equal to 64 bytes, we know there

> +    // are no stacked arguments #9 and beyond that we need to copy to the native

> +    // stack. In this case, we can perform a tail call which is much more

> +    // efficient, since there is no need to touch the native stack at all.

> +    //

> +    sub     x3, x2, x1              // Length = NewStackPointer - FramePtr

> +    cmp     x3, #64

> +    b.gt    1f

>  

> -      bl   CopyMem      // Sp, NewStackPointer, Length

> +    adr     x0, 0f

> +    sub     x0, x0, x3, lsr #1

> +    br      x0

>  

> -      ldp  x0, x1, [sp], #16

> -      ldp  x2, x3, [sp], #16

> -      ldp  x4, x5, [sp], #16

> -      ldp  x6, x7, [sp], #16

> +    ldr     x7, [x9, #56]

> +    ldr     x6, [x9, #48]

> +    ldr     x5, [x9, #40]

> +    ldr     x4, [x9, #32]

> +    ldr     x3, [x9, #24]

> +    ldr     x2, [x9, #16]

> +    ldr     x1, [x9, #8]

> +    ldr     x0, [x9]


Why not keep using ldp, but with x9?

>  

> -      blr  x19

> +0:  br      x8

>  

> -      mov  sp,  x20

> -      ldp  x29, x30, [sp], #16

> -      ldp  x19, x20, [sp], #16

> +    //

> +    // More than 64 bytes: we need to build the full native stack frame and copy

> +    // the part of the VM stack exceeding 64 bytes (which may contain stacked

> +    // arguments) to the native stack

> +    //

> +1:  stp     x29, x30, [sp, #-16]!

> +    mov     x29, sp

>  

> -      ret

> +    //

> +    // Ensure that the stack pointer remains 16 byte aligned,

> +    // even if the size of the VM stack frame is not a multiple of 16

> +    //

> +    add     x1, x1, #64             // Skip over [potential] reg params

> +    tbz     x3, #3, 2f              // Multiple of 16?

> +    ldr     x4, [x2, #-8]!          // No? Then push one word

> +    str     x4, [sp, #-16]!         // ... but use two slots

> +    b       3f

> +

> +2:  ldp     x4, x5, [x2, #-16]!

> +    stp     x4, x5, [sp, #-16]!

> +3:  cmp     x2, x1

> +    b.gt    2b

> +

> +    ldp     x0, x1, [x9]

> +    ldp     x2, x3, [x9, #16]

> +    ldp     x4, x5, [x9, #32]

> +    ldp     x6, x7, [x9, #48]

> +

> +    blr     x8

> +

> +    mov     sp, x29

> +    ldp     x29, x30, [sp], #16

> +    ret

>  

>  //****************************************************************************

>  // EbcLLEbcInterpret

> -- 

> 2.7.4

> 

_______________________________________________
edk2-devel mailing list
edk2-devel@lists.01.org
https://lists.01.org/mailman/listinfo/edk2-devel
Ard Biesheuvel Aug. 26, 2016, 5:14 p.m. | #2
On 26 August 2016 at 13:54, Leif Lindholm <leif.lindholm@linaro.org> wrote:
> On Wed, Aug 17, 2016 at 04:59:04PM +0200, Ard Biesheuvel wrote:

>> Instead of pessimistically copying at least 64 bytes from the VM stack

>> to the native stack, and popping off the register arguments again

>> before doing the native call, try to avoid touching the stack completely

>> if the VM stack frame is < 64 bytes. Also, if the stack frame does exceed

>

> Should this say "does not"?

>


No, if it exceeds 64 bytes, we may have arguments that should be
passed on the stack (but we're not sure, unfortunately, since the VM
stack frame is not annotated, i.e., it could simply be local vars in
the caller) In this case, there is no need to push all arguments onto
the native stack, and pop off the first 8 into registers again, which
is what the original code was doing.

>> 64 bytes, there is no need to copy the first 64 bytes, since we are passing

>> those in registers anyway.

>>

>> Contributed-under: TianoCore Contribution Agreement 1.0

>> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>

>> ---

>>  MdeModulePkg/Universal/EbcDxe/AArch64/EbcLowLevel.S | 73 +++++++++++++++-----

>>  1 file changed, 55 insertions(+), 18 deletions(-)

>>

>> diff --git a/MdeModulePkg/Universal/EbcDxe/AArch64/EbcLowLevel.S b/MdeModulePkg/Universal/EbcDxe/AArch64/EbcLowLevel.S

>> index cb7a70b5a4f8..d95713e82b0f 100644

>> --- a/MdeModulePkg/Universal/EbcDxe/AArch64/EbcLowLevel.S

>> +++ b/MdeModulePkg/Universal/EbcDxe/AArch64/EbcLowLevel.S

>> @@ -35,30 +35,67 @@ ASM_GLOBAL ASM_PFX(mEbcInstructionBufferTemplate)

>>  //****************************************************************************

>>  // UINTN EbcLLCALLEXNative(UINTN FuncAddr, UINTN NewStackPointer, VOID *FramePtr)

>>  ASM_PFX(EbcLLCALLEXNative):

>> -      stp  x19, x20, [sp, #-16]!

>> -      stp  x29, x30, [sp, #-16]!

>> +    mov     x8, x0                 // Preserve x0

>> +    mov     x9, x1                 // Preserve x1

>>

>> -      mov  x19, x0

>> -      mov  x20, sp

>> -      sub  x2, x2, x1   // Length = NewStackPointer-FramePtr

>> -      sub  sp, sp, x2

>> -      sub  sp, sp, #64  // Make sure there is room for at least 8 args in the new stack

>> -      mov  x0, sp

>> +    //

>> +    // If the EBC stack frame is smaller than or equal to 64 bytes, we know there

>> +    // are no stacked arguments #9 and beyond that we need to copy to the native

>> +    // stack. In this case, we can perform a tail call which is much more

>> +    // efficient, since there is no need to touch the native stack at all.

>> +    //

>> +    sub     x3, x2, x1              // Length = NewStackPointer - FramePtr

>> +    cmp     x3, #64

>> +    b.gt    1f

>>

>> -      bl   CopyMem      // Sp, NewStackPointer, Length

>> +    adr     x0, 0f

>> +    sub     x0, x0, x3, lsr #1

>> +    br      x0

>>

>> -      ldp  x0, x1, [sp], #16

>> -      ldp  x2, x3, [sp], #16

>> -      ldp  x4, x5, [sp], #16

>> -      ldp  x6, x7, [sp], #16

>> +    ldr     x7, [x9, #56]

>> +    ldr     x6, [x9, #48]

>> +    ldr     x5, [x9, #40]

>> +    ldr     x4, [x9, #32]

>> +    ldr     x3, [x9, #24]

>> +    ldr     x2, [x9, #16]

>> +    ldr     x1, [x9, #8]

>> +    ldr     x0, [x9]

>

> Why not keep using ldp, but with x9?

>


This is a computed jump, depending on the size of the stack frame,
hence the inverted order, and the LDRs. While probably harmless in
most cases, it is arguably incorrect to copy random data from memory
into these registers (and could even be a security issue)


>>

>> -      blr  x19

>> +0:  br      x8

>>

>> -      mov  sp,  x20

>> -      ldp  x29, x30, [sp], #16

>> -      ldp  x19, x20, [sp], #16

>> +    //

>> +    // More than 64 bytes: we need to build the full native stack frame and copy

>> +    // the part of the VM stack exceeding 64 bytes (which may contain stacked

>> +    // arguments) to the native stack

>> +    //

>> +1:  stp     x29, x30, [sp, #-16]!

>> +    mov     x29, sp

>>

>> -      ret

>> +    //

>> +    // Ensure that the stack pointer remains 16 byte aligned,

>> +    // even if the size of the VM stack frame is not a multiple of 16

>> +    //

>> +    add     x1, x1, #64             // Skip over [potential] reg params

>> +    tbz     x3, #3, 2f              // Multiple of 16?

>> +    ldr     x4, [x2, #-8]!          // No? Then push one word

>> +    str     x4, [sp, #-16]!         // ... but use two slots

>> +    b       3f

>> +

>> +2:  ldp     x4, x5, [x2, #-16]!

>> +    stp     x4, x5, [sp, #-16]!

>> +3:  cmp     x2, x1

>> +    b.gt    2b

>> +

>> +    ldp     x0, x1, [x9]

>> +    ldp     x2, x3, [x9, #16]

>> +    ldp     x4, x5, [x9, #32]

>> +    ldp     x6, x7, [x9, #48]

>> +

>> +    blr     x8

>> +

>> +    mov     sp, x29

>> +    ldp     x29, x30, [sp], #16

>> +    ret

>>

>>  //****************************************************************************

>>  // EbcLLEbcInterpret

>> --

>> 2.7.4

>>

_______________________________________________
edk2-devel mailing list
edk2-devel@lists.01.org
https://lists.01.org/mailman/listinfo/edk2-devel
Leif Lindholm Aug. 26, 2016, 6:10 p.m. | #3
On Fri, Aug 26, 2016 at 06:14:10PM +0100, Ard Biesheuvel wrote:
> On 26 August 2016 at 13:54, Leif Lindholm <leif.lindholm@linaro.org> wrote:

> > On Wed, Aug 17, 2016 at 04:59:04PM +0200, Ard Biesheuvel wrote:

> >> Instead of pessimistically copying at least 64 bytes from the VM stack

> >> to the native stack, and popping off the register arguments again

> >> before doing the native call, try to avoid touching the stack completely

> >> if the VM stack frame is < 64 bytes. Also, if the stack frame does exceed

> >

> > Should this say "does not"?

> 

> No, if it exceeds 64 bytes, we may have arguments that should be

> passed on the stack (but we're not sure, unfortunately, since the VM

> stack frame is not annotated, i.e., it could simply be local vars in

> the caller) In this case, there is no need to push all arguments onto

> the native stack, and pop off the first 8 into registers again, which

> is what the original code was doing.


OK, thanks.
Should that not be <= 64 bytes above then?

> >> 64 bytes, there is no need to copy the first 64 bytes, since we are passing

> >> those in registers anyway.

> >>

> >> Contributed-under: TianoCore Contribution Agreement 1.0

> >> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>

> >> ---

> >>  MdeModulePkg/Universal/EbcDxe/AArch64/EbcLowLevel.S | 73 +++++++++++++++-----

> >>  1 file changed, 55 insertions(+), 18 deletions(-)

> >>

> >> diff --git a/MdeModulePkg/Universal/EbcDxe/AArch64/EbcLowLevel.S b/MdeModulePkg/Universal/EbcDxe/AArch64/EbcLowLevel.S

> >> index cb7a70b5a4f8..d95713e82b0f 100644

> >> --- a/MdeModulePkg/Universal/EbcDxe/AArch64/EbcLowLevel.S

> >> +++ b/MdeModulePkg/Universal/EbcDxe/AArch64/EbcLowLevel.S

> >> @@ -35,30 +35,67 @@ ASM_GLOBAL ASM_PFX(mEbcInstructionBufferTemplate)

> >>  //****************************************************************************

> >>  // UINTN EbcLLCALLEXNative(UINTN FuncAddr, UINTN NewStackPointer, VOID *FramePtr)

> >>  ASM_PFX(EbcLLCALLEXNative):

> >> -      stp  x19, x20, [sp, #-16]!

> >> -      stp  x29, x30, [sp, #-16]!

> >> +    mov     x8, x0                 // Preserve x0

> >> +    mov     x9, x1                 // Preserve x1

> >>

> >> -      mov  x19, x0

> >> -      mov  x20, sp

> >> -      sub  x2, x2, x1   // Length = NewStackPointer-FramePtr

> >> -      sub  sp, sp, x2

> >> -      sub  sp, sp, #64  // Make sure there is room for at least 8 args in the new stack

> >> -      mov  x0, sp

> >> +    //

> >> +    // If the EBC stack frame is smaller than or equal to 64 bytes, we know there

> >> +    // are no stacked arguments #9 and beyond that we need to copy to the native

> >> +    // stack. In this case, we can perform a tail call which is much more

> >> +    // efficient, since there is no need to touch the native stack at all.

> >> +    //

> >> +    sub     x3, x2, x1              // Length = NewStackPointer - FramePtr

> >> +    cmp     x3, #64

> >> +    b.gt    1f

> >>

> >> -      bl   CopyMem      // Sp, NewStackPointer, Length

> >> +    adr     x0, 0f

> >> +    sub     x0, x0, x3, lsr #1

> >> +    br      x0

> >>

> >> -      ldp  x0, x1, [sp], #16

> >> -      ldp  x2, x3, [sp], #16

> >> -      ldp  x4, x5, [sp], #16

> >> -      ldp  x6, x7, [sp], #16

> >> +    ldr     x7, [x9, #56]

> >> +    ldr     x6, [x9, #48]

> >> +    ldr     x5, [x9, #40]

> >> +    ldr     x4, [x9, #32]

> >> +    ldr     x3, [x9, #24]

> >> +    ldr     x2, [x9, #16]

> >> +    ldr     x1, [x9, #8]

> >> +    ldr     x0, [x9]

> >

> > Why not keep using ldp, but with x9?

> 

> This is a computed jump, depending on the size of the stack frame,

> hence the inverted order, and the LDRs. While probably harmless in

> most cases, it is arguably incorrect to copy random data from memory

> into these registers (and could even be a security issue)


OK, that's way too clever for someone like me to realise from the
existing code (which arguably, you have simply corrected). Yes, even
with the block comment above.
Could we add some kind of for-dummies comment by this code to indicate
what is going on at the actual instructions.

Say:

+    ldr     x7, [x9, #56]  // Branch target for 8 arguments
+    ldr     x6, [x9, #48]  // |
+    ldr     x5, [x9, #40]  // |
+    ldr     x4, [x9, #32]  // |
+    ldr     x3, [x9, #24]  // |
+    ldr     x2, [x9, #16]  // |
+    ldr     x1, [x9, #8]   // V
+    ldr     x0, [x9]       // Branch target for 1 argument

-      blr  x19
+0:  br      x8             // Branch target for 0 arguments

(Unless I'm still misunderstanding.)

/
    Leif

> 

> >>

> >> -      blr  x19

> >> +0:  br      x8

> >>

> >> -      mov  sp,  x20

> >> -      ldp  x29, x30, [sp], #16

> >> -      ldp  x19, x20, [sp], #16

> >> +    //

> >> +    // More than 64 bytes: we need to build the full native stack frame and copy

> >> +    // the part of the VM stack exceeding 64 bytes (which may contain stacked

> >> +    // arguments) to the native stack

> >> +    //

> >> +1:  stp     x29, x30, [sp, #-16]!

> >> +    mov     x29, sp

> >>

> >> -      ret

> >> +    //

> >> +    // Ensure that the stack pointer remains 16 byte aligned,

> >> +    // even if the size of the VM stack frame is not a multiple of 16

> >> +    //

> >> +    add     x1, x1, #64             // Skip over [potential] reg params

> >> +    tbz     x3, #3, 2f              // Multiple of 16?

> >> +    ldr     x4, [x2, #-8]!          // No? Then push one word

> >> +    str     x4, [sp, #-16]!         // ... but use two slots

> >> +    b       3f

> >> +

> >> +2:  ldp     x4, x5, [x2, #-16]!

> >> +    stp     x4, x5, [sp, #-16]!

> >> +3:  cmp     x2, x1

> >> +    b.gt    2b

> >> +

> >> +    ldp     x0, x1, [x9]

> >> +    ldp     x2, x3, [x9, #16]

> >> +    ldp     x4, x5, [x9, #32]

> >> +    ldp     x6, x7, [x9, #48]

> >> +

> >> +    blr     x8

> >> +

> >> +    mov     sp, x29

> >> +    ldp     x29, x30, [sp], #16

> >> +    ret

> >>

> >>  //****************************************************************************

> >>  // EbcLLEbcInterpret

> >> --

> >> 2.7.4

> >>

_______________________________________________
edk2-devel mailing list
edk2-devel@lists.01.org
https://lists.01.org/mailman/listinfo/edk2-devel
Ard Biesheuvel Aug. 26, 2016, 6:41 p.m. | #4
> On 26 aug. 2016, at 19:10, Leif Lindholm <leif.lindholm@linaro.org> wrote:

> 

>> On Fri, Aug 26, 2016 at 06:14:10PM +0100, Ard Biesheuvel wrote:

>>> On 26 August 2016 at 13:54, Leif Lindholm <leif.lindholm@linaro.org> wrote:

>>>> On Wed, Aug 17, 2016 at 04:59:04PM +0200, Ard Biesheuvel wrote:

>>>> Instead of pessimistically copying at least 64 bytes from the VM stack

>>>> to the native stack, and popping off the register arguments again

>>>> before doing the native call, try to avoid touching the stack completely

>>>> if the VM stack frame is < 64 bytes. Also, if the stack frame does exceed

>>> 

>>> Should this say "does not"?

>> 

>> No, if it exceeds 64 bytes, we may have arguments that should be

>> passed on the stack (but we're not sure, unfortunately, since the VM

>> stack frame is not annotated, i.e., it could simply be local vars in

>> the caller) In this case, there is no need to push all arguments onto

>> the native stack, and pop off the first 8 into registers again, which

>> is what the original code was doing.

> 

> OK, thanks.

> Should that not be <= 64 bytes above then?

> 


Correct

>>>> 64 bytes, there is no need to copy the first 64 bytes, since we are passing

>>>> those in registers anyway.

>>>> 

>>>> Contributed-under: TianoCore Contribution Agreement 1.0

>>>> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>

>>>> ---

>>>> MdeModulePkg/Universal/EbcDxe/AArch64/EbcLowLevel.S | 73 +++++++++++++++-----

>>>> 1 file changed, 55 insertions(+), 18 deletions(-)

>>>> 

>>>> diff --git a/MdeModulePkg/Universal/EbcDxe/AArch64/EbcLowLevel.S b/MdeModulePkg/Universal/EbcDxe/AArch64/EbcLowLevel.S

>>>> index cb7a70b5a4f8..d95713e82b0f 100644

>>>> --- a/MdeModulePkg/Universal/EbcDxe/AArch64/EbcLowLevel.S

>>>> +++ b/MdeModulePkg/Universal/EbcDxe/AArch64/EbcLowLevel.S

>>>> @@ -35,30 +35,67 @@ ASM_GLOBAL ASM_PFX(mEbcInstructionBufferTemplate)

>>>> //****************************************************************************

>>>> // UINTN EbcLLCALLEXNative(UINTN FuncAddr, UINTN NewStackPointer, VOID *FramePtr)

>>>> ASM_PFX(EbcLLCALLEXNative):

>>>> -      stp  x19, x20, [sp, #-16]!

>>>> -      stp  x29, x30, [sp, #-16]!

>>>> +    mov     x8, x0                 // Preserve x0

>>>> +    mov     x9, x1                 // Preserve x1

>>>> 

>>>> -      mov  x19, x0

>>>> -      mov  x20, sp

>>>> -      sub  x2, x2, x1   // Length = NewStackPointer-FramePtr

>>>> -      sub  sp, sp, x2

>>>> -      sub  sp, sp, #64  // Make sure there is room for at least 8 args in the new stack

>>>> -      mov  x0, sp

>>>> +    //

>>>> +    // If the EBC stack frame is smaller than or equal to 64 bytes, we know there

>>>> +    // are no stacked arguments #9 and beyond that we need to copy to the native

>>>> +    // stack. In this case, we can perform a tail call which is much more

>>>> +    // efficient, since there is no need to touch the native stack at all.

>>>> +    //

>>>> +    sub     x3, x2, x1              // Length = NewStackPointer - FramePtr

>>>> +    cmp     x3, #64

>>>> +    b.gt    1f

>>>> 

>>>> -      bl   CopyMem      // Sp, NewStackPointer, Length

>>>> +    adr     x0, 0f

>>>> +    sub     x0, x0, x3, lsr #1

>>>> +    br      x0

>>>> 

>>>> -      ldp  x0, x1, [sp], #16

>>>> -      ldp  x2, x3, [sp], #16

>>>> -      ldp  x4, x5, [sp], #16

>>>> -      ldp  x6, x7, [sp], #16

>>>> +    ldr     x7, [x9, #56]

>>>> +    ldr     x6, [x9, #48]

>>>> +    ldr     x5, [x9, #40]

>>>> +    ldr     x4, [x9, #32]

>>>> +    ldr     x3, [x9, #24]

>>>> +    ldr     x2, [x9, #16]

>>>> +    ldr     x1, [x9, #8]

>>>> +    ldr     x0, [x9]

>>> 

>>> Why not keep using ldp, but with x9?

>> 

>> This is a computed jump, depending on the size of the stack frame,

>> hence the inverted order, and the LDRs. While probably harmless in

>> most cases, it is arguably incorrect to copy random data from memory

>> into these registers (and could even be a security issue)

> 

> OK, that's way too clever for someone like me to realise from the

> existing code (which arguably, you have simply corrected). Yes, even

> with the block comment above.

> Could we add some kind of for-dummies comment by this code to indicate

> what is going on at the actual instructions.

> 

> Say:

> 

> +    ldr     x7, [x9, #56]  // Branch target for 8 arguments

> +    ldr     x6, [x9, #48]  // |

> +    ldr     x5, [x9, #40]  // |

> +    ldr     x4, [x9, #32]  // |

> +    ldr     x3, [x9, #24]  // |

> +    ldr     x2, [x9, #16]  // |

> +    ldr     x1, [x9, #8]   // V

> +    ldr     x0, [x9]       // Branch target for 1 argument

> 

> -      blr  x19

> +0:  br      x8             // Branch target for 0 arguments

> 

> (Unless I'm still misunderstanding.)

> 


No, that is accurate, and a worthwhile clarification

> /

>    Leif

> 

>> 

>>>> 

>>>> -      blr  x19

>>>> +0:  br      x8

>>>> 

>>>> -      mov  sp,  x20

>>>> -      ldp  x29, x30, [sp], #16

>>>> -      ldp  x19, x20, [sp], #16

>>>> +    //

>>>> +    // More than 64 bytes: we need to build the full native stack frame and copy

>>>> +    // the part of the VM stack exceeding 64 bytes (which may contain stacked

>>>> +    // arguments) to the native stack

>>>> +    //

>>>> +1:  stp     x29, x30, [sp, #-16]!

>>>> +    mov     x29, sp

>>>> 

>>>> -      ret

>>>> +    //

>>>> +    // Ensure that the stack pointer remains 16 byte aligned,

>>>> +    // even if the size of the VM stack frame is not a multiple of 16

>>>> +    //

>>>> +    add     x1, x1, #64             // Skip over [potential] reg params

>>>> +    tbz     x3, #3, 2f              // Multiple of 16?

>>>> +    ldr     x4, [x2, #-8]!          // No? Then push one word

>>>> +    str     x4, [sp, #-16]!         // ... but use two slots

>>>> +    b       3f

>>>> +

>>>> +2:  ldp     x4, x5, [x2, #-16]!

>>>> +    stp     x4, x5, [sp, #-16]!

>>>> +3:  cmp     x2, x1

>>>> +    b.gt    2b

>>>> +

>>>> +    ldp     x0, x1, [x9]

>>>> +    ldp     x2, x3, [x9, #16]

>>>> +    ldp     x4, x5, [x9, #32]

>>>> +    ldp     x6, x7, [x9, #48]

>>>> +

>>>> +    blr     x8

>>>> +

>>>> +    mov     sp, x29

>>>> +    ldp     x29, x30, [sp], #16

>>>> +    ret

>>>> 

>>>> //****************************************************************************

>>>> // EbcLLEbcInterpret

>>>> --

>>>> 2.7.4

>>>> 

_______________________________________________
edk2-devel mailing list
edk2-devel@lists.01.org
https://lists.01.org/mailman/listinfo/edk2-devel

Patch

diff --git a/MdeModulePkg/Universal/EbcDxe/AArch64/EbcLowLevel.S b/MdeModulePkg/Universal/EbcDxe/AArch64/EbcLowLevel.S
index cb7a70b5a4f8..d95713e82b0f 100644
--- a/MdeModulePkg/Universal/EbcDxe/AArch64/EbcLowLevel.S
+++ b/MdeModulePkg/Universal/EbcDxe/AArch64/EbcLowLevel.S
@@ -35,30 +35,67 @@  ASM_GLOBAL ASM_PFX(mEbcInstructionBufferTemplate)
 //****************************************************************************
 // UINTN EbcLLCALLEXNative(UINTN FuncAddr, UINTN NewStackPointer, VOID *FramePtr)
 ASM_PFX(EbcLLCALLEXNative):
-      stp  x19, x20, [sp, #-16]!
-      stp  x29, x30, [sp, #-16]!
+    mov     x8, x0                 // Preserve x0
+    mov     x9, x1                 // Preserve x1
 
-      mov  x19, x0
-      mov  x20, sp
-      sub  x2, x2, x1   // Length = NewStackPointer-FramePtr
-      sub  sp, sp, x2
-      sub  sp, sp, #64  // Make sure there is room for at least 8 args in the new stack
-      mov  x0, sp
+    //
+    // If the EBC stack frame is smaller than or equal to 64 bytes, we know there
+    // are no stacked arguments #9 and beyond that we need to copy to the native
+    // stack. In this case, we can perform a tail call which is much more
+    // efficient, since there is no need to touch the native stack at all.
+    //
+    sub     x3, x2, x1              // Length = NewStackPointer - FramePtr
+    cmp     x3, #64
+    b.gt    1f
 
-      bl   CopyMem      // Sp, NewStackPointer, Length
+    adr     x0, 0f
+    sub     x0, x0, x3, lsr #1
+    br      x0
 
-      ldp  x0, x1, [sp], #16
-      ldp  x2, x3, [sp], #16
-      ldp  x4, x5, [sp], #16
-      ldp  x6, x7, [sp], #16
+    ldr     x7, [x9, #56]
+    ldr     x6, [x9, #48]
+    ldr     x5, [x9, #40]
+    ldr     x4, [x9, #32]
+    ldr     x3, [x9, #24]
+    ldr     x2, [x9, #16]
+    ldr     x1, [x9, #8]
+    ldr     x0, [x9]
 
-      blr  x19
+0:  br      x8
 
-      mov  sp,  x20
-      ldp  x29, x30, [sp], #16
-      ldp  x19, x20, [sp], #16
+    //
+    // More than 64 bytes: we need to build the full native stack frame and copy
+    // the part of the VM stack exceeding 64 bytes (which may contain stacked
+    // arguments) to the native stack
+    //
+1:  stp     x29, x30, [sp, #-16]!
+    mov     x29, sp
 
-      ret
+    //
+    // Ensure that the stack pointer remains 16 byte aligned,
+    // even if the size of the VM stack frame is not a multiple of 16
+    //
+    add     x1, x1, #64             // Skip over [potential] reg params
+    tbz     x3, #3, 2f              // Multiple of 16?
+    ldr     x4, [x2, #-8]!          // No? Then push one word
+    str     x4, [sp, #-16]!         // ... but use two slots
+    b       3f
+
+2:  ldp     x4, x5, [x2, #-16]!
+    stp     x4, x5, [sp, #-16]!
+3:  cmp     x2, x1
+    b.gt    2b
+
+    ldp     x0, x1, [x9]
+    ldp     x2, x3, [x9, #16]
+    ldp     x4, x5, [x9, #32]
+    ldp     x6, x7, [x9, #48]
+
+    blr     x8
+
+    mov     sp, x29
+    ldp     x29, x30, [sp], #16
+    ret
 
 //****************************************************************************
 // EbcLLEbcInterpret