diff mbox series

[Xen-devel,v3,05/17] xen/arm64: Implement a fast path for handling SMCCC_ARCH_WORKAROUND_1

Message ID 20180215150248.28922-6-julien.grall@arm.com
State Superseded
Headers show
Series xen/arm: PSCI 1.1 and SMCCC-1.1 support and XSA-254 variant 2 update | expand

Commit Message

Julien Grall Feb. 15, 2018, 3:02 p.m. UTC
The function SMCCC_ARCH_WORKAROUND_1 will be called by the guest for
hardening the branch predictor. So we want the handling to be as fast as
possible.

As the mitigation is applied on every guest exit, we can check for the
call before saving all the context and return very early.

For now, only provide a fast path for HVC64 call. Because the code rely
on 2 registers, x0 and x1 are saved in advance.

Signed-off-by: Julien Grall <julien.grall@arm.com>
Reviewed-by: Volodymyr Babchuk <volodymyr.babchuk@epam.com>

---
    guest_sync only handle 64-bit guest, so I have only implemented the
    64-bit side for now. We can discuss whether it is useful to
    implement it for 32-bit guests.

    We could also consider to implement the fast path for SMC64,
    althought a guest should always use HVC.

    Changes in v2:
        - Add Volodymyr's reviewed-by
---
 xen/arch/arm/arm64/entry.S      | 56 +++++++++++++++++++++++++++++++++++++++--
 xen/include/asm-arm/processor.h |  2 ++
 2 files changed, 56 insertions(+), 2 deletions(-)

Comments

Stefano Stabellini Feb. 21, 2018, 12:37 a.m. UTC | #1
On Thu, 15 Feb 2018, Julien Grall wrote:
> The function SMCCC_ARCH_WORKAROUND_1 will be called by the guest for
> hardening the branch predictor. So we want the handling to be as fast as
> possible.
> 
> As the mitigation is applied on every guest exit, we can check for the
> call before saving all the context and return very early.
> 
> For now, only provide a fast path for HVC64 call. Because the code rely
> on 2 registers, x0 and x1 are saved in advance.
> 
> Signed-off-by: Julien Grall <julien.grall@arm.com>
> Reviewed-by: Volodymyr Babchuk <volodymyr.babchuk@epam.com>

Nice!

Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>


> ---
>     guest_sync only handle 64-bit guest, so I have only implemented the
>     64-bit side for now. We can discuss whether it is useful to
>     implement it for 32-bit guests.
> 
>     We could also consider to implement the fast path for SMC64,
>     althought a guest should always use HVC.
> 
>     Changes in v2:
>         - Add Volodymyr's reviewed-by
> ---
>  xen/arch/arm/arm64/entry.S      | 56 +++++++++++++++++++++++++++++++++++++++--
>  xen/include/asm-arm/processor.h |  2 ++
>  2 files changed, 56 insertions(+), 2 deletions(-)
> 
> diff --git a/xen/arch/arm/arm64/entry.S b/xen/arch/arm/arm64/entry.S
> index 6d99e46f0f..67f96d518f 100644
> --- a/xen/arch/arm/arm64/entry.S
> +++ b/xen/arch/arm/arm64/entry.S
> @@ -1,6 +1,7 @@
>  #include <asm/asm_defns.h>
>  #include <asm/regs.h>
>  #include <asm/alternative.h>
> +#include <asm/smccc.h>
>  #include <public/xen.h>
>  
>  /*
> @@ -90,8 +91,12 @@ lr      .req    x30             /* link register */
>          .endm
>  /*
>   * Save state on entry to hypervisor, restore on exit
> + *
> + * save_x0_x1: Does the macro needs to save x0/x1 (default 1). If 0,
> + * we rely on the on x0/x1 to have been saved at the correct position on
> + * the stack before.
>   */
> -        .macro  entry, hyp, compat
> +        .macro  entry, hyp, compat, save_x0_x1=1
>          sub     sp, sp, #(UREGS_SPSR_el1 - UREGS_LR) /* CPSR, PC, SP, LR */
>          push    x28, x29
>          push    x26, x27
> @@ -107,7 +112,16 @@ lr      .req    x30             /* link register */
>          push    x6, x7
>          push    x4, x5
>          push    x2, x3
> +        /*
> +         * The caller may already have saved x0/x1 on the stack at the
> +         * correct address and corrupt them with another value. Only
> +         * save them if save_x0_x1 == 1.
> +         */
> +        .if \save_x0_x1 == 1
>          push    x0, x1
> +        .else
> +        sub     sp, sp, #16
> +        .endif
>  
>          .if \hyp == 1        /* Hypervisor mode */
>  
> @@ -200,7 +214,45 @@ hyp_irq:
>          exit    hyp=1
>  
>  guest_sync:
> -        entry   hyp=0, compat=0
> +        /*
> +         * Save x0, x1 in advance
> +         */
> +        stp     x0, x1, [sp, #-(UREGS_kernel_sizeof - UREGS_X0)]
> +
> +        /*
> +         * x1 is used because x0 may contain the function identifier.
> +         * This avoids to restore x0 from the stack.
> +         */
> +        mrs     x1, esr_el2
> +        lsr     x1, x1, #HSR_EC_SHIFT           /* x1 = ESR_EL2.EC */
> +        cmp     x1, #HSR_EC_HVC64
> +        b.ne    1f                              /* Not a HVC skip fastpath. */
> +
> +        mrs     x1, esr_el2
> +        and     x1, x1, #0xffff                 /* Check the immediate [0:16] */
> +        cbnz    x1, 1f                          /* should be 0 for HVC #0 */
> +
> +        /*
> +         * Fastest path possible for ARM_SMCCC_ARCH_WORKAROUND_1.
> +         * The workaround has already been applied on the exception
> +         * entry from the guest, so let's quickly get back to the guest.
> +         */
> +        eor     w0, w0, #ARM_SMCCC_ARCH_WORKAROUND_1_FID
> +        cbnz    w0, 1f
> +
> +        /*
> +         * Clobber both x0 and x1 to prevent leakage. Note that thanks
> +         * the eor, x0 = 0.
> +         */
> +        mov     x1, x0
> +        eret
> +
> +1:
> +        /*
> +         * x0/x1 may have been scratch by the fast path above, so avoid
> +         * to save them.
> +         */
> +        entry   hyp=0, compat=0, save_x0_x1=0
>          /*
>           * The vSError will be checked while SKIP_SYNCHRONIZE_SERROR_ENTRY_EXIT
>           * is not set. If a vSError took place, the initial exception will be
> diff --git a/xen/include/asm-arm/processor.h b/xen/include/asm-arm/processor.h
> index c0f79d0093..222a02dd99 100644
> --- a/xen/include/asm-arm/processor.h
> +++ b/xen/include/asm-arm/processor.h
> @@ -306,6 +306,8 @@
>  #define HDCR_TPM        (_AC(1,U)<<6)           /* Trap Performance Monitors accesses */
>  #define HDCR_TPMCR      (_AC(1,U)<<5)           /* Trap PMCR accesses */
>  
> +#define HSR_EC_SHIFT                26
> +
>  #define HSR_EC_UNKNOWN              0x00
>  #define HSR_EC_WFI_WFE              0x01
>  #define HSR_EC_CP15_32              0x03
> -- 
> 2.11.0
>
Andre Przywara Feb. 21, 2018, 2:27 p.m. UTC | #2
Hi,

On 15/02/18 15:02, Julien Grall wrote:
> The function SMCCC_ARCH_WORKAROUND_1 will be called by the guest for
> hardening the branch predictor. So we want the handling to be as fast as
> possible.
> 
> As the mitigation is applied on every guest exit, we can check for the
> call before saving all the context and return very early.
> 
> For now, only provide a fast path for HVC64 call. Because the code rely
> on 2 registers, x0 and x1 are saved in advance.
> 
> Signed-off-by: Julien Grall <julien.grall@arm.com>
> Reviewed-by: Volodymyr Babchuk <volodymyr.babchuk@epam.com>
> 
> ---
>     guest_sync only handle 64-bit guest, so I have only implemented the
>     64-bit side for now. We can discuss whether it is useful to
>     implement it for 32-bit guests.
> 
>     We could also consider to implement the fast path for SMC64,
>     althought a guest should always use HVC.
> 
>     Changes in v2:
>         - Add Volodymyr's reviewed-by
> ---
>  xen/arch/arm/arm64/entry.S      | 56 +++++++++++++++++++++++++++++++++++++++--
>  xen/include/asm-arm/processor.h |  2 ++
>  2 files changed, 56 insertions(+), 2 deletions(-)
> 
> diff --git a/xen/arch/arm/arm64/entry.S b/xen/arch/arm/arm64/entry.S
> index 6d99e46f0f..67f96d518f 100644
> --- a/xen/arch/arm/arm64/entry.S
> +++ b/xen/arch/arm/arm64/entry.S
> @@ -1,6 +1,7 @@
>  #include <asm/asm_defns.h>
>  #include <asm/regs.h>
>  #include <asm/alternative.h>
> +#include <asm/smccc.h>
>  #include <public/xen.h>
>  
>  /*
> @@ -90,8 +91,12 @@ lr      .req    x30             /* link register */
>          .endm
>  /*
>   * Save state on entry to hypervisor, restore on exit
> + *
> + * save_x0_x1: Does the macro needs to save x0/x1 (default 1). If 0,

A bit confusing. What about:
    * save_x0_x1: Does the macro needs to save x0 and x1? Defaults to 1.
    * If 0, ....

> + * we rely on the on x0/x1 to have been saved at the correct position on
> + * the stack before.
>   */
> -        .macro  entry, hyp, compat
> +        .macro  entry, hyp, compat, save_x0_x1=1
>          sub     sp, sp, #(UREGS_SPSR_el1 - UREGS_LR) /* CPSR, PC, SP, LR */
>          push    x28, x29
>          push    x26, x27
> @@ -107,7 +112,16 @@ lr      .req    x30             /* link register */
>          push    x6, x7
>          push    x4, x5
>          push    x2, x3
> +        /*
> +         * The caller may already have saved x0/x1 on the stack at the
> +         * correct address and corrupt them with another value. Only
> +         * save them if save_x0_x1 == 1.
> +         */
> +        .if \save_x0_x1 == 1
>          push    x0, x1
> +        .else
> +        sub     sp, sp, #16
> +        .endif
>  
>          .if \hyp == 1        /* Hypervisor mode */
>  
> @@ -200,7 +214,45 @@ hyp_irq:
>          exit    hyp=1
>  
>  guest_sync:
> -        entry   hyp=0, compat=0
> +        /*
> +         * Save x0, x1 in advance
> +         */
> +        stp     x0, x1, [sp, #-(UREGS_kernel_sizeof - UREGS_X0)]
> +
> +        /*
> +         * x1 is used because x0 may contain the function identifier.
> +         * This avoids to restore x0 from the stack.
> +         */
> +        mrs     x1, esr_el2
> +        lsr     x1, x1, #HSR_EC_SHIFT           /* x1 = ESR_EL2.EC */
> +        cmp     x1, #HSR_EC_HVC64
> +        b.ne    1f                              /* Not a HVC skip fastpath. */
> +
> +        mrs     x1, esr_el2
> +        and     x1, x1, #0xffff                 /* Check the immediate [0:16] */
> +        cbnz    x1, 1f                          /* should be 0 for HVC #0 */
> +
> +        /*
> +         * Fastest path possible for ARM_SMCCC_ARCH_WORKAROUND_1.
> +         * The workaround has already been applied on the exception
> +         * entry from the guest, so let's quickly get back to the guest.
> +         */
> +        eor     w0, w0, #ARM_SMCCC_ARCH_WORKAROUND_1_FID
> +        cbnz    w0, 1f

It would be good to mention that this function identifier can't be
encoded as an immediate for the "cmp" instruction, but it can with
arithmetic instructions. Hence the exclusive OR.

> +
> +        /*
> +         * Clobber both x0 and x1 to prevent leakage. Note that thanks
> +         * the eor, x0 = 0.
> +         */
> +        mov     x1, x0
> +        eret

I think this is more readable:

	   /* Clobber x1 to prevent leakage. x0 is already 0. */
	   mov     x1, xzr
	   eret

Cheers,
Andre.

> +
> +1:
> +        /*
> +         * x0/x1 may have been scratch by the fast path above, so avoid
> +         * to save them.
> +         */
> +        entry   hyp=0, compat=0, save_x0_x1=0
>          /*
>           * The vSError will be checked while SKIP_SYNCHRONIZE_SERROR_ENTRY_EXIT
>           * is not set. If a vSError took place, the initial exception will be
> diff --git a/xen/include/asm-arm/processor.h b/xen/include/asm-arm/processor.h
> index c0f79d0093..222a02dd99 100644
> --- a/xen/include/asm-arm/processor.h
> +++ b/xen/include/asm-arm/processor.h
> @@ -306,6 +306,8 @@
>  #define HDCR_TPM        (_AC(1,U)<<6)           /* Trap Performance Monitors accesses */
>  #define HDCR_TPMCR      (_AC(1,U)<<5)           /* Trap PMCR accesses */
>  
> +#define HSR_EC_SHIFT                26
> +
>  #define HSR_EC_UNKNOWN              0x00
>  #define HSR_EC_WFI_WFE              0x01
>  #define HSR_EC_CP15_32              0x03
>
Julien Grall Feb. 22, 2018, 1:58 p.m. UTC | #3
On 21/02/18 14:27, Andre Przywara wrote:
> Hi,

Hi Andre,

> On 15/02/18 15:02, Julien Grall wrote:
>> The function SMCCC_ARCH_WORKAROUND_1 will be called by the guest for
>> hardening the branch predictor. So we want the handling to be as fast as
>> possible.
>>
>> As the mitigation is applied on every guest exit, we can check for the
>> call before saving all the context and return very early.
>>
>> For now, only provide a fast path for HVC64 call. Because the code rely
>> on 2 registers, x0 and x1 are saved in advance.
>>
>> Signed-off-by: Julien Grall <julien.grall@arm.com>
>> Reviewed-by: Volodymyr Babchuk <volodymyr.babchuk@epam.com>
>>
>> ---
>>      guest_sync only handle 64-bit guest, so I have only implemented the
>>      64-bit side for now. We can discuss whether it is useful to
>>      implement it for 32-bit guests.
>>
>>      We could also consider to implement the fast path for SMC64,
>>      althought a guest should always use HVC.
>>
>>      Changes in v2:
>>          - Add Volodymyr's reviewed-by
>> ---
>>   xen/arch/arm/arm64/entry.S      | 56 +++++++++++++++++++++++++++++++++++++++--
>>   xen/include/asm-arm/processor.h |  2 ++
>>   2 files changed, 56 insertions(+), 2 deletions(-)
>>
>> diff --git a/xen/arch/arm/arm64/entry.S b/xen/arch/arm/arm64/entry.S
>> index 6d99e46f0f..67f96d518f 100644
>> --- a/xen/arch/arm/arm64/entry.S
>> +++ b/xen/arch/arm/arm64/entry.S
>> @@ -1,6 +1,7 @@
>>   #include <asm/asm_defns.h>
>>   #include <asm/regs.h>
>>   #include <asm/alternative.h>
>> +#include <asm/smccc.h>
>>   #include <public/xen.h>
>>   
>>   /*
>> @@ -90,8 +91,12 @@ lr      .req    x30             /* link register */
>>           .endm
>>   /*
>>    * Save state on entry to hypervisor, restore on exit
>> + *
>> + * save_x0_x1: Does the macro needs to save x0/x1 (default 1). If 0,
> 
> A bit confusing. What about:
>      * save_x0_x1: Does the macro needs to save x0 and x1? Defaults to 1.
>      * If 0, ....

Sounds good to me. I will update the comment.

> 
>> + * we rely on the on x0/x1 to have been saved at the correct position on
>> + * the stack before.
>>    */
>> -        .macro  entry, hyp, compat
>> +        .macro  entry, hyp, compat, save_x0_x1=1
>>           sub     sp, sp, #(UREGS_SPSR_el1 - UREGS_LR) /* CPSR, PC, SP, LR */
>>           push    x28, x29
>>           push    x26, x27
>> @@ -107,7 +112,16 @@ lr      .req    x30             /* link register */
>>           push    x6, x7
>>           push    x4, x5
>>           push    x2, x3
>> +        /*
>> +         * The caller may already have saved x0/x1 on the stack at the
>> +         * correct address and corrupt them with another value. Only
>> +         * save them if save_x0_x1 == 1.
>> +         */
>> +        .if \save_x0_x1 == 1
>>           push    x0, x1
>> +        .else
>> +        sub     sp, sp, #16
>> +        .endif
>>   
>>           .if \hyp == 1        /* Hypervisor mode */
>>   
>> @@ -200,7 +214,45 @@ hyp_irq:
>>           exit    hyp=1
>>   
>>   guest_sync:
>> -        entry   hyp=0, compat=0
>> +        /*
>> +         * Save x0, x1 in advance
>> +         */
>> +        stp     x0, x1, [sp, #-(UREGS_kernel_sizeof - UREGS_X0)]
>> +
>> +        /*
>> +         * x1 is used because x0 may contain the function identifier.
>> +         * This avoids to restore x0 from the stack.
>> +         */
>> +        mrs     x1, esr_el2
>> +        lsr     x1, x1, #HSR_EC_SHIFT           /* x1 = ESR_EL2.EC */
>> +        cmp     x1, #HSR_EC_HVC64
>> +        b.ne    1f                              /* Not a HVC skip fastpath. */
>> +
>> +        mrs     x1, esr_el2
>> +        and     x1, x1, #0xffff                 /* Check the immediate [0:16] */
>> +        cbnz    x1, 1f                          /* should be 0 for HVC #0 */
>> +
>> +        /*
>> +         * Fastest path possible for ARM_SMCCC_ARCH_WORKAROUND_1.
>> +         * The workaround has already been applied on the exception
>> +         * entry from the guest, so let's quickly get back to the guest.
>> +         */
>> +        eor     w0, w0, #ARM_SMCCC_ARCH_WORKAROUND_1_FID
>> +        cbnz    w0, 1f
> 
> It would be good to mention that this function identifier can't be
> encoded as an immediate for the "cmp" instruction, but it can with
> arithmetic instructions. Hence the exclusive OR.

I will.

> 
>> +
>> +        /*
>> +         * Clobber both x0 and x1 to prevent leakage. Note that thanks
>> +         * the eor, x0 = 0.
>> +         */
>> +        mov     x1, x0
>> +        eret
> 
> I think this is more readable:
> 
> 	   /* Clobber x1 to prevent leakage. x0 is already 0. */

I would prefer to say "clobber x0 and x1 to prevent leakage. Note that 
x0 is already 0 thanks to the eor". So it is clear that we expect both 
registers to be clobbered.

> 	   mov     x1, xzr
> 	   eret
Cheers,
diff mbox series

Patch

diff --git a/xen/arch/arm/arm64/entry.S b/xen/arch/arm/arm64/entry.S
index 6d99e46f0f..67f96d518f 100644
--- a/xen/arch/arm/arm64/entry.S
+++ b/xen/arch/arm/arm64/entry.S
@@ -1,6 +1,7 @@ 
 #include <asm/asm_defns.h>
 #include <asm/regs.h>
 #include <asm/alternative.h>
+#include <asm/smccc.h>
 #include <public/xen.h>
 
 /*
@@ -90,8 +91,12 @@  lr      .req    x30             /* link register */
         .endm
 /*
  * Save state on entry to hypervisor, restore on exit
+ *
+ * save_x0_x1: Does the macro needs to save x0/x1 (default 1). If 0,
+ * we rely on the on x0/x1 to have been saved at the correct position on
+ * the stack before.
  */
-        .macro  entry, hyp, compat
+        .macro  entry, hyp, compat, save_x0_x1=1
         sub     sp, sp, #(UREGS_SPSR_el1 - UREGS_LR) /* CPSR, PC, SP, LR */
         push    x28, x29
         push    x26, x27
@@ -107,7 +112,16 @@  lr      .req    x30             /* link register */
         push    x6, x7
         push    x4, x5
         push    x2, x3
+        /*
+         * The caller may already have saved x0/x1 on the stack at the
+         * correct address and corrupt them with another value. Only
+         * save them if save_x0_x1 == 1.
+         */
+        .if \save_x0_x1 == 1
         push    x0, x1
+        .else
+        sub     sp, sp, #16
+        .endif
 
         .if \hyp == 1        /* Hypervisor mode */
 
@@ -200,7 +214,45 @@  hyp_irq:
         exit    hyp=1
 
 guest_sync:
-        entry   hyp=0, compat=0
+        /*
+         * Save x0, x1 in advance
+         */
+        stp     x0, x1, [sp, #-(UREGS_kernel_sizeof - UREGS_X0)]
+
+        /*
+         * x1 is used because x0 may contain the function identifier.
+         * This avoids to restore x0 from the stack.
+         */
+        mrs     x1, esr_el2
+        lsr     x1, x1, #HSR_EC_SHIFT           /* x1 = ESR_EL2.EC */
+        cmp     x1, #HSR_EC_HVC64
+        b.ne    1f                              /* Not a HVC skip fastpath. */
+
+        mrs     x1, esr_el2
+        and     x1, x1, #0xffff                 /* Check the immediate [0:16] */
+        cbnz    x1, 1f                          /* should be 0 for HVC #0 */
+
+        /*
+         * Fastest path possible for ARM_SMCCC_ARCH_WORKAROUND_1.
+         * The workaround has already been applied on the exception
+         * entry from the guest, so let's quickly get back to the guest.
+         */
+        eor     w0, w0, #ARM_SMCCC_ARCH_WORKAROUND_1_FID
+        cbnz    w0, 1f
+
+        /*
+         * Clobber both x0 and x1 to prevent leakage. Note that thanks
+         * the eor, x0 = 0.
+         */
+        mov     x1, x0
+        eret
+
+1:
+        /*
+         * x0/x1 may have been scratch by the fast path above, so avoid
+         * to save them.
+         */
+        entry   hyp=0, compat=0, save_x0_x1=0
         /*
          * The vSError will be checked while SKIP_SYNCHRONIZE_SERROR_ENTRY_EXIT
          * is not set. If a vSError took place, the initial exception will be
diff --git a/xen/include/asm-arm/processor.h b/xen/include/asm-arm/processor.h
index c0f79d0093..222a02dd99 100644
--- a/xen/include/asm-arm/processor.h
+++ b/xen/include/asm-arm/processor.h
@@ -306,6 +306,8 @@ 
 #define HDCR_TPM        (_AC(1,U)<<6)           /* Trap Performance Monitors accesses */
 #define HDCR_TPMCR      (_AC(1,U)<<5)           /* Trap PMCR accesses */
 
+#define HSR_EC_SHIFT                26
+
 #define HSR_EC_UNKNOWN              0x00
 #define HSR_EC_WFI_WFE              0x01
 #define HSR_EC_CP15_32              0x03