Message ID | 20180205132011.27996-8-julien.grall@arm.com |
---|---|
State | Superseded |
Headers | show |
Series | xen/arm: PSCI 1.1 and SMCCC-1.1 support and XSA-254 variant 2 update | expand |
Hi, On 5 February 2018 at 15:20, Julien Grall <julien.grall@arm.com> wrote: > The function SMCCC_ARCH_WORKAROUND_1 will be called by the guest for > hardening the branch predictor. So we want the handling to be as fast as > possible. > > As the mitigation is applied on every guest exit, we can check for the > call before saving all the context and return very early. Have you tried any benchmarks? How big is the benefit? > > For now, only provide a fast path for HVC64 call. Because the code rely > on 2 registers, x0 and x1 are saved in advanced. Is there a typo? Should it be "advance"? > > Signed-off-by: Julien Grall <julien.grall@arm.com> > > --- > guest_sync only handle 64-bit guest, so I have only implemented the > 64-bit side for now. We can discuss whether it is useful to > implement it for 32-bit guests. > > We could also consider to implement the fast path for SMC64, > althought a guest should always use HVC. I can imagine a guest that know nothing about virtualization and use SMC as a conduit. But I can't provide real world example, thou. > --- > xen/arch/arm/arm64/entry.S | 56 +++++++++++++++++++++++++++++++++++++++-- > xen/include/asm-arm/processor.h | 2 ++ > 2 files changed, 56 insertions(+), 2 deletions(-) > > diff --git a/xen/arch/arm/arm64/entry.S b/xen/arch/arm/arm64/entry.S > index 6d99e46f0f..67f96d518f 100644 > --- a/xen/arch/arm/arm64/entry.S > +++ b/xen/arch/arm/arm64/entry.S > @@ -1,6 +1,7 @@ > #include <asm/asm_defns.h> > #include <asm/regs.h> > #include <asm/alternative.h> > +#include <asm/smccc.h> > #include <public/xen.h> > > /* > @@ -90,8 +91,12 @@ lr .req x30 /* link register */ > .endm > /* > * Save state on entry to hypervisor, restore on exit > + * > + * save_x0_x1: Does the macro needs to save x0/x1 (default 1). If 0, > + * we rely on the on x0/x1 to have been saved at the correct position on > + * the stack before. > */ > - .macro entry, hyp, compat > + .macro entry, hyp, compat, save_x0_x1=1 > sub sp, sp, #(UREGS_SPSR_el1 - UREGS_LR) /* CPSR, PC, SP, LR */ > push x28, x29 > push x26, x27 > @@ -107,7 +112,16 @@ lr .req x30 /* link register */ > push x6, x7 > push x4, x5 > push x2, x3 > + /* > + * The caller may already have saved x0/x1 on the stack at the > + * correct address and corrupt them with another value. Only > + * save them if save_x0_x1 == 1. > + */ > + .if \save_x0_x1 == 1 > push x0, x1 > + .else > + sub sp, sp, #16 > + .endif > > .if \hyp == 1 /* Hypervisor mode */ > > @@ -200,7 +214,45 @@ hyp_irq: > exit hyp=1 > > guest_sync: > - entry hyp=0, compat=0 > + /* > + * Save x0, x1 in advance > + */ > + stp x0, x1, [sp, #-(UREGS_kernel_sizeof - UREGS_X0)] > + > + /* > + * x1 is used because x0 may contain the function identifier. > + * This avoids to restore x0 from the stack. > + */ > + mrs x1, esr_el2 > + lsr x1, x1, #HSR_EC_SHIFT /* x1 = ESR_EL2.EC */ > + cmp x1, #HSR_EC_HVC64 > + b.ne 1f /* Not a HVC skip fastpath. */ > + > + mrs x1, esr_el2 > + and x1, x1, #0xffff /* Check the immediate [0:16] */ > + cbnz x1, 1f /* should be 0 for HVC #0 */ > + > + /* > + * Fastest path possible for ARM_SMCCC_ARCH_WORKAROUND_1. > + * The workaround has already been applied on the exception > + * entry from the guest, so let's quickly get back to the guest. > + */ > + eor w0, w0, #ARM_SMCCC_ARCH_WORKAROUND_1_FID > + cbnz w0, 1f > + > + /* > + * Clobber both x0 and x1 to prevent leakage. Note that thanks > + * the eor, x0 = 0. > + */ > + mov x1, x0 > + eret > + > +1: > + /* > + * x0/x1 may have been scratch by the fast path above, so avoid > + * to save them. > + */ > + entry hyp=0, compat=0, save_x0_x1=0 > /* > * The vSError will be checked while SKIP_SYNCHRONIZE_SERROR_ENTRY_EXIT > * is not set. If a vSError took place, the initial exception will be > diff --git a/xen/include/asm-arm/processor.h b/xen/include/asm-arm/processor.h > index c0f79d0093..222a02dd99 100644 > --- a/xen/include/asm-arm/processor.h > +++ b/xen/include/asm-arm/processor.h > @@ -306,6 +306,8 @@ > #define HDCR_TPM (_AC(1,U)<<6) /* Trap Performance Monitors accesses */ > #define HDCR_TPMCR (_AC(1,U)<<5) /* Trap PMCR accesses */ > > +#define HSR_EC_SHIFT 26 > + > #define HSR_EC_UNKNOWN 0x00 > #define HSR_EC_WFI_WFE 0x01 > #define HSR_EC_CP15_32 0x03 > -- > 2.11.0 > > > _______________________________________________ > Xen-devel mailing list > Xen-devel@lists.xenproject.org > https://lists.xenproject.org/mailman/listinfo/xen-devel
Hi, On 02/06/2018 04:36 PM, Volodymyr Babchuk wrote: > On 5 February 2018 at 15:20, Julien Grall <julien.grall@arm.com> wrote: >> The function SMCCC_ARCH_WORKAROUND_1 will be called by the guest for >> hardening the branch predictor. So we want the handling to be as fast as >> possible. >> >> As the mitigation is applied on every guest exit, we can check for the >> call before saving all the context and return very early. > Have you tried any benchmarks? How big is the benefit? I have benchmarked but I can't share the result. I can give you an idea on how this could benefits Xen. Linux will call the workaround on every context switch between process. So imagine for each context switch, you have will enter in Xen and in the following order: 1) enter Xen 2) apply the workaround which means calling EL3. 3) save part of the guest context 4) call enter_hypervisor_head that will sync the vGIC state 5) detect you actually call SMCCC_ARCH_WORKAROUND_1 that will do nothing 6) call leave_hypervisor_tail that will sync back the vGIC state and execute softirq (that could reschedule the vCPU) 7) restore the guest context 8) return to the guest So effectively, instead of executing hundreds (if not thousands) instructions each time, you will end up only executing less than 50 instructions. >> >> For now, only provide a fast path for HVC64 call. Because the code rely >> on 2 registers, x0 and x1 are saved in advanced. > Is there a typo? Should it be "advance"? > >> >> Signed-off-by: Julien Grall <julien.grall@arm.com> >> >> --- >> guest_sync only handle 64-bit guest, so I have only implemented the >> 64-bit side for now. We can discuss whether it is useful to >> implement it for 32-bit guests. >> >> We could also consider to implement the fast path for SMC64, >> althought a guest should always use HVC. > I can imagine a guest that know nothing about virtualization and use > SMC as a conduit. But I can't provide real world example, thou. Someone can easily send a follow-up patch for that if it is deemed necessary. Cheers,
Hi Julien, On 6 February 2018 at 20:33, Julien Grall <julien.grall@arm.com> wrote: > Hi, > > On 02/06/2018 04:36 PM, Volodymyr Babchuk wrote: >> >> On 5 February 2018 at 15:20, Julien Grall <julien.grall@arm.com> wrote: >>> >>> The function SMCCC_ARCH_WORKAROUND_1 will be called by the guest for >>> hardening the branch predictor. So we want the handling to be as fast as >>> possible. >>> >>> As the mitigation is applied on every guest exit, we can check for the >>> call before saving all the context and return very early. >> >> Have you tried any benchmarks? How big is the benefit? > > > I have benchmarked but I can't share the result. I can give you an idea on > how this could benefits Xen. > > Linux will call the workaround on every context switch between process. So > imagine for each context switch, you have will enter in Xen and in the > following order: > 1) enter Xen > 2) apply the workaround which means calling EL3. > 3) save part of the guest context > 4) call enter_hypervisor_head that will sync the vGIC state > 5) detect you actually call SMCCC_ARCH_WORKAROUND_1 that will do > nothing > 6) call leave_hypervisor_tail that will sync back the vGIC state and > execute softirq (that could reschedule the vCPU) > 7) restore the guest context > 8) return to the guest > > So effectively, instead of executing hundreds (if not thousands) > instructions each time, you will end up only executing less than 50 > instructions. Sounds fine. > >>> >>> For now, only provide a fast path for HVC64 call. Because the code rely >>> on 2 registers, x0 and x1 are saved in advanced. >> >> Is there a typo? Should it be "advance"? >> >>> >>> Signed-off-by: Julien Grall <julien.grall@arm.com> Provided that above typo is fixed: Reviewed-by: Volodymyr Babchuk <volodymyr_babchuk@epam.com> >>> >>> --- >>> guest_sync only handle 64-bit guest, so I have only implemented the >>> 64-bit side for now. We can discuss whether it is useful to >>> implement it for 32-bit guests. >>> >>> We could also consider to implement the fast path for SMC64, >>> althought a guest should always use HVC. >> >> I can imagine a guest that know nothing about virtualization and use >> SMC as a conduit. But I can't provide real world example, thou. I'm okay with this.
diff --git a/xen/arch/arm/arm64/entry.S b/xen/arch/arm/arm64/entry.S index 6d99e46f0f..67f96d518f 100644 --- a/xen/arch/arm/arm64/entry.S +++ b/xen/arch/arm/arm64/entry.S @@ -1,6 +1,7 @@ #include <asm/asm_defns.h> #include <asm/regs.h> #include <asm/alternative.h> +#include <asm/smccc.h> #include <public/xen.h> /* @@ -90,8 +91,12 @@ lr .req x30 /* link register */ .endm /* * Save state on entry to hypervisor, restore on exit + * + * save_x0_x1: Does the macro needs to save x0/x1 (default 1). If 0, + * we rely on the on x0/x1 to have been saved at the correct position on + * the stack before. */ - .macro entry, hyp, compat + .macro entry, hyp, compat, save_x0_x1=1 sub sp, sp, #(UREGS_SPSR_el1 - UREGS_LR) /* CPSR, PC, SP, LR */ push x28, x29 push x26, x27 @@ -107,7 +112,16 @@ lr .req x30 /* link register */ push x6, x7 push x4, x5 push x2, x3 + /* + * The caller may already have saved x0/x1 on the stack at the + * correct address and corrupt them with another value. Only + * save them if save_x0_x1 == 1. + */ + .if \save_x0_x1 == 1 push x0, x1 + .else + sub sp, sp, #16 + .endif .if \hyp == 1 /* Hypervisor mode */ @@ -200,7 +214,45 @@ hyp_irq: exit hyp=1 guest_sync: - entry hyp=0, compat=0 + /* + * Save x0, x1 in advance + */ + stp x0, x1, [sp, #-(UREGS_kernel_sizeof - UREGS_X0)] + + /* + * x1 is used because x0 may contain the function identifier. + * This avoids to restore x0 from the stack. + */ + mrs x1, esr_el2 + lsr x1, x1, #HSR_EC_SHIFT /* x1 = ESR_EL2.EC */ + cmp x1, #HSR_EC_HVC64 + b.ne 1f /* Not a HVC skip fastpath. */ + + mrs x1, esr_el2 + and x1, x1, #0xffff /* Check the immediate [0:16] */ + cbnz x1, 1f /* should be 0 for HVC #0 */ + + /* + * Fastest path possible for ARM_SMCCC_ARCH_WORKAROUND_1. + * The workaround has already been applied on the exception + * entry from the guest, so let's quickly get back to the guest. + */ + eor w0, w0, #ARM_SMCCC_ARCH_WORKAROUND_1_FID + cbnz w0, 1f + + /* + * Clobber both x0 and x1 to prevent leakage. Note that thanks + * the eor, x0 = 0. + */ + mov x1, x0 + eret + +1: + /* + * x0/x1 may have been scratch by the fast path above, so avoid + * to save them. + */ + entry hyp=0, compat=0, save_x0_x1=0 /* * The vSError will be checked while SKIP_SYNCHRONIZE_SERROR_ENTRY_EXIT * is not set. If a vSError took place, the initial exception will be diff --git a/xen/include/asm-arm/processor.h b/xen/include/asm-arm/processor.h index c0f79d0093..222a02dd99 100644 --- a/xen/include/asm-arm/processor.h +++ b/xen/include/asm-arm/processor.h @@ -306,6 +306,8 @@ #define HDCR_TPM (_AC(1,U)<<6) /* Trap Performance Monitors accesses */ #define HDCR_TPMCR (_AC(1,U)<<5) /* Trap PMCR accesses */ +#define HSR_EC_SHIFT 26 + #define HSR_EC_UNKNOWN 0x00 #define HSR_EC_WFI_WFE 0x01 #define HSR_EC_CP15_32 0x03
The function SMCCC_ARCH_WORKAROUND_1 will be called by the guest for hardening the branch predictor. So we want the handling to be as fast as possible. As the mitigation is applied on every guest exit, we can check for the call before saving all the context and return very early. For now, only provide a fast path for HVC64 call. Because the code rely on 2 registers, x0 and x1 are saved in advanced. Signed-off-by: Julien Grall <julien.grall@arm.com> --- guest_sync only handle 64-bit guest, so I have only implemented the 64-bit side for now. We can discuss whether it is useful to implement it for 32-bit guests. We could also consider to implement the fast path for SMC64, althought a guest should always use HVC. --- xen/arch/arm/arm64/entry.S | 56 +++++++++++++++++++++++++++++++++++++++-- xen/include/asm-arm/processor.h | 2 ++ 2 files changed, 56 insertions(+), 2 deletions(-)