diff mbox

[v3,09/11] KVM: arm: implement lazy world switch for debug registers

Message ID 1434969694-7432-10-git-send-email-zhichao.huang@linaro.org
State New
Headers show

Commit Message

Zhichao Huang June 22, 2015, 10:41 a.m. UTC
Implement switching of the debug registers. While the number
of registers is massive, CPUs usually don't implement them all
(A15 has 6 breakpoints and 4 watchpoints, which gives us a total
of 22 registers "only").

Notice that, for ARMv7, if the CONFIG_HAVE_HW_BREAKPOINT is set in
the guest, debug is always actively in use (ARM_DSCR_MDBGEN set).

We have to do the save/restore dance in this case, because the host
and the guest might use their respective debug registers at any moment.

If the CONFIG_HAVE_HW_BREAKPOINT is not set, and if no one flagged
the debug registers as dirty, we only save/resotre DBGDSCR.

Signed-off-by: Zhichao Huang <zhichao.huang@linaro.org>
---
 arch/arm/kvm/interrupts.S      |  16 +++
 arch/arm/kvm/interrupts_head.S | 249 ++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 263 insertions(+), 2 deletions(-)

Comments

Christoffer Dall June 30, 2015, 1:15 p.m. UTC | #1
On Mon, Jun 22, 2015 at 06:41:32PM +0800, Zhichao Huang wrote:
> Implement switching of the debug registers. While the number
> of registers is massive, CPUs usually don't implement them all
> (A15 has 6 breakpoints and 4 watchpoints, which gives us a total
> of 22 registers "only").
> 
> Notice that, for ARMv7, if the CONFIG_HAVE_HW_BREAKPOINT is set in
> the guest, debug is always actively in use (ARM_DSCR_MDBGEN set).
> 
> We have to do the save/restore dance in this case, because the host
> and the guest might use their respective debug registers at any moment.

this sounds expensive, and I suggested an alternative approach in the
previsou patch.  In any case, measuring the impact on this on hardware
would be a great idea...

> 
> If the CONFIG_HAVE_HW_BREAKPOINT is not set, and if no one flagged
> the debug registers as dirty, we only save/resotre DBGDSCR.

restore

> 
> Signed-off-by: Zhichao Huang <zhichao.huang@linaro.org>
> ---
>  arch/arm/kvm/interrupts.S      |  16 +++
>  arch/arm/kvm/interrupts_head.S | 249 ++++++++++++++++++++++++++++++++++++++++-
>  2 files changed, 263 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S
> index 79caf79..d626275 100644
> --- a/arch/arm/kvm/interrupts.S
> +++ b/arch/arm/kvm/interrupts.S
> @@ -116,6 +116,12 @@ ENTRY(__kvm_vcpu_run)
>  	read_cp15_state store_to_vcpu = 0
>  	write_cp15_state read_from_vcpu = 1
>  
> +	@ Store hardware CP14 state and load guest state
> +	compute_debug_state 1f
> +	bl __save_host_debug_regs
> +	bl __restore_guest_debug_regs
> +
> +1:
>  	@ If the host kernel has not been configured with VFPv3 support,
>  	@ then it is safer if we deny guests from using it as well.
>  #ifdef CONFIG_VFPv3
> @@ -201,6 +207,16 @@ after_vfp_restore:
>  	mrc	p15, 0, r2, c0, c0, 5
>  	mcr	p15, 4, r2, c0, c0, 5
>  
> +	@ Store guest CP14 state and restore host state
> +	skip_debug_state 1f
> +	bl __save_guest_debug_regs
> +	bl __restore_host_debug_regs
> +	/* Clear the dirty flag for the next run, as all the state has
> +	 * already been saved. Note that we nuke the whole 32bit word.
> +	 * If we ever add more flags, we'll have to be more careful...
> +	 */
> +	clear_debug_dirty_bit
> +1:
>  	@ Store guest CP15 state and restore host state
>  	read_cp15_state store_to_vcpu = 1
>  	write_cp15_state read_from_vcpu = 0
> diff --git a/arch/arm/kvm/interrupts_head.S b/arch/arm/kvm/interrupts_head.S
> index 5662c39..ed406be 100644
> --- a/arch/arm/kvm/interrupts_head.S
> +++ b/arch/arm/kvm/interrupts_head.S
> @@ -7,6 +7,7 @@
>  #define VCPU_USR_SP		(VCPU_USR_REG(13))
>  #define VCPU_USR_LR		(VCPU_USR_REG(14))
>  #define CP15_OFFSET(_cp15_reg_idx) (VCPU_CP15 + (_cp15_reg_idx * 4))
> +#define CP14_OFFSET(_cp14_reg_idx) (VCPU_CP14 + ((_cp14_reg_idx) * 4))
>  
>  /*
>   * Many of these macros need to access the VCPU structure, which is always
> @@ -168,8 +169,7 @@ vcpu	.req	r0		@ vcpu pointer always in r0
>   * Clobbers *all* registers.
>   */
>  .macro restore_guest_regs
> -	/* reset DBGDSCR to disable debug mode */
> -	mov	r2, #0
> +	ldr	r2, [vcpu, #CP14_OFFSET(cp14_DBGDSCRext)]
>  	mcr	p14, 0, r2, c0, c2, 2
>  
>  	restore_guest_regs_mode svc, #VCPU_SVC_REGS
> @@ -250,6 +250,10 @@ vcpu	.req	r0		@ vcpu pointer always in r0
>  	save_guest_regs_mode abt, #VCPU_ABT_REGS
>  	save_guest_regs_mode und, #VCPU_UND_REGS
>  	save_guest_regs_mode irq, #VCPU_IRQ_REGS
> +
> +	/* DBGDSCR reg */
> +	mrc	p14, 0, r2, c0, c1, 0
> +	str	r2, [vcpu, #CP14_OFFSET(cp14_DBGDSCRext)]
>  .endm
>  
>  /* Reads cp15 registers from hardware and stores them in memory
> @@ -449,6 +453,231 @@ vcpu	.req	r0		@ vcpu pointer always in r0
>  	str	r5, [vcpu, #VCPU_DEBUG_FLAGS]
>  .endm
>  
> +/* Assume r11/r12 in used, clobbers r2-r10 */
> +.macro cp14_read_and_push Op2 skip_num
> +	cmp	\skip_num, #8
> +	// if (skip_num >= 8) then skip c8-c15 directly
> +	bge	1f
> +	adr	r2, 9998f
> +	add	r2, r2, \skip_num, lsl #2
> +	bx	r2
> +1:
> +	adr	r2, 9999f
> +	sub	r3, \skip_num, #8
> +	add	r2, r2, r3, lsl #2
> +	bx	r2
> +9998:
> +	mrc	p14, 0, r10, c0, c15, \Op2
> +	mrc	p14, 0, r9, c0, c14, \Op2
> +	mrc	p14, 0, r8, c0, c13, \Op2
> +	mrc	p14, 0, r7, c0, c12, \Op2
> +	mrc	p14, 0, r6, c0, c11, \Op2
> +	mrc	p14, 0, r5, c0, c10, \Op2
> +	mrc	p14, 0, r4, c0, c9, \Op2
> +	mrc	p14, 0, r3, c0, c8, \Op2
> +	push	{r3-r10}

you probably don't want to do more stores to memory than required

> +9999:
> +	mrc	p14, 0, r10, c0, c7, \Op2
> +	mrc	p14, 0, r9, c0, c6, \Op2
> +	mrc	p14, 0, r8, c0, c5, \Op2
> +	mrc	p14, 0, r7, c0, c4, \Op2
> +	mrc	p14, 0, r6, c0, c3, \Op2
> +	mrc	p14, 0, r5, c0, c2, \Op2
> +	mrc	p14, 0, r4, c0, c1, \Op2
> +	mrc	p14, 0, r3, c0, c0, \Op2
> +	push	{r3-r10}

same

> +.endm
> +
> +/* Assume r11/r12 in used, clobbers r2-r10 */
> +.macro cp14_pop_and_write Op2 skip_num
> +	cmp	\skip_num, #8
> +	// if (skip_num >= 8) then skip c8-c15 directly
> +	bge	1f
> +	adr	r2, 9998f
> +	add	r2, r2, \skip_num, lsl #2
> +	pop	{r3-r10}

you probably don't want to do more loads from memory than required

> +	bx	r2
> +1:
> +	adr	r2, 9999f
> +	sub	r3, \skip_num, #8
> +	add	r2, r2, r3, lsl #2
> +	pop	{r3-r10}

same

> +	bx	r2
> +
> +9998:
> +	mcr	p14, 0, r10, c0, c15, \Op2
> +	mcr	p14, 0, r9, c0, c14, \Op2
> +	mcr	p14, 0, r8, c0, c13, \Op2
> +	mcr	p14, 0, r7, c0, c12, \Op2
> +	mcr	p14, 0, r6, c0, c11, \Op2
> +	mcr	p14, 0, r5, c0, c10, \Op2
> +	mcr	p14, 0, r4, c0, c9, \Op2
> +	mcr	p14, 0, r3, c0, c8, \Op2
> +
> +	pop	{r3-r10}
> +9999:
> +	mcr	p14, 0, r10, c0, c7, \Op2
> +	mcr	p14, 0, r9, c0, c6, \Op2
> +	mcr	p14, 0, r8, c0, c5, \Op2
> +	mcr	p14, 0, r7, c0, c4, \Op2
> +	mcr	p14, 0, r6, c0, c3, \Op2
> +	mcr	p14, 0, r5, c0, c2, \Op2
> +	mcr	p14, 0, r4, c0, c1, \Op2
> +	mcr	p14, 0, r3, c0, c0, \Op2
> +.endm
> +
> +/* Assume r11/r12 in used, clobbers r2-r3 */
> +.macro cp14_read_and_str Op2 cp14_reg0 skip_num
> +	adr	r3, 1f
> +	add	r3, r3, \skip_num, lsl #3
> +	bx	r3
> +1:
> +	mrc	p14, 0, r2, c0, c15, \Op2
> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+15)]
> +	mrc	p14, 0, r2, c0, c14, \Op2
> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+14)]
> +	mrc	p14, 0, r2, c0, c13, \Op2
> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+13)]
> +	mrc	p14, 0, r2, c0, c12, \Op2
> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+12)]
> +	mrc	p14, 0, r2, c0, c11, \Op2
> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+11)]
> +	mrc	p14, 0, r2, c0, c10, \Op2
> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+10)]
> +	mrc	p14, 0, r2, c0, c9, \Op2
> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+9)]
> +	mrc	p14, 0, r2, c0, c8, \Op2
> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+8)]
> +	mrc	p14, 0, r2, c0, c7, \Op2
> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+7)]
> +	mrc	p14, 0, r2, c0, c6, \Op2
> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+6)]
> +	mrc	p14, 0, r2, c0, c5, \Op2
> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+5)]
> +	mrc	p14, 0, r2, c0, c4, \Op2
> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+4)]
> +	mrc	p14, 0, r2, c0, c3, \Op2
> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+3)]
> +	mrc	p14, 0, r2, c0, c2, \Op2
> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+2)]
> +	mrc	p14, 0, r2, c0, c1, \Op2
> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+1)]
> +	mrc	p14, 0, r2, c0, c0, \Op2
> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0)]
> +.endm
> +
> +/* Assume r11/r12 in used, clobbers r2-r3 */
> +.macro cp14_ldr_and_write Op2 cp14_reg0 skip_num
> +	adr	r3, 1f
> +	add	r3, r3, \skip_num, lsl #3
> +	bx	r3
> +1:
> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+15)]
> +	mcr	p14, 0, r2, c0, c15, \Op2
> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+14)]
> +	mcr	p14, 0, r2, c0, c14, \Op2
> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+13)]
> +	mcr	p14, 0, r2, c0, c13, \Op2
> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+12)]
> +	mcr	p14, 0, r2, c0, c12, \Op2
> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+11)]
> +	mcr	p14, 0, r2, c0, c11, \Op2
> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+10)]
> +	mcr	p14, 0, r2, c0, c10, \Op2
> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+9)]
> +	mcr	p14, 0, r2, c0, c9, \Op2
> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+8)]
> +	mcr	p14, 0, r2, c0, c8, \Op2
> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+7)]
> +	mcr	p14, 0, r2, c0, c7, \Op2
> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+6)]
> +	mcr	p14, 0, r2, c0, c6, \Op2
> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+5)]
> +	mcr	p14, 0, r2, c0, c5, \Op2
> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+4)]
> +	mcr	p14, 0, r2, c0, c4, \Op2
> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+3)]
> +	mcr	p14, 0, r2, c0, c3, \Op2
> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+2)]
> +	mcr	p14, 0, r2, c0, c2, \Op2
> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+1)]
> +	mcr	p14, 0, r2, c0, c1, \Op2
> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0)]
> +	mcr	p14, 0, r2, c0, c0, \Op2
> +.endm

can you not find some way of unifying cp14_pop_and_write with
cp14_ldr_and_write and cp14_read_and_push with cp14_read_and_str ?

Probably having two separate structs for the VFP state on the vcpu struct
for both the guest and the host state is one possible way of doing so.

> +
> +/* Get extract number of BRPs and WRPs. Saved in r11/r12 */
> +.macro read_hw_dbg_num
> +	mrc	p14, 0, r2, c0, c0, 0
> +	ubfx	r11, r2, #24, #4
> +	add	r11, r11, #1		// Extract BRPs
> +	ubfx	r12, r2, #28, #4
> +	add	r12, r12, #1		// Extract WRPs
> +	mov	r2, #16
> +	sub	r11, r2, r11		// How many BPs to skip
> +	sub	r12, r2, r12		// How many WPs to skip
> +.endm
> +
> +/* Reads cp14 registers from hardware.

You have a lot of multi-line comments in these patches which don't start
with a separate '/*' line, as dictated by the Linux kernel coding style.
So far, I've ignored this, but please fix all these throughout the
series when you respin.

> + * Writes cp14 registers in-order to the stack.
> + *
> + * Assumes vcpu pointer in vcpu reg
> + *
> + * Clobbers r2-r12
> + */
> +.macro save_host_debug_regs
> +	read_hw_dbg_num
> +	cp14_read_and_push #4, r11	@ DBGBVR
> +	cp14_read_and_push #5, r11	@ DBGBCR
> +	cp14_read_and_push #6, r12	@ DBGWVR
> +	cp14_read_and_push #7, r12	@ DBGWCR
> +.endm
> +
> +/* Reads cp14 registers from hardware.
> + * Writes cp14 registers in-order to the VCPU struct pointed to by vcpup.
> + *
> + * Assumes vcpu pointer in vcpu reg
> + *
> + * Clobbers r2-r12
> + */
> +.macro save_guest_debug_regs
> +	read_hw_dbg_num
> +	cp14_read_and_str #4, cp14_DBGBVR0, r11

why do you need the has before the op2 field?

> +	cp14_read_and_str #5, cp14_DBGBCR0, r11
> +	cp14_read_and_str #6, cp14_DBGWVR0, r12
> +	cp14_read_and_str #7, cp14_DBGWCR0, r12
> +.endm
> +
> +/* Reads cp14 registers in-order from the stack.
> + * Writes cp14 registers to hardware.
> + *
> + * Assumes vcpu pointer in vcpu reg
> + *
> + * Clobbers r2-r12
> + */
> +.macro restore_host_debug_regs
> +	read_hw_dbg_num
> +	cp14_pop_and_write #4, r11	@ DBGBVR
> +	cp14_pop_and_write #5, r11	@ DBGBCR
> +	cp14_pop_and_write #6, r12	@ DBGWVR
> +	cp14_pop_and_write #7, r12	@ DBGWCR
> +.endm
> +
> +/* Reads cp14 registers in-order from the VCPU struct pointed to by vcpup
> + * Writes cp14 registers to hardware.
> + *
> + * Assumes vcpu pointer in vcpu reg
> + *
> + * Clobbers r2-r12
> + */
> +.macro restore_guest_debug_regs
> +	read_hw_dbg_num
> +	cp14_ldr_and_write #4, cp14_DBGBVR0, r11
> +	cp14_ldr_and_write #5, cp14_DBGBCR0, r11
> +	cp14_ldr_and_write #6, cp14_DBGWVR0, r12
> +	cp14_ldr_and_write #7, cp14_DBGWCR0, r12
> +.endm
> +
>  /*
>   * Save the VGIC CPU state into memory
>   *
> @@ -684,3 +913,19 @@ ARM_BE8(rev	r6, r6  )
>  .macro load_vcpu
>  	mrc	p15, 4, vcpu, c13, c0, 2	@ HTPIDR
>  .endm
> +
> +__save_host_debug_regs:
> +	save_host_debug_regs
> +	bx	lr
> +
> +__save_guest_debug_regs:
> +	save_guest_debug_regs
> +	bx	lr
> +
> +__restore_host_debug_regs:
> +	restore_host_debug_regs
> +	bx	lr
> +
> +__restore_guest_debug_regs:
> +	restore_guest_debug_regs
> +	bx	lr
> -- 
> 1.7.12.4
> 

Thanks,
-Christoffer
Zhichao Huang July 3, 2015, 10:06 a.m. UTC | #2
On June 30, 2015 9:15:22 PM GMT+08:00, Christoffer Dall <christoffer.dall@linaro.org> wrote:
>On Mon, Jun 22, 2015 at 06:41:32PM +0800, Zhichao Huang wrote:
>> Implement switching of the debug registers. While the number
>> of registers is massive, CPUs usually don't implement them all
>> (A15 has 6 breakpoints and 4 watchpoints, which gives us a total
>> of 22 registers "only").
>> 
>> Notice that, for ARMv7, if the CONFIG_HAVE_HW_BREAKPOINT is set in
>> the guest, debug is always actively in use (ARM_DSCR_MDBGEN set).
>> 
>> We have to do the save/restore dance in this case, because the host
>> and the guest might use their respective debug registers at any moment.
>
>this sounds expensive, and I suggested an alternative approach in the
>previsou patch.  In any case, measuring the impact on this on hardware
>would be a great idea...
>
>> 
>> If the CONFIG_HAVE_HW_BREAKPOINT is not set, and if no one flagged
>> the debug registers as dirty, we only save/resotre DBGDSCR.
>
>restore
>
>> 
>> Signed-off-by: Zhichao Huang <zhichao.huang@linaro.org>
>> ---
>>  arch/arm/kvm/interrupts.S      |  16 +++
>>  arch/arm/kvm/interrupts_head.S | 249 ++++++++++++++++++++++++++++++++++++++++-
>>  2 files changed, 263 insertions(+), 2 deletions(-)
>> 
>> diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S
>> index 79caf79..d626275 100644
>> --- a/arch/arm/kvm/interrupts.S
>> +++ b/arch/arm/kvm/interrupts.S
>> @@ -116,6 +116,12 @@ ENTRY(__kvm_vcpu_run)
>>  	read_cp15_state store_to_vcpu = 0
>>  	write_cp15_state read_from_vcpu = 1
>>  
>> +	@ Store hardware CP14 state and load guest state
>> +	compute_debug_state 1f
>> +	bl __save_host_debug_regs
>> +	bl __restore_guest_debug_regs
>> +
>> +1:
>>  	@ If the host kernel has not been configured with VFPv3 support,
>>  	@ then it is safer if we deny guests from using it as well.
>>  #ifdef CONFIG_VFPv3
>> @@ -201,6 +207,16 @@ after_vfp_restore:
>>  	mrc	p15, 0, r2, c0, c0, 5
>>  	mcr	p15, 4, r2, c0, c0, 5
>>  
>> +	@ Store guest CP14 state and restore host state
>> +	skip_debug_state 1f
>> +	bl __save_guest_debug_regs
>> +	bl __restore_host_debug_regs
>> +	/* Clear the dirty flag for the next run, as all the state has
>> +	 * already been saved. Note that we nuke the whole 32bit word.
>> +	 * If we ever add more flags, we'll have to be more careful...
>> +	 */
>> +	clear_debug_dirty_bit
>> +1:
>>  	@ Store guest CP15 state and restore host state
>>  	read_cp15_state store_to_vcpu = 1
>>  	write_cp15_state read_from_vcpu = 0
>> diff --git a/arch/arm/kvm/interrupts_head.S b/arch/arm/kvm/interrupts_head.S
>> index 5662c39..ed406be 100644
>> --- a/arch/arm/kvm/interrupts_head.S
>> +++ b/arch/arm/kvm/interrupts_head.S
>> @@ -7,6 +7,7 @@
>>  #define VCPU_USR_SP		(VCPU_USR_REG(13))
>>  #define VCPU_USR_LR		(VCPU_USR_REG(14))
>>  #define CP15_OFFSET(_cp15_reg_idx) (VCPU_CP15 + (_cp15_reg_idx * 4))
>> +#define CP14_OFFSET(_cp14_reg_idx) (VCPU_CP14 + ((_cp14_reg_idx) *
>4))
>>  
>>  /*
>>   * Many of these macros need to access the VCPU structure, which is
>always
>> @@ -168,8 +169,7 @@ vcpu	.req	r0		@ vcpu pointer always in r0
>>   * Clobbers *all* registers.
>>   */
>>  .macro restore_guest_regs
>> -	/* reset DBGDSCR to disable debug mode */
>> -	mov	r2, #0
>> +	ldr	r2, [vcpu, #CP14_OFFSET(cp14_DBGDSCRext)]
>>  	mcr	p14, 0, r2, c0, c2, 2
>>  
>>  	restore_guest_regs_mode svc, #VCPU_SVC_REGS
>> @@ -250,6 +250,10 @@ vcpu	.req	r0		@ vcpu pointer always in r0
>>  	save_guest_regs_mode abt, #VCPU_ABT_REGS
>>  	save_guest_regs_mode und, #VCPU_UND_REGS
>>  	save_guest_regs_mode irq, #VCPU_IRQ_REGS
>> +
>> +	/* DBGDSCR reg */
>> +	mrc	p14, 0, r2, c0, c1, 0
>> +	str	r2, [vcpu, #CP14_OFFSET(cp14_DBGDSCRext)]
>>  .endm
>>  
>>  /* Reads cp15 registers from hardware and stores them in memory
>> @@ -449,6 +453,231 @@ vcpu	.req	r0		@ vcpu pointer always in r0
>>  	str	r5, [vcpu, #VCPU_DEBUG_FLAGS]
>>  .endm
>>  
>> +/* Assume r11/r12 in used, clobbers r2-r10 */
>> +.macro cp14_read_and_push Op2 skip_num
>> +	cmp	\skip_num, #8
>> +	// if (skip_num >= 8) then skip c8-c15 directly
>> +	bge	1f
>> +	adr	r2, 9998f
>> +	add	r2, r2, \skip_num, lsl #2
>> +	bx	r2
>> +1:
>> +	adr	r2, 9999f
>> +	sub	r3, \skip_num, #8
>> +	add	r2, r2, r3, lsl #2
>> +	bx	r2
>> +9998:
>> +	mrc	p14, 0, r10, c0, c15, \Op2
>> +	mrc	p14, 0, r9, c0, c14, \Op2
>> +	mrc	p14, 0, r8, c0, c13, \Op2
>> +	mrc	p14, 0, r7, c0, c12, \Op2
>> +	mrc	p14, 0, r6, c0, c11, \Op2
>> +	mrc	p14, 0, r5, c0, c10, \Op2
>> +	mrc	p14, 0, r4, c0, c9, \Op2
>> +	mrc	p14, 0, r3, c0, c8, \Op2
>> +	push	{r3-r10}
>
>you probably don't want to do more stores to memory than required

Yeah, there is no need to push some registers, but I can't find a better 
way to optimize it, is there any precedents that I can refer to?

Imaging that there are only 2 hwbrpts available, BCR0/BCR1, and we
can code like this:

Save:
		 jump to 1
    save BCR2 to r5
1:
    save BCR1 to r4
    save BCR0 to r3
    push {r3-r5}

Restore:
    pop {r3-r5}
    jump to 1
    restore r5 to BCR2
1:
    restore r4 to BCR1
    restore r3 to BCR0

Buf if we want to only push the registers we acutally need, we have to
code like this:

Save:
    jump to 1
    save BCR2 to r5
    push r5
1:
    save BCR1 to r4
    push r4
    save BCR0 to r3
    push r3

Resotre:
    jump to 1
    pop r5
    restore r5 to BCR2
1:
    pop r4
    restore r4 to BCR1
    pop r3
    restore r3 to BCR0

Then we might entercounter a mistake on restoring, as we want to pop
r4, but actually we pop r3.

>
>> +9999:
>> +	mrc	p14, 0, r10, c0, c7, \Op2
>> +	mrc	p14, 0, r9, c0, c6, \Op2
>> +	mrc	p14, 0, r8, c0, c5, \Op2
>> +	mrc	p14, 0, r7, c0, c4, \Op2
>> +	mrc	p14, 0, r6, c0, c3, \Op2
>> +	mrc	p14, 0, r5, c0, c2, \Op2
>> +	mrc	p14, 0, r4, c0, c1, \Op2
>> +	mrc	p14, 0, r3, c0, c0, \Op2
>> +	push	{r3-r10}
>
>same
>
>> +.endm
>> +
>> +/* Assume r11/r12 in used, clobbers r2-r10 */
>> +.macro cp14_pop_and_write Op2 skip_num
>> +	cmp	\skip_num, #8
>> +	// if (skip_num >= 8) then skip c8-c15 directly
>> +	bge	1f
>> +	adr	r2, 9998f
>> +	add	r2, r2, \skip_num, lsl #2
>> +	pop	{r3-r10}
>
>you probably don't want to do more loads from memory than required
>
>> +	bx	r2
>> +1:
>> +	adr	r2, 9999f
>> +	sub	r3, \skip_num, #8
>> +	add	r2, r2, r3, lsl #2
>> +	pop	{r3-r10}
>
>same
>
>> +	bx	r2
>> +
>> +9998:
>> +	mcr	p14, 0, r10, c0, c15, \Op2
>> +	mcr	p14, 0, r9, c0, c14, \Op2
>> +	mcr	p14, 0, r8, c0, c13, \Op2
>> +	mcr	p14, 0, r7, c0, c12, \Op2
>> +	mcr	p14, 0, r6, c0, c11, \Op2
>> +	mcr	p14, 0, r5, c0, c10, \Op2
>> +	mcr	p14, 0, r4, c0, c9, \Op2
>> +	mcr	p14, 0, r3, c0, c8, \Op2
>> +
>> +	pop	{r3-r10}
>> +9999:
>> +	mcr	p14, 0, r10, c0, c7, \Op2
>> +	mcr	p14, 0, r9, c0, c6, \Op2
>> +	mcr	p14, 0, r8, c0, c5, \Op2
>> +	mcr	p14, 0, r7, c0, c4, \Op2
>> +	mcr	p14, 0, r6, c0, c3, \Op2
>> +	mcr	p14, 0, r5, c0, c2, \Op2
>> +	mcr	p14, 0, r4, c0, c1, \Op2
>> +	mcr	p14, 0, r3, c0, c0, \Op2
>> +.endm
>> +
>> +/* Assume r11/r12 in used, clobbers r2-r3 */
>> +.macro cp14_read_and_str Op2 cp14_reg0 skip_num
>> +	adr	r3, 1f
>> +	add	r3, r3, \skip_num, lsl #3
>> +	bx	r3
>> +1:
>> +	mrc	p14, 0, r2, c0, c15, \Op2
>> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+15)]
>> +	mrc	p14, 0, r2, c0, c14, \Op2
>> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+14)]
>> +	mrc	p14, 0, r2, c0, c13, \Op2
>> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+13)]
>> +	mrc	p14, 0, r2, c0, c12, \Op2
>> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+12)]
>> +	mrc	p14, 0, r2, c0, c11, \Op2
>> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+11)]
>> +	mrc	p14, 0, r2, c0, c10, \Op2
>> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+10)]
>> +	mrc	p14, 0, r2, c0, c9, \Op2
>> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+9)]
>> +	mrc	p14, 0, r2, c0, c8, \Op2
>> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+8)]
>> +	mrc	p14, 0, r2, c0, c7, \Op2
>> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+7)]
>> +	mrc	p14, 0, r2, c0, c6, \Op2
>> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+6)]
>> +	mrc	p14, 0, r2, c0, c5, \Op2
>> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+5)]
>> +	mrc	p14, 0, r2, c0, c4, \Op2
>> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+4)]
>> +	mrc	p14, 0, r2, c0, c3, \Op2
>> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+3)]
>> +	mrc	p14, 0, r2, c0, c2, \Op2
>> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+2)]
>> +	mrc	p14, 0, r2, c0, c1, \Op2
>> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+1)]
>> +	mrc	p14, 0, r2, c0, c0, \Op2
>> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0)]
>> +.endm
>> +
>> +/* Assume r11/r12 in used, clobbers r2-r3 */
>> +.macro cp14_ldr_and_write Op2 cp14_reg0 skip_num
>> +	adr	r3, 1f
>> +	add	r3, r3, \skip_num, lsl #3
>> +	bx	r3
>> +1:
>> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+15)]
>> +	mcr	p14, 0, r2, c0, c15, \Op2
>> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+14)]
>> +	mcr	p14, 0, r2, c0, c14, \Op2
>> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+13)]
>> +	mcr	p14, 0, r2, c0, c13, \Op2
>> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+12)]
>> +	mcr	p14, 0, r2, c0, c12, \Op2
>> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+11)]
>> +	mcr	p14, 0, r2, c0, c11, \Op2
>> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+10)]
>> +	mcr	p14, 0, r2, c0, c10, \Op2
>> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+9)]
>> +	mcr	p14, 0, r2, c0, c9, \Op2
>> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+8)]
>> +	mcr	p14, 0, r2, c0, c8, \Op2
>> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+7)]
>> +	mcr	p14, 0, r2, c0, c7, \Op2
>> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+6)]
>> +	mcr	p14, 0, r2, c0, c6, \Op2
>> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+5)]
>> +	mcr	p14, 0, r2, c0, c5, \Op2
>> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+4)]
>> +	mcr	p14, 0, r2, c0, c4, \Op2
>> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+3)]
>> +	mcr	p14, 0, r2, c0, c3, \Op2
>> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+2)]
>> +	mcr	p14, 0, r2, c0, c2, \Op2
>> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+1)]
>> +	mcr	p14, 0, r2, c0, c1, \Op2
>> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0)]
>> +	mcr	p14, 0, r2, c0, c0, \Op2
>> +.endm
>
>can you not find some way of unifying cp14_pop_and_write with
>cp14_ldr_and_write and cp14_read_and_push with cp14_read_and_str ?
>
>Probably having two separate structs for the VFP state on the vcpu
>struct
>for both the guest and the host state is one possible way of doing so.
>

OK, I will do it.
Would you like me to rename the struct vfp_hard_struct, and add 
host_cp14_state in there, or add a new struct host_cp14_state in the
kvm_vcpu_arch?

>> +
>> +/* Get extract number of BRPs and WRPs. Saved in r11/r12 */
>> +.macro read_hw_dbg_num
>> +	mrc	p14, 0, r2, c0, c0, 0
>> +	ubfx	r11, r2, #24, #4
>> +	add	r11, r11, #1		// Extract BRPs
>> +	ubfx	r12, r2, #28, #4
>> +	add	r12, r12, #1		// Extract WRPs
>> +	mov	r2, #16
>> +	sub	r11, r2, r11		// How many BPs to skip
>> +	sub	r12, r2, r12		// How many WPs to skip
>> +.endm
>> +
>> +/* Reads cp14 registers from hardware.
>
>You have a lot of multi-line comments in these patches which don't
>start
>with a separate '/*' line, as dictated by the Linux kernel coding
>style.
>So far, I've ignored this, but please fix all these throughout the
>series when you respin.
>
>> + * Writes cp14 registers in-order to the stack.
>> + *
>> + * Assumes vcpu pointer in vcpu reg
>> + *
>> + * Clobbers r2-r12
>> + */
>> +.macro save_host_debug_regs
>> +	read_hw_dbg_num
>> +	cp14_read_and_push #4, r11	@ DBGBVR
>> +	cp14_read_and_push #5, r11	@ DBGBCR
>> +	cp14_read_and_push #6, r12	@ DBGWVR
>> +	cp14_read_and_push #7, r12	@ DBGWCR
>> +.endm
>> +
>> +/* Reads cp14 registers from hardware.
>> + * Writes cp14 registers in-order to the VCPU struct pointed to by
>vcpup.
>> + *
>> + * Assumes vcpu pointer in vcpu reg
>> + *
>> + * Clobbers r2-r12
>> + */
>> +.macro save_guest_debug_regs
>> +	read_hw_dbg_num
>> +	cp14_read_and_str #4, cp14_DBGBVR0, r11
>
>why do you need the has before the op2 field?

Sorry, I can't quite understand.

>
>> +	cp14_read_and_str #5, cp14_DBGBCR0, r11
>> +	cp14_read_and_str #6, cp14_DBGWVR0, r12
>> +	cp14_read_and_str #7, cp14_DBGWCR0, r12
>> +.endm
>> +
>> +/* Reads cp14 registers in-order from the stack.
>> + * Writes cp14 registers to hardware.
>> + *
>> + * Assumes vcpu pointer in vcpu reg
>> + *
>> + * Clobbers r2-r12
>> + */
>> +.macro restore_host_debug_regs
>> +	read_hw_dbg_num
>> +	cp14_pop_and_write #4, r11	@ DBGBVR
>> +	cp14_pop_and_write #5, r11	@ DBGBCR
>> +	cp14_pop_and_write #6, r12	@ DBGWVR
>> +	cp14_pop_and_write #7, r12	@ DBGWCR
>> +.endm
>> +
>> +/* Reads cp14 registers in-order from the VCPU struct pointed to by
>vcpup
>> + * Writes cp14 registers to hardware.
>> + *
>> + * Assumes vcpu pointer in vcpu reg
>> + *
>> + * Clobbers r2-r12
>> + */
>> +.macro restore_guest_debug_regs
>> +	read_hw_dbg_num
>> +	cp14_ldr_and_write #4, cp14_DBGBVR0, r11
>> +	cp14_ldr_and_write #5, cp14_DBGBCR0, r11
>> +	cp14_ldr_and_write #6, cp14_DBGWVR0, r12
>> +	cp14_ldr_and_write #7, cp14_DBGWCR0, r12
>> +.endm
>> +
>>  /*
>>   * Save the VGIC CPU state into memory
>>   *
>> @@ -684,3 +913,19 @@ ARM_BE8(rev	r6, r6  )
>>  .macro load_vcpu
>>  	mrc	p15, 4, vcpu, c13, c0, 2	@ HTPIDR
>>  .endm
>> +
>> +__save_host_debug_regs:
>> +	save_host_debug_regs
>> +	bx	lr
>> +
>> +__save_guest_debug_regs:
>> +	save_guest_debug_regs
>> +	bx	lr
>> +
>> +__restore_host_debug_regs:
>> +	restore_host_debug_regs
>> +	bx	lr
>> +
>> +__restore_guest_debug_regs:
>> +	restore_guest_debug_regs
>> +	bx	lr
>> -- 
>> 1.7.12.4
>> 
>
>Thanks,
>-Christoffer
Christoffer Dall July 3, 2015, 9:05 p.m. UTC | #3
On Fri, Jul 03, 2015 at 06:06:48PM +0800, Zhichao Huang wrote:
> 
> 
> On June 30, 2015 9:15:22 PM GMT+08:00, Christoffer Dall <christoffer.dall@linaro.org> wrote:
> >On Mon, Jun 22, 2015 at 06:41:32PM +0800, Zhichao Huang wrote:
> >> Implement switching of the debug registers. While the number
> >> of registers is massive, CPUs usually don't implement them all
> >> (A15 has 6 breakpoints and 4 watchpoints, which gives us a total
> >> of 22 registers "only").
> >> 
> >> Notice that, for ARMv7, if the CONFIG_HAVE_HW_BREAKPOINT is set in
> >> the guest, debug is always actively in use (ARM_DSCR_MDBGEN set).
> >> 
> >> We have to do the save/restore dance in this case, because the host
> >> and the guest might use their respective debug registers at any moment.
> >
> >this sounds expensive, and I suggested an alternative approach in the
> >previsou patch.  In any case, measuring the impact on this on hardware
> >would be a great idea...
> >
> >> 
> >> If the CONFIG_HAVE_HW_BREAKPOINT is not set, and if no one flagged
> >> the debug registers as dirty, we only save/resotre DBGDSCR.
> >
> >restore
> >
> >> 
> >> Signed-off-by: Zhichao Huang <zhichao.huang@linaro.org>
> >> ---
> >>  arch/arm/kvm/interrupts.S      |  16 +++
> >>  arch/arm/kvm/interrupts_head.S | 249 ++++++++++++++++++++++++++++++++++++++++-
> >>  2 files changed, 263 insertions(+), 2 deletions(-)
> >> 
> >> diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S
> >> index 79caf79..d626275 100644
> >> --- a/arch/arm/kvm/interrupts.S
> >> +++ b/arch/arm/kvm/interrupts.S
> >> @@ -116,6 +116,12 @@ ENTRY(__kvm_vcpu_run)
> >>  	read_cp15_state store_to_vcpu = 0
> >>  	write_cp15_state read_from_vcpu = 1
> >>  
> >> +	@ Store hardware CP14 state and load guest state
> >> +	compute_debug_state 1f
> >> +	bl __save_host_debug_regs
> >> +	bl __restore_guest_debug_regs
> >> +
> >> +1:
> >>  	@ If the host kernel has not been configured with VFPv3 support,
> >>  	@ then it is safer if we deny guests from using it as well.
> >>  #ifdef CONFIG_VFPv3
> >> @@ -201,6 +207,16 @@ after_vfp_restore:
> >>  	mrc	p15, 0, r2, c0, c0, 5
> >>  	mcr	p15, 4, r2, c0, c0, 5
> >>  
> >> +	@ Store guest CP14 state and restore host state
> >> +	skip_debug_state 1f
> >> +	bl __save_guest_debug_regs
> >> +	bl __restore_host_debug_regs
> >> +	/* Clear the dirty flag for the next run, as all the state has
> >> +	 * already been saved. Note that we nuke the whole 32bit word.
> >> +	 * If we ever add more flags, we'll have to be more careful...
> >> +	 */
> >> +	clear_debug_dirty_bit
> >> +1:
> >>  	@ Store guest CP15 state and restore host state
> >>  	read_cp15_state store_to_vcpu = 1
> >>  	write_cp15_state read_from_vcpu = 0
> >> diff --git a/arch/arm/kvm/interrupts_head.S b/arch/arm/kvm/interrupts_head.S
> >> index 5662c39..ed406be 100644
> >> --- a/arch/arm/kvm/interrupts_head.S
> >> +++ b/arch/arm/kvm/interrupts_head.S
> >> @@ -7,6 +7,7 @@
> >>  #define VCPU_USR_SP		(VCPU_USR_REG(13))
> >>  #define VCPU_USR_LR		(VCPU_USR_REG(14))
> >>  #define CP15_OFFSET(_cp15_reg_idx) (VCPU_CP15 + (_cp15_reg_idx * 4))
> >> +#define CP14_OFFSET(_cp14_reg_idx) (VCPU_CP14 + ((_cp14_reg_idx) *
> >4))
> >>  
> >>  /*
> >>   * Many of these macros need to access the VCPU structure, which is
> >always
> >> @@ -168,8 +169,7 @@ vcpu	.req	r0		@ vcpu pointer always in r0
> >>   * Clobbers *all* registers.
> >>   */
> >>  .macro restore_guest_regs
> >> -	/* reset DBGDSCR to disable debug mode */
> >> -	mov	r2, #0
> >> +	ldr	r2, [vcpu, #CP14_OFFSET(cp14_DBGDSCRext)]
> >>  	mcr	p14, 0, r2, c0, c2, 2
> >>  
> >>  	restore_guest_regs_mode svc, #VCPU_SVC_REGS
> >> @@ -250,6 +250,10 @@ vcpu	.req	r0		@ vcpu pointer always in r0
> >>  	save_guest_regs_mode abt, #VCPU_ABT_REGS
> >>  	save_guest_regs_mode und, #VCPU_UND_REGS
> >>  	save_guest_regs_mode irq, #VCPU_IRQ_REGS
> >> +
> >> +	/* DBGDSCR reg */
> >> +	mrc	p14, 0, r2, c0, c1, 0
> >> +	str	r2, [vcpu, #CP14_OFFSET(cp14_DBGDSCRext)]
> >>  .endm
> >>  
> >>  /* Reads cp15 registers from hardware and stores them in memory
> >> @@ -449,6 +453,231 @@ vcpu	.req	r0		@ vcpu pointer always in r0
> >>  	str	r5, [vcpu, #VCPU_DEBUG_FLAGS]
> >>  .endm
> >>  
> >> +/* Assume r11/r12 in used, clobbers r2-r10 */
> >> +.macro cp14_read_and_push Op2 skip_num
> >> +	cmp	\skip_num, #8
> >> +	// if (skip_num >= 8) then skip c8-c15 directly
> >> +	bge	1f
> >> +	adr	r2, 9998f
> >> +	add	r2, r2, \skip_num, lsl #2
> >> +	bx	r2
> >> +1:
> >> +	adr	r2, 9999f
> >> +	sub	r3, \skip_num, #8
> >> +	add	r2, r2, r3, lsl #2
> >> +	bx	r2
> >> +9998:
> >> +	mrc	p14, 0, r10, c0, c15, \Op2
> >> +	mrc	p14, 0, r9, c0, c14, \Op2
> >> +	mrc	p14, 0, r8, c0, c13, \Op2
> >> +	mrc	p14, 0, r7, c0, c12, \Op2
> >> +	mrc	p14, 0, r6, c0, c11, \Op2
> >> +	mrc	p14, 0, r5, c0, c10, \Op2
> >> +	mrc	p14, 0, r4, c0, c9, \Op2
> >> +	mrc	p14, 0, r3, c0, c8, \Op2
> >> +	push	{r3-r10}
> >
> >you probably don't want to do more stores to memory than required
> 
> Yeah, there is no need to push some registers, but I can't find a better 
> way to optimize it, is there any precedents that I can refer to?

Can you not simply do what you do below where you read the coproc
register and then do the store of that and so on?

If you unify the two approaches you should be in the clear on this one
anyway...

> 
> Imaging that there are only 2 hwbrpts available, BCR0/BCR1, and we
> can code like this:
> 
> Save:
> 		 jump to 1
>     save BCR2 to r5
> 1:
>     save BCR1 to r4
>     save BCR0 to r3
>     push {r3-r5}
> 
> Restore:
>     pop {r3-r5}
>     jump to 1
>     restore r5 to BCR2
> 1:
>     restore r4 to BCR1
>     restore r3 to BCR0
> 
> Buf if we want to only push the registers we acutally need, we have to
> code like this:
> 
> Save:
>     jump to 1
>     save BCR2 to r5
>     push r5
> 1:
>     save BCR1 to r4
>     push r4
>     save BCR0 to r3
>     push r3
> 
> Resotre:
>     jump to 1
>     pop r5
>     restore r5 to BCR2
> 1:
>     pop r4
>     restore r4 to BCR1
>     pop r3
>     restore r3 to BCR0
> 
> Then we might entercounter a mistake on restoring, as we want to pop
> r4, but actually we pop r3.
> 
> >
> >> +9999:
> >> +	mrc	p14, 0, r10, c0, c7, \Op2
> >> +	mrc	p14, 0, r9, c0, c6, \Op2
> >> +	mrc	p14, 0, r8, c0, c5, \Op2
> >> +	mrc	p14, 0, r7, c0, c4, \Op2
> >> +	mrc	p14, 0, r6, c0, c3, \Op2
> >> +	mrc	p14, 0, r5, c0, c2, \Op2
> >> +	mrc	p14, 0, r4, c0, c1, \Op2
> >> +	mrc	p14, 0, r3, c0, c0, \Op2
> >> +	push	{r3-r10}
> >
> >same
> >
> >> +.endm
> >> +
> >> +/* Assume r11/r12 in used, clobbers r2-r10 */
> >> +.macro cp14_pop_and_write Op2 skip_num
> >> +	cmp	\skip_num, #8
> >> +	// if (skip_num >= 8) then skip c8-c15 directly
> >> +	bge	1f
> >> +	adr	r2, 9998f
> >> +	add	r2, r2, \skip_num, lsl #2
> >> +	pop	{r3-r10}
> >
> >you probably don't want to do more loads from memory than required
> >
> >> +	bx	r2
> >> +1:
> >> +	adr	r2, 9999f
> >> +	sub	r3, \skip_num, #8
> >> +	add	r2, r2, r3, lsl #2
> >> +	pop	{r3-r10}
> >
> >same
> >
> >> +	bx	r2
> >> +
> >> +9998:
> >> +	mcr	p14, 0, r10, c0, c15, \Op2
> >> +	mcr	p14, 0, r9, c0, c14, \Op2
> >> +	mcr	p14, 0, r8, c0, c13, \Op2
> >> +	mcr	p14, 0, r7, c0, c12, \Op2
> >> +	mcr	p14, 0, r6, c0, c11, \Op2
> >> +	mcr	p14, 0, r5, c0, c10, \Op2
> >> +	mcr	p14, 0, r4, c0, c9, \Op2
> >> +	mcr	p14, 0, r3, c0, c8, \Op2
> >> +
> >> +	pop	{r3-r10}
> >> +9999:
> >> +	mcr	p14, 0, r10, c0, c7, \Op2
> >> +	mcr	p14, 0, r9, c0, c6, \Op2
> >> +	mcr	p14, 0, r8, c0, c5, \Op2
> >> +	mcr	p14, 0, r7, c0, c4, \Op2
> >> +	mcr	p14, 0, r6, c0, c3, \Op2
> >> +	mcr	p14, 0, r5, c0, c2, \Op2
> >> +	mcr	p14, 0, r4, c0, c1, \Op2
> >> +	mcr	p14, 0, r3, c0, c0, \Op2
> >> +.endm
> >> +
> >> +/* Assume r11/r12 in used, clobbers r2-r3 */
> >> +.macro cp14_read_and_str Op2 cp14_reg0 skip_num
> >> +	adr	r3, 1f
> >> +	add	r3, r3, \skip_num, lsl #3
> >> +	bx	r3
> >> +1:
> >> +	mrc	p14, 0, r2, c0, c15, \Op2
> >> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+15)]
> >> +	mrc	p14, 0, r2, c0, c14, \Op2
> >> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+14)]
> >> +	mrc	p14, 0, r2, c0, c13, \Op2
> >> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+13)]
> >> +	mrc	p14, 0, r2, c0, c12, \Op2
> >> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+12)]
> >> +	mrc	p14, 0, r2, c0, c11, \Op2
> >> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+11)]
> >> +	mrc	p14, 0, r2, c0, c10, \Op2
> >> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+10)]
> >> +	mrc	p14, 0, r2, c0, c9, \Op2
> >> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+9)]
> >> +	mrc	p14, 0, r2, c0, c8, \Op2
> >> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+8)]
> >> +	mrc	p14, 0, r2, c0, c7, \Op2
> >> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+7)]
> >> +	mrc	p14, 0, r2, c0, c6, \Op2
> >> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+6)]
> >> +	mrc	p14, 0, r2, c0, c5, \Op2
> >> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+5)]
> >> +	mrc	p14, 0, r2, c0, c4, \Op2
> >> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+4)]
> >> +	mrc	p14, 0, r2, c0, c3, \Op2
> >> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+3)]
> >> +	mrc	p14, 0, r2, c0, c2, \Op2
> >> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+2)]
> >> +	mrc	p14, 0, r2, c0, c1, \Op2
> >> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+1)]
> >> +	mrc	p14, 0, r2, c0, c0, \Op2
> >> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0)]
> >> +.endm
> >> +
> >> +/* Assume r11/r12 in used, clobbers r2-r3 */
> >> +.macro cp14_ldr_and_write Op2 cp14_reg0 skip_num
> >> +	adr	r3, 1f
> >> +	add	r3, r3, \skip_num, lsl #3
> >> +	bx	r3
> >> +1:
> >> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+15)]
> >> +	mcr	p14, 0, r2, c0, c15, \Op2
> >> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+14)]
> >> +	mcr	p14, 0, r2, c0, c14, \Op2
> >> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+13)]
> >> +	mcr	p14, 0, r2, c0, c13, \Op2
> >> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+12)]
> >> +	mcr	p14, 0, r2, c0, c12, \Op2
> >> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+11)]
> >> +	mcr	p14, 0, r2, c0, c11, \Op2
> >> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+10)]
> >> +	mcr	p14, 0, r2, c0, c10, \Op2
> >> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+9)]
> >> +	mcr	p14, 0, r2, c0, c9, \Op2
> >> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+8)]
> >> +	mcr	p14, 0, r2, c0, c8, \Op2
> >> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+7)]
> >> +	mcr	p14, 0, r2, c0, c7, \Op2
> >> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+6)]
> >> +	mcr	p14, 0, r2, c0, c6, \Op2
> >> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+5)]
> >> +	mcr	p14, 0, r2, c0, c5, \Op2
> >> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+4)]
> >> +	mcr	p14, 0, r2, c0, c4, \Op2
> >> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+3)]
> >> +	mcr	p14, 0, r2, c0, c3, \Op2
> >> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+2)]
> >> +	mcr	p14, 0, r2, c0, c2, \Op2
> >> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+1)]
> >> +	mcr	p14, 0, r2, c0, c1, \Op2
> >> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0)]
> >> +	mcr	p14, 0, r2, c0, c0, \Op2
> >> +.endm
> >
> >can you not find some way of unifying cp14_pop_and_write with
> >cp14_ldr_and_write and cp14_read_and_push with cp14_read_and_str ?
> >
> >Probably having two separate structs for the VFP state on the vcpu
> >struct
> >for both the guest and the host state is one possible way of doing so.
> >
> 
> OK, I will do it.
> Would you like me to rename the struct vfp_hard_struct, and add 
> host_cp14_state in there, or add a new struct host_cp14_state in the
> kvm_vcpu_arch?
> 

Not sure I understand the question exactly.  I would probably define
kvm_cpu_context_t as a new struct that includes the VFP state instead of
it being a typedef, but I haven't looked at it too carefully.


> >> +
> >> +/* Get extract number of BRPs and WRPs. Saved in r11/r12 */
> >> +.macro read_hw_dbg_num
> >> +	mrc	p14, 0, r2, c0, c0, 0
> >> +	ubfx	r11, r2, #24, #4
> >> +	add	r11, r11, #1		// Extract BRPs
> >> +	ubfx	r12, r2, #28, #4
> >> +	add	r12, r12, #1		// Extract WRPs
> >> +	mov	r2, #16
> >> +	sub	r11, r2, r11		// How many BPs to skip
> >> +	sub	r12, r2, r12		// How many WPs to skip
> >> +.endm
> >> +
> >> +/* Reads cp14 registers from hardware.
> >
> >You have a lot of multi-line comments in these patches which don't
> >start
> >with a separate '/*' line, as dictated by the Linux kernel coding
> >style.
> >So far, I've ignored this, but please fix all these throughout the
> >series when you respin.
> >
> >> + * Writes cp14 registers in-order to the stack.
> >> + *
> >> + * Assumes vcpu pointer in vcpu reg
> >> + *
> >> + * Clobbers r2-r12
> >> + */
> >> +.macro save_host_debug_regs
> >> +	read_hw_dbg_num
> >> +	cp14_read_and_push #4, r11	@ DBGBVR
> >> +	cp14_read_and_push #5, r11	@ DBGBCR
> >> +	cp14_read_and_push #6, r12	@ DBGWVR
> >> +	cp14_read_and_push #7, r12	@ DBGWCR
> >> +.endm
> >> +
> >> +/* Reads cp14 registers from hardware.
> >> + * Writes cp14 registers in-order to the VCPU struct pointed to by
> >vcpup.
> >> + *
> >> + * Assumes vcpu pointer in vcpu reg
> >> + *
> >> + * Clobbers r2-r12
> >> + */
> >> +.macro save_guest_debug_regs
> >> +	read_hw_dbg_num
> >> +	cp14_read_and_str #4, cp14_DBGBVR0, r11
> >
> >why do you need the has before the op2 field?
> 
> Sorry, I can't quite understand.
> 

heh, I meant hash, why is is '#4' instead of '4' ?

Thanks,
-Christoffer
diff mbox

Patch

diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S
index 79caf79..d626275 100644
--- a/arch/arm/kvm/interrupts.S
+++ b/arch/arm/kvm/interrupts.S
@@ -116,6 +116,12 @@  ENTRY(__kvm_vcpu_run)
 	read_cp15_state store_to_vcpu = 0
 	write_cp15_state read_from_vcpu = 1
 
+	@ Store hardware CP14 state and load guest state
+	compute_debug_state 1f
+	bl __save_host_debug_regs
+	bl __restore_guest_debug_regs
+
+1:
 	@ If the host kernel has not been configured with VFPv3 support,
 	@ then it is safer if we deny guests from using it as well.
 #ifdef CONFIG_VFPv3
@@ -201,6 +207,16 @@  after_vfp_restore:
 	mrc	p15, 0, r2, c0, c0, 5
 	mcr	p15, 4, r2, c0, c0, 5
 
+	@ Store guest CP14 state and restore host state
+	skip_debug_state 1f
+	bl __save_guest_debug_regs
+	bl __restore_host_debug_regs
+	/* Clear the dirty flag for the next run, as all the state has
+	 * already been saved. Note that we nuke the whole 32bit word.
+	 * If we ever add more flags, we'll have to be more careful...
+	 */
+	clear_debug_dirty_bit
+1:
 	@ Store guest CP15 state and restore host state
 	read_cp15_state store_to_vcpu = 1
 	write_cp15_state read_from_vcpu = 0
diff --git a/arch/arm/kvm/interrupts_head.S b/arch/arm/kvm/interrupts_head.S
index 5662c39..ed406be 100644
--- a/arch/arm/kvm/interrupts_head.S
+++ b/arch/arm/kvm/interrupts_head.S
@@ -7,6 +7,7 @@ 
 #define VCPU_USR_SP		(VCPU_USR_REG(13))
 #define VCPU_USR_LR		(VCPU_USR_REG(14))
 #define CP15_OFFSET(_cp15_reg_idx) (VCPU_CP15 + (_cp15_reg_idx * 4))
+#define CP14_OFFSET(_cp14_reg_idx) (VCPU_CP14 + ((_cp14_reg_idx) * 4))
 
 /*
  * Many of these macros need to access the VCPU structure, which is always
@@ -168,8 +169,7 @@  vcpu	.req	r0		@ vcpu pointer always in r0
  * Clobbers *all* registers.
  */
 .macro restore_guest_regs
-	/* reset DBGDSCR to disable debug mode */
-	mov	r2, #0
+	ldr	r2, [vcpu, #CP14_OFFSET(cp14_DBGDSCRext)]
 	mcr	p14, 0, r2, c0, c2, 2
 
 	restore_guest_regs_mode svc, #VCPU_SVC_REGS
@@ -250,6 +250,10 @@  vcpu	.req	r0		@ vcpu pointer always in r0
 	save_guest_regs_mode abt, #VCPU_ABT_REGS
 	save_guest_regs_mode und, #VCPU_UND_REGS
 	save_guest_regs_mode irq, #VCPU_IRQ_REGS
+
+	/* DBGDSCR reg */
+	mrc	p14, 0, r2, c0, c1, 0
+	str	r2, [vcpu, #CP14_OFFSET(cp14_DBGDSCRext)]
 .endm
 
 /* Reads cp15 registers from hardware and stores them in memory
@@ -449,6 +453,231 @@  vcpu	.req	r0		@ vcpu pointer always in r0
 	str	r5, [vcpu, #VCPU_DEBUG_FLAGS]
 .endm
 
+/* Assume r11/r12 in used, clobbers r2-r10 */
+.macro cp14_read_and_push Op2 skip_num
+	cmp	\skip_num, #8
+	// if (skip_num >= 8) then skip c8-c15 directly
+	bge	1f
+	adr	r2, 9998f
+	add	r2, r2, \skip_num, lsl #2
+	bx	r2
+1:
+	adr	r2, 9999f
+	sub	r3, \skip_num, #8
+	add	r2, r2, r3, lsl #2
+	bx	r2
+9998:
+	mrc	p14, 0, r10, c0, c15, \Op2
+	mrc	p14, 0, r9, c0, c14, \Op2
+	mrc	p14, 0, r8, c0, c13, \Op2
+	mrc	p14, 0, r7, c0, c12, \Op2
+	mrc	p14, 0, r6, c0, c11, \Op2
+	mrc	p14, 0, r5, c0, c10, \Op2
+	mrc	p14, 0, r4, c0, c9, \Op2
+	mrc	p14, 0, r3, c0, c8, \Op2
+	push	{r3-r10}
+9999:
+	mrc	p14, 0, r10, c0, c7, \Op2
+	mrc	p14, 0, r9, c0, c6, \Op2
+	mrc	p14, 0, r8, c0, c5, \Op2
+	mrc	p14, 0, r7, c0, c4, \Op2
+	mrc	p14, 0, r6, c0, c3, \Op2
+	mrc	p14, 0, r5, c0, c2, \Op2
+	mrc	p14, 0, r4, c0, c1, \Op2
+	mrc	p14, 0, r3, c0, c0, \Op2
+	push	{r3-r10}
+.endm
+
+/* Assume r11/r12 in used, clobbers r2-r10 */
+.macro cp14_pop_and_write Op2 skip_num
+	cmp	\skip_num, #8
+	// if (skip_num >= 8) then skip c8-c15 directly
+	bge	1f
+	adr	r2, 9998f
+	add	r2, r2, \skip_num, lsl #2
+	pop	{r3-r10}
+	bx	r2
+1:
+	adr	r2, 9999f
+	sub	r3, \skip_num, #8
+	add	r2, r2, r3, lsl #2
+	pop	{r3-r10}
+	bx	r2
+
+9998:
+	mcr	p14, 0, r10, c0, c15, \Op2
+	mcr	p14, 0, r9, c0, c14, \Op2
+	mcr	p14, 0, r8, c0, c13, \Op2
+	mcr	p14, 0, r7, c0, c12, \Op2
+	mcr	p14, 0, r6, c0, c11, \Op2
+	mcr	p14, 0, r5, c0, c10, \Op2
+	mcr	p14, 0, r4, c0, c9, \Op2
+	mcr	p14, 0, r3, c0, c8, \Op2
+
+	pop	{r3-r10}
+9999:
+	mcr	p14, 0, r10, c0, c7, \Op2
+	mcr	p14, 0, r9, c0, c6, \Op2
+	mcr	p14, 0, r8, c0, c5, \Op2
+	mcr	p14, 0, r7, c0, c4, \Op2
+	mcr	p14, 0, r6, c0, c3, \Op2
+	mcr	p14, 0, r5, c0, c2, \Op2
+	mcr	p14, 0, r4, c0, c1, \Op2
+	mcr	p14, 0, r3, c0, c0, \Op2
+.endm
+
+/* Assume r11/r12 in used, clobbers r2-r3 */
+.macro cp14_read_and_str Op2 cp14_reg0 skip_num
+	adr	r3, 1f
+	add	r3, r3, \skip_num, lsl #3
+	bx	r3
+1:
+	mrc	p14, 0, r2, c0, c15, \Op2
+	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+15)]
+	mrc	p14, 0, r2, c0, c14, \Op2
+	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+14)]
+	mrc	p14, 0, r2, c0, c13, \Op2
+	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+13)]
+	mrc	p14, 0, r2, c0, c12, \Op2
+	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+12)]
+	mrc	p14, 0, r2, c0, c11, \Op2
+	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+11)]
+	mrc	p14, 0, r2, c0, c10, \Op2
+	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+10)]
+	mrc	p14, 0, r2, c0, c9, \Op2
+	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+9)]
+	mrc	p14, 0, r2, c0, c8, \Op2
+	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+8)]
+	mrc	p14, 0, r2, c0, c7, \Op2
+	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+7)]
+	mrc	p14, 0, r2, c0, c6, \Op2
+	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+6)]
+	mrc	p14, 0, r2, c0, c5, \Op2
+	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+5)]
+	mrc	p14, 0, r2, c0, c4, \Op2
+	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+4)]
+	mrc	p14, 0, r2, c0, c3, \Op2
+	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+3)]
+	mrc	p14, 0, r2, c0, c2, \Op2
+	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+2)]
+	mrc	p14, 0, r2, c0, c1, \Op2
+	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+1)]
+	mrc	p14, 0, r2, c0, c0, \Op2
+	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0)]
+.endm
+
+/* Assume r11/r12 in used, clobbers r2-r3 */
+.macro cp14_ldr_and_write Op2 cp14_reg0 skip_num
+	adr	r3, 1f
+	add	r3, r3, \skip_num, lsl #3
+	bx	r3
+1:
+	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+15)]
+	mcr	p14, 0, r2, c0, c15, \Op2
+	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+14)]
+	mcr	p14, 0, r2, c0, c14, \Op2
+	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+13)]
+	mcr	p14, 0, r2, c0, c13, \Op2
+	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+12)]
+	mcr	p14, 0, r2, c0, c12, \Op2
+	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+11)]
+	mcr	p14, 0, r2, c0, c11, \Op2
+	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+10)]
+	mcr	p14, 0, r2, c0, c10, \Op2
+	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+9)]
+	mcr	p14, 0, r2, c0, c9, \Op2
+	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+8)]
+	mcr	p14, 0, r2, c0, c8, \Op2
+	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+7)]
+	mcr	p14, 0, r2, c0, c7, \Op2
+	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+6)]
+	mcr	p14, 0, r2, c0, c6, \Op2
+	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+5)]
+	mcr	p14, 0, r2, c0, c5, \Op2
+	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+4)]
+	mcr	p14, 0, r2, c0, c4, \Op2
+	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+3)]
+	mcr	p14, 0, r2, c0, c3, \Op2
+	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+2)]
+	mcr	p14, 0, r2, c0, c2, \Op2
+	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+1)]
+	mcr	p14, 0, r2, c0, c1, \Op2
+	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0)]
+	mcr	p14, 0, r2, c0, c0, \Op2
+.endm
+
+/* Get extract number of BRPs and WRPs. Saved in r11/r12 */
+.macro read_hw_dbg_num
+	mrc	p14, 0, r2, c0, c0, 0
+	ubfx	r11, r2, #24, #4
+	add	r11, r11, #1		// Extract BRPs
+	ubfx	r12, r2, #28, #4
+	add	r12, r12, #1		// Extract WRPs
+	mov	r2, #16
+	sub	r11, r2, r11		// How many BPs to skip
+	sub	r12, r2, r12		// How many WPs to skip
+.endm
+
+/* Reads cp14 registers from hardware.
+ * Writes cp14 registers in-order to the stack.
+ *
+ * Assumes vcpu pointer in vcpu reg
+ *
+ * Clobbers r2-r12
+ */
+.macro save_host_debug_regs
+	read_hw_dbg_num
+	cp14_read_and_push #4, r11	@ DBGBVR
+	cp14_read_and_push #5, r11	@ DBGBCR
+	cp14_read_and_push #6, r12	@ DBGWVR
+	cp14_read_and_push #7, r12	@ DBGWCR
+.endm
+
+/* Reads cp14 registers from hardware.
+ * Writes cp14 registers in-order to the VCPU struct pointed to by vcpup.
+ *
+ * Assumes vcpu pointer in vcpu reg
+ *
+ * Clobbers r2-r12
+ */
+.macro save_guest_debug_regs
+	read_hw_dbg_num
+	cp14_read_and_str #4, cp14_DBGBVR0, r11
+	cp14_read_and_str #5, cp14_DBGBCR0, r11
+	cp14_read_and_str #6, cp14_DBGWVR0, r12
+	cp14_read_and_str #7, cp14_DBGWCR0, r12
+.endm
+
+/* Reads cp14 registers in-order from the stack.
+ * Writes cp14 registers to hardware.
+ *
+ * Assumes vcpu pointer in vcpu reg
+ *
+ * Clobbers r2-r12
+ */
+.macro restore_host_debug_regs
+	read_hw_dbg_num
+	cp14_pop_and_write #4, r11	@ DBGBVR
+	cp14_pop_and_write #5, r11	@ DBGBCR
+	cp14_pop_and_write #6, r12	@ DBGWVR
+	cp14_pop_and_write #7, r12	@ DBGWCR
+.endm
+
+/* Reads cp14 registers in-order from the VCPU struct pointed to by vcpup
+ * Writes cp14 registers to hardware.
+ *
+ * Assumes vcpu pointer in vcpu reg
+ *
+ * Clobbers r2-r12
+ */
+.macro restore_guest_debug_regs
+	read_hw_dbg_num
+	cp14_ldr_and_write #4, cp14_DBGBVR0, r11
+	cp14_ldr_and_write #5, cp14_DBGBCR0, r11
+	cp14_ldr_and_write #6, cp14_DBGWVR0, r12
+	cp14_ldr_and_write #7, cp14_DBGWCR0, r12
+.endm
+
 /*
  * Save the VGIC CPU state into memory
  *
@@ -684,3 +913,19 @@  ARM_BE8(rev	r6, r6  )
 .macro load_vcpu
 	mrc	p15, 4, vcpu, c13, c0, 2	@ HTPIDR
 .endm
+
+__save_host_debug_regs:
+	save_host_debug_regs
+	bx	lr
+
+__save_guest_debug_regs:
+	save_guest_debug_regs
+	bx	lr
+
+__restore_host_debug_regs:
+	restore_host_debug_regs
+	bx	lr
+
+__restore_guest_debug_regs:
+	restore_guest_debug_regs
+	bx	lr