[Xen-devel,RFC,24/49] ARM: new VGIC: Add IRQ sync/flush framework

Message ID 20180209143937.28866-25-andre.przywara@linaro.org
State New
Headers show
Series
  • New VGIC(-v2) implementation
Related show

Commit Message

Andre Przywara Feb. 9, 2018, 2:39 p.m.
Implement the framework for syncing IRQs between our emulation and the
list registers, which represent the guest's view of IRQs.
This is done in kvm_vgic_flush_hwstate and kvm_vgic_sync_hwstate, which
gets called on guest entry and exit.
The code talking to the actual GICv2/v3 hardware is added in the
following patches.

This is based on Linux commit 0919e84c0fc1, written by Marc Zyngier.

Signed-off-by: Andre Przywara <andre.przywara@linaro.org>
---
 xen/arch/arm/vgic/vgic.c | 246 +++++++++++++++++++++++++++++++++++++++++++++++
 xen/arch/arm/vgic/vgic.h |   2 +
 2 files changed, 248 insertions(+)

Comments

Julien Grall Feb. 13, 2018, 12:41 p.m. | #1
Hi Andre,

On 09/02/18 14:39, Andre Przywara wrote:
> Implement the framework for syncing IRQs between our emulation and the
> list registers, which represent the guest's view of IRQs.
> This is done in kvm_vgic_flush_hwstate and kvm_vgic_sync_hwstate, which

You probably want to update the names here.

> gets called on guest entry and exit.
> The code talking to the actual GICv2/v3 hardware is added in the
> following patches.
> 
> This is based on Linux commit 0919e84c0fc1, written by Marc Zyngier.
> 
> Signed-off-by: Andre Przywara <andre.przywara@linaro.org>
> ---
>   xen/arch/arm/vgic/vgic.c | 246 +++++++++++++++++++++++++++++++++++++++++++++++
>   xen/arch/arm/vgic/vgic.h |   2 +
>   2 files changed, 248 insertions(+)
> 
> diff --git a/xen/arch/arm/vgic/vgic.c b/xen/arch/arm/vgic/vgic.c
> index a4efd1fd03..a1f77130d4 100644
> --- a/xen/arch/arm/vgic/vgic.c
> +++ b/xen/arch/arm/vgic/vgic.c
> @@ -380,6 +380,252 @@ int vgic_inject_irq(struct domain *d, struct vcpu *vcpu, unsigned int intid,
>       return 0;
>   }
>   
> +/**
> + * vgic_prune_ap_list - Remove non-relevant interrupts from the list
> + *
> + * @vcpu: The VCPU pointer
> + *
> + * Go over the list of "interesting" interrupts, and prune those that we
> + * won't have to consider in the near future.
> + */
> +static void vgic_prune_ap_list(struct vcpu *vcpu)
> +{
> +    struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
> +    struct vgic_irq *irq, *tmp;
> +    unsigned long flags;
> +
> +retry:
> +    spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags);
> +
> +    list_for_each_entry_safe( irq, tmp, &vgic_cpu->ap_list_head, ap_list )

See my comment on patch #22, this is where I am worry about going 
through the list every time we enter to the hypervisor from the guest.

> +    {
> +        struct vcpu *target_vcpu, *vcpuA, *vcpuB;
> +
> +        spin_lock(&irq->irq_lock);
> +
> +        BUG_ON(vcpu != irq->vcpu);
> +
> +        target_vcpu = vgic_target_oracle(irq);
> +
> +        if ( !target_vcpu )
> +        {
> +            /*
> +             * We don't need to process this interrupt any
> +             * further, move it off the list.
> +             */
> +            list_del(&irq->ap_list);
> +            irq->vcpu = NULL;
> +            spin_unlock(&irq->irq_lock);
> +
> +            /*
> +             * This vgic_put_irq call matches the
> +             * vgic_get_irq_kref in vgic_queue_irq_unlock,
> +             * where we added the LPI to the ap_list. As
> +             * we remove the irq from the list, we drop
> +             * also drop the refcount.
> +             */
> +            vgic_put_irq(vcpu->domain, irq);
> +            continue;
> +        }
> +
> +        if ( target_vcpu == vcpu )
> +        {
> +            /* We're on the right CPU */
> +            spin_unlock(&irq->irq_lock);
> +            continue;
> +        }
> +
> +        /* This interrupt looks like it has to be migrated. */
> +
> +        spin_unlock(&irq->irq_lock);
> +        spin_unlock_irqrestore(&vgic_cpu->ap_list_lock, flags);
> +
> +        /*
> +         * Ensure locking order by always locking the smallest
> +         * ID first.
> +         */
> +        if ( vcpu->vcpu_id < target_vcpu->vcpu_id )
> +        {
> +            vcpuA = vcpu;
> +            vcpuB = target_vcpu;
> +        }
> +        else
> +        {
> +            vcpuA = target_vcpu;
> +            vcpuB = vcpu;
> +        }
> +
> +        spin_lock_irqsave(&vcpuA->arch.vgic_cpu.ap_list_lock, flags);
> +        spin_lock(&vcpuB->arch.vgic_cpu.ap_list_lock);
> +        spin_lock(&irq->irq_lock);
> +
> +        /*
> +         * If the affinity has been preserved, move the
> +         * interrupt around. Otherwise, it means things have
> +         * changed while the interrupt was unlocked, and we
> +         * need to replay this.
> +         *
> +         * In all cases, we cannot trust the list not to have
> +         * changed, so we restart from the beginning.
> +         */
> +        if ( target_vcpu == vgic_target_oracle(irq) )
> +        {
> +            struct vgic_cpu *new_cpu = &target_vcpu->arch.vgic_cpu;
> +
> +            list_del(&irq->ap_list);
> +            irq->vcpu = target_vcpu;
> +            list_add_tail(&irq->ap_list, &new_cpu->ap_list_head);
> +        }
> +
> +        spin_unlock(&irq->irq_lock);
> +        spin_unlock(&vcpuB->arch.vgic_cpu.ap_list_lock);
> +        spin_unlock_irqrestore(&vcpuA->arch.vgic_cpu.ap_list_lock, flags);
> +        goto retry;
> +    }
> +
> +    spin_unlock_irqrestore(&vgic_cpu->ap_list_lock, flags);
> +}
> +
> +static inline void vgic_fold_lr_state(struct vcpu *vcpu)
> +{
> +}
> +
> +/* Requires the irq_lock to be held. */
> +static inline void vgic_populate_lr(struct vcpu *vcpu,
> +                                    struct vgic_irq *irq, int lr)
> +{
> +    ASSERT(spin_is_locked(&irq->irq_lock));
> +}
> +
> +static inline void vgic_clear_lr(struct vcpu *vcpu, int lr)
> +{
> +}
> +
> +static inline void vgic_set_underflow(struct vcpu *vcpu)
> +{
> +}
> +
> +/* Requires the ap_list_lock to be held. */
> +static int compute_ap_list_depth(struct vcpu *vcpu)
> +{
> +    struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
> +    struct vgic_irq *irq;
> +    int count = 0;
> +
> +    ASSERT(spin_is_locked(&vgic_cpu->ap_list_lock));
> +
> +    list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list)

Here another example.

> +    {
> +        spin_lock(&irq->irq_lock);
> +        /* GICv2 SGIs can count for more than one... */
> +        if ( vgic_irq_is_sgi(irq->intid) && irq->source )
> +            count += hweight8(irq->source);
> +        else
> +            count++;
> +        spin_unlock(&irq->irq_lock);
> +    }
> +    return count;
> +}
> +
> +/* Requires the VCPU's ap_list_lock to be held. */
> +static void vgic_flush_lr_state(struct vcpu *vcpu)
> +{
> +    struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
> +    struct vgic_irq *irq;
> +    int count = 0;
> +
> +    ASSERT(spin_is_locked(&vgic_cpu->ap_list_lock));
> +
> +    if ( compute_ap_list_depth(vcpu) > gic_get_nr_lrs() )
> +        vgic_sort_ap_list(vcpu);
> +
> +    list_for_each_entry( irq, &vgic_cpu->ap_list_head, ap_list )
> +    {
> +        spin_lock(&irq->irq_lock);
> +
> +        if ( unlikely(vgic_target_oracle(irq) != vcpu) )
> +            goto next;
> +
> +        /*
> +         * If we get an SGI with multiple sources, try to get
> +         * them in all at once.
> +         */
> +        do
> +        {
> +            vgic_populate_lr(vcpu, irq, count++);
> +        } while ( irq->source && count < gic_get_nr_lrs() );
> +
> +next:
> +        spin_unlock(&irq->irq_lock);
> +
> +        if ( count == gic_get_nr_lrs() )
> +        {
> +            if ( !list_is_last(&irq->ap_list, &vgic_cpu->ap_list_head) )
> +                vgic_set_underflow(vcpu);
> +            break;
> +        }
> +    }
> +
> +    vcpu->arch.vgic_cpu.used_lrs = count;
> +
> +    /* Nuke remaining LRs */
> +    for ( ; count < gic_get_nr_lrs(); count++)
> +        vgic_clear_lr(vcpu, count);
> +}
> +
> +/*
> + * gic_clear_lrs() - Update the VGIC state from hardware after a guest's run.
> + * @vcpu: the VCPU.
> + *
> + * Sync back the hardware VGIC state after the guest has run, into our
> + * VGIC emulation structures, It reads the LRs and updates the respective
> + * struct vgic_irq, taking level/edge into account.
> + * This is the high level function which takes care of the conditions,
> + * also bails out early if there were no interrupts queued.
> + * Was: kvm_vgic_sync_hwstate()
> + */
> +void gic_clear_lrs(struct vcpu *vcpu)

I think I would prefer if we stick with the KVM name.

> +{
> +    struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
> +
> +    /* An empty ap_list_head implies used_lrs == 0 */
> +    if ( list_empty(&vcpu->arch.vgic_cpu.ap_list_head) )
> +        return;
> +
> +    if ( vgic_cpu->used_lrs )
> +        vgic_fold_lr_state(vcpu);
> +    vgic_prune_ap_list(vcpu);
> +}
> +
> +/*
> + * gic_inject() - flush the emulation state into the hardware on guest entry
> + *
> + * Before we enter a guest, we have to translate the virtual GIC state of a
> + * VCPU into the GIC virtualization hardware registers, namely the LRs.
> + * This is the high level function which takes care about the conditions
> + * and the locking, also bails out early if there are no interrupts queued.
> + * Was: kvm_vgic_flush_hwstate()

Ditto.

> + */
> +void gic_inject(void)
> +{
> +    /*
> +     * If there are no virtual interrupts active or pending for this
> +     * VCPU, then there is no work to do and we can bail out without
> +     * taking any lock.  There is a potential race with someone injecting
> +     * interrupts to the VCPU, but it is a benign race as the VCPU will
> +     * either observe the new interrupt before or after doing this check,
> +     * and introducing additional synchronization mechanism doesn't change
> +     * this.
> +     */
> +    if ( list_empty(&current->arch.vgic_cpu.ap_list_head) )
> +        return;
> +
> +    ASSERT(!local_irq_is_enabled());
> +
> +    spin_lock(&current->arch.vgic_cpu.ap_list_lock);
> +    vgic_flush_lr_state(current);
> +    spin_unlock(&current->arch.vgic_cpu.ap_list_lock);
> +}
>   /*
>    * Local variables:
>    * mode: C
> diff --git a/xen/arch/arm/vgic/vgic.h b/xen/arch/arm/vgic/vgic.h
> index 5127739f0f..47fc58b81e 100644
> --- a/xen/arch/arm/vgic/vgic.h
> +++ b/xen/arch/arm/vgic/vgic.h
> @@ -17,6 +17,8 @@
>   #ifndef __XEN_ARM_VGIC_NEW_H__
>   #define __XEN_ARM_VGIC_NEW_H__
>   
> +#define vgic_irq_is_sgi(intid) ((intid) < VGIC_NR_SGIS)
> +
>   static inline bool irq_is_pending(struct vgic_irq *irq)
>   {
>       if ( irq->config == VGIC_CONFIG_EDGE )
> 

Cheers,
Julien Grall Feb. 13, 2018, 2:31 p.m. | #2
Hi Andre,

On 09/02/18 14:39, Andre Przywara wrote:
> +/* Requires the VCPU's ap_list_lock to be held. */
> +static void vgic_flush_lr_state(struct vcpu *vcpu)
> +{
> +    struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
> +    struct vgic_irq *irq;
> +    int count = 0;
> +
> +    ASSERT(spin_is_locked(&vgic_cpu->ap_list_lock));
> +
> +    if ( compute_ap_list_depth(vcpu) > gic_get_nr_lrs() )
> +        vgic_sort_ap_list(vcpu);
> +
> +    list_for_each_entry( irq, &vgic_cpu->ap_list_head, ap_list )
> +    {
> +        spin_lock(&irq->irq_lock);
> +
> +        if ( unlikely(vgic_target_oracle(irq) != vcpu) )
> +            goto next;
> +
> +        /*
> +         * If we get an SGI with multiple sources, try to get
> +         * them in all at once.
> +         */
> +        do
> +        {
> +            vgic_populate_lr(vcpu, irq, count++);
> +        } while ( irq->source && count < gic_get_nr_lrs() );
> +
> +next:
> +        spin_unlock(&irq->irq_lock);
> +
> +        if ( count == gic_get_nr_lrs() )
> +        {
> +            if ( !list_is_last(&irq->ap_list, &vgic_cpu->ap_list_head) )
> +                vgic_set_underflow(vcpu);
> +            break;
> +        }
> +    }
> +
> +    vcpu->arch.vgic_cpu.used_lrs = count;
> +
> +    /* Nuke remaining LRs */
> +    for ( ; count < gic_get_nr_lrs(); count++)
> +        vgic_clear_lr(vcpu, count);

Why do you need to nuke the LRs here, don't you always zero them when 
clearing it?

Cheers,
Andre Przywara Feb. 13, 2018, 2:56 p.m. | #3
Hi,

On 13/02/18 14:31, Julien Grall wrote:
> Hi Andre,
> 
> On 09/02/18 14:39, Andre Przywara wrote:
>> +/* Requires the VCPU's ap_list_lock to be held. */
>> +static void vgic_flush_lr_state(struct vcpu *vcpu)
>> +{
>> +    struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
>> +    struct vgic_irq *irq;
>> +    int count = 0;
>> +
>> +    ASSERT(spin_is_locked(&vgic_cpu->ap_list_lock));
>> +
>> +    if ( compute_ap_list_depth(vcpu) > gic_get_nr_lrs() )
>> +        vgic_sort_ap_list(vcpu);
>> +
>> +    list_for_each_entry( irq, &vgic_cpu->ap_list_head, ap_list )
>> +    {
>> +        spin_lock(&irq->irq_lock);
>> +
>> +        if ( unlikely(vgic_target_oracle(irq) != vcpu) )
>> +            goto next;
>> +
>> +        /*
>> +         * If we get an SGI with multiple sources, try to get
>> +         * them in all at once.
>> +         */
>> +        do
>> +        {
>> +            vgic_populate_lr(vcpu, irq, count++);
>> +        } while ( irq->source && count < gic_get_nr_lrs() );
>> +
>> +next:
>> +        spin_unlock(&irq->irq_lock);
>> +
>> +        if ( count == gic_get_nr_lrs() )
>> +        {
>> +            if ( !list_is_last(&irq->ap_list, &vgic_cpu->ap_list_head) )
>> +                vgic_set_underflow(vcpu);
>> +            break;
>> +        }
>> +    }
>> +
>> +    vcpu->arch.vgic_cpu.used_lrs = count;
>> +
>> +    /* Nuke remaining LRs */
>> +    for ( ; count < gic_get_nr_lrs(); count++)
>> +        vgic_clear_lr(vcpu, count);
> 
> Why do you need to nuke the LRs here, don't you always zero them when
> clearing it?

We nuke our internal LR copies in here.
It might be interesting to see if we can get rid of those in Xen,
because we can always write to the LRs directly. But this is an
optimization I am not too keen on addressing too early, because this
deviates from the KVM VGIC architecture.

Cheers,
Andre.
Julien Grall Feb. 13, 2018, 3:01 p.m. | #4
On 13/02/18 14:56, Andre Przywara wrote:
> Hi,
> 
> On 13/02/18 14:31, Julien Grall wrote:
>> Hi Andre,
>>
>> On 09/02/18 14:39, Andre Przywara wrote:
>>> +/* Requires the VCPU's ap_list_lock to be held. */
>>> +static void vgic_flush_lr_state(struct vcpu *vcpu)
>>> +{
>>> +    struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
>>> +    struct vgic_irq *irq;
>>> +    int count = 0;
>>> +
>>> +    ASSERT(spin_is_locked(&vgic_cpu->ap_list_lock));
>>> +
>>> +    if ( compute_ap_list_depth(vcpu) > gic_get_nr_lrs() )
>>> +        vgic_sort_ap_list(vcpu);
>>> +
>>> +    list_for_each_entry( irq, &vgic_cpu->ap_list_head, ap_list )
>>> +    {
>>> +        spin_lock(&irq->irq_lock);
>>> +
>>> +        if ( unlikely(vgic_target_oracle(irq) != vcpu) )
>>> +            goto next;
>>> +
>>> +        /*
>>> +         * If we get an SGI with multiple sources, try to get
>>> +         * them in all at once.
>>> +         */
>>> +        do
>>> +        {
>>> +            vgic_populate_lr(vcpu, irq, count++);
>>> +        } while ( irq->source && count < gic_get_nr_lrs() );
>>> +
>>> +next:
>>> +        spin_unlock(&irq->irq_lock);
>>> +
>>> +        if ( count == gic_get_nr_lrs() )
>>> +        {
>>> +            if ( !list_is_last(&irq->ap_list, &vgic_cpu->ap_list_head) )
>>> +                vgic_set_underflow(vcpu);
>>> +            break;
>>> +        }
>>> +    }
>>> +
>>> +    vcpu->arch.vgic_cpu.used_lrs = count;
>>> +
>>> +    /* Nuke remaining LRs */
>>> +    for ( ; count < gic_get_nr_lrs(); count++)
>>> +        vgic_clear_lr(vcpu, count);
>>
>> Why do you need to nuke the LRs here, don't you always zero them when
>> clearing it?
> 
> We nuke our internal LR copies in here.
> It might be interesting to see if we can get rid of those in Xen,
> because we can always write to the LRs directly. But this is an
> optimization I am not too keen on addressing too early, because this
> deviates from the KVM VGIC architecture.

Oh, I thought you were writing back in the hardware when clearing. My 
mistake.

Cheers,
Andre Przywara Feb. 13, 2018, 3:40 p.m. | #5
Hi,

On 13/02/18 12:41, Julien Grall wrote:
> Hi Andre,
> 
> On 09/02/18 14:39, Andre Przywara wrote:
>> Implement the framework for syncing IRQs between our emulation and the
>> list registers, which represent the guest's view of IRQs.
>> This is done in kvm_vgic_flush_hwstate and kvm_vgic_sync_hwstate, which
> 
> You probably want to update the names here.

Sure.

>> gets called on guest entry and exit.
>> The code talking to the actual GICv2/v3 hardware is added in the
>> following patches.
>>
>> This is based on Linux commit 0919e84c0fc1, written by Marc Zyngier.
>>
>> Signed-off-by: Andre Przywara <andre.przywara@linaro.org>
>> ---
>>   xen/arch/arm/vgic/vgic.c | 246
>> +++++++++++++++++++++++++++++++++++++++++++++++
>>   xen/arch/arm/vgic/vgic.h |   2 +
>>   2 files changed, 248 insertions(+)
>>
>> diff --git a/xen/arch/arm/vgic/vgic.c b/xen/arch/arm/vgic/vgic.c
>> index a4efd1fd03..a1f77130d4 100644
>> --- a/xen/arch/arm/vgic/vgic.c
>> +++ b/xen/arch/arm/vgic/vgic.c
>> @@ -380,6 +380,252 @@ int vgic_inject_irq(struct domain *d, struct
>> vcpu *vcpu, unsigned int intid,
>>       return 0;
>>   }
>>   +/**
>> + * vgic_prune_ap_list - Remove non-relevant interrupts from the list
>> + *
>> + * @vcpu: The VCPU pointer
>> + *
>> + * Go over the list of "interesting" interrupts, and prune those that we
>> + * won't have to consider in the near future.
>> + */
>> +static void vgic_prune_ap_list(struct vcpu *vcpu)
>> +{
>> +    struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
>> +    struct vgic_irq *irq, *tmp;
>> +    unsigned long flags;
>> +
>> +retry:
>> +    spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags);
>> +
>> +    list_for_each_entry_safe( irq, tmp, &vgic_cpu->ap_list_head,
>> ap_list )
> 
> See my comment on patch #22, this is where I am worry about going
> through the list every time we enter to the hypervisor from the guest.

I am not sure we can avoid this here, as this function is crucial to the
VGIC state machine.
We might later look into if we can avoid iterating through the whole
list or if we can shortcut some interrupts somehow, but I really would
be careful tinkering with this function too much.

>> +    {
>> +        struct vcpu *target_vcpu, *vcpuA, *vcpuB;
>> +
>> +        spin_lock(&irq->irq_lock);
>> +
>> +        BUG_ON(vcpu != irq->vcpu);
>> +
>> +        target_vcpu = vgic_target_oracle(irq);
>> +
>> +        if ( !target_vcpu )
>> +        {
>> +            /*
>> +             * We don't need to process this interrupt any
>> +             * further, move it off the list.
>> +             */
>> +            list_del(&irq->ap_list);
>> +            irq->vcpu = NULL;
>> +            spin_unlock(&irq->irq_lock);
>> +
>> +            /*
>> +             * This vgic_put_irq call matches the
>> +             * vgic_get_irq_kref in vgic_queue_irq_unlock,
>> +             * where we added the LPI to the ap_list. As
>> +             * we remove the irq from the list, we drop
>> +             * also drop the refcount.
>> +             */
>> +            vgic_put_irq(vcpu->domain, irq);
>> +            continue;
>> +        }
>> +
>> +        if ( target_vcpu == vcpu )
>> +        {
>> +            /* We're on the right CPU */
>> +            spin_unlock(&irq->irq_lock);
>> +            continue;
>> +        }
>> +
>> +        /* This interrupt looks like it has to be migrated. */
>> +
>> +        spin_unlock(&irq->irq_lock);
>> +        spin_unlock_irqrestore(&vgic_cpu->ap_list_lock, flags);
>> +
>> +        /*
>> +         * Ensure locking order by always locking the smallest
>> +         * ID first.
>> +         */
>> +        if ( vcpu->vcpu_id < target_vcpu->vcpu_id )
>> +        {
>> +            vcpuA = vcpu;
>> +            vcpuB = target_vcpu;
>> +        }
>> +        else
>> +        {
>> +            vcpuA = target_vcpu;
>> +            vcpuB = vcpu;
>> +        }
>> +
>> +        spin_lock_irqsave(&vcpuA->arch.vgic_cpu.ap_list_lock, flags);
>> +        spin_lock(&vcpuB->arch.vgic_cpu.ap_list_lock);
>> +        spin_lock(&irq->irq_lock);
>> +
>> +        /*
>> +         * If the affinity has been preserved, move the
>> +         * interrupt around. Otherwise, it means things have
>> +         * changed while the interrupt was unlocked, and we
>> +         * need to replay this.
>> +         *
>> +         * In all cases, we cannot trust the list not to have
>> +         * changed, so we restart from the beginning.
>> +         */
>> +        if ( target_vcpu == vgic_target_oracle(irq) )
>> +        {
>> +            struct vgic_cpu *new_cpu = &target_vcpu->arch.vgic_cpu;
>> +
>> +            list_del(&irq->ap_list);
>> +            irq->vcpu = target_vcpu;
>> +            list_add_tail(&irq->ap_list, &new_cpu->ap_list_head);
>> +        }
>> +
>> +        spin_unlock(&irq->irq_lock);
>> +        spin_unlock(&vcpuB->arch.vgic_cpu.ap_list_lock);
>> +        spin_unlock_irqrestore(&vcpuA->arch.vgic_cpu.ap_list_lock,
>> flags);
>> +        goto retry;
>> +    }
>> +
>> +    spin_unlock_irqrestore(&vgic_cpu->ap_list_lock, flags);
>> +}
>> +
>> +static inline void vgic_fold_lr_state(struct vcpu *vcpu)
>> +{
>> +}
>> +
>> +/* Requires the irq_lock to be held. */
>> +static inline void vgic_populate_lr(struct vcpu *vcpu,
>> +                                    struct vgic_irq *irq, int lr)
>> +{
>> +    ASSERT(spin_is_locked(&irq->irq_lock));
>> +}
>> +
>> +static inline void vgic_clear_lr(struct vcpu *vcpu, int lr)
>> +{
>> +}
>> +
>> +static inline void vgic_set_underflow(struct vcpu *vcpu)
>> +{
>> +}
>> +
>> +/* Requires the ap_list_lock to be held. */
>> +static int compute_ap_list_depth(struct vcpu *vcpu)
>> +{
>> +    struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
>> +    struct vgic_irq *irq;
>> +    int count = 0;
>> +
>> +    ASSERT(spin_is_locked(&vgic_cpu->ap_list_lock));
>> +
>> +    list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list)
> 
> Here another example.

This can be short cut, indeed. The only place we call this function is
where we compare it against the number of LRs below.
So we don't need to know the actual number of elements, but could bail
out once we reached the number of LRs.
This function here would then return a bool, comparing internally
against the number of LRs already.

>> +    {
>> +        spin_lock(&irq->irq_lock);
>> +        /* GICv2 SGIs can count for more than one... */
>> +        if ( vgic_irq_is_sgi(irq->intid) && irq->source )
>> +            count += hweight8(irq->source);
>> +        else
>> +            count++;
>> +        spin_unlock(&irq->irq_lock);
>> +    }
>> +    return count;
>> +}
>> +
>> +/* Requires the VCPU's ap_list_lock to be held. */
>> +static void vgic_flush_lr_state(struct vcpu *vcpu)
>> +{
>> +    struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
>> +    struct vgic_irq *irq;
>> +    int count = 0;
>> +
>> +    ASSERT(spin_is_locked(&vgic_cpu->ap_list_lock));
>> +
>> +    if ( compute_ap_list_depth(vcpu) > gic_get_nr_lrs() )
>> +        vgic_sort_ap_list(vcpu);
>> +
>> +    list_for_each_entry( irq, &vgic_cpu->ap_list_head, ap_list )
>> +    {
>> +        spin_lock(&irq->irq_lock);
>> +
>> +        if ( unlikely(vgic_target_oracle(irq) != vcpu) )
>> +            goto next;
>> +
>> +        /*
>> +         * If we get an SGI with multiple sources, try to get
>> +         * them in all at once.
>> +         */
>> +        do
>> +        {
>> +            vgic_populate_lr(vcpu, irq, count++);
>> +        } while ( irq->source && count < gic_get_nr_lrs() );
>> +
>> +next:
>> +        spin_unlock(&irq->irq_lock);
>> +
>> +        if ( count == gic_get_nr_lrs() )
>> +        {
>> +            if ( !list_is_last(&irq->ap_list, &vgic_cpu->ap_list_head) )
>> +                vgic_set_underflow(vcpu);
>> +            break;
>> +        }
>> +    }
>> +
>> +    vcpu->arch.vgic_cpu.used_lrs = count;
>> +
>> +    /* Nuke remaining LRs */
>> +    for ( ; count < gic_get_nr_lrs(); count++)
>> +        vgic_clear_lr(vcpu, count);
>> +}
>> +
>> +/*
>> + * gic_clear_lrs() - Update the VGIC state from hardware after a
>> guest's run.
>> + * @vcpu: the VCPU.
>> + *
>> + * Sync back the hardware VGIC state after the guest has run, into our
>> + * VGIC emulation structures, It reads the LRs and updates the
>> respective
>> + * struct vgic_irq, taking level/edge into account.
>> + * This is the high level function which takes care of the conditions,
>> + * also bails out early if there were no interrupts queued.
>> + * Was: kvm_vgic_sync_hwstate()
>> + */
>> +void gic_clear_lrs(struct vcpu *vcpu)
> 
> I think I would prefer if we stick with the KVM name.

Yes, please! ;-)
I found those names always confusing, especially gic_inject(). To be
honest I never know which is which in the KVM case as well (sync vs.
flush), so I was wondering if we call both the entry and the exit
handler "sync", but denote the direction, like:
vgic_sync_from_lrs() and vgic_sync_to_lrs().

Cheers,
Andre.

>> +{
>> +    struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
>> +
>> +    /* An empty ap_list_head implies used_lrs == 0 */
>> +    if ( list_empty(&vcpu->arch.vgic_cpu.ap_list_head) )
>> +        return;
>> +
>> +    if ( vgic_cpu->used_lrs )
>> +        vgic_fold_lr_state(vcpu);
>> +    vgic_prune_ap_list(vcpu);
>> +}
>> +
>> +/*
>> + * gic_inject() - flush the emulation state into the hardware on
>> guest entry
>> + *
>> + * Before we enter a guest, we have to translate the virtual GIC
>> state of a
>> + * VCPU into the GIC virtualization hardware registers, namely the LRs.
>> + * This is the high level function which takes care about the conditions
>> + * and the locking, also bails out early if there are no interrupts
>> queued.
>> + * Was: kvm_vgic_flush_hwstate()
> 
> Ditto.
> 
>> + */
>> +void gic_inject(void)
>> +{
>> +    /*
>> +     * If there are no virtual interrupts active or pending for this
>> +     * VCPU, then there is no work to do and we can bail out without
>> +     * taking any lock.  There is a potential race with someone
>> injecting
>> +     * interrupts to the VCPU, but it is a benign race as the VCPU will
>> +     * either observe the new interrupt before or after doing this
>> check,
>> +     * and introducing additional synchronization mechanism doesn't
>> change
>> +     * this.
>> +     */
>> +    if ( list_empty(&current->arch.vgic_cpu.ap_list_head) )
>> +        return;
>> +
>> +    ASSERT(!local_irq_is_enabled());
>> +
>> +    spin_lock(&current->arch.vgic_cpu.ap_list_lock);
>> +    vgic_flush_lr_state(current);
>> +    spin_unlock(&current->arch.vgic_cpu.ap_list_lock);
>> +}
>>   /*
>>    * Local variables:
>>    * mode: C
>> diff --git a/xen/arch/arm/vgic/vgic.h b/xen/arch/arm/vgic/vgic.h
>> index 5127739f0f..47fc58b81e 100644
>> --- a/xen/arch/arm/vgic/vgic.h
>> +++ b/xen/arch/arm/vgic/vgic.h
>> @@ -17,6 +17,8 @@
>>   #ifndef __XEN_ARM_VGIC_NEW_H__
>>   #define __XEN_ARM_VGIC_NEW_H__
>>   +#define vgic_irq_is_sgi(intid) ((intid) < VGIC_NR_SGIS)
>> +
>>   static inline bool irq_is_pending(struct vgic_irq *irq)
>>   {
>>       if ( irq->config == VGIC_CONFIG_EDGE )
>>
> 
> Cheers,
>
Julien Grall Feb. 16, 2018, 3:22 p.m. | #6
On 13/02/18 15:40, Andre Przywara wrote:
> Hi,
> 
> On 13/02/18 12:41, Julien Grall wrote:
>> Hi Andre,
>>
>> On 09/02/18 14:39, Andre Przywara wrote:
>>> gets called on guest entry and exit.
>>> The code talking to the actual GICv2/v3 hardware is added in the
>>> following patches.
>>>
>>> This is based on Linux commit 0919e84c0fc1, written by Marc Zyngier.
>>>
>>> Signed-off-by: Andre Przywara <andre.przywara@linaro.org>
>>> ---
>>>    xen/arch/arm/vgic/vgic.c | 246
>>> +++++++++++++++++++++++++++++++++++++++++++++++
>>>    xen/arch/arm/vgic/vgic.h |   2 +
>>>    2 files changed, 248 insertions(+)
>>>
>>> diff --git a/xen/arch/arm/vgic/vgic.c b/xen/arch/arm/vgic/vgic.c
>>> index a4efd1fd03..a1f77130d4 100644
>>> --- a/xen/arch/arm/vgic/vgic.c
>>> +++ b/xen/arch/arm/vgic/vgic.c
>>> @@ -380,6 +380,252 @@ int vgic_inject_irq(struct domain *d, struct
>>> vcpu *vcpu, unsigned int intid,
>>>        return 0;
>>>    }
>>>    +/**
>>> + * vgic_prune_ap_list - Remove non-relevant interrupts from the list
>>> + *
>>> + * @vcpu: The VCPU pointer
>>> + *
>>> + * Go over the list of "interesting" interrupts, and prune those that we
>>> + * won't have to consider in the near future.
>>> + */
>>> +static void vgic_prune_ap_list(struct vcpu *vcpu)
>>> +{
>>> +    struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
>>> +    struct vgic_irq *irq, *tmp;
>>> +    unsigned long flags;
>>> +
>>> +retry:
>>> +    spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags);
>>> +
>>> +    list_for_each_entry_safe( irq, tmp, &vgic_cpu->ap_list_head,
>>> ap_list )
>>
>> See my comment on patch #22, this is where I am worry about going
>> through the list every time we enter to the hypervisor from the guest.
> 
> I am not sure we can avoid this here, as this function is crucial to the
> VGIC state machine.
> We might later look into if we can avoid iterating through the whole
> list or if we can shortcut some interrupts somehow, but I really would
> be careful tinkering with this function too much.

My biggest concern is how big the list can be. If the list is quite 
small, then this function is ok to call when entering to the hypervisor. 
If you tell me it can be bigger, then I am quite reluctant to see this 
code in Xen. Obviously this could be solved afterwards but clearly 
before we make this a default option in Xen.

To be clear, I am asking how much a guest can control the size of the list.

[...]

>>> +/* Requires the ap_list_lock to be held. */
>>> +static int compute_ap_list_depth(struct vcpu *vcpu)
>>> +{
>>> +    struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
>>> +    struct vgic_irq *irq;
>>> +    int count = 0;
>>> +
>>> +    ASSERT(spin_is_locked(&vgic_cpu->ap_list_lock));
>>> +
>>> +    list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list)
>>
>> Here another example.
> 
> This can be short cut, indeed. The only place we call this function is
> where we compare it against the number of LRs below.
> So we don't need to know the actual number of elements, but could bail
> out once we reached the number of LRs.
> This function here would then return a bool, comparing internally
> against the number of LRs already.

This would only solve one part of the problem. Then you go sorting the 
list if you have more vIRQ in the list than the number of LRs.

See above for my thoughts on the list.

>>> + * gic_clear_lrs() - Update the VGIC state from hardware after a
>>> guest's run.
>>> + * @vcpu: the VCPU.
>>> + *
>>> + * Sync back the hardware VGIC state after the guest has run, into our
>>> + * VGIC emulation structures, It reads the LRs and updates the
>>> respective
>>> + * struct vgic_irq, taking level/edge into account.
>>> + * This is the high level function which takes care of the conditions,
>>> + * also bails out early if there were no interrupts queued.
>>> + * Was: kvm_vgic_sync_hwstate()
>>> + */
>>> +void gic_clear_lrs(struct vcpu *vcpu)
>>
>> I think I would prefer if we stick with the KVM name.
> 
> Yes, please! ;-)
> I found those names always confusing, especially gic_inject(). To be
> honest I never know which is which in the KVM case as well (sync vs.
> flush), so I was wondering if we call both the entry and the exit
> handler "sync", but denote the direction, like:
> vgic_sync_from_lrs() and vgic_sync_to_lrs().

+1 on the suggested naming :).

Cheers,

Patch

diff --git a/xen/arch/arm/vgic/vgic.c b/xen/arch/arm/vgic/vgic.c
index a4efd1fd03..a1f77130d4 100644
--- a/xen/arch/arm/vgic/vgic.c
+++ b/xen/arch/arm/vgic/vgic.c
@@ -380,6 +380,252 @@  int vgic_inject_irq(struct domain *d, struct vcpu *vcpu, unsigned int intid,
     return 0;
 }
 
+/**
+ * vgic_prune_ap_list - Remove non-relevant interrupts from the list
+ *
+ * @vcpu: The VCPU pointer
+ *
+ * Go over the list of "interesting" interrupts, and prune those that we
+ * won't have to consider in the near future.
+ */
+static void vgic_prune_ap_list(struct vcpu *vcpu)
+{
+    struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+    struct vgic_irq *irq, *tmp;
+    unsigned long flags;
+
+retry:
+    spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags);
+
+    list_for_each_entry_safe( irq, tmp, &vgic_cpu->ap_list_head, ap_list )
+    {
+        struct vcpu *target_vcpu, *vcpuA, *vcpuB;
+
+        spin_lock(&irq->irq_lock);
+
+        BUG_ON(vcpu != irq->vcpu);
+
+        target_vcpu = vgic_target_oracle(irq);
+
+        if ( !target_vcpu )
+        {
+            /*
+             * We don't need to process this interrupt any
+             * further, move it off the list.
+             */
+            list_del(&irq->ap_list);
+            irq->vcpu = NULL;
+            spin_unlock(&irq->irq_lock);
+
+            /*
+             * This vgic_put_irq call matches the
+             * vgic_get_irq_kref in vgic_queue_irq_unlock,
+             * where we added the LPI to the ap_list. As
+             * we remove the irq from the list, we drop
+             * also drop the refcount.
+             */
+            vgic_put_irq(vcpu->domain, irq);
+            continue;
+        }
+
+        if ( target_vcpu == vcpu )
+        {
+            /* We're on the right CPU */
+            spin_unlock(&irq->irq_lock);
+            continue;
+        }
+
+        /* This interrupt looks like it has to be migrated. */
+
+        spin_unlock(&irq->irq_lock);
+        spin_unlock_irqrestore(&vgic_cpu->ap_list_lock, flags);
+
+        /*
+         * Ensure locking order by always locking the smallest
+         * ID first.
+         */
+        if ( vcpu->vcpu_id < target_vcpu->vcpu_id )
+        {
+            vcpuA = vcpu;
+            vcpuB = target_vcpu;
+        }
+        else
+        {
+            vcpuA = target_vcpu;
+            vcpuB = vcpu;
+        }
+
+        spin_lock_irqsave(&vcpuA->arch.vgic_cpu.ap_list_lock, flags);
+        spin_lock(&vcpuB->arch.vgic_cpu.ap_list_lock);
+        spin_lock(&irq->irq_lock);
+
+        /*
+         * If the affinity has been preserved, move the
+         * interrupt around. Otherwise, it means things have
+         * changed while the interrupt was unlocked, and we
+         * need to replay this.
+         *
+         * In all cases, we cannot trust the list not to have
+         * changed, so we restart from the beginning.
+         */
+        if ( target_vcpu == vgic_target_oracle(irq) )
+        {
+            struct vgic_cpu *new_cpu = &target_vcpu->arch.vgic_cpu;
+
+            list_del(&irq->ap_list);
+            irq->vcpu = target_vcpu;
+            list_add_tail(&irq->ap_list, &new_cpu->ap_list_head);
+        }
+
+        spin_unlock(&irq->irq_lock);
+        spin_unlock(&vcpuB->arch.vgic_cpu.ap_list_lock);
+        spin_unlock_irqrestore(&vcpuA->arch.vgic_cpu.ap_list_lock, flags);
+        goto retry;
+    }
+
+    spin_unlock_irqrestore(&vgic_cpu->ap_list_lock, flags);
+}
+
+static inline void vgic_fold_lr_state(struct vcpu *vcpu)
+{
+}
+
+/* Requires the irq_lock to be held. */
+static inline void vgic_populate_lr(struct vcpu *vcpu,
+                                    struct vgic_irq *irq, int lr)
+{
+    ASSERT(spin_is_locked(&irq->irq_lock));
+}
+
+static inline void vgic_clear_lr(struct vcpu *vcpu, int lr)
+{
+}
+
+static inline void vgic_set_underflow(struct vcpu *vcpu)
+{
+}
+
+/* Requires the ap_list_lock to be held. */
+static int compute_ap_list_depth(struct vcpu *vcpu)
+{
+    struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+    struct vgic_irq *irq;
+    int count = 0;
+
+    ASSERT(spin_is_locked(&vgic_cpu->ap_list_lock));
+
+    list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list)
+    {
+        spin_lock(&irq->irq_lock);
+        /* GICv2 SGIs can count for more than one... */
+        if ( vgic_irq_is_sgi(irq->intid) && irq->source )
+            count += hweight8(irq->source);
+        else
+            count++;
+        spin_unlock(&irq->irq_lock);
+    }
+    return count;
+}
+
+/* Requires the VCPU's ap_list_lock to be held. */
+static void vgic_flush_lr_state(struct vcpu *vcpu)
+{
+    struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+    struct vgic_irq *irq;
+    int count = 0;
+
+    ASSERT(spin_is_locked(&vgic_cpu->ap_list_lock));
+
+    if ( compute_ap_list_depth(vcpu) > gic_get_nr_lrs() )
+        vgic_sort_ap_list(vcpu);
+
+    list_for_each_entry( irq, &vgic_cpu->ap_list_head, ap_list )
+    {
+        spin_lock(&irq->irq_lock);
+
+        if ( unlikely(vgic_target_oracle(irq) != vcpu) )
+            goto next;
+
+        /*
+         * If we get an SGI with multiple sources, try to get
+         * them in all at once.
+         */
+        do
+        {
+            vgic_populate_lr(vcpu, irq, count++);
+        } while ( irq->source && count < gic_get_nr_lrs() );
+
+next:
+        spin_unlock(&irq->irq_lock);
+
+        if ( count == gic_get_nr_lrs() )
+        {
+            if ( !list_is_last(&irq->ap_list, &vgic_cpu->ap_list_head) )
+                vgic_set_underflow(vcpu);
+            break;
+        }
+    }
+
+    vcpu->arch.vgic_cpu.used_lrs = count;
+
+    /* Nuke remaining LRs */
+    for ( ; count < gic_get_nr_lrs(); count++)
+        vgic_clear_lr(vcpu, count);
+}
+
+/*
+ * gic_clear_lrs() - Update the VGIC state from hardware after a guest's run.
+ * @vcpu: the VCPU.
+ *
+ * Sync back the hardware VGIC state after the guest has run, into our
+ * VGIC emulation structures, It reads the LRs and updates the respective
+ * struct vgic_irq, taking level/edge into account.
+ * This is the high level function which takes care of the conditions,
+ * also bails out early if there were no interrupts queued.
+ * Was: kvm_vgic_sync_hwstate()
+ */
+void gic_clear_lrs(struct vcpu *vcpu)
+{
+    struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+
+    /* An empty ap_list_head implies used_lrs == 0 */
+    if ( list_empty(&vcpu->arch.vgic_cpu.ap_list_head) )
+        return;
+
+    if ( vgic_cpu->used_lrs )
+        vgic_fold_lr_state(vcpu);
+    vgic_prune_ap_list(vcpu);
+}
+
+/*
+ * gic_inject() - flush the emulation state into the hardware on guest entry
+ *
+ * Before we enter a guest, we have to translate the virtual GIC state of a
+ * VCPU into the GIC virtualization hardware registers, namely the LRs.
+ * This is the high level function which takes care about the conditions
+ * and the locking, also bails out early if there are no interrupts queued.
+ * Was: kvm_vgic_flush_hwstate()
+ */
+void gic_inject(void)
+{
+    /*
+     * If there are no virtual interrupts active or pending for this
+     * VCPU, then there is no work to do and we can bail out without
+     * taking any lock.  There is a potential race with someone injecting
+     * interrupts to the VCPU, but it is a benign race as the VCPU will
+     * either observe the new interrupt before or after doing this check,
+     * and introducing additional synchronization mechanism doesn't change
+     * this.
+     */
+    if ( list_empty(&current->arch.vgic_cpu.ap_list_head) )
+        return;
+
+    ASSERT(!local_irq_is_enabled());
+
+    spin_lock(&current->arch.vgic_cpu.ap_list_lock);
+    vgic_flush_lr_state(current);
+    spin_unlock(&current->arch.vgic_cpu.ap_list_lock);
+}
 /*
  * Local variables:
  * mode: C
diff --git a/xen/arch/arm/vgic/vgic.h b/xen/arch/arm/vgic/vgic.h
index 5127739f0f..47fc58b81e 100644
--- a/xen/arch/arm/vgic/vgic.h
+++ b/xen/arch/arm/vgic/vgic.h
@@ -17,6 +17,8 @@ 
 #ifndef __XEN_ARM_VGIC_NEW_H__
 #define __XEN_ARM_VGIC_NEW_H__
 
+#define vgic_irq_is_sgi(intid) ((intid) < VGIC_NR_SGIS)
+
 static inline bool irq_is_pending(struct vgic_irq *irq)
 {
     if ( irq->config == VGIC_CONFIG_EDGE )