diff mbox series

[Xen-devel,RFC,22/49] ARM: new VGIC: Implement virtual IRQ injection

Message ID 20180209143937.28866-23-andre.przywara@linaro.org
State New
Headers show
Series New VGIC(-v2) implementation | expand

Commit Message

Andre Przywara Feb. 9, 2018, 2:39 p.m. UTC
Provide a vgic_queue_irq_unlock() function which decides whether a
given IRQ needs to be queued to a VCPU's ap_list.
This should be called whenever an IRQ becomes pending or enabled,
either as a result of a hardware IRQ injection, from devices emulated by
Xen (like the architected timer) or from MMIO accesses to the distributor
emulation.
Also provides the necessary functions to allow to inject an IRQ to a guest.
Since this is the first code that starts using our locking mechanism,
we add some (hopefully) clear documentation of our locking strategy and
requirements along with this patch.

This is based on Linux commit 81eeb95ddbab, written by Christoffer Dall.

Signed-off-by: Andre Przywara <andre.przywara@linaro.org>
---
 xen/arch/arm/vgic/vgic.c | 224 +++++++++++++++++++++++++++++++++++++++++++++++
 xen/arch/arm/vgic/vgic.h |  10 +++
 2 files changed, 234 insertions(+)

Comments

Julien Grall Feb. 12, 2018, 6:59 p.m. UTC | #1
Hi Andre,

On 09/02/18 14:39, Andre Przywara wrote:
> Provide a vgic_queue_irq_unlock() function which decides whether a
> given IRQ needs to be queued to a VCPU's ap_list.
> This should be called whenever an IRQ becomes pending or enabled,
> either as a result of a hardware IRQ injection, from devices emulated by
> Xen (like the architected timer) or from MMIO accesses to the distributor
> emulation.
> Also provides the necessary functions to allow to inject an IRQ to a guest.
> Since this is the first code that starts using our locking mechanism,
> we add some (hopefully) clear documentation of our locking strategy and
> requirements along with this patch.
> 
> This is based on Linux commit 81eeb95ddbab, written by Christoffer Dall.
> 
> Signed-off-by: Andre Przywara <andre.przywara@linaro.org>
> ---
>   xen/arch/arm/vgic/vgic.c | 224 +++++++++++++++++++++++++++++++++++++++++++++++
>   xen/arch/arm/vgic/vgic.h |  10 +++
>   2 files changed, 234 insertions(+)
> 
> diff --git a/xen/arch/arm/vgic/vgic.c b/xen/arch/arm/vgic/vgic.c
> index 3075091caa..f517df6d00 100644
> --- a/xen/arch/arm/vgic/vgic.c
> +++ b/xen/arch/arm/vgic/vgic.c
> @@ -21,6 +21,32 @@
>   #include <asm/arm_vgic.h>
>   #include "vgic.h"
>   
> +/*
> + * Locking order is always:
> + * kvm->lock (mutex)

You probably want to update the locking order to match Xen one. In that 
case, I am not sure if we need to take the domain lock in the code?


> + *   its->cmd_lock (mutex)
> + *     its->its_lock (mutex)

> + *       vgic_cpu->ap_list_lock
> + *         kvm->lpi_list_lock
> + *           vgic_irq->irq_lock
> + *
> + * If you need to take multiple locks, always take the upper lock first,
> + * then the lower ones, e.g. first take the its_lock, then the irq_lock.
> + * If you are already holding a lock and need to take a higher one, you
> + * have to drop the lower ranking lock first and re-aquire it after having

s/re-aquite/acquire/

> + * taken the upper one.
> + *
> + * When taking more than one ap_list_lock at the same time, always take the
> + * lowest numbered VCPU's ap_list_lock first, so:
> + *   vcpuX->vcpu_id < vcpuY->vcpu_id:
> + *     spin_lock(vcpuX->arch.vgic_cpu.ap_list_lock);
> + *     spin_lock(vcpuY->arch.vgic_cpu.ap_list_lock);
> + *
> + * Since the VGIC must support injecting virtual interrupts from ISRs, we have
> + * to use the spin_lock_irqsave/spin_unlock_irqrestore versions of outer
> + * spinlocks for any lock that may be taken while injecting an interrupt.

It is quite nice to see the locking explained in the file and in general 
a lot of explanation within the code :).

> + */
> +
>   /*
>    * Iterate over the VM's list of mapped LPIs to find the one with a
>    * matching interrupt ID and return a reference to the IRQ structure.
> @@ -97,6 +123,204 @@ void vgic_put_irq(struct domain *d, struct vgic_irq *irq)
>       xfree(irq);
>   }
>   
> +/**
> + * vgic_target_oracle - compute the target vcpu for an irq
> + *
> + * @irq:    The irq to route. Must be already locked.
> + *
> + * Based on the current state of the interrupt (enabled, pending,
> + * active, vcpu and target_vcpu), compute the next vcpu this should be
> + * given to. Return NULL if this shouldn't be injected at all.
> + *
> + * Requires the IRQ lock to be held.
> + */
> +static struct vcpu *vgic_target_oracle(struct vgic_irq *irq)
> +{
> +    ASSERT(spin_is_locked(&irq->irq_lock));
> +
> +    /* If the interrupt is active, it must stay on the current vcpu */
> +    if ( irq->active )
> +        return irq->vcpu ? : irq->target_vcpu;
I am not sure to understand why you check whether irq->vcpu is NULL. If 
the interrupt is active, then irq->vcpu should be NULL. Did I miss anything?

> +
> +    /*
> +     * If the IRQ is not active but enabled and pending, we should direct
> +     * it to its configured target VCPU.
> +     * If the distributor is disabled, pending interrupts shouldn't be
> +     * forwarded.
> +     */
> +    if ( irq->enabled && irq_is_pending(irq) )
> +    {
> +        if ( unlikely(irq->target_vcpu &&
> +                 !irq->target_vcpu->domain->arch.vgic.enabled) )

The indentation looks wrong here.

> +            return NULL;
> +
> +        return irq->target_vcpu;
> +    }
> +
> +    /* If neither active nor pending and enabled, then this IRQ should not

Comment style:

/*
  * ...

> +     * be queued to any VCPU.
> +     */
> +    return NULL;
> +}
> +
> +/*
> + * Only valid injection if changing level for level-triggered IRQs or for a
> + * rising edge.
> + */
> +static bool vgic_validate_injection(struct vgic_irq *irq, bool level)
> +{
> +    switch (irq->config)

switch ( ... )

> +    {
> +    case VGIC_CONFIG_LEVEL:
> +        return irq->line_level != level;
> +    case VGIC_CONFIG_EDGE:
> +        return level;
> +    }
> +

I would add an ASSERT_UNREACHABLE().

> +    return false;
> +}
> +
> +/*
> + * Check whether an IRQ needs to (and can) be queued to a VCPU's ap list.
> + * Do the queuing if necessary, taking the right locks in the right order.
> + * Returns true when the IRQ was queued, false otherwise.
> + *
> + * Needs to be entered with the IRQ lock already held, but will return
> + * with all locks dropped.
> + */
> +bool vgic_queue_irq_unlock(struct domain *d, struct vgic_irq *irq,
> +               unsigned long flags)

Indentation. Also same remark as from vgic_inject_irq. No-one seems to 
care about the return (even in KVM :)).

> +{
> +    struct vcpu *vcpu;
> +    bool running;
> +
> +    ASSERT(spin_is_locked(&irq->irq_lock));
> +
> +retry:
> +    vcpu = vgic_target_oracle(irq);
> +    if ( irq->vcpu || !vcpu )
> +    {
> +        /*
> +         * If this IRQ is already on a VCPU's ap_list, then it
> +         * cannot be moved or modified and there is no more work for
> +         * us to do.
> +         *
> +         * Otherwise, if the irq is not pending and enabled, it does
> +         * not need to be inserted into an ap_list and there is also
> +         * no more work for us to do.
> +         */
> +        spin_unlock_irqrestore(&irq->irq_lock, flags);
> +
> +        /*
> +         * We have to kick the VCPU here, because we could be
> +         * queueing an edge-triggered interrupt for which we
> +         * get no EOI maintenance interrupt. In that case,
> +         * while the IRQ is already on the VCPU's AP list, the
> +         * VCPU could have EOI'ed the original interrupt and
> +         * won't see this one until it exits for some other
> +         * reason.
> +         */
> +        if ( vcpu )
> +            vcpu_unblock(vcpu);

vcpu_unblock will only "unblock" a vCPU that is blocked. It won't notify 
a running vCPU. So you want to have something similar to:

vcpu_unblock(vcpu);
if ( running && vcpu != current )
   smp_send_event_check_mask(...);

It is probably worth to introduce an helper for that.

> +        return false;
> +    }
> +
> +    /*
> +     * We must unlock the irq lock to take the ap_list_lock where
> +     * we are going to insert this new pending interrupt.
> +     */
> +    spin_unlock_irqrestore(&irq->irq_lock, flags);
> +
> +    /* someone can do stuff here, which we re-check below */
> +
> +    spin_lock_irqsave(&vcpu->arch.vgic_cpu.ap_list_lock, flags);
> +    spin_lock(&irq->irq_lock);
> +
> +    /*
> +     * Did something change behind our backs?
> +     *
> +     * There are two cases:
> +     * 1) The irq lost its pending state or was disabled behind our
> +     *    backs and/or it was queued to another VCPU's ap_list.
> +     * 2) Someone changed the affinity on this irq behind our
> +     *    backs and we are now holding the wrong ap_list_lock.
> +     *
> +     * In both cases, drop the locks and retry.
> +     */
> +
> +    if ( unlikely(irq->vcpu || vcpu != vgic_target_oracle(irq)) )
> +    {
> +        spin_unlock(&irq->irq_lock);
> +        spin_unlock_irqrestore(&vcpu->arch.vgic_cpu.ap_list_lock, flags);
> +
> +        spin_lock_irqsave(&irq->irq_lock, flags);
> +        goto retry;
> +    }
> +
> +    /*
> +     * Grab a reference to the irq to reflect the fact that it is
> +     * now in the ap_list.
> +     */
> +    vgic_get_irq_kref(irq);
> +    list_add_tail(&irq->ap_list, &vcpu->arch.vgic_cpu.ap_list_head);

I was expecting the list to be sorted here. But you seem to do it only 
in vgic_flush_lr_state() which is quite interesting.

I can foresee quite a few issues with this choice on Xen:
	1) You compute the size of ap list in vgic_flush_lr_state() and take 
lock on every IRQ one by one. A guest could be nasty and make that list 
quite big by make IRQs pending but never "active" them (i.e read IAR).
	2) This might be an issue while checking whether you need to deliver an 
interrupt (vgic_vcpu_pending_irq) because the list is not sorted.

> +    irq->vcpu = vcpu;
> +
> +    spin_unlock(&irq->irq_lock);
> +    spin_unlock_irqrestore(&vcpu->arch.vgic_cpu.ap_list_lock, flags);
> +
> +    running = vcpu->is_running;
> +    vcpu_unblock(vcpu);
> +    if ( running && vcpu != current )
> +        smp_send_event_check_mask(cpumask_of(vcpu->processor));
> +
> +    return true;
> +}
> +
> +/**
> + * vgic_inject_irq - Inject an IRQ from a device to the vgic
> + * @d:       The domain pointer
> + * @vcpu:    The vCPU for PPIs
> + * @intid:   The INTID to inject a new state to.
> + * @level:   Edge-triggered:  true:  to trigger the interrupt
> + *                false: to ignore the call
> + *       Level-sensitive  true:  raise the input signal
> + *                false: lower the input signal
> + *
> + * The VGIC is not concerned with devices being active-LOW or active-HIGH for
> + * level-sensitive interrupts.  You can think of the level parameter as 1
> + * being HIGH and 0 being LOW and all devices being active-HIGH.
> + */
> +int vgic_inject_irq(struct domain *d, struct vcpu *vcpu, unsigned int intid,
> +            bool level)

Indentation.

> +{
> +    struct vgic_irq *irq;
> +    unsigned long flags;
> +
> +    irq = vgic_get_irq(d, vcpu, intid);
> +    if ( !irq )
> +        return -EINVAL;
> +
> +    spin_lock_irqsave(&irq->irq_lock, flags);
> +
> +    if ( !vgic_validate_injection(irq, level) )
> +    {
> +        /* Nothing to see here, move along... */
> +        spin_unlock_irqrestore(&irq->irq_lock, flags);
> +        vgic_put_irq(d, irq);
> +        return 0;
> +    }
> +
> +    if ( irq->config == VGIC_CONFIG_LEVEL )
> +        irq->line_level = level;
> +    else
> +        irq->pending_latch = true;
> +
> +    vgic_queue_irq_unlock(d, irq, flags);
> +    vgic_put_irq(d, irq);
> +
> +    return 0;
> +}
> +
>   /*
>    * Local variables:
>    * mode: C
> diff --git a/xen/arch/arm/vgic/vgic.h b/xen/arch/arm/vgic/vgic.h
> index 7a15cfdd79..5127739f0f 100644
> --- a/xen/arch/arm/vgic/vgic.h
> +++ b/xen/arch/arm/vgic/vgic.h
> @@ -17,9 +17,19 @@
>   #ifndef __XEN_ARM_VGIC_NEW_H__
>   #define __XEN_ARM_VGIC_NEW_H__
>   
> +static inline bool irq_is_pending(struct vgic_irq *irq)
> +{
> +    if ( irq->config == VGIC_CONFIG_EDGE )
> +        return irq->pending_latch;
> +    else
> +        return irq->pending_latch || irq->line_level;
> +}
> +
>   struct vgic_irq *vgic_get_irq(struct domain *d, struct vcpu *vcpu,
>                                 u32 intid);
>   void vgic_put_irq(struct domain *d, struct vgic_irq *irq);
> +bool vgic_queue_irq_unlock(struct domain *d, struct vgic_irq *irq,
> +               unsigned long flags);
>   
>   static inline void vgic_get_irq_kref(struct vgic_irq *irq)
>   {
> 

Cheers,
Andre Przywara Feb. 27, 2018, 10:17 a.m. UTC | #2
Hi,

On 12/02/18 18:59, Julien Grall wrote:
> Hi Andre,
> 
> On 09/02/18 14:39, Andre Przywara wrote:
>> Provide a vgic_queue_irq_unlock() function which decides whether a
>> given IRQ needs to be queued to a VCPU's ap_list.
>> This should be called whenever an IRQ becomes pending or enabled,
>> either as a result of a hardware IRQ injection, from devices emulated by
>> Xen (like the architected timer) or from MMIO accesses to the distributor
>> emulation.
>> Also provides the necessary functions to allow to inject an IRQ to a
>> guest.
>> Since this is the first code that starts using our locking mechanism,
>> we add some (hopefully) clear documentation of our locking strategy and
>> requirements along with this patch.
>>
>> This is based on Linux commit 81eeb95ddbab, written by Christoffer Dall.
>>
>> Signed-off-by: Andre Przywara <andre.przywara@linaro.org>
>> ---
>>   xen/arch/arm/vgic/vgic.c | 224
>> +++++++++++++++++++++++++++++++++++++++++++++++
>>   xen/arch/arm/vgic/vgic.h |  10 +++
>>   2 files changed, 234 insertions(+)
>>
>> diff --git a/xen/arch/arm/vgic/vgic.c b/xen/arch/arm/vgic/vgic.c
>> index 3075091caa..f517df6d00 100644
>> --- a/xen/arch/arm/vgic/vgic.c
>> +++ b/xen/arch/arm/vgic/vgic.c
>> @@ -21,6 +21,32 @@
>>   #include <asm/arm_vgic.h>
>>   #include "vgic.h"
>>   +/*
>> + * Locking order is always:
>> + * kvm->lock (mutex)
> 
> You probably want to update the locking order to match Xen one. In that
> case, I am not sure if we need to take the domain lock in the code?
> 
> 
>> + *   its->cmd_lock (mutex)
>> + *     its->its_lock (mutex)
> 
>> + *       vgic_cpu->ap_list_lock
>> + *         kvm->lpi_list_lock
>> + *           vgic_irq->irq_lock
>> + *
>> + * If you need to take multiple locks, always take the upper lock first,
>> + * then the lower ones, e.g. first take the its_lock, then the irq_lock.
>> + * If you are already holding a lock and need to take a higher one, you
>> + * have to drop the lower ranking lock first and re-aquire it after
>> having
> 
> s/re-aquite/acquire/
> 
>> + * taken the upper one.
>> + *
>> + * When taking more than one ap_list_lock at the same time, always
>> take the
>> + * lowest numbered VCPU's ap_list_lock first, so:
>> + *   vcpuX->vcpu_id < vcpuY->vcpu_id:
>> + *     spin_lock(vcpuX->arch.vgic_cpu.ap_list_lock);
>> + *     spin_lock(vcpuY->arch.vgic_cpu.ap_list_lock);
>> + *
>> + * Since the VGIC must support injecting virtual interrupts from
>> ISRs, we have
>> + * to use the spin_lock_irqsave/spin_unlock_irqrestore versions of outer
>> + * spinlocks for any lock that may be taken while injecting an
>> interrupt.
> 
> It is quite nice to see the locking explained in the file and in general
> a lot of explanation within the code :).
> 
>> + */
>> +
>>   /*
>>    * Iterate over the VM's list of mapped LPIs to find the one with a
>>    * matching interrupt ID and return a reference to the IRQ structure.
>> @@ -97,6 +123,204 @@ void vgic_put_irq(struct domain *d, struct
>> vgic_irq *irq)
>>       xfree(irq);
>>   }
>>   +/**
>> + * vgic_target_oracle - compute the target vcpu for an irq
>> + *
>> + * @irq:    The irq to route. Must be already locked.
>> + *
>> + * Based on the current state of the interrupt (enabled, pending,
>> + * active, vcpu and target_vcpu), compute the next vcpu this should be
>> + * given to. Return NULL if this shouldn't be injected at all.
>> + *
>> + * Requires the IRQ lock to be held.
>> + */
>> +static struct vcpu *vgic_target_oracle(struct vgic_irq *irq)
>> +{
>> +    ASSERT(spin_is_locked(&irq->irq_lock));
>> +
>> +    /* If the interrupt is active, it must stay on the current vcpu */
>> +    if ( irq->active )
>> +        return irq->vcpu ? : irq->target_vcpu;
> I am not sure to understand why you check whether irq->vcpu is NULL. If
> the interrupt is active, then irq->vcpu should be NULL. Did I miss
                                                ^
                                               not           you mean?
> anything?

Not if it has been explicitly activated via ISACTIVER. This is not
implemented in Xen at the moment, but would be in the future. So I like
to keep this in.

>> +
>> +    /*
>> +     * If the IRQ is not active but enabled and pending, we should
>> direct
>> +     * it to its configured target VCPU.
>> +     * If the distributor is disabled, pending interrupts shouldn't be
>> +     * forwarded.
>> +     */
>> +    if ( irq->enabled && irq_is_pending(irq) )
>> +    {
>> +        if ( unlikely(irq->target_vcpu &&
>> +                 !irq->target_vcpu->domain->arch.vgic.enabled) )
> 
> The indentation looks wrong here.
> 
>> +            return NULL;
>> +
>> +        return irq->target_vcpu;
>> +    }
>> +
>> +    /* If neither active nor pending and enabled, then this IRQ
>> should not
> 
> Comment style:
> 
> /*
>  * ...
> 
>> +     * be queued to any VCPU.
>> +     */
>> +    return NULL;
>> +}
>> +
>> +/*
>> + * Only valid injection if changing level for level-triggered IRQs or
>> for a
>> + * rising edge.
>> + */
>> +static bool vgic_validate_injection(struct vgic_irq *irq, bool level)
>> +{
>> +    switch (irq->config)
> 
> switch ( ... )
> 
>> +    {
>> +    case VGIC_CONFIG_LEVEL:
>> +        return irq->line_level != level;
>> +    case VGIC_CONFIG_EDGE:
>> +        return level;
>> +    }
>> +
> 
> I would add an ASSERT_UNREACHABLE().
> 
>> +    return false;
>> +}
>> +
>> +/*
>> + * Check whether an IRQ needs to (and can) be queued to a VCPU's ap
>> list.
>> + * Do the queuing if necessary, taking the right locks in the right
>> order.
>> + * Returns true when the IRQ was queued, false otherwise.
>> + *
>> + * Needs to be entered with the IRQ lock already held, but will return
>> + * with all locks dropped.
>> + */
>> +bool vgic_queue_irq_unlock(struct domain *d, struct vgic_irq *irq,
>> +               unsigned long flags)
> 
> Indentation. Also same remark as from vgic_inject_irq. No-one seems to
> care about the return (even in KVM :)).
> 
>> +{
>> +    struct vcpu *vcpu;
>> +    bool running;
>> +
>> +    ASSERT(spin_is_locked(&irq->irq_lock));
>> +
>> +retry:
>> +    vcpu = vgic_target_oracle(irq);
>> +    if ( irq->vcpu || !vcpu )
>> +    {
>> +        /*
>> +         * If this IRQ is already on a VCPU's ap_list, then it
>> +         * cannot be moved or modified and there is no more work for
>> +         * us to do.
>> +         *
>> +         * Otherwise, if the irq is not pending and enabled, it does
>> +         * not need to be inserted into an ap_list and there is also
>> +         * no more work for us to do.
>> +         */
>> +        spin_unlock_irqrestore(&irq->irq_lock, flags);
>> +
>> +        /*
>> +         * We have to kick the VCPU here, because we could be
>> +         * queueing an edge-triggered interrupt for which we
>> +         * get no EOI maintenance interrupt. In that case,
>> +         * while the IRQ is already on the VCPU's AP list, the
>> +         * VCPU could have EOI'ed the original interrupt and
>> +         * won't see this one until it exits for some other
>> +         * reason.
>> +         */
>> +        if ( vcpu )
>> +            vcpu_unblock(vcpu);
> 
> vcpu_unblock will only "unblock" a vCPU that is blocked. It won't notify
> a running vCPU. So you want to have something similar to:
> 
> vcpu_unblock(vcpu);
> if ( running && vcpu != current )
>   smp_send_event_check_mask(...);
> 
> It is probably worth to introduce an helper for that.
> 
>> +        return false;
>> +    }
>> +
>> +    /*
>> +     * We must unlock the irq lock to take the ap_list_lock where
>> +     * we are going to insert this new pending interrupt.
>> +     */
>> +    spin_unlock_irqrestore(&irq->irq_lock, flags);
>> +
>> +    /* someone can do stuff here, which we re-check below */
>> +
>> +    spin_lock_irqsave(&vcpu->arch.vgic_cpu.ap_list_lock, flags);
>> +    spin_lock(&irq->irq_lock);
>> +
>> +    /*
>> +     * Did something change behind our backs?
>> +     *
>> +     * There are two cases:
>> +     * 1) The irq lost its pending state or was disabled behind our
>> +     *    backs and/or it was queued to another VCPU's ap_list.
>> +     * 2) Someone changed the affinity on this irq behind our
>> +     *    backs and we are now holding the wrong ap_list_lock.
>> +     *
>> +     * In both cases, drop the locks and retry.
>> +     */
>> +
>> +    if ( unlikely(irq->vcpu || vcpu != vgic_target_oracle(irq)) )
>> +    {
>> +        spin_unlock(&irq->irq_lock);
>> +        spin_unlock_irqrestore(&vcpu->arch.vgic_cpu.ap_list_lock,
>> flags);
>> +
>> +        spin_lock_irqsave(&irq->irq_lock, flags);
>> +        goto retry;
>> +    }
>> +
>> +    /*
>> +     * Grab a reference to the irq to reflect the fact that it is
>> +     * now in the ap_list.
>> +     */
>> +    vgic_get_irq_kref(irq);
>> +    list_add_tail(&irq->ap_list, &vcpu->arch.vgic_cpu.ap_list_head);
> 
> I was expecting the list to be sorted here. But you seem to do it only
> in vgic_flush_lr_state() which is quite interesting.

The list is not sorted on insertion because this is not necessary most
of the time. In fact the hardware VGIC will do the sorting (kind of)
within the LRs. So as long as we don't have more than <number of LRS>
IRQs in the list, sorting is a waste of time. Experiments in the past
showed that the number of used LRs is less than 4 almost every time. And
since 4 is the mostly used number of implemented LRs, we virtually never
need the sorting. So we avoid doing that on every insertion, instead
doing that only if it's necessary.

> I can foresee quite a few issues with this choice on Xen:
>     1) You compute the size of ap list in vgic_flush_lr_state() and take
> lock on every IRQ one by one. A guest could be nasty and make that list
> quite big by make IRQs pending but never "active" them (i.e read IAR).

Yeah, we could try to shortcut a bit here.

>     2) This might be an issue while checking whether you need to deliver
> an interrupt (vgic_vcpu_pending_irq) because the list is not sorted.

Most of the time the list is very short, storing one, two or actually no
IRQs. In the function where we check for pending IRQs we bail out as
soon as we found the first eligible interrupt. So sorting does not help
in the majority of cases.

If you are really concerned about that list growing too long, we could
think about mitigations:
1) Try to avoid iterating the whole list while checking whether it needs
to be sorted.
2) Store a flag that notes if the list has already been sorted. As long
as we don't change anything, we don't need to sort again. Would be good
to test whether this is actually helpful. But we would need to keep this
flag up-to-date, which sounds a bit fragile to get right.
3) Switch to sort-on-insertion once we reached a certain number of IRQs
on the list, to mitigate DOS attacks from the guest. This should avoid
list iterations in hot paths, with IRQs disabled.

But all of these sound a bit hackish to me and just would spoil the very
clean and robust code we have today. Also I am not sure we can avoid
list iterations every time (for instance in prune_ap_list()). There is
an upper limit today (number of SPIs), so we might be happy with that
for now.
To be honest I would very much dislike changing the code at this point.
I believe a patch series afterwards would be better, also to actually
have some numbers on the impact of this.

Cheers,
Andre.

>> +    irq->vcpu = vcpu;
>> +
>> +    spin_unlock(&irq->irq_lock);
>> +    spin_unlock_irqrestore(&vcpu->arch.vgic_cpu.ap_list_lock, flags);
>> +
>> +    running = vcpu->is_running;
>> +    vcpu_unblock(vcpu);
>> +    if ( running && vcpu != current )
>> +        smp_send_event_check_mask(cpumask_of(vcpu->processor));
>> +
>> +    return true;
>> +}
>> +
>> +/**
>> + * vgic_inject_irq - Inject an IRQ from a device to the vgic
>> + * @d:       The domain pointer
>> + * @vcpu:    The vCPU for PPIs
>> + * @intid:   The INTID to inject a new state to.
>> + * @level:   Edge-triggered:  true:  to trigger the interrupt
>> + *                false: to ignore the call
>> + *       Level-sensitive  true:  raise the input signal
>> + *                false: lower the input signal
>> + *
>> + * The VGIC is not concerned with devices being active-LOW or
>> active-HIGH for
>> + * level-sensitive interrupts.  You can think of the level parameter
>> as 1
>> + * being HIGH and 0 being LOW and all devices being active-HIGH.
>> + */
>> +int vgic_inject_irq(struct domain *d, struct vcpu *vcpu, unsigned int
>> intid,
>> +            bool level)
> 
> Indentation.
> 
>> +{
>> +    struct vgic_irq *irq;
>> +    unsigned long flags;
>> +
>> +    irq = vgic_get_irq(d, vcpu, intid);
>> +    if ( !irq )
>> +        return -EINVAL;
>> +
>> +    spin_lock_irqsave(&irq->irq_lock, flags);
>> +
>> +    if ( !vgic_validate_injection(irq, level) )
>> +    {
>> +        /* Nothing to see here, move along... */
>> +        spin_unlock_irqrestore(&irq->irq_lock, flags);
>> +        vgic_put_irq(d, irq);
>> +        return 0;
>> +    }
>> +
>> +    if ( irq->config == VGIC_CONFIG_LEVEL )
>> +        irq->line_level = level;
>> +    else
>> +        irq->pending_latch = true;
>> +
>> +    vgic_queue_irq_unlock(d, irq, flags);
>> +    vgic_put_irq(d, irq);
>> +
>> +    return 0;
>> +}
>> +
>>   /*
>>    * Local variables:
>>    * mode: C
>> diff --git a/xen/arch/arm/vgic/vgic.h b/xen/arch/arm/vgic/vgic.h
>> index 7a15cfdd79..5127739f0f 100644
>> --- a/xen/arch/arm/vgic/vgic.h
>> +++ b/xen/arch/arm/vgic/vgic.h
>> @@ -17,9 +17,19 @@
>>   #ifndef __XEN_ARM_VGIC_NEW_H__
>>   #define __XEN_ARM_VGIC_NEW_H__
>>   +static inline bool irq_is_pending(struct vgic_irq *irq)
>> +{
>> +    if ( irq->config == VGIC_CONFIG_EDGE )
>> +        return irq->pending_latch;
>> +    else
>> +        return irq->pending_latch || irq->line_level;
>> +}
>> +
>>   struct vgic_irq *vgic_get_irq(struct domain *d, struct vcpu *vcpu,
>>                                 u32 intid);
>>   void vgic_put_irq(struct domain *d, struct vgic_irq *irq);
>> +bool vgic_queue_irq_unlock(struct domain *d, struct vgic_irq *irq,
>> +               unsigned long flags);
>>     static inline void vgic_get_irq_kref(struct vgic_irq *irq)
>>   {
>>
> 
> Cheers,
>
Julien Grall Feb. 27, 2018, 10:43 a.m. UTC | #3
On 27/02/18 10:17, Andre Przywara wrote:
> Hi,

Hi Andre,

> On 12/02/18 18:59, Julien Grall wrote:
>> On 09/02/18 14:39, Andre Przywara wrote:
>>>    /*
>>>     * Iterate over the VM's list of mapped LPIs to find the one with a
>>>     * matching interrupt ID and return a reference to the IRQ structure.
>>> @@ -97,6 +123,204 @@ void vgic_put_irq(struct domain *d, struct
>>> vgic_irq *irq)
>>>        xfree(irq);
>>>    }
>>>    +/**
>>> + * vgic_target_oracle - compute the target vcpu for an irq
>>> + *
>>> + * @irq:    The irq to route. Must be already locked.
>>> + *
>>> + * Based on the current state of the interrupt (enabled, pending,
>>> + * active, vcpu and target_vcpu), compute the next vcpu this should be
>>> + * given to. Return NULL if this shouldn't be injected at all.
>>> + *
>>> + * Requires the IRQ lock to be held.
>>> + */
>>> +static struct vcpu *vgic_target_oracle(struct vgic_irq *irq)
>>> +{
>>> +    ASSERT(spin_is_locked(&irq->irq_lock));
>>> +
>>> +    /* If the interrupt is active, it must stay on the current vcpu */
>>> +    if ( irq->active )
>>> +        return irq->vcpu ? : irq->target_vcpu;
>> I am not sure to understand why you check whether irq->vcpu is NULL. If
>> the interrupt is active, then irq->vcpu should be NULL. Did I miss
>                                                  ^
>                                                 not           you mean?

Yes not NULL.

>> anything?
> 
> Not if it has been explicitly activated via ISACTIVER. This is not
> implemented in Xen at the moment, but would be in the future. So I like
> to keep this in.

Oh, I missed that case. Thank you for the explanation :).

[...]

> 
> The list is not sorted on insertion because this is not necessary most
> of the time. In fact the hardware VGIC will do the sorting (kind of)
> within the LRs. So as long as we don't have more than <number of LRS>
> IRQs in the list, sorting is a waste of time. Experiments in the past
> showed that the number of used LRs is less than 4 almost every time. And
> since 4 is the mostly used number of implemented LRs, we virtually never
> need the sorting. So we avoid doing that on every insertion, instead
> doing that only if it's necessary.
> 
>> I can foresee quite a few issues with this choice on Xen:
>>      1) You compute the size of ap list in vgic_flush_lr_state() and take
>> lock on every IRQ one by one. A guest could be nasty and make that list
>> quite big by make IRQs pending but never "active" them (i.e read IAR).
> 
> Yeah, we could try to shortcut a bit here.
> 
>>      2) This might be an issue while checking whether you need to deliver
>> an interrupt (vgic_vcpu_pending_irq) because the list is not sorted.
> 
> Most of the time the list is very short, storing one, two or actually no
> IRQs. In the function where we check for pending IRQs we bail out as
> soon as we found the first eligible interrupt. So sorting does not help
> in the majority of cases.

As you say "most of the time". Malicious guest are unusual but just 
enough to keep busy both the hypervisor and the security team.

> 
> If you are really concerned about that list growing too long, we could
> think about mitigations:
> 1) Try to avoid iterating the whole list while checking whether it needs
> to be sorted.
> 2) Store a flag that notes if the list has already been sorted. As long
> as we don't change anything, we don't need to sort again. Would be good
> to test whether this is actually helpful. But we would need to keep this
> flag up-to-date, which sounds a bit fragile to get right.
> 3) Switch to sort-on-insertion once we reached a certain number of IRQs
> on the list, to mitigate DOS attacks from the guest. This should avoid
> list iterations in hot paths, with IRQs disabled.
> 
> But all of these sound a bit hackish to me and just would spoil the very
> clean and robust code we have today. Also I am not sure we can avoid
> list iterations every time (for instance in prune_ap_list()). There is
> an upper limit today (number of SPIs), so we might be happy with that
> for now.
> To be honest I would very much dislike changing the code at this point.
> I believe a patch series afterwards would be better, also to actually
> have some numbers on the impact of this.

More robust than the current vGIC yes. However, clean code is not an 
excuse to dismiss valid (but unusual) use case. So even if I quite like 
the new vGIC, I really don't want to deal with yet another security.

Thankfully passthrough case is not currently security supported (see 
SUPPORT.md). So we probably can defer, although I would like to keep 
track of known pitfalls of the new vGIC.

Cheers,
diff mbox series

Patch

diff --git a/xen/arch/arm/vgic/vgic.c b/xen/arch/arm/vgic/vgic.c
index 3075091caa..f517df6d00 100644
--- a/xen/arch/arm/vgic/vgic.c
+++ b/xen/arch/arm/vgic/vgic.c
@@ -21,6 +21,32 @@ 
 #include <asm/arm_vgic.h>
 #include "vgic.h"
 
+/*
+ * Locking order is always:
+ * kvm->lock (mutex)
+ *   its->cmd_lock (mutex)
+ *     its->its_lock (mutex)
+ *       vgic_cpu->ap_list_lock
+ *         kvm->lpi_list_lock
+ *           vgic_irq->irq_lock
+ *
+ * If you need to take multiple locks, always take the upper lock first,
+ * then the lower ones, e.g. first take the its_lock, then the irq_lock.
+ * If you are already holding a lock and need to take a higher one, you
+ * have to drop the lower ranking lock first and re-aquire it after having
+ * taken the upper one.
+ *
+ * When taking more than one ap_list_lock at the same time, always take the
+ * lowest numbered VCPU's ap_list_lock first, so:
+ *   vcpuX->vcpu_id < vcpuY->vcpu_id:
+ *     spin_lock(vcpuX->arch.vgic_cpu.ap_list_lock);
+ *     spin_lock(vcpuY->arch.vgic_cpu.ap_list_lock);
+ *
+ * Since the VGIC must support injecting virtual interrupts from ISRs, we have
+ * to use the spin_lock_irqsave/spin_unlock_irqrestore versions of outer
+ * spinlocks for any lock that may be taken while injecting an interrupt.
+ */
+
 /*
  * Iterate over the VM's list of mapped LPIs to find the one with a
  * matching interrupt ID and return a reference to the IRQ structure.
@@ -97,6 +123,204 @@  void vgic_put_irq(struct domain *d, struct vgic_irq *irq)
     xfree(irq);
 }
 
+/**
+ * vgic_target_oracle - compute the target vcpu for an irq
+ *
+ * @irq:    The irq to route. Must be already locked.
+ *
+ * Based on the current state of the interrupt (enabled, pending,
+ * active, vcpu and target_vcpu), compute the next vcpu this should be
+ * given to. Return NULL if this shouldn't be injected at all.
+ *
+ * Requires the IRQ lock to be held.
+ */
+static struct vcpu *vgic_target_oracle(struct vgic_irq *irq)
+{
+    ASSERT(spin_is_locked(&irq->irq_lock));
+
+    /* If the interrupt is active, it must stay on the current vcpu */
+    if ( irq->active )
+        return irq->vcpu ? : irq->target_vcpu;
+
+    /*
+     * If the IRQ is not active but enabled and pending, we should direct
+     * it to its configured target VCPU.
+     * If the distributor is disabled, pending interrupts shouldn't be
+     * forwarded.
+     */
+    if ( irq->enabled && irq_is_pending(irq) )
+    {
+        if ( unlikely(irq->target_vcpu &&
+                 !irq->target_vcpu->domain->arch.vgic.enabled) )
+            return NULL;
+
+        return irq->target_vcpu;
+    }
+
+    /* If neither active nor pending and enabled, then this IRQ should not
+     * be queued to any VCPU.
+     */
+    return NULL;
+}
+
+/*
+ * Only valid injection if changing level for level-triggered IRQs or for a
+ * rising edge.
+ */
+static bool vgic_validate_injection(struct vgic_irq *irq, bool level)
+{
+    switch (irq->config)
+    {
+    case VGIC_CONFIG_LEVEL:
+        return irq->line_level != level;
+    case VGIC_CONFIG_EDGE:
+        return level;
+    }
+
+    return false;
+}
+
+/*
+ * Check whether an IRQ needs to (and can) be queued to a VCPU's ap list.
+ * Do the queuing if necessary, taking the right locks in the right order.
+ * Returns true when the IRQ was queued, false otherwise.
+ *
+ * Needs to be entered with the IRQ lock already held, but will return
+ * with all locks dropped.
+ */
+bool vgic_queue_irq_unlock(struct domain *d, struct vgic_irq *irq,
+               unsigned long flags)
+{
+    struct vcpu *vcpu;
+    bool running;
+
+    ASSERT(spin_is_locked(&irq->irq_lock));
+
+retry:
+    vcpu = vgic_target_oracle(irq);
+    if ( irq->vcpu || !vcpu )
+    {
+        /*
+         * If this IRQ is already on a VCPU's ap_list, then it
+         * cannot be moved or modified and there is no more work for
+         * us to do.
+         *
+         * Otherwise, if the irq is not pending and enabled, it does
+         * not need to be inserted into an ap_list and there is also
+         * no more work for us to do.
+         */
+        spin_unlock_irqrestore(&irq->irq_lock, flags);
+
+        /*
+         * We have to kick the VCPU here, because we could be
+         * queueing an edge-triggered interrupt for which we
+         * get no EOI maintenance interrupt. In that case,
+         * while the IRQ is already on the VCPU's AP list, the
+         * VCPU could have EOI'ed the original interrupt and
+         * won't see this one until it exits for some other
+         * reason.
+         */
+        if ( vcpu )
+            vcpu_unblock(vcpu);
+        return false;
+    }
+
+    /*
+     * We must unlock the irq lock to take the ap_list_lock where
+     * we are going to insert this new pending interrupt.
+     */
+    spin_unlock_irqrestore(&irq->irq_lock, flags);
+
+    /* someone can do stuff here, which we re-check below */
+
+    spin_lock_irqsave(&vcpu->arch.vgic_cpu.ap_list_lock, flags);
+    spin_lock(&irq->irq_lock);
+
+    /*
+     * Did something change behind our backs?
+     *
+     * There are two cases:
+     * 1) The irq lost its pending state or was disabled behind our
+     *    backs and/or it was queued to another VCPU's ap_list.
+     * 2) Someone changed the affinity on this irq behind our
+     *    backs and we are now holding the wrong ap_list_lock.
+     *
+     * In both cases, drop the locks and retry.
+     */
+
+    if ( unlikely(irq->vcpu || vcpu != vgic_target_oracle(irq)) )
+    {
+        spin_unlock(&irq->irq_lock);
+        spin_unlock_irqrestore(&vcpu->arch.vgic_cpu.ap_list_lock, flags);
+
+        spin_lock_irqsave(&irq->irq_lock, flags);
+        goto retry;
+    }
+
+    /*
+     * Grab a reference to the irq to reflect the fact that it is
+     * now in the ap_list.
+     */
+    vgic_get_irq_kref(irq);
+    list_add_tail(&irq->ap_list, &vcpu->arch.vgic_cpu.ap_list_head);
+    irq->vcpu = vcpu;
+
+    spin_unlock(&irq->irq_lock);
+    spin_unlock_irqrestore(&vcpu->arch.vgic_cpu.ap_list_lock, flags);
+
+    running = vcpu->is_running;
+    vcpu_unblock(vcpu);
+    if ( running && vcpu != current )
+        smp_send_event_check_mask(cpumask_of(vcpu->processor));
+
+    return true;
+}
+
+/**
+ * vgic_inject_irq - Inject an IRQ from a device to the vgic
+ * @d:       The domain pointer
+ * @vcpu:    The vCPU for PPIs
+ * @intid:   The INTID to inject a new state to.
+ * @level:   Edge-triggered:  true:  to trigger the interrupt
+ *                false: to ignore the call
+ *       Level-sensitive  true:  raise the input signal
+ *                false: lower the input signal
+ *
+ * The VGIC is not concerned with devices being active-LOW or active-HIGH for
+ * level-sensitive interrupts.  You can think of the level parameter as 1
+ * being HIGH and 0 being LOW and all devices being active-HIGH.
+ */
+int vgic_inject_irq(struct domain *d, struct vcpu *vcpu, unsigned int intid,
+            bool level)
+{
+    struct vgic_irq *irq;
+    unsigned long flags;
+
+    irq = vgic_get_irq(d, vcpu, intid);
+    if ( !irq )
+        return -EINVAL;
+
+    spin_lock_irqsave(&irq->irq_lock, flags);
+
+    if ( !vgic_validate_injection(irq, level) )
+    {
+        /* Nothing to see here, move along... */
+        spin_unlock_irqrestore(&irq->irq_lock, flags);
+        vgic_put_irq(d, irq);
+        return 0;
+    }
+
+    if ( irq->config == VGIC_CONFIG_LEVEL )
+        irq->line_level = level;
+    else
+        irq->pending_latch = true;
+
+    vgic_queue_irq_unlock(d, irq, flags);
+    vgic_put_irq(d, irq);
+
+    return 0;
+}
+
 /*
  * Local variables:
  * mode: C
diff --git a/xen/arch/arm/vgic/vgic.h b/xen/arch/arm/vgic/vgic.h
index 7a15cfdd79..5127739f0f 100644
--- a/xen/arch/arm/vgic/vgic.h
+++ b/xen/arch/arm/vgic/vgic.h
@@ -17,9 +17,19 @@ 
 #ifndef __XEN_ARM_VGIC_NEW_H__
 #define __XEN_ARM_VGIC_NEW_H__
 
+static inline bool irq_is_pending(struct vgic_irq *irq)
+{
+    if ( irq->config == VGIC_CONFIG_EDGE )
+        return irq->pending_latch;
+    else
+        return irq->pending_latch || irq->line_level;
+}
+
 struct vgic_irq *vgic_get_irq(struct domain *d, struct vcpu *vcpu,
                               u32 intid);
 void vgic_put_irq(struct domain *d, struct vgic_irq *irq);
+bool vgic_queue_irq_unlock(struct domain *d, struct vgic_irq *irq,
+               unsigned long flags);
 
 static inline void vgic_get_irq_kref(struct vgic_irq *irq)
 {