diff mbox

[Xen-devel,v4,2/4] xen/arm: inflight irqs during migration

Message ID 1402076908-26740-2-git-send-email-stefano.stabellini@eu.citrix.com
State New
Headers show

Commit Message

Stefano Stabellini June 6, 2014, 5:48 p.m. UTC
We need to take special care when migrating irqs that are already
inflight from one vcpu to another. In fact the lr_pending and inflight
lists are per-vcpu. The lock we take to protect them is also per-vcpu.

In order to avoid issues, we set a new flag GIC_IRQ_GUEST_MIGRATING, so
that we can recognize when we receive an irq while the previous one is
still inflight (given that we are only dealing with hardware interrupts
here, it just means that its LR hasn't been cleared yet on the old vcpu).

If GIC_IRQ_GUEST_MIGRATING is set, we only set GIC_IRQ_GUEST_QUEUED and
interrupt the other vcpu. When clearing the LR on the old vcpu, we take
special care of injecting the interrupt into the new vcpu. To do that we
need to release the old vcpu lock and take the new vcpu lock.

Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
---
 xen/arch/arm/gic.c           |   23 +++++++++++++++++++++--
 xen/arch/arm/vgic.c          |   36 ++++++++++++++++++++++++++++++++++++
 xen/include/asm-arm/domain.h |    4 ++++
 3 files changed, 61 insertions(+), 2 deletions(-)

Comments

Ian Campbell June 10, 2014, 12:12 p.m. UTC | #1
On Fri, 2014-06-06 at 18:48 +0100, Stefano Stabellini wrote:
> We need to take special care when migrating irqs that are already
> inflight from one vcpu to another. In fact the lr_pending and inflight
> lists are per-vcpu. The lock we take to protect them is also per-vcpu.

Please can we get some references to the GIC spec to clarify/justify the
expected behaviour when writing ITARGETSn while an affected interrupt is
pending.

> 
> In order to avoid issues, we set a new flag GIC_IRQ_GUEST_MIGRATING, so
> that we can recognize when we receive an irq while the previous one is
> still inflight (given that we are only dealing with hardware interrupts
> here, it just means that its LR hasn't been cleared yet on the old vcpu).
> 
> If GIC_IRQ_GUEST_MIGRATING is set, we only set GIC_IRQ_GUEST_QUEUED and
> interrupt the other vcpu.

other here being the vcpu which is currently handling or the new one
which will handle?

>  When clearing the LR on the old vcpu, we take
> special care of injecting the interrupt into the new vcpu. To do that we
> need to release the old vcpu lock and take the new vcpu lock.

That might be true but I'm afraid it needs an argument to be made
(rather than an assertion) about why it is safe to drop one lock before
taking the other and why it is correct to do so etc.

In particular I'm not sure what happens if v_target is messing with
ITARGETS at the same time as all this is going on.

I think it also warrants a comment in the code too.

Given that these migrations ought to be rare(ish) it might be simpler
(at least in terms of reasoning about it) for the clearing vcpu to
enqueue the irq onto a dedicated migration list which has its own lock.
Then the new target vcpu could walk that list itself and move things
onto it's own lr_pending list.

> 
> Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
> ---
>  xen/arch/arm/gic.c           |   23 +++++++++++++++++++++--
>  xen/arch/arm/vgic.c          |   36 ++++++++++++++++++++++++++++++++++++
>  xen/include/asm-arm/domain.h |    4 ++++
>  3 files changed, 61 insertions(+), 2 deletions(-)
> 
> diff --git a/xen/arch/arm/gic.c b/xen/arch/arm/gic.c
> index 08ae23b..92391b4 100644
> --- a/xen/arch/arm/gic.c
> +++ b/xen/arch/arm/gic.c
> @@ -677,10 +677,29 @@ static void gic_update_one_lr(struct vcpu *v, int i)
>          clear_bit(GIC_IRQ_GUEST_ACTIVE, &p->status);
>          p->lr = GIC_INVALID_LR;
>          if ( test_bit(GIC_IRQ_GUEST_ENABLED, &p->status) &&
> -             test_bit(GIC_IRQ_GUEST_QUEUED, &p->status) )
> +             test_bit(GIC_IRQ_GUEST_QUEUED, &p->status) &&
> +             !test_bit(GIC_IRQ_GUEST_MIGRATING, &p->status) )
>              gic_raise_guest_irq(v, irq, p->priority);
> -        else
> +        else {
>              list_del_init(&p->inflight);
> +
> +            if ( test_and_clear_bit(GIC_IRQ_GUEST_MIGRATING, &p->status) &&
> +                 test_bit(GIC_IRQ_GUEST_QUEUED, &p->status) )
> +            {
> +                struct vcpu *v_target;
> +
> +                spin_unlock(&v->arch.vgic.lock);
> +                v_target = vgic_get_target_vcpu(v, irq);

Handle v == v_target?

> +                spin_lock(&v_target->arch.vgic.lock);
> +
> +                gic_add_to_lr_pending(v_target, p);
> +                if ( v_target->is_running )
> +                    smp_send_event_check_mask(cpumask_of(v_target->processor));

Don't you also need to vcpu_unblock if it is sleeping?

> +                spin_unlock(&v_target->arch.vgic.lock);
> +                spin_lock(&v->arch.vgic.lock);
> +            }
> +        }
>      }
>  }
>  
> diff --git a/xen/arch/arm/vgic.c b/xen/arch/arm/vgic.c
> index e527892..54d3676 100644
> --- a/xen/arch/arm/vgic.c
> +++ b/xen/arch/arm/vgic.c
> @@ -377,6 +377,21 @@ read_as_zero:
>      return 1;
>  }
>  
> +static void vgic_migrate_irq(struct vcpu *old, struct vcpu *new, unsigned int irq)
> +{
> +    unsigned long flags;
> +    struct pending_irq *p = irq_to_pending(old, irq); 
> +
> +    /* nothing to do for virtual interrupts */
> +    if ( p->desc == NULL )
> +        return;
> +    
> +    spin_lock_irqsave(&old->arch.vgic.lock, flags);
> +    if ( !list_empty(&p->inflight) )
> +        set_bit(GIC_IRQ_GUEST_MIGRATING, &p->status);
> +    spin_unlock_irqrestore(&old->arch.vgic.lock, flags);
> +}
> +
>  struct vcpu *vgic_get_target_vcpu(struct vcpu *v, unsigned int irq)
>  {
>      int target;
> @@ -629,6 +644,21 @@ static int vgic_distr_mmio_write(struct vcpu *v, mmio_info_t *info)
>              }
>              i++;
>          }
> +        i = 0;
> +        while ( (i = find_next_bit((const unsigned long *) &tr, 32, i)) < 32 )

bit ops alignment issues?

I think you should use old_tr^new_tr to only process bits which have
changed.

> +        {
> +            unsigned int irq, target, old_target;
> +            struct vcpu *v_target, *v_old;
> +
> +            target = i % 8;
> +
> +            irq = offset + (i / 8);
> +            v_target = v->domain->vcpu[target];
> +            old_target = byte_read(rank->itargets[REG_RANK_INDEX(8, gicd_reg - GICD_ITARGETSR)], 0, i/8);
> +            v_old = v->domain->vcpu[old_target];

v_target and v_old might be the same.

> +            vgic_migrate_irq(v_old, v_target, irq);
> +            i += 8 - target;
> +        }
>          if ( dabt.size == 2 )
>              rank->itargets[REG_RANK_INDEX(8, gicd_reg - GICD_ITARGETSR)] = *r;
>          else
Stefano Stabellini June 11, 2014, 2:15 p.m. UTC | #2
On Tue, 10 Jun 2014, Ian Campbell wrote:
> On Fri, 2014-06-06 at 18:48 +0100, Stefano Stabellini wrote:
> > We need to take special care when migrating irqs that are already
> > inflight from one vcpu to another. In fact the lr_pending and inflight
> > lists are per-vcpu. The lock we take to protect them is also per-vcpu.
> 
> Please can we get some references to the GIC spec to clarify/justify the
> expected behaviour when writing ITARGETSn while an affected interrupt is
> pending.

OK


> > In order to avoid issues, we set a new flag GIC_IRQ_GUEST_MIGRATING, so
> > that we can recognize when we receive an irq while the previous one is
> > still inflight (given that we are only dealing with hardware interrupts
> > here, it just means that its LR hasn't been cleared yet on the old vcpu).
> > 
> > If GIC_IRQ_GUEST_MIGRATING is set, we only set GIC_IRQ_GUEST_QUEUED and
> > interrupt the other vcpu.
> 
> other here being the vcpu which is currently handling or the new one
> which will handle?

Other is the old vcpu where we still need to clear the LR.
I made it clear in the commit message.


> >  When clearing the LR on the old vcpu, we take
> > special care of injecting the interrupt into the new vcpu. To do that we
> > need to release the old vcpu lock and take the new vcpu lock.
> 
> That might be true but I'm afraid it needs an argument to be made
> (rather than an assertion) about why it is safe to drop one lock before
> taking the other and why it is correct to do so etc.
 
It is OK to drop the vgic lock of the current vcpu because we have
already dealt with the current LR. We don't need to keep the vgic lock
between LRs but we do for simplicity.

I'll improve the comments.


> In particular I'm not sure what happens if v_target is messing with
> ITARGETS at the same time as all this is going on.

vgic_get_target_vcpu takes the rank lock so the view of the target vcpu
is consistent.


> I think it also warrants a comment in the code too.
> 
> Given that these migrations ought to be rare(ish) it might be simpler
> (at least in terms of reasoning about it) for the clearing vcpu to
> enqueue the irq onto a dedicated migration list which has its own lock.
> Then the new target vcpu could walk that list itself and move things
> onto it's own lr_pending list.
>
> > 
> > Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
> > ---
> >  xen/arch/arm/gic.c           |   23 +++++++++++++++++++++--
> >  xen/arch/arm/vgic.c          |   36 ++++++++++++++++++++++++++++++++++++
> >  xen/include/asm-arm/domain.h |    4 ++++
> >  3 files changed, 61 insertions(+), 2 deletions(-)
> > 
> > diff --git a/xen/arch/arm/gic.c b/xen/arch/arm/gic.c
> > index 08ae23b..92391b4 100644
> > --- a/xen/arch/arm/gic.c
> > +++ b/xen/arch/arm/gic.c
> > @@ -677,10 +677,29 @@ static void gic_update_one_lr(struct vcpu *v, int i)
> >          clear_bit(GIC_IRQ_GUEST_ACTIVE, &p->status);
> >          p->lr = GIC_INVALID_LR;
> >          if ( test_bit(GIC_IRQ_GUEST_ENABLED, &p->status) &&
> > -             test_bit(GIC_IRQ_GUEST_QUEUED, &p->status) )
> > +             test_bit(GIC_IRQ_GUEST_QUEUED, &p->status) &&
> > +             !test_bit(GIC_IRQ_GUEST_MIGRATING, &p->status) )
> >              gic_raise_guest_irq(v, irq, p->priority);
> > -        else
> > +        else {
> >              list_del_init(&p->inflight);
> > +
> > +            if ( test_and_clear_bit(GIC_IRQ_GUEST_MIGRATING, &p->status) &&
> > +                 test_bit(GIC_IRQ_GUEST_QUEUED, &p->status) )
> > +            {
> > +                struct vcpu *v_target;
> > +
> > +                spin_unlock(&v->arch.vgic.lock);
> > +                v_target = vgic_get_target_vcpu(v, irq);
> 
> Handle v == v_target?

Should work


> > +                spin_lock(&v_target->arch.vgic.lock);
> > +
> > +                gic_add_to_lr_pending(v_target, p);
> > +                if ( v_target->is_running )
> > +                    smp_send_event_check_mask(cpumask_of(v_target->processor));
> 
> Don't you also need to vcpu_unblock if it is sleeping?

Good point


> > +                spin_unlock(&v_target->arch.vgic.lock);
> > +                spin_lock(&v->arch.vgic.lock);
> > +            }
> > +        }
> >      }
> >  }
> >  
> > diff --git a/xen/arch/arm/vgic.c b/xen/arch/arm/vgic.c
> > index e527892..54d3676 100644
> > --- a/xen/arch/arm/vgic.c
> > +++ b/xen/arch/arm/vgic.c
> > @@ -377,6 +377,21 @@ read_as_zero:
> >      return 1;
> >  }
> >  
> > +static void vgic_migrate_irq(struct vcpu *old, struct vcpu *new, unsigned int irq)
> > +{
> > +    unsigned long flags;
> > +    struct pending_irq *p = irq_to_pending(old, irq); 
> > +
> > +    /* nothing to do for virtual interrupts */
> > +    if ( p->desc == NULL )
> > +        return;
> > +    
> > +    spin_lock_irqsave(&old->arch.vgic.lock, flags);
> > +    if ( !list_empty(&p->inflight) )
> > +        set_bit(GIC_IRQ_GUEST_MIGRATING, &p->status);
> > +    spin_unlock_irqrestore(&old->arch.vgic.lock, flags);
> > +}
> > +
> >  struct vcpu *vgic_get_target_vcpu(struct vcpu *v, unsigned int irq)
> >  {
> >      int target;
> > @@ -629,6 +644,21 @@ static int vgic_distr_mmio_write(struct vcpu *v, mmio_info_t *info)
> >              }
> >              i++;
> >          }
> > +        i = 0;
> > +        while ( (i = find_next_bit((const unsigned long *) &tr, 32, i)) < 32 )
> 
> bit ops alignment issues?
> 
> I think you should use old_tr^new_tr to only process bits which have
> changed.
> 
> > +        {
> > +            unsigned int irq, target, old_target;
> > +            struct vcpu *v_target, *v_old;
> > +
> > +            target = i % 8;
> > +
> > +            irq = offset + (i / 8);
> > +            v_target = v->domain->vcpu[target];
> > +            old_target = byte_read(rank->itargets[REG_RANK_INDEX(8, gicd_reg - GICD_ITARGETSR)], 0, i/8);
> > +            v_old = v->domain->vcpu[old_target];
> 
> v_target and v_old might be the same.

No, they could not: if they were find_next_bit wouldn't find the bit set.


> > +            vgic_migrate_irq(v_old, v_target, irq);
> > +            i += 8 - target;
> > +        }
> >          if ( dabt.size == 2 )
> >              rank->itargets[REG_RANK_INDEX(8, gicd_reg - GICD_ITARGETSR)] = *r;
> >          else
> 
>
Julien Grall June 11, 2014, 2:28 p.m. UTC | #3
On 06/11/2014 03:15 PM, Stefano Stabellini wrote:
>>> +        {
>>> +            unsigned int irq, target, old_target;
>>> +            struct vcpu *v_target, *v_old;
>>> +
>>> +            target = i % 8;
>>> +
>>> +            irq = offset + (i / 8);
>>> +            v_target = v->domain->vcpu[target];
>>> +            old_target = byte_read(rank->itargets[REG_RANK_INDEX(8, gicd_reg - GICD_ITARGETSR)], 0, i/8);
>>> +            v_old = v->domain->vcpu[old_target];
>>
>> v_target and v_old might be the same.
> 
> No, they could not: if they were find_next_bit wouldn't find the bit set.

Even though v_target is always != v_old (because of the tr = r & ~val
stuff), why do you migrate if the old_target will be in the new mask?

BTW, this code suffers the same issue as #1, i.e this register can be
accessed by byte.

Regards,
Stefano Stabellini June 11, 2014, 2:49 p.m. UTC | #4
On Wed, 11 Jun 2014, Julien Grall wrote:
> On 06/11/2014 03:15 PM, Stefano Stabellini wrote:
> >>> +        {
> >>> +            unsigned int irq, target, old_target;
> >>> +            struct vcpu *v_target, *v_old;
> >>> +
> >>> +            target = i % 8;
> >>> +
> >>> +            irq = offset + (i / 8);
> >>> +            v_target = v->domain->vcpu[target];
> >>> +            old_target = byte_read(rank->itargets[REG_RANK_INDEX(8, gicd_reg - GICD_ITARGETSR)], 0, i/8);
> >>> +            v_old = v->domain->vcpu[old_target];
> >>
> >> v_target and v_old might be the same.
> > 
> > No, they could not: if they were find_next_bit wouldn't find the bit set.
> 
> Even though v_target is always != v_old (because of the tr = r & ~val
> stuff), why do you migrate if the old_target will be in the new mask?

We need to be consistent: always the lowest vcpu in the mask.


> BTW, this code suffers the same issue as #1, i.e this register can be
> accessed by byte.

I followed Ian's suggestion and I masked the register according to
v->domain->max_vcpus.

> Regards,
> 
> -- 
> Julien Grall
>
Julien Grall June 11, 2014, 3:16 p.m. UTC | #5
On 06/11/2014 03:49 PM, Stefano Stabellini wrote:
> On Wed, 11 Jun 2014, Julien Grall wrote:
>> On 06/11/2014 03:15 PM, Stefano Stabellini wrote:
>>>>> +        {
>>>>> +            unsigned int irq, target, old_target;
>>>>> +            struct vcpu *v_target, *v_old;
>>>>> +
>>>>> +            target = i % 8;
>>>>> +
>>>>> +            irq = offset + (i / 8);
>>>>> +            v_target = v->domain->vcpu[target];
>>>>> +            old_target = byte_read(rank->itargets[REG_RANK_INDEX(8, gicd_reg - GICD_ITARGETSR)], 0, i/8);
>>>>> +            v_old = v->domain->vcpu[old_target];
>>>>
>>>> v_target and v_old might be the same.
>>>
>>> No, they could not: if they were find_next_bit wouldn't find the bit set.
>>
>> Even though v_target is always != v_old (because of the tr = r & ~val
>> stuff), why do you migrate if the old_target will be in the new mask?
> 
> We need to be consistent: always the lowest vcpu in the mask.

It's not the case here. AFAIU tr only contains the list of VPCU that are
not yet in this register.

The old vcpu can have an cpuid lower than the new vcpu. In this case,
you won't respect your consistency.

Regards,
Stefano Stabellini June 11, 2014, 3:55 p.m. UTC | #6
On Wed, 11 Jun 2014, Julien Grall wrote:
> On 06/11/2014 03:49 PM, Stefano Stabellini wrote:
> > On Wed, 11 Jun 2014, Julien Grall wrote:
> >> On 06/11/2014 03:15 PM, Stefano Stabellini wrote:
> >>>>> +        {
> >>>>> +            unsigned int irq, target, old_target;
> >>>>> +            struct vcpu *v_target, *v_old;
> >>>>> +
> >>>>> +            target = i % 8;
> >>>>> +
> >>>>> +            irq = offset + (i / 8);
> >>>>> +            v_target = v->domain->vcpu[target];
> >>>>> +            old_target = byte_read(rank->itargets[REG_RANK_INDEX(8, gicd_reg - GICD_ITARGETSR)], 0, i/8);
> >>>>> +            v_old = v->domain->vcpu[old_target];
> >>>>
> >>>> v_target and v_old might be the same.
> >>>
> >>> No, they could not: if they were find_next_bit wouldn't find the bit set.
> >>
> >> Even though v_target is always != v_old (because of the tr = r & ~val
> >> stuff), why do you migrate if the old_target will be in the new mask?
> > 
> > We need to be consistent: always the lowest vcpu in the mask.
> 
> It's not the case here. AFAIU tr only contains the list of VPCU that are
> not yet in this register.
> 
> The old vcpu can have an cpuid lower than the new vcpu. In this case,
> you won't respect your consistency.

You are right. I'll rewrite this code.
diff mbox

Patch

diff --git a/xen/arch/arm/gic.c b/xen/arch/arm/gic.c
index 08ae23b..92391b4 100644
--- a/xen/arch/arm/gic.c
+++ b/xen/arch/arm/gic.c
@@ -677,10 +677,29 @@  static void gic_update_one_lr(struct vcpu *v, int i)
         clear_bit(GIC_IRQ_GUEST_ACTIVE, &p->status);
         p->lr = GIC_INVALID_LR;
         if ( test_bit(GIC_IRQ_GUEST_ENABLED, &p->status) &&
-             test_bit(GIC_IRQ_GUEST_QUEUED, &p->status) )
+             test_bit(GIC_IRQ_GUEST_QUEUED, &p->status) &&
+             !test_bit(GIC_IRQ_GUEST_MIGRATING, &p->status) )
             gic_raise_guest_irq(v, irq, p->priority);
-        else
+        else {
             list_del_init(&p->inflight);
+
+            if ( test_and_clear_bit(GIC_IRQ_GUEST_MIGRATING, &p->status) &&
+                 test_bit(GIC_IRQ_GUEST_QUEUED, &p->status) )
+            {
+                struct vcpu *v_target;
+
+                spin_unlock(&v->arch.vgic.lock);
+                v_target = vgic_get_target_vcpu(v, irq);
+                spin_lock(&v_target->arch.vgic.lock);
+
+                gic_add_to_lr_pending(v_target, p);
+                if ( v_target->is_running )
+                    smp_send_event_check_mask(cpumask_of(v_target->processor));
+
+                spin_unlock(&v_target->arch.vgic.lock);
+                spin_lock(&v->arch.vgic.lock);
+            }
+        }
     }
 }
 
diff --git a/xen/arch/arm/vgic.c b/xen/arch/arm/vgic.c
index e527892..54d3676 100644
--- a/xen/arch/arm/vgic.c
+++ b/xen/arch/arm/vgic.c
@@ -377,6 +377,21 @@  read_as_zero:
     return 1;
 }
 
+static void vgic_migrate_irq(struct vcpu *old, struct vcpu *new, unsigned int irq)
+{
+    unsigned long flags;
+    struct pending_irq *p = irq_to_pending(old, irq); 
+
+    /* nothing to do for virtual interrupts */
+    if ( p->desc == NULL )
+        return;
+    
+    spin_lock_irqsave(&old->arch.vgic.lock, flags);
+    if ( !list_empty(&p->inflight) )
+        set_bit(GIC_IRQ_GUEST_MIGRATING, &p->status);
+    spin_unlock_irqrestore(&old->arch.vgic.lock, flags);
+}
+
 struct vcpu *vgic_get_target_vcpu(struct vcpu *v, unsigned int irq)
 {
     int target;
@@ -629,6 +644,21 @@  static int vgic_distr_mmio_write(struct vcpu *v, mmio_info_t *info)
             }
             i++;
         }
+        i = 0;
+        while ( (i = find_next_bit((const unsigned long *) &tr, 32, i)) < 32 )
+        {
+            unsigned int irq, target, old_target;
+            struct vcpu *v_target, *v_old;
+
+            target = i % 8;
+
+            irq = offset + (i / 8);
+            v_target = v->domain->vcpu[target];
+            old_target = byte_read(rank->itargets[REG_RANK_INDEX(8, gicd_reg - GICD_ITARGETSR)], 0, i/8);
+            v_old = v->domain->vcpu[old_target];
+            vgic_migrate_irq(v_old, v_target, irq);
+            i += 8 - target;
+        }
         if ( dabt.size == 2 )
             rank->itargets[REG_RANK_INDEX(8, gicd_reg - GICD_ITARGETSR)] = *r;
         else
@@ -771,6 +801,12 @@  void vgic_vcpu_inject_irq(struct vcpu *v, unsigned int irq)
 
     spin_lock_irqsave(&v->arch.vgic.lock, flags);
 
+    if ( test_bit(GIC_IRQ_GUEST_MIGRATING, &n->status) )
+    {
+        set_bit(GIC_IRQ_GUEST_QUEUED, &n->status);
+        goto out;
+    }
+
     if ( !list_empty(&n->inflight) )
     {
         set_bit(GIC_IRQ_GUEST_QUEUED, &n->status);
diff --git a/xen/include/asm-arm/domain.h b/xen/include/asm-arm/domain.h
index d689675..743c020 100644
--- a/xen/include/asm-arm/domain.h
+++ b/xen/include/asm-arm/domain.h
@@ -54,11 +54,15 @@  struct pending_irq
      * GIC_IRQ_GUEST_ENABLED: the guest IRQ is enabled at the VGICD
      * level (GICD_ICENABLER/GICD_ISENABLER).
      *
+     * GIC_IRQ_GUEST_MIGRATING: the irq is being migrated to a different
+     * vcpu.
+     *
      */
 #define GIC_IRQ_GUEST_QUEUED   0
 #define GIC_IRQ_GUEST_ACTIVE   1
 #define GIC_IRQ_GUEST_VISIBLE  2
 #define GIC_IRQ_GUEST_ENABLED  3
+#define GIC_IRQ_GUEST_MIGRATING   4
     unsigned long status;
     struct irq_desc *desc; /* only set it the irq corresponds to a physical irq */
     int irq;