diff mbox series

[Xen-devel,for-4.12,v2,15/17] xen/arm: p2m: Add support for preemption in p2m_cache_flush_range

Message ID 20181204202651.8836-16-julien.grall@arm.com
State Superseded
Headers show
Series xen/arm: Implement Set/Way operations | expand

Commit Message

Julien Grall Dec. 4, 2018, 8:26 p.m. UTC
p2m_cache_flush_range does not yet support preemption, this may be an
issue as cleaning the cache can take a long time. While the current
caller (XEN_DOMCTL_cacheflush) does not stricly require preemption, this
will be necessary for new caller in a follow-up patch.

The preemption implemented is quite simple, a counter is incremented by:
    - 1 on region skipped
    - 10 for each page requiring a flush

When the counter reach 512 or above, we will check if preemption is
needed. If not, the counter will be reset to 0. If yes, the function
will stop, update start (to allow resuming later on) and return
-ERESTART. This allows the caller to decide how the preemption will be
done.

For now, XEN_DOMCTL_cacheflush will continue to ignore the preemption.

Signed-off-by: Julien Grall <julien.grall@arm.com>

---
    Changes in v2:
        - Patch added
---
 xen/arch/arm/domctl.c     |  8 +++++++-
 xen/arch/arm/p2m.c        | 35 ++++++++++++++++++++++++++++++++---
 xen/include/asm-arm/p2m.h |  4 +++-
 3 files changed, 42 insertions(+), 5 deletions(-)

Comments

Stefano Stabellini Dec. 6, 2018, 11:32 p.m. UTC | #1
On Tue, 4 Dec 2018, Julien Grall wrote:
> p2m_cache_flush_range does not yet support preemption, this may be an
> issue as cleaning the cache can take a long time. While the current
> caller (XEN_DOMCTL_cacheflush) does not stricly require preemption, this
> will be necessary for new caller in a follow-up patch.
> 
> The preemption implemented is quite simple, a counter is incremented by:
>     - 1 on region skipped
>     - 10 for each page requiring a flush
> 
> When the counter reach 512 or above, we will check if preemption is
> needed. If not, the counter will be reset to 0. If yes, the function
> will stop, update start (to allow resuming later on) and return
> -ERESTART. This allows the caller to decide how the preemption will be
> done.
> 
> For now, XEN_DOMCTL_cacheflush will continue to ignore the preemption.
> 
> Signed-off-by: Julien Grall <julien.grall@arm.com>
> 
> ---
>     Changes in v2:
>         - Patch added
> ---
>  xen/arch/arm/domctl.c     |  8 +++++++-
>  xen/arch/arm/p2m.c        | 35 ++++++++++++++++++++++++++++++++---
>  xen/include/asm-arm/p2m.h |  4 +++-
>  3 files changed, 42 insertions(+), 5 deletions(-)
> 
> diff --git a/xen/arch/arm/domctl.c b/xen/arch/arm/domctl.c
> index 20691528a6..9da88b8c64 100644
> --- a/xen/arch/arm/domctl.c
> +++ b/xen/arch/arm/domctl.c
> @@ -54,6 +54,7 @@ long arch_do_domctl(struct xen_domctl *domctl, struct domain *d,
>      {
>          gfn_t s = _gfn(domctl->u.cacheflush.start_pfn);
>          gfn_t e = gfn_add(s, domctl->u.cacheflush.nr_pfns);
> +        int rc;

This is unnecessary...


>          if ( domctl->u.cacheflush.nr_pfns > (1U<<MAX_ORDER) )
>              return -EINVAL;
> @@ -61,7 +62,12 @@ long arch_do_domctl(struct xen_domctl *domctl, struct domain *d,
>          if ( gfn_x(e) < gfn_x(s) )
>              return -EINVAL;
>  
> -        return p2m_cache_flush_range(d, s, e);
> +        /* XXX: Handle preemption */
> +        do
> +            rc = p2m_cache_flush_range(d, &s, e);
> +        while ( rc == -ERESTART );

... you can just do:

  while ( -ERESTART == p2m_cache_flush_range(d, &s, e) )

But given that it is just style, I'll leave it up to you.


> +        return rc;
>      }
>      case XEN_DOMCTL_bind_pt_irq:
>      {
> diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c
> index db22b53bfd..ca9f0d9ebe 100644
> --- a/xen/arch/arm/p2m.c
> +++ b/xen/arch/arm/p2m.c
> @@ -1524,13 +1524,17 @@ int relinquish_p2m_mapping(struct domain *d)
>      return rc;
>  }
>  
> -int p2m_cache_flush_range(struct domain *d, gfn_t start, gfn_t end)
> +int p2m_cache_flush_range(struct domain *d, gfn_t *pstart, gfn_t end)
>  {
>      struct p2m_domain *p2m = p2m_get_hostp2m(d);
>      gfn_t next_block_gfn;
> +    gfn_t start = *pstart;
>      mfn_t mfn = INVALID_MFN;
>      p2m_type_t t;
>      unsigned int order;
> +    int rc = 0;
> +    /* Counter for preemption */
> +    unsigned long count = 0;
>  
>      /*
>       * The operation cache flush will invalidate the RAM assigned to the
> @@ -1547,6 +1551,25 @@ int p2m_cache_flush_range(struct domain *d, gfn_t start, gfn_t end)
>  
>      while ( gfn_x(start) < gfn_x(end) )
>      {
> +       /*
> +         * Cleaning the cache for the P2M may take a long time. So we
> +         * need to be able to preempt. We will arbitrarily preempt every
> +         * time count reach 512 or above.
> +
> +         *
> +         * The count will be incremented by:
> +         *  - 1 on region skipped
> +         *  - 10 for each page requiring a flush

Why this choice? A page flush should cost much more than 10x a region
skipped, more like 100x or 1000x. In fact, doing the full loop without
calling flush_page_to_ram should be cheap and fast, right?. I would:

- not increase count on region skipped at all
- increase it by 1 on each page requiring a flush
- set the limit lower, if we go with your proposal it would be about 50,
  I am not sure what the limit should be though


> +         */
> +        if ( count >= 512 )
> +        {
> +            if ( softirq_pending(smp_processor_id()) )
> +            {
> +                rc = -ERESTART;
> +                break;
> +            }
> +            count = 0;

No need to set count to 0 here


> +        }
> +
>          /*
>           * We want to flush page by page as:
>           *  - it may not be possible to map the full block (can be up to 1GB)
> @@ -1573,22 +1596,28 @@ int p2m_cache_flush_range(struct domain *d, gfn_t start, gfn_t end)
>               */
>              if ( mfn_eq(mfn, INVALID_MFN) || !p2m_is_any_ram(t) )
>              {
> +                count++;

This is just an iteration doing nothing, I would not increament count.

>                  start = next_block_gfn;
>                  continue;
>              }
>          }
>  
> +        count += 10;

This makes sense, but if we skip the count++ above, we might as well
just count++ here and have a lower limit.


>          flush_page_to_ram(mfn_x(mfn), false);
>  
>          start = gfn_add(start, 1);
>          mfn = mfn_add(mfn, 1);
>      }
>  
> -    invalidate_icache();
> +    if ( rc != -ERESTART )
> +        invalidate_icache();
>  
>      p2m_read_unlock(p2m);
>  
> -    return 0;
> +    *pstart = start;
> +
> +    return rc;
>  }
>  
>  mfn_t gfn_to_mfn(struct domain *d, gfn_t gfn)
> diff --git a/xen/include/asm-arm/p2m.h b/xen/include/asm-arm/p2m.h
> index 7c1d930b1d..a633e27cc9 100644
> --- a/xen/include/asm-arm/p2m.h
> +++ b/xen/include/asm-arm/p2m.h
> @@ -232,8 +232,10 @@ bool p2m_resolve_translation_fault(struct domain *d, gfn_t gfn);
>  /*
>   * Clean & invalidate caches corresponding to a region [start,end) of guest
>   * address space.
> + *
> + * start will get updated if the function is preempted.
>   */
> -int p2m_cache_flush_range(struct domain *d, gfn_t start, gfn_t end);
> +int p2m_cache_flush_range(struct domain *d, gfn_t *pstart, gfn_t end);
>  
>  /*
>   * Map a region in the guest p2m with a specific p2m type.
> -- 
> 2.11.0
>
Julien Grall Dec. 7, 2018, 11:15 a.m. UTC | #2
Hi Stefano,

On 06/12/2018 23:32, Stefano Stabellini wrote:
> On Tue, 4 Dec 2018, Julien Grall wrote:
>> p2m_cache_flush_range does not yet support preemption, this may be an
>> issue as cleaning the cache can take a long time. While the current
>> caller (XEN_DOMCTL_cacheflush) does not stricly require preemption, this
>> will be necessary for new caller in a follow-up patch.
>>
>> The preemption implemented is quite simple, a counter is incremented by:
>>      - 1 on region skipped
>>      - 10 for each page requiring a flush
>>
>> When the counter reach 512 or above, we will check if preemption is
>> needed. If not, the counter will be reset to 0. If yes, the function
>> will stop, update start (to allow resuming later on) and return
>> -ERESTART. This allows the caller to decide how the preemption will be
>> done.
>>
>> For now, XEN_DOMCTL_cacheflush will continue to ignore the preemption.
>>
>> Signed-off-by: Julien Grall <julien.grall@arm.com>
>>
>> ---
>>      Changes in v2:
>>          - Patch added
>> ---
>>   xen/arch/arm/domctl.c     |  8 +++++++-
>>   xen/arch/arm/p2m.c        | 35 ++++++++++++++++++++++++++++++++---
>>   xen/include/asm-arm/p2m.h |  4 +++-
>>   3 files changed, 42 insertions(+), 5 deletions(-)
>>
>> diff --git a/xen/arch/arm/domctl.c b/xen/arch/arm/domctl.c
>> index 20691528a6..9da88b8c64 100644
>> --- a/xen/arch/arm/domctl.c
>> +++ b/xen/arch/arm/domctl.c
>> @@ -54,6 +54,7 @@ long arch_do_domctl(struct xen_domctl *domctl, struct domain *d,
>>       {
>>           gfn_t s = _gfn(domctl->u.cacheflush.start_pfn);
>>           gfn_t e = gfn_add(s, domctl->u.cacheflush.nr_pfns);
>> +        int rc;
> 
> This is unnecessary...
> 
> 
>>           if ( domctl->u.cacheflush.nr_pfns > (1U<<MAX_ORDER) )
>>               return -EINVAL;
>> @@ -61,7 +62,12 @@ long arch_do_domctl(struct xen_domctl *domctl, struct domain *d,
>>           if ( gfn_x(e) < gfn_x(s) )
>>               return -EINVAL;
>>   
>> -        return p2m_cache_flush_range(d, s, e);
>> +        /* XXX: Handle preemption */
>> +        do
>> +            rc = p2m_cache_flush_range(d, &s, e);
>> +        while ( rc == -ERESTART );
> 
> ... you can just do:
> 
>    while ( -ERESTART == p2m_cache_flush_range(d, &s, e) )
> 
> But given that it is just style, I'll leave it up to you.

I don't much like the idea to have the loop body empty. This is error-prone 
depending where you use do {} while (...) or while ( ... );

So I would prefer to stick with a temporary variable.

> 
> 
>> +        return rc;
>>       }
>>       case XEN_DOMCTL_bind_pt_irq:
>>       {
>> diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c
>> index db22b53bfd..ca9f0d9ebe 100644
>> --- a/xen/arch/arm/p2m.c
>> +++ b/xen/arch/arm/p2m.c
>> @@ -1524,13 +1524,17 @@ int relinquish_p2m_mapping(struct domain *d)
>>       return rc;
>>   }
>>   
>> -int p2m_cache_flush_range(struct domain *d, gfn_t start, gfn_t end)
>> +int p2m_cache_flush_range(struct domain *d, gfn_t *pstart, gfn_t end)
>>   {
>>       struct p2m_domain *p2m = p2m_get_hostp2m(d);
>>       gfn_t next_block_gfn;
>> +    gfn_t start = *pstart;
>>       mfn_t mfn = INVALID_MFN;
>>       p2m_type_t t;
>>       unsigned int order;
>> +    int rc = 0;
>> +    /* Counter for preemption */
>> +    unsigned long count = 0;
>>   
>>       /*
>>        * The operation cache flush will invalidate the RAM assigned to the
>> @@ -1547,6 +1551,25 @@ int p2m_cache_flush_range(struct domain *d, gfn_t start, gfn_t end)
>>   
>>       while ( gfn_x(start) < gfn_x(end) )
>>       {
>> +       /*
>> +         * Cleaning the cache for the P2M may take a long time. So we
>> +         * need to be able to preempt. We will arbitrarily preempt every
>> +         * time count reach 512 or above.
>> +
>> +         *
>> +         * The count will be incremented by:
>> +         *  - 1 on region skipped
>> +         *  - 10 for each page requiring a flush
> 
> Why this choice? A page flush should cost much more than 10x a region
> skipped, more like 100x or 1000x. In fact, doing the full loop without
> calling flush_page_to_ram should be cheap and fast, right?.

It is cheaper than a flush of the page but it still has a cost. You have to walk 
the stage-2 in software that will require to map the tables. As all the memory 
is not mapped in the hypervisor on arm32 this will require a map/unmap 
operation. On arm64, so far the full memory is mapped, so the map/unmap is 
pretty much a NOP.

> I would:
> 
> - not increase count on region skipped at all
> - increase it by 1 on each page requiring a flush
> - set the limit lower, if we go with your proposal it would be about 50,
>    I am not sure what the limit should be though
I don't think you can avoid incrementing count on region skipped. While one 
lookup is pretty cheap, all the lookups for hole added together may result to a 
pretty long time.

Even if stage-2 mappings are handled by the hypervisor, the guest is still 
somewhat in control of it because it can balloon in/out pages. The operation may 
result to shatter stage-2 mappings.

It would be feasible for a guest to shatter 1GB of memory in 4KB mappings in 
stage-2 entries and then remove all the entries. This means the stage-2 would 
contains 262144 holes. This would result to 262144 iterations, so no matter how 
cheap it is the resulting time spent without preemption is going to be quite 
important.

The choice in the numbers 1 vs 10 is pretty much random. The question is how 
often we want to check for pending softirq. The check is pretty much trivial, 
yet it has a cost to preempt. With the current solution, we check preemption 
every 512 holes or 51 pages flushed (~204KB flushed).

This sounds ok to me. Feel free to suggest better number.

> 
> 
>> +         */
>> +        if ( count >= 512 )
>> +        {
>> +            if ( softirq_pending(smp_processor_id()) )
>> +            {
>> +                rc = -ERESTART;
>> +                break;
>> +            }
>> +            count = 0;
> 
> No need to set count to 0 here

Well, the code would not do the same here. If you don't reset to 0, you would 
check softirq_pending() all the iteration when count reached 512.

If you reset 0, you will avoid to check softirq_pending() until the next time 
count reached 512.

The both are actually valid. It just a matter on whether we are assuming that a 
softirq will happen soon after reaching 512?

> 
> 
>> +        }
>> +
>>           /*
>>            * We want to flush page by page as:
>>            *  - it may not be possible to map the full block (can be up to 1GB)
>> @@ -1573,22 +1596,28 @@ int p2m_cache_flush_range(struct domain *d, gfn_t start, gfn_t end)
>>                */
>>               if ( mfn_eq(mfn, INVALID_MFN) || !p2m_is_any_ram(t) )
>>               {
>> +                count++;
> 
> This is just an iteration doing nothing, I would not increament count.

[...]

> This makes sense, but if we skip the count++ above, we might as well
> just count++ here and have a lower limit.

See above for why I think this can't work.

Cheers,
Stefano Stabellini Dec. 7, 2018, 10:11 p.m. UTC | #3
On Fri, 7 Dec 2018, Julien Grall wrote:
> > > @@ -1547,6 +1551,25 @@ int p2m_cache_flush_range(struct domain *d, gfn_t
> > > start, gfn_t end)
> > >         while ( gfn_x(start) < gfn_x(end) )
> > >       {
> > > +       /*
> > > +         * Cleaning the cache for the P2M may take a long time. So we
> > > +         * need to be able to preempt. We will arbitrarily preempt every
> > > +         * time count reach 512 or above.
> > > +
> > > +         *
> > > +         * The count will be incremented by:
> > > +         *  - 1 on region skipped
> > > +         *  - 10 for each page requiring a flush
> > 
> > Why this choice? A page flush should cost much more than 10x a region
> > skipped, more like 100x or 1000x. In fact, doing the full loop without
> > calling flush_page_to_ram should be cheap and fast, right?.
> 
> It is cheaper than a flush of the page but it still has a cost. You have to
> walk the stage-2 in software that will require to map the tables. As all the
> memory is not mapped in the hypervisor on arm32 this will require a map/unmap
> operation. On arm64, so far the full memory is mapped, so the map/unmap is
> pretty much a NOP.

Good point, actually the cost of an "empty" iteration is significantly
different on arm32 and arm64.


> > I would:
> > 
> > - not increase count on region skipped at all
> > - increase it by 1 on each page requiring a flush
> > - set the limit lower, if we go with your proposal it would be about 50,
> >    I am not sure what the limit should be though
> I don't think you can avoid incrementing count on region skipped. While one
> lookup is pretty cheap, all the lookups for hole added together may result to
> a pretty long time.

Thinking of the arm32 case where map/unmap need to be done, you are
right.


> Even if stage-2 mappings are handled by the hypervisor, the guest is still
> somewhat in control of it because it can balloon in/out pages. The operation
> may result to shatter stage-2 mappings.
> 
> It would be feasible for a guest to shatter 1GB of memory in 4KB mappings in
> stage-2 entries and then remove all the entries. This means the stage-2 would
> contains 262144 holes. This would result to 262144 iterations, so no matter
> how cheap it is the resulting time spent without preemption is going to be
> quite important.

OK


> The choice in the numbers 1 vs 10 is pretty much random. The question is how
> often we want to check for pending softirq. The check is pretty much trivial,
> yet it has a cost to preempt. With the current solution, we check preemption
> every 512 holes or 51 pages flushed (~204KB flushed).
> 
> This sounds ok to me. Feel free to suggest better number.

One suggestion is that we might want to treat the arm32 case differently
from the arm64 case, given the different cost of mapping/unmapping pages
during the walk. Would it be fair to say that if an arm64 empty
iteration is worth "1" unit of work, then an arm32 empty iteration is
worth at least "2" unit of work? Or more? My gut feeling is that it is
more like:

empty arm64:       1
empty arm32:       5
flush arm32/arm64: 10

Or even:

empty arm64:       1
empty arm32:       10
flush arm32/arm64: 20

and the overall limits for checks could be 512 or 1024 like you
suggested.

But I don't really know, we would need precise benchmarks to have an
idea about what are the best numbers for this. I am not suggesting you
have to do any more benchmarks now, we'll just hand-wave it for now.



> > > +         */
> > > +        if ( count >= 512 )
> > > +        {
> > > +            if ( softirq_pending(smp_processor_id()) )
> > > +            {
> > > +                rc = -ERESTART;
> > > +                break;
> > > +            }
> > > +            count = 0;
> > 
> > No need to set count to 0 here
> 
> Well, the code would not do the same here. If you don't reset to 0, you would
> check softirq_pending() all the iteration when count reached 512.
> 
> If you reset 0, you will avoid to check softirq_pending() until the next time
> count reached 512.
> 
> The both are actually valid. It just a matter on whether we are assuming that
> a softirq will happen soon after reaching 512?

My comment was wrong, the code is right as is, I think we want to check
softirq_pending every 512 iterations.
Julien Grall Dec. 11, 2018, 4:11 p.m. UTC | #4
Hi,

On 07/12/2018 22:11, Stefano Stabellini wrote:
> On Fri, 7 Dec 2018, Julien Grall wrote:
>>>> @@ -1547,6 +1551,25 @@ int p2m_cache_flush_range(struct domain *d, gfn_t
>>>> start, gfn_t end)
>>>>          while ( gfn_x(start) < gfn_x(end) )
>>>>        {
>>>> +       /*
>>>> +         * Cleaning the cache for the P2M may take a long time. So we
>>>> +         * need to be able to preempt. We will arbitrarily preempt every
>>>> +         * time count reach 512 or above.
>>>> +
>>>> +         *
>>>> +         * The count will be incremented by:
>>>> +         *  - 1 on region skipped
>>>> +         *  - 10 for each page requiring a flush
>>>
>>> Why this choice? A page flush should cost much more than 10x a region
>>> skipped, more like 100x or 1000x. In fact, doing the full loop without
>>> calling flush_page_to_ram should be cheap and fast, right?.
>>
>> It is cheaper than a flush of the page but it still has a cost. You have to
>> walk the stage-2 in software that will require to map the tables. As all the
>> memory is not mapped in the hypervisor on arm32 this will require a map/unmap
>> operation. On arm64, so far the full memory is mapped, so the map/unmap is
>> pretty much a NOP.
> 
> Good point, actually the cost of an "empty" iteration is significantly
> different on arm32 and arm64.
> 
> 
>>> I would:
>>>
>>> - not increase count on region skipped at all
>>> - increase it by 1 on each page requiring a flush
>>> - set the limit lower, if we go with your proposal it would be about 50,
>>>     I am not sure what the limit should be though
>> I don't think you can avoid incrementing count on region skipped. While one
>> lookup is pretty cheap, all the lookups for hole added together may result to
>> a pretty long time.
> 
> Thinking of the arm32 case where map/unmap need to be done, you are
> right.
> 
> 
>> Even if stage-2 mappings are handled by the hypervisor, the guest is still
>> somewhat in control of it because it can balloon in/out pages. The operation
>> may result to shatter stage-2 mappings.
>>
>> It would be feasible for a guest to shatter 1GB of memory in 4KB mappings in
>> stage-2 entries and then remove all the entries. This means the stage-2 would
>> contains 262144 holes. This would result to 262144 iterations, so no matter
>> how cheap it is the resulting time spent without preemption is going to be
>> quite important.
> 
> OK
> 
> 
>> The choice in the numbers 1 vs 10 is pretty much random. The question is how
>> often we want to check for pending softirq. The check is pretty much trivial,
>> yet it has a cost to preempt. With the current solution, we check preemption
>> every 512 holes or 51 pages flushed (~204KB flushed).
>>
>> This sounds ok to me. Feel free to suggest better number.
> 
> One suggestion is that we might want to treat the arm32 case differently
> from the arm64 case, given the different cost of mapping/unmapping pages
> during the walk. Would it be fair to say that if an arm64 empty
> iteration is worth "1" unit of work, then an arm32 empty iteration is
> worth at least "2" unit of work? Or more? My gut feeling is that it is
> more like:

I don't want to treat arm32 and arm64 different. That's a call to open up a 
security hole in Xen if we ever decide to separate domain heap and xen heap on 
arm64.

> 
> empty arm64:       1
> empty arm32:       5
> flush arm32/arm64: 10
> 
> Or even:
> 
> empty arm64:       1
> empty arm32:       10
> flush arm32/arm64: 20

Bear in mind that in the flush case on arm32, you also need to map/unmap the 
page. So this is more like 10/30 here.

> 
> and the overall limits for checks could be 512 or 1024 like you
> suggested.

What I can suggest is:
	empty: 1
	flush: 3

Limit: 120

Cheers,
diff mbox series

Patch

diff --git a/xen/arch/arm/domctl.c b/xen/arch/arm/domctl.c
index 20691528a6..9da88b8c64 100644
--- a/xen/arch/arm/domctl.c
+++ b/xen/arch/arm/domctl.c
@@ -54,6 +54,7 @@  long arch_do_domctl(struct xen_domctl *domctl, struct domain *d,
     {
         gfn_t s = _gfn(domctl->u.cacheflush.start_pfn);
         gfn_t e = gfn_add(s, domctl->u.cacheflush.nr_pfns);
+        int rc;
 
         if ( domctl->u.cacheflush.nr_pfns > (1U<<MAX_ORDER) )
             return -EINVAL;
@@ -61,7 +62,12 @@  long arch_do_domctl(struct xen_domctl *domctl, struct domain *d,
         if ( gfn_x(e) < gfn_x(s) )
             return -EINVAL;
 
-        return p2m_cache_flush_range(d, s, e);
+        /* XXX: Handle preemption */
+        do
+            rc = p2m_cache_flush_range(d, &s, e);
+        while ( rc == -ERESTART );
+
+        return rc;
     }
     case XEN_DOMCTL_bind_pt_irq:
     {
diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c
index db22b53bfd..ca9f0d9ebe 100644
--- a/xen/arch/arm/p2m.c
+++ b/xen/arch/arm/p2m.c
@@ -1524,13 +1524,17 @@  int relinquish_p2m_mapping(struct domain *d)
     return rc;
 }
 
-int p2m_cache_flush_range(struct domain *d, gfn_t start, gfn_t end)
+int p2m_cache_flush_range(struct domain *d, gfn_t *pstart, gfn_t end)
 {
     struct p2m_domain *p2m = p2m_get_hostp2m(d);
     gfn_t next_block_gfn;
+    gfn_t start = *pstart;
     mfn_t mfn = INVALID_MFN;
     p2m_type_t t;
     unsigned int order;
+    int rc = 0;
+    /* Counter for preemption */
+    unsigned long count = 0;
 
     /*
      * The operation cache flush will invalidate the RAM assigned to the
@@ -1547,6 +1551,25 @@  int p2m_cache_flush_range(struct domain *d, gfn_t start, gfn_t end)
 
     while ( gfn_x(start) < gfn_x(end) )
     {
+       /*
+         * Cleaning the cache for the P2M may take a long time. So we
+         * need to be able to preempt. We will arbitrarily preempt every
+         * time count reach 512 or above.
+         *
+         * The count will be incremented by:
+         *  - 1 on region skipped
+         *  - 10 for each page requiring a flush
+         */
+        if ( count >= 512 )
+        {
+            if ( softirq_pending(smp_processor_id()) )
+            {
+                rc = -ERESTART;
+                break;
+            }
+            count = 0;
+        }
+
         /*
          * We want to flush page by page as:
          *  - it may not be possible to map the full block (can be up to 1GB)
@@ -1573,22 +1596,28 @@  int p2m_cache_flush_range(struct domain *d, gfn_t start, gfn_t end)
              */
             if ( mfn_eq(mfn, INVALID_MFN) || !p2m_is_any_ram(t) )
             {
+                count++;
                 start = next_block_gfn;
                 continue;
             }
         }
 
+        count += 10;
+
         flush_page_to_ram(mfn_x(mfn), false);
 
         start = gfn_add(start, 1);
         mfn = mfn_add(mfn, 1);
     }
 
-    invalidate_icache();
+    if ( rc != -ERESTART )
+        invalidate_icache();
 
     p2m_read_unlock(p2m);
 
-    return 0;
+    *pstart = start;
+
+    return rc;
 }
 
 mfn_t gfn_to_mfn(struct domain *d, gfn_t gfn)
diff --git a/xen/include/asm-arm/p2m.h b/xen/include/asm-arm/p2m.h
index 7c1d930b1d..a633e27cc9 100644
--- a/xen/include/asm-arm/p2m.h
+++ b/xen/include/asm-arm/p2m.h
@@ -232,8 +232,10 @@  bool p2m_resolve_translation_fault(struct domain *d, gfn_t gfn);
 /*
  * Clean & invalidate caches corresponding to a region [start,end) of guest
  * address space.
+ *
+ * start will get updated if the function is preempted.
  */
-int p2m_cache_flush_range(struct domain *d, gfn_t start, gfn_t end);
+int p2m_cache_flush_range(struct domain *d, gfn_t *pstart, gfn_t end);
 
 /*
  * Map a region in the guest p2m with a specific p2m type.