diff mbox series

[v4,4/6] vfio/type1: check dma map request is within a valid iova range

Message ID 20180221122209.9292-5-shameerali.kolothum.thodi@huawei.com
State Superseded
Headers show
Series vfio/type1: Add support for valid iova list management | expand

Commit Message

Shameerali Kolothum Thodi Feb. 21, 2018, 12:22 p.m. UTC
This checks and rejects any dma map request outside valid iova
range.

Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>

---
 drivers/vfio/vfio_iommu_type1.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

-- 
2.7.4

Comments

Eric Auger Feb. 26, 2018, 10:05 p.m. UTC | #1
Hi Shameer,

[Adding Robin in CC]
On 21/02/18 13:22, Shameer Kolothum wrote:
> This checks and rejects any dma map request outside valid iova

> range.

> 

> Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>

> ---

>  drivers/vfio/vfio_iommu_type1.c | 22 ++++++++++++++++++++++

>  1 file changed, 22 insertions(+)

> 

> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c

> index a80884e..3049393 100644

> --- a/drivers/vfio/vfio_iommu_type1.c

> +++ b/drivers/vfio/vfio_iommu_type1.c

> @@ -970,6 +970,23 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,

>  	return ret;

>  }

>  

> +/*

> + * Check dma map request is within a valid iova range

> + */

> +static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu,

> +				dma_addr_t start, dma_addr_t end)

> +{

> +	struct list_head *iova = &iommu->iova_list;

> +	struct vfio_iova *node;

> +

> +	list_for_each_entry(node, iova, list) {

> +		if ((start >= node->start) && (end <= node->end))

> +			return true;

I am now confused by the fact this change will prevent existing QEMU
from working with this series on some platforms. For instance QEMU virt
machine GPA space collides with Seattle PCI host bridge windows. On ARM
the smmu and smmuv3 drivers report the PCI host bridge windows as
reserved regions which does not seem to be the case on other platforms.
The change happened in commit 273df9635385b2156851c7ee49f40658d7bcb29d
(iommu/dma: Make PCI window reservation generic).

For background, we already discussed the topic after LPC 2016. See
https://www.spinics.net/lists/kernel/msg2379607.html.

So is it the right choice to expose PCI host bridge windows as reserved
regions? If yes shouldn't we make a difference between those and MSI
windows in this series and do not reject any user space DMA_MAP attempt
within PCI host bridge windows.

Thanks

Eric
> +	}

> +

> +	return false;

> +}

> +

>  static int vfio_dma_do_map(struct vfio_iommu *iommu,

>  			   struct vfio_iommu_type1_dma_map *map)

>  {

> @@ -1008,6 +1025,11 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,

>  		goto out_unlock;

>  	}

>  

> +	if (!vfio_iommu_iova_dma_valid(iommu, iova, iova + size - 1)) {

> +		ret = -EINVAL;

> +		goto out_unlock;

> +	}

> +

>  	dma = kzalloc(sizeof(*dma), GFP_KERNEL);

>  	if (!dma) {

>  		ret = -ENOMEM;

>
Alex Williamson Feb. 26, 2018, 11:13 p.m. UTC | #2
On Mon, 26 Feb 2018 23:05:43 +0100
Auger Eric <eric.auger@redhat.com> wrote:

> Hi Shameer,

> 

> [Adding Robin in CC]

> On 21/02/18 13:22, Shameer Kolothum wrote:

> > This checks and rejects any dma map request outside valid iova

> > range.

> > 

> > Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>

> > ---

> >  drivers/vfio/vfio_iommu_type1.c | 22 ++++++++++++++++++++++

> >  1 file changed, 22 insertions(+)

> > 

> > diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c

> > index a80884e..3049393 100644

> > --- a/drivers/vfio/vfio_iommu_type1.c

> > +++ b/drivers/vfio/vfio_iommu_type1.c

> > @@ -970,6 +970,23 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,

> >  	return ret;

> >  }

> >  

> > +/*

> > + * Check dma map request is within a valid iova range

> > + */

> > +static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu,

> > +				dma_addr_t start, dma_addr_t end)

> > +{

> > +	struct list_head *iova = &iommu->iova_list;

> > +	struct vfio_iova *node;

> > +

> > +	list_for_each_entry(node, iova, list) {

> > +		if ((start >= node->start) && (end <= node->end))

> > +			return true;  

> I am now confused by the fact this change will prevent existing QEMU

> from working with this series on some platforms. For instance QEMU virt

> machine GPA space collides with Seattle PCI host bridge windows. On ARM

> the smmu and smmuv3 drivers report the PCI host bridge windows as

> reserved regions which does not seem to be the case on other platforms.

> The change happened in commit 273df9635385b2156851c7ee49f40658d7bcb29d

> (iommu/dma: Make PCI window reservation generic).

> 

> For background, we already discussed the topic after LPC 2016. See

> https://www.spinics.net/lists/kernel/msg2379607.html.

> 

> So is it the right choice to expose PCI host bridge windows as reserved

> regions? If yes shouldn't we make a difference between those and MSI

> windows in this series and do not reject any user space DMA_MAP attempt

> within PCI host bridge windows.


If the QEMU machine GPA collides with a reserved region today, then
either:

a) The mapping through the IOMMU works and the reserved region is wrong

or

b) The mapping doesn't actually work, QEMU is at risk of data loss by
being told that it worked, and we're justified in changing that
behavior.

Without knowing the specifics of SMMU, it doesn't particularly make
sense to me to mark the entire PCI hierarchy MMIO range as reserved,
unless perhaps the IOMMU is incapable of translating those IOVAs.

Are we trying to prevent untranslated p2p with this reserved range?
That's not necessarily a terrible idea, but it seems that doing it for
that purpose would need to be a lot smarter, taking into account ACS
and precisely selecting ranges within the peer address space that would
be untranslated.  Perhaps only populated MMIO within non-ACS
hierarchies.  Thanks,

Alex
Eric Auger Feb. 27, 2018, 8:26 a.m. UTC | #3
Hi,
On 27/02/18 00:13, Alex Williamson wrote:
> On Mon, 26 Feb 2018 23:05:43 +0100

> Auger Eric <eric.auger@redhat.com> wrote:

> 

>> Hi Shameer,

>>

>> [Adding Robin in CC]

>> On 21/02/18 13:22, Shameer Kolothum wrote:

>>> This checks and rejects any dma map request outside valid iova

>>> range.

>>>

>>> Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>

>>> ---

>>>  drivers/vfio/vfio_iommu_type1.c | 22 ++++++++++++++++++++++

>>>  1 file changed, 22 insertions(+)

>>>

>>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c

>>> index a80884e..3049393 100644

>>> --- a/drivers/vfio/vfio_iommu_type1.c

>>> +++ b/drivers/vfio/vfio_iommu_type1.c

>>> @@ -970,6 +970,23 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,

>>>  	return ret;

>>>  }

>>>  

>>> +/*

>>> + * Check dma map request is within a valid iova range

>>> + */

>>> +static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu,

>>> +				dma_addr_t start, dma_addr_t end)

>>> +{

>>> +	struct list_head *iova = &iommu->iova_list;

>>> +	struct vfio_iova *node;

>>> +

>>> +	list_for_each_entry(node, iova, list) {

>>> +		if ((start >= node->start) && (end <= node->end))

>>> +			return true;  

>> I am now confused by the fact this change will prevent existing QEMU

>> from working with this series on some platforms. For instance QEMU virt

>> machine GPA space collides with Seattle PCI host bridge windows. On ARM

>> the smmu and smmuv3 drivers report the PCI host bridge windows as

>> reserved regions which does not seem to be the case on other platforms.

>> The change happened in commit 273df9635385b2156851c7ee49f40658d7bcb29d

>> (iommu/dma: Make PCI window reservation generic).

>>

>> For background, we already discussed the topic after LPC 2016. See

>> https://www.spinics.net/lists/kernel/msg2379607.html.

>>

>> So is it the right choice to expose PCI host bridge windows as reserved

>> regions? If yes shouldn't we make a difference between those and MSI

>> windows in this series and do not reject any user space DMA_MAP attempt

>> within PCI host bridge windows.

> 

> If the QEMU machine GPA collides with a reserved region today, then

> either:

> 

> a) The mapping through the IOMMU works and the reserved region is wrong

> 

> or

> 

> b) The mapping doesn't actually work, QEMU is at risk of data loss by

> being told that it worked, and we're justified in changing that

> behavior.

> 

> Without knowing the specifics of SMMU, it doesn't particularly make

> sense to me to mark the entire PCI hierarchy MMIO range as reserved,

> unless perhaps the IOMMU is incapable of translating those IOVAs.

to me the limitation does not come from the smmu itself, which is a
separate HW block sitting between the root complex and the interconnect.
If ACS is not enforced by the PCIe subsystem, the transaction will never
reach the IOMMU.

In the case of such overlap, shouldn't we just warn the end-user that
this situation is dangerous instead of forbidding the use case which
worked "in most cases" until now.

> Are we trying to prevent untranslated p2p with this reserved range?

> That's not necessarily a terrible idea, but it seems that doing it for

> that purpose would need to be a lot smarter, taking into account ACS

> and precisely selecting ranges within the peer address space that would

> be untranslated.  Perhaps only populated MMIO within non-ACS

> hierarchies.  Thanks,


Indeed taking into account the ACS capability would refine the
situations where a risk exists.

Thanks

Eric
> 

> Alex

>
Shameerali Kolothum Thodi Feb. 27, 2018, 9:57 a.m. UTC | #4
> -----Original Message-----

> From: Auger Eric [mailto:eric.auger@redhat.com]

> Sent: Tuesday, February 27, 2018 8:27 AM

> To: Alex Williamson <alex.williamson@redhat.com>

> Cc: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>;

> pmorel@linux.vnet.ibm.com; kvm@vger.kernel.org; linux-

> kernel@vger.kernel.org; Linuxarm <linuxarm@huawei.com>; John Garry

> <john.garry@huawei.com>; xuwei (O) <xuwei5@huawei.com>; Robin Murphy

> <robin.murphy@arm.com>

> Subject: Re: [PATCH v4 4/6] vfio/type1: check dma map request is within a valid

> iova range

> 

> Hi,

> On 27/02/18 00:13, Alex Williamson wrote:

> > On Mon, 26 Feb 2018 23:05:43 +0100

> > Auger Eric <eric.auger@redhat.com> wrote:

> >

> >> Hi Shameer,

> >>

> >> [Adding Robin in CC]

> >> On 21/02/18 13:22, Shameer Kolothum wrote:

> >>> This checks and rejects any dma map request outside valid iova

> >>> range.

> >>>

> >>> Signed-off-by: Shameer Kolothum

> <shameerali.kolothum.thodi@huawei.com>

> >>> ---

> >>>  drivers/vfio/vfio_iommu_type1.c | 22 ++++++++++++++++++++++

> >>>  1 file changed, 22 insertions(+)

> >>>

> >>> diff --git a/drivers/vfio/vfio_iommu_type1.c

> b/drivers/vfio/vfio_iommu_type1.c

> >>> index a80884e..3049393 100644

> >>> --- a/drivers/vfio/vfio_iommu_type1.c

> >>> +++ b/drivers/vfio/vfio_iommu_type1.c

> >>> @@ -970,6 +970,23 @@ static int vfio_pin_map_dma(struct vfio_iommu

> *iommu, struct vfio_dma *dma,

> >>>  	return ret;

> >>>  }

> >>>

> >>> +/*

> >>> + * Check dma map request is within a valid iova range

> >>> + */

> >>> +static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu,

> >>> +				dma_addr_t start, dma_addr_t end)

> >>> +{

> >>> +	struct list_head *iova = &iommu->iova_list;

> >>> +	struct vfio_iova *node;

> >>> +

> >>> +	list_for_each_entry(node, iova, list) {

> >>> +		if ((start >= node->start) && (end <= node->end))

> >>> +			return true;

> >> I am now confused by the fact this change will prevent existing QEMU

> >> from working with this series on some platforms. For instance QEMU virt

> >> machine GPA space collides with Seattle PCI host bridge windows. On ARM

> >> the smmu and smmuv3 drivers report the PCI host bridge windows as

> >> reserved regions which does not seem to be the case on other platforms.

> >> The change happened in commit

> 273df9635385b2156851c7ee49f40658d7bcb29d

> >> (iommu/dma: Make PCI window reservation generic).

> >>

> >> For background, we already discussed the topic after LPC 2016. See

> >> https://www.spinics.net/lists/kernel/msg2379607.html.

> >>

> >> So is it the right choice to expose PCI host bridge windows as reserved

> >> regions? If yes shouldn't we make a difference between those and MSI

> >> windows in this series and do not reject any user space DMA_MAP attempt

> >> within PCI host bridge windows.

> >

> > If the QEMU machine GPA collides with a reserved region today, then

> > either:

> >

> > a) The mapping through the IOMMU works and the reserved region is wrong

> >

> > or

> >

> > b) The mapping doesn't actually work, QEMU is at risk of data loss by

> > being told that it worked, and we're justified in changing that

> > behavior.

> >

> > Without knowing the specifics of SMMU, it doesn't particularly make

> > sense to me to mark the entire PCI hierarchy MMIO range as reserved,

> > unless perhaps the IOMMU is incapable of translating those IOVAs.

> to me the limitation does not come from the smmu itself, which is a

> separate HW block sitting between the root complex and the interconnect.

> If ACS is not enforced by the PCIe subsystem, the transaction will never

> reach the IOMMU.


True. And we do have one such platform where ACS is not enforced but 
reserving the regions and possibly creating holes while launching VM will
make it secure. But I do wonder how we will solve the device grouping
in such cases. 

The Seattle PCI host bridge windows case you mentioned has any pci quirk 
to claim that they support ACS?
 
> In the case of such overlap, shouldn't we just warn the end-user that

> this situation is dangerous instead of forbidding the use case which

> worked "in most cases" until now.


Yes, may be something similar to the allow_unsafe_interrupts case, if
that is acceptable.

Thanks,
Shameer
 
> > Are we trying to prevent untranslated p2p with this reserved range?

> > That's not necessarily a terrible idea, but it seems that doing it for

> > that purpose would need to be a lot smarter, taking into account ACS

> > and precisely selecting ranges within the peer address space that would

> > be untranslated.  Perhaps only populated MMIO within non-ACS

> > hierarchies.  Thanks,

> 

> Indeed taking into account the ACS capability would refine the

> situations where a risk exists.

> 

> Thanks

> 

> Eric

> >

> > Alex

> >
Robin Murphy Feb. 27, 2018, 12:40 p.m. UTC | #5
Hi Alex,

On 26/02/18 23:13, Alex Williamson wrote:
> On Mon, 26 Feb 2018 23:05:43 +0100

> Auger Eric <eric.auger@redhat.com> wrote:

> 

>> Hi Shameer,

>>

>> [Adding Robin in CC]

>> On 21/02/18 13:22, Shameer Kolothum wrote:

>>> This checks and rejects any dma map request outside valid iova

>>> range.

>>>

>>> Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>

>>> ---

>>>   drivers/vfio/vfio_iommu_type1.c | 22 ++++++++++++++++++++++

>>>   1 file changed, 22 insertions(+)

>>>

>>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c

>>> index a80884e..3049393 100644

>>> --- a/drivers/vfio/vfio_iommu_type1.c

>>> +++ b/drivers/vfio/vfio_iommu_type1.c

>>> @@ -970,6 +970,23 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,

>>>   	return ret;

>>>   }

>>>   

>>> +/*

>>> + * Check dma map request is within a valid iova range

>>> + */

>>> +static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu,

>>> +				dma_addr_t start, dma_addr_t end)

>>> +{

>>> +	struct list_head *iova = &iommu->iova_list;

>>> +	struct vfio_iova *node;

>>> +

>>> +	list_for_each_entry(node, iova, list) {

>>> +		if ((start >= node->start) && (end <= node->end))

>>> +			return true;

>> I am now confused by the fact this change will prevent existing QEMU

>> from working with this series on some platforms. For instance QEMU virt

>> machine GPA space collides with Seattle PCI host bridge windows. On ARM

>> the smmu and smmuv3 drivers report the PCI host bridge windows as

>> reserved regions which does not seem to be the case on other platforms.

>> The change happened in commit 273df9635385b2156851c7ee49f40658d7bcb29d

>> (iommu/dma: Make PCI window reservation generic).

>>

>> For background, we already discussed the topic after LPC 2016. See

>> https://www.spinics.net/lists/kernel/msg2379607.html.

>>

>> So is it the right choice to expose PCI host bridge windows as reserved

>> regions? If yes shouldn't we make a difference between those and MSI

>> windows in this series and do not reject any user space DMA_MAP attempt

>> within PCI host bridge windows.

> 

> If the QEMU machine GPA collides with a reserved region today, then

> either:

> 

> a) The mapping through the IOMMU works and the reserved region is wrong

> 

> or

> 

> b) The mapping doesn't actually work, QEMU is at risk of data loss by

> being told that it worked, and we're justified in changing that

> behavior.

> 

> Without knowing the specifics of SMMU, it doesn't particularly make

> sense to me to mark the entire PCI hierarchy MMIO range as reserved,

> unless perhaps the IOMMU is incapable of translating those IOVAs.


Yes, the problem in general is that the SMMU *might* be incapable of 
making any use of IOVAs shadowed by PCI p2p addresses, depending on the 
behaviour and/or integration of the root complex/host bridge.

By way of example, the machine on which I developed iommu-dma (Arm Juno) 
has a PCIe RC which doesn't support p2p at all, and happens to have its 
32-bit bridge window placed exactly where qemu-arm starts guest RAM - I 
can plug in a graphics card which claims a nice big BAR from that 
window, assign the whole PCI group to a VM, and watch the NIC blow up in 
the guest as its (IPA) DMA addresses cause the RC to send an abort back 
to the endpoint before the transactions ever get out to the SMMU.

The SMMU itself doesn't know anything about this (e.g. it's the exact 
same IP block as used in AMD Seattle, where AFAIK the AMD root complex 
doesn't suffer the same issue).

> Are we trying to prevent untranslated p2p with this reserved range?

> That's not necessarily a terrible idea, but it seems that doing it for

> that purpose would need to be a lot smarter, taking into account ACS

> and precisely selecting ranges within the peer address space that would

> be untranslated.  Perhaps only populated MMIO within non-ACS

> hierarchies.  Thanks,


The current code is just playing it as safe as it can by always 
reserving the full windows - IIRC there was some debate about whether 
the "only populated MMIO" approach as used by the x86 IOMMUs is really 
safe, given that hotplug and/or BAR reassignment could happen after the 
domain is created. I agree there probably is room to be a bit cleverer 
with respect to ACS, though.

Robin.
Alex Williamson Feb. 27, 2018, 4:57 p.m. UTC | #6
On Tue, 27 Feb 2018 09:26:37 +0100
Auger Eric <eric.auger@redhat.com> wrote:

> Hi,

> On 27/02/18 00:13, Alex Williamson wrote:

> > On Mon, 26 Feb 2018 23:05:43 +0100

> > Auger Eric <eric.auger@redhat.com> wrote:

> >   

> >> Hi Shameer,

> >>

> >> [Adding Robin in CC]

> >> On 21/02/18 13:22, Shameer Kolothum wrote:  

> >>> This checks and rejects any dma map request outside valid iova

> >>> range.

> >>>

> >>> Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>

> >>> ---

> >>>  drivers/vfio/vfio_iommu_type1.c | 22 ++++++++++++++++++++++

> >>>  1 file changed, 22 insertions(+)

> >>>

> >>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c

> >>> index a80884e..3049393 100644

> >>> --- a/drivers/vfio/vfio_iommu_type1.c

> >>> +++ b/drivers/vfio/vfio_iommu_type1.c

> >>> @@ -970,6 +970,23 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,

> >>>  	return ret;

> >>>  }

> >>>  

> >>> +/*

> >>> + * Check dma map request is within a valid iova range

> >>> + */

> >>> +static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu,

> >>> +				dma_addr_t start, dma_addr_t end)

> >>> +{

> >>> +	struct list_head *iova = &iommu->iova_list;

> >>> +	struct vfio_iova *node;

> >>> +

> >>> +	list_for_each_entry(node, iova, list) {

> >>> +		if ((start >= node->start) && (end <= node->end))

> >>> +			return true;    

> >> I am now confused by the fact this change will prevent existing QEMU

> >> from working with this series on some platforms. For instance QEMU virt

> >> machine GPA space collides with Seattle PCI host bridge windows. On ARM

> >> the smmu and smmuv3 drivers report the PCI host bridge windows as

> >> reserved regions which does not seem to be the case on other platforms.

> >> The change happened in commit 273df9635385b2156851c7ee49f40658d7bcb29d

> >> (iommu/dma: Make PCI window reservation generic).

> >>

> >> For background, we already discussed the topic after LPC 2016. See

> >> https://www.spinics.net/lists/kernel/msg2379607.html.

> >>

> >> So is it the right choice to expose PCI host bridge windows as reserved

> >> regions? If yes shouldn't we make a difference between those and MSI

> >> windows in this series and do not reject any user space DMA_MAP attempt

> >> within PCI host bridge windows.  

> > 

> > If the QEMU machine GPA collides with a reserved region today, then

> > either:

> > 

> > a) The mapping through the IOMMU works and the reserved region is wrong

> > 

> > or

> > 

> > b) The mapping doesn't actually work, QEMU is at risk of data loss by

> > being told that it worked, and we're justified in changing that

> > behavior.

> > 

> > Without knowing the specifics of SMMU, it doesn't particularly make

> > sense to me to mark the entire PCI hierarchy MMIO range as reserved,

> > unless perhaps the IOMMU is incapable of translating those IOVAs.  

> to me the limitation does not come from the smmu itself, which is a

> separate HW block sitting between the root complex and the interconnect.

> If ACS is not enforced by the PCIe subsystem, the transaction will never

> reach the IOMMU.


If the limitation is not from the SMMU, then why is it being exposed
via the IOMMU API?  This seems like overreach, trying to compensate for
a limitation elsewhere by imposing a restriction at the IOMMU.

> In the case of such overlap, shouldn't we just warn the end-user that

> this situation is dangerous instead of forbidding the use case which

> worked "in most cases" until now.


How do we distinguish between reserved ranges that are really reserved
and those that are only an advisory?  This seems like it defeats the
whole purpose of the reserved ranges.  Furthermore, if our vfio IOVA
list to the user is only advisory, what's the point?

Peer-to-peer MMIO within an IOMMU group is a tough problem, and one
that we've mostly ignored as we strive towards singleton IOMMU groups,
which are more the normal case on "enterprise" x86 hardware.  The user
does have some ability to determine potential conflicts, so I don't
necessarily see this as exclusively a kernel issue to solve.  However,
if the user needs to account for potentially conflicting MMIO outside of
the IOMMU group which they're provided, then yeah, we have a bigger
issue.  Thanks,

Alex
Alex Williamson Feb. 27, 2018, 5:13 p.m. UTC | #7
On Tue, 27 Feb 2018 09:57:16 +0000
Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com> wrote:

> > -----Original Message-----

> > From: Auger Eric [mailto:eric.auger@redhat.com]

> > Sent: Tuesday, February 27, 2018 8:27 AM

> > To: Alex Williamson <alex.williamson@redhat.com>

> > Cc: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>;

> > pmorel@linux.vnet.ibm.com; kvm@vger.kernel.org; linux-

> > kernel@vger.kernel.org; Linuxarm <linuxarm@huawei.com>; John Garry

> > <john.garry@huawei.com>; xuwei (O) <xuwei5@huawei.com>; Robin Murphy

> > <robin.murphy@arm.com>

> > Subject: Re: [PATCH v4 4/6] vfio/type1: check dma map request is within a valid

> > iova range

> > 

> > Hi,

> > On 27/02/18 00:13, Alex Williamson wrote:  

> > > On Mon, 26 Feb 2018 23:05:43 +0100

> > > Auger Eric <eric.auger@redhat.com> wrote:

> > >  

> > >> Hi Shameer,

> > >>

> > >> [Adding Robin in CC]

> > >> On 21/02/18 13:22, Shameer Kolothum wrote:  

> > >>> This checks and rejects any dma map request outside valid iova

> > >>> range.

> > >>>

> > >>> Signed-off-by: Shameer Kolothum  

> > <shameerali.kolothum.thodi@huawei.com>  

> > >>> ---

> > >>>  drivers/vfio/vfio_iommu_type1.c | 22 ++++++++++++++++++++++

> > >>>  1 file changed, 22 insertions(+)

> > >>>

> > >>> diff --git a/drivers/vfio/vfio_iommu_type1.c  

> > b/drivers/vfio/vfio_iommu_type1.c  

> > >>> index a80884e..3049393 100644

> > >>> --- a/drivers/vfio/vfio_iommu_type1.c

> > >>> +++ b/drivers/vfio/vfio_iommu_type1.c

> > >>> @@ -970,6 +970,23 @@ static int vfio_pin_map_dma(struct vfio_iommu  

> > *iommu, struct vfio_dma *dma,  

> > >>>  	return ret;

> > >>>  }

> > >>>

> > >>> +/*

> > >>> + * Check dma map request is within a valid iova range

> > >>> + */

> > >>> +static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu,

> > >>> +				dma_addr_t start, dma_addr_t end)

> > >>> +{

> > >>> +	struct list_head *iova = &iommu->iova_list;

> > >>> +	struct vfio_iova *node;

> > >>> +

> > >>> +	list_for_each_entry(node, iova, list) {

> > >>> +		if ((start >= node->start) && (end <= node->end))

> > >>> +			return true;  

> > >> I am now confused by the fact this change will prevent existing QEMU

> > >> from working with this series on some platforms. For instance QEMU virt

> > >> machine GPA space collides with Seattle PCI host bridge windows. On ARM

> > >> the smmu and smmuv3 drivers report the PCI host bridge windows as

> > >> reserved regions which does not seem to be the case on other platforms.

> > >> The change happened in commit  

> > 273df9635385b2156851c7ee49f40658d7bcb29d  

> > >> (iommu/dma: Make PCI window reservation generic).

> > >>

> > >> For background, we already discussed the topic after LPC 2016. See

> > >> https://www.spinics.net/lists/kernel/msg2379607.html.

> > >>

> > >> So is it the right choice to expose PCI host bridge windows as reserved

> > >> regions? If yes shouldn't we make a difference between those and MSI

> > >> windows in this series and do not reject any user space DMA_MAP attempt

> > >> within PCI host bridge windows.  

> > >

> > > If the QEMU machine GPA collides with a reserved region today, then

> > > either:

> > >

> > > a) The mapping through the IOMMU works and the reserved region is wrong

> > >

> > > or

> > >

> > > b) The mapping doesn't actually work, QEMU is at risk of data loss by

> > > being told that it worked, and we're justified in changing that

> > > behavior.

> > >

> > > Without knowing the specifics of SMMU, it doesn't particularly make

> > > sense to me to mark the entire PCI hierarchy MMIO range as reserved,

> > > unless perhaps the IOMMU is incapable of translating those IOVAs.  

> > to me the limitation does not come from the smmu itself, which is a

> > separate HW block sitting between the root complex and the interconnect.

> > If ACS is not enforced by the PCIe subsystem, the transaction will never

> > reach the IOMMU.  

> 

> True. And we do have one such platform where ACS is not enforced but 

> reserving the regions and possibly creating holes while launching VM will

> make it secure. But I do wonder how we will solve the device grouping

> in such cases. 


"creating holes... will make it secure", that's worrisome.  The purpose
of an IOVA list is to inform the user which ranges are available to
them to use.  This is informative, not security.  A malicious user can
ask the device to perform DMA anywhere they choose, regardless of what
we report as reserved.  The hardware provides the security, if we're
relying on the user to honor holes, that's only cooperative security,
which is really no security at all.  If the IOMMU groups don't already
reflect this, they're incorrect.
 
> The Seattle PCI host bridge windows case you mentioned has any pci quirk 

> to claim that they support ACS?

>  

> > In the case of such overlap, shouldn't we just warn the end-user that

> > this situation is dangerous instead of forbidding the use case which

> > worked "in most cases" until now.  

> 

> Yes, may be something similar to the allow_unsafe_interrupts case, if

> that is acceptable.


"allow_unsafe_interrupts" is basically a statement that the user trusts
their user is not malicious and will not exploit potential DMA
attacks.  I'm having a little bit of trouble equating that to allowing
the user to ignore the list of valid IOVA ranges we've provided.  Why
provide a list at all if it's ignored?  If there's grey area in the
list for things the user can choose to ignore, then the list is
invalid.  Thanks,

Alex
Eric Auger Feb. 28, 2018, 9:02 a.m. UTC | #8
Hi Shameer,

On 27/02/18 10:57, Shameerali Kolothum Thodi wrote:
> 

> 

>> -----Original Message-----

>> From: Auger Eric [mailto:eric.auger@redhat.com]

>> Sent: Tuesday, February 27, 2018 8:27 AM

>> To: Alex Williamson <alex.williamson@redhat.com>

>> Cc: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>;

>> pmorel@linux.vnet.ibm.com; kvm@vger.kernel.org; linux-

>> kernel@vger.kernel.org; Linuxarm <linuxarm@huawei.com>; John Garry

>> <john.garry@huawei.com>; xuwei (O) <xuwei5@huawei.com>; Robin Murphy

>> <robin.murphy@arm.com>

>> Subject: Re: [PATCH v4 4/6] vfio/type1: check dma map request is within a valid

>> iova range

>>

>> Hi,

>> On 27/02/18 00:13, Alex Williamson wrote:

>>> On Mon, 26 Feb 2018 23:05:43 +0100

>>> Auger Eric <eric.auger@redhat.com> wrote:

>>>

>>>> Hi Shameer,

>>>>

>>>> [Adding Robin in CC]

>>>> On 21/02/18 13:22, Shameer Kolothum wrote:

>>>>> This checks and rejects any dma map request outside valid iova

>>>>> range.

>>>>>

>>>>> Signed-off-by: Shameer Kolothum

>> <shameerali.kolothum.thodi@huawei.com>

>>>>> ---

>>>>>  drivers/vfio/vfio_iommu_type1.c | 22 ++++++++++++++++++++++

>>>>>  1 file changed, 22 insertions(+)

>>>>>

>>>>> diff --git a/drivers/vfio/vfio_iommu_type1.c

>> b/drivers/vfio/vfio_iommu_type1.c

>>>>> index a80884e..3049393 100644

>>>>> --- a/drivers/vfio/vfio_iommu_type1.c

>>>>> +++ b/drivers/vfio/vfio_iommu_type1.c

>>>>> @@ -970,6 +970,23 @@ static int vfio_pin_map_dma(struct vfio_iommu

>> *iommu, struct vfio_dma *dma,

>>>>>  	return ret;

>>>>>  }

>>>>>

>>>>> +/*

>>>>> + * Check dma map request is within a valid iova range

>>>>> + */

>>>>> +static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu,

>>>>> +				dma_addr_t start, dma_addr_t end)

>>>>> +{

>>>>> +	struct list_head *iova = &iommu->iova_list;

>>>>> +	struct vfio_iova *node;

>>>>> +

>>>>> +	list_for_each_entry(node, iova, list) {

>>>>> +		if ((start >= node->start) && (end <= node->end))

>>>>> +			return true;

>>>> I am now confused by the fact this change will prevent existing QEMU

>>>> from working with this series on some platforms. For instance QEMU virt

>>>> machine GPA space collides with Seattle PCI host bridge windows. On ARM

>>>> the smmu and smmuv3 drivers report the PCI host bridge windows as

>>>> reserved regions which does not seem to be the case on other platforms.

>>>> The change happened in commit

>> 273df9635385b2156851c7ee49f40658d7bcb29d

>>>> (iommu/dma: Make PCI window reservation generic).

>>>>

>>>> For background, we already discussed the topic after LPC 2016. See

>>>> https://www.spinics.net/lists/kernel/msg2379607.html.

>>>>

>>>> So is it the right choice to expose PCI host bridge windows as reserved

>>>> regions? If yes shouldn't we make a difference between those and MSI

>>>> windows in this series and do not reject any user space DMA_MAP attempt

>>>> within PCI host bridge windows.

>>>

>>> If the QEMU machine GPA collides with a reserved region today, then

>>> either:

>>>

>>> a) The mapping through the IOMMU works and the reserved region is wrong

>>>

>>> or

>>>

>>> b) The mapping doesn't actually work, QEMU is at risk of data loss by

>>> being told that it worked, and we're justified in changing that

>>> behavior.

>>>

>>> Without knowing the specifics of SMMU, it doesn't particularly make

>>> sense to me to mark the entire PCI hierarchy MMIO range as reserved,

>>> unless perhaps the IOMMU is incapable of translating those IOVAs.

>> to me the limitation does not come from the smmu itself, which is a

>> separate HW block sitting between the root complex and the interconnect.

>> If ACS is not enforced by the PCIe subsystem, the transaction will never

>> reach the IOMMU.

> 

> True. And we do have one such platform where ACS is not enforced but 

> reserving the regions and possibly creating holes while launching VM will

> make it secure. But I do wonder how we will solve the device grouping

> in such cases. 

> 

> The Seattle PCI host bridge windows case you mentioned has any pci quirk 

> to claim that they support ACS?

No there is none to my knowledge. I am applying Alex' not upstream ACS
overwrite patch.

Thanks

Eric
>  

>> In the case of such overlap, shouldn't we just warn the end-user that

>> this situation is dangerous instead of forbidding the use case which

>> worked "in most cases" until now.

> 

> Yes, may be something similar to the allow_unsafe_interrupts case, if

> that is acceptable.

> 

> Thanks,

> Shameer

>  

>>> Are we trying to prevent untranslated p2p with this reserved range?

>>> That's not necessarily a terrible idea, but it seems that doing it for

>>> that purpose would need to be a lot smarter, taking into account ACS

>>> and precisely selecting ranges within the peer address space that would

>>> be untranslated.  Perhaps only populated MMIO within non-ACS

>>> hierarchies.  Thanks,

>>

>> Indeed taking into account the ACS capability would refine the

>> situations where a risk exists.

>>

>> Thanks

>>

>> Eric

>>>

>>> Alex

>>>
Shameerali Kolothum Thodi Feb. 28, 2018, 9:25 a.m. UTC | #9
> -----Original Message-----

> From: Auger Eric [mailto:eric.auger@redhat.com]

> Sent: Wednesday, February 28, 2018 9:02 AM

> To: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>;

> Alex Williamson <alex.williamson@redhat.com>

> Cc: pmorel@linux.vnet.ibm.com; kvm@vger.kernel.org; linux-

> kernel@vger.kernel.org; Linuxarm <linuxarm@huawei.com>; John Garry

> <john.garry@huawei.com>; xuwei (O) <xuwei5@huawei.com>; Robin Murphy

> <robin.murphy@arm.com>

> Subject: Re: [PATCH v4 4/6] vfio/type1: check dma map request is within a valid

> iova range

> 

> Hi Shameer,

> 

> On 27/02/18 10:57, Shameerali Kolothum Thodi wrote:

> >

> >

> >> -----Original Message-----

> >> From: Auger Eric [mailto:eric.auger@redhat.com]

> >> Sent: Tuesday, February 27, 2018 8:27 AM

> >> To: Alex Williamson <alex.williamson@redhat.com>

> >> Cc: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>;

> >> pmorel@linux.vnet.ibm.com; kvm@vger.kernel.org; linux-

> >> kernel@vger.kernel.org; Linuxarm <linuxarm@huawei.com>; John Garry

> >> <john.garry@huawei.com>; xuwei (O) <xuwei5@huawei.com>; Robin

> Murphy

> >> <robin.murphy@arm.com>

> >> Subject: Re: [PATCH v4 4/6] vfio/type1: check dma map request is within a

> valid

> >> iova range

> >>

> >> Hi,

> >> On 27/02/18 00:13, Alex Williamson wrote:

> >>> On Mon, 26 Feb 2018 23:05:43 +0100

> >>> Auger Eric <eric.auger@redhat.com> wrote:

> >>>

> >>>> Hi Shameer,

> >>>>

> >>>> [Adding Robin in CC]

> >>>> On 21/02/18 13:22, Shameer Kolothum wrote:

> >>>>> This checks and rejects any dma map request outside valid iova

> >>>>> range.

> >>>>>

> >>>>> Signed-off-by: Shameer Kolothum

> >> <shameerali.kolothum.thodi@huawei.com>

> >>>>> ---

> >>>>>  drivers/vfio/vfio_iommu_type1.c | 22 ++++++++++++++++++++++

> >>>>>  1 file changed, 22 insertions(+)

> >>>>>

> >>>>> diff --git a/drivers/vfio/vfio_iommu_type1.c

> >> b/drivers/vfio/vfio_iommu_type1.c

> >>>>> index a80884e..3049393 100644

> >>>>> --- a/drivers/vfio/vfio_iommu_type1.c

> >>>>> +++ b/drivers/vfio/vfio_iommu_type1.c

> >>>>> @@ -970,6 +970,23 @@ static int vfio_pin_map_dma(struct vfio_iommu

> >> *iommu, struct vfio_dma *dma,

> >>>>>  	return ret;

> >>>>>  }

> >>>>>

> >>>>> +/*

> >>>>> + * Check dma map request is within a valid iova range

> >>>>> + */

> >>>>> +static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu,

> >>>>> +				dma_addr_t start, dma_addr_t end)

> >>>>> +{

> >>>>> +	struct list_head *iova = &iommu->iova_list;

> >>>>> +	struct vfio_iova *node;

> >>>>> +

> >>>>> +	list_for_each_entry(node, iova, list) {

> >>>>> +		if ((start >= node->start) && (end <= node->end))

> >>>>> +			return true;

> >>>> I am now confused by the fact this change will prevent existing QEMU

> >>>> from working with this series on some platforms. For instance QEMU virt

> >>>> machine GPA space collides with Seattle PCI host bridge windows. On

> ARM

> >>>> the smmu and smmuv3 drivers report the PCI host bridge windows as

> >>>> reserved regions which does not seem to be the case on other platforms.

> >>>> The change happened in commit

> >> 273df9635385b2156851c7ee49f40658d7bcb29d

> >>>> (iommu/dma: Make PCI window reservation generic).

> >>>>

> >>>> For background, we already discussed the topic after LPC 2016. See

> >>>> https://www.spinics.net/lists/kernel/msg2379607.html.

> >>>>

> >>>> So is it the right choice to expose PCI host bridge windows as reserved

> >>>> regions? If yes shouldn't we make a difference between those and MSI

> >>>> windows in this series and do not reject any user space DMA_MAP

> attempt

> >>>> within PCI host bridge windows.

> >>>

> >>> If the QEMU machine GPA collides with a reserved region today, then

> >>> either:

> >>>

> >>> a) The mapping through the IOMMU works and the reserved region is

> wrong

> >>>

> >>> or

> >>>

> >>> b) The mapping doesn't actually work, QEMU is at risk of data loss by

> >>> being told that it worked, and we're justified in changing that

> >>> behavior.

> >>>

> >>> Without knowing the specifics of SMMU, it doesn't particularly make

> >>> sense to me to mark the entire PCI hierarchy MMIO range as reserved,

> >>> unless perhaps the IOMMU is incapable of translating those IOVAs.

> >> to me the limitation does not come from the smmu itself, which is a

> >> separate HW block sitting between the root complex and the interconnect.

> >> If ACS is not enforced by the PCIe subsystem, the transaction will never

> >> reach the IOMMU.

> >

> > True. And we do have one such platform where ACS is not enforced but

> > reserving the regions and possibly creating holes while launching VM will

> > make it secure. But I do wonder how we will solve the device grouping

> > in such cases.

> >

> > The Seattle PCI host bridge windows case you mentioned has any pci quirk

> > to claim that they support ACS?

> No there is none to my knowledge. I am applying Alex' not upstream ACS

> overwrite patch.


Ok. But isn't that patch actually applicable to cases where ACS is really supported
by hardware but the capability is not available?  I am just trying to see whether
the argument that we should allow DMA MAP requests for this(non-ACS case)
even if the Guest GPA conflict with reserved region holds good. The fact that may
be it was working before is that the Guest never actually allocated any GPA from
the reserved region or maybe I am missing something here.

Thanks,
Shameer

> Thanks

> 

> Eric

> >

> >> In the case of such overlap, shouldn't we just warn the end-user that

> >> this situation is dangerous instead of forbidding the use case which

> >> worked "in most cases" until now.

> >

> > Yes, may be something similar to the allow_unsafe_interrupts case, if

> > that is acceptable.

> >

> > Thanks,

> > Shameer

> >

> >>> Are we trying to prevent untranslated p2p with this reserved range?

> >>> That's not necessarily a terrible idea, but it seems that doing it for

> >>> that purpose would need to be a lot smarter, taking into account ACS

> >>> and precisely selecting ranges within the peer address space that would

> >>> be untranslated.  Perhaps only populated MMIO within non-ACS

> >>> hierarchies.  Thanks,

> >>

> >> Indeed taking into account the ACS capability would refine the

> >> situations where a risk exists.

> >>

> >> Thanks

> >>

> >> Eric

> >>>

> >>> Alex

> >>>
Eric Auger Feb. 28, 2018, 11:53 a.m. UTC | #10
Hi Shameer,

On 28/02/18 10:25, Shameerali Kolothum Thodi wrote:
> 

> 

>> -----Original Message-----

>> From: Auger Eric [mailto:eric.auger@redhat.com]

>> Sent: Wednesday, February 28, 2018 9:02 AM

>> To: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>;

>> Alex Williamson <alex.williamson@redhat.com>

>> Cc: pmorel@linux.vnet.ibm.com; kvm@vger.kernel.org; linux-

>> kernel@vger.kernel.org; Linuxarm <linuxarm@huawei.com>; John Garry

>> <john.garry@huawei.com>; xuwei (O) <xuwei5@huawei.com>; Robin Murphy

>> <robin.murphy@arm.com>

>> Subject: Re: [PATCH v4 4/6] vfio/type1: check dma map request is within a valid

>> iova range

>>

>> Hi Shameer,

>>

>> On 27/02/18 10:57, Shameerali Kolothum Thodi wrote:

>>>

>>>

>>>> -----Original Message-----

>>>> From: Auger Eric [mailto:eric.auger@redhat.com]

>>>> Sent: Tuesday, February 27, 2018 8:27 AM

>>>> To: Alex Williamson <alex.williamson@redhat.com>

>>>> Cc: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>;

>>>> pmorel@linux.vnet.ibm.com; kvm@vger.kernel.org; linux-

>>>> kernel@vger.kernel.org; Linuxarm <linuxarm@huawei.com>; John Garry

>>>> <john.garry@huawei.com>; xuwei (O) <xuwei5@huawei.com>; Robin

>> Murphy

>>>> <robin.murphy@arm.com>

>>>> Subject: Re: [PATCH v4 4/6] vfio/type1: check dma map request is within a

>> valid

>>>> iova range

>>>>

>>>> Hi,

>>>> On 27/02/18 00:13, Alex Williamson wrote:

>>>>> On Mon, 26 Feb 2018 23:05:43 +0100

>>>>> Auger Eric <eric.auger@redhat.com> wrote:

>>>>>

>>>>>> Hi Shameer,

>>>>>>

>>>>>> [Adding Robin in CC]

>>>>>> On 21/02/18 13:22, Shameer Kolothum wrote:

>>>>>>> This checks and rejects any dma map request outside valid iova

>>>>>>> range.

>>>>>>>

>>>>>>> Signed-off-by: Shameer Kolothum

>>>> <shameerali.kolothum.thodi@huawei.com>

>>>>>>> ---

>>>>>>>  drivers/vfio/vfio_iommu_type1.c | 22 ++++++++++++++++++++++

>>>>>>>  1 file changed, 22 insertions(+)

>>>>>>>

>>>>>>> diff --git a/drivers/vfio/vfio_iommu_type1.c

>>>> b/drivers/vfio/vfio_iommu_type1.c

>>>>>>> index a80884e..3049393 100644

>>>>>>> --- a/drivers/vfio/vfio_iommu_type1.c

>>>>>>> +++ b/drivers/vfio/vfio_iommu_type1.c

>>>>>>> @@ -970,6 +970,23 @@ static int vfio_pin_map_dma(struct vfio_iommu

>>>> *iommu, struct vfio_dma *dma,

>>>>>>>  	return ret;

>>>>>>>  }

>>>>>>>

>>>>>>> +/*

>>>>>>> + * Check dma map request is within a valid iova range

>>>>>>> + */

>>>>>>> +static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu,

>>>>>>> +				dma_addr_t start, dma_addr_t end)

>>>>>>> +{

>>>>>>> +	struct list_head *iova = &iommu->iova_list;

>>>>>>> +	struct vfio_iova *node;

>>>>>>> +

>>>>>>> +	list_for_each_entry(node, iova, list) {

>>>>>>> +		if ((start >= node->start) && (end <= node->end))

>>>>>>> +			return true;

>>>>>> I am now confused by the fact this change will prevent existing QEMU

>>>>>> from working with this series on some platforms. For instance QEMU virt

>>>>>> machine GPA space collides with Seattle PCI host bridge windows. On

>> ARM

>>>>>> the smmu and smmuv3 drivers report the PCI host bridge windows as

>>>>>> reserved regions which does not seem to be the case on other platforms.

>>>>>> The change happened in commit

>>>> 273df9635385b2156851c7ee49f40658d7bcb29d

>>>>>> (iommu/dma: Make PCI window reservation generic).

>>>>>>

>>>>>> For background, we already discussed the topic after LPC 2016. See

>>>>>> https://www.spinics.net/lists/kernel/msg2379607.html.

>>>>>>

>>>>>> So is it the right choice to expose PCI host bridge windows as reserved

>>>>>> regions? If yes shouldn't we make a difference between those and MSI

>>>>>> windows in this series and do not reject any user space DMA_MAP

>> attempt

>>>>>> within PCI host bridge windows.

>>>>>

>>>>> If the QEMU machine GPA collides with a reserved region today, then

>>>>> either:

>>>>>

>>>>> a) The mapping through the IOMMU works and the reserved region is

>> wrong

>>>>>

>>>>> or

>>>>>

>>>>> b) The mapping doesn't actually work, QEMU is at risk of data loss by

>>>>> being told that it worked, and we're justified in changing that

>>>>> behavior.

>>>>>

>>>>> Without knowing the specifics of SMMU, it doesn't particularly make

>>>>> sense to me to mark the entire PCI hierarchy MMIO range as reserved,

>>>>> unless perhaps the IOMMU is incapable of translating those IOVAs.

>>>> to me the limitation does not come from the smmu itself, which is a

>>>> separate HW block sitting between the root complex and the interconnect.

>>>> If ACS is not enforced by the PCIe subsystem, the transaction will never

>>>> reach the IOMMU.

>>>

>>> True. And we do have one such platform where ACS is not enforced but

>>> reserving the regions and possibly creating holes while launching VM will

>>> make it secure. But I do wonder how we will solve the device grouping

>>> in such cases.

>>>

>>> The Seattle PCI host bridge windows case you mentioned has any pci quirk

>>> to claim that they support ACS?

>> No there is none to my knowledge. I am applying Alex' not upstream ACS

>> overwrite patch.

> 

> Ok. But isn't that patch actually applicable to cases where ACS is really supported

> by hardware but the capability is not available?


My understanding is normally yes. If you apply the patch whereas the HW
practically does not support ACS, then you fool the kernel pretending
there is isolation whereas there is not. I don't know the exact
capability of the HW on AMD Seattle and effectively I should have cared
about it much earlier and if the HW capability were supported and not
properly exposed we should have implemented a clean quirk for this platform.


  I am just trying to see whether
> the argument that we should allow DMA MAP requests for this(non-ACS case)

> even if the Guest GPA conflict with reserved region holds good. The fact that may

> be it was working before is that the Guest never actually allocated any GPA from

> the reserved region or maybe I am missing something here.


If my understanding is correct, in ideal world we would report the PCI
host bridge window as reserved only in case ACS is not supported. If you
apply the patch and overrides the ACS, then the DMA_MAP  would be
allowed. In case the HW does not support ACS, then you could face
situations where one EP tries to access GPA that never reaches the IOMMU
(because it corresponds to the BAR of another downstream EP). Same can
happen at the moment.

Thanks

Eric
> 

> Thanks,

> Shameer

> 

>> Thanks

>>

>> Eric

>>>

>>>> In the case of such overlap, shouldn't we just warn the end-user that

>>>> this situation is dangerous instead of forbidding the use case which

>>>> worked "in most cases" until now.

>>>

>>> Yes, may be something similar to the allow_unsafe_interrupts case, if

>>> that is acceptable.

>>>

>>> Thanks,

>>> Shameer

>>>

>>>>> Are we trying to prevent untranslated p2p with this reserved range?

>>>>> That's not necessarily a terrible idea, but it seems that doing it for

>>>>> that purpose would need to be a lot smarter, taking into account ACS

>>>>> and precisely selecting ranges within the peer address space that would

>>>>> be untranslated.  Perhaps only populated MMIO within non-ACS

>>>>> hierarchies.  Thanks,

>>>>

>>>> Indeed taking into account the ACS capability would refine the

>>>> situations where a risk exists.

>>>>

>>>> Thanks

>>>>

>>>> Eric

>>>>>

>>>>> Alex

>>>>>
Shameerali Kolothum Thodi Feb. 28, 2018, 1:39 p.m. UTC | #11
Hi Eric,

> -----Original Message-----

> From: Auger Eric [mailto:eric.auger@redhat.com]

> Sent: Wednesday, February 28, 2018 11:53 AM

> To: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>;

> Alex Williamson <alex.williamson@redhat.com>

> Cc: pmorel@linux.vnet.ibm.com; kvm@vger.kernel.org; linux-

> kernel@vger.kernel.org; Linuxarm <linuxarm@huawei.com>; John Garry

> <john.garry@huawei.com>; xuwei (O) <xuwei5@huawei.com>; Robin Murphy

> <robin.murphy@arm.com>

> Subject: Re: [PATCH v4 4/6] vfio/type1: check dma map request is within a valid

> iova range

> 

> Hi Shameer,

> 

> On 28/02/18 10:25, Shameerali Kolothum Thodi wrote:

> >

> >

> >> -----Original Message-----

> >> From: Auger Eric [mailto:eric.auger@redhat.com]

> >> Sent: Wednesday, February 28, 2018 9:02 AM

> >> To: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>;

> >> Alex Williamson <alex.williamson@redhat.com>

> >> Cc: pmorel@linux.vnet.ibm.com; kvm@vger.kernel.org; linux-

> >> kernel@vger.kernel.org; Linuxarm <linuxarm@huawei.com>; John Garry

> >> <john.garry@huawei.com>; xuwei (O) <xuwei5@huawei.com>; Robin

> Murphy

> >> <robin.murphy@arm.com>

> >> Subject: Re: [PATCH v4 4/6] vfio/type1: check dma map request is within a

> valid

> >> iova range

> >>

> >> Hi Shameer,

> >>

> >> On 27/02/18 10:57, Shameerali Kolothum Thodi wrote:

> >>>

> >>>

> >>>> -----Original Message-----

> >>>> From: Auger Eric [mailto:eric.auger@redhat.com]

> >>>> Sent: Tuesday, February 27, 2018 8:27 AM

> >>>> To: Alex Williamson <alex.williamson@redhat.com>

> >>>> Cc: Shameerali Kolothum Thodi

> <shameerali.kolothum.thodi@huawei.com>;

> >>>> pmorel@linux.vnet.ibm.com; kvm@vger.kernel.org; linux-

> >>>> kernel@vger.kernel.org; Linuxarm <linuxarm@huawei.com>; John Garry

> >>>> <john.garry@huawei.com>; xuwei (O) <xuwei5@huawei.com>; Robin

> >> Murphy

> >>>> <robin.murphy@arm.com>

> >>>> Subject: Re: [PATCH v4 4/6] vfio/type1: check dma map request is within a

> >> valid

> >>>> iova range

> >>>>

> >>>> Hi,

> >>>> On 27/02/18 00:13, Alex Williamson wrote:

> >>>>> On Mon, 26 Feb 2018 23:05:43 +0100

> >>>>> Auger Eric <eric.auger@redhat.com> wrote:

> >>>>>

> >>>>>> Hi Shameer,

> >>>>>>

> >>>>>> [Adding Robin in CC]

> >>>>>> On 21/02/18 13:22, Shameer Kolothum wrote:

> >>>>>>> This checks and rejects any dma map request outside valid iova

> >>>>>>> range.

> >>>>>>>

> >>>>>>> Signed-off-by: Shameer Kolothum

> >>>> <shameerali.kolothum.thodi@huawei.com>

> >>>>>>> ---

> >>>>>>>  drivers/vfio/vfio_iommu_type1.c | 22 ++++++++++++++++++++++

> >>>>>>>  1 file changed, 22 insertions(+)

> >>>>>>>

> >>>>>>> diff --git a/drivers/vfio/vfio_iommu_type1.c

> >>>> b/drivers/vfio/vfio_iommu_type1.c

> >>>>>>> index a80884e..3049393 100644

> >>>>>>> --- a/drivers/vfio/vfio_iommu_type1.c

> >>>>>>> +++ b/drivers/vfio/vfio_iommu_type1.c

> >>>>>>> @@ -970,6 +970,23 @@ static int vfio_pin_map_dma(struct

> vfio_iommu

> >>>> *iommu, struct vfio_dma *dma,

> >>>>>>>  	return ret;

> >>>>>>>  }

> >>>>>>>

> >>>>>>> +/*

> >>>>>>> + * Check dma map request is within a valid iova range

> >>>>>>> + */

> >>>>>>> +static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu,

> >>>>>>> +				dma_addr_t start, dma_addr_t end)

> >>>>>>> +{

> >>>>>>> +	struct list_head *iova = &iommu->iova_list;

> >>>>>>> +	struct vfio_iova *node;

> >>>>>>> +

> >>>>>>> +	list_for_each_entry(node, iova, list) {

> >>>>>>> +		if ((start >= node->start) && (end <= node->end))

> >>>>>>> +			return true;

> >>>>>> I am now confused by the fact this change will prevent existing QEMU

> >>>>>> from working with this series on some platforms. For instance QEMU

> virt

> >>>>>> machine GPA space collides with Seattle PCI host bridge windows. On

> >> ARM

> >>>>>> the smmu and smmuv3 drivers report the PCI host bridge windows as

> >>>>>> reserved regions which does not seem to be the case on other

> platforms.

> >>>>>> The change happened in commit

> >>>> 273df9635385b2156851c7ee49f40658d7bcb29d

> >>>>>> (iommu/dma: Make PCI window reservation generic).

> >>>>>>

> >>>>>> For background, we already discussed the topic after LPC 2016. See

> >>>>>> https://www.spinics.net/lists/kernel/msg2379607.html.

> >>>>>>

> >>>>>> So is it the right choice to expose PCI host bridge windows as reserved

> >>>>>> regions? If yes shouldn't we make a difference between those and MSI

> >>>>>> windows in this series and do not reject any user space DMA_MAP

> >> attempt

> >>>>>> within PCI host bridge windows.

> >>>>>

> >>>>> If the QEMU machine GPA collides with a reserved region today, then

> >>>>> either:

> >>>>>

> >>>>> a) The mapping through the IOMMU works and the reserved region is

> >> wrong

> >>>>>

> >>>>> or

> >>>>>

> >>>>> b) The mapping doesn't actually work, QEMU is at risk of data loss by

> >>>>> being told that it worked, and we're justified in changing that

> >>>>> behavior.

> >>>>>

> >>>>> Without knowing the specifics of SMMU, it doesn't particularly make

> >>>>> sense to me to mark the entire PCI hierarchy MMIO range as reserved,

> >>>>> unless perhaps the IOMMU is incapable of translating those IOVAs.

> >>>> to me the limitation does not come from the smmu itself, which is a

> >>>> separate HW block sitting between the root complex and the

> interconnect.

> >>>> If ACS is not enforced by the PCIe subsystem, the transaction will never

> >>>> reach the IOMMU.

> >>>

> >>> True. And we do have one such platform where ACS is not enforced but

> >>> reserving the regions and possibly creating holes while launching VM will

> >>> make it secure. But I do wonder how we will solve the device grouping

> >>> in such cases.

> >>>

> >>> The Seattle PCI host bridge windows case you mentioned has any pci quirk

> >>> to claim that they support ACS?

> >> No there is none to my knowledge. I am applying Alex' not upstream ACS

> >> overwrite patch.

> >

> > Ok. But isn't that patch actually applicable to cases where ACS is really

> supported

> > by hardware but the capability is not available?

> 

> My understanding is normally yes. If you apply the patch whereas the HW

> practically does not support ACS, then you fool the kernel pretending

> there is isolation whereas there is not. I don't know the exact

> capability of the HW on AMD Seattle and effectively I should have cared

> about it much earlier and if the HW capability were supported and not

> properly exposed we should have implemented a clean quirk for this platform.


Ok. Thanks for the details.
 
> 

>   I am just trying to see whether

> > the argument that we should allow DMA MAP requests for this(non-ACS case)

> > even if the Guest GPA conflict with reserved region holds good. The fact that

> may

> > be it was working before is that the Guest never actually allocated any GPA

> from

> > the reserved region or maybe I am missing something here.

> 

> If my understanding is correct, in ideal world we would report the PCI

> host bridge window as reserved only in case ACS is not supported. If you

> apply the patch and overrides the ACS, then the DMA_MAP  would be

> allowed. In case the HW does not support ACS, then you could face

> situations where one EP tries to access GPA that never reaches the IOMMU

> (because it corresponds to the BAR of another downstream EP). Same can

> happen at the moment.


Yes, this is my understanding too.

Just wondering the below changes to the iommu_dma_get_resv_regions() is
good enough to take care this issue or not.

Thanks,
Shameer

-- >8 --
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index f05f3cf..b6e89d5 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -34,6 +34,8 @@
 
 #define IOMMU_MAPPING_ERROR	0
 
+#define REQ_ACS_FLAGS   (PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_UF)
+
 struct iommu_dma_msi_page {
 	struct list_head	list;
 	dma_addr_t		iova;
@@ -183,6 +185,9 @@ void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list)
 	if (!dev_is_pci(dev))
 		return;
 
+	if (pci_acs_path_enabled(to_pci_dev(dev), NULL, REQ_ACS_FLAGS))
+		return;
+
 	bridge = pci_find_host_bridge(to_pci_dev(dev)->bus);
 	resource_list_for_each_entry(window, &bridge->windows) {
 		struct iommu_resv_region *region;
---8--


> 

> Eric

> >

> > Thanks,

> > Shameer

> >

> >> Thanks

> >>

> >> Eric

> >>>

> >>>> In the case of such overlap, shouldn't we just warn the end-user that

> >>>> this situation is dangerous instead of forbidding the use case which

> >>>> worked "in most cases" until now.

> >>>

> >>> Yes, may be something similar to the allow_unsafe_interrupts case, if

> >>> that is acceptable.

> >>>

> >>> Thanks,

> >>> Shameer

> >>>

> >>>>> Are we trying to prevent untranslated p2p with this reserved range?

> >>>>> That's not necessarily a terrible idea, but it seems that doing it for

> >>>>> that purpose would need to be a lot smarter, taking into account ACS

> >>>>> and precisely selecting ranges within the peer address space that would

> >>>>> be untranslated.  Perhaps only populated MMIO within non-ACS

> >>>>> hierarchies.  Thanks,

> >>>>

> >>>> Indeed taking into account the ACS capability would refine the

> >>>> situations where a risk exists.

> >>>>

> >>>> Thanks

> >>>>

> >>>> Eric

> >>>>>

> >>>>> Alex

> >>>>>
Alex Williamson Feb. 28, 2018, 3:24 p.m. UTC | #12
On Wed, 28 Feb 2018 12:53:27 +0100
Auger Eric <eric.auger@redhat.com> wrote:

> Hi Shameer,

> 

> On 28/02/18 10:25, Shameerali Kolothum Thodi wrote:

> > 

> >   

> >> -----Original Message-----

> >> From: Auger Eric [mailto:eric.auger@redhat.com]

> >> Sent: Wednesday, February 28, 2018 9:02 AM

> >> To: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>;

> >> Alex Williamson <alex.williamson@redhat.com>

> >> Cc: pmorel@linux.vnet.ibm.com; kvm@vger.kernel.org; linux-

> >> kernel@vger.kernel.org; Linuxarm <linuxarm@huawei.com>; John Garry

> >> <john.garry@huawei.com>; xuwei (O) <xuwei5@huawei.com>; Robin Murphy

> >> <robin.murphy@arm.com>

> >> Subject: Re: [PATCH v4 4/6] vfio/type1: check dma map request is within a valid

> >> iova range

> >>

> >> Hi Shameer,

> >>

> >> On 27/02/18 10:57, Shameerali Kolothum Thodi wrote:  

> >>>

> >>>  

> >>>> -----Original Message-----

> >>>> From: Auger Eric [mailto:eric.auger@redhat.com]

> >>>> Sent: Tuesday, February 27, 2018 8:27 AM

> >>>> To: Alex Williamson <alex.williamson@redhat.com>

> >>>> Cc: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>;

> >>>> pmorel@linux.vnet.ibm.com; kvm@vger.kernel.org; linux-

> >>>> kernel@vger.kernel.org; Linuxarm <linuxarm@huawei.com>; John Garry

> >>>> <john.garry@huawei.com>; xuwei (O) <xuwei5@huawei.com>; Robin  

> >> Murphy  

> >>>> <robin.murphy@arm.com>

> >>>> Subject: Re: [PATCH v4 4/6] vfio/type1: check dma map request is within a  

> >> valid  

> >>>> iova range

> >>>>

> >>>> Hi,

> >>>> On 27/02/18 00:13, Alex Williamson wrote:  

> >>>>> On Mon, 26 Feb 2018 23:05:43 +0100

> >>>>> Auger Eric <eric.auger@redhat.com> wrote:

> >>>>>  

> >>>>>> Hi Shameer,

> >>>>>>

> >>>>>> [Adding Robin in CC]

> >>>>>> On 21/02/18 13:22, Shameer Kolothum wrote:  

> >>>>>>> This checks and rejects any dma map request outside valid iova

> >>>>>>> range.

> >>>>>>>

> >>>>>>> Signed-off-by: Shameer Kolothum  

> >>>> <shameerali.kolothum.thodi@huawei.com>  

> >>>>>>> ---

> >>>>>>>  drivers/vfio/vfio_iommu_type1.c | 22 ++++++++++++++++++++++

> >>>>>>>  1 file changed, 22 insertions(+)

> >>>>>>>

> >>>>>>> diff --git a/drivers/vfio/vfio_iommu_type1.c  

> >>>> b/drivers/vfio/vfio_iommu_type1.c  

> >>>>>>> index a80884e..3049393 100644

> >>>>>>> --- a/drivers/vfio/vfio_iommu_type1.c

> >>>>>>> +++ b/drivers/vfio/vfio_iommu_type1.c

> >>>>>>> @@ -970,6 +970,23 @@ static int vfio_pin_map_dma(struct vfio_iommu  

> >>>> *iommu, struct vfio_dma *dma,  

> >>>>>>>  	return ret;

> >>>>>>>  }

> >>>>>>>

> >>>>>>> +/*

> >>>>>>> + * Check dma map request is within a valid iova range

> >>>>>>> + */

> >>>>>>> +static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu,

> >>>>>>> +				dma_addr_t start, dma_addr_t end)

> >>>>>>> +{

> >>>>>>> +	struct list_head *iova = &iommu->iova_list;

> >>>>>>> +	struct vfio_iova *node;

> >>>>>>> +

> >>>>>>> +	list_for_each_entry(node, iova, list) {

> >>>>>>> +		if ((start >= node->start) && (end <= node->end))

> >>>>>>> +			return true;  

> >>>>>> I am now confused by the fact this change will prevent existing QEMU

> >>>>>> from working with this series on some platforms. For instance QEMU virt

> >>>>>> machine GPA space collides with Seattle PCI host bridge windows. On  

> >> ARM  

> >>>>>> the smmu and smmuv3 drivers report the PCI host bridge windows as

> >>>>>> reserved regions which does not seem to be the case on other platforms.

> >>>>>> The change happened in commit  

> >>>> 273df9635385b2156851c7ee49f40658d7bcb29d  

> >>>>>> (iommu/dma: Make PCI window reservation generic).

> >>>>>>

> >>>>>> For background, we already discussed the topic after LPC 2016. See

> >>>>>> https://www.spinics.net/lists/kernel/msg2379607.html.

> >>>>>>

> >>>>>> So is it the right choice to expose PCI host bridge windows as reserved

> >>>>>> regions? If yes shouldn't we make a difference between those and MSI

> >>>>>> windows in this series and do not reject any user space DMA_MAP  

> >> attempt  

> >>>>>> within PCI host bridge windows.  

> >>>>>

> >>>>> If the QEMU machine GPA collides with a reserved region today, then

> >>>>> either:

> >>>>>

> >>>>> a) The mapping through the IOMMU works and the reserved region is  

> >> wrong  

> >>>>>

> >>>>> or

> >>>>>

> >>>>> b) The mapping doesn't actually work, QEMU is at risk of data loss by

> >>>>> being told that it worked, and we're justified in changing that

> >>>>> behavior.

> >>>>>

> >>>>> Without knowing the specifics of SMMU, it doesn't particularly make

> >>>>> sense to me to mark the entire PCI hierarchy MMIO range as reserved,

> >>>>> unless perhaps the IOMMU is incapable of translating those IOVAs.  

> >>>> to me the limitation does not come from the smmu itself, which is a

> >>>> separate HW block sitting between the root complex and the interconnect.

> >>>> If ACS is not enforced by the PCIe subsystem, the transaction will never

> >>>> reach the IOMMU.  

> >>>

> >>> True. And we do have one such platform where ACS is not enforced but

> >>> reserving the regions and possibly creating holes while launching VM will

> >>> make it secure. But I do wonder how we will solve the device grouping

> >>> in such cases.

> >>>

> >>> The Seattle PCI host bridge windows case you mentioned has any pci quirk

> >>> to claim that they support ACS?  

> >> No there is none to my knowledge. I am applying Alex' not upstream ACS

> >> overwrite patch.  

> > 

> > Ok. But isn't that patch actually applicable to cases where ACS is really supported

> > by hardware but the capability is not available?  

> 

> My understanding is normally yes. If you apply the patch whereas the HW

> practically does not support ACS, then you fool the kernel pretending

> there is isolation whereas there is not. I don't know the exact

> capability of the HW on AMD Seattle and effectively I should have cared

> about it much earlier and if the HW capability were supported and not

> properly exposed we should have implemented a clean quirk for this platform.


Yes, there's a reason why the ACS override patch is not upstream, and
any downstream distribution that actually intends to support their
customers should not include it.  If we can verify with the hardware
vendor that ACS equivalent isolation exists, we should be putting
quirks into the kernel to enable it automatically.  Supporting users
using the ACS override patch is a non-goal.

>   I am just trying to see whether

> > the argument that we should allow DMA MAP requests for this(non-ACS case)

> > even if the Guest GPA conflict with reserved region holds good. The fact that may

> > be it was working before is that the Guest never actually allocated any GPA from

> > the reserved region or maybe I am missing something here.  

> 

> If my understanding is correct, in ideal world we would report the PCI

> host bridge window as reserved only in case ACS is not supported. If you

> apply the patch and overrides the ACS, then the DMA_MAP  would be

> allowed. In case the HW does not support ACS, then you could face

> situations where one EP tries to access GPA that never reaches the IOMMU

> (because it corresponds to the BAR of another downstream EP). Same can

> happen at the moment.


I still think you're overstretching the IOMMU reserved region interface
by trying to report possible ACS conflicts.  Are we going to update the
reserved list on device hotplug?  Are we going to update the list when
MMIO is enabled or disabled for each device?  What if the BARs are
reprogrammed or bridge apertures changed?  IMO, the IOMMU reserved list
should account for specific IOVA exclusions that apply to transactions
that actually reach the IOMMU, not aliasing that might occur in the
downstream topology.  Additionally, the IOMMU group composition must be
such that these sorts of aliasing issues can only occur within an IOMMU
group.  If a transaction can be affected by the MMIO programming of
another group, then the groups are drawn incorrectly for the isolation
capabilities of the hardware.  Thanks,

Alex
Eric Auger Feb. 28, 2018, 3:32 p.m. UTC | #13
Hi Shameer,

On 28/02/18 14:39, Shameerali Kolothum Thodi wrote:
> Hi Eric,

> 

>> -----Original Message-----

>> From: Auger Eric [mailto:eric.auger@redhat.com]

>> Sent: Wednesday, February 28, 2018 11:53 AM

>> To: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>;

>> Alex Williamson <alex.williamson@redhat.com>

>> Cc: pmorel@linux.vnet.ibm.com; kvm@vger.kernel.org; linux-

>> kernel@vger.kernel.org; Linuxarm <linuxarm@huawei.com>; John Garry

>> <john.garry@huawei.com>; xuwei (O) <xuwei5@huawei.com>; Robin Murphy

>> <robin.murphy@arm.com>

>> Subject: Re: [PATCH v4 4/6] vfio/type1: check dma map request is within a valid

>> iova range

>>

>> Hi Shameer,

>>

>> On 28/02/18 10:25, Shameerali Kolothum Thodi wrote:

>>>

>>>

>>>> -----Original Message-----

>>>> From: Auger Eric [mailto:eric.auger@redhat.com]

>>>> Sent: Wednesday, February 28, 2018 9:02 AM

>>>> To: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>;

>>>> Alex Williamson <alex.williamson@redhat.com>

>>>> Cc: pmorel@linux.vnet.ibm.com; kvm@vger.kernel.org; linux-

>>>> kernel@vger.kernel.org; Linuxarm <linuxarm@huawei.com>; John Garry

>>>> <john.garry@huawei.com>; xuwei (O) <xuwei5@huawei.com>; Robin

>> Murphy

>>>> <robin.murphy@arm.com>

>>>> Subject: Re: [PATCH v4 4/6] vfio/type1: check dma map request is within a

>> valid

>>>> iova range

>>>>

>>>> Hi Shameer,

>>>>

>>>> On 27/02/18 10:57, Shameerali Kolothum Thodi wrote:

>>>>>

>>>>>

>>>>>> -----Original Message-----

>>>>>> From: Auger Eric [mailto:eric.auger@redhat.com]

>>>>>> Sent: Tuesday, February 27, 2018 8:27 AM

>>>>>> To: Alex Williamson <alex.williamson@redhat.com>

>>>>>> Cc: Shameerali Kolothum Thodi

>> <shameerali.kolothum.thodi@huawei.com>;

>>>>>> pmorel@linux.vnet.ibm.com; kvm@vger.kernel.org; linux-

>>>>>> kernel@vger.kernel.org; Linuxarm <linuxarm@huawei.com>; John Garry

>>>>>> <john.garry@huawei.com>; xuwei (O) <xuwei5@huawei.com>; Robin

>>>> Murphy

>>>>>> <robin.murphy@arm.com>

>>>>>> Subject: Re: [PATCH v4 4/6] vfio/type1: check dma map request is within a

>>>> valid

>>>>>> iova range

>>>>>>

>>>>>> Hi,

>>>>>> On 27/02/18 00:13, Alex Williamson wrote:

>>>>>>> On Mon, 26 Feb 2018 23:05:43 +0100

>>>>>>> Auger Eric <eric.auger@redhat.com> wrote:

>>>>>>>

>>>>>>>> Hi Shameer,

>>>>>>>>

>>>>>>>> [Adding Robin in CC]

>>>>>>>> On 21/02/18 13:22, Shameer Kolothum wrote:

>>>>>>>>> This checks and rejects any dma map request outside valid iova

>>>>>>>>> range.

>>>>>>>>>

>>>>>>>>> Signed-off-by: Shameer Kolothum

>>>>>> <shameerali.kolothum.thodi@huawei.com>

>>>>>>>>> ---

>>>>>>>>>  drivers/vfio/vfio_iommu_type1.c | 22 ++++++++++++++++++++++

>>>>>>>>>  1 file changed, 22 insertions(+)

>>>>>>>>>

>>>>>>>>> diff --git a/drivers/vfio/vfio_iommu_type1.c

>>>>>> b/drivers/vfio/vfio_iommu_type1.c

>>>>>>>>> index a80884e..3049393 100644

>>>>>>>>> --- a/drivers/vfio/vfio_iommu_type1.c

>>>>>>>>> +++ b/drivers/vfio/vfio_iommu_type1.c

>>>>>>>>> @@ -970,6 +970,23 @@ static int vfio_pin_map_dma(struct

>> vfio_iommu

>>>>>> *iommu, struct vfio_dma *dma,

>>>>>>>>>  	return ret;

>>>>>>>>>  }

>>>>>>>>>

>>>>>>>>> +/*

>>>>>>>>> + * Check dma map request is within a valid iova range

>>>>>>>>> + */

>>>>>>>>> +static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu,

>>>>>>>>> +				dma_addr_t start, dma_addr_t end)

>>>>>>>>> +{

>>>>>>>>> +	struct list_head *iova = &iommu->iova_list;

>>>>>>>>> +	struct vfio_iova *node;

>>>>>>>>> +

>>>>>>>>> +	list_for_each_entry(node, iova, list) {

>>>>>>>>> +		if ((start >= node->start) && (end <= node->end))

>>>>>>>>> +			return true;

>>>>>>>> I am now confused by the fact this change will prevent existing QEMU

>>>>>>>> from working with this series on some platforms. For instance QEMU

>> virt

>>>>>>>> machine GPA space collides with Seattle PCI host bridge windows. On

>>>> ARM

>>>>>>>> the smmu and smmuv3 drivers report the PCI host bridge windows as

>>>>>>>> reserved regions which does not seem to be the case on other

>> platforms.

>>>>>>>> The change happened in commit

>>>>>> 273df9635385b2156851c7ee49f40658d7bcb29d

>>>>>>>> (iommu/dma: Make PCI window reservation generic).

>>>>>>>>

>>>>>>>> For background, we already discussed the topic after LPC 2016. See

>>>>>>>> https://www.spinics.net/lists/kernel/msg2379607.html.

>>>>>>>>

>>>>>>>> So is it the right choice to expose PCI host bridge windows as reserved

>>>>>>>> regions? If yes shouldn't we make a difference between those and MSI

>>>>>>>> windows in this series and do not reject any user space DMA_MAP

>>>> attempt

>>>>>>>> within PCI host bridge windows.

>>>>>>>

>>>>>>> If the QEMU machine GPA collides with a reserved region today, then

>>>>>>> either:

>>>>>>>

>>>>>>> a) The mapping through the IOMMU works and the reserved region is

>>>> wrong

>>>>>>>

>>>>>>> or

>>>>>>>

>>>>>>> b) The mapping doesn't actually work, QEMU is at risk of data loss by

>>>>>>> being told that it worked, and we're justified in changing that

>>>>>>> behavior.

>>>>>>>

>>>>>>> Without knowing the specifics of SMMU, it doesn't particularly make

>>>>>>> sense to me to mark the entire PCI hierarchy MMIO range as reserved,

>>>>>>> unless perhaps the IOMMU is incapable of translating those IOVAs.

>>>>>> to me the limitation does not come from the smmu itself, which is a

>>>>>> separate HW block sitting between the root complex and the

>> interconnect.

>>>>>> If ACS is not enforced by the PCIe subsystem, the transaction will never

>>>>>> reach the IOMMU.

>>>>>

>>>>> True. And we do have one such platform where ACS is not enforced but

>>>>> reserving the regions and possibly creating holes while launching VM will

>>>>> make it secure. But I do wonder how we will solve the device grouping

>>>>> in such cases.

>>>>>

>>>>> The Seattle PCI host bridge windows case you mentioned has any pci quirk

>>>>> to claim that they support ACS?

>>>> No there is none to my knowledge. I am applying Alex' not upstream ACS

>>>> overwrite patch.

>>>

>>> Ok. But isn't that patch actually applicable to cases where ACS is really

>> supported

>>> by hardware but the capability is not available?

>>

>> My understanding is normally yes. If you apply the patch whereas the HW

>> practically does not support ACS, then you fool the kernel pretending

>> there is isolation whereas there is not. I don't know the exact

>> capability of the HW on AMD Seattle and effectively I should have cared

>> about it much earlier and if the HW capability were supported and not

>> properly exposed we should have implemented a clean quirk for this platform.

> 

> Ok. Thanks for the details.

>  

>>

>>   I am just trying to see whether

>>> the argument that we should allow DMA MAP requests for this(non-ACS case)

>>> even if the Guest GPA conflict with reserved region holds good. The fact that

>> may

>>> be it was working before is that the Guest never actually allocated any GPA

>> from

>>> the reserved region or maybe I am missing something here.

>>

>> If my understanding is correct, in ideal world we would report the PCI

>> host bridge window as reserved only in case ACS is not supported. If you

>> apply the patch and overrides the ACS, then the DMA_MAP  would be

>> allowed. In case the HW does not support ACS, then you could face

>> situations where one EP tries to access GPA that never reaches the IOMMU

>> (because it corresponds to the BAR of another downstream EP). Same can

>> happen at the moment.

> 

> Yes, this is my understanding too.

> 

> Just wondering the below changes to the iommu_dma_get_resv_regions() is

> good enough to take care this issue or not.


This looks sensible to me. The only question I have is can this ACS
computation result be altered later on for some reason.

Otherwise I confirm this fixes the reported reserved regions (no more
PCI host bridge windows) and assignment failure on Cavium CN88xx. Cavium
CN88xx assignment would otherwise be affected too.

Thanks

Eric
> 

> Thanks,

> Shameer

> 

> -- >8 --

> diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c

> index f05f3cf..b6e89d5 100644

> --- a/drivers/iommu/dma-iommu.c

> +++ b/drivers/iommu/dma-iommu.c

> @@ -34,6 +34,8 @@

>  

>  #define IOMMU_MAPPING_ERROR	0

>  

> +#define REQ_ACS_FLAGS   (PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_UF)

> +

>  struct iommu_dma_msi_page {

>  	struct list_head	list;

>  	dma_addr_t		iova;

> @@ -183,6 +185,9 @@ void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list)

>  	if (!dev_is_pci(dev))

>  		return;

>  

> +	if (pci_acs_path_enabled(to_pci_dev(dev), NULL, REQ_ACS_FLAGS))

> +		return;

> +

>  	bridge = pci_find_host_bridge(to_pci_dev(dev)->bus);

>  	resource_list_for_each_entry(window, &bridge->windows) {

>  		struct iommu_resv_region *region;

> ---8--

> 

> 

>>

>> Eric

>>>

>>> Thanks,

>>> Shameer

>>>

>>>> Thanks

>>>>

>>>> Eric

>>>>>

>>>>>> In the case of such overlap, shouldn't we just warn the end-user that

>>>>>> this situation is dangerous instead of forbidding the use case which

>>>>>> worked "in most cases" until now.

>>>>>

>>>>> Yes, may be something similar to the allow_unsafe_interrupts case, if

>>>>> that is acceptable.

>>>>>

>>>>> Thanks,

>>>>> Shameer

>>>>>

>>>>>>> Are we trying to prevent untranslated p2p with this reserved range?

>>>>>>> That's not necessarily a terrible idea, but it seems that doing it for

>>>>>>> that purpose would need to be a lot smarter, taking into account ACS

>>>>>>> and precisely selecting ranges within the peer address space that would

>>>>>>> be untranslated.  Perhaps only populated MMIO within non-ACS

>>>>>>> hierarchies.  Thanks,

>>>>>>

>>>>>> Indeed taking into account the ACS capability would refine the

>>>>>> situations where a risk exists.

>>>>>>

>>>>>> Thanks

>>>>>>

>>>>>> Eric

>>>>>>>

>>>>>>> Alex

>>>>>>>
Shameerali Kolothum Thodi March 2, 2018, 1:19 p.m. UTC | #14
> -----Original Message-----

> From: Alex Williamson [mailto:alex.williamson@redhat.com]

> Sent: Wednesday, February 28, 2018 3:24 PM

> To: Auger Eric <eric.auger@redhat.com>

> Cc: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>;

> pmorel@linux.vnet.ibm.com; kvm@vger.kernel.org; linux-

> kernel@vger.kernel.org; Linuxarm <linuxarm@huawei.com>; John Garry

> <john.garry@huawei.com>; xuwei (O) <xuwei5@huawei.com>; Robin Murphy

> <robin.murphy@arm.com>

> Subject: Re: [PATCH v4 4/6] vfio/type1: check dma map request is within a valid

> iova range

> 

> On Wed, 28 Feb 2018 12:53:27 +0100

> Auger Eric <eric.auger@redhat.com> wrote:

> 

> > Hi Shameer,

> >

> > On 28/02/18 10:25, Shameerali Kolothum Thodi wrote:

> > >

> > >

> > >> -----Original Message-----

> > >> From: Auger Eric [mailto:eric.auger@redhat.com]

> > >> Sent: Wednesday, February 28, 2018 9:02 AM

> > >> To: Shameerali Kolothum Thodi

> <shameerali.kolothum.thodi@huawei.com>;

> > >> Alex Williamson <alex.williamson@redhat.com>

> > >> Cc: pmorel@linux.vnet.ibm.com; kvm@vger.kernel.org; linux-

> > >> kernel@vger.kernel.org; Linuxarm <linuxarm@huawei.com>; John Garry

> > >> <john.garry@huawei.com>; xuwei (O) <xuwei5@huawei.com>; Robin

> Murphy

> > >> <robin.murphy@arm.com>

> > >> Subject: Re: [PATCH v4 4/6] vfio/type1: check dma map request is within a

> valid

> > >> iova range

> > >>

> > >> Hi Shameer,

> > >>

> > >> On 27/02/18 10:57, Shameerali Kolothum Thodi wrote:

> > >>>

> > >>>

> > >>>> -----Original Message-----

> > >>>> From: Auger Eric [mailto:eric.auger@redhat.com]

> > >>>> Sent: Tuesday, February 27, 2018 8:27 AM

> > >>>> To: Alex Williamson <alex.williamson@redhat.com>

> > >>>> Cc: Shameerali Kolothum Thodi

> <shameerali.kolothum.thodi@huawei.com>;

> > >>>> pmorel@linux.vnet.ibm.com; kvm@vger.kernel.org; linux-

> > >>>> kernel@vger.kernel.org; Linuxarm <linuxarm@huawei.com>; John

> Garry

> > >>>> <john.garry@huawei.com>; xuwei (O) <xuwei5@huawei.com>; Robin

> > >> Murphy

> > >>>> <robin.murphy@arm.com>

> > >>>> Subject: Re: [PATCH v4 4/6] vfio/type1: check dma map request is within

> a

> > >> valid

> > >>>> iova range

> > >>>>

> > >>>> Hi,

> > >>>> On 27/02/18 00:13, Alex Williamson wrote:

> > >>>>> On Mon, 26 Feb 2018 23:05:43 +0100

> > >>>>> Auger Eric <eric.auger@redhat.com> wrote:

> > >>>>>

> > >>>>>> Hi Shameer,

> > >>>>>>

> > >>>>>> [Adding Robin in CC]

> > >>>>>> On 21/02/18 13:22, Shameer Kolothum wrote:

> > >>>>>>> This checks and rejects any dma map request outside valid iova

> > >>>>>>> range.

> > >>>>>>>

> > >>>>>>> Signed-off-by: Shameer Kolothum

> > >>>> <shameerali.kolothum.thodi@huawei.com>

> > >>>>>>> ---

> > >>>>>>>  drivers/vfio/vfio_iommu_type1.c | 22 ++++++++++++++++++++++

> > >>>>>>>  1 file changed, 22 insertions(+)

> > >>>>>>>

> > >>>>>>> diff --git a/drivers/vfio/vfio_iommu_type1.c

> > >>>> b/drivers/vfio/vfio_iommu_type1.c

> > >>>>>>> index a80884e..3049393 100644

> > >>>>>>> --- a/drivers/vfio/vfio_iommu_type1.c

> > >>>>>>> +++ b/drivers/vfio/vfio_iommu_type1.c

> > >>>>>>> @@ -970,6 +970,23 @@ static int vfio_pin_map_dma(struct

> vfio_iommu

> > >>>> *iommu, struct vfio_dma *dma,

> > >>>>>>>  	return ret;

> > >>>>>>>  }

> > >>>>>>>

> > >>>>>>> +/*

> > >>>>>>> + * Check dma map request is within a valid iova range

> > >>>>>>> + */

> > >>>>>>> +static bool vfio_iommu_iova_dma_valid(struct vfio_iommu

> *iommu,

> > >>>>>>> +				dma_addr_t start, dma_addr_t end)

> > >>>>>>> +{

> > >>>>>>> +	struct list_head *iova = &iommu->iova_list;

> > >>>>>>> +	struct vfio_iova *node;

> > >>>>>>> +

> > >>>>>>> +	list_for_each_entry(node, iova, list) {

> > >>>>>>> +		if ((start >= node->start) && (end <= node->end))

> > >>>>>>> +			return true;

> > >>>>>> I am now confused by the fact this change will prevent existing QEMU

> > >>>>>> from working with this series on some platforms. For instance QEMU

> virt

> > >>>>>> machine GPA space collides with Seattle PCI host bridge windows. On

> > >> ARM

> > >>>>>> the smmu and smmuv3 drivers report the PCI host bridge windows as

> > >>>>>> reserved regions which does not seem to be the case on other

> platforms.

> > >>>>>> The change happened in commit

> > >>>> 273df9635385b2156851c7ee49f40658d7bcb29d

> > >>>>>> (iommu/dma: Make PCI window reservation generic).

> > >>>>>>

> > >>>>>> For background, we already discussed the topic after LPC 2016. See

> > >>>>>> https://www.spinics.net/lists/kernel/msg2379607.html.

> > >>>>>>

> > >>>>>> So is it the right choice to expose PCI host bridge windows as reserved

> > >>>>>> regions? If yes shouldn't we make a difference between those and

> MSI

> > >>>>>> windows in this series and do not reject any user space DMA_MAP

> > >> attempt

> > >>>>>> within PCI host bridge windows.

> > >>>>>

> > >>>>> If the QEMU machine GPA collides with a reserved region today, then

> > >>>>> either:

> > >>>>>

> > >>>>> a) The mapping through the IOMMU works and the reserved region is

> > >> wrong

> > >>>>>

> > >>>>> or

> > >>>>>

> > >>>>> b) The mapping doesn't actually work, QEMU is at risk of data loss by

> > >>>>> being told that it worked, and we're justified in changing that

> > >>>>> behavior.

> > >>>>>

> > >>>>> Without knowing the specifics of SMMU, it doesn't particularly make

> > >>>>> sense to me to mark the entire PCI hierarchy MMIO range as

> reserved,

> > >>>>> unless perhaps the IOMMU is incapable of translating those IOVAs.

> > >>>> to me the limitation does not come from the smmu itself, which is a

> > >>>> separate HW block sitting between the root complex and the

> interconnect.

> > >>>> If ACS is not enforced by the PCIe subsystem, the transaction will never

> > >>>> reach the IOMMU.

> > >>>

> > >>> True. And we do have one such platform where ACS is not enforced but

> > >>> reserving the regions and possibly creating holes while launching VM will

> > >>> make it secure. But I do wonder how we will solve the device grouping

> > >>> in such cases.

> > >>>

> > >>> The Seattle PCI host bridge windows case you mentioned has any pci

> quirk

> > >>> to claim that they support ACS?

> > >> No there is none to my knowledge. I am applying Alex' not upstream ACS

> > >> overwrite patch.

> > >

> > > Ok. But isn't that patch actually applicable to cases where ACS is really

> supported

> > > by hardware but the capability is not available?

> >

> > My understanding is normally yes. If you apply the patch whereas the HW

> > practically does not support ACS, then you fool the kernel pretending

> > there is isolation whereas there is not. I don't know the exact

> > capability of the HW on AMD Seattle and effectively I should have cared

> > about it much earlier and if the HW capability were supported and not

> > properly exposed we should have implemented a clean quirk for this

> platform.

> 

> Yes, there's a reason why the ACS override patch is not upstream, and

> any downstream distribution that actually intends to support their

> customers should not include it.  If we can verify with the hardware

> vendor that ACS equivalent isolation exists, we should be putting

> quirks into the kernel to enable it automatically.  Supporting users

> using the ACS override patch is a non-goal.

> 

> >   I am just trying to see whether

> > > the argument that we should allow DMA MAP requests for this(non-ACS

> case)

> > > even if the Guest GPA conflict with reserved region holds good. The fact

> that may

> > > be it was working before is that the Guest never actually allocated any GPA

> from

> > > the reserved region or maybe I am missing something here.

> >

> > If my understanding is correct, in ideal world we would report the PCI

> > host bridge window as reserved only in case ACS is not supported. If you

> > apply the patch and overrides the ACS, then the DMA_MAP  would be

> > allowed. In case the HW does not support ACS, then you could face

> > situations where one EP tries to access GPA that never reaches the IOMMU

> > (because it corresponds to the BAR of another downstream EP). Same can

> > happen at the moment.

> 

> I still think you're overstretching the IOMMU reserved region interface

> by trying to report possible ACS conflicts.  Are we going to update the

> reserved list on device hotplug?  Are we going to update the list when

> MMIO is enabled or disabled for each device?  What if the BARs are

> reprogrammed or bridge apertures changed?  IMO, the IOMMU reserved list

> should account for specific IOVA exclusions that apply to transactions

> that actually reach the IOMMU, not aliasing that might occur in the

> downstream topology.  Additionally, the IOMMU group composition must be

> such that these sorts of aliasing issues can only occur within an IOMMU

> group.  If a transaction can be affected by the MMIO programming of

> another group, then the groups are drawn incorrectly for the isolation

> capabilities of the hardware.  Thanks,


Agree that this is not a perfect thing to do covering all scenarios. As Robin
pointed out, the current code is playing safe by reserving the full windows.

My suggestion will be to limit this reservation to non-ACS cases only.  This will
make sure that ACS ARM hardware is not broken by this series and nos-ACS
ones has a chance to run once Qemu is updated to take care of the reserved
regions (at least in some user scenarios).

If you all are fine with this, I can include the ACS check I mentioned earlier in
iommu_dma_get_resv_regions() and sent out the revised  series.

Please let me know your thoughts.

Thanks,
Shameer
Alex Williamson March 2, 2018, 4:04 p.m. UTC | #15
On Fri, 2 Mar 2018 13:19:58 +0000
Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com> wrote:

> > -----Original Message-----

> > From: Alex Williamson [mailto:alex.williamson@redhat.com]

> > Sent: Wednesday, February 28, 2018 3:24 PM

> > To: Auger Eric <eric.auger@redhat.com>

> > Cc: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>;

> > pmorel@linux.vnet.ibm.com; kvm@vger.kernel.org; linux-

> > kernel@vger.kernel.org; Linuxarm <linuxarm@huawei.com>; John Garry

> > <john.garry@huawei.com>; xuwei (O) <xuwei5@huawei.com>; Robin Murphy

> > <robin.murphy@arm.com>

> > Subject: Re: [PATCH v4 4/6] vfio/type1: check dma map request is within a valid

> > iova range

> > 

> > On Wed, 28 Feb 2018 12:53:27 +0100

> > Auger Eric <eric.auger@redhat.com> wrote:

> >   

> > > Hi Shameer,

> > >

> > > On 28/02/18 10:25, Shameerali Kolothum Thodi wrote:  

> > > >

> > > >  

> > > >> -----Original Message-----

> > > >> From: Auger Eric [mailto:eric.auger@redhat.com]

> > > >> Sent: Wednesday, February 28, 2018 9:02 AM

> > > >> To: Shameerali Kolothum Thodi  

> > <shameerali.kolothum.thodi@huawei.com>;  

> > > >> Alex Williamson <alex.williamson@redhat.com>

> > > >> Cc: pmorel@linux.vnet.ibm.com; kvm@vger.kernel.org; linux-

> > > >> kernel@vger.kernel.org; Linuxarm <linuxarm@huawei.com>; John Garry

> > > >> <john.garry@huawei.com>; xuwei (O) <xuwei5@huawei.com>; Robin  

> > Murphy  

> > > >> <robin.murphy@arm.com>

> > > >> Subject: Re: [PATCH v4 4/6] vfio/type1: check dma map request is within a  

> > valid  

> > > >> iova range

> > > >>

> > > >> Hi Shameer,

> > > >>

> > > >> On 27/02/18 10:57, Shameerali Kolothum Thodi wrote:  

> > > >>>

> > > >>>  

> > > >>>> -----Original Message-----

> > > >>>> From: Auger Eric [mailto:eric.auger@redhat.com]

> > > >>>> Sent: Tuesday, February 27, 2018 8:27 AM

> > > >>>> To: Alex Williamson <alex.williamson@redhat.com>

> > > >>>> Cc: Shameerali Kolothum Thodi  

> > <shameerali.kolothum.thodi@huawei.com>;  

> > > >>>> pmorel@linux.vnet.ibm.com; kvm@vger.kernel.org; linux-

> > > >>>> kernel@vger.kernel.org; Linuxarm <linuxarm@huawei.com>; John  

> > Garry  

> > > >>>> <john.garry@huawei.com>; xuwei (O) <xuwei5@huawei.com>; Robin  

> > > >> Murphy  

> > > >>>> <robin.murphy@arm.com>

> > > >>>> Subject: Re: [PATCH v4 4/6] vfio/type1: check dma map request is within  

> > a  

> > > >> valid  

> > > >>>> iova range

> > > >>>>

> > > >>>> Hi,

> > > >>>> On 27/02/18 00:13, Alex Williamson wrote:  

> > > >>>>> On Mon, 26 Feb 2018 23:05:43 +0100

> > > >>>>> Auger Eric <eric.auger@redhat.com> wrote:

> > > >>>>>  

> > > >>>>>> Hi Shameer,

> > > >>>>>>

> > > >>>>>> [Adding Robin in CC]

> > > >>>>>> On 21/02/18 13:22, Shameer Kolothum wrote:  

> > > >>>>>>> This checks and rejects any dma map request outside valid iova

> > > >>>>>>> range.

> > > >>>>>>>

> > > >>>>>>> Signed-off-by: Shameer Kolothum  

> > > >>>> <shameerali.kolothum.thodi@huawei.com>  

> > > >>>>>>> ---

> > > >>>>>>>  drivers/vfio/vfio_iommu_type1.c | 22 ++++++++++++++++++++++

> > > >>>>>>>  1 file changed, 22 insertions(+)

> > > >>>>>>>

> > > >>>>>>> diff --git a/drivers/vfio/vfio_iommu_type1.c  

> > > >>>> b/drivers/vfio/vfio_iommu_type1.c  

> > > >>>>>>> index a80884e..3049393 100644

> > > >>>>>>> --- a/drivers/vfio/vfio_iommu_type1.c

> > > >>>>>>> +++ b/drivers/vfio/vfio_iommu_type1.c

> > > >>>>>>> @@ -970,6 +970,23 @@ static int vfio_pin_map_dma(struct  

> > vfio_iommu  

> > > >>>> *iommu, struct vfio_dma *dma,  

> > > >>>>>>>  	return ret;

> > > >>>>>>>  }

> > > >>>>>>>

> > > >>>>>>> +/*

> > > >>>>>>> + * Check dma map request is within a valid iova range

> > > >>>>>>> + */

> > > >>>>>>> +static bool vfio_iommu_iova_dma_valid(struct vfio_iommu  

> > *iommu,  

> > > >>>>>>> +				dma_addr_t start, dma_addr_t end)

> > > >>>>>>> +{

> > > >>>>>>> +	struct list_head *iova = &iommu->iova_list;

> > > >>>>>>> +	struct vfio_iova *node;

> > > >>>>>>> +

> > > >>>>>>> +	list_for_each_entry(node, iova, list) {

> > > >>>>>>> +		if ((start >= node->start) && (end <= node->end))

> > > >>>>>>> +			return true;  

> > > >>>>>> I am now confused by the fact this change will prevent existing QEMU

> > > >>>>>> from working with this series on some platforms. For instance QEMU  

> > virt  

> > > >>>>>> machine GPA space collides with Seattle PCI host bridge windows. On  

> > > >> ARM  

> > > >>>>>> the smmu and smmuv3 drivers report the PCI host bridge windows as

> > > >>>>>> reserved regions which does not seem to be the case on other  

> > platforms.  

> > > >>>>>> The change happened in commit  

> > > >>>> 273df9635385b2156851c7ee49f40658d7bcb29d  

> > > >>>>>> (iommu/dma: Make PCI window reservation generic).

> > > >>>>>>

> > > >>>>>> For background, we already discussed the topic after LPC 2016. See

> > > >>>>>> https://www.spinics.net/lists/kernel/msg2379607.html.

> > > >>>>>>

> > > >>>>>> So is it the right choice to expose PCI host bridge windows as reserved

> > > >>>>>> regions? If yes shouldn't we make a difference between those and  

> > MSI  

> > > >>>>>> windows in this series and do not reject any user space DMA_MAP  

> > > >> attempt  

> > > >>>>>> within PCI host bridge windows.  

> > > >>>>>

> > > >>>>> If the QEMU machine GPA collides with a reserved region today, then

> > > >>>>> either:

> > > >>>>>

> > > >>>>> a) The mapping through the IOMMU works and the reserved region is  

> > > >> wrong  

> > > >>>>>

> > > >>>>> or

> > > >>>>>

> > > >>>>> b) The mapping doesn't actually work, QEMU is at risk of data loss by

> > > >>>>> being told that it worked, and we're justified in changing that

> > > >>>>> behavior.

> > > >>>>>

> > > >>>>> Without knowing the specifics of SMMU, it doesn't particularly make

> > > >>>>> sense to me to mark the entire PCI hierarchy MMIO range as  

> > reserved,  

> > > >>>>> unless perhaps the IOMMU is incapable of translating those IOVAs.  

> > > >>>> to me the limitation does not come from the smmu itself, which is a

> > > >>>> separate HW block sitting between the root complex and the  

> > interconnect.  

> > > >>>> If ACS is not enforced by the PCIe subsystem, the transaction will never

> > > >>>> reach the IOMMU.  

> > > >>>

> > > >>> True. And we do have one such platform where ACS is not enforced but

> > > >>> reserving the regions and possibly creating holes while launching VM will

> > > >>> make it secure. But I do wonder how we will solve the device grouping

> > > >>> in such cases.

> > > >>>

> > > >>> The Seattle PCI host bridge windows case you mentioned has any pci  

> > quirk  

> > > >>> to claim that they support ACS?  

> > > >> No there is none to my knowledge. I am applying Alex' not upstream ACS

> > > >> overwrite patch.  

> > > >

> > > > Ok. But isn't that patch actually applicable to cases where ACS is really  

> > supported  

> > > > by hardware but the capability is not available?  

> > >

> > > My understanding is normally yes. If you apply the patch whereas the HW

> > > practically does not support ACS, then you fool the kernel pretending

> > > there is isolation whereas there is not. I don't know the exact

> > > capability of the HW on AMD Seattle and effectively I should have cared

> > > about it much earlier and if the HW capability were supported and not

> > > properly exposed we should have implemented a clean quirk for this  

> > platform.

> > 

> > Yes, there's a reason why the ACS override patch is not upstream, and

> > any downstream distribution that actually intends to support their

> > customers should not include it.  If we can verify with the hardware

> > vendor that ACS equivalent isolation exists, we should be putting

> > quirks into the kernel to enable it automatically.  Supporting users

> > using the ACS override patch is a non-goal.

> >   

> > >   I am just trying to see whether  

> > > > the argument that we should allow DMA MAP requests for this(non-ACS  

> > case)  

> > > > even if the Guest GPA conflict with reserved region holds good. The fact  

> > that may  

> > > > be it was working before is that the Guest never actually allocated any GPA  

> > from  

> > > > the reserved region or maybe I am missing something here.  

> > >

> > > If my understanding is correct, in ideal world we would report the PCI

> > > host bridge window as reserved only in case ACS is not supported. If you

> > > apply the patch and overrides the ACS, then the DMA_MAP  would be

> > > allowed. In case the HW does not support ACS, then you could face

> > > situations where one EP tries to access GPA that never reaches the IOMMU

> > > (because it corresponds to the BAR of another downstream EP). Same can

> > > happen at the moment.  

> > 

> > I still think you're overstretching the IOMMU reserved region interface

> > by trying to report possible ACS conflicts.  Are we going to update the

> > reserved list on device hotplug?  Are we going to update the list when

> > MMIO is enabled or disabled for each device?  What if the BARs are

> > reprogrammed or bridge apertures changed?  IMO, the IOMMU reserved list

> > should account for specific IOVA exclusions that apply to transactions

> > that actually reach the IOMMU, not aliasing that might occur in the

> > downstream topology.  Additionally, the IOMMU group composition must be

> > such that these sorts of aliasing issues can only occur within an IOMMU

> > group.  If a transaction can be affected by the MMIO programming of

> > another group, then the groups are drawn incorrectly for the isolation

> > capabilities of the hardware.  Thanks,  

> 

> Agree that this is not a perfect thing to do covering all scenarios. As Robin

> pointed out, the current code is playing safe by reserving the full windows.

> 

> My suggestion will be to limit this reservation to non-ACS cases only.  This will

> make sure that ACS ARM hardware is not broken by this series and nos-ACS

> ones has a chance to run once Qemu is updated to take care of the reserved

> regions (at least in some user scenarios).

> 

> If you all are fine with this, I can include the ACS check I mentioned earlier in

> iommu_dma_get_resv_regions() and sent out the revised  series.

> 

> Please let me know your thoughts.


IMO, the IOMMU should be concerned with ACS as far as isolation is
concerned and reporting reserved ranges that are imposed at the IOMMU
and leave any aliasing or routing issues in the downstream topology to
other layers, or perhaps to the user.  Unfortunately, enforcing the
iova list in vfio is gated by some movement here since we can't break
existing users.  Thanks,

Alex
Robin Murphy March 2, 2018, 5:16 p.m. UTC | #16
On 02/03/18 16:04, Alex Williamson wrote:
[...]
>>> I still think you're overstretching the IOMMU reserved region interface

>>> by trying to report possible ACS conflicts.  Are we going to update the

>>> reserved list on device hotplug?  Are we going to update the list when

>>> MMIO is enabled or disabled for each device?  What if the BARs are

>>> reprogrammed or bridge apertures changed?  IMO, the IOMMU reserved list

>>> should account for specific IOVA exclusions that apply to transactions

>>> that actually reach the IOMMU, not aliasing that might occur in the

>>> downstream topology.  Additionally, the IOMMU group composition must be

>>> such that these sorts of aliasing issues can only occur within an IOMMU

>>> group.  If a transaction can be affected by the MMIO programming of

>>> another group, then the groups are drawn incorrectly for the isolation

>>> capabilities of the hardware.  Thanks,

>>

>> Agree that this is not a perfect thing to do covering all scenarios. As Robin

>> pointed out, the current code is playing safe by reserving the full windows.

>>

>> My suggestion will be to limit this reservation to non-ACS cases only.  This will

>> make sure that ACS ARM hardware is not broken by this series and nos-ACS

>> ones has a chance to run once Qemu is updated to take care of the reserved

>> regions (at least in some user scenarios).

>>

>> If you all are fine with this, I can include the ACS check I mentioned earlier in

>> iommu_dma_get_resv_regions() and sent out the revised  series.

>>

>> Please let me know your thoughts.

> 

> IMO, the IOMMU should be concerned with ACS as far as isolation is

> concerned and reporting reserved ranges that are imposed at the IOMMU

> and leave any aliasing or routing issues in the downstream topology to

> other layers, or perhaps to the user.  Unfortunately, enforcing the

> iova list in vfio is gated by some movement here since we can't break

> existing users.  Thanks,


FWIW, given the discussion we've had here I wouldn't object to pushing 
the PCI window reservation back into the DMA-specific path, such that it 
doesn't get exposed via the general IOMMU API interface. We *do* want to 
do it there where we are in total control of our own address space and 
it just avoids a whole class of problems (even with ACS I'm not sure the 
root complex can be guaranteed to send a "bad" IOVA out to the SMMU 
instead of just terminating it for looking like a peer-to-peer attempt).

I do agree that it's not scalable for the IOMMU layer to attempt to 
detect and describe upstream PCI limitations to userspace by itself - 
they are "reserved regions" rather than "may or may not work regions" 
after all. If there is a genuine integration issue between an IOMMU and 
an upstream PCI RC which restricts usable addresses on a given system, 
that probably needs to be explicitly communicated from firmware to the 
IOMMU driver anyway, at which point that driver can report the relevant 
region(s) directly from its own callback.

I suppose there's an in-between option of keeping generic window 
reservations but giving them a new "only reserve this if you're being 
super-cautious or don't know better" region type which we hide from 
userspace and ignore in VFIO, but maybe that leaves the lines a but too 
blurred still.

Robin.
Shameerali Kolothum Thodi March 5, 2018, 11:44 a.m. UTC | #17
Hi Robin,

> -----Original Message-----

> From: Robin Murphy [mailto:robin.murphy@arm.com]

> Sent: Friday, March 02, 2018 5:17 PM

> To: Alex Williamson <alex.williamson@redhat.com>; Shameerali Kolothum

> Thodi <shameerali.kolothum.thodi@huawei.com>

> Cc: Auger Eric <eric.auger@redhat.com>; pmorel@linux.vnet.ibm.com;

> kvm@vger.kernel.org; linux-kernel@vger.kernel.org; Linuxarm

> <linuxarm@huawei.com>; John Garry <john.garry@huawei.com>; xuwei (O)

> <xuwei5@huawei.com>

> Subject: Re: [PATCH v4 4/6] vfio/type1: check dma map request is within a valid

> iova range

> 

> On 02/03/18 16:04, Alex Williamson wrote:

> [...]

> >>> I still think you're overstretching the IOMMU reserved region interface

> >>> by trying to report possible ACS conflicts.  Are we going to update the

> >>> reserved list on device hotplug?  Are we going to update the list when

> >>> MMIO is enabled or disabled for each device?  What if the BARs are

> >>> reprogrammed or bridge apertures changed?  IMO, the IOMMU reserved

> list

> >>> should account for specific IOVA exclusions that apply to transactions

> >>> that actually reach the IOMMU, not aliasing that might occur in the

> >>> downstream topology.  Additionally, the IOMMU group composition must

> be

> >>> such that these sorts of aliasing issues can only occur within an IOMMU

> >>> group.  If a transaction can be affected by the MMIO programming of

> >>> another group, then the groups are drawn incorrectly for the isolation

> >>> capabilities of the hardware.  Thanks,

> >>

> >> Agree that this is not a perfect thing to do covering all scenarios. As Robin

> >> pointed out, the current code is playing safe by reserving the full windows.

> >>

> >> My suggestion will be to limit this reservation to non-ACS cases only.  This

> will

> >> make sure that ACS ARM hardware is not broken by this series and nos-ACS

> >> ones has a chance to run once Qemu is updated to take care of the reserved

> >> regions (at least in some user scenarios).

> >>

> >> If you all are fine with this, I can include the ACS check I mentioned earlier

> in

> >> iommu_dma_get_resv_regions() and sent out the revised  series.

> >>

> >> Please let me know your thoughts.

> >

> > IMO, the IOMMU should be concerned with ACS as far as isolation is

> > concerned and reporting reserved ranges that are imposed at the IOMMU

> > and leave any aliasing or routing issues in the downstream topology to

> > other layers, or perhaps to the user.  Unfortunately, enforcing the

> > iova list in vfio is gated by some movement here since we can't break

> > existing users.  Thanks,

> 

> FWIW, given the discussion we've had here I wouldn't object to pushing

> the PCI window reservation back into the DMA-specific path, such that it

> doesn't get exposed via the general IOMMU API interface. We *do* want to

> do it there where we are in total control of our own address space and

> it just avoids a whole class of problems (even with ACS I'm not sure the

> root complex can be guaranteed to send a "bad" IOVA out to the SMMU

> instead of just terminating it for looking like a peer-to-peer attempt).

> 

> I do agree that it's not scalable for the IOMMU layer to attempt to

> detect and describe upstream PCI limitations to userspace by itself -

> they are "reserved regions" rather than "may or may not work regions"

> after all. If there is a genuine integration issue between an IOMMU and

> an upstream PCI RC which restricts usable addresses on a given system,

> that probably needs to be explicitly communicated from firmware to the

> IOMMU driver anyway, at which point that driver can report the relevant

> region(s) directly from its own callback.

> 

> I suppose there's an in-between option of keeping generic window

> reservations but giving them a new "only reserve this if you're being

> super-cautious or don't know better" region type which we hide from

> userspace and ignore in VFIO, but maybe that leaves the lines a but too

> blurred still.


Thanks for your reply and details. I have made an attempt to revert the PCI
window reservation back  into the DMA path. Could you please take a look
at the below changes and let me know.
(This is on top of HW MSI reserve patches which is now part of linux-next)

Thanks,
Shameer

-->8--
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index f05f3cf..ddcbbdb 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -167,40 +167,16 @@ EXPORT_SYMBOL(iommu_put_dma_cookie);
  * @list: Reserved region list from iommu_get_resv_regions()
  *
  * IOMMU drivers can use this to implement their .get_resv_regions callback
- * for general non-IOMMU-specific reservations. Currently, this covers host
- * bridge windows for PCI devices and GICv3 ITS region reservation on ACPI
- * based ARM platforms that may require HW MSI reservation.
+ * for general non-IOMMU-specific reservations. Currently, this covers GICv3
+ * ITS region reservation on ACPI based ARM platforms that may require HW MSI
+ * reservation.
  */
 void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list)
 {
-	struct pci_host_bridge *bridge;
-	struct resource_entry *window;
-
-	if (!is_of_node(dev->iommu_fwspec->iommu_fwnode) &&
-		iort_iommu_msi_get_resv_regions(dev, list) < 0)
-		return;
-
-	if (!dev_is_pci(dev))
-		return;
-
-	bridge = pci_find_host_bridge(to_pci_dev(dev)->bus);
-	resource_list_for_each_entry(window, &bridge->windows) {
-		struct iommu_resv_region *region;
-		phys_addr_t start;
-		size_t length;
-
-		if (resource_type(window->res) != IORESOURCE_MEM)
-			continue;
 
-		start = window->res->start - window->offset;
-		length = window->res->end - window->res->start + 1;
-		region = iommu_alloc_resv_region(start, length, 0,
-				IOMMU_RESV_RESERVED);
-		if (!region)
-			return;
+	if (!is_of_node(dev->iommu_fwspec->iommu_fwnode))
+		iort_iommu_msi_get_resv_regions(dev, list);
 
-		list_add_tail(&region->list, list);
-	}
 }
 EXPORT_SYMBOL(iommu_dma_get_resv_regions);
 
@@ -229,6 +205,23 @@ static int cookie_init_hw_msi_region(struct iommu_dma_cookie *cookie,
 	return 0;
 }
 
+static void iova_reserve_pci_windows(struct pci_dev *dev,
+		struct iova_domain *iovad)
+{
+	struct pci_host_bridge *bridge = pci_find_host_bridge(dev->bus);
+	struct resource_entry *window;
+	unsigned long lo, hi;
+
+	resource_list_for_each_entry(window, &bridge->windows) {
+		if (resource_type(window->res) != IORESOURCE_MEM)
+			continue;
+
+		lo = iova_pfn(iovad, window->res->start - window->offset);
+		hi = iova_pfn(iovad, window->res->end - window->offset);
+		reserve_iova(iovad, lo, hi);
+	}
+}
+
 static int iova_reserve_iommu_regions(struct device *dev,
 		struct iommu_domain *domain)
 {
@@ -238,6 +231,9 @@ static int iova_reserve_iommu_regions(struct device *dev,
 	LIST_HEAD(resv_regions);
 	int ret = 0;
 
+	if (dev_is_pci(dev))
+		iova_reserve_pci_windows(to_pci_dev(dev), iovad);
+
 	iommu_get_resv_regions(dev, &resv_regions);
 	list_for_each_entry(region, &resv_regions, list) {
 		unsigned long lo, hi;
Shameerali Kolothum Thodi March 8, 2018, 9:35 a.m. UTC | #18
> -----Original Message-----

> From: Shameerali Kolothum Thodi

> Sent: Monday, March 05, 2018 11:44 AM

> To: 'Robin Murphy' <robin.murphy@arm.com>; Alex Williamson

> <alex.williamson@redhat.com>

> Cc: Auger Eric <eric.auger@redhat.com>; pmorel@linux.vnet.ibm.com;

> kvm@vger.kernel.org; linux-kernel@vger.kernel.org; Linuxarm

> <linuxarm@huawei.com>; John Garry <john.garry@huawei.com>; xuwei (O)

> <xuwei5@huawei.com>

> Subject: RE: [PATCH v4 4/6] vfio/type1: check dma map request is within a valid

> iova range


 [...]

> > >>> I still think you're overstretching the IOMMU reserved region interface

> > >>> by trying to report possible ACS conflicts.  Are we going to update the

> > >>> reserved list on device hotplug?  Are we going to update the list when

> > >>> MMIO is enabled or disabled for each device?  What if the BARs are

> > >>> reprogrammed or bridge apertures changed?  IMO, the IOMMU

> reserved

> > list

> > >>> should account for specific IOVA exclusions that apply to transactions

> > >>> that actually reach the IOMMU, not aliasing that might occur in the

> > >>> downstream topology.  Additionally, the IOMMU group composition must

> > be

> > >>> such that these sorts of aliasing issues can only occur within an IOMMU

> > >>> group.  If a transaction can be affected by the MMIO programming of

> > >>> another group, then the groups are drawn incorrectly for the isolation

> > >>> capabilities of the hardware.  Thanks,

> > >>

> > >> Agree that this is not a perfect thing to do covering all scenarios. As Robin

> > >> pointed out, the current code is playing safe by reserving the full windows.

> > >>

> > >> My suggestion will be to limit this reservation to non-ACS cases only.  This

> > will

> > >> make sure that ACS ARM hardware is not broken by this series and nos-

> ACS

> > >> ones has a chance to run once Qemu is updated to take care of the

> reserved

> > >> regions (at least in some user scenarios).

> > >>

> > >> If you all are fine with this, I can include the ACS check I mentioned earlier

> > in

> > >> iommu_dma_get_resv_regions() and sent out the revised  series.

> > >>

> > >> Please let me know your thoughts.

> > >

> > > IMO, the IOMMU should be concerned with ACS as far as isolation is

> > > concerned and reporting reserved ranges that are imposed at the IOMMU

> > > and leave any aliasing or routing issues in the downstream topology to

> > > other layers, or perhaps to the user.  Unfortunately, enforcing the

> > > iova list in vfio is gated by some movement here since we can't break

> > > existing users.  Thanks,

> >

> > FWIW, given the discussion we've had here I wouldn't object to pushing

> > the PCI window reservation back into the DMA-specific path, such that it

> > doesn't get exposed via the general IOMMU API interface. We *do* want to

> > do it there where we are in total control of our own address space and

> > it just avoids a whole class of problems (even with ACS I'm not sure the

> > root complex can be guaranteed to send a "bad" IOVA out to the SMMU

> > instead of just terminating it for looking like a peer-to-peer attempt).

> >

> > I do agree that it's not scalable for the IOMMU layer to attempt to

> > detect and describe upstream PCI limitations to userspace by itself -

> > they are "reserved regions" rather than "may or may not work regions"

> > after all. If there is a genuine integration issue between an IOMMU and

> > an upstream PCI RC which restricts usable addresses on a given system,

> > that probably needs to be explicitly communicated from firmware to the

> > IOMMU driver anyway, at which point that driver can report the relevant

> > region(s) directly from its own callback.

> >

> > I suppose there's an in-between option of keeping generic window

> > reservations but giving them a new "only reserve this if you're being

> > super-cautious or don't know better" region type which we hide from

> > userspace and ignore in VFIO, but maybe that leaves the lines a but too

> > blurred still.

> 

> Thanks for your reply and details. I have made an attempt to revert the PCI

> window reservation back  into the DMA path. Could you please take a look

> at the below changes and let me know.

> (This is on top of HW MSI reserve patches which is now part of linux-next)


Hi Robin,

Please take a look at the below changes if you get sometime and let me know.
Not sure this is what you have in mind.

Thanks,
Shameer

> 

> -->8--

> diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c

> index f05f3cf..ddcbbdb 100644

> --- a/drivers/iommu/dma-iommu.c

> +++ b/drivers/iommu/dma-iommu.c

> @@ -167,40 +167,16 @@ EXPORT_SYMBOL(iommu_put_dma_cookie);

>   * @list: Reserved region list from iommu_get_resv_regions()

>   *

>   * IOMMU drivers can use this to implement their .get_resv_regions callback

> - * for general non-IOMMU-specific reservations. Currently, this covers host

> - * bridge windows for PCI devices and GICv3 ITS region reservation on ACPI

> - * based ARM platforms that may require HW MSI reservation.

> + * for general non-IOMMU-specific reservations. Currently, this covers GICv3

> + * ITS region reservation on ACPI based ARM platforms that may require HW

> MSI

> + * reservation.

>   */

>  void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list)

>  {

> -	struct pci_host_bridge *bridge;

> -	struct resource_entry *window;

> -

> -	if (!is_of_node(dev->iommu_fwspec->iommu_fwnode) &&

> -		iort_iommu_msi_get_resv_regions(dev, list) < 0)

> -		return;

> -

> -	if (!dev_is_pci(dev))

> -		return;

> -

> -	bridge = pci_find_host_bridge(to_pci_dev(dev)->bus);

> -	resource_list_for_each_entry(window, &bridge->windows) {

> -		struct iommu_resv_region *region;

> -		phys_addr_t start;

> -		size_t length;

> -

> -		if (resource_type(window->res) != IORESOURCE_MEM)

> -			continue;

> 

> -		start = window->res->start - window->offset;

> -		length = window->res->end - window->res->start + 1;

> -		region = iommu_alloc_resv_region(start, length, 0,

> -				IOMMU_RESV_RESERVED);

> -		if (!region)

> -			return;

> +	if (!is_of_node(dev->iommu_fwspec->iommu_fwnode))

> +		iort_iommu_msi_get_resv_regions(dev, list);

> 

> -		list_add_tail(&region->list, list);

> -	}

>  }

>  EXPORT_SYMBOL(iommu_dma_get_resv_regions);

> 

> @@ -229,6 +205,23 @@ static int cookie_init_hw_msi_region(struct

> iommu_dma_cookie *cookie,

>  	return 0;

>  }

> 

> +static void iova_reserve_pci_windows(struct pci_dev *dev,

> +		struct iova_domain *iovad)

> +{

> +	struct pci_host_bridge *bridge = pci_find_host_bridge(dev->bus);

> +	struct resource_entry *window;

> +	unsigned long lo, hi;

> +

> +	resource_list_for_each_entry(window, &bridge->windows) {

> +		if (resource_type(window->res) != IORESOURCE_MEM)

> +			continue;

> +

> +		lo = iova_pfn(iovad, window->res->start - window->offset);

> +		hi = iova_pfn(iovad, window->res->end - window->offset);

> +		reserve_iova(iovad, lo, hi);

> +	}

> +}

> +

>  static int iova_reserve_iommu_regions(struct device *dev,

>  		struct iommu_domain *domain)

>  {

> @@ -238,6 +231,9 @@ static int iova_reserve_iommu_regions(struct device

> *dev,

>  	LIST_HEAD(resv_regions);

>  	int ret = 0;

> 

> +	if (dev_is_pci(dev))

> +		iova_reserve_pci_windows(to_pci_dev(dev), iovad);

> +

>  	iommu_get_resv_regions(dev, &resv_regions);

>  	list_for_each_entry(region, &resv_regions, list) {

>  		unsigned long lo, hi;

> 

>
Robin Murphy March 14, 2018, 6:12 p.m. UTC | #19
Hi Shameer,

On 05/03/18 11:44, Shameerali Kolothum Thodi wrote:
> Hi Robin,

> 

>> -----Original Message-----

>> From: Robin Murphy [mailto:robin.murphy@arm.com]

>> Sent: Friday, March 02, 2018 5:17 PM

>> To: Alex Williamson <alex.williamson@redhat.com>; Shameerali Kolothum

>> Thodi <shameerali.kolothum.thodi@huawei.com>

>> Cc: Auger Eric <eric.auger@redhat.com>; pmorel@linux.vnet.ibm.com;

>> kvm@vger.kernel.org; linux-kernel@vger.kernel.org; Linuxarm

>> <linuxarm@huawei.com>; John Garry <john.garry@huawei.com>; xuwei (O)

>> <xuwei5@huawei.com>

>> Subject: Re: [PATCH v4 4/6] vfio/type1: check dma map request is within a valid

>> iova range

>>

>> On 02/03/18 16:04, Alex Williamson wrote:

>> [...]

>>>>> I still think you're overstretching the IOMMU reserved region interface

>>>>> by trying to report possible ACS conflicts.  Are we going to update the

>>>>> reserved list on device hotplug?  Are we going to update the list when

>>>>> MMIO is enabled or disabled for each device?  What if the BARs are

>>>>> reprogrammed or bridge apertures changed?  IMO, the IOMMU reserved

>> list

>>>>> should account for specific IOVA exclusions that apply to transactions

>>>>> that actually reach the IOMMU, not aliasing that might occur in the

>>>>> downstream topology.  Additionally, the IOMMU group composition must

>> be

>>>>> such that these sorts of aliasing issues can only occur within an IOMMU

>>>>> group.  If a transaction can be affected by the MMIO programming of

>>>>> another group, then the groups are drawn incorrectly for the isolation

>>>>> capabilities of the hardware.  Thanks,

>>>>

>>>> Agree that this is not a perfect thing to do covering all scenarios. As Robin

>>>> pointed out, the current code is playing safe by reserving the full windows.

>>>>

>>>> My suggestion will be to limit this reservation to non-ACS cases only.  This

>> will

>>>> make sure that ACS ARM hardware is not broken by this series and nos-ACS

>>>> ones has a chance to run once Qemu is updated to take care of the reserved

>>>> regions (at least in some user scenarios).

>>>>

>>>> If you all are fine with this, I can include the ACS check I mentioned earlier

>> in

>>>> iommu_dma_get_resv_regions() and sent out the revised  series.

>>>>

>>>> Please let me know your thoughts.

>>>

>>> IMO, the IOMMU should be concerned with ACS as far as isolation is

>>> concerned and reporting reserved ranges that are imposed at the IOMMU

>>> and leave any aliasing or routing issues in the downstream topology to

>>> other layers, or perhaps to the user.  Unfortunately, enforcing the

>>> iova list in vfio is gated by some movement here since we can't break

>>> existing users.  Thanks,

>>

>> FWIW, given the discussion we've had here I wouldn't object to pushing

>> the PCI window reservation back into the DMA-specific path, such that it

>> doesn't get exposed via the general IOMMU API interface. We *do* want to

>> do it there where we are in total control of our own address space and

>> it just avoids a whole class of problems (even with ACS I'm not sure the

>> root complex can be guaranteed to send a "bad" IOVA out to the SMMU

>> instead of just terminating it for looking like a peer-to-peer attempt).

>>

>> I do agree that it's not scalable for the IOMMU layer to attempt to

>> detect and describe upstream PCI limitations to userspace by itself -

>> they are "reserved regions" rather than "may or may not work regions"

>> after all. If there is a genuine integration issue between an IOMMU and

>> an upstream PCI RC which restricts usable addresses on a given system,

>> that probably needs to be explicitly communicated from firmware to the

>> IOMMU driver anyway, at which point that driver can report the relevant

>> region(s) directly from its own callback.

>>

>> I suppose there's an in-between option of keeping generic window

>> reservations but giving them a new "only reserve this if you're being

>> super-cautious or don't know better" region type which we hide from

>> userspace and ignore in VFIO, but maybe that leaves the lines a but too

>> blurred still.

> 

> Thanks for your reply and details. I have made an attempt to revert the PCI

> window reservation back  into the DMA path. Could you please take a look

> at the below changes and let me know.

> (This is on top of HW MSI reserve patches which is now part of linux-next)


Thanks for putting this together - seeing what's left after the patch 
makes me feel half-tempted to effectively revert 273df9635385 altogether 
and just call iort_iommu_msi_get_resv_regions() directly from SMMUv3, 
but there are other things like dma-ranges constraints which we may also 
want to handle generically somewhere so it's probably worth keeping 
iommu_dma_get_resv_regions() around as a common hook for now. In future 
it might ultimately make sense to move it out to something like 
iommu_get_fw_resv_regions(), but that can be a discussion for another 
day; right now it seems this ought to be enough to resolve everyone's 
immediate concerns.

Cheers,
Robin.

> 

> Thanks,

> Shameer

> 

> -->8--

> diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c

> index f05f3cf..ddcbbdb 100644

> --- a/drivers/iommu/dma-iommu.c

> +++ b/drivers/iommu/dma-iommu.c

> @@ -167,40 +167,16 @@ EXPORT_SYMBOL(iommu_put_dma_cookie);

>    * @list: Reserved region list from iommu_get_resv_regions()

>    *

>    * IOMMU drivers can use this to implement their .get_resv_regions callback

> - * for general non-IOMMU-specific reservations. Currently, this covers host

> - * bridge windows for PCI devices and GICv3 ITS region reservation on ACPI

> - * based ARM platforms that may require HW MSI reservation.

> + * for general non-IOMMU-specific reservations. Currently, this covers GICv3

> + * ITS region reservation on ACPI based ARM platforms that may require HW MSI

> + * reservation.

>    */

>   void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list)

>   {

> -	struct pci_host_bridge *bridge;

> -	struct resource_entry *window;

> -

> -	if (!is_of_node(dev->iommu_fwspec->iommu_fwnode) &&

> -		iort_iommu_msi_get_resv_regions(dev, list) < 0)

> -		return;

> -

> -	if (!dev_is_pci(dev))

> -		return;

> -

> -	bridge = pci_find_host_bridge(to_pci_dev(dev)->bus);

> -	resource_list_for_each_entry(window, &bridge->windows) {

> -		struct iommu_resv_region *region;

> -		phys_addr_t start;

> -		size_t length;

> -

> -		if (resource_type(window->res) != IORESOURCE_MEM)

> -			continue;

>   

> -		start = window->res->start - window->offset;

> -		length = window->res->end - window->res->start + 1;

> -		region = iommu_alloc_resv_region(start, length, 0,

> -				IOMMU_RESV_RESERVED);

> -		if (!region)

> -			return;

> +	if (!is_of_node(dev->iommu_fwspec->iommu_fwnode))

> +		iort_iommu_msi_get_resv_regions(dev, list);

>   

> -		list_add_tail(&region->list, list);

> -	}

>   }

>   EXPORT_SYMBOL(iommu_dma_get_resv_regions);

>   

> @@ -229,6 +205,23 @@ static int cookie_init_hw_msi_region(struct iommu_dma_cookie *cookie,

>   	return 0;

>   }

>   

> +static void iova_reserve_pci_windows(struct pci_dev *dev,

> +		struct iova_domain *iovad)

> +{

> +	struct pci_host_bridge *bridge = pci_find_host_bridge(dev->bus);

> +	struct resource_entry *window;

> +	unsigned long lo, hi;

> +

> +	resource_list_for_each_entry(window, &bridge->windows) {

> +		if (resource_type(window->res) != IORESOURCE_MEM)

> +			continue;

> +

> +		lo = iova_pfn(iovad, window->res->start - window->offset);

> +		hi = iova_pfn(iovad, window->res->end - window->offset);

> +		reserve_iova(iovad, lo, hi);

> +	}

> +}

> +

>   static int iova_reserve_iommu_regions(struct device *dev,

>   		struct iommu_domain *domain)

>   {

> @@ -238,6 +231,9 @@ static int iova_reserve_iommu_regions(struct device *dev,

>   	LIST_HEAD(resv_regions);

>   	int ret = 0;

>   

> +	if (dev_is_pci(dev))

> +		iova_reserve_pci_windows(to_pci_dev(dev), iovad);

> +

>   	iommu_get_resv_regions(dev, &resv_regions);

>   	list_for_each_entry(region, &resv_regions, list) {

>   		unsigned long lo, hi;

> 

> 

>
diff mbox series

Patch

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index a80884e..3049393 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -970,6 +970,23 @@  static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
 	return ret;
 }
 
+/*
+ * Check dma map request is within a valid iova range
+ */
+static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu,
+				dma_addr_t start, dma_addr_t end)
+{
+	struct list_head *iova = &iommu->iova_list;
+	struct vfio_iova *node;
+
+	list_for_each_entry(node, iova, list) {
+		if ((start >= node->start) && (end <= node->end))
+			return true;
+	}
+
+	return false;
+}
+
 static int vfio_dma_do_map(struct vfio_iommu *iommu,
 			   struct vfio_iommu_type1_dma_map *map)
 {
@@ -1008,6 +1025,11 @@  static int vfio_dma_do_map(struct vfio_iommu *iommu,
 		goto out_unlock;
 	}
 
+	if (!vfio_iommu_iova_dma_valid(iommu, iova, iova + size - 1)) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
 	dma = kzalloc(sizeof(*dma), GFP_KERNEL);
 	if (!dma) {
 		ret = -ENOMEM;