diff mbox series

[v4,3/6] ACPI: Add driver for the VIOT table

Message ID 20210610075130.67517-4-jean-philippe@linaro.org
State Superseded
Headers show
Series Add support for ACPI VIOT | expand

Commit Message

Jean-Philippe Brucker June 10, 2021, 7:51 a.m. UTC
The ACPI Virtual I/O Translation Table describes topology of
para-virtual platforms, similarly to vendor tables DMAR, IVRS and IORT.
For now it describes the relation between virtio-iommu and the endpoints
it manages.

Three steps are needed to configure DMA of endpoints:

(1) acpi_viot_init(): parse the VIOT table, find or create the fwnode
    associated to each vIOMMU device.

(2) When probing the vIOMMU device, the driver registers its IOMMU ops
    within the IOMMU subsystem. This step doesn't require any
    intervention from the VIOT driver.

(3) viot_iommu_configure(): before binding the endpoint to a driver,
    find the associated IOMMU ops. Register them, along with the
    endpoint ID, into the device's iommu_fwspec.

If step (3) happens before step (2), it is deferred until the IOMMU is
initialized, then retried.

Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>

---
 drivers/acpi/Kconfig      |   3 +
 drivers/iommu/Kconfig     |   1 +
 drivers/acpi/Makefile     |   2 +
 include/linux/acpi_viot.h |  19 ++
 drivers/acpi/bus.c        |   2 +
 drivers/acpi/scan.c       |   3 +
 drivers/acpi/viot.c       | 364 ++++++++++++++++++++++++++++++++++++++
 MAINTAINERS               |   8 +
 8 files changed, 402 insertions(+)
 create mode 100644 include/linux/acpi_viot.h
 create mode 100644 drivers/acpi/viot.c

-- 
2.31.1

Comments

Eric Auger June 16, 2021, 1:26 p.m. UTC | #1
Hi Jean,

On 6/10/21 9:51 AM, Jean-Philippe Brucker wrote:
> The ACPI Virtual I/O Translation Table describes topology of

> para-virtual platforms, similarly to vendor tables DMAR, IVRS and IORT.

> For now it describes the relation between virtio-iommu and the endpoints

> it manages.

>

> Three steps are needed to configure DMA of endpoints:

>

> (1) acpi_viot_init(): parse the VIOT table, find or create the fwnode

>     associated to each vIOMMU device.

>

> (2) When probing the vIOMMU device, the driver registers its IOMMU ops

>     within the IOMMU subsystem. This step doesn't require any

>     intervention from the VIOT driver.

>

> (3) viot_iommu_configure(): before binding the endpoint to a driver,

>     find the associated IOMMU ops. Register them, along with the

>     endpoint ID, into the device's iommu_fwspec.

>

> If step (3) happens before step (2), it is deferred until the IOMMU is

> initialized, then retried.

>

> Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>

> ---

>  drivers/acpi/Kconfig      |   3 +

>  drivers/iommu/Kconfig     |   1 +

>  drivers/acpi/Makefile     |   2 +

>  include/linux/acpi_viot.h |  19 ++

>  drivers/acpi/bus.c        |   2 +

>  drivers/acpi/scan.c       |   3 +

>  drivers/acpi/viot.c       | 364 ++++++++++++++++++++++++++++++++++++++

>  MAINTAINERS               |   8 +

>  8 files changed, 402 insertions(+)

>  create mode 100644 include/linux/acpi_viot.h

>  create mode 100644 drivers/acpi/viot.c

>

> diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig

> index eedec61e3476..3758c6940ed7 100644

> --- a/drivers/acpi/Kconfig

> +++ b/drivers/acpi/Kconfig

> @@ -526,6 +526,9 @@ endif

>  

>  source "drivers/acpi/pmic/Kconfig"

>  

> +config ACPI_VIOT

> +	bool

> +

>  endif	# ACPI

>  

>  config X86_PM_TIMER

> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig

> index 1f111b399bca..aff8a4830dd1 100644

> --- a/drivers/iommu/Kconfig

> +++ b/drivers/iommu/Kconfig

> @@ -403,6 +403,7 @@ config VIRTIO_IOMMU

>  	depends on ARM64

>  	select IOMMU_API

>  	select INTERVAL_TREE

> +	select ACPI_VIOT if ACPI

>  	help

>  	  Para-virtualised IOMMU driver with virtio.

>  

> diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile

> index 700b41adf2db..a6e644c48987 100644

> --- a/drivers/acpi/Makefile

> +++ b/drivers/acpi/Makefile

> @@ -118,3 +118,5 @@ video-objs			+= acpi_video.o video_detect.o

>  obj-y				+= dptf/

>  

>  obj-$(CONFIG_ARM64)		+= arm64/

> +

> +obj-$(CONFIG_ACPI_VIOT)		+= viot.o

> diff --git a/include/linux/acpi_viot.h b/include/linux/acpi_viot.h

> new file mode 100644

> index 000000000000..1eb8ee5b0e5f

> --- /dev/null

> +++ b/include/linux/acpi_viot.h

> @@ -0,0 +1,19 @@

> +/* SPDX-License-Identifier: GPL-2.0-only */

> +

> +#ifndef __ACPI_VIOT_H__

> +#define __ACPI_VIOT_H__

> +

> +#include <linux/acpi.h>

> +

> +#ifdef CONFIG_ACPI_VIOT

> +void __init acpi_viot_init(void);

> +int viot_iommu_configure(struct device *dev);

> +#else

> +static inline void acpi_viot_init(void) {}

> +static inline int viot_iommu_configure(struct device *dev)

> +{

> +	return -ENODEV;

> +}

> +#endif

> +

> +#endif /* __ACPI_VIOT_H__ */

> diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c

> index be7da23fad76..b835ca702ff0 100644

> --- a/drivers/acpi/bus.c

> +++ b/drivers/acpi/bus.c

> @@ -27,6 +27,7 @@

>  #include <linux/dmi.h>

>  #endif

>  #include <linux/acpi_iort.h>

> +#include <linux/acpi_viot.h>

>  #include <linux/pci.h>

>  #include <acpi/apei.h>

>  #include <linux/suspend.h>

> @@ -1339,6 +1340,7 @@ static int __init acpi_init(void)

>  	pci_mmcfg_late_init();

>  	acpi_iort_init();

>  	acpi_scan_init();

> +	acpi_viot_init();

>  	acpi_ec_init();

>  	acpi_debugfs_init();

>  	acpi_sleep_proc_init();

> diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c

> index 0c53c8533300..4fa684fdfda8 100644

> --- a/drivers/acpi/scan.c

> +++ b/drivers/acpi/scan.c

> @@ -9,6 +9,7 @@

>  #include <linux/kernel.h>

>  #include <linux/acpi.h>

>  #include <linux/acpi_iort.h>

> +#include <linux/acpi_viot.h>

>  #include <linux/iommu.h>

>  #include <linux/signal.h>

>  #include <linux/kthread.h>

> @@ -1556,6 +1557,8 @@ static const struct iommu_ops *acpi_iommu_configure_id(struct device *dev,

>  		return ops;

>  

>  	err = iort_iommu_configure_id(dev, id_in);

> +	if (err && err != -EPROBE_DEFER)

> +		err = viot_iommu_configure(dev);

>  

>  	/*

>  	 * If we have reason to believe the IOMMU driver missed the initial

> diff --git a/drivers/acpi/viot.c b/drivers/acpi/viot.c

> new file mode 100644

> index 000000000000..892cd9fa7b6d

> --- /dev/null

> +++ b/drivers/acpi/viot.c

> @@ -0,0 +1,364 @@

> +// SPDX-License-Identifier: GPL-2.0

> +/*

> + * Virtual I/O topology

> + *

> + * The Virtual I/O Translation Table (VIOT) describes the topology of

> + * para-virtual IOMMUs and the endpoints they manage. The OS uses it to

> + * initialize devices in the right order, preventing endpoints from issuing DMA

> + * before their IOMMU is ready.

> + *

> + * When binding a driver to a device, before calling the device driver's probe()

> + * method, the driver infrastructure calls dma_configure(). At that point the

> + * VIOT driver looks for an IOMMU associated to the device in the VIOT table.

> + * If an IOMMU exists and has been initialized, the VIOT driver initializes the

> + * device's IOMMU fwspec, allowing the DMA infrastructure to invoke the IOMMU

> + * ops when the device driver configures DMA mappings. If an IOMMU exists and

> + * hasn't yet been initialized, VIOT returns -EPROBE_DEFER to postpone probing

> + * the device until the IOMMU is available.

> + */

> +#define pr_fmt(fmt) "ACPI: VIOT: " fmt

> +

> +#include <linux/acpi_viot.h>

> +#include <linux/dma-iommu.h>

> +#include <linux/fwnode.h>

> +#include <linux/iommu.h>

> +#include <linux/list.h>

> +#include <linux/pci.h>

> +#include <linux/platform_device.h>

> +

> +struct viot_iommu {

> +	/* Node offset within the table */

> +	unsigned int			offset;

> +	struct fwnode_handle		*fwnode;

> +	struct list_head		list;

> +};

> +

> +struct viot_endpoint {

> +	union {

> +		/* PCI range */

> +		struct {

> +			u16		segment_start;

> +			u16		segment_end;

> +			u16		bdf_start;

> +			u16		bdf_end;

> +		};

> +		/* MMIO */

> +		u64			address;

> +	};

> +	u32				endpoint_id;

> +	struct viot_iommu		*viommu;

> +	struct list_head		list;

> +};

> +

> +static struct acpi_table_viot *viot;

> +static LIST_HEAD(viot_iommus);

> +static LIST_HEAD(viot_pci_ranges);

> +static LIST_HEAD(viot_mmio_endpoints);

> +

> +static int __init viot_check_bounds(const struct acpi_viot_header *hdr)

> +{

> +	struct acpi_viot_header *start, *end, *hdr_end;

> +

> +	start = ACPI_ADD_PTR(struct acpi_viot_header, viot,

> +			     max_t(size_t, sizeof(*viot), viot->node_offset));

> +	end = ACPI_ADD_PTR(struct acpi_viot_header, viot, viot->header.length);

> +	hdr_end = ACPI_ADD_PTR(struct acpi_viot_header, hdr, sizeof(*hdr));

> +

> +	if (hdr < start || hdr_end > end) {

> +		pr_err(FW_BUG "Node pointer overflows\n");

> +		return -EOVERFLOW;

> +	}

> +	if (hdr->length < sizeof(*hdr)) {

> +		pr_err(FW_BUG "Empty node\n");

> +		return -EINVAL;

> +	}

> +	return 0;

> +}

> +

> +static int __init viot_get_pci_iommu_fwnode(struct viot_iommu *viommu,

> +					    u16 segment, u16 bdf)

> +{

> +	struct pci_dev *pdev;

> +	struct fwnode_handle *fwnode;

> +

> +	pdev = pci_get_domain_bus_and_slot(segment, PCI_BUS_NUM(bdf),

> +					   bdf & 0xff);

> +	if (!pdev) {

> +		pr_err("Could not find PCI IOMMU\n");

> +		return -ENODEV;

> +	}

> +

> +	fwnode = pdev->dev.fwnode;

> +	if (!fwnode) {

> +		/*

> +		 * PCI devices aren't necessarily described by ACPI. Create a

> +		 * fwnode so the IOMMU subsystem can identify this device.

> +		 */

> +		fwnode = acpi_alloc_fwnode_static();

> +		if (!fwnode) {

> +			pci_dev_put(pdev);

> +			return -ENOMEM;

> +		}

> +		set_primary_fwnode(&pdev->dev, fwnode);

> +	}

> +	viommu->fwnode = pdev->dev.fwnode;

> +	pci_dev_put(pdev);

> +	return 0;

> +}

> +

> +static int __init viot_get_mmio_iommu_fwnode(struct viot_iommu *viommu,

> +					     u64 address)

> +{

> +	struct acpi_device *adev;

> +	struct resource res = {

> +		.start	= address,

> +		.end	= address,

> +		.flags	= IORESOURCE_MEM,

> +	};

> +

> +	adev = acpi_resource_consumer(&res);

> +	if (!adev) {

> +		pr_err("Could not find MMIO IOMMU\n");

> +		return -EINVAL;

> +	}

> +	viommu->fwnode = &adev->fwnode;

> +	return 0;

> +}

> +

> +static struct viot_iommu * __init viot_get_iommu(unsigned int offset)

> +{

> +	int ret;

> +	struct viot_iommu *viommu;

> +	struct acpi_viot_header *hdr = ACPI_ADD_PTR(struct acpi_viot_header,

> +						    viot, offset);

> +	union {

> +		struct acpi_viot_virtio_iommu_pci pci;

> +		struct acpi_viot_virtio_iommu_mmio mmio;

> +	} *node = (void *)hdr;

> +

> +	list_for_each_entry(viommu, &viot_iommus, list)

> +		if (viommu->offset == offset)

> +			return viommu;

> +

> +	if (viot_check_bounds(hdr))

> +		return NULL;

> +

> +	viommu = kzalloc(sizeof(*viommu), GFP_KERNEL);

> +	if (!viommu)

> +		return NULL;

> +

> +	viommu->offset = offset;

> +	switch (hdr->type) {

> +	case ACPI_VIOT_NODE_VIRTIO_IOMMU_PCI:

> +		if (hdr->length < sizeof(node->pci))

> +			goto err_free;

> +

> +		ret = viot_get_pci_iommu_fwnode(viommu, node->pci.segment,

> +						node->pci.bdf);

> +		break;

> +	case ACPI_VIOT_NODE_VIRTIO_IOMMU_MMIO:

> +		if (hdr->length < sizeof(node->mmio))

> +			goto err_free;

> +

> +		ret = viot_get_mmio_iommu_fwnode(viommu,

> +						 node->mmio.base_address);

> +		break;

> +	default:

> +		ret = -EINVAL;

> +	}

> +	if (ret)

> +		goto err_free;

> +

> +	list_add(&viommu->list, &viot_iommus);

> +	return viommu;

> +

> +err_free:

> +	kfree(viommu);

> +	return NULL;

> +}

> +

> +static int __init viot_parse_node(const struct acpi_viot_header *hdr)

> +{

> +	int ret = -EINVAL;

> +	struct list_head *list;

> +	struct viot_endpoint *ep;

> +	union {

> +		struct acpi_viot_mmio mmio;

> +		struct acpi_viot_pci_range pci;

> +	} *node = (void *)hdr;

> +

> +	if (viot_check_bounds(hdr))

> +		return -EINVAL;

> +

> +	if (hdr->type == ACPI_VIOT_NODE_VIRTIO_IOMMU_PCI ||

> +	    hdr->type == ACPI_VIOT_NODE_VIRTIO_IOMMU_MMIO)

> +		return 0;

> +

> +	ep = kzalloc(sizeof(*ep), GFP_KERNEL);

> +	if (!ep)

> +		return -ENOMEM;

> +

> +	switch (hdr->type) {

> +	case ACPI_VIOT_NODE_PCI_RANGE:

> +		if (hdr->length < sizeof(node->pci)) {

> +			pr_err(FW_BUG "Invalid PCI node size\n");

> +			goto err_free;

> +		}

> +

> +		ep->segment_start = node->pci.segment_start;

> +		ep->segment_end = node->pci.segment_end;

> +		ep->bdf_start = node->pci.bdf_start;

> +		ep->bdf_end = node->pci.bdf_end;

> +		ep->endpoint_id = node->pci.endpoint_start;

> +		ep->viommu = viot_get_iommu(node->pci.output_node);

> +		list = &viot_pci_ranges;

> +		break;

> +	case ACPI_VIOT_NODE_MMIO:

> +		if (hdr->length < sizeof(node->mmio)) {

> +			pr_err(FW_BUG "Invalid MMIO node size\n");

> +			goto err_free;

> +		}

> +

> +		ep->address = node->mmio.base_address;

> +		ep->endpoint_id = node->mmio.endpoint;

> +		ep->viommu = viot_get_iommu(node->mmio.output_node);

> +		list = &viot_mmio_endpoints;

> +		break;

> +	default:

> +		pr_warn("Unsupported node %x\n", hdr->type);

> +		ret = 0;

> +		goto err_free;

> +	}

> +

> +	/*

> +	 * To be compatible with future versions of the table which may include

> +	 * other node types, keep parsing.

> +	 */

nit: doesn't this comment rather apply to the default clause in the
switch. In case the PCI range node or the single MMIO endoint node does
not refer to any translation element, isn't it simply an error case?
> +	if (!ep->viommu) {

> +		pr_warn("No IOMMU node found\n");

> +		ret = 0;

> +		goto err_free;

> +	}

> +

> +	list_add(&ep->list, list);

> +	return 0;

> +

> +err_free:

> +	kfree(ep);

> +	return ret;

> +}

> +

> +/**

> + * acpi_viot_init - Parse the VIOT table

> + *

> + * Parse the VIOT table, prepare the list of endpoints to be used during DMA

> + * setup of devices.

> + */

> +void __init acpi_viot_init(void)

> +{

> +	int i;

> +	acpi_status status;

> +	struct acpi_table_header *hdr;

> +	struct acpi_viot_header *node;

> +

> +	status = acpi_get_table(ACPI_SIG_VIOT, 0, &hdr);

> +	if (ACPI_FAILURE(status)) {

> +		if (status != AE_NOT_FOUND) {

> +			const char *msg = acpi_format_exception(status);

> +

> +			pr_err("Failed to get table, %s\n", msg);

> +		}

> +		return;

> +	}

> +

> +	viot = (void *)hdr;

> +

> +	node = ACPI_ADD_PTR(struct acpi_viot_header, viot, viot->node_offset);

> +	for (i = 0; i < viot->node_count; i++) {

> +		if (viot_parse_node(node))

> +			return;

> +

> +		node = ACPI_ADD_PTR(struct acpi_viot_header, node,

> +				    node->length);

> +	}

> +}

> +

> +static int viot_dev_iommu_init(struct device *dev, struct viot_iommu *viommu,

> +			       u32 epid)

> +{

> +	const struct iommu_ops *ops;

> +

> +	if (!viommu)

> +		return -ENODEV;

> +

> +	/* We're not translating ourself */

> +	if (viommu->fwnode == dev->fwnode)

> +		return -EINVAL;

> +

> +	ops = iommu_ops_from_fwnode(viommu->fwnode);

> +	if (!ops)

> +		return IS_ENABLED(CONFIG_VIRTIO_IOMMU) ?

> +			-EPROBE_DEFER : -ENODEV;

> +

> +	return acpi_iommu_fwspec_init(dev, epid, viommu->fwnode, ops);

> +}

> +

> +static int viot_pci_dev_iommu_init(struct pci_dev *pdev, u16 dev_id, void *data)

> +{

> +	u32 epid;

> +	struct viot_endpoint *ep;

> +	u32 domain_nr = pci_domain_nr(pdev->bus);

> +

> +	list_for_each_entry(ep, &viot_pci_ranges, list) {

> +		if (domain_nr >= ep->segment_start &&

> +		    domain_nr <= ep->segment_end &&

> +		    dev_id >= ep->bdf_start &&

> +		    dev_id <= ep->bdf_end) {

> +			epid = ((domain_nr - ep->segment_start) << 16) +

> +				dev_id - ep->bdf_start + ep->endpoint_id;

> +

> +			/*

> +			 * If we found a PCI range managed by the viommu, we're

> +			 * the one that has to request ACS.

> +			 */

> +			pci_request_acs();

> +

> +			return viot_dev_iommu_init(&pdev->dev, ep->viommu,

> +						   epid);

> +		}

> +	}

> +	return -ENODEV;

> +}

> +

> +static int viot_mmio_dev_iommu_init(struct platform_device *pdev)

> +{

> +	struct resource *mem;

> +	struct viot_endpoint *ep;

> +

> +	mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);

> +	if (!mem)

> +		return -ENODEV;

> +

> +	list_for_each_entry(ep, &viot_mmio_endpoints, list) {

> +		if (ep->address == mem->start)

> +			return viot_dev_iommu_init(&pdev->dev, ep->viommu,

> +						   ep->endpoint_id);

> +	}

> +	return -ENODEV;

> +}

> +

> +/**

> + * viot_iommu_configure - Setup IOMMU ops for an endpoint described by VIOT

> + * @dev: the endpoint

> + *

> + * Return: 0 on success, <0 on failure

> + */

> +int viot_iommu_configure(struct device *dev)

> +{

> +	if (dev_is_pci(dev))

> +		return pci_for_each_dma_alias(to_pci_dev(dev),

> +					      viot_pci_dev_iommu_init, NULL);

> +	else if (dev_is_platform(dev))

> +		return viot_mmio_dev_iommu_init(to_platform_device(dev));

> +	return -ENODEV;

> +}

> diff --git a/MAINTAINERS b/MAINTAINERS

> index b706dd20ff2b..8d71591f979a 100644

> --- a/MAINTAINERS

> +++ b/MAINTAINERS

> @@ -431,6 +431,14 @@ W:	https://01.org/linux-acpi

>  B:	https://bugzilla.kernel.org

>  F:	drivers/acpi/acpi_video.c

>  

> +ACPI VIOT DRIVER

> +M:	Jean-Philippe Brucker <jean-philippe@linaro.org>

> +L:	linux-acpi@vger.kernel.org

> +L:	iommu@lists.linux-foundation.org

> +S:	Maintained

> +F:	drivers/acpi/viot.c

> +F:	include/linux/acpi_viot.h

> +

>  ACPI WMI DRIVER

>  L:	platform-driver-x86@vger.kernel.org

>  S:	Orphan

Besides
Reviewed-by: Eric Auger <eric.auger@redhat.com>


Thanks

Eric
Rafael J. Wysocki June 17, 2021, 11:50 a.m. UTC | #2
On Thu, Jun 10, 2021 at 10:03 AM Jean-Philippe Brucker
<jean-philippe@linaro.org> wrote:
>

> The ACPI Virtual I/O Translation Table describes topology of

> para-virtual platforms, similarly to vendor tables DMAR, IVRS and IORT.

> For now it describes the relation between virtio-iommu and the endpoints

> it manages.

>

> Three steps are needed to configure DMA of endpoints:

>

> (1) acpi_viot_init(): parse the VIOT table, find or create the fwnode

>     associated to each vIOMMU device.

>

> (2) When probing the vIOMMU device, the driver registers its IOMMU ops

>     within the IOMMU subsystem. This step doesn't require any

>     intervention from the VIOT driver.

>

> (3) viot_iommu_configure(): before binding the endpoint to a driver,

>     find the associated IOMMU ops. Register them, along with the

>     endpoint ID, into the device's iommu_fwspec.

>

> If step (3) happens before step (2), it is deferred until the IOMMU is

> initialized, then retried.

>

> Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>

> ---

>  drivers/acpi/Kconfig      |   3 +

>  drivers/iommu/Kconfig     |   1 +

>  drivers/acpi/Makefile     |   2 +

>  include/linux/acpi_viot.h |  19 ++

>  drivers/acpi/bus.c        |   2 +

>  drivers/acpi/scan.c       |   3 +

>  drivers/acpi/viot.c       | 364 ++++++++++++++++++++++++++++++++++++++

>  MAINTAINERS               |   8 +

>  8 files changed, 402 insertions(+)

>  create mode 100644 include/linux/acpi_viot.h

>  create mode 100644 drivers/acpi/viot.c

>

> diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig

> index eedec61e3476..3758c6940ed7 100644

> --- a/drivers/acpi/Kconfig

> +++ b/drivers/acpi/Kconfig

> @@ -526,6 +526,9 @@ endif

>

>  source "drivers/acpi/pmic/Kconfig"

>

> +config ACPI_VIOT

> +       bool

> +

>  endif  # ACPI

>

>  config X86_PM_TIMER

> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig

> index 1f111b399bca..aff8a4830dd1 100644

> --- a/drivers/iommu/Kconfig

> +++ b/drivers/iommu/Kconfig

> @@ -403,6 +403,7 @@ config VIRTIO_IOMMU

>         depends on ARM64

>         select IOMMU_API

>         select INTERVAL_TREE

> +       select ACPI_VIOT if ACPI

>         help

>           Para-virtualised IOMMU driver with virtio.

>

> diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile

> index 700b41adf2db..a6e644c48987 100644

> --- a/drivers/acpi/Makefile

> +++ b/drivers/acpi/Makefile

> @@ -118,3 +118,5 @@ video-objs                  += acpi_video.o video_detect.o

>  obj-y                          += dptf/

>

>  obj-$(CONFIG_ARM64)            += arm64/

> +

> +obj-$(CONFIG_ACPI_VIOT)                += viot.o

> diff --git a/include/linux/acpi_viot.h b/include/linux/acpi_viot.h

> new file mode 100644

> index 000000000000..1eb8ee5b0e5f

> --- /dev/null

> +++ b/include/linux/acpi_viot.h

> @@ -0,0 +1,19 @@

> +/* SPDX-License-Identifier: GPL-2.0-only */

> +

> +#ifndef __ACPI_VIOT_H__

> +#define __ACPI_VIOT_H__

> +

> +#include <linux/acpi.h>

> +

> +#ifdef CONFIG_ACPI_VIOT

> +void __init acpi_viot_init(void);

> +int viot_iommu_configure(struct device *dev);

> +#else

> +static inline void acpi_viot_init(void) {}

> +static inline int viot_iommu_configure(struct device *dev)

> +{

> +       return -ENODEV;

> +}

> +#endif

> +

> +#endif /* __ACPI_VIOT_H__ */

> diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c

> index be7da23fad76..b835ca702ff0 100644

> --- a/drivers/acpi/bus.c

> +++ b/drivers/acpi/bus.c

> @@ -27,6 +27,7 @@

>  #include <linux/dmi.h>

>  #endif

>  #include <linux/acpi_iort.h>

> +#include <linux/acpi_viot.h>

>  #include <linux/pci.h>

>  #include <acpi/apei.h>

>  #include <linux/suspend.h>

> @@ -1339,6 +1340,7 @@ static int __init acpi_init(void)

>         pci_mmcfg_late_init();

>         acpi_iort_init();

>         acpi_scan_init();

> +       acpi_viot_init();


Is there a specific reason why to call it right here?

In particular, does it need to be called after acpi_scan_init()?  And
does it need to be called before the subsequent functions?  If so,
then why?

>         acpi_ec_init();

>         acpi_debugfs_init();

>         acpi_sleep_proc_init();

> diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c

> index 0c53c8533300..4fa684fdfda8 100644

> --- a/drivers/acpi/scan.c

> +++ b/drivers/acpi/scan.c

> @@ -9,6 +9,7 @@

>  #include <linux/kernel.h>

>  #include <linux/acpi.h>

>  #include <linux/acpi_iort.h>

> +#include <linux/acpi_viot.h>

>  #include <linux/iommu.h>

>  #include <linux/signal.h>

>  #include <linux/kthread.h>

> @@ -1556,6 +1557,8 @@ static const struct iommu_ops *acpi_iommu_configure_id(struct device *dev,

>                 return ops;

>

>         err = iort_iommu_configure_id(dev, id_in);

> +       if (err && err != -EPROBE_DEFER)

> +               err = viot_iommu_configure(dev);

>

>         /*

>          * If we have reason to believe the IOMMU driver missed the initial

> diff --git a/drivers/acpi/viot.c b/drivers/acpi/viot.c

> new file mode 100644

> index 000000000000..892cd9fa7b6d

> --- /dev/null

> +++ b/drivers/acpi/viot.c

> @@ -0,0 +1,364 @@

> +// SPDX-License-Identifier: GPL-2.0

> +/*

> + * Virtual I/O topology

> + *

> + * The Virtual I/O Translation Table (VIOT) describes the topology of

> + * para-virtual IOMMUs and the endpoints they manage. The OS uses it to

> + * initialize devices in the right order, preventing endpoints from issuing DMA

> + * before their IOMMU is ready.

> + *

> + * When binding a driver to a device, before calling the device driver's probe()

> + * method, the driver infrastructure calls dma_configure(). At that point the

> + * VIOT driver looks for an IOMMU associated to the device in the VIOT table.

> + * If an IOMMU exists and has been initialized, the VIOT driver initializes the

> + * device's IOMMU fwspec, allowing the DMA infrastructure to invoke the IOMMU

> + * ops when the device driver configures DMA mappings. If an IOMMU exists and

> + * hasn't yet been initialized, VIOT returns -EPROBE_DEFER to postpone probing

> + * the device until the IOMMU is available.

> + */

> +#define pr_fmt(fmt) "ACPI: VIOT: " fmt

> +

> +#include <linux/acpi_viot.h>

> +#include <linux/dma-iommu.h>

> +#include <linux/fwnode.h>

> +#include <linux/iommu.h>

> +#include <linux/list.h>

> +#include <linux/pci.h>

> +#include <linux/platform_device.h>

> +

> +struct viot_iommu {

> +       /* Node offset within the table */

> +       unsigned int                    offset;

> +       struct fwnode_handle            *fwnode;

> +       struct list_head                list;

> +};

> +

> +struct viot_endpoint {

> +       union {

> +               /* PCI range */

> +               struct {

> +                       u16             segment_start;

> +                       u16             segment_end;

> +                       u16             bdf_start;

> +                       u16             bdf_end;

> +               };

> +               /* MMIO */

> +               u64                     address;

> +       };

> +       u32                             endpoint_id;

> +       struct viot_iommu               *viommu;

> +       struct list_head                list;

> +};

> +

> +static struct acpi_table_viot *viot;

> +static LIST_HEAD(viot_iommus);

> +static LIST_HEAD(viot_pci_ranges);

> +static LIST_HEAD(viot_mmio_endpoints);

> +

> +static int __init viot_check_bounds(const struct acpi_viot_header *hdr)

> +{

> +       struct acpi_viot_header *start, *end, *hdr_end;

> +

> +       start = ACPI_ADD_PTR(struct acpi_viot_header, viot,

> +                            max_t(size_t, sizeof(*viot), viot->node_offset));

> +       end = ACPI_ADD_PTR(struct acpi_viot_header, viot, viot->header.length);

> +       hdr_end = ACPI_ADD_PTR(struct acpi_viot_header, hdr, sizeof(*hdr));

> +

> +       if (hdr < start || hdr_end > end) {

> +               pr_err(FW_BUG "Node pointer overflows\n");

> +               return -EOVERFLOW;

> +       }

> +       if (hdr->length < sizeof(*hdr)) {

> +               pr_err(FW_BUG "Empty node\n");

> +               return -EINVAL;

> +       }

> +       return 0;

> +}

> +

> +static int __init viot_get_pci_iommu_fwnode(struct viot_iommu *viommu,

> +                                           u16 segment, u16 bdf)

> +{

> +       struct pci_dev *pdev;

> +       struct fwnode_handle *fwnode;

> +

> +       pdev = pci_get_domain_bus_and_slot(segment, PCI_BUS_NUM(bdf),

> +                                          bdf & 0xff);

> +       if (!pdev) {

> +               pr_err("Could not find PCI IOMMU\n");

> +               return -ENODEV;

> +       }

> +

> +       fwnode = pdev->dev.fwnode;

> +       if (!fwnode) {

> +               /*

> +                * PCI devices aren't necessarily described by ACPI. Create a

> +                * fwnode so the IOMMU subsystem can identify this device.

> +                */

> +               fwnode = acpi_alloc_fwnode_static();

> +               if (!fwnode) {

> +                       pci_dev_put(pdev);

> +                       return -ENOMEM;

> +               }

> +               set_primary_fwnode(&pdev->dev, fwnode);

> +       }

> +       viommu->fwnode = pdev->dev.fwnode;

> +       pci_dev_put(pdev);

> +       return 0;

> +}

> +

> +static int __init viot_get_mmio_iommu_fwnode(struct viot_iommu *viommu,

> +                                            u64 address)

> +{

> +       struct acpi_device *adev;

> +       struct resource res = {

> +               .start  = address,

> +               .end    = address,

> +               .flags  = IORESOURCE_MEM,

> +       };

> +

> +       adev = acpi_resource_consumer(&res);

> +       if (!adev) {

> +               pr_err("Could not find MMIO IOMMU\n");

> +               return -EINVAL;

> +       }

> +       viommu->fwnode = &adev->fwnode;

> +       return 0;

> +}

> +

> +static struct viot_iommu * __init viot_get_iommu(unsigned int offset)

> +{

> +       int ret;

> +       struct viot_iommu *viommu;

> +       struct acpi_viot_header *hdr = ACPI_ADD_PTR(struct acpi_viot_header,

> +                                                   viot, offset);

> +       union {

> +               struct acpi_viot_virtio_iommu_pci pci;

> +               struct acpi_viot_virtio_iommu_mmio mmio;

> +       } *node = (void *)hdr;

> +

> +       list_for_each_entry(viommu, &viot_iommus, list)

> +               if (viommu->offset == offset)

> +                       return viommu;

> +

> +       if (viot_check_bounds(hdr))

> +               return NULL;

> +

> +       viommu = kzalloc(sizeof(*viommu), GFP_KERNEL);

> +       if (!viommu)

> +               return NULL;

> +

> +       viommu->offset = offset;

> +       switch (hdr->type) {

> +       case ACPI_VIOT_NODE_VIRTIO_IOMMU_PCI:

> +               if (hdr->length < sizeof(node->pci))

> +                       goto err_free;

> +

> +               ret = viot_get_pci_iommu_fwnode(viommu, node->pci.segment,

> +                                               node->pci.bdf);

> +               break;

> +       case ACPI_VIOT_NODE_VIRTIO_IOMMU_MMIO:

> +               if (hdr->length < sizeof(node->mmio))

> +                       goto err_free;

> +

> +               ret = viot_get_mmio_iommu_fwnode(viommu,

> +                                                node->mmio.base_address);

> +               break;

> +       default:

> +               ret = -EINVAL;

> +       }

> +       if (ret)

> +               goto err_free;

> +

> +       list_add(&viommu->list, &viot_iommus);

> +       return viommu;

> +

> +err_free:

> +       kfree(viommu);

> +       return NULL;

> +}

> +

> +static int __init viot_parse_node(const struct acpi_viot_header *hdr)

> +{

> +       int ret = -EINVAL;

> +       struct list_head *list;

> +       struct viot_endpoint *ep;

> +       union {

> +               struct acpi_viot_mmio mmio;

> +               struct acpi_viot_pci_range pci;

> +       } *node = (void *)hdr;

> +

> +       if (viot_check_bounds(hdr))

> +               return -EINVAL;

> +

> +       if (hdr->type == ACPI_VIOT_NODE_VIRTIO_IOMMU_PCI ||

> +           hdr->type == ACPI_VIOT_NODE_VIRTIO_IOMMU_MMIO)

> +               return 0;

> +

> +       ep = kzalloc(sizeof(*ep), GFP_KERNEL);

> +       if (!ep)

> +               return -ENOMEM;

> +

> +       switch (hdr->type) {

> +       case ACPI_VIOT_NODE_PCI_RANGE:

> +               if (hdr->length < sizeof(node->pci)) {

> +                       pr_err(FW_BUG "Invalid PCI node size\n");

> +                       goto err_free;

> +               }

> +

> +               ep->segment_start = node->pci.segment_start;

> +               ep->segment_end = node->pci.segment_end;

> +               ep->bdf_start = node->pci.bdf_start;

> +               ep->bdf_end = node->pci.bdf_end;

> +               ep->endpoint_id = node->pci.endpoint_start;

> +               ep->viommu = viot_get_iommu(node->pci.output_node);

> +               list = &viot_pci_ranges;

> +               break;

> +       case ACPI_VIOT_NODE_MMIO:

> +               if (hdr->length < sizeof(node->mmio)) {

> +                       pr_err(FW_BUG "Invalid MMIO node size\n");

> +                       goto err_free;

> +               }

> +

> +               ep->address = node->mmio.base_address;

> +               ep->endpoint_id = node->mmio.endpoint;

> +               ep->viommu = viot_get_iommu(node->mmio.output_node);

> +               list = &viot_mmio_endpoints;

> +               break;

> +       default:

> +               pr_warn("Unsupported node %x\n", hdr->type);

> +               ret = 0;

> +               goto err_free;

> +       }

> +

> +       /*

> +        * To be compatible with future versions of the table which may include

> +        * other node types, keep parsing.

> +        */

> +       if (!ep->viommu) {

> +               pr_warn("No IOMMU node found\n");

> +               ret = 0;

> +               goto err_free;

> +       }

> +

> +       list_add(&ep->list, list);

> +       return 0;

> +

> +err_free:

> +       kfree(ep);

> +       return ret;

> +}

> +

> +/**

> + * acpi_viot_init - Parse the VIOT table

> + *

> + * Parse the VIOT table, prepare the list of endpoints to be used during DMA

> + * setup of devices.

> + */

> +void __init acpi_viot_init(void)

> +{

> +       int i;

> +       acpi_status status;

> +       struct acpi_table_header *hdr;

> +       struct acpi_viot_header *node;

> +

> +       status = acpi_get_table(ACPI_SIG_VIOT, 0, &hdr);

> +       if (ACPI_FAILURE(status)) {

> +               if (status != AE_NOT_FOUND) {

> +                       const char *msg = acpi_format_exception(status);

> +

> +                       pr_err("Failed to get table, %s\n", msg);

> +               }

> +               return;

> +       }

> +

> +       viot = (void *)hdr;

> +

> +       node = ACPI_ADD_PTR(struct acpi_viot_header, viot, viot->node_offset);

> +       for (i = 0; i < viot->node_count; i++) {

> +               if (viot_parse_node(node))

> +                       return;

> +

> +               node = ACPI_ADD_PTR(struct acpi_viot_header, node,

> +                                   node->length);

> +       }


Do you still need the table after the above is complete?  If not,
release the reference on it acquired above.

> +}

> +

> +static int viot_dev_iommu_init(struct device *dev, struct viot_iommu *viommu,

> +                              u32 epid)

> +{

> +       const struct iommu_ops *ops;

> +

> +       if (!viommu)

> +               return -ENODEV;

> +

> +       /* We're not translating ourself */

> +       if (viommu->fwnode == dev->fwnode)

> +               return -EINVAL;

> +

> +       ops = iommu_ops_from_fwnode(viommu->fwnode);

> +       if (!ops)

> +               return IS_ENABLED(CONFIG_VIRTIO_IOMMU) ?

> +                       -EPROBE_DEFER : -ENODEV;

> +

> +       return acpi_iommu_fwspec_init(dev, epid, viommu->fwnode, ops);

> +}

> +

> +static int viot_pci_dev_iommu_init(struct pci_dev *pdev, u16 dev_id, void *data)

> +{

> +       u32 epid;

> +       struct viot_endpoint *ep;

> +       u32 domain_nr = pci_domain_nr(pdev->bus);

> +

> +       list_for_each_entry(ep, &viot_pci_ranges, list) {

> +               if (domain_nr >= ep->segment_start &&

> +                   domain_nr <= ep->segment_end &&

> +                   dev_id >= ep->bdf_start &&

> +                   dev_id <= ep->bdf_end) {

> +                       epid = ((domain_nr - ep->segment_start) << 16) +

> +                               dev_id - ep->bdf_start + ep->endpoint_id;

> +

> +                       /*

> +                        * If we found a PCI range managed by the viommu, we're

> +                        * the one that has to request ACS.

> +                        */

> +                       pci_request_acs();

> +

> +                       return viot_dev_iommu_init(&pdev->dev, ep->viommu,

> +                                                  epid);

> +               }

> +       }

> +       return -ENODEV;

> +}

> +

> +static int viot_mmio_dev_iommu_init(struct platform_device *pdev)

> +{

> +       struct resource *mem;

> +       struct viot_endpoint *ep;

> +

> +       mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);

> +       if (!mem)

> +               return -ENODEV;

> +

> +       list_for_each_entry(ep, &viot_mmio_endpoints, list) {

> +               if (ep->address == mem->start)

> +                       return viot_dev_iommu_init(&pdev->dev, ep->viommu,

> +                                                  ep->endpoint_id);

> +       }

> +       return -ENODEV;

> +}

> +

> +/**

> + * viot_iommu_configure - Setup IOMMU ops for an endpoint described by VIOT

> + * @dev: the endpoint

> + *

> + * Return: 0 on success, <0 on failure

> + */

> +int viot_iommu_configure(struct device *dev)

> +{

> +       if (dev_is_pci(dev))

> +               return pci_for_each_dma_alias(to_pci_dev(dev),

> +                                             viot_pci_dev_iommu_init, NULL);

> +       else if (dev_is_platform(dev))

> +               return viot_mmio_dev_iommu_init(to_platform_device(dev));

> +       return -ENODEV;

> +}

> diff --git a/MAINTAINERS b/MAINTAINERS

> index b706dd20ff2b..8d71591f979a 100644

> --- a/MAINTAINERS

> +++ b/MAINTAINERS

> @@ -431,6 +431,14 @@ W: https://01.org/linux-acpi

>  B:     https://bugzilla.kernel.org

>  F:     drivers/acpi/acpi_video.c

>

> +ACPI VIOT DRIVER

> +M:     Jean-Philippe Brucker <jean-philippe@linaro.org>

> +L:     linux-acpi@vger.kernel.org

> +L:     iommu@lists.linux-foundation.org

> +S:     Maintained

> +F:     drivers/acpi/viot.c

> +F:     include/linux/acpi_viot.h

> +

>  ACPI WMI DRIVER

>  L:     platform-driver-x86@vger.kernel.org

>  S:     Orphan

> --

> 2.31.1

>
Jean-Philippe Brucker June 18, 2021, 7:43 a.m. UTC | #3
On Wed, Jun 16, 2021 at 03:26:08PM +0200, Eric Auger wrote:
> > +	default:

> > +		pr_warn("Unsupported node %x\n", hdr->type);

> > +		ret = 0;

> > +		goto err_free;

> > +	}

> > +

> > +	/*

> > +	 * To be compatible with future versions of the table which may include

> > +	 * other node types, keep parsing.

> > +	 */

> nit: doesn't this comment rather apply to the default clause in the

> switch.


Yes, the comment doesn't accurately explain the code below, I'll tweak it.

        /*
         * A future version of the table may use the node for other purposes.
         * Keep parsing.
         */

> In case the PCI range node or the single MMIO endoint node does

> not refer to any translation element, isn't it simply an error case?


It is permissible in my opinion. If a future version of the spec appends
new fields to the MMIO endpoint describing some PV property (I can't think
of a useful example), then the table can contain the vIOMMU topology as
usual plus one MMIO node that's only here to describe that property, and
doesn't have a translation element. If we encounter that I think we should
keep parsing.

> > +	if (!ep->viommu) {

> > +		pr_warn("No IOMMU node found\n");

> > +		ret = 0;

> > +		goto err_free;

> > +	}


> Besides

> Reviewed-by: Eric Auger <eric.auger@redhat.com>


Thanks!
Jean
Jean-Philippe Brucker June 18, 2021, 7:54 a.m. UTC | #4
On Thu, Jun 17, 2021 at 01:50:59PM +0200, Rafael J. Wysocki wrote:
> > diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c

> > index be7da23fad76..b835ca702ff0 100644

> > --- a/drivers/acpi/bus.c

> > +++ b/drivers/acpi/bus.c

> > @@ -27,6 +27,7 @@

> >  #include <linux/dmi.h>

> >  #endif

> >  #include <linux/acpi_iort.h>

> > +#include <linux/acpi_viot.h>

> >  #include <linux/pci.h>

> >  #include <acpi/apei.h>

> >  #include <linux/suspend.h>

> > @@ -1339,6 +1340,7 @@ static int __init acpi_init(void)

> >         pci_mmcfg_late_init();

> >         acpi_iort_init();

> >         acpi_scan_init();

> > +       acpi_viot_init();

> 

> Is there a specific reason why to call it right here?

> 

> In particular, does it need to be called after acpi_scan_init()?  And

> does it need to be called before the subsequent functions?  If so,

> then why?


It does need to be called after acpi_scan_init(), because it relies on
struct device and their fwnode to be initialized. In particular to find a
PCI device we call pci_get_domain_bus_and_slot(), which needs the PCI
topology made available by acpi_scan_init().

It does not need to be before the subsequent functions however, I can move
it at the end.

> > +void __init acpi_viot_init(void)

> > +{

> > +       int i;

> > +       acpi_status status;

> > +       struct acpi_table_header *hdr;

> > +       struct acpi_viot_header *node;

> > +

> > +       status = acpi_get_table(ACPI_SIG_VIOT, 0, &hdr);

> > +       if (ACPI_FAILURE(status)) {

> > +               if (status != AE_NOT_FOUND) {

> > +                       const char *msg = acpi_format_exception(status);

> > +

> > +                       pr_err("Failed to get table, %s\n", msg);

> > +               }

> > +               return;

> > +       }

> > +

> > +       viot = (void *)hdr;

> > +

> > +       node = ACPI_ADD_PTR(struct acpi_viot_header, viot, viot->node_offset);

> > +       for (i = 0; i < viot->node_count; i++) {

> > +               if (viot_parse_node(node))

> > +                       return;

> > +

> > +               node = ACPI_ADD_PTR(struct acpi_viot_header, node,

> > +                                   node->length);

> > +       }

> 

> Do you still need the table after the above is complete?  If not,

> release the reference on it acquired above.


We don't need the table anymore, I'll drop the reference

Thanks,
Jean
diff mbox series

Patch

diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig
index eedec61e3476..3758c6940ed7 100644
--- a/drivers/acpi/Kconfig
+++ b/drivers/acpi/Kconfig
@@ -526,6 +526,9 @@  endif
 
 source "drivers/acpi/pmic/Kconfig"
 
+config ACPI_VIOT
+	bool
+
 endif	# ACPI
 
 config X86_PM_TIMER
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 1f111b399bca..aff8a4830dd1 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -403,6 +403,7 @@  config VIRTIO_IOMMU
 	depends on ARM64
 	select IOMMU_API
 	select INTERVAL_TREE
+	select ACPI_VIOT if ACPI
 	help
 	  Para-virtualised IOMMU driver with virtio.
 
diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile
index 700b41adf2db..a6e644c48987 100644
--- a/drivers/acpi/Makefile
+++ b/drivers/acpi/Makefile
@@ -118,3 +118,5 @@  video-objs			+= acpi_video.o video_detect.o
 obj-y				+= dptf/
 
 obj-$(CONFIG_ARM64)		+= arm64/
+
+obj-$(CONFIG_ACPI_VIOT)		+= viot.o
diff --git a/include/linux/acpi_viot.h b/include/linux/acpi_viot.h
new file mode 100644
index 000000000000..1eb8ee5b0e5f
--- /dev/null
+++ b/include/linux/acpi_viot.h
@@ -0,0 +1,19 @@ 
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef __ACPI_VIOT_H__
+#define __ACPI_VIOT_H__
+
+#include <linux/acpi.h>
+
+#ifdef CONFIG_ACPI_VIOT
+void __init acpi_viot_init(void);
+int viot_iommu_configure(struct device *dev);
+#else
+static inline void acpi_viot_init(void) {}
+static inline int viot_iommu_configure(struct device *dev)
+{
+	return -ENODEV;
+}
+#endif
+
+#endif /* __ACPI_VIOT_H__ */
diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
index be7da23fad76..b835ca702ff0 100644
--- a/drivers/acpi/bus.c
+++ b/drivers/acpi/bus.c
@@ -27,6 +27,7 @@ 
 #include <linux/dmi.h>
 #endif
 #include <linux/acpi_iort.h>
+#include <linux/acpi_viot.h>
 #include <linux/pci.h>
 #include <acpi/apei.h>
 #include <linux/suspend.h>
@@ -1339,6 +1340,7 @@  static int __init acpi_init(void)
 	pci_mmcfg_late_init();
 	acpi_iort_init();
 	acpi_scan_init();
+	acpi_viot_init();
 	acpi_ec_init();
 	acpi_debugfs_init();
 	acpi_sleep_proc_init();
diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index 0c53c8533300..4fa684fdfda8 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -9,6 +9,7 @@ 
 #include <linux/kernel.h>
 #include <linux/acpi.h>
 #include <linux/acpi_iort.h>
+#include <linux/acpi_viot.h>
 #include <linux/iommu.h>
 #include <linux/signal.h>
 #include <linux/kthread.h>
@@ -1556,6 +1557,8 @@  static const struct iommu_ops *acpi_iommu_configure_id(struct device *dev,
 		return ops;
 
 	err = iort_iommu_configure_id(dev, id_in);
+	if (err && err != -EPROBE_DEFER)
+		err = viot_iommu_configure(dev);
 
 	/*
 	 * If we have reason to believe the IOMMU driver missed the initial
diff --git a/drivers/acpi/viot.c b/drivers/acpi/viot.c
new file mode 100644
index 000000000000..892cd9fa7b6d
--- /dev/null
+++ b/drivers/acpi/viot.c
@@ -0,0 +1,364 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Virtual I/O topology
+ *
+ * The Virtual I/O Translation Table (VIOT) describes the topology of
+ * para-virtual IOMMUs and the endpoints they manage. The OS uses it to
+ * initialize devices in the right order, preventing endpoints from issuing DMA
+ * before their IOMMU is ready.
+ *
+ * When binding a driver to a device, before calling the device driver's probe()
+ * method, the driver infrastructure calls dma_configure(). At that point the
+ * VIOT driver looks for an IOMMU associated to the device in the VIOT table.
+ * If an IOMMU exists and has been initialized, the VIOT driver initializes the
+ * device's IOMMU fwspec, allowing the DMA infrastructure to invoke the IOMMU
+ * ops when the device driver configures DMA mappings. If an IOMMU exists and
+ * hasn't yet been initialized, VIOT returns -EPROBE_DEFER to postpone probing
+ * the device until the IOMMU is available.
+ */
+#define pr_fmt(fmt) "ACPI: VIOT: " fmt
+
+#include <linux/acpi_viot.h>
+#include <linux/dma-iommu.h>
+#include <linux/fwnode.h>
+#include <linux/iommu.h>
+#include <linux/list.h>
+#include <linux/pci.h>
+#include <linux/platform_device.h>
+
+struct viot_iommu {
+	/* Node offset within the table */
+	unsigned int			offset;
+	struct fwnode_handle		*fwnode;
+	struct list_head		list;
+};
+
+struct viot_endpoint {
+	union {
+		/* PCI range */
+		struct {
+			u16		segment_start;
+			u16		segment_end;
+			u16		bdf_start;
+			u16		bdf_end;
+		};
+		/* MMIO */
+		u64			address;
+	};
+	u32				endpoint_id;
+	struct viot_iommu		*viommu;
+	struct list_head		list;
+};
+
+static struct acpi_table_viot *viot;
+static LIST_HEAD(viot_iommus);
+static LIST_HEAD(viot_pci_ranges);
+static LIST_HEAD(viot_mmio_endpoints);
+
+static int __init viot_check_bounds(const struct acpi_viot_header *hdr)
+{
+	struct acpi_viot_header *start, *end, *hdr_end;
+
+	start = ACPI_ADD_PTR(struct acpi_viot_header, viot,
+			     max_t(size_t, sizeof(*viot), viot->node_offset));
+	end = ACPI_ADD_PTR(struct acpi_viot_header, viot, viot->header.length);
+	hdr_end = ACPI_ADD_PTR(struct acpi_viot_header, hdr, sizeof(*hdr));
+
+	if (hdr < start || hdr_end > end) {
+		pr_err(FW_BUG "Node pointer overflows\n");
+		return -EOVERFLOW;
+	}
+	if (hdr->length < sizeof(*hdr)) {
+		pr_err(FW_BUG "Empty node\n");
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int __init viot_get_pci_iommu_fwnode(struct viot_iommu *viommu,
+					    u16 segment, u16 bdf)
+{
+	struct pci_dev *pdev;
+	struct fwnode_handle *fwnode;
+
+	pdev = pci_get_domain_bus_and_slot(segment, PCI_BUS_NUM(bdf),
+					   bdf & 0xff);
+	if (!pdev) {
+		pr_err("Could not find PCI IOMMU\n");
+		return -ENODEV;
+	}
+
+	fwnode = pdev->dev.fwnode;
+	if (!fwnode) {
+		/*
+		 * PCI devices aren't necessarily described by ACPI. Create a
+		 * fwnode so the IOMMU subsystem can identify this device.
+		 */
+		fwnode = acpi_alloc_fwnode_static();
+		if (!fwnode) {
+			pci_dev_put(pdev);
+			return -ENOMEM;
+		}
+		set_primary_fwnode(&pdev->dev, fwnode);
+	}
+	viommu->fwnode = pdev->dev.fwnode;
+	pci_dev_put(pdev);
+	return 0;
+}
+
+static int __init viot_get_mmio_iommu_fwnode(struct viot_iommu *viommu,
+					     u64 address)
+{
+	struct acpi_device *adev;
+	struct resource res = {
+		.start	= address,
+		.end	= address,
+		.flags	= IORESOURCE_MEM,
+	};
+
+	adev = acpi_resource_consumer(&res);
+	if (!adev) {
+		pr_err("Could not find MMIO IOMMU\n");
+		return -EINVAL;
+	}
+	viommu->fwnode = &adev->fwnode;
+	return 0;
+}
+
+static struct viot_iommu * __init viot_get_iommu(unsigned int offset)
+{
+	int ret;
+	struct viot_iommu *viommu;
+	struct acpi_viot_header *hdr = ACPI_ADD_PTR(struct acpi_viot_header,
+						    viot, offset);
+	union {
+		struct acpi_viot_virtio_iommu_pci pci;
+		struct acpi_viot_virtio_iommu_mmio mmio;
+	} *node = (void *)hdr;
+
+	list_for_each_entry(viommu, &viot_iommus, list)
+		if (viommu->offset == offset)
+			return viommu;
+
+	if (viot_check_bounds(hdr))
+		return NULL;
+
+	viommu = kzalloc(sizeof(*viommu), GFP_KERNEL);
+	if (!viommu)
+		return NULL;
+
+	viommu->offset = offset;
+	switch (hdr->type) {
+	case ACPI_VIOT_NODE_VIRTIO_IOMMU_PCI:
+		if (hdr->length < sizeof(node->pci))
+			goto err_free;
+
+		ret = viot_get_pci_iommu_fwnode(viommu, node->pci.segment,
+						node->pci.bdf);
+		break;
+	case ACPI_VIOT_NODE_VIRTIO_IOMMU_MMIO:
+		if (hdr->length < sizeof(node->mmio))
+			goto err_free;
+
+		ret = viot_get_mmio_iommu_fwnode(viommu,
+						 node->mmio.base_address);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+	if (ret)
+		goto err_free;
+
+	list_add(&viommu->list, &viot_iommus);
+	return viommu;
+
+err_free:
+	kfree(viommu);
+	return NULL;
+}
+
+static int __init viot_parse_node(const struct acpi_viot_header *hdr)
+{
+	int ret = -EINVAL;
+	struct list_head *list;
+	struct viot_endpoint *ep;
+	union {
+		struct acpi_viot_mmio mmio;
+		struct acpi_viot_pci_range pci;
+	} *node = (void *)hdr;
+
+	if (viot_check_bounds(hdr))
+		return -EINVAL;
+
+	if (hdr->type == ACPI_VIOT_NODE_VIRTIO_IOMMU_PCI ||
+	    hdr->type == ACPI_VIOT_NODE_VIRTIO_IOMMU_MMIO)
+		return 0;
+
+	ep = kzalloc(sizeof(*ep), GFP_KERNEL);
+	if (!ep)
+		return -ENOMEM;
+
+	switch (hdr->type) {
+	case ACPI_VIOT_NODE_PCI_RANGE:
+		if (hdr->length < sizeof(node->pci)) {
+			pr_err(FW_BUG "Invalid PCI node size\n");
+			goto err_free;
+		}
+
+		ep->segment_start = node->pci.segment_start;
+		ep->segment_end = node->pci.segment_end;
+		ep->bdf_start = node->pci.bdf_start;
+		ep->bdf_end = node->pci.bdf_end;
+		ep->endpoint_id = node->pci.endpoint_start;
+		ep->viommu = viot_get_iommu(node->pci.output_node);
+		list = &viot_pci_ranges;
+		break;
+	case ACPI_VIOT_NODE_MMIO:
+		if (hdr->length < sizeof(node->mmio)) {
+			pr_err(FW_BUG "Invalid MMIO node size\n");
+			goto err_free;
+		}
+
+		ep->address = node->mmio.base_address;
+		ep->endpoint_id = node->mmio.endpoint;
+		ep->viommu = viot_get_iommu(node->mmio.output_node);
+		list = &viot_mmio_endpoints;
+		break;
+	default:
+		pr_warn("Unsupported node %x\n", hdr->type);
+		ret = 0;
+		goto err_free;
+	}
+
+	/*
+	 * To be compatible with future versions of the table which may include
+	 * other node types, keep parsing.
+	 */
+	if (!ep->viommu) {
+		pr_warn("No IOMMU node found\n");
+		ret = 0;
+		goto err_free;
+	}
+
+	list_add(&ep->list, list);
+	return 0;
+
+err_free:
+	kfree(ep);
+	return ret;
+}
+
+/**
+ * acpi_viot_init - Parse the VIOT table
+ *
+ * Parse the VIOT table, prepare the list of endpoints to be used during DMA
+ * setup of devices.
+ */
+void __init acpi_viot_init(void)
+{
+	int i;
+	acpi_status status;
+	struct acpi_table_header *hdr;
+	struct acpi_viot_header *node;
+
+	status = acpi_get_table(ACPI_SIG_VIOT, 0, &hdr);
+	if (ACPI_FAILURE(status)) {
+		if (status != AE_NOT_FOUND) {
+			const char *msg = acpi_format_exception(status);
+
+			pr_err("Failed to get table, %s\n", msg);
+		}
+		return;
+	}
+
+	viot = (void *)hdr;
+
+	node = ACPI_ADD_PTR(struct acpi_viot_header, viot, viot->node_offset);
+	for (i = 0; i < viot->node_count; i++) {
+		if (viot_parse_node(node))
+			return;
+
+		node = ACPI_ADD_PTR(struct acpi_viot_header, node,
+				    node->length);
+	}
+}
+
+static int viot_dev_iommu_init(struct device *dev, struct viot_iommu *viommu,
+			       u32 epid)
+{
+	const struct iommu_ops *ops;
+
+	if (!viommu)
+		return -ENODEV;
+
+	/* We're not translating ourself */
+	if (viommu->fwnode == dev->fwnode)
+		return -EINVAL;
+
+	ops = iommu_ops_from_fwnode(viommu->fwnode);
+	if (!ops)
+		return IS_ENABLED(CONFIG_VIRTIO_IOMMU) ?
+			-EPROBE_DEFER : -ENODEV;
+
+	return acpi_iommu_fwspec_init(dev, epid, viommu->fwnode, ops);
+}
+
+static int viot_pci_dev_iommu_init(struct pci_dev *pdev, u16 dev_id, void *data)
+{
+	u32 epid;
+	struct viot_endpoint *ep;
+	u32 domain_nr = pci_domain_nr(pdev->bus);
+
+	list_for_each_entry(ep, &viot_pci_ranges, list) {
+		if (domain_nr >= ep->segment_start &&
+		    domain_nr <= ep->segment_end &&
+		    dev_id >= ep->bdf_start &&
+		    dev_id <= ep->bdf_end) {
+			epid = ((domain_nr - ep->segment_start) << 16) +
+				dev_id - ep->bdf_start + ep->endpoint_id;
+
+			/*
+			 * If we found a PCI range managed by the viommu, we're
+			 * the one that has to request ACS.
+			 */
+			pci_request_acs();
+
+			return viot_dev_iommu_init(&pdev->dev, ep->viommu,
+						   epid);
+		}
+	}
+	return -ENODEV;
+}
+
+static int viot_mmio_dev_iommu_init(struct platform_device *pdev)
+{
+	struct resource *mem;
+	struct viot_endpoint *ep;
+
+	mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!mem)
+		return -ENODEV;
+
+	list_for_each_entry(ep, &viot_mmio_endpoints, list) {
+		if (ep->address == mem->start)
+			return viot_dev_iommu_init(&pdev->dev, ep->viommu,
+						   ep->endpoint_id);
+	}
+	return -ENODEV;
+}
+
+/**
+ * viot_iommu_configure - Setup IOMMU ops for an endpoint described by VIOT
+ * @dev: the endpoint
+ *
+ * Return: 0 on success, <0 on failure
+ */
+int viot_iommu_configure(struct device *dev)
+{
+	if (dev_is_pci(dev))
+		return pci_for_each_dma_alias(to_pci_dev(dev),
+					      viot_pci_dev_iommu_init, NULL);
+	else if (dev_is_platform(dev))
+		return viot_mmio_dev_iommu_init(to_platform_device(dev));
+	return -ENODEV;
+}
diff --git a/MAINTAINERS b/MAINTAINERS
index b706dd20ff2b..8d71591f979a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -431,6 +431,14 @@  W:	https://01.org/linux-acpi
 B:	https://bugzilla.kernel.org
 F:	drivers/acpi/acpi_video.c
 
+ACPI VIOT DRIVER
+M:	Jean-Philippe Brucker <jean-philippe@linaro.org>
+L:	linux-acpi@vger.kernel.org
+L:	iommu@lists.linux-foundation.org
+S:	Maintained
+F:	drivers/acpi/viot.c
+F:	include/linux/acpi_viot.h
+
 ACPI WMI DRIVER
 L:	platform-driver-x86@vger.kernel.org
 S:	Orphan