From c12fa88c6d16ed3865072d91154cff6fd1cd9cd4 Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Thu, 10 Sep 2020 20:25:08 +0800 Subject: [PATCH 1/9] vfio: Fix typo of the device_state A typo fix ("_RUNNNG" => "_RUNNING") in comment block of the uapi header. Signed-off-by: Zenghui Yu Reviewed-by: Cornelia Huck Reviewed-by: Kirti Wankhede Signed-off-by: Alex Williamson --- include/uapi/linux/vfio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 920470502329..d4bd39e124bf 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -462,7 +462,7 @@ struct vfio_region_gfx_edid { * 5. Resumed * |--------->| * - * 0. Default state of VFIO device is _RUNNNG when the user application starts. + * 0. Default state of VFIO device is _RUNNING when the user application starts. * 1. During normal shutdown of the user application, the user application may * optionally change the VFIO device state from _RUNNING to _STOP. This * transition is optional. The vendor driver must support this transition but From eac7cc21c4e49ab7dfeef22e29abdee8432face9 Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Mon, 21 Sep 2020 12:51:15 +0800 Subject: [PATCH 2/9] vfio/pci: Remove redundant declaration of vfio_pci_driver It was added by commit 137e5531351d ("vfio/pci: Add sriov_configure support") but duplicates a forward declaration earlier in the file. Remove it. Signed-off-by: Zenghui Yu Reviewed-by: Cornelia Huck Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 1ab1f5cda4ac..da68e2f86622 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -1862,7 +1862,6 @@ static const struct vfio_device_ops vfio_pci_ops = { static int vfio_pci_reflck_attach(struct vfio_pci_device *vdev); static void vfio_pci_reflck_put(struct vfio_pci_reflck *reflck); -static struct pci_driver vfio_pci_driver; static int vfio_pci_bus_notifier(struct notifier_block *nb, unsigned long action, void *data) From 12856e7acde4702b7c3238c15fcba86ff6aa507f Mon Sep 17 00:00:00 2001 From: Matthew Rosato Date: Thu, 10 Sep 2020 10:59:55 -0400 Subject: [PATCH 3/9] PCI/IOV: Mark VFs as not implementing PCI_COMMAND_MEMORY For VFs, the Memory Space Enable bit in the Command Register is hard-wired to 0. Add a new bit to signify devices where the Command Register Memory Space Enable bit does not control the device's response to MMIO accesses. Fixes: abafbc551fdd ("vfio-pci: Invalidate mmaps and block MMIO access on disabled memory") Signed-off-by: Matthew Rosato Acked-by: Bjorn Helgaas Signed-off-by: Alex Williamson --- drivers/pci/iov.c | 1 + include/linux/pci.h | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c index b37e08c4f9d1..4afd4ee4f7f0 100644 --- a/drivers/pci/iov.c +++ b/drivers/pci/iov.c @@ -180,6 +180,7 @@ int pci_iov_add_virtfn(struct pci_dev *dev, int id) virtfn->device = iov->vf_device; virtfn->is_virtfn = 1; virtfn->physfn = pci_dev_get(dev); + virtfn->no_command_memory = 1; if (id == 0) pci_read_vf_config_common(virtfn); diff --git a/include/linux/pci.h b/include/linux/pci.h index 835530605c0d..3ff723124ca7 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -445,6 +445,7 @@ struct pci_dev { unsigned int is_probed:1; /* Device probing in progress */ unsigned int link_active_reporting:1;/* Device capable of reporting link active */ unsigned int no_vf_scan:1; /* Don't scan for VFs after IOV enablement */ + unsigned int no_command_memory:1; /* No PCI_COMMAND_MEMORY */ pci_dev_flags_t dev_flags; atomic_t enable_cnt; /* pci_enable_device has been called */ From 7ef32e52368f62a4e041a4f0abefb4fb64e7fd4a Mon Sep 17 00:00:00 2001 From: Yan Zhao Date: Wed, 16 Sep 2020 10:28:33 +0800 Subject: [PATCH 4/9] vfio: add a singleton check for vfio_group_pin_pages Page pinning is used both to translate and pin device mappings for DMA purpose, as well as to indicate to the IOMMU backend to limit the dirty page scope to those pages that have been pinned, in the case of an IOMMU backed device. To support this, the vfio_pin_pages() interface limits itself to only singleton groups such that the IOMMU backend can consider dirty page scope only at the group level. Implement the same requirement for the vfio_group_pin_pages() interface. Fixes: 95fc87b44104 ("vfio: Selective dirty page tracking if IOMMU backed device pins pages") Signed-off-by: Yan Zhao Signed-off-by: Alex Williamson --- drivers/vfio/vfio.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index 262ab0efd06c..532bcaf28c11 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -2051,6 +2051,9 @@ int vfio_group_pin_pages(struct vfio_group *group, if (!group || !user_iova_pfn || !phys_pfn || !npage) return -EINVAL; + if (group->dev_counter > 1) + return -EINVAL; + if (npage > VFIO_PIN_PAGES_MAX_ENTRIES) return -E2BIG; From 7d6e1329652ed971d1b6e0e7bea66fba5044e271 Mon Sep 17 00:00:00 2001 From: Matthew Rosato Date: Tue, 15 Sep 2020 15:05:18 -0400 Subject: [PATCH 5/9] vfio iommu: Add dma available capability Commit 492855939bdb ("vfio/type1: Limit DMA mappings per container") added the ability to limit the number of memory backed DMA mappings. However on s390x, when lazy mapping is in use, we use a very large number of concurrent mappings. Let's provide the current allowable number of DMA mappings to userspace via the IOMMU info chain so that userspace can take appropriate mitigation. Signed-off-by: Matthew Rosato Reviewed-by: Cornelia Huck Signed-off-by: Alex Williamson --- drivers/vfio/vfio_iommu_type1.c | 17 +++++++++++++++++ include/uapi/linux/vfio.h | 15 +++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 5fbf0c1f7433..15e21dbffb16 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -2609,6 +2609,20 @@ static int vfio_iommu_migration_build_caps(struct vfio_iommu *iommu, return vfio_info_add_capability(caps, &cap_mig.header, sizeof(cap_mig)); } +static int vfio_iommu_dma_avail_build_caps(struct vfio_iommu *iommu, + struct vfio_info_cap *caps) +{ + struct vfio_iommu_type1_info_dma_avail cap_dma_avail; + + cap_dma_avail.header.id = VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL; + cap_dma_avail.header.version = 1; + + cap_dma_avail.avail = iommu->dma_avail; + + return vfio_info_add_capability(caps, &cap_dma_avail.header, + sizeof(cap_dma_avail)); +} + static int vfio_iommu_type1_get_info(struct vfio_iommu *iommu, unsigned long arg) { @@ -2641,6 +2655,9 @@ static int vfio_iommu_type1_get_info(struct vfio_iommu *iommu, ret = vfio_iommu_migration_build_caps(iommu, &caps); + if (!ret) + ret = vfio_iommu_dma_avail_build_caps(iommu, &caps); + if (!ret) ret = vfio_iommu_iova_build_caps(iommu, &caps); diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 920470502329..3891e03d3af0 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -1039,6 +1039,21 @@ struct vfio_iommu_type1_info_cap_migration { __u64 max_dirty_bitmap_size; /* in bytes */ }; +/* + * The DMA available capability allows to report the current number of + * simultaneously outstanding DMA mappings that are allowed. + * + * The structure below defines version 1 of this capability. + * + * avail: specifies the current number of outstanding DMA mappings allowed. + */ +#define VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL 3 + +struct vfio_iommu_type1_info_dma_avail { + struct vfio_info_cap_header header; + __u32 avail; +}; + #define VFIO_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12) /** From 08b6e22b850c28b6032da1e4d767a33116e23dfb Mon Sep 17 00:00:00 2001 From: Matthew Rosato Date: Thu, 10 Sep 2020 10:59:56 -0400 Subject: [PATCH 6/9] s390/pci: Mark all VFs as not implementing PCI_COMMAND_MEMORY For s390 we can have VFs that are passed-through without the associated PF. Firmware provides an emulation layer to allow these devices to operate independently, but is missing emulation of the Memory Space Enable bit. For these as well as linked VFs, set no_command_memory which specifies these devices do not implement PCI_COMMAND_MEMORY. Fixes: abafbc551fdd ("vfio-pci: Invalidate mmaps and block MMIO access on disabled memory") Signed-off-by: Matthew Rosato Reviewed-by: Niklas Schnelle Reviewed-by: Pierre Morel Signed-off-by: Alex Williamson --- arch/s390/pci/pci_bus.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/s390/pci/pci_bus.c b/arch/s390/pci/pci_bus.c index 5967f3014156..c93486a9989b 100644 --- a/arch/s390/pci/pci_bus.c +++ b/arch/s390/pci/pci_bus.c @@ -197,9 +197,10 @@ void pcibios_bus_add_device(struct pci_dev *pdev) * With pdev->no_vf_scan the common PCI probing code does not * perform PF/VF linking. */ - if (zdev->vfn) + if (zdev->vfn) { zpci_bus_setup_virtfn(zdev->zbus, pdev, zdev->vfn); - + pdev->no_command_memory = 1; + } } static int zpci_bus_add_device(struct zpci_bus *zbus, struct zpci_dev *zdev) From 515ecd5368f1510152fa4f9b9ce55b66ac56c334 Mon Sep 17 00:00:00 2001 From: Matthew Rosato Date: Thu, 10 Sep 2020 10:59:57 -0400 Subject: [PATCH 7/9] vfio/pci: Decouple PCI_COMMAND_MEMORY bit checks from is_virtfn While it is true that devices with is_virtfn=1 will have a Memory Space Enable bit that is hard-wired to 0, this is not the only case where we see this behavior -- For example some bare-metal hypervisors lack Memory Space Enable bit emulation for devices not setting is_virtfn (s390). Fix this by instead checking for the newly-added no_command_memory bit which directly denotes the need for PCI_COMMAND_MEMORY emulation in vfio. Fixes: abafbc551fdd ("vfio-pci: Invalidate mmaps and block MMIO access on disabled memory") Signed-off-by: Matthew Rosato Reviewed-by: Niklas Schnelle Reviewed-by: Pierre Morel Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_config.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c index d98843feddce..5076d0155bc3 100644 --- a/drivers/vfio/pci/vfio_pci_config.c +++ b/drivers/vfio/pci/vfio_pci_config.c @@ -406,7 +406,7 @@ bool __vfio_pci_memory_enabled(struct vfio_pci_device *vdev) * PF SR-IOV capability, there's therefore no need to trigger * faults based on the virtual value. */ - return pdev->is_virtfn || (cmd & PCI_COMMAND_MEMORY); + return pdev->no_command_memory || (cmd & PCI_COMMAND_MEMORY); } /* @@ -520,8 +520,8 @@ static int vfio_basic_config_read(struct vfio_pci_device *vdev, int pos, count = vfio_default_config_read(vdev, pos, count, perm, offset, val); - /* Mask in virtual memory enable for SR-IOV devices */ - if (offset == PCI_COMMAND && vdev->pdev->is_virtfn) { + /* Mask in virtual memory enable */ + if (offset == PCI_COMMAND && vdev->pdev->no_command_memory) { u16 cmd = le16_to_cpu(*(__le16 *)&vdev->vconfig[PCI_COMMAND]); u32 tmp_val = le32_to_cpu(*val); @@ -589,9 +589,11 @@ static int vfio_basic_config_write(struct vfio_pci_device *vdev, int pos, * shows it disabled (phys_mem/io, then the device has * undergone some kind of backdoor reset and needs to be * restored before we allow it to enable the bars. - * SR-IOV devices will trigger this, but we catch them later + * SR-IOV devices will trigger this - for mem enable let's + * catch this now and for io enable it will be caught later */ - if ((new_mem && virt_mem && !phys_mem) || + if ((new_mem && virt_mem && !phys_mem && + !pdev->no_command_memory) || (new_io && virt_io && !phys_io) || vfio_need_bar_restore(vdev)) vfio_bar_restore(vdev); @@ -1734,12 +1736,14 @@ int vfio_config_init(struct vfio_pci_device *vdev) vconfig[PCI_INTERRUPT_PIN]); vconfig[PCI_INTERRUPT_PIN] = 0; /* Gratuitous for good VFs */ - + } + if (pdev->no_command_memory) { /* - * VFs do no implement the memory enable bit of the COMMAND - * register therefore we'll not have it set in our initial - * copy of config space after pci_enable_device(). For - * consistency with PFs, set the virtual enable bit here. + * VFs and devices that set pdev->no_command_memory do not + * implement the memory enable bit of the COMMAND register + * therefore we'll not have it set in our initial copy of + * config space after pci_enable_device(). For consistency + * with PFs, set the virtual enable bit here. */ *(__le16 *)&vconfig[PCI_COMMAND] |= cpu_to_le16(PCI_COMMAND_MEMORY); From 28b130244061863cf0437b7af1625fb45ec1a71e Mon Sep 17 00:00:00 2001 From: Yan Zhao Date: Wed, 16 Sep 2020 10:29:27 +0800 Subject: [PATCH 8/9] vfio: fix a missed vfio group put in vfio_pin_pages When error occurs, need to put vfio group after a successful get. Fixes: 95fc87b44104 ("vfio: Selective dirty page tracking if IOMMU backed device pins pages") Signed-off-by: Yan Zhao Reviewed-by: Cornelia Huck Signed-off-by: Alex Williamson --- drivers/vfio/vfio.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index 532bcaf28c11..2151bc7f87ab 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -1949,8 +1949,10 @@ int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, int npage, if (!group) return -ENODEV; - if (group->dev_counter > 1) - return -EINVAL; + if (group->dev_counter > 1) { + ret = -EINVAL; + goto err_pin_pages; + } ret = vfio_group_add_container_user(group); if (ret) From 2c5af98592f65517170c7bcc714566590d3f7397 Mon Sep 17 00:00:00 2001 From: Yan Zhao Date: Wed, 16 Sep 2020 10:30:05 +0800 Subject: [PATCH 9/9] vfio/type1: fix dirty bitmap calculation in vfio_dma_rw The count of dirtied pages is not only determined by count of copied pages, but also by the start offset. e.g. if offset = PAGE_SIZE - 1, and *copied=2, the dirty pages count is 2, instead of 1 or 0. Fixes: d6a4c185660c ("vfio iommu: Implementation of ioctl for dirty pages tracking") Signed-off-by: Yan Zhao Signed-off-by: Alex Williamson --- drivers/vfio/vfio_iommu_type1.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 5fbf0c1f7433..d0438388feeb 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -2933,7 +2933,8 @@ static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu, * size */ bitmap_set(dma->bitmap, offset >> pgshift, - *copied >> pgshift); + ((offset + *copied - 1) >> pgshift) - + (offset >> pgshift) + 1); } } else *copied = copy_from_user(data, (void __user *)vaddr,