Message ID | 20250513163438.3942405-8-tabba@google.com |
---|---|
State | New |
Headers | show |
Series | KVM: Mapping guest_memfd backed memory at the host for software protected VMs | expand |
Hi Fuad, On 5/14/25 2:34 AM, Fuad Tabba wrote: > This patch enables support for shared memory in guest_memfd, including > mapping that memory at the host userspace. This support is gated by the > configuration option KVM_GMEM_SHARED_MEM, and toggled by the guest_memfd > flag GUEST_MEMFD_FLAG_SUPPORT_SHARED, which can be set when creating a > guest_memfd instance. > > Co-developed-by: Ackerley Tng <ackerleytng@google.com> > Signed-off-by: Ackerley Tng <ackerleytng@google.com> > Signed-off-by: Fuad Tabba <tabba@google.com> > --- > arch/x86/include/asm/kvm_host.h | 10 ++++ > include/linux/kvm_host.h | 13 +++++ > include/uapi/linux/kvm.h | 1 + > virt/kvm/Kconfig | 5 ++ > virt/kvm/guest_memfd.c | 88 +++++++++++++++++++++++++++++++++ > 5 files changed, 117 insertions(+) > [...] > diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h > index b6ae8ad8934b..9857022a0f0c 100644 > --- a/include/uapi/linux/kvm.h > +++ b/include/uapi/linux/kvm.h > @@ -1566,6 +1566,7 @@ struct kvm_memory_attributes { > #define KVM_MEMORY_ATTRIBUTE_PRIVATE (1ULL << 3) > > #define KVM_CREATE_GUEST_MEMFD _IOWR(KVMIO, 0xd4, struct kvm_create_guest_memfd) > +#define GUEST_MEMFD_FLAG_SUPPORT_SHARED (1UL << 0) > This would be (1ULL << 0) to be consistent with '__u64 struct kvm_create_guest_memfd::flags' Thanks, Gavin
Hi Fuad, On 5/14/25 2:34 AM, Fuad Tabba wrote: > This patch enables support for shared memory in guest_memfd, including > mapping that memory at the host userspace. This support is gated by the > configuration option KVM_GMEM_SHARED_MEM, and toggled by the guest_memfd > flag GUEST_MEMFD_FLAG_SUPPORT_SHARED, which can be set when creating a > guest_memfd instance. > > Co-developed-by: Ackerley Tng <ackerleytng@google.com> > Signed-off-by: Ackerley Tng <ackerleytng@google.com> > Signed-off-by: Fuad Tabba <tabba@google.com> > --- > arch/x86/include/asm/kvm_host.h | 10 ++++ > include/linux/kvm_host.h | 13 +++++ > include/uapi/linux/kvm.h | 1 + > virt/kvm/Kconfig | 5 ++ > virt/kvm/guest_memfd.c | 88 +++++++++++++++++++++++++++++++++ > 5 files changed, 117 insertions(+) > [...] > diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c > index 6db515833f61..8e6d1866b55e 100644 > --- a/virt/kvm/guest_memfd.c > +++ b/virt/kvm/guest_memfd.c > @@ -312,7 +312,88 @@ static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn) > return gfn - slot->base_gfn + slot->gmem.pgoff; > } > > +#ifdef CONFIG_KVM_GMEM_SHARED_MEM > + > +static bool kvm_gmem_supports_shared(struct inode *inode) > +{ > + uint64_t flags = (uint64_t)inode->i_private; > + > + return flags & GUEST_MEMFD_FLAG_SUPPORT_SHARED; > +} > + > +static vm_fault_t kvm_gmem_fault_shared(struct vm_fault *vmf) > +{ > + struct inode *inode = file_inode(vmf->vma->vm_file); > + struct folio *folio; > + vm_fault_t ret = VM_FAULT_LOCKED; > + > + filemap_invalidate_lock_shared(inode->i_mapping); > + > + folio = kvm_gmem_get_folio(inode, vmf->pgoff); > + if (IS_ERR(folio)) { > + int err = PTR_ERR(folio); > + > + if (err == -EAGAIN) > + ret = VM_FAULT_RETRY; > + else > + ret = vmf_error(err); > + > + goto out_filemap; > + } > + > + if (folio_test_hwpoison(folio)) { > + ret = VM_FAULT_HWPOISON; > + goto out_folio; > + } > + > + if (WARN_ON_ONCE(folio_test_large(folio))) { > + ret = VM_FAULT_SIGBUS; > + goto out_folio; > + } > + I don't think there is a large folio involved since the max/min folio order (stored in struct address_space::flags) should have been set to 0, meaning only order-0 is possible when the folio (page) is allocated and added to the page-cache. More details can be referred to AS_FOLIO_ORDER_MASK. It's unnecessary check but not harmful. Maybe a comment is needed to mention large folio isn't around yet, but double confirm. > + if (!folio_test_uptodate(folio)) { > + clear_highpage(folio_page(folio, 0)); > + kvm_gmem_mark_prepared(folio); > + } > + I must be missing some thing here. This chunk of code is out of sync to kvm_gmem_get_pfn(), where kvm_gmem_prepare_folio() and kvm_arch_gmem_prepare() are executed, and then PG_uptodate is set after that. In the latest ARM CCA series, kvm_arch_gmem_prepare() isn't used, but it would delegate the folio (page) with the prerequisite that the folio belongs to the private address space. I guess that kvm_arch_gmem_prepare() is skipped here because we have the assumption that the folio belongs to the shared address space? However, this assumption isn't always true. We probably need to ensure the folio range is really belonging to the shared address space by poking kvm->mem_attr_array, which can be modified by VMM through ioctl KVM_SET_MEMORY_ATTRIBUTES. > + vmf->page = folio_file_page(folio, vmf->pgoff); > + > +out_folio: > + if (ret != VM_FAULT_LOCKED) { > + folio_unlock(folio); > + folio_put(folio); > + } > + > +out_filemap: > + filemap_invalidate_unlock_shared(inode->i_mapping); > + > + return ret; > +} > + Thanks, Gavin
On Tue, May 13, 2025 at 11:37 AM Ackerley Tng <ackerleytng@google.com> wrote: > > Fuad Tabba <tabba@google.com> writes: > > diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c > > index 6db515833f61..8e6d1866b55e 100644 > > --- a/virt/kvm/guest_memfd.c > > +++ b/virt/kvm/guest_memfd.c > > @@ -312,7 +312,88 @@ static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn) > > return gfn - slot->base_gfn + slot->gmem.pgoff; > > } > > > > +#ifdef CONFIG_KVM_GMEM_SHARED_MEM > > + > > +static bool kvm_gmem_supports_shared(struct inode *inode) > > +{ > > + uint64_t flags = (uint64_t)inode->i_private; > > + > > + return flags & GUEST_MEMFD_FLAG_SUPPORT_SHARED; > > +} > > + > > +static vm_fault_t kvm_gmem_fault_shared(struct vm_fault *vmf) > > +{ > > + struct inode *inode = file_inode(vmf->vma->vm_file); > > + struct folio *folio; > > + vm_fault_t ret = VM_FAULT_LOCKED; > > + > > + filemap_invalidate_lock_shared(inode->i_mapping); > > + > > + folio = kvm_gmem_get_folio(inode, vmf->pgoff); > > + if (IS_ERR(folio)) { > > + int err = PTR_ERR(folio); > > + > > + if (err == -EAGAIN) > > + ret = VM_FAULT_RETRY; > > + else > > + ret = vmf_error(err); > > + > > + goto out_filemap; > > + } > > + > > + if (folio_test_hwpoison(folio)) { > > + ret = VM_FAULT_HWPOISON; > > + goto out_folio; > > + } nit: shmem_fault() does not include an equivalent of the above HWPOISON check, and __do_fault() already handles HWPOISON. It's very unlikely for `folio` to be hwpoison and not up-to-date, and even then, writing over poison (to zero the folio) is not usually fatal. > > + > > + if (WARN_ON_ONCE(folio_test_large(folio))) { > > + ret = VM_FAULT_SIGBUS; > > + goto out_folio; > > + } nit: I would prefer we remove this SIGBUS bit and change the below clearing logic to handle large folios. Up to you I suppose. > > + > > + if (!folio_test_uptodate(folio)) { > > + clear_highpage(folio_page(folio, 0)); > > + kvm_gmem_mark_prepared(folio); > > + } > > + > > + vmf->page = folio_file_page(folio, vmf->pgoff); > > + > > +out_folio: > > + if (ret != VM_FAULT_LOCKED) { > > + folio_unlock(folio); > > + folio_put(folio); > > + } > > + > > +out_filemap: > > + filemap_invalidate_unlock_shared(inode->i_mapping); > > Do we need to hold the filemap_invalidate_lock while zeroing? Would > holding the folio lock be enough? Do we need to hold the filemap_invalidate_lock for reading *at all*? I don't see why we need it. We're not checking gmem->bindings, and filemap_grab_folio() already synchronizes with filemap removal properly. > > > + > > + return ret; > > +}
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 709cc2a7ba66..f72722949cae 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2255,8 +2255,18 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, #ifdef CONFIG_KVM_GMEM #define kvm_arch_supports_gmem(kvm) ((kvm)->arch.supports_gmem) + +/* + * CoCo VMs with hardware support that use guest_memfd only for backing private + * memory, e.g., TDX, cannot use guest_memfd with userspace mapping enabled. + */ +#define kvm_arch_vm_supports_gmem_shared_mem(kvm) \ + (IS_ENABLED(CONFIG_KVM_GMEM_SHARED_MEM) && \ + ((kvm)->arch.vm_type == KVM_X86_SW_PROTECTED_VM || \ + (kvm)->arch.vm_type == KVM_X86_DEFAULT_VM)) #else #define kvm_arch_supports_gmem(kvm) false +#define kvm_arch_vm_supports_gmem_shared_mem(kvm) false #endif #define kvm_arch_has_readonly_mem(kvm) (!(kvm)->arch.has_protected_state) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index ae70e4e19700..2ec89c214978 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -729,6 +729,19 @@ static inline bool kvm_arch_supports_gmem(struct kvm *kvm) } #endif +/* + * Returns true if this VM supports shared mem in guest_memfd. + * + * Arch code must define kvm_arch_vm_supports_gmem_shared_mem if support for + * guest_memfd is enabled. + */ +#if !defined(kvm_arch_vm_supports_gmem_shared_mem) && !IS_ENABLED(CONFIG_KVM_GMEM) +static inline bool kvm_arch_vm_supports_gmem_shared_mem(struct kvm *kvm) +{ + return false; +} +#endif + #ifndef kvm_arch_has_readonly_mem static inline bool kvm_arch_has_readonly_mem(struct kvm *kvm) { diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index b6ae8ad8934b..9857022a0f0c 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1566,6 +1566,7 @@ struct kvm_memory_attributes { #define KVM_MEMORY_ATTRIBUTE_PRIVATE (1ULL << 3) #define KVM_CREATE_GUEST_MEMFD _IOWR(KVMIO, 0xd4, struct kvm_create_guest_memfd) +#define GUEST_MEMFD_FLAG_SUPPORT_SHARED (1UL << 0) struct kvm_create_guest_memfd { __u64 size; diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 559c93ad90be..f4e469a62a60 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -128,3 +128,8 @@ config HAVE_KVM_ARCH_GMEM_PREPARE config HAVE_KVM_ARCH_GMEM_INVALIDATE bool depends on KVM_GMEM + +config KVM_GMEM_SHARED_MEM + select KVM_GMEM + bool + prompt "Enables in-place shared memory for guest_memfd" diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 6db515833f61..8e6d1866b55e 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -312,7 +312,88 @@ static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn) return gfn - slot->base_gfn + slot->gmem.pgoff; } +#ifdef CONFIG_KVM_GMEM_SHARED_MEM + +static bool kvm_gmem_supports_shared(struct inode *inode) +{ + uint64_t flags = (uint64_t)inode->i_private; + + return flags & GUEST_MEMFD_FLAG_SUPPORT_SHARED; +} + +static vm_fault_t kvm_gmem_fault_shared(struct vm_fault *vmf) +{ + struct inode *inode = file_inode(vmf->vma->vm_file); + struct folio *folio; + vm_fault_t ret = VM_FAULT_LOCKED; + + filemap_invalidate_lock_shared(inode->i_mapping); + + folio = kvm_gmem_get_folio(inode, vmf->pgoff); + if (IS_ERR(folio)) { + int err = PTR_ERR(folio); + + if (err == -EAGAIN) + ret = VM_FAULT_RETRY; + else + ret = vmf_error(err); + + goto out_filemap; + } + + if (folio_test_hwpoison(folio)) { + ret = VM_FAULT_HWPOISON; + goto out_folio; + } + + if (WARN_ON_ONCE(folio_test_large(folio))) { + ret = VM_FAULT_SIGBUS; + goto out_folio; + } + + if (!folio_test_uptodate(folio)) { + clear_highpage(folio_page(folio, 0)); + kvm_gmem_mark_prepared(folio); + } + + vmf->page = folio_file_page(folio, vmf->pgoff); + +out_folio: + if (ret != VM_FAULT_LOCKED) { + folio_unlock(folio); + folio_put(folio); + } + +out_filemap: + filemap_invalidate_unlock_shared(inode->i_mapping); + + return ret; +} + +static const struct vm_operations_struct kvm_gmem_vm_ops = { + .fault = kvm_gmem_fault_shared, +}; + +static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma) +{ + if (!kvm_gmem_supports_shared(file_inode(file))) + return -ENODEV; + + if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) != + (VM_SHARED | VM_MAYSHARE)) { + return -EINVAL; + } + + vma->vm_ops = &kvm_gmem_vm_ops; + + return 0; +} +#else +#define kvm_gmem_mmap NULL +#endif /* CONFIG_KVM_GMEM_SHARED_MEM */ + static struct file_operations kvm_gmem_fops = { + .mmap = kvm_gmem_mmap, .open = generic_file_open, .release = kvm_gmem_release, .fallocate = kvm_gmem_fallocate, @@ -463,6 +544,9 @@ int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args) u64 flags = args->flags; u64 valid_flags = 0; + if (kvm_arch_vm_supports_gmem_shared_mem(kvm)) + valid_flags |= GUEST_MEMFD_FLAG_SUPPORT_SHARED; + if (flags & ~valid_flags) return -EINVAL; @@ -501,6 +585,10 @@ int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot, offset + size > i_size_read(inode)) goto err; + if (kvm_gmem_supports_shared(inode) && + !kvm_arch_vm_supports_gmem_shared_mem(kvm)) + goto err; + filemap_invalidate_lock(inode->i_mapping); start = offset >> PAGE_SHIFT;