diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index fc5736839edd6c..aa70277dffb966 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6441,15 +6441,18 @@ a single guest_memfd file, but the bound ranges must not overlap). The capability KVM_CAP_GUEST_MEMFD_FLAGS enumerates the `flags` that can be specified via KVM_CREATE_GUEST_MEMFD. Currently defined flags: - ============================ ================================================ - GUEST_MEMFD_FLAG_MMAP Enable using mmap() on the guest_memfd file - descriptor. - GUEST_MEMFD_FLAG_INIT_SHARED Make all memory in the file shared during - KVM_CREATE_GUEST_MEMFD (memory files created - without INIT_SHARED will be marked private). - Shared memory can be faulted into host userspace - page tables. Private memory cannot. - ============================ ================================================ + ============================== ================================================ + GUEST_MEMFD_FLAG_MMAP Enable using mmap() on the guest_memfd file + descriptor. + GUEST_MEMFD_FLAG_INIT_SHARED Make all memory in the file shared during + KVM_CREATE_GUEST_MEMFD (memory files created + without INIT_SHARED will be marked private). + Shared memory can be faulted into host userspace + page tables. Private memory cannot. + GUEST_MEMFD_FLAG_NO_DIRECT_MAP The guest_memfd instance will unmap the memory + backing it from the kernel's address space + before passing it off to userspace or the guest. + ============================== ================================================ When the KVM MMU performs a PFN lookup to service a guest fault and the backing guest_memfd has the GUEST_MEMFD_FLAG_MMAP set, then the fault will always be diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 5d5a3bbdb95e4b..681090dc2c2e1b 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -1678,6 +1679,18 @@ static __always_inline enum fgt_group_id __fgt_reg_to_group_id(enum vcpu_sysreg \ p; \ }) +#ifdef CONFIG_KVM_GUEST_MEMFD +static inline bool kvm_arch_gmem_supports_no_direct_map(struct kvm *kvm) +{ + /* + * Without FWB, direct map access is needed in kvm_pgtable_stage2_map(), + * as it calls dcache_clean_inval_poc(). + */ + return can_set_direct_map() && cpus_have_final_cap(ARM64_HAS_STAGE2_FWB); +} +#define kvm_arch_gmem_supports_no_direct_map kvm_arch_gmem_supports_no_direct_map +#endif /* CONFIG_KVM_GUEST_MEMFD */ + long kvm_get_cap_for_kvm_ioctl(unsigned int ioctl, long *ext); diff --git a/arch/arm64/include/asm/set_memory.h b/arch/arm64/include/asm/set_memory.h index 90f61b17275e1b..c71a2a6812c4dc 100644 --- a/arch/arm64/include/asm/set_memory.h +++ b/arch/arm64/include/asm/set_memory.h @@ -11,9 +11,10 @@ bool can_set_direct_map(void); int set_memory_valid(unsigned long addr, int numpages, int enable); -int set_direct_map_invalid_noflush(struct page *page); -int set_direct_map_default_noflush(struct page *page); -int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid); +int set_direct_map_invalid_noflush(const void *addr); +int set_direct_map_default_noflush(const void *addr); +int set_direct_map_valid_noflush(const void *addr, unsigned long numpages, + bool valid); bool kernel_page_present(struct page *page); int set_memory_encrypted(unsigned long addr, int numpages); diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index 358d1dc9a576f0..5aff94e1f8b215 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -245,7 +245,7 @@ int set_memory_valid(unsigned long addr, int numpages, int enable) __pgprot(PTE_VALID)); } -int set_direct_map_invalid_noflush(struct page *page) +int set_direct_map_invalid_noflush(const void *addr) { pgprot_t clear_mask = __pgprot(PTE_VALID); pgprot_t set_mask = __pgprot(0); @@ -253,11 +253,11 @@ int set_direct_map_invalid_noflush(struct page *page) if (!can_set_direct_map()) return 0; - return update_range_prot((unsigned long)page_address(page), - PAGE_SIZE, set_mask, clear_mask); + return update_range_prot((unsigned long)addr, PAGE_SIZE, set_mask, + clear_mask); } -int set_direct_map_default_noflush(struct page *page) +int set_direct_map_default_noflush(const void *addr) { pgprot_t set_mask = __pgprot(PTE_VALID | PTE_WRITE); pgprot_t clear_mask = __pgprot(PTE_RDONLY); @@ -265,8 +265,8 @@ int set_direct_map_default_noflush(struct page *page) if (!can_set_direct_map()) return 0; - return update_range_prot((unsigned long)page_address(page), - PAGE_SIZE, set_mask, clear_mask); + return update_range_prot((unsigned long)addr, PAGE_SIZE, set_mask, + clear_mask); } static int __set_memory_enc_dec(unsigned long addr, @@ -349,14 +349,13 @@ int realm_register_memory_enc_ops(void) return arm64_mem_crypt_ops_register(&realm_crypt_ops); } -int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid) +int set_direct_map_valid_noflush(const void *addr, unsigned long numpages, + bool valid) { - unsigned long addr = (unsigned long)page_address(page); - if (!can_set_direct_map()) return 0; - return set_memory_valid(addr, nr, valid); + return set_memory_valid((unsigned long)addr, numpages, valid); } #ifdef CONFIG_DEBUG_PAGEALLOC diff --git a/arch/loongarch/include/asm/set_memory.h b/arch/loongarch/include/asm/set_memory.h index 55dfaefd02c8a6..5e9b67b2fea151 100644 --- a/arch/loongarch/include/asm/set_memory.h +++ b/arch/loongarch/include/asm/set_memory.h @@ -15,8 +15,9 @@ int set_memory_ro(unsigned long addr, int numpages); int set_memory_rw(unsigned long addr, int numpages); bool kernel_page_present(struct page *page); -int set_direct_map_default_noflush(struct page *page); -int set_direct_map_invalid_noflush(struct page *page); -int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid); +int set_direct_map_invalid_noflush(const void *addr); +int set_direct_map_default_noflush(const void *addr); +int set_direct_map_valid_noflush(const void *addr, unsigned long numpages, + bool valid); #endif /* _ASM_LOONGARCH_SET_MEMORY_H */ diff --git a/arch/loongarch/mm/pageattr.c b/arch/loongarch/mm/pageattr.c index f5e910b68229d3..9e08905d36242e 100644 --- a/arch/loongarch/mm/pageattr.c +++ b/arch/loongarch/mm/pageattr.c @@ -198,32 +198,29 @@ bool kernel_page_present(struct page *page) return pte_present(ptep_get(pte)); } -int set_direct_map_default_noflush(struct page *page) +int set_direct_map_default_noflush(const void *addr) { - unsigned long addr = (unsigned long)page_address(page); - - if (addr < vm_map_base) + if ((unsigned long)addr < vm_map_base) return 0; - return __set_memory(addr, 1, PAGE_KERNEL, __pgprot(0)); + return __set_memory((unsigned long)addr, 1, PAGE_KERNEL, __pgprot(0)); } -int set_direct_map_invalid_noflush(struct page *page) +int set_direct_map_invalid_noflush(const void *addr) { - unsigned long addr = (unsigned long)page_address(page); - - if (addr < vm_map_base) + if ((unsigned long)addr < vm_map_base) return 0; - return __set_memory(addr, 1, __pgprot(0), __pgprot(_PAGE_PRESENT | _PAGE_VALID)); + return __set_memory((unsigned long)addr, 1, __pgprot(0), + __pgprot(_PAGE_PRESENT | _PAGE_VALID)); } -int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid) +int set_direct_map_valid_noflush(const void *addr, unsigned long numpages, + bool valid) { - unsigned long addr = (unsigned long)page_address(page); pgprot_t set, clear; - if (addr < vm_map_base) + if ((unsigned long)addr < vm_map_base) return 0; if (valid) { @@ -234,5 +231,5 @@ int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid) clear = __pgprot(_PAGE_PRESENT | _PAGE_VALID); } - return __set_memory(addr, 1, set, clear); + return __set_memory((unsigned long)addr, 1, set, clear); } diff --git a/arch/riscv/include/asm/set_memory.h b/arch/riscv/include/asm/set_memory.h index 87389e93325a3b..a87eabd7fc78a5 100644 --- a/arch/riscv/include/asm/set_memory.h +++ b/arch/riscv/include/asm/set_memory.h @@ -40,9 +40,10 @@ static inline int set_kernel_memory(char *startp, char *endp, } #endif -int set_direct_map_invalid_noflush(struct page *page); -int set_direct_map_default_noflush(struct page *page); -int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid); +int set_direct_map_invalid_noflush(const void *addr); +int set_direct_map_default_noflush(const void *addr); +int set_direct_map_valid_noflush(const void *addr, unsigned long numpages, + bool valid); bool kernel_page_present(struct page *page); #endif /* __ASSEMBLER__ */ diff --git a/arch/riscv/mm/pageattr.c b/arch/riscv/mm/pageattr.c index 3f76db3d276992..0a457177a88c69 100644 --- a/arch/riscv/mm/pageattr.c +++ b/arch/riscv/mm/pageattr.c @@ -374,19 +374,20 @@ int set_memory_nx(unsigned long addr, int numpages) return __set_memory(addr, numpages, __pgprot(0), __pgprot(_PAGE_EXEC)); } -int set_direct_map_invalid_noflush(struct page *page) +int set_direct_map_invalid_noflush(const void *addr) { - return __set_memory((unsigned long)page_address(page), 1, - __pgprot(0), __pgprot(_PAGE_PRESENT)); + return __set_memory((unsigned long)addr, 1, __pgprot(0), + __pgprot(_PAGE_PRESENT)); } -int set_direct_map_default_noflush(struct page *page) +int set_direct_map_default_noflush(const void *addr) { - return __set_memory((unsigned long)page_address(page), 1, - PAGE_KERNEL, __pgprot(_PAGE_EXEC)); + return __set_memory((unsigned long)addr, 1, PAGE_KERNEL, + __pgprot(_PAGE_EXEC)); } -int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid) +int set_direct_map_valid_noflush(const void *addr, unsigned long numpages, + bool valid) { pgprot_t set, clear; @@ -398,7 +399,7 @@ int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid) clear = __pgprot(_PAGE_PRESENT); } - return __set_memory((unsigned long)page_address(page), nr, set, clear); + return __set_memory((unsigned long)addr, numpages, set, clear); } #ifdef CONFIG_DEBUG_PAGEALLOC diff --git a/arch/s390/include/asm/set_memory.h b/arch/s390/include/asm/set_memory.h index 94092f4ae76499..3e43c3c96e67be 100644 --- a/arch/s390/include/asm/set_memory.h +++ b/arch/s390/include/asm/set_memory.h @@ -60,9 +60,10 @@ __SET_MEMORY_FUNC(set_memory_rox, SET_MEMORY_RO | SET_MEMORY_X) __SET_MEMORY_FUNC(set_memory_rwnx, SET_MEMORY_RW | SET_MEMORY_NX) __SET_MEMORY_FUNC(set_memory_4k, SET_MEMORY_4K) -int set_direct_map_invalid_noflush(struct page *page); -int set_direct_map_default_noflush(struct page *page); -int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid); +int set_direct_map_invalid_noflush(const void *addr); +int set_direct_map_default_noflush(const void *addr); +int set_direct_map_valid_noflush(const void *addr, unsigned long numpages, + bool valid); bool kernel_page_present(struct page *page); #endif diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c index bb29c38ae6241d..8e90ff5cf50df2 100644 --- a/arch/s390/mm/pageattr.c +++ b/arch/s390/mm/pageattr.c @@ -383,17 +383,18 @@ int __set_memory(unsigned long addr, unsigned long numpages, unsigned long flags return rc; } -int set_direct_map_invalid_noflush(struct page *page) +int set_direct_map_invalid_noflush(const void *addr) { - return __set_memory((unsigned long)page_to_virt(page), 1, SET_MEMORY_INV); + return __set_memory((unsigned long)addr, 1, SET_MEMORY_INV); } -int set_direct_map_default_noflush(struct page *page) +int set_direct_map_default_noflush(const void *addr) { - return __set_memory((unsigned long)page_to_virt(page), 1, SET_MEMORY_DEF); + return __set_memory((unsigned long)addr, 1, SET_MEMORY_DEF); } -int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid) +int set_direct_map_valid_noflush(const void *addr, unsigned long numpages, + bool valid) { unsigned long flags; @@ -402,7 +403,7 @@ int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid) else flags = SET_MEMORY_INV; - return __set_memory((unsigned long)page_to_virt(page), nr, flags); + return __set_memory((unsigned long)addr, numpages, flags); } bool kernel_page_present(struct page *page) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index ff07c45e3c731a..2d89340bfc6cd9 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -2503,4 +2504,9 @@ static inline bool kvm_arch_has_irq_bypass(void) return enable_device_posted_irqs; } +#ifdef CONFIG_KVM_GUEST_MEMFD +bool kvm_arch_gmem_supports_no_direct_map(struct kvm *kvm); +#define kvm_arch_gmem_supports_no_direct_map kvm_arch_gmem_supports_no_direct_map +#endif /* CONFIG_KVM_GUEST_MEMFD */ + #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h index 4362c26aa992db..b6a4173ff2491b 100644 --- a/arch/x86/include/asm/set_memory.h +++ b/arch/x86/include/asm/set_memory.h @@ -86,9 +86,10 @@ int set_pages_wb(struct page *page, int numpages); int set_pages_ro(struct page *page, int numpages); int set_pages_rw(struct page *page, int numpages); -int set_direct_map_invalid_noflush(struct page *page); -int set_direct_map_default_noflush(struct page *page); -int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid); +int set_direct_map_invalid_noflush(const void *addr); +int set_direct_map_default_noflush(const void *addr); +int set_direct_map_valid_noflush(const void *addr, unsigned long numpages, + bool valid); bool kernel_page_present(struct page *page); extern int kernel_set_to_readonly; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 391f4a5ce6dd18..0ed9fa1c5611a8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -14080,6 +14080,11 @@ void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end) kvm_x86_call(gmem_invalidate)(start, end); } #endif + +bool kvm_arch_gmem_supports_no_direct_map(struct kvm *kvm) +{ + return can_set_direct_map() && kvm->arch.vm_type != KVM_X86_TDX_VM; +} #endif int kvm_spec_ctrl_test_value(u64 value) diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 40581a720fe829..6aea1f470fd5f6 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -2587,9 +2587,9 @@ int set_pages_rw(struct page *page, int numpages) return set_memory_rw(addr, numpages); } -static int __set_pages_p(struct page *page, int numpages) +static int __set_pages_p(const void *addr, int numpages) { - unsigned long tempaddr = (unsigned long) page_address(page); + unsigned long tempaddr = (unsigned long)addr; struct cpa_data cpa = { .vaddr = &tempaddr, .pgd = NULL, .numpages = numpages, @@ -2606,9 +2606,9 @@ static int __set_pages_p(struct page *page, int numpages) return __change_page_attr_set_clr(&cpa, 1); } -static int __set_pages_np(struct page *page, int numpages) +static int __set_pages_np(const void *addr, int numpages) { - unsigned long tempaddr = (unsigned long) page_address(page); + unsigned long tempaddr = (unsigned long)addr; struct cpa_data cpa = { .vaddr = &tempaddr, .pgd = NULL, .numpages = numpages, @@ -2625,22 +2625,23 @@ static int __set_pages_np(struct page *page, int numpages) return __change_page_attr_set_clr(&cpa, 1); } -int set_direct_map_invalid_noflush(struct page *page) +int set_direct_map_invalid_noflush(const void *addr) { - return __set_pages_np(page, 1); + return __set_pages_np(addr, 1); } -int set_direct_map_default_noflush(struct page *page) +int set_direct_map_default_noflush(const void *addr) { - return __set_pages_p(page, 1); + return __set_pages_p(addr, 1); } -int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid) +int set_direct_map_valid_noflush(const void *addr, unsigned long numpages, + bool valid) { if (valid) - return __set_pages_p(page, nr); + return __set_pages_p(addr, numpages); - return __set_pages_np(page, nr); + return __set_pages_np(addr, numpages); } #ifdef CONFIG_DEBUG_PAGEALLOC diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index d42a95cbcfbc3c..d8747ec7f8a219 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -740,10 +740,22 @@ static inline u64 kvm_gmem_get_supported_flags(struct kvm *kvm) if (!kvm || kvm_arch_supports_gmem_init_shared(kvm)) flags |= GUEST_MEMFD_FLAG_INIT_SHARED; + if (!kvm || kvm_arch_gmem_supports_no_direct_map(kvm)) + flags |= GUEST_MEMFD_FLAG_NO_DIRECT_MAP; + return flags; } #endif +#ifdef CONFIG_KVM_GUEST_MEMFD +#ifndef kvm_arch_gmem_supports_no_direct_map +static inline bool kvm_arch_gmem_supports_no_direct_map(struct kvm *kvm) +{ + return false; +} +#endif +#endif /* CONFIG_KVM_GUEST_MEMFD */ + #ifndef kvm_arch_has_readonly_mem static inline bool kvm_arch_has_readonly_mem(struct kvm *kvm) { @@ -2605,6 +2617,8 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end); +#else +static inline void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end) { } #endif #ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index ec442af3f88613..68c075502d91ce 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -211,6 +211,7 @@ enum mapping_flags { AS_KERNEL_FILE = 10, /* mapping for a fake kernel file that shouldn't account usage to user cgroups */ AS_NO_DATA_INTEGRITY = 11, /* no data integrity guarantees */ + AS_NO_DIRECT_MAP = 12, /* Folios in the mapping are not in the direct map */ /* Bits 16-25 are used for FOLIO_ORDER */ AS_FOLIO_ORDER_BITS = 5, AS_FOLIO_ORDER_MIN = 16, @@ -356,6 +357,21 @@ static inline bool mapping_no_data_integrity(const struct address_space *mapping return test_bit(AS_NO_DATA_INTEGRITY, &mapping->flags); } +static inline void mapping_set_no_direct_map(struct address_space *mapping) +{ + set_bit(AS_NO_DIRECT_MAP, &mapping->flags); +} + +static inline bool mapping_no_direct_map(const struct address_space *mapping) +{ + return test_bit(AS_NO_DIRECT_MAP, &mapping->flags); +} + +static inline bool vma_has_no_direct_map(const struct vm_area_struct *vma) +{ + return vma->vm_file && mapping_no_direct_map(vma->vm_file->f_mapping); +} + static inline gfp_t mapping_gfp_mask(const struct address_space *mapping) { return mapping->gfp_mask; diff --git a/include/linux/secretmem.h b/include/linux/secretmem.h index e918f96881f569..0ae1fb057b3d28 100644 --- a/include/linux/secretmem.h +++ b/include/linux/secretmem.h @@ -4,28 +4,10 @@ #ifdef CONFIG_SECRETMEM -extern const struct address_space_operations secretmem_aops; - -static inline bool secretmem_mapping(struct address_space *mapping) -{ - return mapping->a_ops == &secretmem_aops; -} - -bool vma_is_secretmem(struct vm_area_struct *vma); bool secretmem_active(void); #else -static inline bool vma_is_secretmem(struct vm_area_struct *vma) -{ - return false; -} - -static inline bool secretmem_mapping(struct address_space *mapping) -{ - return false; -} - static inline bool secretmem_active(void) { return false; diff --git a/include/linux/set_memory.h b/include/linux/set_memory.h index 3030d9245f5ac8..24caea2931f9a1 100644 --- a/include/linux/set_memory.h +++ b/include/linux/set_memory.h @@ -25,21 +25,31 @@ static inline int set_memory_rox(unsigned long addr, int numpages) #endif #ifndef CONFIG_ARCH_HAS_SET_DIRECT_MAP -static inline int set_direct_map_invalid_noflush(struct page *page) +static inline int set_direct_map_invalid_noflush(const void *addr) { return 0; } -static inline int set_direct_map_default_noflush(struct page *page) +static inline int set_direct_map_default_noflush(const void *addr) { return 0; } -static inline int set_direct_map_valid_noflush(struct page *page, - unsigned nr, bool valid) +static inline int set_direct_map_valid_noflush(const void *addr, + unsigned long numpages, + bool valid) { return 0; } +static inline int folio_zap_direct_map(struct folio *folio) +{ + return 0; +} + +static inline void folio_restore_direct_map(struct folio *folio) +{ +} + static inline bool kernel_page_present(struct page *page) { return true; @@ -56,6 +66,10 @@ static inline bool can_set_direct_map(void) } #define can_set_direct_map can_set_direct_map #endif + +int folio_zap_direct_map(struct folio *folio); +void folio_restore_direct_map(struct folio *folio); + #endif /* CONFIG_ARCH_HAS_SET_DIRECT_MAP */ #ifdef CONFIG_X86_64 diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 65500f5db37992..c83f2258ed411a 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1634,6 +1634,7 @@ struct kvm_memory_attributes { #define KVM_CREATE_GUEST_MEMFD _IOWR(KVMIO, 0xd4, struct kvm_create_guest_memfd) #define GUEST_MEMFD_FLAG_MMAP (1ULL << 0) #define GUEST_MEMFD_FLAG_INIT_SHARED (1ULL << 1) +#define GUEST_MEMFD_FLAG_NO_DIRECT_MAP (1ULL << 2) struct kvm_create_guest_memfd { __u64 size; diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 0a946932d5c17d..b6dda3a8eb6e7d 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -88,7 +88,7 @@ static inline int hibernate_restore_unprotect_page(void *page_address) {return 0 static inline void hibernate_map_page(struct page *page) { if (IS_ENABLED(CONFIG_ARCH_HAS_SET_DIRECT_MAP)) { - int ret = set_direct_map_default_noflush(page); + int ret = set_direct_map_default_noflush(page_address(page)); if (ret) pr_warn_once("Failed to remap page\n"); @@ -101,7 +101,7 @@ static inline void hibernate_unmap_page(struct page *page) { if (IS_ENABLED(CONFIG_ARCH_HAS_SET_DIRECT_MAP)) { unsigned long addr = (unsigned long)page_address(page); - int ret = set_direct_map_invalid_noflush(page); + int ret = set_direct_map_invalid_noflush(page_address(page)); if (ret) pr_warn_once("Failed to remap page\n"); diff --git a/lib/buildid.c b/lib/buildid.c index c4b73764062159..ba79bf28f7e69d 100644 --- a/lib/buildid.c +++ b/lib/buildid.c @@ -47,6 +47,10 @@ static int freader_get_folio(struct freader *r, loff_t file_off) freader_put_folio(r); + /* reject folios without direct map entries (e.g. from memfd_secret() or guest_memfd()) */ + if (mapping_no_direct_map(r->file->f_mapping)) + return -EFAULT; + /* only use page cache lookup - fail if not already cached */ r->folio = filemap_get_folio(r->file->f_mapping, file_off >> PAGE_SHIFT); @@ -87,8 +91,8 @@ const void *freader_fetch(struct freader *r, loff_t file_off, size_t sz) return r->data + file_off; } - /* reject secretmem folios created with memfd_secret() */ - if (secretmem_mapping(r->file->f_mapping)) { + /* reject folios without direct map entries (e.g. from memfd_secret() or guest_memfd()) */ + if (mapping_no_direct_map(r->file->f_mapping)) { r->err = -EFAULT; return NULL; } diff --git a/mm/execmem.c b/mm/execmem.c index 810a4ba9c9243e..220298ec87c8f7 100644 --- a/mm/execmem.c +++ b/mm/execmem.c @@ -119,7 +119,8 @@ static int execmem_set_direct_map_valid(struct vm_struct *vm, bool valid) int err = 0; for (int i = 0; i < vm->nr_pages; i += nr) { - err = set_direct_map_valid_noflush(vm->pages[i], nr, valid); + err = set_direct_map_valid_noflush(page_address(vm->pages[i]), + nr, valid); if (err) goto err_restore; updated += nr; @@ -129,7 +130,8 @@ static int execmem_set_direct_map_valid(struct vm_struct *vm, bool valid) err_restore: for (int i = 0; i < updated; i += nr) - set_direct_map_valid_noflush(vm->pages[i], nr, !valid); + set_direct_map_valid_noflush(page_address(vm->pages[i]), nr, + !valid); return err; } diff --git a/mm/gup.c b/mm/gup.c index 8e7dc2c6ee7385..a5a753da66aab7 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include @@ -1216,7 +1215,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) if ((gup_flags & FOLL_SPLIT_PMD) && is_vm_hugetlb_page(vma)) return -EOPNOTSUPP; - if (vma_is_secretmem(vma)) + if (vma_has_no_direct_map(vma)) return -EFAULT; if (write) { @@ -2724,7 +2723,7 @@ EXPORT_SYMBOL(get_user_pages_unlocked); * This call assumes the caller has pinned the folio, that the lowest page table * level still points to this folio, and that interrupts have been disabled. * - * GUP-fast must reject all secretmem folios. + * GUP-fast must reject all folios without direct map entries (such as secretmem). * * Writing to pinned file-backed dirty tracked folios is inherently problematic * (see comment describing the writable_file_mapping_allowed() function). We @@ -2737,32 +2736,14 @@ EXPORT_SYMBOL(get_user_pages_unlocked); */ static bool gup_fast_folio_allowed(struct folio *folio, unsigned int flags) { - bool reject_file_backed = false; struct address_space *mapping; - bool check_secretmem = false; unsigned long mapping_flags; - /* - * If we aren't pinning then no problematic write can occur. A long term - * pin is the most egregious case so this is the one we disallow. - */ - if ((flags & (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE)) == - (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE)) - reject_file_backed = true; - /* We hold a folio reference, so we can safely access folio fields. */ - - /* secretmem folios are always order-0 folios. */ - if (IS_ENABLED(CONFIG_SECRETMEM) && !folio_test_large(folio)) - check_secretmem = true; - - if (!reject_file_backed && !check_secretmem) - return true; - if (WARN_ON_ONCE(folio_test_slab(folio))) return false; - /* hugetlb neither requires dirty-tracking nor can be secretmem. */ + /* hugetlb neither requires dirty-tracking nor can be without direct map. */ if (folio_test_hugetlb(folio)) return true; @@ -2800,10 +2781,20 @@ static bool gup_fast_folio_allowed(struct folio *folio, unsigned int flags) * At this point, we know the mapping is non-null and points to an * address_space object. */ - if (check_secretmem && secretmem_mapping(mapping)) + if (mapping_no_direct_map(mapping)) return false; - /* The only remaining allowed file system is shmem. */ - return !reject_file_backed || shmem_mapping(mapping); + + /* + * If we aren't pinning then no problematic write can occur. A writable + * long term pin is the most egregious case, so this is the one we + * allow only for ... + */ + if ((flags & (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE)) != + (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE)) + return true; + + /* ... hugetlb (which we allowed above already) and shared memory. */ + return shmem_mapping(mapping); } #ifdef CONFIG_ARCH_HAS_PTE_SPECIAL diff --git a/mm/memory.c b/mm/memory.c index b0d487229b2e58..4be3b8c273a73e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -78,6 +78,7 @@ #include #include #include +#include #include @@ -7471,3 +7472,44 @@ void vma_pgtable_walk_end(struct vm_area_struct *vma) if (is_vm_hugetlb_page(vma)) hugetlb_vma_unlock_read(vma); } + +#ifdef CONFIG_ARCH_HAS_SET_DIRECT_MAP +/** + * folio_zap_direct_map - remove a folio from the kernel direct map + * @folio: folio to remove from the direct map + * + * Removes the folio from the kernel direct map and flushes the TLB. This may + * require splitting huge pages in the direct map, which can fail due to memory + * allocation. + * + * Return: 0 on success, or a negative error code on failure. + */ +int folio_zap_direct_map(struct folio *folio) +{ + const void *addr = folio_address(folio); + int ret; + + ret = set_direct_map_valid_noflush(addr, folio_nr_pages(folio), false); + flush_tlb_kernel_range((unsigned long)addr, + (unsigned long)addr + folio_size(folio)); + + return ret; +} +EXPORT_SYMBOL_FOR_MODULES(folio_zap_direct_map, "kvm"); + +/** + * folio_restore_direct_map - restore the kernel direct map entry for a folio + * @folio: folio whose direct map entry is to be restored + * + * This may only be called after a prior successful folio_zap_direct_map() on + * the same folio. Because the zap will have already split any huge pages in + * the direct map, restoration here only updates protection bits and cannot + * fail. + */ +void folio_restore_direct_map(struct folio *folio) +{ + WARN_ON_ONCE(set_direct_map_valid_noflush(folio_address(folio), + folio_nr_pages(folio), true)); +} +EXPORT_SYMBOL_FOR_MODULES(folio_restore_direct_map, "kvm"); +#endif /* CONFIG_ARCH_HAS_SET_DIRECT_MAP */ diff --git a/mm/mlock.c b/mm/mlock.c index 2f699c3497a579..a6f4b3df4f3ff0 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -474,7 +474,7 @@ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, if (newflags == oldflags || (oldflags & VM_SPECIAL) || is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) || - vma_is_dax(vma) || vma_is_secretmem(vma) || (oldflags & VM_DROPPABLE)) + vma_is_dax(vma) || vma_has_no_direct_map(vma) || (oldflags & VM_DROPPABLE)) /* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */ goto out; diff --git a/mm/secretmem.c b/mm/secretmem.c index edf111e0a1bbba..80b0338eeb8854 100644 --- a/mm/secretmem.c +++ b/mm/secretmem.c @@ -53,7 +53,6 @@ static vm_fault_t secretmem_fault(struct vm_fault *vmf) struct inode *inode = file_inode(vmf->vma->vm_file); pgoff_t offset = vmf->pgoff; gfp_t gfp = vmf->gfp_mask; - unsigned long addr; struct folio *folio; vm_fault_t ret; int err; @@ -72,7 +71,7 @@ static vm_fault_t secretmem_fault(struct vm_fault *vmf) goto out; } - err = set_direct_map_invalid_noflush(folio_page(folio, 0)); + err = folio_zap_direct_map(folio); if (err) { folio_put(folio); ret = vmf_error(err); @@ -87,7 +86,7 @@ static vm_fault_t secretmem_fault(struct vm_fault *vmf) * already happened when we marked the page invalid * which guarantees that this call won't fail */ - set_direct_map_default_noflush(folio_page(folio, 0)); + folio_restore_direct_map(folio); folio_put(folio); if (err == -EEXIST) goto retry; @@ -95,9 +94,6 @@ static vm_fault_t secretmem_fault(struct vm_fault *vmf) ret = vmf_error(err); goto out; } - - addr = (unsigned long)folio_address(folio); - flush_tlb_kernel_range(addr, addr + PAGE_SIZE); } vmf->page = folio_file_page(folio, vmf->pgoff); @@ -134,11 +130,6 @@ static int secretmem_mmap_prepare(struct vm_area_desc *desc) return 0; } -bool vma_is_secretmem(struct vm_area_struct *vma) -{ - return vma->vm_ops == &secretmem_vm_ops; -} - static const struct file_operations secretmem_fops = { .release = secretmem_release, .mmap_prepare = secretmem_mmap_prepare, @@ -152,11 +143,11 @@ static int secretmem_migrate_folio(struct address_space *mapping, static void secretmem_free_folio(struct folio *folio) { - set_direct_map_default_noflush(folio_page(folio, 0)); + set_direct_map_default_noflush(folio_address(folio)); folio_zero_segment(folio, 0, folio_size(folio)); } -const struct address_space_operations secretmem_aops = { +static const struct address_space_operations secretmem_aops = { .dirty_folio = noop_dirty_folio, .free_folio = secretmem_free_folio, .migrate_folio = secretmem_migrate_folio, @@ -205,6 +196,7 @@ static struct file *secretmem_file_create(unsigned long flags) mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); mapping_set_unevictable(inode->i_mapping); + mapping_set_no_direct_map(inode->i_mapping); inode->i_op = &secretmem_iops; inode->i_mapping->a_ops = &secretmem_aops; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 03e1117480d5c6..e4708bbd737cb2 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3342,14 +3342,17 @@ struct vm_struct *remove_vm_area(const void *addr) } static inline void set_area_direct_map(const struct vm_struct *area, - int (*set_direct_map)(struct page *page)) + int (*set_direct_map)(const void *addr)) { int i; /* HUGE_VMALLOC passes small pages to set_direct_map */ - for (i = 0; i < area->nr_pages; i++) - if (page_address(area->pages[i])) - set_direct_map(area->pages[i]); + for (i = 0; i < area->nr_pages; i++) { + const void *addr = page_address(area->pages[i]); + + if (addr) + set_direct_map(addr); + } } /* diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing/selftests/kvm/guest_memfd_test.c index 618c937f3c90f8..5fcb922cce446a 100644 --- a/tools/testing/selftests/kvm/guest_memfd_test.c +++ b/tools/testing/selftests/kvm/guest_memfd_test.c @@ -403,6 +403,17 @@ static void test_guest_memfd(unsigned long vm_type) __test_guest_memfd(vm, GUEST_MEMFD_FLAG_MMAP | GUEST_MEMFD_FLAG_INIT_SHARED); + if (flags & GUEST_MEMFD_FLAG_NO_DIRECT_MAP) { + __test_guest_memfd(vm, GUEST_MEMFD_FLAG_NO_DIRECT_MAP); + if (flags & GUEST_MEMFD_FLAG_MMAP) + __test_guest_memfd(vm, GUEST_MEMFD_FLAG_NO_DIRECT_MAP | + GUEST_MEMFD_FLAG_MMAP); + if (flags & GUEST_MEMFD_FLAG_INIT_SHARED) + __test_guest_memfd(vm, GUEST_MEMFD_FLAG_NO_DIRECT_MAP | + GUEST_MEMFD_FLAG_MMAP | + GUEST_MEMFD_FLAG_INIT_SHARED); + } + kvm_vm_free(vm); } @@ -445,10 +456,14 @@ static void test_guest_memfd_guest(void) TEST_ASSERT(vm_check_cap(vm, KVM_CAP_GUEST_MEMFD_FLAGS) & GUEST_MEMFD_FLAG_INIT_SHARED, "Default VM type should support INIT_SHARED, supported flags = 0x%x", vm_check_cap(vm, KVM_CAP_GUEST_MEMFD_FLAGS)); + TEST_ASSERT(vm_check_cap(vm, KVM_CAP_GUEST_MEMFD_FLAGS) & GUEST_MEMFD_FLAG_NO_DIRECT_MAP, + "Default VM type should support NO_DIRECT_MAP, supported flags = 0x%x", + vm_check_cap(vm, KVM_CAP_GUEST_MEMFD_FLAGS)); size = vm->page_size; fd = vm_create_guest_memfd(vm, size, GUEST_MEMFD_FLAG_MMAP | - GUEST_MEMFD_FLAG_INIT_SHARED); + GUEST_MEMFD_FLAG_INIT_SHARED | + GUEST_MEMFD_FLAG_NO_DIRECT_MAP); vm_set_user_memory_region2(vm, slot, KVM_MEM_GUEST_MEMFD, gpa, size, NULL, fd, 0); mem = kvm_mmap(size, PROT_READ | PROT_WRITE, MAP_SHARED, fd); diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index 8b39cb919f4fc4..48b6ee8223aa5c 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -215,7 +215,7 @@ enum vm_guest_mode { struct vm_shape { uint32_t type; uint8_t mode; - uint8_t pad0; + uint8_t src_type; uint16_t pad1; }; @@ -223,14 +223,15 @@ kvm_static_assert(sizeof(struct vm_shape) == sizeof(uint64_t)); #define VM_TYPE_DEFAULT 0 -#define VM_SHAPE(__mode) \ -({ \ - struct vm_shape shape = { \ - .mode = (__mode), \ - .type = VM_TYPE_DEFAULT \ - }; \ - \ - shape; \ +#define VM_SHAPE(__mode) \ +({ \ + struct vm_shape shape = { \ + .mode = (__mode), \ + .type = VM_TYPE_DEFAULT, \ + .src_type = VM_MEM_SRC_ANONYMOUS \ + }; \ + \ + shape; \ }) extern enum vm_guest_mode vm_mode_default; @@ -664,6 +665,24 @@ static inline bool is_smt_on(void) void vm_create_irqchip(struct kvm_vm *vm); +static inline uint32_t backing_src_guest_memfd_flags(enum vm_mem_backing_src_type t) +{ + uint32_t flags = 0; + + switch (t) { + case VM_MEM_SRC_GUEST_MEMFD_NO_DIRECT_MAP: + flags |= GUEST_MEMFD_FLAG_NO_DIRECT_MAP; + fallthrough; + case VM_MEM_SRC_GUEST_MEMFD: + flags |= GUEST_MEMFD_FLAG_MMAP | GUEST_MEMFD_FLAG_INIT_SHARED; + break; + default: + break; + } + + return flags; +} + static inline int __vm_create_guest_memfd(struct kvm_vm *vm, uint64_t size, uint64_t flags) { diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h index b4872ba8ed1245..ea6de20ce8ef8f 100644 --- a/tools/testing/selftests/kvm/include/test_util.h +++ b/tools/testing/selftests/kvm/include/test_util.h @@ -48,6 +48,7 @@ do { \ ssize_t test_write(int fd, const void *buf, size_t count); ssize_t test_read(int fd, void *buf, size_t count); +ssize_t test_read_bounce(int fd, void *buf, size_t count); int test_seq_read(const char *path, char **bufp, size_t *sizep); void __printf(5, 6) test_assert(bool exp, const char *exp_str, @@ -151,6 +152,8 @@ enum vm_mem_backing_src_type { VM_MEM_SRC_ANONYMOUS_HUGETLB_16GB, VM_MEM_SRC_SHMEM, VM_MEM_SRC_SHARED_HUGETLB, + VM_MEM_SRC_GUEST_MEMFD, + VM_MEM_SRC_GUEST_MEMFD_NO_DIRECT_MAP, NUM_SRC_TYPES, }; @@ -183,6 +186,11 @@ static inline bool backing_src_is_shared(enum vm_mem_backing_src_type t) return vm_mem_backing_src_alias(t)->flag & MAP_SHARED; } +static inline bool backing_src_is_guest_memfd(enum vm_mem_backing_src_type t) +{ + return t == VM_MEM_SRC_GUEST_MEMFD || t == VM_MEM_SRC_GUEST_MEMFD_NO_DIRECT_MAP; +} + static inline bool backing_src_can_be_huge(enum vm_mem_backing_src_type t) { return t != VM_MEM_SRC_ANONYMOUS && t != VM_MEM_SRC_SHMEM; diff --git a/tools/testing/selftests/kvm/lib/elf.c b/tools/testing/selftests/kvm/lib/elf.c index f34d926d973591..e829fbe0a11e7c 100644 --- a/tools/testing/selftests/kvm/lib/elf.c +++ b/tools/testing/selftests/kvm/lib/elf.c @@ -31,7 +31,7 @@ static void elfhdr_get(const char *filename, Elf64_Ehdr *hdrp) * the real size of the ELF header. */ unsigned char ident[EI_NIDENT]; - test_read(fd, ident, sizeof(ident)); + test_read_bounce(fd, ident, sizeof(ident)); TEST_ASSERT((ident[EI_MAG0] == ELFMAG0) && (ident[EI_MAG1] == ELFMAG1) && (ident[EI_MAG2] == ELFMAG2) && (ident[EI_MAG3] == ELFMAG3), "ELF MAGIC Mismatch,\n" @@ -79,7 +79,7 @@ static void elfhdr_get(const char *filename, Elf64_Ehdr *hdrp) offset_rv = lseek(fd, 0, SEEK_SET); TEST_ASSERT(offset_rv == 0, "Seek to ELF header failed,\n" " rv: %zi expected: %i", offset_rv, 0); - test_read(fd, hdrp, sizeof(*hdrp)); + test_read_bounce(fd, hdrp, sizeof(*hdrp)); TEST_ASSERT(hdrp->e_phentsize == sizeof(Elf64_Phdr), "Unexpected physical header size,\n" " hdrp->e_phentsize: %x\n" @@ -146,7 +146,7 @@ void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename) /* Read in the program header. */ Elf64_Phdr phdr; - test_read(fd, &phdr, sizeof(phdr)); + test_read_bounce(fd, &phdr, sizeof(phdr)); /* Skip if this header doesn't describe a loadable segment. */ if (phdr.p_type != PT_LOAD) @@ -187,7 +187,7 @@ void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename) " expected: 0x%jx", n1, errno, (intmax_t) offset_rv, (intmax_t) phdr.p_offset); - test_read(fd, addr_gva2hva(vm, phdr.p_vaddr), + test_read_bounce(fd, addr_gva2hva(vm, phdr.p_vaddr), phdr.p_filesz); } } diff --git a/tools/testing/selftests/kvm/lib/io.c b/tools/testing/selftests/kvm/lib/io.c index fedb2a741f0b12..60613dce6cfdce 100644 --- a/tools/testing/selftests/kvm/lib/io.c +++ b/tools/testing/selftests/kvm/lib/io.c @@ -155,3 +155,26 @@ ssize_t test_read(int fd, void *buf, size_t count) return num_read; } + +/* Test read via intermediary buffer + * + * Same as test_read, except read(2)s happen into a bounce buffer that is memcpy'd + * to buf. For use with buffers that cannot be GUP'd (e.g. guest_memfd VMAs if + * guest_memfd was created with GUEST_MEMFD_FLAG_NO_DIRECT_MAP). + */ +ssize_t test_read_bounce(int fd, void *buf, size_t count) +{ + void *bounce_buffer; + ssize_t num_read; + + TEST_ASSERT(count > 0, "Unexpected count, count: %zu", count); + + bounce_buffer = malloc(count); + TEST_ASSERT(bounce_buffer != NULL, "Failed to allocate bounce buffer"); + + num_read = test_read(fd, bounce_buffer, count); + memcpy(buf, bounce_buffer, num_read); + free(bounce_buffer); + + return num_read; +} diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 1959bf556e88ea..824c94c64864cf 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -500,7 +500,7 @@ struct kvm_vm *__vm_create(struct vm_shape shape, uint32_t nr_runnable_vcpus, if (is_guest_memfd_required(shape)) flags |= KVM_MEM_GUEST_MEMFD; - vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, 0, 0, nr_pages, flags); + vm_userspace_mem_region_add(vm, shape.src_type, 0, 0, nr_pages, flags); for (i = 0; i < NR_MEM_REGIONS; i++) vm->memslots[i] = 0; @@ -1046,6 +1046,33 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, alignment = 1; #endif + if (guest_memfd < 0) { + if ((flags & KVM_MEM_GUEST_MEMFD) || backing_src_is_guest_memfd(src_type)) { + uint32_t guest_memfd_flags = backing_src_guest_memfd_flags(src_type); + + TEST_ASSERT(!guest_memfd_offset, + "Offset must be zero when creating new guest_memfd"); + guest_memfd = vm_create_guest_memfd(vm, mem_size, guest_memfd_flags); + } + } else { + /* + * Install a unique fd for each memslot so that the fd + * can be closed when the region is deleted without + * needing to track if the fd is owned by the framework + * or by the caller. + */ + guest_memfd = kvm_dup(guest_memfd); + } + + if (guest_memfd >= 0) { + flags |= KVM_MEM_GUEST_MEMFD; + + region->region.guest_memfd = guest_memfd; + region->region.guest_memfd_offset = guest_memfd_offset; + } else { + region->region.guest_memfd = -1; + } + /* * When using THP mmap is not guaranteed to returned a hugepage aligned * address so we have to pad the mmap. Padding is not needed for HugeTLB @@ -1061,10 +1088,13 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, if (alignment > 1) region->mmap_size += alignment; - region->fd = -1; - if (backing_src_is_shared(src_type)) + if (backing_src_is_guest_memfd(src_type)) + region->fd = guest_memfd; + else if (backing_src_is_shared(src_type)) region->fd = kvm_memfd_alloc(region->mmap_size, src_type == VM_MEM_SRC_SHARED_HUGETLB); + else + region->fd = -1; region->mmap_start = kvm_mmap(region->mmap_size, PROT_READ | PROT_WRITE, vm_mem_backing_src_alias(src_type)->flag, @@ -1089,29 +1119,6 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, } region->backing_src_type = src_type; - - if (flags & KVM_MEM_GUEST_MEMFD) { - if (guest_memfd < 0) { - uint32_t guest_memfd_flags = 0; - TEST_ASSERT(!guest_memfd_offset, - "Offset must be zero when creating new guest_memfd"); - guest_memfd = vm_create_guest_memfd(vm, mem_size, guest_memfd_flags); - } else { - /* - * Install a unique fd for each memslot so that the fd - * can be closed when the region is deleted without - * needing to track if the fd is owned by the framework - * or by the caller. - */ - guest_memfd = kvm_dup(guest_memfd); - } - - region->region.guest_memfd = guest_memfd; - region->region.guest_memfd_offset = guest_memfd_offset; - } else { - region->region.guest_memfd = -1; - } - region->unused_phy_pages = sparsebit_alloc(); if (vm_arch_has_protected_memory(vm)) region->protected_phy_pages = sparsebit_alloc(); diff --git a/tools/testing/selftests/kvm/lib/test_util.c b/tools/testing/selftests/kvm/lib/test_util.c index 8a1848586a8570..ce9fe027151579 100644 --- a/tools/testing/selftests/kvm/lib/test_util.c +++ b/tools/testing/selftests/kvm/lib/test_util.c @@ -306,6 +306,14 @@ const struct vm_mem_backing_src_alias *vm_mem_backing_src_alias(uint32_t i) */ .flag = MAP_SHARED, }, + [VM_MEM_SRC_GUEST_MEMFD] = { + .name = "guest_memfd", + .flag = MAP_SHARED, + }, + [VM_MEM_SRC_GUEST_MEMFD_NO_DIRECT_MAP] = { + .name = "guest_memfd_no_direct_map", + .flag = MAP_SHARED, + } }; _Static_assert(ARRAY_SIZE(aliases) == NUM_SRC_TYPES, "Missing new backing src types?"); diff --git a/tools/testing/selftests/kvm/lib/x86/sev.c b/tools/testing/selftests/kvm/lib/x86/sev.c index c3a9838f4806a5..d920880e4fc0b9 100644 --- a/tools/testing/selftests/kvm/lib/x86/sev.c +++ b/tools/testing/selftests/kvm/lib/x86/sev.c @@ -164,6 +164,7 @@ struct kvm_vm *vm_sev_create_with_one_vcpu(uint32_t type, void *guest_code, struct vm_shape shape = { .mode = VM_MODE_DEFAULT, .type = type, + .src_type = VM_MEM_SRC_ANONYMOUS, }; struct kvm_vm *vm; struct kvm_vcpu *cpus[1]; diff --git a/tools/testing/selftests/kvm/pre_fault_memory_test.c b/tools/testing/selftests/kvm/pre_fault_memory_test.c index 93e603d91311cc..8a4d5af53fab7f 100644 --- a/tools/testing/selftests/kvm/pre_fault_memory_test.c +++ b/tools/testing/selftests/kvm/pre_fault_memory_test.c @@ -165,6 +165,7 @@ static void __test_pre_fault_memory(unsigned long vm_type, bool private) const struct vm_shape shape = { .mode = VM_MODE_DEFAULT, .type = vm_type, + .src_type = VM_MEM_SRC_ANONYMOUS, }; struct kvm_vcpu *vcpu; struct kvm_run *run; diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c index 7fe427ff9b38ca..cb445d420e8ca7 100644 --- a/tools/testing/selftests/kvm/set_memory_region_test.c +++ b/tools/testing/selftests/kvm/set_memory_region_test.c @@ -602,6 +602,41 @@ static void test_mmio_during_vectoring(void) kvm_vm_free(vm); } + +static void guest_code_trigger_mmio(void) +{ + /* + * Read some GPA that is not backed by a memslot. KVM consider this + * as MMIO and tell userspace to emulate the read. + */ + READ_ONCE(*((uint64_t *)MEM_REGION_GPA)); + + GUEST_DONE(); +} + +static void test_guest_memfd_mmio(void) +{ + struct kvm_vm *vm; + struct kvm_vcpu *vcpu; + struct vm_shape shape = { + .mode = VM_MODE_DEFAULT, + .src_type = VM_MEM_SRC_GUEST_MEMFD_NO_DIRECT_MAP, + }; + pthread_t vcpu_thread; + + pr_info("Testing MMIO emulation for instructions in gmem\n"); + + vm = __vm_create_shape_with_one_vcpu(shape, &vcpu, 0, guest_code_trigger_mmio); + + virt_map(vm, MEM_REGION_GPA, MEM_REGION_GPA, 1); + + pthread_create(&vcpu_thread, NULL, vcpu_worker, vcpu); + + /* If the MMIO read was successfully emulated, the vcpu thread will exit */ + pthread_join(vcpu_thread, NULL); + + kvm_vm_free(vm); +} #endif int main(int argc, char *argv[]) @@ -625,10 +660,19 @@ int main(int argc, char *argv[]) test_add_max_memory_regions(); #ifdef __x86_64__ - if (kvm_has_cap(KVM_CAP_GUEST_MEMFD) && - (kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SW_PROTECTED_VM))) { - test_add_private_memory_region(); - test_add_overlapping_private_memory_regions(); + if (kvm_has_cap(KVM_CAP_GUEST_MEMFD)) { + uint64_t valid_flags = kvm_check_cap(KVM_CAP_GUEST_MEMFD_FLAGS); + + if (kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SW_PROTECTED_VM)) { + test_add_private_memory_region(); + test_add_overlapping_private_memory_regions(); + } + + if ((valid_flags & GUEST_MEMFD_FLAG_MMAP) && + (valid_flags & GUEST_MEMFD_FLAG_NO_DIRECT_MAP)) + test_guest_memfd_mmio(); + else + pr_info("Skipping tests requiring GUEST_MEMFD_FLAG_MMAP | GUEST_MEMFD_FLAG_NO_DIRECT_MAP"); } else { pr_info("Skipping tests for KVM_MEM_GUEST_MEMFD memory regions\n"); } diff --git a/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c b/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c index 1969f4ab9b280d..8767cb4a037e6f 100644 --- a/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c +++ b/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c @@ -367,7 +367,7 @@ static void *__test_mem_conversions(void *__vcpu) } static void test_mem_conversions(enum vm_mem_backing_src_type src_type, uint32_t nr_vcpus, - uint32_t nr_memslots) + uint32_t nr_memslots, uint64_t gmem_flags) { /* * Allocate enough memory so that each vCPU's chunk of memory can be @@ -394,7 +394,7 @@ static void test_mem_conversions(enum vm_mem_backing_src_type src_type, uint32_t vm_enable_cap(vm, KVM_CAP_EXIT_HYPERCALL, (1 << KVM_HC_MAP_GPA_RANGE)); - memfd = vm_create_guest_memfd(vm, memfd_size, 0); + memfd = vm_create_guest_memfd(vm, memfd_size, gmem_flags); for (i = 0; i < nr_memslots; i++) vm_mem_add(vm, src_type, BASE_DATA_GPA + slot_size * i, @@ -474,7 +474,8 @@ int main(int argc, char *argv[]) } } - test_mem_conversions(src_type, nr_vcpus, nr_memslots); + test_mem_conversions(src_type, nr_vcpus, nr_memslots, 0); + test_mem_conversions(src_type, nr_vcpus, nr_memslots, GUEST_MEMFD_FLAG_NO_DIRECT_MAP); return 0; } diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 923c51a3a5256c..f449b40ae628c1 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -7,6 +7,7 @@ #include #include #include +#include #include "kvm_mm.h" @@ -76,6 +77,35 @@ static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slo return 0; } +#define KVM_GMEM_FOLIO_NO_DIRECT_MAP BIT(0) + +static bool kvm_gmem_folio_no_direct_map(struct folio *folio) +{ + return ((u64)folio->private) & KVM_GMEM_FOLIO_NO_DIRECT_MAP; +} + +static int kvm_gmem_folio_zap_direct_map(struct folio *folio) +{ + u64 gmem_flags = GMEM_I(folio_inode(folio))->flags; + int r = 0; + + if (kvm_gmem_folio_no_direct_map(folio) || !(gmem_flags & GUEST_MEMFD_FLAG_NO_DIRECT_MAP)) + goto out; + + r = folio_zap_direct_map(folio); + if (!r) + folio->private = (void *)((u64)folio->private | KVM_GMEM_FOLIO_NO_DIRECT_MAP); + +out: + return r; +} + +static void kvm_gmem_folio_restore_direct_map(struct folio *folio) +{ + folio_restore_direct_map(folio); + folio->private = (void *)((u64)folio->private & ~KVM_GMEM_FOLIO_NO_DIRECT_MAP); +} + /* * Process @folio, which contains @gfn, so that the guest can use it. * The folio must be locked and the gfn must be contained in @slot. @@ -388,11 +418,17 @@ static bool kvm_gmem_supports_mmap(struct inode *inode) return GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_MMAP; } +static bool kvm_gmem_no_direct_map(struct inode *inode) +{ + return GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_NO_DIRECT_MAP; +} + static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf) { struct inode *inode = file_inode(vmf->vma->vm_file); struct folio *folio; vm_fault_t ret = VM_FAULT_LOCKED; + int err; if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode)) return VM_FAULT_SIGBUS; @@ -418,6 +454,14 @@ static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf) folio_mark_uptodate(folio); } + if (kvm_gmem_no_direct_map(folio_inode(folio))) { + err = kvm_gmem_folio_zap_direct_map(folio); + if (err) { + ret = vmf_error(err); + goto out_folio; + } + } + vmf->page = folio_file_page(folio, vmf->pgoff); out_folio: @@ -522,24 +566,23 @@ static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *fol return MF_DELAYED; } -#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE static void kvm_gmem_free_folio(struct folio *folio) { struct page *page = folio_page(folio, 0); kvm_pfn_t pfn = page_to_pfn(page); int order = folio_order(folio); + if (kvm_gmem_folio_no_direct_map(folio)) + kvm_gmem_folio_restore_direct_map(folio); + kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order)); } -#endif static const struct address_space_operations kvm_gmem_aops = { .dirty_folio = noop_dirty_folio, .migrate_folio = kvm_gmem_migrate_folio, .error_remove_folio = kvm_gmem_error_folio, -#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE .free_folio = kvm_gmem_free_folio, -#endif }; static int kvm_gmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry, @@ -595,6 +638,9 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) /* Unmovable mappings are supposed to be marked unevictable as well. */ WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping)); + if (flags & GUEST_MEMFD_FLAG_NO_DIRECT_MAP) + mapping_set_no_direct_map(inode->i_mapping); + GMEM_I(inode)->flags = flags; file = alloc_file_pseudo(inode, kvm_gmem_mnt, name, O_RDWR, &kvm_gmem_fops); @@ -807,13 +853,22 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, } r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio); + if (r) + goto out_unlock; + if (kvm_gmem_no_direct_map(folio_inode(folio))) { + r = kvm_gmem_folio_zap_direct_map(folio); + if (r) + goto out_unlock; + } + + *page = folio_file_page(folio, index); folio_unlock(folio); + return 0; - if (!r) - *page = folio_file_page(folio, index); - else - folio_put(folio); +out_unlock: + folio_unlock(folio); + folio_put(folio); return r; }