diff --git a/Documentation/firmware-guide/acpi/apei/einj.rst b/Documentation/firmware-guide/acpi/apei/einj.rst index e588bccf5158370fb03dfe4db06b20ee93114108..2fa989c0a12d9e8138711179a5426b04da1e056e 100644 --- a/Documentation/firmware-guide/acpi/apei/einj.rst +++ b/Documentation/firmware-guide/acpi/apei/einj.rst @@ -181,5 +181,24 @@ You should see something like this in dmesg:: [22715.834759] EDAC sbridge MC3: PROCESSOR 0:306e7 TIME 1422553404 SOCKET 0 APIC 0 [22716.616173] EDAC MC3: 1 CE memory read error on CPU_SrcID#0_Channel#0_DIMM#0 (channel:0 slot:0 page:0x12345 offset:0x0 grain:32 syndrome:0x0 - area:DRAM err_code:0001:0090 socket:0 channel_mask:1 rank:0) +Special notes for injection into SGX enclaves: + +There may be a separate BIOS setup option to enable SGX injection. + +The injection process consists of setting some special memory controller +trigger that will inject the error on the next write to the target +address. But the h/w prevents any software outside of an SGX enclave +from accessing enclave pages (even BIOS SMM mode). + +The following sequence can be used: + 1) Determine physical address of enclave page + 2) Use "notrigger=1" mode to inject (this will setup + the injection address, but will not actually inject) + 3) Enter the enclave + 4) Store data to the virtual address matching physical address from step 1 + 5) Execute CLFLUSH for that virtual address + 6) Spin delay for 250ms + 7) Read from the virtual address. This will trigger the error + For more information about EINJ, please refer to ACPI specification version 4.0, section 17.5 and ACPI 5.0, section 18.6. diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 040fb77366dd7ed693753524b3cf616dfe56e23a..aa217be83f58b6e219a2361b7d10259aee6251a1 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1967,6 +1967,7 @@ config X86_SGX select SRCU select MMU_NOTIFIER select NUMA_KEEP_MEMINFO if NUMA + select XARRAY_MULTI help Intel(R) Software Guard eXtensions (SGX) is a set of CPU instructions that can be used by applications to set aside private regions of code diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index d428d611a43a97fa4ce48671c8ac0a89b73e7b55..a3d0152a4361386eab160edab502be504b3531c1 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -858,4 +858,12 @@ enum mds_mitigations { MDS_MITIGATION_VMWERV, }; +#ifdef CONFIG_X86_SGX +int arch_memory_failure(unsigned long pfn, int flags); +#define arch_memory_failure arch_memory_failure + +bool arch_is_platform_page(u64 paddr); +#define arch_is_platform_page arch_is_platform_page +#endif + #endif /* _ASM_X86_PROCESSOR_H */ diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h index 5948218f35c584e00d605a4b76115c2bd9897cda..ef47246d22f5d25126d19db570755b5649a5f197 100644 --- a/arch/x86/include/asm/set_memory.h +++ b/arch/x86/include/asm/set_memory.h @@ -2,6 +2,7 @@ #ifndef _ASM_X86_SET_MEMORY_H #define _ASM_X86_SET_MEMORY_H +#include #include #include @@ -97,6 +98,9 @@ static inline int set_mce_nospec(unsigned long pfn, bool unmap) unsigned long decoy_addr; int rc; + /* SGX pages are not in the 1:1 map */ + if (arch_is_platform_page(pfn << PAGE_SHIFT)) + return 0; /* * We would like to just call: * set_memory_XX((unsigned long)pfn_to_kaddr(pfn), 1); diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 379eeefa744a496a5f1beab8e73be5027f324c22..4036f50fc42ceed02848ab46909b4b45f68be6aa 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -20,6 +20,7 @@ struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS]; static int sgx_nr_epc_sections; static struct task_struct *ksgxd_tsk; static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq); +static DEFINE_XARRAY(sgx_epc_address_space); /* * These variables are part of the state of the reclaimer, and must be accessed @@ -60,6 +61,24 @@ static void __sgx_sanitize_pages(struct list_head *dirty_page_list) page = list_first_entry(dirty_page_list, struct sgx_epc_page, list); + /* + * Checking page->poison without holding the node->lock + * is racy, but losing the race (i.e. poison is set just + * after the check) just means __eremove() will be uselessly + * called for a page that sgx_free_epc_page() will put onto + * the node->sgx_poison_page_list later. + */ + if (page->poison) { + struct sgx_epc_section *section = &sgx_epc_sections[page->section]; + struct sgx_numa_node *node = section->node; + + spin_lock(&node->lock); + list_move(&page->list, &node->sgx_poison_page_list); + spin_unlock(&node->lock); + + continue; + } + ret = __eremove(sgx_get_epc_virt_addr(page)); if (!ret) { /* @@ -471,6 +490,7 @@ static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid) page = list_first_entry(&node->free_page_list, struct sgx_epc_page, list); list_del_init(&page->list); + page->flags = 0; spin_unlock(&node->lock); atomic_long_dec(&sgx_nr_free_pages); @@ -624,7 +644,12 @@ void sgx_free_epc_page(struct sgx_epc_page *page) spin_lock(&node->lock); - list_add_tail(&page->list, &node->free_page_list); + page->owner = NULL; + if (page->poison) + list_add(&page->list, &node->sgx_poison_page_list); + else + list_add_tail(&page->list, &node->free_page_list); + page->flags = SGX_EPC_PAGE_IS_FREE; spin_unlock(&node->lock); atomic_long_inc(&sgx_nr_free_pages); @@ -648,17 +673,102 @@ static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size, } section->phys_addr = phys_addr; + xa_store_range(&sgx_epc_address_space, section->phys_addr, + phys_addr + size - 1, section, GFP_KERNEL); for (i = 0; i < nr_pages; i++) { section->pages[i].section = index; section->pages[i].flags = 0; section->pages[i].owner = NULL; + section->pages[i].poison = 0; list_add_tail(§ion->pages[i].list, &sgx_dirty_page_list); } return true; } +bool arch_is_platform_page(u64 paddr) +{ + return !!xa_load(&sgx_epc_address_space, paddr); +} +EXPORT_SYMBOL_GPL(arch_is_platform_page); + +static struct sgx_epc_page *sgx_paddr_to_page(u64 paddr) +{ + struct sgx_epc_section *section; + + section = xa_load(&sgx_epc_address_space, paddr); + if (!section) + return NULL; + + return §ion->pages[PFN_DOWN(paddr - section->phys_addr)]; +} + +/* + * Called in process context to handle a hardware reported + * error in an SGX EPC page. + * If the MF_ACTION_REQUIRED bit is set in flags, then the + * context is the task that consumed the poison data. Otherwise + * this is called from a kernel thread unrelated to the page. + */ +int arch_memory_failure(unsigned long pfn, int flags) +{ + struct sgx_epc_page *page = sgx_paddr_to_page(pfn << PAGE_SHIFT); + struct sgx_epc_section *section; + struct sgx_numa_node *node; + + /* + * mm/memory-failure.c calls this routine for all errors + * where there isn't a "struct page" for the address. But that + * includes other address ranges besides SGX. + */ + if (!page) + return -ENXIO; + + /* + * If poison was consumed synchronously. Send a SIGBUS to + * the task. Hardware has already exited the SGX enclave and + * will not allow re-entry to an enclave that has a memory + * error. The signal may help the task understand why the + * enclave is broken. + */ + if (flags & MF_ACTION_REQUIRED) + force_sig(SIGBUS); + + section = &sgx_epc_sections[page->section]; + node = section->node; + + spin_lock(&node->lock); + + /* Already poisoned? Nothing more to do */ + if (page->poison) + goto out; + + page->poison = 1; + + /* + * If the page is on a free list, move it to the per-node + * poison page list. + */ + if (page->flags & SGX_EPC_PAGE_IS_FREE) { + list_move(&page->list, &node->sgx_poison_page_list); + goto out; + } + + /* + * TBD: Add additional plumbing to enable pre-emptive + * action for asynchronous poison notification. Until + * then just hope that the poison: + * a) is not accessed - sgx_free_epc_page() will deal with it + * when the user gives it back + * b) results in a recoverable machine check rather than + * a fatal one + */ +out: + spin_unlock(&node->lock); + return 0; +} + /** * A section metric is concatenated in a way that @low bits 12-31 define the * bits 12-31 of the metric and @high bits 0-19 define the bits 32-51 of the @@ -713,6 +823,7 @@ static bool __init sgx_page_cache_init(void) if (!node_isset(nid, sgx_numa_mask)) { spin_lock_init(&sgx_numa_nodes[nid].lock); INIT_LIST_HEAD(&sgx_numa_nodes[nid].free_page_list); + INIT_LIST_HEAD(&sgx_numa_nodes[nid].sgx_poison_page_list); node_set(nid, sgx_numa_mask); } diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h index 4628acec000915e2d2b41895088c3edbe5033c14..9ec3136c780091ca538fb9d152820216118f7894 100644 --- a/arch/x86/kernel/cpu/sgx/sgx.h +++ b/arch/x86/kernel/cpu/sgx/sgx.h @@ -26,9 +26,13 @@ /* Pages, which are being tracked by the page reclaimer. */ #define SGX_EPC_PAGE_RECLAIMER_TRACKED BIT(0) +/* Pages on free list */ +#define SGX_EPC_PAGE_IS_FREE BIT(1) + struct sgx_epc_page { unsigned int section; - unsigned int flags; + u16 flags; + u16 poison; struct sgx_encl_page *owner; struct list_head list; }; @@ -39,6 +43,7 @@ struct sgx_epc_page { */ struct sgx_numa_node { struct list_head free_page_list; + struct list_head sgx_poison_page_list; spinlock_t lock; }; diff --git a/drivers/acpi/apei/einj.c b/drivers/acpi/apei/einj.c index 1331567595512f7ddc6b880a0d1e7ffd499677e1..4384269ad1591ec688aa43dc79a7a02b8b346c78 100644 --- a/drivers/acpi/apei/einj.c +++ b/drivers/acpi/apei/einj.c @@ -544,7 +544,8 @@ static int einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2, ((region_intersects(base_addr, size, IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE) != REGION_INTERSECTS) && (region_intersects(base_addr, size, IORESOURCE_MEM, IORES_DESC_PERSISTENT_MEMORY) - != REGION_INTERSECTS))) + != REGION_INTERSECTS) && + !arch_is_platform_page(base_addr))) return -EINVAL; inject: diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index 9ac1e45dbba0f83bf77319543db878dcf09990ae..9c38c2cdd2fd218356112ccf26fb5b1cfec31c9f 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -452,7 +452,7 @@ static bool ghes_do_memory_failure(u64 physical_addr, int flags) return false; pfn = PHYS_PFN(physical_addr); - if (!pfn_valid(pfn)) { + if (!pfn_valid(pfn) && !arch_is_platform_page(physical_addr)) { pr_warn_ratelimited(FW_WARN GHES_PFX "Invalid address in generic error data: %#llx\n", physical_addr); diff --git a/include/linux/mm.h b/include/linux/mm.h index b0ddfac0486b7da51f8e97a0e75dae6e3343f251..25891b581bf4e2d0352d5754bef0071053f7b4c3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -425,6 +425,19 @@ extern unsigned int kobjsize(const void *objp); /* VMA basic access permission flags */ #define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC) +#ifndef arch_memory_failure +static inline int arch_memory_failure(unsigned long pfn, int flags) +{ + return -ENXIO; +} +#endif + +#ifndef arch_is_platform_page +static inline bool arch_is_platform_page(u64 paddr) +{ + return false; +} +#endif /* * Special vmas that are non-mergable, non-mlock()able. diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 0519f20d2b57d708960c38052548ae68990df54e..bfa6d1478a7588133d4eeaad749062859ce273e8 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1370,21 +1370,28 @@ int memory_failure(unsigned long pfn, int flags) if (!sysctl_memory_failure_recovery) panic("Memory failure on page %lx", pfn); + mutex_lock(&mf_mutex); + p = pfn_to_online_page(pfn); if (!p) { + res = arch_memory_failure(pfn, flags); + if (res == 0) + goto unlock_mutex; + if (pfn_valid(pfn)) { pgmap = get_dev_pagemap(pfn, NULL); - if (pgmap) - return memory_failure_dev_pagemap(pfn, flags, - pgmap); + if (pgmap) { + res = memory_failure_dev_pagemap(pfn, flags, + pgmap); + goto unlock_mutex; + } } pr_err("Memory failure: %#lx: memory outside kernel control\n", pfn); - return -ENXIO; + res = -ENXIO; + goto unlock_mutex; } - mutex_lock(&mf_mutex); - try_again: if (PageHuge(p)) { res = memory_failure_hugetlb(pfn, flags);