From 84ddf6b0e04d8c0ec40735094a58e35ef3623fe5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 13 Nov 2020 00:01:17 +0200 Subject: [PATCH 001/674] x86/mm: Signal SIGSEGV with PF_SGX mainline inclusion from mainline-5.11 commit 74faeee06db81a06add0def6a394210c8fef0ab7 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK CVE: NA Intel-SIG: commit 74faeee06db8 x86/mm: Signal SIGSEGV with PF_SGX. Backport for SGX Foundations support -------------------------------- The x86 architecture has a set of page fault error codes. These indicate things like whether the fault occurred from a write, or whether it originated in userspace. The SGX hardware architecture has its own per-page memory management metadata (EPCM) [*] and hardware which is separate from the normal x86 MMU. The architecture has a new page fault error code: PF_SGX. This new error code bit is set whenever a page fault occurs as the result of the SGX MMU. These faults occur for a variety of reasons. For instance, an access attempt to enclave memory from outside the enclave causes a PF_SGX fault. PF_SGX would also be set for permission conflicts, such as if a write to an enclave page occurs and the page is marked read-write in the x86 page tables but is read-only in the EPCM. These faults do not always indicate errors, though. SGX pages are encrypted with a key that is destroyed at hardware reset, including suspend. Throwing a SIGSEGV allows user space software to react and recover when these events occur. Include PF_SGX in the PF error codes list and throw SIGSEGV when it is encountered. [*] Intel SDM: 36.5.1 Enclave Page Cache Map (EPCM) [ bp: Add bit 15 to the comment above enum x86_pf_error_code too. ] Signed-off-by: Sean Christopherson Signed-off-by: Jarkko Sakkinen Signed-off-by: Borislav Petkov Acked-by: Jethro Beekman Link: https://lkml.kernel.org/r/20201112220135.165028-7-jarkko@kernel.org Signed-off-by: Fan Du Signed-off-by: Zhiquan Li --- arch/x86/include/asm/trap_pf.h | 2 ++ arch/x86/mm/fault.c | 12 ++++++++++++ 2 files changed, 14 insertions(+) diff --git a/arch/x86/include/asm/trap_pf.h b/arch/x86/include/asm/trap_pf.h index 305bc1214aef..10b1de500ab1 100644 --- a/arch/x86/include/asm/trap_pf.h +++ b/arch/x86/include/asm/trap_pf.h @@ -11,6 +11,7 @@ * bit 3 == 1: use of reserved bit detected * bit 4 == 1: fault was an instruction fetch * bit 5 == 1: protection keys block access + * bit 15 == 1: SGX MMU page-fault */ enum x86_pf_error_code { X86_PF_PROT = 1 << 0, @@ -19,6 +20,7 @@ enum x86_pf_error_code { X86_PF_RSVD = 1 << 3, X86_PF_INSTR = 1 << 4, X86_PF_PK = 1 << 5, + X86_PF_SGX = 1 << 15, }; #endif /* _ASM_X86_TRAP_PF_H */ diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 34112c63b347..058ff3f6944c 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1123,6 +1123,18 @@ access_error(unsigned long error_code, struct vm_area_struct *vma) if (error_code & X86_PF_PK) return 1; + /* + * SGX hardware blocked the access. This usually happens + * when the enclave memory contents have been destroyed, like + * after a suspend/resume cycle. In any case, the kernel can't + * fix the cause of the fault. Handle the fault as an access + * error even in cases where no actual access violation + * occurred. This allows userspace to rebuild the enclave in + * response to the signal. + */ + if (unlikely(error_code & X86_PF_SGX)) + return 1; + /* * Make sure to check the VMA so that we do not perform * faults just to hit a X86_PF_PK as soon as we fill in a -- Gitee From 413a8d39e19850afc962b34f23deb0728719fc6e Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Fri, 19 Mar 2021 20:22:17 +1300 Subject: [PATCH 002/674] x86/cpufeatures: Make SGX_LC feature bit depend on SGX bit mainline inclusion from mainline-5.13 commit e9a15a40e857fc6ccfbb05fec7b184e9003057df category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK CVE: NA Intel-SIG: commit e9a15a40e857 x86/cpufeatures: Make SGX_LC feature bit depend on SGX bit. Backport for SGX virtualization support -------------------------------- Move SGX_LC feature bit to CPUID dependency table to make clearing all SGX feature bits easier. Also remove clear_sgx_caps() since it is just a wrapper of setup_clear_cpu_cap(X86_FEATURE_SGX) now. Suggested-by: Sean Christopherson Signed-off-by: Kai Huang Signed-off-by: Borislav Petkov Reviewed-by: Sean Christopherson Acked-by: Dave Hansen Acked-by: Jarkko Sakkinen Link: https://lkml.kernel.org/r/5d4220fd0a39f52af024d3fa166231c1d498dd10.1616136308.git.kai.huang@intel.com Signed-off-by: Fan Du Signed-off-by: Zhiquan Li --- arch/x86/kernel/cpu/cpuid-deps.c | 1 + arch/x86/kernel/cpu/feat_ctl.c | 12 +++--------- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index d502241995a3..b34632ea6d70 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -71,6 +71,7 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_AVX512_BF16, X86_FEATURE_AVX512VL }, { X86_FEATURE_ENQCMD, X86_FEATURE_XSAVES }, { X86_FEATURE_PER_THREAD_MBA, X86_FEATURE_MBA }, + { X86_FEATURE_SGX_LC, X86_FEATURE_SGX }, {} }; diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c index 3b1b01f2b248..27533a6e04fa 100644 --- a/arch/x86/kernel/cpu/feat_ctl.c +++ b/arch/x86/kernel/cpu/feat_ctl.c @@ -93,15 +93,9 @@ static void init_vmx_capabilities(struct cpuinfo_x86 *c) } #endif /* CONFIG_X86_VMX_FEATURE_NAMES */ -static void clear_sgx_caps(void) -{ - setup_clear_cpu_cap(X86_FEATURE_SGX); - setup_clear_cpu_cap(X86_FEATURE_SGX_LC); -} - static int __init nosgx(char *str) { - clear_sgx_caps(); + setup_clear_cpu_cap(X86_FEATURE_SGX); return 0; } @@ -116,7 +110,7 @@ void init_ia32_feat_ctl(struct cpuinfo_x86 *c) if (rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr)) { clear_cpu_cap(c, X86_FEATURE_VMX); - clear_sgx_caps(); + clear_cpu_cap(c, X86_FEATURE_SGX); return; } @@ -177,6 +171,6 @@ void init_ia32_feat_ctl(struct cpuinfo_x86 *c) !(msr & FEAT_CTL_SGX_LC_ENABLED) || !enable_sgx) { if (enable_sgx) pr_err_once("SGX disabled by BIOS\n"); - clear_sgx_caps(); + clear_cpu_cap(c, X86_FEATURE_SGX); } } -- Gitee From 4a5aef2a96bdfe125d9f893aec4d226f573deb5c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Mar 2021 20:22:18 +1300 Subject: [PATCH 003/674] x86/cpufeatures: Add SGX1 and SGX2 sub-features mainline inclusion from mainline-5.13 commit b8921dccf3b25798409d35155b5d127085de72c2 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK CVE: NA Intel-SIG: commit b8921dccf3b2 x86/cpufeatures: Add SGX1 and SGX2 sub-features. Backport for SGX virtualization support -------------------------------- Add SGX1 and SGX2 feature flags, via CPUID.0x12.0x0.EAX, as scattered features, since adding a new leaf for only two bits would be wasteful. As part of virtualizing SGX, KVM will expose the SGX CPUID leafs to its guest, and to do so correctly needs to query hardware and kernel support for SGX1 and SGX2. Suppress both SGX1 and SGX2 from /proc/cpuinfo. SGX1 basically means SGX, and for SGX2 there is no concrete use case of using it in /proc/cpuinfo. Signed-off-by: Sean Christopherson Signed-off-by: Kai Huang Signed-off-by: Borislav Petkov Acked-by: Dave Hansen Acked-by: Jarkko Sakkinen Link: https://lkml.kernel.org/r/d787827dbfca6b3210ac3e432e3ac1202727e786.1616136308.git.kai.huang@intel.com Signed-off-by: Fan Du Signed-off-by: Zhiquan Li --- arch/x86/include/asm/cpufeatures.h | 2 ++ arch/x86/kernel/cpu/cpuid-deps.c | 2 ++ arch/x86/kernel/cpu/scattered.c | 2 ++ 3 files changed, 6 insertions(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 4da419226377..54966a376b99 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -291,6 +291,8 @@ #define X86_FEATURE_FENCE_SWAPGS_KERNEL (11*32+ 5) /* "" LFENCE in kernel entry SWAPGS path */ #define X86_FEATURE_SPLIT_LOCK_DETECT (11*32+ 6) /* #AC for split lock */ #define X86_FEATURE_PER_THREAD_MBA (11*32+ 7) /* "" Per-thread Memory Bandwidth Allocation */ +#define X86_FEATURE_SGX1 (11*32+ 8) /* "" Basic SGX */ +#define X86_FEATURE_SGX2 (11*32+ 9) /* "" SGX Enclave Dynamic Memory Management (EDMM) */ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ #define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */ diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index b34632ea6d70..28c75066815c 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -72,6 +72,8 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_ENQCMD, X86_FEATURE_XSAVES }, { X86_FEATURE_PER_THREAD_MBA, X86_FEATURE_MBA }, { X86_FEATURE_SGX_LC, X86_FEATURE_SGX }, + { X86_FEATURE_SGX1, X86_FEATURE_SGX }, + { X86_FEATURE_SGX2, X86_FEATURE_SGX1 }, {} }; diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 866c9a9bcdee..839b54a08e09 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -36,6 +36,8 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_CDP_L2, CPUID_ECX, 2, 0x00000010, 2 }, { X86_FEATURE_MBA, CPUID_EBX, 3, 0x00000010, 0 }, { X86_FEATURE_PER_THREAD_MBA, CPUID_ECX, 0, 0x00000010, 3 }, + { X86_FEATURE_SGX1, CPUID_EAX, 0, 0x00000012, 0 }, + { X86_FEATURE_SGX2, CPUID_EAX, 1, 0x00000012, 0 }, { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 }, { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, -- Gitee From e230e731c6d6ec793fb02b448508920d28ef2bca Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Thu, 25 Mar 2021 22:30:57 +1300 Subject: [PATCH 004/674] x86/sgx: Wipe out EREMOVE from sgx_free_epc_page() mainline inclusion from mainline-5.13 commit b0c7459be0670fabe080e30906ba9fe62df5e02c category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK CVE: NA Intel-SIG: commit b0c7459be067 x86/sgx: Wipe out EREMOVE from sgx_free_epc_page(). Backport for SGX virtualization support -------------------------------- EREMOVE takes a page and removes any association between that page and an enclave. It must be run on a page before it can be added into another enclave. Currently, EREMOVE is run as part of pages being freed into the SGX page allocator. It is not expected to fail, as it would indicate a use-after-free of EPC pages. Rather than add the page back to the pool of available EPC pages, the kernel intentionally leaks the page to avoid additional errors in the future. However, KVM does not track how guest pages are used, which means that SGX virtualization use of EREMOVE might fail. Specifically, it is legitimate that EREMOVE returns SGX_CHILD_PRESENT for EPC assigned to KVM guest, because KVM/kernel doesn't track SECS pages. To allow SGX/KVM to introduce a more permissive EREMOVE helper and to let the SGX virtualization code use the allocator directly, break out the EREMOVE call from the SGX page allocator. Rename the original sgx_free_epc_page() to sgx_encl_free_epc_page(), indicating that it is used to free an EPC page assigned to a host enclave. Replace sgx_free_epc_page() with sgx_encl_free_epc_page() in all call sites so there's no functional change. At the same time, improve the error message when EREMOVE fails, and add documentation to explain to the user what that failure means and to suggest to the user what to do when this bug happens in the case it happens. [ bp: Massage commit message, fix typos and sanitize text, simplify. ] Signed-off-by: Kai Huang Signed-off-by: Borislav Petkov Reviewed-by: Jarkko Sakkinen Link: https://lkml.kernel.org/r/20210325093057.122834-1-kai.huang@intel.com Signed-off-by: Fan Du Signed-off-by: Zhiquan Li --- Documentation/x86/sgx.rst | 25 +++++++++++++++++++++++++ arch/x86/kernel/cpu/sgx/encl.c | 31 ++++++++++++++++++++++++++----- arch/x86/kernel/cpu/sgx/encl.h | 1 + arch/x86/kernel/cpu/sgx/ioctl.c | 6 +++--- arch/x86/kernel/cpu/sgx/main.c | 14 +++++--------- arch/x86/kernel/cpu/sgx/sgx.h | 4 ++++ 6 files changed, 64 insertions(+), 17 deletions(-) diff --git a/Documentation/x86/sgx.rst b/Documentation/x86/sgx.rst index eaee1368b4fd..f90076e67cde 100644 --- a/Documentation/x86/sgx.rst +++ b/Documentation/x86/sgx.rst @@ -209,3 +209,28 @@ An application may be loaded into a container enclave which is specially configured with a library OS and run-time which permits the application to run. The enclave run-time and library OS work together to execute the application when a thread enters the enclave. + +Impact of Potential Kernel SGX Bugs +=================================== + +EPC leaks +--------- + +When EPC page leaks happen, a WARNING like this is shown in dmesg: + +"EREMOVE returned ... and an EPC page was leaked. SGX may become unusable..." + +This is effectively a kernel use-after-free of an EPC page, and due +to the way SGX works, the bug is detected at freeing. Rather than +adding the page back to the pool of available EPC pages, the kernel +intentionally leaks the page to avoid additional errors in the future. + +When this happens, the kernel will likely soon leak more EPC pages, and +SGX will likely become unusable because the memory available to SGX is +limited. However, while this may be fatal to SGX, the rest of the kernel +is unlikely to be impacted and should continue to work. + +As a result, when this happpens, user should stop running any new +SGX workloads, (or just any new workloads), and migrate all valuable +workloads. Although a machine reboot can recover all EPC memory, the bug +should be reported to Linux developers. diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 3aaa709b5054..0c9f6cdd34a4 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -212,7 +212,7 @@ static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page, ret = __sgx_encl_eldu(encl_page, epc_page, secs_page); if (ret) { - sgx_free_epc_page(epc_page); + sgx_encl_free_epc_page(epc_page); return ERR_PTR(ret); } @@ -538,7 +538,7 @@ void sgx_encl_release(struct kref *ref) if (sgx_unmark_page_reclaimable(entry->epc_page)) continue; - sgx_free_epc_page(entry->epc_page); + sgx_encl_free_epc_page(entry->epc_page); encl->secs_child_cnt--; entry->epc_page = NULL; } @@ -549,7 +549,7 @@ void sgx_encl_release(struct kref *ref) xa_destroy(&encl->page_array); if (!encl->secs_child_cnt && encl->secs.epc_page) { - sgx_free_epc_page(encl->secs.epc_page); + sgx_encl_free_epc_page(encl->secs.epc_page); encl->secs.epc_page = NULL; } @@ -557,7 +557,7 @@ void sgx_encl_release(struct kref *ref) va_page = list_first_entry(&encl->va_pages, struct sgx_va_page, list); list_del(&va_page->list); - sgx_free_epc_page(va_page->epc_page); + sgx_encl_free_epc_page(va_page->epc_page); kfree(va_page); } @@ -818,7 +818,7 @@ struct sgx_epc_page *sgx_alloc_va_page(void) ret = __epa(sgx_get_epc_virt_addr(epc_page)); if (ret) { WARN_ONCE(1, "EPA returned %d (0x%x)", ret, ret); - sgx_free_epc_page(epc_page); + sgx_encl_free_epc_page(epc_page); return ERR_PTR(-EFAULT); } @@ -867,3 +867,24 @@ bool sgx_va_page_full(struct sgx_va_page *va_page) return slot == SGX_VA_SLOT_COUNT; } + +/** + * sgx_encl_free_epc_page - free an EPC page assigned to an enclave + * @page: EPC page to be freed + * + * Free an EPC page assigned to an enclave. It does EREMOVE for the page, and + * only upon success, it puts the page back to free page list. Otherwise, it + * gives a WARNING to indicate page is leaked. + */ +void sgx_encl_free_epc_page(struct sgx_epc_page *page) +{ + int ret; + + WARN_ON_ONCE(page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED); + + ret = __eremove(sgx_get_epc_virt_addr(page)); + if (WARN_ONCE(ret, EREMOVE_ERROR_MESSAGE, ret, ret)) + return; + + sgx_free_epc_page(page); +} diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h index d8d30ccbef4c..6e74f85b6264 100644 --- a/arch/x86/kernel/cpu/sgx/encl.h +++ b/arch/x86/kernel/cpu/sgx/encl.h @@ -115,5 +115,6 @@ struct sgx_epc_page *sgx_alloc_va_page(void); unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page); void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset); bool sgx_va_page_full(struct sgx_va_page *va_page); +void sgx_encl_free_epc_page(struct sgx_epc_page *page); #endif /* _X86_ENCL_H */ diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c index 2e10367ea66c..354e309fcdb7 100644 --- a/arch/x86/kernel/cpu/sgx/ioctl.c +++ b/arch/x86/kernel/cpu/sgx/ioctl.c @@ -47,7 +47,7 @@ static void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page) encl->page_cnt--; if (va_page) { - sgx_free_epc_page(va_page->epc_page); + sgx_encl_free_epc_page(va_page->epc_page); list_del(&va_page->list); kfree(va_page); } @@ -117,7 +117,7 @@ static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs) return 0; err_out: - sgx_free_epc_page(encl->secs.epc_page); + sgx_encl_free_epc_page(encl->secs.epc_page); encl->secs.epc_page = NULL; err_out_backing: @@ -365,7 +365,7 @@ static int sgx_encl_add_page(struct sgx_encl *encl, unsigned long src, mmap_read_unlock(current->mm); err_out_free: - sgx_free_epc_page(epc_page); + sgx_encl_free_epc_page(epc_page); kfree(encl_page); return ret; diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 13a7599ce7d4..b227629b1e9c 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -294,7 +294,7 @@ static void sgx_reclaimer_write(struct sgx_epc_page *epc_page, sgx_encl_ewb(encl->secs.epc_page, &secs_backing); - sgx_free_epc_page(encl->secs.epc_page); + sgx_encl_free_epc_page(encl->secs.epc_page); encl->secs.epc_page = NULL; sgx_encl_put_backing(&secs_backing, true); @@ -609,19 +609,15 @@ struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim) * sgx_free_epc_page() - Free an EPC page * @page: an EPC page * - * Call EREMOVE for an EPC page and insert it back to the list of free pages. + * Put the EPC page back to the list of free pages. It's the caller's + * responsibility to make sure that the page is in uninitialized state. In other + * words, do EREMOVE, EWB or whatever operation is necessary before calling + * this function. */ void sgx_free_epc_page(struct sgx_epc_page *page) { struct sgx_epc_section *section = &sgx_epc_sections[page->section]; struct sgx_numa_node *node = section->node; - int ret; - - WARN_ON_ONCE(page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED); - - ret = __eremove(sgx_get_epc_virt_addr(page)); - if (WARN_ONCE(ret, "EREMOVE returned %d (0x%x)", ret, ret)) - return; spin_lock(&node->lock); diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h index 2a2b5c857451..65b2e88dbedd 100644 --- a/arch/x86/kernel/cpu/sgx/sgx.h +++ b/arch/x86/kernel/cpu/sgx/sgx.h @@ -13,6 +13,10 @@ #undef pr_fmt #define pr_fmt(fmt) "sgx: " fmt +#define EREMOVE_ERROR_MESSAGE \ + "EREMOVE returned %d (0x%x) and an EPC page was leaked. SGX may become unusable. " \ + "Refer to Documentation/x86/sgx.rst for more information." + #define SGX_MAX_EPC_SECTIONS 8 #define SGX_EEXTEND_BLOCK_SIZE 256 #define SGX_NR_TO_SCAN 16 -- Gitee From afb84915d55a274c8723c9d0ece90c63f28621e6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Mar 2021 20:22:20 +1300 Subject: [PATCH 005/674] x86/sgx: Add SGX_CHILD_PRESENT hardware error code mainline inclusion from mainline-5.13 commit 231d3dbdda192e3b3c7b79f4c3b0616f6c7f31b7 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK CVE: NA Intel-SIG: commit 231d3dbdda19 x86/sgx: Add SGX_CHILD_PRESENT hardware error code. Backport for SGX virtualization support -------------------------------- SGX driver can accurately track how enclave pages are used. This enables SECS to be specifically targeted and EREMOVE'd only after all child pages have been EREMOVE'd. This ensures that SGX driver will never encounter SGX_CHILD_PRESENT in normal operation. Virtual EPC is different. The host does not track how EPC pages are used by the guest, so it cannot guarantee EREMOVE success. It might, for instance, encounter a SECS with a non-zero child count. Add a definition of SGX_CHILD_PRESENT. It will be used exclusively by the SGX virtualization driver to handle recoverable EREMOVE errors when saniziting EPC pages after they are freed. Signed-off-by: Sean Christopherson Signed-off-by: Kai Huang Signed-off-by: Borislav Petkov Acked-by: Dave Hansen Acked-by: Jarkko Sakkinen Link: https://lkml.kernel.org/r/050b198e882afde7e6eba8e6a0d4da39161dbb5a.1616136308.git.kai.huang@intel.com Signed-off-by: Fan Du Signed-off-by: Zhiquan Li --- arch/x86/include/asm/sgx.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/include/asm/sgx.h b/arch/x86/include/asm/sgx.h index d96e5451d28a..14bb5f7e221c 100644 --- a/arch/x86/include/asm/sgx.h +++ b/arch/x86/include/asm/sgx.h @@ -31,12 +31,14 @@ * enum sgx_return_code - The return code type for ENCLS, ENCLU and ENCLV * %SGX_NOT_TRACKED: Previous ETRACK's shootdown sequence has not * been completed yet. + * %SGX_CHILD_PRESENT SECS has child pages present in the EPC. * %SGX_INVALID_EINITTOKEN: EINITTOKEN is invalid and enclave signer's * public key does not match IA32_SGXLEPUBKEYHASH. * %SGX_UNMASKED_EVENT: An unmasked event, e.g. INTR, was received */ enum sgx_return_code { SGX_NOT_TRACKED = 11, + SGX_CHILD_PRESENT = 13, SGX_INVALID_EINITTOKEN = 16, SGX_UNMASKED_EVENT = 128, }; -- Gitee From 4fca036fe49fc7b29c8c2fb4e67a5dc64041507c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Mar 2021 20:22:21 +1300 Subject: [PATCH 006/674] x86/sgx: Introduce virtual EPC for use by KVM guests mainline inclusion from mainline-5.13 commit 540745ddbc70eabdc7dbd3fcc00fe4fb17cd59ba category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK CVE: NA Intel-SIG: commit 540745ddbc70 x86/sgx: Introduce virtual EPC for use by KVM guests. Backport for SGX virtualization support -------------------------------- Add a misc device /dev/sgx_vepc to allow userspace to allocate "raw" Enclave Page Cache (EPC) without an associated enclave. The intended and only known use case for raw EPC allocation is to expose EPC to a KVM guest, hence the 'vepc' moniker, virt.{c,h} files and X86_SGX_KVM Kconfig. The SGX driver uses the misc device /dev/sgx_enclave to support userspace in creating an enclave. Each file descriptor returned from opening /dev/sgx_enclave represents an enclave. Unlike the SGX driver, KVM doesn't control how the guest uses the EPC, therefore EPC allocated to a KVM guest is not associated with an enclave, and /dev/sgx_enclave is not suitable for allocating EPC for a KVM guest. Having separate device nodes for the SGX driver and KVM virtual EPC also allows separate permission control for running host SGX enclaves and KVM SGX guests. To use /dev/sgx_vepc to allocate a virtual EPC instance with particular size, the hypervisor opens /dev/sgx_vepc, and uses mmap() with the intended size to get an address range of virtual EPC. Then it may use the address range to create one KVM memory slot as virtual EPC for a guest. Implement the "raw" EPC allocation in the x86 core-SGX subsystem via /dev/sgx_vepc rather than in KVM. Doing so has two major advantages: - Does not require changes to KVM's uAPI, e.g. EPC gets handled as just another memory backend for guests. - EPC management is wholly contained in the SGX subsystem, e.g. SGX does not have to export any symbols, changes to reclaim flows don't need to be routed through KVM, SGX's dirty laundry doesn't have to get aired out for the world to see, and so on and so forth. The virtual EPC pages allocated to guests are currently not reclaimable. Reclaiming an EPC page used by enclave requires a special reclaim mechanism separate from normal page reclaim, and that mechanism is not supported for virutal EPC pages. Due to the complications of handling reclaim conflicts between guest and host, reclaiming virtual EPC pages is significantly more complex than basic support for SGX virtualization. [ bp: - Massage commit message and comments - use cpu_feature_enabled() - vertically align struct members init - massage Virtual EPC clarification text - move Kconfig prompt to Virtualization ] Signed-off-by: Sean Christopherson Co-developed-by: Kai Huang Signed-off-by: Kai Huang Signed-off-by: Borislav Petkov Acked-by: Dave Hansen Acked-by: Jarkko Sakkinen Link: https://lkml.kernel.org/r/0c38ced8c8e5a69872db4d6a1c0dabd01e07cad7.1616136308.git.kai.huang@intel.com Signed-off-by: Fan Du Signed-off-by: Zhiquan Li --- Documentation/x86/sgx.rst | 16 ++ arch/x86/kernel/cpu/sgx/Makefile | 1 + arch/x86/kernel/cpu/sgx/sgx.h | 9 ++ arch/x86/kernel/cpu/sgx/virt.c | 259 +++++++++++++++++++++++++++++++ arch/x86/kvm/Kconfig | 12 ++ 5 files changed, 297 insertions(+) create mode 100644 arch/x86/kernel/cpu/sgx/virt.c diff --git a/Documentation/x86/sgx.rst b/Documentation/x86/sgx.rst index f90076e67cde..dd0ac96ff9ef 100644 --- a/Documentation/x86/sgx.rst +++ b/Documentation/x86/sgx.rst @@ -234,3 +234,19 @@ As a result, when this happpens, user should stop running any new SGX workloads, (or just any new workloads), and migrate all valuable workloads. Although a machine reboot can recover all EPC memory, the bug should be reported to Linux developers. + + +Virtual EPC +=========== + +The implementation has also a virtual EPC driver to support SGX enclaves +in guests. Unlike the SGX driver, an EPC page allocated by the virtual +EPC driver doesn't have a specific enclave associated with it. This is +because KVM doesn't track how a guest uses EPC pages. + +As a result, the SGX core page reclaimer doesn't support reclaiming EPC +pages allocated to KVM guests through the virtual EPC driver. If the +user wants to deploy SGX applications both on the host and in guests +on the same machine, the user should reserve enough EPC (by taking out +total virtual EPC size of all SGX VMs from the physical EPC size) for +host SGX applications so they can run with acceptable performance. diff --git a/arch/x86/kernel/cpu/sgx/Makefile b/arch/x86/kernel/cpu/sgx/Makefile index 91d3dc784a29..9c1656779b2a 100644 --- a/arch/x86/kernel/cpu/sgx/Makefile +++ b/arch/x86/kernel/cpu/sgx/Makefile @@ -3,3 +3,4 @@ obj-y += \ encl.o \ ioctl.o \ main.o +obj-$(CONFIG_X86_SGX_KVM) += virt.o diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h index 65b2e88dbedd..e4cbc71bf136 100644 --- a/arch/x86/kernel/cpu/sgx/sgx.h +++ b/arch/x86/kernel/cpu/sgx/sgx.h @@ -84,4 +84,13 @@ void sgx_mark_page_reclaimable(struct sgx_epc_page *page); int sgx_unmark_page_reclaimable(struct sgx_epc_page *page); struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim); +#ifdef CONFIG_X86_SGX_KVM +int __init sgx_vepc_init(void); +#else +static inline int __init sgx_vepc_init(void) +{ + return -ENODEV; +} +#endif + #endif /* _X86_SGX_H */ diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c new file mode 100644 index 000000000000..259cc46ad78c --- /dev/null +++ b/arch/x86/kernel/cpu/sgx/virt.c @@ -0,0 +1,259 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Device driver to expose SGX enclave memory to KVM guests. + * + * Copyright(c) 2021 Intel Corporation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "encls.h" +#include "sgx.h" + +struct sgx_vepc { + struct xarray page_array; + struct mutex lock; +}; + +/* + * Temporary SECS pages that cannot be EREMOVE'd due to having child in other + * virtual EPC instances, and the lock to protect it. + */ +static struct mutex zombie_secs_pages_lock; +static struct list_head zombie_secs_pages; + +static int __sgx_vepc_fault(struct sgx_vepc *vepc, + struct vm_area_struct *vma, unsigned long addr) +{ + struct sgx_epc_page *epc_page; + unsigned long index, pfn; + int ret; + + WARN_ON(!mutex_is_locked(&vepc->lock)); + + /* Calculate index of EPC page in virtual EPC's page_array */ + index = vma->vm_pgoff + PFN_DOWN(addr - vma->vm_start); + + epc_page = xa_load(&vepc->page_array, index); + if (epc_page) + return 0; + + epc_page = sgx_alloc_epc_page(vepc, false); + if (IS_ERR(epc_page)) + return PTR_ERR(epc_page); + + ret = xa_err(xa_store(&vepc->page_array, index, epc_page, GFP_KERNEL)); + if (ret) + goto err_free; + + pfn = PFN_DOWN(sgx_get_epc_phys_addr(epc_page)); + + ret = vmf_insert_pfn(vma, addr, pfn); + if (ret != VM_FAULT_NOPAGE) { + ret = -EFAULT; + goto err_delete; + } + + return 0; + +err_delete: + xa_erase(&vepc->page_array, index); +err_free: + sgx_free_epc_page(epc_page); + return ret; +} + +static vm_fault_t sgx_vepc_fault(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + struct sgx_vepc *vepc = vma->vm_private_data; + int ret; + + mutex_lock(&vepc->lock); + ret = __sgx_vepc_fault(vepc, vma, vmf->address); + mutex_unlock(&vepc->lock); + + if (!ret) + return VM_FAULT_NOPAGE; + + if (ret == -EBUSY && (vmf->flags & FAULT_FLAG_ALLOW_RETRY)) { + mmap_read_unlock(vma->vm_mm); + return VM_FAULT_RETRY; + } + + return VM_FAULT_SIGBUS; +} + +const struct vm_operations_struct sgx_vepc_vm_ops = { + .fault = sgx_vepc_fault, +}; + +static int sgx_vepc_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct sgx_vepc *vepc = file->private_data; + + if (!(vma->vm_flags & VM_SHARED)) + return -EINVAL; + + vma->vm_ops = &sgx_vepc_vm_ops; + /* Don't copy VMA in fork() */ + vma->vm_flags |= VM_PFNMAP | VM_IO | VM_DONTDUMP | VM_DONTCOPY; + vma->vm_private_data = vepc; + + return 0; +} + +static int sgx_vepc_free_page(struct sgx_epc_page *epc_page) +{ + int ret; + + /* + * Take a previously guest-owned EPC page and return it to the + * general EPC page pool. + * + * Guests can not be trusted to have left this page in a good + * state, so run EREMOVE on the page unconditionally. In the + * case that a guest properly EREMOVE'd this page, a superfluous + * EREMOVE is harmless. + */ + ret = __eremove(sgx_get_epc_virt_addr(epc_page)); + if (ret) { + /* + * Only SGX_CHILD_PRESENT is expected, which is because of + * EREMOVE'ing an SECS still with child, in which case it can + * be handled by EREMOVE'ing the SECS again after all pages in + * virtual EPC have been EREMOVE'd. See comments in below in + * sgx_vepc_release(). + * + * The user of virtual EPC (KVM) needs to guarantee there's no + * logical processor is still running in the enclave in guest, + * otherwise EREMOVE will get SGX_ENCLAVE_ACT which cannot be + * handled here. + */ + WARN_ONCE(ret != SGX_CHILD_PRESENT, EREMOVE_ERROR_MESSAGE, + ret, ret); + return ret; + } + + sgx_free_epc_page(epc_page); + + return 0; +} + +static int sgx_vepc_release(struct inode *inode, struct file *file) +{ + struct sgx_vepc *vepc = file->private_data; + struct sgx_epc_page *epc_page, *tmp, *entry; + unsigned long index; + + LIST_HEAD(secs_pages); + + xa_for_each(&vepc->page_array, index, entry) { + /* + * Remove all normal, child pages. sgx_vepc_free_page() + * will fail if EREMOVE fails, but this is OK and expected on + * SECS pages. Those can only be EREMOVE'd *after* all their + * child pages. Retries below will clean them up. + */ + if (sgx_vepc_free_page(entry)) + continue; + + xa_erase(&vepc->page_array, index); + } + + /* + * Retry EREMOVE'ing pages. This will clean up any SECS pages that + * only had children in this 'epc' area. + */ + xa_for_each(&vepc->page_array, index, entry) { + epc_page = entry; + /* + * An EREMOVE failure here means that the SECS page still + * has children. But, since all children in this 'sgx_vepc' + * have been removed, the SECS page must have a child on + * another instance. + */ + if (sgx_vepc_free_page(epc_page)) + list_add_tail(&epc_page->list, &secs_pages); + + xa_erase(&vepc->page_array, index); + } + + /* + * SECS pages are "pinned" by child pages, and "unpinned" once all + * children have been EREMOVE'd. A child page in this instance + * may have pinned an SECS page encountered in an earlier release(), + * creating a zombie. Since some children were EREMOVE'd above, + * try to EREMOVE all zombies in the hopes that one was unpinned. + */ + mutex_lock(&zombie_secs_pages_lock); + list_for_each_entry_safe(epc_page, tmp, &zombie_secs_pages, list) { + /* + * Speculatively remove the page from the list of zombies, + * if the page is successfully EREMOVE'd it will be added to + * the list of free pages. If EREMOVE fails, throw the page + * on the local list, which will be spliced on at the end. + */ + list_del(&epc_page->list); + + if (sgx_vepc_free_page(epc_page)) + list_add_tail(&epc_page->list, &secs_pages); + } + + if (!list_empty(&secs_pages)) + list_splice_tail(&secs_pages, &zombie_secs_pages); + mutex_unlock(&zombie_secs_pages_lock); + + kfree(vepc); + + return 0; +} + +static int sgx_vepc_open(struct inode *inode, struct file *file) +{ + struct sgx_vepc *vepc; + + vepc = kzalloc(sizeof(struct sgx_vepc), GFP_KERNEL); + if (!vepc) + return -ENOMEM; + mutex_init(&vepc->lock); + xa_init(&vepc->page_array); + + file->private_data = vepc; + + return 0; +} + +static const struct file_operations sgx_vepc_fops = { + .owner = THIS_MODULE, + .open = sgx_vepc_open, + .release = sgx_vepc_release, + .mmap = sgx_vepc_mmap, +}; + +static struct miscdevice sgx_vepc_dev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "sgx_vepc", + .nodename = "sgx_vepc", + .fops = &sgx_vepc_fops, +}; + +int __init sgx_vepc_init(void) +{ + /* SGX virtualization requires KVM to work */ + if (!cpu_feature_enabled(X86_FEATURE_VMX)) + return -ENODEV; + + INIT_LIST_HEAD(&zombie_secs_pages); + mutex_init(&zombie_secs_pages_lock); + + return misc_register(&sgx_vepc_dev); +} diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index f92dfd8ef10d..71fb38d83e4a 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -84,6 +84,18 @@ config KVM_INTEL To compile this as a module, choose M here: the module will be called kvm-intel. +config X86_SGX_KVM + bool "Software Guard eXtensions (SGX) Virtualization" + depends on X86_SGX && KVM_INTEL + help + + Enables KVM guests to create SGX enclaves. + + This includes support to expose "raw" unreclaimable enclave memory to + guests via a device node, e.g. /dev/sgx_vepc. + + If unsure, say N. + config KVM_AMD tristate "KVM for AMD processors support" depends on KVM -- Gitee From ecdadaa2be1be5ff54534b2df23a83185f4c4c99 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Mar 2021 20:22:58 +1300 Subject: [PATCH 007/674] x86/cpu/intel: Allow SGX virtualization without Launch Control support mainline inclusion from mainline-5.13 commit 332bfc7becf479de8a55864cc5ed0024baea28aa category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK CVE: NA Intel-SIG: commit 332bfc7becf4 x86/cpu/intel: Allow SGX virtualization without Launch Control support. Backport for SGX virtualization support -------------------------------- The kernel will currently disable all SGX support if the hardware does not support launch control. Make it more permissive to allow SGX virtualization on systems without Launch Control support. This will allow KVM to expose SGX to guests that have less-strict requirements on the availability of flexible launch control. Improve error message to distinguish between three cases. There are two cases where SGX support is completely disabled: 1) SGX has been disabled completely by the BIOS 2) SGX LC is locked by the BIOS. Bare-metal support is disabled because of LC unavailability. SGX virtualization is unavailable (because of Kconfig). One where it is partially available: 3) SGX LC is locked by the BIOS. Bare-metal support is disabled because of LC unavailability. SGX virtualization is supported. Signed-off-by: Sean Christopherson Co-developed-by: Kai Huang Signed-off-by: Kai Huang Signed-off-by: Borislav Petkov Acked-by: Jarkko Sakkinen Acked-by: Dave Hansen Link: https://lkml.kernel.org/r/b3329777076509b3b601550da288c8f3c406a865.1616136308.git.kai.huang@intel.com Signed-off-by: Fan Du Signed-off-by: Zhiquan Li --- arch/x86/kernel/cpu/feat_ctl.c | 59 +++++++++++++++++++++++++--------- 1 file changed, 44 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c index 27533a6e04fa..da696eb4821a 100644 --- a/arch/x86/kernel/cpu/feat_ctl.c +++ b/arch/x86/kernel/cpu/feat_ctl.c @@ -104,8 +104,9 @@ early_param("nosgx", nosgx); void init_ia32_feat_ctl(struct cpuinfo_x86 *c) { + bool enable_sgx_kvm = false, enable_sgx_driver = false; bool tboot = tboot_enabled(); - bool enable_sgx; + bool enable_vmx; u64 msr; if (rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr)) { @@ -114,13 +115,19 @@ void init_ia32_feat_ctl(struct cpuinfo_x86 *c) return; } - /* - * Enable SGX if and only if the kernel supports SGX and Launch Control - * is supported, i.e. disable SGX if the LE hash MSRs can't be written. - */ - enable_sgx = cpu_has(c, X86_FEATURE_SGX) && - cpu_has(c, X86_FEATURE_SGX_LC) && - IS_ENABLED(CONFIG_X86_SGX); + enable_vmx = cpu_has(c, X86_FEATURE_VMX) && + IS_ENABLED(CONFIG_KVM_INTEL); + + if (cpu_has(c, X86_FEATURE_SGX) && IS_ENABLED(CONFIG_X86_SGX)) { + /* + * Separate out SGX driver enabling from KVM. This allows KVM + * guests to use SGX even if the kernel SGX driver refuses to + * use it. This happens if flexible Launch Control is not + * available. + */ + enable_sgx_driver = cpu_has(c, X86_FEATURE_SGX_LC); + enable_sgx_kvm = enable_vmx && IS_ENABLED(CONFIG_X86_SGX_KVM); + } if (msr & FEAT_CTL_LOCKED) goto update_caps; @@ -136,15 +143,18 @@ void init_ia32_feat_ctl(struct cpuinfo_x86 *c) * i.e. KVM is enabled, to avoid unnecessarily adding an attack vector * for the kernel, e.g. using VMX to hide malicious code. */ - if (cpu_has(c, X86_FEATURE_VMX) && IS_ENABLED(CONFIG_KVM_INTEL)) { + if (enable_vmx) { msr |= FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; if (tboot) msr |= FEAT_CTL_VMX_ENABLED_INSIDE_SMX; } - if (enable_sgx) - msr |= FEAT_CTL_SGX_ENABLED | FEAT_CTL_SGX_LC_ENABLED; + if (enable_sgx_kvm || enable_sgx_driver) { + msr |= FEAT_CTL_SGX_ENABLED; + if (enable_sgx_driver) + msr |= FEAT_CTL_SGX_LC_ENABLED; + } wrmsrl(MSR_IA32_FEAT_CTL, msr); @@ -167,10 +177,29 @@ void init_ia32_feat_ctl(struct cpuinfo_x86 *c) } update_sgx: - if (!(msr & FEAT_CTL_SGX_ENABLED) || - !(msr & FEAT_CTL_SGX_LC_ENABLED) || !enable_sgx) { - if (enable_sgx) - pr_err_once("SGX disabled by BIOS\n"); + if (!(msr & FEAT_CTL_SGX_ENABLED)) { + if (enable_sgx_kvm || enable_sgx_driver) + pr_err_once("SGX disabled by BIOS.\n"); clear_cpu_cap(c, X86_FEATURE_SGX); + return; + } + + /* + * VMX feature bit may be cleared due to being disabled in BIOS, + * in which case SGX virtualization cannot be supported either. + */ + if (!cpu_has(c, X86_FEATURE_VMX) && enable_sgx_kvm) { + pr_err_once("SGX virtualization disabled due to lack of VMX.\n"); + enable_sgx_kvm = 0; + } + + if (!(msr & FEAT_CTL_SGX_LC_ENABLED) && enable_sgx_driver) { + if (!enable_sgx_kvm) { + pr_err_once("SGX Launch Control is locked. Disable SGX.\n"); + clear_cpu_cap(c, X86_FEATURE_SGX); + } else { + pr_err_once("SGX Launch Control is locked. Support SGX virtualization only.\n"); + clear_cpu_cap(c, X86_FEATURE_SGX_LC); + } } } -- Gitee From 90ec4287f6bfbc994b552351709f2e547b745453 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Fri, 19 Mar 2021 20:23:02 +1300 Subject: [PATCH 008/674] x86/sgx: Initialize virtual EPC driver even when SGX driver is disabled mainline inclusion from mainline-5.13 commit faa7d3e6f3b983a28bf0f88f82dcb1c162e61105 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK CVE: NA Intel-SIG: commit faa7d3e6f3b9 x86/sgx: Initialize virtual EPC driver even when SGX driver is disabled. Backport for SGX virtualization support -------------------------------- Modify sgx_init() to always try to initialize the virtual EPC driver, even if the SGX driver is disabled. The SGX driver might be disabled if SGX Launch Control is in locked mode, or not supported in the hardware at all. This allows (non-Linux) guests that support non-LC configurations to use SGX. [ bp: De-silli-fy the test. ] Signed-off-by: Kai Huang Signed-off-by: Borislav Petkov Reviewed-by: Sean Christopherson Acked-by: Jarkko Sakkinen Acked-by: Dave Hansen Link: https://lkml.kernel.org/r/d35d17a02bbf8feef83a536cec8b43746d4ea557.1616136308.git.kai.huang@intel.com Signed-off-by: Fan Du Signed-off-by: Zhiquan Li --- arch/x86/kernel/cpu/sgx/main.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index b227629b1e9c..1c8a228b0104 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -743,8 +743,17 @@ static int __init sgx_init(void) goto err_page_cache; } + /* + * Always try to initialize the native *and* KVM drivers. + * The KVM driver is less picky than the native one and + * can function if the native one is not supported on the + * current system or fails to initialize. + * + * Error out only if both fail to initialize. + */ ret = sgx_drv_init(); - if (ret) + + if (sgx_vepc_init() && ret) goto err_kthread; return 0; -- Gitee From 42e4ea5524ab1d223cba74c8c8cde8367728b108 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Mar 2021 20:23:04 +1300 Subject: [PATCH 009/674] x86/sgx: Move ENCLS leaf definitions to sgx.h mainline inclusion from mainline-5.13 commit 9c55c78a73ce6e62a1d46ba6e4f242c23c29b812 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK CVE: NA Intel-SIG: commit 9c55c78a73ce x86/sgx: Move ENCLS leaf definitions to sgx.h. Backport for SGX virtualization support -------------------------------- Move the ENCLS leaf definitions to sgx.h so that they can be used by KVM. Signed-off-by: Sean Christopherson Signed-off-by: Kai Huang Signed-off-by: Borislav Petkov Acked-by: Jarkko Sakkinen Acked-by: Dave Hansen Link: https://lkml.kernel.org/r/2e6cd7c5c1ced620cfcd292c3c6c382827fde6b2.1616136308.git.kai.huang@intel.com Signed-off-by: Fan Du Signed-off-by: Zhiquan Li --- arch/x86/include/asm/sgx.h | 15 +++++++++++++++ arch/x86/kernel/cpu/sgx/encls.h | 15 --------------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/arch/x86/include/asm/sgx.h b/arch/x86/include/asm/sgx.h index 14bb5f7e221c..34f44238d1d1 100644 --- a/arch/x86/include/asm/sgx.h +++ b/arch/x86/include/asm/sgx.h @@ -27,6 +27,21 @@ /* The bitmask for the EPC section type. */ #define SGX_CPUID_EPC_MASK GENMASK(3, 0) +enum sgx_encls_function { + ECREATE = 0x00, + EADD = 0x01, + EINIT = 0x02, + EREMOVE = 0x03, + EDGBRD = 0x04, + EDGBWR = 0x05, + EEXTEND = 0x06, + ELDU = 0x08, + EBLOCK = 0x09, + EPA = 0x0A, + EWB = 0x0B, + ETRACK = 0x0C, +}; + /** * enum sgx_return_code - The return code type for ENCLS, ENCLU and ENCLV * %SGX_NOT_TRACKED: Previous ETRACK's shootdown sequence has not diff --git a/arch/x86/kernel/cpu/sgx/encls.h b/arch/x86/kernel/cpu/sgx/encls.h index 443188fe7e70..be5c49689980 100644 --- a/arch/x86/kernel/cpu/sgx/encls.h +++ b/arch/x86/kernel/cpu/sgx/encls.h @@ -11,21 +11,6 @@ #include #include "sgx.h" -enum sgx_encls_function { - ECREATE = 0x00, - EADD = 0x01, - EINIT = 0x02, - EREMOVE = 0x03, - EDGBRD = 0x04, - EDGBWR = 0x05, - EEXTEND = 0x06, - ELDU = 0x08, - EBLOCK = 0x09, - EPA = 0x0A, - EWB = 0x0B, - ETRACK = 0x0C, -}; - /** * ENCLS_FAULT_FLAG - flag signifying an ENCLS return code is a trapnr * -- Gitee From f3a767a7cb95fd64fe8c5727ffce40d38c43bd93 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Mar 2021 20:23:05 +1300 Subject: [PATCH 010/674] x86/sgx: Add SGX2 ENCLS leaf definitions (EAUG, EMODPR and EMODT) mainline inclusion from mainline-5.13 commit 32ddda8e445df3de477db14d386fb3518042224a category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK CVE: NA Intel-SIG: commit 32ddda8e445d x86/sgx: Add SGX2 ENCLS leaf definitions (EAUG, EMODPR and EMODT). Backport for SGX virtualization support -------------------------------- Define the ENCLS leafs that are available with SGX2, also referred to as Enclave Dynamic Memory Management (EDMM). The leafs will be used by KVM to conditionally expose SGX2 capabilities to guests. Signed-off-by: Sean Christopherson Signed-off-by: Kai Huang Signed-off-by: Borislav Petkov Acked-by: Jarkko Sakkinen Acked-by: Dave Hansen Link: https://lkml.kernel.org/r/5f0970c251ebcc6d5add132f0d750cc753b7060f.1616136308.git.kai.huang@intel.com Signed-off-by: Fan Du Signed-off-by: Zhiquan Li --- arch/x86/include/asm/sgx.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/include/asm/sgx.h b/arch/x86/include/asm/sgx.h index 34f44238d1d1..3b025afec0a7 100644 --- a/arch/x86/include/asm/sgx.h +++ b/arch/x86/include/asm/sgx.h @@ -40,6 +40,9 @@ enum sgx_encls_function { EPA = 0x0A, EWB = 0x0B, ETRACK = 0x0C, + EAUG = 0x0D, + EMODPR = 0x0E, + EMODT = 0x0F, }; /** -- Gitee From 42413c8aded5b5c845dc7a975e6dcf26437f03d9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Mar 2021 20:23:06 +1300 Subject: [PATCH 011/674] x86/sgx: Add encls_faulted() helper mainline inclusion from mainline-5.13 commit a67136b458e5e63822b19c35794451122fe2bf3e category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK CVE: NA Intel-SIG: commit a67136b458e5 x86/sgx: Add encls_faulted() helper. Backport for SGX virtualization support -------------------------------- Add a helper to extract the fault indicator from an encoded ENCLS return value. SGX virtualization will also need to detect ENCLS faults. Signed-off-by: Sean Christopherson Signed-off-by: Kai Huang Signed-off-by: Borislav Petkov Acked-by: Jarkko Sakkinen Acked-by: Dave Hansen Link: https://lkml.kernel.org/r/c1f955898110de2f669da536fc6cf62e003dff88.1616136308.git.kai.huang@intel.com Signed-off-by: Fan Du Signed-off-by: Zhiquan Li --- arch/x86/kernel/cpu/sgx/encls.h | 15 ++++++++++++++- arch/x86/kernel/cpu/sgx/ioctl.c | 2 +- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/sgx/encls.h b/arch/x86/kernel/cpu/sgx/encls.h index be5c49689980..9b204843b78d 100644 --- a/arch/x86/kernel/cpu/sgx/encls.h +++ b/arch/x86/kernel/cpu/sgx/encls.h @@ -40,6 +40,19 @@ } while (0); \ } +/* + * encls_faulted() - Check if an ENCLS leaf faulted given an error code + * @ret: the return value of an ENCLS leaf function call + * + * Return: + * - true: ENCLS leaf faulted. + * - false: Otherwise. + */ +static inline bool encls_faulted(int ret) +{ + return ret & ENCLS_FAULT_FLAG; +} + /** * encls_failed() - Check if an ENCLS function failed * @ret: the return value of an ENCLS function call @@ -50,7 +63,7 @@ */ static inline bool encls_failed(int ret) { - if (ret & ENCLS_FAULT_FLAG) + if (encls_faulted(ret)) return ENCLS_TRAPNR(ret) != X86_TRAP_PF; return !!ret; diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c index 354e309fcdb7..11e3f9635c24 100644 --- a/arch/x86/kernel/cpu/sgx/ioctl.c +++ b/arch/x86/kernel/cpu/sgx/ioctl.c @@ -568,7 +568,7 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct, } } - if (ret & ENCLS_FAULT_FLAG) { + if (encls_faulted(ret)) { if (encls_failed(ret)) ENCLS_WARN(ret, "EINIT"); -- Gitee From 74ecbf21604bc194537a72ddc330c8cc2ec8614d Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Fri, 19 Mar 2021 20:23:07 +1300 Subject: [PATCH 012/674] x86/sgx: Add helper to update SGX_LEPUBKEYHASHn MSRs mainline inclusion from mainline-5.13 commit 73916b6a0c714258f9c2619408a66c6696a761a7 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK CVE: NA Intel-SIG: commit 73916b6a0c71 x86/sgx: Add helper to update SGX_LEPUBKEYHASHn MSRs. Backport for SGX virtualization support -------------------------------- Add a helper to update SGX_LEPUBKEYHASHn MSRs. SGX virtualization also needs to update those MSRs based on guest's "virtual" SGX_LEPUBKEYHASHn before EINIT from guest. Signed-off-by: Kai Huang Signed-off-by: Borislav Petkov Acked-by: Dave Hansen Acked-by: Jarkko Sakkinen Link: https://lkml.kernel.org/r/dfb7cd39d4dd62ea27703b64afdd8bccb579f623.1616136308.git.kai.huang@intel.com Signed-off-by: Fan Du Signed-off-by: Zhiquan Li --- arch/x86/kernel/cpu/sgx/ioctl.c | 5 ++--- arch/x86/kernel/cpu/sgx/main.c | 16 ++++++++++++++++ arch/x86/kernel/cpu/sgx/sgx.h | 2 ++ 3 files changed, 20 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c index 11e3f9635c24..7be9c064a640 100644 --- a/arch/x86/kernel/cpu/sgx/ioctl.c +++ b/arch/x86/kernel/cpu/sgx/ioctl.c @@ -495,7 +495,7 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct, void *token) { u64 mrsigner[4]; - int i, j, k; + int i, j; void *addr; int ret; @@ -544,8 +544,7 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct, preempt_disable(); - for (k = 0; k < 4; k++) - wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + k, mrsigner[k]); + sgx_update_lepubkeyhash(mrsigner); ret = __einit(sigstruct, token, addr); diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 1c8a228b0104..227f1e2ad9cf 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -727,6 +727,22 @@ static bool __init sgx_page_cache_init(void) return true; } +/* + * Update the SGX_LEPUBKEYHASH MSRs to the values specified by caller. + * Bare-metal driver requires to update them to hash of enclave's signer + * before EINIT. KVM needs to update them to guest's virtual MSR values + * before doing EINIT from guest. + */ +void sgx_update_lepubkeyhash(u64 *lepubkeyhash) +{ + int i; + + WARN_ON_ONCE(preemptible()); + + for (i = 0; i < 4; i++) + wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + i, lepubkeyhash[i]); +} + static int __init sgx_init(void) { int ret; diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h index e4cbc71bf136..4628acec0009 100644 --- a/arch/x86/kernel/cpu/sgx/sgx.h +++ b/arch/x86/kernel/cpu/sgx/sgx.h @@ -93,4 +93,6 @@ static inline int __init sgx_vepc_init(void) } #endif +void sgx_update_lepubkeyhash(u64 *lepubkeyhash); + #endif /* _X86_SGX_H */ -- Gitee From 0d53491c285d1a148bb030a61592383e7bc8cb92 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Mar 2021 20:23:08 +1300 Subject: [PATCH 013/674] x86/sgx: Add helpers to expose ECREATE and EINIT to KVM mainline inclusion from mainline-5.13 commit d155030b1e7c0e448aab22a803f7a71ea2e117d7 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK CVE: NA Intel-SIG: commit d155030b1e7c x86/sgx: Add helpers to expose ECREATE and EINIT to KVM. Backport for SGX virtualization support -------------------------------- The host kernel must intercept ECREATE to impose policies on guests, and intercept EINIT to be able to write guest's virtual SGX_LEPUBKEYHASH MSR values to hardware before running guest's EINIT so it can run correctly according to hardware behavior. Provide wrappers around __ecreate() and __einit() to hide the ugliness of overloading the ENCLS return value to encode multiple error formats in a single int. KVM will trap-and-execute ECREATE and EINIT as part of SGX virtualization, and reflect ENCLS execution result to guest by setting up guest's GPRs, or on an exception, injecting the correct fault based on return value of __ecreate() and __einit(). Use host userspace addresses (provided by KVM based on guest physical address of ENCLS parameters) to execute ENCLS/EINIT when possible. Accesses to both EPC and memory originating from ENCLS are subject to segmentation and paging mechanisms. It's also possible to generate kernel mappings for ENCLS parameters by resolving PFN but using __uaccess_xx() is simpler. [ bp: Return early if the __user memory accesses fail, use cpu_feature_enabled(). ] Signed-off-by: Sean Christopherson Signed-off-by: Kai Huang Signed-off-by: Borislav Petkov Acked-by: Jarkko Sakkinen Link: https://lkml.kernel.org/r/20e09daf559aa5e9e680a0b4b5fba940f1bad86e.1616136308.git.kai.huang@intel.com Signed-off-by: Fan Du Signed-off-by: Zhiquan Li --- arch/x86/include/asm/sgx.h | 7 ++ arch/x86/kernel/cpu/sgx/virt.c | 117 +++++++++++++++++++++++++++++++++ 2 files changed, 124 insertions(+) diff --git a/arch/x86/include/asm/sgx.h b/arch/x86/include/asm/sgx.h index 3b025afec0a7..954042e04102 100644 --- a/arch/x86/include/asm/sgx.h +++ b/arch/x86/include/asm/sgx.h @@ -365,4 +365,11 @@ struct sgx_sigstruct { * comment! */ +#ifdef CONFIG_X86_SGX_KVM +int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs, + int *trapnr); +int sgx_virt_einit(void __user *sigstruct, void __user *token, + void __user *secs, u64 *lepubkeyhash, int *trapnr); +#endif + #endif /* _ASM_X86_SGX_H */ diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c index 259cc46ad78c..7d221eac716a 100644 --- a/arch/x86/kernel/cpu/sgx/virt.c +++ b/arch/x86/kernel/cpu/sgx/virt.c @@ -257,3 +257,120 @@ int __init sgx_vepc_init(void) return misc_register(&sgx_vepc_dev); } + +/** + * sgx_virt_ecreate() - Run ECREATE on behalf of guest + * @pageinfo: Pointer to PAGEINFO structure + * @secs: Userspace pointer to SECS page + * @trapnr: trap number injected to guest in case of ECREATE error + * + * Run ECREATE on behalf of guest after KVM traps ECREATE for the purpose + * of enforcing policies of guest's enclaves, and return the trap number + * which should be injected to guest in case of any ECREATE error. + * + * Return: + * - 0: ECREATE was successful. + * - <0: on error. + */ +int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs, + int *trapnr) +{ + int ret; + + /* + * @secs is an untrusted, userspace-provided address. It comes from + * KVM and is assumed to be a valid pointer which points somewhere in + * userspace. This can fault and call SGX or other fault handlers when + * userspace mapping @secs doesn't exist. + * + * Add a WARN() to make sure @secs is already valid userspace pointer + * from caller (KVM), who should already have handled invalid pointer + * case (for instance, made by malicious guest). All other checks, + * such as alignment of @secs, are deferred to ENCLS itself. + */ + if (WARN_ON_ONCE(!access_ok(secs, PAGE_SIZE))) + return -EINVAL; + + __uaccess_begin(); + ret = __ecreate(pageinfo, (void *)secs); + __uaccess_end(); + + if (encls_faulted(ret)) { + *trapnr = ENCLS_TRAPNR(ret); + return -EFAULT; + } + + /* ECREATE doesn't return an error code, it faults or succeeds. */ + WARN_ON_ONCE(ret); + return 0; +} +EXPORT_SYMBOL_GPL(sgx_virt_ecreate); + +static int __sgx_virt_einit(void __user *sigstruct, void __user *token, + void __user *secs) +{ + int ret; + + /* + * Make sure all userspace pointers from caller (KVM) are valid. + * All other checks deferred to ENCLS itself. Also see comment + * for @secs in sgx_virt_ecreate(). + */ +#define SGX_EINITTOKEN_SIZE 304 + if (WARN_ON_ONCE(!access_ok(sigstruct, sizeof(struct sgx_sigstruct)) || + !access_ok(token, SGX_EINITTOKEN_SIZE) || + !access_ok(secs, PAGE_SIZE))) + return -EINVAL; + + __uaccess_begin(); + ret = __einit((void *)sigstruct, (void *)token, (void *)secs); + __uaccess_end(); + + return ret; +} + +/** + * sgx_virt_einit() - Run EINIT on behalf of guest + * @sigstruct: Userspace pointer to SIGSTRUCT structure + * @token: Userspace pointer to EINITTOKEN structure + * @secs: Userspace pointer to SECS page + * @lepubkeyhash: Pointer to guest's *virtual* SGX_LEPUBKEYHASH MSR values + * @trapnr: trap number injected to guest in case of EINIT error + * + * Run EINIT on behalf of guest after KVM traps EINIT. If SGX_LC is available + * in host, SGX driver may rewrite the hardware values at wish, therefore KVM + * needs to update hardware values to guest's virtual MSR values in order to + * ensure EINIT is executed with expected hardware values. + * + * Return: + * - 0: EINIT was successful. + * - <0: on error. + */ +int sgx_virt_einit(void __user *sigstruct, void __user *token, + void __user *secs, u64 *lepubkeyhash, int *trapnr) +{ + int ret; + + if (!cpu_feature_enabled(X86_FEATURE_SGX_LC)) { + ret = __sgx_virt_einit(sigstruct, token, secs); + } else { + preempt_disable(); + + sgx_update_lepubkeyhash(lepubkeyhash); + + ret = __sgx_virt_einit(sigstruct, token, secs); + preempt_enable(); + } + + /* Propagate up the error from the WARN_ON_ONCE in __sgx_virt_einit() */ + if (ret == -EINVAL) + return ret; + + if (encls_faulted(ret)) { + *trapnr = ENCLS_TRAPNR(ret); + return -EFAULT; + } + + return ret; +} +EXPORT_SYMBOL_GPL(sgx_virt_einit); -- Gitee From 6825a358d3e30c17aa88ed64ad06811a60aa9e3d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Mar 2021 20:23:09 +1300 Subject: [PATCH 014/674] x86/sgx: Move provisioning device creation out of SGX driver mainline inclusion from mainline-5.13 commit b3754e5d3da320af2bebb7a690002685c7f5c15c category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK CVE: NA Intel-SIG: commit b3754e5d3da3 x86/sgx: Move provisioning device creation out of SGX driver. Backport for SGX virtualization support -------------------------------- And extract sgx_set_attribute() out of sgx_ioc_enclave_provision() and export it as symbol for KVM to use. The provisioning key is sensitive. The SGX driver only allows to create an enclave which can access the provisioning key when the enclave creator has permission to open /dev/sgx_provision. It should apply to a VM as well, as the provisioning key is platform-specific, thus an unrestricted VM can also potentially compromise the provisioning key. Move the provisioning device creation out of sgx_drv_init() to sgx_init() as a preparation for adding SGX virtualization support, so that even if the SGX driver is not enabled due to flexible launch control not being available, SGX virtualization can still be enabled, and use it to restrict a VM's capability of being able to access the provisioning key. [ bp: Massage commit message. ] Signed-off-by: Sean Christopherson Signed-off-by: Kai Huang Signed-off-by: Borislav Petkov Reviewed-by: Jarkko Sakkinen Acked-by: Dave Hansen Link: https://lkml.kernel.org/r/0f4d044d621561f26d5f4ef73e8dc6cd18cc7e79.1616136308.git.kai.huang@intel.com Signed-off-by: Fan Du Signed-off-by: Zhiquan Li --- arch/x86/include/asm/sgx.h | 3 ++ arch/x86/kernel/cpu/sgx/driver.c | 17 ---------- arch/x86/kernel/cpu/sgx/ioctl.c | 16 ++------- arch/x86/kernel/cpu/sgx/main.c | 57 +++++++++++++++++++++++++++++++- 4 files changed, 61 insertions(+), 32 deletions(-) diff --git a/arch/x86/include/asm/sgx.h b/arch/x86/include/asm/sgx.h index 954042e04102..a16e2c9154a3 100644 --- a/arch/x86/include/asm/sgx.h +++ b/arch/x86/include/asm/sgx.h @@ -372,4 +372,7 @@ int sgx_virt_einit(void __user *sigstruct, void __user *token, void __user *secs, u64 *lepubkeyhash, int *trapnr); #endif +int sgx_set_attribute(unsigned long *allowed_attributes, + unsigned int attribute_fd); + #endif /* _ASM_X86_SGX_H */ diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c index 8ce6d8371cfb..aa9b8b868867 100644 --- a/arch/x86/kernel/cpu/sgx/driver.c +++ b/arch/x86/kernel/cpu/sgx/driver.c @@ -136,10 +136,6 @@ static const struct file_operations sgx_encl_fops = { .get_unmapped_area = sgx_get_unmapped_area, }; -const struct file_operations sgx_provision_fops = { - .owner = THIS_MODULE, -}; - static struct miscdevice sgx_dev_enclave = { .minor = MISC_DYNAMIC_MINOR, .name = "sgx_enclave", @@ -147,13 +143,6 @@ static struct miscdevice sgx_dev_enclave = { .fops = &sgx_encl_fops, }; -static struct miscdevice sgx_dev_provision = { - .minor = MISC_DYNAMIC_MINOR, - .name = "sgx_provision", - .nodename = "sgx_provision", - .fops = &sgx_provision_fops, -}; - int __init sgx_drv_init(void) { unsigned int eax, ebx, ecx, edx; @@ -187,11 +176,5 @@ int __init sgx_drv_init(void) if (ret) return ret; - ret = misc_register(&sgx_dev_provision); - if (ret) { - misc_deregister(&sgx_dev_enclave); - return ret; - } - return 0; } diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c index 7be9c064a640..83df20e3e633 100644 --- a/arch/x86/kernel/cpu/sgx/ioctl.c +++ b/arch/x86/kernel/cpu/sgx/ioctl.c @@ -2,6 +2,7 @@ /* Copyright(c) 2016-20 Intel Corporation. */ #include +#include #include #include #include @@ -666,24 +667,11 @@ static long sgx_ioc_enclave_init(struct sgx_encl *encl, void __user *arg) static long sgx_ioc_enclave_provision(struct sgx_encl *encl, void __user *arg) { struct sgx_enclave_provision params; - struct file *file; if (copy_from_user(¶ms, arg, sizeof(params))) return -EFAULT; - file = fget(params.fd); - if (!file) - return -EINVAL; - - if (file->f_op != &sgx_provision_fops) { - fput(file); - return -EINVAL; - } - - encl->attributes_mask |= SGX_ATTR_PROVISIONKEY; - - fput(file); - return 0; + return sgx_set_attribute(&encl->attributes_mask, params.fd); } long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 227f1e2ad9cf..92cb11dffd4c 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -1,14 +1,17 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright(c) 2016-20 Intel Corporation. */ +#include #include #include #include +#include #include #include #include #include #include +#include #include "driver.h" #include "encl.h" #include "encls.h" @@ -743,6 +746,51 @@ void sgx_update_lepubkeyhash(u64 *lepubkeyhash) wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + i, lepubkeyhash[i]); } +const struct file_operations sgx_provision_fops = { + .owner = THIS_MODULE, +}; + +static struct miscdevice sgx_dev_provision = { + .minor = MISC_DYNAMIC_MINOR, + .name = "sgx_provision", + .nodename = "sgx_provision", + .fops = &sgx_provision_fops, +}; + +/** + * sgx_set_attribute() - Update allowed attributes given file descriptor + * @allowed_attributes: Pointer to allowed enclave attributes + * @attribute_fd: File descriptor for specific attribute + * + * Append enclave attribute indicated by file descriptor to allowed + * attributes. Currently only SGX_ATTR_PROVISIONKEY indicated by + * /dev/sgx_provision is supported. + * + * Return: + * -0: SGX_ATTR_PROVISIONKEY is appended to allowed_attributes + * -EINVAL: Invalid, or not supported file descriptor + */ +int sgx_set_attribute(unsigned long *allowed_attributes, + unsigned int attribute_fd) +{ + struct file *file; + + file = fget(attribute_fd); + if (!file) + return -EINVAL; + + if (file->f_op != &sgx_provision_fops) { + fput(file); + return -EINVAL; + } + + *allowed_attributes |= SGX_ATTR_PROVISIONKEY; + + fput(file); + return 0; +} +EXPORT_SYMBOL_GPL(sgx_set_attribute); + static int __init sgx_init(void) { int ret; @@ -759,6 +807,10 @@ static int __init sgx_init(void) goto err_page_cache; } + ret = misc_register(&sgx_dev_provision); + if (ret) + goto err_kthread; + /* * Always try to initialize the native *and* KVM drivers. * The KVM driver is less picky than the native one and @@ -770,10 +822,13 @@ static int __init sgx_init(void) ret = sgx_drv_init(); if (sgx_vepc_init() && ret) - goto err_kthread; + goto err_provision; return 0; +err_provision: + misc_deregister(&sgx_dev_provision); + err_kthread: kthread_stop(ksgxd_tsk); -- Gitee From d0cb049e9ddf3baa8e86ddde2321c1a489965b19 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 12 Apr 2021 16:21:33 +1200 Subject: [PATCH 015/674] KVM: x86: Export kvm_mmu_gva_to_gpa_{read,write}() for SGX (VMX) mainline inclusion from mainline-5.13 commit 54f958cdaa8c43c0e9b9ef850ae613a6e1bda44e category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK CVE: NA Intel-SIG: commit 54f958cdaa8c KVM: x86: Export kvm_mmu_gva_to_gpa_{read,write}() for SGX (VMX). Backport for SGX virtualization support -------------------------------- Export the gva_to_gpa() helpers for use by SGX virtualization when executing ENCLS[ECREATE] and ENCLS[EINIT] on behalf of the guest. To execute ECREATE and EINIT, KVM must obtain the GPA of the target Secure Enclave Control Structure (SECS) in order to get its corresponding HVA. Because the SECS must reside in the Enclave Page Cache (EPC), copying the SECS's data to a host-controlled buffer via existing exported helpers is not a viable option as the EPC is not readable or writable by the kernel. SGX virtualization will also use gva_to_gpa() to obtain HVAs for non-EPC pages in order to pass user pointers directly to ECREATE and EINIT, which avoids having to copy pages worth of data into the kernel. Signed-off-by: Sean Christopherson Acked-by: Jarkko Sakkinen Signed-off-by: Kai Huang Message-Id: <02f37708321bcdfaa2f9d41c8478affa6e84b04d.1618196135.git.kai.huang@intel.com> Signed-off-by: Paolo Bonzini Signed-off-by: Fan Du Signed-off-by: Zhiquan Li --- arch/x86/kvm/x86.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1f857bc5ac6e..4595c7dbfa93 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6005,6 +6005,7 @@ gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); } +EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_read); gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception) @@ -6021,6 +6022,7 @@ gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, access |= PFERR_WRITE_MASK; return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); } +EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_write); /* uses this to access any guest's mapped memory without checking CPL */ gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, -- Gitee From a10fe21a2e72d80e26fb880169c49fea5316a9c9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 12 Apr 2021 16:21:34 +1200 Subject: [PATCH 016/674] KVM: x86: Define new #PF SGX error code bit mainline inclusion from mainline-5.13 commit 00e7646c3563d2f1a46a8fa1824c32373d77a8be category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK CVE: NA Intel-SIG: commit 00e7646c3563 KVM: x86: Define new #PF SGX error code bit. Backport for SGX virtualization support -------------------------------- Page faults that are signaled by the SGX Enclave Page Cache Map (EPCM), as opposed to the traditional IA32/EPT page tables, set an SGX bit in the error code to indicate that the #PF was induced by SGX. KVM will need to emulate this behavior as part of its trap-and-execute scheme for virtualizing SGX Launch Control, e.g. to inject SGX-induced #PFs if EINIT faults in the host, and to support live migration. Signed-off-by: Sean Christopherson Signed-off-by: Kai Huang Message-Id: Signed-off-by: Paolo Bonzini Signed-off-by: Fan Du Signed-off-by: Zhiquan Li --- arch/x86/include/asm/kvm_host.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a3bf158f5b12..80556cef8c48 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -216,6 +216,7 @@ enum x86_intercept_stage; #define PFERR_RSVD_BIT 3 #define PFERR_FETCH_BIT 4 #define PFERR_PK_BIT 5 +#define PFERR_SGX_BIT 15 #define PFERR_GUEST_FINAL_BIT 32 #define PFERR_GUEST_PAGE_BIT 33 @@ -225,6 +226,7 @@ enum x86_intercept_stage; #define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT) #define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT) #define PFERR_PK_MASK (1U << PFERR_PK_BIT) +#define PFERR_SGX_MASK (1U << PFERR_SGX_BIT) #define PFERR_GUEST_FINAL_MASK (1ULL << PFERR_GUEST_FINAL_BIT) #define PFERR_GUEST_PAGE_MASK (1ULL << PFERR_GUEST_PAGE_BIT) -- Gitee From 09afd8d6d01f3a8f5f9e84180796ee07ba93f5c2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 12 Apr 2021 16:21:35 +1200 Subject: [PATCH 017/674] KVM: x86: Add support for reverse CPUID lookup of scattered features mainline inclusion from mainline-5.13 commit 4e66c0cb79b732b01b82e094b21b8e22a20dff83 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK CVE: NA Intel-SIG: commit 4e66c0cb79b7 KVM: x86: Add support for reverse CPUID lookup of scattered features. Backport for SGX virtualization support -------------------------------- Introduce a scheme that allows KVM's CPUID magic to support features that are scattered in the kernel's feature words. To advertise and/or query guest support for CPUID-based features, KVM requires the bit number of an X86_FEATURE_* to match the bit number in its associated CPUID entry. For scattered features, this does not hold true. Add a framework to allow defining KVM-only words, stored in kvm_cpu_caps after the shared kernel caps, that can be used to gather the scattered feature bits by translating X86_FEATURE_* flags into their KVM-defined feature. Note, because reverse_cpuid_check() effectively forces kvm_cpu_caps lookups to be resolved at compile time, there is no runtime cost for translating from kernel-defined to kvm-defined features. More details here: https://lkml.kernel.org/r/X/jxCOLG+HUO4QlZ@google.com Signed-off-by: Sean Christopherson Signed-off-by: Kai Huang Message-Id: <16cad8d00475f67867fb36701fc7fb7c1ec86ce1.1618196135.git.kai.huang@intel.com> Signed-off-by: Fan Du Signed-off-by: Zhiquan Li --- arch/x86/kvm/cpuid.c | 32 +++++++++++++++++++++++++++----- arch/x86/kvm/cpuid.h | 39 ++++++++++++++++++++++++++++++++++----- 2 files changed, 61 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 41b0dc37720e..c5fa5c62e391 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -28,7 +28,7 @@ * Unlike "struct cpuinfo_x86.x86_capability", kvm_cpu_caps doesn't need to be * aligned to sizeof(unsigned long) because it's not accessed via bitops. */ -u32 kvm_cpu_caps[NCAPINTS] __read_mostly; +u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly; EXPORT_SYMBOL_GPL(kvm_cpu_caps); static u32 xstate_required_size(u64 xstate_bv, bool compacted) @@ -53,6 +53,7 @@ static u32 xstate_required_size(u64 xstate_bv, bool compacted) } #define F feature_bit +#define SF(name) (boot_cpu_has(X86_FEATURE_##name) ? F(name) : 0) static inline struct kvm_cpuid_entry2 *cpuid_entry2_find( struct kvm_cpuid_entry2 *entries, int nent, u32 function, u32 index) @@ -330,13 +331,13 @@ int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, return r; } -static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask) +/* Mask kvm_cpu_caps for @leaf with the raw CPUID capabilities of this CPU. */ +static __always_inline void __kvm_cpu_cap_mask(enum cpuid_leafs leaf) { const struct cpuid_reg cpuid = x86_feature_cpuid(leaf * 32); struct kvm_cpuid_entry2 entry; reverse_cpuid_check(leaf); - kvm_cpu_caps[leaf] &= mask; cpuid_count(cpuid.function, cpuid.index, &entry.eax, &entry.ebx, &entry.ecx, &entry.edx); @@ -344,6 +345,26 @@ static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask) kvm_cpu_caps[leaf] &= *__cpuid_entry_get_reg(&entry, cpuid.reg); } +static __always_inline void kvm_cpu_cap_init_scattered(enum cpuid_leafs leaf, u32 mask) +{ + /* Use kvm_cpu_cap_mask for non-scattered leafs. */ + BUILD_BUG_ON(leaf < NCAPINTS); + + kvm_cpu_caps[leaf] = mask; + + __kvm_cpu_cap_mask(leaf); +} + +static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask) +{ + /* Use kvm_cpu_cap_init_scattered for scattered leafs. */ + BUILD_BUG_ON(leaf >= NCAPINTS); + + kvm_cpu_caps[leaf] &= mask; + + __kvm_cpu_cap_mask(leaf); +} + void kvm_set_cpu_caps(void) { unsigned int f_nx = is_efer_nx() ? F(NX) : 0; @@ -354,12 +375,13 @@ void kvm_set_cpu_caps(void) unsigned int f_gbpages = 0; unsigned int f_lm = 0; #endif + memset(kvm_cpu_caps, 0, sizeof(kvm_cpu_caps)); - BUILD_BUG_ON(sizeof(kvm_cpu_caps) > + BUILD_BUG_ON(sizeof(kvm_cpu_caps) - (NKVMCAPINTS * sizeof(*kvm_cpu_caps)) > sizeof(boot_cpu_data.x86_capability)); memcpy(&kvm_cpu_caps, &boot_cpu_data.x86_capability, - sizeof(kvm_cpu_caps)); + sizeof(kvm_cpu_caps) - (NKVMCAPINTS * sizeof(*kvm_cpu_caps))); kvm_cpu_cap_mask(CPUID_1_ECX, /* diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index dc921d76e42e..2041e2f07347 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -7,7 +7,20 @@ #include #include -extern u32 kvm_cpu_caps[NCAPINTS] __read_mostly; +/* + * Hardware-defined CPUID leafs that are scattered in the kernel, but need to + * be directly used by KVM. Note, these word values conflict with the kernel's + * "bug" caps, but KVM doesn't use those. + */ +enum kvm_only_cpuid_leafs { + NR_KVM_CPU_CAPS = NCAPINTS, + + NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS, +}; + +#define X86_KVM_FEATURE(w, f) ((w)*32 + (f)) + +extern u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly; void kvm_set_cpu_caps(void); void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu); @@ -83,6 +96,20 @@ static __always_inline void reverse_cpuid_check(unsigned int x86_leaf) BUILD_BUG_ON(reverse_cpuid[x86_leaf].function == 0); } +/* + * Translate feature bits that are scattered in the kernel's cpufeatures word + * into KVM feature words that align with hardware's definitions. + */ +static __always_inline u32 __feature_translate(int x86_feature) +{ + return x86_feature; +} + +static __always_inline u32 __feature_leaf(int x86_feature) +{ + return __feature_translate(x86_feature) / 32; +} + /* * Retrieve the bit mask from an X86_FEATURE_* definition. Features contain * the hardware defined bit number (stored in bits 4:0) and a software defined @@ -91,6 +118,8 @@ static __always_inline void reverse_cpuid_check(unsigned int x86_leaf) */ static __always_inline u32 __feature_bit(int x86_feature) { + x86_feature = __feature_translate(x86_feature); + reverse_cpuid_check(x86_feature / 32); return 1 << (x86_feature & 31); } @@ -99,7 +128,7 @@ static __always_inline u32 __feature_bit(int x86_feature) static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned int x86_feature) { - unsigned int x86_leaf = x86_feature / 32; + unsigned int x86_leaf = __feature_leaf(x86_feature); reverse_cpuid_check(x86_leaf); return reverse_cpuid[x86_leaf]; @@ -291,7 +320,7 @@ static inline bool cpuid_fault_enabled(struct kvm_vcpu *vcpu) static __always_inline void kvm_cpu_cap_clear(unsigned int x86_feature) { - unsigned int x86_leaf = x86_feature / 32; + unsigned int x86_leaf = __feature_leaf(x86_feature); reverse_cpuid_check(x86_leaf); kvm_cpu_caps[x86_leaf] &= ~__feature_bit(x86_feature); @@ -299,7 +328,7 @@ static __always_inline void kvm_cpu_cap_clear(unsigned int x86_feature) static __always_inline void kvm_cpu_cap_set(unsigned int x86_feature) { - unsigned int x86_leaf = x86_feature / 32; + unsigned int x86_leaf = __feature_leaf(x86_feature); reverse_cpuid_check(x86_leaf); kvm_cpu_caps[x86_leaf] |= __feature_bit(x86_feature); @@ -307,7 +336,7 @@ static __always_inline void kvm_cpu_cap_set(unsigned int x86_feature) static __always_inline u32 kvm_cpu_cap_get(unsigned int x86_feature) { - unsigned int x86_leaf = x86_feature / 32; + unsigned int x86_leaf = __feature_leaf(x86_feature); reverse_cpuid_check(x86_leaf); return kvm_cpu_caps[x86_leaf] & __feature_bit(x86_feature); -- Gitee From c66f4510fb74765e574864d5144f149a4b059d02 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 12 Apr 2021 16:21:36 +1200 Subject: [PATCH 018/674] KVM: x86: Add reverse-CPUID lookup support for scattered SGX features mainline inclusion from mainline-5.13 commit 01de8682b32d3ed4f0136f7379e1e3ae2e563308 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK CVE: NA Intel-SIG: commit 01de8682b32d KVM: x86: Add reverse-CPUID lookup support for scattered SGX features. Backport for SGX virtualization support -------------------------------- Define a new KVM-only feature word for advertising and querying SGX sub-features in CPUID.0x12.0x0.EAX. Because SGX1 and SGX2 are scattered in the kernel's feature word, they need to be translated so that the bit numbers match those of hardware. Signed-off-by: Sean Christopherson Signed-off-by: Kai Huang Message-Id: Signed-off-by: Paolo Bonzini Signed-off-by: Fan Du Signed-off-by: Zhiquan Li --- arch/x86/kvm/cpuid.h | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 2041e2f07347..fbef5b730805 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -13,12 +13,17 @@ * "bug" caps, but KVM doesn't use those. */ enum kvm_only_cpuid_leafs { - NR_KVM_CPU_CAPS = NCAPINTS, + CPUID_12_EAX = NCAPINTS, + NR_KVM_CPU_CAPS, NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS, }; -#define X86_KVM_FEATURE(w, f) ((w)*32 + (f)) +#define KVM_X86_FEATURE(w, f) ((w)*32 + (f)) + +/* Intel-defined SGX sub-features, CPUID level 0x12 (EAX). */ +#define KVM_X86_FEATURE_SGX1 KVM_X86_FEATURE(CPUID_12_EAX, 0) +#define KVM_X86_FEATURE_SGX2 KVM_X86_FEATURE(CPUID_12_EAX, 1) extern u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly; void kvm_set_cpu_caps(void); @@ -76,6 +81,7 @@ static const struct cpuid_reg reverse_cpuid[] = { [CPUID_8000_0007_EBX] = {0x80000007, 0, CPUID_EBX}, [CPUID_7_EDX] = { 7, 0, CPUID_EDX}, [CPUID_7_1_EAX] = { 7, 1, CPUID_EAX}, + [CPUID_12_EAX] = {0x00000012, 0, CPUID_EAX}, }; /* @@ -102,6 +108,11 @@ static __always_inline void reverse_cpuid_check(unsigned int x86_leaf) */ static __always_inline u32 __feature_translate(int x86_feature) { + if (x86_feature == X86_FEATURE_SGX1) + return KVM_X86_FEATURE_SGX1; + else if (x86_feature == X86_FEATURE_SGX2) + return KVM_X86_FEATURE_SGX2; + return x86_feature; } -- Gitee From 08f41fc48faf19e8926e2d8fed0ae31967c63ed0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 12 Apr 2021 16:21:37 +1200 Subject: [PATCH 019/674] KVM: VMX: Add basic handling of VM-Exit from SGX enclave mainline inclusion from mainline-5.13 commit 3c0c2ad1ae75963c05bf89ec91918c6a53a72696 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK CVE: NA Intel-SIG: commit 3c0c2ad1ae75 KVM: VMX: Add basic handling of VM-Exit from SGX enclave Backport for SGX virtualization support -------------------------------- Add support for handling VM-Exits that originate from a guest SGX enclave. In SGX, an "enclave" is a new CPL3-only execution environment, wherein the CPU and memory state is protected by hardware to make the state inaccesible to code running outside of the enclave. When exiting an enclave due to an asynchronous event (from the perspective of the enclave), e.g. exceptions, interrupts, and VM-Exits, the enclave's state is automatically saved and scrubbed (the CPU loads synthetic state), and then reloaded when re-entering the enclave. E.g. after an instruction based VM-Exit from an enclave, vmcs.GUEST_RIP will not contain the RIP of the enclave instruction that trigered VM-Exit, but will instead point to a RIP in the enclave's untrusted runtime (the guest userspace code that coordinates entry/exit to/from the enclave). To help a VMM recognize and handle exits from enclaves, SGX adds bits to existing VMCS fields, VM_EXIT_REASON.VMX_EXIT_REASON_FROM_ENCLAVE and GUEST_INTERRUPTIBILITY_INFO.GUEST_INTR_STATE_ENCLAVE_INTR. Define the new architectural bits, and add a boolean to struct vcpu_vmx to cache VMX_EXIT_REASON_FROM_ENCLAVE. Clear the bit in exit_reason so that checks against exit_reason do not need to account for SGX, e.g. "if (exit_reason == EXIT_REASON_EXCEPTION_NMI)" continues to work. KVM is a largely a passive observer of the new bits, e.g. KVM needs to account for the bits when propagating information to a nested VMM, but otherwise doesn't need to act differently for the majority of VM-Exits from enclaves. The one scenario that is directly impacted is emulation, which is for all intents and purposes impossible[1] since KVM does not have access to the RIP or instruction stream that triggered the VM-Exit. The inability to emulate is a non-issue for KVM, as most instructions that might trigger VM-Exit unconditionally #UD in an enclave (before the VM-Exit check. For the few instruction that conditionally #UD, KVM either never sets the exiting control, e.g. PAUSE_EXITING[2], or sets it if and only if the feature is not exposed to the guest in order to inject a #UD, e.g. RDRAND_EXITING. But, because it is still possible for a guest to trigger emulation, e.g. MMIO, inject a #UD if KVM ever attempts emulation after a VM-Exit from an enclave. This is architecturally accurate for instruction VM-Exits, and for MMIO it's the least bad choice, e.g. it's preferable to killing the VM. In practice, only broken or particularly stupid guests should ever encounter this behavior. Add a WARN in skip_emulated_instruction to detect any attempt to modify the guest's RIP during an SGX enclave VM-Exit as all such flows should either be unreachable or must handle exits from enclaves before getting to skip_emulated_instruction. [1] Impossible for all practical purposes. Not truly impossible since KVM could implement some form of para-virtualization scheme. [2] PAUSE_LOOP_EXITING only affects CPL0 and enclaves exist only at CPL3, so we also don't need to worry about that interaction. Signed-off-by: Sean Christopherson Signed-off-by: Kai Huang Message-Id: <315f54a8507d09c292463ef29104e1d4c62e9090.1618196135.git.kai.huang@intel.com> Signed-off-by: Paolo Bonzini Signed-off-by: Fan Du Signed-off-by: Zhiquan Li --- arch/x86/include/asm/vmx.h | 1 + arch/x86/include/uapi/asm/vmx.h | 1 + arch/x86/kvm/vmx/nested.c | 2 ++ arch/x86/kvm/vmx/vmx.c | 45 +++++++++++++++++++++++++++++++-- 4 files changed, 47 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index f8ba5289ecb0..c6f028bac3ff 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -371,6 +371,7 @@ enum vmcs_field { #define GUEST_INTR_STATE_MOV_SS 0x00000002 #define GUEST_INTR_STATE_SMI 0x00000004 #define GUEST_INTR_STATE_NMI 0x00000008 +#define GUEST_INTR_STATE_ENCLAVE_INTR 0x00000010 /* GUEST_ACTIVITY_STATE flags */ #define GUEST_ACTIVITY_ACTIVE 0 diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index b8ff9e8ac0d5..df6707a76a3d 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -27,6 +27,7 @@ #define VMX_EXIT_REASONS_FAILED_VMENTRY 0x80000000 +#define VMX_EXIT_REASONS_SGX_ENCLAVE_MODE 0x08000000 #define EXIT_REASON_EXCEPTION_NMI 0 #define EXIT_REASON_EXTERNAL_INTERRUPT 1 diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 0c2389d0fdaf..0f871faeff1a 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4142,6 +4142,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, { /* update exit information fields: */ vmcs12->vm_exit_reason = vm_exit_reason; + if (to_vmx(vcpu)->exit_reason.enclave_mode) + vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE; vmcs12->exit_qualification = exit_qualification; vmcs12->vm_exit_intr_info = exit_intr_info; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 79889d27aa5b..e6f8c3f52ffa 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1630,12 +1630,25 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len) { + /* + * Emulation of instructions in SGX enclaves is impossible as RIP does + * not point tthe failing instruction, and even if it did, the code + * stream is inaccessible. Inject #UD instead of exiting to userspace + * so that guest userspace can't DoS the guest simply by triggering + * emulation (enclaves are CPL3 only). + */ + if (to_vmx(vcpu)->exit_reason.enclave_mode) { + kvm_queue_exception(vcpu, UD_VECTOR); + return false; + } return true; } static int skip_emulated_instruction(struct kvm_vcpu *vcpu) { + union vmx_exit_reason exit_reason = to_vmx(vcpu)->exit_reason; unsigned long rip, orig_rip; + u32 instr_len; /* * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on @@ -1646,9 +1659,33 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu) * i.e. we end up advancing IP with some random value. */ if (!static_cpu_has(X86_FEATURE_HYPERVISOR) || - to_vmx(vcpu)->exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) { + exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) { + instr_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + + /* + * Emulating an enclave's instructions isn't supported as KVM + * cannot access the enclave's memory or its true RIP, e.g. the + * vmcs.GUEST_RIP points at the exit point of the enclave, not + * the RIP that actually triggered the VM-Exit. But, because + * most instructions that cause VM-Exit will #UD in an enclave, + * most instruction-based VM-Exits simply do not occur. + * + * There are a few exceptions, notably the debug instructions + * INT1ICEBRK and INT3, as they are allowed in debug enclaves + * and generate #DB/#BP as expected, which KVM might intercept. + * But again, the CPU does the dirty work and saves an instr + * length of zero so VMMs don't shoot themselves in the foot. + * WARN if KVM tries to skip a non-zero length instruction on + * a VM-Exit from an enclave. + */ + if (!instr_len) + goto rip_updated; + + WARN(exit_reason.enclave_mode, + "KVM: skipping instruction after SGX enclave VM-Exit"); + orig_rip = kvm_rip_read(vcpu); - rip = orig_rip + vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + rip = orig_rip + instr_len; #ifdef CONFIG_X86_64 /* * We need to mask out the high 32 bits of RIP if not in 64-bit @@ -1664,6 +1701,7 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu) return 0; } +rip_updated: /* skipping an emulated instruction also counts */ vmx_set_interrupt_shadow(vcpu, 0); @@ -5508,6 +5546,9 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) { gpa_t gpa; + if (!vmx_can_emulate_instruction(vcpu, NULL, 0)) + return 1; + /* * A nested guest cannot optimize MMIO vmexits, because we have an * nGPA here instead of the required GPA. -- Gitee From e4e22234d060efeaae35860cf411bdf68ba6bfe7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 12 Apr 2021 16:21:38 +1200 Subject: [PATCH 020/674] KVM: VMX: Frame in ENCLS handler for SGX virtualization mainline inclusion from mainline-5.13 commit 9798adbc04cf1b14325dc7e2c882639693516a69 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK CVE: NA Intel-SIG: commit 9798adbc04cf KVM: VMX: Frame in ENCLS handler for SGX virtualization. Backport for SGX virtualization support -------------------------------- Introduce sgx.c and sgx.h, along with the framework for handling ENCLS VM-Exits. Add a bool, enable_sgx, that will eventually be wired up to a module param to control whether or not SGX virtualization is enabled at runtime. Signed-off-by: Sean Christopherson Signed-off-by: Kai Huang Message-Id: <1c782269608b2f5e1034be450f375a8432fb705d.1618196135.git.kai.huang@intel.com> Signed-off-by: Paolo Bonzini Signed-off-by: Fan Du Signed-off-by: Zhiquan Li --- arch/x86/kvm/Makefile | 2 ++ arch/x86/kvm/vmx/sgx.c | 50 ++++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/vmx/sgx.h | 15 +++++++++++++ arch/x86/kvm/vmx/vmx.c | 9 +++++--- 4 files changed, 73 insertions(+), 3 deletions(-) create mode 100644 arch/x86/kvm/vmx/sgx.c create mode 100644 arch/x86/kvm/vmx/sgx.h diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index b804444e16d4..1c6c9adcb730 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -20,6 +20,8 @@ kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \ kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \ vmx/evmcs.o vmx/nested.o vmx/posted_intr.o +kvm-intel-$(CONFIG_X86_SGX_KVM) += vmx/sgx.o + kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o svm/sev.o obj-$(CONFIG_KVM) += kvm.o diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c new file mode 100644 index 000000000000..d874eb180b7d --- /dev/null +++ b/arch/x86/kvm/vmx/sgx.c @@ -0,0 +1,50 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2021 Intel Corporation. */ + +#include + +#include "cpuid.h" +#include "kvm_cache_regs.h" +#include "sgx.h" +#include "vmx.h" +#include "x86.h" + +bool __read_mostly enable_sgx; + +static inline bool encls_leaf_enabled_in_guest(struct kvm_vcpu *vcpu, u32 leaf) +{ + if (!enable_sgx || !guest_cpuid_has(vcpu, X86_FEATURE_SGX)) + return false; + + if (leaf >= ECREATE && leaf <= ETRACK) + return guest_cpuid_has(vcpu, X86_FEATURE_SGX1); + + if (leaf >= EAUG && leaf <= EMODT) + return guest_cpuid_has(vcpu, X86_FEATURE_SGX2); + + return false; +} + +static inline bool sgx_enabled_in_guest_bios(struct kvm_vcpu *vcpu) +{ + const u64 bits = FEAT_CTL_SGX_ENABLED | FEAT_CTL_LOCKED; + + return (to_vmx(vcpu)->msr_ia32_feature_control & bits) == bits; +} + +int handle_encls(struct kvm_vcpu *vcpu) +{ + u32 leaf = (u32)kvm_rax_read(vcpu); + + if (!encls_leaf_enabled_in_guest(vcpu, leaf)) { + kvm_queue_exception(vcpu, UD_VECTOR); + } else if (!sgx_enabled_in_guest_bios(vcpu)) { + kvm_inject_gp(vcpu, 0); + } else { + WARN(1, "KVM: unexpected exit on ENCLS[%u]", leaf); + vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; + vcpu->run->hw.hardware_exit_reason = EXIT_REASON_ENCLS; + return 0; + } + return 1; +} diff --git a/arch/x86/kvm/vmx/sgx.h b/arch/x86/kvm/vmx/sgx.h new file mode 100644 index 000000000000..6e17ecd4aca3 --- /dev/null +++ b/arch/x86/kvm/vmx/sgx.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __KVM_X86_SGX_H +#define __KVM_X86_SGX_H + +#include + +#ifdef CONFIG_X86_SGX_KVM +extern bool __read_mostly enable_sgx; + +int handle_encls(struct kvm_vcpu *vcpu); +#else +#define enable_sgx 0 +#endif + +#endif /* __KVM_X86_SGX_H */ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index e6f8c3f52ffa..578dc24b6195 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -58,6 +58,7 @@ #include "mmu.h" #include "nested.h" #include "pmu.h" +#include "sgx.h" #include "trace.h" #include "vmcs.h" #include "vmcs12.h" @@ -5800,16 +5801,18 @@ static int handle_vmx_instruction(struct kvm_vcpu *vcpu) return 1; } +#ifndef CONFIG_X86_SGX_KVM static int handle_encls(struct kvm_vcpu *vcpu) { /* - * SGX virtualization is not yet supported. There is no software - * enable bit for SGX, so we have to trap ENCLS and inject a #UD - * to prevent the guest from executing ENCLS. + * SGX virtualization is disabled. There is no software enable bit for + * SGX, so KVM intercepts all ENCLS leafs and injects a #UD to prevent + * the guest from executing ENCLS (when SGX is supported by hardware). */ kvm_queue_exception(vcpu, UD_VECTOR); return 1; } +#endif /* CONFIG_X86_SGX_KVM */ /* * The exit handlers return 1 if the exit was handled fully and guest execution -- Gitee From 6ad6cc5bd16c19b3d0680fb0a95a8e2282dfd825 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 12 Apr 2021 16:21:39 +1200 Subject: [PATCH 021/674] KVM: VMX: Add SGX ENCLS[ECREATE] handler to enforce CPUID restrictions mainline inclusion from mainline-5.13 commit 70210c044b4ea8f05e93ec62abc30cab4233ec88 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK CVE: NA Intel-SIG: commit 70210c044b4e KVM: VMX: Add SGX ENCLS[ECREATE] handler to enforce CPUID restrictions. Backport for SGX virtualization support -------------------------------- Add an ECREATE handler that will be used to intercept ECREATE for the purpose of enforcing and enclave's MISCSELECT, ATTRIBUTES and XFRM, i.e. to allow userspace to restrict SGX features via CPUID. ECREATE will be intercepted when any of the aforementioned masks diverges from hardware in order to enforce the desired CPUID model, i.e. inject #GP if the guest attempts to set a bit that hasn't been enumerated as allowed-1 in CPUID. Note, access to the PROVISIONKEY is not yet supported. Signed-off-by: Sean Christopherson Co-developed-by: Kai Huang Signed-off-by: Kai Huang Message-Id: Signed-off-by: Paolo Bonzini Signed-off-by: Fan Du Signed-off-by: Zhiquan Li --- arch/x86/include/asm/kvm_host.h | 3 + arch/x86/kvm/vmx/sgx.c | 275 ++++++++++++++++++++++++++++++++ 2 files changed, 278 insertions(+) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 80556cef8c48..559063fd8907 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -994,6 +994,9 @@ struct kvm_arch { bool bus_lock_detection_enabled; + /* Guest can access the SGX PROVISIONKEY. */ + bool sgx_provisioning_allowed; + /* Deflect RDMSR and WRMSR to user space when they trigger a #GP */ u32 user_space_msr_mask; diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c index d874eb180b7d..2cf9d7cf818c 100644 --- a/arch/x86/kvm/vmx/sgx.c +++ b/arch/x86/kvm/vmx/sgx.c @@ -11,6 +11,279 @@ bool __read_mostly enable_sgx; +/* + * ENCLS's memory operands use a fixed segment (DS) and a fixed + * address size based on the mode. Related prefixes are ignored. + */ +static int sgx_get_encls_gva(struct kvm_vcpu *vcpu, unsigned long offset, + int size, int alignment, gva_t *gva) +{ + struct kvm_segment s; + bool fault; + + /* Skip vmcs.GUEST_DS retrieval for 64-bit mode to avoid VMREADs. */ + *gva = offset; + if (!is_long_mode(vcpu)) { + vmx_get_segment(vcpu, &s, VCPU_SREG_DS); + *gva += s.base; + } + + if (!IS_ALIGNED(*gva, alignment)) { + fault = true; + } else if (likely(is_long_mode(vcpu))) { + fault = is_noncanonical_address(*gva, vcpu); + } else { + *gva &= 0xffffffff; + fault = (s.unusable) || + (s.type != 2 && s.type != 3) || + (*gva > s.limit) || + ((s.base != 0 || s.limit != 0xffffffff) && + (((u64)*gva + size - 1) > s.limit + 1)); + } + if (fault) + kvm_inject_gp(vcpu, 0); + return fault ? -EINVAL : 0; +} + +static void sgx_handle_emulation_failure(struct kvm_vcpu *vcpu, u64 addr, + unsigned int size) +{ + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 2; + vcpu->run->internal.data[0] = addr; + vcpu->run->internal.data[1] = size; +} + +static int sgx_read_hva(struct kvm_vcpu *vcpu, unsigned long hva, void *data, + unsigned int size) +{ + if (__copy_from_user(data, (void __user *)hva, size)) { + sgx_handle_emulation_failure(vcpu, hva, size); + return -EFAULT; + } + + return 0; +} + +static int sgx_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t gva, bool write, + gpa_t *gpa) +{ + struct x86_exception ex; + + if (write) + *gpa = kvm_mmu_gva_to_gpa_write(vcpu, gva, &ex); + else + *gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, &ex); + + if (*gpa == UNMAPPED_GVA) { + kvm_inject_emulated_page_fault(vcpu, &ex); + return -EFAULT; + } + + return 0; +} + +static int sgx_gpa_to_hva(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned long *hva) +{ + *hva = kvm_vcpu_gfn_to_hva(vcpu, PFN_DOWN(gpa)); + if (kvm_is_error_hva(*hva)) { + sgx_handle_emulation_failure(vcpu, gpa, 1); + return -EFAULT; + } + + *hva |= gpa & ~PAGE_MASK; + + return 0; +} + +static int sgx_inject_fault(struct kvm_vcpu *vcpu, gva_t gva, int trapnr) +{ + struct x86_exception ex; + + /* + * A non-EPCM #PF indicates a bad userspace HVA. This *should* check + * for PFEC.SGX and not assume any #PF on SGX2 originated in the EPC, + * but the error code isn't (yet) plumbed through the ENCLS helpers. + */ + if (trapnr == PF_VECTOR && !boot_cpu_has(X86_FEATURE_SGX2)) { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + return 0; + } + + /* + * If the guest thinks it's running on SGX2 hardware, inject an SGX + * #PF if the fault matches an EPCM fault signature (#GP on SGX1, + * #PF on SGX2). The assumption is that EPCM faults are much more + * likely than a bad userspace address. + */ + if ((trapnr == PF_VECTOR || !boot_cpu_has(X86_FEATURE_SGX2)) && + guest_cpuid_has(vcpu, X86_FEATURE_SGX2)) { + memset(&ex, 0, sizeof(ex)); + ex.vector = PF_VECTOR; + ex.error_code = PFERR_PRESENT_MASK | PFERR_WRITE_MASK | + PFERR_SGX_MASK; + ex.address = gva; + ex.error_code_valid = true; + ex.nested_page_fault = false; + kvm_inject_page_fault(vcpu, &ex); + } else { + kvm_inject_gp(vcpu, 0); + } + return 1; +} + +static int __handle_encls_ecreate(struct kvm_vcpu *vcpu, + struct sgx_pageinfo *pageinfo, + unsigned long secs_hva, + gva_t secs_gva) +{ + struct sgx_secs *contents = (struct sgx_secs *)pageinfo->contents; + struct kvm_cpuid_entry2 *sgx_12_0, *sgx_12_1; + u64 attributes, xfrm, size; + u32 miscselect; + u8 max_size_log2; + int trapnr, ret; + + sgx_12_0 = kvm_find_cpuid_entry(vcpu, 0x12, 0); + sgx_12_1 = kvm_find_cpuid_entry(vcpu, 0x12, 1); + if (!sgx_12_0 || !sgx_12_1) { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + return 0; + } + + miscselect = contents->miscselect; + attributes = contents->attributes; + xfrm = contents->xfrm; + size = contents->size; + + /* Enforce restriction of access to the PROVISIONKEY. */ + if (!vcpu->kvm->arch.sgx_provisioning_allowed && + (attributes & SGX_ATTR_PROVISIONKEY)) { + if (sgx_12_1->eax & SGX_ATTR_PROVISIONKEY) + pr_warn_once("KVM: SGX PROVISIONKEY advertised but not allowed\n"); + kvm_inject_gp(vcpu, 0); + return 1; + } + + /* Enforce CPUID restrictions on MISCSELECT, ATTRIBUTES and XFRM. */ + if ((u32)miscselect & ~sgx_12_0->ebx || + (u32)attributes & ~sgx_12_1->eax || + (u32)(attributes >> 32) & ~sgx_12_1->ebx || + (u32)xfrm & ~sgx_12_1->ecx || + (u32)(xfrm >> 32) & ~sgx_12_1->edx) { + kvm_inject_gp(vcpu, 0); + return 1; + } + + /* Enforce CPUID restriction on max enclave size. */ + max_size_log2 = (attributes & SGX_ATTR_MODE64BIT) ? sgx_12_0->edx >> 8 : + sgx_12_0->edx; + if (size >= BIT_ULL(max_size_log2)) + kvm_inject_gp(vcpu, 0); + + /* + * sgx_virt_ecreate() returns: + * 1) 0: ECREATE was successful + * 2) -EFAULT: ECREATE was run but faulted, and trapnr was set to the + * exception number. + * 3) -EINVAL: access_ok() on @secs_hva failed. This should never + * happen as KVM checks host addresses at memslot creation. + * sgx_virt_ecreate() has already warned in this case. + */ + ret = sgx_virt_ecreate(pageinfo, (void __user *)secs_hva, &trapnr); + if (!ret) + return kvm_skip_emulated_instruction(vcpu); + if (ret == -EFAULT) + return sgx_inject_fault(vcpu, secs_gva, trapnr); + + return ret; +} + +static int handle_encls_ecreate(struct kvm_vcpu *vcpu) +{ + gva_t pageinfo_gva, secs_gva; + gva_t metadata_gva, contents_gva; + gpa_t metadata_gpa, contents_gpa, secs_gpa; + unsigned long metadata_hva, contents_hva, secs_hva; + struct sgx_pageinfo pageinfo; + struct sgx_secs *contents; + struct x86_exception ex; + int r; + + if (sgx_get_encls_gva(vcpu, kvm_rbx_read(vcpu), 32, 32, &pageinfo_gva) || + sgx_get_encls_gva(vcpu, kvm_rcx_read(vcpu), 4096, 4096, &secs_gva)) + return 1; + + /* + * Copy the PAGEINFO to local memory, its pointers need to be + * translated, i.e. we need to do a deep copy/translate. + */ + r = kvm_read_guest_virt(vcpu, pageinfo_gva, &pageinfo, + sizeof(pageinfo), &ex); + if (r == X86EMUL_PROPAGATE_FAULT) { + kvm_inject_emulated_page_fault(vcpu, &ex); + return 1; + } else if (r != X86EMUL_CONTINUE) { + sgx_handle_emulation_failure(vcpu, pageinfo_gva, + sizeof(pageinfo)); + return 0; + } + + if (sgx_get_encls_gva(vcpu, pageinfo.metadata, 64, 64, &metadata_gva) || + sgx_get_encls_gva(vcpu, pageinfo.contents, 4096, 4096, + &contents_gva)) + return 1; + + /* + * Translate the SECINFO, SOURCE and SECS pointers from GVA to GPA. + * Resume the guest on failure to inject a #PF. + */ + if (sgx_gva_to_gpa(vcpu, metadata_gva, false, &metadata_gpa) || + sgx_gva_to_gpa(vcpu, contents_gva, false, &contents_gpa) || + sgx_gva_to_gpa(vcpu, secs_gva, true, &secs_gpa)) + return 1; + + /* + * ...and then to HVA. The order of accesses isn't architectural, i.e. + * KVM doesn't have to fully process one address at a time. Exit to + * userspace if a GPA is invalid. + */ + if (sgx_gpa_to_hva(vcpu, metadata_gpa, &metadata_hva) || + sgx_gpa_to_hva(vcpu, contents_gpa, &contents_hva) || + sgx_gpa_to_hva(vcpu, secs_gpa, &secs_hva)) + return 0; + + /* + * Copy contents into kernel memory to prevent TOCTOU attack. E.g. the + * guest could do ECREATE w/ SECS.SGX_ATTR_PROVISIONKEY=0, and + * simultaneously set SGX_ATTR_PROVISIONKEY to bypass the check to + * enforce restriction of access to the PROVISIONKEY. + */ + contents = (struct sgx_secs *)__get_free_page(GFP_KERNEL_ACCOUNT); + if (!contents) + return -ENOMEM; + + /* Exit to userspace if copying from a host userspace address fails. */ + if (sgx_read_hva(vcpu, contents_hva, (void *)contents, PAGE_SIZE)) { + free_page((unsigned long)contents); + return 0; + } + + pageinfo.metadata = metadata_hva; + pageinfo.contents = (u64)contents; + + r = __handle_encls_ecreate(vcpu, &pageinfo, secs_hva, secs_gva); + + free_page((unsigned long)contents); + + return r; +} + static inline bool encls_leaf_enabled_in_guest(struct kvm_vcpu *vcpu, u32 leaf) { if (!enable_sgx || !guest_cpuid_has(vcpu, X86_FEATURE_SGX)) @@ -41,6 +314,8 @@ int handle_encls(struct kvm_vcpu *vcpu) } else if (!sgx_enabled_in_guest_bios(vcpu)) { kvm_inject_gp(vcpu, 0); } else { + if (leaf == ECREATE) + return handle_encls_ecreate(vcpu); WARN(1, "KVM: unexpected exit on ENCLS[%u]", leaf); vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; vcpu->run->hw.hardware_exit_reason = EXIT_REASON_ENCLS; -- Gitee From 1766b14ea2c20f12432e24e2cf3fc6f403d9445c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 12 Apr 2021 16:21:40 +1200 Subject: [PATCH 022/674] KVM: VMX: Add emulation of SGX Launch Control LE hash MSRs mainline inclusion from mainline-5.13 commit 8f102445d4045384799627c53d82c45ca2cad3a5 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK CVE: NA Intel-SIG: commit 8f102445d404 KVM: VMX: Add emulation of SGX Launch Control LE hash MSRs. Backport for SGX virtualization support -------------------------------- Emulate the four Launch Enclave public key hash MSRs (LE hash MSRs) that exist on CPUs that support SGX Launch Control (LC). SGX LC modifies the behavior of ENCLS[EINIT] to use the LE hash MSRs when verifying the key used to sign an enclave. On CPUs without LC support, the LE hash is hardwired into the CPU to an Intel controlled key (the Intel key is also the reset value of the LE hash MSRs). Track the guest's desired hash so that a future patch can stuff the hash into the hardware MSRs when executing EINIT on behalf of the guest, when those MSRs are writable in host. Signed-off-by: Sean Christopherson Co-developed-by: Kai Huang Signed-off-by: Kai Huang Message-Id: [Add a comment regarding the MSRs being available until SGX is locked. - Paolo] Signed-off-by: Paolo Bonzini Signed-off-by: Fan Du Signed-off-by: Zhiquan Li --- arch/x86/kvm/vmx/sgx.c | 35 +++++++++++++++++++++++++++++++++++ arch/x86/kvm/vmx/sgx.h | 6 ++++++ arch/x86/kvm/vmx/vmx.c | 31 +++++++++++++++++++++++++++++++ arch/x86/kvm/vmx/vmx.h | 3 +++ 4 files changed, 75 insertions(+) diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c index 2cf9d7cf818c..d53e6b17b320 100644 --- a/arch/x86/kvm/vmx/sgx.c +++ b/arch/x86/kvm/vmx/sgx.c @@ -11,6 +11,9 @@ bool __read_mostly enable_sgx; +/* Initial value of guest's virtual SGX_LEPUBKEYHASHn MSRs */ +static u64 sgx_pubkey_hash[4] __ro_after_init; + /* * ENCLS's memory operands use a fixed segment (DS) and a fixed * address size based on the mode. Related prefixes are ignored. @@ -323,3 +326,35 @@ int handle_encls(struct kvm_vcpu *vcpu) } return 1; } + +void setup_default_sgx_lepubkeyhash(void) +{ + /* + * Use Intel's default value for Skylake hardware if Launch Control is + * not supported, i.e. Intel's hash is hardcoded into silicon, or if + * Launch Control is supported and enabled, i.e. mimic the reset value + * and let the guest write the MSRs at will. If Launch Control is + * supported but disabled, then use the current MSR values as the hash + * MSRs exist but are read-only (locked and not writable). + */ + if (!enable_sgx || boot_cpu_has(X86_FEATURE_SGX_LC) || + rdmsrl_safe(MSR_IA32_SGXLEPUBKEYHASH0, &sgx_pubkey_hash[0])) { + sgx_pubkey_hash[0] = 0xa6053e051270b7acULL; + sgx_pubkey_hash[1] = 0x6cfbe8ba8b3b413dULL; + sgx_pubkey_hash[2] = 0xc4916d99f2b3735dULL; + sgx_pubkey_hash[3] = 0xd4f8c05909f9bb3bULL; + } else { + /* MSR_IA32_SGXLEPUBKEYHASH0 is read above */ + rdmsrl(MSR_IA32_SGXLEPUBKEYHASH1, sgx_pubkey_hash[1]); + rdmsrl(MSR_IA32_SGXLEPUBKEYHASH2, sgx_pubkey_hash[2]); + rdmsrl(MSR_IA32_SGXLEPUBKEYHASH3, sgx_pubkey_hash[3]); + } +} + +void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + memcpy(vmx->msr_ia32_sgxlepubkeyhash, sgx_pubkey_hash, + sizeof(sgx_pubkey_hash)); +} diff --git a/arch/x86/kvm/vmx/sgx.h b/arch/x86/kvm/vmx/sgx.h index 6e17ecd4aca3..6502fa52c7e9 100644 --- a/arch/x86/kvm/vmx/sgx.h +++ b/arch/x86/kvm/vmx/sgx.h @@ -8,8 +8,14 @@ extern bool __read_mostly enable_sgx; int handle_encls(struct kvm_vcpu *vcpu); + +void setup_default_sgx_lepubkeyhash(void); +void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu); #else #define enable_sgx 0 + +static inline void setup_default_sgx_lepubkeyhash(void) { } +static inline void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu) { } #endif #endif /* __KVM_X86_SGX_H */ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 578dc24b6195..df1cdca49fd2 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1964,6 +1964,13 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_FEAT_CTL: msr_info->data = vmx->msr_ia32_feature_control; break; + case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3: + if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC)) + return 1; + msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash + [msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0]; + break; case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: if (!nested_vmx_allowed(vcpu)) return 1; @@ -2259,6 +2266,26 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (msr_info->host_initiated && data == 0) vmx_leave_nested(vcpu); break; + case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3: + /* + * On real hardware, the LE hash MSRs are writable before + * the firmware sets bit 0 in MSR 0x7a ("activating" SGX), + * at which point SGX related bits in IA32_FEATURE_CONTROL + * become writable. + * + * KVM does not emulate SGX activation for simplicity, so + * allow writes to the LE hash MSRs if IA32_FEATURE_CONTROL + * is unlocked. This is technically not architectural + * behavior, but it's close enough. + */ + if (!msr_info->host_initiated && + (!guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC) || + ((vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED) && + !(vmx->msr_ia32_feature_control & FEAT_CTL_SGX_LC_ENABLED)))) + return 1; + vmx->msr_ia32_sgxlepubkeyhash + [msr_index - MSR_IA32_SGXLEPUBKEYHASH0] = data; + break; case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: if (!msr_info->host_initiated) return 1; /* they are read-only */ @@ -7145,6 +7172,8 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) else memset(&vmx->nested.msrs, 0, sizeof(vmx->nested.msrs)); + vcpu_setup_sgx_lepubkeyhash(vcpu); + vmx->nested.posted_intr_nv = -1; vmx->nested.current_vmptr = -1ull; @@ -8087,6 +8116,8 @@ static __init int hardware_setup(void) if (!enable_ept || !cpu_has_vmx_intel_pt()) pt_mode = PT_MODE_SYSTEM; + setup_default_sgx_lepubkeyhash(); + if (nested) { nested_vmx_setup_ctls_msrs(&vmcs_config.nested, vmx_capability.ept); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 05eca210a5ff..1848fef1f96e 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -324,6 +324,9 @@ struct vcpu_vmx { */ u64 msr_ia32_feature_control; u64 msr_ia32_feature_control_valid_bits; + /* SGX Launch Control public key hash */ + u64 msr_ia32_sgxlepubkeyhash[4]; + u64 ept_pointer; u64 msr_ia32_mcu_opt_ctrl; bool disable_fb_clear; -- Gitee From 97c47cac100800f3bcb258ad401e290dcd20a056 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 12 Apr 2021 16:21:41 +1200 Subject: [PATCH 023/674] KVM: VMX: Add ENCLS[EINIT] handler to support SGX Launch Control (LC) mainline inclusion from mainline-5.13 commit b6f084ca553845135ccade79ce6548035e52884a category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK CVE: NA Intel-SIG: commit b6f084ca5538 KVM: VMX: Add ENCLS[EINIT] handler to support SGX Launch Control (LC). Backport for SGX virtualization support -------------------------------- Add a VM-Exit handler to trap-and-execute EINIT when SGX LC is enabled in the host. When SGX LC is enabled, the host kernel may rewrite the hardware values at will, e.g. to launch enclaves with different signers, thus KVM needs to intercept EINIT to ensure it is executed with the correct LE hash (even if the guest sees a hardwired hash). Switching the LE hash MSRs on VM-Enter/VM-Exit is not a viable option as writing the MSRs is prohibitively expensive, e.g. on SKL hardware each WRMSR is ~400 cycles. And because EINIT takes tens of thousands of cycles to execute, the ~1500 cycle overhead to trap-and-execute EINIT is unlikely to be noticed by the guest, let alone impact its overall SGX performance. Signed-off-by: Sean Christopherson Signed-off-by: Kai Huang Message-Id: <57c92fa4d2083eb3be9e6355e3882fc90cffea87.1618196135.git.kai.huang@intel.com> Signed-off-by: Paolo Bonzini Signed-off-by: Fan Du Signed-off-by: Zhiquan Li --- arch/x86/kvm/vmx/sgx.c | 64 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c index d53e6b17b320..2eed5da91698 100644 --- a/arch/x86/kvm/vmx/sgx.c +++ b/arch/x86/kvm/vmx/sgx.c @@ -287,6 +287,68 @@ static int handle_encls_ecreate(struct kvm_vcpu *vcpu) return r; } +static int handle_encls_einit(struct kvm_vcpu *vcpu) +{ + unsigned long sig_hva, secs_hva, token_hva, rflags; + struct vcpu_vmx *vmx = to_vmx(vcpu); + gva_t sig_gva, secs_gva, token_gva; + gpa_t sig_gpa, secs_gpa, token_gpa; + int ret, trapnr; + + if (sgx_get_encls_gva(vcpu, kvm_rbx_read(vcpu), 1808, 4096, &sig_gva) || + sgx_get_encls_gva(vcpu, kvm_rcx_read(vcpu), 4096, 4096, &secs_gva) || + sgx_get_encls_gva(vcpu, kvm_rdx_read(vcpu), 304, 512, &token_gva)) + return 1; + + /* + * Translate the SIGSTRUCT, SECS and TOKEN pointers from GVA to GPA. + * Resume the guest on failure to inject a #PF. + */ + if (sgx_gva_to_gpa(vcpu, sig_gva, false, &sig_gpa) || + sgx_gva_to_gpa(vcpu, secs_gva, true, &secs_gpa) || + sgx_gva_to_gpa(vcpu, token_gva, false, &token_gpa)) + return 1; + + /* + * ...and then to HVA. The order of accesses isn't architectural, i.e. + * KVM doesn't have to fully process one address at a time. Exit to + * userspace if a GPA is invalid. Note, all structures are aligned and + * cannot split pages. + */ + if (sgx_gpa_to_hva(vcpu, sig_gpa, &sig_hva) || + sgx_gpa_to_hva(vcpu, secs_gpa, &secs_hva) || + sgx_gpa_to_hva(vcpu, token_gpa, &token_hva)) + return 0; + + ret = sgx_virt_einit((void __user *)sig_hva, (void __user *)token_hva, + (void __user *)secs_hva, + vmx->msr_ia32_sgxlepubkeyhash, &trapnr); + + if (ret == -EFAULT) + return sgx_inject_fault(vcpu, secs_gva, trapnr); + + /* + * sgx_virt_einit() returns -EINVAL when access_ok() fails on @sig_hva, + * @token_hva or @secs_hva. This should never happen as KVM checks host + * addresses at memslot creation. sgx_virt_einit() has already warned + * in this case, so just return. + */ + if (ret < 0) + return ret; + + rflags = vmx_get_rflags(vcpu) & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | + X86_EFLAGS_AF | X86_EFLAGS_SF | + X86_EFLAGS_OF); + if (ret) + rflags |= X86_EFLAGS_ZF; + else + rflags &= ~X86_EFLAGS_ZF; + vmx_set_rflags(vcpu, rflags); + + kvm_rax_write(vcpu, ret); + return kvm_skip_emulated_instruction(vcpu); +} + static inline bool encls_leaf_enabled_in_guest(struct kvm_vcpu *vcpu, u32 leaf) { if (!enable_sgx || !guest_cpuid_has(vcpu, X86_FEATURE_SGX)) @@ -319,6 +381,8 @@ int handle_encls(struct kvm_vcpu *vcpu) } else { if (leaf == ECREATE) return handle_encls_ecreate(vcpu); + if (leaf == EINIT) + return handle_encls_einit(vcpu); WARN(1, "KVM: unexpected exit on ENCLS[%u]", leaf); vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; vcpu->run->hw.hardware_exit_reason = EXIT_REASON_ENCLS; -- Gitee From 5698b7e86eb371e324b632402364f89956e59e38 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 12 Apr 2021 16:21:42 +1200 Subject: [PATCH 024/674] KVM: VMX: Enable SGX virtualization for SGX1, SGX2 and LC mainline inclusion from mainline-5.13 commit 72add915fbd5bf5c57deee3da5b2605e966ac199 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK CVE: NA Intel-SIG: commit 72add915fbd5 KVM: VMX: Enable SGX virtualization for SGX1, SGX2 and LC. Backport for SGX virtualization support -------------------------------- Enable SGX virtualization now that KVM has the VM-Exit handlers needed to trap-and-execute ENCLS to ensure correctness and/or enforce the CPU model exposed to the guest. Add a KVM module param, "sgx", to allow an admin to disable SGX virtualization independent of the kernel. When supported in hardware and the kernel, advertise SGX1, SGX2 and SGX LC to userspace via CPUID and wire up the ENCLS_EXITING bitmap based on the guest's SGX capabilities, i.e. to allow ENCLS to be executed in an SGX-enabled guest. With the exception of the provision key, all SGX attribute bits may be exposed to the guest. Guest access to the provision key, which is controlled via securityfs, will be added in a future patch. Note, KVM does not yet support exposing ENCLS_C leafs or ENCLV leafs. Signed-off-by: Sean Christopherson Signed-off-by: Kai Huang Message-Id: Signed-off-by: Paolo Bonzini Signed-off-by: Fan Du Signed-off-by: Zhiquan Li --- arch/x86/kvm/cpuid.c | 57 +++++++++++++++++++++++++++- arch/x86/kvm/vmx/nested.c | 27 +++++++++++-- arch/x86/kvm/vmx/nested.h | 5 +++ arch/x86/kvm/vmx/sgx.c | 80 ++++++++++++++++++++++++++++++++++++++- arch/x86/kvm/vmx/sgx.h | 13 +++++++ arch/x86/kvm/vmx/vmcs12.c | 1 + arch/x86/kvm/vmx/vmcs12.h | 4 +- arch/x86/kvm/vmx/vmx.c | 26 ++++++++++++- 8 files changed, 204 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index c5fa5c62e391..6d6130ecf250 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "cpuid.h" #include "lapic.h" #include "mmu.h" @@ -170,6 +171,21 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) vcpu->arch.guest_supported_xcr0 = (best->eax | ((u64)best->edx << 32)) & supported_xcr0; + /* + * Bits 127:0 of the allowed SECS.ATTRIBUTES (CPUID.0x12.0x1) enumerate + * the supported XSAVE Feature Request Mask (XFRM), i.e. the enclave's + * requested XCR0 value. The enclave's XFRM must be a subset of XCRO + * at the time of EENTER, thus adjust the allowed XFRM by the guest's + * supported XCR0. Similar to XCR0 handling, FP and SSE are forced to + * '1' even on CPUs that don't support XSAVE. + */ + best = kvm_find_cpuid_entry(vcpu, 0x12, 0x1); + if (best) { + best->ecx &= vcpu->arch.guest_supported_xcr0 & 0xffffffff; + best->edx &= vcpu->arch.guest_supported_xcr0 >> 32; + best->ecx |= XFEATURE_MASK_FPSSE; + } + kvm_update_pv_runtime(vcpu); vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); @@ -412,7 +428,7 @@ void kvm_set_cpu_caps(void) ); kvm_cpu_cap_mask(CPUID_7_0_EBX, - F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | + F(FSGSBASE) | F(SGX) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | F(BMI2) | F(ERMS) | 0 /*INVPCID*/ | F(RTM) | 0 /*MPX*/ | F(RDSEED) | F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) | @@ -423,7 +439,8 @@ void kvm_set_cpu_caps(void) F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | F(RDPID) | F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) | F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) | - F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/ + F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/ | + F(SGX_LC) ); /* Set LA57 based on hardware capability. */ if (cpuid_ecx(7) & F(LA57)) @@ -462,6 +479,10 @@ void kvm_set_cpu_caps(void) F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | F(XSAVES) ); + kvm_cpu_cap_init_scattered(CPUID_12_EAX, + SF(SGX1) | SF(SGX2) + ); + kvm_cpu_cap_mask(CPUID_8000_0001_ECX, F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ | F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | @@ -785,6 +806,38 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->edx = 0; } break; + case 0x12: + /* Intel SGX */ + if (!kvm_cpu_cap_has(X86_FEATURE_SGX)) { + entry->eax = entry->ebx = entry->ecx = entry->edx = 0; + break; + } + + /* + * Index 0: Sub-features, MISCSELECT (a.k.a extended features) + * and max enclave sizes. The SGX sub-features and MISCSELECT + * are restricted by kernel and KVM capabilities (like most + * feature flags), while enclave size is unrestricted. + */ + cpuid_entry_override(entry, CPUID_12_EAX); + entry->ebx &= SGX_MISC_EXINFO; + + entry = do_host_cpuid(array, function, 1); + if (!entry) + goto out; + + /* + * Index 1: SECS.ATTRIBUTES. ATTRIBUTES are restricted a la + * feature flags. Advertise all supported flags, including + * privileged attributes that require explicit opt-in from + * userspace. ATTRIBUTES.XFRM is not adjusted as userspace is + * expected to derive it from supported XCR0. + */ + entry->eax &= SGX_ATTR_DEBUG | SGX_ATTR_MODE64BIT | + /* PROVISIONKEY | */ SGX_ATTR_EINITTOKENKEY | + SGX_ATTR_KSS; + entry->ebx &= 0; + break; /* Intel PT */ case 0x14: if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT)) { diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 0f871faeff1a..d2a0464e95c7 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -11,6 +11,7 @@ #include "mmu.h" #include "nested.h" #include "pmu.h" +#include "sgx.h" #include "trace.h" #include "x86.h" @@ -2326,6 +2327,9 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; + if (exec_control & SECONDARY_EXEC_ENCLS_EXITING) + vmx_write_encls_bitmap(&vmx->vcpu, vmcs12); + secondary_exec_controls_set(vmx, exec_control); } @@ -5738,6 +5742,21 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, return false; } +static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + u32 encls_leaf; + + if (!guest_cpuid_has(vcpu, X86_FEATURE_SGX) || + !nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING)) + return false; + + encls_leaf = kvm_rax_read(vcpu); + if (encls_leaf > 62) + encls_leaf = 63; + return vmcs12->encls_exiting_bitmap & BIT_ULL(encls_leaf); +} + static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, gpa_t bitmap) { @@ -5835,9 +5854,6 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, case EXIT_REASON_VMFUNC: /* VM functions are emulated through L2->L0 vmexits. */ return true; - case EXIT_REASON_ENCLS: - /* SGX is never exposed to L1 */ - return true; default: break; } @@ -5961,6 +5977,8 @@ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, case EXIT_REASON_TPAUSE: return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE); + case EXIT_REASON_ENCLS: + return nested_vmx_exit_handled_encls(vcpu, vmcs12); default: return true; } @@ -6536,6 +6554,9 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps) msrs->secondary_ctls_high |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + if (enable_sgx) + msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING; + /* miscellaneous data */ rdmsr(MSR_IA32_VMX_MISC, msrs->misc_low, diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index 197148d76b8f..184418baeb3c 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -244,6 +244,11 @@ static inline bool nested_exit_on_intr(struct kvm_vcpu *vcpu) PIN_BASED_EXT_INTR_MASK; } +static inline bool nested_cpu_has_encls_exit(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING); +} + /* * if fixed0[i] == 1: val[i] must be 1 * if fixed1[i] == 0: val[i] must be 0 diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c index 2eed5da91698..6693ebdc0770 100644 --- a/arch/x86/kvm/vmx/sgx.c +++ b/arch/x86/kvm/vmx/sgx.c @@ -5,11 +5,13 @@ #include "cpuid.h" #include "kvm_cache_regs.h" +#include "nested.h" #include "sgx.h" #include "vmx.h" #include "x86.h" -bool __read_mostly enable_sgx; +bool __read_mostly enable_sgx = 1; +module_param_named(sgx, enable_sgx, bool, 0444); /* Initial value of guest's virtual SGX_LEPUBKEYHASHn MSRs */ static u64 sgx_pubkey_hash[4] __ro_after_init; @@ -422,3 +424,79 @@ void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu) memcpy(vmx->msr_ia32_sgxlepubkeyhash, sgx_pubkey_hash, sizeof(sgx_pubkey_hash)); } + +/* + * ECREATE must be intercepted to enforce MISCSELECT, ATTRIBUTES and XFRM + * restrictions if the guest's allowed-1 settings diverge from hardware. + */ +static bool sgx_intercept_encls_ecreate(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *guest_cpuid; + u32 eax, ebx, ecx, edx; + + if (!vcpu->kvm->arch.sgx_provisioning_allowed) + return true; + + guest_cpuid = kvm_find_cpuid_entry(vcpu, 0x12, 0); + if (!guest_cpuid) + return true; + + cpuid_count(0x12, 0, &eax, &ebx, &ecx, &edx); + if (guest_cpuid->ebx != ebx || guest_cpuid->edx != edx) + return true; + + guest_cpuid = kvm_find_cpuid_entry(vcpu, 0x12, 1); + if (!guest_cpuid) + return true; + + cpuid_count(0x12, 1, &eax, &ebx, &ecx, &edx); + if (guest_cpuid->eax != eax || guest_cpuid->ebx != ebx || + guest_cpuid->ecx != ecx || guest_cpuid->edx != edx) + return true; + + return false; +} + +void vmx_write_encls_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) +{ + /* + * There is no software enable bit for SGX that is virtualized by + * hardware, e.g. there's no CR4.SGXE, so when SGX is disabled in the + * guest (either by the host or by the guest's BIOS) but enabled in the + * host, trap all ENCLS leafs and inject #UD/#GP as needed to emulate + * the expected system behavior for ENCLS. + */ + u64 bitmap = -1ull; + + /* Nothing to do if hardware doesn't support SGX */ + if (!cpu_has_vmx_encls_vmexit()) + return; + + if (guest_cpuid_has(vcpu, X86_FEATURE_SGX) && + sgx_enabled_in_guest_bios(vcpu)) { + if (guest_cpuid_has(vcpu, X86_FEATURE_SGX1)) { + bitmap &= ~GENMASK_ULL(ETRACK, ECREATE); + if (sgx_intercept_encls_ecreate(vcpu)) + bitmap |= (1 << ECREATE); + } + + if (guest_cpuid_has(vcpu, X86_FEATURE_SGX2)) + bitmap &= ~GENMASK_ULL(EMODT, EAUG); + + /* + * Trap and execute EINIT if launch control is enabled in the + * host using the guest's values for launch control MSRs, even + * if the guest's values are fixed to hardware default values. + * The MSRs are not loaded/saved on VM-Enter/VM-Exit as writing + * the MSRs is extraordinarily expensive. + */ + if (boot_cpu_has(X86_FEATURE_SGX_LC)) + bitmap |= (1 << EINIT); + + if (!vmcs12 && is_guest_mode(vcpu)) + vmcs12 = get_vmcs12(vcpu); + if (vmcs12 && nested_cpu_has_encls_exit(vmcs12)) + bitmap |= vmcs12->encls_exiting_bitmap; + } + vmcs_write64(ENCLS_EXITING_BITMAP, bitmap); +} diff --git a/arch/x86/kvm/vmx/sgx.h b/arch/x86/kvm/vmx/sgx.h index 6502fa52c7e9..a400888b376d 100644 --- a/arch/x86/kvm/vmx/sgx.h +++ b/arch/x86/kvm/vmx/sgx.h @@ -4,6 +4,9 @@ #include +#include "capabilities.h" +#include "vmx_ops.h" + #ifdef CONFIG_X86_SGX_KVM extern bool __read_mostly enable_sgx; @@ -11,11 +14,21 @@ int handle_encls(struct kvm_vcpu *vcpu); void setup_default_sgx_lepubkeyhash(void); void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu); + +void vmx_write_encls_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12); #else #define enable_sgx 0 static inline void setup_default_sgx_lepubkeyhash(void) { } static inline void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu) { } + +static inline void vmx_write_encls_bitmap(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + /* Nothing to do if hardware doesn't support SGX */ + if (cpu_has_vmx_encls_vmexit()) + vmcs_write64(ENCLS_EXITING_BITMAP, -1ull); +} #endif #endif /* __KVM_X86_SGX_H */ diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c index c8e51c004f78..034adb6404dc 100644 --- a/arch/x86/kvm/vmx/vmcs12.c +++ b/arch/x86/kvm/vmx/vmcs12.c @@ -50,6 +50,7 @@ const unsigned short vmcs_field_to_offset_table[] = { FIELD64(VMREAD_BITMAP, vmread_bitmap), FIELD64(VMWRITE_BITMAP, vmwrite_bitmap), FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap), + FIELD64(ENCLS_EXITING_BITMAP, encls_exiting_bitmap), FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address), FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer), FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl), diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h index 80232daf00ff..13494956d0e9 100644 --- a/arch/x86/kvm/vmx/vmcs12.h +++ b/arch/x86/kvm/vmx/vmcs12.h @@ -69,7 +69,8 @@ struct __packed vmcs12 { u64 vm_function_control; u64 eptp_list_address; u64 pml_address; - u64 padding64[3]; /* room for future expansion */ + u64 encls_exiting_bitmap; + u64 padding64[2]; /* room for future expansion */ /* * To allow migration of L1 (complete with its L2 guests) between * machines of different natural widths (32 or 64 bit), we cannot have @@ -256,6 +257,7 @@ static inline void vmx_check_vmcs12_offsets(void) CHECK_OFFSET(vm_function_control, 296); CHECK_OFFSET(eptp_list_address, 304); CHECK_OFFSET(pml_address, 312); + CHECK_OFFSET(encls_exiting_bitmap, 320); CHECK_OFFSET(cr0_guest_host_mask, 344); CHECK_OFFSET(cr4_guest_host_mask, 352); CHECK_OFFSET(cr0_read_shadow, 360); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index df1cdca49fd2..6bb07e495eca 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2265,6 +2265,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vmx->msr_ia32_feature_control = data; if (msr_info->host_initiated && data == 0) vmx_leave_nested(vcpu); + + /* SGX may be enabled/disabled by guest's firmware */ + vmx_write_encls_bitmap(vcpu, NULL); break; case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3: /* @@ -4554,8 +4557,7 @@ static void init_vmcs(struct vcpu_vmx *vmx) vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); } - if (cpu_has_vmx_encls_vmexit()) - vmcs_write64(ENCLS_EXITING_BITMAP, -1ull); + vmx_write_encls_bitmap(&vmx->vcpu, NULL); if (vmx_pt_mode_is_host_guest()) { memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc)); @@ -7500,6 +7502,19 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) set_cr4_guest_host_mask(vmx); + vmx_write_encls_bitmap(vcpu, NULL); + if (guest_cpuid_has(vcpu, X86_FEATURE_SGX)) + vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_SGX_ENABLED; + else + vmx->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_SGX_ENABLED; + + if (guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC)) + vmx->msr_ia32_feature_control_valid_bits |= + FEAT_CTL_SGX_LC_ENABLED; + else + vmx->msr_ia32_feature_control_valid_bits &= + ~FEAT_CTL_SGX_LC_ENABLED; + /* Refresh #PF interception to account for MAXPHYADDR changes. */ update_exception_bitmap(vcpu); } @@ -7520,6 +7535,13 @@ static __init void vmx_set_cpu_caps(void) if (vmx_pt_mode_is_host_guest()) kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT); + if (!enable_sgx) { + kvm_cpu_cap_clear(X86_FEATURE_SGX); + kvm_cpu_cap_clear(X86_FEATURE_SGX_LC); + kvm_cpu_cap_clear(X86_FEATURE_SGX1); + kvm_cpu_cap_clear(X86_FEATURE_SGX2); + } + if (vmx_umip_emulated()) kvm_cpu_cap_set(X86_FEATURE_UMIP); -- Gitee From 2c1e5e2fae715defb064ad661e8f8812ded8f3c3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 12 Apr 2021 16:21:43 +1200 Subject: [PATCH 025/674] KVM: x86: Add capability to grant VM access to privileged SGX attribute mainline inclusion from mainline-5.13 commit fe7e948837f312d87853b3fce743795d1ae3715a category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK CVE: NA Intel-SIG: commit fe7e948837f3 KVM: x86: Add capability to grant VM access to privileged SGX attribute. Backport for SGX virtualization support -------------------------------- Add a capability, KVM_CAP_SGX_ATTRIBUTE, that can be used by userspace to grant a VM access to a priveleged attribute, with args[0] holding a file handle to a valid SGX attribute file. The SGX subsystem restricts access to a subset of enclave attributes to provide additional security for an uncompromised kernel, e.g. to prevent malware from using the PROVISIONKEY to ensure its nodes are running inside a geniune SGX enclave and/or to obtain a stable fingerprint. To prevent userspace from circumventing such restrictions by running an enclave in a VM, KVM restricts guest access to privileged attributes by default. Cc: Andy Lutomirski Signed-off-by: Sean Christopherson Signed-off-by: Kai Huang Message-Id: <0b099d65e933e068e3ea934b0523bab070cb8cea.1618196135.git.kai.huang@intel.com> Signed-off-by: Paolo Bonzini Signed-off-by: Fan Du Signed-off-by: Zhiquan Li --- Documentation/virt/kvm/api.rst | 23 +++++++++++++++++++++++ arch/x86/kvm/cpuid.c | 2 +- arch/x86/kvm/x86.c | 21 +++++++++++++++++++++ include/uapi/linux/kvm.h | 1 + 4 files changed, 46 insertions(+), 1 deletion(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 56deaceab625..671bbd90f15b 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6197,6 +6197,29 @@ KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR exit notifications which user space can then handle to implement model specific MSR handling and/or user notifications to inform a user that an MSR was not handled. +7.25 KVM_CAP_SGX_ATTRIBUTE +---------------------- + +:Architectures: x86 +:Target: VM +:Parameters: args[0] is a file handle of a SGX attribute file in securityfs +:Returns: 0 on success, -EINVAL if the file handle is invalid or if a requested + attribute is not supported by KVM. + +KVM_CAP_SGX_ATTRIBUTE enables a userspace VMM to grant a VM access to one or +more priveleged enclave attributes. args[0] must hold a file handle to a valid +SGX attribute file corresponding to an attribute that is supported/restricted +by KVM (currently only PROVISIONKEY). + +The SGX subsystem restricts access to a subset of enclave attributes to provide +additional security for an uncompromised kernel, e.g. use of the PROVISIONKEY +is restricted to deter malware from using the PROVISIONKEY to obtain a stable +system fingerprint. To prevent userspace from circumventing such restrictions +by running an enclave in a VM, KVM prevents access to privileged attributes by +default. + +See Documentation/x86/sgx/2.Kernel-internals.rst for more details. + 8. Other capabilities. ====================== diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 6d6130ecf250..72e7db3a0ad1 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -834,7 +834,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) * expected to derive it from supported XCR0. */ entry->eax &= SGX_ATTR_DEBUG | SGX_ATTR_MODE64BIT | - /* PROVISIONKEY | */ SGX_ATTR_EINITTOKENKEY | + SGX_ATTR_PROVISIONKEY | SGX_ATTR_EINITTOKENKEY | SGX_ATTR_KSS; entry->ebx &= 0; break; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4595c7dbfa93..4e28a6d6dd1a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -74,6 +74,7 @@ #include #include #include +#include #include #define CREATE_TRACE_POINTS @@ -3842,6 +3843,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_X86_USER_SPACE_MSR: case KVM_CAP_X86_MSR_FILTER: case KVM_CAP_ENFORCE_PV_FEATURE_CPUID: +#ifdef CONFIG_X86_SGX_KVM + case KVM_CAP_SGX_ATTRIBUTE: +#endif r = 1; break; case KVM_CAP_SYNC_REGS: @@ -5394,6 +5398,23 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, kvm->arch.user_space_msr_mask = cap->args[0]; r = 0; break; +#ifdef CONFIG_X86_SGX_KVM + case KVM_CAP_SGX_ATTRIBUTE: { + unsigned long allowed_attributes = 0; + + r = sgx_set_attribute(&allowed_attributes, cap->args[0]); + if (r) + break; + + /* KVM only supports the PROVISIONKEY privileged attribute. */ + if ((allowed_attributes & SGX_ATTR_PROVISIONKEY) && + !(allowed_attributes & ~SGX_ATTR_PROVISIONKEY)) + kvm->arch.sgx_provisioning_allowed = true; + else + r = -EINVAL; + break; + } +#endif default: r = -EINVAL; break; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 3af8b0164f1e..4e0ebd66368f 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1061,6 +1061,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_X86_USER_SPACE_MSR 188 #define KVM_CAP_X86_MSR_FILTER 189 #define KVM_CAP_ENFORCE_PV_FEATURE_CPUID 190 +#define KVM_CAP_SGX_ATTRIBUTE 196 #define KVM_CAP_ARM_CPU_FEATURE 555 -- Gitee From 90772c1249e2e5fa19753dc3025fb88ccb150d5e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 20 Apr 2021 18:08:50 -0700 Subject: [PATCH 026/674] KVM: x86: Fix implicit enum conversion goof in scattered reverse CPUID code mainline inclusion from mainline-5.13 commit 462f8ddebccbb8a364b154008212052d515ac6b1 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK CVE: NA Intel-SIG: commit 462f8ddebccb KVM: x86: Fix implicit enum conversion goof in scattered reverse CPUID code. Backport for SGX virtualization support -------------------------------- Take "enum kvm_only_cpuid_leafs" in scattered specific CPUID helpers (which is obvious in hindsight), and use "unsigned int" for leafs that can be the kernel's standard "enum cpuid_leaf" or the aforementioned KVM-only variant. Loss of the enum params is a bit disapponting, but gcc obviously isn't providing any extra sanity checks, and the various BUILD_BUG_ON() assertions ensure the input is in range. This fixes implicit enum conversions that are detected by clang-11: arch/x86/kvm/cpuid.c:499:29: warning: implicit conversion from enumeration type 'enum kvm_only_cpuid_leafs' to different enumeration type 'enum cpuid_leafs' [-Wenum-conversion] kvm_cpu_cap_init_scattered(CPUID_12_EAX, ~~~~~~~~~~~~~~~~~~~~~~~~~~ ^~~~~~~~~~~~ arch/x86/kvm/cpuid.c:837:31: warning: implicit conversion from enumeration type 'enum kvm_only_cpuid_leafs' to different enumeration type 'enum cpuid_leafs' [-Wenum-conversion] cpuid_entry_override(entry, CPUID_12_EAX); ~~~~~~~~~~~~~~~~~~~~ ^~~~~~~~~~~~ 2 warnings generated. Fixes: 4e66c0cb79b7 ("KVM: x86: Add support for reverse CPUID lookup of scattered features") Cc: Kai Huang Signed-off-by: Sean Christopherson Message-Id: <20210421010850.3009718-1-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Zhiquan Li --- arch/x86/kvm/cpuid.c | 5 +++-- arch/x86/kvm/cpuid.h | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 72e7db3a0ad1..911c6efe60aa 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -348,7 +348,7 @@ int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, } /* Mask kvm_cpu_caps for @leaf with the raw CPUID capabilities of this CPU. */ -static __always_inline void __kvm_cpu_cap_mask(enum cpuid_leafs leaf) +static __always_inline void __kvm_cpu_cap_mask(unsigned int leaf) { const struct cpuid_reg cpuid = x86_feature_cpuid(leaf * 32); struct kvm_cpuid_entry2 entry; @@ -361,7 +361,8 @@ static __always_inline void __kvm_cpu_cap_mask(enum cpuid_leafs leaf) kvm_cpu_caps[leaf] &= *__cpuid_entry_get_reg(&entry, cpuid.reg); } -static __always_inline void kvm_cpu_cap_init_scattered(enum cpuid_leafs leaf, u32 mask) +static __always_inline +void kvm_cpu_cap_init_scattered(enum kvm_only_cpuid_leafs leaf, u32 mask) { /* Use kvm_cpu_cap_mask for non-scattered leafs. */ BUILD_BUG_ON(leaf < NCAPINTS); diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index fbef5b730805..9fbc0de7c337 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -218,7 +218,7 @@ static __always_inline void cpuid_entry_change(struct kvm_cpuid_entry2 *entry, } static __always_inline void cpuid_entry_override(struct kvm_cpuid_entry2 *entry, - enum cpuid_leafs leaf) + unsigned int leaf) { u32 *reg = cpuid_entry_get_reg(entry, leaf * 32); -- Gitee From ef2f0cf07504e7ae045d5117f20271fc12181baf Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Thu, 8 Apr 2021 12:29:24 +0300 Subject: [PATCH 027/674] x86/sgx: Do not update sgx_nr_free_pages in sgx_setup_epc_section() mainline inclusion from mainline-5.13 commit ae40aaf6bdbf0354a75b8284a0de453fcf5f4d32 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK CVE: NA Intel-SIG: commit ae40aaf6bdbf x86/sgx: Do not update sgx_nr_free_pages in sgx_setup_epc_section(). Backport for SGX virtualization support -------------------------------- The commit in Fixes: changed the SGX EPC page sanitization to end up in sgx_free_epc_page() which puts clean and sanitized pages on the free list. This was done for the reason that it is best to keep the logic to assign available-for-use EPC pages to the correct NUMA lists in a single location. sgx_nr_free_pages is also incremented by sgx_free_epc_pages() but those pages which are being added there per EPC section do not belong to the free list yet because they haven't been sanitized yet - they land on the dirty list first and the sanitization happens later when ksgxd starts massaging them. So remove that addition there and have sgx_free_epc_page() do that solely. [ bp: Sanitize commit message too. ] Fixes: 51ab30eb2ad4 ("x86/sgx: Replace section->init_laundry_list with sgx_dirty_page_list") Signed-off-by: Jarkko Sakkinen Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210408092924.7032-1-jarkko@kernel.org Signed-off-by: Fan Du Signed-off-by: Zhiquan Li --- arch/x86/kernel/cpu/sgx/main.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 92cb11dffd4c..ad904747419e 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -656,7 +656,6 @@ static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size, list_add_tail(§ion->pages[i].list, &sgx_dirty_page_list); } - sgx_nr_free_pages += nr_pages; return true; } -- Gitee From b2386478f855d09c726379aecc85a95bcbd461af Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Tue, 15 Jun 2021 22:16:39 +1200 Subject: [PATCH 028/674] x86/sgx: Add missing xa_destroy() when virtual EPC is destroyed mainline inclusion from mainline-5.13 commit 4692bc775d2180a937335ccba0edce557103d44a category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK CVE: NA Intel-SIG: commit 4692bc775d21 x86/sgx: Add missing xa_destroy() when virtual EPC is destroyed. Backport for SGX virtualization support -------------------------------- xa_destroy() needs to be called to destroy a virtual EPC's page array before calling kfree() to free the virtual EPC. Currently it is not called so add the missing xa_destroy(). Fixes: 540745ddbc70 ("x86/sgx: Introduce virtual EPC for use by KVM guests") Signed-off-by: Kai Huang Signed-off-by: Borislav Petkov Acked-by: Dave Hansen Tested-by: Yang Zhong Link: https://lkml.kernel.org/r/20210615101639.291929-1-kai.huang@intel.com Signed-off-by: Fan Du Signed-off-by: Zhiquan Li --- arch/x86/kernel/cpu/sgx/virt.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c index 7d221eac716a..8f7dd1dc4189 100644 --- a/arch/x86/kernel/cpu/sgx/virt.c +++ b/arch/x86/kernel/cpu/sgx/virt.c @@ -212,6 +212,7 @@ static int sgx_vepc_release(struct inode *inode, struct file *file) list_splice_tail(&secs_pages, &zombie_secs_pages); mutex_unlock(&zombie_secs_pages_lock); + xa_destroy(&vepc->page_array); kfree(vepc); return 0; -- Gitee From 254084d399e4a2d6a1b1fb082568c77d2b8c7c42 Mon Sep 17 00:00:00 2001 From: Liam Howlett Date: Tue, 23 Mar 2021 13:42:21 +0000 Subject: [PATCH 029/674] i915_vma: Rename vma_lookup to i915_vma_lookup mainline inclusion from mainline-5.13 commit 547be6a479fd19fe1071d2cf76ed574fbff13481 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK CVE: NA Intel-SIG: commit 547be6a479fd i915_vma: Rename vma_lookup to i915_vma_lookup. Backport for SGX virtualization support -------------------------------- Use i915 prefix to avoid name collision with future vma_lookup() in mm. Signed-off-by: Liam R. Howlett Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20210323134208.3077275-1-Liam.Howlett@Oracle.com Signed-off-by: Zhiquan Li --- drivers/gpu/drm/i915/i915_vma.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c index 50a86fd89d00..9aa4a6ce9fbf 100644 --- a/drivers/gpu/drm/i915/i915_vma.c +++ b/drivers/gpu/drm/i915/i915_vma.c @@ -230,7 +230,7 @@ vma_create(struct drm_i915_gem_object *obj, } static struct i915_vma * -vma_lookup(struct drm_i915_gem_object *obj, +i915_vma_lookup(struct drm_i915_gem_object *obj, struct i915_address_space *vm, const struct i915_ggtt_view *view) { @@ -278,7 +278,7 @@ i915_vma_instance(struct drm_i915_gem_object *obj, GEM_BUG_ON(!atomic_read(&vm->open)); spin_lock(&obj->vma.lock); - vma = vma_lookup(obj, vm, view); + vma = i915_vma_lookup(obj, vm, view); spin_unlock(&obj->vma.lock); /* vma_create() will resolve the race if another creates the vma */ -- Gitee From 24b6b038317e9d646e4d8ff17915a7454ec287a3 Mon Sep 17 00:00:00 2001 From: Liam Howlett Date: Mon, 28 Jun 2021 19:38:50 -0700 Subject: [PATCH 030/674] mm: add vma_lookup(), update find_vma_intersection() comments mainline inclusion from mainline-5.14 commit ce6d42f2e4a2d98898419743b037a95661e3ac9d category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK CVE: NA Intel-SIG: commit ce6d42f2e4a2 mm: add vma_lookup(), update find_vma_intersection() comments. Backport for SGX virtualization support -------------------------------- Patch series "mm: Add vma_lookup()", v2. Many places in the kernel use find_vma() to get a vma and then check the start address of the vma to ensure the next vma was not returned. Other places use the find_vma_intersection() call with add, addr + 1 as the range; looking for just the vma at a specific address. The third use of find_vma() is by developers who do not know that the function starts searching at the provided address upwards for the next vma. This results in a bug that is often overlooked for a long time. Adding the new vma_lookup() function will allow for cleaner code by removing the find_vma() calls which check limits, making find_vma_intersection() calls of a single address to be shorter, and potentially reduce the incorrect uses of find_vma(). This patch (of 22): Many places in the kernel use find_vma() to get a vma and then check the start address of the vma to ensure the next vma was not returned. Other places use the find_vma_intersection() call with add, addr + 1 as the range; looking for just the vma at a specific address. The third use of find_vma() is by developers who do not know that the function starts searching at the provided address upwards for the next vma. This results in a bug that is often overlooked for a long time. Adding the new vma_lookup() function will allow for cleaner code by removing the find_vma() calls which check limits, making find_vma_intersection() calls of a single address to be shorter, and potentially reduce the incorrect uses of find_vma(). Also change find_vma_intersection() comments and declaration to be of the correct length and add kernel documentation style comment. Link: https://lkml.kernel.org/r/20210521174745.2219620-1-Liam.Howlett@Oracle.com Link: https://lkml.kernel.org/r/20210521174745.2219620-2-Liam.Howlett@Oracle.com Signed-off-by: Liam R. Howlett Reviewed-by: Laurent Dufour Acked-by: David Hildenbrand Acked-by: Davidlohr Bueso Cc: David Miller Cc: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Fan Du Signed-off-by: Zhiquan Li --- include/linux/mm.h | 36 ++++++++++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index a886f48b6a0e..b0ddfac0486b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2712,17 +2712,45 @@ extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long add extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr, struct vm_area_struct **pprev); -/* Look up the first VMA which intersects the interval start_addr..end_addr-1, - NULL if none. Assume start_addr < end_addr. */ -static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr) +/** + * find_vma_intersection() - Look up the first VMA which intersects the interval + * @mm: The process address space. + * @start_addr: The inclusive start user address. + * @end_addr: The exclusive end user address. + * + * Returns: The first VMA within the provided range, %NULL otherwise. Assumes + * start_addr < end_addr. + */ +static inline +struct vm_area_struct *find_vma_intersection(struct mm_struct *mm, + unsigned long start_addr, + unsigned long end_addr) { - struct vm_area_struct * vma = find_vma(mm,start_addr); + struct vm_area_struct *vma = find_vma(mm, start_addr); if (vma && end_addr <= vma->vm_start) vma = NULL; return vma; } +/** + * vma_lookup() - Find a VMA at a specific address + * @mm: The process address space. + * @addr: The user address. + * + * Return: The vm_area_struct at the given address, %NULL otherwise. + */ +static inline +struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr) +{ + struct vm_area_struct *vma = find_vma(mm, addr); + + if (vma && addr < vma->vm_start) + vma = NULL; + + return vma; +} + static inline unsigned long vm_start_gap(struct vm_area_struct *vma) { unsigned long vm_start = vma->vm_start; -- Gitee From daeb457c98273f7cb1a1eb1dd9a2eb2473b817c2 Mon Sep 17 00:00:00 2001 From: Liam Howlett Date: Mon, 28 Jun 2021 19:39:14 -0700 Subject: [PATCH 031/674] x86/sgx: use vma_lookup() in sgx_encl_find() mainline inclusion from mainline-5.14 commit 9ce2c3fc0be6e7d0bb2236a33bbb7a0f1943bd81 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK CVE: NA Intel-SIG: commit 9ce2c3fc0be6 x86/sgx: use vma_lookup() in sgx_encl_find(). Backport for SGX virtualization support -------------------------------- Use vma_lookup() to find the VMA at a specific address. As vma_lookup() will return NULL if the address is not within any VMA, the start address no longer needs to be validated. Link: https://lkml.kernel.org/r/20210521174745.2219620-10-Liam.Howlett@Oracle.com Signed-off-by: Liam R. Howlett Reviewed-by: Laurent Dufour Acked-by: David Hildenbrand Acked-by: Davidlohr Bueso Cc: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Fan Du Signed-off-by: Zhiquan Li --- arch/x86/kernel/cpu/sgx/encl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h index 6e74f85b6264..fec43ca65065 100644 --- a/arch/x86/kernel/cpu/sgx/encl.h +++ b/arch/x86/kernel/cpu/sgx/encl.h @@ -91,8 +91,8 @@ static inline int sgx_encl_find(struct mm_struct *mm, unsigned long addr, { struct vm_area_struct *result; - result = find_vma(mm, addr); - if (!result || result->vm_ops != &sgx_vm_ops || addr < result->vm_start) + result = vma_lookup(mm, addr); + if (!result || result->vm_ops != &sgx_vm_ops) return -EINVAL; *vma = result; -- Gitee From 75c51ab6a7038468a77ba92da865408e4b562391 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 21 Oct 2021 16:11:54 -0400 Subject: [PATCH 032/674] x86/sgx/virt: extract sgx_vepc_remove_page mainline inclusion from mainline-5.14 commit fd5128e622d7834bb3f7ee23c2bbea8db63cebaf category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK CVE: NA Intel-SIG: commit fd5128e622d7 x86/sgx/virt: extract sgx_vepc_remove_page. Backport for SGX virtualization support -------------------------------- For bare-metal SGX on real hardware, the hardware provides guarantees SGX state at reboot. For instance, all pages start out uninitialized. The vepc driver provides a similar guarantee today for freshly-opened vepc instances, but guests such as Windows expect all pages to be in uninitialized state on startup, including after every guest reboot. One way to do this is to simply close and reopen the /dev/sgx_vepc file descriptor and re-mmap the virtual EPC. However, this is problematic because it prevents sandboxing the userspace (for example forbidding open() after the guest starts; this is doable with heavy use of SCM_RIGHTS file descriptor passing). In order to implement this, we will need a ioctl that performs EREMOVE on all pages mapped by a /dev/sgx_vepc file descriptor: other possibilities, such as closing and reopening the device, are racy. Start the implementation by creating a separate function with just the __eremove wrapper. Reviewed-by: Jarkko Sakkinen Reviewed-by: Dave Hansen Signed-off-by: Paolo Bonzini Signed-off-by: Dave Hansen Link: https://lkml.kernel.org/r/20211021201155.1523989-2-pbonzini@redhat.com Signed-off-by: Fan Du Signed-off-by: Zhiquan Li --- arch/x86/kernel/cpu/sgx/virt.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c index 8f7dd1dc4189..09674c824ae9 100644 --- a/arch/x86/kernel/cpu/sgx/virt.c +++ b/arch/x86/kernel/cpu/sgx/virt.c @@ -111,10 +111,8 @@ static int sgx_vepc_mmap(struct file *file, struct vm_area_struct *vma) return 0; } -static int sgx_vepc_free_page(struct sgx_epc_page *epc_page) +static int sgx_vepc_remove_page(struct sgx_epc_page *epc_page) { - int ret; - /* * Take a previously guest-owned EPC page and return it to the * general EPC page pool. @@ -124,7 +122,12 @@ static int sgx_vepc_free_page(struct sgx_epc_page *epc_page) * case that a guest properly EREMOVE'd this page, a superfluous * EREMOVE is harmless. */ - ret = __eremove(sgx_get_epc_virt_addr(epc_page)); + return __eremove(sgx_get_epc_virt_addr(epc_page)); +} + +static int sgx_vepc_free_page(struct sgx_epc_page *epc_page) +{ + int ret = sgx_vepc_remove_page(epc_page); if (ret) { /* * Only SGX_CHILD_PRESENT is expected, which is because of @@ -144,7 +147,6 @@ static int sgx_vepc_free_page(struct sgx_epc_page *epc_page) } sgx_free_epc_page(epc_page); - return 0; } -- Gitee From 1c4d239038fb183fe2cba97798a0fed4d4601519 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 21 Oct 2021 16:11:55 -0400 Subject: [PATCH 033/674] x86/sgx/virt: implement SGX_IOC_VEPC_REMOVE ioctl mainline inclusion from mainline-5.16 commit ae095b16fc652f459e6c16a256834985c85ecc4d category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK CVE: NA Intel-SIG: commit ae095b16fc65 x86/sgx/virt: implement SGX_IOC_VEPC_REMOVE ioctl. Backport for SGX virtualization support -------------------------------- For bare-metal SGX on real hardware, the hardware provides guarantees SGX state at reboot. For instance, all pages start out uninitialized. The vepc driver provides a similar guarantee today for freshly-opened vepc instances, but guests such as Windows expect all pages to be in uninitialized state on startup, including after every guest reboot. Some userspace implementations of virtual SGX would rather avoid having to close and reopen the /dev/sgx_vepc file descriptor and re-mmap the virtual EPC. For example, they could sandbox themselves after the guest starts and forbid further calls to open(), in order to mitigate exploits from untrusted guests. Therefore, add a ioctl that does this with EREMOVE. Userspace can invoke the ioctl to bring its vEPC pages back to uninitialized state. There is a possibility that some pages fail to be removed if they are SECS pages, and the child and SECS pages could be in separate vEPC regions. Therefore, the ioctl returns the number of EREMOVE failures, telling userspace to try the ioctl again after it's done with all vEPC regions. A more verbose description of the correct usage and the possible error conditions is documented in sgx.rst. Reviewed-by: Jarkko Sakkinen Reviewed-by: Dave Hansen Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini Signed-off-by: Dave Hansen Link: https://lkml.kernel.org/r/20211021201155.1523989-3-pbonzini@redhat.com Signed-off-by: Fan Du Signed-off-by: Zhiquan Li --- Documentation/x86/sgx.rst | 35 ++++++++++++++++++++++ arch/x86/include/uapi/asm/sgx.h | 2 ++ arch/x86/kernel/cpu/sgx/virt.c | 53 +++++++++++++++++++++++++++++++++ 3 files changed, 90 insertions(+) diff --git a/Documentation/x86/sgx.rst b/Documentation/x86/sgx.rst index dd0ac96ff9ef..a608f667fb95 100644 --- a/Documentation/x86/sgx.rst +++ b/Documentation/x86/sgx.rst @@ -250,3 +250,38 @@ user wants to deploy SGX applications both on the host and in guests on the same machine, the user should reserve enough EPC (by taking out total virtual EPC size of all SGX VMs from the physical EPC size) for host SGX applications so they can run with acceptable performance. + +Architectural behavior is to restore all EPC pages to an uninitialized +state also after a guest reboot. Because this state can be reached only +through the privileged ``ENCLS[EREMOVE]`` instruction, ``/dev/sgx_vepc`` +provides the ``SGX_IOC_VEPC_REMOVE_ALL`` ioctl to execute the instruction +on all pages in the virtual EPC. + +``EREMOVE`` can fail for three reasons. Userspace must pay attention +to expected failures and handle them as follows: + +1. Page removal will always fail when any thread is running in the + enclave to which the page belongs. In this case the ioctl will + return ``EBUSY`` independent of whether it has successfully removed + some pages; userspace can avoid these failures by preventing execution + of any vcpu which maps the virtual EPC. + +2. Page removal will cause a general protection fault if two calls to + ``EREMOVE`` happen concurrently for pages that refer to the same + "SECS" metadata pages. This can happen if there are concurrent + invocations to ``SGX_IOC_VEPC_REMOVE_ALL``, or if a ``/dev/sgx_vepc`` + file descriptor in the guest is closed at the same time as + ``SGX_IOC_VEPC_REMOVE_ALL``; it will also be reported as ``EBUSY``. + This can be avoided in userspace by serializing calls to the ioctl() + and to close(), but in general it should not be a problem. + +3. Finally, page removal will fail for SECS metadata pages which still + have child pages. Child pages can be removed by executing + ``SGX_IOC_VEPC_REMOVE_ALL`` on all ``/dev/sgx_vepc`` file descriptors + mapped into the guest. This means that the ioctl() must be called + twice: an initial set of calls to remove child pages and a subsequent + set of calls to remove SECS pages. The second set of calls is only + required for those mappings that returned a nonzero value from the + first call. It indicates a bug in the kernel or the userspace client + if any of the second round of ``SGX_IOC_VEPC_REMOVE_ALL`` calls has + a return code other than 0. diff --git a/arch/x86/include/uapi/asm/sgx.h b/arch/x86/include/uapi/asm/sgx.h index 791e45334a4a..c815a6fec9aa 100644 --- a/arch/x86/include/uapi/asm/sgx.h +++ b/arch/x86/include/uapi/asm/sgx.h @@ -27,6 +27,8 @@ enum sgx_page_flags { _IOW(SGX_MAGIC, 0x02, struct sgx_enclave_init) #define SGX_IOC_ENCLAVE_PROVISION \ _IOW(SGX_MAGIC, 0x03, struct sgx_enclave_provision) +#define SGX_IOC_VEPC_REMOVE_ALL \ + _IO(SGX_MAGIC, 0x04) /** * struct sgx_enclave_create - parameter structure for the diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c index 09674c824ae9..b4f9a50de776 100644 --- a/arch/x86/kernel/cpu/sgx/virt.c +++ b/arch/x86/kernel/cpu/sgx/virt.c @@ -150,6 +150,41 @@ static int sgx_vepc_free_page(struct sgx_epc_page *epc_page) return 0; } +static long sgx_vepc_remove_all(struct sgx_vepc *vepc) +{ + struct sgx_epc_page *entry; + unsigned long index; + long failures = 0; + + xa_for_each(&vepc->page_array, index, entry) { + int ret = sgx_vepc_remove_page(entry); + if (ret) { + if (ret == SGX_CHILD_PRESENT) { + /* The page is a SECS, userspace will retry. */ + failures++; + } else { + /* + * Report errors due to #GP or SGX_ENCLAVE_ACT; do not + * WARN, as userspace can induce said failures by + * calling the ioctl concurrently on multiple vEPCs or + * while one or more CPUs is running the enclave. Only + * a #PF on EREMOVE indicates a kernel/hardware issue. + */ + WARN_ON_ONCE(encls_faulted(ret) && + ENCLS_TRAPNR(ret) != X86_TRAP_GP); + return -EBUSY; + } + } + cond_resched(); + } + + /* + * Return the number of SECS pages that failed to be removed, so + * userspace knows that it has to retry. + */ + return failures; +} + static int sgx_vepc_release(struct inode *inode, struct file *file) { struct sgx_vepc *vepc = file->private_data; @@ -235,9 +270,27 @@ static int sgx_vepc_open(struct inode *inode, struct file *file) return 0; } +static long sgx_vepc_ioctl(struct file *file, + unsigned int cmd, unsigned long arg) +{ + struct sgx_vepc *vepc = file->private_data; + + switch (cmd) { + case SGX_IOC_VEPC_REMOVE_ALL: + if (arg) + return -EINVAL; + return sgx_vepc_remove_all(vepc); + + default: + return -ENOTTY; + } +} + static const struct file_operations sgx_vepc_fops = { .owner = THIS_MODULE, .open = sgx_vepc_open, + .unlocked_ioctl = sgx_vepc_ioctl, + .compat_ioctl = sgx_vepc_ioctl, .release = sgx_vepc_release, .mmap = sgx_vepc_mmap, }; -- Gitee From e08294e44e67f41007227e510df79a2a7967b3b8 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Mon, 15 Nov 2021 11:29:04 -0800 Subject: [PATCH 034/674] x86/sgx: Fix free page accounting mainline inclusion from mainline-5.16 commit ac5d272a0ad0419f52e08c91953356e32b075af7 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK CVE: NA Intel-SIG: commit ac5d272a0ad0 x86/sgx: Fix free page accounting Backport for SGX virtualization support -------------------------------- The SGX driver maintains a single global free page counter, sgx_nr_free_pages, that reflects the number of free pages available across all NUMA nodes. Correspondingly, a list of free pages is associated with each NUMA node and sgx_nr_free_pages is updated every time a page is added or removed from any of the free page lists. The main usage of sgx_nr_free_pages is by the reclaimer that runs when it (sgx_nr_free_pages) goes below a watermark to ensure that there are always some free pages available to, for example, support efficient page faults. With sgx_nr_free_pages accessed and modified from a few places it is essential to ensure that these accesses are done safely but this is not the case. sgx_nr_free_pages is read without any protection and updated with inconsistent protection by any one of the spin locks associated with the individual NUMA nodes. For example: CPU_A CPU_B ----- ----- spin_lock(&nodeA->lock); spin_lock(&nodeB->lock); ... ... sgx_nr_free_pages--; /* NOT SAFE */ sgx_nr_free_pages--; spin_unlock(&nodeA->lock); spin_unlock(&nodeB->lock); Since sgx_nr_free_pages may be protected by different spin locks while being modified from different CPUs, the following scenario is possible: CPU_A CPU_B ----- ----- {sgx_nr_free_pages = 100} spin_lock(&nodeA->lock); spin_lock(&nodeB->lock); sgx_nr_free_pages--; sgx_nr_free_pages--; /* LOAD sgx_nr_free_pages = 100 */ /* LOAD sgx_nr_free_pages = 100 */ /* sgx_nr_free_pages-- */ /* sgx_nr_free_pages-- */ /* STORE sgx_nr_free_pages = 99 */ /* STORE sgx_nr_free_pages = 99 */ spin_unlock(&nodeA->lock); spin_unlock(&nodeB->lock); In the above scenario, sgx_nr_free_pages is decremented from two CPUs but instead of sgx_nr_free_pages ending with a value that is two less than it started with, it was only decremented by one while the number of free pages were actually reduced by two. The consequence of sgx_nr_free_pages not being protected is that its value may not accurately reflect the actual number of free pages on the system, impacting the availability of free pages in support of many flows. The problematic scenario is when the reclaimer does not run because it believes there to be sufficient free pages while any attempt to allocate a page fails because there are no free pages available. In the SGX driver the reclaimer's watermark is only 32 pages so after encountering the above example scenario 32 times a user space hang is possible when there are no more free pages because of repeated page faults caused by no free pages made available. The following flow was encountered: asm_exc_page_fault ... sgx_vma_fault() sgx_encl_load_page() sgx_encl_eldu() // Encrypted page needs to be loaded from backing // storage into newly allocated SGX memory page sgx_alloc_epc_page() // Allocate a page of SGX memory __sgx_alloc_epc_page() // Fails, no free SGX memory ... if (sgx_should_reclaim(SGX_NR_LOW_PAGES)) // Wake reclaimer wake_up(&ksgxd_waitq); return -EBUSY; // Return -EBUSY giving reclaimer time to run return -EBUSY; return -EBUSY; return VM_FAULT_NOPAGE; The reclaimer is triggered in above flow with the following code: static bool sgx_should_reclaim(unsigned long watermark) { return sgx_nr_free_pages < watermark && !list_empty(&sgx_active_page_list); } In the problematic scenario there were no free pages available yet the value of sgx_nr_free_pages was above the watermark. The allocation of SGX memory thus always failed because of a lack of free pages while no free pages were made available because the reclaimer is never started because of sgx_nr_free_pages' incorrect value. The consequence was that user space kept encountering VM_FAULT_NOPAGE that caused the same address to be accessed repeatedly with the same result. Change the global free page counter to an atomic type that ensures simultaneous updates are done safely. While doing so, move the updating of the variable outside of the spin lock critical section to which it does not belong. Cc: stable@vger.kernel.org Fixes: 901ddbb9ecf5 ("x86/sgx: Add a basic NUMA allocation scheme to sgx_alloc_epc_page()") Suggested-by: Dave Hansen Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Reviewed-by: Tony Luck Acked-by: Jarkko Sakkinen Link: https://lkml.kernel.org/r/a95a40743bbd3f795b465f30922dde7f1ea9e0eb.1637004094.git.reinette.chatre@intel.com Signed-off-by: Fan Du Signed-off-by: Zhiquan Li --- arch/x86/kernel/cpu/sgx/main.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index ad904747419e..379eeefa744a 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -28,8 +28,7 @@ static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq); static LIST_HEAD(sgx_active_page_list); static DEFINE_SPINLOCK(sgx_reclaimer_lock); -/* The free page list lock protected variables prepend the lock. */ -static unsigned long sgx_nr_free_pages; +static atomic_long_t sgx_nr_free_pages = ATOMIC_LONG_INIT(0); /* Nodes with one or more EPC sections. */ static nodemask_t sgx_numa_mask; @@ -403,14 +402,15 @@ static void sgx_reclaim_pages(void) spin_lock(&node->lock); list_add_tail(&epc_page->list, &node->free_page_list); - sgx_nr_free_pages++; spin_unlock(&node->lock); + atomic_long_inc(&sgx_nr_free_pages); } } static bool sgx_should_reclaim(unsigned long watermark) { - return sgx_nr_free_pages < watermark && !list_empty(&sgx_active_page_list); + return atomic_long_read(&sgx_nr_free_pages) < watermark && + !list_empty(&sgx_active_page_list); } static int ksgxd(void *p) @@ -471,9 +471,9 @@ static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid) page = list_first_entry(&node->free_page_list, struct sgx_epc_page, list); list_del_init(&page->list); - sgx_nr_free_pages--; spin_unlock(&node->lock); + atomic_long_dec(&sgx_nr_free_pages); return page; } @@ -625,9 +625,9 @@ void sgx_free_epc_page(struct sgx_epc_page *page) spin_lock(&node->lock); list_add_tail(&page->list, &node->free_page_list); - sgx_nr_free_pages++; spin_unlock(&node->lock); + atomic_long_inc(&sgx_nr_free_pages); } static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size, -- Gitee From 7e7b20e483355faeda1cef2a88e8809d8657f891 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Tue, 8 Feb 2022 10:48:07 -0800 Subject: [PATCH 035/674] x86/sgx: Silence softlockup detection when releasing large enclaves mainline inclusion from mainline-5.17 commit 8795359e35bc33bf86b6d0765aa7f37431db3b9c category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK CVE: NA Intel-SIG: commit 8795359e35bc x86/sgx: Silence softlockup detection when releasing large enclaves. Backport for SGX virtualization support -------------------------------- Vijay reported that the "unclobbered_vdso_oversubscribed" selftest triggers the softlockup detector. Actual SGX systems have 128GB of enclave memory or more. The "unclobbered_vdso_oversubscribed" selftest creates one enclave which consumes all of the enclave memory on the system. Tearing down such a large enclave takes around a minute, most of it in the loop where the EREMOVE instruction is applied to each individual 4k enclave page. Spending one minute in a loop triggers the softlockup detector. Add a cond_resched() to give other tasks a chance to run and placate the softlockup detector. Cc: stable@vger.kernel.org Fixes: 1728ab54b4be ("x86/sgx: Add a page reclaimer") Reported-by: Vijay Dhanraj Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Acked-by: Dave Hansen Tested-by: Jarkko Sakkinen (kselftest as sanity check) Link: https://lkml.kernel.org/r/ced01cac1e75f900251b0a4ae1150aa8ebd295ec.1644345232.git.reinette.chatre@intel.com Signed-off-by: Fan Du Signed-off-by: Zhiquan Li --- arch/x86/kernel/cpu/sgx/encl.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 0c9f6cdd34a4..9a1a93ed2562 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -544,6 +544,8 @@ void sgx_encl_release(struct kref *ref) } kfree(entry); + /* Invoke scheduler to prevent soft lockups. */ + cond_resched(); } xa_destroy(&encl->page_array); -- Gitee From b2d2fe399208c58fcf31fd46a59cd53e9cd975ce Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Sun, 27 Dec 2020 12:11:16 +0200 Subject: [PATCH 036/674] intel_idle: add SnowRidge C-state table mainline inclusion from mainline-v5.11-rc2 commit 9cf93f056f783f986c19f40d5304d1bcffa0fc0d category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5BECY CVE: NA Intel-SIG: commit 9cf93f056f78 intel_idle: add SnowRidge C-state table. Backport intel_idle driver -------------------------------- Add C-state table for the SnowRidge SoC which is found on Intel Jacobsville platforms. The following has been changed. 1. C1E latency changed from 10us to 15us. It was measured using the open source "wult" tool (the "nic" method, 15us is the 99.99th percentile). 2. C1E power break even changed from 20us to 25us, which may result in less C1E residency in some workloads. 3. C6 latency changed from 50us to 130us. Measured the same way as C1E. The C6 C-state is supported only by some SnowRidge revisions, so add a C-state table commentary about this. On SnowRidge, C6 support is enumerated via the usual mechanism: "mwait" leaf of the "cpuid" instruction. The 'intel_idle' driver does check this leaf, so even though C6 is present in the table, the driver will only use it if the CPU does support it. Signed-off-by: Artem Bityutskiy Signed-off-by: Rafael J. Wysocki Signed-off-by: yingbao jia Signed-off-by: Jun Tian --- drivers/idle/intel_idle.c | 41 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index d79335506ecd..28f93b9aa51b 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -963,6 +963,39 @@ static struct cpuidle_state dnv_cstates[] __initdata = { .enter = NULL } }; +/* + * Note, depending on HW and FW revision, SnowRidge SoC may or may not support + * C6, and this is indicated in the CPUID mwait leaf. + */ +static struct cpuidle_state snr_cstates[] __initdata = { + { + .name = "C1", + .desc = "MWAIT 0x00", + .flags = MWAIT2flg(0x00), + .exit_latency = 2, + .target_residency = 2, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .name = "C1E", + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, + .exit_latency = 15, + .target_residency = 25, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .name = "C6", + .desc = "MWAIT 0x20", + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 130, + .target_residency = 500, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .enter = NULL } +}; + static const struct idle_cpu idle_cpu_nehalem __initconst = { .state_table = nehalem_cstates, .auto_demotion_disable_flags = NHM_C1_AUTO_DEMOTE | NHM_C3_AUTO_DEMOTE, @@ -1084,6 +1117,12 @@ static const struct idle_cpu idle_cpu_dnv __initconst = { .use_acpi = true, }; +static const struct idle_cpu idle_cpu_snr __initconst = { + .state_table = snr_cstates, + .disable_promotion_to_c1e = true, + .use_acpi = true, +}; + static const struct x86_cpu_id intel_idle_ids[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(NEHALEM_EP, &idle_cpu_nhx), X86_MATCH_INTEL_FAM6_MODEL(NEHALEM, &idle_cpu_nehalem), @@ -1122,7 +1161,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT, &idle_cpu_bxt), X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_PLUS, &idle_cpu_bxt), X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_D, &idle_cpu_dnv), - X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &idle_cpu_dnv), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &idle_cpu_snr), {} }; -- Gitee From 5f8214310181dabd9120b6a1c57d40bab63c8d38 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Fri, 15 Jan 2021 15:56:46 -0800 Subject: [PATCH 037/674] intel_idle: remove definition of DEBUG mainline inclusion from mainline-v5.12-rc2 commit 651bc5816c39e57833fea4478c8ecfb72ad47e44 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5BECY CVE: NA Intel-SIG: commit 651bc5816c39 intel_idle: remove definition of DEBUG. Backport intel_idle driver -------------------------------- Defining DEBUG should only be done in development. So remove DEBUG. Signed-off-by: Tom Rix Signed-off-by: Rafael J. Wysocki Signed-off-by: yingbao jia Signed-off-by: Jun Tian --- drivers/idle/intel_idle.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 28f93b9aa51b..3273360f30f7 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -37,7 +37,7 @@ */ /* un-comment DEBUG to enable pr_debug() statements */ -#define DEBUG +/* #define DEBUG */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -- Gitee From 0fdfc2a57e517ae371d8520f31549e41aa7d9e96 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Mon, 8 Mar 2021 16:31:46 +0200 Subject: [PATCH 038/674] intel_idle: update ICX C6 data mainline inclusion from mainline-v5.13-rc1 commit d484b8bfc6fa71a088e4ac85d9ce11aa0385867e category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5BECY CVE: NA Intel-SIG: commit d484b8bfc6fa intel_idle: update ICX C6 data. Backport intel_idle driver -------------------------------- Change IceLake Xeon C6 latency from 128 us to 170 us. The latency was measured with the "wult" tool and corresponds to the 99.99th percentile when measuring with the "nic" method. Note, the 128 us figure correspond to the median latency, but in intel_idle we use the "worst case" latency figure instead. C6 target residency was increased from 384 us to 600 us, which may result in less C6 residency in some workloads. This value was tested and compared to values 384, and 1000. Value 600 is a reasonable tradeoff between power and performance. Signed-off-by: Artem Bityutskiy Acked-by: Zhang Rui Signed-off-by: Rafael J. Wysocki Signed-off-by: yingbao jia Signed-off-by: Jun Tian --- drivers/idle/intel_idle.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 3273360f30f7..6cac0b748efa 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -744,8 +744,8 @@ static struct cpuidle_state icx_cstates[] __initdata = { .name = "C6", .desc = "MWAIT 0x20", .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 128, - .target_residency = 384, + .exit_latency = 170, + .target_residency = 600, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { -- Gitee From 442af217b8e8c37793a7a7928bf88c4f93fa1aed Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Wed, 7 Apr 2021 09:10:28 +0300 Subject: [PATCH 039/674] intel_idle: add Iclelake-D support mainline inclusion from mainline-v5.13-rc1 commit 22141d5f411895bb1b0df2a6b05f702e11e63918 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5BECY CVE: NA Intel-SIG: commit 22141d5f4118 intel_idle: add Iclelake-D support. Backport intel_idle driver -------------------------------- This patch adds Icelake Xeon D support to the intel_idle driver. Since Icelake D and Icelake SP C-state characteristics the same, we use Icelake SP C-states table for Icelake D as well. Signed-off-by: Artem Bityutskiy Acked-by: Chen Yu Signed-off-by: Rafael J. Wysocki Signed-off-by: yingbao jia Signed-off-by: Jun Tian --- drivers/idle/intel_idle.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 6cac0b748efa..ec1b9d306ba6 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -1156,6 +1156,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE, &idle_cpu_skl), X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, &idle_cpu_skx), X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &idle_cpu_icx), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &idle_cpu_icx), X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL, &idle_cpu_knl), X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM, &idle_cpu_knl), X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT, &idle_cpu_bxt), -- Gitee From d9030dcd45e39d16269fc298871802e0bd3222b5 Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Fri, 28 May 2021 11:20:54 +0800 Subject: [PATCH 040/674] intel_idle: Adjust the SKX C6 parameters if PC6 is disabled mainline inclusion from mainline-v5.14-rc1 commit 64233338499126c5c31e07165735ab5441c7e45a category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5BECY CVE: NA Intel-SIG: commit 642333384991 intel_idle: Adjust the SKX C6 parameters if PC6 is disabled. Backport intel_idle driver -------------------------------- Because cpuidle assumes worst-case C-state parameters, PC6 parameters are used for describing C6, which is worst-case for requesting CC6. When PC6 is enabled, this is appropriate. But if PC6 is disabled in the BIOS, the exit latency and target residency should be adjusted accordingly. Exit latency: Previously the C6 exit latency was measured as the PC6 exit latency. With PC6 disabled, the C6 exit latency should be the one of CC6. Target residency: With PC6 disabled, the idle duration within [CC6, PC6) would make the idle governor choose C1E over C6. This would cause low energy-efficiency. We should lower the bar to request C6 when PC6 is disabled. To fill this gap, check if PC6 is disabled in the BIOS in the MSR_PKG_CST_CONFIG_CONTROL(0xe2) register. If so, use the CC6 exit latency for C6 and set target_residency to 3 times of the new exit latency. [This is consistent with how intel_idle driver uses _CST to calculate the target_residency.] As a result, the OS would be more likely to choose C6 over C1E when PC6 is disabled, which is reasonable, because if C6 is enabled, it implies that the user cares about energy, so choosing C6 more frequently makes sense. The new CC6 exit latency of 92us was measured with wult[1] on SKX via NIC wakeup as the 99.99th percentile. Also CLX and CPX both have the same CPU model number as SkX, but their CC6 exit latencies are similar to the SKX one, 96us and 89us respectively, so reuse the SKX value for them. There is a concern that it might be better to use a more generic approach instead of optimizing every platform. However, if the required code complexity and different PC6 bit interpretation on different platforms are taken into account, tuning the code per platform seems to be an acceptable tradeoff. Link: https://intel.github.io/wult/ # [1] Suggested-by: Len Brown Signed-off-by: Chen Yu Reviewed-by: Artem Bityutskiy [ rjw: Subject and changelog edits ] Signed-off-by: Rafael J. Wysocki Signed-off-by: yingbao jia Signed-off-by: Jun Tian --- drivers/idle/intel_idle.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index ec1b9d306ba6..e6c543b5ee1d 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -1484,6 +1484,36 @@ static void __init sklh_idle_state_table_update(void) skl_cstates[6].flags |= CPUIDLE_FLAG_UNUSABLE; /* C9-SKL */ } +/** + * skx_idle_state_table_update - Adjust the Sky Lake/Cascade Lake + * idle states table. + */ +static void __init skx_idle_state_table_update(void) +{ + unsigned long long msr; + + rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr); + + /* + * 000b: C0/C1 (no package C-state support) + * 001b: C2 + * 010b: C6 (non-retention) + * 011b: C6 (retention) + * 111b: No Package C state limits. + */ + if ((msr & 0x7) < 2) { + /* + * Uses the CC6 + PC0 latency and 3 times of + * latency for target_residency if the PC6 + * is disabled in BIOS. This is consistent + * with how intel_idle driver uses _CST + * to set the target_residency. + */ + skx_cstates[2].exit_latency = 92; + skx_cstates[2].target_residency = 276; + } +} + static bool __init intel_idle_verify_cstate(unsigned int mwait_hint) { unsigned int mwait_cstate = MWAIT_HINT2CSTATE(mwait_hint) + 1; @@ -1515,6 +1545,9 @@ static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv) case INTEL_FAM6_SKYLAKE: sklh_idle_state_table_update(); break; + case INTEL_FAM6_SKYLAKE_X: + skx_idle_state_table_update(); + break; } for (cstate = 0; cstate < CPUIDLE_STATE_MAX; ++cstate) { -- Gitee From 9b77001c0ae72da913c2c624575d73dcdd692bdb Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Fri, 17 Sep 2021 10:20:22 +0300 Subject: [PATCH 041/674] intel_idle: enable interrupts before C1 on Xeons mainline inclusion from mainline-v5.16-rc1 commit c227233ad64c77e57db738ab0e46439db71822a3 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5BECY CVE: NA Intel-SIG: commit c227233ad64c intel_idle: enable interrupts before C1 on Xeons. Backport intel_idle driver -------------------------------- Enable local interrupts before requesting C1 on the last two generations of Intel Xeon platforms: Sky Lake, Cascade Lake, Cooper Lake, Ice Lake. This decreases average C1 interrupt latency by about 5-10%, as measured with the 'wult' tool. The '->enter()' function of the driver enters C-states with local interrupts disabled by executing the 'monitor' and 'mwait' pair of instructions. If an interrupt happens, the CPU exits the C-state and continues executing instructions after 'mwait'. It does not jump to the interrupt handler, because local interrupts are disabled. The cpuidle subsystem enables interrupts a bit later, after doing some housekeeping. With this patch, we enable local interrupts before requesting C1. In this case, if the CPU wakes up because of an interrupt, it will jump to the interrupt handler right away. The cpuidle housekeeping will be done after the pending interrupt(s) are handled. Enabling interrupts before entering a C-state has measurable impact for faster C-states, like C1. Deeper, but slower C-states like C6 do not really benefit from this sort of change, because their latency is a lot higher comparing to the delay added by cpuidle housekeeping. This change was also tested with cyclictest and dbench. In case of Ice Lake, the average cyclictest latency decreased by 5.1%, and the average 'dbench' throughput increased by about 0.8%. Both tests were run for 4 hours with only C1 enabled (all other idle states, including 'POLL', were disabled). CPU frequency was pinned to HFM, and uncore frequency was pinned to the maximum value. The other platforms had similar single-digit percentage improvements. It is worth noting that this patch affects 'cpuidle' statistics a tiny bit. Before this patch, C1 residency did not include the interrupt handling time, but with this patch, it will include it. This is similar to what happens in case of the 'POLL' state, which also runs with interrupts enabled. Suggested-by: Len Brown Signed-off-by: Artem Bityutskiy Signed-off-by: Rafael J. Wysocki Signed-off-by: yingbao jia Signed-off-by: Jun Tian --- drivers/idle/intel_idle.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index e6c543b5ee1d..0b66e25c0e2d 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -88,6 +88,12 @@ static struct cpuidle_state *cpuidle_state_table __initdata; static unsigned int mwait_substates __initdata; +/* + * Enable interrupts before entering the C-state. On some platforms and for + * some C-states, this may measurably decrease interrupt latency. + */ +#define CPUIDLE_FLAG_IRQ_ENABLE BIT(14) + /* * Enable this state by default even if the ACPI _CST does not list it. */ @@ -127,6 +133,9 @@ static __cpuidle int intel_idle(struct cpuidle_device *dev, unsigned long eax = flg2MWAIT(state->flags); unsigned long ecx = 1; /* break on interrupt flag */ + if (state->flags & CPUIDLE_FLAG_IRQ_ENABLE) + local_irq_enable(); + mwait_idle_with_hints(eax, ecx); return index; @@ -698,7 +707,7 @@ static struct cpuidle_state skx_cstates[] __initdata = { { .name = "C1", .desc = "MWAIT 0x00", - .flags = MWAIT2flg(0x00), + .flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_IRQ_ENABLE, .exit_latency = 2, .target_residency = 2, .enter = &intel_idle, @@ -727,7 +736,7 @@ static struct cpuidle_state icx_cstates[] __initdata = { { .name = "C1", .desc = "MWAIT 0x00", - .flags = MWAIT2flg(0x00), + .flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_IRQ_ENABLE, .exit_latency = 1, .target_residency = 1, .enter = &intel_idle, -- Gitee From 2a0225ab4315ef488b15607607a55cbf308a1e0a Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Wed, 2 Mar 2022 10:15:58 +0200 Subject: [PATCH 042/674] intel_idle: add SPR support mainline inclusion from mainline-v5.18-rc1 commit 9edf3c0ffef0ec1bed8300315852b5c6a0997130 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5BECY CVE: NA Intel-SIG: commit 9edf3c0ffef0 intel_idle: add SPR support. Backport intel_idle driver -------------------------------- Add Sapphire Rapids Xeon support. Up until very recently, the C1 and C1E C-states were independent, but this has changed in some new chips, including Sapphire Rapids Xeon (SPR). In these chips the C1 and C1E states cannot be enabled at the same time. The "C1E promotion" bit in 'MSR_IA32_POWER_CTL' also has its semantics changed a bit. Here are the C1, C1E, and "C1E promotion" bit rules on Xeons before SPR. 1. If C1E promotion bit is disabled. a. C1 requests end up with C1 C-state. b. C1E requests end up with C1E C-state. 2. If C1E promotion bit is enabled. a. C1 requests end up with C1E C-state. b. C1E requests end up with C1E C-state. Here are the C1, C1E, and "C1E promotion" bit rules on Sapphire Rapids Xeon. 1. If C1E promotion bit is disabled. a. C1 requests end up with C1 C-state. b. C1E requests end up with C1 C-state. 2. If C1E promotion bit is enabled. a. C1 requests end up with C1E C-state. b. C1E requests end up with C1E C-state. Before SPR Xeon, the 'intel_idle' driver was disabling C1E promotion and was exposing C1 and C1E as independent C-states. But on SPR, C1 and C1E cannot be enabled at the same time. This patch adds both C1 and C1E states. However, C1E is marked as with the "CPUIDLE_FLAG_UNUSABLE" flag, which means that in won't be registered by default. The C1E promotion bit will be cleared, which means that by default only C1 and C6 will be registered on SPR. The next patch will add an option for enabling C1E and disabling C1 on SPR. Signed-off-by: Artem Bityutskiy Signed-off-by: Rafael J. Wysocki Signed-off-by: yingbao jia Signed-off-by: Jun Tian --- drivers/idle/intel_idle.c | 47 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 0b66e25c0e2d..1c7c25909e54 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -761,6 +761,46 @@ static struct cpuidle_state icx_cstates[] __initdata = { .enter = NULL } }; +/* + * On Sapphire Rapids Xeon C1 has to be disabled if C1E is enabled, and vice + * versa. On SPR C1E is enabled only if "C1E promotion" bit is set in + * MSR_IA32_POWER_CTL. But in this case there effectively no C1, because C1 + * requests are promoted to C1E. If the "C1E promotion" bit is cleared, then + * both C1 and C1E requests end up with C1, so there is effectively no C1E. + * + * By default we enable C1 and disable C1E by marking it with + * 'CPUIDLE_FLAG_UNUSABLE'. + */ +static struct cpuidle_state spr_cstates[] __initdata = { + { + .name = "C1", + .desc = "MWAIT 0x00", + .flags = MWAIT2flg(0x00), + .exit_latency = 1, + .target_residency = 1, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .name = "C1E", + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE | \ + CPUIDLE_FLAG_UNUSABLE, + .exit_latency = 2, + .target_residency = 4, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .name = "C6", + .desc = "MWAIT 0x20", + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 290, + .target_residency = 800, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .enter = NULL } +}; + static struct cpuidle_state atom_cstates[] __initdata = { { .name = "C1E", @@ -1104,6 +1144,12 @@ static const struct idle_cpu idle_cpu_icx __initconst = { .use_acpi = true, }; +static const struct idle_cpu idle_cpu_spr __initconst = { + .state_table = spr_cstates, + .disable_promotion_to_c1e = true, + .use_acpi = true, +}; + static const struct idle_cpu idle_cpu_avn __initconst = { .state_table = avn_cstates, .disable_promotion_to_c1e = true, @@ -1166,6 +1212,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, &idle_cpu_skx), X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &idle_cpu_icx), X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &idle_cpu_icx), + X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &idle_cpu_spr), X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL, &idle_cpu_knl), X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM, &idle_cpu_knl), X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT, &idle_cpu_bxt), -- Gitee From fecfa8c2b8f7e0fafb27357c19741d6f1024d0f3 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Wed, 2 Mar 2022 10:15:59 +0200 Subject: [PATCH 043/674] intel_idle: add 'preferred_cstates' module argument mainline inclusion from mainline-v5.18-rc1 commit da0e58c038e60e7e65d30813ebdfe91687aa8a24 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5BECY CVE: NA Intel-SIG: commit da0e58c038e6 intel_idle: add 'preferred_cstates' module argument. Backport intel_idle driver -------------------------------- On Sapphire Rapids Xeon (SPR) the C1 and C1E states are basically mutually exclusive - only one of them can be enabled. By default, 'intel_idle' driver enables C1 and disables C1E. However, some users prefer to use C1E instead of C1, because it saves more energy. This patch adds a new module parameter ('preferred_cstates') for enabling C1E and disabling C1. Here is the idea behind it. 1. This option has effect only for "mutually exclusive" C-states like C1 and C1E on SPR. 2. It does not have any effect on independent C-states, which do not require other C-states to be disabled (most states on most platforms as of today). 3. For mutually exclusive C-states, the 'intel_idle' driver always has a reasonable default, such as enabling C1 on SPR by default. On other platforms, the default may be different. 4. Users can override the default using the 'preferred_cstates' parameter. 5. The parameter accepts the preferred C-states bit-mask, similarly to the existing 'states_off' parameter. 6. This parameter is not limited to C1/C1E, and leaves room for supporting other mutually exclusive C-states, if they come in the future. Today 'intel_idle' can only be compiled-in, which means that on SPR, in order to disable C1 and enable C1E, users should boot with the following kernel argument: intel_idle.preferred_cstates=4 Signed-off-by: Artem Bityutskiy Signed-off-by: Rafael J. Wysocki Signed-off-by: yingbao jia Signed-off-by: Jun Tian --- drivers/idle/intel_idle.c | 46 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 1c7c25909e54..b2688c326522 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -64,6 +64,7 @@ static struct cpuidle_driver intel_idle_driver = { /* intel_idle.max_cstate=0 disables driver */ static int max_cstate = CPUIDLE_STATE_MAX - 1; static unsigned int disabled_states_mask; +static unsigned int preferred_states_mask; static struct cpuidle_device __percpu *intel_idle_cpuidle_devices; @@ -1400,6 +1401,8 @@ static inline void intel_idle_init_cstates_acpi(struct cpuidle_driver *drv) { } static inline bool intel_idle_off_by_default(u32 mwait_hint) { return false; } #endif /* !CONFIG_ACPI_PROCESSOR_CSTATE */ +static void c1e_promotion_enable(void); + /** * ivt_idle_state_table_update - Tune the idle states table for Ivy Town. * @@ -1570,6 +1573,26 @@ static void __init skx_idle_state_table_update(void) } } +/** + * spr_idle_state_table_update - Adjust Sapphire Rapids idle states table. + */ +static void __init spr_idle_state_table_update(void) +{ + /* Check if user prefers C1E over C1. */ + if (preferred_states_mask & BIT(2)) { + if (preferred_states_mask & BIT(1)) + /* Both can't be enabled, stick to the defaults. */ + return; + + spr_cstates[0].flags |= CPUIDLE_FLAG_UNUSABLE; + spr_cstates[1].flags &= ~CPUIDLE_FLAG_UNUSABLE; + + /* Enable C1E using the "C1E promotion" bit. */ + c1e_promotion_enable(); + disable_promotion_to_c1e = false; + } +} + static bool __init intel_idle_verify_cstate(unsigned int mwait_hint) { unsigned int mwait_cstate = MWAIT_HINT2CSTATE(mwait_hint) + 1; @@ -1604,6 +1627,9 @@ static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv) case INTEL_FAM6_SKYLAKE_X: skx_idle_state_table_update(); break; + case INTEL_FAM6_SAPPHIRERAPIDS_X: + spr_idle_state_table_update(); + break; } for (cstate = 0; cstate < CPUIDLE_STATE_MAX; ++cstate) { @@ -1676,6 +1702,15 @@ static void auto_demotion_disable(void) wrmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr_bits); } +static void c1e_promotion_enable(void) +{ + unsigned long long msr_bits; + + rdmsrl(MSR_IA32_POWER_CTL, msr_bits); + msr_bits |= 0x2; + wrmsrl(MSR_IA32_POWER_CTL, msr_bits); +} + static void c1e_promotion_disable(void) { unsigned long long msr_bits; @@ -1845,3 +1880,14 @@ module_param(max_cstate, int, 0444); */ module_param_named(states_off, disabled_states_mask, uint, 0444); MODULE_PARM_DESC(states_off, "Mask of disabled idle states"); +/* + * Some platforms come with mutually exclusive C-states, so that if one is + * enabled, the other C-states must not be used. Example: C1 and C1E on + * Sapphire Rapids platform. This parameter allows for selecting the + * preferred C-states among the groups of mutually exclusive C-states - the + * selected C-states will be registered, the other C-states from the mutually + * exclusive group won't be registered. If the platform has no mutually + * exclusive C-states, this parameter has no effect. + */ +module_param_named(preferred_cstates, preferred_states_mask, uint, 0444); +MODULE_PARM_DESC(preferred_cstates, "Mask of preferred idle states"); -- Gitee From 5e98d44d9f6b0f2b43886a483fbce9ba3cc0a402 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Wed, 2 Mar 2022 10:16:00 +0200 Subject: [PATCH 044/674] intel_idle: add core C6 optimization for SPR mainline inclusion from mainline-v5.18-rc1 commit 3a9cf77b60dc9839b6674943bb7c9dcd524b6294 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5BECY CVE: NA Intel-SIG: commit 3a9cf77b60dc intel_idle: add core C6 optimization for SPR. Backport intel_idle driver -------------------------------- Add a Sapphire Rapids Xeon C6 optimization, similar to what we have for Sky Lake Xeon: if package C6 is disabled, adjust C6 exit latency and target residency to match core C6 values, instead of using the default package C6 values. Signed-off-by: Artem Bityutskiy Signed-off-by: Rafael J. Wysocki Signed-off-by: yingbao jia Signed-off-by: Jun Tian --- drivers/idle/intel_idle.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index b2688c326522..e385ddf15b32 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -1578,6 +1578,8 @@ static void __init skx_idle_state_table_update(void) */ static void __init spr_idle_state_table_update(void) { + unsigned long long msr; + /* Check if user prefers C1E over C1. */ if (preferred_states_mask & BIT(2)) { if (preferred_states_mask & BIT(1)) @@ -1591,6 +1593,19 @@ static void __init spr_idle_state_table_update(void) c1e_promotion_enable(); disable_promotion_to_c1e = false; } + + /* + * By default, the C6 state assumes the worst-case scenario of package + * C6. However, if PC6 is disabled, we update the numbers to match + * core C6. + */ + rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr); + + /* Limit value 2 and above allow for PC6. */ + if ((msr & 0x7) < 2) { + spr_cstates[2].exit_latency = 190; + spr_cstates[2].target_residency = 600; + } } static bool __init intel_idle_verify_cstate(unsigned int mwait_hint) -- Gitee From 2b213f7240392736d1280bff47b7f8a7dde70f39 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 15 Mar 2022 20:35:47 +0100 Subject: [PATCH 045/674] cpuidle: intel_idle: Update intel_idle() kerneldoc comment mainline inclusion from mainline-v5.18-rc1 commit a335b1e6bb29300d3bc6749763a4298627e594ba category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5BECY CVE: NA Intel-SIG: commit a335b1e6bb29 cpuidle: intel_idle: Update intel_idle() kerneldoc comment. Backport intel_idle driver -------------------------------- Commit bf9282dc26e7 ("cpuidle: Make CPUIDLE_FLAG_TLB_FLUSHED generic") moved the leave_mm() call away from intel_idle(), but it didn't update its kerneldoc comment accordingly, so do that now. Fixes: bf9282dc26e7 ("cpuidle: Make CPUIDLE_FLAG_TLB_FLUSHED generic") Signed-off-by: Rafael J. Wysocki Signed-off-by: yingbao jia Signed-off-by: Jun Tian --- drivers/idle/intel_idle.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index e385ddf15b32..4ba4ab974dbe 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -122,9 +122,6 @@ static unsigned int mwait_substates __initdata; * If the local APIC timer is not known to be reliable in the target idle state, * enable one-shot tick broadcasting for the target CPU before executing MWAIT. * - * Optionally call leave_mm() for the target CPU upfront to avoid wakeups due to - * flushing user TLBs. - * * Must be called under local_irq_disable(). */ static __cpuidle int intel_idle(struct cpuidle_device *dev, -- Gitee From e724de09b2346397db17e008b27352ca9d794c09 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 15 Mar 2022 20:36:42 +0100 Subject: [PATCH 046/674] cpuidle: intel_idle: Drop redundant backslash at line end mainline inclusion from mainline-v5.18-rc1 commit 03eb65224e5711e7a2f34b500d44866b322a249a category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5BECY CVE: NA Intel-SIG: commit 03eb65224e57 cpuidle: intel_idle: Drop redundant backslash at line end. Backport intel_idle driver -------------------------------- Drop a redundant backslash character at the end of a line in the spr_cstates[] definition. Signed-off-by: Rafael J. Wysocki Acked-by: Artem Bityutskiy Signed-off-by: yingbao jia Signed-off-by: Jun Tian --- drivers/idle/intel_idle.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 4ba4ab974dbe..b7640cfe0020 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -781,7 +781,7 @@ static struct cpuidle_state spr_cstates[] __initdata = { { .name = "C1E", .desc = "MWAIT 0x01", - .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE | \ + .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE | CPUIDLE_FLAG_UNUSABLE, .exit_latency = 2, .target_residency = 4, -- Gitee From 950d18c9e2a1398787f9327a29ed0eb9fb7db3a8 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Wed, 27 Apr 2022 09:08:52 +0300 Subject: [PATCH 047/674] intel_idle: Fix the 'preferred_cstates' module parameter mainline inclusion from mainline-v5.18-rc5 commit 39c184a6a9a7a99950b321d55fe713175cf1d404 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5BECY CVE: NA Intel-SIG: commit 39c184a6a9a7 intel_idle: Fix the 'preferred_cstates' module parameter. Backport for intel_idle driver -------------------------------- Problem description. When user boots kernel up with the 'intel_idle.preferred_cstates=4' option, we enable C1E and disable C1 states on Sapphire Rapids Xeon (SPR). In order for C1E to work on SPR, we have to enable the C1E promotion bit on all CPUs. However, we enable it only on one CPU. Fix description. The 'intel_idle' driver already has the infrastructure for disabling C1E promotion on every CPU. This patch uses the same infrastructure for enabling C1E promotion on every CPU. It changes the boolean 'disable_promotion_to_c1e' variable to a tri-state 'c1e_promotion' variable. Tested on a 2-socket SPR system. I verified the following combinations: * C1E promotion enabled and disabled in BIOS. * Booted with and without the 'intel_idle.preferred_cstates=4' kernel argument. In all 4 cases C1E promotion was correctly set on all CPUs. Also tested on an old Broadwell system, just to make sure it does not cause a regression. C1E promotion was correctly disabled on that system, both C1 and C1E were exposed (as expected). Fixes: da0e58c038e6 ("intel_idle: add 'preferred_cstates' module argument") Reported-by: Jan Beulich Signed-off-by: Artem Bityutskiy [ rjw: Minor changelog edits ] Signed-off-by: Rafael J. Wysocki Signed-off-by: yingbao jia Signed-off-by: Jun Tian --- drivers/idle/intel_idle.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index b7640cfe0020..cf5ed4c1d02c 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -69,7 +69,12 @@ static unsigned int preferred_states_mask; static struct cpuidle_device __percpu *intel_idle_cpuidle_devices; static unsigned long auto_demotion_disable_flags; -static bool disable_promotion_to_c1e; + +static enum { + C1E_PROMOTION_PRESERVE, + C1E_PROMOTION_ENABLE, + C1E_PROMOTION_DISABLE +} c1e_promotion = C1E_PROMOTION_PRESERVE; struct idle_cpu { struct cpuidle_state *state_table; @@ -1398,8 +1403,6 @@ static inline void intel_idle_init_cstates_acpi(struct cpuidle_driver *drv) { } static inline bool intel_idle_off_by_default(u32 mwait_hint) { return false; } #endif /* !CONFIG_ACPI_PROCESSOR_CSTATE */ -static void c1e_promotion_enable(void); - /** * ivt_idle_state_table_update - Tune the idle states table for Ivy Town. * @@ -1587,8 +1590,7 @@ static void __init spr_idle_state_table_update(void) spr_cstates[1].flags &= ~CPUIDLE_FLAG_UNUSABLE; /* Enable C1E using the "C1E promotion" bit. */ - c1e_promotion_enable(); - disable_promotion_to_c1e = false; + c1e_promotion = C1E_PROMOTION_ENABLE; } /* @@ -1754,7 +1756,9 @@ static int intel_idle_cpu_init(unsigned int cpu) if (auto_demotion_disable_flags) auto_demotion_disable(); - if (disable_promotion_to_c1e) + if (c1e_promotion == C1E_PROMOTION_ENABLE) + c1e_promotion_enable(); + else if (c1e_promotion == C1E_PROMOTION_DISABLE) c1e_promotion_disable(); return 0; @@ -1833,7 +1837,8 @@ static int __init intel_idle_init(void) if (icpu) { cpuidle_state_table = icpu->state_table; auto_demotion_disable_flags = icpu->auto_demotion_disable_flags; - disable_promotion_to_c1e = icpu->disable_promotion_to_c1e; + if (icpu->disable_promotion_to_c1e) + c1e_promotion = C1E_PROMOTION_DISABLE; if (icpu->use_acpi || force_use_acpi) intel_idle_acpi_cst_extract(); } else if (!intel_idle_acpi_cst_extract()) { -- Gitee From ca838b2fe275e9acf0aaf8d31c1ab6bd96609c86 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Wed, 27 Apr 2022 09:08:53 +0300 Subject: [PATCH 048/674] intel_idle: Fix SPR C6 optimization mainline inclusion from mainline-v5.18-rc5 commit 7eac3bd38d18cd3317756649921b8264ddfee692 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5BECY CVE: NA Intel-SIG: commit 7eac3bd38d18 intel_idle: Fix SPR C6 optimization. Backport for intel_idle driver -------------------------------- The Sapphire Rapids (SPR) C6 optimization was added to the end of the 'spr_idle_state_table_update()' function. However, the function has a 'return' which may happen before the optimization has a chance to run. And this may prevent the optimization from happening. This is an unlikely scenario, but possible if user boots with, say, the 'intel_idle.preferred_cstates=6' kernel boot option. This patch fixes the issue by eliminating the problematic 'return' statement. Fixes: 3a9cf77b60dc ("intel_idle: add core C6 optimization for SPR") Suggested-by: Jan Beulich Reported-by: Jan Beulich Signed-off-by: Artem Bityutskiy [ rjw: Minor changelog edits ] Signed-off-by: Rafael J. Wysocki Signed-off-by: yingbao jia Signed-off-by: Jun Tian --- drivers/idle/intel_idle.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index cf5ed4c1d02c..47551ab73ca8 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -1581,11 +1581,9 @@ static void __init spr_idle_state_table_update(void) unsigned long long msr; /* Check if user prefers C1E over C1. */ - if (preferred_states_mask & BIT(2)) { - if (preferred_states_mask & BIT(1)) - /* Both can't be enabled, stick to the defaults. */ - return; - + if ((preferred_states_mask & BIT(2)) && + !(preferred_states_mask & BIT(1))) { + /* Disable C1 and enable C1E. */ spr_cstates[0].flags |= CPUIDLE_FLAG_UNUSABLE; spr_cstates[1].flags &= ~CPUIDLE_FLAG_UNUSABLE; -- Gitee From 0ce855fdbd1fbc84b9902e78eb9468834723b77c Mon Sep 17 00:00:00 2001 From: "David E. Box" Date: Wed, 28 Oct 2020 18:44:45 -0700 Subject: [PATCH 049/674] PCI: Add defines for Designated Vendor-Specific Extended Capability mainline inclusion from mainline-v5.11-rc1 commit 1dc2da5cd51f648de6d1df87e2bc6ea13f72f19c category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I596K9 CVE: NA Intel-SIG: commit 1dc2da5cd51f PCI: Add defines for Designated Vendor-Specific Extended Capability. Backport for intel PMT (Platform Monitoring Technology) support -------------------------------- Add PCIe Designated Vendor-Specific Extended Capability (DVSEC) and defines for the header offsets. Defined in PCIe r5.0, sec 7.9.6. Signed-off-by: David E. Box Acked-by: Bjorn Helgaas Reviewed-by: Andy Shevchenko Signed-off-by: Lee Jones Signed-off-by: yingbao jia Signed-off-by: Jun Tian --- include/uapi/linux/pci_regs.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index 7e0d526dd96f..9e257fe9efa5 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -729,6 +729,7 @@ #define PCI_EXT_CAP_ID_DPC 0x1D /* Downstream Port Containment */ #define PCI_EXT_CAP_ID_L1SS 0x1E /* L1 PM Substates */ #define PCI_EXT_CAP_ID_PTM 0x1F /* Precision Time Measurement */ +#define PCI_EXT_CAP_ID_DVSEC 0x23 /* Designated Vendor-Specific */ #define PCI_EXT_CAP_ID_DLF 0x25 /* Data Link Feature */ #define PCI_EXT_CAP_ID_PL_16GT 0x26 /* Physical Layer 16.0 GT/s */ #define PCI_EXT_CAP_ID_MAX PCI_EXT_CAP_ID_PL_16GT @@ -1079,6 +1080,10 @@ #define PCI_L1SS_CTL1_LTR_L12_TH_SCALE 0xe0000000 /* LTR_L1.2_THRESHOLD_Scale */ #define PCI_L1SS_CTL2 0x0c /* Control 2 Register */ +/* Designated Vendor-Specific (DVSEC, PCI_EXT_CAP_ID_DVSEC) */ +#define PCI_DVSEC_HEADER1 0x4 /* Designated Vendor-Specific Header1 */ +#define PCI_DVSEC_HEADER2 0x8 /* Designated Vendor-Specific Header2 */ + /* Data Link Feature */ #define PCI_DLF_CAP 0x04 /* Capabilities Register */ #define PCI_DLF_EXCHANGE_ENABLE 0x80000000 /* Data Link Feature Exchange Enable */ -- Gitee From e615c7102016697dfa1ee9c6940d09ae75c8a11e Mon Sep 17 00:00:00 2001 From: "David E. Box" Date: Wed, 28 Oct 2020 18:55:33 -0700 Subject: [PATCH 050/674] mfd: Intel Platform Monitoring Technology support mainline inclusion from mainline-v5.11-rc1 commit 4f8217d5b0ca8ace78a27dc371b87697eedc421d category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I596K9 CVE: NA Intel-SIG: commit 4f8217d5b0ca mfd: Intel Platform Monitoring Technology support. Backport for intel PMT (Platform Monitoring Technology) support -------------------------------- Intel Platform Monitoring Technology (PMT) is an architecture for enumerating and accessing hardware monitoring facilities. PMT supports multiple types of monitoring capabilities. This driver creates platform devices for each type so that they may be managed by capability specific drivers (to be introduced). Capabilities are discovered using PCIe DVSEC ids. Support is included for the 3 current capability types, Telemetry, Watcher, and Crashlog. The features are available on new Intel platforms starting from Tiger Lake for which support is added. This patch adds support for Tiger Lake (TGL), Alder Lake (ADL), and Out-of-Band Management Services Module (OOBMSM). Also add a quirk mechanism for several early hardware differences and bugs. For Tiger Lake and Alder Lake, do not support Watcher and Crashlog capabilities since they will not be compatible with future product. Also, fix use a quirk to fix the discovery table offset. Co-developed-by: Alexander Duyck Signed-off-by: Alexander Duyck Signed-off-by: David E. Box Reviewed-by: Andy Shevchenko Reviewed-by: Hans de Goede Signed-off-by: Lee Jones Signed-off-by: yingbao jia Signed-off-by: Jun Tian --- MAINTAINERS | 5 + drivers/mfd/Kconfig | 10 ++ drivers/mfd/Makefile | 1 + drivers/mfd/intel_pmt.c | 223 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 239 insertions(+) create mode 100644 drivers/mfd/intel_pmt.c diff --git a/MAINTAINERS b/MAINTAINERS index 23a23bd94c00..cdffd5192e09 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9038,6 +9038,11 @@ F: drivers/mfd/intel_soc_pmic* F: include/linux/mfd/intel_msic.h F: include/linux/mfd/intel_soc_pmic* +INTEL PMT DRIVER +M: "David E. Box" +S: Maintained +F: drivers/mfd/intel_pmt.c + INTEL PRO/WIRELESS 2100, 2200BG, 2915ABG NETWORK CONNECTION SUPPORT M: Stanislav Yakovlev L: linux-wireless@vger.kernel.org diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig index 15680c3c9279..3f9f84f9f288 100644 --- a/drivers/mfd/Kconfig +++ b/drivers/mfd/Kconfig @@ -699,6 +699,16 @@ config MFD_INTEL_PMC_BXT Register and P-unit access. In addition this creates devices for iTCO watchdog and telemetry that are part of the PMC. +config MFD_INTEL_PMT + tristate "Intel Platform Monitoring Technology (PMT) support" + depends on PCI + select MFD_CORE + help + The Intel Platform Monitoring Technology (PMT) is an interface that + provides access to hardware monitor registers. This driver supports + Telemetry, Watcher, and Crashlog PMT capabilities/devices for + platforms starting from Tiger Lake. + config MFD_IPAQ_MICRO bool "Atmel Micro ASIC (iPAQ h3100/h3600/h3700) Support" depends on SA1100_H3100 || SA1100_H3600 diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile index fb1df45a301e..ce8f1c0583d5 100644 --- a/drivers/mfd/Makefile +++ b/drivers/mfd/Makefile @@ -216,6 +216,7 @@ obj-$(CONFIG_MFD_INTEL_LPSS_PCI) += intel-lpss-pci.o obj-$(CONFIG_MFD_INTEL_LPSS_ACPI) += intel-lpss-acpi.o obj-$(CONFIG_MFD_INTEL_MSIC) += intel_msic.o obj-$(CONFIG_MFD_INTEL_PMC_BXT) += intel_pmc_bxt.o +obj-$(CONFIG_MFD_INTEL_PMT) += intel_pmt.o obj-$(CONFIG_MFD_PALMAS) += palmas.o obj-$(CONFIG_MFD_VIPERBOARD) += viperboard.o obj-$(CONFIG_MFD_RC5T583) += rc5t583.o rc5t583-irq.o diff --git a/drivers/mfd/intel_pmt.c b/drivers/mfd/intel_pmt.c new file mode 100644 index 000000000000..744b230cdcca --- /dev/null +++ b/drivers/mfd/intel_pmt.c @@ -0,0 +1,223 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Intel Platform Monitoring Technology PMT driver + * + * Copyright (c) 2020, Intel Corporation. + * All Rights Reserved. + * + * Author: David E. Box + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Intel DVSEC capability vendor space offsets */ +#define INTEL_DVSEC_ENTRIES 0xA +#define INTEL_DVSEC_SIZE 0xB +#define INTEL_DVSEC_TABLE 0xC +#define INTEL_DVSEC_TABLE_BAR(x) ((x) & GENMASK(2, 0)) +#define INTEL_DVSEC_TABLE_OFFSET(x) ((x) & GENMASK(31, 3)) +#define INTEL_DVSEC_ENTRY_SIZE 4 + +/* PMT capabilities */ +#define DVSEC_INTEL_ID_TELEMETRY 2 +#define DVSEC_INTEL_ID_WATCHER 3 +#define DVSEC_INTEL_ID_CRASHLOG 4 + +struct intel_dvsec_header { + u16 length; + u16 id; + u8 num_entries; + u8 entry_size; + u8 tbir; + u32 offset; +}; + +enum pmt_quirks { + /* Watcher capability not supported */ + PMT_QUIRK_NO_WATCHER = BIT(0), + + /* Crashlog capability not supported */ + PMT_QUIRK_NO_CRASHLOG = BIT(1), + + /* Use shift instead of mask to read discovery table offset */ + PMT_QUIRK_TABLE_SHIFT = BIT(2), +}; + +struct pmt_platform_info { + unsigned long quirks; +}; + +static const struct pmt_platform_info tgl_info = { + .quirks = PMT_QUIRK_NO_WATCHER | PMT_QUIRK_NO_CRASHLOG | + PMT_QUIRK_TABLE_SHIFT, +}; + +static int pmt_add_dev(struct pci_dev *pdev, struct intel_dvsec_header *header, + unsigned long quirks) +{ + struct device *dev = &pdev->dev; + struct resource *res, *tmp; + struct mfd_cell *cell; + const char *name; + int count = header->num_entries; + int size = header->entry_size; + int id = header->id; + int i; + + switch (id) { + case DVSEC_INTEL_ID_TELEMETRY: + name = "pmt_telemetry"; + break; + case DVSEC_INTEL_ID_WATCHER: + if (quirks & PMT_QUIRK_NO_WATCHER) { + dev_info(dev, "Watcher not supported\n"); + return 0; + } + name = "pmt_watcher"; + break; + case DVSEC_INTEL_ID_CRASHLOG: + if (quirks & PMT_QUIRK_NO_CRASHLOG) { + dev_info(dev, "Crashlog not supported\n"); + return 0; + } + name = "pmt_crashlog"; + break; + default: + dev_err(dev, "Unrecognized PMT capability: %d\n", id); + return -EINVAL; + } + + if (!header->num_entries || !header->entry_size) { + dev_err(dev, "Invalid count or size for %s header\n", name); + return -EINVAL; + } + + cell = devm_kzalloc(dev, sizeof(*cell), GFP_KERNEL); + if (!cell) + return -ENOMEM; + + res = devm_kcalloc(dev, count, sizeof(*res), GFP_KERNEL); + if (!res) + return -ENOMEM; + + if (quirks & PMT_QUIRK_TABLE_SHIFT) + header->offset >>= 3; + + /* + * The PMT DVSEC contains the starting offset and count for a block of + * discovery tables, each providing access to monitoring facilities for + * a section of the device. Create a resource list of these tables to + * provide to the driver. + */ + for (i = 0, tmp = res; i < count; i++, tmp++) { + tmp->start = pdev->resource[header->tbir].start + + header->offset + i * (size << 2); + tmp->end = tmp->start + (size << 2) - 1; + tmp->flags = IORESOURCE_MEM; + } + + cell->resources = res; + cell->num_resources = count; + cell->name = name; + + return devm_mfd_add_devices(dev, PLATFORM_DEVID_AUTO, cell, 1, NULL, 0, + NULL); +} + +static int pmt_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) +{ + struct pmt_platform_info *info; + unsigned long quirks = 0; + bool found_devices = false; + int ret, pos = 0; + + ret = pcim_enable_device(pdev); + if (ret) + return ret; + + info = (struct pmt_platform_info *)id->driver_data; + + if (info) + quirks = info->quirks; + + do { + struct intel_dvsec_header header; + u32 table; + u16 vid; + + pos = pci_find_next_ext_capability(pdev, pos, PCI_EXT_CAP_ID_DVSEC); + if (!pos) + break; + + pci_read_config_word(pdev, pos + PCI_DVSEC_HEADER1, &vid); + if (vid != PCI_VENDOR_ID_INTEL) + continue; + + pci_read_config_word(pdev, pos + PCI_DVSEC_HEADER2, + &header.id); + pci_read_config_byte(pdev, pos + INTEL_DVSEC_ENTRIES, + &header.num_entries); + pci_read_config_byte(pdev, pos + INTEL_DVSEC_SIZE, + &header.entry_size); + pci_read_config_dword(pdev, pos + INTEL_DVSEC_TABLE, + &table); + + header.tbir = INTEL_DVSEC_TABLE_BAR(table); + header.offset = INTEL_DVSEC_TABLE_OFFSET(table); + + ret = pmt_add_dev(pdev, &header, quirks); + if (ret) { + dev_warn(&pdev->dev, + "Failed to add device for DVSEC id %d\n", + header.id); + continue; + } + + found_devices = true; + } while (true); + + if (!found_devices) + return -ENODEV; + + pm_runtime_put(&pdev->dev); + pm_runtime_allow(&pdev->dev); + + return 0; +} + +static void pmt_pci_remove(struct pci_dev *pdev) +{ + pm_runtime_forbid(&pdev->dev); + pm_runtime_get_sync(&pdev->dev); +} + +#define PCI_DEVICE_ID_INTEL_PMT_ADL 0x467d +#define PCI_DEVICE_ID_INTEL_PMT_OOBMSM 0x09a7 +#define PCI_DEVICE_ID_INTEL_PMT_TGL 0x9a0d +static const struct pci_device_id pmt_pci_ids[] = { + { PCI_DEVICE_DATA(INTEL, PMT_ADL, &tgl_info) }, + { PCI_DEVICE_DATA(INTEL, PMT_OOBMSM, NULL) }, + { PCI_DEVICE_DATA(INTEL, PMT_TGL, &tgl_info) }, + { } +}; +MODULE_DEVICE_TABLE(pci, pmt_pci_ids); + +static struct pci_driver pmt_pci_driver = { + .name = "intel-pmt", + .id_table = pmt_pci_ids, + .probe = pmt_pci_probe, + .remove = pmt_pci_remove, +}; +module_pci_driver(pmt_pci_driver); + +MODULE_AUTHOR("David E. Box "); +MODULE_DESCRIPTION("Intel Platform Monitoring Technology PMT driver"); +MODULE_LICENSE("GPL v2"); -- Gitee From 71bb316092365fe81085377dce3e97d2a16669d5 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 28 Oct 2020 18:55:34 -0700 Subject: [PATCH 051/674] platform/x86: Intel PMT class driver mainline inclusion from mainline-v5.11-rc1 commit e2729113ce66d8d21f729b41bc3ed3feaf1acf69 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I596K9 CVE: NA Intel-SIG: commit e2729113ce66 platform/x86: Intel PMT class driver. Backport for intel PMT (Platform Monitoring Technology) support -------------------------------- Intel Platform Monitoring Technology is meant to provide a common way to access telemetry and system metrics. Register mappings are not provided by the driver. Instead, a GUID is read from a header for each endpoint. The GUID identifies the device and is to be used with an XML, provided by the vendor, to discover the available set of metrics and their register mapping. This allows firmware updates to modify the register space without needing to update the driver every time with new mappings. Firmware writes a new GUID in this case to specify the new mapping. Software tools with access to the associated XML file can then interpret the changes. The module manages access to all Intel PMT endpoints on a system, independent of the device exporting them. It creates an intel_pmt class to manage the devices. For each telemetry endpoint, sysfs files provide GUID and size information as well as a pointer to the parent device the telemetry came from. Software may discover the association between endpoints and devices by iterating through the list in sysfs, or by looking for the existence of the class folder under the device of interest. A binary sysfs attribute of the same name allows software to then read or map the telemetry space for direct access. Signed-off-by: Alexander Duyck Signed-off-by: David E. Box Reviewed-by: Hans de Goede Signed-off-by: Lee Jones Signed-off-by: yingbao jia Signed-off-by: Jun Tian --- .../ABI/testing/sysfs-class-intel_pmt | 54 ++++ MAINTAINERS | 1 + drivers/platform/x86/Kconfig | 12 + drivers/platform/x86/Makefile | 1 + drivers/platform/x86/intel_pmt_class.c | 297 ++++++++++++++++++ drivers/platform/x86/intel_pmt_class.h | 52 +++ 6 files changed, 417 insertions(+) create mode 100644 Documentation/ABI/testing/sysfs-class-intel_pmt create mode 100644 drivers/platform/x86/intel_pmt_class.c create mode 100644 drivers/platform/x86/intel_pmt_class.h diff --git a/Documentation/ABI/testing/sysfs-class-intel_pmt b/Documentation/ABI/testing/sysfs-class-intel_pmt new file mode 100644 index 000000000000..926b5cf95fd1 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-class-intel_pmt @@ -0,0 +1,54 @@ +What: /sys/class/intel_pmt/ +Date: October 2020 +KernelVersion: 5.10 +Contact: David Box +Description: + The intel_pmt/ class directory contains information for + devices that expose hardware telemetry using Intel Platform + Monitoring Technology (PMT) + +What: /sys/class/intel_pmt/telem +Date: October 2020 +KernelVersion: 5.10 +Contact: David Box +Description: + The telem directory contains files describing an instance of + a PMT telemetry device that exposes hardware telemetry. Each + telem directory has an associated telem file. This file + may be opened and mapped or read to access the telemetry space + of the device. The register layout of the telemetry space is + determined from an XML file that matches the PCI device id and + GUID for the device. + +What: /sys/class/intel_pmt/telem/telem +Date: October 2020 +KernelVersion: 5.10 +Contact: David Box +Description: + (RO) The telemetry data for this telemetry device. This file + may be mapped or read to obtain the data. + +What: /sys/class/intel_pmt/telem/guid +Date: October 2020 +KernelVersion: 5.10 +Contact: David Box +Description: + (RO) The GUID for this telemetry device. The GUID identifies + the version of the XML file for the parent device that is to + be used to get the register layout. + +What: /sys/class/intel_pmt/telem/size +Date: October 2020 +KernelVersion: 5.10 +Contact: David Box +Description: + (RO) The size of telemetry region in bytes that corresponds to + the mapping size for the telem file. + +What: /sys/class/intel_pmt/telem/offset +Date: October 2020 +KernelVersion: 5.10 +Contact: David Box +Description: + (RO) The offset of telemetry region in bytes that corresponds to + the mapping for the telem file. diff --git a/MAINTAINERS b/MAINTAINERS index cdffd5192e09..466b1c599848 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9042,6 +9042,7 @@ INTEL PMT DRIVER M: "David E. Box" S: Maintained F: drivers/mfd/intel_pmt.c +F: drivers/platform/x86/intel_pmt_* INTEL PRO/WIRELESS 2100, 2200BG, 2915ABG NETWORK CONNECTION SUPPORT M: Stanislav Yakovlev diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig index a1858689d6e1..9118f76ed3e9 100644 --- a/drivers/platform/x86/Kconfig +++ b/drivers/platform/x86/Kconfig @@ -1362,6 +1362,18 @@ config INTEL_PMC_CORE - LTR Ignore - MPHY/PLL gating status (Sunrisepoint PCH only) +config INTEL_PMT_CLASS + tristate "Intel Platform Monitoring Technology (PMT) Class driver" + help + The Intel Platform Monitoring Technology (PMT) class driver provides + the basic sysfs interface and file hierarchy uses by PMT devices. + + For more information, see: + + + To compile this driver as a module, choose M here: the module + will be called intel_pmt_class. + config INTEL_PUNIT_IPC tristate "Intel P-Unit IPC Driver" help diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile index 5f823f7eff45..f4b1f87f2401 100644 --- a/drivers/platform/x86/Makefile +++ b/drivers/platform/x86/Makefile @@ -140,6 +140,7 @@ obj-$(CONFIG_INTEL_MFLD_THERMAL) += intel_mid_thermal.o obj-$(CONFIG_INTEL_MID_POWER_BUTTON) += intel_mid_powerbtn.o obj-$(CONFIG_INTEL_MRFLD_PWRBTN) += intel_mrfld_pwrbtn.o obj-$(CONFIG_INTEL_PMC_CORE) += intel_pmc_core.o intel_pmc_core_pltdrv.o +obj-$(CONFIG_INTEL_PMT_CLASS) += intel_pmt_class.o obj-$(CONFIG_INTEL_PUNIT_IPC) += intel_punit_ipc.o obj-$(CONFIG_INTEL_SCU_IPC) += intel_scu_ipc.o obj-$(CONFIG_INTEL_SCU_PCI) += intel_scu_pcidrv.o diff --git a/drivers/platform/x86/intel_pmt_class.c b/drivers/platform/x86/intel_pmt_class.c new file mode 100644 index 000000000000..aa88dc23bbde --- /dev/null +++ b/drivers/platform/x86/intel_pmt_class.c @@ -0,0 +1,297 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Intel Platform Monitory Technology Telemetry driver + * + * Copyright (c) 2020, Intel Corporation. + * All Rights Reserved. + * + * Author: "Alexander Duyck" + */ + +#include +#include +#include +#include + +#include "intel_pmt_class.h" + +#define PMT_XA_START 0 +#define PMT_XA_MAX INT_MAX +#define PMT_XA_LIMIT XA_LIMIT(PMT_XA_START, PMT_XA_MAX) + +/* + * sysfs + */ +static ssize_t +intel_pmt_read(struct file *filp, struct kobject *kobj, + struct bin_attribute *attr, char *buf, loff_t off, + size_t count) +{ + struct intel_pmt_entry *entry = container_of(attr, + struct intel_pmt_entry, + pmt_bin_attr); + + if (off < 0) + return -EINVAL; + + if (off >= entry->size) + return 0; + + if (count > entry->size - off) + count = entry->size - off; + + memcpy_fromio(buf, entry->base + off, count); + + return count; +} + +static int +intel_pmt_mmap(struct file *filp, struct kobject *kobj, + struct bin_attribute *attr, struct vm_area_struct *vma) +{ + struct intel_pmt_entry *entry = container_of(attr, + struct intel_pmt_entry, + pmt_bin_attr); + unsigned long vsize = vma->vm_end - vma->vm_start; + struct device *dev = kobj_to_dev(kobj); + unsigned long phys = entry->base_addr; + unsigned long pfn = PFN_DOWN(phys); + unsigned long psize; + + if (vma->vm_flags & (VM_WRITE | VM_MAYWRITE)) + return -EROFS; + + psize = (PFN_UP(entry->base_addr + entry->size) - pfn) * PAGE_SIZE; + if (vsize > psize) { + dev_err(dev, "Requested mmap size is too large\n"); + return -EINVAL; + } + + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + if (io_remap_pfn_range(vma, vma->vm_start, pfn, + vsize, vma->vm_page_prot)) + return -EAGAIN; + + return 0; +} + +static ssize_t +guid_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct intel_pmt_entry *entry = dev_get_drvdata(dev); + + return sprintf(buf, "0x%x\n", entry->guid); +} +static DEVICE_ATTR_RO(guid); + +static ssize_t size_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct intel_pmt_entry *entry = dev_get_drvdata(dev); + + return sprintf(buf, "%zu\n", entry->size); +} +static DEVICE_ATTR_RO(size); + +static ssize_t +offset_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct intel_pmt_entry *entry = dev_get_drvdata(dev); + + return sprintf(buf, "%lu\n", offset_in_page(entry->base_addr)); +} +static DEVICE_ATTR_RO(offset); + +static struct attribute *intel_pmt_attrs[] = { + &dev_attr_guid.attr, + &dev_attr_size.attr, + &dev_attr_offset.attr, + NULL +}; +ATTRIBUTE_GROUPS(intel_pmt); + +static struct class intel_pmt_class = { + .name = "intel_pmt", + .owner = THIS_MODULE, + .dev_groups = intel_pmt_groups, +}; + +static int intel_pmt_populate_entry(struct intel_pmt_entry *entry, + struct intel_pmt_header *header, + struct device *dev, + struct resource *disc_res) +{ + struct pci_dev *pci_dev = to_pci_dev(dev->parent); + u8 bir; + + /* + * The base offset should always be 8 byte aligned. + * + * For non-local access types the lower 3 bits of base offset + * contains the index of the base address register where the + * telemetry can be found. + */ + bir = GET_BIR(header->base_offset); + + /* Local access and BARID only for now */ + switch (header->access_type) { + case ACCESS_LOCAL: + if (bir) { + dev_err(dev, + "Unsupported BAR index %d for access type %d\n", + bir, header->access_type); + return -EINVAL; + } + /* + * For access_type LOCAL, the base address is as follows: + * base address = end of discovery region + base offset + */ + entry->base_addr = disc_res->end + 1 + header->base_offset; + break; + case ACCESS_BARID: + /* + * If another BAR was specified then the base offset + * represents the offset within that BAR. SO retrieve the + * address from the parent PCI device and add offset. + */ + entry->base_addr = pci_resource_start(pci_dev, bir) + + GET_ADDRESS(header->base_offset); + break; + default: + dev_err(dev, "Unsupported access type %d\n", + header->access_type); + return -EINVAL; + } + + entry->guid = header->guid; + entry->size = header->size; + + return 0; +} + +static int intel_pmt_dev_register(struct intel_pmt_entry *entry, + struct intel_pmt_namespace *ns, + struct device *parent) +{ + struct resource res; + struct device *dev; + int ret; + + ret = xa_alloc(ns->xa, &entry->devid, entry, PMT_XA_LIMIT, GFP_KERNEL); + if (ret) + return ret; + + dev = device_create(&intel_pmt_class, parent, MKDEV(0, 0), entry, + "%s%d", ns->name, entry->devid); + + if (IS_ERR(dev)) { + dev_err(parent, "Could not create %s%d device node\n", + ns->name, entry->devid); + ret = PTR_ERR(dev); + goto fail_dev_create; + } + + entry->kobj = &dev->kobj; + + if (ns->attr_grp) { + ret = sysfs_create_group(entry->kobj, ns->attr_grp); + if (ret) + goto fail_sysfs; + } + + /* if size is 0 assume no data buffer, so no file needed */ + if (!entry->size) + return 0; + + res.start = entry->base_addr; + res.end = res.start + entry->size - 1; + res.flags = IORESOURCE_MEM; + + entry->base = devm_ioremap_resource(dev, &res); + if (IS_ERR(entry->base)) { + ret = PTR_ERR(entry->base); + goto fail_ioremap; + } + + sysfs_bin_attr_init(&entry->pmt_bin_attr); + entry->pmt_bin_attr.attr.name = ns->name; + entry->pmt_bin_attr.attr.mode = 0440; + entry->pmt_bin_attr.mmap = intel_pmt_mmap; + entry->pmt_bin_attr.read = intel_pmt_read; + entry->pmt_bin_attr.size = entry->size; + + ret = sysfs_create_bin_file(&dev->kobj, &entry->pmt_bin_attr); + if (!ret) + return 0; + +fail_ioremap: + sysfs_remove_group(entry->kobj, ns->attr_grp); +fail_sysfs: + device_unregister(dev); +fail_dev_create: + xa_erase(ns->xa, entry->devid); + + return ret; +} + +int intel_pmt_dev_create(struct intel_pmt_entry *entry, + struct intel_pmt_namespace *ns, + struct platform_device *pdev, int idx) +{ + struct intel_pmt_header header; + struct resource *disc_res; + int ret = -ENODEV; + + disc_res = platform_get_resource(pdev, IORESOURCE_MEM, idx); + if (!disc_res) + return ret; + + entry->disc_table = devm_platform_ioremap_resource(pdev, idx); + if (IS_ERR(entry->disc_table)) + return PTR_ERR(entry->disc_table); + + ret = ns->pmt_header_decode(entry, &header, &pdev->dev); + if (ret) + return ret; + + ret = intel_pmt_populate_entry(entry, &header, &pdev->dev, disc_res); + if (ret) + return ret; + + return intel_pmt_dev_register(entry, ns, &pdev->dev); + +} +EXPORT_SYMBOL_GPL(intel_pmt_dev_create); + +void intel_pmt_dev_destroy(struct intel_pmt_entry *entry, + struct intel_pmt_namespace *ns) +{ + struct device *dev = kobj_to_dev(entry->kobj); + + if (entry->size) + sysfs_remove_bin_file(entry->kobj, &entry->pmt_bin_attr); + + if (ns->attr_grp) + sysfs_remove_group(entry->kobj, ns->attr_grp); + + device_unregister(dev); + xa_erase(ns->xa, entry->devid); +} +EXPORT_SYMBOL_GPL(intel_pmt_dev_destroy); + +static int __init pmt_class_init(void) +{ + return class_register(&intel_pmt_class); +} + +static void __exit pmt_class_exit(void) +{ + class_unregister(&intel_pmt_class); +} + +module_init(pmt_class_init); +module_exit(pmt_class_exit); + +MODULE_AUTHOR("Alexander Duyck "); +MODULE_DESCRIPTION("Intel PMT Class driver"); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/platform/x86/intel_pmt_class.h b/drivers/platform/x86/intel_pmt_class.h new file mode 100644 index 000000000000..de8f8139ba31 --- /dev/null +++ b/drivers/platform/x86/intel_pmt_class.h @@ -0,0 +1,52 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _INTEL_PMT_CLASS_H +#define _INTEL_PMT_CLASS_H + +#include +#include +#include +#include +#include +#include + +/* PMT access types */ +#define ACCESS_BARID 2 +#define ACCESS_LOCAL 3 + +/* PMT discovery base address/offset register layout */ +#define GET_BIR(v) ((v) & GENMASK(2, 0)) +#define GET_ADDRESS(v) ((v) & GENMASK(31, 3)) + +struct intel_pmt_entry { + struct bin_attribute pmt_bin_attr; + struct kobject *kobj; + void __iomem *disc_table; + void __iomem *base; + unsigned long base_addr; + size_t size; + u32 guid; + int devid; +}; + +struct intel_pmt_header { + u32 base_offset; + u32 size; + u32 guid; + u8 access_type; +}; + +struct intel_pmt_namespace { + const char *name; + struct xarray *xa; + const struct attribute_group *attr_grp; + int (*pmt_header_decode)(struct intel_pmt_entry *entry, + struct intel_pmt_header *header, + struct device *dev); +}; + +int intel_pmt_dev_create(struct intel_pmt_entry *entry, + struct intel_pmt_namespace *ns, + struct platform_device *pdev, int idx); +void intel_pmt_dev_destroy(struct intel_pmt_entry *entry, + struct intel_pmt_namespace *ns); +#endif -- Gitee From 0d50cb6937086b9a8aa6e24a6ff5ff4a981b23c5 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 28 Oct 2020 18:55:35 -0700 Subject: [PATCH 052/674] platform/x86: Intel PMT Telemetry capability driver mainline inclusion from mainline-v5.11-rc1 commit 68fe8e6e2c4b04e2733d77834f55a4a0e172b770 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I596K9 CVE: NA Intel-SIG: commit 68fe8e6e2c4b platform/x86: Intel PMT Telemetry capability driver. Backport for intel PMT (Platform Monitoring Technology) support -------------------------------- PMT Telemetry is a capability of the Intel Platform Monitoring Technology. The Telemetry capability provides access to device telemetry metrics that provide hardware performance data to users from read-only register spaces. With this driver present the intel_pmt directory can be populated with telem devices. These devices will contain the standard intel_pmt sysfs data and a "telem" binary sysfs attribute which can be used to access the telemetry data. Also create a PCI device id list for early telemetry hardware that require workarounds for known issues. Signed-off-by: Alexander Duyck Co-developed-by: David E. Box Signed-off-by: David E. Box Reviewed-by: Andy Shevchenko Reviewed-by: Hans de Goede Signed-off-by: Lee Jones Signed-off-by: yingbao jia Signed-off-by: Jun Tian --- drivers/platform/x86/Kconfig | 11 ++ drivers/platform/x86/Makefile | 1 + drivers/platform/x86/intel_pmt_telemetry.c | 160 +++++++++++++++++++++ 3 files changed, 172 insertions(+) create mode 100644 drivers/platform/x86/intel_pmt_telemetry.c diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig index 9118f76ed3e9..ac93c807eb9a 100644 --- a/drivers/platform/x86/Kconfig +++ b/drivers/platform/x86/Kconfig @@ -1374,6 +1374,17 @@ config INTEL_PMT_CLASS To compile this driver as a module, choose M here: the module will be called intel_pmt_class. +config INTEL_PMT_TELEMETRY + tristate "Intel Platform Monitoring Technology (PMT) Telemetry driver" + select INTEL_PMT_CLASS + help + The Intel Platform Monitory Technology (PMT) Telemetry driver provides + access to hardware telemetry metrics on devices that support the + feature. + + To compile this driver as a module, choose M here: the module + will be called intel_pmt_telemetry. + config INTEL_PUNIT_IPC tristate "Intel P-Unit IPC Driver" help diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile index f4b1f87f2401..6a7b61f59ea8 100644 --- a/drivers/platform/x86/Makefile +++ b/drivers/platform/x86/Makefile @@ -141,6 +141,7 @@ obj-$(CONFIG_INTEL_MID_POWER_BUTTON) += intel_mid_powerbtn.o obj-$(CONFIG_INTEL_MRFLD_PWRBTN) += intel_mrfld_pwrbtn.o obj-$(CONFIG_INTEL_PMC_CORE) += intel_pmc_core.o intel_pmc_core_pltdrv.o obj-$(CONFIG_INTEL_PMT_CLASS) += intel_pmt_class.o +obj-$(CONFIG_INTEL_PMT_TELEMETRY) += intel_pmt_telemetry.o obj-$(CONFIG_INTEL_PUNIT_IPC) += intel_punit_ipc.o obj-$(CONFIG_INTEL_SCU_IPC) += intel_scu_ipc.o obj-$(CONFIG_INTEL_SCU_PCI) += intel_scu_pcidrv.o diff --git a/drivers/platform/x86/intel_pmt_telemetry.c b/drivers/platform/x86/intel_pmt_telemetry.c new file mode 100644 index 000000000000..f8a87614efa4 --- /dev/null +++ b/drivers/platform/x86/intel_pmt_telemetry.c @@ -0,0 +1,160 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Intel Platform Monitory Technology Telemetry driver + * + * Copyright (c) 2020, Intel Corporation. + * All Rights Reserved. + * + * Author: "David E. Box" + */ + +#include +#include +#include +#include +#include +#include + +#include "intel_pmt_class.h" + +#define TELEM_DEV_NAME "pmt_telemetry" + +#define TELEM_SIZE_OFFSET 0x0 +#define TELEM_GUID_OFFSET 0x4 +#define TELEM_BASE_OFFSET 0x8 +#define TELEM_ACCESS(v) ((v) & GENMASK(3, 0)) +/* size is in bytes */ +#define TELEM_SIZE(v) (((v) & GENMASK(27, 12)) >> 10) + +/* Used by client hardware to identify a fixed telemetry entry*/ +#define TELEM_CLIENT_FIXED_BLOCK_GUID 0x10000000 + +struct pmt_telem_priv { + int num_entries; + struct intel_pmt_entry entry[]; +}; + +/* + * Early implementations of PMT on client platforms have some + * differences from the server platforms (which use the Out Of Band + * Management Services Module OOBMSM). This list tracks those + * platforms as needed to handle those differences. Newer client + * platforms are expected to be fully compatible with server. + */ +static const struct pci_device_id pmt_telem_early_client_pci_ids[] = { + { PCI_VDEVICE(INTEL, 0x9a0d) }, /* TGL */ + { PCI_VDEVICE(INTEL, 0x467d) }, /* ADL */ + { } +}; + +static bool intel_pmt_is_early_client_hw(struct device *dev) +{ + struct pci_dev *parent = to_pci_dev(dev->parent); + + return !!pci_match_id(pmt_telem_early_client_pci_ids, parent); +} + +static bool pmt_telem_region_overlaps(struct intel_pmt_entry *entry, + struct device *dev) +{ + u32 guid = readl(entry->disc_table + TELEM_GUID_OFFSET); + + if (guid != TELEM_CLIENT_FIXED_BLOCK_GUID) + return false; + + return intel_pmt_is_early_client_hw(dev); +} + +static int pmt_telem_header_decode(struct intel_pmt_entry *entry, + struct intel_pmt_header *header, + struct device *dev) +{ + void __iomem *disc_table = entry->disc_table; + + if (pmt_telem_region_overlaps(entry, dev)) + return 1; + + header->access_type = TELEM_ACCESS(readl(disc_table)); + header->guid = readl(disc_table + TELEM_GUID_OFFSET); + header->base_offset = readl(disc_table + TELEM_BASE_OFFSET); + + /* Size is measured in DWORDS, but accessor returns bytes */ + header->size = TELEM_SIZE(readl(disc_table)); + + return 0; +} + +static DEFINE_XARRAY_ALLOC(telem_array); +static struct intel_pmt_namespace pmt_telem_ns = { + .name = "telem", + .xa = &telem_array, + .pmt_header_decode = pmt_telem_header_decode, +}; + +static int pmt_telem_remove(struct platform_device *pdev) +{ + struct pmt_telem_priv *priv = platform_get_drvdata(pdev); + int i; + + for (i = 0; i < priv->num_entries; i++) + intel_pmt_dev_destroy(&priv->entry[i], &pmt_telem_ns); + + return 0; +} + +static int pmt_telem_probe(struct platform_device *pdev) +{ + struct pmt_telem_priv *priv; + size_t size; + int i, ret; + + size = struct_size(priv, entry, pdev->num_resources); + priv = devm_kzalloc(&pdev->dev, size, GFP_KERNEL); + if (!priv) + return -ENOMEM; + + platform_set_drvdata(pdev, priv); + + for (i = 0; i < pdev->num_resources; i++) { + struct intel_pmt_entry *entry = &priv->entry[i]; + + ret = intel_pmt_dev_create(entry, &pmt_telem_ns, pdev, i); + if (ret < 0) + goto abort_probe; + if (ret) + continue; + + priv->num_entries++; + } + + return 0; +abort_probe: + pmt_telem_remove(pdev); + return ret; +} + +static struct platform_driver pmt_telem_driver = { + .driver = { + .name = TELEM_DEV_NAME, + }, + .remove = pmt_telem_remove, + .probe = pmt_telem_probe, +}; + +static int __init pmt_telem_init(void) +{ + return platform_driver_register(&pmt_telem_driver); +} +module_init(pmt_telem_init); + +static void __exit pmt_telem_exit(void) +{ + platform_driver_unregister(&pmt_telem_driver); + xa_destroy(&telem_array); +} +module_exit(pmt_telem_exit); + +MODULE_AUTHOR("David E. Box "); +MODULE_DESCRIPTION("Intel PMT Telemetry driver"); +MODULE_ALIAS("platform:" TELEM_DEV_NAME); +MODULE_LICENSE("GPL v2"); -- Gitee From 140652ff28bd44545c96b7a23da8f9fd101f8b33 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 28 Oct 2020 18:55:36 -0700 Subject: [PATCH 053/674] platform/x86: Intel PMT Crashlog capability driver mainline inclusion from mainline-v5.11-rc1 commit 5ef9998c96b0c99c49c202054586967e609286d2 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I596K9 CVE: NA Intel-SIG: commit 5ef9998c96b0 platform/x86: Intel PMT Crashlog capability driver. Backport for intel PMT (Platform Monitoring Technology) support -------------------------------- Add support for the Intel Platform Monitoring Technology crashlog interface. This interface provides a few sysfs values to allow for controlling the crashlog telemetry interface as well as a character driver to allow for mapping the crashlog memory region so that it can be accessed after a crashlog has been recorded. This driver is meant to only support the server version of the crashlog which is identified as crash_type 1 with a version of zero. Currently no other types are supported. Signed-off-by: Alexander Duyck Signed-off-by: David E. Box Reviewed-by: Hans de Goede Signed-off-by: Lee Jones Signed-off-by: yingbao jia Signed-off-by: Jun Tian --- .../ABI/testing/sysfs-class-intel_pmt | 65 ++++ drivers/platform/x86/Kconfig | 11 + drivers/platform/x86/Makefile | 1 + drivers/platform/x86/intel_pmt_crashlog.c | 328 ++++++++++++++++++ 4 files changed, 405 insertions(+) create mode 100644 drivers/platform/x86/intel_pmt_crashlog.c diff --git a/Documentation/ABI/testing/sysfs-class-intel_pmt b/Documentation/ABI/testing/sysfs-class-intel_pmt index 926b5cf95fd1..ed4c886a21b1 100644 --- a/Documentation/ABI/testing/sysfs-class-intel_pmt +++ b/Documentation/ABI/testing/sysfs-class-intel_pmt @@ -52,3 +52,68 @@ Contact: David Box Description: (RO) The offset of telemetry region in bytes that corresponds to the mapping for the telem file. + +What: /sys/class/intel_pmt/crashlog +Date: October 2020 +KernelVersion: 5.10 +Contact: Alexander Duyck +Description: + The crashlog directory contains files for configuring an + instance of a PMT crashlog device that can perform crash data + recording. Each crashlog device has an associated crashlog + file. This file can be opened and mapped or read to access the + resulting crashlog buffer. The register layout for the buffer + can be determined from an XML file of specified GUID for the + parent device. + +What: /sys/class/intel_pmt/crashlog/crashlog +Date: October 2020 +KernelVersion: 5.10 +Contact: David Box +Description: + (RO) The crashlog buffer for this crashlog device. This file + may be mapped or read to obtain the data. + +What: /sys/class/intel_pmt/crashlog/guid +Date: October 2020 +KernelVersion: 5.10 +Contact: Alexander Duyck +Description: + (RO) The GUID for this crashlog device. The GUID identifies the + version of the XML file for the parent device that should be + used to determine the register layout. + +What: /sys/class/intel_pmt/crashlog/size +Date: October 2020 +KernelVersion: 5.10 +Contact: Alexander Duyck +Description: + (RO) The length of the result buffer in bytes that corresponds + to the size for the crashlog buffer. + +What: /sys/class/intel_pmt/crashlog/offset +Date: October 2020 +KernelVersion: 5.10 +Contact: Alexander Duyck +Description: + (RO) The offset of the buffer in bytes that corresponds + to the mapping for the crashlog device. + +What: /sys/class/intel_pmt/crashlog/enable +Date: October 2020 +KernelVersion: 5.10 +Contact: Alexander Duyck +Description: + (RW) Boolean value controlling if the crashlog functionality + is enabled for the crashlog device. + +What: /sys/class/intel_pmt/crashlog/trigger +Date: October 2020 +KernelVersion: 5.10 +Contact: Alexander Duyck +Description: + (RW) Boolean value controlling the triggering of the crashlog + device node. When read it provides data on if the crashlog has + been triggered. When written to it can be used to either clear + the current trigger by writing false, or to trigger a new + event if the trigger is not currently set. diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig index ac93c807eb9a..da36a682d0cf 100644 --- a/drivers/platform/x86/Kconfig +++ b/drivers/platform/x86/Kconfig @@ -1385,6 +1385,17 @@ config INTEL_PMT_TELEMETRY To compile this driver as a module, choose M here: the module will be called intel_pmt_telemetry. +config INTEL_PMT_CRASHLOG + tristate "Intel Platform Monitoring Technology (PMT) Crashlog driver" + select INTEL_PMT_CLASS + help + The Intel Platform Monitoring Technology (PMT) crashlog driver provides + access to hardware crashlog capabilities on devices that support the + feature. + + To compile this driver as a module, choose M here: the module + will be called intel_pmt_crashlog. + config INTEL_PUNIT_IPC tristate "Intel P-Unit IPC Driver" help diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile index 6a7b61f59ea8..ca82c1344977 100644 --- a/drivers/platform/x86/Makefile +++ b/drivers/platform/x86/Makefile @@ -142,6 +142,7 @@ obj-$(CONFIG_INTEL_MRFLD_PWRBTN) += intel_mrfld_pwrbtn.o obj-$(CONFIG_INTEL_PMC_CORE) += intel_pmc_core.o intel_pmc_core_pltdrv.o obj-$(CONFIG_INTEL_PMT_CLASS) += intel_pmt_class.o obj-$(CONFIG_INTEL_PMT_TELEMETRY) += intel_pmt_telemetry.o +obj-$(CONFIG_INTEL_PMT_CRASHLOG) += intel_pmt_crashlog.o obj-$(CONFIG_INTEL_PUNIT_IPC) += intel_punit_ipc.o obj-$(CONFIG_INTEL_SCU_IPC) += intel_scu_ipc.o obj-$(CONFIG_INTEL_SCU_PCI) += intel_scu_pcidrv.o diff --git a/drivers/platform/x86/intel_pmt_crashlog.c b/drivers/platform/x86/intel_pmt_crashlog.c new file mode 100644 index 000000000000..97dd749c8290 --- /dev/null +++ b/drivers/platform/x86/intel_pmt_crashlog.c @@ -0,0 +1,328 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Intel Platform Monitoring Technology Crashlog driver + * + * Copyright (c) 2020, Intel Corporation. + * All Rights Reserved. + * + * Author: "Alexander Duyck" + */ + +#include +#include +#include +#include +#include +#include + +#include "intel_pmt_class.h" + +#define DRV_NAME "pmt_crashlog" + +/* Crashlog discovery header types */ +#define CRASH_TYPE_OOBMSM 1 + +/* Control Flags */ +#define CRASHLOG_FLAG_DISABLE BIT(27) + +/* + * Bits 28 and 29 control the state of bit 31. + * + * Bit 28 will clear bit 31, if set, allowing a new crashlog to be captured. + * Bit 29 will immediately trigger a crashlog to be generated, setting bit 31. + * Bit 30 is read-only and reserved as 0. + * Bit 31 is the read-only status with a 1 indicating log is complete. + */ +#define CRASHLOG_FLAG_TRIGGER_CLEAR BIT(28) +#define CRASHLOG_FLAG_TRIGGER_EXECUTE BIT(29) +#define CRASHLOG_FLAG_TRIGGER_COMPLETE BIT(31) +#define CRASHLOG_FLAG_TRIGGER_MASK GENMASK(31, 28) + +/* Crashlog Discovery Header */ +#define CONTROL_OFFSET 0x0 +#define GUID_OFFSET 0x4 +#define BASE_OFFSET 0x8 +#define SIZE_OFFSET 0xC +#define GET_ACCESS(v) ((v) & GENMASK(3, 0)) +#define GET_TYPE(v) (((v) & GENMASK(7, 4)) >> 4) +#define GET_VERSION(v) (((v) & GENMASK(19, 16)) >> 16) +/* size is in bytes */ +#define GET_SIZE(v) ((v) * sizeof(u32)) + +struct crashlog_entry { + /* entry must be first member of struct */ + struct intel_pmt_entry entry; + struct mutex control_mutex; +}; + +struct pmt_crashlog_priv { + int num_entries; + struct crashlog_entry entry[]; +}; + +/* + * I/O + */ +static bool pmt_crashlog_complete(struct intel_pmt_entry *entry) +{ + u32 control = readl(entry->disc_table + CONTROL_OFFSET); + + /* return current value of the crashlog complete flag */ + return !!(control & CRASHLOG_FLAG_TRIGGER_COMPLETE); +} + +static bool pmt_crashlog_disabled(struct intel_pmt_entry *entry) +{ + u32 control = readl(entry->disc_table + CONTROL_OFFSET); + + /* return current value of the crashlog disabled flag */ + return !!(control & CRASHLOG_FLAG_DISABLE); +} + +static bool pmt_crashlog_supported(struct intel_pmt_entry *entry) +{ + u32 discovery_header = readl(entry->disc_table + CONTROL_OFFSET); + u32 crash_type, version; + + crash_type = GET_TYPE(discovery_header); + version = GET_VERSION(discovery_header); + + /* + * Currently we only recognize OOBMSM version 0 devices. + * We can ignore all other crashlog devices in the system. + */ + return crash_type == CRASH_TYPE_OOBMSM && version == 0; +} + +static void pmt_crashlog_set_disable(struct intel_pmt_entry *entry, + bool disable) +{ + u32 control = readl(entry->disc_table + CONTROL_OFFSET); + + /* clear trigger bits so we are only modifying disable flag */ + control &= ~CRASHLOG_FLAG_TRIGGER_MASK; + + if (disable) + control |= CRASHLOG_FLAG_DISABLE; + else + control &= ~CRASHLOG_FLAG_DISABLE; + + writel(control, entry->disc_table + CONTROL_OFFSET); +} + +static void pmt_crashlog_set_clear(struct intel_pmt_entry *entry) +{ + u32 control = readl(entry->disc_table + CONTROL_OFFSET); + + control &= ~CRASHLOG_FLAG_TRIGGER_MASK; + control |= CRASHLOG_FLAG_TRIGGER_CLEAR; + + writel(control, entry->disc_table + CONTROL_OFFSET); +} + +static void pmt_crashlog_set_execute(struct intel_pmt_entry *entry) +{ + u32 control = readl(entry->disc_table + CONTROL_OFFSET); + + control &= ~CRASHLOG_FLAG_TRIGGER_MASK; + control |= CRASHLOG_FLAG_TRIGGER_EXECUTE; + + writel(control, entry->disc_table + CONTROL_OFFSET); +} + +/* + * sysfs + */ +static ssize_t +enable_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct intel_pmt_entry *entry = dev_get_drvdata(dev); + int enabled = !pmt_crashlog_disabled(entry); + + return sprintf(buf, "%d\n", enabled); +} + +static ssize_t +enable_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct crashlog_entry *entry; + bool enabled; + int result; + + entry = dev_get_drvdata(dev); + + result = kstrtobool(buf, &enabled); + if (result) + return result; + + mutex_lock(&entry->control_mutex); + pmt_crashlog_set_disable(&entry->entry, !enabled); + mutex_unlock(&entry->control_mutex); + + return count; +} +static DEVICE_ATTR_RW(enable); + +static ssize_t +trigger_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct intel_pmt_entry *entry; + int trigger; + + entry = dev_get_drvdata(dev); + trigger = pmt_crashlog_complete(entry); + + return sprintf(buf, "%d\n", trigger); +} + +static ssize_t +trigger_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct crashlog_entry *entry; + bool trigger; + int result; + + entry = dev_get_drvdata(dev); + + result = kstrtobool(buf, &trigger); + if (result) + return result; + + mutex_lock(&entry->control_mutex); + + if (!trigger) { + pmt_crashlog_set_clear(&entry->entry); + } else if (pmt_crashlog_complete(&entry->entry)) { + /* we cannot trigger a new crash if one is still pending */ + result = -EEXIST; + goto err; + } else if (pmt_crashlog_disabled(&entry->entry)) { + /* if device is currently disabled, return busy */ + result = -EBUSY; + goto err; + } else { + pmt_crashlog_set_execute(&entry->entry); + } + + result = count; +err: + mutex_unlock(&entry->control_mutex); + return result; +} +static DEVICE_ATTR_RW(trigger); + +static struct attribute *pmt_crashlog_attrs[] = { + &dev_attr_enable.attr, + &dev_attr_trigger.attr, + NULL +}; + +static struct attribute_group pmt_crashlog_group = { + .attrs = pmt_crashlog_attrs, +}; + +static int pmt_crashlog_header_decode(struct intel_pmt_entry *entry, + struct intel_pmt_header *header, + struct device *dev) +{ + void __iomem *disc_table = entry->disc_table; + struct crashlog_entry *crashlog; + + if (!pmt_crashlog_supported(entry)) + return 1; + + /* initialize control mutex */ + crashlog = container_of(entry, struct crashlog_entry, entry); + mutex_init(&crashlog->control_mutex); + + header->access_type = GET_ACCESS(readl(disc_table)); + header->guid = readl(disc_table + GUID_OFFSET); + header->base_offset = readl(disc_table + BASE_OFFSET); + + /* Size is measured in DWORDS, but accessor returns bytes */ + header->size = GET_SIZE(readl(disc_table + SIZE_OFFSET)); + + return 0; +} + +static DEFINE_XARRAY_ALLOC(crashlog_array); +static struct intel_pmt_namespace pmt_crashlog_ns = { + .name = "crashlog", + .xa = &crashlog_array, + .attr_grp = &pmt_crashlog_group, + .pmt_header_decode = pmt_crashlog_header_decode, +}; + +/* + * initialization + */ +static int pmt_crashlog_remove(struct platform_device *pdev) +{ + struct pmt_crashlog_priv *priv = platform_get_drvdata(pdev); + int i; + + for (i = 0; i < priv->num_entries; i++) + intel_pmt_dev_destroy(&priv->entry[i].entry, &pmt_crashlog_ns); + + return 0; +} + +static int pmt_crashlog_probe(struct platform_device *pdev) +{ + struct pmt_crashlog_priv *priv; + size_t size; + int i, ret; + + size = struct_size(priv, entry, pdev->num_resources); + priv = devm_kzalloc(&pdev->dev, size, GFP_KERNEL); + if (!priv) + return -ENOMEM; + + platform_set_drvdata(pdev, priv); + + for (i = 0; i < pdev->num_resources; i++) { + struct intel_pmt_entry *entry = &priv->entry[i].entry; + + ret = intel_pmt_dev_create(entry, &pmt_crashlog_ns, pdev, i); + if (ret < 0) + goto abort_probe; + if (ret) + continue; + + priv->num_entries++; + } + + return 0; +abort_probe: + pmt_crashlog_remove(pdev); + return ret; +} + +static struct platform_driver pmt_crashlog_driver = { + .driver = { + .name = DRV_NAME, + }, + .remove = pmt_crashlog_remove, + .probe = pmt_crashlog_probe, +}; + +static int __init pmt_crashlog_init(void) +{ + return platform_driver_register(&pmt_crashlog_driver); +} + +static void __exit pmt_crashlog_exit(void) +{ + platform_driver_unregister(&pmt_crashlog_driver); + xa_destroy(&crashlog_array); +} + +module_init(pmt_crashlog_init); +module_exit(pmt_crashlog_exit); + +MODULE_AUTHOR("Alexander Duyck "); +MODULE_DESCRIPTION("Intel PMT Crashlog driver"); +MODULE_ALIAS("platform:" DRV_NAME); +MODULE_LICENSE("GPL v2"); -- Gitee From c0a64500da64cf47e8d6847eff107059b39d702a Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 17 Nov 2020 10:22:51 +0300 Subject: [PATCH 054/674] platform/x86: pmt: Fix a potential Oops on error in probe mainline inclusion from mainline-v5.11-rc1 commit d3d73d25e0d9bc43fd2a6f4b4e58ff182e55b217 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I596K9 CVE: NA Intel-SIG: commit d3d73d25e0d9 platform/x86: pmt: Fix a potential Oops on error in probe. Backport for intel PMT (Platform Monitoring Technology) support -------------------------------- The "ns->attr_grp" pointer can be NULL so this error handling code needs to check for that to avoid an Oops. Fixes: e2729113ce66 ("platform/x86: Intel PMT class driver") Signed-off-by: Dan Carpenter Reviewed-by: David E. Box Link: https://lore.kernel.org/r/20201117072251.GC1111239@mwanda Signed-off-by: Hans de Goede Signed-off-by: yingbao jia Signed-off-by: Jun Tian --- drivers/platform/x86/intel_pmt_class.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/platform/x86/intel_pmt_class.c b/drivers/platform/x86/intel_pmt_class.c index aa88dc23bbde..c8939fba4509 100644 --- a/drivers/platform/x86/intel_pmt_class.c +++ b/drivers/platform/x86/intel_pmt_class.c @@ -225,7 +225,8 @@ static int intel_pmt_dev_register(struct intel_pmt_entry *entry, return 0; fail_ioremap: - sysfs_remove_group(entry->kobj, ns->attr_grp); + if (ns->attr_grp) + sysfs_remove_group(entry->kobj, ns->attr_grp); fail_sysfs: device_unregister(dev); fail_dev_create: -- Gitee From ebd22f42906106451221f9ec6f628fe191cf9826 Mon Sep 17 00:00:00 2001 From: "David E. Box" Date: Tue, 26 Jan 2021 12:55:06 -0800 Subject: [PATCH 055/674] platform/x86: intel_pmt: Make INTEL_PMT_CLASS non-user-selectable mainline inclusion from mainline-v5.12-rc2 commit 35d8a973fe4d38afee944db636c3d2b1df3741a7 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I596K9 CVE: NA Intel-SIG: commit 35d8a973fe4d platform/x86: intel_pmt: Make INTEL_PMT_CLASS non-user-selectable. Backport for intel PMT (Platform Monitoring Technology) support -------------------------------- Fix error in Kconfig that exposed INTEL_PMT_CLASS as a user selectable option. It is already selected by INTEL_PMT_TELEMETRY and INTEL_PMT_CRASHLOG which are user selectable. Fixes: e2729113ce66 ("platform/x86: Intel PMT class driver") Signed-off-by: David E. Box Link: https://lore.kernel.org/r/20210126205508.30907-1-david.e.box@linux.intel.com Signed-off-by: Hans de Goede Signed-off-by: yingbao jia Signed-off-by: Jun Tian --- drivers/platform/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig index da36a682d0cf..15c8dd312af5 100644 --- a/drivers/platform/x86/Kconfig +++ b/drivers/platform/x86/Kconfig @@ -1363,7 +1363,7 @@ config INTEL_PMC_CORE - MPHY/PLL gating status (Sunrisepoint PCH only) config INTEL_PMT_CLASS - tristate "Intel Platform Monitoring Technology (PMT) Class driver" + tristate help The Intel Platform Monitoring Technology (PMT) class driver provides the basic sysfs interface and file hierarchy uses by PMT devices. -- Gitee From 40bf0eee7dfb6c7eb5adbb2d6a1494ba28c6fdca Mon Sep 17 00:00:00 2001 From: "David E. Box" Date: Tue, 26 Jan 2021 12:55:07 -0800 Subject: [PATCH 056/674] platform/x86: intel_pmt_telemetry: Add dependency on MFD_INTEL_PMT mainline inclusion from mainline-v5.12-rc2 commit f3f6da5014dea3cc005b36948abe3664b5d1f7d3 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I596K9 CVE: NA Intel-SIG: commit f3f6da5014de platform/x86: intel_pmt_telemetry: Add dependency on MFD_INTEL_PMT. Backport for intel PMT (Platform Monitoring Technology) support -------------------------------- All devices that expose Intel Platform Monitoring Technology (PMT) telemetry are currently owned by the intel_pmt MFD driver. Therefore make the telemetry driver depend on the MFD driver for build. Fixes: 68fe8e6e2c4b ("platform/x86: Intel PMT Telemetry capability driver") Signed-off-by: David E. Box Link: https://lore.kernel.org/r/20210126205508.30907-2-david.e.box@linux.intel.com Signed-off-by: Hans de Goede Signed-off-by: yingbao jia Signed-off-by: Jun Tian --- drivers/platform/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig index 15c8dd312af5..06484c5c5b8c 100644 --- a/drivers/platform/x86/Kconfig +++ b/drivers/platform/x86/Kconfig @@ -1376,6 +1376,7 @@ config INTEL_PMT_CLASS config INTEL_PMT_TELEMETRY tristate "Intel Platform Monitoring Technology (PMT) Telemetry driver" + depends on MFD_INTEL_PMT select INTEL_PMT_CLASS help The Intel Platform Monitory Technology (PMT) Telemetry driver provides -- Gitee From e44c89b41641116e22bbb153e86b3cece9b0bfdf Mon Sep 17 00:00:00 2001 From: "David E. Box" Date: Tue, 26 Jan 2021 12:55:08 -0800 Subject: [PATCH 057/674] platform/x86: intel_pmt_crashlog: Add dependency on MFD_INTEL_PMT mainline inclusion from mainline-v5.12-rc2 commit fdd3feb37e36bec2ad75d76f8ac4d0273c5c0a91 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I596K9 CVE: NA Intel-SIG: commit fdd3feb37e36 platform/x86: intel_pmt_crashlog: Add dependency on MFD_INTEL_PMT. Backport for intel PMT (Platform Monitoring Technology) support -------------------------------- All devices that expose Intel Platform Monitoring Technology (PMT) crashlog are currently owned by the intel_pmt MFD driver. Therefore make the crashlog driver depend on the MFD driver for build. Fixes: 5ef9998c96b0 ("platform/x86: Intel PMT Crashlog capability driver") Signed-off-by: David E. Box Link: https://lore.kernel.org/r/20210126205508.30907-3-david.e.box@linux.intel.com Signed-off-by: Hans de Goede Signed-off-by: yingbao jia Signed-off-by: Jun Tian --- drivers/platform/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig index 06484c5c5b8c..a24783aa52ea 100644 --- a/drivers/platform/x86/Kconfig +++ b/drivers/platform/x86/Kconfig @@ -1388,6 +1388,7 @@ config INTEL_PMT_TELEMETRY config INTEL_PMT_CRASHLOG tristate "Intel Platform Monitoring Technology (PMT) Crashlog driver" + depends on MFD_INTEL_PMT select INTEL_PMT_CLASS help The Intel Platform Monitoring Technology (PMT) crashlog driver provides -- Gitee From fac7d1185d749969a83168368b5a0867b2d0f81c Mon Sep 17 00:00:00 2001 From: "David E. Box" Date: Wed, 24 Feb 2021 12:10:04 -0800 Subject: [PATCH 058/674] mfd: intel_pmt: Fix nuisance messages and handling of disabled capabilities mainline inclusion from mainline-v5.13-rc1 commit a1a5c1c3df282dc122508a17500317266ef19e46 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I596K9 CVE: NA Intel-SIG: commit a1a5c1c3df28 mfd: intel_pmt: Fix nuisance messages and handling of disabled capabilities. Backport for intel PMT (Platform Monitoring Technology) support -------------------------------- Some products will be available that have PMT capabilities that are not supported. Remove the warnings in this instance to avoid nuisance messages and confusion. Also return an error code for capabilities that are disabled by quirk to prevent them from keeping the driver loaded if only disabled capabilities are found. Fixes: 4f8217d5b0ca ("mfd: Intel Platform Monitoring Technology support") Signed-off-by: David E. Box Reviewed-by: Hans de Goede Signed-off-by: Lee Jones Signed-off-by: yingbao jia Signed-off-by: Jun Tian --- drivers/mfd/intel_pmt.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/drivers/mfd/intel_pmt.c b/drivers/mfd/intel_pmt.c index 744b230cdcca..65da2b17a204 100644 --- a/drivers/mfd/intel_pmt.c +++ b/drivers/mfd/intel_pmt.c @@ -79,19 +79,18 @@ static int pmt_add_dev(struct pci_dev *pdev, struct intel_dvsec_header *header, case DVSEC_INTEL_ID_WATCHER: if (quirks & PMT_QUIRK_NO_WATCHER) { dev_info(dev, "Watcher not supported\n"); - return 0; + return -EINVAL; } name = "pmt_watcher"; break; case DVSEC_INTEL_ID_CRASHLOG: if (quirks & PMT_QUIRK_NO_CRASHLOG) { dev_info(dev, "Crashlog not supported\n"); - return 0; + return -EINVAL; } name = "pmt_crashlog"; break; default: - dev_err(dev, "Unrecognized PMT capability: %d\n", id); return -EINVAL; } @@ -174,12 +173,8 @@ static int pmt_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) header.offset = INTEL_DVSEC_TABLE_OFFSET(table); ret = pmt_add_dev(pdev, &header, quirks); - if (ret) { - dev_warn(&pdev->dev, - "Failed to add device for DVSEC id %d\n", - header.id); + if (ret) continue; - } found_devices = true; } while (true); -- Gitee From ad0c48149868e76d0c3c4cc023ce3ed2efda63d1 Mon Sep 17 00:00:00 2001 From: "David E. Box" Date: Wed, 24 Feb 2021 12:10:05 -0800 Subject: [PATCH 059/674] mfd: intel_pmt: Add support for DG1 mainline inclusion from mainline-v5.13-rc1 commit aa47ad3f853ae72c32b7e46dfc8bc2c8dc2dbad7 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I596K9 CVE: NA Intel-SIG: commit aa47ad3f853a mfd: intel_pmt: Add support for DG1. Backport for intel PMT (Platform Monitoring Technology) support -------------------------------- Adds PMT Telemetry aggregator support for the DG1 graphics PCIe card. The device does not have the DVSEC region in its PCI config space so hard code the discovery table data in the driver. Also requires a fix for DG1 in the Telemetry driver for how the ACCESS_TYPE field is used. Signed-off-by: David E. Box Reviewed-by: Hans de Goede Signed-off-by: Lee Jones Signed-off-by: yingbao jia Signed-off-by: Jun Tian --- drivers/mfd/intel_pmt.c | 101 +++++++++++++++------ drivers/platform/x86/intel_pmt_class.c | 46 ++++++++++ drivers/platform/x86/intel_pmt_class.h | 1 + drivers/platform/x86/intel_pmt_telemetry.c | 20 ---- 4 files changed, 119 insertions(+), 49 deletions(-) diff --git a/drivers/mfd/intel_pmt.c b/drivers/mfd/intel_pmt.c index 65da2b17a204..dd7eb614c28e 100644 --- a/drivers/mfd/intel_pmt.c +++ b/drivers/mfd/intel_pmt.c @@ -49,10 +49,14 @@ enum pmt_quirks { /* Use shift instead of mask to read discovery table offset */ PMT_QUIRK_TABLE_SHIFT = BIT(2), + + /* DVSEC not present (provided in driver data) */ + PMT_QUIRK_NO_DVSEC = BIT(3), }; struct pmt_platform_info { unsigned long quirks; + struct intel_dvsec_header **capabilities; }; static const struct pmt_platform_info tgl_info = { @@ -60,6 +64,26 @@ static const struct pmt_platform_info tgl_info = { PMT_QUIRK_TABLE_SHIFT, }; +/* DG1 Platform with DVSEC quirk*/ +static struct intel_dvsec_header dg1_telemetry = { + .length = 0x10, + .id = 2, + .num_entries = 1, + .entry_size = 3, + .tbir = 0, + .offset = 0x466000, +}; + +static struct intel_dvsec_header *dg1_capabilities[] = { + &dg1_telemetry, + NULL +}; + +static const struct pmt_platform_info dg1_info = { + .quirks = PMT_QUIRK_NO_DVSEC, + .capabilities = dg1_capabilities, +}; + static int pmt_add_dev(struct pci_dev *pdev, struct intel_dvsec_header *header, unsigned long quirks) { @@ -147,37 +171,54 @@ static int pmt_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (info) quirks = info->quirks; - do { - struct intel_dvsec_header header; - u32 table; - u16 vid; + if (info && (info->quirks & PMT_QUIRK_NO_DVSEC)) { + struct intel_dvsec_header **header; - pos = pci_find_next_ext_capability(pdev, pos, PCI_EXT_CAP_ID_DVSEC); - if (!pos) - break; + header = info->capabilities; + while (*header) { + ret = pmt_add_dev(pdev, *header, quirks); + if (ret) + dev_warn(&pdev->dev, + "Failed to add device for DVSEC id %d\n", + (*header)->id); + else + found_devices = true; - pci_read_config_word(pdev, pos + PCI_DVSEC_HEADER1, &vid); - if (vid != PCI_VENDOR_ID_INTEL) - continue; - - pci_read_config_word(pdev, pos + PCI_DVSEC_HEADER2, - &header.id); - pci_read_config_byte(pdev, pos + INTEL_DVSEC_ENTRIES, - &header.num_entries); - pci_read_config_byte(pdev, pos + INTEL_DVSEC_SIZE, - &header.entry_size); - pci_read_config_dword(pdev, pos + INTEL_DVSEC_TABLE, - &table); - - header.tbir = INTEL_DVSEC_TABLE_BAR(table); - header.offset = INTEL_DVSEC_TABLE_OFFSET(table); - - ret = pmt_add_dev(pdev, &header, quirks); - if (ret) - continue; - - found_devices = true; - } while (true); + ++header; + } + } else { + do { + struct intel_dvsec_header header; + u32 table; + u16 vid; + + pos = pci_find_next_ext_capability(pdev, pos, PCI_EXT_CAP_ID_DVSEC); + if (!pos) + break; + + pci_read_config_word(pdev, pos + PCI_DVSEC_HEADER1, &vid); + if (vid != PCI_VENDOR_ID_INTEL) + continue; + + pci_read_config_word(pdev, pos + PCI_DVSEC_HEADER2, + &header.id); + pci_read_config_byte(pdev, pos + INTEL_DVSEC_ENTRIES, + &header.num_entries); + pci_read_config_byte(pdev, pos + INTEL_DVSEC_SIZE, + &header.entry_size); + pci_read_config_dword(pdev, pos + INTEL_DVSEC_TABLE, + &table); + + header.tbir = INTEL_DVSEC_TABLE_BAR(table); + header.offset = INTEL_DVSEC_TABLE_OFFSET(table); + + ret = pmt_add_dev(pdev, &header, quirks); + if (ret) + continue; + + found_devices = true; + } while (true); + } if (!found_devices) return -ENODEV; @@ -195,10 +236,12 @@ static void pmt_pci_remove(struct pci_dev *pdev) } #define PCI_DEVICE_ID_INTEL_PMT_ADL 0x467d +#define PCI_DEVICE_ID_INTEL_PMT_DG1 0x490e #define PCI_DEVICE_ID_INTEL_PMT_OOBMSM 0x09a7 #define PCI_DEVICE_ID_INTEL_PMT_TGL 0x9a0d static const struct pci_device_id pmt_pci_ids[] = { { PCI_DEVICE_DATA(INTEL, PMT_ADL, &tgl_info) }, + { PCI_DEVICE_DATA(INTEL, PMT_DG1, &dg1_info) }, { PCI_DEVICE_DATA(INTEL, PMT_OOBMSM, NULL) }, { PCI_DEVICE_DATA(INTEL, PMT_TGL, &tgl_info) }, { } diff --git a/drivers/platform/x86/intel_pmt_class.c b/drivers/platform/x86/intel_pmt_class.c index c8939fba4509..228e21f1ce5c 100644 --- a/drivers/platform/x86/intel_pmt_class.c +++ b/drivers/platform/x86/intel_pmt_class.c @@ -19,6 +19,28 @@ #define PMT_XA_MAX INT_MAX #define PMT_XA_LIMIT XA_LIMIT(PMT_XA_START, PMT_XA_MAX) +/* + * Early implementations of PMT on client platforms have some + * differences from the server platforms (which use the Out Of Band + * Management Services Module OOBMSM). This list tracks those + * platforms as needed to handle those differences. Newer client + * platforms are expected to be fully compatible with server. + */ +static const struct pci_device_id pmt_telem_early_client_pci_ids[] = { + { PCI_VDEVICE(INTEL, 0x467d) }, /* ADL */ + { PCI_VDEVICE(INTEL, 0x490e) }, /* DG1 */ + { PCI_VDEVICE(INTEL, 0x9a0d) }, /* TGL */ + { } +}; + +bool intel_pmt_is_early_client_hw(struct device *dev) +{ + struct pci_dev *parent = to_pci_dev(dev->parent); + + return !!pci_match_id(pmt_telem_early_client_pci_ids, parent); +} +EXPORT_SYMBOL_GPL(intel_pmt_is_early_client_hw); + /* * sysfs */ @@ -147,6 +169,30 @@ static int intel_pmt_populate_entry(struct intel_pmt_entry *entry, * base address = end of discovery region + base offset */ entry->base_addr = disc_res->end + 1 + header->base_offset; + + /* + * Some hardware use a different calculation for the base address + * when access_type == ACCESS_LOCAL. On the these systems + * ACCCESS_LOCAL refers to an address in the same BAR as the + * header but at a fixed offset. But as the header address was + * supplied to the driver, we don't know which BAR it was in. + * So search for the bar whose range includes the header address. + */ + if (intel_pmt_is_early_client_hw(dev)) { + int i; + + entry->base_addr = 0; + for (i = 0; i < 6; i++) + if (disc_res->start >= pci_resource_start(pci_dev, i) && + (disc_res->start <= pci_resource_end(pci_dev, i))) { + entry->base_addr = pci_resource_start(pci_dev, i) + + header->base_offset; + break; + } + if (!entry->base_addr) + return -EINVAL; + } + break; case ACCESS_BARID: /* diff --git a/drivers/platform/x86/intel_pmt_class.h b/drivers/platform/x86/intel_pmt_class.h index de8f8139ba31..1337019c2873 100644 --- a/drivers/platform/x86/intel_pmt_class.h +++ b/drivers/platform/x86/intel_pmt_class.h @@ -44,6 +44,7 @@ struct intel_pmt_namespace { struct device *dev); }; +bool intel_pmt_is_early_client_hw(struct device *dev); int intel_pmt_dev_create(struct intel_pmt_entry *entry, struct intel_pmt_namespace *ns, struct platform_device *pdev, int idx); diff --git a/drivers/platform/x86/intel_pmt_telemetry.c b/drivers/platform/x86/intel_pmt_telemetry.c index f8a87614efa4..9b95ef050457 100644 --- a/drivers/platform/x86/intel_pmt_telemetry.c +++ b/drivers/platform/x86/intel_pmt_telemetry.c @@ -34,26 +34,6 @@ struct pmt_telem_priv { struct intel_pmt_entry entry[]; }; -/* - * Early implementations of PMT on client platforms have some - * differences from the server platforms (which use the Out Of Band - * Management Services Module OOBMSM). This list tracks those - * platforms as needed to handle those differences. Newer client - * platforms are expected to be fully compatible with server. - */ -static const struct pci_device_id pmt_telem_early_client_pci_ids[] = { - { PCI_VDEVICE(INTEL, 0x9a0d) }, /* TGL */ - { PCI_VDEVICE(INTEL, 0x467d) }, /* ADL */ - { } -}; - -static bool intel_pmt_is_early_client_hw(struct device *dev) -{ - struct pci_dev *parent = to_pci_dev(dev->parent); - - return !!pci_match_id(pmt_telem_early_client_pci_ids, parent); -} - static bool pmt_telem_region_overlaps(struct intel_pmt_entry *entry, struct device *dev) { -- Gitee From 9a2f0041975b5654471cd8f58c5446b67d14e099 Mon Sep 17 00:00:00 2001 From: "David E. Box" Date: Tue, 16 Mar 2021 19:44:54 -0700 Subject: [PATCH 060/674] platform/x86: intel_pmt_class: Initial resource to 0 mainline inclusion from mainline-v5.13-rc1 commit 501bb68a66cfc0bc2a2458483400cb49daca974f category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I596K9 CVE: NA Intel-SIG: commit 501bb68a66cf platform/x86: intel_pmt_class: Initial resource to 0. Backport for intel PMT (Platform Monitoring Technology) support -------------------------------- Initialize the struct resource in intel_pmt_dev_register to zero to avoid a fault should the char *name field be non-zero. Signed-off-by: David E. Box Link: https://lore.kernel.org/r/20210317024455.3071477-1-david.e.box@linux.intel.com Signed-off-by: Hans de Goede Signed-off-by: yingbao jia Signed-off-by: Jun Tian --- drivers/platform/x86/intel_pmt_class.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/platform/x86/intel_pmt_class.c b/drivers/platform/x86/intel_pmt_class.c index 228e21f1ce5c..c86ff15b1ed5 100644 --- a/drivers/platform/x86/intel_pmt_class.c +++ b/drivers/platform/x86/intel_pmt_class.c @@ -219,7 +219,7 @@ static int intel_pmt_dev_register(struct intel_pmt_entry *entry, struct intel_pmt_namespace *ns, struct device *parent) { - struct resource res; + struct resource res = {0}; struct device *dev; int ret; -- Gitee From 38603f41c16a1086ed099a84fa05815a9e872a0f Mon Sep 17 00:00:00 2001 From: "David E. Box" Date: Tue, 16 Mar 2021 19:44:55 -0700 Subject: [PATCH 061/674] platform/x86: intel_pmt_crashlog: Fix incorrect macros mainline inclusion from mainline-v5.12-rc5 commit 10c931cdfe64ebc38a15a485dd794915044f2111 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I596K9 CVE: NA Intel-SIG: commit 10c931cdfe64 platform/x86: intel_pmt_crashlog: Fix incorrect macros. Backport for intel PMT (Platform Monitoring Technology) support -------------------------------- Fixes off-by-one bugs in the macro assignments for the crashlog control bits. Was initially tested on emulation but bug revealed after testing on silicon. Fixes: 5ef9998c96b0 ("platform/x86: Intel PMT Crashlog capability driver") Signed-off-by: David E. Box Link: https://lore.kernel.org/r/20210317024455.3071477-2-david.e.box@linux.intel.com Signed-off-by: Hans de Goede Signed-off-by: yingbao jia Signed-off-by: Jun Tian --- drivers/platform/x86/intel_pmt_crashlog.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/platform/x86/intel_pmt_crashlog.c b/drivers/platform/x86/intel_pmt_crashlog.c index 97dd749c8290..92d315a16cfd 100644 --- a/drivers/platform/x86/intel_pmt_crashlog.c +++ b/drivers/platform/x86/intel_pmt_crashlog.c @@ -23,18 +23,17 @@ #define CRASH_TYPE_OOBMSM 1 /* Control Flags */ -#define CRASHLOG_FLAG_DISABLE BIT(27) +#define CRASHLOG_FLAG_DISABLE BIT(28) /* - * Bits 28 and 29 control the state of bit 31. + * Bits 29 and 30 control the state of bit 31. * - * Bit 28 will clear bit 31, if set, allowing a new crashlog to be captured. - * Bit 29 will immediately trigger a crashlog to be generated, setting bit 31. - * Bit 30 is read-only and reserved as 0. + * Bit 29 will clear bit 31, if set, allowing a new crashlog to be captured. + * Bit 30 will immediately trigger a crashlog to be generated, setting bit 31. * Bit 31 is the read-only status with a 1 indicating log is complete. */ -#define CRASHLOG_FLAG_TRIGGER_CLEAR BIT(28) -#define CRASHLOG_FLAG_TRIGGER_EXECUTE BIT(29) +#define CRASHLOG_FLAG_TRIGGER_CLEAR BIT(29) +#define CRASHLOG_FLAG_TRIGGER_EXECUTE BIT(30) #define CRASHLOG_FLAG_TRIGGER_COMPLETE BIT(31) #define CRASHLOG_FLAG_TRIGGER_MASK GENMASK(31, 28) -- Gitee From e69b71e05fbbf874203483132d39e7c72dfba6e5 Mon Sep 17 00:00:00 2001 From: Rikard Falkeborn Date: Sat, 5 Jun 2021 22:38:05 +0200 Subject: [PATCH 062/674] platform/x86: intel_pmt_crashlog: Constify static attribute_group struct mainline inclusion from mainline-v5.14-rc1 commit d24023e375704860c6c8b91c3af3034669aa1bc5 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I596K9 CVE: NA Intel-SIG: commit d24023e37570 platform/x86: intel_pmt_crashlog: Constify static attribute_group struct. Backport for intel PMT (Platform Monitoring Technology) support -------------------------------- The only use of pmt_crashlog_group is to assign its address to the attr_grp field in the intel_pmt_namespace struct, which is a pointer to const attribute_group. Make it const to allow the compiler to put it in read-only memory. Signed-off-by: Rikard Falkeborn Link: https://lore.kernel.org/r/20210605203807.60547-3-rikard.falkeborn@gmail.com Signed-off-by: Hans de Goede Signed-off-by: yingbao jia Signed-off-by: Jun Tian --- drivers/platform/x86/intel_pmt_crashlog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/platform/x86/intel_pmt_crashlog.c b/drivers/platform/x86/intel_pmt_crashlog.c index 92d315a16cfd..56963ceb6345 100644 --- a/drivers/platform/x86/intel_pmt_crashlog.c +++ b/drivers/platform/x86/intel_pmt_crashlog.c @@ -218,7 +218,7 @@ static struct attribute *pmt_crashlog_attrs[] = { NULL }; -static struct attribute_group pmt_crashlog_group = { +static const struct attribute_group pmt_crashlog_group = { .attrs = pmt_crashlog_attrs, }; -- Gitee From 21db19cf4be15474deadabe6160688a0371e9f7b Mon Sep 17 00:00:00 2001 From: "David E. Box" Date: Tue, 17 Aug 2021 15:40:17 -0700 Subject: [PATCH 063/674] platform/x86: intel_pmt_telemetry: Ignore zero sized entries mainline inclusion from mainline-v5.15-rc1 commit ef195e8a7f43924b9979b2cd81ac7fa54f24bb3c category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I596K9 CVE: NA Intel-SIG: commit ef195e8a7f43 platform/x86: intel_pmt_telemetry: Ignore zero sized entries. Backport for intel PMT (Platform Monitoring Technology) support -------------------------------- Some devices may expose non-functioning entries that are reserved for future use. These entries have zero size. Ignore them during probe. Signed-off-by: David E. Box Link: https://lore.kernel.org/r/20210817224018.1013192-5-david.e.box@linux.intel.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede Signed-off-by: yingbao jia Signed-off-by: Jun Tian --- drivers/platform/x86/intel_pmt_telemetry.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/platform/x86/intel_pmt_telemetry.c b/drivers/platform/x86/intel_pmt_telemetry.c index 9b95ef050457..9f845e70a1f8 100644 --- a/drivers/platform/x86/intel_pmt_telemetry.c +++ b/drivers/platform/x86/intel_pmt_telemetry.c @@ -61,6 +61,14 @@ static int pmt_telem_header_decode(struct intel_pmt_entry *entry, /* Size is measured in DWORDS, but accessor returns bytes */ header->size = TELEM_SIZE(readl(disc_table)); + /* + * Some devices may expose non-functioning entries that are + * reserved for future use. They have zero size. Do not fail + * probe for these. Just ignore them. + */ + if (header->size == 0) + return 1; + return 0; } -- Gitee From b778ba5c868584a960628901482978560de43fd4 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 7 Jan 2021 13:23:34 +0100 Subject: [PATCH 064/674] x86/mce: Get rid of mcheck_intel_therm_init() mainline inclusion from mainline-5.12 commit 4f432e8bb15b352da72525144da025a46695968f category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5DSOL Intel_SIG: commit 4f432e8bb15b x86/mce: Get rid of mcheck_intel_therm_init(). Backport for Intel HFI (Hardware Feedback Interface) support ------------------------------------- Move the APIC_LVTTHMR read which needs to happen on the BSP, to intel_init_thermal(). One less boot dependency. No functional changes. Signed-off-by: Borislav Petkov Tested-by: Srinivas Pandruvada Link: https://lkml.kernel.org/r/20210201142704.12495-2-bp@alien8.de Signed-off-by: yingbao jia Signed-off-by: Jun Tian --- arch/x86/include/asm/mce.h | 6 ------ arch/x86/kernel/cpu/mce/core.c | 1 - arch/x86/kernel/cpu/mce/therm_throt.c | 15 ++++----------- 3 files changed, 4 insertions(+), 18 deletions(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 9b5ff423e939..4feed45f0438 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -299,12 +299,6 @@ extern int (*platform_thermal_package_notify)(__u64 msr_val); * callback has rate control */ extern bool (*platform_thermal_package_rate_control)(void); -#ifdef CONFIG_X86_THERMAL_VECTOR -extern void mcheck_intel_therm_init(void); -#else -static inline void mcheck_intel_therm_init(void) { } -#endif - /* * Used by APEI to report memory error via /dev/mcelog */ diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 0ced8b250090..ed49a4abd20e 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -2231,7 +2231,6 @@ __setup("mce", mcheck_enable); int __init mcheck_init(void) { - mcheck_intel_therm_init(); mce_register_decode_chain(&early_nb); mce_register_decode_chain(&mce_uc_nb); mce_register_decode_chain(&mce_default_nb); diff --git a/arch/x86/kernel/cpu/mce/therm_throt.c b/arch/x86/kernel/cpu/mce/therm_throt.c index a7cd2d203ced..5b15d7cef1d1 100644 --- a/arch/x86/kernel/cpu/mce/therm_throt.c +++ b/arch/x86/kernel/cpu/mce/therm_throt.c @@ -633,17 +633,6 @@ static int intel_thermal_supported(struct cpuinfo_x86 *c) return 1; } -void __init mcheck_intel_therm_init(void) -{ - /* - * This function is only called on boot CPU. Save the init thermal - * LVT value on BSP and use that value to restore APs' thermal LVT - * entry BIOS programmed later - */ - if (intel_thermal_supported(&boot_cpu_data)) - lvtthmr_init = apic_read(APIC_LVTTHMR); -} - void intel_init_thermal(struct cpuinfo_x86 *c) { unsigned int cpu = smp_processor_id(); @@ -653,6 +642,10 @@ void intel_init_thermal(struct cpuinfo_x86 *c) if (!intel_thermal_supported(c)) return; + /* On the BSP? */ + if (c == &boot_cpu_data) + lvtthmr_init = apic_read(APIC_LVTTHMR); + /* * First check if its enabled already, in which case there might * be some SMM goo which handles it, so we can't even put a handler -- Gitee From 3042f430fab61ead14942ce4a39be12834f08a78 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 7 Jan 2021 13:29:05 +0100 Subject: [PATCH 065/674] thermal: Move therm_throt there from x86/mce mainline inclusion from mainline-5.12 commit 9223d0dccb8f8523754122f68316dd1a4f39f7f8 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5DSOL Intel_SIG: commit 9223d0dccb8f thermal: Move therm_throt there from x86/mce. Backport for Intel HFI (Hardware Feedback Interface) support ------------------------------------- This functionality has nothing to do with MCE, move it to the thermal framework and untangle it from MCE. Requested-by: Peter Zijlstra Signed-off-by: Borislav Petkov Reviewed-by: Srinivas Pandruvada Tested-by: Srinivas Pandruvada Link: https://lkml.kernel.org/r/20210202121003.GD18075@zn.tnic Signed-off-by: yingbao jia Signed-off-by: Jun Tian --- arch/x86/Kconfig | 4 --- arch/x86/include/asm/mce.h | 16 ---------- arch/x86/include/asm/thermal.h | 13 ++++++++ arch/x86/kernel/cpu/intel.c | 3 ++ arch/x86/kernel/cpu/mce/Makefile | 2 -- arch/x86/kernel/cpu/mce/intel.c | 1 - arch/x86/kernel/irq.c | 21 ++++++++++++ drivers/thermal/intel/Kconfig | 4 +++ drivers/thermal/intel/Makefile | 1 + .../thermal/intel}/therm_throt.c | 32 ++++++------------- drivers/thermal/intel/thermal_interrupt.h | 15 +++++++++ drivers/thermal/intel/x86_pkg_temp_thermal.c | 4 +-- 12 files changed, 68 insertions(+), 48 deletions(-) create mode 100644 arch/x86/include/asm/thermal.h rename {arch/x86/kernel/cpu/mce => drivers/thermal/intel}/therm_throt.c (97%) create mode 100644 drivers/thermal/intel/thermal_interrupt.h diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 040fb77366dd..16683d531a00 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1164,10 +1164,6 @@ config X86_MCE_INJECT If you don't know what a machine check is and you don't do kernel QA it is safe to say n. -config X86_THERMAL_VECTOR - def_bool y - depends on X86_MCE_INTEL - source "arch/x86/events/Kconfig" config X86_LEGACY_VM86 diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 4feed45f0438..28c112695e92 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -283,22 +283,6 @@ extern void (*mce_threshold_vector)(void); /* Deferred error interrupt handler */ extern void (*deferred_error_int_vector)(void); -/* - * Thermal handler - */ - -void intel_init_thermal(struct cpuinfo_x86 *c); - -/* Interrupt Handler for core thermal thresholds */ -extern int (*platform_thermal_notify)(__u64 msr_val); - -/* Interrupt Handler for package thermal thresholds */ -extern int (*platform_thermal_package_notify)(__u64 msr_val); - -/* Callback support of rate control, return true, if - * callback has rate control */ -extern bool (*platform_thermal_package_rate_control)(void); - /* * Used by APEI to report memory error via /dev/mcelog */ diff --git a/arch/x86/include/asm/thermal.h b/arch/x86/include/asm/thermal.h new file mode 100644 index 000000000000..ddbdefd5b94f --- /dev/null +++ b/arch/x86/include/asm/thermal.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_THERMAL_H +#define _ASM_X86_THERMAL_H + +#ifdef CONFIG_X86_THERMAL_VECTOR +void intel_init_thermal(struct cpuinfo_x86 *c); +bool x86_thermal_enabled(void); +void intel_thermal_interrupt(void); +#else +static inline void intel_init_thermal(struct cpuinfo_x86 *c) { } +#endif + +#endif /* _ASM_X86_THERMAL_H */ diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 816fdbec795a..0e422a544835 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -24,6 +24,7 @@ #include #include #include +#include #ifdef CONFIG_X86_64 #include @@ -719,6 +720,8 @@ static void init_intel(struct cpuinfo_x86 *c) tsx_disable(); split_lock_init(); + + intel_init_thermal(c); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/cpu/mce/Makefile b/arch/x86/kernel/cpu/mce/Makefile index 9f020c994154..015856abdbb1 100644 --- a/arch/x86/kernel/cpu/mce/Makefile +++ b/arch/x86/kernel/cpu/mce/Makefile @@ -9,8 +9,6 @@ obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o mce-inject-y := inject.o obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o -obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o - obj-$(CONFIG_ACPI_APEI) += apei.o obj-$(CONFIG_X86_MCELOG_LEGACY) += dev-mcelog.o diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c index 24d45a1e214c..1adf67e0fcba 100644 --- a/arch/x86/kernel/cpu/mce/intel.c +++ b/arch/x86/kernel/cpu/mce/intel.c @@ -514,7 +514,6 @@ static void intel_ppin_init(struct cpuinfo_x86 *c) void mce_intel_feature_init(struct cpuinfo_x86 *c) { - intel_init_thermal(c); intel_init_cmci(); intel_init_lmce(); intel_ppin_init(c); diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index ce904c89c6c7..cb23373bffa8 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -21,6 +21,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -376,3 +377,23 @@ void fixup_irqs(void) } } #endif + +#ifdef CONFIG_X86_THERMAL_VECTOR +static void smp_thermal_vector(void) +{ + if (x86_thermal_enabled()) + intel_thermal_interrupt(); + else + pr_err("CPU%d: Unexpected LVT thermal interrupt!\n", + smp_processor_id()); +} + +DEFINE_IDTENTRY_SYSVEC(sysvec_thermal) +{ + trace_thermal_apic_entry(THERMAL_APIC_VECTOR); + inc_irq_stat(irq_thermal_count); + smp_thermal_vector(); + trace_thermal_apic_exit(THERMAL_APIC_VECTOR); + ack_APIC_irq(); +} +#endif diff --git a/drivers/thermal/intel/Kconfig b/drivers/thermal/intel/Kconfig index 8025b21f43fa..ce4f59213c7a 100644 --- a/drivers/thermal/intel/Kconfig +++ b/drivers/thermal/intel/Kconfig @@ -8,6 +8,10 @@ config INTEL_POWERCLAMP enforce idle time which results in more package C-state residency. The user interface is exposed via generic thermal framework. +config X86_THERMAL_VECTOR + def_bool y + depends on X86 && CPU_SUP_INTEL && X86_LOCAL_APIC + config X86_PKG_TEMP_THERMAL tristate "X86 package temperature thermal driver" depends on X86_THERMAL_VECTOR diff --git a/drivers/thermal/intel/Makefile b/drivers/thermal/intel/Makefile index 0d9736ced5d4..ff2ad30ef397 100644 --- a/drivers/thermal/intel/Makefile +++ b/drivers/thermal/intel/Makefile @@ -10,3 +10,4 @@ obj-$(CONFIG_INTEL_QUARK_DTS_THERMAL) += intel_quark_dts_thermal.o obj-$(CONFIG_INT340X_THERMAL) += int340x_thermal/ obj-$(CONFIG_INTEL_BXT_PMIC_THERMAL) += intel_bxt_pmic_thermal.o obj-$(CONFIG_INTEL_PCH_THERMAL) += intel_pch_thermal.o +obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o diff --git a/arch/x86/kernel/cpu/mce/therm_throt.c b/drivers/thermal/intel/therm_throt.c similarity index 97% rename from arch/x86/kernel/cpu/mce/therm_throt.c rename to drivers/thermal/intel/therm_throt.c index 5b15d7cef1d1..f8e882592ba5 100644 --- a/arch/x86/kernel/cpu/mce/therm_throt.c +++ b/drivers/thermal/intel/therm_throt.c @@ -26,13 +26,13 @@ #include #include +#include #include #include -#include +#include #include -#include -#include "internal.h" +#include "thermal_interrupt.h" /* How long to wait between reporting thermal events */ #define CHECK_INTERVAL (300 * HZ) @@ -570,7 +570,7 @@ static void notify_thresholds(__u64 msr_val) } /* Thermal transition interrupt handler */ -static void intel_thermal_interrupt(void) +void intel_thermal_interrupt(void) { __u64 msr_val; @@ -606,23 +606,6 @@ static void intel_thermal_interrupt(void) } } -static void unexpected_thermal_interrupt(void) -{ - pr_err("CPU%d: Unexpected LVT thermal interrupt!\n", - smp_processor_id()); -} - -static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt; - -DEFINE_IDTENTRY_SYSVEC(sysvec_thermal) -{ - trace_thermal_apic_entry(THERMAL_APIC_VECTOR); - inc_irq_stat(irq_thermal_count); - smp_thermal_vector(); - trace_thermal_apic_exit(THERMAL_APIC_VECTOR); - ack_APIC_irq(); -} - /* Thermal monitoring depends on APIC, ACPI and clock modulation */ static int intel_thermal_supported(struct cpuinfo_x86 *c) { @@ -633,6 +616,11 @@ static int intel_thermal_supported(struct cpuinfo_x86 *c) return 1; } +bool x86_thermal_enabled(void) +{ + return atomic_read(&therm_throt_en); +} + void intel_init_thermal(struct cpuinfo_x86 *c) { unsigned int cpu = smp_processor_id(); @@ -719,8 +707,6 @@ void intel_init_thermal(struct cpuinfo_x86 *c) | PACKAGE_THERM_INT_HIGH_ENABLE), h); } - smp_thermal_vector = intel_thermal_interrupt; - rdmsr(MSR_IA32_MISC_ENABLE, l, h); wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h); diff --git a/drivers/thermal/intel/thermal_interrupt.h b/drivers/thermal/intel/thermal_interrupt.h new file mode 100644 index 000000000000..53f427bb58dc --- /dev/null +++ b/drivers/thermal/intel/thermal_interrupt.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _INTEL_THERMAL_INTERRUPT_H +#define _INTEL_THERMAL_INTERRUPT_H + +/* Interrupt Handler for package thermal thresholds */ +extern int (*platform_thermal_package_notify)(__u64 msr_val); + +/* Interrupt Handler for core thermal thresholds */ +extern int (*platform_thermal_notify)(__u64 msr_val); + +/* Callback support of rate control, return true, if + * callback has rate control */ +extern bool (*platform_thermal_package_rate_control)(void); + +#endif /* _INTEL_THERMAL_INTERRUPT_H */ diff --git a/drivers/thermal/intel/x86_pkg_temp_thermal.c b/drivers/thermal/intel/x86_pkg_temp_thermal.c index 4f5d97329ee3..e734929b2cd0 100644 --- a/drivers/thermal/intel/x86_pkg_temp_thermal.c +++ b/drivers/thermal/intel/x86_pkg_temp_thermal.c @@ -17,9 +17,9 @@ #include #include #include -#include -#include +#include +#include "thermal_interrupt.h" /* * Rate control delay: Idea is to introduce denounce effect * This should be long enough to avoid reduce events, when -- Gitee From 967320d3fae4cafac795c7b457eff40953eb6d94 Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Thu, 27 Jan 2022 11:34:49 -0800 Subject: [PATCH 066/674] x86/cpu: Add definitions for the Intel Hardware Feedback Interface mainline inclusion from mainline-5.18 commit 7b8f40b3de75c971a4e5f9308b06deb59118dbac category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5DSOL Intel_SIG: commit 7b8f40b3de75 x86/cpu: Add definitions for the Intel Hardware Feedback Interface. Backport for Intel HFI (Hardware Feedback Interface) support ------------------------------------- Add the CPUID feature bit and the model-specific registers needed to identify and configure the Intel Hardware Feedback Interface. Acked-by: Borislav Petkov Signed-off-by: Ricardo Neri Signed-off-by: Rafael J. Wysocki Signed-off-by: yingbao jia Signed-off-by: Jun Tian --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/include/asm/msr-index.h | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 4da419226377..9e61ccbac3c0 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -321,6 +321,7 @@ #define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* HWP Activity Window */ #define X86_FEATURE_HWP_EPP (14*32+10) /* HWP Energy Perf. Preference */ #define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */ +#define X86_FEATURE_HFI (14*32+19) /* Hardware Feedback Interface */ /* AMD SVM Feature Identification, CPUID level 0x8000000a (EDX), word 15 */ #define X86_FEATURE_NPT (15*32+ 0) /* Nested Page Table support */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 2b0af5eb5131..2f0ca77d24bc 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -706,12 +706,14 @@ #define PACKAGE_THERM_STATUS_PROCHOT (1 << 0) #define PACKAGE_THERM_STATUS_POWER_LIMIT (1 << 10) +#define PACKAGE_THERM_STATUS_HFI_UPDATED (1 << 26) #define MSR_IA32_PACKAGE_THERM_INTERRUPT 0x000001b2 #define PACKAGE_THERM_INT_HIGH_ENABLE (1 << 0) #define PACKAGE_THERM_INT_LOW_ENABLE (1 << 1) #define PACKAGE_THERM_INT_PLN_ENABLE (1 << 24) +#define PACKAGE_THERM_INT_HFI_ENABLE (1 << 25) /* Thermal Thresholds Support */ #define THERM_INT_THRESHOLD0_ENABLE (1 << 15) @@ -956,4 +958,8 @@ #define MSR_VM_IGNNE 0xc0010115 #define MSR_VM_HSAVE_PA 0xc0010117 +/* Hardware Feedback Interface */ +#define MSR_IA32_HW_FEEDBACK_PTR 0x17d0 +#define MSR_IA32_HW_FEEDBACK_CONFIG 0x17d1 + #endif /* _ASM_X86_MSR_INDEX_H */ -- Gitee From cd073e4711bf1e7a723c5d951c65395b44bde852 Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Thu, 27 Jan 2022 11:34:50 -0800 Subject: [PATCH 067/674] thermal: intel: hfi: Minimally initialize the Hardware Feedback Interface mainline inclusion from mainline-5.18 commit 1cb19cabeb0e187b6c244d0da73d27f7432c40dc category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5DSOL Intel_SIG: commit 1cb19cabeb0e thermal: intel: hfi: Minimally initialize the Hardware Feedback Interface. Backport for Intel HFI (Hardware Feedback Interface) support ------------------------------------- The Intel Hardware Feedback Interface provides guidance to the operating system about the performance and energy efficiency capabilities of each CPU in the system. Capabilities are numbers between 0 and 255 where a higher number represents a higher capability. For each CPU, energy efficiency and performance are reported as separate capabilities. Hardware computes these capabilities based on the operating conditions of the system such as power and thermal limits. These capabilities are shared with the operating system in a table resident in memory. Each package in the system has its own HFI instance. Every logical CPU in the package is represented in the table. More than one logical CPUs may be represented in a single table entry. When the hardware updates the table, it generates a package-level thermal interrupt. The size and format of the HFI table depend on the supported features and can only be determined at runtime. To minimally initialize the HFI, parse its features and allocate one instance per package of a data structure with the necessary parameters to read and navigate a local copy (i.e., owned by the driver) of individual HFI tables. A subsequent changeset will provide per-CPU initialization and interrupt handling. Reviewed-by: Len Brown Co-developed by: Aubrey Li Signed-off-by: Aubrey Li Signed-off-by: Ricardo Neri Signed-off-by: Rafael J. Wysocki Signed-off-by: yingbao jia Signed-off-by: Jun Tian --- drivers/thermal/intel/Kconfig | 12 ++ drivers/thermal/intel/Makefile | 1 + drivers/thermal/intel/intel_hfi.c | 180 ++++++++++++++++++++++++++++ drivers/thermal/intel/intel_hfi.h | 11 ++ drivers/thermal/intel/therm_throt.c | 3 + 5 files changed, 207 insertions(+) create mode 100644 drivers/thermal/intel/intel_hfi.c create mode 100644 drivers/thermal/intel/intel_hfi.h diff --git a/drivers/thermal/intel/Kconfig b/drivers/thermal/intel/Kconfig index ce4f59213c7a..26aea864aaca 100644 --- a/drivers/thermal/intel/Kconfig +++ b/drivers/thermal/intel/Kconfig @@ -79,3 +79,15 @@ config INTEL_PCH_THERMAL Enable this to support thermal reporting on certain intel PCHs. Thermal reporting device will provide temperature reading, programmable trip points and other information. + +config INTEL_HFI_THERMAL + bool "Intel Hardware Feedback Interface" + depends on CPU_SUP_INTEL + depends on X86_THERMAL_VECTOR + help + Select this option to enable the Hardware Feedback Interface. If + selected, hardware provides guidance to the operating system on + the performance and energy efficiency capabilities of each CPU. + These capabilities may change as a result of changes in the operating + conditions of the system such power and thermal limits. If selected, + the kernel relays updates in CPUs' capabilities to userspace. diff --git a/drivers/thermal/intel/Makefile b/drivers/thermal/intel/Makefile index ff2ad30ef397..ece3888f30af 100644 --- a/drivers/thermal/intel/Makefile +++ b/drivers/thermal/intel/Makefile @@ -11,3 +11,4 @@ obj-$(CONFIG_INT340X_THERMAL) += int340x_thermal/ obj-$(CONFIG_INTEL_BXT_PMIC_THERMAL) += intel_bxt_pmic_thermal.o obj-$(CONFIG_INTEL_PCH_THERMAL) += intel_pch_thermal.o obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o +obj-$(CONFIG_INTEL_HFI_THERMAL) += intel_hfi.o diff --git a/drivers/thermal/intel/intel_hfi.c b/drivers/thermal/intel/intel_hfi.c new file mode 100644 index 000000000000..9ea8256cf9ae --- /dev/null +++ b/drivers/thermal/intel/intel_hfi.c @@ -0,0 +1,180 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Hardware Feedback Interface Driver + * + * Copyright (c) 2021, Intel Corporation. + * + * Authors: Aubrey Li + * Ricardo Neri + * + * + * The Hardware Feedback Interface provides a performance and energy efficiency + * capability information for each CPU in the system. Depending on the processor + * model, hardware may periodically update these capabilities as a result of + * changes in the operating conditions (e.g., power limits or thermal + * constraints). On other processor models, there is a single HFI update + * at boot. + * + * This file provides functionality to process HFI updates and relay these + * updates to userspace. + */ + +#define pr_fmt(fmt) "intel-hfi: " fmt + +#include +#include +#include +#include +#include +#include + +#include "intel_hfi.h" + +/* CPUID detection and enumeration definitions for HFI */ + +#define CPUID_HFI_LEAF 6 + +union hfi_capabilities { + struct { + u8 performance:1; + u8 energy_efficiency:1; + u8 __reserved:6; + } split; + u8 bits; +}; + +union cpuid6_edx { + struct { + union hfi_capabilities capabilities; + u32 table_pages:4; + u32 __reserved:4; + s32 index:16; + } split; + u32 full; +}; + +/** + * struct hfi_cpu_data - HFI capabilities per CPU + * @perf_cap: Performance capability + * @ee_cap: Energy efficiency capability + * + * Capabilities of a logical processor in the HFI table. These capabilities are + * unitless. + */ +struct hfi_cpu_data { + u8 perf_cap; + u8 ee_cap; +} __packed; + +/** + * struct hfi_hdr - Header of the HFI table + * @perf_updated: Hardware updated performance capabilities + * @ee_updated: Hardware updated energy efficiency capabilities + * + * Properties of the data in an HFI table. + */ +struct hfi_hdr { + u8 perf_updated; + u8 ee_updated; +} __packed; + +/** + * struct hfi_instance - Representation of an HFI instance (i.e., a table) + * @local_table: Base of the local copy of the HFI table + * @timestamp: Timestamp of the last update of the local table. + * Located at the base of the local table. + * @hdr: Base address of the header of the local table + * @data: Base address of the data of the local table + * + * A set of parameters to parse and navigate a specific HFI table. + */ +struct hfi_instance { + union { + void *local_table; + u64 *timestamp; + }; + void *hdr; + void *data; +}; + +/** + * struct hfi_features - Supported HFI features + * @nr_table_pages: Size of the HFI table in 4KB pages + * @cpu_stride: Stride size to locate the capability data of a logical + * processor within the table (i.e., row stride) + * @hdr_size: Size of the table header + * + * Parameters and supported features that are common to all HFI instances + */ +struct hfi_features { + unsigned int nr_table_pages; + unsigned int cpu_stride; + unsigned int hdr_size; +}; + +static int max_hfi_instances; +static struct hfi_instance *hfi_instances; + +static struct hfi_features hfi_features; + +static __init int hfi_parse_features(void) +{ + unsigned int nr_capabilities; + union cpuid6_edx edx; + + if (!boot_cpu_has(X86_FEATURE_HFI)) + return -ENODEV; + + /* + * If we are here we know that CPUID_HFI_LEAF exists. Parse the + * supported capabilities and the size of the HFI table. + */ + edx.full = cpuid_edx(CPUID_HFI_LEAF); + + if (!edx.split.capabilities.split.performance) { + pr_debug("Performance reporting not supported! Not using HFI\n"); + return -ENODEV; + } + + /* + * The number of supported capabilities determines the number of + * columns in the HFI table. Exclude the reserved bits. + */ + edx.split.capabilities.split.__reserved = 0; + nr_capabilities = hweight8(edx.split.capabilities.bits); + + /* The number of 4KB pages required by the table */ + hfi_features.nr_table_pages = edx.split.table_pages + 1; + + /* + * The header contains change indications for each supported feature. + * The size of the table header is rounded up to be a multiple of 8 + * bytes. + */ + hfi_features.hdr_size = DIV_ROUND_UP(nr_capabilities, 8) * 8; + + /* + * Data of each logical processor is also rounded up to be a multiple + * of 8 bytes. + */ + hfi_features.cpu_stride = DIV_ROUND_UP(nr_capabilities, 8) * 8; + + return 0; +} + +void __init intel_hfi_init(void) +{ + if (hfi_parse_features()) + return; + + /* There is one HFI instance per die/package. */ + max_hfi_instances = topology_max_packages() * + topology_max_die_per_package(); + + /* + * This allocation may fail. CPU hotplug callbacks must check + * for a null pointer. + */ + hfi_instances = kcalloc(max_hfi_instances, sizeof(*hfi_instances), + GFP_KERNEL); +} diff --git a/drivers/thermal/intel/intel_hfi.h b/drivers/thermal/intel/intel_hfi.h new file mode 100644 index 000000000000..05f748b48a4e --- /dev/null +++ b/drivers/thermal/intel/intel_hfi.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _INTEL_HFI_H +#define _INTEL_HFI_H + +#if defined(CONFIG_INTEL_HFI_THERMAL) +void __init intel_hfi_init(void); +#else +static inline void intel_hfi_init(void) { } +#endif /* CONFIG_INTEL_HFI_THERMAL */ + +#endif /* _INTEL_HFI_H */ diff --git a/drivers/thermal/intel/therm_throt.c b/drivers/thermal/intel/therm_throt.c index f8e882592ba5..9d26e35106a7 100644 --- a/drivers/thermal/intel/therm_throt.c +++ b/drivers/thermal/intel/therm_throt.c @@ -32,6 +32,7 @@ #include #include +#include "intel_hfi.h" #include "thermal_interrupt.h" /* How long to wait between reporting thermal events */ @@ -509,6 +510,8 @@ static __init int thermal_throttle_init_device(void) if (!atomic_read(&therm_throt_en)) return 0; + intel_hfi_init(); + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/therm:online", thermal_throttle_online, thermal_throttle_offline); -- Gitee From 55affa7a8345f4b8d74407b00c0223bf7183f62a Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Thu, 27 Jan 2022 11:34:51 -0800 Subject: [PATCH 068/674] thermal: intel: hfi: Handle CPU hotplug events mainline inclusion from mainline-5.18 commit 2d74e6319abe278981e79166b6c2d0c3ed39b1ae category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5DSOL Intel_SIG: commit 2d74e6319abe thermal: intel: hfi: Handle CPU hotplug events. Backport for Intel HFI (Hardware Feedback Interface) support ------------------------------------- All CPUs in a package are represented in an HFI table. There exists an HFI table per package. Thus, CPUs in a package need to coordinate to initialize and access the table. Do such coordination during CPU hotplug. Use the first CPU to come online in a package to initialize the HFI instance and the data structure representing it. Other CPUs in the same package need only to register or unregister themselves in that data structure. The HFI depends on both the package-level thermal management and the local APIC thermal local vector. Thus, to ensure that a CPU coming online has an associated HFI instance when the hardware issues an HFI event, enable the HFI only after having enabled the local APIC thermal vector. The thermal throttle driver takes care of the needed package-level initialization. Reviewed-by: Len Brown Signed-off-by: Ricardo Neri Signed-off-by: Rafael J. Wysocki Signed-off-by: yingbao jia Signed-off-by: Jun Tian --- drivers/thermal/intel/intel_hfi.c | 205 ++++++++++++++++++++++++++++ drivers/thermal/intel/intel_hfi.h | 4 + drivers/thermal/intel/therm_throt.c | 9 ++ 3 files changed, 218 insertions(+) diff --git a/drivers/thermal/intel/intel_hfi.c b/drivers/thermal/intel/intel_hfi.c index 9ea8256cf9ae..8cac7ee6a20f 100644 --- a/drivers/thermal/intel/intel_hfi.c +++ b/drivers/thermal/intel/intel_hfi.c @@ -23,13 +23,23 @@ #include #include +#include +#include +#include +#include +#include #include #include #include #include +#include + #include "intel_hfi.h" +/* Hardware Feedback Interface MSR configuration bits */ +#define HW_FEEDBACK_PTR_VALID_BIT BIT(0) + /* CPUID detection and enumeration definitions for HFI */ #define CPUID_HFI_LEAF 6 @@ -85,6 +95,8 @@ struct hfi_hdr { * Located at the base of the local table. * @hdr: Base address of the header of the local table * @data: Base address of the data of the local table + * @cpus: CPUs represented in this HFI table instance + * @hw_table: Pointer to the HFI table of this instance * * A set of parameters to parse and navigate a specific HFI table. */ @@ -95,6 +107,8 @@ struct hfi_instance { }; void *hdr; void *data; + cpumask_var_t cpus; + void *hw_table; }; /** @@ -112,10 +126,179 @@ struct hfi_features { unsigned int hdr_size; }; +/** + * struct hfi_cpu_info - Per-CPU attributes to consume HFI data + * @index: Row of this CPU in its HFI table + * @hfi_instance: Attributes of the HFI table to which this CPU belongs + * + * Parameters to link a logical processor to an HFI table and a row within it. + */ +struct hfi_cpu_info { + s16 index; + struct hfi_instance *hfi_instance; +}; + +static DEFINE_PER_CPU(struct hfi_cpu_info, hfi_cpu_info) = { .index = -1 }; + static int max_hfi_instances; static struct hfi_instance *hfi_instances; static struct hfi_features hfi_features; +static DEFINE_MUTEX(hfi_instance_lock); + +static void init_hfi_cpu_index(struct hfi_cpu_info *info) +{ + union cpuid6_edx edx; + + /* Do not re-read @cpu's index if it has already been initialized. */ + if (info->index > -1) + return; + + edx.full = cpuid_edx(CPUID_HFI_LEAF); + info->index = edx.split.index; +} + +/* + * The format of the HFI table depends on the number of capabilities that the + * hardware supports. Keep a data structure to navigate the table. + */ +static void init_hfi_instance(struct hfi_instance *hfi_instance) +{ + /* The HFI header is below the time-stamp. */ + hfi_instance->hdr = hfi_instance->local_table + + sizeof(*hfi_instance->timestamp); + + /* The HFI data starts below the header. */ + hfi_instance->data = hfi_instance->hdr + hfi_features.hdr_size; +} + +/** + * intel_hfi_online() - Enable HFI on @cpu + * @cpu: CPU in which the HFI will be enabled + * + * Enable the HFI to be used in @cpu. The HFI is enabled at the die/package + * level. The first CPU in the die/package to come online does the full HFI + * initialization. Subsequent CPUs will just link themselves to the HFI + * instance of their die/package. + * + * This function is called before enabling the thermal vector in the local APIC + * in order to ensure that @cpu has an associated HFI instance when it receives + * an HFI event. + */ +void intel_hfi_online(unsigned int cpu) +{ + struct hfi_instance *hfi_instance; + struct hfi_cpu_info *info; + phys_addr_t hw_table_pa; + u64 msr_val; + u16 die_id; + + /* Nothing to do if hfi_instances are missing. */ + if (!hfi_instances) + return; + + /* + * Link @cpu to the HFI instance of its package/die. It does not + * matter whether the instance has been initialized. + */ + info = &per_cpu(hfi_cpu_info, cpu); + die_id = topology_logical_die_id(cpu); + hfi_instance = info->hfi_instance; + if (!hfi_instance) { + if (die_id < 0 || die_id >= max_hfi_instances) + return; + + hfi_instance = &hfi_instances[die_id]; + info->hfi_instance = hfi_instance; + } + + init_hfi_cpu_index(info); + + /* + * Now check if the HFI instance of the package/die of @cpu has been + * initialized (by checking its header). In such case, all we have to + * do is to add @cpu to this instance's cpumask. + */ + mutex_lock(&hfi_instance_lock); + if (hfi_instance->hdr) { + cpumask_set_cpu(cpu, hfi_instance->cpus); + goto unlock; + } + + /* + * Hardware is programmed with the physical address of the first page + * frame of the table. Hence, the allocated memory must be page-aligned. + */ + hfi_instance->hw_table = alloc_pages_exact(hfi_features.nr_table_pages, + GFP_KERNEL | __GFP_ZERO); + if (!hfi_instance->hw_table) + goto unlock; + + hw_table_pa = virt_to_phys(hfi_instance->hw_table); + + /* + * Allocate memory to keep a local copy of the table that + * hardware generates. + */ + hfi_instance->local_table = kzalloc(hfi_features.nr_table_pages << PAGE_SHIFT, + GFP_KERNEL); + if (!hfi_instance->local_table) + goto free_hw_table; + + /* + * Program the address of the feedback table of this die/package. On + * some processors, hardware remembers the old address of the HFI table + * even after having been reprogrammed and re-enabled. Thus, do not free + * the pages allocated for the table or reprogram the hardware with a + * new base address. Namely, program the hardware only once. + */ + msr_val = hw_table_pa | HW_FEEDBACK_PTR_VALID_BIT; + wrmsrl(MSR_IA32_HW_FEEDBACK_PTR, msr_val); + + init_hfi_instance(hfi_instance); + + cpumask_set_cpu(cpu, hfi_instance->cpus); + +unlock: + mutex_unlock(&hfi_instance_lock); + return; + +free_hw_table: + free_pages_exact(hfi_instance->hw_table, hfi_features.nr_table_pages); + goto unlock; +} + +/** + * intel_hfi_offline() - Disable HFI on @cpu + * @cpu: CPU in which the HFI will be disabled + * + * Remove @cpu from those covered by its HFI instance. + * + * On some processors, hardware remembers previous programming settings even + * after being reprogrammed. Thus, keep HFI enabled even if all CPUs in the + * die/package of @cpu are offline. See note in intel_hfi_online(). + */ +void intel_hfi_offline(unsigned int cpu) +{ + struct hfi_cpu_info *info = &per_cpu(hfi_cpu_info, cpu); + struct hfi_instance *hfi_instance; + + /* + * Check if @cpu as an associated, initialized (i.e., with a non-NULL + * header). Also, HFI instances are only initialized if X86_FEATURE_HFI + * is present. + */ + hfi_instance = info->hfi_instance; + if (!hfi_instance) + return; + + if (!hfi_instance->hdr) + return; + + mutex_lock(&hfi_instance_lock); + cpumask_clear_cpu(cpu, hfi_instance->cpus); + mutex_unlock(&hfi_instance_lock); +} static __init int hfi_parse_features(void) { @@ -164,6 +347,9 @@ static __init int hfi_parse_features(void) void __init intel_hfi_init(void) { + struct hfi_instance *hfi_instance; + int i, j; + if (hfi_parse_features()) return; @@ -177,4 +363,23 @@ void __init intel_hfi_init(void) */ hfi_instances = kcalloc(max_hfi_instances, sizeof(*hfi_instances), GFP_KERNEL); + if (!hfi_instances) + return; + + for (i = 0; i < max_hfi_instances; i++) { + hfi_instance = &hfi_instances[i]; + if (!zalloc_cpumask_var(&hfi_instance->cpus, GFP_KERNEL)) + goto err_nomem; + } + + return; + +err_nomem: + for (j = 0; j < i; ++j) { + hfi_instance = &hfi_instances[j]; + free_cpumask_var(hfi_instance->cpus); + } + + kfree(hfi_instances); + hfi_instances = NULL; } diff --git a/drivers/thermal/intel/intel_hfi.h b/drivers/thermal/intel/intel_hfi.h index 05f748b48a4e..56c6b2d75202 100644 --- a/drivers/thermal/intel/intel_hfi.h +++ b/drivers/thermal/intel/intel_hfi.h @@ -4,8 +4,12 @@ #if defined(CONFIG_INTEL_HFI_THERMAL) void __init intel_hfi_init(void); +void intel_hfi_online(unsigned int cpu); +void intel_hfi_offline(unsigned int cpu); #else static inline void intel_hfi_init(void) { } +static inline void intel_hfi_online(unsigned int cpu) { } +static inline void intel_hfi_offline(unsigned int cpu) { } #endif /* CONFIG_INTEL_HFI_THERMAL */ #endif /* _INTEL_HFI_H */ diff --git a/drivers/thermal/intel/therm_throt.c b/drivers/thermal/intel/therm_throt.c index 9d26e35106a7..e4f95cf06b82 100644 --- a/drivers/thermal/intel/therm_throt.c +++ b/drivers/thermal/intel/therm_throt.c @@ -476,6 +476,13 @@ static int thermal_throttle_online(unsigned int cpu) INIT_DELAYED_WORK(&state->package_throttle.therm_work, throttle_active_work); INIT_DELAYED_WORK(&state->core_throttle.therm_work, throttle_active_work); + /* + * The first CPU coming online will enable the HFI. Usually this causes + * hardware to issue an HFI thermal interrupt. Such interrupt will reach + * the CPU once we enable the thermal vector in the local APIC. + */ + intel_hfi_online(cpu); + /* Unmask the thermal vector after the above workqueues are initialized. */ l = apic_read(APIC_LVTTHMR); apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); @@ -493,6 +500,8 @@ static int thermal_throttle_offline(unsigned int cpu) l = apic_read(APIC_LVTTHMR); apic_write(APIC_LVTTHMR, l | APIC_LVT_MASKED); + intel_hfi_offline(cpu); + cancel_delayed_work_sync(&state->package_throttle.therm_work); cancel_delayed_work_sync(&state->core_throttle.therm_work); -- Gitee From 02091d14394d14819544c301e8ec0c6103efad89 Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Thu, 27 Jan 2022 11:34:48 -0800 Subject: [PATCH 069/674] x86/Documentation: Describe the Intel Hardware Feedback Interface mainline inclusion from mainline-5.18 commit 4a960e8941bd59fe20f8f774de371f40f222a0c7 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5DSOL Intel_SIG: commit 4a960e8941bd x86/Documentation: Describe the Intel Hardware Feedback Interface. Backport for Intel HFI (Hardware Feedback Interface) support ------------------------------------- Start a documentation file to describe the purpose and operation of Intel's Hardware Feedback Interface. Describe how this interface is used in Linux to relay performance and energy efficiency updates to userspace. Reviewed-by: Len Brown Suggested-by: Srinivas Pandruvada Signed-off-by: Ricardo Neri Signed-off-by: Rafael J. Wysocki Signed-off-by: yingbao jia Signed-off-by: Jun Tian --- Documentation/x86/index.rst | 1 + Documentation/x86/intel-hfi.rst | 72 +++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+) create mode 100644 Documentation/x86/intel-hfi.rst diff --git a/Documentation/x86/index.rst b/Documentation/x86/index.rst index 647a570d4931..6832df92f084 100644 --- a/Documentation/x86/index.rst +++ b/Documentation/x86/index.rst @@ -21,6 +21,7 @@ x86-specific Documentation tlb mtrr pat + intel-hfi intel-iommu intel_txt amd-memory-encryption diff --git a/Documentation/x86/intel-hfi.rst b/Documentation/x86/intel-hfi.rst new file mode 100644 index 000000000000..49dea58ea4fb --- /dev/null +++ b/Documentation/x86/intel-hfi.rst @@ -0,0 +1,72 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============================================================ +Hardware-Feedback Interface for scheduling on Intel Hardware +============================================================ + +Overview +-------- + +Intel has described the Hardware Feedback Interface (HFI) in the Intel 64 and +IA-32 Architectures Software Developer's Manual (Intel SDM) Volume 3 Section +14.6 [1]_. + +The HFI gives the operating system a performance and energy efficiency +capability data for each CPU in the system. Linux can use the information from +the HFI to influence task placement decisions. + +The Hardware Feedback Interface +------------------------------- + +The Hardware Feedback Interface provides to the operating system information +about the performance and energy efficiency of each CPU in the system. Each +capability is given as a unit-less quantity in the range [0-255]. Higher values +indicate higher capability. Energy efficiency and performance are reported in +separate capabilities. Even though on some systems these two metrics may be +related, they are specified as independent capabilities in the Intel SDM. + +These capabilities may change at runtime as a result of changes in the +operating conditions of the system or the action of external factors. The rate +at which these capabilities are updated is specific to each processor model. On +some models, capabilities are set at boot time and never change. On others, +capabilities may change every tens of milliseconds. For instance, a remote +mechanism may be used to lower Thermal Design Power. Such change can be +reflected in the HFI. Likewise, if the system needs to be throttled due to +excessive heat, the HFI may reflect reduced performance on specific CPUs. + +The kernel or a userspace policy daemon can use these capabilities to modify +task placement decisions. For instance, if either the performance or energy +capabilities of a given logical processor becomes zero, it is an indication that +the hardware recommends to the operating system to not schedule any tasks on +that processor for performance or energy efficiency reasons, respectively. + +Implementation details for Linux +-------------------------------- + +The infrastructure to handle thermal event interrupts has two parts. In the +Local Vector Table of a CPU's local APIC, there exists a register for the +Thermal Monitor Register. This register controls how interrupts are delivered +to a CPU when the thermal monitor generates and interrupt. Further details +can be found in the Intel SDM Vol. 3 Section 10.5 [1]_. + +The thermal monitor may generate interrupts per CPU or per package. The HFI +generates package-level interrupts. This monitor is configured and initialized +via a set of machine-specific registers. Specifically, the HFI interrupt and +status are controlled via designated bits in the IA32_PACKAGE_THERM_INTERRUPT +and IA32_PACKAGE_THERM_STATUS registers, respectively. There exists one HFI +table per package. Further details can be found in the Intel SDM Vol. 3 +Section 14.9 [1]_. + +The hardware issues an HFI interrupt after updating the HFI table and is ready +for the operating system to consume it. CPUs receive such interrupt via the +thermal entry in the Local APIC's Local Vector Table. + +When servicing such interrupt, the HFI driver parses the updated table and +relays the update to userspace using the thermal notification framework. Given +that there may be many HFI updates every second, the updates relayed to +userspace are throttled at a rate of CONFIG_HZ jiffies. + +References +---------- + +.. [1] https://www.intel.com/sdm -- Gitee From 07dc6e0b04d8eca2963e138452d5b24af8f82b28 Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Thu, 27 Jan 2022 11:34:52 -0800 Subject: [PATCH 070/674] thermal: intel: hfi: Enable notification interrupt mainline inclusion from mainline-5.18 commit ab09b0744a9944cbdc0ac9a5cb00bef72adf79d5 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5DSOL Intel_SIG: commit ab09b0744a99 thermal: intel: hfi: Enable notification interrupt. Backport for Intel HFI (Hardware Feedback Interface) support ------------------------------------- When hardware wants to inform the operating system about updates in the HFI table, it issues a package-level thermal event interrupt. For this, hardware has new interrupt and status bits in the IA32_PACKAGE_THERM_ INTERRUPT and IA32_PACKAGE_THERM_STATUS registers. The existing thermal throttle driver already handles thermal event interrupts: it initializes the thermal vector of the local APIC as well as per-CPU and package-level interrupt reporting. It also provides routines to service such interrupts. Extend its functionality to also handle HFI interrupts. The frequency of the thermal HFI interrupt is specific to each processor model. On some processors, a single interrupt happens as soon as the HFI is enabled and hardware will never update HFI capabilities afterwards. On other processors, thermal and power constraints may cause thermal HFI interrupts every tens of milliseconds. To not overwhelm consumers of the HFI data, use delayed work to throttle the rate at which HFI updates are processed. Use a dedicated workqueue to not overload system_wq if hardware issues many HFI updates. Reviewed-by: Len Brown Signed-off-by: Ricardo Neri Signed-off-by: Rafael J. Wysocki Signed-off-by: yingbao jia Signed-off-by: Jun Tian --- drivers/thermal/intel/intel_hfi.c | 110 ++++++++++++++++++++++++++++ drivers/thermal/intel/intel_hfi.h | 2 + drivers/thermal/intel/therm_throt.c | 10 +++ 3 files changed, 122 insertions(+) diff --git a/drivers/thermal/intel/intel_hfi.c b/drivers/thermal/intel/intel_hfi.c index 8cac7ee6a20f..ebaf767bc83d 100644 --- a/drivers/thermal/intel/intel_hfi.c +++ b/drivers/thermal/intel/intel_hfi.c @@ -26,19 +26,27 @@ #include #include #include +#include #include #include #include #include #include +#include +#include #include +#include #include #include "intel_hfi.h" +#define THERM_STATUS_CLEAR_PKG_MASK (BIT(1) | BIT(3) | BIT(5) | BIT(7) | \ + BIT(9) | BIT(11) | BIT(26)) + /* Hardware Feedback Interface MSR configuration bits */ #define HW_FEEDBACK_PTR_VALID_BIT BIT(0) +#define HW_FEEDBACK_CONFIG_HFI_ENABLE_BIT BIT(0) /* CPUID detection and enumeration definitions for HFI */ @@ -97,6 +105,9 @@ struct hfi_hdr { * @data: Base address of the data of the local table * @cpus: CPUs represented in this HFI table instance * @hw_table: Pointer to the HFI table of this instance + * @update_work: Delayed work to process HFI updates + * @table_lock: Lock to protect acceses to the table of this instance + * @event_lock: Lock to process HFI interrupts * * A set of parameters to parse and navigate a specific HFI table. */ @@ -109,6 +120,9 @@ struct hfi_instance { void *data; cpumask_var_t cpus; void *hw_table; + struct delayed_work update_work; + raw_spinlock_t table_lock; + raw_spinlock_t event_lock; }; /** @@ -146,6 +160,86 @@ static struct hfi_instance *hfi_instances; static struct hfi_features hfi_features; static DEFINE_MUTEX(hfi_instance_lock); +static struct workqueue_struct *hfi_updates_wq; +#define HFI_UPDATE_INTERVAL HZ + +static void hfi_update_work_fn(struct work_struct *work) +{ + struct hfi_instance *hfi_instance; + + hfi_instance = container_of(to_delayed_work(work), struct hfi_instance, + update_work); + if (!hfi_instance) + return; + + /* TODO: Consume update here. */ +} + +void intel_hfi_process_event(__u64 pkg_therm_status_msr_val) +{ + struct hfi_instance *hfi_instance; + int cpu = smp_processor_id(); + struct hfi_cpu_info *info; + u64 new_timestamp; + + if (!pkg_therm_status_msr_val) + return; + + info = &per_cpu(hfi_cpu_info, cpu); + if (!info) + return; + + /* + * A CPU is linked to its HFI instance before the thermal vector in the + * local APIC is unmasked. Hence, info->hfi_instance cannot be NULL + * when receiving an HFI event. + */ + hfi_instance = info->hfi_instance; + if (unlikely(!hfi_instance)) { + pr_debug("Received event on CPU %d but instance was null", cpu); + return; + } + + /* + * On most systems, all CPUs in the package receive a package-level + * thermal interrupt when there is an HFI update. It is sufficient to + * let a single CPU to acknowledge the update and queue work to + * process it. The remaining CPUs can resume their work. + */ + if (!raw_spin_trylock(&hfi_instance->event_lock)) + return; + + /* Skip duplicated updates. */ + new_timestamp = *(u64 *)hfi_instance->hw_table; + if (*hfi_instance->timestamp == new_timestamp) { + raw_spin_unlock(&hfi_instance->event_lock); + return; + } + + raw_spin_lock(&hfi_instance->table_lock); + + /* + * Copy the updated table into our local copy. This includes the new + * timestamp. + */ + memcpy(hfi_instance->local_table, hfi_instance->hw_table, + hfi_features.nr_table_pages << PAGE_SHIFT); + + raw_spin_unlock(&hfi_instance->table_lock); + raw_spin_unlock(&hfi_instance->event_lock); + + /* + * Let hardware know that we are done reading the HFI table and it is + * free to update it again. + */ + pkg_therm_status_msr_val &= THERM_STATUS_CLEAR_PKG_MASK & + ~PACKAGE_THERM_STATUS_HFI_UPDATED; + wrmsrl(MSR_IA32_PACKAGE_THERM_STATUS, pkg_therm_status_msr_val); + + queue_delayed_work(hfi_updates_wq, &hfi_instance->update_work, + HFI_UPDATE_INTERVAL); +} + static void init_hfi_cpu_index(struct hfi_cpu_info *info) { union cpuid6_edx edx; @@ -257,8 +351,20 @@ void intel_hfi_online(unsigned int cpu) init_hfi_instance(hfi_instance); + INIT_DELAYED_WORK(&hfi_instance->update_work, hfi_update_work_fn); + raw_spin_lock_init(&hfi_instance->table_lock); + raw_spin_lock_init(&hfi_instance->event_lock); + cpumask_set_cpu(cpu, hfi_instance->cpus); + /* + * Enable the hardware feedback interface and never disable it. See + * comment on programming the address of the table. + */ + rdmsrl(MSR_IA32_HW_FEEDBACK_CONFIG, msr_val); + msr_val |= HW_FEEDBACK_CONFIG_HFI_ENABLE_BIT; + wrmsrl(MSR_IA32_HW_FEEDBACK_CONFIG, msr_val); + unlock: mutex_unlock(&hfi_instance_lock); return; @@ -372,6 +478,10 @@ void __init intel_hfi_init(void) goto err_nomem; } + hfi_updates_wq = create_singlethread_workqueue("hfi-updates"); + if (!hfi_updates_wq) + goto err_nomem; + return; err_nomem: diff --git a/drivers/thermal/intel/intel_hfi.h b/drivers/thermal/intel/intel_hfi.h index 56c6b2d75202..325aa78b745c 100644 --- a/drivers/thermal/intel/intel_hfi.h +++ b/drivers/thermal/intel/intel_hfi.h @@ -6,10 +6,12 @@ void __init intel_hfi_init(void); void intel_hfi_online(unsigned int cpu); void intel_hfi_offline(unsigned int cpu); +void intel_hfi_process_event(__u64 pkg_therm_status_msr_val); #else static inline void intel_hfi_init(void) { } static inline void intel_hfi_online(unsigned int cpu) { } static inline void intel_hfi_offline(unsigned int cpu) { } +static inline void intel_hfi_process_event(__u64 pkg_therm_status_msr_val) { } #endif /* CONFIG_INTEL_HFI_THERMAL */ #endif /* _INTEL_HFI_H */ diff --git a/drivers/thermal/intel/therm_throt.c b/drivers/thermal/intel/therm_throt.c index e4f95cf06b82..8cc35a9f3862 100644 --- a/drivers/thermal/intel/therm_throt.c +++ b/drivers/thermal/intel/therm_throt.c @@ -615,6 +615,10 @@ void intel_thermal_interrupt(void) PACKAGE_THERM_STATUS_POWER_LIMIT, POWER_LIMIT_EVENT, PACKAGE_LEVEL); + + if (this_cpu_has(X86_FEATURE_HFI)) + intel_hfi_process_event(msr_val & + PACKAGE_THERM_STATUS_HFI_UPDATED); } } @@ -717,6 +721,12 @@ void intel_init_thermal(struct cpuinfo_x86 *c) wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l | (PACKAGE_THERM_INT_LOW_ENABLE | PACKAGE_THERM_INT_HIGH_ENABLE), h); + + if (cpu_has(c, X86_FEATURE_HFI)) { + rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); + wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, + l | PACKAGE_THERM_INT_HFI_ENABLE, h); + } } rdmsr(MSR_IA32_MISC_ENABLE, l, h); -- Gitee From b0b8a4822c7764d153f1fc03ba3b7d37c0cba077 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Thu, 27 Jan 2022 11:34:54 -0800 Subject: [PATCH 071/674] thermal: intel: hfi: Notify user space for HFI events mainline inclusion from mainline-5.18 commit bd30cdfd9bd73b68e4977ce7c5540aa7b14c25cd category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5DSOL Intel_SIG: commit bd30cdfd9bd7 thermal: intel: hfi: Notify user space for HFI events. Backport for Intel HFI (Hardware Feedback Interface) support ------------------------------------- When the hardware issues an HFI event, relay a notification to user space. This allows user space to respond by reading performance and efficiency of each CPU and take appropriate action. For example, when the performance and efficiency of a CPU is 0, user space can either offline the CPU or inject idle. Also, if user space notices a downward trend in performance, it may proactively adjust power limits to avoid future situations in which performance drops to 0. To avoid excessive notifications, the rate is limited by one HZ per event. To limit the netlink message size, send parameters for up to 16 CPUs in a single message. If there are more than 16 CPUs, issue as many messages as needed to notify the status of all CPUs. In the HFI specification, both performance and efficiency capabilities are defined in the [0, 255] range. The existing implementations of HFI hardware do not scale the maximum values to 255. Since userspace cares about capability values that are either 0 or show a downward/upward trend, this fact does not matter much. Relative changes in capabilities are enough. To comply with the thermal netlink ABI, scale both performance and efficiency capabilities to the [0, 1023] interval. Reviewed-by: Len Brown Signed-off-by: Srinivas Pandruvada Signed-off-by: Rafael J. Wysocki Signed-off-by: yingbao jia Signed-off-by: Jun Tian --- drivers/thermal/intel/Kconfig | 1 + drivers/thermal/intel/intel_hfi.c | 75 ++++++++++++++++++++++++++++++- 2 files changed, 75 insertions(+), 1 deletion(-) diff --git a/drivers/thermal/intel/Kconfig b/drivers/thermal/intel/Kconfig index 26aea864aaca..510d36326dfd 100644 --- a/drivers/thermal/intel/Kconfig +++ b/drivers/thermal/intel/Kconfig @@ -84,6 +84,7 @@ config INTEL_HFI_THERMAL bool "Intel Hardware Feedback Interface" depends on CPU_SUP_INTEL depends on X86_THERMAL_VECTOR + select THERMAL_NETLINK help Select this option to enable the Hardware Feedback Interface. If selected, hardware provides guidance to the operating system on diff --git a/drivers/thermal/intel/intel_hfi.c b/drivers/thermal/intel/intel_hfi.c index ebaf767bc83d..158a2b7c6b6f 100644 --- a/drivers/thermal/intel/intel_hfi.c +++ b/drivers/thermal/intel/intel_hfi.c @@ -39,6 +39,7 @@ #include +#include "../thermal_core.h" #include "intel_hfi.h" #define THERM_STATUS_CLEAR_PKG_MASK (BIT(1) | BIT(3) | BIT(5) | BIT(7) | \ @@ -162,6 +163,78 @@ static DEFINE_MUTEX(hfi_instance_lock); static struct workqueue_struct *hfi_updates_wq; #define HFI_UPDATE_INTERVAL HZ +#define HFI_MAX_THERM_NOTIFY_COUNT 16 + +static void get_hfi_caps(struct hfi_instance *hfi_instance, + struct thermal_genl_cpu_caps *cpu_caps) +{ + int cpu, i = 0; + + raw_spin_lock_irq(&hfi_instance->table_lock); + for_each_cpu(cpu, hfi_instance->cpus) { + struct hfi_cpu_data *caps; + s16 index; + + index = per_cpu(hfi_cpu_info, cpu).index; + caps = hfi_instance->data + index * hfi_features.cpu_stride; + cpu_caps[i].cpu = cpu; + + /* + * Scale performance and energy efficiency to + * the [0, 1023] interval that thermal netlink uses. + */ + cpu_caps[i].performance = caps->perf_cap << 2; + cpu_caps[i].efficiency = caps->ee_cap << 2; + + ++i; + } + raw_spin_unlock_irq(&hfi_instance->table_lock); +} + +/* + * Call update_capabilities() when there are changes in the HFI table. + */ +static void update_capabilities(struct hfi_instance *hfi_instance) +{ + struct thermal_genl_cpu_caps *cpu_caps; + int i = 0, cpu_count; + + /* CPUs may come online/offline while processing an HFI update. */ + mutex_lock(&hfi_instance_lock); + + cpu_count = cpumask_weight(hfi_instance->cpus); + + /* No CPUs to report in this hfi_instance. */ + if (!cpu_count) + goto out; + + cpu_caps = kcalloc(cpu_count, sizeof(*cpu_caps), GFP_KERNEL); + if (!cpu_caps) + goto out; + + get_hfi_caps(hfi_instance, cpu_caps); + + if (cpu_count < HFI_MAX_THERM_NOTIFY_COUNT) + goto last_cmd; + + /* Process complete chunks of HFI_MAX_THERM_NOTIFY_COUNT capabilities. */ + for (i = 0; + (i + HFI_MAX_THERM_NOTIFY_COUNT) <= cpu_count; + i += HFI_MAX_THERM_NOTIFY_COUNT) + thermal_genl_cpu_capability_event(HFI_MAX_THERM_NOTIFY_COUNT, + &cpu_caps[i]); + + cpu_count = cpu_count - i; + +last_cmd: + /* Process the remaining capabilities if any. */ + if (cpu_count) + thermal_genl_cpu_capability_event(cpu_count, &cpu_caps[i]); + + kfree(cpu_caps); +out: + mutex_unlock(&hfi_instance_lock); +} static void hfi_update_work_fn(struct work_struct *work) { @@ -172,7 +245,7 @@ static void hfi_update_work_fn(struct work_struct *work) if (!hfi_instance) return; - /* TODO: Consume update here. */ + update_capabilities(hfi_instance); } void intel_hfi_process_event(__u64 pkg_therm_status_msr_val) -- Gitee From 9c1fe5896c82aa7193bdfe10b45845bcb11210bb Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 8 Feb 2022 16:15:46 -0800 Subject: [PATCH 072/674] thermal: intel: hfi: INTEL_HFI_THERMAL depends on NET mainline inclusion from mainline-5.18 commit c95aa2bab974394809edea28690f6504a15791b6 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5DSOL Intel_SIG: commit c95aa2bab974 thermal: intel: hfi: INTEL_HFI_THERMAL depends on NET. Backport for Intel HFI (Hardware Feedback Interface) support ------------------------------------- THERMAL_NETLINK depends on NET and since 'select' does not follow any dependency chain, INTEL_HFI_THERMAL also should depend on NET. Fix one Kconfig warning and 48 subsequent build errors: WARNING: unmet direct dependencies detected for THERMAL_NETLINK Depends on [n]: THERMAL [=y] && NET [=n] Selected by [y]: - INTEL_HFI_THERMAL [=y] && THERMAL [=y] && (X86 [=y] || X86_INTEL_QUARK [=n] || COMPILE_TEST [=y]) && CPU_SUP_INTEL [=y] && X86_THERMAL_VECTOR [=y] Fixes: bd30cdfd9bd7 ("thermal: intel: hfi: Notify user space for HFI events") Signed-off-by: Randy Dunlap Reviewed-by: Srinivas Pandruvada Signed-off-by: Rafael J. Wysocki Signed-off-by: yingbao jia Signed-off-by: Jun Tian --- drivers/thermal/intel/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/thermal/intel/Kconfig b/drivers/thermal/intel/Kconfig index 510d36326dfd..876869d57313 100644 --- a/drivers/thermal/intel/Kconfig +++ b/drivers/thermal/intel/Kconfig @@ -82,6 +82,7 @@ config INTEL_PCH_THERMAL config INTEL_HFI_THERMAL bool "Intel Hardware Feedback Interface" + depends on NET depends on CPU_SUP_INTEL depends on X86_THERMAL_VECTOR select THERMAL_NETLINK -- Gitee From d1b287f6c142193c6dc5a5abfe2fe59e2f03cee6 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Thu, 27 Jan 2022 11:34:53 -0800 Subject: [PATCH 073/674] thermal: netlink: Add a new event to notify CPU capabilities change mainline inclusion from mainline-5.18 commit e4b1eb24ce5a696ef7229f9926ff34d7502f0582 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5DSOL Intel_SIG: commit e4b1eb24ce5a thermal: netlink: Add a new event to notify CPU capabilities change. Backport for Intel HFI (Hardware Feedback Interface) support ------------------------------------- Add a new netlink event to notify change in CPU capabilities in terms of performance and efficiency. Firmware may change CPU capabilities as a result of thermal events in the system or to account for changes in the TDP (thermal design power) level. This notification type will allow user space to avoid running workloads on certain CPUs or proactively adjust power limits to avoid future events. The netlink message consists of a nested attribute (THERMAL_GENL_ATTR_CPU_CAPABILITY) with three attributes: * THERMAL_GENL_ATTR_CPU_CAPABILITY_ID (type u32): -- logical CPU number * THERMAL_GENL_ATTR_CPU_CAPABILITY_PERFORMANCE (type u32): -- Scaled performance from 0-1023 * THERMAL_GENL_ATTR_CPU_CAPABILITY_EFFICIENCY (type u32): -- Scaled efficiency from 0-1023 Reviewed-by: Len Brown Signed-off-by: Srinivas Pandruvada Signed-off-by: Rafael J. Wysocki Signed-off-by: yingbao jia Signed-off-by: Jun Tian --- drivers/thermal/thermal_netlink.c | 53 +++++++++++++++++++++++++++++++ drivers/thermal/thermal_netlink.h | 14 ++++++++ include/uapi/linux/thermal.h | 6 +++- 3 files changed, 72 insertions(+), 1 deletion(-) diff --git a/drivers/thermal/thermal_netlink.c b/drivers/thermal/thermal_netlink.c index 41c8d47805c4..dc535831b660 100644 --- a/drivers/thermal/thermal_netlink.c +++ b/drivers/thermal/thermal_netlink.c @@ -43,6 +43,11 @@ static const struct nla_policy thermal_genl_policy[THERMAL_GENL_ATTR_MAX + 1] = [THERMAL_GENL_ATTR_CDEV_MAX_STATE] = { .type = NLA_U32 }, [THERMAL_GENL_ATTR_CDEV_NAME] = { .type = NLA_STRING, .len = THERMAL_NAME_LENGTH }, + /* CPU capabilities */ + [THERMAL_GENL_ATTR_CPU_CAPABILITY] = { .type = NLA_NESTED }, + [THERMAL_GENL_ATTR_CPU_CAPABILITY_ID] = { .type = NLA_U32 }, + [THERMAL_GENL_ATTR_CPU_CAPABILITY_PERFORMANCE] = { .type = NLA_U32 }, + [THERMAL_GENL_ATTR_CPU_CAPABILITY_EFFICIENCY] = { .type = NLA_U32 }, }; struct param { @@ -58,6 +63,8 @@ struct param { int temp; int cdev_state; int cdev_max_state; + struct thermal_genl_cpu_caps *cpu_capabilities; + int cpu_capabilities_count; }; typedef int (*cb_t)(struct param *); @@ -189,6 +196,42 @@ static int thermal_genl_event_gov_change(struct param *p) return 0; } +static int thermal_genl_event_cpu_capability_change(struct param *p) +{ + struct thermal_genl_cpu_caps *cpu_cap = p->cpu_capabilities; + struct sk_buff *msg = p->msg; + struct nlattr *start_cap; + int i; + + start_cap = nla_nest_start(msg, THERMAL_GENL_ATTR_CPU_CAPABILITY); + if (!start_cap) + return -EMSGSIZE; + + for (i = 0; i < p->cpu_capabilities_count; ++i) { + if (nla_put_u32(msg, THERMAL_GENL_ATTR_CPU_CAPABILITY_ID, + cpu_cap->cpu)) + goto out_cancel_nest; + + if (nla_put_u32(msg, THERMAL_GENL_ATTR_CPU_CAPABILITY_PERFORMANCE, + cpu_cap->performance)) + goto out_cancel_nest; + + if (nla_put_u32(msg, THERMAL_GENL_ATTR_CPU_CAPABILITY_EFFICIENCY, + cpu_cap->efficiency)) + goto out_cancel_nest; + + ++cpu_cap; + } + + nla_nest_end(msg, start_cap); + + return 0; +out_cancel_nest: + nla_nest_cancel(msg, start_cap); + + return -EMSGSIZE; +} + int thermal_genl_event_tz_delete(struct param *p) __attribute__((alias("thermal_genl_event_tz"))); @@ -218,6 +261,7 @@ static cb_t event_cb[] = { [THERMAL_GENL_EVENT_CDEV_DELETE] = thermal_genl_event_cdev_delete, [THERMAL_GENL_EVENT_CDEV_STATE_UPDATE] = thermal_genl_event_cdev_state_update, [THERMAL_GENL_EVENT_TZ_GOV_CHANGE] = thermal_genl_event_gov_change, + [THERMAL_GENL_EVENT_CPU_CAPABILITY_CHANGE] = thermal_genl_event_cpu_capability_change, }; /* @@ -355,6 +399,15 @@ int thermal_notify_tz_gov_change(int tz_id, const char *name) return thermal_genl_send_event(THERMAL_GENL_EVENT_TZ_GOV_CHANGE, &p); } +int thermal_genl_cpu_capability_event(int count, + struct thermal_genl_cpu_caps *caps) +{ + struct param p = { .cpu_capabilities_count = count, .cpu_capabilities = caps }; + + return thermal_genl_send_event(THERMAL_GENL_EVENT_CPU_CAPABILITY_CHANGE, &p); +} +EXPORT_SYMBOL_GPL(thermal_genl_cpu_capability_event); + /*************************** Command encoding ********************************/ static int __thermal_genl_cmd_tz_get_id(struct thermal_zone_device *tz, diff --git a/drivers/thermal/thermal_netlink.h b/drivers/thermal/thermal_netlink.h index 828d1dddfa98..7439b245e41e 100644 --- a/drivers/thermal/thermal_netlink.h +++ b/drivers/thermal/thermal_netlink.h @@ -4,6 +4,12 @@ * Author: Daniel Lezcano */ +struct thermal_genl_cpu_caps { + int cpu; + int performance; + int efficiency; +}; + /* Netlink notification function */ #ifdef CONFIG_THERMAL_NETLINK int __init thermal_netlink_init(void); @@ -23,6 +29,8 @@ int thermal_notify_cdev_add(int cdev_id, const char *name, int max_state); int thermal_notify_cdev_delete(int cdev_id); int thermal_notify_tz_gov_change(int tz_id, const char *name); int thermal_genl_sampling_temp(int id, int temp); +int thermal_genl_cpu_capability_event(int count, + struct thermal_genl_cpu_caps *caps); #else static inline int thermal_netlink_init(void) { @@ -101,4 +109,10 @@ static inline int thermal_genl_sampling_temp(int id, int temp) { return 0; } + +static inline int thermal_genl_cpu_capability_event(int count, struct cpu_capability *caps) +{ + return 0; +} + #endif /* CONFIG_THERMAL_NETLINK */ diff --git a/include/uapi/linux/thermal.h b/include/uapi/linux/thermal.h index c105054cbb57..06a8cea36fe3 100644 --- a/include/uapi/linux/thermal.h +++ b/include/uapi/linux/thermal.h @@ -44,7 +44,10 @@ enum thermal_genl_attr { THERMAL_GENL_ATTR_CDEV_MAX_STATE, THERMAL_GENL_ATTR_CDEV_NAME, THERMAL_GENL_ATTR_GOV_NAME, - + THERMAL_GENL_ATTR_CPU_CAPABILITY, + THERMAL_GENL_ATTR_CPU_CAPABILITY_ID, + THERMAL_GENL_ATTR_CPU_CAPABILITY_PERFORMANCE, + THERMAL_GENL_ATTR_CPU_CAPABILITY_EFFICIENCY, __THERMAL_GENL_ATTR_MAX, }; #define THERMAL_GENL_ATTR_MAX (__THERMAL_GENL_ATTR_MAX - 1) @@ -71,6 +74,7 @@ enum thermal_genl_event { THERMAL_GENL_EVENT_CDEV_DELETE, /* Cdev unbound */ THERMAL_GENL_EVENT_CDEV_STATE_UPDATE, /* Cdev state updated */ THERMAL_GENL_EVENT_TZ_GOV_CHANGE, /* Governor policy changed */ + THERMAL_GENL_EVENT_CPU_CAPABILITY_CHANGE, /* CPU capability changed */ __THERMAL_GENL_EVENT_MAX, }; #define THERMAL_GENL_EVENT_MAX (__THERMAL_GENL_EVENT_MAX - 1) -- Gitee From ddbef6c0c3a2bd531aa7bfe545d6eb0ad6b23f76 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Mon, 7 Feb 2022 09:38:29 -0700 Subject: [PATCH 074/674] thermal: netlink: Fix parameter type of thermal_genl_cpu_capability_event() stub mainline inclusion mainline-5.18 commit 345be4275cad454ae7e25884369a9c6c25e56279 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5DSOL Intel_SIG: commit 345be4275cad thermal: netlink: Fix parameter type of thermal_genl_cpu_capability_event() stub. Backport for Intel HFI (Hardware Feedback Interface) support ------------------------------------- When building with CONFIG_THERMAL_NETLINK=n, there is a spew of warnings along the lines of: In file included from drivers/thermal/thermal_core.c:27: In file included from drivers/thermal/thermal_core.h:15: drivers/thermal/thermal_netlink.h:113:71: warning: declaration of 'struct cpu_capability' will not be visible outside of this function [-Wvisibility] static inline int thermal_genl_cpu_capability_event(int count, struct cpu_capability *caps) ^ 1 warning generated. 'struct cpu_capability' is not forward declared anywhere in the header. As it turns out, this should really be 'struct thermal_genl_cpu_caps', which silences the warning and makes the parameter types of the stub match the full function. Fixes: e4b1eb24ce5a ("thermal: netlink: Add a new event to notify CPU capabilities change") Reported-by: Stephen Rothwell Signed-off-by: Nathan Chancellor Reviewed-by: Srinivas Pandruvada Signed-off-by: Rafael J. Wysocki Signed-off-by: yingbao jia Signed-off-by: Jun Tian --- drivers/thermal/thermal_netlink.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/thermal/thermal_netlink.h b/drivers/thermal/thermal_netlink.h index 7439b245e41e..fd4188470d3f 100644 --- a/drivers/thermal/thermal_netlink.h +++ b/drivers/thermal/thermal_netlink.h @@ -110,7 +110,7 @@ static inline int thermal_genl_sampling_temp(int id, int temp) return 0; } -static inline int thermal_genl_cpu_capability_event(int count, struct cpu_capability *caps) +static inline int thermal_genl_cpu_capability_event(int count, struct thermal_genl_cpu_caps *caps) { return 0; } -- Gitee From c200f744e7896c73d9a8417c3bcf29d32eba90f4 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Tue, 7 Dec 2021 21:17:34 +0800 Subject: [PATCH 075/674] powercap: intel_rapl: support new layout of Psys PowerLimit Register on SPR mainline inclusion mainline-5.17 commit 931da6a0de5d620425af4425344259e6ff46b654 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5BED0 Intel-SIG: commit 931da6a0de5d powercap: intel_rapl: support new layout of Psys PowerLimit Register on SPR. Backport for SPR RAPL Psys support. ------------------------------------- On Sapphire Rapids, the layout of the Psys domain Power Limit Register is different from from what it was before. Enhance the code to support the new Psys PL register layout. Signed-off-by: Zhang Rui Reported-and-tested-by: Alkattan Dana [ rjw: Subject and changelog edits ] Signed-off-by: Rafael J. Wysocki Signed-off-by: yingbao jia Signed-off-by: Jun Tian --- drivers/powercap/intel_rapl_common.c | 61 +++++++++++++++++++++++++++- include/linux/intel_rapl.h | 6 +++ 2 files changed, 65 insertions(+), 2 deletions(-) diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index c9e57237d778..a994c45ca42e 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -61,6 +61,20 @@ #define PERF_STATUS_THROTTLE_TIME_MASK 0xffffffff #define PP_POLICY_MASK 0x1F +/* + * SPR has different layout for Psys Domain PowerLimit registers. + * There are 17 bits of PL1 and PL2 instead of 15 bits. + * The Enable bits and TimeWindow bits are also shifted as a result. + */ +#define PSYS_POWER_LIMIT1_MASK 0x1FFFF +#define PSYS_POWER_LIMIT1_ENABLE BIT(17) + +#define PSYS_POWER_LIMIT2_MASK (0x1FFFFULL<<32) +#define PSYS_POWER_LIMIT2_ENABLE BIT_ULL(49) + +#define PSYS_TIME_WINDOW1_MASK (0x7FULL<<19) +#define PSYS_TIME_WINDOW2_MASK (0x7FULL<<51) + /* Non HW constants */ #define RAPL_PRIMITIVE_DERIVED BIT(1) /* not from raw data */ #define RAPL_PRIMITIVE_DUMMY BIT(2) @@ -97,6 +111,7 @@ struct rapl_defaults { bool to_raw); unsigned int dram_domain_energy_unit; unsigned int psys_domain_energy_unit; + bool spr_psys_bits; }; static struct rapl_defaults *rapl_defaults; @@ -669,12 +684,51 @@ static struct rapl_primitive_info rpi[] = { RAPL_DOMAIN_REG_PERF, TIME_UNIT, 0), PRIMITIVE_INFO_INIT(PRIORITY_LEVEL, PP_POLICY_MASK, 0, RAPL_DOMAIN_REG_POLICY, ARBITRARY_UNIT, 0), + PRIMITIVE_INFO_INIT(PSYS_POWER_LIMIT1, PSYS_POWER_LIMIT1_MASK, 0, + RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0), + PRIMITIVE_INFO_INIT(PSYS_POWER_LIMIT2, PSYS_POWER_LIMIT2_MASK, 32, + RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0), + PRIMITIVE_INFO_INIT(PSYS_PL1_ENABLE, PSYS_POWER_LIMIT1_ENABLE, 17, + RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), + PRIMITIVE_INFO_INIT(PSYS_PL2_ENABLE, PSYS_POWER_LIMIT2_ENABLE, 49, + RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), + PRIMITIVE_INFO_INIT(PSYS_TIME_WINDOW1, PSYS_TIME_WINDOW1_MASK, 19, + RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0), + PRIMITIVE_INFO_INIT(PSYS_TIME_WINDOW2, PSYS_TIME_WINDOW2_MASK, 51, + RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0), /* non-hardware */ PRIMITIVE_INFO_INIT(AVERAGE_POWER, 0, 0, 0, POWER_UNIT, RAPL_PRIMITIVE_DERIVED), {NULL, 0, 0, 0}, }; +static enum rapl_primitives +prim_fixups(struct rapl_domain *rd, enum rapl_primitives prim) +{ + if (!rapl_defaults->spr_psys_bits) + return prim; + + if (rd->id != RAPL_DOMAIN_PLATFORM) + return prim; + + switch (prim) { + case POWER_LIMIT1: + return PSYS_POWER_LIMIT1; + case POWER_LIMIT2: + return PSYS_POWER_LIMIT2; + case PL1_ENABLE: + return PSYS_PL1_ENABLE; + case PL2_ENABLE: + return PSYS_PL2_ENABLE; + case TIME_WINDOW1: + return PSYS_TIME_WINDOW1; + case TIME_WINDOW2: + return PSYS_TIME_WINDOW2; + default: + return prim; + } +} + /* Read primitive data based on its related struct rapl_primitive_info. * if xlate flag is set, return translated data based on data units, i.e. * time, energy, and power. @@ -692,7 +746,8 @@ static int rapl_read_data_raw(struct rapl_domain *rd, enum rapl_primitives prim, bool xlate, u64 *data) { u64 value; - struct rapl_primitive_info *rp = &rpi[prim]; + enum rapl_primitives prim_fixed = prim_fixups(rd, prim); + struct rapl_primitive_info *rp = &rpi[prim_fixed]; struct reg_action ra; int cpu; @@ -738,7 +793,8 @@ static int rapl_write_data_raw(struct rapl_domain *rd, enum rapl_primitives prim, unsigned long long value) { - struct rapl_primitive_info *rp = &rpi[prim]; + enum rapl_primitives prim_fixed = prim_fixups(rd, prim); + struct rapl_primitive_info *rp = &rpi[prim_fixed]; int cpu; u64 bits; struct reg_action ra; @@ -981,6 +1037,7 @@ static const struct rapl_defaults rapl_defaults_spr_server = { .compute_time_window = rapl_compute_time_window_core, .dram_domain_energy_unit = 15300, .psys_domain_energy_unit = 1000000000, + .spr_psys_bits = true, }; static const struct rapl_defaults rapl_defaults_byt = { diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h index 50b8398ffd21..acf72c018142 100644 --- a/include/linux/intel_rapl.h +++ b/include/linux/intel_rapl.h @@ -58,6 +58,12 @@ enum rapl_primitives { THROTTLED_TIME, PRIORITY_LEVEL, + PSYS_POWER_LIMIT1, + PSYS_POWER_LIMIT2, + PSYS_PL1_ENABLE, + PSYS_PL2_ENABLE, + PSYS_TIME_WINDOW1, + PSYS_TIME_WINDOW2, /* below are not raw primitive data */ AVERAGE_POWER, NR_RAPL_PRIMITIVES, -- Gitee From 198e436bbfda02c58308cdacc94f66dae432ad9f Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Mon, 11 Jul 2022 22:07:02 +0800 Subject: [PATCH 076/674] vt: drop old FONT ioctls stable inclusion from stable-v5.10.127 commit 3acb7dc242ca25eb258493b513ef2f4b0f2a9ad1 category: bugfix bugzilla: 187166 https://gitee.com/src-openeuler/kernel/issues/I5GKVX CVE: CVE-2021-33656 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?h=linux-5.10.y&id=3acb7dc242ca25eb258493b513ef2f4b0f2a9ad1 -------------------------------- commit ff2047fb755d4415ec3c70ac799889371151796d upstream. Drop support for these ioctls: * PIO_FONT, PIO_FONTX * GIO_FONT, GIO_FONTX * PIO_FONTRESET As was demonstrated by commit 90bfdeef83f1 (tty: make FONTX ioctl use the tty pointer they were actually passed), these ioctls are not used from userspace, as: 1) they used to be broken (set up font on current console, not the open one) and racy (before the commit above) 2) KDFONTOP ioctl is used for years instead Note that PIO_FONTRESET is defunct on most systems as VGA_CONSOLE is set on them for ages. That turns on BROKEN_GRAPHICS_PROGRAMS which makes PIO_FONTRESET just return an error. We are removing KD_FONT_FLAG_OLD here as it was used only by these removed ioctls. kd.h header exists both in kernel and uapi headers, so we can remove the kernel one completely. Everyone includeing kd.h will now automatically get the uapi one. There are now unused definitions of the ioctl numbers and "struct consolefontdesc" in kd.h, but as it is a uapi header, I am not touching these. Signed-off-by: Jiri Slaby Link: https://lore.kernel.org/r/20210105120239.28031-8-jslaby@suse.cz Cc: guodaxing Signed-off-by: Greg Kroah-Hartman Reviewed-by: Xie XiuQi Reviewed-by: Xiu Jianfeng Signed-off-by: Zheng Zengkai --- drivers/tty/vt/vt.c | 39 +--------- drivers/tty/vt/vt_ioctl.c | 151 -------------------------------------- include/linux/kd.h | 8 -- 3 files changed, 3 insertions(+), 195 deletions(-) delete mode 100644 include/linux/kd.h diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c index a7ee1171eeb3..0a6336d54a65 100644 --- a/drivers/tty/vt/vt.c +++ b/drivers/tty/vt/vt.c @@ -4625,16 +4625,8 @@ static int con_font_get(struct vc_data *vc, struct console_font_op *op) if (op->data && font.charcount > op->charcount) rc = -ENOSPC; - if (!(op->flags & KD_FONT_FLAG_OLD)) { - if (font.width > op->width || font.height > op->height) - rc = -ENOSPC; - } else { - if (font.width != 8) - rc = -EIO; - else if ((op->height && font.height > op->height) || - font.height > 32) - rc = -ENOSPC; - } + if (font.width > op->width || font.height > op->height) + rc = -ENOSPC; if (rc) goto out; @@ -4662,7 +4654,7 @@ static int con_font_set(struct vc_data *vc, struct console_font_op *op) return -EINVAL; if (op->charcount > 512) return -EINVAL; - if (op->width <= 0 || op->width > 32 || op->height > 32) + if (op->width <= 0 || op->width > 32 || !op->height || op->height > 32) return -EINVAL; size = (op->width+7)/8 * 32 * op->charcount; if (size > max_font_size) @@ -4672,31 +4664,6 @@ static int con_font_set(struct vc_data *vc, struct console_font_op *op) if (IS_ERR(font.data)) return PTR_ERR(font.data); - if (!op->height) { /* Need to guess font height [compat] */ - int h, i; - u8 *charmap = font.data; - - /* - * If from KDFONTOP ioctl, don't allow things which can be done - * in userland,so that we can get rid of this soon - */ - if (!(op->flags & KD_FONT_FLAG_OLD)) { - kfree(font.data); - return -EINVAL; - } - - for (h = 32; h > 0; h--) - for (i = 0; i < op->charcount; i++) - if (charmap[32*i+h-1]) - goto nonzero; - - kfree(font.data); - return -EINVAL; - - nonzero: - op->height = h; - } - font.charcount = op->charcount; font.width = op->width; font.height = op->height; diff --git a/drivers/tty/vt/vt_ioctl.c b/drivers/tty/vt/vt_ioctl.c index a9c6ea8986af..b10b86e2c17e 100644 --- a/drivers/tty/vt/vt_ioctl.c +++ b/drivers/tty/vt/vt_ioctl.c @@ -486,70 +486,6 @@ static int vt_k_ioctl(struct tty_struct *tty, unsigned int cmd, return 0; } -static inline int do_fontx_ioctl(struct vc_data *vc, int cmd, - struct consolefontdesc __user *user_cfd, - struct console_font_op *op) -{ - struct consolefontdesc cfdarg; - int i; - - if (copy_from_user(&cfdarg, user_cfd, sizeof(struct consolefontdesc))) - return -EFAULT; - - switch (cmd) { - case PIO_FONTX: - op->op = KD_FONT_OP_SET; - op->flags = KD_FONT_FLAG_OLD; - op->width = 8; - op->height = cfdarg.charheight; - op->charcount = cfdarg.charcount; - op->data = cfdarg.chardata; - return con_font_op(vc, op); - - case GIO_FONTX: - op->op = KD_FONT_OP_GET; - op->flags = KD_FONT_FLAG_OLD; - op->width = 8; - op->height = cfdarg.charheight; - op->charcount = cfdarg.charcount; - op->data = cfdarg.chardata; - i = con_font_op(vc, op); - if (i) - return i; - cfdarg.charheight = op->height; - cfdarg.charcount = op->charcount; - if (copy_to_user(user_cfd, &cfdarg, sizeof(struct consolefontdesc))) - return -EFAULT; - return 0; - } - return -EINVAL; -} - -static int vt_io_fontreset(struct vc_data *vc, struct console_font_op *op) -{ - int ret; - - if (__is_defined(BROKEN_GRAPHICS_PROGRAMS)) { - /* - * With BROKEN_GRAPHICS_PROGRAMS defined, the default font is - * not saved. - */ - return -ENOSYS; - } - - op->op = KD_FONT_OP_SET_DEFAULT; - op->data = NULL; - ret = con_font_op(vc, op); - if (ret) - return ret; - - console_lock(); - con_set_default_unimap(vc); - console_unlock(); - - return 0; -} - static inline int do_unimap_ioctl(int cmd, struct unimapdesc __user *user_ud, bool perm, struct vc_data *vc) { @@ -574,29 +510,7 @@ static inline int do_unimap_ioctl(int cmd, struct unimapdesc __user *user_ud, static int vt_io_ioctl(struct vc_data *vc, unsigned int cmd, void __user *up, bool perm) { - struct console_font_op op; /* used in multiple places here */ - switch (cmd) { - case PIO_FONT: - if (!perm) - return -EPERM; - op.op = KD_FONT_OP_SET; - op.flags = KD_FONT_FLAG_OLD | KD_FONT_FLAG_DONT_RECALC; /* Compatibility */ - op.width = 8; - op.height = 0; - op.charcount = 256; - op.data = up; - return con_font_op(vc, &op); - - case GIO_FONT: - op.op = KD_FONT_OP_GET; - op.flags = KD_FONT_FLAG_OLD; - op.width = 8; - op.height = 32; - op.charcount = 256; - op.data = up; - return con_font_op(vc, &op); - case PIO_CMAP: if (!perm) return -EPERM; @@ -605,20 +519,6 @@ static int vt_io_ioctl(struct vc_data *vc, unsigned int cmd, void __user *up, case GIO_CMAP: return con_get_cmap(up); - case PIO_FONTX: - if (!perm) - return -EPERM; - - fallthrough; - case GIO_FONTX: - return do_fontx_ioctl(vc, cmd, up, &op); - - case PIO_FONTRESET: - if (!perm) - return -EPERM; - - return vt_io_fontreset(vc, &op); - case PIO_SCRNMAP: if (!perm) return -EPERM; @@ -1099,54 +999,6 @@ void vc_SAK(struct work_struct *work) #ifdef CONFIG_COMPAT -struct compat_consolefontdesc { - unsigned short charcount; /* characters in font (256 or 512) */ - unsigned short charheight; /* scan lines per character (1-32) */ - compat_caddr_t chardata; /* font data in expanded form */ -}; - -static inline int -compat_fontx_ioctl(struct vc_data *vc, int cmd, - struct compat_consolefontdesc __user *user_cfd, - int perm, struct console_font_op *op) -{ - struct compat_consolefontdesc cfdarg; - int i; - - if (copy_from_user(&cfdarg, user_cfd, sizeof(struct compat_consolefontdesc))) - return -EFAULT; - - switch (cmd) { - case PIO_FONTX: - if (!perm) - return -EPERM; - op->op = KD_FONT_OP_SET; - op->flags = KD_FONT_FLAG_OLD; - op->width = 8; - op->height = cfdarg.charheight; - op->charcount = cfdarg.charcount; - op->data = compat_ptr(cfdarg.chardata); - return con_font_op(vc, op); - - case GIO_FONTX: - op->op = KD_FONT_OP_GET; - op->flags = KD_FONT_FLAG_OLD; - op->width = 8; - op->height = cfdarg.charheight; - op->charcount = cfdarg.charcount; - op->data = compat_ptr(cfdarg.chardata); - i = con_font_op(vc, op); - if (i) - return i; - cfdarg.charheight = op->height; - cfdarg.charcount = op->charcount; - if (copy_to_user(user_cfd, &cfdarg, sizeof(struct compat_consolefontdesc))) - return -EFAULT; - return 0; - } - return -EINVAL; -} - struct compat_console_font_op { compat_uint_t op; /* operation code KD_FONT_OP_* */ compat_uint_t flags; /* KD_FONT_FLAG_* */ @@ -1223,9 +1075,6 @@ long vt_compat_ioctl(struct tty_struct *tty, /* * these need special handlers for incompatible data structures */ - case PIO_FONTX: - case GIO_FONTX: - return compat_fontx_ioctl(vc, cmd, up, perm, &op); case KDFONTOP: return compat_kdfontop_ioctl(up, perm, &op, vc); diff --git a/include/linux/kd.h b/include/linux/kd.h deleted file mode 100644 index b130a18f860f..000000000000 --- a/include/linux/kd.h +++ /dev/null @@ -1,8 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_KD_H -#define _LINUX_KD_H - -#include - -#define KD_FONT_FLAG_OLD 0x80000000 /* Invoked via old interface [compat] */ -#endif /* _LINUX_KD_H */ -- Gitee From 039bd396a88b93edd0008cab8f0b2e597ec072cc Mon Sep 17 00:00:00 2001 From: He Sheng Date: Fri, 6 May 2022 09:18:15 +0800 Subject: [PATCH 077/674] sw64: remove unused header files Sunway inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I56OLG -------------------------------- These header files have became unused for a long time. Signed-off-by: He Sheng Signed-off-by: Gu Zitao --- arch/sw_64/include/asm/agp.h | 19 ---- arch/sw_64/include/asm/asm-prototypes.h | 1 - arch/sw_64/include/asm/compiler.h | 7 -- arch/sw_64/include/asm/console.h | 11 --- arch/sw_64/include/asm/elf.h | 6 +- arch/sw_64/include/asm/floppy.h | 116 ------------------------ arch/sw_64/include/asm/special_insns.h | 20 ---- arch/sw_64/include/uapi/asm/console.h | 51 ----------- 8 files changed, 2 insertions(+), 229 deletions(-) delete mode 100644 arch/sw_64/include/asm/agp.h delete mode 100644 arch/sw_64/include/asm/compiler.h delete mode 100644 arch/sw_64/include/asm/console.h delete mode 100644 arch/sw_64/include/asm/floppy.h delete mode 100644 arch/sw_64/include/asm/special_insns.h delete mode 100644 arch/sw_64/include/uapi/asm/console.h diff --git a/arch/sw_64/include/asm/agp.h b/arch/sw_64/include/asm/agp.h deleted file mode 100644 index e9d16888910e..000000000000 --- a/arch/sw_64/include/asm/agp.h +++ /dev/null @@ -1,19 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_SW64_AGP_H -#define _ASM_SW64_AGP_H 1 - -#include - -/* dummy for now */ - -#define map_page_into_agp(page) -#define unmap_page_from_agp(page) -#define flush_agp_cache() mb() - -/* GATT allocation. Returns/accepts GATT kernel virtual address. */ -#define alloc_gatt_pages(order) \ - ((char *)__get_free_pages(GFP_KERNEL, (order))) -#define free_gatt_pages(table, order) \ - free_pages((unsigned long)(table), (order)) - -#endif diff --git a/arch/sw_64/include/asm/asm-prototypes.h b/arch/sw_64/include/asm/asm-prototypes.h index 21f4f494d74d..15bad8ef6883 100644 --- a/arch/sw_64/include/asm/asm-prototypes.h +++ b/arch/sw_64/include/asm/asm-prototypes.h @@ -4,7 +4,6 @@ #include #include -#include #include #include #include diff --git a/arch/sw_64/include/asm/compiler.h b/arch/sw_64/include/asm/compiler.h deleted file mode 100644 index 9a80aa6a0ba8..000000000000 --- a/arch/sw_64/include/asm/compiler.h +++ /dev/null @@ -1,7 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_SW64_COMPILER_H -#define _ASM_SW64_COMPILER_H - -#include - -#endif /* _ASM_SW64_COMPILER_H */ diff --git a/arch/sw_64/include/asm/console.h b/arch/sw_64/include/asm/console.h deleted file mode 100644 index 0c01cb740bce..000000000000 --- a/arch/sw_64/include/asm/console.h +++ /dev/null @@ -1,11 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_SW64_CONSOLE_H -#define _ASM_SW64_CONSOLE_H - -#include -#ifndef __ASSEMBLY__ -struct crb_struct; -extern int callback_init_done; -extern void callback_init(void); -#endif /* __ASSEMBLY__ */ -#endif /* _ASM_SW64_CONSOLE_H */ diff --git a/arch/sw_64/include/asm/elf.h b/arch/sw_64/include/asm/elf.h index 150629b0b615..abecc5653eba 100644 --- a/arch/sw_64/include/asm/elf.h +++ b/arch/sw_64/include/asm/elf.h @@ -3,7 +3,6 @@ #define _ASM_SW64_ELF_H #ifdef __KERNEL__ #include -#include #endif /* Special values for the st_other field in the symbol table. */ @@ -141,11 +140,10 @@ extern int dump_elf_task_fp(elf_fpreg_t *dest, struct task_struct *task); /* * This yields a mask that user programs can use to figure out what - * instruction set this CPU supports. This is trivial on SW-64, - * but not so on other machines. + * instruction set this CPU supports. */ -#define ELF_HWCAP (~amask(-1)) +#define ELF_HWCAP 0 /* * This yields a string that ld.so will use to load implementation diff --git a/arch/sw_64/include/asm/floppy.h b/arch/sw_64/include/asm/floppy.h deleted file mode 100644 index f4646d99d80c..000000000000 --- a/arch/sw_64/include/asm/floppy.h +++ /dev/null @@ -1,116 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Architecture specific parts of the Floppy driver - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * Copyright (C) 1995 - */ -#ifndef _ASM_SW64_FLOPPY_H -#define _ASM_SW64_FLOPPY_H - -#define fd_inb(port) inb_p(port) -#define fd_outb(value, port) outb_p(value, port) - -#define fd_enable_dma() enable_dma(FLOPPY_DMA) -#define fd_disable_dma() disable_dma(FLOPPY_DMA) -#define fd_request_dma() request_dma(FLOPPY_DMA, "floppy") -#define fd_free_dma() free_dma(FLOPPY_DMA) -#define fd_clear_dma_ff() clear_dma_ff(FLOPPY_DMA) -#define fd_set_dma_mode(mode) set_dma_mode(FLOPPY_DMA, mode) -#define fd_set_dma_addr(addr) set_dma_addr(FLOPPY_DMA, virt_to_bus(addr)) -#define fd_set_dma_count(count) set_dma_count(FLOPPY_DMA, count) -#define fd_enable_irq() enable_irq(FLOPPY_IRQ) -#define fd_disable_irq() disable_irq(FLOPPY_IRQ) -#define fd_cacheflush(addr, size) /* nothing */ -#define fd_request_irq() \ - request_irq(FLOPPY_IRQ, floppy_interrupt, 0, "floppy", NULL) -#define fd_free_irq() free_irq(FLOPPY_IRQ, NULL) - -#ifdef CONFIG_PCI - -#include - -#define fd_dma_setup(addr, size, mode, io) \ - sw64_fd_dma_setup(addr, size, mode, io) - -static inline int -sw64_fd_dma_setup(char *addr, unsigned long size, int mode, int io) -{ - static unsigned long prev_size; - static dma_addr_t bus_addr; - static char *prev_addr; - static int prev_dir; - int dir; - - dir = (mode != DMA_MODE_READ) ? PCI_DMA_FROMDEVICE : PCI_DMA_TODEVICE; - - if (bus_addr - && (addr != prev_addr || size != prev_size || dir != prev_dir)) { - /* different from last time -- unmap prev */ - bus_addr = 0; - } - - if (!bus_addr) /* need to map it */ - bus_addr = virt_to_bus(addr); - - /* remember this one as prev */ - prev_addr = addr; - prev_size = size; - prev_dir = dir; - - fd_clear_dma_ff(); - fd_cacheflush(addr, size); - fd_set_dma_mode(mode); - set_dma_addr(FLOPPY_DMA, bus_addr); - fd_set_dma_count(size); - virtual_dma_port = io; - fd_enable_dma(); - - return 0; -} - -#endif /* CONFIG_PCI */ - -inline void virtual_dma_init(void) -{ - /* Nothing to do on an sw64 */ -} - -static int FDC1 = 0x3f0; -static int FDC2 = -1; - -/* - * Again, the CMOS information doesn't work on the sw64.. - */ -#define FLOPPY0_TYPE 6 -#define FLOPPY1_TYPE 0 - -#define N_FDC 2 -#define N_DRIVE 8 - -/* - * Most sw64s have no problems with floppy DMA crossing 64k borders, - * except for certain ones, like XL and RUFFIAN. - * - * However, the test is simple and fast, and this *is* floppy, after all, - * so we do it for all platforms, just to make sure. - * - * This is advantageous in other circumstances as well, as in moving - * about the PCI DMA windows and forcing the floppy to start doing - * scatter-gather when it never had before, and there *is* a problem - * on that platform... ;-} - */ - -static inline unsigned long CROSS_64KB(void *a, unsigned long s) -{ - unsigned long p = (unsigned long)a; - - return ((p + s - 1) ^ p) & ~0xffffUL; -} - -#define EXTRA_FLOPPY_PARAMS - -#endif /* __ASM_SW64_FLOPPY_H */ diff --git a/arch/sw_64/include/asm/special_insns.h b/arch/sw_64/include/asm/special_insns.h deleted file mode 100644 index 7f5a52b20444..000000000000 --- a/arch/sw_64/include/asm/special_insns.h +++ /dev/null @@ -1,20 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_SW64_SPECIAL_INSNS_H -#define _ASM_SW64_SPECIAL_INSNS_H - -enum amask_enum { - AMASK_BWX = (1UL << 0), - AMASK_FIX = (1UL << 1), - AMASK_CIX = (1UL << 2), - AMASK_MAX = (1UL << 8), - AMASK_PRECISE_TRAP = (1UL << 9), -}; - -#define amask(mask) \ -({ \ - unsigned long __amask, __input = (mask); \ - __asm__ ("mov %1, %0" : "=r"(__amask) : "rI"(__input)); \ - __amask; \ -}) - -#endif /* _ASM_SW64_SPECIAL_INSNS_H */ diff --git a/arch/sw_64/include/uapi/asm/console.h b/arch/sw_64/include/uapi/asm/console.h deleted file mode 100644 index a40cd7aeb31f..000000000000 --- a/arch/sw_64/include/uapi/asm/console.h +++ /dev/null @@ -1,51 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef _UAPI_ASM_SW64_CONSOLE_H -#define _UAPI_ASM_SW64_CONSOLE_H - -/* - * Console callback routine numbers - */ -#define CCB_GETC 0x01 -#define CCB_PUTS 0x02 -#define CCB_RESET_TERM 0x03 -#define CCB_SET_TERM_INT 0x04 -#define CCB_SET_TERM_CTL 0x05 -#define CCB_PROCESS_KEYCODE 0x06 -#define CCB_OPEN_CONSOLE 0x07 -#define CCB_CLOSE_CONSOLE 0x08 - -#define CCB_OPEN 0x10 -#define CCB_CLOSE 0x11 -#define CCB_IOCTL 0x12 -#define CCB_READ 0x13 -#define CCB_WRITE 0x14 - -#define CCB_SET_ENV 0x20 -#define CCB_RESET_ENV 0x21 -#define CCB_GET_ENV 0x22 -#define CCB_SAVE_ENV 0x23 - -#define CCB_PSWITCH 0x30 -#define CCB_BIOS_EMUL 0x32 - -/* - * Environment variable numbers - */ -#define ENV_AUTO_ACTION 0x01 -#define ENV_BOOT_DEV 0x02 -#define ENV_BOOTDEF_DEV 0x03 -#define ENV_BOOTED_DEV 0x04 -#define ENV_BOOT_FILE 0x05 -#define ENV_BOOTED_FILE 0x06 -#define ENV_BOOT_OSFLAGS 0x07 -#define ENV_BOOTED_OSFLAGS 0x08 -#define ENV_BOOT_RESET 0x09 -#define ENV_DUMP_DEV 0x0A -#define ENV_ENABLE_AUDIT 0x0B -#define ENV_LICENSE 0x0C -#define ENV_CHAR_SET 0x0D -#define ENV_LANGUAGE 0x0E -#define ENV_TTY_DEV 0x0F - - -#endif /* _UAPI_ASM_SW64_CONSOLE_H */ -- Gitee From 65d722507cd65bbf42f9c00d92c7016bc92968bf Mon Sep 17 00:00:00 2001 From: He Sheng Date: Fri, 6 May 2022 09:28:36 +0800 Subject: [PATCH 078/674] sw64: uapi: generate some uapi headers from generic ones Sunway inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I56OLG -------------------------------- These uapi headers are proved to be the same with generic version. Herein we remove the arch-specific statfs.h and types.h to end support for old system. Signed-off-by: He Sheng Signed-off-by: Gu Zitao --- arch/sw_64/include/uapi/asm/Kbuild | 1 + arch/sw_64/include/uapi/asm/ipcbuf.h | 7 ---- arch/sw_64/include/uapi/asm/kvm_para.h | 7 ---- arch/sw_64/include/uapi/asm/msgbuf.h | 28 ---------------- arch/sw_64/include/uapi/asm/poll.h | 7 ---- arch/sw_64/include/uapi/asm/posix_types.h | 18 ----------- arch/sw_64/include/uapi/asm/sembuf.h | 23 ------------- arch/sw_64/include/uapi/asm/shmbuf.h | 39 ----------------------- arch/sw_64/include/uapi/asm/statfs.h | 9 ------ arch/sw_64/include/uapi/asm/types.h | 28 ---------------- 10 files changed, 1 insertion(+), 166 deletions(-) delete mode 100644 arch/sw_64/include/uapi/asm/ipcbuf.h delete mode 100644 arch/sw_64/include/uapi/asm/kvm_para.h delete mode 100644 arch/sw_64/include/uapi/asm/msgbuf.h delete mode 100644 arch/sw_64/include/uapi/asm/poll.h delete mode 100644 arch/sw_64/include/uapi/asm/posix_types.h delete mode 100644 arch/sw_64/include/uapi/asm/sembuf.h delete mode 100644 arch/sw_64/include/uapi/asm/shmbuf.h delete mode 100644 arch/sw_64/include/uapi/asm/statfs.h delete mode 100644 arch/sw_64/include/uapi/asm/types.h diff --git a/arch/sw_64/include/uapi/asm/Kbuild b/arch/sw_64/include/uapi/asm/Kbuild index a01bfb9600ec..15700040f138 100644 --- a/arch/sw_64/include/uapi/asm/Kbuild +++ b/arch/sw_64/include/uapi/asm/Kbuild @@ -1,4 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 # UAPI Header export list +generic-y += kvm_para.h generated-y += unistd_64.h diff --git a/arch/sw_64/include/uapi/asm/ipcbuf.h b/arch/sw_64/include/uapi/asm/ipcbuf.h deleted file mode 100644 index 553cdb37052d..000000000000 --- a/arch/sw_64/include/uapi/asm/ipcbuf.h +++ /dev/null @@ -1,7 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef _UAPI_ASM_SW64_IPCBUF_H -#define _UAPI_ASM_SW64_IPCBUF_H - -#include - -#endif diff --git a/arch/sw_64/include/uapi/asm/kvm_para.h b/arch/sw_64/include/uapi/asm/kvm_para.h deleted file mode 100644 index 3c0f9fa712ab..000000000000 --- a/arch/sw_64/include/uapi/asm/kvm_para.h +++ /dev/null @@ -1,7 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef _UAPI_ASM_SW64_KVM_PARA_H -#define _UAPI_ASM_SW64_KVM_PARA_H - -#include - -#endif diff --git a/arch/sw_64/include/uapi/asm/msgbuf.h b/arch/sw_64/include/uapi/asm/msgbuf.h deleted file mode 100644 index b938df3664a0..000000000000 --- a/arch/sw_64/include/uapi/asm/msgbuf.h +++ /dev/null @@ -1,28 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef _UAPI_ASM_SW64_MSGBUF_H -#define _UAPI_ASM_SW64_MSGBUF_H - -/* - * The msqid64_ds structure for sw64 architecture. - * Note extra padding because this structure is passed back and forth - * between kernel and user space. - * - * Pad space is left for: - * - 2 miscellaneous 64-bit values - */ - -struct msqid64_ds { - struct ipc64_perm msg_perm; - long msg_stime; /* last msgsnd time */ - long msg_rtime; /* last msgrcv time */ - long msg_ctime; /* last change time */ - unsigned long msg_cbytes; /* current number of bytes on queue */ - unsigned long msg_qnum; /* number of messages in queue */ - unsigned long msg_qbytes; /* max number of bytes on queue */ - __kernel_pid_t msg_lspid; /* pid of last msgsnd */ - __kernel_pid_t msg_lrpid; /* last receive pid */ - unsigned long __unused1; - unsigned long __unused2; -}; - -#endif /* _UAPI_ASM_SW64_MSGBUF_H */ diff --git a/arch/sw_64/include/uapi/asm/poll.h b/arch/sw_64/include/uapi/asm/poll.h deleted file mode 100644 index 114d0344e377..000000000000 --- a/arch/sw_64/include/uapi/asm/poll.h +++ /dev/null @@ -1,7 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef _UAPI_ASM_SW64_POLL_H -#define _UAPI_ASM_SW64_POLL_H - -#include - -#endif diff --git a/arch/sw_64/include/uapi/asm/posix_types.h b/arch/sw_64/include/uapi/asm/posix_types.h deleted file mode 100644 index 182741aaa06e..000000000000 --- a/arch/sw_64/include/uapi/asm/posix_types.h +++ /dev/null @@ -1,18 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef _UAPI_ASM_SW64_POSIX_TYPES_H -#define _UAPI_ASM_SW64_POSIX_TYPES_H - -/* - * This file is generally used by user-level software, so you need to - * be a little careful about namespace pollution etc. Also, we cannot - * assume GCC is being used. - */ - -typedef unsigned long __kernel_ino_t; -#define __kernel_ino_t __kernel_ino_t - -typedef unsigned long __kernel_sigset_t; /* at least 32 bits */ - -#include - -#endif /* _UAPI_ASM_SW64_POSIX_TYPES_H */ diff --git a/arch/sw_64/include/uapi/asm/sembuf.h b/arch/sw_64/include/uapi/asm/sembuf.h deleted file mode 100644 index 08b0876e739c..000000000000 --- a/arch/sw_64/include/uapi/asm/sembuf.h +++ /dev/null @@ -1,23 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef _UAPI_ASM_SW64_SEMBUF_H -#define _UAPI_ASM_SW64_SEMBUF_H - -/* - * The semid64_ds structure for sw64 architecture. - * Note extra padding because this structure is passed back and forth - * between kernel and user space. - * - * Pad space is left for: - * - 2 miscellaneous 64-bit values - */ - -struct semid64_ds { - struct ipc64_perm sem_perm; /* permissions .. see ipc.h */ - long sem_otime; /* last semop time */ - long sem_ctime; /* last change time */ - unsigned long sem_nsems; /* no. of semaphores in array */ - unsigned long __unused1; - unsigned long __unused2; -}; - -#endif /* _UAPI_ASM_SW64_SEMBUF_H */ diff --git a/arch/sw_64/include/uapi/asm/shmbuf.h b/arch/sw_64/include/uapi/asm/shmbuf.h deleted file mode 100644 index 4572337bee02..000000000000 --- a/arch/sw_64/include/uapi/asm/shmbuf.h +++ /dev/null @@ -1,39 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef _UAPI_ASM_SW64_SHMBUF_H -#define _UAPI_ASM_SW64_SHMBUF_H - -/* - * The shmid64_ds structure for sw64 architecture. - * Note extra padding because this structure is passed back and forth - * between kernel and user space. - * - * Pad space is left for: - * - 2 miscellaneous 64-bit values - */ - -struct shmid64_ds { - struct ipc64_perm shm_perm; /* operation perms */ - size_t shm_segsz; /* size of segment (bytes) */ - long shm_atime; /* last attach time */ - long shm_dtime; /* last detach time */ - long shm_ctime; /* last change time */ - __kernel_pid_t shm_cpid; /* pid of creator */ - __kernel_pid_t shm_lpid; /* pid of last operator */ - unsigned long shm_nattch; /* no. of current attaches */ - unsigned long __unused1; - unsigned long __unused2; -}; - -struct shminfo64 { - unsigned long shmmax; - unsigned long shmmin; - unsigned long shmmni; - unsigned long shmseg; - unsigned long shmall; - unsigned long __unused1; - unsigned long __unused2; - unsigned long __unused3; - unsigned long __unused4; -}; - -#endif /* _UAPI_ASM_SW64_SHMBUF_H */ diff --git a/arch/sw_64/include/uapi/asm/statfs.h b/arch/sw_64/include/uapi/asm/statfs.h deleted file mode 100644 index b92d719238d1..000000000000 --- a/arch/sw_64/include/uapi/asm/statfs.h +++ /dev/null @@ -1,9 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef _UAPI_ASM_SW64_STATFS_H -#define _UAPI_ASM_SW64_STATFS_H - -#include - -#include - -#endif diff --git a/arch/sw_64/include/uapi/asm/types.h b/arch/sw_64/include/uapi/asm/types.h deleted file mode 100644 index 750b5181c3de..000000000000 --- a/arch/sw_64/include/uapi/asm/types.h +++ /dev/null @@ -1,28 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef _UAPI_ASM_SW64_TYPES_H -#define _UAPI_ASM_SW64_TYPES_H - -/* - * This file is never included by application software unless - * explicitly requested (e.g., via linux/types.h) in which case the - * application is Linux specific so (user-) name space pollution is - * not a major issue. However, for interoperability, libraries still - * need to be careful to avoid a name clashes. - */ - -/* - * This is here because we used to use l64 for sw64 and we don't want - * to impact user mode with our change to ll64 in the kernel. - * - * However, some user programs are fine with this. They can - * flag __SANE_USERSPACE_TYPES__ to get int-ll64.h here. - */ -#ifndef __KERNEL__ -#ifndef __SANE_USERSPACE_TYPES__ -#include -#else -#include -#endif /* __SANE_USERSPACE_TYPES__ */ -#endif /* __KERNEL__ */ - -#endif /* _UAPI_ASM_SW64_TYPES_H */ -- Gitee From e2c89aaa6b99586c72d55024e4c42c0a331864f6 Mon Sep 17 00:00:00 2001 From: He Sheng Date: Fri, 6 May 2022 09:30:28 +0800 Subject: [PATCH 079/674] sw64: uapi: include generic param.h Sunway inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I56OLG -------------------------------- Most macro definitions are the same with generic version. Signed-off-by: He Sheng Signed-off-by: Gu Zitao --- arch/sw_64/include/uapi/asm/param.h | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/arch/sw_64/include/uapi/asm/param.h b/arch/sw_64/include/uapi/asm/param.h index 16c4934c937e..d38e8202dd97 100644 --- a/arch/sw_64/include/uapi/asm/param.h +++ b/arch/sw_64/include/uapi/asm/param.h @@ -2,15 +2,8 @@ #ifndef _UAPI_ASM_SW64_PARAM_H #define _UAPI_ASM_SW64_PARAM_H -#define HZ 100 - #define EXEC_PAGESIZE 8192 -#ifndef NOGROUP -#define NOGROUP (-1) -#endif - -#define MAXHOSTNAMELEN 64 /* max length of hostname */ - +#include #endif /* _UAPI_ASM_SW64_PARAM_H */ -- Gitee From 92f19473b28196492cbfd8153f8d48f2297a562a Mon Sep 17 00:00:00 2001 From: He Sheng Date: Fri, 6 May 2022 09:57:27 +0800 Subject: [PATCH 080/674] sw64: kapi: remove unused header-y from Kbuild Sunway inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I56OLG -------------------------------- Signed-off-by: He Sheng Signed-off-by: Gu Zitao --- arch/sw_64/include/asm/Kbuild | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/arch/sw_64/include/asm/Kbuild b/arch/sw_64/include/asm/Kbuild index e276ba366e68..a94a978edf3e 100644 --- a/arch/sw_64/include/asm/Kbuild +++ b/arch/sw_64/include/asm/Kbuild @@ -1,23 +1,12 @@ # SPDX-License-Identifier: GPL-2.0 -header-y += compiler.h -header-y += console.h -header-y += fpu.h -header-y += gentrap.h -header-y += hmcall.h -header-y += reg.h -header-y += regdef.h -header-y += sysinfo.h -header-y += page.h -header-y += elf.h -generated-y += syscall_table.h +generic-y += clkdev.h generic-y += export.h generic-y += kvm_types.h -generic-y += rwsem.h - +generic-y += mcs_spinlock.h generic-y += qrwlock.h generic-y += qspinlock.h -generic-y += mcs_spinlock.h -generic-y += clkdev.h +generic-y += rwsem.h generic-y += scatterlist.h generic-y += user.h +generated-y += syscall_table.h -- Gitee From 927e8e16d604582a095d7c0ae9714f8050f0205d Mon Sep 17 00:00:00 2001 From: He Sheng Date: Fri, 6 May 2022 10:00:14 +0800 Subject: [PATCH 081/674] sw64: kapi: generate some kapi headers from generic ones Sunway inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I56OLG -------------------------------- These kapi headers are proved to be the same with generic version. Signed-off-by: He Sheng Signed-off-by: Gu Zitao --- arch/sw_64/include/asm/Kbuild | 7 +- arch/sw_64/include/asm/div64.h | 7 -- arch/sw_64/include/asm/emergency-restart.h | 7 -- arch/sw_64/include/asm/exec.h | 7 -- arch/sw_64/include/asm/irq_regs.h | 7 -- arch/sw_64/include/asm/kmap_types.h | 15 --- arch/sw_64/include/asm/local.h | 125 --------------------- arch/sw_64/include/asm/local64.h | 7 -- arch/sw_64/include/asm/param.h | 11 -- arch/sw_64/include/asm/parport.h | 19 ---- arch/sw_64/include/asm/preempt.h | 7 -- arch/sw_64/include/asm/seccomp.h | 15 --- arch/sw_64/include/asm/sections.h | 8 -- arch/sw_64/include/asm/segment.h | 7 -- arch/sw_64/include/asm/serial.h | 16 --- arch/sw_64/include/asm/shmparam.h | 7 -- arch/sw_64/include/asm/trace_clock.h | 10 -- arch/sw_64/include/asm/types.h | 7 -- arch/sw_64/include/asm/unaligned.h | 12 -- 19 files changed, 6 insertions(+), 295 deletions(-) delete mode 100644 arch/sw_64/include/asm/div64.h delete mode 100644 arch/sw_64/include/asm/emergency-restart.h delete mode 100644 arch/sw_64/include/asm/exec.h delete mode 100644 arch/sw_64/include/asm/irq_regs.h delete mode 100644 arch/sw_64/include/asm/kmap_types.h delete mode 100644 arch/sw_64/include/asm/local.h delete mode 100644 arch/sw_64/include/asm/local64.h delete mode 100644 arch/sw_64/include/asm/param.h delete mode 100644 arch/sw_64/include/asm/parport.h delete mode 100644 arch/sw_64/include/asm/preempt.h delete mode 100644 arch/sw_64/include/asm/seccomp.h delete mode 100644 arch/sw_64/include/asm/sections.h delete mode 100644 arch/sw_64/include/asm/segment.h delete mode 100644 arch/sw_64/include/asm/serial.h delete mode 100644 arch/sw_64/include/asm/shmparam.h delete mode 100644 arch/sw_64/include/asm/trace_clock.h delete mode 100644 arch/sw_64/include/asm/types.h delete mode 100644 arch/sw_64/include/asm/unaligned.h diff --git a/arch/sw_64/include/asm/Kbuild b/arch/sw_64/include/asm/Kbuild index a94a978edf3e..d08f0b08918e 100644 --- a/arch/sw_64/include/asm/Kbuild +++ b/arch/sw_64/include/asm/Kbuild @@ -3,10 +3,15 @@ generic-y += clkdev.h generic-y += export.h generic-y += kvm_types.h +generic-y += local64.h generic-y += mcs_spinlock.h +generic-y += param.h generic-y += qrwlock.h generic-y += qspinlock.h generic-y += rwsem.h -generic-y += scatterlist.h +generic-y += seccomp.h +generic-y += segment.h +generic-y += types.h generic-y += user.h + generated-y += syscall_table.h diff --git a/arch/sw_64/include/asm/div64.h b/arch/sw_64/include/asm/div64.h deleted file mode 100644 index 306581407ba5..000000000000 --- a/arch/sw_64/include/asm/div64.h +++ /dev/null @@ -1,7 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_SW64_DIV64_H -#define _ASM_SW64_DIV64_H - -#include - -#endif diff --git a/arch/sw_64/include/asm/emergency-restart.h b/arch/sw_64/include/asm/emergency-restart.h deleted file mode 100644 index fabb33ebf0eb..000000000000 --- a/arch/sw_64/include/asm/emergency-restart.h +++ /dev/null @@ -1,7 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_SW64_EMERGENCY_RESTART_H -#define _ASM_SW64_EMERGENCY_RESTART_H - -#include - -#endif /* _ASM_SW64_EMERGENCY_RESTART_H */ diff --git a/arch/sw_64/include/asm/exec.h b/arch/sw_64/include/asm/exec.h deleted file mode 100644 index 4a9cb71c5c47..000000000000 --- a/arch/sw_64/include/asm/exec.h +++ /dev/null @@ -1,7 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_SW64_EXEC_H -#define _ASM_SW64_EXEC_H - -#define arch_align_stack(x) (x) - -#endif /* _ASM_SW64_EXEC_H */ diff --git a/arch/sw_64/include/asm/irq_regs.h b/arch/sw_64/include/asm/irq_regs.h deleted file mode 100644 index bba48f36a40f..000000000000 --- a/arch/sw_64/include/asm/irq_regs.h +++ /dev/null @@ -1,7 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_SW64_IRQ_REGS_H -#define _ASM_SW64_IRQ_REGS_H - -#include - -#endif diff --git a/arch/sw_64/include/asm/kmap_types.h b/arch/sw_64/include/asm/kmap_types.h deleted file mode 100644 index 8e86b08dee94..000000000000 --- a/arch/sw_64/include/asm/kmap_types.h +++ /dev/null @@ -1,15 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_SW64_KMAP_TYPES_H -#define _ASM_SW64_KMAP_TYPES_H - -/* Dummy header just to define km_type. */ - -#ifdef CONFIG_DEBUG_HIGHMEM -#define __WITH_KM_FENCE -#endif - -#include - -#undef __WITH_KM_FENCE - -#endif diff --git a/arch/sw_64/include/asm/local.h b/arch/sw_64/include/asm/local.h deleted file mode 100644 index 9144600f641d..000000000000 --- a/arch/sw_64/include/asm/local.h +++ /dev/null @@ -1,125 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_SW64_LOCAL_H -#define _ASM_SW64_LOCAL_H - -#include -#include - -typedef struct { - atomic_long_t a; -} local_t; - -#define LOCAL_INIT(i) { ATOMIC_LONG_INIT(i) } -#define local_read(l) atomic_long_read(&(l)->a) -#define local_set(l, i) atomic_long_set(&(l)->a, (i)) -#define local_inc(l) atomic_long_inc(&(l)->a) -#define local_dec(l) atomic_long_dec(&(l)->a) -#define local_add(i, l) atomic_long_add((i), (&(l)->a)) -#define local_sub(i, l) atomic_long_sub((i), (&(l)->a)) - -static inline long local_add_return(long i, local_t *l) -{ - long temp1, temp2, result, addr; - - __asm__ __volatile__( -#ifdef CONFIG_LOCK_MEMB - " memb\n" -#endif - " ldi %4, %2\n" - "1: lldl %0, 0(%4)\n" - " ldi %1, 1\n" - " wr_f %1\n" - " addl %0, %5, %3\n" - " addl %0, %5, %0\n" -#ifdef CONFIG_LOCK_FIXUP - " memb\n" -#endif - " lstl %0, 0(%4)\n" - " rd_f %0\n" - " beq %0, 2f\n" - ".subsection 2\n" - "2: br 1b\n" - ".previous" - : "=&r" (temp1), "=&r" (temp2), "=m" (l->a.counter), - "=&r" (result), "=&r" (addr) - : "Ir" (i), "m" (l->a.counter) : "memory"); - return result; -} - -static inline long local_sub_return(long i, local_t *l) -{ - long temp1, temp2, result, addr; - - __asm__ __volatile__( -#ifdef CONFIG_LOCK_MEMB - " memb\n" -#endif - " ldi %4, %2\n" - "1: lldl %0, 0(%4)\n" - " ldi %1, 1\n" - " wr_f %1\n" - " subl %0, %5, %3\n" - " subl %0, %5, %0\n" -#ifdef CONFIG_LOCK_FIXUP - " memb\n" -#endif - " lstl %0, 0(%4)\n" - " rd_f %0\n" - " beq %0, 2f\n" - ".subsection 2\n" - "2: br 1b\n" - ".previous" - : "=&r" (temp1), "=&r" (temp2), "=m" (l->a.counter), - "=&r" (result), "=&r" (addr) - : "Ir" (i), "m" (l->a.counter) : "memory"); - return result; -} - -#define local_cmpxchg(l, o, n) \ - (cmpxchg_local(&((l)->a.counter), (o), (n))) -#define local_xchg(l, n) (xchg_local(&((l)->a.counter), (n))) - -/** - * local_add_unless - add unless the number is a given value - * @l: pointer of type local_t - * @a: the amount to add to l... - * @u: ...unless l is equal to u. - * - * Atomically adds @a to @l, so long as it was not @u. - * Returns non-zero if @l was not @u, and zero otherwise. - */ -#define local_add_unless(l, a, u) \ -({ \ - long c, old; \ - c = local_read(l); \ - for (;;) { \ - if (unlikely(c == (u))) \ - break; \ - old = local_cmpxchg((l), c, c + (a)); \ - if (likely(old == c)) \ - break; \ - c = old; \ - } \ - c != (u); \ -}) -#define local_inc_not_zero(l) local_add_unless((l), 1, 0) - -#define local_add_negative(a, l) (local_add_return((a), (l)) < 0) - -#define local_dec_return(l) local_sub_return(1, (l)) - -#define local_inc_return(l) local_add_return(1, (l)) - -#define local_sub_and_test(i, l) (local_sub_return((i), (l)) == 0) - -#define local_inc_and_test(l) (local_add_return(1, (l)) == 0) - -#define local_dec_and_test(l) (local_sub_return(1, (l)) == 0) - -/* Verify if faster than atomic ops */ -#define __local_inc(l) ((l)->a.counter++) -#define __local_dec(l) ((l)->a.counter++) -#define __local_add(i, l) ((l)->a.counter += (i)) -#define __local_sub(i, l) ((l)->a.counter -= (i)) - -#endif /* _ASM_SW64_LOCAL_H */ diff --git a/arch/sw_64/include/asm/local64.h b/arch/sw_64/include/asm/local64.h deleted file mode 100644 index 4278133cd8fa..000000000000 --- a/arch/sw_64/include/asm/local64.h +++ /dev/null @@ -1,7 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_SW64_LOCAL64_H -#define _ASM_SW64_LOCAL64_H - -#include - -#endif diff --git a/arch/sw_64/include/asm/param.h b/arch/sw_64/include/asm/param.h deleted file mode 100644 index 49c5d03a3370..000000000000 --- a/arch/sw_64/include/asm/param.h +++ /dev/null @@ -1,11 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_SW64_PARAM_H -#define _ASM_SW64_PARAM_H - -#include - -#undef HZ -#define HZ CONFIG_HZ -#define USER_HZ 100 -#define CLOCKS_PER_SEC USER_HZ /* frequency at which times() counts */ -#endif /* _ASM_SW64_PARAM_H */ diff --git a/arch/sw_64/include/asm/parport.h b/arch/sw_64/include/asm/parport.h deleted file mode 100644 index 82b9a219b797..000000000000 --- a/arch/sw_64/include/asm/parport.h +++ /dev/null @@ -1,19 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * parport.h: platform-specific PC-style parport initialisation - * - * Copyright (C) 1999, 2000 Tim Waugh - * - * This file should only be included by drivers/parport/parport_pc.c. - */ - -#ifndef _ASM_SW64_PARPORT_H -#define _ASM_SW64_PARPORT_H - -static int parport_pc_find_isa_ports(int autoirq, int autodma); -static int parport_pc_find_nonpci_ports(int autoirq, int autodma) -{ - return parport_pc_find_isa_ports(autoirq, autodma); -} - -#endif /* !(_ASM_SW64_PARPORT_H) */ diff --git a/arch/sw_64/include/asm/preempt.h b/arch/sw_64/include/asm/preempt.h deleted file mode 100644 index dc6643a43766..000000000000 --- a/arch/sw_64/include/asm/preempt.h +++ /dev/null @@ -1,7 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_SW64_PREEMPT_H -#define _ASM_SW64_PREEMPT_H - -#include - -#endif /* _ASM_SW64_PREEMPT_H */ diff --git a/arch/sw_64/include/asm/seccomp.h b/arch/sw_64/include/asm/seccomp.h deleted file mode 100644 index db2f298862c3..000000000000 --- a/arch/sw_64/include/asm/seccomp.h +++ /dev/null @@ -1,15 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * arch/sw_64/include/asm/seccomp.h - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ -#ifndef _ASM_SW64_SECCOMP_H -#define _ASM_SW64_SECCOMP_H - -#include -#include - -#endif /* _ASM_SW64_SECCOMP_H */ diff --git a/arch/sw_64/include/asm/sections.h b/arch/sw_64/include/asm/sections.h deleted file mode 100644 index 37dab4fde720..000000000000 --- a/arch/sw_64/include/asm/sections.h +++ /dev/null @@ -1,8 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_SW64_SECTIONS_H -#define _ASM_SW64_SECTIONS_H - -/* nothing to see, move along */ -#include - -#endif diff --git a/arch/sw_64/include/asm/segment.h b/arch/sw_64/include/asm/segment.h deleted file mode 100644 index dc90357765e5..000000000000 --- a/arch/sw_64/include/asm/segment.h +++ /dev/null @@ -1,7 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_SW64_SEGMENT_H -#define _ASM_SW64_SEGMENT_H - -/* Only here because we have some old header files that expect it.. */ - -#endif diff --git a/arch/sw_64/include/asm/serial.h b/arch/sw_64/include/asm/serial.h deleted file mode 100644 index 059e603642b9..000000000000 --- a/arch/sw_64/include/asm/serial.h +++ /dev/null @@ -1,16 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_SW64_SERIAL_H -#define _ASM_SW64_SERIAL_H - -#define BASE_BAUD (1843200 / 16) - -/* Standard COM flags (except for COM4, because of the 8514 problem) */ -#ifdef CONFIG_SERIAL_8250_DETECT_IRQ -#define STD_COM_FLAGS (UPF_BOOT_AUTOCONF | UPF_SKIP_TEST | UPF_AUTO_IRQ) -#define STD_COM4_FLAGS (UPF_BOOT_AUTOCONF | UPF_AUTO_IRQ) -#else -#define STD_COM_FLAGS (UPF_BOOT_AUTOCONF | UPF_SKIP_TEST) -#define STD_COM4_FLAGS UPF_BOOT_AUTOCONF -#endif - -#endif /* _ASM_SW64_SERIAL_H */ diff --git a/arch/sw_64/include/asm/shmparam.h b/arch/sw_64/include/asm/shmparam.h deleted file mode 100644 index 15f71533b1ed..000000000000 --- a/arch/sw_64/include/asm/shmparam.h +++ /dev/null @@ -1,7 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_SW64_SHMPARAM_H -#define _ASM_SW64_SHMPARAM_H - -#define SHMLBA PAGE_SIZE /* attach addr a multiple of this */ - -#endif /* _ASM_SW64_SHMPARAM_H */ diff --git a/arch/sw_64/include/asm/trace_clock.h b/arch/sw_64/include/asm/trace_clock.h deleted file mode 100644 index 57324215a837..000000000000 --- a/arch/sw_64/include/asm/trace_clock.h +++ /dev/null @@ -1,10 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_SW64_TRACE_CLOCK_H -#define _ASM_SW64_TRACE_CLOCK_H - -#include -#include - -#define ARCH_TRACE_CLOCKS - -#endif /* _ASM_SW64_TRACE_CLOCK_H */ diff --git a/arch/sw_64/include/asm/types.h b/arch/sw_64/include/asm/types.h deleted file mode 100644 index 37d626269a02..000000000000 --- a/arch/sw_64/include/asm/types.h +++ /dev/null @@ -1,7 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_SW64_TYPES_H -#define _ASM_SW64_TYPES_H - -#include - -#endif /* _ASM_SW64_TYPES_H */ diff --git a/arch/sw_64/include/asm/unaligned.h b/arch/sw_64/include/asm/unaligned.h deleted file mode 100644 index 91fdff923ce5..000000000000 --- a/arch/sw_64/include/asm/unaligned.h +++ /dev/null @@ -1,12 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_SW64_UNALIGNED_H -#define _ASM_SW64_UNALIGNED_H - -#include -#include -#include - -#define get_unaligned __get_unaligned_le -#define put_unaligned __put_unaligned_le - -#endif /* _ASM_SW64_UNALIGNED_H */ -- Gitee From 6519c5195084b7326aa8a26584bb2e2b4d82f19c Mon Sep 17 00:00:00 2001 From: He Sheng Date: Fri, 6 May 2022 10:05:54 +0800 Subject: [PATCH 082/674] sw64: move ucontext.h to uapi Sunway inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I56OLG -------------------------------- It's an important user interface which should be put in uapi. Signed-off-by: He Sheng Signed-off-by: Gu Zitao --- arch/sw_64/include/{ => uapi}/asm/ucontext.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) rename arch/sw_64/include/{ => uapi}/asm/ucontext.h (56%) diff --git a/arch/sw_64/include/asm/ucontext.h b/arch/sw_64/include/uapi/asm/ucontext.h similarity index 56% rename from arch/sw_64/include/asm/ucontext.h rename to arch/sw_64/include/uapi/asm/ucontext.h index d40eebe988ef..c5d6e24e3e5f 100644 --- a/arch/sw_64/include/asm/ucontext.h +++ b/arch/sw_64/include/uapi/asm/ucontext.h @@ -1,6 +1,6 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_SW64_UCONTEXT_H -#define _ASM_SW64_UCONTEXT_H +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_ASM_SW64_UCONTEXT_H +#define _UAPI_ASM_SW64_UCONTEXT_H struct ucontext { unsigned long uc_flags; @@ -11,4 +11,4 @@ struct ucontext { sigset_t uc_sigmask; /* mask last for extensibility */ }; -#endif /* _ASM_SW64_UCONTEXT_H */ +#endif /* _UAPI_ASM_SW64_UCONTEXT_H */ -- Gitee From dc6f31f83df3a0ff2a9da93ffb61aee3e58ae852 Mon Sep 17 00:00:00 2001 From: He Sheng Date: Fri, 6 May 2022 10:30:09 +0800 Subject: [PATCH 083/674] sw64: kapi: remove redudant SMP_CACHE_BYTES Sunway inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I56OLG -------------------------------- SMP_CACHE_BYTES is also defined in linux/cache.h, so remove the replicated one. Signed-off-by: He Sheng Signed-off-by: Gu Zitao --- arch/sw_64/include/asm/cache.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/sw_64/include/asm/cache.h b/arch/sw_64/include/asm/cache.h index a59a74110884..1dca2e2e04a4 100644 --- a/arch/sw_64/include/asm/cache.h +++ b/arch/sw_64/include/asm/cache.h @@ -5,9 +5,7 @@ #ifndef _ASM_SW64_CACHE_H #define _ASM_SW64_CACHE_H -#define L1_CACHE_BYTES 128 #define L1_CACHE_SHIFT 7 - -#define SMP_CACHE_BYTES L1_CACHE_BYTES +#define L1_CACHE_BYTES (1 << L1_CACHE_SHIFT) #endif -- Gitee From cf28d11eff5d9edbb5330eed2e2bb3e75e4bc066 Mon Sep 17 00:00:00 2001 From: He Sheng Date: Fri, 6 May 2022 10:45:56 +0800 Subject: [PATCH 084/674] sw64: kapi: remove unimplemented IPLs Sunway inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I56OLG -------------------------------- There are only two IPLs so far on sw64. Signed-off-by: He Sheng Signed-off-by: Gu Zitao --- arch/sw_64/include/asm/irqflags.h | 8 -------- 1 file changed, 8 deletions(-) diff --git a/arch/sw_64/include/asm/irqflags.h b/arch/sw_64/include/asm/irqflags.h index 6101b6ad2e99..b4440f25a51d 100644 --- a/arch/sw_64/include/asm/irqflags.h +++ b/arch/sw_64/include/asm/irqflags.h @@ -5,14 +5,6 @@ #include #define IPL_MIN 0 -#define IPL_SW0 1 -#define IPL_SW1 2 -#define IPL_DEV0 3 -#define IPL_DEV1 4 -#define IPL_TIMER 5 -#define IPL_PERF 6 -#define IPL_POWERFAIL 6 -#define IPL_MCHECK 7 #define IPL_MAX 7 #define getipl() (rdps() & 7) -- Gitee From 211af39aab2e322ad14fd7727977946b29fff3ba Mon Sep 17 00:00:00 2001 From: He Sheng Date: Fri, 6 May 2022 11:09:36 +0800 Subject: [PATCH 085/674] sw64: kapi: include generic modules.h Sunway inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I56OLG -------------------------------- This patch removes replicated macros which have been defined in generic version. Signed-off-by: He Sheng Signed-off-by: Gu Zitao --- arch/sw_64/include/asm/module.h | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/arch/sw_64/include/asm/module.h b/arch/sw_64/include/asm/module.h index 55e6e333585f..d1663aab4097 100644 --- a/arch/sw_64/include/asm/module.h +++ b/arch/sw_64/include/asm/module.h @@ -2,18 +2,12 @@ #ifndef _ASM_SW64_MODULE_H #define _ASM_SW64_MODULE_H +#include + struct mod_arch_specific { unsigned int gotsecindex; }; -#define Elf_Sym Elf64_Sym -#define Elf_Shdr Elf64_Shdr -#define Elf_Ehdr Elf64_Ehdr -#define Elf_Phdr Elf64_Phdr -#define Elf_Dyn Elf64_Dyn -#define Elf_Rel Elf64_Rel -#define Elf_Rela Elf64_Rela - #define ARCH_SHF_SMALL SHF_SW64_GPREL #ifdef MODULE -- Gitee From fda97bdcd80c5dbced89f64ad340c18987fa39fb Mon Sep 17 00:00:00 2001 From: He Sheng Date: Fri, 6 May 2022 14:15:52 +0800 Subject: [PATCH 086/674] sw64: remove VGA_HOSE things Sunway inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I56OLG -------------------------------- SW64 has nothing to do with CONFIG_VGA_HOSE. Signed-off-by: He Sheng Signed-off-by: Gu Zitao --- arch/sw_64/Kconfig | 8 -------- arch/sw_64/include/asm/vga.h | 31 ------------------------------- 2 files changed, 39 deletions(-) diff --git a/arch/sw_64/Kconfig b/arch/sw_64/Kconfig index 472a916fd93e..8c5e87cb9b84 100644 --- a/arch/sw_64/Kconfig +++ b/arch/sw_64/Kconfig @@ -896,12 +896,4 @@ source "drivers/idle/Kconfig" endmenu -# DUMMY_CONSOLE may be defined in drivers/video/console/Kconfig -# but we also need it if VGA_HOSE is set -config DUMMY_CONSOLE - bool - depends on VGA_HOSE - default y - - source "arch/sw_64/kvm/Kconfig" diff --git a/arch/sw_64/include/asm/vga.h b/arch/sw_64/include/asm/vga.h index 28adb8b8b7f1..7268b5cd4bd8 100644 --- a/arch/sw_64/include/asm/vga.h +++ b/arch/sw_64/include/asm/vga.h @@ -49,37 +49,6 @@ extern void scr_memcpyw(u16 *d, const u16 *s, unsigned int count); #define vga_readb(a) readb((u8 __iomem *)(a)) #define vga_writeb(v, a) writeb(v, (u8 __iomem *)(a)) -#ifdef CONFIG_VGA_HOSE -#include -#include - -extern struct pci_controller *pci_vga_hose; - -#define __is_port_vga(a) \ - (((a) >= 0x3b0) && ((a) < 0x3e0) && \ - ((a) != 0x3b3) && ((a) != 0x3d3)) - -#define __is_mem_vga(a) \ - (((a) >= 0xa0000) && ((a) <= 0xc0000)) - -#define FIXUP_IOADDR_VGA(a) do { \ - if (pci_vga_hose && __is_port_vga(a)) \ - (a) += pci_vga_hose->io_space->start; \ -} while (0) - -#define FIXUP_MEMADDR_VGA(a) do { \ - if (pci_vga_hose && __is_mem_vga(a)) \ - (a) += pci_vga_hose->mem_space->start; \ -} while (0) - -#else /* CONFIG_VGA_HOSE */ -#define pci_vga_hose 0 -#define __is_port_vga(a) 0 -#define __is_mem_vga(a) 0 -#define FIXUP_IOADDR_VGA(a) -#define FIXUP_MEMADDR_VGA(a) -#endif /* CONFIG_VGA_HOSE */ - #define VGA_MAP_MEM(x, s) ((unsigned long)ioremap(x, s)) #endif -- Gitee From cfc38dc4ccccb5fd218eeb560bde21a42f381b91 Mon Sep 17 00:00:00 2001 From: He Sheng Date: Sat, 7 May 2022 15:00:06 +0800 Subject: [PATCH 087/674] sw64: clean up unused pci iounmap operation Sunway inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I56OLG -------------------------------- For sw64, IO address is mapped by direct mapping without pages. It doesn't have to do anything to unmap PCI MMIO. The result is that __is_mmio helper can be removed. Signed-off-by: He Sheng Signed-off-by: Gu Zitao --- arch/sw_64/include/asm/io.h | 8 -------- arch/sw_64/kernel/pci.c | 4 ---- 2 files changed, 12 deletions(-) diff --git a/arch/sw_64/include/asm/io.h b/arch/sw_64/include/asm/io.h index 6796c64f94ae..cb6c7c1f4a6b 100644 --- a/arch/sw_64/include/asm/io.h +++ b/arch/sw_64/include/asm/io.h @@ -218,14 +218,6 @@ static inline int __is_ioaddr(unsigned long addr) #define __is_ioaddr(a) __is_ioaddr((unsigned long)(a)) -static inline int __is_mmio(const volatile void __iomem *xaddr) -{ - unsigned long addr = (unsigned long)xaddr; - - return (addr & 0x100000000UL) == 0; -} - - #define ioread16be(p) be16_to_cpu(ioread16(p)) #define ioread32be(p) be32_to_cpu(ioread32(p)) diff --git a/arch/sw_64/kernel/pci.c b/arch/sw_64/kernel/pci.c index 44264e3da18f..77bbd1b68176 100644 --- a/arch/sw_64/kernel/pci.c +++ b/arch/sw_64/kernel/pci.c @@ -358,12 +358,8 @@ asmlinkage long sys_pciconfig_iobase(long which, unsigned long bus, unsigned lon return -EOPNOTSUPP; } -/* Destroy an __iomem token. Not copied from lib/iomap.c. */ - void pci_iounmap(struct pci_dev *dev, void __iomem *addr) { - if (__is_mmio(addr)) - iounmap(addr); } EXPORT_SYMBOL(pci_iounmap); -- Gitee From 751e450b08b35f491121fa7f977f1234d1366a12 Mon Sep 17 00:00:00 2001 From: He Sheng Date: Tue, 10 May 2022 13:50:33 +0800 Subject: [PATCH 088/674] sw64: kapi: use generic vga.h Sunway inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I56OLG -------------------------------- The old arch-specific vga.h takes no effect now, so use the generic version. As a result, the arch-specific scr_memcpyw interface and __is_ioaddr helper can be removed too. Signed-off-by: He Sheng Signed-off-by: Gu Zitao --- arch/sw_64/include/asm/io.h | 8 ------ arch/sw_64/include/asm/vga.h | 54 ------------------------------------ arch/sw_64/lib/iomap.c | 37 ------------------------ 3 files changed, 99 deletions(-) delete mode 100644 arch/sw_64/include/asm/vga.h diff --git a/arch/sw_64/include/asm/io.h b/arch/sw_64/include/asm/io.h index cb6c7c1f4a6b..e8e80c761467 100644 --- a/arch/sw_64/include/asm/io.h +++ b/arch/sw_64/include/asm/io.h @@ -211,14 +211,6 @@ static inline void __iounmap(volatile void __iomem *addr) #define iounmap __iounmap -static inline int __is_ioaddr(unsigned long addr) -{ - return addr >= (PAGE_OFFSET | IO_BASE); -} - -#define __is_ioaddr(a) __is_ioaddr((unsigned long)(a)) - - #define ioread16be(p) be16_to_cpu(ioread16(p)) #define ioread32be(p) be32_to_cpu(ioread32(p)) #define iowrite16be(v, p) iowrite16(cpu_to_be16(v), (p)) diff --git a/arch/sw_64/include/asm/vga.h b/arch/sw_64/include/asm/vga.h deleted file mode 100644 index 7268b5cd4bd8..000000000000 --- a/arch/sw_64/include/asm/vga.h +++ /dev/null @@ -1,54 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Access to VGA videoram - * - * (c) 1998 Martin Mares - */ - -#ifndef _ASM_SW64_VGA_H -#define _ASM_SW64_VGA_H - -#include - -#define VT_BUF_HAVE_RW -#define VT_BUF_HAVE_MEMSETW -#define VT_BUF_HAVE_MEMCPYW - -static inline void scr_writew(u16 val, volatile u16 *addr) -{ - if (__is_ioaddr(addr)) - __raw_writew(val, (volatile u16 __iomem *) addr); - else - *addr = val; -} - -static inline u16 scr_readw(volatile const u16 *addr) -{ - if (__is_ioaddr(addr)) - return __raw_readw((volatile const u16 __iomem *) addr); - else - return *addr; -} - -static inline void scr_memsetw(u16 *s, u16 c, unsigned int count) -{ - if (__is_ioaddr(s)) - memsetw_io((u16 __iomem *) s, c, count); - else - memsetw(s, c, count); -} - -/* Do not trust that the usage will be correct; analyze the arguments. */ -extern void scr_memcpyw(u16 *d, const u16 *s, unsigned int count); - -/* - * ??? These are currently only used for downloading character sets. As - * such, they don't need memory barriers. Is this all they are intended - * to be used for? - */ -#define vga_readb(a) readb((u8 __iomem *)(a)) -#define vga_writeb(v, a) writeb(v, (u8 __iomem *)(a)) - -#define VGA_MAP_MEM(x, s) ((unsigned long)ioremap(x, s)) - -#endif diff --git a/arch/sw_64/lib/iomap.c b/arch/sw_64/lib/iomap.c index 39e3d5498ae6..6fb96a7d7119 100644 --- a/arch/sw_64/lib/iomap.c +++ b/arch/sw_64/lib/iomap.c @@ -457,43 +457,6 @@ void _memset_c_io(volatile void __iomem *to, unsigned long c, long count) } EXPORT_SYMBOL(_memset_c_io); -/* - * A version of memcpy used by the vga console routines to move data around - * arbitrarily between screen and main memory. - */ - -void -scr_memcpyw(u16 *d, const u16 *s, unsigned int count) -{ - const u16 __iomem *ios = (const u16 __iomem *) s; - u16 __iomem *iod = (u16 __iomem *) d; - int s_isio = __is_ioaddr(s); - int d_isio = __is_ioaddr(d); - u16 tmp; - - if (s_isio) { - if (d_isio) { - /* - * FIXME: Should handle unaligned ops and - * operation widening. - */ - - count /= 2; - while (count--) { - tmp = __raw_readw(ios++); - __raw_writew(tmp, iod++); - } - } else - memcpy_fromio(d, ios, count); - } else { - if (d_isio) - memcpy_toio(iod, s, count); - else - memcpy(d, s, count); - } -} -EXPORT_SYMBOL(scr_memcpyw); - void __iomem *ioport_map(unsigned long port, unsigned int size) { return ioportmap(port); -- Gitee From b5601cff08accb45befdd523b35337c96550c547 Mon Sep 17 00:00:00 2001 From: He Sheng Date: Tue, 10 May 2022 14:21:21 +0800 Subject: [PATCH 089/674] sw64: remove unused IO_CONCAT Sunway inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I56OLG -------------------------------- IO interface of chipset is wrapped as a hook in sw64_platform_ops. Signed-off-by: He Sheng Signed-off-by: Gu Zitao --- arch/sw_64/include/asm/io.h | 7 ------- 1 file changed, 7 deletions(-) diff --git a/arch/sw_64/include/asm/io.h b/arch/sw_64/include/asm/io.h index e8e80c761467..3138665133e1 100644 --- a/arch/sw_64/include/asm/io.h +++ b/arch/sw_64/include/asm/io.h @@ -64,13 +64,6 @@ static inline void * __deprecated bus_to_virt(unsigned long address) } #define isa_bus_to_virt bus_to_virt -/* - * There are different chipsets to interface the sw64 CPUs to the world. - */ - -#define IO_CONCAT(a, b) _IO_CONCAT(a, b) -#define _IO_CONCAT(a, b) a ## _ ## b - #include /* -- Gitee From a75d3cc4a8a08692bdfd9037c0ea97bbd8cdcd9c Mon Sep 17 00:00:00 2001 From: He Sheng Date: Tue, 10 May 2022 15:34:03 +0800 Subject: [PATCH 090/674] sw64: do not include sw64io.h in io.h Sunway inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I56OLG -------------------------------- We consider io.h as a basic header which should be decoupled with platforms and chipsets, so it's better to remove sw64io.h from io.h, and do something necessary to make it build ok. Signed-off-by: He Sheng Signed-off-by: Gu Zitao --- arch/sw_64/chip/chip3/i2c-lib.c | 2 ++ arch/sw_64/include/asm/io.h | 10 ---------- arch/sw_64/include/asm/irq_impl.h | 2 ++ arch/sw_64/include/asm/sw64_init.h | 1 + arch/sw_64/kernel/pci_impl.h | 2 ++ arch/sw_64/kernel/proto.h | 1 + arch/sw_64/lib/iomap.c | 3 ++- 7 files changed, 10 insertions(+), 11 deletions(-) diff --git a/arch/sw_64/chip/chip3/i2c-lib.c b/arch/sw_64/chip/chip3/i2c-lib.c index ddf0a187ab5a..39b815022e69 100644 --- a/arch/sw_64/chip/chip3/i2c-lib.c +++ b/arch/sw_64/chip/chip3/i2c-lib.c @@ -19,6 +19,8 @@ #include #include +#include + #define CPLD_BUSNR 2 #ifndef _I2C_DEBUG_FLAG_ diff --git a/arch/sw_64/include/asm/io.h b/arch/sw_64/include/asm/io.h index 3138665133e1..fc11cb82f5b5 100644 --- a/arch/sw_64/include/asm/io.h +++ b/arch/sw_64/include/asm/io.h @@ -64,8 +64,6 @@ static inline void * __deprecated bus_to_virt(unsigned long address) } #define isa_bus_to_virt bus_to_virt -#include - /* * Generic IO read/write. These perform native-endian accesses. */ @@ -177,14 +175,6 @@ extern void outb(u8 b, unsigned long port); extern void outw(u16 b, unsigned long port); extern void outl(u32 b, unsigned long port); -/* - * Mapping from port numbers to __iomem space is pretty easy. - */ -static inline void __iomem *ioportmap(unsigned long addr) -{ - return sw64_platform->ioportmap(addr); -} - static inline void __iomem *__ioremap(phys_addr_t addr, size_t size, pgprot_t prot) { diff --git a/arch/sw_64/include/asm/irq_impl.h b/arch/sw_64/include/asm/irq_impl.h index 713267077142..3679793d8b65 100644 --- a/arch/sw_64/include/asm/irq_impl.h +++ b/arch/sw_64/include/asm/irq_impl.h @@ -11,6 +11,8 @@ #include #include +#include + #define SW64_PCIE0_INT_BASE 17 #define SW64_PCIE0_MSI_BASE 21 diff --git a/arch/sw_64/include/asm/sw64_init.h b/arch/sw_64/include/asm/sw64_init.h index 15842d22e5ba..2d9140605d0b 100644 --- a/arch/sw_64/include/asm/sw64_init.h +++ b/arch/sw_64/include/asm/sw64_init.h @@ -5,6 +5,7 @@ #include #include +#include struct sw64_early_init_ops { void (*setup_core_start)(struct cpumask *cpumask); diff --git a/arch/sw_64/kernel/pci_impl.h b/arch/sw_64/kernel/pci_impl.h index 8e541f28f4ce..6025145cb1c5 100644 --- a/arch/sw_64/kernel/pci_impl.h +++ b/arch/sw_64/kernel/pci_impl.h @@ -6,6 +6,8 @@ #ifndef _SW64_KERNEL_PCI_IMPL_H #define _SW64_KERNEL_PCI_IMPL_H +#include + struct pci_dev; struct pci_controller; diff --git a/arch/sw_64/kernel/proto.h b/arch/sw_64/kernel/proto.h index 1a729a8f21c3..9c99baaaf1d0 100644 --- a/arch/sw_64/kernel/proto.h +++ b/arch/sw_64/kernel/proto.h @@ -5,6 +5,7 @@ #include #include #include +#include /* ptrace.c */ extern int ptrace_set_bpt(struct task_struct *child); diff --git a/arch/sw_64/lib/iomap.c b/arch/sw_64/lib/iomap.c index 6fb96a7d7119..3a8d879ef070 100644 --- a/arch/sw_64/lib/iomap.c +++ b/arch/sw_64/lib/iomap.c @@ -6,6 +6,7 @@ #include #include +#include /* * Here comes the sw64 implementation of the IOMAP interfaces. @@ -459,7 +460,7 @@ EXPORT_SYMBOL(_memset_c_io); void __iomem *ioport_map(unsigned long port, unsigned int size) { - return ioportmap(port); + return sw64_platform->ioportmap(port); } EXPORT_SYMBOL(ioport_map); -- Gitee From 5e50a69788ed5bb9a512b098a3e1c7620c705e47 Mon Sep 17 00:00:00 2001 From: Zhou Qihang Date: Wed, 11 May 2022 10:53:06 +0800 Subject: [PATCH 091/674] sw64: iommu: work around iova mapping on pci bars Sunway inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5G5T3 -------------------------------- Since the RCs of SW64 chipset do not support Peer-to-Peer DMA, we used to return -EINVAL error on iova mampping on PCI BARs. However, it will cause driver loading errors when QEMU calls vfio module to passthrough PCI devices with this kind of BARs, shown as follows: > qemu-system-sw64: VFIO_MAP_DMA: -22 > qemu-system-sw64: vfio_dma_map(0x121c28020, 0x8800c0000000, 0x40000, 0x4165f21b2000) = -22 (Invalid argument) > qemu: hardware error: vfio: DMA mapping failed, unable to continue To fix this problem, this patch will return success on the iova mapping of PCI BARs that pretends to support this feature. Signed-off-by: Zhou Qihang Signed-off-by: Gu Zitao --- drivers/iommu/sw64/sunway_iommu.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/sw64/sunway_iommu.c b/drivers/iommu/sw64/sunway_iommu.c index 8b851e0a0c20..5797adf8fbc5 100644 --- a/drivers/iommu/sw64/sunway_iommu.c +++ b/drivers/iommu/sw64/sunway_iommu.c @@ -1509,6 +1509,9 @@ sunway_iommu_iova_to_phys(struct iommu_domain *dom, dma_addr_t iova) struct sunway_iommu_domain *sdomain = to_sunway_domain(dom); unsigned long paddr, grn; + if (iova > SW64_BAR_ADDRESS) + return iova; + paddr = fetch_pte(sdomain, iova, PTE_LEVEL2_VAL); if ((paddr & SW64_IOMMU_ENTRY_VALID) == 0) @@ -1544,7 +1547,7 @@ sunway_iommu_map(struct iommu_domain *dom, unsigned long iova, * to avoid VFIO trying to map pci config space. */ if (iova > SW64_BAR_ADDRESS) - return -EINVAL; + return 0; mutex_lock(&sdomain->api_lock); ret = sunway_iommu_map_page(sdomain, iova, paddr, page_size); -- Gitee From fb2723595a7bee3a69596b60ce8038d5cee85384 Mon Sep 17 00:00:00 2001 From: Zhao Yihan Date: Wed, 11 May 2022 15:13:59 +0800 Subject: [PATCH 092/674] sw64: map address by OR operation in __va() Sunway inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I56OLG -------------------------------- Signed-off-by: Zhao Yihan Signed-off-by: Gu Zitao --- arch/sw_64/include/asm/page.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/sw_64/include/asm/page.h b/arch/sw_64/include/asm/page.h index 6e17d5e437c5..363785c9f082 100644 --- a/arch/sw_64/include/asm/page.h +++ b/arch/sw_64/include/asm/page.h @@ -46,7 +46,7 @@ extern unsigned long __phys_addr(unsigned long); #endif #define __pa(x) __phys_addr((unsigned long)(x)) -#define __va(x) ((void *)((unsigned long) (x) + PAGE_OFFSET)) +#define __va(x) ((void *)((unsigned long) (x) | PAGE_OFFSET)) #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) #define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) -- Gitee From 447e5cc179c5a24a1e99eb103737523f8b549137 Mon Sep 17 00:00:00 2001 From: He Sheng Date: Thu, 12 May 2022 11:10:01 +0800 Subject: [PATCH 093/674] sw64: read host IO registers with rdio64 hmcall Sunway inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I56OLG -------------------------------- The hmcall rdio64 can access IOR directly with its PA. Signed-off-by: He Sheng Signed-off-by: Gu Zitao --- arch/sw_64/chip/chip3/chip.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/sw_64/chip/chip3/chip.c b/arch/sw_64/chip/chip3/chip.c index 4d2f99cc6402..704588d7c3e1 100644 --- a/arch/sw_64/chip/chip3/chip.c +++ b/arch/sw_64/chip/chip3/chip.c @@ -54,7 +54,7 @@ static struct clocksource clocksource_longtime = { static u64 read_vtime(struct clocksource *cs) { u64 result; - unsigned long vtime_addr = PAGE_OFFSET | IO_BASE | LONG_TIME; + unsigned long vtime_addr = IO_BASE | LONG_TIME; result = rdio64(vtime_addr); return result; -- Gitee From 96259d7012ceb2b857d2f53c9921498365bd212e Mon Sep 17 00:00:00 2001 From: He Sheng Date: Thu, 12 May 2022 11:22:07 +0800 Subject: [PATCH 094/674] sw64: map logical address with __va() Sunway inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5G5YB -------------------------------- Although kmem and IO space are accessed with direct mapping on sw64, it's not a standard method to or PAGE_OFFSET and access it directly everywhere. This patch maps PA to VA with __va(). Besides, we change type of some variables to pointer to make the code clearer. Signed-off-by: He Sheng Signed-off-by: Gu Zitao --- arch/sw_64/chip/chip3/chip.c | 4 +- arch/sw_64/chip/chip3/i2c-lib.c | 24 +++++------ arch/sw_64/include/asm/pci.h | 4 +- arch/sw_64/include/asm/sw64io.h | 56 +++++++++++++------------- arch/sw_64/kernel/dup_print.c | 10 +++-- arch/sw_64/kernel/pci.c | 5 +-- arch/sw_64/kvm/kvm-sw64.c | 11 +++-- arch/sw_64/platform/platform_xuelang.c | 2 +- 8 files changed, 59 insertions(+), 57 deletions(-) diff --git a/arch/sw_64/chip/chip3/chip.c b/arch/sw_64/chip/chip3/chip.c index 704588d7c3e1..acd25fe99669 100644 --- a/arch/sw_64/chip/chip3/chip.c +++ b/arch/sw_64/chip/chip3/chip.c @@ -480,8 +480,8 @@ static void chip3_hose_init(struct pci_controller *hose) hose->dense_mem_base = pci_io_base; hose->dense_io_base = pci_io_base | PCI_LEGACY_IO; - hose->ep_config_space_base = PAGE_OFFSET | pci_io_base | PCI_EP_CFG; - hose->rc_config_space_base = PAGE_OFFSET | pci_io_base | PCI_RC_CFG; + hose->ep_config_space_base = __va(pci_io_base | PCI_EP_CFG); + hose->rc_config_space_base = __va(pci_io_base | PCI_RC_CFG); hose->mem_space->start = pci_io_base + PCI_32BIT_MEMIO; hose->mem_space->end = hose->mem_space->start + PCI_32BIT_MEMIO_SIZE - 1; diff --git a/arch/sw_64/chip/chip3/i2c-lib.c b/arch/sw_64/chip/chip3/i2c-lib.c index 39b815022e69..b3dcdcd32735 100644 --- a/arch/sw_64/chip/chip3/i2c-lib.c +++ b/arch/sw_64/chip/chip3/i2c-lib.c @@ -96,7 +96,7 @@ enum i2c_bus_operation { I2C_BUS_WRITE, }; -static uint64_t m_i2c_base_address; +static void __iomem *m_i2c_base_address; /* * This function get I2Cx controller base address @@ -104,18 +104,18 @@ static uint64_t m_i2c_base_address; * @param i2c_controller_index Bus Number of I2C controller. * @return I2C BAR. */ -uint64_t get_i2c_bar_addr(uint8_t i2c_controller_index) +void __iomem *get_i2c_bar_addr(uint8_t i2c_controller_index) { - uint64_t base_addr = 0; - - if (i2c_controller_index == 0) - base_addr = PAGE_OFFSET | IO_BASE | IIC0_BASE; - else if (i2c_controller_index == 1) - base_addr = PAGE_OFFSET | IO_BASE | IIC1_BASE; - else if (i2c_controller_index == 2) - base_addr = PAGE_OFFSET | IO_BASE | IIC2_BASE; - - return base_addr; + switch (i2c_controller_index) { + case 0: + return __va(IO_BASE | IIC0_BASE); + case 1: + return __va(IO_BASE | IIC1_BASE); + case 2: + return __va(IO_BASE | IIC2_BASE); + default: + return NULL; + } } void write_cpu_i2c_controller(uint64_t offset, uint32_t data) diff --git a/arch/sw_64/include/asm/pci.h b/arch/sw_64/include/asm/pci.h index ed875e0c3162..a90f80152470 100644 --- a/arch/sw_64/include/asm/pci.h +++ b/arch/sw_64/include/asm/pci.h @@ -34,8 +34,8 @@ struct pci_controller { unsigned long dense_io_base; /* This one's for the kernel only. It's in KSEG somewhere. */ - unsigned long ep_config_space_base; - unsigned long rc_config_space_base; + void __iomem *ep_config_space_base; + void __iomem *rc_config_space_base; unsigned long index; unsigned long node; diff --git a/arch/sw_64/include/asm/sw64io.h b/arch/sw_64/include/asm/sw64io.h index 7c032070acf0..e66aba66932b 100644 --- a/arch/sw_64/include/asm/sw64io.h +++ b/arch/sw_64/include/asm/sw64io.h @@ -11,20 +11,20 @@ extern void setup_chip_clocksource(void); #endif #define MK_RC_CFG(nid, idx) \ - (PAGE_OFFSET | SW64_PCI_IO_BASE((nid), (idx)) | PCI_RC_CFG) + (SW64_PCI_IO_BASE((nid), (idx)) | PCI_RC_CFG) #define MK_PIU_IOR0(nid, idx) \ - (PAGE_OFFSET | SW64_PCI_IO_BASE((nid), (idx)) | PCI_IOR0_BASE) + (SW64_PCI_IO_BASE((nid), (idx)) | PCI_IOR0_BASE) #define MK_PIU_IOR1(nid, idx) \ - (PAGE_OFFSET | SW64_PCI_IO_BASE((nid), (idx)) | PCI_IOR1_BASE) + (SW64_PCI_IO_BASE((nid), (idx)) | PCI_IOR1_BASE) static inline unsigned int -read_rc_conf(unsigned long node, unsigned long rc_index, - unsigned int conf_offset) +read_rc_conf(unsigned long node, unsigned long rc, + unsigned int offset) { - unsigned long addr; + void __iomem *addr; unsigned int value; - addr = MK_RC_CFG(node, rc_index) | conf_offset; + addr = __va(MK_RC_CFG(node, rc) | offset); value = *(volatile unsigned int *)addr; mb(); @@ -32,24 +32,24 @@ read_rc_conf(unsigned long node, unsigned long rc_index, } static inline void -write_rc_conf(unsigned long node, unsigned long rc_index, - unsigned int conf_offset, unsigned int data) +write_rc_conf(unsigned long node, unsigned long rc, + unsigned int offset, unsigned int data) { - unsigned long addr; + void __iomem *addr; - addr = MK_RC_CFG(node, rc_index) | conf_offset; + addr = __va(MK_RC_CFG(node, rc) | offset); *(unsigned int *)addr = data; mb(); } static inline unsigned long -read_piu_ior0(unsigned long node, unsigned long rc_index, +read_piu_ior0(unsigned long node, unsigned long rc, unsigned int reg) { - unsigned long addr; + void __iomem *addr; unsigned long value; - addr = MK_PIU_IOR0(node, rc_index) + reg; + addr = __va(MK_PIU_IOR0(node, rc) + reg); value = *(volatile unsigned long __iomem *)addr; mb(); @@ -57,23 +57,24 @@ read_piu_ior0(unsigned long node, unsigned long rc_index, } static inline void -write_piu_ior0(unsigned long node, unsigned long rc_index, +write_piu_ior0(unsigned long node, unsigned long rc, unsigned int reg, unsigned long data) { - unsigned long addr; + void __iomem *addr; - addr = MK_PIU_IOR0(node, rc_index) + reg; + addr = __va(MK_PIU_IOR0(node, rc) + reg); *(unsigned long __iomem *)addr = data; mb(); } static inline unsigned long -read_piu_ior1(unsigned long node, unsigned long rc_index, +read_piu_ior1(unsigned long node, unsigned long rc, unsigned int reg) { - unsigned long addr, value; + void __iomem *addr; + unsigned long value; - addr = MK_PIU_IOR1(node, rc_index) + reg; + addr = __va(MK_PIU_IOR1(node, rc) + reg); value = *(volatile unsigned long __iomem *)addr; mb(); @@ -81,12 +82,12 @@ read_piu_ior1(unsigned long node, unsigned long rc_index, } static inline void -write_piu_ior1(unsigned long node, unsigned long rc_index, +write_piu_ior1(unsigned long node, unsigned long rc, unsigned int reg, unsigned long data) { - unsigned long addr; + void __iomem *addr; - addr = MK_PIU_IOR1(node, rc_index) + reg; + addr = __va(MK_PIU_IOR1(node, rc) + reg); *(volatile unsigned long __iomem *)addr = data; mb(); } @@ -94,9 +95,10 @@ write_piu_ior1(unsigned long node, unsigned long rc_index, static inline unsigned long sw64_io_read(unsigned long node, unsigned long reg) { - unsigned long addr, value; + void __iomem *addr; + unsigned long value; - addr = PAGE_OFFSET | SW64_IO_BASE(node) | reg; + addr = __va(SW64_IO_BASE(node) | reg); value = *(volatile unsigned long __iomem *)addr; mb(); @@ -106,9 +108,9 @@ sw64_io_read(unsigned long node, unsigned long reg) static inline void sw64_io_write(unsigned long node, unsigned long reg, unsigned long data) { - unsigned long addr; + void __iomem *addr; - addr = PAGE_OFFSET | SW64_IO_BASE(node) | reg; + addr = __va(SW64_IO_BASE(node) | reg); *(volatile unsigned long __iomem *)addr = data; mb(); } diff --git a/arch/sw_64/kernel/dup_print.c b/arch/sw_64/kernel/dup_print.c index 1aa7710b5092..02639f40a4bc 100644 --- a/arch/sw_64/kernel/dup_print.c +++ b/arch/sw_64/kernel/dup_print.c @@ -4,6 +4,7 @@ #include #include +#include #ifdef CONFIG_SW64_RRK @@ -18,7 +19,7 @@ unsigned long sw64_printk_offset; * For output the kernel message on the console * with full-system emulator. */ -#define QEMU_PRINTF_BUFF_BASE (IO_BASE | MCU_BASE | 0x40000UL | PAGE_OFFSET) +#define QEMU_PRINTF_BUFF_BASE (IO_BASE | MCU_BASE | 0x40000UL) int sw64_printk(const char *fmt, va_list args) { @@ -38,9 +39,10 @@ int sw64_printk(const char *fmt, va_list args) } else { printed_len += vscnprintf(sw64_printk_buf, 1024, fmt, args); if (is_in_emul()) { - unsigned long write_addr = QEMU_PRINTF_BUFF_BASE; - *(unsigned long *)write_addr = (unsigned long)((((unsigned long)sw64_printk_buf) & 0xffffffffUL) - | ((unsigned long)printed_len << 32)); + void __iomem *addr = __va(QEMU_PRINTF_BUFF_BASE); + u64 data = ((u64)sw64_printk_buf & 0xffffffffUL) + | ((u64)printed_len << 32); + *(u64 *)addr = data; } } sw64_printk_offset += printed_len; diff --git a/arch/sw_64/kernel/pci.c b/arch/sw_64/kernel/pci.c index 77bbd1b68176..401ee0b781be 100644 --- a/arch/sw_64/kernel/pci.c +++ b/arch/sw_64/kernel/pci.c @@ -398,7 +398,7 @@ int sw6_pcie_read_rc_cfg(struct pci_bus *bus, unsigned int devfn, { u32 data; struct pci_controller *hose = bus->sysdata; - void __iomem *cfg_iobase = (void *)hose->rc_config_space_base; + void __iomem *cfg_iobase = hose->rc_config_space_base; if (IS_ENABLED(CONFIG_PCI_DEBUG)) pr_debug("rc read addr:%px bus %d, devfn %#x, where %#x size=%d\t", @@ -549,9 +549,8 @@ static void __iomem *sw6_pcie_map_bus(struct pci_bus *bus, return NULL; relbus = (bus->number << 24) | (devfn << 16) | where; - relbus |= PCI_EP_CFG; - cfg_iobase = (void *)(hose->ep_config_space_base | relbus); + cfg_iobase = hose->ep_config_space_base + relbus; if (IS_ENABLED(CONFIG_PCI_DEBUG)) pr_debug("addr:%px bus %d, devfn %d, where %d\n", diff --git a/arch/sw_64/kvm/kvm-sw64.c b/arch/sw_64/kvm/kvm-sw64.c index d651d26a957a..839ee83d57d5 100644 --- a/arch/sw_64/kvm/kvm-sw64.c +++ b/arch/sw_64/kvm/kvm-sw64.c @@ -311,7 +311,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, kvm->arch.host_phys_addr = (u64)addr; kvm->arch.size = round_up(mem->memory_size, 8<<20); - memset((void *)(PAGE_OFFSET + addr), 0, 0x2000000); + memset(__va(addr), 0, 0x2000000); return 0; } @@ -344,7 +344,7 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) memset(&vcpu->arch.irqs_pending, 0, sizeof(vcpu->arch.irqs_pending)); if (vcpu->vcpu_id == 0) - memset((void *)(PAGE_OFFSET + addr), 0, 0x2000000); + memset(__va(addr), 0, 0x2000000); return 0; } @@ -432,18 +432,17 @@ void _debug_printk_vcpu(struct kvm_vcpu *vcpu) { unsigned long pc = vcpu->arch.regs.pc; unsigned long offset = vcpu->kvm->arch.host_phys_addr; - unsigned long pc_phys = PAGE_OFFSET | ((pc & 0x7fffffffUL) + offset); + unsigned int *pc_phys = __va((pc & 0x7fffffffUL) + offset); unsigned int insn; int opc, ra, disp16; - insn = *(unsigned int *)pc_phys; - + insn = *pc_phys; opc = (insn >> 26) & 0x3f; ra = (insn >> 21) & 0x1f; disp16 = insn & 0xffff; if (opc == 0x06 && disp16 == 0x1000) /* RD_F */ - pr_info("vcpu exit: pc = %#lx (%#lx), insn[%x] : rd_f r%d [%#lx]\n", + pr_info("vcpu exit: pc = %#lx (%px), insn[%x] : rd_f r%d [%#lx]\n", pc, pc_phys, insn, ra, vcpu_get_reg(vcpu, ra)); } diff --git a/arch/sw_64/platform/platform_xuelang.c b/arch/sw_64/platform/platform_xuelang.c index f0e33c664b0e..63a4b163e43e 100644 --- a/arch/sw_64/platform/platform_xuelang.c +++ b/arch/sw_64/platform/platform_xuelang.c @@ -54,7 +54,7 @@ static inline void __iomem *xuelang_ioportmap(unsigned long addr) addr = addr | io_offset; } - return (void __iomem *)(addr | PAGE_OFFSET); + return __va(addr); } struct sw64_platform_ops xuelang_ops = { -- Gitee From 53003aee7c6a44a565254b702fe73a0c44fa244b Mon Sep 17 00:00:00 2001 From: He Sheng Date: Thu, 12 May 2022 14:07:00 +0800 Subject: [PATCH 095/674] sw64: access IO space with readX/writeX Sunway inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5G5YB -------------------------------- It's more standard to access IO registers by calling readX/writeX interfaces. In addition, the code is easier to maintain. Signed-off-by: He Sheng Signed-off-by: Gu Zitao --- arch/sw_64/chip/chip3/i2c-lib.c | 15 +++++-------- arch/sw_64/include/asm/sw64io.h | 37 ++++++++------------------------- 2 files changed, 14 insertions(+), 38 deletions(-) diff --git a/arch/sw_64/chip/chip3/i2c-lib.c b/arch/sw_64/chip/chip3/i2c-lib.c index b3dcdcd32735..e70f0f0c9a56 100644 --- a/arch/sw_64/chip/chip3/i2c-lib.c +++ b/arch/sw_64/chip/chip3/i2c-lib.c @@ -118,19 +118,14 @@ void __iomem *get_i2c_bar_addr(uint8_t i2c_controller_index) } } -void write_cpu_i2c_controller(uint64_t offset, uint32_t data) +static inline void write_cpu_i2c_controller(uint64_t offset, uint32_t data) { - mb(); - *(volatile uint32_t *)(m_i2c_base_address + offset) = data; + writel(data, m_i2c_base_address + offset); } -uint32_t read_cpu_i2c_controller(uint64_t offset) +static inline uint32_t read_cpu_i2c_controller(uint64_t offset) { - uint32_t data; - - data = *(volatile uint32_t *)(m_i2c_base_address + offset); - mb(); - return data; + return readl(m_i2c_base_address + offset); } static int poll_for_status_set0(uint16_t status_bit) @@ -241,7 +236,7 @@ static int i2c_read(uint8_t reg_offset, uint8_t *buffer, uint32_t length) write_cpu_i2c_controller(DW_IC_DATA_CMD, DW_IC_CMD); if (poll_for_status_set0(DW_IC_STATUS_RFNE) == 0) - buffer[i] = *(uint8_t *) (m_i2c_base_address + DW_IC_DATA_CMD); + buffer[i] = readb(m_i2c_base_address + DW_IC_DATA_CMD); else pr_err("Read timeout line %d.\n", __LINE__); } diff --git a/arch/sw_64/include/asm/sw64io.h b/arch/sw_64/include/asm/sw64io.h index e66aba66932b..7d79a5b75090 100644 --- a/arch/sw_64/include/asm/sw64io.h +++ b/arch/sw_64/include/asm/sw64io.h @@ -2,6 +2,7 @@ #ifndef _ASM_SW64_SW64IO_H #define _ASM_SW64_SW64IO_H +#include #include extern void setup_chip_clocksource(void); @@ -22,13 +23,9 @@ read_rc_conf(unsigned long node, unsigned long rc, unsigned int offset) { void __iomem *addr; - unsigned int value; addr = __va(MK_RC_CFG(node, rc) | offset); - value = *(volatile unsigned int *)addr; - mb(); - - return value; + return readl(addr); } static inline void @@ -38,8 +35,7 @@ write_rc_conf(unsigned long node, unsigned long rc, void __iomem *addr; addr = __va(MK_RC_CFG(node, rc) | offset); - *(unsigned int *)addr = data; - mb(); + writel(data, addr); } static inline unsigned long @@ -47,13 +43,9 @@ read_piu_ior0(unsigned long node, unsigned long rc, unsigned int reg) { void __iomem *addr; - unsigned long value; addr = __va(MK_PIU_IOR0(node, rc) + reg); - value = *(volatile unsigned long __iomem *)addr; - mb(); - - return value; + return readq(addr); } static inline void @@ -63,8 +55,7 @@ write_piu_ior0(unsigned long node, unsigned long rc, void __iomem *addr; addr = __va(MK_PIU_IOR0(node, rc) + reg); - *(unsigned long __iomem *)addr = data; - mb(); + writeq(data, addr); } static inline unsigned long @@ -72,13 +63,9 @@ read_piu_ior1(unsigned long node, unsigned long rc, unsigned int reg) { void __iomem *addr; - unsigned long value; addr = __va(MK_PIU_IOR1(node, rc) + reg); - value = *(volatile unsigned long __iomem *)addr; - mb(); - - return value; + return readq(addr); } static inline void @@ -88,21 +75,16 @@ write_piu_ior1(unsigned long node, unsigned long rc, void __iomem *addr; addr = __va(MK_PIU_IOR1(node, rc) + reg); - *(volatile unsigned long __iomem *)addr = data; - mb(); + writeq(data, addr); } static inline unsigned long sw64_io_read(unsigned long node, unsigned long reg) { void __iomem *addr; - unsigned long value; addr = __va(SW64_IO_BASE(node) | reg); - value = *(volatile unsigned long __iomem *)addr; - mb(); - - return value; + return readq(addr); } static inline void @@ -111,7 +93,6 @@ sw64_io_write(unsigned long node, unsigned long reg, unsigned long data) void __iomem *addr; addr = __va(SW64_IO_BASE(node) | reg); - *(volatile unsigned long __iomem *)addr = data; - mb(); + writeq(data, addr); } #endif -- Gitee From 687ba666ce3a7f7c55eaa7b325d0ad928559798c Mon Sep 17 00:00:00 2001 From: He Sheng Date: Thu, 21 Apr 2022 17:13:53 +0800 Subject: [PATCH 096/674] sw64: add fpu state save/restore interfaces Sunway inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I56OLG -------------------------------- Signed-off-by: He Sheng Signed-off-by: Gu Zitao --- arch/sw_64/kernel/Makefile | 2 +- arch/sw_64/kernel/fpu.S | 106 +++++++++++++++++++++++++++++++++++++ 2 files changed, 107 insertions(+), 1 deletion(-) create mode 100644 arch/sw_64/kernel/fpu.S diff --git a/arch/sw_64/kernel/Makefile b/arch/sw_64/kernel/Makefile index f6a2813b0466..293b360626f7 100644 --- a/arch/sw_64/kernel/Makefile +++ b/arch/sw_64/kernel/Makefile @@ -13,7 +13,7 @@ CFLAGS_REMOVE_insn.o = -pg CFLAGS_REMOVE_printk.o = -pg endif -obj-y := entry.o traps.o process.o sys_sw64.o irq.o \ +obj-y := entry.o fpu.o traps.o process.o sys_sw64.o irq.o \ irq_sw64.o signal.o setup.o ptrace.o time.o \ systbls.o dup_print.o tc.o \ insn.o early_init.o topology.o cacheinfo.o \ diff --git a/arch/sw_64/kernel/fpu.S b/arch/sw_64/kernel/fpu.S new file mode 100644 index 000000000000..25dcf4740525 --- /dev/null +++ b/arch/sw_64/kernel/fpu.S @@ -0,0 +1,106 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include +#include +#include +#include + + .text + .set noat +ENTRY(__fpstate_save) + /* a0: prev task */ + ldi a0, TASK_THREAD(a0) + ldi t0, THREAD_CTX_FP(a0) + vstd $f0, CTX_FP_F0(t0) + vstd $f1, CTX_FP_F1(t0) + vstd $f2, CTX_FP_F2(t0) + vstd $f3, CTX_FP_F3(t0) + vstd $f4, CTX_FP_F4(t0) + vstd $f5, CTX_FP_F5(t0) + vstd $f6, CTX_FP_F6(t0) + vstd $f7, CTX_FP_F7(t0) + vstd $f8, CTX_FP_F8(t0) + vstd $f9, CTX_FP_F9(t0) + vstd $f10, CTX_FP_F10(t0) + vstd $f11, CTX_FP_F11(t0) + vstd $f12, CTX_FP_F12(t0) + vstd $f13, CTX_FP_F13(t0) + vstd $f14, CTX_FP_F14(t0) + vstd $f15, CTX_FP_F15(t0) + vstd $f16, CTX_FP_F16(t0) + vstd $f17, CTX_FP_F17(t0) + vstd $f18, CTX_FP_F18(t0) + vstd $f19, CTX_FP_F19(t0) + vstd $f20, CTX_FP_F20(t0) + vstd $f21, CTX_FP_F21(t0) + vstd $f22, CTX_FP_F22(t0) + vstd $f23, CTX_FP_F23(t0) + vstd $f24, CTX_FP_F24(t0) + vstd $f25, CTX_FP_F25(t0) + vstd $f26, CTX_FP_F26(t0) + vstd $f27, CTX_FP_F27(t0) + rfpcr $f0 + vstd $f28, CTX_FP_F28(t0) + vstd $f29, CTX_FP_F29(t0) + vstd $f30, CTX_FP_F30(t0) + fstd $f0, THREAD_FPCR(a0) + vldd $f0, CTX_FP_F0(t0) + ret +END(__fpstate_save) + +ENTRY(__fpstate_restore) + /* a0: next task */ + ldi a0, TASK_THREAD(a0) + fldd $f0, THREAD_FPCR(a0) + wfpcr $f0 + fimovd $f0, t1 + and t1, 0x3, t1 + beq t1, $setfpec_0 + subl t1, 0x1, t1 + beq t1, $setfpec_1 + subl t1, 0x1, t1 + beq t1, $setfpec_2 + setfpec3 + br $setfpec_over +$setfpec_0: + setfpec0 + br $setfpec_over +$setfpec_1: + setfpec1 + br $setfpec_over +$setfpec_2: + setfpec2 +$setfpec_over: + ldi t0, THREAD_CTX_FP(a0) + vldd $f0, CTX_FP_F0(t0) + vldd $f1, CTX_FP_F1(t0) + vldd $f2, CTX_FP_F2(t0) + vldd $f3, CTX_FP_F3(t0) + vldd $f4, CTX_FP_F4(t0) + vldd $f5, CTX_FP_F5(t0) + vldd $f6, CTX_FP_F6(t0) + vldd $f7, CTX_FP_F7(t0) + vldd $f8, CTX_FP_F8(t0) + vldd $f9, CTX_FP_F9(t0) + vldd $f10, CTX_FP_F10(t0) + vldd $f11, CTX_FP_F11(t0) + vldd $f12, CTX_FP_F12(t0) + vldd $f13, CTX_FP_F13(t0) + vldd $f14, CTX_FP_F14(t0) + vldd $f15, CTX_FP_F15(t0) + vldd $f16, CTX_FP_F16(t0) + vldd $f17, CTX_FP_F17(t0) + vldd $f18, CTX_FP_F18(t0) + vldd $f19, CTX_FP_F19(t0) + vldd $f20, CTX_FP_F20(t0) + vldd $f21, CTX_FP_F21(t0) + vldd $f22, CTX_FP_F22(t0) + vldd $f23, CTX_FP_F23(t0) + vldd $f24, CTX_FP_F24(t0) + vldd $f25, CTX_FP_F25(t0) + vldd $f26, CTX_FP_F26(t0) + vldd $f27, CTX_FP_F27(t0) + vldd $f28, CTX_FP_F28(t0) + vldd $f29, CTX_FP_F29(t0) + vldd $f30, CTX_FP_F30(t0) + ret +END(__fpstate_restore) -- Gitee From fa1cadd2b145b88c6ffb9fd8cd540967b5f7c067 Mon Sep 17 00:00:00 2001 From: He Sheng Date: Fri, 22 Apr 2022 09:29:43 +0800 Subject: [PATCH 097/674] sw64: switch to generic fork like system calls Sunway inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5GF7A -------------------------------- We put thread's callee-saved registers in thread_struct, and put exception callee-saved registers in pt_regs. For context switch, it switches fpu state first, and then switches integer registers in __switch_to. Because callee-saved registers are always saved and restored, it will cause a little overhead. As a result, the special sw64 fork like system calls are removed. Signed-off-by: He Sheng Signed-off-by: Gu Zitao --- arch/sw_64/include/asm/processor.h | 3 + arch/sw_64/include/asm/switch_to.h | 37 +++++- arch/sw_64/include/uapi/asm/ptrace.h | 15 ++- arch/sw_64/kernel/asm-offsets.c | 16 +++ arch/sw_64/kernel/entry.S | 173 +++++++++++++++---------- arch/sw_64/kernel/process.c | 21 ++- arch/sw_64/kernel/syscalls/syscall.tbl | 6 +- 7 files changed, 175 insertions(+), 96 deletions(-) diff --git a/arch/sw_64/include/asm/processor.h b/arch/sw_64/include/asm/processor.h index 645c33a596ff..67d3b4368987 100644 --- a/arch/sw_64/include/asm/processor.h +++ b/arch/sw_64/include/asm/processor.h @@ -78,6 +78,9 @@ struct context_fpregs { struct thread_struct { struct context_fpregs ctx_fp; unsigned long fpcr; + /* Callee-saved registers */ + unsigned long ra; + unsigned long s[7]; /* s0 ~ s6 */ }; #define INIT_THREAD { } diff --git a/arch/sw_64/include/asm/switch_to.h b/arch/sw_64/include/asm/switch_to.h index 22045b247557..d503fc59390f 100644 --- a/arch/sw_64/include/asm/switch_to.h +++ b/arch/sw_64/include/asm/switch_to.h @@ -2,12 +2,41 @@ #ifndef _ASM_SW64_SWITCH_TO_H #define _ASM_SW64_SWITCH_TO_H -struct task_struct; -extern struct task_struct *__switch_to(unsigned long, struct task_struct *); +#include + +extern void __fpstate_save(struct task_struct *save_to); +extern void __fpstate_restore(struct task_struct *restore_from); +extern struct task_struct *__switch_to(unsigned long pcb, + struct task_struct *prev, struct task_struct *next); extern void restore_da_match_after_sched(void); -#define switch_to(P, N, L) \ + +static inline void fpstate_save(struct task_struct *task) +{ + if (likely(!(task->flags & PF_KTHREAD))) + __fpstate_save(task); +} + +static inline void fpstate_restore(struct task_struct *task) +{ + if (likely(!(task->flags & PF_KTHREAD))) + __fpstate_restore(task); +} + +static inline void __switch_to_aux(struct task_struct *prev, + struct task_struct *next) +{ + fpstate_save(prev); + fpstate_restore(next); +} + + +#define switch_to(prev, next, last) \ do { \ - (L) = __switch_to(virt_to_phys(&task_thread_info(N)->pcb), (P));\ + struct task_struct *__prev = (prev); \ + struct task_struct *__next = (next); \ + __u64 __nextpcb = virt_to_phys(&task_thread_info(__next)->pcb); \ + __switch_to_aux(__prev, __next); \ + (last) = __switch_to(__nextpcb, __prev, __next); \ check_mmu_context(); \ } while (0) diff --git a/arch/sw_64/include/uapi/asm/ptrace.h b/arch/sw_64/include/uapi/asm/ptrace.h index 96cb10891bea..4549e8d4bf57 100644 --- a/arch/sw_64/include/uapi/asm/ptrace.h +++ b/arch/sw_64/include/uapi/asm/ptrace.h @@ -9,12 +9,7 @@ * * NOTE! I want to minimize the overhead of system calls, so this * struct has as little information as possible. I does not have - * - * - floating point regs: the kernel doesn't change those - * - r9-15: saved by the C compiler - * - * This makes "fork()" and "exec()" a bit more complex, but should - * give us low system call latency. + * floating point regs, because the kernel doesn't change those */ struct pt_regs { @@ -27,6 +22,14 @@ struct pt_regs { unsigned long r6; unsigned long r7; unsigned long r8; + unsigned long r9; + unsigned long r10; + unsigned long r11; + unsigned long r12; + unsigned long r13; + unsigned long r14; + unsigned long r15; + /* r16 ~ r18 saved by hmcode */ unsigned long r19; unsigned long r20; unsigned long r21; diff --git a/arch/sw_64/kernel/asm-offsets.c b/arch/sw_64/kernel/asm-offsets.c index bea12d2d96fe..349bc5c4e2f1 100644 --- a/arch/sw_64/kernel/asm-offsets.c +++ b/arch/sw_64/kernel/asm-offsets.c @@ -72,6 +72,13 @@ void foo(void) DEFINE(PT_REGS_R6, offsetof(struct pt_regs, r6)); DEFINE(PT_REGS_R7, offsetof(struct pt_regs, r7)); DEFINE(PT_REGS_R8, offsetof(struct pt_regs, r8)); + DEFINE(PT_REGS_R9, offsetof(struct pt_regs, r9)); + DEFINE(PT_REGS_R10, offsetof(struct pt_regs, r10)); + DEFINE(PT_REGS_R11, offsetof(struct pt_regs, r11)); + DEFINE(PT_REGS_R12, offsetof(struct pt_regs, r12)); + DEFINE(PT_REGS_R13, offsetof(struct pt_regs, r13)); + DEFINE(PT_REGS_R14, offsetof(struct pt_regs, r14)); + DEFINE(PT_REGS_R15, offsetof(struct pt_regs, r15)); DEFINE(PT_REGS_R19, offsetof(struct pt_regs, r19)); DEFINE(PT_REGS_R20, offsetof(struct pt_regs, r20)); DEFINE(PT_REGS_R21, offsetof(struct pt_regs, r21)); @@ -258,4 +265,13 @@ void foo(void) DEFINE(CTX_FP_F29, offsetof(struct context_fpregs, f29)); DEFINE(CTX_FP_F30, offsetof(struct context_fpregs, f30)); BLANK(); + OFFSET(TASK_THREAD_RA, task_struct, thread.ra); + OFFSET(TASK_THREAD_S0, task_struct, thread.s[0]); + OFFSET(TASK_THREAD_S1, task_struct, thread.s[1]); + OFFSET(TASK_THREAD_S2, task_struct, thread.s[2]); + OFFSET(TASK_THREAD_S3, task_struct, thread.s[3]); + OFFSET(TASK_THREAD_S4, task_struct, thread.s[4]); + OFFSET(TASK_THREAD_S5, task_struct, thread.s[5]); + OFFSET(TASK_THREAD_S6, task_struct, thread.s[6]); + BLANK(); } diff --git a/arch/sw_64/kernel/entry.S b/arch/sw_64/kernel/entry.S index 6c40d2015439..ac62461aa9dc 100644 --- a/arch/sw_64/kernel/entry.S +++ b/arch/sw_64/kernel/entry.S @@ -21,52 +21,84 @@ * the hmcode-provided values are available to the signal handler. */ -#define SAVE_ALL \ - ldi $sp, -PT_REGS_PS($sp); \ - stl $0, PT_REGS_R0($sp); \ - stl $1, PT_REGS_R1($sp); \ - stl $2, PT_REGS_R2($sp); \ - stl $3, PT_REGS_R3($sp); \ - stl $4, PT_REGS_R4($sp); \ - stl $28, PT_REGS_R28($sp); \ - stl $5, PT_REGS_R5($sp); \ - stl $6, PT_REGS_R6($sp); \ - stl $7, PT_REGS_R7($sp); \ - stl $8, PT_REGS_R8($sp); \ - stl $19, PT_REGS_R19($sp); \ - stl $20, PT_REGS_R20($sp); \ - stl $21, PT_REGS_R21($sp); \ - stl $22, PT_REGS_R22($sp); \ - stl $23, PT_REGS_R23($sp); \ - stl $24, PT_REGS_R24($sp); \ - stl $25, PT_REGS_R25($sp); \ - stl $26, PT_REGS_R26($sp); \ - stl $27, PT_REGS_R27($sp); \ - stl $16, PT_REGS_TRAP_A0($sp); \ - stl $17, PT_REGS_TRAP_A1($sp); \ + .macro SAVE_COMMON_REGS + ldi $sp, -PT_REGS_PS($sp) + stl $0, PT_REGS_R0($sp) + stl $1, PT_REGS_R1($sp) + stl $2, PT_REGS_R2($sp) + stl $3, PT_REGS_R3($sp) + stl $4, PT_REGS_R4($sp) + stl $28, PT_REGS_R28($sp) + stl $5, PT_REGS_R5($sp) + stl $6, PT_REGS_R6($sp) + stl $7, PT_REGS_R7($sp) + stl $8, PT_REGS_R8($sp) + stl $19, PT_REGS_R19($sp) + stl $20, PT_REGS_R20($sp) + stl $21, PT_REGS_R21($sp) + stl $22, PT_REGS_R22($sp) + stl $23, PT_REGS_R23($sp) + stl $24, PT_REGS_R24($sp) + stl $25, PT_REGS_R25($sp) + stl $26, PT_REGS_R26($sp) + stl $27, PT_REGS_R27($sp) + stl $16, PT_REGS_TRAP_A0($sp) + stl $17, PT_REGS_TRAP_A1($sp) stl $18, PT_REGS_TRAP_A2($sp) + .endm -#define RESTORE_ALL \ - ldl $0, PT_REGS_R0($sp); \ - ldl $1, PT_REGS_R1($sp); \ - ldl $2, PT_REGS_R2($sp); \ - ldl $3, PT_REGS_R3($sp); \ - ldl $4, PT_REGS_R4($sp); \ - ldl $5, PT_REGS_R5($sp); \ - ldl $6, PT_REGS_R6($sp); \ - ldl $7, PT_REGS_R7($sp); \ - ldl $8, PT_REGS_R8($sp); \ - ldl $19, PT_REGS_R19($sp); \ - ldl $20, PT_REGS_R20($sp); \ - ldl $21, PT_REGS_R21($sp); \ - ldl $22, PT_REGS_R22($sp); \ - ldl $23, PT_REGS_R23($sp); \ - ldl $24, PT_REGS_R24($sp); \ - ldl $25, PT_REGS_R25($sp); \ - ldl $26, PT_REGS_R26($sp); \ - ldl $27, PT_REGS_R27($sp); \ - ldl $28, PT_REGS_R28($sp); \ + .macro RESTORE_COMMON_REGS + ldl $0, PT_REGS_R0($sp) + ldl $1, PT_REGS_R1($sp) + ldl $2, PT_REGS_R2($sp) + ldl $3, PT_REGS_R3($sp) + ldl $4, PT_REGS_R4($sp) + ldl $5, PT_REGS_R5($sp) + ldl $6, PT_REGS_R6($sp) + ldl $7, PT_REGS_R7($sp) + ldl $8, PT_REGS_R8($sp) + ldl $19, PT_REGS_R19($sp) + ldl $20, PT_REGS_R20($sp) + ldl $21, PT_REGS_R21($sp) + ldl $22, PT_REGS_R22($sp) + ldl $23, PT_REGS_R23($sp) + ldl $24, PT_REGS_R24($sp) + ldl $25, PT_REGS_R25($sp) + ldl $26, PT_REGS_R26($sp) + ldl $27, PT_REGS_R27($sp) + ldl $28, PT_REGS_R28($sp) ldi $sp, PT_REGS_PS($sp) + .endm + + .macro SAVE_CALLEE_REGS + stl $9, PT_REGS_R9($sp) + stl $10, PT_REGS_R10($sp) + stl $11, PT_REGS_R11($sp) + stl $12, PT_REGS_R12($sp) + stl $13, PT_REGS_R13($sp) + stl $14, PT_REGS_R14($sp) + stl $15, PT_REGS_R15($sp) + .endm + + .macro RESTORE_CALLEE_REGS + ldl $9, PT_REGS_R9($sp) + ldl $10, PT_REGS_R10($sp) + ldl $11, PT_REGS_R11($sp) + ldl $12, PT_REGS_R12($sp) + ldl $13, PT_REGS_R13($sp) + ldl $14, PT_REGS_R14($sp) + ldl $15, PT_REGS_R15($sp) + .endm + + .macro SAVE_ALL + SAVE_COMMON_REGS + SAVE_CALLEE_REGS + .endm + + .macro RESTORE_ALL + RESTORE_CALLEE_REGS + RESTORE_COMMON_REGS + .endm /* * Non-syscall kernel entry points. @@ -591,19 +623,42 @@ $setfpec_over: .end undo_switch_stack /* - * The meat of the context switch code. + * Integer register context switch + * The callee-saved registers must be saved and restored. + * + * a0: physical address of next task's pcb, used by hmcode + * a1: previous task_struct (must be preserved across the switch) + * a2: next task_struct + * + * The value of a1 must be preserved by this function, as that's how + * arguments are passed to schedule_tail. */ - .align 4 .globl __switch_to .ent __switch_to __switch_to: .prologue 0 - bsr $1, do_switch_stack + /* Save context into prev->thread */ + stl $26, TASK_THREAD_RA($17) + stl $9, TASK_THREAD_S0($17) + stl $10, TASK_THREAD_S1($17) + stl $11, TASK_THREAD_S2($17) + stl $12, TASK_THREAD_S3($17) + stl $13, TASK_THREAD_S4($17) + stl $14, TASK_THREAD_S5($17) + stl $15, TASK_THREAD_S6($17) + /* Restore context from next->thread */ + ldl $26, TASK_THREAD_RA($18) + ldl $9, TASK_THREAD_S0($18) + ldl $10, TASK_THREAD_S1($18) + ldl $11, TASK_THREAD_S2($18) + ldl $12, TASK_THREAD_S3($18) + ldl $13, TASK_THREAD_S4($18) + ldl $14, TASK_THREAD_S5($18) + ldl $15, TASK_THREAD_S6($18) sys_call HMC_swpctx ldi $8, 0x3fff bic $sp, $8, $8 - bsr $1, undo_switch_stack mov $17, $0 ret .end __switch_to @@ -637,30 +692,6 @@ ret_from_kernel_thread: br $31, ret_to_user .end ret_from_kernel_thread -/* - * Special system calls. Most of these are special in that they either - * have to play switch_stack games or in some way use the pt_regs struct. - */ - -.macro fork_like name - .align 4 - .globl sw64_\name - .ent sw64_\name -sw64_\name: - .prologue 0 - bsr $1, do_switch_stack - call $26, sys_\name - ldl $26, SWITCH_STACK_RA($sp) - ldi $sp, SWITCH_STACK_SIZE($sp) - ret - .end sw64_\name - .endm - -fork_like fork -fork_like vfork -fork_like clone -fork_like clone3 - .align 4 .globl sys_sigreturn .ent sys_sigreturn diff --git a/arch/sw_64/kernel/process.c b/arch/sw_64/kernel/process.c index 4192d50f5b0e..c6e87874af19 100644 --- a/arch/sw_64/kernel/process.c +++ b/arch/sw_64/kernel/process.c @@ -11,6 +11,7 @@ #include #include +#include #include "proto.h" @@ -158,19 +159,18 @@ copy_thread(unsigned long clone_flags, unsigned long usp, struct thread_info *childti = task_thread_info(p); struct pt_regs *childregs = task_pt_regs(p); struct pt_regs *regs = current_pt_regs(); - struct switch_stack *childstack, *stack; - childstack = ((struct switch_stack *) childregs) - 1; - childti->pcb.ksp = (unsigned long) childstack; + childti->pcb.ksp = (unsigned long) childregs; childti->pcb.flags = 7; /* set FEN, clear everything else */ + __fpstate_save(current); + p->thread = current->thread; if (unlikely(p->flags & PF_KTHREAD)) { /* kernel thread */ - memset(childstack, 0, - sizeof(struct switch_stack) + sizeof(struct pt_regs)); - childstack->r26 = (unsigned long) ret_from_kernel_thread; - childstack->r9 = usp; /* function */ - childstack->r10 = kthread_arg; + memset(childregs, 0, sizeof(struct pt_regs)); + p->thread.ra = (unsigned long) ret_from_kernel_thread; + p->thread.s[0] = usp; /* function */ + p->thread.s[1] = kthread_arg; childti->pcb.usp = 0; return 0; } @@ -189,10 +189,7 @@ copy_thread(unsigned long clone_flags, unsigned long usp, *childregs = *regs; childregs->r0 = 0; childregs->r19 = 0; - stack = ((struct switch_stack *) regs) - 1; - *childstack = *stack; - p->thread = current->thread; - childstack->r26 = (unsigned long) ret_from_fork; + p->thread.ra = (unsigned long) ret_from_fork; return 0; } diff --git a/arch/sw_64/kernel/syscalls/syscall.tbl b/arch/sw_64/kernel/syscalls/syscall.tbl index b9b93d70124d..42a179422b6b 100644 --- a/arch/sw_64/kernel/syscalls/syscall.tbl +++ b/arch/sw_64/kernel/syscalls/syscall.tbl @@ -73,7 +73,7 @@ 63 common getpgrp sys_getpgrp #64 is unused #65 is unused -66 common vfork sw64_vfork +66 common vfork sys_vfork 67 common stat sys_newstat 68 common lstat sys_newlstat #69 is unused @@ -289,7 +289,7 @@ 279 common fsmount sys_fsmount 280 common fspick sys_fspick 281 common pidfd_open sys_pidfd_open -282 common clone3 sw64_clone3 +282 common clone3 sys_clone3 283 common close_range sys_close_range 284 common openat2 sys_openat2 285 common pidfd_getfd sys_pidfd_getfd @@ -319,7 +319,7 @@ 309 common get_kernel_syms sys_ni_syscall 310 common syslog sys_syslog 311 common reboot sys_reboot -312 common clone sw64_clone +312 common clone sys_clone 313 common uselib sys_uselib 314 common mlock sys_mlock 315 common munlock sys_munlock -- Gitee From cf8d42cc0e6dc5fe55895a233aa95418a157145c Mon Sep 17 00:00:00 2001 From: He Sheng Date: Sun, 24 Apr 2022 09:11:52 +0800 Subject: [PATCH 098/674] sw64: remove r9_r15 argument of dik_show_regs and die_if_kernel Sunway inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5GF7A -------------------------------- Signed-off-by: He Sheng Signed-off-by: Gu Zitao --- arch/sw_64/kernel/process.c | 2 +- arch/sw_64/kernel/proto.h | 4 ++-- arch/sw_64/kernel/ptrace.c | 2 +- arch/sw_64/kernel/traps.c | 27 ++++++++++++--------------- arch/sw_64/mm/fault.c | 8 ++++---- arch/sw_64/mm/init.c | 2 -- 6 files changed, 20 insertions(+), 25 deletions(-) diff --git a/arch/sw_64/kernel/process.c b/arch/sw_64/kernel/process.c index c6e87874af19..9a6812851b15 100644 --- a/arch/sw_64/kernel/process.c +++ b/arch/sw_64/kernel/process.c @@ -110,7 +110,7 @@ void show_regs(struct pt_regs *regs) { show_regs_print_info(KERN_DEFAULT); - dik_show_regs(regs, NULL); + dik_show_regs(regs); } /* diff --git a/arch/sw_64/kernel/proto.h b/arch/sw_64/kernel/proto.h index 9c99baaaf1d0..189074f8bd5c 100644 --- a/arch/sw_64/kernel/proto.h +++ b/arch/sw_64/kernel/proto.h @@ -12,8 +12,8 @@ extern int ptrace_set_bpt(struct task_struct *child); extern int ptrace_cancel_bpt(struct task_struct *child); /* traps.c */ -extern void dik_show_regs(struct pt_regs *regs, unsigned long *r9_15); -extern void die_if_kernel(char *str, struct pt_regs *regs, long err, unsigned long *r9_15); +extern void dik_show_regs(struct pt_regs *regs); +extern void die_if_kernel(char *str, struct pt_regs *regs, long err); /* timer.c */ extern void setup_timer(void); diff --git a/arch/sw_64/kernel/ptrace.c b/arch/sw_64/kernel/ptrace.c index b06c98e9944b..3b29ff7114f7 100644 --- a/arch/sw_64/kernel/ptrace.c +++ b/arch/sw_64/kernel/ptrace.c @@ -521,7 +521,7 @@ int do_match(unsigned long address, unsigned long mmcsr, long cause, struct pt_r case MMCSR__DA_MATCH: case MMCSR__DV_MATCH: case MMCSR__DAV_MATCH: - dik_show_regs(regs, (unsigned long *)regs-15); + dik_show_regs(regs); if (!(current->ptrace & PT_PTRACED)) { printk(" pid %d %s not be ptraced, return\n", current->pid, current->comm); diff --git a/arch/sw_64/kernel/traps.c b/arch/sw_64/kernel/traps.c index 99cee58e886d..b7c4e45df87b 100644 --- a/arch/sw_64/kernel/traps.c +++ b/arch/sw_64/kernel/traps.c @@ -22,8 +22,7 @@ #include "proto.h" -void -dik_show_regs(struct pt_regs *regs, unsigned long *r9_15) +void dik_show_regs(struct pt_regs *regs) { printk("pc = [<%016lx>] ra = [<%016lx>] ps = %04lx %s\n", regs->pc, regs->r26, regs->ps, print_tainted()); @@ -36,13 +35,12 @@ dik_show_regs(struct pt_regs *regs, unsigned long *r9_15) printk("t5 = %016lx t6 = %016lx t7 = %016lx\n", regs->r6, regs->r7, regs->r8); - if (r9_15) { - printk("s0 = %016lx s1 = %016lx s2 = %016lx\n", - r9_15[9], r9_15[10], r9_15[11]); - printk("s3 = %016lx s4 = %016lx s5 = %016lx\n", - r9_15[12], r9_15[13], r9_15[14]); - printk("s6 = %016lx\n", r9_15[15]); - } + printk("s0 = %016lx s1 = %016lx s2 = %016lx\n", + regs->r9, regs->r10, regs->r11); + printk("s3 = %016lx s4 = %016lx s5 = %016lx\n", + regs->r12, regs->r13, regs->r14); + printk("s6 = %016lx\n", + regs->r15); printk("a0 = %016lx a1 = %016lx a2 = %016lx\n", regs->r16, regs->r17, regs->r18); @@ -117,8 +115,7 @@ void show_stack(struct task_struct *task, unsigned long *sp, const char *loglvl) dik_show_trace(sp, loglvl); } -void -die_if_kernel(char *str, struct pt_regs *regs, long err, unsigned long *r9_15) +void die_if_kernel(char *str, struct pt_regs *regs, long err) { if (regs->ps & 8) return; @@ -126,7 +123,7 @@ die_if_kernel(char *str, struct pt_regs *regs, long err, unsigned long *r9_15) printk("CPU %d ", hard_smp_processor_id()); #endif printk("%s(%d): %s %ld\n", current->comm, task_pid_nr(current), str, err); - dik_show_regs(regs, r9_15); + dik_show_regs(regs); add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE); dik_show_trace((unsigned long *)(regs+1), KERN_DEFAULT); dik_show_code((unsigned int *)regs->pc); @@ -178,7 +175,7 @@ do_entArith(unsigned long summary, unsigned long write_mask, if (si_code == 0) return; } - die_if_kernel("Arithmetic fault", regs, 0, NULL); + die_if_kernel("Arithmetic fault", regs, 0); force_sig_fault(SIGFPE, si_code, (void __user *)regs->pc, 0); } @@ -205,7 +202,7 @@ do_entIF(unsigned long inst_type, struct pt_regs *regs) return; } die_if_kernel((type == 1 ? "Kernel Bug" : "Instruction fault"), - regs, type, NULL); + regs, type); } switch (type) { @@ -297,7 +294,7 @@ do_entIF(unsigned long inst_type, struct pt_regs *regs) return; } if ((regs->ps & ~IPL_MAX) == 0) - die_if_kernel("Instruction fault", regs, type, NULL); + die_if_kernel("Instruction fault", regs, type); break; case 3: /* FEN fault */ diff --git a/arch/sw_64/mm/fault.c b/arch/sw_64/mm/fault.c index b580450893ba..9562bc3ba37f 100644 --- a/arch/sw_64/mm/fault.c +++ b/arch/sw_64/mm/fault.c @@ -31,8 +31,8 @@ static inline int notify_page_fault(struct pt_regs *regs, unsigned long mmcsr) } #endif -extern void die_if_kernel(char *, struct pt_regs *, long, unsigned long *); -extern void dik_show_regs(struct pt_regs *regs, unsigned long *r9_15); +extern void die_if_kernel(char *, struct pt_regs *, long); +extern void dik_show_regs(struct pt_regs *regs); void show_all_vma(void) { @@ -305,7 +305,7 @@ do_page_fault(unsigned long address, unsigned long mmcsr, */ pr_alert("Unable to handle kernel paging request at virtual address %016lx\n", address); - die_if_kernel("Oops", regs, cause, (unsigned long *)regs - 16); + die_if_kernel("Oops", regs, cause); do_exit(SIGKILL); /* @@ -336,7 +336,7 @@ do_page_fault(unsigned long address, unsigned long mmcsr, if (unlikely(segv_debug_enabled)) { pr_info("fault: want to send_segv: pid %d, cause = %#lx, mmcsr = %#lx, address = %#lx, pc %#lx\n", current->pid, cause, mmcsr, address, regs->pc); - dik_show_regs(regs, (unsigned long *)regs-16); + dik_show_regs(regs); show_all_vma(); } diff --git a/arch/sw_64/mm/init.c b/arch/sw_64/mm/init.c index 7fcd3d834ba5..3593e4b13319 100644 --- a/arch/sw_64/mm/init.c +++ b/arch/sw_64/mm/init.c @@ -13,8 +13,6 @@ #include -extern void die_if_kernel(char *, struct pt_regs *, long); - struct mem_desc_t mem_desc; #ifndef CONFIG_NUMA struct numa_node_desc_t numa_nodes_desc[1]; -- Gitee From 9edfcf89f2df44eda3a312e0c958914f391a14cf Mon Sep 17 00:00:00 2001 From: He Sheng Date: Sun, 24 Apr 2022 09:52:46 +0800 Subject: [PATCH 099/674] sw64: remove switch_stack from entMM and entSys Sunway inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5GF7A -------------------------------- This patch removes switch_stack and its usages in mm fault entry and system call entry. Note that ptrace accesses registers with struct pt_regs only since strace gave up a switch_stack trick. Signed-off-by: He Sheng Signed-off-by: Gu Zitao --- arch/sw_64/kernel/entry.S | 42 +++----------------------------------- arch/sw_64/kernel/ptrace.c | 12 ++--------- 2 files changed, 5 insertions(+), 49 deletions(-) diff --git a/arch/sw_64/kernel/entry.S b/arch/sw_64/kernel/entry.S index ac62461aa9dc..f843acd88249 100644 --- a/arch/sw_64/kernel/entry.S +++ b/arch/sw_64/kernel/entry.S @@ -133,31 +133,11 @@ entArith: .ent entMM entMM: SAVE_ALL -/* save $9 - $15 so the inline exception code can manipulate them. */ - subl $sp, SWITCH_STACK_RA, $sp - stl $9, SWITCH_STACK_R9($sp) - stl $10, SWITCH_STACK_R10($sp) - stl $11, SWITCH_STACK_R11($sp) - stl $12, SWITCH_STACK_R12($sp) - stl $13, SWITCH_STACK_R13($sp) - stl $14, SWITCH_STACK_R14($sp) - stl $15, SWITCH_STACK_R15($sp) - addl $sp, SWITCH_STACK_RA, $19 -/* handle the fault */ ldi $8, 0x3fff + ldi $26, ret_from_sys_call bic $sp, $8, $8 - call $26, do_page_fault -/* reload the registers after the exception code played. */ - ldl $9, SWITCH_STACK_R9($sp) - ldl $10, SWITCH_STACK_R10($sp) - ldl $11, SWITCH_STACK_R11($sp) - ldl $12, SWITCH_STACK_R12($sp) - ldl $13, SWITCH_STACK_R13($sp) - ldl $14, SWITCH_STACK_R14($sp) - ldl $15, SWITCH_STACK_R15($sp) - addl $sp, SWITCH_STACK_RA, $sp -/* finish up the syscall as normal. */ - br ret_from_sys_call + mov $sp, $19 + call $31, do_page_fault .end entMM .align 4 @@ -400,9 +380,7 @@ $work_resched: $work_notifysig: mov $sp, $16 - bsr $1, do_switch_stack call $26, do_work_pending - bsr $1, undo_switch_stack br restore_all .end work_pending @@ -416,14 +394,9 @@ $work_notifysig: .ent strace strace: /* set up signal stack, call syscall_trace */ - bsr $1, do_switch_stack mov $0, $9 mov $19, $10 call $26, syscall_trace_enter - mov $9, $18 - mov $10, $19 - bsr $1, undo_switch_stack - blt $0, $syscall_trace_failed /* get the system call number and the arguments back.. */ @@ -452,10 +425,7 @@ ret_from_straced: stl $31, PT_REGS_R19($sp) /* a3=0 => no error */ $strace_success: stl $0, PT_REGS_R0($sp) /* save return value */ - - bsr $1, do_switch_stack call $26, syscall_trace_leave - bsr $1, undo_switch_stack br $31, ret_from_sys_call .align 3 @@ -470,25 +440,19 @@ $strace_error: stl $0, PT_REGS_R0($sp) stl $1, PT_REGS_R19($sp) /* a3 for return */ - bsr $1, do_switch_stack mov $18, $9 /* save old syscall number */ mov $19, $10 /* save old a3 */ call $26, syscall_trace_leave mov $9, $18 mov $10, $19 - bsr $1, undo_switch_stack mov $31, $26 /* tell "ret_from_sys_call" we can restart */ br ret_from_sys_call $syscall_trace_failed: - bsr $1, do_switch_stack - mov $18, $9 - mov $19, $10 call $26, syscall_trace_leave mov $9, $18 mov $10, $19 - bsr $1, undo_switch_stack mov $31, $26 /* tell "ret_from_sys_call" we can restart */ br ret_from_sys_call .end strace diff --git a/arch/sw_64/kernel/ptrace.c b/arch/sw_64/kernel/ptrace.c index 3b29ff7114f7..097590b22a5a 100644 --- a/arch/sw_64/kernel/ptrace.c +++ b/arch/sw_64/kernel/ptrace.c @@ -33,10 +33,6 @@ * | frame generated by SAVE_ALL | | * | | v * +================================+ - * | | ^ - * | frame saved by do_switch_stack | | struct switch_stack - * | | v - * +================================+ */ /* @@ -59,18 +55,14 @@ enum { #define PT_REG(reg) \ (PAGE_SIZE * 2 - sizeof(struct pt_regs) + offsetof(struct pt_regs, reg)) -#define SW_REG(reg) \ - (PAGE_SIZE * 2 - sizeof(struct pt_regs) - sizeof(struct switch_stack) \ - + offsetof(struct switch_stack, reg)) - #define FP_REG(fp_regno, vector_regno) \ (fp_regno * 32 + vector_regno * 8) static int regoff[] = { PT_REG(r0), PT_REG(r1), PT_REG(r2), PT_REG(r3), PT_REG(r4), PT_REG(r5), PT_REG(r6), PT_REG(r7), - PT_REG(r8), SW_REG(r9), SW_REG(r10), SW_REG(r11), - SW_REG(r12), SW_REG(r13), SW_REG(r14), SW_REG(r15), + PT_REG(r8), PT_REG(r9), PT_REG(r10), PT_REG(r11), + PT_REG(r12), PT_REG(r13), PT_REG(r14), PT_REG(r15), PT_REG(r16), PT_REG(r17), PT_REG(r18), PT_REG(r19), PT_REG(r20), PT_REG(r21), PT_REG(r22), PT_REG(r23), PT_REG(r24), PT_REG(r25), PT_REG(r26), PT_REG(r27), -- Gitee From 40f13ed00ba08ca82c764d04f78fdf08d7583263 Mon Sep 17 00:00:00 2001 From: He Sheng Date: Sun, 24 Apr 2022 12:01:24 +0800 Subject: [PATCH 100/674] sw64: remove switch_stack from signal handling Sunway inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5GF7A -------------------------------- The new pt_regs struct holds r9 ~ r15, and it's sufficient to setup and restore sigcontext. Take care that fpregs state of current task should be saved and restored explicitly after switch_stack removed. Signed-off-by: He Sheng Signed-off-by: Gu Zitao --- arch/sw_64/kernel/entry.S | 8 ++------ arch/sw_64/kernel/signal.c | 38 ++++++++++++++++++-------------------- 2 files changed, 20 insertions(+), 26 deletions(-) diff --git a/arch/sw_64/kernel/entry.S b/arch/sw_64/kernel/entry.S index f843acd88249..730346754ecc 100644 --- a/arch/sw_64/kernel/entry.S +++ b/arch/sw_64/kernel/entry.S @@ -663,12 +663,10 @@ sys_sigreturn: .prologue 0 ldi $9, ret_from_straced cmpult $26, $9, $9 - ldi $sp, -SWITCH_STACK_SIZE($sp) call $26, do_sigreturn bne $9, 1f call $26, syscall_trace_leave -1: br $1, undo_switch_stack - br ret_from_sys_call +1: br ret_from_sys_call .end sys_sigreturn .align 4 @@ -678,12 +676,10 @@ sys_rt_sigreturn: .prologue 0 ldi $9, ret_from_straced cmpult $26, $9, $9 - ldi $sp, -SWITCH_STACK_SIZE($sp) call $26, do_rt_sigreturn bne $9, 1f call $26, syscall_trace_leave -1: br $1, undo_switch_stack - br ret_from_sys_call +1: br ret_from_sys_call .end sys_rt_sigreturn .align 4 diff --git a/arch/sw_64/kernel/signal.c b/arch/sw_64/kernel/signal.c index dd0d8ff42420..887326812624 100644 --- a/arch/sw_64/kernel/signal.c +++ b/arch/sw_64/kernel/signal.c @@ -14,6 +14,7 @@ #include #include +#include #include "proto.h" @@ -22,8 +23,6 @@ #define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) -asmlinkage void ret_from_sys_call(void); - SYSCALL_DEFINE2(odd_sigprocmask, int, how, unsigned long, newmask) { sigset_t oldmask; @@ -64,13 +63,10 @@ static long restore_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs) { unsigned long usp; - struct switch_stack *sw = (struct switch_stack *)regs - 1; long err = __get_user(regs->pc, &sc->sc_pc); current->restart_block.fn = do_no_restart_syscall; - sw->r26 = (unsigned long) ret_from_sys_call; - err |= __get_user(regs->r0, sc->sc_regs+0); err |= __get_user(regs->r1, sc->sc_regs+1); err |= __get_user(regs->r2, sc->sc_regs+2); @@ -80,13 +76,13 @@ restore_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs) err |= __get_user(regs->r6, sc->sc_regs+6); err |= __get_user(regs->r7, sc->sc_regs+7); err |= __get_user(regs->r8, sc->sc_regs+8); - err |= __get_user(sw->r9, sc->sc_regs+9); - err |= __get_user(sw->r10, sc->sc_regs+10); - err |= __get_user(sw->r11, sc->sc_regs+11); - err |= __get_user(sw->r12, sc->sc_regs+12); - err |= __get_user(sw->r13, sc->sc_regs+13); - err |= __get_user(sw->r14, sc->sc_regs+14); - err |= __get_user(sw->r15, sc->sc_regs+15); + err |= __get_user(regs->r9, sc->sc_regs+9); + err |= __get_user(regs->r10, sc->sc_regs+10); + err |= __get_user(regs->r11, sc->sc_regs+11); + err |= __get_user(regs->r12, sc->sc_regs+12); + err |= __get_user(regs->r13, sc->sc_regs+13); + err |= __get_user(regs->r14, sc->sc_regs+14); + err |= __get_user(regs->r15, sc->sc_regs+15); err |= __get_user(regs->r16, sc->sc_regs+16); err |= __get_user(regs->r17, sc->sc_regs+17); err |= __get_user(regs->r18, sc->sc_regs+18); @@ -107,6 +103,8 @@ restore_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs) err |= __copy_from_user(¤t->thread.ctx_fp, &sc->sc_fpregs, sizeof(struct context_fpregs)); err |= __get_user(current->thread.fpcr, &sc->sc_fpcr); + if (likely(!err)) + __fpstate_restore(current); return err; } @@ -191,7 +189,6 @@ static long setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned long mask, unsigned long sp) { - struct switch_stack *sw = (struct switch_stack *)regs - 1; long err = 0; err |= __put_user(on_sig_stack((unsigned long)sc), &sc->sc_onstack); @@ -208,13 +205,13 @@ setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, err |= __put_user(regs->r6, sc->sc_regs+6); err |= __put_user(regs->r7, sc->sc_regs+7); err |= __put_user(regs->r8, sc->sc_regs+8); - err |= __put_user(sw->r9, sc->sc_regs+9); - err |= __put_user(sw->r10, sc->sc_regs+10); - err |= __put_user(sw->r11, sc->sc_regs+11); - err |= __put_user(sw->r12, sc->sc_regs+12); - err |= __put_user(sw->r13, sc->sc_regs+13); - err |= __put_user(sw->r14, sc->sc_regs+14); - err |= __put_user(sw->r15, sc->sc_regs+15); + err |= __put_user(regs->r9, sc->sc_regs+9); + err |= __put_user(regs->r10, sc->sc_regs+10); + err |= __put_user(regs->r11, sc->sc_regs+11); + err |= __put_user(regs->r12, sc->sc_regs+12); + err |= __put_user(regs->r13, sc->sc_regs+13); + err |= __put_user(regs->r14, sc->sc_regs+14); + err |= __put_user(regs->r15, sc->sc_regs+15); err |= __put_user(regs->r16, sc->sc_regs+16); err |= __put_user(regs->r17, sc->sc_regs+17); err |= __put_user(regs->r18, sc->sc_regs+18); @@ -232,6 +229,7 @@ setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, err |= __put_user(sp, sc->sc_regs+30); err |= __put_user(0, sc->sc_regs+31); /* simd-fp */ + __fpstate_save(current); err |= __copy_to_user(&sc->sc_fpregs, ¤t->thread.ctx_fp, sizeof(struct context_fpregs)); err |= __put_user(current->thread.fpcr, &sc->sc_fpcr); -- Gitee From ba8c73ce31bd4d5375b0d005b1f89d46c73477dc Mon Sep 17 00:00:00 2001 From: He Sheng Date: Sun, 24 Apr 2022 16:51:34 +0800 Subject: [PATCH 101/674] sw64: dump callee-saved registers from pt_regs Sunway inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5GF7A -------------------------------- Signed-off-by: He Sheng Signed-off-by: Gu Zitao --- arch/sw_64/kernel/process.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/arch/sw_64/kernel/process.c b/arch/sw_64/kernel/process.c index 9a6812851b15..4e7a1418af30 100644 --- a/arch/sw_64/kernel/process.c +++ b/arch/sw_64/kernel/process.c @@ -199,9 +199,6 @@ copy_thread(unsigned long clone_flags, unsigned long usp, void dump_elf_thread(elf_greg_t *dest, struct pt_regs *pt, struct thread_info *ti) { - /* switch stack follows right below pt_regs: */ - struct switch_stack *sw = ((struct switch_stack *) pt) - 1; - dest[0] = pt->r0; dest[1] = pt->r1; dest[2] = pt->r2; @@ -211,13 +208,13 @@ dump_elf_thread(elf_greg_t *dest, struct pt_regs *pt, struct thread_info *ti) dest[6] = pt->r6; dest[7] = pt->r7; dest[8] = pt->r8; - dest[9] = sw->r9; - dest[10] = sw->r10; - dest[11] = sw->r11; - dest[12] = sw->r12; - dest[13] = sw->r13; - dest[14] = sw->r14; - dest[15] = sw->r15; + dest[9] = pt->r9; + dest[10] = pt->r10; + dest[11] = pt->r11; + dest[12] = pt->r12; + dest[13] = pt->r13; + dest[14] = pt->r14; + dest[15] = pt->r15; dest[16] = pt->r16; dest[17] = pt->r17; dest[18] = pt->r18; -- Gitee From c4c31c88e37cf8ad3a6797dba141c0ed162b5e93 Mon Sep 17 00:00:00 2001 From: He Sheng Date: Sun, 24 Apr 2022 16:51:45 +0800 Subject: [PATCH 102/674] sw64: get blocked thread's frame pointer from thread_struct Sunway inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5GF7A -------------------------------- The thread_struct holds all provided the thread blocked through a call to schedule(). $15 is the frame pointer in schedule() and it is saved at thread_struct.s[6]. With frame pointer, it can return saved PC of a blocked thread which assumes that the saved return address is the first long in the frame. Signed-off-by: He Sheng Signed-off-by: Gu Zitao --- arch/sw_64/kernel/process.c | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/arch/sw_64/kernel/process.c b/arch/sw_64/kernel/process.c index 4e7a1418af30..1f093c5a5235 100644 --- a/arch/sw_64/kernel/process.c +++ b/arch/sw_64/kernel/process.c @@ -257,13 +257,6 @@ dump_elf_task_fp(elf_fpreg_t *dest, struct task_struct *task) EXPORT_SYMBOL(dump_elf_task_fp); /* - * Return saved PC of a blocked thread. This assumes the frame - * pointer is the 6th saved long on the kernel stack and that the - * saved return address is the first long in the frame. This all - * holds provided the thread blocked through a call to schedule() ($15 - * is the frame pointer in schedule() and $15 is saved at offset 48 by - * entry.S:do_switch_stack). - * * Under heavy swap load I've seen this lose in an ugly way. So do * some extra sanity checking on the ranges we expect these pointers * to be in so that we can fail gracefully. This is just for ps after @@ -273,14 +266,14 @@ EXPORT_SYMBOL(dump_elf_task_fp); unsigned long thread_saved_pc(struct task_struct *t) { - unsigned long base = (unsigned long)task_stack_page(t); - unsigned long fp, sp = task_thread_info(t)->pcb.ksp; + unsigned long top, fp, sp; - if (sp > base && sp+6*8 < base + 16*1024) { - fp = ((unsigned long *)sp)[6]; - if (fp > sp && fp < base + 16*1024) - return *(unsigned long *)fp; - } + top = (unsigned long)task_stack_page(t) + 2 * PAGE_SIZE; + sp = task_thread_info(t)->pcb.ksp; + fp = t->thread.s[6]; + + if (fp > sp && fp < top) + return *(unsigned long *)fp; return 0; } @@ -289,7 +282,7 @@ unsigned long get_wchan(struct task_struct *p) { unsigned long schedule_frame; - unsigned long pc, base, sp; + unsigned long pc, top, sp; if (!p || p == current || p->state == TASK_RUNNING) return 0; @@ -305,10 +298,10 @@ get_wchan(struct task_struct *p) pc = thread_saved_pc(p); if (in_sched_functions(pc)) { - base = (unsigned long)task_stack_page(p); + top = (unsigned long)task_stack_page(p) + 2 * PAGE_SIZE; sp = task_thread_info(p)->pcb.ksp; - schedule_frame = ((unsigned long *)sp)[6]; - if (schedule_frame > sp && schedule_frame < base + 16*1024) + schedule_frame = p->thread.s[6]; + if (schedule_frame > sp && schedule_frame < top) return ((unsigned long *)schedule_frame)[12]; } return pc; -- Gitee From 72ed1e6e2775b868bb84d55f3c13ff8cd204d1d8 Mon Sep 17 00:00:00 2001 From: He Sheng Date: Mon, 25 Apr 2022 12:57:28 +0800 Subject: [PATCH 103/674] sw64: remove switch_stack and allregs from entUna Sunway inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5GF7A -------------------------------- It's simple to maintain unified register layout for user-mode and kernel-mode unaligned accesses. With the new struct pt_regs, we are able to make the exception handler cleaner. After that, the unused struct allregs can be cleaned up. Signed-off-by: He Sheng Signed-off-by: Gu Zitao --- arch/sw_64/kernel/asm-offsets.c | 41 ------------ arch/sw_64/kernel/entry.S | 109 +++++--------------------------- arch/sw_64/kernel/traps.c | 76 +++++----------------- 3 files changed, 33 insertions(+), 193 deletions(-) diff --git a/arch/sw_64/kernel/asm-offsets.c b/arch/sw_64/kernel/asm-offsets.c index 349bc5c4e2f1..0e8a814f47ce 100644 --- a/arch/sw_64/kernel/asm-offsets.c +++ b/arch/sw_64/kernel/asm-offsets.c @@ -111,47 +111,6 @@ void foo(void) DEFINE(SWITCH_STACK_RA, offsetof(struct switch_stack, r26)); BLANK(); - DEFINE(ALLREGS_SIZE, sizeof(struct allregs)); - DEFINE(ALLREGS_R0, offsetof(struct allregs, regs[0])); - DEFINE(ALLREGS_R1, offsetof(struct allregs, regs[1])); - DEFINE(ALLREGS_R2, offsetof(struct allregs, regs[2])); - DEFINE(ALLREGS_R3, offsetof(struct allregs, regs[3])); - DEFINE(ALLREGS_R4, offsetof(struct allregs, regs[4])); - DEFINE(ALLREGS_R5, offsetof(struct allregs, regs[5])); - DEFINE(ALLREGS_R6, offsetof(struct allregs, regs[6])); - DEFINE(ALLREGS_R7, offsetof(struct allregs, regs[7])); - DEFINE(ALLREGS_R8, offsetof(struct allregs, regs[8])); - DEFINE(ALLREGS_R9, offsetof(struct allregs, regs[9])); - DEFINE(ALLREGS_R10, offsetof(struct allregs, regs[10])); - DEFINE(ALLREGS_R11, offsetof(struct allregs, regs[11])); - DEFINE(ALLREGS_R12, offsetof(struct allregs, regs[12])); - DEFINE(ALLREGS_R13, offsetof(struct allregs, regs[13])); - DEFINE(ALLREGS_R14, offsetof(struct allregs, regs[14])); - DEFINE(ALLREGS_R15, offsetof(struct allregs, regs[15])); - DEFINE(ALLREGS_R16, offsetof(struct allregs, regs[16])); - DEFINE(ALLREGS_R17, offsetof(struct allregs, regs[17])); - DEFINE(ALLREGS_R18, offsetof(struct allregs, regs[18])); - DEFINE(ALLREGS_R19, offsetof(struct allregs, regs[19])); - DEFINE(ALLREGS_R20, offsetof(struct allregs, regs[20])); - DEFINE(ALLREGS_R21, offsetof(struct allregs, regs[21])); - DEFINE(ALLREGS_R22, offsetof(struct allregs, regs[22])); - DEFINE(ALLREGS_R23, offsetof(struct allregs, regs[23])); - DEFINE(ALLREGS_R24, offsetof(struct allregs, regs[24])); - DEFINE(ALLREGS_R25, offsetof(struct allregs, regs[25])); - DEFINE(ALLREGS_R26, offsetof(struct allregs, regs[26])); - DEFINE(ALLREGS_R27, offsetof(struct allregs, regs[27])); - DEFINE(ALLREGS_R28, offsetof(struct allregs, regs[28])); - DEFINE(ALLREGS_R29, offsetof(struct allregs, regs[29])); - DEFINE(ALLREGS_R30, offsetof(struct allregs, regs[30])); - DEFINE(ALLREGS_R31, offsetof(struct allregs, regs[31])); - DEFINE(ALLREGS_PS, offsetof(struct allregs, ps)); - DEFINE(ALLREGS_PC, offsetof(struct allregs, pc)); - DEFINE(ALLREGS_GP, offsetof(struct allregs, gp)); - DEFINE(ALLREGS_A0, offsetof(struct allregs, a0)); - DEFINE(ALLREGS_A1, offsetof(struct allregs, a1)); - DEFINE(ALLREGS_A2, offsetof(struct allregs, a2)); - BLANK(); - DEFINE(KVM_REGS_SIZE, sizeof(struct kvm_regs)); DEFINE(KVM_REGS_R0, offsetof(struct kvm_regs, r0)); DEFINE(KVM_REGS_R1, offsetof(struct kvm_regs, r1)); diff --git a/arch/sw_64/kernel/entry.S b/arch/sw_64/kernel/entry.S index 730346754ecc..f54b02c1b3a9 100644 --- a/arch/sw_64/kernel/entry.S +++ b/arch/sw_64/kernel/entry.S @@ -152,109 +152,32 @@ entIF: call $31, do_entIF .end entIF +/* + * Handle unalignment exception. + * We don't handle the "gp" register correctly, but if we fault on a + * gp-register unaligned load/store, something is _very_ wrong in the + * kernel anyway. + */ .align 4 .globl entUna .ent entUna entUna: - ldi $sp, -ALLREGS_PS($sp) - stl $0, ALLREGS_R0($sp) - ldl $0, ALLREGS_PS($sp) /* get PS */ - stl $1, ALLREGS_R1($sp) - stl $2, ALLREGS_R2($sp) - stl $3, ALLREGS_R3($sp) - and $0, 8, $0 /* user mode? */ - stl $4, ALLREGS_R4($sp) - bne $0, entUnaUser /* yup -> do user-level unaligned fault */ - stl $5, ALLREGS_R5($sp) - stl $6, ALLREGS_R6($sp) - stl $7, ALLREGS_R7($sp) - stl $8, ALLREGS_R8($sp) - stl $9, ALLREGS_R9($sp) - stl $10, ALLREGS_R10($sp) - stl $11, ALLREGS_R11($sp) - stl $12, ALLREGS_R12($sp) - stl $13, ALLREGS_R13($sp) - stl $14, ALLREGS_R14($sp) - stl $15, ALLREGS_R15($sp) - /* 16-18 HMCODE-saved */ - stl $19, ALLREGS_R19($sp) - stl $20, ALLREGS_R20($sp) - stl $21, ALLREGS_R21($sp) - stl $22, ALLREGS_R22($sp) - stl $23, ALLREGS_R23($sp) - stl $24, ALLREGS_R24($sp) - stl $25, ALLREGS_R25($sp) - stl $26, ALLREGS_R26($sp) - stl $27, ALLREGS_R27($sp) - stl $28, ALLREGS_R28($sp) - mov $sp, $19 - stl $gp, ALLREGS_R29($sp) + SAVE_ALL ldi $8, 0x3fff - stl $31, ALLREGS_R31($sp) bic $sp, $8, $8 + mov $sp, $19 + ldl $0, PT_REGS_PS($sp) + and $0, 8, $0 /* user mode ? */ + beq $0, 1f + ldi $26, ret_from_sys_call + call $31, do_entUnaUser /* return to ret_from_syscall */ +1: ldl $9, PT_REGS_GP($sp) call $26, do_entUna - ldl $0, ALLREGS_R0($sp) - ldl $1, ALLREGS_R1($sp) - ldl $2, ALLREGS_R2($sp) - ldl $3, ALLREGS_R3($sp) - ldl $4, ALLREGS_R4($sp) - ldl $5, ALLREGS_R5($sp) - ldl $6, ALLREGS_R6($sp) - ldl $7, ALLREGS_R7($sp) - ldl $8, ALLREGS_R8($sp) - ldl $9, ALLREGS_R9($sp) - ldl $10, ALLREGS_R10($sp) - ldl $11, ALLREGS_R11($sp) - ldl $12, ALLREGS_R12($sp) - ldl $13, ALLREGS_R13($sp) - ldl $14, ALLREGS_R14($sp) - ldl $15, ALLREGS_R15($sp) - /* 16-18 HMCODE-saved */ - ldl $19, ALLREGS_R19($sp) - ldl $20, ALLREGS_R20($sp) - ldl $21, ALLREGS_R21($sp) - ldl $22, ALLREGS_R22($sp) - ldl $23, ALLREGS_R23($sp) - ldl $24, ALLREGS_R24($sp) - ldl $25, ALLREGS_R25($sp) - ldl $26, ALLREGS_R26($sp) - ldl $27, ALLREGS_R27($sp) - ldl $28, ALLREGS_R28($sp) - ldl $gp, ALLREGS_R29($sp) - ldi $sp, ALLREGS_PS($sp) + stl $9, PT_REGS_GP($sp) + RESTORE_ALL sys_call HMC_rti .end entUna - .align 4 - .ent entUnaUser -entUnaUser: - ldl $0, ALLREGS_R0($sp) /* restore original $0 */ - ldi $sp, ALLREGS_PS($sp) /* pop entUna's stack frame */ - SAVE_ALL /* setup normal kernel stack */ - ldi $sp, -SWITCH_STACK_RA($sp) - stl $9, SWITCH_STACK_R9($sp) - stl $10, SWITCH_STACK_R10($sp) - stl $11, SWITCH_STACK_R11($sp) - stl $12, SWITCH_STACK_R12($sp) - stl $13, SWITCH_STACK_R13($sp) - stl $14, SWITCH_STACK_R14($sp) - stl $15, SWITCH_STACK_R15($sp) - ldi $8, 0x3fff - addl $sp, SWITCH_STACK_RA, $19 - bic $sp, $8, $8 - call $26, do_entUnaUser - ldl $9, SWITCH_STACK_R9($sp) - ldl $10, SWITCH_STACK_R10($sp) - ldl $11, SWITCH_STACK_R11($sp) - ldl $12, SWITCH_STACK_R12($sp) - ldl $13, SWITCH_STACK_R13($sp) - ldl $14, SWITCH_STACK_R14($sp) - ldl $15, SWITCH_STACK_R15($sp) - ldi $sp, SWITCH_STACK_RA($sp) - br ret_from_sys_call - .end entUnaUser - - /* * The system call entry point is special. Most importantly, it looks * like a function call to userspace as far as clobbered registers. We diff --git a/arch/sw_64/kernel/traps.c b/arch/sw_64/kernel/traps.c index b7c4e45df87b..6cd3ade905d1 100644 --- a/arch/sw_64/kernel/traps.c +++ b/arch/sw_64/kernel/traps.c @@ -321,41 +321,35 @@ do_entIF(unsigned long inst_type, struct pt_regs *regs) force_sig_fault(SIGILL, ILL_ILLOPC, (void __user *)regs->pc, 0); } -/* - * entUna has a different register layout to be reasonably simple. It - * needs access to all the integer registers (the kernel doesn't use - * fp-regs), and it needs to have them in order for simpler access. - * - * Due to the non-standard register layout (and because we don't want - * to handle floating-point regs), user-mode unaligned accesses are - * handled separately by do_entUnaUser below. - * - * Oh, btw, we don't handle the "gp" register correctly, but if we fault - * on a gp-register unaligned load/store, something is _very_ wrong - * in the kernel anyway.. - */ -struct allregs { - unsigned long regs[32]; - unsigned long ps, pc, gp, a0, a1, a2; -}; - struct unaligned_stat { unsigned long count, va, pc; } unaligned[2]; /* Macro for exception fixup code to access integer registers. */ -#define una_reg(r) (_regs[(r) >= 16 && (r) <= 18 ? (r) + 19 : (r)]) +#define R(x) ((size_t) &((struct pt_regs *)0)->x) + +static int regoffsets[32] = { + R(r0), R(r1), R(r2), R(r3), R(r4), R(r5), R(r6), R(r7), R(r8), + R(r9), R(r10), R(r11), R(r12), R(r13), R(r14), R(r15), + R(r16), R(r17), R(r18), + R(r19), R(r20), R(r21), R(r22), R(r23), R(r24), R(r25), R(r26), + R(r27), R(r28), R(gp), + 0, 0 +}; + +#undef R + +#define una_reg(r) (*(unsigned long *)((char *)regs + regoffsets[r])) asmlinkage void do_entUna(void *va, unsigned long opcode, unsigned long reg, - struct allregs *regs) + struct pt_regs *regs) { long error; unsigned long tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; unsigned long pc = regs->pc - 4; - unsigned long *_regs = regs->regs; const struct exception_table_entry *fixup; unaligned[0].count++; @@ -566,29 +560,7 @@ do_entUna(void *va, unsigned long opcode, unsigned long reg, printk("%s(%d): unhandled unaligned exception\n", current->comm, task_pid_nr(current)); - printk("pc = [<%016lx>] ra = [<%016lx>] ps = %04lx\n", - pc, una_reg(26), regs->ps); - printk("r0 = %016lx r1 = %016lx r2 = %016lx\n", - una_reg(0), una_reg(1), una_reg(2)); - printk("r3 = %016lx r4 = %016lx r5 = %016lx\n", - una_reg(3), una_reg(4), una_reg(5)); - printk("r6 = %016lx r7 = %016lx r8 = %016lx\n", - una_reg(6), una_reg(7), una_reg(8)); - printk("r9 = %016lx r10= %016lx r11= %016lx\n", - una_reg(9), una_reg(10), una_reg(11)); - printk("r12= %016lx r13= %016lx r14= %016lx\n", - una_reg(12), una_reg(13), una_reg(14)); - printk("r15= %016lx\n", una_reg(15)); - printk("r16= %016lx r17= %016lx r18= %016lx\n", - una_reg(16), una_reg(17), una_reg(18)); - printk("r19= %016lx r20= %016lx r21= %016lx\n", - una_reg(19), una_reg(20), una_reg(21)); - printk("r22= %016lx r23= %016lx r24= %016lx\n", - una_reg(22), una_reg(23), una_reg(24)); - printk("r25= %016lx r27= %016lx r28= %016lx\n", - una_reg(25), una_reg(27), una_reg(28)); - printk("gp = %016lx sp = %p\n", regs->gp, regs+1); - + dik_show_regs(regs); dik_show_code((unsigned int *)pc); dik_show_trace((unsigned long *)(regs+1), KERN_DEFAULT); @@ -668,20 +640,6 @@ s_reg_to_mem(unsigned long s_reg) 1L << 0x2c | 1L << 0x2d | /* stw stl */ \ 1L << 0x0d | 1L << 0x0e) /* sth stb */ -#define R(x) ((size_t) &((struct pt_regs *)0)->x) - -static int unauser_reg_offsets[32] = { - R(r0), R(r1), R(r2), R(r3), R(r4), R(r5), R(r6), R(r7), R(r8), - /* r9 ... r15 are stored in front of regs. */ - -56, -48, -40, -32, -24, -16, -8, - R(r16), R(r17), R(r18), - R(r19), R(r20), R(r21), R(r22), R(r23), R(r24), R(r25), R(r26), - R(r27), R(r28), R(gp), - 0, 0 -}; - -#undef R - asmlinkage void do_entUnaUser(void __user *va, unsigned long opcode, unsigned long reg, struct pt_regs *regs) @@ -734,7 +692,7 @@ do_entUnaUser(void __user *va, unsigned long opcode, /* it's an integer load/store */ if (reg < 30) { reg_addr = (unsigned long *) - ((char *)regs + unauser_reg_offsets[reg]); + ((char *)regs + regoffsets[reg]); } else if (reg == 30) { /* usp in HMCODE regs */ fake_reg = rdusp(); -- Gitee From b936d36d7ef16056e8b64dda3a0110a39a543bab Mon Sep 17 00:00:00 2001 From: He Sheng Date: Mon, 25 Apr 2022 15:09:11 +0800 Subject: [PATCH 104/674] sw64: remove switch_stack from __sw64_vcpu_run Sunway inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5GF7A -------------------------------- Since r9~r15 have been added to struct pt_regs, it's okay to save and restore host callee-saved integer registers with pt_regs. Signed-off-by: He Sheng Signed-off-by: Gu Zitao --- arch/sw_64/kvm/entry.S | 57 +++++++++++++++--------------------------- 1 file changed, 20 insertions(+), 37 deletions(-) diff --git a/arch/sw_64/kvm/entry.S b/arch/sw_64/kvm/entry.S index 76ebdda920cb..1e089da5378e 100644 --- a/arch/sw_64/kvm/entry.S +++ b/arch/sw_64/kvm/entry.S @@ -34,20 +34,15 @@ ENTRY(__sw64_vcpu_run) /* r18 = hcall args */ /* save host pt_regs to current kernel stack */ ldi sp, -PT_REGS_SIZE(sp) - - stl $8, PT_REGS_R8(sp) + stl $9, PT_REGS_R9(sp) + stl $10, PT_REGS_R10(sp) + stl $11, PT_REGS_R11(sp) + stl $12, PT_REGS_R12(sp) + stl $13, PT_REGS_R13(sp) + stl $14, PT_REGS_R14(sp) + stl $15, PT_REGS_R15(sp) stl $26, PT_REGS_R26(sp) - /* save host switch stack to current kernel stack */ - ldi sp, -SWITCH_STACK_SIZE(sp) - stl $9, SWITCH_STACK_R9(sp) - stl $10, SWITCH_STACK_R10(sp) - stl $11, SWITCH_STACK_R11(sp) - stl $12, SWITCH_STACK_R12(sp) - stl $13, SWITCH_STACK_R13(sp) - stl $14, SWITCH_STACK_R14(sp) - stl $15, SWITCH_STACK_R15(sp) - /* restore guest switch stack from guest kvm_regs struct */ ldl $0, KVM_REGS_R0($17) ldl $1, KVM_REGS_R1($17) @@ -203,23 +198,19 @@ $g_setfpec_over: stl $27, KVM_REGS_R27($17) stl $28, KVM_REGS_R28($17) - /* restore host switch stack from host sp */ - ldl $9, SWITCH_STACK_R9(sp) - ldl $10, SWITCH_STACK_R10(sp) - ldl $11, SWITCH_STACK_R11(sp) - ldl $12, SWITCH_STACK_R12(sp) - ldl $13, SWITCH_STACK_R13(sp) - ldl $14, SWITCH_STACK_R14(sp) - ldl $15, SWITCH_STACK_R15(sp) - - ldi sp, SWITCH_STACK_SIZE(sp) - /* restore host regs from host sp */ - ldl $8, PT_REGS_R8(sp) + ldl $9, PT_REGS_R9(sp) + ldl $10, PT_REGS_R10(sp) + ldl $11, PT_REGS_R11(sp) + ldl $12, PT_REGS_R12(sp) + ldl $13, PT_REGS_R13(sp) + ldl $14, PT_REGS_R14(sp) + ldl $15, PT_REGS_R15(sp) ldl $26, PT_REGS_R26(sp) - ldi sp, PT_REGS_SIZE(sp) + ldi $8, 0x3fff + bic sp, $8, $8 /* restore host fpregs */ ldl $1, TI_TASK($8) ldi $1, TASK_THREAD($1) @@ -261,25 +252,17 @@ $setfpec_over: /* Hmcode will setup in */ /* restore $16 $17 $18, do interrupt trick */ - ldi sp, -(HOST_INT_SIZE + PT_REGS_SIZE + SWITCH_STACK_SIZE)(sp) + ldi sp, -(HOST_INT_SIZE + PT_REGS_SIZE)(sp) ldl $16, HOST_INT_R16(sp) ldl $17, HOST_INT_R17(sp) ldl $18, HOST_INT_R18(sp) - ldi sp, (HOST_INT_SIZE + PT_REGS_SIZE + SWITCH_STACK_SIZE)(sp) + ldi sp, (HOST_INT_SIZE + PT_REGS_SIZE)(sp) - ldi $8, 0x3fff - bic sp, $8, $8 ldi $19, -PT_REGS_SIZE(sp) - - ldi $26, ret_from_do_entInt_noregs - call $31, do_entInt - - /* ret($0) indicate hcall number */ -ret_from_do_entInt_noregs: + call $26, do_entInt ldl $26, VCPU_RET_RA(sp) ldl $0, VCPU_RET_R0(sp) - - /* restore r16 - r19 */ $ret_to: + /* ret($0) indicate hcall number */ ldi sp, VCPU_RET_SIZE(sp) /* pop stack */ ret -- Gitee From b9b8f159d7d859a2a6d1fcfcfa49987af59f0c87 Mon Sep 17 00:00:00 2001 From: He Sheng Date: Mon, 25 Apr 2022 13:39:10 +0800 Subject: [PATCH 105/674] sw64: remove other struct switch_stack things Sunway inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5GF7A -------------------------------- Based on the previous series of patches, the do_switch_stack, undo_switch_stack and struct switch_stack become unused. Herein they are cleaned up. Signed-off-by: He Sheng Signed-off-by: Gu Zitao --- arch/sw_64/include/uapi/asm/ptrace.h | 15 ---- arch/sw_64/kernel/asm-offsets.c | 11 --- arch/sw_64/kernel/entry.S | 129 --------------------------- arch/sw_64/kernel/traps.c | 5 +- 4 files changed, 2 insertions(+), 158 deletions(-) diff --git a/arch/sw_64/include/uapi/asm/ptrace.h b/arch/sw_64/include/uapi/asm/ptrace.h index 4549e8d4bf57..deb7e4096c93 100644 --- a/arch/sw_64/include/uapi/asm/ptrace.h +++ b/arch/sw_64/include/uapi/asm/ptrace.h @@ -54,21 +54,6 @@ struct pt_regs { unsigned long r18; }; -/* - * This is the extended stack used by signal handlers and the context - * switcher: it's pushed after the normal "struct pt_regs". - */ -struct switch_stack { - unsigned long r9; - unsigned long r10; - unsigned long r11; - unsigned long r12; - unsigned long r13; - unsigned long r14; - unsigned long r15; - unsigned long r26; -}; - #define PTRACE_GETREGS 12 /* get general purpose registers */ #define PTRACE_SETREGS 13 /* set general purpose registers */ #define PTRACE_GETFPREGS 14 /* get floating-point registers */ diff --git a/arch/sw_64/kernel/asm-offsets.c b/arch/sw_64/kernel/asm-offsets.c index 0e8a814f47ce..5dc59346996f 100644 --- a/arch/sw_64/kernel/asm-offsets.c +++ b/arch/sw_64/kernel/asm-offsets.c @@ -100,17 +100,6 @@ void foo(void) DEFINE(PT_REGS_R18, offsetof(struct pt_regs, r18)); BLANK(); - DEFINE(SWITCH_STACK_SIZE, sizeof(struct switch_stack)); - DEFINE(SWITCH_STACK_R9, offsetof(struct switch_stack, r9)); - DEFINE(SWITCH_STACK_R10, offsetof(struct switch_stack, r10)); - DEFINE(SWITCH_STACK_R11, offsetof(struct switch_stack, r11)); - DEFINE(SWITCH_STACK_R12, offsetof(struct switch_stack, r12)); - DEFINE(SWITCH_STACK_R13, offsetof(struct switch_stack, r13)); - DEFINE(SWITCH_STACK_R14, offsetof(struct switch_stack, r14)); - DEFINE(SWITCH_STACK_R15, offsetof(struct switch_stack, r15)); - DEFINE(SWITCH_STACK_RA, offsetof(struct switch_stack, r26)); - BLANK(); - DEFINE(KVM_REGS_SIZE, sizeof(struct kvm_regs)); DEFINE(KVM_REGS_R0, offsetof(struct kvm_regs, r0)); DEFINE(KVM_REGS_R1, offsetof(struct kvm_regs, r1)); diff --git a/arch/sw_64/kernel/entry.S b/arch/sw_64/kernel/entry.S index f54b02c1b3a9..977c774ad799 100644 --- a/arch/sw_64/kernel/entry.S +++ b/arch/sw_64/kernel/entry.S @@ -380,135 +380,6 @@ $syscall_trace_failed: br ret_from_sys_call .end strace - .align 4 - .ent do_switch_stack -do_switch_stack: - ldi $sp, -SWITCH_STACK_SIZE($sp) - flds $f31, 0($sp) /* fillde hint */ - stl $9, SWITCH_STACK_R9($sp) - stl $10, SWITCH_STACK_R10($sp) - stl $11, SWITCH_STACK_R11($sp) - stl $12, SWITCH_STACK_R12($sp) - stl $13, SWITCH_STACK_R13($sp) - stl $14, SWITCH_STACK_R14($sp) - stl $15, SWITCH_STACK_R15($sp) - stl $26, SWITCH_STACK_RA($sp) - // SIMD-FP - ldl $9, TI_TASK($8) - ldi $9, TASK_THREAD($9) - ldi $10, THREAD_CTX_FP($9) - vstd $f0, CTX_FP_F0($10) - vstd $f1, CTX_FP_F1($10) - vstd $f2, CTX_FP_F2($10) - vstd $f3, CTX_FP_F3($10) - vstd $f4, CTX_FP_F4($10) - vstd $f5, CTX_FP_F5($10) - vstd $f6, CTX_FP_F6($10) - vstd $f7, CTX_FP_F7($10) - vstd $f8, CTX_FP_F8($10) - vstd $f9, CTX_FP_F9($10) - vstd $f10, CTX_FP_F10($10) - vstd $f11, CTX_FP_F11($10) - vstd $f12, CTX_FP_F12($10) - vstd $f13, CTX_FP_F13($10) - vstd $f14, CTX_FP_F14($10) - vstd $f15, CTX_FP_F15($10) - vstd $f16, CTX_FP_F16($10) - vstd $f17, CTX_FP_F17($10) - vstd $f18, CTX_FP_F18($10) - vstd $f19, CTX_FP_F19($10) - vstd $f20, CTX_FP_F20($10) - vstd $f21, CTX_FP_F21($10) - vstd $f22, CTX_FP_F22($10) - vstd $f23, CTX_FP_F23($10) - vstd $f24, CTX_FP_F24($10) - vstd $f25, CTX_FP_F25($10) - vstd $f26, CTX_FP_F26($10) - vstd $f27, CTX_FP_F27($10) - rfpcr $f0 - vstd $f28, CTX_FP_F28($10) - vstd $f29, CTX_FP_F29($10) - vstd $f30, CTX_FP_F30($10) - fstd $f0, THREAD_FPCR($9) - vldd $f0, CTX_FP_F0($10) - ldl $9, SWITCH_STACK_R9($sp) - ldl $10, SWITCH_STACK_R10($sp) - ret $31, ($1), 1 - .end do_switch_stack - - .align 4 - .ent undo_switch_stack -undo_switch_stack: -#ifdef CONFIG_SUBARCH_C3B - fillcs 0($sp) /* prefetch */ -#endif - ldl $11, SWITCH_STACK_R11($sp) - ldl $12, SWITCH_STACK_R12($sp) - ldl $13, SWITCH_STACK_R13($sp) - ldl $14, SWITCH_STACK_R14($sp) - ldl $15, SWITCH_STACK_R15($sp) - ldl $26, SWITCH_STACK_RA($sp) - // SIMD-FP - ldl $9, TI_TASK($8) - ldi $9, TASK_THREAD($9) - fldd $f0, THREAD_FPCR($9) - wfpcr $f0 - fimovd $f0, $10 - and $10, 0x3, $10 - beq $10, $setfpec_0 - subl $10, 0x1, $10 - beq $10, $setfpec_1 - subl $10, 0x1, $10 - beq $10, $setfpec_2 - setfpec3 - br $setfpec_over -$setfpec_0: - setfpec0 - br $setfpec_over -$setfpec_1: - setfpec1 - br $setfpec_over -$setfpec_2: - setfpec2 -$setfpec_over: - ldi $10, THREAD_CTX_FP($9) - vldd $f0, CTX_FP_F0($10) - vldd $f1, CTX_FP_F1($10) - vldd $f2, CTX_FP_F2($10) - vldd $f3, CTX_FP_F3($10) - vldd $f4, CTX_FP_F4($10) - vldd $f5, CTX_FP_F5($10) - vldd $f6, CTX_FP_F6($10) - vldd $f7, CTX_FP_F7($10) - vldd $f8, CTX_FP_F8($10) - vldd $f9, CTX_FP_F9($10) - vldd $f10, CTX_FP_F10($10) - vldd $f11, CTX_FP_F11($10) - vldd $f12, CTX_FP_F12($10) - vldd $f13, CTX_FP_F13($10) - vldd $f14, CTX_FP_F14($10) - vldd $f15, CTX_FP_F15($10) - vldd $f16, CTX_FP_F16($10) - vldd $f17, CTX_FP_F17($10) - vldd $f18, CTX_FP_F18($10) - vldd $f19, CTX_FP_F19($10) - vldd $f20, CTX_FP_F20($10) - vldd $f21, CTX_FP_F21($10) - vldd $f22, CTX_FP_F22($10) - vldd $f23, CTX_FP_F23($10) - vldd $f24, CTX_FP_F24($10) - vldd $f25, CTX_FP_F25($10) - vldd $f26, CTX_FP_F26($10) - vldd $f27, CTX_FP_F27($10) - vldd $f28, CTX_FP_F28($10) - vldd $f29, CTX_FP_F29($10) - vldd $f30, CTX_FP_F30($10) - ldl $9, SWITCH_STACK_R9($sp) - ldl $10, SWITCH_STACK_R10($sp) - ldi $sp, SWITCH_STACK_SIZE($sp) - ret $31, ($1), 1 - .end undo_switch_stack - /* * Integer register context switch * The callee-saved registers must be saved and restored. diff --git a/arch/sw_64/kernel/traps.c b/arch/sw_64/kernel/traps.c index 6cd3ade905d1..19fbf50b6ebc 100644 --- a/arch/sw_64/kernel/traps.c +++ b/arch/sw_64/kernel/traps.c @@ -300,9 +300,8 @@ do_entIF(unsigned long inst_type, struct pt_regs *regs) case 3: /* FEN fault */ /* * Irritating users can call HMC_clrfen to disable the - * FPU for the process. The kernel will then trap in - * do_switch_stack and undo_switch_stack when we try - * to save and restore the FP registers. + * FPU for the process. The kernel will then trap to + * save and restore the FP registers. * Given that GCC by default generates code that uses the * FP registers, HMC_clrfen is not useful except for DoS -- Gitee From ef588f6bce82bdeface18c414af7ffe0a97a1773 Mon Sep 17 00:00:00 2001 From: He Sheng Date: Thu, 28 Apr 2022 09:32:24 +0800 Subject: [PATCH 106/674] sw64: access pt_regs with regoffsets where appropriate Sunway inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5GF7A -------------------------------- The struct pt_regs of sw64 is a bit weird because r16~r18, r29 and r30 are saved at odd location by hmcode, which makes it complicated to reference them with regno. To improve maintainability, we map these regno(s) in an unified way. Signed-off-by: He Sheng Signed-off-by: Gu Zitao --- arch/sw_64/include/asm/extable.h | 4 ++++ arch/sw_64/include/asm/ptrace.h | 3 +++ arch/sw_64/kernel/ptrace.c | 27 +++++++++++++-------------- arch/sw_64/kernel/traps.c | 31 +++++++------------------------ arch/sw_64/mm/fault.c | 6 +----- 5 files changed, 28 insertions(+), 43 deletions(-) diff --git a/arch/sw_64/include/asm/extable.h b/arch/sw_64/include/asm/extable.h index 12b50b68a0d2..ae753772a45a 100644 --- a/arch/sw_64/include/asm/extable.h +++ b/arch/sw_64/include/asm/extable.h @@ -52,4 +52,8 @@ struct exception_table_entry { (b)->fixup.unit = (tmp).fixup.unit; \ } while (0) +/* Macro for exception fixup code to access integer registers. */ +extern short regoffsets[]; +#define map_regs(r) (*(unsigned long *)((char *)regs + regoffsets[r])) + #endif diff --git a/arch/sw_64/include/asm/ptrace.h b/arch/sw_64/include/asm/ptrace.h index 74349a05b9e4..0f0de9a3f8ce 100644 --- a/arch/sw_64/include/asm/ptrace.h +++ b/arch/sw_64/include/asm/ptrace.h @@ -28,6 +28,9 @@ #define force_successful_syscall_return() (current_pt_regs()->r0 = 0) #define MAX_REG_OFFSET (offsetof(struct pt_regs, r18)) + +extern short regoffsets[]; + /** * regs_get_register() - get register value from its offset * @regs: pt_regs from which register value is gotten diff --git a/arch/sw_64/kernel/ptrace.c b/arch/sw_64/kernel/ptrace.c index 097590b22a5a..ce41d89a54cb 100644 --- a/arch/sw_64/kernel/ptrace.c +++ b/arch/sw_64/kernel/ptrace.c @@ -9,6 +9,7 @@ #include #include +#include #include "proto.h" @@ -52,23 +53,21 @@ enum { REG_GP = 29 }; -#define PT_REG(reg) \ - (PAGE_SIZE * 2 - sizeof(struct pt_regs) + offsetof(struct pt_regs, reg)) - #define FP_REG(fp_regno, vector_regno) \ (fp_regno * 32 + vector_regno * 8) -static int regoff[] = { - PT_REG(r0), PT_REG(r1), PT_REG(r2), PT_REG(r3), - PT_REG(r4), PT_REG(r5), PT_REG(r6), PT_REG(r7), - PT_REG(r8), PT_REG(r9), PT_REG(r10), PT_REG(r11), - PT_REG(r12), PT_REG(r13), PT_REG(r14), PT_REG(r15), - PT_REG(r16), PT_REG(r17), PT_REG(r18), PT_REG(r19), - PT_REG(r20), PT_REG(r21), PT_REG(r22), PT_REG(r23), - PT_REG(r24), PT_REG(r25), PT_REG(r26), PT_REG(r27), - PT_REG(r28), PT_REG(gp), -1, -1 +#define R(x) ((size_t) &((struct pt_regs *)0)->x) + +short regoffsets[32] = { + R(r0), R(r1), R(r2), R(r3), R(r4), R(r5), R(r6), R(r7), R(r8), + R(r9), R(r10), R(r11), R(r12), R(r13), R(r14), R(r15), + R(r16), R(r17), R(r18), + R(r19), R(r20), R(r21), R(r22), R(r23), R(r24), R(r25), R(r26), + R(r27), R(r28), R(gp), 0, 0 }; +#undef R + #define PCB_OFF(var) offsetof(struct pcb_struct, var) static int pcboff[] = { @@ -104,7 +103,7 @@ get_reg_addr(struct task_struct *task, unsigned long regno) addr = (void *)task_thread_info(task) + pcboff[regno]; break; case REG_BASE ... REG_END: - addr = (void *)task_thread_info(task) + regoff[regno]; + addr = (void *)task_pt_regs(task) + regoffsets[regno]; break; case FPREG_BASE ... FPREG_END: fp_regno = regno - FPREG_BASE; @@ -128,7 +127,7 @@ get_reg_addr(struct task_struct *task, unsigned long regno) addr = (void *)&task->thread.fpcr; break; case PC: - addr = (void *)task_thread_info(task) + PT_REG(pc); + addr = (void *)task_pt_regs(task) + PT_REGS_PC; break; default: addr = &zero; diff --git a/arch/sw_64/kernel/traps.c b/arch/sw_64/kernel/traps.c index 19fbf50b6ebc..a61c851967a9 100644 --- a/arch/sw_64/kernel/traps.c +++ b/arch/sw_64/kernel/traps.c @@ -325,23 +325,6 @@ struct unaligned_stat { } unaligned[2]; -/* Macro for exception fixup code to access integer registers. */ -#define R(x) ((size_t) &((struct pt_regs *)0)->x) - -static int regoffsets[32] = { - R(r0), R(r1), R(r2), R(r3), R(r4), R(r5), R(r6), R(r7), R(r8), - R(r9), R(r10), R(r11), R(r12), R(r13), R(r14), R(r15), - R(r16), R(r17), R(r18), - R(r19), R(r20), R(r21), R(r22), R(r23), R(r24), R(r25), R(r26), - R(r27), R(r28), R(gp), - 0, 0 -}; - -#undef R - -#define una_reg(r) (*(unsigned long *)((char *)regs + regoffsets[r])) - - asmlinkage void do_entUna(void *va, unsigned long opcode, unsigned long reg, struct pt_regs *regs) @@ -380,7 +363,7 @@ do_entUna(void *va, unsigned long opcode, unsigned long reg, if (error) goto got_exception; - una_reg(reg) = tmp1 | tmp2; + map_regs(reg) = tmp1 | tmp2; return; case 0x22: @@ -401,7 +384,7 @@ do_entUna(void *va, unsigned long opcode, unsigned long reg, if (error) goto got_exception; - una_reg(reg) = (int)(tmp1 | tmp2); + map_regs(reg) = (int)(tmp1 | tmp2); return; case 0x23: /* ldl */ @@ -422,7 +405,7 @@ do_entUna(void *va, unsigned long opcode, unsigned long reg, if (error) goto got_exception; - una_reg(reg) = tmp1 | tmp2; + map_regs(reg) = tmp1 | tmp2; return; case 0x29: /* sth */ @@ -440,7 +423,7 @@ do_entUna(void *va, unsigned long opcode, unsigned long reg, ".previous" : "=r"(error), "=&r"(tmp1), "=&r"(tmp2), "=&r"(tmp3), "=&r"(tmp4) - : "r"(va), "r"(una_reg(reg)), "0"(0)); + : "r"(va), "r"(map_regs(reg)), "0"(0)); if (error) goto got_exception; @@ -472,7 +455,7 @@ do_entUna(void *va, unsigned long opcode, unsigned long reg, ".previous" : "=r"(error), "=&r"(tmp1), "=&r"(tmp2), "=&r"(tmp3), "=&r"(tmp4) - : "r"(va), "r"(una_reg(reg)), "0"(0)); + : "r"(va), "r"(map_regs(reg)), "0"(0)); if (error) goto got_exception; @@ -524,7 +507,7 @@ do_entUna(void *va, unsigned long opcode, unsigned long reg, ".previous" : "=r"(error), "=&r"(tmp1), "=&r"(tmp2), "=&r"(tmp3), "=&r"(tmp4), "=&r"(tmp5), "=&r"(tmp6), "=&r"(tmp7), "=&r"(tmp8) - : "r"(va), "r"(una_reg(reg)), "0"(0)); + : "r"(va), "r"(map_regs(reg)), "0"(0)); if (error) goto got_exception; @@ -543,7 +526,7 @@ do_entUna(void *va, unsigned long opcode, unsigned long reg, if (fixup != 0) { unsigned long newpc; - newpc = fixup_exception(una_reg, fixup, pc); + newpc = fixup_exception(map_regs, fixup, pc); printk("Forwarding unaligned exception at %lx (%lx)\n", pc, newpc); diff --git a/arch/sw_64/mm/fault.c b/arch/sw_64/mm/fault.c index 9562bc3ba37f..9b0ad9eb5a3a 100644 --- a/arch/sw_64/mm/fault.c +++ b/arch/sw_64/mm/fault.c @@ -107,10 +107,6 @@ __load_new_mm_context(struct mm_struct *next_mm) * modify them. */ -/* Macro for exception fixup code to access integer registers. */ -#define dpf_reg(r) \ - (((unsigned long *)regs)[(r) <= 8 ? (r) : (r) <= 15 ? (r)-16 : \ - (r) <= 18 ? (r)+10 : (r)-10]) unsigned long show_va_to_pa(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd = NULL; @@ -294,7 +290,7 @@ do_page_fault(unsigned long address, unsigned long mmcsr, if (fixup != 0) { unsigned long newpc; - newpc = fixup_exception(dpf_reg, fixup, regs->pc); + newpc = fixup_exception(map_regs, fixup, regs->pc); regs->pc = newpc; return; } -- Gitee From ca223ff216cdab7bb985b24221f92e1495ce4bc7 Mon Sep 17 00:00:00 2001 From: He Sheng Date: Sat, 14 May 2022 11:07:13 +0800 Subject: [PATCH 107/674] sw64: move struct pt_regs to kapi ptrace.h Sunway inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5GF7A -------------------------------- This struct should be invisible to user space, and we will define new regs struct for user in uapi later. Signed-off-by: He Sheng Signed-off-by: Gu Zitao --- arch/sw_64/include/asm/ptrace.h | 47 +++++++++++++++++++++++++ arch/sw_64/include/uapi/asm/ptrace.h | 52 ---------------------------- 2 files changed, 47 insertions(+), 52 deletions(-) diff --git a/arch/sw_64/include/asm/ptrace.h b/arch/sw_64/include/asm/ptrace.h index 0f0de9a3f8ce..85f7a47fdee3 100644 --- a/arch/sw_64/include/asm/ptrace.h +++ b/arch/sw_64/include/asm/ptrace.h @@ -9,6 +9,53 @@ #include #include +/* + * This struct defines the way the registers are stored on the + * kernel stack during a system call or other kernel entry + */ + +struct pt_regs { + unsigned long r0; + unsigned long r1; + unsigned long r2; + unsigned long r3; + unsigned long r4; + unsigned long r5; + unsigned long r6; + unsigned long r7; + unsigned long r8; + unsigned long r9; + unsigned long r10; + unsigned long r11; + unsigned long r12; + unsigned long r13; + unsigned long r14; + unsigned long r15; + /* r16 ~ r18 saved by hmcode */ + unsigned long r19; + unsigned long r20; + unsigned long r21; + unsigned long r22; + unsigned long r23; + unsigned long r24; + unsigned long r25; + unsigned long r26; + unsigned long r27; + unsigned long r28; + unsigned long hae; +/* JRP - These are the values provided to a0-a2 by HMcode */ + unsigned long trap_a0; + unsigned long trap_a1; + unsigned long trap_a2; +/* These are saved by HMcode: */ + unsigned long ps; + unsigned long pc; + unsigned long gp; + unsigned long r16; + unsigned long r17; + unsigned long r18; +}; + #define arch_has_single_step() (1) #define user_mode(regs) (((regs)->ps & 8) != 0) #define instruction_pointer(regs) ((regs)->pc) diff --git a/arch/sw_64/include/uapi/asm/ptrace.h b/arch/sw_64/include/uapi/asm/ptrace.h index deb7e4096c93..ac7ce6b6510b 100644 --- a/arch/sw_64/include/uapi/asm/ptrace.h +++ b/arch/sw_64/include/uapi/asm/ptrace.h @@ -2,58 +2,6 @@ #ifndef _UAPI_ASM_SW64_PTRACE_H #define _UAPI_ASM_SW64_PTRACE_H - -/* - * This struct defines the way the registers are stored on the - * kernel stack during a system call or other kernel entry - * - * NOTE! I want to minimize the overhead of system calls, so this - * struct has as little information as possible. I does not have - * floating point regs, because the kernel doesn't change those - */ - -struct pt_regs { - unsigned long r0; - unsigned long r1; - unsigned long r2; - unsigned long r3; - unsigned long r4; - unsigned long r5; - unsigned long r6; - unsigned long r7; - unsigned long r8; - unsigned long r9; - unsigned long r10; - unsigned long r11; - unsigned long r12; - unsigned long r13; - unsigned long r14; - unsigned long r15; - /* r16 ~ r18 saved by hmcode */ - unsigned long r19; - unsigned long r20; - unsigned long r21; - unsigned long r22; - unsigned long r23; - unsigned long r24; - unsigned long r25; - unsigned long r26; - unsigned long r27; - unsigned long r28; - unsigned long hae; -/* JRP - These are the values provided to a0-a2 by HMcode */ - unsigned long trap_a0; - unsigned long trap_a1; - unsigned long trap_a2; -/* These are saved by HMcode: */ - unsigned long ps; - unsigned long pc; - unsigned long gp; - unsigned long r16; - unsigned long r17; - unsigned long r18; -}; - #define PTRACE_GETREGS 12 /* get general purpose registers */ #define PTRACE_SETREGS 13 /* set general purpose registers */ #define PTRACE_GETFPREGS 14 /* get floating-point registers */ -- Gitee From 2ac1d37a4635c4fd4fb9a98889bbeb7bfe9aef87 Mon Sep 17 00:00:00 2001 From: He Sheng Date: Mon, 16 May 2022 10:31:57 +0800 Subject: [PATCH 108/674] sw64: avoid copying thread_struct twice Sunway inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5GF7A -------------------------------- The thread_struct has been copied in dup_task_struct() by generic arch_dup_task_struct(), and it's unnecessary to be copied again in copy_thread(). This patch adds arch-specific implementation of arch_dup_task_struct() to reduce data copying. It does not have to save fpu state for a kernel thread, so call fpstate_save() instead of __fpstate_save() here to save fpu state on demand. Signed-off-by: He Sheng Signed-off-by: Gu Zitao --- arch/sw_64/kernel/process.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/sw_64/kernel/process.c b/arch/sw_64/kernel/process.c index 1f093c5a5235..6b5f0f3a8345 100644 --- a/arch/sw_64/kernel/process.c +++ b/arch/sw_64/kernel/process.c @@ -144,6 +144,13 @@ release_thread(struct task_struct *dead_task) { } +int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) +{ + fpstate_save(src); + *dst = *src; + return 0; +} + /* * Copy architecture-specific thread state */ @@ -162,8 +169,6 @@ copy_thread(unsigned long clone_flags, unsigned long usp, childti->pcb.ksp = (unsigned long) childregs; childti->pcb.flags = 7; /* set FEN, clear everything else */ - __fpstate_save(current); - p->thread = current->thread; if (unlikely(p->flags & PF_KTHREAD)) { /* kernel thread */ -- Gitee From cb3b05c7af85291bc8fa870f542a6176ee6b126a Mon Sep 17 00:00:00 2001 From: He Chuyue Date: Mon, 16 May 2022 16:34:52 +0800 Subject: [PATCH 109/674] sw64: simplify __phys_addr and __virt_addr_valid Sunway inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I56OLG -------------------------------- In these methods, the local variable `y` is redundant and can be removed. Signed-off-by: He Chuyue Signed-off-by: Gu Zitao --- arch/sw_64/mm/physaddr.c | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/arch/sw_64/mm/physaddr.c b/arch/sw_64/mm/physaddr.c index fbb489ae4db5..26769f0bf7bf 100644 --- a/arch/sw_64/mm/physaddr.c +++ b/arch/sw_64/mm/physaddr.c @@ -5,34 +5,30 @@ unsigned long __phys_addr(unsigned long x) { - unsigned long y = x; - - if (y >= __START_KERNEL_map) { - y -= __START_KERNEL_map; - VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE); + if (x >= __START_KERNEL_map) { + x -= __START_KERNEL_map; + VIRTUAL_BUG_ON(x >= KERNEL_IMAGE_SIZE); } else { - VIRTUAL_BUG_ON(y < PAGE_OFFSET); - y -= PAGE_OFFSET; - VIRTUAL_BUG_ON(!phys_addr_valid(y)); + VIRTUAL_BUG_ON(x < PAGE_OFFSET); + x -= PAGE_OFFSET; + VIRTUAL_BUG_ON(!phys_addr_valid(x)); } - return y; + return x; } EXPORT_SYMBOL(__phys_addr); bool __virt_addr_valid(unsigned long x) { - unsigned long y = x; - - if (y >= __START_KERNEL_map) { - y -= __START_KERNEL_map; - if (y >= KERNEL_IMAGE_SIZE) + if (x >= __START_KERNEL_map) { + x -= __START_KERNEL_map; + if (x >= KERNEL_IMAGE_SIZE) return false; } else { - if (y < PAGE_OFFSET) + if (x < PAGE_OFFSET) return false; - y -= PAGE_OFFSET; + x -= PAGE_OFFSET; } - return pfn_valid(y >> PAGE_SHIFT); + return pfn_valid(x >> PAGE_SHIFT); } EXPORT_SYMBOL(__virt_addr_valid); -- Gitee From 5c841330a078634752deca6aaf3fd749f092a0eb Mon Sep 17 00:00:00 2001 From: He Sheng Date: Tue, 17 May 2022 15:05:15 +0800 Subject: [PATCH 110/674] sw64: simplify pgtable helpers Sunway inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I56OLG -------------------------------- This patch: - Remove unused macros. - Rename some variables and macros to make them shorter. - Add pfn_to_virt and virt_to_pfn to make code cleaner. - Map vaddr by __va(). - Redefine mk_pte with pfn_to_pte. - Fix some wrong usages, like pgd_offset Signed-off-by: He Sheng Signed-off-by: Gu Zitao --- arch/sw_64/include/asm/mmu_context.h | 8 +-- arch/sw_64/include/asm/mmzone.h | 23 ------- arch/sw_64/include/asm/page.h | 3 + arch/sw_64/include/asm/pgalloc.h | 2 +- arch/sw_64/include/asm/pgtable.h | 91 +++++++++------------------- arch/sw_64/mm/fault.c | 6 +- arch/sw_64/mm/init.c | 2 +- 7 files changed, 40 insertions(+), 95 deletions(-) diff --git a/arch/sw_64/include/asm/mmu_context.h b/arch/sw_64/include/asm/mmu_context.h index e3d7ae7c873e..d6cd01d55712 100644 --- a/arch/sw_64/include/asm/mmu_context.h +++ b/arch/sw_64/include/asm/mmu_context.h @@ -131,7 +131,7 @@ switch_mm(struct mm_struct *prev_mm, struct mm_struct *next_mm, * Always update the PCB PTBR. If next is kernel thread, it must * update PTBR. If next is user process, it's ok to update PTBR. */ - task_thread_info(next)->pcb.ptbr = (__pa(next_mm->pgd)) >> PAGE_SHIFT; + task_thread_info(next)->pcb.ptbr = virt_to_pfn(next_mm->pgd); load_asn_ptbr(task_thread_info(next)->pcb.asn, task_thread_info(next)->pcb.ptbr); } @@ -170,8 +170,7 @@ static inline int init_new_context(struct task_struct *tsk, for_each_possible_cpu(i) mm->context.asid[i] = 0; if (tsk != current) - task_thread_info(tsk)->pcb.ptbr - = (__pa(mm->pgd)) >> PAGE_SHIFT; + task_thread_info(tsk)->pcb.ptbr = virt_to_pfn(mm->pgd); return 0; } @@ -183,8 +182,7 @@ static inline void destroy_context(struct mm_struct *mm) static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) { - task_thread_info(tsk)->pcb.ptbr - = (__pa(mm->pgd)) >> PAGE_SHIFT; + task_thread_info(tsk)->pcb.ptbr = virt_to_pfn(mm->pgd); } static inline int arch_dup_mmap(struct mm_struct *oldmm, diff --git a/arch/sw_64/include/asm/mmzone.h b/arch/sw_64/include/asm/mmzone.h index 924e33f6d326..4219015c5cfc 100644 --- a/arch/sw_64/include/asm/mmzone.h +++ b/arch/sw_64/include/asm/mmzone.h @@ -18,29 +18,6 @@ extern pg_data_t *node_data[]; extern int pa_to_nid(unsigned long pa); extern int pfn_valid(unsigned long pfn); -#define mk_pte(page, pgprot) \ -({ \ - pte_t pte; \ - unsigned long pfn; \ - \ - pfn = page_to_pfn(page) << _PTE_FLAGS_BITS; \ - pte_val(pte) = pfn | pgprot_val(pgprot); \ - \ - pte; \ -}) - -#define pte_page(x) \ -({ \ - unsigned long kvirt; \ - struct page *__xx; \ - \ - kvirt = (unsigned long)__va(pte_val(x) >> (_PTE_FLAGS_BITS-PAGE_SHIFT));\ - __xx = virt_to_page(kvirt); \ - \ - __xx; \ -}) - -#define page_to_pa(page) (page_to_pfn(page) << PAGE_SHIFT) #define pfn_to_nid(pfn) pa_to_nid(((u64)(pfn) << PAGE_SHIFT)) #endif /* CONFIG_DISCONTIGMEM */ diff --git a/arch/sw_64/include/asm/page.h b/arch/sw_64/include/asm/page.h index 363785c9f082..dc6a89c37231 100644 --- a/arch/sw_64/include/asm/page.h +++ b/arch/sw_64/include/asm/page.h @@ -50,6 +50,9 @@ extern unsigned long __phys_addr(unsigned long); #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) #define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) +#define virt_to_pfn(vaddr) (PHYS_PFN(__pa(vaddr))) +#define pfn_to_virt(pfn) (__va(PFN_PHYS(pfn))) + #ifdef CONFIG_FLATMEM #define pfn_valid(pfn) ((pfn) < max_mapnr) #endif /* CONFIG_FLATMEM */ diff --git a/arch/sw_64/include/asm/pgalloc.h b/arch/sw_64/include/asm/pgalloc.h index 3cfdcbef7ef8..9572b4709ff4 100644 --- a/arch/sw_64/include/asm/pgalloc.h +++ b/arch/sw_64/include/asm/pgalloc.h @@ -15,7 +15,7 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, pgtable_t pte) { - pmd_set(pmd, (pte_t *)(page_to_pa(pte) + PAGE_OFFSET)); + pmd_set(pmd, (pte_t *)__va(page_to_pa(pte))); } #define pmd_pgtable(pmd) pmd_page(pmd) diff --git a/arch/sw_64/include/asm/pgtable.h b/arch/sw_64/include/asm/pgtable.h index 590f15508e28..67637b4ddf1d 100644 --- a/arch/sw_64/include/asm/pgtable.h +++ b/arch/sw_64/include/asm/pgtable.h @@ -119,9 +119,8 @@ static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, #define __ACCESS_BITS (_PAGE_ACCESSED | _PAGE_KRE | _PAGE_URE) -#define _PFN_MASK 0xFFFFFFFFF0000000UL -#define _PFN_BITS 36 -#define _PTE_FLAGS_BITS (64 - _PFN_BITS) +#define _PFN_SHIFT 28 +#define _PFN_MASK ((-1UL) << _PFN_SHIFT) #define _PAGE_TABLE (_PAGE_VALID | __DIRTY_BITS | __ACCESS_BITS) #define _PAGE_CHG_MASK (_PFN_MASK | __DIRTY_BITS | __ACCESS_BITS | _PAGE_SPECIAL) @@ -181,53 +180,19 @@ static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, extern struct page *empty_zero_page; #define ZERO_PAGE(vaddr) (empty_zero_page) -/* number of bits that fit into a memory pointer */ -#define BITS_PER_PTR (8 * sizeof(unsigned long)) - -/* to align the pointer to a pointer address */ -#define PTR_MASK (~(sizeof(void *) - 1)) - -/* sizeof(void*)==1<> (PAGE_SHIFT - SIZEOF_PTR_LOG2) & PTR_MASK & ~PAGE_MASK) - -#define PHYS_TWIDDLE(pfn) (pfn) - -/* - * Conversion functions: convert a page and protection to a page entry, - * and a page entry and page directory to the page they refer to. - */ -#define page_to_pa(page) (page_to_pfn(page) << PAGE_SHIFT) - -#define pmd_pfn(pmd) (pmd_val(pmd) >> _PTE_FLAGS_BITS) -#define pte_pfn(pte) (pte_val(pte) >> _PTE_FLAGS_BITS) -#ifndef CONFIG_DISCONTIGMEM -#define pte_page(pte) pfn_to_page(pte_pfn(pte)) -#define mk_pte(page, pgprot) \ -({ \ - pte_t pte; \ - \ - pte_val(pte) = (page_to_pfn(page) << _PTE_FLAGS_BITS) | pgprot_val(pgprot); \ - pte; \ -}) -#endif - -static inline pte_t pfn_pte(unsigned long physpfn, pgprot_t pgprot) +static inline pte_t pfn_pte(unsigned long pfn, pgprot_t prot) { pte_t pte; - pte_val(pte) = (PHYS_TWIDDLE(physpfn) << _PTE_FLAGS_BITS) | pgprot_val(pgprot); + pte_val(pte) = (pfn << _PFN_SHIFT) | pgprot_val(prot); return pte; } -static inline pmd_t pfn_pmd(unsigned long physpfn, pgprot_t pgprot) +static inline pmd_t pfn_pmd(unsigned long pfn, pgprot_t prot) { pmd_t pmd; - pmd_val(pmd) = (PHYS_TWIDDLE(physpfn) << _PTE_FLAGS_BITS) | pgprot_val(pgprot); + pmd_val(pmd) = (pfn << _PFN_SHIFT) | pgprot_val(prot); return pmd; } @@ -245,37 +210,48 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) static inline void pmd_set(pmd_t *pmdp, pte_t *ptep) { - pmd_val(*pmdp) = _PAGE_TABLE | (__pa(ptep) << (_PTE_FLAGS_BITS - PAGE_SHIFT)); + pmd_val(*pmdp) = _PAGE_TABLE | (virt_to_pfn(ptep) << _PFN_SHIFT); } static inline void pud_set(pud_t *pudp, pmd_t *pmdp) { - pud_val(*pudp) = _PAGE_TABLE | (__pa(pmdp) << (_PTE_FLAGS_BITS - PAGE_SHIFT)); + pud_val(*pudp) = _PAGE_TABLE | (virt_to_pfn(pmdp) << _PFN_SHIFT); } static inline void p4d_set(p4d_t *p4dp, pud_t *pudp) { - p4d_val(*p4dp) = _PAGE_TABLE | (__pa(pudp) << (_PTE_FLAGS_BITS - PAGE_SHIFT)); + p4d_val(*p4dp) = _PAGE_TABLE | (virt_to_pfn(pudp) << _PFN_SHIFT); } -static inline unsigned long -pmd_page_vaddr(pmd_t pmd) +static inline unsigned long pmd_page_vaddr(pmd_t pmd) { - return ((pmd_val(pmd) & _PFN_MASK) >> (_PTE_FLAGS_BITS-PAGE_SHIFT)) + PAGE_OFFSET; + return (unsigned long)pfn_to_virt(pmd_val(pmd) >> _PFN_SHIFT); } -#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> _PTE_FLAGS_BITS)) -#define pud_page(pud) (pfn_to_page(pud_val(pud) >> _PTE_FLAGS_BITS)) -#define p4d_page(p4d) (pfn_to_page(p4d_val(p4d) >> _PTE_FLAGS_BITS)) +/* + * Conversion functions: convert a page and protection to a page entry, + * and a page entry and page directory to the page they refer to. + */ +#define page_to_pa(page) (page_to_pfn(page) << PAGE_SHIFT) + +#define pmd_pfn(pmd) (pmd_val(pmd) >> _PFN_SHIFT) +#define pte_pfn(pte) (pte_val(pte) >> _PFN_SHIFT) + +#define pte_page(pte) pfn_to_page(pte_pfn(pte)) +#define mk_pte(page, prot) pfn_pte(page_to_pfn(page), prot) + +#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> _PFN_SHIFT)) +#define pud_page(pud) (pfn_to_page(pud_val(pud) >> _PFN_SHIFT)) +#define p4d_page(p4d) (pfn_to_page(p4d_val(p4d) >> _PFN_SHIFT)) static inline pud_t *p4d_pgtable(p4d_t p4d) { - return (pud_t *)(PAGE_OFFSET + ((p4d_val(p4d) & _PFN_MASK) >> (_PTE_FLAGS_BITS-PAGE_SHIFT))); + return (pud_t *)pfn_to_virt(p4d_val(p4d) >> _PFN_SHIFT); } static inline pmd_t *pud_pgtable(pud_t pud) { - return (pmd_t *)(PAGE_OFFSET + ((pud_val(pud) & _PFN_MASK) >> (_PTE_FLAGS_BITS-PAGE_SHIFT))); + return (pmd_t *)pfn_to_virt(pud_val(pud) >> _PFN_SHIFT); } static inline int pte_none(pte_t pte) @@ -566,7 +542,7 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, set_bit(_PAGE_BIT_FOW, (unsigned long *)pmdp); } -#define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot)) +#define mk_pmd(page, prot) pfn_pmd(page_to_pfn(page), (prot)) #define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS extern int pmdp_set_access_flags(struct vm_area_struct *vma, @@ -586,15 +562,6 @@ extern int pmdp_clear_flush_young(struct vm_area_struct *vma, extern void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp); -#define PAGE_DIR_OFFSET(tsk, address) pgd_offset((tsk), (address)) - -/* to find an entry in a kernel page-table-directory */ -#define pgd_offset_k(address) pgd_offset(&init_mm, (address)) - -/* to find an entry in a page-table-directory. */ -#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1)) -#define pgd_offset(mm, address) ((mm)->pgd+pgd_index(address)) - extern pgd_t swapper_pg_dir[1024]; /* diff --git a/arch/sw_64/mm/fault.c b/arch/sw_64/mm/fault.c index 9b0ad9eb5a3a..d596fc50772d 100644 --- a/arch/sw_64/mm/fault.c +++ b/arch/sw_64/mm/fault.c @@ -80,7 +80,7 @@ __load_new_mm_context(struct mm_struct *next_mm) pcb = ¤t_thread_info()->pcb; pcb->asn = mmc & HARDWARE_ASN_MASK; - pcb->ptbr = ((unsigned long) next_mm->pgd - PAGE_OFFSET) >> PAGE_SHIFT; + pcb->ptbr = virt_to_pfn(next_mm->pgd); __reload_thread(pcb); } @@ -122,7 +122,7 @@ unsigned long show_va_to_pa(struct mm_struct *mm, unsigned long addr) pr_debug("addr = %#lx, pgd = %#lx\n", addr, pgd_val(*pgd)); goto out; } - p4d = pgd_offset(pgd, addr); + p4d = p4d_offset(pgd, addr); if (p4d_none(*p4d)) { ret = 0; pr_debug("addr = %#lx, pgd = %#lx, p4d = %#lx\n", @@ -146,7 +146,7 @@ unsigned long show_va_to_pa(struct mm_struct *mm, unsigned long addr) } pte = pte_offset_map(pmd, addr); if (pte_present(*pte)) { - ret = ((unsigned long)__va(((pte_val(*pte) >> 32)) << PAGE_SHIFT)); + ret = (unsigned long)pfn_to_virt(pte_val(*pte) >> _PFN_SHIFT); pr_debug("addr = %#lx, pgd = %#lx, pud = %#lx, pmd = %#lx, pte = %#lx, ret = %#lx\n", addr, *(unsigned long *)pgd, *(unsigned long *)pud, *(unsigned long *)pmd, *(unsigned long *)pte, ret); diff --git a/arch/sw_64/mm/init.c b/arch/sw_64/mm/init.c index 3593e4b13319..cd3c0551b8f8 100644 --- a/arch/sw_64/mm/init.c +++ b/arch/sw_64/mm/init.c @@ -87,7 +87,7 @@ switch_to_system_map(void) * the last slot of the L1 page table. */ memset(swapper_pg_dir, 0, PAGE_SIZE); - newptbr = __pa(swapper_pg_dir) >> PAGE_SHIFT; + newptbr = virt_to_pfn(swapper_pg_dir); /* Also set up the real kernel PCB while we're at it. */ init_thread_info.pcb.ptbr = newptbr; -- Gitee From 9e08dc63c00d48adde62976841bcacf25409b88b Mon Sep 17 00:00:00 2001 From: He Chuyue Date: Tue, 17 May 2022 09:43:30 +0800 Subject: [PATCH 111/674] sw64: check integrity for dtb passed by BIOS Sunway inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I56OLG -------------------------------- If kernel image is so large that it corrupts dtb, system will break down early to give a message. Signed-off-by: He Chuyue Signed-off-by: Gu Zitao --- arch/sw_64/kernel/setup.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/sw_64/kernel/setup.c b/arch/sw_64/kernel/setup.c index ca19445ac883..faae7cb3c5a1 100644 --- a/arch/sw_64/kernel/setup.c +++ b/arch/sw_64/kernel/setup.c @@ -565,8 +565,14 @@ static void __init setup_machine_fdt(void) /* Give a chance to select kernel builtin DTB firstly */ if (IS_ENABLED(CONFIG_SW64_BUILTIN_DTB)) dt_virt = (void *)__dtb_start; - else + else { dt_virt = (void *)sunway_boot_params->dtb_start; + if (dt_virt < (void *)__bss_stop) { + pr_emerg("BUG: DTB has been corrupted by kernel image!\n"); + while (true) + cpu_relax(); + } + } phys_addr = __phys_addr((unsigned long)dt_virt); if (!phys_addr_valid(phys_addr) || -- Gitee From 8d384996f080b31d8e7c0f3337952bf32921aa3a Mon Sep 17 00:00:00 2001 From: He Sheng Date: Tue, 17 May 2022 17:32:12 +0800 Subject: [PATCH 112/674] sw64: remove discontiguous memory support Sunway inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I56OLG -------------------------------- Since sparse memory support works fine so far, discontiguous memory support can be removed to avoid confusion. Signed-off-by: He Sheng Signed-off-by: Gu Zitao --- arch/sw_64/Kconfig | 9 -------- arch/sw_64/include/asm/mmzone.h | 7 ------- arch/sw_64/include/asm/pgtable.h | 7 ------- arch/sw_64/kernel/Makefile | 3 +-- arch/sw_64/kernel/core.c | 35 -------------------------------- arch/sw_64/mm/init.c | 12 ----------- 6 files changed, 1 insertion(+), 72 deletions(-) delete mode 100644 arch/sw_64/kernel/core.c diff --git a/arch/sw_64/Kconfig b/arch/sw_64/Kconfig index 8c5e87cb9b84..2d7721acb529 100644 --- a/arch/sw_64/Kconfig +++ b/arch/sw_64/Kconfig @@ -649,15 +649,6 @@ config ARCH_SPARSEMEM_ENABLE depends on SMP select SPARSEMEM_VMEMMAP_ENABLE -config ARCH_DISCONTIGMEM_ENABLE - bool "Discontiguous Memory Support" - depends on SMP - help - Say Y to support efficient handling of discontiguous physical memory, - for architectures which are either NUMA (Non-Uniform Memory Access) - or have huge holes in the physical address space for other reasons. - See for more. - config NUMA bool "NUMA Support" depends on SMP && !FLATMEM diff --git a/arch/sw_64/include/asm/mmzone.h b/arch/sw_64/include/asm/mmzone.h index 4219015c5cfc..3849d26e3890 100644 --- a/arch/sw_64/include/asm/mmzone.h +++ b/arch/sw_64/include/asm/mmzone.h @@ -14,11 +14,4 @@ extern pg_data_t *node_data[]; #define NODE_DATA(nid) (node_data[(nid)]) #endif -#ifdef CONFIG_DISCONTIGMEM -extern int pa_to_nid(unsigned long pa); -extern int pfn_valid(unsigned long pfn); - -#define pfn_to_nid(pfn) pa_to_nid(((u64)(pfn) << PAGE_SHIFT)) -#endif /* CONFIG_DISCONTIGMEM */ - #endif /* _ASM_SW64_MMZONE_H */ diff --git a/arch/sw_64/include/asm/pgtable.h b/arch/sw_64/include/asm/pgtable.h index 67637b4ddf1d..76c782baf242 100644 --- a/arch/sw_64/include/asm/pgtable.h +++ b/arch/sw_64/include/asm/pgtable.h @@ -596,14 +596,7 @@ extern pgd_t swapper_pg_dir[1024]; #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -#if defined(CONFIG_FLATMEM) #define kern_addr_valid(addr) (1) -#elif defined(CONFIG_DISCONTIGMEM) -/* XXX: FIXME -- wli */ -#define kern_addr_valid(kaddr) (0) -#elif defined(CONFIG_SPARSEMEM) -#define kern_addr_valid(addr) (1) -#endif #define pte_ERROR(e) \ pr_err("%s: %d: bad pte %016lx.\n", __FILE__, __LINE__, pte_val(e)) diff --git a/arch/sw_64/kernel/Makefile b/arch/sw_64/kernel/Makefile index 293b360626f7..c1e461e0ac56 100644 --- a/arch/sw_64/kernel/Makefile +++ b/arch/sw_64/kernel/Makefile @@ -15,7 +15,7 @@ endif obj-y := entry.o fpu.o traps.o process.o sys_sw64.o irq.o \ irq_sw64.o signal.o setup.o ptrace.o time.o \ - systbls.o dup_print.o tc.o \ + systbls.o dup_print.o tc.o timer.o \ insn.o early_init.o topology.o cacheinfo.o \ vdso.o vdso/ @@ -43,7 +43,6 @@ obj-y += kvm_cma.o endif # Core logic support -obj-$(CONFIG_SW64) += core.o timer.o obj-$(CONFIG_SW64_CPUFREQ) += platform.o clock.o obj-$(CONFIG_SW64_CPUAUTOPLUG) += cpuautoplug.o diff --git a/arch/sw_64/kernel/core.c b/arch/sw_64/kernel/core.c deleted file mode 100644 index e26b3a5faab2..000000000000 --- a/arch/sw_64/kernel/core.c +++ /dev/null @@ -1,35 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include - -#ifdef CONFIG_DISCONTIGMEM -#ifdef CONFIG_NUMA -int pa_to_nid(unsigned long pa) -{ - int i = 0; - phys_addr_t pfn_base, pfn_size, pfn; - - pfn = pa >> PAGE_SHIFT; - for (i = 0; i < MAX_NUMNODES; i++) { - if (!NODE_DATA(i)) - continue; - - pfn_base = NODE_DATA(i)->node_start_pfn; - pfn_size = NODE_DATA(i)->node_spanned_pages; - - if (pfn >= pfn_base && pfn < pfn_base + pfn_size) - return i; - } - - pr_err("%s: pa %#lx does not belong to any node, return node 0\n", __func__, pa); - return 0; -} -EXPORT_SYMBOL(pa_to_nid); -#else /* !CONFIG_NUMA */ -int pa_to_nid(unsigned long pa) -{ - return 0; -} -EXPORT_SYMBOL(pa_to_nid); -#endif /* CONFIG_NUMA */ -#endif /* CONFIG_DISCONTIGMEM */ diff --git a/arch/sw_64/mm/init.c b/arch/sw_64/mm/init.c index cd3c0551b8f8..16d3da7beebe 100644 --- a/arch/sw_64/mm/init.c +++ b/arch/sw_64/mm/init.c @@ -244,18 +244,6 @@ void vmemmap_free(unsigned long start, unsigned long end, } #endif -#ifdef CONFIG_DISCONTIGMEM -int pfn_valid(unsigned long pfn) -{ - phys_addr_t addr = pfn << PAGE_SHIFT; - - if ((addr >> PAGE_SHIFT) != pfn) - return 0; - return memblock_is_map_memory(addr); -} -EXPORT_SYMBOL(pfn_valid); -#endif - #ifdef CONFIG_HAVE_MEMBLOCK #ifndef MIN_MEMBLOCK_ADDR #define MIN_MEMBLOCK_ADDR __pa(PAGE_OFFSET) -- Gitee From d351c1bdf7563b31109bf0a01feb015233a01f2d Mon Sep 17 00:00:00 2001 From: Zhu Donghong Date: Wed, 18 May 2022 15:23:55 +0800 Subject: [PATCH 113/674] irqchip: add sw64 chip3 builtin LPC interrupt controller driver Sunway inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5G7AK -------------------------------- This adds an irqchip driver for the secondary interrupt controllers based on sw64 chip3 cpu builtin LPC. Signed-off-by: Zhu Donghong Signed-off-by: Gu Zitao --- drivers/irqchip/Kconfig | 7 ++ drivers/irqchip/Makefile | 1 + drivers/irqchip/irq-lpc.c | 137 +++++++++++++++++++++++++++++++++ drivers/mfd/lpc_sunway_chip3.c | 130 ------------------------------- 4 files changed, 145 insertions(+), 130 deletions(-) create mode 100644 drivers/irqchip/irq-lpc.c diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig index 375ba3b30b72..5bf6cf60999b 100644 --- a/drivers/irqchip/Kconfig +++ b/drivers/irqchip/Kconfig @@ -9,6 +9,13 @@ config SW64_INTC The INTC controls devices interrupts and connects them to each core's local interrupt controller. +config SW64_CHIP3_LPC + bool "SW64 Chip3 Buildin LPC Interrupt Controller" + depends on SW64_CHIP3 + help + This enables support for the LPC interrupt controller bultin in + on chip3 series. + config IRQCHIP def_bool y depends on OF_IRQ diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile index 4c78b0f64e6c..78eb12ab4d4c 100644 --- a/drivers/irqchip/Makefile +++ b/drivers/irqchip/Makefile @@ -28,6 +28,7 @@ obj-$(CONFIG_ARCH_SUNXI) += irq-sunxi-nmi.o obj-$(CONFIG_ARCH_SPEAR3XX) += spear-shirq.o obj-$(CONFIG_ARM_GIC) += irq-gic.o irq-gic-common.o obj-$(CONFIG_SW64_INTC) += irq-intc-v1.o +obj-$(CONFIG_SW64_CHIP3_LPC) += irq-lpc.o obj-$(CONFIG_ARM_GIC_PM) += irq-gic-pm.o obj-$(CONFIG_ARCH_REALVIEW) += irq-gic-realview.o obj-$(CONFIG_ARM_GIC_V2M) += irq-gic-v2m.o diff --git a/drivers/irqchip/irq-lpc.c b/drivers/irqchip/irq-lpc.c new file mode 100644 index 000000000000..1cbf87478242 --- /dev/null +++ b/drivers/irqchip/irq-lpc.c @@ -0,0 +1,137 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include + +#define LPC_NR_IRQS 16 +#define LPC_IRQ 0x4 +#define LPC_IRQ_MASK 0x8 + +struct lpc_intc_data { + struct irq_domain *domain; + struct irq_chip_generic *gc; +}; + +static void lpc_irq_mask_ack(struct irq_data *data) +{ + struct irq_chip_generic *gc = irq_data_get_irq_chip_data(data); + struct irq_chip_type *ct = irq_data_get_chip_type(data); + unsigned int mask = data->mask; + + irq_gc_lock(gc); + *ct->mask_cache |= mask; + irq_reg_writel(gc, *ct->mask_cache, ct->regs.mask); + irq_reg_writel(gc, mask, ct->regs.ack); + irq_gc_unlock(gc); +} + +static void lpc_irq_handler(struct irq_desc *desc) +{ + struct lpc_intc_data *b = irq_desc_get_handler_data(desc); + struct irq_chip *chip = irq_desc_get_chip(desc); + unsigned int irq; + u32 status; + + chained_irq_enter(chip, desc); + + status = irq_reg_readl(b->gc, LPC_IRQ); + + if (status == 0) { + raw_spin_lock(&desc->lock); + handle_bad_irq(desc); + raw_spin_unlock(&desc->lock); + goto out; + } + + while (status) { + irq = __ffs(status); + status &= ~BIT(irq); + generic_handle_irq(irq_find_mapping(b->domain, irq)); + } + +out: + chained_irq_exit(chip, desc); +} + +static int __init lpc_intc_of_init(struct device_node *np, + struct device_node *parent) +{ + unsigned int set = IRQ_NOPROBE | IRQ_LEVEL; + struct lpc_intc_data *data; + struct irq_chip_type *ct; + int parent_irq, ret; + void __iomem *base; + int hwirq = 0; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + base = of_iomap(np, 0); + if (!base) { + pr_err("failed to remap lpc intc registers\n"); + ret = -ENOMEM; + goto out_free; + } + + parent_irq = irq_of_parse_and_map(np, 0); + if (!parent_irq) { + pr_err("failed to find parent interrupt\n"); + ret = -EINVAL; + goto out_unmap; + } + + data->domain = irq_domain_add_linear(np, LPC_NR_IRQS, + &irq_generic_chip_ops, NULL); + if (!data->domain) { + ret = -ENOMEM; + goto out_unmap; + } + + /* Allocate a single Generic IRQ chip for this node */ + ret = irq_alloc_domain_generic_chips(data->domain, 16, 1, np->name, + handle_level_irq, 0, set, + IRQ_GC_INIT_MASK_CACHE); + if (ret) { + pr_err("failed to allocate generic irq chip\n"); + goto out_free_domain; + } + + /* Set the IRQ chaining logic */ + irq_set_chained_handler_and_data(parent_irq, + lpc_irq_handler, data); + + data->gc = irq_get_domain_generic_chip(data->domain, 0); + data->gc->reg_base = base; + data->gc->private = data; + + ct = data->gc->chip_types; + + ct->regs.ack = LPC_IRQ; + ct->regs.mask = LPC_IRQ_MASK; + ct->chip.irq_mask = irq_gc_mask_set_bit; + ct->chip.irq_unmask = irq_gc_mask_clr_bit; + ct->chip.irq_ack = irq_gc_ack_set_bit; + ct->chip.irq_mask_ack = lpc_irq_mask_ack; + + for (hwirq = 0 ; hwirq < 16 ; hwirq++) + irq_create_mapping(data->domain, hwirq); + + /* Enable LPC interrupts */ + writel(0xffffebdd, base + LPC_IRQ_MASK); + + return 0; + +out_free_domain: + irq_domain_remove(data->domain); +out_unmap: + iounmap(base); +out_free: + kfree(data); + return ret; +} +IRQCHIP_DECLARE(sw_lpc_intc, "sw64,lpc_intc", lpc_intc_of_init); diff --git a/drivers/mfd/lpc_sunway_chip3.c b/drivers/mfd/lpc_sunway_chip3.c index 793b557195c2..878aff87c992 100644 --- a/drivers/mfd/lpc_sunway_chip3.c +++ b/drivers/mfd/lpc_sunway_chip3.c @@ -75,53 +75,16 @@ enum { LPC_DMA_SWRST = 0x70, }; -enum { - LPC_IRQ0 = 0, /* 8254 Timer */ - LPC_IRQ1, /* Keyboard */ - LPC_IRQ2, /* Reserved */ - LPC_IRQ3, /* UART */ - LPC_IRQ4, /* UART */ - LPC_IRQ5, /* LPC Parallel Port2 */ - LPC_IRQ6, /* FDC-Floppy Disk Controller */ - LPC_IRQ7, /* LPT-Parallel Port1 */ - LPC_NR_IRQS, - LPC_IRQ8, /* RTC */ - LPC_IRQ9, /* Undefined */ - LPC_IRQ10, /* Undefined */ - LPC_IRQ11, /* Undefined */ - LPC_IRQ12, /* Mouse */ - LPC_IRQ13, /* Undefined */ - LPC_IRQ14, /* Undefined */ - LPC_IRQ15, /* Undefined */ -}; - struct lpc_chip3_adapter { void __iomem *hst_regs; struct device *dev; int irq; - struct irq_chip_generic *gc; unsigned int features; }; static struct resource superio_chip3_resources[] = { { .flags = IORESOURCE_IO, - }, { - .start = LPC_IRQ1, - .flags = IORESOURCE_IRQ, - .name = "i8042_kbd_irq", - }, { - .start = LPC_IRQ12, - .flags = IORESOURCE_IRQ, - .name = "i8042_aux_irq", - }, { - .start = LPC_IRQ5, - .flags = IORESOURCE_IRQ, - .name = "uart0_irq", - }, { - .start = LPC_IRQ4, - .flags = IORESOURCE_IRQ, - .name = "uart1_irq", } }; @@ -218,75 +181,9 @@ static void lpc_fw_flash_init(struct platform_device *pdev, } -static u32 lpc_do_irq(struct lpc_chip3_adapter *lpc_adapter) -{ - u32 irq_status = readl_relaxed(lpc_adapter->hst_regs + LPC_IRQ); - u32 ret = irq_status; - - DBG_LPC("%s irq_status=%#x\n", __func__, irq_status); - while (irq_status) { - int hwirq = fls(irq_status) - 1; - - generic_handle_irq(hwirq); - irq_status &= ~BIT(hwirq); - } - - lpc_writel(lpc_adapter->hst_regs, LPC_IRQ, ret); - return 1; -} - -static void lpc_irq_handler_mfd(struct irq_desc *desc) -{ - unsigned int irq = irq_desc_get_irq(desc); - struct lpc_chip3_adapter *lpc_adapter = irq_get_handler_data(irq); - u32 worked = 0; - - DBG_LPC("enter %s line:%d\n", __func__, __LINE__); - - worked = lpc_do_irq(lpc_adapter); - if (worked == IRQ_HANDLED) - dev_dbg(lpc_adapter->dev, "LPC irq handled.\n"); - - DBG_LPC("leave %s line:%d\n", __func__, __LINE__); -} - -static void lpc_unmask_interrupt_all(struct lpc_chip3_adapter *lpc_adapter) -{ - lpc_writel(lpc_adapter->hst_regs, LPC_IRQ_MASK, 0); -} - -static void lpc_irq_mask_ack(struct irq_data *d) -{ - struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); - struct irq_chip_type *ct = irq_data_get_chip_type(d); - u32 mask = d->mask; - - irq_gc_lock(gc); - *ct->mask_cache |= mask; - irq_reg_writel(gc, *ct->mask_cache, ct->regs.mask); - irq_reg_writel(gc, mask, ct->regs.ack); - irq_gc_unlock(gc); -} - -static void lpc_enable_irqs(struct lpc_chip3_adapter *lpc_adapter) -{ - int interrupt = 0; - - lpc_unmask_interrupt_all(lpc_adapter); - - interrupt = lpc_readl(lpc_adapter->hst_regs, LPC_IRQ); - - lpc_writel(lpc_adapter->hst_regs, LPC_CTL, 0x1600); - interrupt = lpc_readl(lpc_adapter->hst_regs, LPC_IRQ); -} - static int lpc_chip3_probe(struct platform_device *pdev) { int ret; - int num_ct = 1; - int irq_base; - struct irq_chip_generic *gc; - struct irq_chip_type *ct; struct lpc_chip3_adapter *lpc_adapter; struct resource *mem; @@ -312,32 +209,6 @@ static int lpc_chip3_probe(struct platform_device *pdev) lpc_adapter->dev = &pdev->dev; lpc_adapter->features = 0; - lpc_adapter->irq = platform_get_irq(pdev, 0); - if (lpc_adapter->irq < 0) { - dev_err(&pdev->dev, "no irq resource?\n"); - return lpc_adapter->irq; /* -ENXIO */ - } - - irq_base = LPC_IRQ0; - gc = irq_alloc_generic_chip("LPC_CHIP3", num_ct, irq_base, - lpc_adapter->hst_regs, handle_level_irq); - - ct = gc->chip_types; - ct->regs.mask = LPC_IRQ_MASK; - ct->regs.ack = LPC_IRQ; - ct->chip.irq_mask = irq_gc_mask_set_bit; - ct->chip.irq_unmask = irq_gc_mask_clr_bit; - ct->chip.irq_ack = irq_gc_ack_set_bit; - ct->chip.irq_mask_ack = lpc_irq_mask_ack; - irq_setup_generic_chip(gc, IRQ_MSK(LPC_NR_IRQS), 0, 0, - IRQ_NOPROBE | IRQ_LEVEL); - - lpc_adapter->gc = gc; - - irq_set_handler_data(lpc_adapter->irq, lpc_adapter); - irq_set_chained_handler(lpc_adapter->irq, - (irq_flow_handler_t) lpc_irq_handler_mfd); - lpc_enable(lpc_adapter); lpc_mem_flash_init(pdev, lpc_adapter); @@ -350,7 +221,6 @@ static int lpc_chip3_probe(struct platform_device *pdev) goto out_dev; dev_info(lpc_adapter->dev, "probe succeed !\n"); - lpc_enable_irqs(lpc_adapter); return ret; -- Gitee From f920164998e475ee8dc6a3dc2be9d406c380acf3 Mon Sep 17 00:00:00 2001 From: Zhu Donghong Date: Wed, 18 May 2022 15:24:50 +0800 Subject: [PATCH 114/674] sw64: add builtin LPC interrupt controller to chip3.dts Sunway inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5G7AK -------------------------------- This adds corresponding devicetree binding for LPC interrupt controllers on chip3 series cpu. Signed-off-by: Zhu Donghong Signed-off-by: Gu Zitao --- arch/sw_64/boot/dts/chip3.dts | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/arch/sw_64/boot/dts/chip3.dts b/arch/sw_64/boot/dts/chip3.dts index be2e91ee3279..082506393ac9 100644 --- a/arch/sw_64/boot/dts/chip3.dts +++ b/arch/sw_64/boot/dts/chip3.dts @@ -32,12 +32,20 @@ spiclk: spiclk { }; - intc: interrupt-controller{ + intc: interrupt-controller { compatible = "sw64,sw6_irq_controller"; interrupt-controller; #interrupt-cells = <1>; }; + lpc_intc: interrupt-controller@0x8037 { + compatible = "sw64,lpc_intc"; + reg = <0x8037 0x40000000 0x0 0x8000>; + interrupt-controller; + #interrupt-cells = <1>; + interrupt-parent = <&intc>; + interrupts = <2>; + }; uart: serial0@8033 { #address-cells = <2>; @@ -176,10 +184,8 @@ partition@0 { lpc: lpc@0x8037 { #address-cells = <2>; #size-cells = <2>; - compatible = "sw,sw6b_lpc"; + compatible = "sunway,chip3_lpc"; reg = <0x8037 0x40000000 0x0 0x8000>; - interrupt-parent=<&intc>; - interrupts = <2>; status = "okay"; }; @@ -202,6 +208,8 @@ ipmi-bt@0x8037 { device_type = "ipmi"; compatible = "ipmi-bt"; reg = <0x8037 0x100000e4 0x0 0x10>; + interrupt-parent=<&lpc_intc>; + interrupts = <10>; reg-size = <1>; reg-spacing = <1>; reg-shift = <0>; -- Gitee From 6422b5c0dbd7b5602f449ccec702266cdde2b34e Mon Sep 17 00:00:00 2001 From: He Chuyue Date: Thu, 19 May 2022 09:06:17 +0800 Subject: [PATCH 115/674] sw64: fix some structs related to pt_regs Sunway inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GF7A -------------------------------- Because of previous modifications to struct pt_regs, this patch fixes struct pt_regs_offset, dbg_reg_def and perf_event_sw64_regs to keep them consistent. Signed-off-by: He Chuyue Signed-off-by: Gu Zitao --- arch/sw_64/include/uapi/asm/perf_regs.h | 7 +++++++ arch/sw_64/kernel/kgdb.c | 14 +++++++------- arch/sw_64/kernel/ptrace.c | 7 +++++++ 3 files changed, 21 insertions(+), 7 deletions(-) diff --git a/arch/sw_64/include/uapi/asm/perf_regs.h b/arch/sw_64/include/uapi/asm/perf_regs.h index 426ae642fcc8..1378a7397951 100644 --- a/arch/sw_64/include/uapi/asm/perf_regs.h +++ b/arch/sw_64/include/uapi/asm/perf_regs.h @@ -13,6 +13,13 @@ enum perf_event_sw64_regs { PERF_REG_SW64_R6, PERF_REG_SW64_R7, PERF_REG_SW64_R8, + PERF_REG_SW64_R9, + PERF_REG_SW64_R10, + PERF_REG_SW64_R11, + PERF_REG_SW64_R12, + PERF_REG_SW64_R13, + PERF_REG_SW64_R14, + PERF_REG_SW64_R15, PERF_REG_SW64_R19, PERF_REG_SW64_R20, PERF_REG_SW64_R21, diff --git a/arch/sw_64/kernel/kgdb.c b/arch/sw_64/kernel/kgdb.c index 491f287eede9..ac2f397f1609 100644 --- a/arch/sw_64/kernel/kgdb.c +++ b/arch/sw_64/kernel/kgdb.c @@ -34,13 +34,13 @@ struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = { { "r7", 8, offsetof(struct pt_regs, r7)}, { "r8", 8, offsetof(struct pt_regs, r8)}, - { "r9", 8, -1 }, - { "r10", 8, -1 }, - { "r11", 8, -1 }, - { "r12", 8, -1 }, - { "r13", 8, -1 }, - { "r14", 8, -1 }, - { "r15", 8, -1 }, + { "r9", 8, offsetof(struct pt_regs, r9)}, + { "r10", 8, offsetof(struct pt_regs, r10)}, + { "r11", 8, offsetof(struct pt_regs, r11)}, + { "r12", 8, offsetof(struct pt_regs, r12)}, + { "r13", 8, offsetof(struct pt_regs, r13)}, + { "r14", 8, offsetof(struct pt_regs, r14)}, + { "r15", 8, offsetof(struct pt_regs, r15)}, { "r16", 8, offsetof(struct pt_regs, r16)}, { "r17", 8, offsetof(struct pt_regs, r17)}, diff --git a/arch/sw_64/kernel/ptrace.c b/arch/sw_64/kernel/ptrace.c index ce41d89a54cb..94809804e23e 100644 --- a/arch/sw_64/kernel/ptrace.c +++ b/arch/sw_64/kernel/ptrace.c @@ -619,6 +619,13 @@ static const struct pt_regs_offset regoffset_table[] = { REG_OFFSET_NAME(r6), REG_OFFSET_NAME(r7), REG_OFFSET_NAME(r8), + REG_OFFSET_NAME(r9), + REG_OFFSET_NAME(r10), + REG_OFFSET_NAME(r11), + REG_OFFSET_NAME(r12), + REG_OFFSET_NAME(r13), + REG_OFFSET_NAME(r14), + REG_OFFSET_NAME(r15), REG_OFFSET_NAME(r19), REG_OFFSET_NAME(r20), REG_OFFSET_NAME(r21), -- Gitee From 9fc89ea369a1953ee9ed8839a43df772eead479d Mon Sep 17 00:00:00 2001 From: He Chuyue Date: Tue, 24 May 2022 16:03:40 +0800 Subject: [PATCH 116/674] sw64: perf: add exclude_user and exclude_kernel support Sunway inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I56X48 -------------------------------- Although sw64 have no per-counter usr/os/guest/host bits, we can distinguish user and kernel by sample mode. Signed-off-by: He Chuyue Signed-off-by: Gu Zitao --- arch/sw_64/include/asm/wrperfmon.h | 4 ++++ arch/sw_64/kernel/perf_event.c | 20 +++++++++++++++----- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/arch/sw_64/include/asm/wrperfmon.h b/arch/sw_64/include/asm/wrperfmon.h index eaa6735b5a25..098702573bfc 100644 --- a/arch/sw_64/include/asm/wrperfmon.h +++ b/arch/sw_64/include/asm/wrperfmon.h @@ -38,6 +38,10 @@ #define PC1_MIN 0x0 #define PC1_MAX 0x37 +#define SW64_PERFCTRL_KM 2 +#define SW64_PERFCTRL_UM 3 +#define SW64_PERFCTRL_AM 4 + /* pc0 events */ #define PC0_INSTRUCTIONS 0x0 #define PC0_BRANCH_INSTRUCTIONS 0x3 diff --git a/arch/sw_64/kernel/perf_event.c b/arch/sw_64/kernel/perf_event.c index d2975e17f666..0d49224ba058 100644 --- a/arch/sw_64/kernel/perf_event.c +++ b/arch/sw_64/kernel/perf_event.c @@ -457,8 +457,8 @@ static void sw64_pmu_start(struct perf_event *event, int flags) hwc->state = 0; - /* counting in all modes, for both counters */ - wrperfmon(PERFMON_CMD_PM, 4); + /* counting in selected modes, for both counters */ + wrperfmon(PERFMON_CMD_PM, hwc->config_base); if (hwc->idx == PERFMON_PC0) { wrperfmon(PERFMON_CMD_EVENT_PC0, hwc->event_base); wrperfmon(PERFMON_CMD_ENABLE, PERFMON_ENABLE_ARGS_PC0); @@ -519,9 +519,12 @@ static int __hw_perf_event_init(struct perf_event *event) const struct sw64_perf_event *event_type; - /* SW64 do not have per-counter usr/os/guest/host bits */ - if (event->attr.exclude_user || event->attr.exclude_kernel || - event->attr.exclude_hv || event->attr.exclude_idle || + /* + * SW64 does not have per-counter usr/os/guest/host bits, + * we can distinguish exclude_user and exclude_kernel by + * sample mode. + */ + if (event->attr.exclude_hv || event->attr.exclude_idle || event->attr.exclude_host || event->attr.exclude_guest) return -EINVAL; @@ -553,6 +556,13 @@ static int __hw_perf_event_init(struct perf_event *event) hwc->event_base = attr->config & 0xff; /* event selector */ } + hwc->config_base = SW64_PERFCTRL_AM; + + if (attr->exclude_user) + hwc->config_base = SW64_PERFCTRL_KM; + if (attr->exclude_kernel) + hwc->config_base = SW64_PERFCTRL_UM; + hwc->config = attr->config; if (!is_sampling_event(event)) -- Gitee From b9e3841445914e43ecf42975f2d8a8f3c8239e25 Mon Sep 17 00:00:00 2001 From: Hang Xiaoqian Date: Wed, 25 May 2022 10:54:25 +0800 Subject: [PATCH 117/674] sw64: change the value of physical_id in /proc/cpuinfo Sunway inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I56OLG -------------------------------- The value of physical_id in /proc/cpuinfo should be the socket_id of the processor. Signed-off-by: Hang Xiaoqian Signed-off-by: Gu Zitao --- arch/sw_64/kernel/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/sw_64/kernel/setup.c b/arch/sw_64/kernel/setup.c index faae7cb3c5a1..4d39db5fb1ef 100644 --- a/arch/sw_64/kernel/setup.c +++ b/arch/sw_64/kernel/setup.c @@ -907,7 +907,7 @@ show_cpuinfo(struct seq_file *f, void *slot) "physical id\t: %d\n" "bogomips\t: %lu.%02lu\n", cpu_freq, cpu_data[i].tcache.size >> 10, - cpu_to_rcid(i), + cpu_topology[i].package_id, loops_per_jiffy / (500000/HZ), (loops_per_jiffy / (5000/HZ)) % 100); -- Gitee From 06ed664fca7dcb8963d9868472a4cbbb149edb27 Mon Sep 17 00:00:00 2001 From: Gu Zitao Date: Tue, 24 May 2022 15:28:27 +0800 Subject: [PATCH 118/674] sw64: add ARCH_TRACEHOOK and regset support Sunway inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5G7NL -------------------------------- By adding TRACEHOOK support, we can access register sets via PTRACE_GETREGSET and PTRACE_SETREGSET request. The user-visible regset struct user_pt_regs and user_fpsimd_state are added in this patch, and they can be accessed with NT_PRSTATUS and NT_PRFPREG respectively. Besides, PTRACE_{GET,SET}REGS and PTRACE_{GET,SET}FPREGS requests which are implemented in error have never been used on sw64, so we remove them completely. Signed-off-by: Gu Zitao Signed-off-by: Gu Zitao --- arch/sw_64/Kconfig | 1 + arch/sw_64/include/uapi/asm/ptrace.h | 18 ++- arch/sw_64/kernel/ptrace.c | 194 +++++++++++++-------------- 3 files changed, 111 insertions(+), 102 deletions(-) diff --git a/arch/sw_64/Kconfig b/arch/sw_64/Kconfig index 2d7721acb529..0e32fc7e1f9a 100644 --- a/arch/sw_64/Kconfig +++ b/arch/sw_64/Kconfig @@ -68,6 +68,7 @@ config SW64 select ARCH_HAS_SG_CHAIN select IRQ_FORCED_THREADING select GENERIC_IRQ_MIGRATION if SMP + select HAVE_ARCH_TRACEHOOK select HAVE_FUNCTION_TRACER select HAVE_DYNAMIC_FTRACE select HAVE_FTRACE_MCOUNT_RECORD diff --git a/arch/sw_64/include/uapi/asm/ptrace.h b/arch/sw_64/include/uapi/asm/ptrace.h index ac7ce6b6510b..1169d87c4b9c 100644 --- a/arch/sw_64/include/uapi/asm/ptrace.h +++ b/arch/sw_64/include/uapi/asm/ptrace.h @@ -2,10 +2,20 @@ #ifndef _UAPI_ASM_SW64_PTRACE_H #define _UAPI_ASM_SW64_PTRACE_H -#define PTRACE_GETREGS 12 /* get general purpose registers */ -#define PTRACE_SETREGS 13 /* set general purpose registers */ -#define PTRACE_GETFPREGS 14 /* get floating-point registers */ -#define PTRACE_SETFPREGS 15 /* set floating-point registers */ +/* + * User structures for general purpose, floating point and debug registers. + */ +struct user_pt_regs { + __u64 regs[31]; + __u64 pc; + __u64 pstate; +}; + +struct user_fpsimd_state { + __u64 vregs[124]; + __u64 fpcr; +}; + /* PTRACE_ATTACH is 16 */ /* PTRACE_DETACH is 17 */ diff --git a/arch/sw_64/kernel/ptrace.c b/arch/sw_64/kernel/ptrace.c index 94809804e23e..94fba3569781 100644 --- a/arch/sw_64/kernel/ptrace.c +++ b/arch/sw_64/kernel/ptrace.c @@ -7,6 +7,8 @@ #include #include +#include +#include #include #include @@ -270,123 +272,131 @@ void ptrace_disable(struct task_struct *child) user_disable_single_step(child); } -int ptrace_getregs(struct task_struct *child, __s64 __user *data) +static int gpr_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) { - int ret, retval = 0; - int i; - unsigned long regval; + struct pt_regs *regs; + struct user_pt_regs uregs; + int i, ret; - if (!access_ok(data, sizeof(long) * 33)) - return -EIO; + regs = task_pt_regs(target); + for (i = 0; i < 30; i++) + uregs.regs[i] = *(__u64 *)((void *)regs + regoffsets[i]); + + uregs.regs[30] = task_thread_info(target)->pcb.usp; + uregs.pc = regs->pc; + uregs.pstate = regs->ps; + + ret = membuf_write(&to, &uregs, sizeof(uregs)); - /* r0-r15 */ - for (i = 0; i < 16; i++) { - regval = get_reg(child, i); - retval |= __put_user((long)regval, data + i); - } - /* r19-r28 */ - for (i = 19; i < 29; i++) { - regval = get_reg(child, i); - retval |= __put_user((long)regval, data + i - 3); - } - /*SP, PS ,PC,GP*/ - retval |= __put_user((long)(get_reg(child, REG_SP)), data + EF_SP); - retval |= __put_user((long)(get_reg(child, REG_PS)), data + EF_PS); - retval |= __put_user((long)(get_reg(child, REG_PC)), data + EF_PC); - retval |= __put_user((long)(get_reg(child, REG_GP)), data + EF_GP); - /* r16-r18 */ - retval |= __put_user((long)(get_reg(child, 16)), data + EF_A0); - retval |= __put_user((long)(get_reg(child, 17)), data + EF_A1); - retval |= __put_user((long)(get_reg(child, 18)), data + EF_A2); - - ret = retval ? -EIO : 0; return ret; } -int ptrace_setregs(struct task_struct *child, __s64 __user *data) +static int gpr_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) { - int ret, retval = 0; - int i; - unsigned long regval; + struct pt_regs *regs; + struct user_pt_regs uregs; + int i, ret; - if (!access_ok(data, sizeof(long) * 33)) - return -EIO; + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &uregs, 0, sizeof(uregs)); + if (ret) + return ret; + + regs = task_pt_regs(target); + for (i = 0; i < 30; i++) + *(__u64 *)((void *)regs + regoffsets[i]) = uregs.regs[i]; + + task_thread_info(target)->pcb.usp = uregs.regs[30]; + regs->pc = uregs.pc; + regs->ps = uregs.pstate; - /* r0-r15 */ - for (i = 0; i < 16; i++) { - retval |= __get_user(regval, data + i); - ret = put_reg(child, i, regval); - } - /* r19-r28 */ - for (i = 19; i < 29; i++) { - retval |= __get_user(regval, data + i - 3); - ret = put_reg(child, i, regval); - } - /*SP, PS ,PC,GP*/ - retval |= __get_user(regval, data + EF_SP); - ret = put_reg(child, REG_SP, regval); - retval |= __get_user(regval, data + EF_PS); - ret = put_reg(child, REG_PS, regval); - retval |= __get_user(regval, data + EF_PC); - ret = put_reg(child, REG_PC, regval); - retval |= __get_user(regval, data + EF_GP); - ret = put_reg(child, REG_GP, regval); - /* r16-r18 */ - retval |= __get_user(regval, data + EF_A0); - ret = put_reg(child, 16, regval); - retval |= __get_user(regval, data + EF_A1); - ret = put_reg(child, 17, regval); - retval |= __get_user(regval, data + EF_A2); - ret = put_reg(child, 18, regval); - - ret = retval ? -EIO : 0; return 0; } -int ptrace_getfpregs(struct task_struct *child, __s64 __user *data) +static int fpr_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) { - int ret, retval = 0; - int i; - unsigned long regval; + int ret; + struct user_fpsimd_state uregs; - if (!access_ok(data, sizeof(long) * 32)) - return -EIO; + memcpy(uregs.vregs, &target->thread.ctx_fp, + sizeof(struct context_fpregs)); - /* fp0-fp31 */ - for (i = 0; i < 32; i++) { - regval = get_reg(child, REG_F0 + i); - retval |= __put_user((long)regval, data + i); - } + uregs.fpcr = target->thread.fpcr; - ret = retval ? -EIO : 0; - return 0; + ret = membuf_write(&to, &uregs, sizeof(uregs)); + + return ret; } -int ptrace_setfpregs(struct task_struct *child, __s64 __user *data) +static int fpr_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) { - int ret, retval = 0; - int i; - unsigned long regval; + int ret; + struct user_fpsimd_state uregs; - if (!access_ok(data, sizeof(long) * 32)) - return -EIO; + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &uregs, 0, sizeof(uregs)); - /* fp0-fp31 */ - for (i = 0; i < 32; i++) { - retval |= __get_user(regval, data + i); - ret = put_reg(child, REG_F0 + i, regval); - } + if (ret) + return ret; + + memcpy(&target->thread.ctx_fp, uregs.vregs, + sizeof(struct context_fpregs)); + + target->thread.fpcr = uregs.fpcr; return ret; } +enum sw64_regset { + REGSET_GPR, + REGSET_FPR, +}; + +static const struct user_regset sw64_regsets[] = { + [REGSET_GPR] = { + .core_note_type = NT_PRSTATUS, + .n = ELF_NGREG, + .size = sizeof(elf_greg_t), + .align = sizeof(elf_greg_t), + .regset_get = gpr_get, + .set = gpr_set + }, + [REGSET_FPR] = { + .core_note_type = NT_PRFPREG, + .n = sizeof(struct user_fpsimd_state) / sizeof(u64), + .size = sizeof(u64), + .align = sizeof(u64), + .regset_get = fpr_get, + .set = fpr_set + }, +}; + +static const struct user_regset_view user_sw64_view = { + .name = "sw64", .e_machine = EM_SW64, + .regsets = sw64_regsets, .n = ARRAY_SIZE(sw64_regsets) +}; + +const struct user_regset_view *task_user_regset_view(struct task_struct *task) +{ + return &user_sw64_view; +} + long arch_ptrace(struct task_struct *child, long request, unsigned long addr, unsigned long data) { unsigned long tmp; size_t copied; long ret; - void __user *datavp = (void __user *) data; switch (request) { /* When I and D space are separate, these will need to be fixed. */ @@ -416,18 +426,6 @@ long arch_ptrace(struct task_struct *child, long request, case PTRACE_POKEUSR: /* write the specified register */ ret = put_reg(child, addr, data); break; - case PTRACE_GETREGS: - ret = ptrace_getregs(child, datavp); - break; - case PTRACE_SETREGS: - ret = ptrace_setregs(child, datavp); - break; - case PTRACE_GETFPREGS: - ret = ptrace_getfpregs(child, datavp); - break; - case PTRACE_SETFPREGS: - ret = ptrace_setfpregs(child, datavp); - break; default: ret = ptrace_request(child, request, addr, data); break; -- Gitee From a9839e292f57d2e4f65e82e73d278fff9adff87e Mon Sep 17 00:00:00 2001 From: He Sheng Date: Thu, 26 May 2022 11:00:48 +0800 Subject: [PATCH 119/674] sw64: merge user_fpsimd_state into thread_struct Sunway inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5G7NL -------------------------------- The user_fpsimd_state struct has been defined to hold fpu state which is also held in thread_struct. To reuse this struct, this patch makes a little change to user_fpsimd_state, then replace struct context_fpregs and fpcr of thread_struct. This patch also moves definition of task_pt_regs and mm_segment_t to appropriate headers to avoid compile errors. Signed-off-by: He Sheng Signed-off-by: Gu Zitao --- arch/sw_64/include/asm/processor.h | 45 +-------- arch/sw_64/include/asm/ptrace.h | 4 - arch/sw_64/include/asm/thread_info.h | 5 + arch/sw_64/include/uapi/asm/ptrace.h | 8 +- arch/sw_64/kernel/asm-offsets.c | 67 +++++++------- arch/sw_64/kernel/fpu.S | 134 +++++++++++++-------------- arch/sw_64/kernel/process.c | 2 +- arch/sw_64/kernel/ptrace.c | 51 +++------- arch/sw_64/kernel/signal.c | 13 +-- arch/sw_64/kvm/entry.S | 40 ++++---- 10 files changed, 155 insertions(+), 214 deletions(-) diff --git a/arch/sw_64/include/asm/processor.h b/arch/sw_64/include/asm/processor.h index 67d3b4368987..08e8cc8e5428 100644 --- a/arch/sw_64/include/asm/processor.h +++ b/arch/sw_64/include/asm/processor.h @@ -9,6 +9,10 @@ #define _ASM_SW64_PROCESSOR_H #include /* for ADDR_LIMIT_32BIT */ +#include + +#define task_pt_regs(task) \ + ((struct pt_regs *) (task_stack_page(task) + 2 * PAGE_SIZE) - 1) /* * Returns current instruction pointer ("program counter"). @@ -37,47 +41,8 @@ #define TASK_UNMAPPED_BASE \ ((current->personality & ADDR_LIMIT_32BIT) ? 0x40000000 : UNMAPPED_BASE) -typedef struct { - unsigned long seg; -} mm_segment_t; - -struct context_fpregs { - unsigned long f0[4]; - unsigned long f1[4]; - unsigned long f2[4]; - unsigned long f3[4]; - unsigned long f4[4]; - unsigned long f5[4]; - unsigned long f6[4]; - unsigned long f7[4]; - unsigned long f8[4]; - unsigned long f9[4]; - unsigned long f10[4]; - unsigned long f11[4]; - unsigned long f12[4]; - unsigned long f13[4]; - unsigned long f14[4]; - unsigned long f15[4]; - unsigned long f16[4]; - unsigned long f17[4]; - unsigned long f18[4]; - unsigned long f19[4]; - unsigned long f20[4]; - unsigned long f21[4]; - unsigned long f22[4]; - unsigned long f23[4]; - unsigned long f24[4]; - unsigned long f25[4]; - unsigned long f26[4]; - unsigned long f27[4]; - unsigned long f28[4]; - unsigned long f29[4]; - unsigned long f30[4]; -} __aligned(32); /* 256 bits aligned for simd */ - struct thread_struct { - struct context_fpregs ctx_fp; - unsigned long fpcr; + struct user_fpsimd_state fpstate; /* Callee-saved registers */ unsigned long ra; unsigned long s[7]; /* s0 ~ s6 */ diff --git a/arch/sw_64/include/asm/ptrace.h b/arch/sw_64/include/asm/ptrace.h index 85f7a47fdee3..ac9943015663 100644 --- a/arch/sw_64/include/asm/ptrace.h +++ b/arch/sw_64/include/asm/ptrace.h @@ -3,10 +3,8 @@ #define _ASM_SW64_PTRACE_H #include -#include #include #include -#include #include /* @@ -65,8 +63,6 @@ struct pt_regs { #define kernel_stack_pointer(regs) (((regs->ps) >> 4) & (TASK_SIZE - 1)) #define instruction_pointer_set(regs, val) ((regs)->pc = val) -#define task_pt_regs(task) \ - ((struct pt_regs *) (task_stack_page(task) + 2 * PAGE_SIZE) - 1) #define current_pt_regs() \ ((struct pt_regs *) ((char *)current_thread_info() + 2 * PAGE_SIZE) - 1) diff --git a/arch/sw_64/include/asm/thread_info.h b/arch/sw_64/include/asm/thread_info.h index cffb09fc6262..33b95f815448 100644 --- a/arch/sw_64/include/asm/thread_info.h +++ b/arch/sw_64/include/asm/thread_info.h @@ -9,6 +9,11 @@ #include #include +typedef struct { + unsigned long seg; +} mm_segment_t; + + struct pcb_struct { unsigned long ksp; unsigned long usp; diff --git a/arch/sw_64/include/uapi/asm/ptrace.h b/arch/sw_64/include/uapi/asm/ptrace.h index 1169d87c4b9c..a45b10b8914a 100644 --- a/arch/sw_64/include/uapi/asm/ptrace.h +++ b/arch/sw_64/include/uapi/asm/ptrace.h @@ -11,9 +11,15 @@ struct user_pt_regs { __u64 pstate; }; +/* 256 bits aligned for simd */ +struct fpreg { + __u64 v[4] __attribute__((aligned(32))); +}; + struct user_fpsimd_state { - __u64 vregs[124]; + struct fpreg fp[31]; __u64 fpcr; + __u64 __reserved[3]; }; /* PTRACE_ATTACH is 16 */ diff --git a/arch/sw_64/kernel/asm-offsets.c b/arch/sw_64/kernel/asm-offsets.c index 5dc59346996f..0c7e1e26eb05 100644 --- a/arch/sw_64/kernel/asm-offsets.c +++ b/arch/sw_64/kernel/asm-offsets.c @@ -178,40 +178,39 @@ void foo(void) DEFINE(HOST_INT_R16, offsetof(struct host_int_args, r16)); BLANK(); - DEFINE(TASK_THREAD, offsetof(struct task_struct, thread)); - DEFINE(THREAD_CTX_FP, offsetof(struct thread_struct, ctx_fp)); - DEFINE(THREAD_FPCR, offsetof(struct thread_struct, fpcr)); - DEFINE(CTX_FP_F0, offsetof(struct context_fpregs, f0)); - DEFINE(CTX_FP_F1, offsetof(struct context_fpregs, f1)); - DEFINE(CTX_FP_F2, offsetof(struct context_fpregs, f2)); - DEFINE(CTX_FP_F3, offsetof(struct context_fpregs, f3)); - DEFINE(CTX_FP_F4, offsetof(struct context_fpregs, f4)); - DEFINE(CTX_FP_F5, offsetof(struct context_fpregs, f5)); - DEFINE(CTX_FP_F6, offsetof(struct context_fpregs, f6)); - DEFINE(CTX_FP_F7, offsetof(struct context_fpregs, f7)); - DEFINE(CTX_FP_F8, offsetof(struct context_fpregs, f8)); - DEFINE(CTX_FP_F9, offsetof(struct context_fpregs, f9)); - DEFINE(CTX_FP_F10, offsetof(struct context_fpregs, f10)); - DEFINE(CTX_FP_F11, offsetof(struct context_fpregs, f11)); - DEFINE(CTX_FP_F12, offsetof(struct context_fpregs, f12)); - DEFINE(CTX_FP_F13, offsetof(struct context_fpregs, f13)); - DEFINE(CTX_FP_F14, offsetof(struct context_fpregs, f14)); - DEFINE(CTX_FP_F15, offsetof(struct context_fpregs, f15)); - DEFINE(CTX_FP_F16, offsetof(struct context_fpregs, f16)); - DEFINE(CTX_FP_F17, offsetof(struct context_fpregs, f17)); - DEFINE(CTX_FP_F18, offsetof(struct context_fpregs, f18)); - DEFINE(CTX_FP_F19, offsetof(struct context_fpregs, f19)); - DEFINE(CTX_FP_F20, offsetof(struct context_fpregs, f20)); - DEFINE(CTX_FP_F21, offsetof(struct context_fpregs, f21)); - DEFINE(CTX_FP_F22, offsetof(struct context_fpregs, f22)); - DEFINE(CTX_FP_F23, offsetof(struct context_fpregs, f23)); - DEFINE(CTX_FP_F24, offsetof(struct context_fpregs, f24)); - DEFINE(CTX_FP_F25, offsetof(struct context_fpregs, f25)); - DEFINE(CTX_FP_F26, offsetof(struct context_fpregs, f26)); - DEFINE(CTX_FP_F27, offsetof(struct context_fpregs, f27)); - DEFINE(CTX_FP_F28, offsetof(struct context_fpregs, f28)); - DEFINE(CTX_FP_F29, offsetof(struct context_fpregs, f29)); - DEFINE(CTX_FP_F30, offsetof(struct context_fpregs, f30)); + OFFSET(TASK_THREAD, task_struct, thread); + OFFSET(TASK_THREAD_F0, task_struct, thread.fpstate.fp[0]); + OFFSET(TASK_THREAD_F1, task_struct, thread.fpstate.fp[1]); + OFFSET(TASK_THREAD_F2, task_struct, thread.fpstate.fp[2]); + OFFSET(TASK_THREAD_F3, task_struct, thread.fpstate.fp[3]); + OFFSET(TASK_THREAD_F4, task_struct, thread.fpstate.fp[4]); + OFFSET(TASK_THREAD_F5, task_struct, thread.fpstate.fp[5]); + OFFSET(TASK_THREAD_F6, task_struct, thread.fpstate.fp[6]); + OFFSET(TASK_THREAD_F7, task_struct, thread.fpstate.fp[7]); + OFFSET(TASK_THREAD_F8, task_struct, thread.fpstate.fp[8]); + OFFSET(TASK_THREAD_F9, task_struct, thread.fpstate.fp[9]); + OFFSET(TASK_THREAD_F10, task_struct, thread.fpstate.fp[10]); + OFFSET(TASK_THREAD_F11, task_struct, thread.fpstate.fp[11]); + OFFSET(TASK_THREAD_F12, task_struct, thread.fpstate.fp[12]); + OFFSET(TASK_THREAD_F13, task_struct, thread.fpstate.fp[13]); + OFFSET(TASK_THREAD_F14, task_struct, thread.fpstate.fp[14]); + OFFSET(TASK_THREAD_F15, task_struct, thread.fpstate.fp[15]); + OFFSET(TASK_THREAD_F16, task_struct, thread.fpstate.fp[16]); + OFFSET(TASK_THREAD_F17, task_struct, thread.fpstate.fp[17]); + OFFSET(TASK_THREAD_F18, task_struct, thread.fpstate.fp[18]); + OFFSET(TASK_THREAD_F19, task_struct, thread.fpstate.fp[19]); + OFFSET(TASK_THREAD_F20, task_struct, thread.fpstate.fp[20]); + OFFSET(TASK_THREAD_F21, task_struct, thread.fpstate.fp[21]); + OFFSET(TASK_THREAD_F22, task_struct, thread.fpstate.fp[22]); + OFFSET(TASK_THREAD_F23, task_struct, thread.fpstate.fp[23]); + OFFSET(TASK_THREAD_F24, task_struct, thread.fpstate.fp[24]); + OFFSET(TASK_THREAD_F25, task_struct, thread.fpstate.fp[25]); + OFFSET(TASK_THREAD_F26, task_struct, thread.fpstate.fp[26]); + OFFSET(TASK_THREAD_F27, task_struct, thread.fpstate.fp[27]); + OFFSET(TASK_THREAD_F28, task_struct, thread.fpstate.fp[28]); + OFFSET(TASK_THREAD_F29, task_struct, thread.fpstate.fp[29]); + OFFSET(TASK_THREAD_F30, task_struct, thread.fpstate.fp[30]); + OFFSET(TASK_THREAD_FPCR, task_struct, thread.fpstate.fpcr); BLANK(); OFFSET(TASK_THREAD_RA, task_struct, thread.ra); OFFSET(TASK_THREAD_S0, task_struct, thread.s[0]); diff --git a/arch/sw_64/kernel/fpu.S b/arch/sw_64/kernel/fpu.S index 25dcf4740525..3cb3bfab08e8 100644 --- a/arch/sw_64/kernel/fpu.S +++ b/arch/sw_64/kernel/fpu.S @@ -8,49 +8,46 @@ .set noat ENTRY(__fpstate_save) /* a0: prev task */ - ldi a0, TASK_THREAD(a0) - ldi t0, THREAD_CTX_FP(a0) - vstd $f0, CTX_FP_F0(t0) - vstd $f1, CTX_FP_F1(t0) - vstd $f2, CTX_FP_F2(t0) - vstd $f3, CTX_FP_F3(t0) - vstd $f4, CTX_FP_F4(t0) - vstd $f5, CTX_FP_F5(t0) - vstd $f6, CTX_FP_F6(t0) - vstd $f7, CTX_FP_F7(t0) - vstd $f8, CTX_FP_F8(t0) - vstd $f9, CTX_FP_F9(t0) - vstd $f10, CTX_FP_F10(t0) - vstd $f11, CTX_FP_F11(t0) - vstd $f12, CTX_FP_F12(t0) - vstd $f13, CTX_FP_F13(t0) - vstd $f14, CTX_FP_F14(t0) - vstd $f15, CTX_FP_F15(t0) - vstd $f16, CTX_FP_F16(t0) - vstd $f17, CTX_FP_F17(t0) - vstd $f18, CTX_FP_F18(t0) - vstd $f19, CTX_FP_F19(t0) - vstd $f20, CTX_FP_F20(t0) - vstd $f21, CTX_FP_F21(t0) - vstd $f22, CTX_FP_F22(t0) - vstd $f23, CTX_FP_F23(t0) - vstd $f24, CTX_FP_F24(t0) - vstd $f25, CTX_FP_F25(t0) - vstd $f26, CTX_FP_F26(t0) - vstd $f27, CTX_FP_F27(t0) + vstd $f0, TASK_THREAD_F0(a0) + vstd $f1, TASK_THREAD_F1(a0) + vstd $f2, TASK_THREAD_F2(a0) + vstd $f3, TASK_THREAD_F3(a0) + vstd $f4, TASK_THREAD_F4(a0) + vstd $f5, TASK_THREAD_F5(a0) + vstd $f6, TASK_THREAD_F6(a0) + vstd $f7, TASK_THREAD_F7(a0) + vstd $f8, TASK_THREAD_F8(a0) + vstd $f9, TASK_THREAD_F9(a0) + vstd $f10, TASK_THREAD_F10(a0) + vstd $f11, TASK_THREAD_F11(a0) + vstd $f12, TASK_THREAD_F12(a0) + vstd $f13, TASK_THREAD_F13(a0) + vstd $f14, TASK_THREAD_F14(a0) + vstd $f15, TASK_THREAD_F15(a0) + vstd $f16, TASK_THREAD_F16(a0) + vstd $f17, TASK_THREAD_F17(a0) + vstd $f18, TASK_THREAD_F18(a0) + vstd $f19, TASK_THREAD_F19(a0) + vstd $f20, TASK_THREAD_F20(a0) + vstd $f21, TASK_THREAD_F21(a0) + vstd $f22, TASK_THREAD_F22(a0) + vstd $f23, TASK_THREAD_F23(a0) + vstd $f24, TASK_THREAD_F24(a0) + vstd $f25, TASK_THREAD_F25(a0) + vstd $f26, TASK_THREAD_F26(a0) + vstd $f27, TASK_THREAD_F27(a0) rfpcr $f0 - vstd $f28, CTX_FP_F28(t0) - vstd $f29, CTX_FP_F29(t0) - vstd $f30, CTX_FP_F30(t0) - fstd $f0, THREAD_FPCR(a0) - vldd $f0, CTX_FP_F0(t0) + vstd $f28, TASK_THREAD_F28(a0) + vstd $f29, TASK_THREAD_F29(a0) + vstd $f30, TASK_THREAD_F30(a0) + fstd $f0, TASK_THREAD_FPCR(a0) + vldd $f0, TASK_THREAD_F0(a0) ret END(__fpstate_save) ENTRY(__fpstate_restore) /* a0: next task */ - ldi a0, TASK_THREAD(a0) - fldd $f0, THREAD_FPCR(a0) + fldd $f0, TASK_THREAD_FPCR(a0) wfpcr $f0 fimovd $f0, t1 and t1, 0x3, t1 @@ -70,37 +67,36 @@ $setfpec_1: $setfpec_2: setfpec2 $setfpec_over: - ldi t0, THREAD_CTX_FP(a0) - vldd $f0, CTX_FP_F0(t0) - vldd $f1, CTX_FP_F1(t0) - vldd $f2, CTX_FP_F2(t0) - vldd $f3, CTX_FP_F3(t0) - vldd $f4, CTX_FP_F4(t0) - vldd $f5, CTX_FP_F5(t0) - vldd $f6, CTX_FP_F6(t0) - vldd $f7, CTX_FP_F7(t0) - vldd $f8, CTX_FP_F8(t0) - vldd $f9, CTX_FP_F9(t0) - vldd $f10, CTX_FP_F10(t0) - vldd $f11, CTX_FP_F11(t0) - vldd $f12, CTX_FP_F12(t0) - vldd $f13, CTX_FP_F13(t0) - vldd $f14, CTX_FP_F14(t0) - vldd $f15, CTX_FP_F15(t0) - vldd $f16, CTX_FP_F16(t0) - vldd $f17, CTX_FP_F17(t0) - vldd $f18, CTX_FP_F18(t0) - vldd $f19, CTX_FP_F19(t0) - vldd $f20, CTX_FP_F20(t0) - vldd $f21, CTX_FP_F21(t0) - vldd $f22, CTX_FP_F22(t0) - vldd $f23, CTX_FP_F23(t0) - vldd $f24, CTX_FP_F24(t0) - vldd $f25, CTX_FP_F25(t0) - vldd $f26, CTX_FP_F26(t0) - vldd $f27, CTX_FP_F27(t0) - vldd $f28, CTX_FP_F28(t0) - vldd $f29, CTX_FP_F29(t0) - vldd $f30, CTX_FP_F30(t0) + vldd $f0, TASK_THREAD_F0(a0) + vldd $f1, TASK_THREAD_F1(a0) + vldd $f2, TASK_THREAD_F2(a0) + vldd $f3, TASK_THREAD_F3(a0) + vldd $f4, TASK_THREAD_F4(a0) + vldd $f5, TASK_THREAD_F5(a0) + vldd $f6, TASK_THREAD_F6(a0) + vldd $f7, TASK_THREAD_F7(a0) + vldd $f8, TASK_THREAD_F8(a0) + vldd $f9, TASK_THREAD_F9(a0) + vldd $f10, TASK_THREAD_F10(a0) + vldd $f11, TASK_THREAD_F11(a0) + vldd $f12, TASK_THREAD_F12(a0) + vldd $f13, TASK_THREAD_F13(a0) + vldd $f14, TASK_THREAD_F14(a0) + vldd $f15, TASK_THREAD_F15(a0) + vldd $f16, TASK_THREAD_F16(a0) + vldd $f17, TASK_THREAD_F17(a0) + vldd $f18, TASK_THREAD_F18(a0) + vldd $f19, TASK_THREAD_F19(a0) + vldd $f20, TASK_THREAD_F20(a0) + vldd $f21, TASK_THREAD_F21(a0) + vldd $f22, TASK_THREAD_F22(a0) + vldd $f23, TASK_THREAD_F23(a0) + vldd $f24, TASK_THREAD_F24(a0) + vldd $f25, TASK_THREAD_F25(a0) + vldd $f26, TASK_THREAD_F26(a0) + vldd $f27, TASK_THREAD_F27(a0) + vldd $f28, TASK_THREAD_F28(a0) + vldd $f29, TASK_THREAD_F29(a0) + vldd $f30, TASK_THREAD_F30(a0) ret END(__fpstate_restore) diff --git a/arch/sw_64/kernel/process.c b/arch/sw_64/kernel/process.c index 6b5f0f3a8345..308675d29c04 100644 --- a/arch/sw_64/kernel/process.c +++ b/arch/sw_64/kernel/process.c @@ -256,7 +256,7 @@ EXPORT_SYMBOL(dump_elf_task); int dump_elf_task_fp(elf_fpreg_t *dest, struct task_struct *task) { - memcpy(dest, &task->thread.ctx_fp, 32 * 8); + memcpy(dest, &task->thread.fpstate, 32 * 8); return 1; } EXPORT_SYMBOL(dump_elf_task_fp); diff --git a/arch/sw_64/kernel/ptrace.c b/arch/sw_64/kernel/ptrace.c index 94fba3569781..bdbd0d97a130 100644 --- a/arch/sw_64/kernel/ptrace.c +++ b/arch/sw_64/kernel/ptrace.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -55,9 +56,6 @@ enum { REG_GP = 29 }; -#define FP_REG(fp_regno, vector_regno) \ - (fp_regno * 32 + vector_regno * 8) - #define R(x) ((size_t) &((struct pt_regs *)0)->x) short regoffsets[32] = { @@ -91,8 +89,8 @@ static unsigned long zero; static unsigned long * get_reg_addr(struct task_struct *task, unsigned long regno) { - unsigned long *addr; - int fp_regno, vector_regno; + void *addr; + int fno, vno; switch (regno) { case USP: @@ -108,9 +106,8 @@ get_reg_addr(struct task_struct *task, unsigned long regno) addr = (void *)task_pt_regs(task) + regoffsets[regno]; break; case FPREG_BASE ... FPREG_END: - fp_regno = regno - FPREG_BASE; - vector_regno = 0; - addr = (void *)((unsigned long)&task->thread.ctx_fp + FP_REG(fp_regno, vector_regno)); + fno = regno - FPREG_BASE; + addr = &task->thread.fpstate.fp[fno].v[0]; break; case VECREG_BASE ... VECREG_END: /* @@ -121,12 +118,12 @@ get_reg_addr(struct task_struct *task, unsigned long regno) addr = &zero; break; } - fp_regno = (regno - VECREG_BASE) & 0x1f; - vector_regno = 1 + ((regno - VECREG_BASE) >> 5); - addr = (void *)((unsigned long)&task->thread.ctx_fp + FP_REG(fp_regno, vector_regno)); + fno = (regno - VECREG_BASE) & 0x1f; + vno = 1 + ((regno - VECREG_BASE) >> 5); + addr = &task->thread.fpstate.fp[fno].v[vno]; break; case FPCR: - addr = (void *)&task->thread.fpcr; + addr = &task->thread.fpstate.fpcr; break; case PC: addr = (void *)task_pt_regs(task) + PT_REGS_PC; @@ -322,17 +319,9 @@ static int fpr_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { - int ret; - struct user_fpsimd_state uregs; - - memcpy(uregs.vregs, &target->thread.ctx_fp, - sizeof(struct context_fpregs)); - - uregs.fpcr = target->thread.fpcr; - - ret = membuf_write(&to, &uregs, sizeof(uregs)); - return ret; + return membuf_write(&to, &target->thread.fpstate, + sizeof(struct user_fpsimd_state)); } static int fpr_set(struct task_struct *target, @@ -340,21 +329,9 @@ static int fpr_set(struct task_struct *target, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { - int ret; - struct user_fpsimd_state uregs; - - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &uregs, 0, sizeof(uregs)); - - if (ret) - return ret; - - memcpy(&target->thread.ctx_fp, uregs.vregs, - sizeof(struct context_fpregs)); - - target->thread.fpcr = uregs.fpcr; - - return ret; + return user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.fpstate, 0, + sizeof(struct user_fpsimd_state)); } enum sw64_regset { diff --git a/arch/sw_64/kernel/signal.c b/arch/sw_64/kernel/signal.c index 887326812624..6a6203ccb04f 100644 --- a/arch/sw_64/kernel/signal.c +++ b/arch/sw_64/kernel/signal.c @@ -100,9 +100,10 @@ restore_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs) err |= __get_user(usp, sc->sc_regs+30); wrusp(usp); /* simd-fp */ - err |= __copy_from_user(¤t->thread.ctx_fp, - &sc->sc_fpregs, sizeof(struct context_fpregs)); - err |= __get_user(current->thread.fpcr, &sc->sc_fpcr); + err |= __copy_from_user(¤t->thread.fpstate, &sc->sc_fpregs, + offsetof(struct user_fpsimd_state, fpcr)); + err |= __get_user(current->thread.fpstate.fpcr, &sc->sc_fpcr); + if (likely(!err)) __fpstate_restore(current); @@ -230,9 +231,9 @@ setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, err |= __put_user(0, sc->sc_regs+31); /* simd-fp */ __fpstate_save(current); - err |= __copy_to_user(&sc->sc_fpregs, - ¤t->thread.ctx_fp, sizeof(struct context_fpregs)); - err |= __put_user(current->thread.fpcr, &sc->sc_fpcr); + err |= __copy_to_user(&sc->sc_fpregs, ¤t->thread.fpstate, + offsetof(struct user_fpsimd_state, fpcr)); + err |= __put_user(current->thread.fpstate.fpcr, &sc->sc_fpcr); err |= __put_user(regs->trap_a0, &sc->sc_traparg_a0); err |= __put_user(regs->trap_a1, &sc->sc_traparg_a1); diff --git a/arch/sw_64/kvm/entry.S b/arch/sw_64/kvm/entry.S index 1e089da5378e..0c02b68ee7d0 100644 --- a/arch/sw_64/kvm/entry.S +++ b/arch/sw_64/kvm/entry.S @@ -15,18 +15,16 @@ ENTRY(__sw64_vcpu_run) /* save host fpregs */ ldl $1, TI_TASK($8) - ldi $1, TASK_THREAD($1) rfpcr $f0 - fstd $f0, THREAD_FPCR($1) - ldi $1, THREAD_CTX_FP($1) - vstd $f2, CTX_FP_F2($1) - vstd $f3, CTX_FP_F3($1) - vstd $f4, CTX_FP_F4($1) - vstd $f5, CTX_FP_F5($1) - vstd $f6, CTX_FP_F6($1) - vstd $f7, CTX_FP_F7($1) - vstd $f8, CTX_FP_F8($1) - vstd $f9, CTX_FP_F9($1) + fstd $f0, TASK_THREAD_FPCR($1) + vstd $f2, TASK_THREAD_F2($1) + vstd $f3, TASK_THREAD_F3($1) + vstd $f4, TASK_THREAD_F4($1) + vstd $f5, TASK_THREAD_F5($1) + vstd $f6, TASK_THREAD_F6($1) + vstd $f7, TASK_THREAD_F7($1) + vstd $f8, TASK_THREAD_F8($1) + vstd $f9, TASK_THREAD_F9($1) ldi sp, -VCPU_RET_SIZE(sp) /* r16 = guest kvm_vcpu_arch.vcb struct pointer */ @@ -213,8 +211,7 @@ $g_setfpec_over: bic sp, $8, $8 /* restore host fpregs */ ldl $1, TI_TASK($8) - ldi $1, TASK_THREAD($1) - fldd $f0, THREAD_FPCR($1) + fldd $f0, TASK_THREAD_FPCR($1) wfpcr $f0 fimovd $f0, $2 and $2, 0x3, $2 @@ -234,15 +231,14 @@ $setfpec_1: $setfpec_2: setfpec2 $setfpec_over: - ldi $1, THREAD_CTX_FP($1) - vldd $f2, CTX_FP_F2($1) - vldd $f3, CTX_FP_F3($1) - vldd $f4, CTX_FP_F4($1) - vldd $f5, CTX_FP_F5($1) - vldd $f6, CTX_FP_F6($1) - vldd $f7, CTX_FP_F7($1) - vldd $f8, CTX_FP_F8($1) - vldd $f9, CTX_FP_F9($1) + vldd $f2, TASK_THREAD_F2($1) + vldd $f3, TASK_THREAD_F3($1) + vldd $f4, TASK_THREAD_F4($1) + vldd $f5, TASK_THREAD_F5($1) + vldd $f6, TASK_THREAD_F6($1) + vldd $f7, TASK_THREAD_F7($1) + vldd $f8, TASK_THREAD_F8($1) + vldd $f9, TASK_THREAD_F9($1) /* if $0 > 0, handle hcall */ bgt $0, $ret_to -- Gitee From 6a2490239e1f86c8009ab192e2361aa4f911da2e Mon Sep 17 00:00:00 2001 From: He Sheng Date: Fri, 27 May 2022 10:01:54 +0800 Subject: [PATCH 120/674] sw64: fix ptrace.h with types.h and NOT __ASSEMBLY__ Sunway inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I56OLG -------------------------------- It may report compile errors on some user apps if linux/types.h is nowhere to be included. And a __ASSEMBLY__ guard makes it work if included by assembly codes. Signed-off-by: He Sheng Signed-off-by: Gu Zitao --- arch/sw_64/include/uapi/asm/ptrace.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/sw_64/include/uapi/asm/ptrace.h b/arch/sw_64/include/uapi/asm/ptrace.h index a45b10b8914a..80bad067fc15 100644 --- a/arch/sw_64/include/uapi/asm/ptrace.h +++ b/arch/sw_64/include/uapi/asm/ptrace.h @@ -2,6 +2,9 @@ #ifndef _UAPI_ASM_SW64_PTRACE_H #define _UAPI_ASM_SW64_PTRACE_H +#include + +#ifndef __ASSEMBLY__ /* * User structures for general purpose, floating point and debug registers. */ @@ -21,6 +24,7 @@ struct user_fpsimd_state { __u64 fpcr; __u64 __reserved[3]; }; +#endif /* PTRACE_ATTACH is 16 */ /* PTRACE_DETACH is 17 */ -- Gitee From 9941c328d6527b408479f70e782a40c706c277d1 Mon Sep 17 00:00:00 2001 From: He Sheng Date: Fri, 27 May 2022 10:09:44 +0800 Subject: [PATCH 121/674] sw64: rewrite elf core copy interfaces Sunway inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I56OLG -------------------------------- For gpreg, simplify dump method to make it easier to maintain. For fpreg, redefine elf_fpregset_t with struct user_fpsimd_state and add a arch-specific dump_fpu() method. Signed-off-by: He Sheng Signed-off-by: Gu Zitao --- arch/sw_64/include/asm/elf.h | 34 +++++-------------- arch/sw_64/kernel/process.c | 66 ++++++++---------------------------- 2 files changed, 23 insertions(+), 77 deletions(-) diff --git a/arch/sw_64/include/asm/elf.h b/arch/sw_64/include/asm/elf.h index abecc5653eba..8c858cff5573 100644 --- a/arch/sw_64/include/asm/elf.h +++ b/arch/sw_64/include/asm/elf.h @@ -55,23 +55,18 @@ #define EF_SW64_32BIT 1 /* All addresses are below 2GB */ /* - * ELF register definitions.. - */ - -/* - * The legacy version of makes gregset_t 46 entries long. - * I have no idea why that is so. For now, we just leave it at 33 - * (32 general regs + processor status word). + * ELF register definitions. + * + * For now, we just leave it at 33 (32 general regs + processor status word). */ #define ELF_NGREG 33 -#define ELF_NFPREG 32 - typedef unsigned long elf_greg_t; typedef elf_greg_t elf_gregset_t[ELF_NGREG]; -typedef double elf_fpreg_t; -typedef elf_fpreg_t elf_fpregset_t[ELF_NFPREG]; +/* Same with user_fpsimd_state */ +#include +typedef struct user_fpsimd_state elf_fpregset_t; /* * This is used to ensure we don't load something for the wrong architecture. @@ -121,22 +116,9 @@ extern int arch_setup_additional_pages(struct linux_binprm *bprm, #ifdef __KERNEL__ struct pt_regs; -struct thread_info; struct task_struct; -extern void dump_elf_thread(elf_greg_t *dest, struct pt_regs *pt, - struct thread_info *ti); -#define ELF_CORE_COPY_REGS(DEST, REGS) \ - dump_elf_thread(DEST, REGS, current_thread_info()); - -/* Similar, but for a thread other than current. */ - -extern int dump_elf_task(elf_greg_t *dest, struct task_struct *task); -#define ELF_CORE_COPY_TASK_REGS(TASK, DEST) dump_elf_task(*(DEST), TASK) - -/* Similar, but for the FP registers. */ - -extern int dump_elf_task_fp(elf_fpreg_t *dest, struct task_struct *task); -#define ELF_CORE_COPY_FPREGS(TASK, DEST) dump_elf_task_fp(*(DEST), TASK) +extern void sw64_elf_core_copy_regs(elf_greg_t *dest, struct pt_regs *pt); +#define ELF_CORE_COPY_REGS(DEST, REGS) sw64_elf_core_copy_regs(DEST, REGS); /* * This yields a mask that user programs can use to figure out what diff --git a/arch/sw_64/kernel/process.c b/arch/sw_64/kernel/process.c index 308675d29c04..7a7578d530c6 100644 --- a/arch/sw_64/kernel/process.c +++ b/arch/sw_64/kernel/process.c @@ -200,66 +200,30 @@ copy_thread(unsigned long clone_flags, unsigned long usp, /* * Fill in the user structure for a ELF core dump. + * @regs: should be signal_pt_regs() or task_pt_reg(task) */ -void -dump_elf_thread(elf_greg_t *dest, struct pt_regs *pt, struct thread_info *ti) +void sw64_elf_core_copy_regs(elf_greg_t *dest, struct pt_regs *regs) { - dest[0] = pt->r0; - dest[1] = pt->r1; - dest[2] = pt->r2; - dest[3] = pt->r3; - dest[4] = pt->r4; - dest[5] = pt->r5; - dest[6] = pt->r6; - dest[7] = pt->r7; - dest[8] = pt->r8; - dest[9] = pt->r9; - dest[10] = pt->r10; - dest[11] = pt->r11; - dest[12] = pt->r12; - dest[13] = pt->r13; - dest[14] = pt->r14; - dest[15] = pt->r15; - dest[16] = pt->r16; - dest[17] = pt->r17; - dest[18] = pt->r18; - dest[19] = pt->r19; - dest[20] = pt->r20; - dest[21] = pt->r21; - dest[22] = pt->r22; - dest[23] = pt->r23; - dest[24] = pt->r24; - dest[25] = pt->r25; - dest[26] = pt->r26; - dest[27] = pt->r27; - dest[28] = pt->r28; - dest[29] = pt->gp; - dest[30] = ti == current_thread_info() ? rdusp() : ti->pcb.usp; - dest[31] = pt->pc; + int i; + struct thread_info *ti; - /* Once upon a time this was the PS value. Which is stupid - * since that is always 8 for usermode. Usurped for the more - * useful value of the thread's UNIQUE field. - */ - dest[32] = ti->pcb.unique; -} -EXPORT_SYMBOL(dump_elf_thread); + ti = (void *)((__u64)regs & ~(THREAD_SIZE - 1)); -int -dump_elf_task(elf_greg_t *dest, struct task_struct *task) -{ - dump_elf_thread(dest, task_pt_regs(task), task_thread_info(task)); - return 1; + for (i = 0; i < 30; i++) + dest[i] = *(__u64 *)((void *)regs + regoffsets[i]); + dest[30] = ti == current_thread_info() ? rdusp() : ti->pcb.usp; + dest[31] = regs->pc; + dest[32] = ti->pcb.unique; } -EXPORT_SYMBOL(dump_elf_task); +EXPORT_SYMBOL(sw64_elf_core_copy_regs); -int -dump_elf_task_fp(elf_fpreg_t *dest, struct task_struct *task) +/* Fill in the fpu structure for a core dump. */ +int dump_fpu(struct pt_regs *regs, elf_fpregset_t *fpu) { - memcpy(dest, &task->thread.fpstate, 32 * 8); + memcpy(fpu, ¤t->thread.fpstate, sizeof(*fpu)); return 1; } -EXPORT_SYMBOL(dump_elf_task_fp); +EXPORT_SYMBOL(dump_fpu); /* * Under heavy swap load I've seen this lose in an ugly way. So do -- Gitee From 4c651be01d70a2c0e3c6ac2c2821eac775f85aaa Mon Sep 17 00:00:00 2001 From: Wu Liliu Date: Tue, 31 May 2022 09:48:40 +0800 Subject: [PATCH 122/674] sw64: perf: add fp based stack trace support Sunway inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I56X48 -------------------------------- It used to obtain return address of function by scanning entire stack which lead to inaccurate result. This patch adds fp based stack trace support to improve efficiency and accuracy. Because toolchain is not ready yet, we take CONFIG_FRAME_POINTER to activate this support, and it should be disabled by default. Signed-off-by: Wu Liliu Signed-off-by: Gu Zitao --- arch/sw_64/include/asm/stacktrace.h | 72 +++++++++++++++++++++++++ arch/sw_64/kernel/perf_event.c | 66 +++++++++++++++++++---- arch/sw_64/kernel/stacktrace.c | 83 +++++++++++++++++++++++++++++ 3 files changed, 210 insertions(+), 11 deletions(-) create mode 100644 arch/sw_64/include/asm/stacktrace.h diff --git a/arch/sw_64/include/asm/stacktrace.h b/arch/sw_64/include/asm/stacktrace.h new file mode 100644 index 000000000000..813aa5e7a91d --- /dev/null +++ b/arch/sw_64/include/asm/stacktrace.h @@ -0,0 +1,72 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _ASM_SW64_STACKTRACE_H +#define _ASM_SW64_STACKTRACE_H + +#include +#include +#include +#include +#include + +struct stackframe { + unsigned long pc; + unsigned long fp; +}; + +enum stack_type { + STACK_TYPE_UNKNOWN, + STACK_TYPE_TASK, +}; + +struct stack_info { + unsigned long low; + unsigned long high; + enum stack_type type; +}; + +/* The form of the top of the frame on the stack */ +struct stack_frame { + unsigned long return_address; + struct stack_frame *next_frame; +}; + +extern int unwind_frame(struct task_struct *tsk, struct stackframe *frame); +extern void walk_stackframe(struct task_struct *tsk, struct stackframe *frame, + int (*fn)(struct stackframe *, void *), void *data); + +static inline bool on_task_stack(struct task_struct *tsk, unsigned long sp, + struct stack_info *info) +{ + unsigned long low = (unsigned long)task_stack_page(tsk); + unsigned long high = low + THREAD_SIZE; + + if (sp < low || sp >= high) + return false; + + if (info) { + info->low = low; + info->high = high; + info->type = STACK_TYPE_TASK; + } + + return true; +} + +/* + * We can only safely access per-cpu stacks from current in a non-preemptible + * context. + */ +static inline bool on_accessible_stack(struct task_struct *tsk, + unsigned long sp, + struct stack_info *info) +{ + if (on_task_stack(tsk, sp, info)) + return true; + if (tsk != current || preemptible()) + return false; + + return false; +} + +#endif /* _ASM_SW64_STACKTRACE_H */ diff --git a/arch/sw_64/kernel/perf_event.c b/arch/sw_64/kernel/perf_event.c index 0d49224ba058..70f1f2781016 100644 --- a/arch/sw_64/kernel/perf_event.c +++ b/arch/sw_64/kernel/perf_event.c @@ -6,6 +6,7 @@ */ #include +#include /* For tracking PMCs and the hw events they monitor on each CPU. */ struct cpu_hw_events { @@ -247,10 +248,9 @@ static const struct sw64_perf_event *core3_map_cache_event(u64 config) */ static bool core3_raw_event_valid(u64 config) { - if ((config >= (PC0_RAW_BASE + PC0_MIN) && config <= (PC0_RAW_BASE + PC0_MAX)) || - (config >= (PC1_RAW_BASE + PC1_MIN) && config <= (PC1_RAW_BASE + PC1_MAX))) { + if ((config >= PC0_RAW_BASE && config <= (PC0_RAW_BASE + PC0_MAX)) || + (config >= PC1_RAW_BASE && config <= (PC1_RAW_BASE + PC1_MAX))) return true; - } pr_info("sw64 pmu: invalid raw event config %#llx\n", config); return false; @@ -697,6 +697,36 @@ bool valid_dy_addr(unsigned long addr) return ret; } +#ifdef CONFIG_FRAME_POINTER +void perf_callchain_user(struct perf_callchain_entry_ctx *entry, + struct pt_regs *regs) +{ + + struct stack_frame frame; + unsigned long __user *fp; + int err; + + perf_callchain_store(entry, regs->pc); + + fp = (unsigned long __user *)regs->r15; + + while (entry->nr < entry->max_stack && (unsigned long)fp < current->mm->start_stack) { + if (!access_ok(fp, sizeof(frame))) + break; + + pagefault_disable(); + err = __copy_from_user_inatomic(&frame, fp, sizeof(frame)); + pagefault_enable(); + + if (err) + break; + + if (valid_utext_addr(frame.return_address) || valid_dy_addr(frame.return_address)) + perf_callchain_store(entry, frame.return_address); + fp = (void __user *)frame.next_frame; + } +} +#else /* !CONFIG_FRAME_POINTER */ void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { @@ -709,30 +739,44 @@ void perf_callchain_user(struct perf_callchain_entry_ctx *entry, while (entry->nr < entry->max_stack && usp < current->mm->start_stack) { if (!access_ok(usp, 8)) break; + pagefault_disable(); err = __get_user(user_addr, (unsigned long *)usp); pagefault_enable(); + if (err) break; + if (valid_utext_addr(user_addr) || valid_dy_addr(user_addr)) perf_callchain_store(entry, user_addr); usp = usp + 8; } } +#endif/* CONFIG_FRAME_POINTER */ + +/* + * Gets called by walk_stackframe() for every stackframe. This will be called + * whist unwinding the stackframe and is like a subroutine return so we use + * the PC. + */ +static int callchain_trace(struct stackframe *frame, void *data) +{ + struct perf_callchain_entry_ctx *entry = data; + + perf_callchain_store(entry, frame->pc); + + return 0; +} void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { - unsigned long *sp = (unsigned long *)current_thread_info()->pcb.ksp; - unsigned long addr; + struct stackframe frame; - perf_callchain_store(entry, regs->pc); + frame.fp = regs->r15; + frame.pc = regs->pc; - while (!kstack_end(sp) && entry->nr < entry->max_stack) { - addr = *sp++; - if (__kernel_text_address(addr)) - perf_callchain_store(entry, addr); - } + walk_stackframe(current, &frame, callchain_trace, entry); } /* diff --git a/arch/sw_64/kernel/stacktrace.c b/arch/sw_64/kernel/stacktrace.c index 41cdff5b4941..2671331717ba 100644 --- a/arch/sw_64/kernel/stacktrace.c +++ b/arch/sw_64/kernel/stacktrace.c @@ -8,7 +8,90 @@ #include #include #include +#include +#include +#include +/* + * sw_64 PCS assigns the frame pointer to r15. + * + * A simple function prologue looks like this: + * ldi sp,-xx(sp) + * stl ra,0(sp) + * stl fp,8(sp) + * mov sp,fp + * + * A simple function epilogue looks like this: + * mov fp,sp + * ldl ra,0(sp) + * ldl fp,8(sp) + * ldi sp,+xx(sp) + */ + +#ifdef CONFIG_FRAME_POINTER + +int unwind_frame(struct task_struct *tsk, struct stackframe *frame) +{ + unsigned long fp = frame->fp; + + if (fp & 0x7) + return -EINVAL; + + if (!tsk) + tsk = current; + + if (!on_accessible_stack(tsk, fp, NULL)) + return -EINVAL; + + frame->pc = READ_ONCE_NOCHECK(*(unsigned long *)(fp)); + frame->fp = READ_ONCE_NOCHECK(*(unsigned long *)(fp + 8)); + + /* + * Frames created upon entry from user have NULL FP and PC values, so + * don't bother reporting these. Frames created by __noreturn functions + * might have a valid FP even if PC is bogus, so only terminate where + * both are NULL. + */ + if (!frame->fp && !frame->pc) + return -EINVAL; + + return 0; +} +EXPORT_SYMBOL_GPL(unwind_frame); + +void walk_stackframe(struct task_struct *tsk, struct stackframe *frame, + int (*fn)(struct stackframe *, void *), void *data) +{ + while (1) { + int ret; + + if (fn(frame, data)) + break; + ret = unwind_frame(tsk, frame); + if (ret < 0) + break; + } +} +EXPORT_SYMBOL_GPL(walk_stackframe); + +#else /* !CONFIG_FRAME_POINTER */ +void walk_stackframe(struct task_struct *tsk, struct stackframe *frame, + int (*fn)(struct stackframe *, void *), void *data) +{ + unsigned long *sp = (unsigned long *)current_thread_info()->pcb.ksp; + unsigned long addr; + struct perf_callchain_entry_ctx *entry = data; + + perf_callchain_store(entry, frame->pc); + while (!kstack_end(sp) && entry->nr < entry->max_stack) { + addr = *sp++; + if (__kernel_text_address(addr)) + perf_callchain_store(entry, addr); + } +} +EXPORT_SYMBOL_GPL(walk_stackframe); + +#endif/* CONFIG_FRAME_POINTER */ /* * Save stack-backtrace addresses into a stack_trace buffer. -- Gitee From 42518209c039862f32068aeee497621aaac776ed Mon Sep 17 00:00:00 2001 From: Xu Chenjiao Date: Tue, 31 May 2022 14:11:10 +0800 Subject: [PATCH 123/674] sw64: fix compile errors for NOT chip3 Sunway inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I56QAM -------------------------------- IO register DEVINT_WKEN, DEVINTWK_INTEN, PME_ENABLE_INTD_CORE0 and AER_ENABLE_INTD_CORE0 are only defined in chip3. This will lead to compile errors if other chip is selected. To avoid these errors, we define default set_devint_wken() and set_pcieport_service_irq() as weak symbols. Signed-off-by: Xu Chenjiao Signed-off-by: Gu Zitao --- arch/sw_64/chip/chip3/chip.c | 16 ++++++++++++++++ arch/sw_64/kernel/pci.c | 15 ++++----------- 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/arch/sw_64/chip/chip3/chip.c b/arch/sw_64/chip/chip3/chip.c index acd25fe99669..b73afc9c62ac 100644 --- a/arch/sw_64/chip/chip3/chip.c +++ b/arch/sw_64/chip/chip3/chip.c @@ -90,6 +90,22 @@ void setup_chip_clocksource(void) #endif } +void set_devint_wken(int node) +{ + unsigned long val; + + /* enable INTD wakeup */ + val = 0x80; + sw64_io_write(node, DEVINT_WKEN, val); + sw64_io_write(node, DEVINTWK_INTEN, val); +} + +void set_pcieport_service_irq(int node, int index) +{ + write_piu_ior0(node, index, PMEINTCONFIG, PME_ENABLE_INTD_CORE0); + write_piu_ior0(node, index, AERERRINTCONFIG, AER_ENABLE_INTD_CORE0); +} + static int chip3_get_cpu_nums(void) { unsigned long trkmode; diff --git a/arch/sw_64/kernel/pci.c b/arch/sw_64/kernel/pci.c index 401ee0b781be..a004ea9fde7f 100644 --- a/arch/sw_64/kernel/pci.c +++ b/arch/sw_64/kernel/pci.c @@ -600,15 +600,7 @@ sw64_init_host(unsigned long node, unsigned long index) } } -static void set_devint_wken(int node) -{ - unsigned long val; - - /* enable INTD wakeup */ - val = 0x80; - sw64_io_write(node, DEVINT_WKEN, val); - sw64_io_write(node, DEVINTWK_INTEN, val); -} +void __weak set_devint_wken(int node) {} void __init sw64_init_arch(void) { @@ -651,6 +643,8 @@ void __init sw64_init_arch(void) } } +void __weak set_pcieport_service_irq(int node, int index) {} + static void __init sw64_init_intx(struct pci_controller *hose) { unsigned long int_conf, node, val_node; @@ -679,8 +673,7 @@ static void __init sw64_init_intx(struct pci_controller *hose) if (sw64_chip_init->pci_init.set_intx) sw64_chip_init->pci_init.set_intx(node, index, int_conf); - write_piu_ior0(node, index, PMEINTCONFIG, PME_ENABLE_INTD_CORE0); - write_piu_ior0(node, index, AERERRINTCONFIG, AER_ENABLE_INTD_CORE0); + set_pcieport_service_irq(node, index); } void __init sw64_init_irq(void) -- Gitee From afe00ca0c338c3a0828cc835f249e4b11906b632 Mon Sep 17 00:00:00 2001 From: Xiong Aifei Date: Mon, 6 Jun 2022 15:24:44 +0800 Subject: [PATCH 124/674] sw64: gpu: correct low-level mmio memset/memcpy direct calls Sunway inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GDKC -------------------------------- Driver codes of the direct calls, via the SIMD-optimized memset and memcpy functions, may raise DFAULT when using RX580 or R7 Series graphics card on sw64, so work around 'memset' references to '_memset_c_io' and 'memcpy' to 'memcpy_fromio'. Signed-off-by: Xiong Aifei Signed-off-by: Gu Zitao --- drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 12 ++++++++++++ drivers/gpu/drm/radeon/vce_v1_0.c | 5 +++++ 2 files changed, 17 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c index c36258d56b44..851d64e83166 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c @@ -1354,7 +1354,11 @@ static int gfx_v8_0_mec_init(struct amdgpu_device *adev) return r; } +#if IS_ENABLED(CONFIG_SW64) + _memset_c_io(hpd, 0, mec_hpd_size); +#else memset(hpd, 0, mec_hpd_size); +#endif amdgpu_bo_kunmap(adev->gfx.mec.hpd_eop_obj); amdgpu_bo_unreserve(adev->gfx.mec.hpd_eop_obj); @@ -4649,7 +4653,11 @@ static int gfx_v8_0_kiq_init_queue(struct amdgpu_ring *ring) vi_srbm_select(adev, 0, 0, 0, 0); mutex_unlock(&adev->srbm_mutex); } else { +#if IS_ENABLED(CONFIG_SW64) + _memset_c_io((void *)mqd, 0, sizeof(struct vi_mqd_allocation)); +#else memset((void *)mqd, 0, sizeof(struct vi_mqd_allocation)); +#endif ((struct vi_mqd_allocation *)mqd)->dynamic_cu_mask = 0xFFFFFFFF; ((struct vi_mqd_allocation *)mqd)->dynamic_rb_mask = 0xFFFFFFFF; mutex_lock(&adev->srbm_mutex); @@ -4660,7 +4668,11 @@ static int gfx_v8_0_kiq_init_queue(struct amdgpu_ring *ring) mutex_unlock(&adev->srbm_mutex); if (adev->gfx.mec.mqd_backup[mqd_idx]) +#if IS_ENABLED(CONFIG_SW64) + memcpy_fromio(adev->gfx.mec.mqd_backup[mqd_idx], mqd, sizeof(struct vi_mqd_allocation)); +#else memcpy(adev->gfx.mec.mqd_backup[mqd_idx], mqd, sizeof(struct vi_mqd_allocation)); +#endif } return 0; diff --git a/drivers/gpu/drm/radeon/vce_v1_0.c b/drivers/gpu/drm/radeon/vce_v1_0.c index bd75bbcf5bf6..fbd8a5d9a691 100644 --- a/drivers/gpu/drm/radeon/vce_v1_0.c +++ b/drivers/gpu/drm/radeon/vce_v1_0.c @@ -193,8 +193,13 @@ int vce_v1_0_load_fw(struct radeon_device *rdev, uint32_t *data) data[3] = sign->val[i].nonce[3]; data[4] = cpu_to_le32(le32_to_cpu(sign->len) + 64); +#if IS_ENABLED(CONFIG_SW64) + memset_io(&data[5], 0, 44); + memcpy_toio(&data[16], &sign[1], rdev->vce_fw->size - sizeof(*sign)); +#else memset(&data[5], 0, 44); memcpy(&data[16], &sign[1], rdev->vce_fw->size - sizeof(*sign)); +#endif data += (le32_to_cpu(sign->len) + 64) / 4; data[0] = sign->val[i].sigval[0]; -- Gitee From 852e6489f5d399a75ad6e755e60e9312a2e4300e Mon Sep 17 00:00:00 2001 From: Xu Chenjiao Date: Mon, 6 Jun 2022 16:50:04 +0800 Subject: [PATCH 125/674] sw64: pcie: fix lack of PME and AER interrupt service routines Sunway inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I56XUF -------------------------------- On SW64 platform, we used to enable PME and AER interrupt by default, however, there is a risk that no body cares the interrupt routine when CONFIG_PCIE_PME=n or CONFIG_PCIEAER=n. So let's fix this problem. Signed-off-by: Xu Chenjiao Signed-off-by: Gu Zitao --- arch/sw_64/chip/chip3/chip.c | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/arch/sw_64/chip/chip3/chip.c b/arch/sw_64/chip/chip3/chip.c index b73afc9c62ac..2103d93a53a2 100644 --- a/arch/sw_64/chip/chip3/chip.c +++ b/arch/sw_64/chip/chip3/chip.c @@ -102,8 +102,11 @@ void set_devint_wken(int node) void set_pcieport_service_irq(int node, int index) { - write_piu_ior0(node, index, PMEINTCONFIG, PME_ENABLE_INTD_CORE0); - write_piu_ior0(node, index, AERERRINTCONFIG, AER_ENABLE_INTD_CORE0); + if (IS_ENABLED(CONFIG_PCIE_PME)) + write_piu_ior0(node, index, PMEINTCONFIG, PME_ENABLE_INTD_CORE0); + + if (IS_ENABLED(CONFIG_PCIEAER)) + write_piu_ior0(node, index, AERERRINTCONFIG, AER_ENABLE_INTD_CORE0); } static int chip3_get_cpu_nums(void) @@ -438,7 +441,7 @@ extern struct pci_controller *hose_head, **hose_tail; static void sw6_handle_intx(unsigned int offset) { struct pci_controller *hose; - unsigned long value, pme_value, aer_value; + unsigned long value; hose = hose_head; for (hose = hose_head; hose; hose = hose->next) { @@ -451,15 +454,20 @@ static void sw6_handle_intx(unsigned int offset) write_piu_ior0(hose->node, hose->index, INTACONFIG + (offset << 7), value); } - pme_value = read_piu_ior0(hose->node, hose->index, PMEINTCONFIG); - aer_value = read_piu_ior0(hose->node, hose->index, AERERRINTCONFIG); - if ((pme_value >> 63) || (aer_value >> 63)) { - handle_irq(hose->service_irq); + if (IS_ENABLED(CONFIG_PCIE_PME)) { + value = read_piu_ior0(hose->node, hose->index, PMEINTCONFIG); + if (value >> 63) { + handle_irq(hose->service_irq); + write_piu_ior0(hose->node, hose->index, PMEINTCONFIG, value); + } + } - if (pme_value >> 63) - write_piu_ior0(hose->node, hose->index, PMEINTCONFIG, pme_value); - if (aer_value >> 63) - write_piu_ior0(hose->node, hose->index, AERERRINTCONFIG, aer_value); + if (IS_ENABLED(CONFIG_PCIEAER)) { + value = read_piu_ior0(hose->node, hose->index, AERERRINTCONFIG); + if (value >> 63) { + handle_irq(hose->service_irq); + write_piu_ior0(hose->node, hose->index, AERERRINTCONFIG, value); + } } if (hose->iommu_enable) { -- Gitee From e41f34b561b8485031080a07c2e19f22dceb06da Mon Sep 17 00:00:00 2001 From: Zheng Chongzhen Date: Wed, 15 Jun 2022 16:54:25 +0800 Subject: [PATCH 126/674] sw64: fix dma features for zx200 Sunway inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I56OSP -------------------------------- The device controllers of ZX200 chipset are all the 32-bit DMA capable device, so we add pci quirk function hook to set dev.bus_dma_limit to DMA_BIT_MASK(32) for all devices on the chipset, besides when CONFIG_SUNWAY_IOMMU=n, all these devices will use swiotlb dma_ops. Signed-off-by: Zheng Chongzhen Signed-off-by: Gu Zitao --- arch/sw_64/kernel/pci.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/arch/sw_64/kernel/pci.c b/arch/sw_64/kernel/pci.c index a004ea9fde7f..7cc8b7d2d43b 100644 --- a/arch/sw_64/kernel/pci.c +++ b/arch/sw_64/kernel/pci.c @@ -691,3 +691,16 @@ sw64_init_pci(void) { common_init_pci(); } + +static int setup_bus_dma_cb(struct pci_dev *pdev, void *data) +{ + pdev->dev.bus_dma_limit = DMA_BIT_MASK(32); + return 0; +} + +static void fix_bus_dma_limit(struct pci_dev *dev) +{ + pci_walk_bus(dev->subordinate, setup_bus_dma_cb, NULL); + pr_info("Set zx200 bus_dma_limit to 32-bit\n"); +} +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ZHAOXIN, 0x071f, fix_bus_dma_limit); -- Gitee From aadce49276fe3cb5d65f9bd932522f5ac069564b Mon Sep 17 00:00:00 2001 From: Zheng Chongzhen Date: Wed, 15 Jun 2022 17:40:59 +0800 Subject: [PATCH 127/674] sw64: iommu: fix iommu interrupt handler Sunway inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I56OSP -------------------------------- When porting ZX200 chipset driver on SW64 platform, we meet an IOMMU exception show as follows: iommu_interrupt, iommu_status = 0xc8014000dffe0000, devid 0xa00, dva 0xdffe0000, 1Unable to handle kernel paging request at virtual address 0000000000000040 CPU 0 swapper/0(0): Oops 0 pc = [] ra = [] ps = 0001 Not tainted pc is at iommu_interrupt+0x140/0x3e0 ra is at iommu_interrupt+0xe4/0x3e0 v0 = 0000000000000051 t0 = c8014000dffe0000 t1 = 0000000000000000 t2 = 0000000000000000 t3 = 0000000000000000 t4 = 0000000000000001 t5 = 0000000000000001 t6 = 0000000000000000 t7 = ffffffff82948000 s0 = fff00003ffff0400 s1 = 0000000000000001 s2 = 0000000000000a00 s3 = 0000000000000a00 s4 = 00000000dffe0000 s5 = fff0000100680e80 s6 = ffffffff8294ba70 a0 = 0000000000000001 a1 = 0000000000000001 a2 = ffffffff8294b790 a3 = ffffffff8294b7a8 a4 = 0000000000000000 a5 = ffffffff82c5fb7a t8 = 0000000000000001 t9 = fffffffffffcac48 t10 = 0000000000000000 t11= 0000000000000000 pv = ffffffff809f4f10 at = ffffffff82bff6c0 gp = ffffffff82c1f510 sp = (____ptrval____) The root cause is that the device which raises iommu exception is not in the device list, then reference a null sdev will cause a page fualt. To work around this problem, we apply this patch by just clearing IOMMUEXCPT_STATUS and then go on. BTW, why the device raise IOMMU exception is not a valid device ID, it's a puzzling problem. Signed-off-by: Zheng Chongzhen Signed-off-by: Gu Zitao --- drivers/iommu/sw64/sunway_iommu.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/sw64/sunway_iommu.c b/drivers/iommu/sw64/sunway_iommu.c index 5797adf8fbc5..b6c8f1272d28 100644 --- a/drivers/iommu/sw64/sunway_iommu.c +++ b/drivers/iommu/sw64/sunway_iommu.c @@ -648,10 +648,21 @@ irqreturn_t iommu_interrupt(int irq, void *dev) type = (iommu_status >> 59) & 0x7; devid = (iommu_status >> 37) & 0xffff; dva = iommu_status & 0xffffffff; - sdev = search_dev_data(devid); - sdomain = sdev->domain; pr_info("%s, iommu_status = %#lx, devid %#lx, dva %#lx, ", __func__, iommu_status, devid, dva); + + sdev = search_dev_data(devid); + if (sdev == NULL) { + pr_info("no such dev!!!\n"); + + iommu_status &= ~(1UL << 62); + write_piu_ior0(hose->node, hose->index, + IOMMUEXCPT_STATUS, iommu_status); + + return IRQ_HANDLED; + } + + sdomain = sdev->domain; switch (type) { case DTE_LEVEL1: pr_info("invalid level1 dte, addr:%#lx, val:%#lx\n", @@ -674,7 +685,6 @@ irqreturn_t iommu_interrupt(int irq, void *dev) fetch_pte(sdomain, dva, PTE_LEVEL2_VAL)); iommu_status &= ~(1UL << 62); - iommu_status = iommu_status | (1UL << 63); write_piu_ior0(hose->node, hose->index, IOMMUEXCPT_STATUS, iommu_status); break; -- Gitee From c1bf5440faf937aa402e5c08090537948dd99d60 Mon Sep 17 00:00:00 2001 From: Min Fanlei Date: Tue, 7 Jun 2022 15:12:25 +0800 Subject: [PATCH 128/674] sw64: kvm: fix incorrect page_ref_count() call Sunway inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I56WV8 -------------------------------- The last page_ref_count() call will cause kernel panic if kvm memory pool is at the end of DRAM. This patch reorders the checks to avoid illegal atomic_read operation. Signed-off-by: Min Fanlei Signed-off-by: Gu Zitao --- arch/sw_64/kernel/setup.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/sw_64/kernel/setup.c b/arch/sw_64/kernel/setup.c index 4d39db5fb1ef..e20e215dd08a 100644 --- a/arch/sw_64/kernel/setup.c +++ b/arch/sw_64/kernel/setup.c @@ -1026,8 +1026,7 @@ static int __init sw64_kvm_pool_init(void) end_page = pfn_to_page((kvm_mem_base + kvm_mem_size - 1) >> PAGE_SHIFT); p = base_page; - while (page_ref_count(p) == 0 && - (unsigned long)p <= (unsigned long)end_page) { + while (p <= end_page && page_ref_count(p) == 0) { set_page_count(p, 1); page_mapcount_reset(p); SetPageReserved(p); -- Gitee From 3f5352084a8cc53cc8d555dcb20357138a37e276 Mon Sep 17 00:00:00 2001 From: Yang Qiang Date: Wed, 8 Jun 2022 09:35:37 +0800 Subject: [PATCH 129/674] sw64: dtb: check address validity with physical address Sunway inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GE5X -------------------------------- Due to sw64 ABI sepecification, a large kernel image may override the data structures of UEFI BIOS. To solve this problem, we have expanded UEFI BIOS to 1GB to make sure that runtime service code and data structures reside in the high address below 1GB which may be beyond ktext map. So fix it. Signed-off-by: Yang Qiang Signed-off-by: Gu Zitao --- arch/sw_64/kernel/setup.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/sw_64/kernel/setup.c b/arch/sw_64/kernel/setup.c index e20e215dd08a..85d172c32239 100644 --- a/arch/sw_64/kernel/setup.c +++ b/arch/sw_64/kernel/setup.c @@ -560,22 +560,20 @@ static void __init setup_machine_fdt(void) #ifdef CONFIG_USE_OF void *dt_virt; const char *name; - unsigned long phys_addr; /* Give a chance to select kernel builtin DTB firstly */ if (IS_ENABLED(CONFIG_SW64_BUILTIN_DTB)) dt_virt = (void *)__dtb_start; else { dt_virt = (void *)sunway_boot_params->dtb_start; - if (dt_virt < (void *)__bss_stop) { + if (virt_to_phys(dt_virt) < virt_to_phys(__bss_stop)) { pr_emerg("BUG: DTB has been corrupted by kernel image!\n"); while (true) cpu_relax(); } } - phys_addr = __phys_addr((unsigned long)dt_virt); - if (!phys_addr_valid(phys_addr) || + if (!phys_addr_valid(virt_to_phys(dt_virt)) || !early_init_dt_scan(dt_virt)) { pr_crit("\n" "Error: invalid device tree blob at virtual address %px\n" -- Gitee From b9c01ee05c5fb17024d11bc058214f336718af3b Mon Sep 17 00:00:00 2001 From: He Chuyue Date: Wed, 8 Jun 2022 09:38:24 +0800 Subject: [PATCH 130/674] sw64: perf: fix the number of supported raw events Sunway inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I56X48 -------------------------------- According to CORE3B core software interface manual, the performance counter PC_1 on sw64 supports event index up to 0x3d. Now fix it, and remove some unused macros from wrperfmon.h. Signed-off-by: He Chuyue Signed-off-by: Gu Zitao --- arch/sw_64/include/asm/wrperfmon.h | 4 +--- arch/sw_64/kernel/perf_event.c | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/sw_64/include/asm/wrperfmon.h b/arch/sw_64/include/asm/wrperfmon.h index 098702573bfc..15f7f6beb07c 100644 --- a/arch/sw_64/include/asm/wrperfmon.h +++ b/arch/sw_64/include/asm/wrperfmon.h @@ -33,10 +33,8 @@ #define PC0_RAW_BASE 0x0 #define PC1_RAW_BASE 0x100 -#define PC0_MIN 0x0 #define PC0_MAX 0xF -#define PC1_MIN 0x0 -#define PC1_MAX 0x37 +#define PC1_MAX 0x3D #define SW64_PERFCTRL_KM 2 #define SW64_PERFCTRL_UM 3 diff --git a/arch/sw_64/kernel/perf_event.c b/arch/sw_64/kernel/perf_event.c index 70f1f2781016..e9aae53a56f6 100644 --- a/arch/sw_64/kernel/perf_event.c +++ b/arch/sw_64/kernel/perf_event.c @@ -244,7 +244,7 @@ static const struct sw64_perf_event *core3_map_cache_event(u64 config) /* * r0xx for counter0, r1yy for counter1. - * According to the datasheet, 00 <= xx <= 0F, 00 <= yy <= 37 + * According to the datasheet, 00 <= xx <= 0F, 00 <= yy <= 3D */ static bool core3_raw_event_valid(u64 config) { -- Gitee From de13548b9a24e73d988b43fbad57f32cfb4adfd1 Mon Sep 17 00:00:00 2001 From: Zhu Donghong Date: Wed, 8 Jun 2022 09:49:30 +0800 Subject: [PATCH 131/674] drivers/irqchip: add sw64 interrupt controller support Sunway inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5GDHC -------------------------------- Add a driver for the SW64 implementation of the CPU interrupt controller(INTC). In fact, the INTC is just a software simulated module to connect global interrupt sources to the local interrupt controller on each core. In addition, we rename irq-lpc.c and SW64_CHIP3_LPC config. Signed-off-by: Zhu Donghong Signed-off-by: Gu Zitao --- arch/sw_64/Kconfig | 8 +- arch/sw_64/chip/chip3/Makefile | 1 - drivers/irqchip/Kconfig | 33 +++--- drivers/irqchip/Makefile | 4 +- drivers/irqchip/irq-intc-v1.c | 104 ------------------ .../irqchip/irq-sw64-intc-v2.c | 64 ++++++++++- .../{irq-lpc.c => irq-sw64-lpc-intc.c} | 0 7 files changed, 84 insertions(+), 130 deletions(-) delete mode 100644 drivers/irqchip/irq-intc-v1.c rename arch/sw_64/chip/chip3/irq_chip.c => drivers/irqchip/irq-sw64-intc-v2.c (59%) rename drivers/irqchip/{irq-lpc.c => irq-sw64-lpc-intc.c} (100%) diff --git a/arch/sw_64/Kconfig b/arch/sw_64/Kconfig index 0e32fc7e1f9a..ec6e583a5d9a 100644 --- a/arch/sw_64/Kconfig +++ b/arch/sw_64/Kconfig @@ -234,6 +234,7 @@ config PLATFORM_XUELANG depends on SW64_CHIP3 select SPARSE_IRQ select SYS_HAS_EARLY_PRINTK + select SW64_INTC_V2 help Sunway chip3 board chipset @@ -736,15 +737,10 @@ endmenu menu "Boot options" -config SW64_IRQ_CHIP - bool - config USE_OF bool "Flattened Device Tree support" - select GENERIC_IRQ_CHIP - select IRQ_DOMAIN - select SW64_IRQ_CHIP select OF + select IRQ_DOMAIN help Include support for flattened device tree machine descriptions. diff --git a/arch/sw_64/chip/chip3/Makefile b/arch/sw_64/chip/chip3/Makefile index 2b7b5790003f..ba0ab3f67f98 100644 --- a/arch/sw_64/chip/chip3/Makefile +++ b/arch/sw_64/chip/chip3/Makefile @@ -4,5 +4,4 @@ obj-y := chip.o i2c-lib.o obj-$(CONFIG_PCI) += pci-quirks.o obj-$(CONFIG_PCI_MSI) += msi.o vt_msi.o -obj-$(CONFIG_SW64_IRQ_CHIP) += irq_chip.o obj-$(CONFIG_CPUFREQ_DEBUGFS) += cpufreq_debugfs.o diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig index 5bf6cf60999b..dd35895c92f3 100644 --- a/drivers/irqchip/Kconfig +++ b/drivers/irqchip/Kconfig @@ -1,21 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only menu "IRQ chip support" -config SW64_INTC - bool "SW64 Platform-Level Interrupt Controller" - depends on ACPI && SW64 - help - This enables support for the INTC chip found in SW systems. - The INTC controls devices interrupts and connects them to each - core's local interrupt controller. - -config SW64_CHIP3_LPC - bool "SW64 Chip3 Buildin LPC Interrupt Controller" - depends on SW64_CHIP3 - help - This enables support for the LPC interrupt controller bultin in - on chip3 series. - config IRQCHIP def_bool y depends on OF_IRQ @@ -26,6 +11,24 @@ config ARM_GIC select GENERIC_IRQ_MULTI_HANDLER select GENERIC_IRQ_EFFECTIVE_AFF_MASK +config SW64_INTC_V2 + bool "SW64 Interrupt Controller V2" + depends on SW64_CHIP3 + default y + select GENERIC_IRQ_CHIP + select IRQ_DOMAIN + help + This enables support for the INTC chip found in SW CHIP3 systems. + The INTC controls devices interrupts and connects them to each + core's local interrupt controller. + +config SW64_LPC_INTC + bool "SW64 cpu builtin LPC Interrupt Controller" + depends on SW64_INTC_V2 + help + Say yes here to add support for the SW64 cpu builtin LPC + IRQ controller. + config ARM_GIC_PM bool depends on PM diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile index 78eb12ab4d4c..14a022c074ce 100644 --- a/drivers/irqchip/Makefile +++ b/drivers/irqchip/Makefile @@ -27,8 +27,8 @@ obj-$(CONFIG_ARCH_SUNXI) += irq-sun4i.o obj-$(CONFIG_ARCH_SUNXI) += irq-sunxi-nmi.o obj-$(CONFIG_ARCH_SPEAR3XX) += spear-shirq.o obj-$(CONFIG_ARM_GIC) += irq-gic.o irq-gic-common.o -obj-$(CONFIG_SW64_INTC) += irq-intc-v1.o -obj-$(CONFIG_SW64_CHIP3_LPC) += irq-lpc.o +obj-$(CONFIG_SW64_INTC_V2) += irq-sw64-intc-v2.o +obj-$(CONFIG_SW64_LPC_INTC) += irq-sw64-lpc-intc.o obj-$(CONFIG_ARM_GIC_PM) += irq-gic-pm.o obj-$(CONFIG_ARCH_REALVIEW) += irq-gic-realview.o obj-$(CONFIG_ARM_GIC_V2M) += irq-gic-v2m.o diff --git a/drivers/irqchip/irq-intc-v1.c b/drivers/irqchip/irq-intc-v1.c deleted file mode 100644 index 4519e96526fb..000000000000 --- a/drivers/irqchip/irq-intc-v1.c +++ /dev/null @@ -1,104 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -static void fake_irq_mask(struct irq_data *data) -{ -} - -static void fake_irq_unmask(struct irq_data *data) -{ -} - -static struct irq_chip onchip_intc = { - .name = "SW fake Intc", - .irq_mask = fake_irq_mask, - .irq_unmask = fake_irq_unmask, -}; - -static int sw_intc_domain_map(struct irq_domain *d, unsigned int irq, - irq_hw_number_t hw) -{ - irq_set_chip_and_handler(irq, &onchip_intc, handle_level_irq); - irq_set_status_flags(irq, IRQ_LEVEL); - return 0; -} - -static const struct irq_domain_ops intc_irq_domain_ops = { - .xlate = irq_domain_xlate_onecell, - .map = sw_intc_domain_map, -}; - -#ifdef CONFIG_ACPI - -static int __init -intc_parse_madt(union acpi_subtable_headers *header, - const unsigned long end) -{ - struct acpi_madt_io_sapic *its_entry; - static struct irq_domain *root_domain; - int intc_irqs = 8, irq_base = NR_IRQS_LEGACY; - irq_hw_number_t hwirq_base = 0; - int irq_start = -1; - - its_entry = (struct acpi_madt_io_sapic *)header; - - intc_irqs -= hwirq_base; /* calculate # of irqs to allocate */ - - irq_base = irq_alloc_descs(irq_start, 16, intc_irqs, - numa_node_id()); - if (irq_base < 0) { - WARN(1, "Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n", - irq_start); - irq_base = irq_start; - } - - root_domain = irq_domain_add_legacy(NULL, intc_irqs, irq_base, - hwirq_base, &intc_irq_domain_ops, NULL); - - if (!root_domain) - pr_err("Failed to create irqdomain"); - - irq_set_default_host(root_domain); - - sw64_io_write(0, MCU_DVC_INT_EN, 0xff); - - return 0; -} - -static int __init acpi_intc_init(void) -{ - int count = 0; - - count = acpi_table_parse_madt(ACPI_MADT_TYPE_IO_SAPIC, - intc_parse_madt, 0); - - if (count <= 0) { - pr_err("No valid intc entries exist\n"); - return -EINVAL; - } - return 0; -} -#else -static int __init acpi_intc_init(void) -{ - return 0; -} -#endif - -static int __init intc_init(void) -{ - acpi_intc_init(); - - return 0; -} -subsys_initcall(intc_init); diff --git a/arch/sw_64/chip/chip3/irq_chip.c b/drivers/irqchip/irq-sw64-intc-v2.c similarity index 59% rename from arch/sw_64/chip/chip3/irq_chip.c rename to drivers/irqchip/irq-sw64-intc-v2.c index 24dfa1e1a898..8640c4aa9506 100644 --- a/arch/sw_64/chip/chip3/irq_chip.c +++ b/drivers/irqchip/irq-sw64-intc-v2.c @@ -1,8 +1,10 @@ // SPDX-License-Identifier: GPL-2.0 #include #include - -#include +#include +#include +#include +#include static void fake_irq_mask(struct irq_data *data) { @@ -32,6 +34,64 @@ static const struct irq_domain_ops sw64_intc_domain_ops = { .map = sw64_intc_domain_map, }; +static int __init +intc_parse_madt(union acpi_subtable_headers *header, + const unsigned long end) +{ + struct acpi_madt_io_sapic *its_entry; + static struct irq_domain *root_domain; + int intc_irqs = 8, irq_base = NR_IRQS_LEGACY; + irq_hw_number_t hwirq_base = 0; + int irq_start = -1; + + its_entry = (struct acpi_madt_io_sapic *)header; + + intc_irqs -= hwirq_base; /* calculate # of irqs to allocate */ + + irq_base = irq_alloc_descs(irq_start, 16, intc_irqs, + numa_node_id()); + if (irq_base < 0) { + WARN(1, "Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n", + irq_start); + irq_base = irq_start; + } + + root_domain = irq_domain_add_legacy(NULL, intc_irqs, irq_base, + hwirq_base, &sw64_intc_domain_ops, NULL); + + if (!root_domain) + pr_err("Failed to create irqdomain"); + + irq_set_default_host(root_domain); + + sw64_io_write(0, MCU_DVC_INT_EN, 0xff); + + return 0; +} + +static int __init acpi_intc_init(void) +{ + int count = 0; + + count = acpi_table_parse_madt(ACPI_MADT_TYPE_IO_SAPIC, + intc_parse_madt, 0); + + if (count <= 0) { + pr_err("No valid intc entries exist\n"); + return -EINVAL; + } + return 0; +} + +static int __init intc_init(void) +{ + acpi_intc_init(); + + return 0; +} + +subsys_initcall(intc_init); + static struct irq_domain *root_domain; static int __init diff --git a/drivers/irqchip/irq-lpc.c b/drivers/irqchip/irq-sw64-lpc-intc.c similarity index 100% rename from drivers/irqchip/irq-lpc.c rename to drivers/irqchip/irq-sw64-lpc-intc.c -- Gitee From 5384a3a3e7337b4953636b75c681ef0b0083ac0c Mon Sep 17 00:00:00 2001 From: Mao Minkai Date: Thu, 23 Jun 2022 14:11:05 +0800 Subject: [PATCH 132/674] sw64: fix __csum_and_copy when dest is not 8-byte aligned Sunway inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I56W9F -------------------------------- When destination is not 8-byte aligned, csum_partial_cfu_dest_unaligned() should be used to avoid kernel unaligned exception. Fixes: 2fcadd2861b4 ("sw64: optimize ip checksum calculation") Signed-off-by: Mao Minkai Signed-off-by: Gu Zitao --- arch/sw_64/lib/csum_partial_copy.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/sw_64/lib/csum_partial_copy.c b/arch/sw_64/lib/csum_partial_copy.c index 441ae5575de5..5e5274e82b2b 100644 --- a/arch/sw_64/lib/csum_partial_copy.c +++ b/arch/sw_64/lib/csum_partial_copy.c @@ -128,9 +128,9 @@ static __wsum __csum_and_copy(const void __user *src, void *dst, int len) (const unsigned long __user *) src, (unsigned long *) dst, len-8); } else { - checksum = csum_partial_cfu_dest_aligned( + checksum = csum_partial_cfu_dest_unaligned( (const unsigned long __user *) src, - (unsigned long *) dst, len-8); + (unsigned long *) dst, doff, len-8); } return (__force __wsum)from64to16(checksum); } -- Gitee From 12edb9430a3d902a0e8edb8c3d1489efda8205ad Mon Sep 17 00:00:00 2001 From: Xiong Aifei Date: Wed, 15 Jun 2022 16:36:00 +0800 Subject: [PATCH 133/674] sw64: gpu: replace '_memset_c_io' by 'memset_io' Sunway inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GDKC -------------------------------- Signed-off-by: Xiong Aifei Signed-off-by: Gu Zitao --- drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 4 ++-- drivers/gpu/drm/radeon/radeon_vce.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c index 851d64e83166..28c4e1fe5cd4 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c @@ -1355,7 +1355,7 @@ static int gfx_v8_0_mec_init(struct amdgpu_device *adev) } #if IS_ENABLED(CONFIG_SW64) - _memset_c_io(hpd, 0, mec_hpd_size); + memset_io(hpd, 0, mec_hpd_size); #else memset(hpd, 0, mec_hpd_size); #endif @@ -4654,7 +4654,7 @@ static int gfx_v8_0_kiq_init_queue(struct amdgpu_ring *ring) mutex_unlock(&adev->srbm_mutex); } else { #if IS_ENABLED(CONFIG_SW64) - _memset_c_io((void *)mqd, 0, sizeof(struct vi_mqd_allocation)); + memset_io((void *)mqd, 0, sizeof(struct vi_mqd_allocation)); #else memset((void *)mqd, 0, sizeof(struct vi_mqd_allocation)); #endif diff --git a/drivers/gpu/drm/radeon/radeon_vce.c b/drivers/gpu/drm/radeon/radeon_vce.c index 68cc5a347d3b..b9680d38d924 100644 --- a/drivers/gpu/drm/radeon/radeon_vce.c +++ b/drivers/gpu/drm/radeon/radeon_vce.c @@ -240,7 +240,7 @@ int radeon_vce_resume(struct radeon_device *rdev) } #ifdef __sw_64__ - _memset_c_io(cpu_addr, 0, radeon_bo_size(rdev->vce.vcpu_bo)); + memset_io(cpu_addr, 0, radeon_bo_size(rdev->vce.vcpu_bo)); #else memset(cpu_addr, 0, radeon_bo_size(rdev->vce.vcpu_bo)); #endif -- Gitee From fd62817c1d7a175e76ea65dbaeaa8bea72bfd012 Mon Sep 17 00:00:00 2001 From: He Chuyue Date: Tue, 14 Jun 2022 16:40:12 +0800 Subject: [PATCH 134/674] sw64: perf: fix raw event count Sunway inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I56X48 -------------------------------- To read raw event count, perf stat calls sw64_pmu_start() to start the event and then calls sw64_perf_event_set_period() to set a new period to sample over. It used to initialize hwc.prev_count and PMC with different values and this will result in error sample values. To fix this problem, initialize them to 0 consistently. Signed-off-by: He Chuyue Signed-off-by: Gu Zitao --- arch/sw_64/kernel/perf_event.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/arch/sw_64/kernel/perf_event.c b/arch/sw_64/kernel/perf_event.c index e9aae53a56f6..52ec34e33269 100644 --- a/arch/sw_64/kernel/perf_event.c +++ b/arch/sw_64/kernel/perf_event.c @@ -297,31 +297,33 @@ static int sw64_perf_event_set_period(struct perf_event *event, { long left = local64_read(&hwc->period_left); long period = hwc->sample_period; - int ret = 0; + int overflow = 0; + unsigned long value; if (unlikely(left <= -period)) { left = period; local64_set(&hwc->period_left, left); hwc->last_period = period; - ret = 1; + overflow = 1; } if (unlikely(left <= 0)) { left += period; local64_set(&hwc->period_left, left); hwc->last_period = period; - ret = 1; + overflow = 1; } if (left > (long)sw64_pmu->pmc_max_period) left = sw64_pmu->pmc_max_period; - local64_set(&hwc->prev_count, (unsigned long)(-left)); - sw64_write_pmc(idx, (unsigned long)(sw64_pmu->pmc_max_period - left)); + value = sw64_pmu->pmc_max_period - left; + local64_set(&hwc->prev_count, value); + sw64_write_pmc(idx, value); perf_event_update_userpage(event); - return ret; + return overflow; } /* -- Gitee From 248fac94e07c792a921d7f76c47f87c22b414ff6 Mon Sep 17 00:00:00 2001 From: Lu Feifei Date: Wed, 15 Jun 2022 14:59:23 +0800 Subject: [PATCH 135/674] sw64: add memhotplug support for guest os Sunway inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I56WV8 -------------------------------- This patch introduces memory-hotplug support for guest os, and it should set CONFIG_KVM_MEMHOTLPUG=y on host to enable this feature. Currently, only 1GB memory-hotplug granularity is supported, and multiple granularity support will be implemented in the future. Signed-off-by: Lu Feifei Signed-off-by: Gu Zitao --- arch/sw_64/chip/chip3/chip.c | 5 + arch/sw_64/include/asm/hcall.h | 1 + arch/sw_64/include/asm/irq_impl.h | 1 + arch/sw_64/include/asm/kvm_asm.h | 3 + arch/sw_64/include/asm/kvm_host.h | 11 +- arch/sw_64/include/asm/memory.h | 1 + arch/sw_64/kvm/Kconfig | 7 + arch/sw_64/kvm/handle_exit.c | 5 + arch/sw_64/kvm/kvm-sw64.c | 113 ++++++++++++- arch/sw_64/mm/init.c | 9 ++ drivers/misc/Kconfig | 8 + drivers/misc/Makefile | 1 + drivers/misc/sunway-ged.c | 253 ++++++++++++++++++++++++++++++ 13 files changed, 410 insertions(+), 8 deletions(-) create mode 100644 drivers/misc/sunway-ged.c diff --git a/arch/sw_64/chip/chip3/chip.c b/arch/sw_64/chip/chip3/chip.c index 2103d93a53a2..02b369b2b37b 100644 --- a/arch/sw_64/chip/chip3/chip.c +++ b/arch/sw_64/chip/chip3/chip.c @@ -701,6 +701,11 @@ void handle_chip_irq(unsigned long type, unsigned long vector, handle_irq(type); set_irq_regs(old_regs); return; + case INT_VT_HOTPLUG: + old_regs = set_irq_regs(regs); + handle_irq(type); + set_irq_regs(old_regs); + return; case INT_PC0: perf_irq(PERFMON_PC0, regs); return; diff --git a/arch/sw_64/include/asm/hcall.h b/arch/sw_64/include/asm/hcall.h index 8117752b657e..b5438b477c87 100644 --- a/arch/sw_64/include/asm/hcall.h +++ b/arch/sw_64/include/asm/hcall.h @@ -18,6 +18,7 @@ enum HCALL_TYPE { HCALL_SWNET = 20, /* guest request swnet service */ HCALL_SWNET_IRQ = 21, /* guest request swnet intr */ HCALL_FATAL_ERROR = 22, /* guest fatal error, issued by hmcode */ + HCALL_MEMHOTPLUG = 23, /* guest memory hotplug event */ NR_HCALL }; diff --git a/arch/sw_64/include/asm/irq_impl.h b/arch/sw_64/include/asm/irq_impl.h index 3679793d8b65..b568efef6994 100644 --- a/arch/sw_64/include/asm/irq_impl.h +++ b/arch/sw_64/include/asm/irq_impl.h @@ -32,6 +32,7 @@ enum sw64_irq_type { INT_RTC = 9, INT_FAULT = 10, INT_VT_SERIAL = 12, + INT_VT_HOTPLUG = 13, INT_DEV = 17, INT_NMI = 18, INT_LEGACY = 31, diff --git a/arch/sw_64/include/asm/kvm_asm.h b/arch/sw_64/include/asm/kvm_asm.h index 4b851682188c..7e2c92ed4574 100644 --- a/arch/sw_64/include/asm/kvm_asm.h +++ b/arch/sw_64/include/asm/kvm_asm.h @@ -11,4 +11,7 @@ #define SW64_KVM_EXIT_RESTART 17 #define SW64_KVM_EXIT_FATAL_ERROR 22 +#ifdef CONFIG_KVM_MEMHOTPLUG +#define SW64_KVM_EXIT_MEMHOTPLUG 23 +#endif #endif /* _ASM_SW64_KVM_ASM_H */ diff --git a/arch/sw_64/include/asm/kvm_host.h b/arch/sw_64/include/asm/kvm_host.h index e4ebb993153c..6d292c086347 100644 --- a/arch/sw_64/include/asm/kvm_host.h +++ b/arch/sw_64/include/asm/kvm_host.h @@ -29,7 +29,7 @@ #include #define KVM_MAX_VCPUS 64 -#define KVM_USER_MEM_SLOTS 512 +#define KVM_USER_MEM_SLOTS 64 #define KVM_HALT_POLL_NS_DEFAULT 0 #define KVM_IRQCHIP_NUM_PINS 256 @@ -42,12 +42,16 @@ #define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE) struct kvm_arch_memory_slot { - + unsigned long host_phys_addr; + bool valid; }; struct kvm_arch { unsigned long host_phys_addr; unsigned long size; + + /* segment table */ + unsigned long *seg_pgd; }; @@ -100,6 +104,9 @@ struct kvm_vcpu_stat { u64 halt_poll_invalid; }; +#ifdef CONFIG_KVM_MEMHOTPLUG +void vcpu_mem_hotplug(struct kvm_vcpu *vcpu, unsigned long start_addr); +#endif int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, int exception_index, struct hcall_args *hargs); void vcpu_send_ipi(struct kvm_vcpu *vcpu, int target_vcpuid); diff --git a/arch/sw_64/include/asm/memory.h b/arch/sw_64/include/asm/memory.h index d3191165c7b5..b2b7492ae477 100644 --- a/arch/sw_64/include/asm/memory.h +++ b/arch/sw_64/include/asm/memory.h @@ -6,6 +6,7 @@ #include #endif +#define MIN_MEMORY_BLOCK_SIZE_VM_MEMHP (1UL << 30) #define NODE0_START (_TEXT_START - __START_KERNEL_map) #define MAX_PHYSMEM_BITS 48 diff --git a/arch/sw_64/kvm/Kconfig b/arch/sw_64/kvm/Kconfig index 230ac526911c..85323b48f564 100644 --- a/arch/sw_64/kvm/Kconfig +++ b/arch/sw_64/kvm/Kconfig @@ -42,6 +42,13 @@ config KVM_SW64_HOST Provides host support for SW64 processors. To compile this as a module, choose M here. +config KVM_MEMHOTPLUG + bool "Memory hotplug support for guest" + depends on KVM + help + Provides memory hotplug support for SW64 guest. + + source "drivers/vhost/Kconfig" endif # VIRTUALIZATION diff --git a/arch/sw_64/kvm/handle_exit.c b/arch/sw_64/kvm/handle_exit.c index 0d6806051fc7..5016bc0eddc2 100644 --- a/arch/sw_64/kvm/handle_exit.c +++ b/arch/sw_64/kvm/handle_exit.c @@ -34,6 +34,11 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, case SW64_KVM_EXIT_IPI: vcpu_send_ipi(vcpu, hargs->arg0); return 1; +#ifdef CONFIG_KVM_MEMHOTPLUG + case SW64_KVM_EXIT_MEMHOTPLUG: + vcpu_mem_hotplug(vcpu, hargs->arg0); + return 1; +#endif case SW64_KVM_EXIT_FATAL_ERROR: printk("Guest fatal error: Reason=[%lx], EXC_PC=[%lx], DVA=[%lx]", hargs->arg0, hargs->arg1, hargs->arg2); vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; diff --git a/arch/sw_64/kvm/kvm-sw64.c b/arch/sw_64/kvm/kvm-sw64.c index 839ee83d57d5..af29d0ca8e7f 100644 --- a/arch/sw_64/kvm/kvm-sw64.c +++ b/arch/sw_64/kvm/kvm-sw64.c @@ -56,10 +56,18 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int irq extern int __sw64_vcpu_run(struct vcpucb *vcb, struct kvm_regs *regs, struct hcall_args *args); -static unsigned long get_vpcr(unsigned long machine_mem_offset, unsigned long memory_size, unsigned long vpn) +#ifdef CONFIG_KVM_MEMHOTPLUG +static u64 get_vpcr_memhp(u64 seg_base, u64 vpn) { - return (machine_mem_offset >> 23) | ((memory_size >> 23) << 16) | ((vpn & HARDWARE_VPN_MASK) << 44); + return seg_base | ((vpn & HARDWARE_VPN_MASK) << 44); } +#else +static u64 get_vpcr(u64 hpa_base, u64 mem_size, u64 vpn) +{ + return (hpa_base >> 23) | ((mem_size >> 23) << 16) + | ((vpn & HARDWARE_VPN_MASK) << 44); +} +#endif static unsigned long __get_new_vpn_context(struct kvm_vcpu *vcpu, long cpu) { @@ -212,12 +220,38 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { +#ifdef CONFIG_KVM_MEMHOTPLUG + unsigned long *seg_pgd; + + if (kvm->arch.seg_pgd != NULL) { + kvm_err("kvm_arch already initialized?\n"); + return -EINVAL; + } + + seg_pgd = alloc_pages_exact(PAGE_SIZE, GFP_KERNEL | __GFP_ZERO); + if (!seg_pgd) + return -ENOMEM; + + kvm->arch.seg_pgd = seg_pgd; +#endif + return 0; } void kvm_arch_destroy_vm(struct kvm *kvm) { int i; +#ifdef CONFIG_KVM_MEMHOTPLUG + void *seg_pgd = NULL; + + if (kvm->arch.seg_pgd) { + seg_pgd = READ_ONCE(kvm->arch.seg_pgd); + kvm->arch.seg_pgd = NULL; + } + + if (seg_pgd) + free_pages_exact(seg_pgd, PAGE_SIZE); +#endif for (i = 0; i < KVM_MAX_VCPUS; ++i) { if (kvm->vcpus[i]) { @@ -227,7 +261,6 @@ void kvm_arch_destroy_vm(struct kvm *kvm) } atomic_set(&kvm->online_vcpus, 0); - } long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) @@ -241,6 +274,22 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, return 0; } +#ifdef CONFIG_KVM_MEMHOTPLUG +static void setup_segment_table(struct kvm *kvm, + struct kvm_memory_slot *memslot, unsigned long addr, size_t size) +{ + unsigned long *seg_pgd = kvm->arch.seg_pgd; + unsigned int num_of_entry = size >> 30; + unsigned long base_hpa = addr >> 30; + int i; + + for (i = 0; i < num_of_entry; i++) { + *seg_pgd = base_hpa + i; + seg_pgd++; + } +} +#endif + int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, const struct kvm_userspace_memory_region *mem, @@ -253,8 +302,15 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, unsigned long ret; size_t size; - if (change == KVM_MR_FLAGS_ONLY) + if (change == KVM_MR_FLAGS_ONLY || change == KVM_MR_DELETE) + return 0; + +#ifndef CONFIG_KVM_MEMHOTPLUG + if (mem->guest_phys_addr) { + pr_info("%s, No KVM MEMHOTPLUG support!\n", __func__); return 0; + } +#endif if (test_bit(IO_MARK_BIT, &(mem->guest_phys_addr))) return 0; @@ -276,7 +332,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, if (!vm_file) { info = kzalloc(sizeof(struct vmem_info), GFP_KERNEL); - size = round_up(mem->memory_size, 8<<20); + size = round_up(mem->memory_size, 8 << 20); addr = gen_pool_alloc(sw64_kvm_pool, size); if (!addr) return -ENOMEM; @@ -291,6 +347,18 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, if (!vma) return -ENOMEM; +#ifdef CONFIG_KVM_MEMHOTPLUG + if (memslot->base_gfn == 0x0UL) { + setup_segment_table(kvm, memslot, addr, size); + kvm->arch.host_phys_addr = (u64)addr; + memslot->arch.host_phys_addr = addr; + } else { + /* used for memory hotplug */ + memslot->arch.host_phys_addr = addr; + memslot->arch.valid = false; + } +#endif + info->start = addr; info->size = size; vma->vm_private_data = (void *) info; @@ -308,8 +376,11 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, pr_info("guest phys addr = %#lx, size = %#lx\n", addr, vma->vm_end - vma->vm_start); + +#ifndef CONFIG_KVM_MEMHOTPLUG kvm->arch.host_phys_addr = (u64)addr; - kvm->arch.size = round_up(mem->memory_size, 8<<20); + kvm->arch.size = round_up(mem->memory_size, 8 << 20); +#endif memset(__va(addr), 0, 0x2000000); @@ -463,8 +534,14 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) /* Set guest vcb */ /* vpn will update later when vcpu is running */ if (vcpu->arch.vcb.vpcr == 0) { +#ifndef CONFIG_KVM_MEMHOTPLUG vcpu->arch.vcb.vpcr = get_vpcr(vcpu->kvm->arch.host_phys_addr, vcpu->kvm->arch.size, 0); +#else + unsigned long seg_base = virt_to_phys(vcpu->kvm->arch.seg_pgd); + + vcpu->arch.vcb.vpcr = get_vpcr_memhp(seg_base, 0); +#endif vcpu->arch.vcb.upcr = 0x7; } @@ -640,6 +717,30 @@ int kvm_dev_ioctl_check_extension(long ext) return r; } +#ifdef CONFIG_KVM_MEMHOTPLUG +void vcpu_mem_hotplug(struct kvm_vcpu *vcpu, unsigned long start_addr) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_memory_slot *slot; + unsigned long start_pfn = start_addr >> PAGE_SHIFT; + + kvm_for_each_memslot(slot, kvm_memslots(kvm)) { + if (start_pfn == slot->base_gfn) { + unsigned long *seg_pgd; + unsigned long num_of_entry = slot->npages >> 17; + unsigned long base_hpa = slot->arch.host_phys_addr; + int i; + + seg_pgd = kvm->arch.seg_pgd + (start_pfn >> 17); + for (i = 0; i < num_of_entry; i++) { + *seg_pgd = (base_hpa >> 30) + i; + seg_pgd++; + } + } + } +} +#endif + void vcpu_send_ipi(struct kvm_vcpu *vcpu, int target_vcpuid) { struct kvm_vcpu *target_vcpu = kvm_get_vcpu(vcpu->kvm, target_vcpuid); diff --git a/arch/sw_64/mm/init.c b/arch/sw_64/mm/init.c index 16d3da7beebe..82f2414ef7f7 100644 --- a/arch/sw_64/mm/init.c +++ b/arch/sw_64/mm/init.c @@ -10,6 +10,7 @@ #include #include #include +#include #include @@ -33,6 +34,14 @@ static pud_t vmalloc_pud[1024] __attribute__((__aligned__(PAGE_SIZE))); static phys_addr_t mem_start; static phys_addr_t mem_size_limit; +unsigned long memory_block_size_bytes(void) +{ + if (is_in_guest()) + return MIN_MEMORY_BLOCK_SIZE_VM_MEMHP; + else + return MIN_MEMORY_BLOCK_SIZE; +} + static int __init setup_mem_size(char *p) { char *oldp; diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig index fafa8b0d8099..140716083ab8 100644 --- a/drivers/misc/Kconfig +++ b/drivers/misc/Kconfig @@ -351,6 +351,14 @@ config HMC6352 This driver provides support for the Honeywell HMC6352 compass, providing configuration and heading data via sysfs. +config SUNWAY_GED + tristate "sunway generic device driver for memhotplug" + depends on SW64 + depends on MEMORY_HOTPLUG + help + This driver provides support for sunway generic device driver for + memhotplug, providing configuration and heading data via sysfs. + config DS1682 tristate "Dallas DS1682 Total Elapsed Time Recorder with Alarm" depends on I2C diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile index d23231e73330..3615763234a6 100644 --- a/drivers/misc/Makefile +++ b/drivers/misc/Makefile @@ -34,6 +34,7 @@ obj-$(CONFIG_SENSORS_TSL2550) += tsl2550.o obj-$(CONFIG_DS1682) += ds1682.o obj-$(CONFIG_C2PORT) += c2port/ obj-$(CONFIG_HMC6352) += hmc6352.o +obj-$(CONFIG_SUNWAY_GED) += sunway-ged.o obj-y += eeprom/ obj-y += cb710/ obj-$(CONFIG_VMWARE_BALLOON) += vmw_balloon.o diff --git a/drivers/misc/sunway-ged.c b/drivers/misc/sunway-ged.c new file mode 100644 index 000000000000..b4e4ca315852 --- /dev/null +++ b/drivers/misc/sunway-ged.c @@ -0,0 +1,253 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* Generic Event Device for ACPI. */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define OFFSET_START_ADDR 0 +#define OFFSET_LENGTH 8 +#define OFFSET_STATUS 16 +#define OFFSET_SLOT 24 + +/* Memory hotplug event */ +#define SUNWAY_MEMHOTPLUG_ADD 0x1 +#define SUNWAY_MEMHOTPLUG_REMOVE 0x2 + +struct sunway_memory_device { + struct sunway_ged_device *device; + unsigned int state; /* State of the memory device */ + struct list_head list; + + u64 start_addr; /* Memory Range start physical addr */ + u64 length; /* Memory Range length */ + u64 slot; /* Memory Range slot */ + unsigned int enabled:1; +}; + +struct sunway_ged_device { + struct device *dev; + void __iomem *membase; + void *driver_data; + spinlock_t lock; + struct list_head dev_list; +}; + +static int sunway_memory_enable_device(struct sunway_memory_device *mem_device) +{ + int num_enabled = 0; + int result = 0; + + if (mem_device->enabled) { /* just sanity check...*/ + num_enabled++; + goto out; + } + + /* + * If the memory block size is zero, please ignore it. + * Don't try to do the following memory hotplug flowchart. + */ + if (!mem_device->length) + goto out; + + lock_device_hotplug(); + /* suppose node = 0, fix me! */ + result = __add_memory(0, mem_device->start_addr, mem_device->length); + unlock_device_hotplug(); + /* + * If the memory block has been used by the kernel, add_memory() + * returns -EEXIST. If add_memory() returns the other error, it + * means that this memory block is not used by the kernel. + */ + if (result && result != -EEXIST) + goto out; + + mem_device->enabled = 1; + + /* + * Add num_enable even if add_memory() returns -EEXIST, so the + * device is bound to this driver. + */ + num_enabled++; +out: + if (!num_enabled) { + dev_err(mem_device->device->dev, "add_memory failed\n"); + return -EINVAL; + } + + return 0; +} + +static int sunway_memory_get_meminfo(struct sunway_memory_device *mem_device) +{ + struct sunway_ged_device *geddev; + + if (!mem_device) + return -EINVAL; + + if (mem_device->enabled) + return 0; + + geddev = mem_device->device; + + mem_device->start_addr = readq(geddev->membase + OFFSET_START_ADDR); + mem_device->length = readq(geddev->membase + OFFSET_LENGTH); + + return 0; +} + +static void sunway_memory_device_remove(struct sunway_ged_device *device) +{ + struct sunway_memory_device *mem_dev, *n; + unsigned long start_addr, length, slot; + + if (!device) + return; + + start_addr = readq(device->membase + OFFSET_START_ADDR); + length = readq(device->membase + OFFSET_LENGTH); + slot = readq(device->membase + OFFSET_SLOT); + + list_for_each_entry_safe(mem_dev, n, &device->dev_list, list) { + if (!mem_dev->enabled) + continue; + + if ((start_addr == mem_dev->start_addr) && + (length == mem_dev->length)) { + /* suppose node = 0, fix me! */ + remove_memory(0, start_addr, length); + list_del(&mem_dev->list); + kfree(mem_dev); + } + } + + writeq(slot, device->membase + OFFSET_SLOT); +} + +static int sunway_memory_device_add(struct sunway_ged_device *device) +{ + struct sunway_memory_device *mem_device; + int result; + + if (!device) + return -EINVAL; + + mem_device = kzalloc(sizeof(struct sunway_memory_device), GFP_KERNEL); + if (!mem_device) + return -ENOMEM; + + INIT_LIST_HEAD(&mem_device->list); + mem_device->device = device; + + /* Get the range from the IO */ + mem_device->start_addr = readq(device->membase + OFFSET_START_ADDR); + mem_device->length = readq(device->membase + OFFSET_LENGTH); + mem_device->slot = readq(device->membase + OFFSET_SLOT); + + result = sunway_memory_enable_device(mem_device); + if (result) { + dev_err(device->dev, "sunway_memory_enable_device() error\n"); + sunway_memory_device_remove(device); + + return result; + } + + list_add_tail(&mem_device->list, &device->dev_list); + dev_dbg(device->dev, "Memory device configured\n"); + + hcall(HCALL_MEMHOTPLUG, mem_device->start_addr, 0, 0); + + return 1; +} + +static irqreturn_t sunwayged_ist(int irq, void *data) +{ + struct sunway_ged_device *sunwayged_dev = data; + unsigned int status; + + status = readl(sunwayged_dev->membase + OFFSET_STATUS); + + /* through IO status to add or remove memory device */ + if (status & SUNWAY_MEMHOTPLUG_ADD) + sunway_memory_device_add(sunwayged_dev); + + if (status & SUNWAY_MEMHOTPLUG_REMOVE) + sunway_memory_device_remove(sunwayged_dev); + + return IRQ_HANDLED; +} + +static irqreturn_t sunwayged_irq_handler(int irq, void *data) +{ + return IRQ_WAKE_THREAD; +} + +static int sunwayged_probe(struct platform_device *pdev) +{ + struct resource *regs = platform_get_resource(pdev, IORESOURCE_MEM, 0); + int irq = platform_get_irq(pdev, 0); + struct sunway_ged_device *geddev; + struct device *dev; + int irqflags; + + if (!regs) { + dev_err(dev, "no registers defined\n"); + return -EINVAL; + } + + geddev = devm_kzalloc(&pdev->dev, sizeof(*geddev), GFP_KERNEL); + if (!geddev) + return -ENOMEM; + + spin_lock_init(&geddev->lock); + geddev->membase = devm_ioremap(&pdev->dev, + regs->start, resource_size(regs)); + if (!geddev->membase) + return -ENOMEM; + + INIT_LIST_HEAD(&geddev->dev_list); + geddev->dev = &pdev->dev; + irqflags = IRQF_SHARED; + + if (request_threaded_irq(irq, sunwayged_irq_handler, sunwayged_ist, + irqflags, "SUNWAY:Ged", geddev)) { + dev_err(dev, "failed to setup event handler for irq %u\n", irq); + + return -EINVAL; + } + + platform_set_drvdata(pdev, geddev); + + return 0; +} + +static int sunwayged_remove(struct platform_device *pdev) +{ + return 0; +} + +static const struct of_device_id sunwayged_of_match[] = { + {.compatible = "sw6,sunway-ged", }, + { } +}; +MODULE_DEVICE_TABLE(of, sunwayged_of_match); + +static struct platform_driver sunwayged_platform_driver = { + .driver = { + .name = "sunway-ged", + .of_match_table = sunwayged_of_match, + }, + .probe = sunwayged_probe, + .remove = sunwayged_remove, +}; +module_platform_driver(sunwayged_platform_driver); + +MODULE_AUTHOR("Lu Feifei"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Sunway ged driver"); -- Gitee From 2d3b928eac96fa601080afba9282cac711edbcef Mon Sep 17 00:00:00 2001 From: Lu Feifei Date: Wed, 15 Jun 2022 14:59:51 +0800 Subject: [PATCH 136/674] sw64: add a misc device to chip_vt.dts for memory-hotplug Sunway inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I56WV8 -------------------------------- This is an emulated device used to communicate with the memory-hotplug device on the qemu side, so the device is added to device tree for guest os. Signed-off-by: Lu Feifei Signed-off-by: Gu Zitao --- arch/sw_64/boot/dts/chip_vt.dts | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/arch/sw_64/boot/dts/chip_vt.dts b/arch/sw_64/boot/dts/chip_vt.dts index f0bcf1db1d08..abad29dee97e 100644 --- a/arch/sw_64/boot/dts/chip_vt.dts +++ b/arch/sw_64/boot/dts/chip_vt.dts @@ -34,5 +34,17 @@ uart: serial0@8801 { clock-frequency = <24000000>; status = "okay"; }; + misc: misc0@8036 { + #address-cells = <2>; + #size-cells = <2>; + compatible = "sw6,sunway-ged"; + reg = <0x8036 0x0 0x0 0x20>; + interrupt-parent=<&intc>; + interrupts = <13>; + reg-shift = <0>; + reg-io-width = <8>; + clock-frequency = <24000000>; + status = "okay"; + }; }; }; -- Gitee From edb81706ec0dc4bc13c81b7ce08febe09bf2a471 Mon Sep 17 00:00:00 2001 From: Yang Qiang Date: Mon, 20 Jun 2022 14:23:56 +0800 Subject: [PATCH 137/674] sw64: pci: fix maximum bus number for pci scan Sunway inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GDR1 -------------------------------- When PCI devices enable SRIOV function, the previous policy that uses pci_find_bus to locate PCI bus from a given domain and bus number may return false PCI bus number, so this patch fixes this patch. Signed-off-by: Yang Qiang Signed-off-by: Gu Zitao --- arch/sw_64/chip/chip3/chip.c | 13 +++++++------ arch/sw_64/kernel/pci.c | 18 +++++++++--------- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/arch/sw_64/chip/chip3/chip.c b/arch/sw_64/chip/chip3/chip.c index 02b369b2b37b..84ca7ffcb2ef 100644 --- a/arch/sw_64/chip/chip3/chip.c +++ b/arch/sw_64/chip/chip3/chip.c @@ -178,18 +178,20 @@ int chip_pcie_configure(struct pci_controller *hose) struct pci_bus *bus, *top; struct list_head *next; unsigned int max_read_size, smallest_max_payload; - int max_payloadsize, iov_bus = 0; + int max_payloadsize; unsigned long rc_index, node; unsigned long piuconfig0, value; unsigned int pcie_caps_offset; unsigned int rc_conf_value; u16 devctl, new_values; bool rc_ari_disabled = false, found = false; + unsigned char bus_max_num; node = hose->node; rc_index = hose->index; smallest_max_payload = read_rc_conf(node, rc_index, RC_EXP_DEVCAP); smallest_max_payload &= PCI_EXP_DEVCAP_PAYLOAD; + bus_max_num = hose->busn_space->start; top = hose->bus; bus = top; @@ -200,6 +202,7 @@ int chip_pcie_configure(struct pci_controller *hose) /* end of this bus, go up or finish */ if (bus == top) break; + next = bus->self->bus_list.next; bus = bus->self->bus; continue; @@ -224,10 +227,8 @@ int chip_pcie_configure(struct pci_controller *hose) } } -#ifdef CONFIG_PCI_IOV - if (dev->is_physfn) - iov_bus += dev->sriov->max_VF_buses - dev->bus->number; -#endif + if (bus->busn_res.end > bus_max_num) + bus_max_num = bus->busn_res.end; /* Query device PCIe capability register */ pcie_caps_offset = dev->pcie_cap; @@ -306,7 +307,7 @@ int chip_pcie_configure(struct pci_controller *hose) pci_write_config_word(dev, pcie_caps_offset + PCI_EXP_DEVCTL, devctl); } - return iov_bus; + return bus_max_num; } static int chip3_check_pci_vt_linkup(unsigned long node, unsigned long index) diff --git a/arch/sw_64/kernel/pci.c b/arch/sw_64/kernel/pci.c index 7cc8b7d2d43b..fcc6e0f02a93 100644 --- a/arch/sw_64/kernel/pci.c +++ b/arch/sw_64/kernel/pci.c @@ -221,7 +221,7 @@ void __init common_init_pci(void) struct pci_bus *bus; unsigned int init_busnr; int need_domain_info = 0; - int ret, iov_bus; + int ret; unsigned long offset; /* Scan all of the recorded PCI controllers. */ @@ -257,20 +257,20 @@ void __init common_init_pci(void) bus = hose->bus = bridge->bus; hose->need_domain_info = need_domain_info; - while (pci_find_bus(pci_domain_nr(bus), last_bus)) - last_bus++; if (is_in_host()) - iov_bus = chip_pcie_configure(hose); - last_bus += iov_bus; + last_bus = chip_pcie_configure(hose); + else + while (pci_find_bus(pci_domain_nr(bus), last_bus)) + last_bus++; - hose->last_busno = hose->busn_space->end = last_bus - 1; + hose->last_busno = hose->busn_space->end = last_bus; init_busnr = read_rc_conf(hose->node, hose->index, RC_PRIMARY_BUS); init_busnr &= ~(0xff << 16); - init_busnr |= (last_bus - 1) << 16; + init_busnr |= last_bus << 16; write_rc_conf(hose->node, hose->index, RC_PRIMARY_BUS, init_busnr); - pci_bus_update_busn_res_end(bus, last_bus - 1); - + pci_bus_update_busn_res_end(bus, last_bus); + last_bus++; } pcibios_claim_console_setup(); -- Gitee From b6bc5e6d316eb6e70a88ca730d2e099a8bad1bf7 Mon Sep 17 00:00:00 2001 From: Hang Xiaoqian Date: Mon, 20 Jun 2022 16:54:38 +0800 Subject: [PATCH 138/674] sw64: remove unaligned count Sunway inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GFNY -------------------------------- In the case of highly concurrent unaligned processing, unaligned count may cause severe cache thrashing. However, unaligned count is not required, so remove it to reduce cache thrashing. Signed-off-by: Hang Xiaoqian Signed-off-by: Gu Zitao --- arch/sw_64/kernel/Makefile | 2 +- arch/sw_64/kernel/traps.c | 13 -------- arch/sw_64/kernel/unaligned.c | 56 ----------------------------------- 3 files changed, 1 insertion(+), 70 deletions(-) delete mode 100644 arch/sw_64/kernel/unaligned.c diff --git a/arch/sw_64/kernel/Makefile b/arch/sw_64/kernel/Makefile index c1e461e0ac56..94b63d6a286b 100644 --- a/arch/sw_64/kernel/Makefile +++ b/arch/sw_64/kernel/Makefile @@ -31,7 +31,7 @@ obj-$(CONFIG_HIBERNATION) += hibernate_asm.o hibernate.o obj-$(CONFIG_AUDIT) += audit.o obj-$(CONFIG_PCI) += pci_common.o obj-$(CONFIG_RELOCATABLE) += relocate.o -obj-$(CONFIG_DEBUG_FS) += unaligned.o segvdbg.o +obj-$(CONFIG_DEBUG_FS) += segvdbg.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o ifndef CONFIG_PCI diff --git a/arch/sw_64/kernel/traps.c b/arch/sw_64/kernel/traps.c index a61c851967a9..d656eca5f961 100644 --- a/arch/sw_64/kernel/traps.c +++ b/arch/sw_64/kernel/traps.c @@ -320,11 +320,6 @@ do_entIF(unsigned long inst_type, struct pt_regs *regs) force_sig_fault(SIGILL, ILL_ILLOPC, (void __user *)regs->pc, 0); } -struct unaligned_stat { - unsigned long count, va, pc; -} unaligned[2]; - - asmlinkage void do_entUna(void *va, unsigned long opcode, unsigned long reg, struct pt_regs *regs) @@ -334,10 +329,6 @@ do_entUna(void *va, unsigned long opcode, unsigned long reg, unsigned long pc = regs->pc - 4; const struct exception_table_entry *fixup; - unaligned[0].count++; - unaligned[0].va = (unsigned long) va; - unaligned[0].pc = pc; - /* * We don't want to use the generic get/put unaligned macros as * we want to trap exceptions. Only if we actually get an @@ -666,10 +657,6 @@ do_entUnaUser(void __user *va, unsigned long opcode, if ((unsigned long)va >= TASK_SIZE) goto give_sigsegv; - ++unaligned[1].count; - unaligned[1].va = (unsigned long)va; - unaligned[1].pc = regs->pc - 4; - if ((1L << opcode) & OP_INT_MASK) { /* it's an integer load/store */ if (reg < 30) { diff --git a/arch/sw_64/kernel/unaligned.c b/arch/sw_64/kernel/unaligned.c deleted file mode 100644 index a1bbdab4a266..000000000000 --- a/arch/sw_64/kernel/unaligned.c +++ /dev/null @@ -1,56 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -/* - * Copyright (C) 2020 Mao Minkai - * Author: Mao Minkai - * - * This code is taken from arch/mips/kernel/segment.c - * Copyright (C) 2013 Imagination Technologies Ltd. - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - */ - -#include -#include - -static int show_unaligned(struct seq_file *sf, void *v) -{ - extern struct unaligned_stat { - unsigned long count, va, pc; - } unaligned[2]; - - seq_printf(sf, "kernel unaligned acc\t: %ld (pc=%lx, va=%lx)\n", unaligned[0].count, unaligned[0].pc, unaligned[0].va); - seq_printf(sf, "user unaligned acc\t: %ld (pc=%lx, va=%lx)\n", unaligned[1].count, unaligned[1].pc, unaligned[1].va); - - return 0; -} - -static int unaligned_open(struct inode *inode, struct file *file) -{ - return single_open(file, show_unaligned, NULL); -} - -static const struct file_operations unaligned_fops = { - .open = unaligned_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int __init unaligned_info(void) -{ - struct dentry *unaligned; - - if (!sw64_debugfs_dir) - return -ENODEV; - - unaligned = debugfs_create_file("unaligned", S_IRUGO, - sw64_debugfs_dir, NULL, - &unaligned_fops); - if (!unaligned) - return -ENOMEM; - return 0; -} -device_initcall(unaligned_info); -- Gitee From 993a93c8c4a7aa088c0157bddb10fda388fcdccb Mon Sep 17 00:00:00 2001 From: Mao Minkai Date: Mon, 20 Jun 2022 16:58:09 +0800 Subject: [PATCH 139/674] sw64: fix simd version of memset Sunway inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GFOO -------------------------------- In commit 20b900c4b7fb ("sw64: optimize simd version of memcpy and memset"), _nc instructions are used to improve performance, but the position of memb instruction in memset is wrong. Fix it. Fixes: 20b900c4b7fb ("sw64: optimize simd version of memcpy and memset") Signed-off-by: Mao Minkai Signed-off-by: Gu Zitao --- arch/sw_64/lib/deep-memset.S | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/sw_64/lib/deep-memset.S b/arch/sw_64/lib/deep-memset.S index ed2171c56d4d..7fbd529c72a8 100644 --- a/arch/sw_64/lib/deep-memset.S +++ b/arch/sw_64/lib/deep-memset.S @@ -99,12 +99,11 @@ $mod32_aligned: .align 5 $mod32_loop_nc: subl $18, 64, $18 - blt $18, $mod32_tail + blt $18, $mod32_tail_memb vstd_nc $f10, 0($16) vstd_nc $f10, 32($16) addl $16, 64, $16 br $31, $mod32_loop_nc - memb # required for _nc store instructions .align 5 $mod32_loop: @@ -115,6 +114,8 @@ $mod32_loop: addl $16, 64, $16 br $31, $mod32_loop +$mod32_tail_memb: + memb # required for _nc store instructions $mod32_tail: vldd $f10, 0($4) addl $sp, 64, $sp -- Gitee From e73f0bffb26c59b5b3ff024b4194a3c6fe34a8dd Mon Sep 17 00:00:00 2001 From: Zhu Donghong Date: Tue, 21 Jun 2022 14:47:33 +0800 Subject: [PATCH 140/674] sw64: deliver a hot reset to Root Complex with plugin JMicron 585 card Sunway inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GFPA -------------------------------- On SW64 platform, JMicron 585 SATA card directly connected to Root Complex may raise DMA failure when reboot, so we force a hot reset to Root Complex to fix can not access JMicron 585 SATA card. Signed-off-by: Zhu Donghong Signed-off-by: Gu Zitao --- arch/sw_64/platform/platform_xuelang.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/arch/sw_64/platform/platform_xuelang.c b/arch/sw_64/platform/platform_xuelang.c index 63a4b163e43e..ae8179b53b4c 100644 --- a/arch/sw_64/platform/platform_xuelang.c +++ b/arch/sw_64/platform/platform_xuelang.c @@ -26,9 +26,25 @@ extern void cpld_write(uint8_t slave_addr, uint8_t reg, uint8_t data); static void xuelang_kill_arch(int mode) { + struct pci_dev *pdev; + struct pci_controller *hose; + int val; + if (is_in_host()) { switch (mode) { case LINUX_REBOOT_CMD_RESTART: + pdev = pci_get_device(PCI_VENDOR_ID_JMICRON, + 0x0585, NULL); + if (pdev) { + hose = (struct pci_controller *)pdev->sysdata; + val = read_rc_conf(hose->node, hose->index, + RC_PORT_LINK_CTL); + write_rc_conf(hose->node, hose->index, + RC_PORT_LINK_CTL, val | 0x8); + write_rc_conf(hose->node, hose->index, + RC_PORT_LINK_CTL, val); + } + cpld_write(0x64, 0x00, 0xc3); mb(); break; -- Gitee From 9e0b719ccc16ce561eb8247e87f8a15f00ffaccf Mon Sep 17 00:00:00 2001 From: He Sheng Date: Wed, 21 Jul 2021 13:36:27 +0800 Subject: [PATCH 141/674] sw64: rename debugfs dir sw_64 to sw64 Sunway inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I56OLG -------------------------------- Signed-off-by: He Sheng Signed-off-by: Gu Zitao --- arch/sw_64/kernel/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/sw_64/kernel/setup.c b/arch/sw_64/kernel/setup.c index 85d172c32239..0e93643539d3 100644 --- a/arch/sw_64/kernel/setup.c +++ b/arch/sw_64/kernel/setup.c @@ -976,7 +976,7 @@ static int __init debugfs_sw64(void) { struct dentry *d; - d = debugfs_create_dir("sw_64", NULL); + d = debugfs_create_dir("sw64", NULL); if (!d) return -ENOMEM; sw64_debugfs_dir = d; -- Gitee From 0115f679cddf65c65822b61208be17a141a72cfc Mon Sep 17 00:00:00 2001 From: Wu Liliu Date: Wed, 15 Jun 2022 11:00:09 +0800 Subject: [PATCH 142/674] sw64: reimplement show_stack() method Sunway inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GFQ0 -------------------------------- It used to show stack information by SP, that means it has nothing to do with task, which is unexpected. In order to show stack of task, we improve the implementation of show_stack(). However, the original task struct does not have sp, so we add it into thread_struct and do something necessary in __switch_to. Meanwhile, walk_stackframe() can be reused, so refactor related codes for better support. Signed-off-by: Wu Liliu Signed-off-by: Gu Zitao --- arch/sw_64/include/asm/processor.h | 1 + arch/sw_64/include/asm/stacktrace.h | 4 +- arch/sw_64/kernel/asm-offsets.c | 1 + arch/sw_64/kernel/entry.S | 6 +++ arch/sw_64/kernel/perf_event.c | 12 ++--- arch/sw_64/kernel/process.c | 1 + arch/sw_64/kernel/stacktrace.c | 78 +++++++++++++++++++++++------ arch/sw_64/kernel/traps.c | 58 ++++----------------- 8 files changed, 85 insertions(+), 76 deletions(-) diff --git a/arch/sw_64/include/asm/processor.h b/arch/sw_64/include/asm/processor.h index 08e8cc8e5428..886f28635dd4 100644 --- a/arch/sw_64/include/asm/processor.h +++ b/arch/sw_64/include/asm/processor.h @@ -45,6 +45,7 @@ struct thread_struct { struct user_fpsimd_state fpstate; /* Callee-saved registers */ unsigned long ra; + unsigned long sp; unsigned long s[7]; /* s0 ~ s6 */ }; #define INIT_THREAD { } diff --git a/arch/sw_64/include/asm/stacktrace.h b/arch/sw_64/include/asm/stacktrace.h index 813aa5e7a91d..ed691a72573b 100644 --- a/arch/sw_64/include/asm/stacktrace.h +++ b/arch/sw_64/include/asm/stacktrace.h @@ -32,8 +32,8 @@ struct stack_frame { }; extern int unwind_frame(struct task_struct *tsk, struct stackframe *frame); -extern void walk_stackframe(struct task_struct *tsk, struct stackframe *frame, - int (*fn)(struct stackframe *, void *), void *data); +extern void walk_stackframe(struct task_struct *tsk, struct pt_regs *regs, + int (*fn)(unsigned long, void *), void *data); static inline bool on_task_stack(struct task_struct *tsk, unsigned long sp, struct stack_info *info) diff --git a/arch/sw_64/kernel/asm-offsets.c b/arch/sw_64/kernel/asm-offsets.c index 0c7e1e26eb05..9e6c338a5edd 100644 --- a/arch/sw_64/kernel/asm-offsets.c +++ b/arch/sw_64/kernel/asm-offsets.c @@ -213,6 +213,7 @@ void foo(void) OFFSET(TASK_THREAD_FPCR, task_struct, thread.fpstate.fpcr); BLANK(); OFFSET(TASK_THREAD_RA, task_struct, thread.ra); + OFFSET(TASK_THREAD_SP, task_struct, thread.sp); OFFSET(TASK_THREAD_S0, task_struct, thread.s[0]); OFFSET(TASK_THREAD_S1, task_struct, thread.s[1]); OFFSET(TASK_THREAD_S2, task_struct, thread.s[2]); diff --git a/arch/sw_64/kernel/entry.S b/arch/sw_64/kernel/entry.S index 977c774ad799..f79c9a6ddf36 100644 --- a/arch/sw_64/kernel/entry.S +++ b/arch/sw_64/kernel/entry.S @@ -398,6 +398,7 @@ __switch_to: .prologue 0 /* Save context into prev->thread */ stl $26, TASK_THREAD_RA($17) + stl $30, TASK_THREAD_SP($17) stl $9, TASK_THREAD_S0($17) stl $10, TASK_THREAD_S1($17) stl $11, TASK_THREAD_S2($17) @@ -415,6 +416,11 @@ __switch_to: ldl $14, TASK_THREAD_S5($18) ldl $15, TASK_THREAD_S6($18) sys_call HMC_swpctx + /* + * SP has been saved and restored by HMC_swpctx, + * and restore it again here for future expansion. + */ + ldl $30, TASK_THREAD_SP($18) ldi $8, 0x3fff bic $sp, $8, $8 mov $17, $0 diff --git a/arch/sw_64/kernel/perf_event.c b/arch/sw_64/kernel/perf_event.c index 52ec34e33269..6e344239917b 100644 --- a/arch/sw_64/kernel/perf_event.c +++ b/arch/sw_64/kernel/perf_event.c @@ -761,24 +761,18 @@ void perf_callchain_user(struct perf_callchain_entry_ctx *entry, * whist unwinding the stackframe and is like a subroutine return so we use * the PC. */ -static int callchain_trace(struct stackframe *frame, void *data) +static int callchain_trace(unsigned long pc, void *data) { struct perf_callchain_entry_ctx *entry = data; - perf_callchain_store(entry, frame->pc); - + perf_callchain_store(entry, pc); return 0; } void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { - struct stackframe frame; - - frame.fp = regs->r15; - frame.pc = regs->pc; - - walk_stackframe(current, &frame, callchain_trace, entry); + walk_stackframe(NULL, regs, callchain_trace, entry); } /* diff --git a/arch/sw_64/kernel/process.c b/arch/sw_64/kernel/process.c index 7a7578d530c6..da721d4266ee 100644 --- a/arch/sw_64/kernel/process.c +++ b/arch/sw_64/kernel/process.c @@ -169,6 +169,7 @@ copy_thread(unsigned long clone_flags, unsigned long usp, childti->pcb.ksp = (unsigned long) childregs; childti->pcb.flags = 7; /* set FEN, clear everything else */ + p->thread.sp = (unsigned long) childregs; if (unlikely(p->flags & PF_KTHREAD)) { /* kernel thread */ diff --git a/arch/sw_64/kernel/stacktrace.c b/arch/sw_64/kernel/stacktrace.c index 2671331717ba..4e9acf99aaab 100644 --- a/arch/sw_64/kernel/stacktrace.c +++ b/arch/sw_64/kernel/stacktrace.c @@ -10,6 +10,8 @@ #include #include #include +#include + #include /* @@ -59,40 +61,84 @@ int unwind_frame(struct task_struct *tsk, struct stackframe *frame) } EXPORT_SYMBOL_GPL(unwind_frame); -void walk_stackframe(struct task_struct *tsk, struct stackframe *frame, - int (*fn)(struct stackframe *, void *), void *data) +void walk_stackframe(struct task_struct *tsk, struct pt_regs *regs, + int (*fn)(unsigned long, void *), void *data) { + unsigned long pc, fp; + + struct stackframe frame; + + if (regs) { + pc = regs->pc; + fp = regs->r15; + } else if (tsk == current || tsk == NULL) { + fp = (unsigned long)__builtin_frame_address(0); + pc = (unsigned long)walk_stackframe; + } else { + fp = tsk->thread.s[6]; + pc = tsk->thread.ra; + } + + if (!__kernel_text_address(pc) || fn(pc, data)) + return; + + frame.pc = pc; + frame.fp = fp; while (1) { int ret; - - if (fn(frame, data)) - break; - ret = unwind_frame(tsk, frame); + ret = unwind_frame(tsk, &frame); if (ret < 0) break; + + if (fn(frame.pc, data)) + break; } } EXPORT_SYMBOL_GPL(walk_stackframe); #else /* !CONFIG_FRAME_POINTER */ -void walk_stackframe(struct task_struct *tsk, struct stackframe *frame, - int (*fn)(struct stackframe *, void *), void *data) +void walk_stackframe(struct task_struct *tsk, struct pt_regs *regs, + int (*fn)(unsigned long, void *), void *data) { - unsigned long *sp = (unsigned long *)current_thread_info()->pcb.ksp; - unsigned long addr; - struct perf_callchain_entry_ctx *entry = data; + unsigned long *ksp; + unsigned long sp, pc; + + if (regs) { + sp = (unsigned long)(regs+1); + pc = regs->pc; + } else if (tsk == current || tsk == NULL) { + register unsigned long current_sp __asm__ ("$30"); + sp = current_sp; + pc = (unsigned long)walk_stackframe; + } else { + sp = tsk->thread.sp; + pc = tsk->thread.ra; + } - perf_callchain_store(entry, frame->pc); - while (!kstack_end(sp) && entry->nr < entry->max_stack) { - addr = *sp++; - if (__kernel_text_address(addr)) - perf_callchain_store(entry, addr); + ksp = (unsigned long *)sp; + + while (!kstack_end(ksp)) { + if (__kernel_text_address(pc) && fn(pc, data)) + break; + pc = (*ksp++) - 0x4; } } EXPORT_SYMBOL_GPL(walk_stackframe); #endif/* CONFIG_FRAME_POINTER */ +static int print_address_trace(unsigned long pc, void *data) +{ + print_ip_sym((const char *)data, pc); + return 0; +} + +void show_stack(struct task_struct *task, unsigned long *sp, const char *loglvl) +{ + pr_info("Trace:\n"); + walk_stackframe(task, NULL, print_address_trace, (void *)loglvl); +} + /* * Save stack-backtrace addresses into a stack_trace buffer. */ diff --git a/arch/sw_64/kernel/traps.c b/arch/sw_64/kernel/traps.c index d656eca5f961..4e95cab13daa 100644 --- a/arch/sw_64/kernel/traps.c +++ b/arch/sw_64/kernel/traps.c @@ -12,13 +12,20 @@ #include #include #include +#include #include +#include +#include +#include #include #include #include #include #include +#include +#include +#include #include "proto.h" @@ -68,53 +75,6 @@ dik_show_code(unsigned int *pc) printk("\n"); } -static void -dik_show_trace(unsigned long *sp, const char *loglvl) -{ - long i = 0; - unsigned long tmp; - - printk("%sTrace:\n", loglvl); - while (0x1ff8 & (unsigned long)sp) { - tmp = *sp; - sp++; - if (!__kernel_text_address(tmp)) - continue; - printk("%s[<%lx>] %pSR\n", loglvl, tmp, (void *)tmp); - if (i > 40) { - printk("%s ...", loglvl); - break; - } - } - printk("\n"); -} - -static int kstack_depth_to_print = 24; - -void show_stack(struct task_struct *task, unsigned long *sp, const char *loglvl) -{ - unsigned long *stack; - int i; - - /* - * debugging aid: "show_stack(NULL, NULL, KERN_EMERG);" prints the - * back trace for this cpu. - */ - if (sp == NULL) - sp = (unsigned long *)&sp; - - stack = sp; - for (i = 0; i < kstack_depth_to_print; i++) { - if (((long) stack & (THREAD_SIZE-1)) == 0) - break; - if (i && ((i % 4) == 0)) - printk("%s ", loglvl); - printk("%016lx ", *stack++); - } - printk("\n"); - dik_show_trace(sp, loglvl); -} - void die_if_kernel(char *str, struct pt_regs *regs, long err) { if (regs->ps & 8) @@ -125,7 +85,7 @@ void die_if_kernel(char *str, struct pt_regs *regs, long err) printk("%s(%d): %s %ld\n", current->comm, task_pid_nr(current), str, err); dik_show_regs(regs); add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE); - dik_show_trace((unsigned long *)(regs+1), KERN_DEFAULT); + show_stack(current, NULL, KERN_EMERG); dik_show_code((unsigned int *)regs->pc); if (test_and_set_thread_flag(TIF_DIE_IF_KERNEL)) { @@ -535,7 +495,7 @@ do_entUna(void *va, unsigned long opcode, unsigned long reg, dik_show_regs(regs); dik_show_code((unsigned int *)pc); - dik_show_trace((unsigned long *)(regs+1), KERN_DEFAULT); + show_stack(current, NULL, KERN_EMERG); if (test_and_set_thread_flag(TIF_DIE_IF_KERNEL)) { printk("die_if_kernel recursion detected.\n"); -- Gitee From 8cc109c4a51ea3c343c15b47eb62d99d0adae211 Mon Sep 17 00:00:00 2001 From: Wu Liliu Date: Tue, 21 Jun 2022 14:19:58 +0800 Subject: [PATCH 143/674] sw64: reimplement get_wchan() Sunway inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GFUD -------------------------------- The get_wchan() always return 0 because `fp > sp` is false. This patch reimplements it entirely and fixes this error. Signed-off-by: Wu Liliu Signed-off-by: Gu Zitao --- arch/sw_64/kernel/process.c | 51 ---------------------------------- arch/sw_64/kernel/stacktrace.c | 22 +++++++++++++++ 2 files changed, 22 insertions(+), 51 deletions(-) diff --git a/arch/sw_64/kernel/process.c b/arch/sw_64/kernel/process.c index da721d4266ee..a75ae20205f3 100644 --- a/arch/sw_64/kernel/process.c +++ b/arch/sw_64/kernel/process.c @@ -226,57 +226,6 @@ int dump_fpu(struct pt_regs *regs, elf_fpregset_t *fpu) } EXPORT_SYMBOL(dump_fpu); -/* - * Under heavy swap load I've seen this lose in an ugly way. So do - * some extra sanity checking on the ranges we expect these pointers - * to be in so that we can fail gracefully. This is just for ps after - * all. -- r~ - */ - -unsigned long -thread_saved_pc(struct task_struct *t) -{ - unsigned long top, fp, sp; - - top = (unsigned long)task_stack_page(t) + 2 * PAGE_SIZE; - sp = task_thread_info(t)->pcb.ksp; - fp = t->thread.s[6]; - - if (fp > sp && fp < top) - return *(unsigned long *)fp; - - return 0; -} - -unsigned long -get_wchan(struct task_struct *p) -{ - unsigned long schedule_frame; - unsigned long pc, top, sp; - - if (!p || p == current || p->state == TASK_RUNNING) - return 0; - /* - * This one depends on the frame size of schedule(). Do a - * "disass schedule" in gdb to find the frame size. Also, the - * code assumes that sleep_on() follows immediately after - * interruptible_sleep_on() and that add_timer() follows - * immediately after interruptible_sleep(). Ugly, isn't it? - * Maybe adding a wchan field to task_struct would be better, - * after all... - */ - - pc = thread_saved_pc(p); - if (in_sched_functions(pc)) { - top = (unsigned long)task_stack_page(p) + 2 * PAGE_SIZE; - sp = task_thread_info(p)->pcb.ksp; - schedule_frame = p->thread.s[6]; - if (schedule_frame > sp && schedule_frame < top) - return ((unsigned long *)schedule_frame)[12]; - } - return pc; -} - unsigned long arch_randomize_brk(struct mm_struct *mm) { return randomize_page(mm->brk, 0x02000000); diff --git a/arch/sw_64/kernel/stacktrace.c b/arch/sw_64/kernel/stacktrace.c index 4e9acf99aaab..b51b1500c736 100644 --- a/arch/sw_64/kernel/stacktrace.c +++ b/arch/sw_64/kernel/stacktrace.c @@ -172,3 +172,25 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) trace->entries[trace->nr_entries++] = ULONG_MAX; } EXPORT_SYMBOL_GPL(save_stack_trace_tsk); + +static int save_pc(unsigned long pc, void *data) +{ + unsigned long *p = data; + *p = 0; + + if (!in_sched_functions(pc)) + *p = pc; + + return *p; +} + +unsigned long get_wchan(struct task_struct *tsk) +{ + unsigned long pc; + + if (!tsk || tsk == current || tsk->state == TASK_RUNNING) + return 0; + walk_stackframe(tsk, NULL, save_pc, &pc); + + return pc; +} -- Gitee From a3266d177538c7304143e30f9c1c9de6cda945d4 Mon Sep 17 00:00:00 2001 From: Wu Liliu Date: Fri, 24 Jun 2022 14:12:33 +0800 Subject: [PATCH 144/674] sw64: reimplement save_stack_trace() Sunway inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GFQQ -------------------------------- It used to save stack-backtrace addresses by SP, which is inaccurate. In order to implement this function accurately, we provide two ways to support it. Signed-off-by: Wu Liliu Signed-off-by: Gu Zitao --- arch/sw_64/kernel/stacktrace.c | 62 ++++++++++++++++++++++------------ 1 file changed, 40 insertions(+), 22 deletions(-) diff --git a/arch/sw_64/kernel/stacktrace.c b/arch/sw_64/kernel/stacktrace.c index b51b1500c736..7b5ddc78bd6d 100644 --- a/arch/sw_64/kernel/stacktrace.c +++ b/arch/sw_64/kernel/stacktrace.c @@ -139,40 +139,58 @@ void show_stack(struct task_struct *task, unsigned long *sp, const char *loglvl) walk_stackframe(task, NULL, print_address_trace, (void *)loglvl); } +#ifdef CONFIG_STACKTRACE /* * Save stack-backtrace addresses into a stack_trace buffer. */ -void save_stack_trace(struct stack_trace *trace) +struct stack_trace_data { + struct stack_trace *trace; + unsigned int nosched; +}; + +int save_trace(unsigned long pc, void *d) { - save_stack_trace_tsk(current, trace); -} -EXPORT_SYMBOL_GPL(save_stack_trace); + struct stack_trace_data *data = d; + struct stack_trace *trace = data->trace; + + if (data->nosched && in_sched_functions(pc)) + return 0; + if (trace->skip > 0) { + trace->skip--; + return 0; + } + trace->entries[trace->nr_entries++] = pc; + return (trace->nr_entries >= trace->max_entries); +} -void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) +static void __save_stack_trace(struct task_struct *tsk, + struct stack_trace *trace, unsigned int nosched) { - unsigned long *sp = (unsigned long *)task_thread_info(tsk)->pcb.ksp; - unsigned long addr; - - WARN_ON(trace->nr_entries || !trace->max_entries); - - while (!kstack_end(sp)) { - addr = *sp++; - if (__kernel_text_address(addr) && - !in_sched_functions(addr)) { - if (trace->skip > 0) - trace->skip--; - else - trace->entries[trace->nr_entries++] = addr; - if (trace->nr_entries >= trace->max_entries) - break; - } - } + struct stack_trace_data data; + + data.trace = trace; + data.nosched = nosched; + + walk_stackframe(tsk, NULL, save_trace, &data); + if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = ULONG_MAX; } + +void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) +{ + __save_stack_trace(tsk, trace, 1); +} EXPORT_SYMBOL_GPL(save_stack_trace_tsk); +void save_stack_trace(struct stack_trace *trace) +{ + __save_stack_trace(current, trace, 0); +} +EXPORT_SYMBOL_GPL(save_stack_trace); +#endif + static int save_pc(unsigned long pc, void *data) { unsigned long *p = data; -- Gitee From d14ec0e7e888b5ab16142e0a71da4d99183587b6 Mon Sep 17 00:00:00 2001 From: Wang Yuanheng Date: Mon, 27 Jun 2022 09:02:01 +0800 Subject: [PATCH 145/674] sw64: kvm: enable binding_vcpu debug dynamically Sunway inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5GFR5 -------------------------------- Add a bool debugfs file /sys/kernel/debug/sw_64/bind_vcpu, you can echo 1/Y to enable bind vcpu, or echo 0/N to disable it. Determin which node to bind the core according to the physical address assigned to the guest. Signed-off-by: Wang Yuanheng Signed-off-by: Gu Zitao --- arch/sw_64/kernel/Makefile | 2 +- arch/sw_64/kernel/bindvcpu.c | 29 +++++++++++++++++++++++++++++ arch/sw_64/kvm/kvm-sw64.c | 13 ++++++++++++- 3 files changed, 42 insertions(+), 2 deletions(-) create mode 100644 arch/sw_64/kernel/bindvcpu.c diff --git a/arch/sw_64/kernel/Makefile b/arch/sw_64/kernel/Makefile index 94b63d6a286b..d4dc9e175d67 100644 --- a/arch/sw_64/kernel/Makefile +++ b/arch/sw_64/kernel/Makefile @@ -31,7 +31,7 @@ obj-$(CONFIG_HIBERNATION) += hibernate_asm.o hibernate.o obj-$(CONFIG_AUDIT) += audit.o obj-$(CONFIG_PCI) += pci_common.o obj-$(CONFIG_RELOCATABLE) += relocate.o -obj-$(CONFIG_DEBUG_FS) += segvdbg.o +obj-$(CONFIG_DEBUG_FS) += segvdbg.o bindvcpu.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o ifndef CONFIG_PCI diff --git a/arch/sw_64/kernel/bindvcpu.c b/arch/sw_64/kernel/bindvcpu.c new file mode 100644 index 000000000000..611c395c144b --- /dev/null +++ b/arch/sw_64/kernel/bindvcpu.c @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2022 Wang Yuanheng + * Author: Wang Yuanheng + * + */ + +#include +#include +#include +#include +#include + +extern bool bind_vcpu_enabled; + +static int __init bind_vcpu_init(void) +{ + struct dentry *bindvcpu; + + if (!sw64_debugfs_dir) + return -ENODEV; + + bindvcpu = debugfs_create_bool("bind_vcpu", 0644, + sw64_debugfs_dir, &bind_vcpu_enabled); + if (!bindvcpu) + return -ENOMEM; + return 0; +} +late_initcall(bind_vcpu_init); diff --git a/arch/sw_64/kvm/kvm-sw64.c b/arch/sw_64/kvm/kvm-sw64.c index af29d0ca8e7f..de81f7efe01a 100644 --- a/arch/sw_64/kvm/kvm-sw64.c +++ b/arch/sw_64/kvm/kvm-sw64.c @@ -12,7 +12,7 @@ #include #include #include - +#include #include #include @@ -21,6 +21,7 @@ bool set_msi_flag; unsigned long sw64_kvm_last_vpn[NR_CPUS]; +__read_mostly bool bind_vcpu_enabled; #define cpu_last_vpn(cpuid) sw64_kvm_last_vpn[cpuid] #ifdef CONFIG_SUBARCH_C3B @@ -537,6 +538,16 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) #ifndef CONFIG_KVM_MEMHOTPLUG vcpu->arch.vcb.vpcr = get_vpcr(vcpu->kvm->arch.host_phys_addr, vcpu->kvm->arch.size, 0); + + if (unlikely(bind_vcpu_enabled)) { + int nid; + unsigned long end; + + end = vcpu->kvm->arch.host_phys_addr + vcpu->kvm->arch.size; + nid = pfn_to_nid(PHYS_PFN(vcpu->kvm->arch.host_phys_addr)); + if (pfn_to_nid(PHYS_PFN(end)) == nid) + set_cpus_allowed_ptr(vcpu->arch.tsk, node_to_cpumask_map[nid]); + } #else unsigned long seg_base = virt_to_phys(vcpu->kvm->arch.seg_pgd); -- Gitee From 0d22be5a4f5fb31a45fb4c55dde5037bfba3c572 Mon Sep 17 00:00:00 2001 From: Gu Zitao Date: Tue, 28 Jun 2022 08:51:26 +0800 Subject: [PATCH 146/674] sw64: rename CONFIG_HAVE_GENERIC_GUP to CONFIG_HAVE_FAST_GUP Sunway inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5GFRP -------------------------------- This patch is a backport of commit 67a929e097b7 ("mm: rename CONFIG_HAVE_GENERIC_GUP to CONFIG_HAVE_FAST_GUP"). In order to use the generic GUP, let's do the same. Signed-off-by: Gu Zitao Signed-off-by: Gu Zitao --- arch/sw_64/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/sw_64/Kconfig b/arch/sw_64/Kconfig index ec6e583a5d9a..cf2f6f00708c 100644 --- a/arch/sw_64/Kconfig +++ b/arch/sw_64/Kconfig @@ -7,7 +7,7 @@ config SW64 select HAVE_OPROFILE select HAVE_PCSPKR_PLATFORM select HAVE_PERF_EVENTS - select HAVE_GENERIC_GUP + select HAVE_FAST_GUP select GENERIC_CLOCKEVENTS select GENERIC_IRQ_PROBE select GENERIC_IRQ_LEGACY -- Gitee From ce6455155eeacae6557fe0142e69a6c288ac970e Mon Sep 17 00:00:00 2001 From: Zhou Xuemei Date: Thu, 7 Jul 2022 17:19:10 +0800 Subject: [PATCH 147/674] sw64: fix floating point register corruption Sunway inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GFS3 -------------------------------- When csum_partial_copy_from_user is called in an interrupt, __copy_from_user will modify floating point register f10-f15 without restore register value. This will cause the value of the userspace register to be corrupted. Use memcpy() instead when called from kernel space. Signed-off-by: Zhou Xuemei Signed-off-by: Gu Zitao --- arch/sw_64/lib/csum_partial_copy.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/arch/sw_64/lib/csum_partial_copy.c b/arch/sw_64/lib/csum_partial_copy.c index 5e5274e82b2b..742dd63cdb70 100644 --- a/arch/sw_64/lib/csum_partial_copy.c +++ b/arch/sw_64/lib/csum_partial_copy.c @@ -61,7 +61,11 @@ csum_partial_cfu_dest_aligned(const unsigned long __user *src, unsigned long checksum = ~0U; int err = 0; - err = __copy_from_user(dst, src, len+8); + if (likely(!uaccess_kernel())) + err = __copy_from_user(dst, src, len + 8); + else + memcpy(dst, src, len + 8); + while (len > 0) { word = *dst; checksum += word; @@ -89,7 +93,10 @@ csum_partial_cfu_dest_unaligned(const unsigned long __user *src, unsigned long checksum = ~0U; int err = 0; - err = __copy_from_user(dst, src, len+8); + if (likely(!uaccess_kernel())) + err = __copy_from_user(dst, src, len + 8); + else + memcpy(dst, src, len + 8); dst = (unsigned long *)((unsigned long)dst & (~7UL)); word = *dst; -- Gitee From 510d96b0f5a1edbd9ed1e9efbfde87930a3588b3 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 12 Jul 2022 22:51:43 +0800 Subject: [PATCH 148/674] blk-throttle: fix io hung due to configuration updates hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I58VE5 CVE: NA -------------------------------- If new configuration is submitted while a bio is throttled, then new waiting time is recaculated regardless that the bio might aready wait for some time: tg_conf_updated throtl_start_new_slice tg_update_disptime throtl_schedule_next_dispatch Then io hung can be triggered by always submmiting new configuration before the throttled bio is dispatched. Fix the problem by respecting the time that throttled bio aready waited. In order to do that, instead of start new slice in tg_conf_updated(), just update 'bytes_disp' and 'io_disp' based on the new configuration. Signed-off-by: Yu Kuai Reviewed-by: Jason Yan Signed-off-by: Zheng Zengkai --- block/blk-throttle.c | 79 ++++++++++++++++++++++++++++++++++++-------- 1 file changed, 66 insertions(+), 13 deletions(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 95a13da1f343..0427c9c63e81 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1421,7 +1421,57 @@ static int tg_print_conf_uint(struct seq_file *sf, void *v) return 0; } -static void tg_conf_updated(struct throtl_grp *tg, bool global) +static u64 throtl_update_bytes_disp(u64 dispatched, u64 new_limit, + u64 old_limit) +{ + if (new_limit == old_limit) + return dispatched; + + if (!dispatched) + return 0; + + /* + * In the case that multiply will overflow, just return 0. It will only + * let bios to be dispatched earlier. + */ + if (div64_u64(U64_MAX, dispatched) < new_limit) + return 0; + + dispatched *= new_limit; + return div64_u64(dispatched, old_limit); +} + +static u32 throtl_update_io_disp(u32 dispatched, u32 new_limit, u32 old_limit) +{ + if (new_limit == old_limit) + return dispatched; + + if (!dispatched) + return 0; + /* + * In the case that multiply will overflow, just return 0. It will only + * let bios to be dispatched earlier. + */ + if (UINT_MAX / dispatched < new_limit) + return 0; + + dispatched *= new_limit; + return dispatched / old_limit; +} + +static void throtl_update_slice(struct throtl_grp *tg, u64 *old_limits) +{ + tg->bytes_disp[READ] = throtl_update_bytes_disp(tg->bytes_disp[READ], + tg_bps_limit(tg, READ), old_limits[0]); + tg->bytes_disp[WRITE] = throtl_update_bytes_disp(tg->bytes_disp[WRITE], + tg_bps_limit(tg, WRITE), old_limits[1]); + tg->io_disp[READ] = throtl_update_io_disp(tg->io_disp[READ], + tg_iops_limit(tg, READ), (u32)old_limits[2]); + tg->io_disp[WRITE] = throtl_update_io_disp(tg->io_disp[WRITE], + tg_iops_limit(tg, WRITE), (u32)old_limits[3]); +} + +static void tg_conf_updated(struct throtl_grp *tg, u64 *old_limits, bool global) { struct throtl_service_queue *sq = &tg->service_queue; struct cgroup_subsys_state *pos_css; @@ -1460,16 +1510,7 @@ static void tg_conf_updated(struct throtl_grp *tg, bool global) parent_tg->latency_target); } - /* - * We're already holding queue_lock and know @tg is valid. Let's - * apply the new config directly. - * - * Restart the slices for both READ and WRITES. It might happen - * that a group's limit are dropped suddenly and we don't want to - * account recently dispatched IO with new low rate. - */ - throtl_start_new_slice(tg, READ); - throtl_start_new_slice(tg, WRITE); + throtl_update_slice(tg, old_limits); if (tg->flags & THROTL_TG_PENDING) { tg_update_disptime(tg); @@ -1502,6 +1543,14 @@ static inline int throtl_restart_syscall_when_busy(int errno) return ret; } +static void tg_get_limits(struct throtl_grp *tg, u64 *limits) +{ + limits[0] = tg_bps_limit(tg, READ); + limits[1] = tg_bps_limit(tg, WRITE); + limits[2] = tg_iops_limit(tg, READ); + limits[3] = tg_iops_limit(tg, WRITE); +} + static ssize_t tg_set_conf(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off, bool is_u64) { @@ -1510,6 +1559,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of, struct throtl_grp *tg; int ret; u64 v; + u64 old_limits[4]; ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); if (ret) @@ -1526,13 +1576,14 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of, v = U64_MAX; tg = blkg_to_tg(ctx.blkg); + tg_get_limits(tg, old_limits); if (is_u64) *(u64 *)((void *)tg + of_cft(of)->private) = v; else *(unsigned int *)((void *)tg + of_cft(of)->private) = v; - tg_conf_updated(tg, false); + tg_conf_updated(tg, old_limits, false); ret = 0; out_finish: blkg_conf_finish(&ctx); @@ -1703,6 +1754,7 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of, struct blkg_conf_ctx ctx; struct throtl_grp *tg; u64 v[4]; + u64 old_limits[4]; unsigned long idle_time; unsigned long latency_time; int ret; @@ -1721,6 +1773,7 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of, v[1] = tg->bps_conf[WRITE][index]; v[2] = tg->iops_conf[READ][index]; v[3] = tg->iops_conf[WRITE][index]; + tg_get_limits(tg, old_limits); idle_time = tg->idletime_threshold_conf; latency_time = tg->latency_target_conf; @@ -1807,7 +1860,7 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of, tg->td->limit_index = LIMIT_LOW; } else tg->td->limit_index = LIMIT_MAX; - tg_conf_updated(tg, index == LIMIT_LOW && + tg_conf_updated(tg, old_limits, index == LIMIT_LOW && tg->td->limit_valid[LIMIT_LOW]); ret = 0; out_finish: -- Gitee From badcb143e9e6bf8f66c06e08be3fb92a3abd1f11 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 12 Jul 2022 22:51:44 +0800 Subject: [PATCH 149/674] block: update nsecs[] in part_stat_show() and diskstats_show() hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I57S8D CVE: NA -------------------------------- commit 7ec2ec682568 ("block: update io_ticks when io hang") fixed that %util will be zero for iostat when io is hanged, however, avgqu-sz is still zero while it represents the number of io that are hunged. On the other hand, for some slow device, if an io is started before and done after diskstats is read, the avgqu-sz will be miscalculated. To fix the problem, update 'nsecs[]' when part_stat_show() or diskstats_show() is called. In order to do that, add 'stat_time' in struct hd_struct and 'rq_stat_time' in struct request to record the time. And during iteration, update 'nsecs[]' for each inflight request. Signed-off-by: Yu Kuai Reviewed-by: Jason Yan Signed-off-by: Zheng Zengkai --- block/blk-core.c | 15 ++++++++-- block/blk-mq.c | 45 ++++++++++++++++++++++++++++++ block/blk-mq.h | 2 ++ block/genhd.c | 61 ++++++++++++++++++++++++++--------------- block/partitions/core.c | 2 ++ include/linux/blkdev.h | 3 +- include/linux/genhd.h | 2 ++ 7 files changed, 105 insertions(+), 25 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index bbd3d4560458..f5f419dd8ab6 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1292,13 +1292,24 @@ void blk_account_io_done(struct request *req, u64 now) !(req->rq_flags & RQF_FLUSH_SEQ)) { const int sgrp = op_stat_group(req_op(req)); struct hd_struct *part; + u64 stat_time; part_stat_lock(); part = req->part; - update_io_ticks(part, jiffies, true); part_stat_inc(part, ios[sgrp]); - part_stat_add(part, nsecs[sgrp], now - req->start_time_ns); + stat_time = READ_ONCE(req->stat_time_ns); + /* + * This might fail if 'req->stat_time_ns' is updated + * in blk_mq_check_inflight_with_stat(). + */ + if (likely(cmpxchg64(&req->stat_time_ns, stat_time, now) + == stat_time)) { + u64 duation = stat_time ? now - stat_time : + now - req->start_time_ns; + + part_stat_add(req->part, nsecs[sgrp], duation); + } part_stat_unlock(); hd_struct_put(part); diff --git a/block/blk-mq.c b/block/blk-mq.c index 83193e44aada..2cf387697a41 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -102,6 +102,50 @@ struct mq_inflight { unsigned int inflight[2]; }; +static bool blk_mq_check_inflight_with_stat(struct blk_mq_hw_ctx *hctx, + struct request *rq, void *priv, + bool reserved) +{ + struct mq_inflight *mi = priv; + + if ((!mi->part->partno || rq->part == mi->part) && + blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT) { + u64 stat_time; + + mi->inflight[rq_data_dir(rq)]++; + if (!rq->part) + return true; + + stat_time = READ_ONCE(rq->stat_time_ns); + /* + * This might fail if 'req->stat_time_ns' is updated in + * blk_account_io_done(). + */ + if (likely(cmpxchg64(&rq->stat_time_ns, stat_time, + rq->part->stat_time) == stat_time)) { + int sgrp = op_stat_group(req_op(rq)); + u64 duation = stat_time ? + rq->part->stat_time - stat_time : + rq->part->stat_time - rq->start_time_ns; + + part_stat_add(rq->part, nsecs[sgrp], duation); + } + } + + return true; +} + +unsigned int blk_mq_in_flight_with_stat(struct request_queue *q, + struct hd_struct *part) +{ + struct mq_inflight mi = { .part = part }; + + blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight_with_stat, &mi); + + return mi.inflight[0] + mi.inflight[1]; +} + + static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx, struct request *rq, void *priv, bool reserved) @@ -328,6 +372,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, rq->start_time_ns = ktime_get_ns(); else rq->start_time_ns = 0; + rq->stat_time_ns = 0; rq->io_start_time_ns = 0; rq->stats_sectors = 0; rq->nr_phys_segments = 0; diff --git a/block/blk-mq.h b/block/blk-mq.h index bb58c68b8274..6897b4c09b7c 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -187,6 +187,8 @@ static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx) unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part); void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part, unsigned int inflight[2]); +unsigned int blk_mq_in_flight_with_stat(struct request_queue *q, + struct hd_struct *part); static inline void blk_mq_put_dispatch_budget(struct request_queue *q) { diff --git a/block/genhd.c b/block/genhd.c index f94152e99876..16ad881172d0 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1293,25 +1293,52 @@ ssize_t part_size_show(struct device *dev, (unsigned long long)part_nr_sects_read(p)); } +static void part_set_stat_time(struct hd_struct *hd) +{ + u64 now = ktime_get_ns(); + +again: + hd->stat_time = now; + if (hd->partno) { + hd = &part_to_disk(hd)->part0; + goto again; + } +} + +static void part_get_stat_info(struct hd_struct *hd, struct disk_stats *stat, + unsigned int *inflight) +{ + struct request_queue *q = part_to_disk(hd)->queue; + + if (queue_is_mq(q)) { + part_stat_lock(); + spin_lock(&hd->bd_stat_lock); + part_set_stat_time(hd); + *inflight = blk_mq_in_flight_with_stat(q, hd); + spin_unlock(&hd->bd_stat_lock); + part_stat_unlock(); + } else { + *inflight = part_in_flight(hd); + } + + if (*inflight) { + part_stat_lock(); + update_io_ticks(hd, jiffies, true); + part_stat_unlock(); + } + + part_stat_read_all(hd, stat); +} + ssize_t part_stat_show(struct device *dev, struct device_attribute *attr, char *buf) { struct hd_struct *p = dev_to_part(dev); - struct request_queue *q = part_to_disk(p)->queue; struct disk_stats stat; unsigned int inflight; - if (queue_is_mq(q)) - inflight = blk_mq_in_flight(q, p); - else - inflight = part_in_flight(p); + part_get_stat_info(p, &stat, &inflight); - if (inflight) { - part_stat_lock(); - update_io_ticks(p, jiffies, true); - part_stat_unlock(); - } - part_stat_read_all(p, &stat); return sprintf(buf, "%8lu %8lu %8llu %8u " "%8lu %8lu %8llu %8u " @@ -1628,17 +1655,7 @@ static int diskstats_show(struct seq_file *seqf, void *v) disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0); while ((hd = disk_part_iter_next(&piter))) { - if (queue_is_mq(gp->queue)) - inflight = blk_mq_in_flight(gp->queue, hd); - else - inflight = part_in_flight(hd); - - if (inflight) { - part_stat_lock(); - update_io_ticks(hd, jiffies, true); - part_stat_unlock(); - } - part_stat_read_all(hd, &stat); + part_get_stat_info(hd, &stat, &inflight); seq_printf(seqf, "%4d %7d %s " "%lu %lu %lu %u " "%lu %lu %lu %u " diff --git a/block/partitions/core.c b/block/partitions/core.c index 569b0ca9f6e1..92c723c19bb0 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -415,6 +415,8 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, p->nr_sects = len; p->partno = partno; p->read_only = get_disk_ro(disk) | test_bit(partno, disk->user_ro_bitmap); + p->stat_time = 0; + spin_lock_init(&p->bd_stat_lock); if (info) { struct partition_meta_info *pinfo; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6a4b2a01a462..46feee6bd45c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -207,7 +207,8 @@ struct request { u64 start_time_ns; /* Time that I/O was submitted to the device. */ u64 io_start_time_ns; - + /* Time that I/O was counted in part_get_stat_info(). */ + u64 stat_time_ns; #ifdef CONFIG_BLK_WBT unsigned short wbt_flags; #endif diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 09da27361620..07122c79210c 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -63,6 +63,8 @@ struct hd_struct { seqcount_t nr_sects_seq; #endif unsigned long stamp; + spinlock_t bd_stat_lock; + u64 stat_time; struct disk_stats __percpu *dkstats; struct percpu_ref ref; -- Gitee From 5c250d556e1021b554c712056bc00ca9871d06f9 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 12 Jul 2022 22:51:45 +0800 Subject: [PATCH 150/674] blk-mq: fix kabi broken in struct request hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I57S8D CVE: NA -------------------------------- Since there are no reserved fields, declare a wrapper to fix kabi broken. Signed-off-by: Yu Kuai Reviewed-by: Jason Yan Signed-off-by: Zheng Zengkai --- block/blk-core.c | 7 ++++--- block/blk-mq.c | 12 +++++++----- include/linux/blk-mq.h | 13 +++++++++++-- include/linux/blkdev.h | 2 -- 4 files changed, 22 insertions(+), 12 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index f5f419dd8ab6..8b93366c76e4 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1293,17 +1293,18 @@ void blk_account_io_done(struct request *req, u64 now) const int sgrp = op_stat_group(req_op(req)); struct hd_struct *part; u64 stat_time; + struct request_wrapper *rq_wrapper = request_to_wrapper(req); part_stat_lock(); part = req->part; update_io_ticks(part, jiffies, true); part_stat_inc(part, ios[sgrp]); - stat_time = READ_ONCE(req->stat_time_ns); + stat_time = READ_ONCE(rq_wrapper->stat_time_ns); /* - * This might fail if 'req->stat_time_ns' is updated + * This might fail if 'stat_time_ns' is updated * in blk_mq_check_inflight_with_stat(). */ - if (likely(cmpxchg64(&req->stat_time_ns, stat_time, now) + if (likely(cmpxchg64(&rq_wrapper->stat_time_ns, stat_time, now) == stat_time)) { u64 duation = stat_time ? now - stat_time : now - req->start_time_ns; diff --git a/block/blk-mq.c b/block/blk-mq.c index 2cf387697a41..76ff229e7fb2 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -111,17 +111,19 @@ static bool blk_mq_check_inflight_with_stat(struct blk_mq_hw_ctx *hctx, if ((!mi->part->partno || rq->part == mi->part) && blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT) { u64 stat_time; + struct request_wrapper *rq_wrapper; mi->inflight[rq_data_dir(rq)]++; if (!rq->part) return true; - stat_time = READ_ONCE(rq->stat_time_ns); + rq_wrapper = request_to_wrapper(rq); + stat_time = READ_ONCE(rq_wrapper->stat_time_ns); /* - * This might fail if 'req->stat_time_ns' is updated in + * This might fail if 'stat_time_ns' is updated in * blk_account_io_done(). */ - if (likely(cmpxchg64(&rq->stat_time_ns, stat_time, + if (likely(cmpxchg64(&rq_wrapper->stat_time_ns, stat_time, rq->part->stat_time) == stat_time)) { int sgrp = op_stat_group(req_op(rq)); u64 duation = stat_time ? @@ -368,11 +370,11 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, #ifdef CONFIG_BLK_RQ_ALLOC_TIME rq->alloc_time_ns = alloc_time_ns; #endif + request_to_wrapper(rq)->stat_time_ns = 0; if (blk_mq_need_time_stamp(rq)) rq->start_time_ns = ktime_get_ns(); else rq->start_time_ns = 0; - rq->stat_time_ns = 0; rq->io_start_time_ns = 0; rq->stats_sectors = 0; rq->nr_phys_segments = 0; @@ -2555,7 +2557,7 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, * rq_size is the size of the request plus driver payload, rounded * to the cacheline size */ - rq_size = round_up(sizeof(struct request) + set->cmd_size, + rq_size = round_up(sizeof(struct request_wrapper) + set->cmd_size, cache_line_size()); left = rq_size * depth; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index c9210fb70e4d..ac83257972a0 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -304,6 +304,15 @@ struct blk_mq_queue_data { KABI_RESERVE(1) }; +struct request_wrapper { + struct request rq; + + /* Time that I/O was counted in part_get_stat_info(). */ + u64 stat_time_ns; +}; + +#define request_to_wrapper(_rq) container_of(_rq, struct request_wrapper, rq) + typedef bool (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *, bool); typedef bool (busy_tag_iter_fn)(struct request *, void *, bool); @@ -595,7 +604,7 @@ static inline bool blk_should_fake_timeout(struct request_queue *q) */ static inline struct request *blk_mq_rq_from_pdu(void *pdu) { - return pdu - sizeof(struct request); + return pdu - sizeof(struct request_wrapper); } /** @@ -609,7 +618,7 @@ static inline struct request *blk_mq_rq_from_pdu(void *pdu) */ static inline void *blk_mq_rq_to_pdu(struct request *rq) { - return rq + 1; + return request_to_wrapper(rq) + 1; } static inline struct blk_mq_hw_ctx *queue_hctx(struct request_queue *q, int id) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 46feee6bd45c..49540ce9e325 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -207,8 +207,6 @@ struct request { u64 start_time_ns; /* Time that I/O was submitted to the device. */ u64 io_start_time_ns; - /* Time that I/O was counted in part_get_stat_info(). */ - u64 stat_time_ns; #ifdef CONFIG_BLK_WBT unsigned short wbt_flags; #endif -- Gitee From 47fea27be7e37652e5fb324c26e1f976aef7f83c Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 12 Jul 2022 22:51:46 +0800 Subject: [PATCH 151/674] block: fix kabi broken in struct hd_struct hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I57S8D CVE: NA -------------------------------- Use reserved fields to fix kabi broken for field 'stat_time'. However, for the field 'bd_stat_lock', spinlock_t can be up to 64 bytes, thus reserved fields is not enough. And struct 'hd_struct' is internal of other sutrct, thus declare a wrapper is infeasible. In order to fix kabi broken for 'bd_stat_lock', use 'dev->mutex' instead. Signed-off-by: Yu Kuai Reviewed-by: Jason Yan Signed-off-by: Zheng Zengkai --- block/genhd.c | 4 ++-- block/partitions/core.c | 1 - include/linux/genhd.h | 4 +--- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 16ad881172d0..cc114dd0265b 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1312,10 +1312,10 @@ static void part_get_stat_info(struct hd_struct *hd, struct disk_stats *stat, if (queue_is_mq(q)) { part_stat_lock(); - spin_lock(&hd->bd_stat_lock); + mutex_lock(&part_to_dev(hd)->mutex); part_set_stat_time(hd); *inflight = blk_mq_in_flight_with_stat(q, hd); - spin_unlock(&hd->bd_stat_lock); + mutex_unlock(&part_to_dev(hd)->mutex); part_stat_unlock(); } else { *inflight = part_in_flight(hd); diff --git a/block/partitions/core.c b/block/partitions/core.c index 92c723c19bb0..8f32f3cd0ede 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -416,7 +416,6 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, p->partno = partno; p->read_only = get_disk_ro(disk) | test_bit(partno, disk->user_ro_bitmap); p->stat_time = 0; - spin_lock_init(&p->bd_stat_lock); if (info) { struct partition_meta_info *pinfo; diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 07122c79210c..05927a1c6b5b 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -63,8 +63,6 @@ struct hd_struct { seqcount_t nr_sects_seq; #endif unsigned long stamp; - spinlock_t bd_stat_lock; - u64 stat_time; struct disk_stats __percpu *dkstats; struct percpu_ref ref; @@ -78,7 +76,7 @@ struct hd_struct { #endif struct rcu_work rcu_work; - KABI_RESERVE(1) + KABI_USE(1, u64 stat_time) KABI_RESERVE(2) KABI_RESERVE(3) KABI_RESERVE(4) -- Gitee From 8d1b294dcbd95a1a301245f11ee6fbdd9b23f019 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 12 Jul 2022 22:51:47 +0800 Subject: [PATCH 152/674] block: fix sleeping function called from invalid context in part_get_stat_info() hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I57S8D CVE: NA -------------------------------- part_get_stat_info() call mutex_lock() inside part_stat_lock(), which is wrong because part_stat_lock() disables preempt. Fix the problem by hold mutex first. Signed-off-by: Yu Kuai Reviewed-by: Jason Yan Signed-off-by: Zheng Zengkai --- block/genhd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index cc114dd0265b..fcd6210417bc 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1311,12 +1311,12 @@ static void part_get_stat_info(struct hd_struct *hd, struct disk_stats *stat, struct request_queue *q = part_to_disk(hd)->queue; if (queue_is_mq(q)) { - part_stat_lock(); mutex_lock(&part_to_dev(hd)->mutex); + part_stat_lock(); part_set_stat_time(hd); *inflight = blk_mq_in_flight_with_stat(q, hd); - mutex_unlock(&part_to_dev(hd)->mutex); part_stat_unlock(); + mutex_unlock(&part_to_dev(hd)->mutex); } else { *inflight = part_in_flight(hd); } -- Gitee From ae7eb31ec422f1a1c644c7c3a73feb11d8657b99 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 12 Jul 2022 22:51:48 +0800 Subject: [PATCH 153/674] block: fix that iostat can show huge wait time hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I57S8D CVE: NA -------------------------------- There might be a problem that iostat can read less 'nsecs' than the last time. 1) io is started after 'hd->stat_time' is set. 2) following concurrent scenario: t1 t2 blk_mq_end_request time1 -> before hd->stat_time blk_account_io_done part_get_stat_info part_set_stat_time hd->stat_time = time2 -> time1 < time2 blk_mq_in_flight_with_stat blk_mq_check_inflight_with_stat cmpxchg64() -> set stat_time_ns to time2 cmpxchg64() -> set stat_time to time1 duation = time1 - time2; -> time1 < time2 part_stat_add(xx, nsecs, duation) -> problematic 3) Similar concurrent scenario the other way around. Fix the problem by don't add 'duation' if the calculation might underflow. Signed-off-by: Yu Kuai Reviewed-by: Jason Yan Signed-off-by: Zheng Zengkai --- block/blk-core.c | 3 ++- block/blk-mq.c | 12 ++++++++++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 8b93366c76e4..715b61c239ea 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1304,7 +1304,8 @@ void blk_account_io_done(struct request *req, u64 now) * This might fail if 'stat_time_ns' is updated * in blk_mq_check_inflight_with_stat(). */ - if (likely(cmpxchg64(&rq_wrapper->stat_time_ns, stat_time, now) + if (likely(now > stat_time && + cmpxchg64(&rq_wrapper->stat_time_ns, stat_time, now) == stat_time)) { u64 duation = stat_time ? now - stat_time : now - req->start_time_ns; diff --git a/block/blk-mq.c b/block/blk-mq.c index 76ff229e7fb2..5031c4fc4412 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -117,14 +117,22 @@ static bool blk_mq_check_inflight_with_stat(struct blk_mq_hw_ctx *hctx, if (!rq->part) return true; + /* + * If the request is started after 'part->stat_time' is set, + * don't update 'nsces' here. + */ + if (rq->part->stat_time <= rq->start_time_ns) + return true; + rq_wrapper = request_to_wrapper(rq); stat_time = READ_ONCE(rq_wrapper->stat_time_ns); /* * This might fail if 'stat_time_ns' is updated in * blk_account_io_done(). */ - if (likely(cmpxchg64(&rq_wrapper->stat_time_ns, stat_time, - rq->part->stat_time) == stat_time)) { + if (likely(rq->part->stat_time > stat_time && + cmpxchg64(&rq_wrapper->stat_time_ns, stat_time, + rq->part->stat_time) == stat_time)) { int sgrp = op_stat_group(req_op(rq)); u64 duation = stat_time ? rq->part->stat_time - stat_time : -- Gitee From 2ed9d17cc07fcbbdfe37df9169164ebd8c547b00 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 12 Jul 2022 22:51:49 +0800 Subject: [PATCH 154/674] block: don't use cmpxchg64() on 32-bit platform hulk inclusion category: bugfix bugzilla: 186921 CVE: NA -------------------------------- Some 32-bit platform doesn't support cmpxchg64(), using it in generic code will cause compile error. Fixes: 4c8f034bf1e6 ("[Huawei] block: update nsecs[] in part_stat_show() and diskstats_show()") Signed-off-by: Yu Kuai Reviewed-by: Jason Yan Signed-off-by: Zheng Zengkai --- block/blk-core.c | 6 ++++++ block/blk-mq.c | 3 ++- block/blk-mq.h | 2 ++ block/genhd.c | 8 ++++++-- 4 files changed, 16 insertions(+), 3 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 715b61c239ea..109fb2750453 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1292,13 +1292,16 @@ void blk_account_io_done(struct request *req, u64 now) !(req->rq_flags & RQF_FLUSH_SEQ)) { const int sgrp = op_stat_group(req_op(req)); struct hd_struct *part; +#ifdef CONFIG_64BIT u64 stat_time; struct request_wrapper *rq_wrapper = request_to_wrapper(req); +#endif part_stat_lock(); part = req->part; update_io_ticks(part, jiffies, true); part_stat_inc(part, ios[sgrp]); +#ifdef CONFIG_64BIT stat_time = READ_ONCE(rq_wrapper->stat_time_ns); /* * This might fail if 'stat_time_ns' is updated @@ -1312,6 +1315,9 @@ void blk_account_io_done(struct request *req, u64 now) part_stat_add(req->part, nsecs[sgrp], duation); } +#else + part_stat_add(part, nsecs[sgrp], now - req->start_time_ns); +#endif part_stat_unlock(); hd_struct_put(part); diff --git a/block/blk-mq.c b/block/blk-mq.c index 5031c4fc4412..1941ffc4db85 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -102,6 +102,7 @@ struct mq_inflight { unsigned int inflight[2]; }; +#ifdef CONFIG_64BIT static bool blk_mq_check_inflight_with_stat(struct blk_mq_hw_ctx *hctx, struct request *rq, void *priv, bool reserved) @@ -154,7 +155,7 @@ unsigned int blk_mq_in_flight_with_stat(struct request_queue *q, return mi.inflight[0] + mi.inflight[1]; } - +#endif static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx, struct request *rq, void *priv, diff --git a/block/blk-mq.h b/block/blk-mq.h index 6897b4c09b7c..ad2d74f887f2 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -187,8 +187,10 @@ static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx) unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part); void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part, unsigned int inflight[2]); +#ifdef CONFIG_64BIT unsigned int blk_mq_in_flight_with_stat(struct request_queue *q, struct hd_struct *part); +#endif static inline void blk_mq_put_dispatch_budget(struct request_queue *q) { diff --git a/block/genhd.c b/block/genhd.c index fcd6210417bc..8b37fcfa10d1 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1293,6 +1293,7 @@ ssize_t part_size_show(struct device *dev, (unsigned long long)part_nr_sects_read(p)); } +#ifdef CONFIG_64BIT static void part_set_stat_time(struct hd_struct *hd) { u64 now = ktime_get_ns(); @@ -1304,12 +1305,13 @@ static void part_set_stat_time(struct hd_struct *hd) goto again; } } +#endif static void part_get_stat_info(struct hd_struct *hd, struct disk_stats *stat, unsigned int *inflight) { +#ifdef CONFIG_64BIT struct request_queue *q = part_to_disk(hd)->queue; - if (queue_is_mq(q)) { mutex_lock(&part_to_dev(hd)->mutex); part_stat_lock(); @@ -1320,7 +1322,9 @@ static void part_get_stat_info(struct hd_struct *hd, struct disk_stats *stat, } else { *inflight = part_in_flight(hd); } - +#else + *inflight = part_in_flight(hd); +#endif if (*inflight) { part_stat_lock(); update_io_ticks(hd, jiffies, true); -- Gitee From 499ecade21fc377c04ec66ed7f0505f1ca74d755 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 12 Jul 2022 22:51:50 +0800 Subject: [PATCH 155/674] mm/filemap: fix that first page is not mark accessed in filemap_read() hulk inclusion category: bugfix bugzilla: 186896, https://gitee.com/src-openeuler/kernel/issues/I5FRAP CVE: NA -------------------------------- In filemap_read(), first page will not mark accessed if previous page is equal to the current page: if (iocb->ki_pos >> PAGE_SHIFT != ra->prev_pos >> PAGE_SHIFT)) folio_mark_accessed(fbatch.folios[0]); However, 'prev_pos' is set to 'ki_pos + copied' during last read, which means 'prev_pos' can be equal to 'ki_pos' in this read, thus previous page can be miscaculated. Fix the problem by setting 'prev_pos' to the start offset of last read, so that 'prev_pos >> PAGE_SHIFT' will be previous page as expected. Fixes: 06c0444290ce ("mm/filemap.c: generic_file_buffered_read() now uses find_get_pages_contig") Signed-off-by: Yu Kuai Reviewed-by: Kefeng Wang Signed-off-by: Zheng Zengkai --- mm/filemap.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 3958fc3280d8..9653c1e0faef 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2549,10 +2549,11 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb, flush_dcache_page(pages[i]); copied = copy_page_to_iter(pages[i], offset, bytes, iter); - - written += copied; - iocb->ki_pos += copied; - ra->prev_pos = iocb->ki_pos; + if (copied) { + ra->prev_pos = iocb->ki_pos; + written += copied; + iocb->ki_pos += copied; + } if (copied < bytes) { error = -EFAULT; -- Gitee From ac90783f66b8154e56a3779206a18a1c5ee6d020 Mon Sep 17 00:00:00 2001 From: Li Huafei Date: Tue, 12 Jul 2022 22:51:51 +0800 Subject: [PATCH 156/674] livepatch/ppc64: Fix several compilation errors in unwind_frame() hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5CJ7X -------------------------------- Fix the spelling error of 'CONFIG_ FUNCTION_GRAPH_TRACE' and two compilation errors: - 'ftrace_idx' is not defined - 'struct stackframe' does not have a member named 'ip' Fixes: ac81d625f3da ("livepatch/ppc64: Implement livepatch without ftrace for ppc64be") Signed-off-by: Li Huafei Reviewed-by: Xu Kuohai Signed-off-by: Zheng Zengkai --- arch/powerpc/kernel/livepatch_64.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/livepatch_64.c b/arch/powerpc/kernel/livepatch_64.c index f008b3beb001..2487fab569a7 100644 --- a/arch/powerpc/kernel/livepatch_64.c +++ b/arch/powerpc/kernel/livepatch_64.c @@ -250,6 +250,9 @@ static int unwind_frame(struct task_struct *tsk, struct stackframe *frame) { unsigned long *stack; +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + int ftrace_idx = 0; +#endif if (!validate_sp(frame->sp, tsk, STACK_FRAME_OVERHEAD)) return -1; @@ -279,12 +282,12 @@ static int unwind_frame(struct task_struct *tsk, struct stackframe *frame) frame->sp = stack[0]; frame->pc = stack[STACK_FRAME_LR_SAVE]; -#ifdef CONFIG_FUNCTION_GRAPH_TRACE +#ifdef CONFIG_FUNCTION_GRAPH_TRACER /* * IMHO these tests do not belong in * arch-dependent code, they are generic. */ - frame->pc = ftrace_graph_ret_addr(tsk, &ftrace_idx, frame->ip, stack); + frame->pc = ftrace_graph_ret_addr(tsk, &ftrace_idx, frame->pc, stack); #endif return 0; -- Gitee From 27d852d29cd2f760d33cc4aa2244de3f6ddf5ebd Mon Sep 17 00:00:00 2001 From: Li Huafei Date: Tue, 12 Jul 2022 22:51:52 +0800 Subject: [PATCH 157/674] livepatch/ppc64: Fix the stack check for exception frames hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5CJ7X -------------------------------- For exception frames, we need to check NIP in addition to PC. When a task is interrupted, NIP is the interrupt return address. The function where the NIP is located is also on the stack. Fixes: ec2244b5adcf ("livepatch/ppc64: only check stack top") Fixes: 2a7c3db6e6a8 ("livepatch/powerpc64: Add arch_klp_module_check_calltrace") Signed-off-by: Li Huafei Reviewed-by: Xu Kuohai Signed-off-by: Zheng Zengkai --- arch/powerpc/kernel/livepatch_64.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/kernel/livepatch_64.c b/arch/powerpc/kernel/livepatch_64.c index 2487fab569a7..acd3658d37a3 100644 --- a/arch/powerpc/kernel/livepatch_64.c +++ b/arch/powerpc/kernel/livepatch_64.c @@ -326,9 +326,14 @@ static int klp_check_jump_func(struct stackframe *frame, void *data) struct walk_stackframe_args *args = data; struct klp_func_list *check_funcs = args->check_funcs; - if (!check_func_list(check_funcs, &args->ret, frame->pc)) { + /* check the PC first */ + if (!check_func_list(check_funcs, &args->ret, frame->pc)) return args->ret; - } + + /* check NIP when the exception stack switching */ + if (frame->nip && !check_func_list(check_funcs, &args->ret, frame->nip)) + return args->ret; + return 0; } @@ -430,11 +435,19 @@ static int check_module_calltrace(struct stackframe *frame, void *data) { struct walk_stackframe_args *args = data; - if (within_module_core(frame->pc, args->mod)) { - pr_err("module %s is in use!\n", args->mod->name); - return (args->ret = -EBUSY); - } + /* check the PC first */ + if (within_module_core(frame->pc, args->mod)) + goto err_out; + + /* check NIP when the exception stack switching */ + if (frame->nip && within_module_core(frame->nip, args->mod)) + goto err_out; + return 0; + +err_out: + pr_err("module %s is in use!\n", args->mod->name); + return (args->ret = -EBUSY); } int arch_klp_module_check_calltrace(void *data) -- Gitee From 7a64eb335c3b048001238d9b36bf2bedef1d5b62 Mon Sep 17 00:00:00 2001 From: Li Huafei Date: Tue, 12 Jul 2022 22:51:53 +0800 Subject: [PATCH 158/674] livepatch/ppc32: Fix the stack check for exception frames hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5CJ7X -------------------------------- Currently, the stack check of ppc32 does not consider exception frames, and the interrupted functions are omitted. Similar to ppc64, special processing of exception frames should be added. Because the stack frame structure of ppc32 and ppc64 is the same, we can reuse the unwind_frame() code of ppc64. Then during the stack check, the exception frames need to check the NIP in addition to the PC, which is the function that is interrupted. Fixes: e22fb775aaf6 ("livepatch/ppc32: Support livepatch without ftrace") Fixes: 1daef7b0cae5 ("livepatch/powerpc32: Add arch_klp_module_check_calltrace") Signed-off-by: Li Huafei Reviewed-by: Xu Kuohai Signed-off-by: Zheng Zengkai --- arch/powerpc/include/asm/livepatch.h | 7 ++++ arch/powerpc/kernel/livepatch.c | 47 ++++++++++++++++++++++++ arch/powerpc/kernel/livepatch_32.c | 47 +++++++++++------------- arch/powerpc/kernel/livepatch_64.c | 55 +--------------------------- 4 files changed, 76 insertions(+), 80 deletions(-) diff --git a/arch/powerpc/include/asm/livepatch.h b/arch/powerpc/include/asm/livepatch.h index bafbfaba190f..39dcfc3c28ce 100644 --- a/arch/powerpc/include/asm/livepatch.h +++ b/arch/powerpc/include/asm/livepatch.h @@ -118,6 +118,12 @@ struct arch_klp_data { #endif /* CONFIG_PPC64 */ +struct stackframe { + unsigned long sp; + unsigned long pc; + unsigned long nip; +}; + #ifdef PPC64_ELF_ABI_v1 struct klp_func_node; void arch_klp_set_brk_func(struct klp_func_node *func_node, void *new_func); @@ -127,6 +133,7 @@ int arch_klp_add_breakpoint(struct arch_klp_data *arch_data, void *old_func); void arch_klp_remove_breakpoint(struct arch_klp_data *arch_data, void *old_func); long arch_klp_save_old_code(struct arch_klp_data *arch_data, void *old_func); int arch_klp_module_check_calltrace(void *data); +int klp_unwind_frame(struct task_struct *tsk, struct stackframe *frame); #endif /* CONFIG_LIVEPATCH_FTRACE */ diff --git a/arch/powerpc/kernel/livepatch.c b/arch/powerpc/kernel/livepatch.c index b8afcc7b9939..d568e8c8b16b 100644 --- a/arch/powerpc/kernel/livepatch.c +++ b/arch/powerpc/kernel/livepatch.c @@ -21,6 +21,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include +#include #include #include #include @@ -65,3 +66,49 @@ int klp_brk_handler(struct pt_regs *regs) return 1; } + +int klp_unwind_frame(struct task_struct *tsk, struct stackframe *frame) +{ + unsigned long *stack; +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + int ftrace_idx = 0; +#endif + + if (!validate_sp(frame->sp, tsk, STACK_FRAME_OVERHEAD)) + return -1; + + if (frame->nip != 0) + frame->nip = 0; + + stack = (unsigned long *)frame->sp; + + /* + * When switching to the exception stack, + * we save the NIP in pt_regs + * + * See if this is an exception frame. + * We look for the "regshere" marker in the current frame. + */ + if (validate_sp(frame->sp, tsk, STACK_INT_FRAME_SIZE) + && stack[STACK_FRAME_MARKER] == STACK_FRAME_REGS_MARKER) { + struct pt_regs *regs = (struct pt_regs *) + (frame->sp + STACK_FRAME_OVERHEAD); + frame->nip = regs->nip; + pr_debug("--- interrupt: task = %d/%s, trap %lx at NIP=x%lx/%pS, LR=0x%lx/%pS\n", + tsk->pid, tsk->comm, regs->trap, + regs->nip, (void *)regs->nip, + regs->link, (void *)regs->link); + } + + frame->sp = stack[0]; + frame->pc = stack[STACK_FRAME_LR_SAVE]; +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + /* + * IMHO these tests do not belong in + * arch-dependent code, they are generic. + */ + frame->pc = ftrace_graph_ret_addr(tsk, &ftrace_idx, frame->pc, stack); +#endif + + return 0; +} diff --git a/arch/powerpc/kernel/livepatch_32.c b/arch/powerpc/kernel/livepatch_32.c index 603f1d61cc23..8f53386e7cf8 100644 --- a/arch/powerpc/kernel/livepatch_32.c +++ b/arch/powerpc/kernel/livepatch_32.c @@ -62,11 +62,6 @@ struct klp_func_list { int force; }; -struct stackframe { - unsigned long sp; - unsigned long pc; -}; - struct walk_stackframe_args { int enable; struct klp_func_list *check_funcs; @@ -226,20 +221,6 @@ static int klp_check_activeness_func(struct klp_patch *patch, int enable, return 0; } -static int unwind_frame(struct task_struct *tsk, struct stackframe *frame) -{ - - unsigned long *stack; - - if (!validate_sp(frame->sp, tsk, STACK_FRAME_OVERHEAD)) - return -1; - - stack = (unsigned long *)frame->sp; - frame->sp = stack[0]; - frame->pc = stack[STACK_FRAME_LR_SAVE]; - return 0; -} - void notrace klp_walk_stackframe(struct stackframe *frame, int (*fn)(struct stackframe *, void *), struct task_struct *tsk, void *data) @@ -249,7 +230,7 @@ void notrace klp_walk_stackframe(struct stackframe *frame, if (fn(frame, data)) break; - ret = unwind_frame(tsk, frame); + ret = klp_unwind_frame(tsk, frame); if (ret < 0) break; } @@ -273,9 +254,14 @@ static int klp_check_jump_func(struct stackframe *frame, void *data) struct walk_stackframe_args *args = data; struct klp_func_list *check_funcs = args->check_funcs; - if (!check_func_list(check_funcs, &args->ret, frame->pc)) { + /* check the PC first */ + if (!check_func_list(check_funcs, &args->ret, frame->pc)) return args->ret; - } + + /* check NIP when the exception stack switching */ + if (frame->nip && !check_func_list(check_funcs, &args->ret, frame->nip)) + return args->ret; + return 0; } @@ -334,6 +320,7 @@ static int do_check_calltrace(struct walk_stackframe_args *args, frame.sp = (unsigned long)stack; frame.pc = stack[STACK_FRAME_LR_SAVE]; + frame.nip = 0; klp_walk_stackframe(&frame, fn, t, args); if (args->ret) { pr_info("PID: %d Comm: %.20s\n", t->pid, t->comm); @@ -372,11 +359,19 @@ static int check_module_calltrace(struct stackframe *frame, void *data) { struct walk_stackframe_args *args = data; - if (within_module_core(frame->pc, args->mod)) { - pr_err("module %s is in use!\n", args->mod->name); - return (args->ret = -EBUSY); - } + /* check the PC first */ + if (within_module_core(frame->pc, args->mod)) + goto err_out; + + /* check NIP when the exception stack switching */ + if (frame->nip && within_module_core(frame->nip, args->mod)) + goto err_out; + return 0; + +err_out: + pr_err("module %s is in use!\n", args->mod->name); + return (args->ret = -EBUSY); } int arch_klp_module_check_calltrace(void *data) diff --git a/arch/powerpc/kernel/livepatch_64.c b/arch/powerpc/kernel/livepatch_64.c index acd3658d37a3..cbb5e02cccff 100644 --- a/arch/powerpc/kernel/livepatch_64.c +++ b/arch/powerpc/kernel/livepatch_64.c @@ -67,12 +67,6 @@ struct klp_func_list { int force; }; -struct stackframe { - unsigned long sp; - unsigned long pc; - unsigned long nip; -}; - struct walk_stackframe_args { int enable; struct klp_func_list *check_funcs; @@ -246,53 +240,6 @@ static int klp_check_activeness_func(struct klp_patch *patch, int enable, return 0; } -static int unwind_frame(struct task_struct *tsk, struct stackframe *frame) -{ - - unsigned long *stack; -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - int ftrace_idx = 0; -#endif - - if (!validate_sp(frame->sp, tsk, STACK_FRAME_OVERHEAD)) - return -1; - - if (frame->nip != 0) - frame->nip = 0; - - stack = (unsigned long *)frame->sp; - - /* - * When switching to the exception stack, - * we save the NIP in pt_regs - * - * See if this is an exception frame. - * We look for the "regshere" marker in the current frame. - */ - if (validate_sp(frame->sp, tsk, STACK_INT_FRAME_SIZE) - && stack[STACK_FRAME_MARKER] == STACK_FRAME_REGS_MARKER) { - struct pt_regs *regs = (struct pt_regs *) - (frame->sp + STACK_FRAME_OVERHEAD); - frame->nip = regs->nip; - pr_debug("--- interrupt: task = %d/%s, trap %lx at NIP=x%lx/%pS, LR=0x%lx/%pS\n", - tsk->pid, tsk->comm, regs->trap, - regs->nip, (void *)regs->nip, - regs->link, (void *)regs->link); - } - - frame->sp = stack[0]; - frame->pc = stack[STACK_FRAME_LR_SAVE]; -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - /* - * IMHO these tests do not belong in - * arch-dependent code, they are generic. - */ - frame->pc = ftrace_graph_ret_addr(tsk, &ftrace_idx, frame->pc, stack); -#endif - - return 0; -} - static void notrace klp_walk_stackframe(struct stackframe *frame, int (*fn)(struct stackframe *, void *), struct task_struct *tsk, void *data) @@ -302,7 +249,7 @@ static void notrace klp_walk_stackframe(struct stackframe *frame, if (fn(frame, data)) break; - ret = unwind_frame(tsk, frame); + ret = klp_unwind_frame(tsk, frame); if (ret < 0) break; } -- Gitee From bf06a577f02ef3a1a74e138211be5639e0227178 Mon Sep 17 00:00:00 2001 From: Guo Mengqi Date: Tue, 12 Jul 2022 22:51:54 +0800 Subject: [PATCH 159/674] Revert "iommu: handle page response timeout" hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5EOOG CVE: NA -------------------------------- This reverts commit da76349ca8776aa7f8b186010005fb563fb163bb. However, the iommu_fault_param and iommu_fault_event changes are reserved to avoid KABI change. Signed-off-by: Guo Mengqi Reviewed-by: Weilong Chen Signed-off-by: Zheng Zengkai --- drivers/iommu/iommu.c | 55 ------------------------------------------- include/linux/iommu.h | 4 ++-- 2 files changed, 2 insertions(+), 57 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 9116c93945d0..97953fa27630 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -1084,39 +1084,6 @@ int iommu_group_unregister_notifier(struct iommu_group *group, } EXPORT_SYMBOL_GPL(iommu_group_unregister_notifier); -static void iommu_dev_fault_timer_fn(struct timer_list *t) -{ - struct iommu_fault_param *fparam = from_timer(fparam, t, timer); - struct iommu_fault_event *evt; - struct iommu_fault_page_request *prm; - - u64 now; - - now = get_jiffies_64(); - - /* The goal is to ensure driver or guest page fault handler(via vfio) - * send page response on time. Otherwise, limited queue resources - * may be occupied by some irresponsive guests or drivers. - * When per device pending fault list is not empty, we periodically checks - * if any anticipated page response time has expired. - * - * TODO: - * We could do the following if response time expires: - * 1. send page response code FAILURE to all pending PRQ - * 2. inform device driver or vfio - * 3. drain in-flight page requests and responses for this device - * 4. clear pending fault list such that driver can unregister fault - * handler(otherwise blocked when pending faults are present). - */ - list_for_each_entry(evt, &fparam->faults, list) { - prm = &evt->fault.prm; - if (time_after64(now, evt->expire)) - pr_err("Page response time expired!, pasid %d gid %d exp %llu now %llu\n", - prm->pasid, prm->grpid, evt->expire, now); - } - mod_timer(t, now + prq_timeout); -} - /** * iommu_register_device_fault_handler() - Register a device fault handler * @dev: the device @@ -1164,9 +1131,6 @@ int iommu_register_device_fault_handler(struct device *dev, mutex_init(¶m->fault_param->lock); INIT_LIST_HEAD(¶m->fault_param->faults); - if (prq_timeout) - timer_setup(¶m->fault_param->timer, iommu_dev_fault_timer_fn, - TIMER_DEFERRABLE); done_unlock: mutex_unlock(¶m->lock); @@ -1306,9 +1270,7 @@ int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt) struct dev_iommu *param = dev->iommu; struct iommu_fault_event *evt_pending = NULL; struct iommu_fault_param *fparam; - struct timer_list *tmr; int ret = 0; - u64 exp; if (!param || !evt || WARN_ON_ONCE(!iommu_fault_valid(&evt->fault))) return -EINVAL; @@ -1329,17 +1291,7 @@ int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt) ret = -ENOMEM; goto done_unlock; } - /* Keep track of response expiration time */ - exp = get_jiffies_64() + prq_timeout; - evt_pending->expire = exp; mutex_lock(&fparam->lock); - if (list_empty(&fparam->faults)) { - /* First pending event, start timer */ - tmr = &fparam->timer; - WARN_ON(timer_pending(tmr)); - mod_timer(tmr, exp); - } - list_add_tail(&evt_pending->list, &fparam->faults); mutex_unlock(&fparam->lock); } @@ -1417,13 +1369,6 @@ int iommu_page_response(struct device *dev, break; } - /* stop response timer if no more pending request */ - if (list_empty(¶m->fault_param->faults) && - timer_pending(¶m->fault_param->timer)) { - pr_debug("no pending PRQ, stop timer\n"); - del_timer(¶m->fault_param->timer); - } - done_unlock: mutex_unlock(¶m->fault_param->lock); return ret; diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 8baf5ed66a84..092384b71ab2 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -393,7 +393,7 @@ struct iommu_device { struct iommu_fault_event { struct iommu_fault fault; struct list_head list; - u64 expire; + _KABI_DEPRECATE(u64, expire); }; /** @@ -408,7 +408,7 @@ struct iommu_fault_param { iommu_dev_fault_handler_t handler; void *data; struct list_head faults; - struct timer_list timer; + _KABI_DEPRECATE(struct timer_list, timer); struct mutex lock; }; -- Gitee From 3e8545091ac1afe772c6a79e8b015bad7284129f Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Tue, 12 Jul 2022 22:51:55 +0800 Subject: [PATCH 160/674] fs: introduce a wrapper uuid_to_fsid() mainline inclusion from mainline-5.13-rc1 commit 9591c3a34f7722bd77f42c98d76fd5a5bad465f0 category: feature bugzilla: 187162, https://gitee.com/openeuler/kernel/issues/I5FYKV CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=9591c3a34f7722bd77f42c98d76fd5a5bad465f0 ------------------------------------------------- Some filesystem's use a digest of their uuid for f_fsid. Create a simple wrapper for this open coded folding. Filesystems that have a non null uuid but use the block device number for f_fsid may also consider using this helper. [JK: Added missing asm/byteorder.h include] Link: https://lore.kernel.org/r/20210322173944.449469-2-amir73il@gmail.com Acked-by: Damien Le Moal Reviewed-by: Christian Brauner Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Signed-off-by: Baokun Li Reviewed-by: chengzhihao Reviewed-by: Zhang Yi Signed-off-by: Zheng Zengkai --- fs/ext2/super.c | 5 +---- fs/ext4/super.c | 5 +---- fs/zonefs/super.c | 5 +---- include/linux/statfs.h | 8 ++++++++ 4 files changed, 11 insertions(+), 12 deletions(-) diff --git a/fs/ext2/super.c b/fs/ext2/super.c index b6314d3c6a87..74e110289413 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -1403,7 +1403,6 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf) struct super_block *sb = dentry->d_sb; struct ext2_sb_info *sbi = EXT2_SB(sb); struct ext2_super_block *es = sbi->s_es; - u64 fsid; spin_lock(&sbi->s_lock); @@ -1457,9 +1456,7 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf) buf->f_ffree = ext2_count_free_inodes(sb); es->s_free_inodes_count = cpu_to_le32(buf->f_ffree); buf->f_namelen = EXT2_NAME_LEN; - fsid = le64_to_cpup((void *)es->s_uuid) ^ - le64_to_cpup((void *)es->s_uuid + sizeof(u64)); - buf->f_fsid = u64_to_fsid(fsid); + buf->f_fsid = uuid_to_fsid(es->s_uuid); spin_unlock(&sbi->s_lock); return 0; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 5e992775fc30..26c938360895 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -6246,7 +6246,6 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; ext4_fsblk_t overhead = 0, resv_blocks; - u64 fsid; s64 bfree; resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters)); @@ -6267,9 +6266,7 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_files = le32_to_cpu(es->s_inodes_count); buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter); buf->f_namelen = EXT4_NAME_LEN; - fsid = le64_to_cpup((void *)es->s_uuid) ^ - le64_to_cpup((void *)es->s_uuid + sizeof(u64)); - buf->f_fsid = u64_to_fsid(fsid); + buf->f_fsid = uuid_to_fsid(es->s_uuid); #ifdef CONFIG_QUOTA if (ext4_test_inode_flag(dentry->d_inode, EXT4_INODE_PROJINHERIT) && diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index e60759d8bb5f..2d1f5d4d12e0 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -1169,7 +1169,6 @@ static int zonefs_statfs(struct dentry *dentry, struct kstatfs *buf) struct super_block *sb = dentry->d_sb; struct zonefs_sb_info *sbi = ZONEFS_SB(sb); enum zonefs_ztype t; - u64 fsid; buf->f_type = ZONEFS_MAGIC; buf->f_bsize = sb->s_blocksize; @@ -1192,9 +1191,7 @@ static int zonefs_statfs(struct dentry *dentry, struct kstatfs *buf) spin_unlock(&sbi->s_lock); - fsid = le64_to_cpup((void *)sbi->s_uuid.b) ^ - le64_to_cpup((void *)sbi->s_uuid.b + sizeof(u64)); - buf->f_fsid = u64_to_fsid(fsid); + buf->f_fsid = uuid_to_fsid(sbi->s_uuid.b); return 0; } diff --git a/include/linux/statfs.h b/include/linux/statfs.h index 20f695b90aab..02c862686ea3 100644 --- a/include/linux/statfs.h +++ b/include/linux/statfs.h @@ -4,6 +4,7 @@ #include #include +#include struct kstatfs { long f_type; @@ -50,4 +51,11 @@ static inline __kernel_fsid_t u64_to_fsid(u64 v) return (__kernel_fsid_t){.val = {(u32)v, (u32)(v>>32)}}; } +/* Fold 16 bytes uuid to 64 bit fsid */ +static inline __kernel_fsid_t uuid_to_fsid(__u8 *uuid) +{ + return u64_to_fsid(le64_to_cpup((void *)uuid) ^ + le64_to_cpup((void *)(uuid + sizeof(u64)))); +} + #endif -- Gitee From a3fbc9d494df45ee9316918c4286bbe594faf6e4 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Tue, 12 Jul 2022 22:51:56 +0800 Subject: [PATCH 161/674] shmem: allow reporting fanotify events with file handles on tmpfs mainline inclusion from mainline-5.13-rc1 commit 59cda49ecf6c9a32fae4942420701b6e087204f6 category: feature bugzilla: 187162, https://gitee.com/openeuler/kernel/issues/I5FYKV CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=59cda49ecf6c9a32fae4942420701b6e087204f6 ------------------------------------------------- Since kernel v5.1, fanotify_init(2) supports the flag FAN_REPORT_FID for identifying objects using file handle and fsid in events. fanotify_mark(2) fails with -ENODEV when trying to set a mark on filesystems that report null f_fsid in stasfs(2). Use the digest of uuid as f_fsid for tmpfs to uniquely identify tmpfs objects as best as possible and allow setting an fanotify mark that reports events with file handles on tmpfs. Link: https://lore.kernel.org/r/20210322173944.449469-3-amir73il@gmail.com Acked-by: Hugh Dickins Reviewed-by: Christian Brauner Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Signed-off-by: Baokun Li Reviewed-by: chengzhihao Reviewed-by: Zhang Yi Signed-off-by: Zheng Zengkai --- mm/shmem.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/shmem.c b/mm/shmem.c index 9df016296347..201086030e72 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2869,6 +2869,9 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_ffree = sbinfo->free_inodes; } /* else leave those fields 0 like simple_statfs */ + + buf->f_fsid = uuid_to_fsid(dentry->d_sb->s_uuid.b); + return 0; } -- Gitee From 756a3804723a3dac3018968ad9e0cec674418709 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Tue, 12 Jul 2022 22:51:57 +0800 Subject: [PATCH 162/674] mm/filemap: fix UAF in find_lock_entries maillist inclusion category: bugfix bugzilla: 186821, https://gitee.com/openeuler/kernel/issues/I5G69G Reference: https://lore.kernel.org/all/20220707020938.2122198-1-liushixin2@huawei.com/ -------------------------------- Release refcount after xas_set to fix UAF which may cause panic like this: page:ffffea000491fa40 refcount:1 mapcount:0 mapping:0000000000000000 index:0x1 pfn:0x1247e9 head:ffffea000491fa00 order:3 compound_mapcount:0 compound_pincount:0 memcg:ffff888104f91091 flags: 0x2fffff80010200(slab|head|node=0|zone=2|lastcpupid=0x1fffff) ... page dumped because: VM_BUG_ON_PAGE(PageTail(page)) ------------[ cut here ]------------ kernel BUG at include/linux/page-flags.h:632! invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC KASAN CPU: 1 PID: 7642 Comm: sh Not tainted 5.15.51-dirty #26 ... Call Trace: __invalidate_mapping_pages+0xe7/0x540 drop_pagecache_sb+0x159/0x320 iterate_supers+0x120/0x240 drop_caches_sysctl_handler+0xaa/0xe0 proc_sys_call_handler+0x2b4/0x480 new_sync_write+0x3d6/0x5c0 vfs_write+0x446/0x7a0 ksys_write+0x105/0x210 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f52b5733130 ... This problem has been fixed on mainline by patch 6b24ca4a1a8d ("mm: Use multi-index entries in the page cache") since it deletes the related code. Fixes: 5c211ba29deb ("mm: add and use find_lock_entries") Signed-off-by: Liu Shixin Acked-by: Matthew Wilcox (Oracle) Signed-off-by: Greg Kroah-Hartman Conflicts: mm/filemap.c Signed-off-by: Liu Shixin Reviewed-by: Kefeng Wang Signed-off-by: Zheng Zengkai --- mm/filemap.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 9653c1e0faef..ebae261f9df9 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1951,7 +1951,11 @@ unsigned find_lock_entries(struct address_space *mapping, pgoff_t start, rcu_read_lock(); while ((page = find_get_entry(&xas, end, XA_PRESENT))) { + unsigned long next_idx = xas.xa_index + 1; + if (!xa_is_value(page)) { + if (PageTransHuge(page)) + next_idx = page->index + thp_nr_pages(page); if (page->index < start) goto put; VM_BUG_ON_PAGE(page->index != xas.xa_index, page); @@ -1973,13 +1977,11 @@ unsigned find_lock_entries(struct address_space *mapping, pgoff_t start, put: put_page(page); next: - if (!xa_is_value(page) && PageTransHuge(page)) { - unsigned int nr_pages = thp_nr_pages(page); - + if (next_idx != xas.xa_index + 1) { /* Final THP may cross MAX_LFS_FILESIZE on 32-bit */ - xas_set(&xas, page->index + nr_pages); - if (xas.xa_index < nr_pages) + if (next_idx < xas.xa_index) break; + xas_set(&xas, next_idx); } } rcu_read_unlock(); -- Gitee From 2e8eb7a44f09ec9a3da21113ad2e13efcd07e7ba Mon Sep 17 00:00:00 2001 From: Luo Meng Date: Tue, 12 Jul 2022 22:51:58 +0800 Subject: [PATCH 163/674] tmpfs: fix undefined-behaviour in shmem_reconfigure() mainline inclusion from mainline-v5.19-rc3 commit d14f5efadd846dbde561bd734318de6a9e6b26e6 category: bugfix bugzilla: 186471 https://gitee.com/openeuler/kernel/issues/I5DRKR CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=d14f5efadd846dbde561bd734318de6a9e6b26e6 ------------------------------------------------------------------------------- When shmem_reconfigure() calls __percpu_counter_compare(), the second parameter is unsigned long long. But in the definition of __percpu_counter_compare(), the second parameter is s64. So when __percpu_counter_compare() executes abs(count - rhs), UBSAN shows the following warning: ================================================================================ UBSAN: Undefined behaviour in lib/percpu_counter.c:209:6 signed integer overflow: 0 - -9223372036854775808 cannot be represented in type 'long long int' CPU: 1 PID: 9636 Comm: syz-executor.2 Tainted: G ---------r- - 4.18.0 #2 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 Call Trace: __dump_stack home/install/linux-rh-3-10/lib/dump_stack.c:77 [inline] dump_stack+0x125/0x1ae home/install/linux-rh-3-10/lib/dump_stack.c:117 ubsan_epilogue+0xe/0x81 home/install/linux-rh-3-10/lib/ubsan.c:159 handle_overflow+0x19d/0x1ec home/install/linux-rh-3-10/lib/ubsan.c:190 __percpu_counter_compare+0x124/0x140 home/install/linux-rh-3-10/lib/percpu_counter.c:209 percpu_counter_compare home/install/linux-rh-3-10/./include/linux/percpu_counter.h:50 [inline] shmem_remount_fs+0x1ce/0x6b0 home/install/linux-rh-3-10/mm/shmem.c:3530 do_remount_sb+0x11b/0x530 home/install/linux-rh-3-10/fs/super.c:888 do_remount home/install/linux-rh-3-10/fs/namespace.c:2344 [inline] do_mount+0xf8d/0x26b0 home/install/linux-rh-3-10/fs/namespace.c:2844 ksys_mount+0xad/0x120 home/install/linux-rh-3-10/fs/namespace.c:3075 __do_sys_mount home/install/linux-rh-3-10/fs/namespace.c:3089 [inline] __se_sys_mount home/install/linux-rh-3-10/fs/namespace.c:3086 [inline] __x64_sys_mount+0xbf/0x160 home/install/linux-rh-3-10/fs/namespace.c:3086 do_syscall_64+0xca/0x5c0 home/install/linux-rh-3-10/arch/x86/entry/common.c:298 entry_SYSCALL_64_after_hwframe+0x6a/0xdf RIP: 0033:0x46b5e9 Code: 5d db fa ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 2b db fa ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007f54d5f22c68 EFLAGS: 00000246 ORIG_RAX: 00000000000000a5 RAX: ffffffffffffffda RBX: 000000000077bf60 RCX: 000000000046b5e9 RDX: 0000000000000000 RSI: 0000000020000000 RDI: 0000000000000000 RBP: 000000000077bf60 R08: 0000000020000140 R09: 0000000000000000 R10: 00000000026740a4 R11: 0000000000000246 R12: 0000000000000000 R13: 00007ffd1fb1592f R14: 00007f54d5f239c0 R15: 000000000077bf6c ================================================================================ [akpm@linux-foundation.org: tweak error message text] Link: https://lkml.kernel.org/r/20220513025225.2678727-1-luomeng12@huawei.com Signed-off-by: Luo Meng Cc: Hugh Dickins Cc: Yu Kuai Signed-off-by: Andrew Morton Conflicts: mm/shmem.c Signed-off-by: ZhaoLong Wang Reviewed-by: Zhang Yi Signed-off-by: Zheng Zengkai --- mm/shmem.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/shmem.c b/mm/shmem.c index 201086030e72..f043682ba567 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3553,6 +3553,10 @@ static int shmem_reconfigure(struct fs_context *fc) spin_lock(&sbinfo->stat_lock); inodes = sbinfo->max_inodes - sbinfo->free_inodes; + if (ctx->blocks > S64_MAX) { + err = "Number of blocks too large"; + goto out; + } if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) { if (!sbinfo->max_blocks) { err = "Cannot retroactively limit size"; -- Gitee From 03c87a1c896c4b7c8797a8af2bbc020998e31c6d Mon Sep 17 00:00:00 2001 From: ZhaoLong Wang Date: Tue, 12 Jul 2022 22:51:59 +0800 Subject: [PATCH 164/674] tmpfs: fix the issue that the mount and remount results are inconsistent. hulk inclusion category: bugfix bugzilla: 187184, https://gitee.com/openeuler/kernel/issues/I5G9EK CVE: NA -------------------------------- An undefined-behavior issue has not been completely fixed since commit a70846a97e7b("tmpfs: fix undefined-behaviour in shmem_reconfigure()"). In the commit, check in the shmem_reconfigure() is added in remount process to avoid the Ubsan problem. However, the check is not added to the mount process. It causes inconsistent results between mount and remount. The operations to reproduce the problem in user mode as follows: If nr_blocks is set to 0x8000000000000000, the mounting is successful. # mount tmpfs /dev/shm/ -t tmpfs -o nr_blocks=0x8000000000000000 However, when -o remount is used, the mount fails because of the check in the shmem_reconfigure() # mount tmpfs /dev/shm/ -t tmpfs -o remount,nr_blocks=0x8000000000000000 mount: /dev/shm: mount point not mounted or bad option. Therefore, add checks in the shmem_parse_one() function and remove the check in shmem_reconfigure() to avoid this problem. Signed-off-by: ZhaoLong Wang Reviewed-by: Zhang Yi Signed-off-by: Zheng Zengkai --- mm/shmem.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index f043682ba567..ad2d68150ed2 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3432,7 +3432,7 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param) break; case Opt_nr_blocks: ctx->blocks = memparse(param->string, &rest); - if (*rest) + if (*rest || ctx->blocks > S64_MAX) goto bad_value; ctx->seen |= SHMEM_SEEN_BLOCKS; break; @@ -3553,10 +3553,7 @@ static int shmem_reconfigure(struct fs_context *fc) spin_lock(&sbinfo->stat_lock); inodes = sbinfo->max_inodes - sbinfo->free_inodes; - if (ctx->blocks > S64_MAX) { - err = "Number of blocks too large"; - goto out; - } + if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) { if (!sbinfo->max_blocks) { err = "Cannot retroactively limit size"; -- Gitee From a5b6f7c60d6566217f8090e3b9c2da6af5c00105 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 12 Jul 2022 22:52:00 +0800 Subject: [PATCH 165/674] xen-netfront: restore __skb_queue_tail() positioning in xennet_get_responses() stable inclusion from stable-v5.10.129 commit 547b7c640df545a344358ede93e491a89194cdfa category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5GBF2?from=project-issue CVE: CVE-2022-33743 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?h=linux-5.10.y&id=547b7c640df545a344358ede93e491a89194cdfa -------------------------------- commit f63c2c2032c2e3caad9add3b82cc6e91c376fd26 upstream. The commit referenced below moved the invocation past the "next" label, without any explanation. In fact this allows misbehaving backends undue control over the domain the frontend runs in, as earlier detected errors require the skb to not be freed (it may be retained for later processing via xennet_move_rx_slot(), or it may simply be unsafe to have it freed). This is CVE-2022-33743 / XSA-405. Fixes: 6c5aa6fc4def ("xen networking: add basic XDP support for xen-netfront") Signed-off-by: Jan Beulich Reviewed-by: Juergen Gross Signed-off-by: Juergen Gross Signed-off-by: Greg Kroah-Hartman Signed-off-by: Ziyang Xuan Reviewed-by: Yue Haibing Reviewed-by: Xiu Jianfeng Reviewed-by: Wei Yongjun Signed-off-by: Zheng Zengkai --- drivers/net/xen-netfront.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index 1a69b5246133..a7ebf17f75a3 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -1057,8 +1057,10 @@ static int xennet_get_responses(struct netfront_queue *queue, } } rcu_read_unlock(); -next: + __skb_queue_tail(list, skb); + +next: if (!(rx->flags & XEN_NETRXF_more_data)) break; -- Gitee From 8da2810de868dd9d42b61a16cbff500f405479cf Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 12 Jul 2022 22:52:01 +0800 Subject: [PATCH 166/674] netfilter: nf_tables: stricter validation of element data mainline inclusion from mainline-v5.19-rc6 commit 7e6bc1f6cabcd30aba0b11219d8e01b952eacbb6 category: bugfix bugzilla: 187147, https://gitee.com/src-openeuler/kernel/issues/I5GCQH CVE: CVE-2022-34918 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=7e6bc1f6cabcd30aba0b11219d8e01b952eacbb6 -------------------------------- Make sure element data type and length do not mismatch the one specified by the set declaration. Fixes: 7d7402642eaf ("netfilter: nf_tables: variable sized set element keys / data") Reported-by: Hugues ANGUELKOV Signed-off-by: Pablo Neira Ayuso Signed-off-by: Lu Wei Reviewed-by: Yue Haibing Reviewed-by: Xiu Jianfeng Reviewed-by: Wei Yongjun Signed-off-by: Zheng Zengkai --- net/netfilter/nf_tables_api.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index ea162e36e0e4..560a93aad5b6 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4880,13 +4880,20 @@ static int nft_setelem_parse_data(struct nft_ctx *ctx, struct nft_set *set, struct nft_data *data, struct nlattr *attr) { + u32 dtype; int err; err = nft_data_init(ctx, data, NFT_DATA_VALUE_MAXLEN, desc, attr); if (err < 0) return err; - if (desc->type != NFT_DATA_VERDICT && desc->len != set->dlen) { + if (set->dtype == NFT_DATA_VERDICT) + dtype = NFT_DATA_VERDICT; + else + dtype = NFT_DATA_VALUE; + + if (dtype != desc->type || + set->dlen != desc->len) { nft_data_release(data, desc->type); return -EINVAL; } -- Gitee From 1938d44f19da38bd4d28bb89aaee5a75763d9ffd Mon Sep 17 00:00:00 2001 From: Roger Pau Monne Date: Tue, 12 Jul 2022 22:52:02 +0800 Subject: [PATCH 167/674] xen/blkfront: fix leaking data in shared pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit stable inclusion from stable-v5.10.129 commit cfea428030be836d79a7690968232bb7fa4410f1 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5GLXT CVE: CVE-2022-26365 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?h=linux-5.10.y&id=cfea428030be836d79a7690968232bb7fa4410f1 -------------------------------- commit 2f446ffe9d737e9a844b97887919c4fda18246e7 upstream. When allocating pages to be used for shared communication with the backend always zero them, this avoids leaking unintended data present on the pages. This is CVE-2022-26365, part of XSA-403. Signed-off-by: Roger Pau Monné Reviewed-by: Jan Beulich Reviewed-by: Juergen Gross Signed-off-by: Juergen Gross Signed-off-by: Greg Kroah-Hartman Signed-off-by: ChenXiaoSong Reviewed-by: Xiu Jianfeng Reviewed-by: Jason Yan Signed-off-by: Zheng Zengkai --- drivers/block/xen-blkfront.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 47d4bb23d6f3..fffb7c3118b1 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -311,7 +311,7 @@ static int fill_grant_buffer(struct blkfront_ring_info *rinfo, int num) goto out_of_memory; if (info->feature_persistent) { - granted_page = alloc_page(GFP_NOIO); + granted_page = alloc_page(GFP_NOIO | __GFP_ZERO); if (!granted_page) { kfree(gnt_list_entry); goto out_of_memory; @@ -1753,7 +1753,7 @@ static int setup_blkring(struct xenbus_device *dev, for (i = 0; i < info->nr_ring_pages; i++) rinfo->ring_ref[i] = GRANT_INVALID_REF; - sring = alloc_pages_exact(ring_size, GFP_NOIO); + sring = alloc_pages_exact(ring_size, GFP_NOIO | __GFP_ZERO); if (!sring) { xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring"); return -ENOMEM; @@ -2293,7 +2293,8 @@ static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo) BUG_ON(!list_empty(&rinfo->indirect_pages)); for (i = 0; i < num; i++) { - struct page *indirect_page = alloc_page(GFP_KERNEL); + struct page *indirect_page = alloc_page(GFP_KERNEL | + __GFP_ZERO); if (!indirect_page) goto out_of_memory; list_add(&indirect_page->lru, &rinfo->indirect_pages); -- Gitee From a6baeccd14246eed67ffaafa52d3902d1c9fa9bf Mon Sep 17 00:00:00 2001 From: Roger Pau Monne Date: Tue, 12 Jul 2022 22:52:03 +0800 Subject: [PATCH 168/674] xen/netfront: fix leaking data in shared pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit stable inclusion from stable-v5.10.129 commit 728d68bfe68d92eae1407b8a9edc7817d6227404 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5GLYP CVE: CVE-2022-33740 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?h=linux-5.10.y&id=728d68bfe68d92eae1407b8a9edc7817d6227404 -------------------------------- commit 307c8de2b02344805ebead3440d8feed28f2f010 upstream. When allocating pages to be used for shared communication with the backend always zero them, this avoids leaking unintended data present on the pages. This is CVE-2022-33740, part of XSA-403. Signed-off-by: Roger Pau Monné Reviewed-by: Jan Beulich Reviewed-by: Juergen Gross Signed-off-by: Juergen Gross Signed-off-by: Greg Kroah-Hartman Signed-off-by: ChenXiaoSong Reviewed-by: Xiu Jianfeng Reviewed-by: Jason Yan Signed-off-by: Zheng Zengkai --- drivers/net/xen-netfront.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index a7ebf17f75a3..881bc68e5402 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -273,7 +273,8 @@ static struct sk_buff *xennet_alloc_one_rx_buffer(struct netfront_queue *queue) if (unlikely(!skb)) return NULL; - page = page_pool_dev_alloc_pages(queue->page_pool); + page = page_pool_alloc_pages(queue->page_pool, + GFP_ATOMIC | __GFP_NOWARN | __GFP_ZERO); if (unlikely(!page)) { kfree_skb(skb); return NULL; -- Gitee From 5d6fe8fb20024bec55c269088b085a3706a0be16 Mon Sep 17 00:00:00 2001 From: Roger Pau Monne Date: Tue, 12 Jul 2022 22:52:04 +0800 Subject: [PATCH 169/674] xen/netfront: force data bouncing when backend is untrusted MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit stable inclusion from stable-v5.10.129 commit 4923217af5742a796821272ee03f8d6de15c0cca category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5GLZZ CVE: CVE-2022-33741 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?h=linux-5.10.y&id=4923217af5742a796821272ee03f8d6de15c0cca -------------------------------- commit 4491001c2e0fa69efbb748c96ec96b100a5cdb7e upstream. Bounce all data on the skbs to be transmitted into zeroed pages if the backend is untrusted. This avoids leaking data present in the pages shared with the backend but not part of the skb fragments. This requires introducing a new helper in order to allocate skbs with a size multiple of XEN_PAGE_SIZE so we don't leak contiguous data on the granted pages. Reporting whether the backend is to be trusted can be done using a module parameter, or from the xenstore frontend path as set by the toolstack when adding the device. This is CVE-2022-33741, part of XSA-403. Signed-off-by: Roger Pau Monné Reviewed-by: Juergen Gross Signed-off-by: Juergen Gross Signed-off-by: Greg Kroah-Hartman Signed-off-by: ChenXiaoSong Reviewed-by: Xiu Jianfeng Reviewed-by: Jason Yan Signed-off-by: Zheng Zengkai --- drivers/net/xen-netfront.c | 49 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 47 insertions(+), 2 deletions(-) diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index 881bc68e5402..569f3c8e7b75 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -66,6 +66,10 @@ module_param_named(max_queues, xennet_max_queues, uint, 0644); MODULE_PARM_DESC(max_queues, "Maximum number of queues per virtual interface"); +static bool __read_mostly xennet_trusted = true; +module_param_named(trusted, xennet_trusted, bool, 0644); +MODULE_PARM_DESC(trusted, "Is the backend trusted"); + #define XENNET_TIMEOUT (5 * HZ) static const struct ethtool_ops xennet_ethtool_ops; @@ -175,6 +179,9 @@ struct netfront_info { /* Is device behaving sane? */ bool broken; + /* Should skbs be bounced into a zeroed buffer? */ + bool bounce; + atomic_t rx_gso_checksum_fixup; }; @@ -670,6 +677,33 @@ static int xennet_xdp_xmit(struct net_device *dev, int n, return n - drops; } +struct sk_buff *bounce_skb(const struct sk_buff *skb) +{ + unsigned int headerlen = skb_headroom(skb); + /* Align size to allocate full pages and avoid contiguous data leaks */ + unsigned int size = ALIGN(skb_end_offset(skb) + skb->data_len, + XEN_PAGE_SIZE); + struct sk_buff *n = alloc_skb(size, GFP_ATOMIC | __GFP_ZERO); + + if (!n) + return NULL; + + if (!IS_ALIGNED((uintptr_t)n->head, XEN_PAGE_SIZE)) { + WARN_ONCE(1, "misaligned skb allocated\n"); + kfree_skb(n); + return NULL; + } + + /* Set the data pointer */ + skb_reserve(n, headerlen); + /* Set the tail pointer and length */ + skb_put(n, skb->len); + + BUG_ON(skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len)); + + skb_copy_header(n, skb); + return n; +} #define MAX_XEN_SKB_FRAGS (65536 / XEN_PAGE_SIZE + 1) @@ -723,9 +757,13 @@ static netdev_tx_t xennet_start_xmit(struct sk_buff *skb, struct net_device *dev /* The first req should be at least ETH_HLEN size or the packet will be * dropped by netback. + * + * If the backend is not trusted bounce all data to zeroed pages to + * avoid exposing contiguous data on the granted page not belonging to + * the skb. */ - if (unlikely(PAGE_SIZE - offset < ETH_HLEN)) { - nskb = skb_copy(skb, GFP_ATOMIC); + if (np->bounce || unlikely(PAGE_SIZE - offset < ETH_HLEN)) { + nskb = bounce_skb(skb); if (!nskb) goto drop; dev_consume_skb_any(skb); @@ -2251,6 +2289,10 @@ static int talk_to_netback(struct xenbus_device *dev, info->netdev->irq = 0; + /* Check if backend is trusted. */ + info->bounce = !xennet_trusted || + !xenbus_read_unsigned(dev->nodename, "trusted", 1); + /* Check if backend supports multiple queues */ max_queues = xenbus_read_unsigned(info->xbdev->otherend, "multi-queue-max-queues", 1); @@ -2417,6 +2459,9 @@ static int xennet_connect(struct net_device *dev) return err; if (np->netback_has_xdp_headroom) pr_info("backend supports XDP headroom\n"); + if (np->bounce) + dev_info(&np->xbdev->dev, + "bouncing transmitted data to zeroed pages\n"); /* talk_to_netback() sets the correct number of queues */ num_queues = dev->real_num_tx_queues; -- Gitee From 52c7b5227826f6cbb5b2c3eb0ea19484dfa8d53d Mon Sep 17 00:00:00 2001 From: Roger Pau Monne Date: Tue, 12 Jul 2022 22:52:05 +0800 Subject: [PATCH 170/674] xen/blkfront: force data bouncing when backend is untrusted MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit stable inclusion from stable-v5.10.129 commit cbbd2d2531539212ff090aecbea9877c996e6ce6 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5GM0S CVE: CVE-2022-33742 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?h=linux-5.10.y&id=cbbd2d2531539212ff090aecbea9877c996e6ce6 -------------------------------- commit 2400617da7eebf9167d71a46122828bc479d64c9 upstream. Split the current bounce buffering logic used with persistent grants into it's own option, and allow enabling it independently of persistent grants. This allows to reuse the same code paths to perform the bounce buffering required to avoid leaking contiguous data in shared pages not part of the request fragments. Reporting whether the backend is to be trusted can be done using a module parameter, or from the xenstore frontend path as set by the toolstack when adding the device. This is CVE-2022-33742, part of XSA-403. Signed-off-by: Roger Pau Monné Reviewed-by: Juergen Gross Signed-off-by: Juergen Gross Signed-off-by: Greg Kroah-Hartman Signed-off-by: ChenXiaoSong Reviewed-by: Xiu Jianfeng Reviewed-by: Jason Yan Signed-off-by: Zheng Zengkai --- drivers/block/xen-blkfront.c | 49 +++++++++++++++++++++++++----------- 1 file changed, 34 insertions(+), 15 deletions(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index fffb7c3118b1..abbb68b6d9bd 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -151,6 +151,10 @@ static unsigned int xen_blkif_max_ring_order; module_param_named(max_ring_page_order, xen_blkif_max_ring_order, int, 0444); MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the shared ring"); +static bool __read_mostly xen_blkif_trusted = true; +module_param_named(trusted, xen_blkif_trusted, bool, 0644); +MODULE_PARM_DESC(trusted, "Is the backend trusted"); + #define BLK_RING_SIZE(info) \ __CONST_RING_SIZE(blkif, XEN_PAGE_SIZE * (info)->nr_ring_pages) @@ -208,6 +212,7 @@ struct blkfront_info unsigned int feature_discard:1; unsigned int feature_secdiscard:1; unsigned int feature_persistent:1; + unsigned int bounce:1; unsigned int discard_granularity; unsigned int discard_alignment; /* Number of 4KB segments handled */ @@ -310,7 +315,7 @@ static int fill_grant_buffer(struct blkfront_ring_info *rinfo, int num) if (!gnt_list_entry) goto out_of_memory; - if (info->feature_persistent) { + if (info->bounce) { granted_page = alloc_page(GFP_NOIO | __GFP_ZERO); if (!granted_page) { kfree(gnt_list_entry); @@ -330,7 +335,7 @@ static int fill_grant_buffer(struct blkfront_ring_info *rinfo, int num) list_for_each_entry_safe(gnt_list_entry, n, &rinfo->grants, node) { list_del(&gnt_list_entry->node); - if (info->feature_persistent) + if (info->bounce) __free_page(gnt_list_entry->page); kfree(gnt_list_entry); i--; @@ -376,7 +381,7 @@ static struct grant *get_grant(grant_ref_t *gref_head, /* Assign a gref to this page */ gnt_list_entry->gref = gnttab_claim_grant_reference(gref_head); BUG_ON(gnt_list_entry->gref == -ENOSPC); - if (info->feature_persistent) + if (info->bounce) grant_foreign_access(gnt_list_entry, info); else { /* Grant access to the GFN passed by the caller */ @@ -400,7 +405,7 @@ static struct grant *get_indirect_grant(grant_ref_t *gref_head, /* Assign a gref to this page */ gnt_list_entry->gref = gnttab_claim_grant_reference(gref_head); BUG_ON(gnt_list_entry->gref == -ENOSPC); - if (!info->feature_persistent) { + if (!info->bounce) { struct page *indirect_page; /* Fetch a pre-allocated page to use for indirect grefs */ @@ -715,7 +720,7 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri .grant_idx = 0, .segments = NULL, .rinfo = rinfo, - .need_copy = rq_data_dir(req) && info->feature_persistent, + .need_copy = rq_data_dir(req) && info->bounce, }; /* @@ -1035,11 +1040,12 @@ static void xlvbd_flush(struct blkfront_info *info) { blk_queue_write_cache(info->rq, info->feature_flush ? true : false, info->feature_fua ? true : false); - pr_info("blkfront: %s: %s %s %s %s %s\n", + pr_info("blkfront: %s: %s %s %s %s %s %s %s\n", info->gd->disk_name, flush_info(info), "persistent grants:", info->feature_persistent ? "enabled;" : "disabled;", "indirect descriptors:", - info->max_indirect_segments ? "enabled;" : "disabled;"); + info->max_indirect_segments ? "enabled;" : "disabled;", + "bounce buffer:", info->bounce ? "enabled" : "disabled;"); } static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset) @@ -1273,7 +1279,7 @@ static void blkif_free_ring(struct blkfront_ring_info *rinfo) if (!list_empty(&rinfo->indirect_pages)) { struct page *indirect_page, *n; - BUG_ON(info->feature_persistent); + BUG_ON(info->bounce); list_for_each_entry_safe(indirect_page, n, &rinfo->indirect_pages, lru) { list_del(&indirect_page->lru); __free_page(indirect_page); @@ -1290,7 +1296,7 @@ static void blkif_free_ring(struct blkfront_ring_info *rinfo) 0, 0UL); rinfo->persistent_gnts_c--; } - if (info->feature_persistent) + if (info->bounce) __free_page(persistent_gnt->page); kfree(persistent_gnt); } @@ -1311,7 +1317,7 @@ static void blkif_free_ring(struct blkfront_ring_info *rinfo) for (j = 0; j < segs; j++) { persistent_gnt = rinfo->shadow[i].grants_used[j]; gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL); - if (info->feature_persistent) + if (info->bounce) __free_page(persistent_gnt->page); kfree(persistent_gnt); } @@ -1501,7 +1507,7 @@ static int blkif_completion(unsigned long *id, data.s = s; num_sg = s->num_sg; - if (bret->operation == BLKIF_OP_READ && info->feature_persistent) { + if (bret->operation == BLKIF_OP_READ && info->bounce) { for_each_sg(s->sg, sg, num_sg, i) { BUG_ON(sg->offset + sg->length > PAGE_SIZE); @@ -1560,7 +1566,7 @@ static int blkif_completion(unsigned long *id, * Add the used indirect page back to the list of * available pages for indirect grefs. */ - if (!info->feature_persistent) { + if (!info->bounce) { indirect_page = s->indirect_grants[i]->page; list_add(&indirect_page->lru, &rinfo->indirect_pages); } @@ -1857,6 +1863,10 @@ static int talk_to_blkback(struct xenbus_device *dev, if (!info) return -ENODEV; + /* Check if backend is trusted. */ + info->bounce = !xen_blkif_trusted || + !xenbus_read_unsigned(dev->nodename, "trusted", 1); + max_page_order = xenbus_read_unsigned(info->xbdev->otherend, "max-ring-page-order", 0); ring_page_order = min(xen_blkif_max_ring_order, max_page_order); @@ -2283,10 +2293,10 @@ static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo) if (err) goto out_of_memory; - if (!info->feature_persistent && info->max_indirect_segments) { + if (!info->bounce && info->max_indirect_segments) { /* - * We are using indirect descriptors but not persistent - * grants, we need to allocate a set of pages that can be + * We are using indirect descriptors but don't have a bounce + * buffer, we need to allocate a set of pages that can be * used for mapping indirect grefs */ int num = INDIRECT_GREFS(grants) * BLK_RING_SIZE(info); @@ -2387,6 +2397,8 @@ static void blkfront_gather_backend_features(struct blkfront_info *info) info->feature_persistent = !!xenbus_read_unsigned(info->xbdev->otherend, "feature-persistent", 0); + if (info->feature_persistent) + info->bounce = true; indirect_segments = xenbus_read_unsigned(info->xbdev->otherend, "feature-max-indirect-segments", 0); @@ -2760,6 +2772,13 @@ static void blkfront_delay_work(struct work_struct *work) struct blkfront_info *info; bool need_schedule_work = false; + /* + * Note that when using bounce buffers but not persistent grants + * there's no need to run blkfront_delay_work because grants are + * revoked in blkif_completion or else an error is reported and the + * connection is closed. + */ + mutex_lock(&blkfront_mutex); list_for_each_entry(info, &info_list, info_list) { -- Gitee From d78aa4d9b77b77832dbe9fb1f03ac0054d5eaca8 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Mon, 22 Mar 2021 13:53:23 +0000 Subject: [PATCH 171/674] x86/cpufeatures: Enumerate #DB for bus lock detection mainline inclusion from mainline-v5.12-rc4 commit f21d4d3b97a8603567e5d4250bd75e8ebbd520af category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5G10C CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/ commit/?id=f21d4d3b97a8603567e5d4250bd75e8ebbd520af Intel-SIG: commit f21d4d3 x86/cpufeatures: Enumerate #DB for bus lock detection -------------------------------- A bus lock is acquired through either a split locked access to writeback (WB) memory or any locked access to non-WB memory. This is typically >1000 cycles slower than an atomic operation within a cache line. It also disrupts performance on other cores. Some CPUs have the ability to notify the kernel by a #DB trap after a user instruction acquires a bus lock and is executed. This allows the kernel to enforce user application throttling or mitigation. Both breakpoint and bus lock can trigger the #DB trap in the same instruction and the ordering of handling them is the kernel #DB handler's choice. The CPU feature flag to be shown in /proc/cpuinfo will be "bus_lock_detect". Signed-off-by: Fenghua Yu Signed-off-by: Thomas Gleixner Reviewed-by: Tony Luck Link: https://lore.kernel.org/r/20210322135325.682257-2-fenghua.yu@intel.com (cherry picked from commit f21d4d3b97a8603567e5d4250bd75e8ebbd520af) Signed-off-by: Ethan Zhao --- arch/x86/include/asm/cpufeatures.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 4da419226377..a0de1f6b8e62 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -353,6 +353,7 @@ #define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */ #define X86_FEATURE_LA57 (16*32+16) /* 5-level page tables */ #define X86_FEATURE_RDPID (16*32+22) /* RDPID instruction */ +#define X86_FEATURE_BUS_LOCK_DETECT (16*32+24) /* Bus Lock detect */ #define X86_FEATURE_CLDEMOTE (16*32+25) /* CLDEMOTE instruction */ #define X86_FEATURE_MOVDIRI (16*32+27) /* MOVDIRI instruction */ #define X86_FEATURE_MOVDIR64B (16*32+28) /* MOVDIR64B instruction */ -- Gitee From 3f9272839c21f5b6ff3e912c45fc1b0d605c7cd6 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Mon, 22 Mar 2021 13:53:24 +0000 Subject: [PATCH 172/674] x86/traps: Handle #DB for bus lock mainline inclusion from mainline-v5.12-rc4 commit ebb1064e7c2e90b56e4d40ab154ef9796060a1c3 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5G10C CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/ commit/?id=ebb1064e7c2e90b56e4d40ab154ef9796060a1c3 Intel-SIG: commit ebb1064 x86/traps: Handle #DB for bus lock -------------------------------- Bus locks degrade performance for the whole system, not just for the CPU that requested the bus lock. Two CPU features "#AC for split lock" and "#DB for bus lock" provide hooks so that the operating system may choose one of several mitigation strategies. bus lock feature to cover additional situations with new options to mitigate. split_lock_detect= #AC for split lock #DB for bus lock off Do nothing Do nothing warn Kernel OOPs Warn once per task and Warn once per task and and continues to run. disable future checking When both features are supported, warn in #AC fatal Kernel OOPs Send SIGBUS to user. Send SIGBUS to user When both features are supported, fatal in #AC ratelimit:N Do nothing Limit bus lock rate to N per second in the current non-root user. Default option is "warn". Hardware only generates #DB for bus lock detect when CPL>0 to avoid nested #DB from multiple bus locks while the first #DB is being handled. So no need to handle #DB for bus lock detected in the kernel. while #AC for split lock is enabled by split lock detection bit 29 in TEST_CTRL MSR. Both breakpoint and bus lock in the same instruction can trigger one #DB. The bus lock is handled before the breakpoint in the #DB handler. Delivery of #DB for bus lock in userspace clears DR6[11], which is set by the #DB handler right after reading DR6. Signed-off-by: Fenghua Yu Signed-off-by: Thomas Gleixner Reviewed-by: Tony Luck Link: https://lore.kernel.org/r/20210322135325.682257-3-fenghua.yu@intel.com (cherry picked from commit ebb1064e7c2e90b56e4d40ab154ef9796060a1c3) Signed-off-by: Ethan Zhao --- arch/x86/include/asm/cpu.h | 7 +- arch/x86/include/asm/msr-index.h | 1 + arch/x86/include/uapi/asm/debugreg.h | 1 + arch/x86/kernel/cpu/common.c | 2 +- arch/x86/kernel/cpu/intel.c | 113 ++++++++++++++++++++++----- arch/x86/kernel/traps.c | 4 + 6 files changed, 106 insertions(+), 22 deletions(-) diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h index da78ccbd493b..0d7fc0e2bfc9 100644 --- a/arch/x86/include/asm/cpu.h +++ b/arch/x86/include/asm/cpu.h @@ -41,12 +41,13 @@ unsigned int x86_family(unsigned int sig); unsigned int x86_model(unsigned int sig); unsigned int x86_stepping(unsigned int sig); #ifdef CONFIG_CPU_SUP_INTEL -extern void __init cpu_set_core_cap_bits(struct cpuinfo_x86 *c); +extern void __init sld_setup(struct cpuinfo_x86 *c); extern void switch_to_sld(unsigned long tifn); extern bool handle_user_split_lock(struct pt_regs *regs, long error_code); extern bool handle_guest_split_lock(unsigned long ip); +extern void handle_bus_lock(struct pt_regs *regs); #else -static inline void __init cpu_set_core_cap_bits(struct cpuinfo_x86 *c) {} +static inline void __init sld_setup(struct cpuinfo_x86 *c) {} static inline void switch_to_sld(unsigned long tifn) {} static inline bool handle_user_split_lock(struct pt_regs *regs, long error_code) { @@ -57,6 +58,8 @@ static inline bool handle_guest_split_lock(unsigned long ip) { return false; } + +static inline void handle_bus_lock(struct pt_regs *regs) {} #endif #ifdef CONFIG_IA32_FEAT_CTL void init_ia32_feat_ctl(struct cpuinfo_x86 *c); diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 2b0af5eb5131..e2103a35b45a 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -289,6 +289,7 @@ #define DEBUGCTLMSR_LBR (1UL << 0) /* last branch recording */ #define DEBUGCTLMSR_BTF_SHIFT 1 #define DEBUGCTLMSR_BTF (1UL << 1) /* single-step on branches */ +#define DEBUGCTLMSR_BUS_LOCK_DETECT (1UL << 2) #define DEBUGCTLMSR_TR (1UL << 6) #define DEBUGCTLMSR_BTS (1UL << 7) #define DEBUGCTLMSR_BTINT (1UL << 8) diff --git a/arch/x86/include/uapi/asm/debugreg.h b/arch/x86/include/uapi/asm/debugreg.h index d95d080b30e3..0007ba077c0c 100644 --- a/arch/x86/include/uapi/asm/debugreg.h +++ b/arch/x86/include/uapi/asm/debugreg.h @@ -24,6 +24,7 @@ #define DR_TRAP3 (0x8) /* db3 */ #define DR_TRAP_BITS (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3) +#define DR_BUS_LOCK (0x800) /* bus_lock */ #define DR_STEP (0x4000) /* single-step */ #define DR_SWITCH (0x8000) /* task switch */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 4917c2698ac1..b06254f8643c 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1374,7 +1374,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) cpu_set_bug_bits(c); - cpu_set_core_cap_bits(c); + sld_setup(c); fpu__init_system(c); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 816fdbec795a..dfda4f42ec9b 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -43,9 +43,9 @@ enum split_lock_detect_state { }; /* - * Default to sld_off because most systems do not support split lock detection - * split_lock_setup() will switch this to sld_warn on systems that support - * split lock detect, unless there is a command line override. + * Default to sld_off because most systems do not support split lock detection. + * sld_state_setup() will switch this to sld_warn on systems that support + * split lock/bus lock detect, unless there is a command line override. */ static enum split_lock_detect_state sld_state __ro_after_init = sld_off; static u64 msr_test_ctrl_cache __ro_after_init; @@ -602,6 +602,7 @@ static void init_intel_misc_features(struct cpuinfo_x86 *c) } static void split_lock_init(void); +static void bus_lock_init(void); static void init_intel(struct cpuinfo_x86 *c) { @@ -719,6 +720,9 @@ static void init_intel(struct cpuinfo_x86 *c) tsx_disable(); split_lock_init(); + bus_lock_init(); + + intel_init_thermal(c); } #ifdef CONFIG_X86_32 @@ -1017,16 +1021,15 @@ static bool split_lock_verify_msr(bool on) return ctrl == tmp; } -static void __init split_lock_setup(void) +static void __init sld_state_setup(void) { enum split_lock_detect_state state = sld_warn; char arg[20]; int i, ret; - if (!split_lock_verify_msr(false)) { - pr_info("MSR access failed: Disabled\n"); + if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) && + !boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) return; - } ret = cmdline_find_option(boot_command_line, "split_lock_detect", arg, sizeof(arg)); @@ -1038,17 +1041,14 @@ static void __init split_lock_setup(void) } } } + sld_state = state; +} - switch (state) { - case sld_off: - pr_info("disabled\n"); +static void __init __split_lock_setup(void) +{ + if (!split_lock_verify_msr(false)) { + pr_info("MSR access failed: Disabled\n"); return; - case sld_warn: - pr_info("warning about user-space split_locks\n"); - break; - case sld_fatal: - pr_info("sending SIGBUS on user-space split_locks\n"); - break; } rdmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache); @@ -1058,7 +1058,9 @@ static void __init split_lock_setup(void) return; } - sld_state = state; + /* Restore the MSR to its cached value. */ + wrmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache); + setup_force_cpu_cap(X86_FEATURE_SPLIT_LOCK_DETECT); } @@ -1115,6 +1117,29 @@ bool handle_guest_split_lock(unsigned long ip) } EXPORT_SYMBOL_GPL(handle_guest_split_lock); +static void bus_lock_init(void) +{ + u64 val; + + /* + * Warn and fatal are handled by #AC for split lock if #AC for + * split lock is supported. + */ + if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) || + (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) && + (sld_state == sld_warn || sld_state == sld_fatal)) || + sld_state == sld_off) + return; + + /* + * Enable #DB for bus lock. All bus locks are handled in #DB except + * split locks are handled in #AC in the fatal case. + */ + rdmsrl(MSR_IA32_DEBUGCTLMSR, val); + val |= DEBUGCTLMSR_BUS_LOCK_DETECT; + wrmsrl(MSR_IA32_DEBUGCTLMSR, val); +} + bool handle_user_split_lock(struct pt_regs *regs, long error_code) { if ((regs->flags & X86_EFLAGS_AC) || sld_state == sld_fatal) @@ -1123,6 +1148,21 @@ bool handle_user_split_lock(struct pt_regs *regs, long error_code) return true; } +void handle_bus_lock(struct pt_regs *regs) +{ + switch (sld_state) { + case sld_off: + break; + case sld_warn: + pr_warn_ratelimited("#DB: %s/%d took a bus_lock trap at address: 0x%lx\n", + current->comm, current->pid, regs->ip); + break; + case sld_fatal: + force_sig_fault(SIGBUS, BUS_ADRALN, NULL); + break; + } +} + /* * This function is called only when switching between tasks with * different split-lock detection modes. It sets the MSR for the @@ -1163,7 +1203,7 @@ static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = { {} }; -void __init cpu_set_core_cap_bits(struct cpuinfo_x86 *c) +static void __init split_lock_setup(struct cpuinfo_x86 *c) { const struct x86_cpu_id *m; u64 ia32_core_caps; @@ -1190,5 +1230,40 @@ void __init cpu_set_core_cap_bits(struct cpuinfo_x86 *c) } cpu_model_supports_sld = true; - split_lock_setup(); + __split_lock_setup(); +} + +static void sld_state_show(void) +{ + if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) && + !boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) + return; + + switch (sld_state) { + case sld_off: + pr_info("disabled\n"); + break; + case sld_warn: + if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) + pr_info("#AC: crashing the kernel on kernel split_locks and warning on user-space split_locks\n"); + else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) + pr_info("#DB: warning on user-space bus_locks\n"); + break; + case sld_fatal: + if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) { + pr_info("#AC: crashing the kernel on kernel split_locks and sending SIGBUS on user-space split_locks\n"); + } else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) { + pr_info("#DB: sending SIGBUS on user-space bus_locks%s\n", + boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) ? + " from non-WB" : ""); + } + break; + } +} + +void __init sld_setup(struct cpuinfo_x86 *c) +{ + split_lock_setup(c); + sld_state_setup(); + sld_state_show(); } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 696ec85164e6..68fa31d334e3 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -1023,6 +1023,10 @@ static __always_inline void exc_debug_user(struct pt_regs *regs, goto out_irq; } + /* #DB for bus lock can only be triggered from userspace. */ + if (dr6 & DR_BUS_LOCK) + handle_bus_lock(regs); + /* Add the virtual_dr6 bits for signals. */ dr6 |= current->thread.virtual_dr6; if (dr6 & (DR_STEP | DR_TRAP_BITS) || icebp) -- Gitee From 82b4f86723f1bef98f128d4bd8b2c75d57260557 Mon Sep 17 00:00:00 2001 From: Kyung Min Park Date: Thu, 16 Jun 2022 09:21:39 +0300 Subject: [PATCH 173/674] Intel: 5G ISA: x86: Enumerate AVX512 FP16 CPUID feature flag mainline inclusion from mainline-5.11 commit e1b35da5e624f8b09d2e98845c2e4c84b179d9a4 category: feature feature: SPR New instructions bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I596EH CVE: N/A Intel-SIG: commit e1b35da5e6 x86: Enumerate AVX512 FP16 CPUID feature flag Backport for SPR core 5G ISA support. ------------------------------------- Enumerate AVX512 Half-precision floating point (FP16) CPUID feature flag. Compared with using FP32, using FP16 cut the number of bits required for storage in half, reducing the exponent from 8 bits to 5, and the mantissa from 23 bits to 10. Using FP16 also enables developers to train and run inference on deep learning models fast when all precision or magnitude (FP32) is not needed. A processor supports AVX512 FP16 if CPUID.(EAX=7,ECX=0):EDX[bit 23] is present. The AVX512 FP16 requires AVX512BW feature be implemented since the instructions for manipulating 32bit masks are associated with AVX512BW. The only in-kernel usage of this is kvm passthrough. The CPU feature flag is shown as "avx512_fp16" in /proc/cpuinfo. Signed-off-by: Kyung Min Park kyung.min.park@intel.com Acked-by: Dave Hansen dave.hansen@intel.com Reviewed-by: Tony Luck tony.luck@intel.com Message-Id: 20201208033441.28207-2-kyung.min.park@intel.com Acked-by: Borislav Petkov bp@suse.de Signed-off-by: Paolo Bonzini pbonzini@redhat.com Signed-off-by: Luming Yu luming.yu@intel.com --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/kernel/cpu/cpuid-deps.c | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 4da419226377..81f0c811604e 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -376,6 +376,7 @@ #define X86_FEATURE_TSXLDTRK (18*32+16) /* TSX Suspend Load Address Tracking */ #define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */ #define X86_FEATURE_ARCH_LBR (18*32+19) /* Intel ARCH LBR */ +#define X86_FEATURE_AVX512_FP16 (18*32+23) /* AVX512 FP16 */ #define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */ #define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */ #define X86_FEATURE_FLUSH_L1D (18*32+28) /* Flush L1D cache */ diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index d502241995a3..42af31b64c2c 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -69,6 +69,7 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_CQM_MBM_TOTAL, X86_FEATURE_CQM_LLC }, { X86_FEATURE_CQM_MBM_LOCAL, X86_FEATURE_CQM_LLC }, { X86_FEATURE_AVX512_BF16, X86_FEATURE_AVX512VL }, + { X86_FEATURE_AVX512_FP16, X86_FEATURE_AVX512BW }, { X86_FEATURE_ENQCMD, X86_FEATURE_XSAVES }, { X86_FEATURE_PER_THREAD_MBA, X86_FEATURE_MBA }, {} -- Gitee From de9dc9cae0c6df31ec992234773ce452c977834e Mon Sep 17 00:00:00 2001 From: Kyung Min Park Date: Thu, 16 Jun 2022 14:56:12 +0300 Subject: [PATCH 174/674] Intel: AVX VNNI: x86: Enumerate AVX Vector Neural Network instructions mainline inclusion from mainline-5.11 commit b85a0425d8056f3bd8d0a94ecdddf2a39d32a801 category: feature feature: SPR New instructions bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I596EH CVE: N/A Intel-SIG: commit b85a0425d80 x86: Enumerate AVX Vector Neural Network instructions Backport for SPR core AVX VNNI support. ---------------------------- Add AVX version of the Vector Neural Network (VNNI) Instructions. A processor supports AVX VNNI instructions if CPUID.0x07.0x1:EAX[4] is present. The following instructions are available when this feature is present. 1. VPDPBUS: Multiply and Add Unsigned and Signed Bytes 2. VPDPBUSDS: Multiply and Add Unsigned and Signed Bytes with Saturation 3. VPDPWSSD: Multiply and Add Signed Word Integers 4. VPDPWSSDS: Multiply and Add Signed Integers with Saturation The only in-kernel usage of this is kvm passthrough. The CPU feature flag is shown as "avx_vnni" in /proc/cpuinfo. This instruction is currently documented in the latest "extensions" manual (ISE). It will appear in the "main" manual (SDM) in the future. Signed-off-by: Kyung Min Park Signed-off-by: Yang Zhong Reviewed-by: Tony Luck Message-Id: <20210105004909.42000-2-yang.zhong@intel.com> Acked-by: Borislav Petkov Signed-off-by: Paolo Bonzini Signed-off-by: Luming Yu --- arch/x86/include/asm/cpufeatures.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 81f0c811604e..51e57b620461 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -293,6 +293,7 @@ #define X86_FEATURE_PER_THREAD_MBA (11*32+ 7) /* "" Per-thread Memory Bandwidth Allocation */ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ +#define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ #define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */ /* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */ -- Gitee From 58276802a77ed31f164b4a97d2204a7d6ee2c4fe Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Mon, 22 Mar 2021 13:53:25 +0000 Subject: [PATCH 175/674] Documentation/admin-guide: Change doc for split_lock_detect parameter mainline inclusion from mainline-v5.12-rc4 commit ebca17707e38f2050b188d837bd4646b29a1b0c2 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5G10C CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/ commit/?id=ebca17707e38f2050b188d837bd4646b29a1b0c2 Intel-SIG: commit ebca177 Documentation/admin-guide: Change doc for split_lock_ detect parameter -------------------------------- Since #DB for bus lock detect changes the split_lock_detect parameter, update the documentation for the changes. Signed-off-by: Fenghua Yu Signed-off-by: Thomas Gleixner Reviewed-by: Tony Luck Acked-by: Randy Dunlap Link: https://lore.kernel.org/r/20210322135325.682257-4-fenghua.yu@intel.com (cherry picked from commit ebca17707e38f2050b188d837bd4646b29a1b0c2) Signed-off-by: Ethan Zhao --- .../admin-guide/kernel-parameters.txt | 22 ++++++++++++++----- 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 2b04cf8fbab4..fbbe09ac2f86 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5275,27 +5275,37 @@ spia_peddr= split_lock_detect= - [X86] Enable split lock detection + [X86] Enable split lock detection or bus lock detection When enabled (and if hardware support is present), atomic instructions that access data across cache line - boundaries will result in an alignment check exception. + boundaries will result in an alignment check exception + for split lock detection or a debug exception for + bus lock detection. off - not enabled - warn - the kernel will emit rate limited warnings + warn - the kernel will emit rate-limited warnings about applications triggering the #AC - exception. This mode is the default on CPUs - that supports split lock detection. + exception or the #DB exception. This mode is + the default on CPUs that support split lock + detection or bus lock detection. Default + behavior is by #AC if both features are + enabled in hardware. fatal - the kernel will send SIGBUS to applications - that trigger the #AC exception. + that trigger the #AC exception or the #DB + exception. Default behavior is by #AC if + both features are enabled in hardware. If an #AC exception is hit in the kernel or in firmware (i.e. not while executing in user mode) the kernel will oops in either "warn" or "fatal" mode. + #DB exception for bus lock is triggered only when + CPL > 0. + srbds= [X86,INTEL] Control the Special Register Buffer Data Sampling (SRBDS) mitigation. -- Gitee From 9fbda1057252b584a3b24a507740aac38fe41c7d Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Mon, 19 Apr 2021 21:49:55 +0000 Subject: [PATCH 176/674] Documentation/x86: Add buslock.rst mainline inclusion from mainline-v5.12-rc4 commit 1897907cca5aa22cdfcdb7fb8f0644a6add0877d category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5G10C CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/ commit/?id=1897907cca5aa22cdfcdb7fb8f0644a6add0877d Intel-SIG: commit 1897907 Documentation/x86: Add buslock.rst -------------------------------- Add buslock.rst to explain bus lock problem and how to detect and handle it. [ tglx: Included it into index.rst and added the missing include ... ] Signed-off-by: Fenghua Yu Signed-off-by: Thomas Gleixner Reviewed-by: Tony Luck Link: https://lore.kernel.org/r/20210419214958.4035512-2-fenghua.yu@intel.com (cherry picked from commit 1897907cca5aa22cdfcdb7fb8f0644a6add0877d) Signed-off-by: Ethan Zhao --- Documentation/x86/buslock.rst | 104 ++++++++++++++++++++++++++++++++++ Documentation/x86/index.rst | 1 + 2 files changed, 105 insertions(+) create mode 100644 Documentation/x86/buslock.rst diff --git a/Documentation/x86/buslock.rst b/Documentation/x86/buslock.rst new file mode 100644 index 000000000000..159ff6ba830e --- /dev/null +++ b/Documentation/x86/buslock.rst @@ -0,0 +1,104 @@ +.. SPDX-License-Identifier: GPL-2.0 + +.. include:: + +=============================== +Bus lock detection and handling +=============================== + +:Copyright: |copy| 2021 Intel Corporation +:Authors: - Fenghua Yu + - Tony Luck + +Problem +======= + +A split lock is any atomic operation whose operand crosses two cache lines. +Since the operand spans two cache lines and the operation must be atomic, +the system locks the bus while the CPU accesses the two cache lines. + +A bus lock is acquired through either split locked access to writeback (WB) +memory or any locked access to non-WB memory. This is typically thousands of +cycles slower than an atomic operation within a cache line. It also disrupts +performance on other cores and brings the whole system to its knees. + +Detection +========= + +Intel processors may support either or both of the following hardware +mechanisms to detect split locks and bus locks. + +#AC exception for split lock detection +-------------------------------------- + +Beginning with the Tremont Atom CPU split lock operations may raise an +Alignment Check (#AC) exception when a split lock operation is attemped. + +#DB exception for bus lock detection +------------------------------------ + +Some CPUs have the ability to notify the kernel by an #DB trap after a user +instruction acquires a bus lock and is executed. This allows the kernel to +terminate the application or to enforce throttling. + +Software handling +================= + +The kernel #AC and #DB handlers handle bus lock based on the kernel +parameter "split_lock_detect". Here is a summary of different options: + ++------------------+----------------------------+-----------------------+ +|split_lock_detect=|#AC for split lock |#DB for bus lock | ++------------------+----------------------------+-----------------------+ +|off |Do nothing |Do nothing | ++------------------+----------------------------+-----------------------+ +|warn |Kernel OOPs |Warn once per task and | +|(default) |Warn once per task and |and continues to run. | +| |disable future checking | | +| |When both features are | | +| |supported, warn in #AC | | ++------------------+----------------------------+-----------------------+ +|fatal |Kernel OOPs |Send SIGBUS to user. | +| |Send SIGBUS to user | | +| |When both features are | | +| |supported, fatal in #AC | | ++------------------+----------------------------+-----------------------+ + +Usages +====== + +Detecting and handling bus lock may find usages in various areas: + +It is critical for real time system designers who build consolidated real +time systems. These systems run hard real time code on some cores and run +"untrusted" user processes on other cores. The hard real time cannot afford +to have any bus lock from the untrusted processes to hurt real time +performance. To date the designers have been unable to deploy these +solutions as they have no way to prevent the "untrusted" user code from +generating split lock and bus lock to block the hard real time code to +access memory during bus locking. + +It's also useful for general computing to prevent guests or user +applications from slowing down the overall system by executing instructions +with bus lock. + + +Guidance +======== +off +--- + +Disable checking for split lock and bus lock. This option can be useful if +there are legacy applications that trigger these events at a low rate so +that mitigation is not needed. + +warn +---- + +A warning is emitted when a bus lock is detected which allows to identify +the offending application. This is the default behavior. + +fatal +----- + +In this case, the bus lock is not tolerated and the process is killed. diff --git a/Documentation/x86/index.rst b/Documentation/x86/index.rst index 647a570d4931..295a59cbaa69 100644 --- a/Documentation/x86/index.rst +++ b/Documentation/x86/index.rst @@ -29,6 +29,7 @@ x86-specific Documentation microcode resctrl_ui tsx_async_abort + buslock usb-legacy-support i386/index x86_64/index -- Gitee From 25f4c3c5f204a006fbcc003054c3cc08932b6e0c Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Mon, 19 Apr 2021 21:49:57 +0000 Subject: [PATCH 177/674] Documentation/admin-guide: Add bus lock ratelimit mainline inclusion from mainline-v5.12-rc4 commit 9d839c280b64817345c2fa462c0027a9bd742361 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5G10C CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/ commit/?id=9d839c280b64817345c2fa462c0027a9bd742361 Intel-SIG: commit 9d839c Documentation/admin-guide: Add bus lock ratelimit -------------------------------- Since bus lock rate limit changes the split_lock_detect parameter, update the documentation for the change. Signed-off-by: Fenghua Yu Signed-off-by: Thomas Gleixner Reviewed-by: Tony Luck Link: https://lore.kernel.org/r/20210419214958.4035512-4-fenghua.yu@intel.com (cherry picked from commit 9d839c280b64817345c2fa462c0027a9bd742361) Signed-off-by: Ethan Zhao --- Documentation/admin-guide/kernel-parameters.txt | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index fbbe09ac2f86..4241caac8b83 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5298,6 +5298,14 @@ exception. Default behavior is by #AC if both features are enabled in hardware. + ratelimit:N - + Set system wide rate limit to N bus locks + per second for bus lock detection. + 0 < N <= 1000. + + N/A for split lock detection. + + If an #AC exception is hit in the kernel or in firmware (i.e. not while executing in user mode) the kernel will oops in either "warn" or "fatal" -- Gitee From afc7742cc862c618ea92366a49b9437e51ad387a Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Mon, 19 Apr 2021 21:49:56 +0000 Subject: [PATCH 178/674] x86/bus_lock: Set rate limit for bus lock mainline inclusion from mainline-v5.12-rc4 commit ef4ae6e4413159d2329a172c12e9274e2cb0a3a8 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5G10C CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/ commit/?id=ef4ae6e4413159d2329a172c12e9274e2cb0a3a8 Intel-SIG: commit ef4ae6e x86/bus_lock: Set rate limit for bus lock_ -------------------------------- A bus lock can be thousands of cycles slower than atomic operation within one cache line. It also disrupts performance on other cores. Malicious users can generate multiple bus locks to degrade the whole system performance. The current mitigation is to kill the offending process, but for certain scenarios it's desired to identify and throttle the offending application. Add a system wide rate limit for bus locks. When the system detects bus locks at a rate higher than N/sec (where N can be set by the kernel boot argument in the range [1..1000]) any task triggering a bus lock will be forced to sleep for at least 20ms until the overall system rate of bus locks drops below the threshold. Signed-off-by: Fenghua Yu Signed-off-by: Thomas Gleixner Reviewed-by: Tony Luck Link: https://lore.kernel.org/r/20210419214958.4035512-3-fenghua.yu@intel.com (cherry picked from commit ef4ae6e4413159d2329a172c12e9274e2cb0a3a8) Signed-off-by: Ethan Zhao --- arch/x86/kernel/cpu/intel.c | 42 +++++++++++++++++++++++++++++++++++-- 1 file changed, 40 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index dfda4f42ec9b..1780db8192a0 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -40,6 +41,7 @@ enum split_lock_detect_state { sld_off = 0, sld_warn, sld_fatal, + sld_ratelimit, }; /* @@ -996,13 +998,30 @@ static const struct { { "off", sld_off }, { "warn", sld_warn }, { "fatal", sld_fatal }, + { "ratelimit:", sld_ratelimit }, }; +static struct ratelimit_state bld_ratelimit; + static inline bool match_option(const char *arg, int arglen, const char *opt) { - int len = strlen(opt); + int len = strlen(opt), ratelimit; + + if (strncmp(arg, opt, len)) + return false; + + /* + * Min ratelimit is 1 bus lock/sec. + * Max ratelimit is 1000 bus locks/sec. + */ + if (sscanf(arg, "ratelimit:%d", &ratelimit) == 1 && + ratelimit > 0 && ratelimit <= 1000) { + ratelimit_state_init(&bld_ratelimit, HZ, ratelimit); + ratelimit_set_flags(&bld_ratelimit, RATELIMIT_MSG_ON_RELEASE); + return true; + } - return len == arglen && !strncmp(arg, opt, len); + return len == arglen; } static bool split_lock_verify_msr(bool on) @@ -1081,6 +1100,15 @@ static void sld_update_msr(bool on) static void split_lock_init(void) { + /* + * #DB for bus lock handles ratelimit and #AC for split lock is + * disabled. + */ + if (sld_state == sld_ratelimit) { + split_lock_verify_msr(false); + return; + } + if (cpu_model_supports_sld) split_lock_verify_msr(sld_state != sld_off); } @@ -1153,6 +1181,12 @@ void handle_bus_lock(struct pt_regs *regs) switch (sld_state) { case sld_off: break; + case sld_ratelimit: + /* Enforce no more than bld_ratelimit bus locks/sec. */ + while (!__ratelimit(&bld_ratelimit)) + msleep(20); + /* Warn on the bus lock. */ + fallthrough; case sld_warn: pr_warn_ratelimited("#DB: %s/%d took a bus_lock trap at address: 0x%lx\n", current->comm, current->pid, regs->ip); @@ -1258,6 +1292,10 @@ static void sld_state_show(void) " from non-WB" : ""); } break; + case sld_ratelimit: + if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) + pr_info("#DB: setting system wide bus lock rate limit to %u/sec\n", bld_ratelimit.burst); + break; } } -- Gitee From a7a9a47d38a5e6d0e32e9167e51986b645502532 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Thu, 10 Mar 2022 12:48:53 -0800 Subject: [PATCH 179/674] x86/split_lock: Make life miserable for split lockers mainline inclusion from mainline-v5.18-rc4 commit b041b525dab95352fbd666b14dc73ab898df465f category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5G10C CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/ commit/?id=b041b525dab95352fbd666b14dc73ab898df465f Intel-SIG: commit b041b52 x86/split_lock: Make life miserable for split lockers -------------------------------- In https://lore.kernel.org/all/87y22uujkm.ffs@tglx/ Thomas said: Its's simply wishful thinking that stuff gets fixed because of a WARN_ONCE(). This has never worked. The only thing which works is to make stuff fail hard or slow it down in a way which makes it annoying enough to users to complain. He was talking about WBINVD. But it made me think about how we use the split lock detection feature in Linux. Existing code has three options for applications: 1) Don't enable split lock detection (allow arbitrary split locks) 2) Warn once when a process uses split lock, but let the process keep running with split lock detection disabled 3) Kill process that use split locks Option 2 falls into the "wishful thinking" territory that Thomas warns does nothing. But option 3 might not be viable in a situation with legacy applications that need to run. Hence make option 2 much stricter to "slow it down in a way which makes it annoying". Primary reason for this change is to provide better quality of service to the rest of the applications running on the system. Internal testing shows that even with many processes splitting locks, performance for the rest of the system is much more responsive. The new "warn" mode operates like this. When an application tries to execute a bus lock the #AC handler. 1) Delays (interruptibly) 10 ms before moving to next step. 2) Blocks (interruptibly) until it can get the semaphore If interrupted, just return. Assume the signal will either kill the task, or direct execution away from the instruction that is trying to get the bus lock. 3) Disables split lock detection for the current core 4) Schedules a work queue to re-enable split lock detect in 2 jiffies 5) Returns The work queue that re-enables split lock detection also releases the semaphore. There is a corner case where a CPU may be taken offline while split lock detection is disabled. A CPU hotplug handler handles this case. Old behaviour was to only print the split lock warning on the first occurrence of a split lock from a task. Preserve that by adding a flag to the task structure that suppresses subsequent split lock messages from that task. Signed-off-by: Tony Luck Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20220310204854.31752-2-tony.luck@intel.com (cherry picked from commit b041b525dab95352fbd666b14dc73ab898df465f) Signed-off-by: Ethan Zhao --- arch/x86/kernel/cpu/intel.c | 63 +++++++++++++++++++++++++++++++------ include/linux/sched.h | 3 ++ kernel/fork.c | 5 +++ 3 files changed, 61 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 1780db8192a0..87f07093df53 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -7,10 +7,13 @@ #include #include #include +#include #include #include #include +#include #include +#include #include #include @@ -1003,6 +1006,8 @@ static const struct { static struct ratelimit_state bld_ratelimit; +static DEFINE_SEMAPHORE(buslock_sem); + static inline bool match_option(const char *arg, int arglen, const char *opt) { int len = strlen(opt), ratelimit; @@ -1113,18 +1118,52 @@ static void split_lock_init(void) split_lock_verify_msr(sld_state != sld_off); } +static void __split_lock_reenable(struct work_struct *work) +{ + sld_update_msr(true); + up(&buslock_sem); +} + +/* + * If a CPU goes offline with pending delayed work to re-enable split lock + * detection then the delayed work will be executed on some other CPU. That + * handles releasing the buslock_sem, but because it executes on a + * different CPU probably won't re-enable split lock detection. This is a + * problem on HT systems since the sibling CPU on the same core may then be + * left running with split lock detection disabled. + * + * Unconditionally re-enable detection here. + */ +static int splitlock_cpu_offline(unsigned int cpu) +{ + sld_update_msr(true); + + return 0; +} + +static DECLARE_DELAYED_WORK(split_lock_reenable, __split_lock_reenable); + static void split_lock_warn(unsigned long ip) { - pr_warn_ratelimited("#AC: %s/%d took a split_lock trap at address: 0x%lx\n", - current->comm, current->pid, ip); + int cpu; - /* - * Disable the split lock detection for this task so it can make - * progress and set TIF_SLD so the detection is re-enabled via - * switch_to_sld() when the task is scheduled out. - */ + if (!current->reported_split_lock) + pr_warn_ratelimited("#AC: %s/%d took a split_lock trap at address: 0x%lx\n", + current->comm, current->pid, ip); + current->reported_split_lock = 1; + + /* misery factor #1, sleep 10ms before trying to execute split lock */ + if (msleep_interruptible(10) > 0) + return; + /* Misery factor #2, only allow one buslocked disabled core at a time */ + if (down_interruptible(&buslock_sem) == -EINTR) + return; + cpu = get_cpu(); + schedule_delayed_work_on(cpu, &split_lock_reenable, 2); + + /* Disable split lock detection on this CPU to make progress */ sld_update_msr(false); - set_tsk_thread_flag(current, TIF_SLD); + put_cpu(); } bool handle_guest_split_lock(unsigned long ip) @@ -1278,10 +1317,14 @@ static void sld_state_show(void) pr_info("disabled\n"); break; case sld_warn: - if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) + if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) { pr_info("#AC: crashing the kernel on kernel split_locks and warning on user-space split_locks\n"); - else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) + if (cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, + "x86/splitlock", NULL, splitlock_cpu_offline) < 0) + pr_warn("No splitlock CPU offline handler\n"); + } else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) { pr_info("#DB: warning on user-space bus_locks\n"); + } break; case sld_fatal: if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) { diff --git a/include/linux/sched.h b/include/linux/sched.h index 47f462040f4d..764b2a927ef9 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -865,6 +865,9 @@ struct task_struct { /* Stalled due to lack of memory */ unsigned in_memstall:1; #endif +#ifdef CONFIG_CPU_SUP_INTEL + unsigned reported_split_lock:1; +#endif unsigned long atomic_flags; /* Flags requiring atomic access. */ diff --git a/kernel/fork.c b/kernel/fork.c index 0fb86b65ae60..7a2395ade1d4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -946,6 +946,11 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) #ifdef CONFIG_MEMCG tsk->active_memcg = NULL; #endif + +#ifdef CONFIG_CPU_SUP_INTEL + tsk->reported_split_lock = 0; +#endif + return tsk; free_stack: -- Gitee From ba77b6ebe736ec84ef21efb653f06297a6e0d989 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Wed, 19 May 2021 10:51:43 +0200 Subject: [PATCH 180/674] docs: virt: api.rst: fix a pointer to SGX documentation mainline inclusion from mainline-5.14 commit 0a5fab9f085880dbd7f9b0055c74936ca8b64fc1 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK CVE: NA Intel-SIG: commit 0a5fab9f0858 docs: virt: api.rst: fix a pointer to SGX documentation. Backport for SGX virtualization support -------------------------------- The document which describes the SGX kernel architecture was added at commit 3fa97bf00126 ("Documentation/x86: Document SGX kernel architecture") but the reference at virt/kvm/api.rst is pointing to some non-existing document. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/138c24633c6e4edf862a2b4d77033c603fc10406.1621413933.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet Signed-off-by: Zhiquan Li --- Documentation/virt/kvm/api.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 671bbd90f15b..d542bd99a9a4 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6218,7 +6218,7 @@ system fingerprint. To prevent userspace from circumventing such restrictions by running an enclave in a VM, KVM prevents access to privileged attributes by default. -See Documentation/x86/sgx/2.Kernel-internals.rst for more details. +See Documentation/x86/sgx.rst for more details. 8. Other capabilities. ====================== -- Gitee From b7d30662370210e8b0a42ba93f9dc42df80b8b6d Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Tue, 26 Oct 2021 15:00:44 -0700 Subject: [PATCH 181/674] x86/sgx: Add new sgx_epc_page flag bit to mark free pages mainline inclusion from mainline-5.17 commit d6d261bded8a57aed4faa12d08a5b193418d3aa4 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZFM CVE: NA Intel-SIG: commit d6d261bded8a x86/sgx: Add new sgx_epc_page flag bit to mark free pages. Backport for SGX MCA recovery co-existence support -------------------------------- SGX EPC pages go through the following life cycle: DIRTY ---> FREE ---> IN-USE --\ ^ | \-----------------/ Recovery action for poison for a DIRTY or FREE page is simple. Just make sure never to allocate the page. IN-USE pages need some extra handling. Add a new flag bit SGX_EPC_PAGE_IS_FREE that is set when a page is added to a free list and cleared when the page is allocated. Notes: 1) These transitions are made while holding the node->lock so that future code that checks the flags while holding the node->lock can be sure that if the SGX_EPC_PAGE_IS_FREE bit is set, then the page is on the free list. 2) Initially while the pages are on the dirty list the SGX_EPC_PAGE_IS_FREE bit is cleared. Signed-off-by: Tony Luck Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Tested-by: Reinette Chatre Link: https://lkml.kernel.org/r/20211026220050.697075-2-tony.luck@intel.com Signed-off-by: Zhiquan Li --- arch/x86/kernel/cpu/sgx/main.c | 2 ++ arch/x86/kernel/cpu/sgx/sgx.h | 3 +++ 2 files changed, 5 insertions(+) diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 379eeefa744a..e62cc1d04cd7 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -471,6 +471,7 @@ static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid) page = list_first_entry(&node->free_page_list, struct sgx_epc_page, list); list_del_init(&page->list); + page->flags = 0; spin_unlock(&node->lock); atomic_long_dec(&sgx_nr_free_pages); @@ -625,6 +626,7 @@ void sgx_free_epc_page(struct sgx_epc_page *page) spin_lock(&node->lock); list_add_tail(&page->list, &node->free_page_list); + page->flags = SGX_EPC_PAGE_IS_FREE; spin_unlock(&node->lock); atomic_long_inc(&sgx_nr_free_pages); diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h index 4628acec0009..5906471156c5 100644 --- a/arch/x86/kernel/cpu/sgx/sgx.h +++ b/arch/x86/kernel/cpu/sgx/sgx.h @@ -26,6 +26,9 @@ /* Pages, which are being tracked by the page reclaimer. */ #define SGX_EPC_PAGE_RECLAIMER_TRACKED BIT(0) +/* Pages on free list */ +#define SGX_EPC_PAGE_IS_FREE BIT(1) + struct sgx_epc_page { unsigned int section; unsigned int flags; -- Gitee From e87c3256335b30b3fa238c359d8481ebcb03d1f4 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Tue, 26 Oct 2021 15:00:45 -0700 Subject: [PATCH 182/674] x86/sgx: Add infrastructure to identify SGX EPC pages mainline inclusion from mainline-5.17 commit 40e0e7843e23d164625b9031514f5672f8758bf4 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZFM CVE: NA Intel-SIG: commit 40e0e7843e23 x86/sgx: Add infrastructure to identify SGX EPC pages. Backport for SGX MCA recovery co-existence support -------------------------------- X86 machine check architecture reports a physical address when there is a memory error. Handling that error requires a method to determine whether the physical address reported is in any of the areas reserved for EPC pages by BIOS. SGX EPC pages do not have Linux "struct page" associated with them. Keep track of the mapping from ranges of EPC pages to the sections that contain them using an xarray. N.B. adds CONFIG_XARRAY_MULTI to the SGX dependecies. So "select" that in arch/x86/Kconfig for X86/SGX. Create a function arch_is_platform_page() that simply reports whether an address is an EPC page for use elsewhere in the kernel. The ACPI error injection code needs this function and is typically built as a module, so export it. Note that arch_is_platform_page() will be slower than other similar "what type is this page" functions that can simply check bits in the "struct page". If there is some future performance critical user of this function it may need to be implemented in a more efficient way. Note also that the current implementation of xarray allocates a few hundred kilobytes for this usage on a system with 4GB of SGX EPC memory configured. This isn't ideal, but worth it for the code simplicity. Signed-off-by: Tony Luck Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Tested-by: Reinette Chatre Link: https://lkml.kernel.org/r/20211026220050.697075-3-tony.luck@intel.com Signed-off-by: Zhiquan Li --- arch/x86/Kconfig | 1 + arch/x86/kernel/cpu/sgx/main.c | 9 +++++++++ 2 files changed, 10 insertions(+) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 040fb77366dd..aa217be83f58 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1967,6 +1967,7 @@ config X86_SGX select SRCU select MMU_NOTIFIER select NUMA_KEEP_MEMINFO if NUMA + select XARRAY_MULTI help Intel(R) Software Guard eXtensions (SGX) is a set of CPU instructions that can be used by applications to set aside private regions of code diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index e62cc1d04cd7..9e51d3510732 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -20,6 +20,7 @@ struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS]; static int sgx_nr_epc_sections; static struct task_struct *ksgxd_tsk; static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq); +static DEFINE_XARRAY(sgx_epc_address_space); /* * These variables are part of the state of the reclaimer, and must be accessed @@ -650,6 +651,8 @@ static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size, } section->phys_addr = phys_addr; + xa_store_range(&sgx_epc_address_space, section->phys_addr, + phys_addr + size - 1, section, GFP_KERNEL); for (i = 0; i < nr_pages; i++) { section->pages[i].section = index; @@ -661,6 +664,12 @@ static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size, return true; } +bool arch_is_platform_page(u64 paddr) +{ + return !!xa_load(&sgx_epc_address_space, paddr); +} +EXPORT_SYMBOL_GPL(arch_is_platform_page); + /** * A section metric is concatenated in a way that @low bits 12-31 define the * bits 12-31 of the metric and @high bits 0-19 define the bits 32-51 of the -- Gitee From dc1b46e0f80bc54c0bd12eed9824214cb718f70c Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Tue, 26 Oct 2021 15:00:46 -0700 Subject: [PATCH 183/674] x86/sgx: Initial poison handling for dirty and free pages mainline inclusion from mainline-5.17 commit 992801ae92431761b3d8ec88abd5793d154d34ac category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZFM CVE: NA Intel-SIG: commit 992801ae9243 x86/sgx: Initial poison handling for dirty and free pages. Backport for SGX MCA recovery co-existence support -------------------------------- A memory controller patrol scrubber can report poison in a page that isn't currently being used. Add "poison" field in the sgx_epc_page that can be set for an sgx_epc_page. Check for it: 1) When sanitizing dirty pages 2) When freeing epc pages Poison is a new field separated from flags to avoid having to make all updates to flags atomic, or integrate poison state changes into some other locking scheme to protect flags (Currently just sgx_reclaimer_lock which protects the SGX_EPC_PAGE_RECLAIMER_TRACKED bit in page->flags). In both cases place the poisoned page on a per-node list of poisoned epc pages to make sure it will not be reallocated. Signed-off-by: Tony Luck Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Tested-by: Reinette Chatre Link: https://lkml.kernel.org/r/20211026220050.697075-4-tony.luck@intel.com Signed-off-by: Zhiquan Li --- arch/x86/kernel/cpu/sgx/main.c | 26 +++++++++++++++++++++++++- arch/x86/kernel/cpu/sgx/sgx.h | 4 +++- 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 9e51d3510732..2e9ac0454608 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -61,6 +61,24 @@ static void __sgx_sanitize_pages(struct list_head *dirty_page_list) page = list_first_entry(dirty_page_list, struct sgx_epc_page, list); + /* + * Checking page->poison without holding the node->lock + * is racy, but losing the race (i.e. poison is set just + * after the check) just means __eremove() will be uselessly + * called for a page that sgx_free_epc_page() will put onto + * the node->sgx_poison_page_list later. + */ + if (page->poison) { + struct sgx_epc_section *section = &sgx_epc_sections[page->section]; + struct sgx_numa_node *node = section->node; + + spin_lock(&node->lock); + list_move(&page->list, &node->sgx_poison_page_list); + spin_unlock(&node->lock); + + continue; + } + ret = __eremove(sgx_get_epc_virt_addr(page)); if (!ret) { /* @@ -626,7 +644,11 @@ void sgx_free_epc_page(struct sgx_epc_page *page) spin_lock(&node->lock); - list_add_tail(&page->list, &node->free_page_list); + page->owner = NULL; + if (page->poison) + list_add(&page->list, &node->sgx_poison_page_list); + else + list_add_tail(&page->list, &node->free_page_list); page->flags = SGX_EPC_PAGE_IS_FREE; spin_unlock(&node->lock); @@ -658,6 +680,7 @@ static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size, section->pages[i].section = index; section->pages[i].flags = 0; section->pages[i].owner = NULL; + section->pages[i].poison = 0; list_add_tail(§ion->pages[i].list, &sgx_dirty_page_list); } @@ -724,6 +747,7 @@ static bool __init sgx_page_cache_init(void) if (!node_isset(nid, sgx_numa_mask)) { spin_lock_init(&sgx_numa_nodes[nid].lock); INIT_LIST_HEAD(&sgx_numa_nodes[nid].free_page_list); + INIT_LIST_HEAD(&sgx_numa_nodes[nid].sgx_poison_page_list); node_set(nid, sgx_numa_mask); } diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h index 5906471156c5..9ec3136c7800 100644 --- a/arch/x86/kernel/cpu/sgx/sgx.h +++ b/arch/x86/kernel/cpu/sgx/sgx.h @@ -31,7 +31,8 @@ struct sgx_epc_page { unsigned int section; - unsigned int flags; + u16 flags; + u16 poison; struct sgx_encl_page *owner; struct list_head list; }; @@ -42,6 +43,7 @@ struct sgx_epc_page { */ struct sgx_numa_node { struct list_head free_page_list; + struct list_head sgx_poison_page_list; spinlock_t lock; }; -- Gitee From 4fccae8c26f2571bfdde980c8b58af647e7ce008 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Tue, 26 Oct 2021 15:00:47 -0700 Subject: [PATCH 184/674] x86/sgx: Add SGX infrastructure to recover from poison mainline inclusion from mainline-5.17 commit a495cbdffa30558b34f3c95555cecc4fd9688039 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZFM CVE: NA Intel-SIG: commit a495cbdffa30 x86/sgx: Add SGX infrastructure to recover from poison. Backport for SGX MCA recovery co-existence support -------------------------------- Provide a recovery function sgx_memory_failure(). If the poison was consumed synchronously then send a SIGBUS. Note that the virtual address of the access is not included with the SIGBUS as is the case for poison outside of SGX enclaves. This doesn't matter as addresses of code/data inside an enclave is of little to no use to code executing outside the (now dead) enclave. Poison found in a free page results in the page being moved from the free list to the per-node poison page list. Signed-off-by: Tony Luck Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Tested-by: Reinette Chatre Link: https://lkml.kernel.org/r/20211026220050.697075-5-tony.luck@intel.com Signed-off-by: Zhiquan Li --- arch/x86/kernel/cpu/sgx/main.c | 76 ++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 2e9ac0454608..4036f50fc42c 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -693,6 +693,82 @@ bool arch_is_platform_page(u64 paddr) } EXPORT_SYMBOL_GPL(arch_is_platform_page); +static struct sgx_epc_page *sgx_paddr_to_page(u64 paddr) +{ + struct sgx_epc_section *section; + + section = xa_load(&sgx_epc_address_space, paddr); + if (!section) + return NULL; + + return §ion->pages[PFN_DOWN(paddr - section->phys_addr)]; +} + +/* + * Called in process context to handle a hardware reported + * error in an SGX EPC page. + * If the MF_ACTION_REQUIRED bit is set in flags, then the + * context is the task that consumed the poison data. Otherwise + * this is called from a kernel thread unrelated to the page. + */ +int arch_memory_failure(unsigned long pfn, int flags) +{ + struct sgx_epc_page *page = sgx_paddr_to_page(pfn << PAGE_SHIFT); + struct sgx_epc_section *section; + struct sgx_numa_node *node; + + /* + * mm/memory-failure.c calls this routine for all errors + * where there isn't a "struct page" for the address. But that + * includes other address ranges besides SGX. + */ + if (!page) + return -ENXIO; + + /* + * If poison was consumed synchronously. Send a SIGBUS to + * the task. Hardware has already exited the SGX enclave and + * will not allow re-entry to an enclave that has a memory + * error. The signal may help the task understand why the + * enclave is broken. + */ + if (flags & MF_ACTION_REQUIRED) + force_sig(SIGBUS); + + section = &sgx_epc_sections[page->section]; + node = section->node; + + spin_lock(&node->lock); + + /* Already poisoned? Nothing more to do */ + if (page->poison) + goto out; + + page->poison = 1; + + /* + * If the page is on a free list, move it to the per-node + * poison page list. + */ + if (page->flags & SGX_EPC_PAGE_IS_FREE) { + list_move(&page->list, &node->sgx_poison_page_list); + goto out; + } + + /* + * TBD: Add additional plumbing to enable pre-emptive + * action for asynchronous poison notification. Until + * then just hope that the poison: + * a) is not accessed - sgx_free_epc_page() will deal with it + * when the user gives it back + * b) results in a recoverable machine check rather than + * a fatal one + */ +out: + spin_unlock(&node->lock); + return 0; +} + /** * A section metric is concatenated in a way that @low bits 12-31 define the * bits 12-31 of the metric and @high bits 0-19 define the bits 32-51 of the -- Gitee From 5bb9e11f0474d1573ac7c25b0a2139e7ddebf2ab Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Tue, 26 Oct 2021 15:00:48 -0700 Subject: [PATCH 185/674] x86/sgx: Hook arch_memory_failure() into mainline code mainline inclusion from mainline-5.17 commit 03b122da74b22fbe7cd98184fa5657a9ce13970c category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZFM CVE: NA Intel-SIG: commit 03b122da74b2 x86/sgx: Hook arch_memory_failure() into mainline code. Backport for SGX MCA recovery co-existence support -------------------------------- Add a call inside memory_failure() to call the arch specific code to check if the address is an SGX EPC page and handle it. Note the SGX EPC pages do not have a "struct page" entry, so the hook goes in at the same point as the device mapping hook. Pull the call to acquire the mutex earlier so the SGX errors are also protected. Make set_mce_nospec() skip SGX pages when trying to adjust the 1:1 map. Signed-off-by: Tony Luck Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Reviewed-by: Naoya Horiguchi Tested-by: Reinette Chatre Link: https://lkml.kernel.org/r/20211026220050.697075-6-tony.luck@intel.com Signed-off-by: Zhiquan Li --- arch/x86/include/asm/processor.h | 8 ++++++++ arch/x86/include/asm/set_memory.h | 4 ++++ include/linux/mm.h | 13 +++++++++++++ mm/memory-failure.c | 19 +++++++++++++------ 4 files changed, 38 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index d428d611a43a..a3d0152a4361 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -858,4 +858,12 @@ enum mds_mitigations { MDS_MITIGATION_VMWERV, }; +#ifdef CONFIG_X86_SGX +int arch_memory_failure(unsigned long pfn, int flags); +#define arch_memory_failure arch_memory_failure + +bool arch_is_platform_page(u64 paddr); +#define arch_is_platform_page arch_is_platform_page +#endif + #endif /* _ASM_X86_PROCESSOR_H */ diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h index 5948218f35c5..ef47246d22f5 100644 --- a/arch/x86/include/asm/set_memory.h +++ b/arch/x86/include/asm/set_memory.h @@ -2,6 +2,7 @@ #ifndef _ASM_X86_SET_MEMORY_H #define _ASM_X86_SET_MEMORY_H +#include #include #include @@ -97,6 +98,9 @@ static inline int set_mce_nospec(unsigned long pfn, bool unmap) unsigned long decoy_addr; int rc; + /* SGX pages are not in the 1:1 map */ + if (arch_is_platform_page(pfn << PAGE_SHIFT)) + return 0; /* * We would like to just call: * set_memory_XX((unsigned long)pfn_to_kaddr(pfn), 1); diff --git a/include/linux/mm.h b/include/linux/mm.h index b0ddfac0486b..25891b581bf4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -425,6 +425,19 @@ extern unsigned int kobjsize(const void *objp); /* VMA basic access permission flags */ #define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC) +#ifndef arch_memory_failure +static inline int arch_memory_failure(unsigned long pfn, int flags) +{ + return -ENXIO; +} +#endif + +#ifndef arch_is_platform_page +static inline bool arch_is_platform_page(u64 paddr) +{ + return false; +} +#endif /* * Special vmas that are non-mergable, non-mlock()able. diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 0519f20d2b57..bfa6d1478a75 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1370,21 +1370,28 @@ int memory_failure(unsigned long pfn, int flags) if (!sysctl_memory_failure_recovery) panic("Memory failure on page %lx", pfn); + mutex_lock(&mf_mutex); + p = pfn_to_online_page(pfn); if (!p) { + res = arch_memory_failure(pfn, flags); + if (res == 0) + goto unlock_mutex; + if (pfn_valid(pfn)) { pgmap = get_dev_pagemap(pfn, NULL); - if (pgmap) - return memory_failure_dev_pagemap(pfn, flags, - pgmap); + if (pgmap) { + res = memory_failure_dev_pagemap(pfn, flags, + pgmap); + goto unlock_mutex; + } } pr_err("Memory failure: %#lx: memory outside kernel control\n", pfn); - return -ENXIO; + res = -ENXIO; + goto unlock_mutex; } - mutex_lock(&mf_mutex); - try_again: if (PageHuge(p)) { res = memory_failure_hugetlb(pfn, flags); -- Gitee From 3f72e50d38c1c8ea9f6021f539348f2ba1d3a905 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Tue, 26 Oct 2021 15:00:49 -0700 Subject: [PATCH 186/674] x86/sgx: Add hook to error injection address validation mainline inclusion from mainline-5.17 commit c6acb1e7bf4656b9434335c72b8245cc84575fde category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZFM CVE: NA Intel-SIG: commit c6acb1e7bf46 x86/sgx: Add hook to error injection address validation. Backport for SGX MCA recovery co-existence support -------------------------------- SGX reserved memory does not appear in the standard address maps. Add hook to call into the SGX code to check if an address is located in SGX memory. There are other challenges in injecting errors into SGX. Update the documentation with a sequence of operations to inject. Signed-off-by: Tony Luck Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Tested-by: Reinette Chatre Link: https://lkml.kernel.org/r/20211026220050.697075-7-tony.luck@intel.com Signed-off-by: Zhiquan Li --- .../firmware-guide/acpi/apei/einj.rst | 19 +++++++++++++++++++ drivers/acpi/apei/einj.c | 3 ++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/Documentation/firmware-guide/acpi/apei/einj.rst b/Documentation/firmware-guide/acpi/apei/einj.rst index e588bccf5158..2fa989c0a12d 100644 --- a/Documentation/firmware-guide/acpi/apei/einj.rst +++ b/Documentation/firmware-guide/acpi/apei/einj.rst @@ -181,5 +181,24 @@ You should see something like this in dmesg:: [22715.834759] EDAC sbridge MC3: PROCESSOR 0:306e7 TIME 1422553404 SOCKET 0 APIC 0 [22716.616173] EDAC MC3: 1 CE memory read error on CPU_SrcID#0_Channel#0_DIMM#0 (channel:0 slot:0 page:0x12345 offset:0x0 grain:32 syndrome:0x0 - area:DRAM err_code:0001:0090 socket:0 channel_mask:1 rank:0) +Special notes for injection into SGX enclaves: + +There may be a separate BIOS setup option to enable SGX injection. + +The injection process consists of setting some special memory controller +trigger that will inject the error on the next write to the target +address. But the h/w prevents any software outside of an SGX enclave +from accessing enclave pages (even BIOS SMM mode). + +The following sequence can be used: + 1) Determine physical address of enclave page + 2) Use "notrigger=1" mode to inject (this will setup + the injection address, but will not actually inject) + 3) Enter the enclave + 4) Store data to the virtual address matching physical address from step 1 + 5) Execute CLFLUSH for that virtual address + 6) Spin delay for 250ms + 7) Read from the virtual address. This will trigger the error + For more information about EINJ, please refer to ACPI specification version 4.0, section 17.5 and ACPI 5.0, section 18.6. diff --git a/drivers/acpi/apei/einj.c b/drivers/acpi/apei/einj.c index 133156759551..4384269ad159 100644 --- a/drivers/acpi/apei/einj.c +++ b/drivers/acpi/apei/einj.c @@ -544,7 +544,8 @@ static int einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2, ((region_intersects(base_addr, size, IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE) != REGION_INTERSECTS) && (region_intersects(base_addr, size, IORESOURCE_MEM, IORES_DESC_PERSISTENT_MEMORY) - != REGION_INTERSECTS))) + != REGION_INTERSECTS) && + !arch_is_platform_page(base_addr))) return -EINVAL; inject: -- Gitee From b8151303ac4d44e8eb3862aed5557837837be9af Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Tue, 26 Oct 2021 15:00:50 -0700 Subject: [PATCH 187/674] x86/sgx: Add check for SGX pages to ghes_do_memory_failure() mainline inclusion from mainline-5.17 commit 3ad6fd77a2d62e8f4465b429b65805eaf88e1b9e category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZFM CVE: NA Intel-SIG: commit 3ad6fd77a2d6 x86/sgx: Add check for SGX pages to ghes_do_memory_failure(). Backport for SGX MCA recovery co-existence support -------------------------------- SGX EPC pages do not have a "struct page" associated with them so the pfn_valid() sanity check fails and results in a warning message to the console. Add an additional check to skip the warning if the address of the error is in an SGX EPC page. Signed-off-by: Tony Luck Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Tested-by: Reinette Chatre Link: https://lkml.kernel.org/r/20211026220050.697075-8-tony.luck@intel.com Signed-off-by: Zhiquan Li --- drivers/acpi/apei/ghes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index 9ac1e45dbba0..9c38c2cdd2fd 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -452,7 +452,7 @@ static bool ghes_do_memory_failure(u64 physical_addr, int flags) return false; pfn = PHYS_PFN(physical_addr); - if (!pfn_valid(pfn)) { + if (!pfn_valid(pfn) && !arch_is_platform_page(physical_addr)) { pr_warn_ratelimited(FW_WARN GHES_PFX "Invalid address in generic error data: %#llx\n", physical_addr); -- Gitee From 2f7813cc2de07c4516a117c3f80ccd928520aef4 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Thu, 14 Jul 2022 21:56:58 +0800 Subject: [PATCH 188/674] gfs2: Check for active reservation in gfs2_release stable inclusion from stable-v5.10.111 commit 3f53715fd55c4616fc25ac0e45ca7331d8fdf7f1 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=3f53715fd55c4616fc25ac0e45ca7331d8fdf7f1 -------------------------------- [ Upstream commit 0ec9b9ea4f83303bfd8f052a3d8b2bd179b002e1 ] In gfs2_release, check if the inode has an active reservation to avoid unnecessary lock taking. Signed-off-by: Andreas Gruenbacher Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- fs/gfs2/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index cfd9d03f604f..59318b1eaa60 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -716,10 +716,10 @@ static int gfs2_release(struct inode *inode, struct file *file) kfree(file->private_data); file->private_data = NULL; - if (file->f_mode & FMODE_WRITE) { + if (gfs2_rs_active(&ip->i_res)) gfs2_rs_delete(ip, &inode->i_writecount); + if (file->f_mode & FMODE_WRITE) gfs2_qa_put(ip); - } return 0; } -- Gitee From 9d7d0fdf041465c6390a608c28d07ff15ee3096a Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Thu, 14 Jul 2022 21:56:59 +0800 Subject: [PATCH 189/674] gfs2: Fix gfs2_release for non-writers regression stable inclusion from stable-v5.10.111 commit f349d7f9ee6d6c9cf9edadb0e1c5e9de6eda7466 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=f349d7f9ee6d6c9cf9edadb0e1c5e9de6eda7466 -------------------------------- [ Upstream commit d3add1a9519dcacd6e644ecac741c56cf18b67f5 ] When a file is opened for writing, the vfs code (do_dentry_open) calls get_write_access for the inode, thus incrementing the inode's write count. That writer normally then creates a multi-block reservation for the inode (i_res) that can be re-used by other writers, which speeds up writes for applications that stupidly loop on open/write/close. When the writes are all done, the multi-block reservation should be deleted when the file is closed by the last "writer." Commit 0ec9b9ea4f83 broke that concept when it moved the call to gfs2_rs_delete before the check for FMODE_WRITE. Non-writers have no business removing the multi-block reservations of writers. In fact, if someone opens and closes the file for RO while a writer has a multi-block reservation, the RO closer will delete the reservation midway through the write, and this results in: kernel BUG at fs/gfs2/rgrp.c:677! (or thereabouts) which is: BUG_ON(rs->rs_requested); from function gfs2_rs_deltree. This patch moves the check back inside the check for FMODE_WRITE. Fixes: 0ec9b9ea4f83 ("gfs2: Check for active reservation in gfs2_release") Cc: stable@vger.kernel.org # v5.12+ Signed-off-by: Bob Peterson Signed-off-by: Andreas Gruenbacher Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- fs/gfs2/file.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 59318b1eaa60..7bd7581aa682 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -716,10 +716,11 @@ static int gfs2_release(struct inode *inode, struct file *file) kfree(file->private_data); file->private_data = NULL; - if (gfs2_rs_active(&ip->i_res)) - gfs2_rs_delete(ip, &inode->i_writecount); - if (file->f_mode & FMODE_WRITE) + if (file->f_mode & FMODE_WRITE) { + if (gfs2_rs_active(&ip->i_res)) + gfs2_rs_delete(ip, &inode->i_writecount); gfs2_qa_put(ip); + } return 0; } -- Gitee From 63fc4fbc7733aae50b5ba196913d9960aa253594 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Thu, 14 Jul 2022 21:57:00 +0800 Subject: [PATCH 190/674] gfs2: gfs2_setattr_size error path fix stable inclusion from stable-v5.10.111 commit 0777fe98a44c3434a817817d3fb1f17ae855f3f2 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=0777fe98a44c3434a817817d3fb1f17ae855f3f2 -------------------------------- [ Upstream commit 7336905a89f19173bf9301cd50a24421162f417c ] When gfs2_setattr_size() fails, it calls gfs2_rs_delete(ip, NULL) to get rid of any reservations the inode may have. Instead, it should pass in the inode's write count as the second parameter to allow gfs2_rs_delete() to figure out if the inode has any writers left. In a next step, there are two instances of gfs2_rs_delete(ip, NULL) left where we know that there can be no other users of the inode. Replace those with gfs2_rs_deltree(&ip->i_res) to avoid the unnecessary write count check. With that, gfs2_rs_delete() is only called with the inode's actual write count, so get rid of the second parameter. Fixes: a097dc7e24cb ("GFS2: Make rgrp reservations part of the gfs2_inode structure") Signed-off-by: Andreas Gruenbacher Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- fs/gfs2/bmap.c | 2 +- fs/gfs2/file.c | 2 +- fs/gfs2/inode.c | 2 +- fs/gfs2/rgrp.c | 7 ++++--- fs/gfs2/rgrp.h | 2 +- fs/gfs2/super.c | 2 +- 6 files changed, 9 insertions(+), 8 deletions(-) diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index b34c02985d9d..6c047570d6a9 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -2200,7 +2200,7 @@ int gfs2_setattr_size(struct inode *inode, u64 newsize) ret = do_shrink(inode, newsize); out: - gfs2_rs_delete(ip, NULL); + gfs2_rs_delete(ip); gfs2_qa_put(ip); return ret; } diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 7bd7581aa682..2e6f622ed428 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -718,7 +718,7 @@ static int gfs2_release(struct inode *inode, struct file *file) if (file->f_mode & FMODE_WRITE) { if (gfs2_rs_active(&ip->i_res)) - gfs2_rs_delete(ip, &inode->i_writecount); + gfs2_rs_delete(ip); gfs2_qa_put(ip); } return 0; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 65ae4fc28ede..74a6b0800e05 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -811,7 +811,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, if (free_vfs_inode) /* else evict will do the put for us */ gfs2_glock_put(ip->i_gl); } - gfs2_rs_delete(ip, NULL); + gfs2_rs_deltree(&ip->i_res); gfs2_qa_put(ip); fail_free_acls: posix_acl_release(default_acl); diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index eb775e93de97..dc55b029afaa 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -664,13 +664,14 @@ void gfs2_rs_deltree(struct gfs2_blkreserv *rs) /** * gfs2_rs_delete - delete a multi-block reservation * @ip: The inode for this reservation - * @wcount: The inode's write count, or NULL * */ -void gfs2_rs_delete(struct gfs2_inode *ip, atomic_t *wcount) +void gfs2_rs_delete(struct gfs2_inode *ip) { + struct inode *inode = &ip->i_inode; + down_write(&ip->i_rw_mutex); - if ((wcount == NULL) || (atomic_read(wcount) <= 1)) + if (atomic_read(&inode->i_writecount) <= 1) gfs2_rs_deltree(&ip->i_res); up_write(&ip->i_rw_mutex); } diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index 9a587ada51ed..2d3c150c55bd 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -45,7 +45,7 @@ extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n, bool dinode, u64 *generation); extern void gfs2_rs_deltree(struct gfs2_blkreserv *rs); -extern void gfs2_rs_delete(struct gfs2_inode *ip, atomic_t *wcount); +extern void gfs2_rs_delete(struct gfs2_inode *ip); extern void __gfs2_free_blocks(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd, u64 bstart, u32 blen, int meta); extern void gfs2_free_meta(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd, diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index d2b7ecbd1b15..d14b98aa1c3e 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1434,7 +1434,7 @@ static void gfs2_evict_inode(struct inode *inode) truncate_inode_pages_final(&inode->i_data); if (ip->i_qadata) gfs2_assert_warn(sdp, ip->i_qadata->qa_ref == 0); - gfs2_rs_delete(ip, NULL); + gfs2_rs_deltree(&ip->i_res); gfs2_ordered_del_inode(ip); clear_inode(inode); gfs2_dir_hash_inval(ip); -- Gitee From f182174d1f67f5cc7227d28aa1f52a8b1c96e1b7 Mon Sep 17 00:00:00 2001 From: Jiasheng Jiang Date: Thu, 14 Jul 2022 21:57:01 +0800 Subject: [PATCH 191/674] rtc: wm8350: Handle error for wm8350_register_irq stable inclusion from stable-v5.10.111 commit 2e52a294700bec7867450e912e5bf91e3503288e category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=2e52a294700bec7867450e912e5bf91e3503288e -------------------------------- [ Upstream commit 43f0269b6b89c1eec4ef83c48035608f4dcdd886 ] As the potential failure of the wm8350_register_irq(), it should be better to check it and return error if fails. Also, it need not free 'wm_rtc->rtc' since it will be freed automatically. Fixes: 077eaf5b40ec ("rtc: rtc-wm8350: add support for WM8350 RTC") Signed-off-by: Jiasheng Jiang Acked-by: Charles Keepax Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20220303085030.291793-1-jiasheng@iscas.ac.cn Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/rtc/rtc-wm8350.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/rtc/rtc-wm8350.c b/drivers/rtc/rtc-wm8350.c index 2018614f258f..6eaa9321c074 100644 --- a/drivers/rtc/rtc-wm8350.c +++ b/drivers/rtc/rtc-wm8350.c @@ -432,14 +432,21 @@ static int wm8350_rtc_probe(struct platform_device *pdev) return ret; } - wm8350_register_irq(wm8350, WM8350_IRQ_RTC_SEC, + ret = wm8350_register_irq(wm8350, WM8350_IRQ_RTC_SEC, wm8350_rtc_update_handler, 0, "RTC Seconds", wm8350); + if (ret) + return ret; + wm8350_mask_irq(wm8350, WM8350_IRQ_RTC_SEC); - wm8350_register_irq(wm8350, WM8350_IRQ_RTC_ALM, + ret = wm8350_register_irq(wm8350, WM8350_IRQ_RTC_ALM, wm8350_rtc_alarm_handler, 0, "RTC Alarm", wm8350); + if (ret) { + wm8350_free_irq(wm8350, WM8350_IRQ_RTC_SEC, wm8350); + return ret; + } return 0; } -- Gitee From 162443c3431f1eb02174f822c00f5548b86628ab Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Thu, 14 Jul 2022 21:57:02 +0800 Subject: [PATCH 192/674] KVM: x86/svm: Clear reserved bits written to PerfEvtSeln MSRs stable inclusion from stable-v5.10.111 commit 66b0fa6b22187caf4d789bdac8dcca49940628b0 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=66b0fa6b22187caf4d789bdac8dcca49940628b0 -------------------------------- [ Upstream commit 9b026073db2f1ad0e4d8b61c83316c8497981037 ] AMD EPYC CPUs never raise a #GP for a WRMSR to a PerfEvtSeln MSR. Some reserved bits are cleared, and some are not. Specifically, on Zen3/Milan, bits 19 and 42 are not cleared. When emulating such a WRMSR, KVM should not synthesize a #GP, regardless of which bits are set. However, undocumented bits should not be passed through to the hardware MSR. So, rather than checking for reserved bits and synthesizing a #GP, just clear the reserved bits. This may seem pedantic, but since KVM currently does not support the "Host/Guest Only" bits (41:40), it is necessary to clear these bits rather than synthesizing #GP, because some popular guests (e.g Linux) will set the "Host Only" bit even on CPUs that don't support EFER.SVME, and they don't expect a #GP. For example, root@Ubuntu1804:~# perf stat -e r26 -a sleep 1 Performance counter stats for 'system wide': 0 r26 1.001070977 seconds time elapsed Feb 23 03:59:58 Ubuntu1804 kernel: [ 405.379957] unchecked MSR access error: WRMSR to 0xc0010200 (tried to write 0x0000020000130026) at rIP: 0xffffffff9b276a28 (native_write_msr+0x8/0x30) Feb 23 03:59:58 Ubuntu1804 kernel: [ 405.379958] Call Trace: Feb 23 03:59:58 Ubuntu1804 kernel: [ 405.379963] amd_pmu_disable_event+0x27/0x90 Fixes: ca724305a2b0 ("KVM: x86/vPMU: Implement AMD vPMU code for KVM") Reported-by: Lotus Fenn Signed-off-by: Jim Mattson Reviewed-by: Like Xu Reviewed-by: David Dunn Message-Id: <20220226234131.2167175-1-jmattson@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- arch/x86/kvm/svm/pmu.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 4e7093bcb64b..0e9c2322d398 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -253,12 +253,10 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) /* MSR_EVNTSELn */ pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL); if (pmc) { - if (data == pmc->eventsel) - return 0; - if (!(data & pmu->reserved_bits)) { + data &= ~pmu->reserved_bits; + if (data != pmc->eventsel) reprogram_gp_counter(pmc, data); - return 0; - } + return 0; } return 1; -- Gitee From ab5144b67e9eecff089754b281fd6faa168cb4d4 Mon Sep 17 00:00:00 2001 From: Hou Wenlong Date: Thu, 14 Jul 2022 21:57:03 +0800 Subject: [PATCH 193/674] KVM: x86/emulator: Emulate RDPID only if it is enabled in guest stable inclusion from stable-v5.10.111 commit a24479c5e9f44c2a799ca2178b1ba9fe1290706e category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=a24479c5e9f44c2a799ca2178b1ba9fe1290706e -------------------------------- [ Upstream commit a836839cbfe60dc434c5476a7429cf2bae36415d ] When RDTSCP is supported but RDPID is not supported in host, RDPID emulation is available. However, __kvm_get_msr() would only fail when RDTSCP/RDPID both are disabled in guest, so the emulator wouldn't inject a #UD when RDPID is disabled but RDTSCP is enabled in guest. Fixes: fb6d4d340e05 ("KVM: x86: emulate RDPID") Signed-off-by: Hou Wenlong Message-Id: <1dfd46ae5b76d3ed87bde3154d51c64ea64c99c1.1646226788.git.houwenlong.hwl@antgroup.com> Signed-off-by: Paolo Bonzini Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- arch/x86/kvm/emulate.c | 4 +++- arch/x86/kvm/kvm_emulate.h | 1 + arch/x86/kvm/x86.c | 6 ++++++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index a63df19ef4da..71e1a2d39f21 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3611,8 +3611,10 @@ static int em_rdpid(struct x86_emulate_ctxt *ctxt) { u64 tsc_aux = 0; - if (ctxt->ops->get_msr(ctxt, MSR_TSC_AUX, &tsc_aux)) + if (!ctxt->ops->guest_has_rdpid(ctxt)) return emulate_ud(ctxt); + + ctxt->ops->get_msr(ctxt, MSR_TSC_AUX, &tsc_aux); ctxt->dst.val = tsc_aux; return X86EMUL_CONTINUE; } diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index 7d5be04dc661..aeed6da60e0c 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -225,6 +225,7 @@ struct x86_emulate_ops { bool (*guest_has_long_mode)(struct x86_emulate_ctxt *ctxt); bool (*guest_has_movbe)(struct x86_emulate_ctxt *ctxt); bool (*guest_has_fxsr)(struct x86_emulate_ctxt *ctxt); + bool (*guest_has_rdpid)(struct x86_emulate_ctxt *ctxt); void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4e28a6d6dd1a..1c45d9bafe0a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6961,6 +6961,11 @@ static bool emulator_guest_has_fxsr(struct x86_emulate_ctxt *ctxt) return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_FXSR); } +static bool emulator_guest_has_rdpid(struct x86_emulate_ctxt *ctxt) +{ + return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_RDPID); +} + static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg) { return kvm_register_read(emul_to_vcpu(ctxt), reg); @@ -7044,6 +7049,7 @@ static const struct x86_emulate_ops emulate_ops = { .guest_has_long_mode = emulator_guest_has_long_mode, .guest_has_movbe = emulator_guest_has_movbe, .guest_has_fxsr = emulator_guest_has_fxsr, + .guest_has_rdpid = emulator_guest_has_rdpid, .set_nmi_mask = emulator_set_nmi_mask, .get_hflags = emulator_get_hflags, .set_hflags = emulator_set_hflags, -- Gitee From c96ad29ecadf11244fb87bf9da9df16da9234034 Mon Sep 17 00:00:00 2001 From: Anisse Astier Date: Thu, 14 Jul 2022 21:57:04 +0800 Subject: [PATCH 194/674] drm: Add orientation quirk for GPD Win Max stable inclusion from stable-v5.10.111 commit 850c4351e89555bb89ddb6c7fee5bed7cfdcfcb2 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=850c4351e89555bb89ddb6c7fee5bed7cfdcfcb2 -------------------------------- [ Upstream commit 0b464ca3e0dd3cec65f28bc6d396d82f19080f69 ] Panel is 800x1280, but mounted on a laptop form factor, sideways. Signed-off-by: Anisse Astier Reviewed-by: Hans de Goede Signed-off-by: Jani Nikula Link: https://patchwork.freedesktop.org/patch/msgid/20211229222200.53128-3-anisse@astier.eu Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/gpu/drm/drm_panel_orientation_quirks.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/gpu/drm/drm_panel_orientation_quirks.c b/drivers/gpu/drm/drm_panel_orientation_quirks.c index 448c2f2d803a..f5ab891731d0 100644 --- a/drivers/gpu/drm/drm_panel_orientation_quirks.c +++ b/drivers/gpu/drm/drm_panel_orientation_quirks.c @@ -166,6 +166,12 @@ static const struct dmi_system_id orientation_data[] = { DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "MicroPC"), }, .driver_data = (void *)&lcd720x1280_rightside_up, + }, { /* GPD Win Max */ + .matches = { + DMI_EXACT_MATCH(DMI_SYS_VENDOR, "GPD"), + DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "G1619-01"), + }, + .driver_data = (void *)&lcd800x1280_rightside_up, }, { /* * GPD Pocket, note that the the DMI data is less generic then * it seems, devices with a board-vendor of "AMI Corporation" -- Gitee From 6f858fc398ef1d2ac68aa3a39b8f3941e2200486 Mon Sep 17 00:00:00 2001 From: Zekun Shen Date: Thu, 14 Jul 2022 21:57:05 +0800 Subject: [PATCH 195/674] ath5k: fix OOB in ath5k_eeprom_read_pcal_info_5111 stable inclusion from stable-v5.10.111 commit 9d7d83d0399e23d66fd431b553842a84ac10398f category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=9d7d83d0399e23d66fd431b553842a84ac10398f -------------------------------- [ Upstream commit 564d4eceb97eaf381dd6ef6470b06377bb50c95a ] The bug was found during fuzzing. Stacktrace locates it in ath5k_eeprom_convert_pcal_info_5111. When none of the curve is selected in the loop, idx can go up to AR5K_EEPROM_N_PD_CURVES. The line makes pd out of bound. pd = &chinfo[pier].pd_curves[idx]; There are many OOB writes using pd later in the code. So I added a sanity check for idx. Checks for other loops involving AR5K_EEPROM_N_PD_CURVES are not needed as the loop index is not used outside the loops. The patch is NOT tested with real device. The following is the fuzzing report BUG: KASAN: slab-out-of-bounds in ath5k_eeprom_read_pcal_info_5111+0x126a/0x1390 [ath5k] Write of size 1 at addr ffff8880174a4d60 by task modprobe/214 CPU: 0 PID: 214 Comm: modprobe Not tainted 5.6.0 #1 Call Trace: dump_stack+0x76/0xa0 print_address_description.constprop.0+0x16/0x200 ? ath5k_eeprom_read_pcal_info_5111+0x126a/0x1390 [ath5k] ? ath5k_eeprom_read_pcal_info_5111+0x126a/0x1390 [ath5k] __kasan_report.cold+0x37/0x7c ? ath5k_eeprom_read_pcal_info_5111+0x126a/0x1390 [ath5k] kasan_report+0xe/0x20 ath5k_eeprom_read_pcal_info_5111+0x126a/0x1390 [ath5k] ? apic_timer_interrupt+0xa/0x20 ? ath5k_eeprom_init_11a_pcal_freq+0xbc0/0xbc0 [ath5k] ? ath5k_pci_eeprom_read+0x228/0x3c0 [ath5k] ath5k_eeprom_init+0x2513/0x6290 [ath5k] ? ath5k_eeprom_init_11a_pcal_freq+0xbc0/0xbc0 [ath5k] ? usleep_range+0xb8/0x100 ? apic_timer_interrupt+0xa/0x20 ? ath5k_eeprom_read_pcal_info_2413+0x2f20/0x2f20 [ath5k] ath5k_hw_init+0xb60/0x1970 [ath5k] ath5k_init_ah+0x6fe/0x2530 [ath5k] ? kasprintf+0xa6/0xe0 ? ath5k_stop+0x140/0x140 [ath5k] ? _dev_notice+0xf6/0xf6 ? apic_timer_interrupt+0xa/0x20 ath5k_pci_probe.cold+0x29a/0x3d6 [ath5k] ? ath5k_pci_eeprom_read+0x3c0/0x3c0 [ath5k] ? mutex_lock+0x89/0xd0 ? ath5k_pci_eeprom_read+0x3c0/0x3c0 [ath5k] local_pci_probe+0xd3/0x160 pci_device_probe+0x23f/0x3e0 ? pci_device_remove+0x280/0x280 ? pci_device_remove+0x280/0x280 really_probe+0x209/0x5d0 Reported-by: Brendan Dolan-Gavitt Signed-off-by: Zekun Shen Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/YckvDdj3mtCkDRIt@a-10-27-26-18.dynapool.vpn.nyu.edu Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/net/wireless/ath/ath5k/eeprom.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/wireless/ath/ath5k/eeprom.c b/drivers/net/wireless/ath/ath5k/eeprom.c index 1fbc2c19848f..d444b3d70ba2 100644 --- a/drivers/net/wireless/ath/ath5k/eeprom.c +++ b/drivers/net/wireless/ath/ath5k/eeprom.c @@ -746,6 +746,9 @@ ath5k_eeprom_convert_pcal_info_5111(struct ath5k_hw *ah, int mode, } } + if (idx == AR5K_EEPROM_N_PD_CURVES) + goto err_out; + ee->ee_pd_gains[mode] = 1; pd = &chinfo[pier].pd_curves[idx]; -- Gitee From f82957609b5e7de8fa5f4090a5b147efc72f98e0 Mon Sep 17 00:00:00 2001 From: Dale Zhao Date: Thu, 14 Jul 2022 21:57:06 +0800 Subject: [PATCH 196/674] drm/amd/display: Add signal type check when verify stream backends same stable inclusion from stable-v5.10.111 commit 85313d9bc7bd1f20d4ff92023fe731dadf0c778f category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=85313d9bc7bd1f20d4ff92023fe731dadf0c778f -------------------------------- [ Upstream commit 047db281c026de5971cedb5bb486aa29bd16a39d ] [Why] For allow eDP hot-plug feature, the stream signal may change to VIRTUAL when plug-out and back to eDP when plug-in. OS will still setPathMode with same timing for each plugging, but eDP gets no stream update as we don't check signal type changing back as keeping it VIRTUAL. It's also unsafe for future cases that stream signal is switched with same timing. [How] Check stream signal type change include previous HDMI signal case. Reviewed-by: Aric Cyr Acked-by: Wayne Lin Signed-off-by: Dale Zhao Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/gpu/drm/amd/display/dc/core/dc_resource.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_resource.c b/drivers/gpu/drm/amd/display/dc/core/dc_resource.c index 5f4cdb05c4db..5c5ccbad9658 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc_resource.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc_resource.c @@ -1674,6 +1674,9 @@ static bool are_stream_backends_same( if (is_timing_changed(stream_a, stream_b)) return false; + if (stream_a->signal != stream_b->signal) + return false; + if (stream_a->dpms_off != stream_b->dpms_off) return false; -- Gitee From 1e958ba009c6fe88dff92244c3a73c7243948b01 Mon Sep 17 00:00:00 2001 From: Xin Xiong Date: Thu, 14 Jul 2022 21:57:07 +0800 Subject: [PATCH 197/674] drm/amd/amdgpu/amdgpu_cs: fix refcount leak of a dma_fence obj MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit stable inclusion from stable-v5.10.111 commit 927beb05aaa429c883cc0ec6adc48964b187e291 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=927beb05aaa429c883cc0ec6adc48964b187e291 -------------------------------- [ Upstream commit dfced44f122c500004a48ecc8db516bb6a295a1b ] This issue takes place in an error path in amdgpu_cs_fence_to_handle_ioctl(). When `info->in.what` falls into default case, the function simply returns -EINVAL, forgetting to decrement the reference count of a dma_fence obj, which is bumped earlier by amdgpu_cs_get_fence(). This may result in reference count leaks. Fix it by decreasing the refcount of specific object before returning the error code. Reviewed-by: Christian König Signed-off-by: Xin Xiong Signed-off-by: Xin Tan Signed-off-by: Alex Deucher Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index 12598a4b5c78..867fcee6b0d3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -1484,6 +1484,7 @@ int amdgpu_cs_fence_to_handle_ioctl(struct drm_device *dev, void *data, return 0; default: + dma_fence_put(fence); return -EINVAL; } } -- Gitee From 137ee1fdc2747db7d385bc7ae93fd1572ca3d852 Mon Sep 17 00:00:00 2001 From: Wayne Chang Date: Thu, 14 Jul 2022 21:57:08 +0800 Subject: [PATCH 198/674] usb: gadget: tegra-xudc: Do not program SPARAM stable inclusion from stable-v5.10.111 commit 07971b818e18b8b61245be270acc861386f7479d category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=07971b818e18b8b61245be270acc861386f7479d -------------------------------- [ Upstream commit 62fb61580eb48fc890b7bc9fb5fd263367baeca8 ] According to the Tegra Technical Reference Manual, SPARAM is a read-only register and should not be programmed in the driver. The change removes the wrong SPARAM usage. Signed-off-by: Wayne Chang Link: https://lore.kernel.org/r/20220107090443.149021-1-waynec@nvidia.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/usb/gadget/udc/tegra-xudc.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/drivers/usb/gadget/udc/tegra-xudc.c b/drivers/usb/gadget/udc/tegra-xudc.c index 57ee72fead45..1dd0d50c6b56 100644 --- a/drivers/usb/gadget/udc/tegra-xudc.c +++ b/drivers/usb/gadget/udc/tegra-xudc.c @@ -32,9 +32,6 @@ #include /* XUSB_DEV registers */ -#define SPARAM 0x000 -#define SPARAM_ERSTMAX_MASK GENMASK(20, 16) -#define SPARAM_ERSTMAX(x) (((x) << 16) & SPARAM_ERSTMAX_MASK) #define DB 0x004 #define DB_TARGET_MASK GENMASK(15, 8) #define DB_TARGET(x) (((x) << 8) & DB_TARGET_MASK) @@ -3295,11 +3292,6 @@ static void tegra_xudc_init_event_ring(struct tegra_xudc *xudc) unsigned int i; u32 val; - val = xudc_readl(xudc, SPARAM); - val &= ~(SPARAM_ERSTMAX_MASK); - val |= SPARAM_ERSTMAX(XUDC_NR_EVENT_RINGS); - xudc_writel(xudc, val, SPARAM); - for (i = 0; i < ARRAY_SIZE(xudc->event_ring); i++) { memset(xudc->event_ring[i], 0, XUDC_EVENT_RING_SIZE * sizeof(*xudc->event_ring[i])); -- Gitee From 7b7537b8342690fc80888b2028bc7fa5c0f7587f Mon Sep 17 00:00:00 2001 From: Wayne Chang Date: Thu, 14 Jul 2022 21:57:09 +0800 Subject: [PATCH 199/674] usb: gadget: tegra-xudc: Fix control endpoint's definitions stable inclusion from stable-v5.10.111 commit 9ea17b9f1dd0dd9f458df9460b36d5c7160eb3f1 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=9ea17b9f1dd0dd9f458df9460b36d5c7160eb3f1 -------------------------------- [ Upstream commit 7bd42fb95eb4f98495ccadf467ad15124208ec49 ] According to the Tegra Technical Reference Manual, the seq_num field of control endpoint is not [31:24] but [31:27]. Bit 24 is reserved and bit 26 is splitxstate. The change fixes the wrong control endpoint's definitions. Signed-off-by: Wayne Chang Link: https://lore.kernel.org/r/20220107091349.149798-1-waynec@nvidia.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/usb/gadget/udc/tegra-xudc.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/usb/gadget/udc/tegra-xudc.c b/drivers/usb/gadget/udc/tegra-xudc.c index 1dd0d50c6b56..de178bf264c2 100644 --- a/drivers/usb/gadget/udc/tegra-xudc.c +++ b/drivers/usb/gadget/udc/tegra-xudc.c @@ -272,8 +272,10 @@ BUILD_EP_CONTEXT_RW(deq_hi, deq_hi, 0, 0xffffffff) BUILD_EP_CONTEXT_RW(avg_trb_len, tx_info, 0, 0xffff) BUILD_EP_CONTEXT_RW(max_esit_payload, tx_info, 16, 0xffff) BUILD_EP_CONTEXT_RW(edtla, rsvd[0], 0, 0xffffff) -BUILD_EP_CONTEXT_RW(seq_num, rsvd[0], 24, 0xff) +BUILD_EP_CONTEXT_RW(rsvd, rsvd[0], 24, 0x1) BUILD_EP_CONTEXT_RW(partial_td, rsvd[0], 25, 0x1) +BUILD_EP_CONTEXT_RW(splitxstate, rsvd[0], 26, 0x1) +BUILD_EP_CONTEXT_RW(seq_num, rsvd[0], 27, 0x1f) BUILD_EP_CONTEXT_RW(cerrcnt, rsvd[1], 18, 0x3) BUILD_EP_CONTEXT_RW(data_offset, rsvd[2], 0, 0x1ffff) BUILD_EP_CONTEXT_RW(numtrbs, rsvd[2], 22, 0x1f) @@ -1554,6 +1556,9 @@ static int __tegra_xudc_ep_set_halt(struct tegra_xudc_ep *ep, bool halt) ep_reload(xudc, ep->index); ep_ctx_write_state(ep->context, EP_STATE_RUNNING); + ep_ctx_write_rsvd(ep->context, 0); + ep_ctx_write_partial_td(ep->context, 0); + ep_ctx_write_splitxstate(ep->context, 0); ep_ctx_write_seq_num(ep->context, 0); ep_reload(xudc, ep->index); @@ -2809,7 +2814,10 @@ static void tegra_xudc_reset(struct tegra_xudc *xudc) xudc->setup_seq_num = 0; xudc->queued_setup_packet = false; - ep_ctx_write_seq_num(ep0->context, xudc->setup_seq_num); + ep_ctx_write_rsvd(ep0->context, 0); + ep_ctx_write_partial_td(ep0->context, 0); + ep_ctx_write_splitxstate(ep0->context, 0); + ep_ctx_write_seq_num(ep0->context, 0); deq_ptr = trb_virt_to_phys(ep0, &ep0->transfer_ring[ep0->deq_ptr]); -- Gitee From 7a81c512ceac75cc4f8e13effe8fcf1f5489843b Mon Sep 17 00:00:00 2001 From: Yang Guang Date: Thu, 14 Jul 2022 21:57:10 +0800 Subject: [PATCH 200/674] ptp: replace snprintf with sysfs_emit stable inclusion from stable-v5.10.111 commit 02e2ee8619849f7ac1555d250698fd900ba95d2d category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=02e2ee8619849f7ac1555d250698fd900ba95d2d -------------------------------- [ Upstream commit e2cf07654efb0fd7bbcb475c6f74be7b5755a8fd ] coccinelle report: ./drivers/ptp/ptp_sysfs.c:17:8-16: WARNING: use scnprintf or sprintf ./drivers/ptp/ptp_sysfs.c:390:8-16: WARNING: use scnprintf or sprintf Use sysfs_emit instead of scnprintf or sprintf makes more sense. Reported-by: Zeal Robot Signed-off-by: Yang Guang Signed-off-by: David Yang Acked-by: Richard Cochran Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/ptp/ptp_sysfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/ptp/ptp_sysfs.c b/drivers/ptp/ptp_sysfs.c index be076a91e20e..8cd59e848163 100644 --- a/drivers/ptp/ptp_sysfs.c +++ b/drivers/ptp/ptp_sysfs.c @@ -13,7 +13,7 @@ static ssize_t clock_name_show(struct device *dev, struct device_attribute *attr, char *page) { struct ptp_clock *ptp = dev_get_drvdata(dev); - return snprintf(page, PAGE_SIZE-1, "%s\n", ptp->info->name); + return sysfs_emit(page, "%s\n", ptp->info->name); } static DEVICE_ATTR_RO(clock_name); @@ -227,7 +227,7 @@ static ssize_t ptp_pin_show(struct device *dev, struct device_attribute *attr, mutex_unlock(&ptp->pincfg_mux); - return snprintf(page, PAGE_SIZE, "%u %u\n", func, chan); + return sysfs_emit(page, "%u %u\n", func, chan); } static ssize_t ptp_pin_store(struct device *dev, struct device_attribute *attr, -- Gitee From 52033721eb81fbd50f491b514b080ce6ee5826c2 Mon Sep 17 00:00:00 2001 From: Maxim Kiselev Date: Thu, 14 Jul 2022 21:57:11 +0800 Subject: [PATCH 201/674] powerpc: dts: t104xrdb: fix phy type for FMAN 4/5 stable inclusion from stable-v5.10.111 commit e4d2d72013564231842f87fe8fea5d3e4edecedc category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=e4d2d72013564231842f87fe8fea5d3e4edecedc -------------------------------- [ Upstream commit 17846485dff91acce1ad47b508b633dffc32e838 ] T1040RDB has two RTL8211E-VB phys which requires setting of internal delays for correct work. Changing the phy-connection-type property to `rgmii-id` will fix this issue. Signed-off-by: Maxim Kiselev Reviewed-by: Maxim Kochetkov Reviewed-by: Vladimir Oltean Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211230151123.1258321-1-bigunclemax@gmail.com Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- arch/powerpc/boot/dts/fsl/t104xrdb.dtsi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/boot/dts/fsl/t104xrdb.dtsi b/arch/powerpc/boot/dts/fsl/t104xrdb.dtsi index 099a598c74c0..bfe1ed5be337 100644 --- a/arch/powerpc/boot/dts/fsl/t104xrdb.dtsi +++ b/arch/powerpc/boot/dts/fsl/t104xrdb.dtsi @@ -139,12 +139,12 @@ pca9546@77 { fman@400000 { ethernet@e6000 { phy-handle = <&phy_rgmii_0>; - phy-connection-type = "rgmii"; + phy-connection-type = "rgmii-id"; }; ethernet@e8000 { phy-handle = <&phy_rgmii_1>; - phy-connection-type = "rgmii"; + phy-connection-type = "rgmii-id"; }; mdio0: mdio@fc000 { -- Gitee From bd1eb5982d2ce144afeaac39413bddcee4a37753 Mon Sep 17 00:00:00 2001 From: Venkateswara Naralasetty Date: Thu, 14 Jul 2022 21:57:12 +0800 Subject: [PATCH 202/674] ath11k: fix kernel panic during unload/load ath11k modules stable inclusion from stable-v5.10.111 commit c6a815f5abdf324108799829dd19ea62fef4bf95 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=c6a815f5abdf324108799829dd19ea62fef4bf95 -------------------------------- [ Upstream commit 22b59cb965f79ee1accf83172441c9ca0ecb632a ] Call netif_napi_del() from ath11k_ahb_free_ext_irq() to fix the following kernel panic when unload/load ath11k modules for few iterations. [ 971.201365] Unable to handle kernel paging request at virtual address 6d97a208 [ 971.204227] pgd = 594c2919 [ 971.211478] [6d97a208] *pgd=00000000 [ 971.214120] Internal error: Oops: 5 [#1] PREEMPT SMP ARM [ 971.412024] CPU: 2 PID: 4435 Comm: insmod Not tainted 5.4.89 #0 [ 971.434256] Hardware name: Generic DT based system [ 971.440165] PC is at napi_by_id+0x10/0x40 [ 971.445019] LR is at netif_napi_add+0x160/0x1dc [ 971.743127] (napi_by_id) from [<807d89a0>] (netif_napi_add+0x160/0x1dc) [ 971.751295] (netif_napi_add) from [<7f1209ac>] (ath11k_ahb_config_irq+0xf8/0x414 [ath11k_ahb]) [ 971.759164] (ath11k_ahb_config_irq [ath11k_ahb]) from [<7f12135c>] (ath11k_ahb_probe+0x40c/0x51c [ath11k_ahb]) [ 971.768567] (ath11k_ahb_probe [ath11k_ahb]) from [<80666864>] (platform_drv_probe+0x48/0x94) [ 971.779670] (platform_drv_probe) from [<80664718>] (really_probe+0x1c8/0x450) [ 971.789389] (really_probe) from [<80664cc4>] (driver_probe_device+0x15c/0x1b8) [ 971.797547] (driver_probe_device) from [<80664f60>] (device_driver_attach+0x44/0x60) [ 971.805795] (device_driver_attach) from [<806650a0>] (__driver_attach+0x124/0x140) [ 971.814822] (__driver_attach) from [<80662adc>] (bus_for_each_dev+0x58/0xa4) [ 971.823328] (bus_for_each_dev) from [<80663a2c>] (bus_add_driver+0xf0/0x1e8) [ 971.831662] (bus_add_driver) from [<806658a4>] (driver_register+0xa8/0xf0) [ 971.839822] (driver_register) from [<8030269c>] (do_one_initcall+0x78/0x1ac) [ 971.847638] (do_one_initcall) from [<80392524>] (do_init_module+0x54/0x200) [ 971.855968] (do_init_module) from [<803945b0>] (load_module+0x1e30/0x1ffc) [ 971.864126] (load_module) from [<803948b0>] (sys_init_module+0x134/0x17c) [ 971.871852] (sys_init_module) from [<80301000>] (ret_fast_syscall+0x0/0x50) Tested-on: IPQ8074 hw2.0 AHB WLAN.HK.2.6.0.1-00760-QCAHKSWPL_SILICONZ-1 Signed-off-by: Venkateswara Naralasetty Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/1642583973-21599-1-git-send-email-quic_vnaralas@quicinc.com Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/net/wireless/ath/ath11k/ahb.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/wireless/ath/ath11k/ahb.c b/drivers/net/wireless/ath/ath11k/ahb.c index 9ff6e6853314..190bc5712e96 100644 --- a/drivers/net/wireless/ath/ath11k/ahb.c +++ b/drivers/net/wireless/ath/ath11k/ahb.c @@ -366,6 +366,8 @@ static void ath11k_ahb_free_ext_irq(struct ath11k_base *ab) for (j = 0; j < irq_grp->num_irq; j++) free_irq(ab->irq_num[irq_grp->irqs[j]], irq_grp); + + netif_napi_del(&irq_grp->napi); } } -- Gitee From 7658d7c5aaa3e536aa38714731f06d780041a006 Mon Sep 17 00:00:00 2001 From: Kalle Valo Date: Thu, 14 Jul 2022 21:57:13 +0800 Subject: [PATCH 203/674] ath11k: mhi: use mhi_sync_power_up() stable inclusion from stable-v5.10.111 commit 339bd0b55ecdd0f7f341e9357c4cfde799de9418 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=339bd0b55ecdd0f7f341e9357c4cfde799de9418 -------------------------------- [ Upstream commit 3df6d74aedfdca919cca475d15dfdbc8b05c9e5d ] If amss.bin was missing ath11k would crash during 'rmmod ath11k_pci'. The reason for that was that we were using mhi_async_power_up() which does not check any errors. But mhi_sync_power_up() on the other hand does check for errors so let's use that to fix the crash. I was not able to find a reason why an async version was used. ath11k_mhi_start() (which enables state ATH11K_MHI_POWER_ON) is called from ath11k_hif_power_up(), which can sleep. So sync version should be safe to use here. [ 145.569731] general protection fault, probably for non-canonical address 0xdffffc0000000000: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC KASAN PTI [ 145.569789] KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007] [ 145.569843] CPU: 2 PID: 1628 Comm: rmmod Kdump: loaded Tainted: G W 5.16.0-wt-ath+ #567 [ 145.569898] Hardware name: Intel(R) Client Systems NUC8i7HVK/NUC8i7HVB, BIOS HNKBLi70.86A.0067.2021.0528.1339 05/28/2021 [ 145.569956] RIP: 0010:ath11k_hal_srng_access_begin+0xb5/0x2b0 [ath11k] [ 145.570028] Code: df 48 89 fa 48 c1 ea 03 80 3c 02 00 0f 85 ec 01 00 00 48 8b ab a8 00 00 00 48 b8 00 00 00 00 00 fc ff df 48 89 ea 48 c1 ea 03 <0f> b6 14 02 48 89 e8 83 e0 07 83 c0 03 45 85 ed 75 48 38 d0 7c 08 [ 145.570089] RSP: 0018:ffffc900025d7ac0 EFLAGS: 00010246 [ 145.570144] RAX: dffffc0000000000 RBX: ffff88814fca2dd8 RCX: 1ffffffff50cb455 [ 145.570196] RDX: 0000000000000000 RSI: ffff88814fca2dd8 RDI: ffff88814fca2e80 [ 145.570252] RBP: 0000000000000000 R08: 0000000000000000 R09: ffffffffa8659497 [ 145.570329] R10: fffffbfff50cb292 R11: 0000000000000001 R12: ffff88814fca0000 [ 145.570410] R13: 0000000000000000 R14: ffff88814fca2798 R15: ffff88814fca2dd8 [ 145.570465] FS: 00007fa399988540(0000) GS:ffff888233e00000(0000) knlGS:0000000000000000 [ 145.570519] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 145.570571] CR2: 00007fa399b51421 CR3: 0000000137898002 CR4: 00000000003706e0 [ 145.570623] Call Trace: [ 145.570675] [ 145.570727] ? ath11k_ce_tx_process_cb+0x34b/0x860 [ath11k] [ 145.570797] ath11k_ce_tx_process_cb+0x356/0x860 [ath11k] [ 145.570864] ? tasklet_init+0x150/0x150 [ 145.570919] ? ath11k_ce_alloc_pipes+0x280/0x280 [ath11k] [ 145.570986] ? tasklet_clear_sched+0x42/0xe0 [ 145.571042] ? tasklet_kill+0xe9/0x1b0 [ 145.571095] ? tasklet_clear_sched+0xe0/0xe0 [ 145.571148] ? irq_has_action+0x120/0x120 [ 145.571202] ath11k_ce_cleanup_pipes+0x45a/0x580 [ath11k] [ 145.571270] ? ath11k_pci_stop+0x10e/0x170 [ath11k_pci] [ 145.571345] ath11k_core_stop+0x8a/0xc0 [ath11k] [ 145.571434] ath11k_core_deinit+0x9e/0x150 [ath11k] [ 145.571499] ath11k_pci_remove+0xd2/0x260 [ath11k_pci] [ 145.571553] pci_device_remove+0x9a/0x1c0 [ 145.571605] __device_release_driver+0x332/0x660 [ 145.571659] driver_detach+0x1e7/0x2c0 [ 145.571712] bus_remove_driver+0xe2/0x2d0 [ 145.571772] pci_unregister_driver+0x21/0x250 [ 145.571826] __do_sys_delete_module+0x30a/0x4b0 [ 145.571879] ? free_module+0xac0/0xac0 [ 145.571933] ? lockdep_hardirqs_on_prepare.part.0+0x18c/0x370 [ 145.571986] ? syscall_enter_from_user_mode+0x1d/0x50 [ 145.572039] ? lockdep_hardirqs_on+0x79/0x100 [ 145.572097] do_syscall_64+0x3b/0x90 [ 145.572153] entry_SYSCALL_64_after_hwframe+0x44/0xae Tested-on: WCN6855 hw2.0 PCI WLAN.HSP.1.1-03003-QCAHSPSWPL_V1_V2_SILICONZ_LITE-2 Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20220127090117.2024-2-kvalo@kernel.org Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/net/wireless/ath/ath11k/mhi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/ath/ath11k/mhi.c b/drivers/net/wireless/ath/ath11k/mhi.c index aded9a719d51..84db9e55c3e7 100644 --- a/drivers/net/wireless/ath/ath11k/mhi.c +++ b/drivers/net/wireless/ath/ath11k/mhi.c @@ -402,7 +402,7 @@ static int ath11k_mhi_set_state(struct ath11k_pci *ab_pci, ret = 0; break; case ATH11K_MHI_POWER_ON: - ret = mhi_async_power_up(ab_pci->mhi_ctrl); + ret = mhi_sync_power_up(ab_pci->mhi_ctrl); break; case ATH11K_MHI_POWER_OFF: mhi_power_down(ab_pci->mhi_ctrl, true); -- Gitee From 4d9dfcd10e53a8fc7e5e6be9a66291b2421f2a3a Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Thu, 14 Jul 2022 21:57:14 +0800 Subject: [PATCH 204/674] bpf: Make dst_port field in struct bpf_sock 16-bit wide stable inclusion from stable-v5.10.111 commit 995f517888687c0730bc3d2dbca424c27350eaa7 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=995f517888687c0730bc3d2dbca424c27350eaa7 -------------------------------- [ Upstream commit 4421a582718ab81608d8486734c18083b822390d ] Menglong Dong reports that the documentation for the dst_port field in struct bpf_sock is inaccurate and confusing. From the BPF program PoV, the field is a zero-padded 16-bit integer in network byte order. The value appears to the BPF user as if laid out in memory as so: offsetof(struct bpf_sock, dst_port) + 0 + 8 +16 0x00 +24 0x00 32-, 16-, and 8-bit wide loads from the field are all allowed, but only if the offset into the field is 0. 32-bit wide loads from dst_port are especially confusing. The loaded value, after converting to host byte order with bpf_ntohl(dst_port), contains the port number in the upper 16-bits. Remove the confusion by splitting the field into two 16-bit fields. For backward compatibility, allow 32-bit wide loads from offsetof(struct bpf_sock, dst_port). While at it, allow loads 8-bit loads at offset [0] and [1] from dst_port. Reported-by: Menglong Dong Signed-off-by: Jakub Sitnicki Link: https://lore.kernel.org/r/20220130115518.213259-2-jakub@cloudflare.com Signed-off-by: Alexei Starovoitov Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- include/uapi/linux/bpf.h | 3 ++- net/core/filter.c | 10 +++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6bf4d010222e..8fae845d80e2 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4195,7 +4195,8 @@ struct bpf_sock { __u32 src_ip4; __u32 src_ip6[4]; __u32 src_port; /* host byte order */ - __u32 dst_port; /* network byte order */ + __be16 dst_port; /* network byte order */ + __u16 :16; /* zero padding */ __u32 dst_ip4; __u32 dst_ip6[4]; __u32 state; diff --git a/net/core/filter.c b/net/core/filter.c index 4e06f7331914..6b619ef0dee3 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7789,6 +7789,7 @@ bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { const int size_default = sizeof(__u32); + int field_size; if (off < 0 || off >= sizeof(struct bpf_sock)) return false; @@ -7800,7 +7801,6 @@ bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, case offsetof(struct bpf_sock, family): case offsetof(struct bpf_sock, type): case offsetof(struct bpf_sock, protocol): - case offsetof(struct bpf_sock, dst_port): case offsetof(struct bpf_sock, src_port): case offsetof(struct bpf_sock, rx_queue_mapping): case bpf_ctx_range(struct bpf_sock, src_ip4): @@ -7809,6 +7809,14 @@ bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]): bpf_ctx_record_field_size(info, size_default); return bpf_ctx_narrow_access_ok(off, size, size_default); + case bpf_ctx_range(struct bpf_sock, dst_port): + field_size = size == size_default ? + size_default : sizeof_field(struct bpf_sock, dst_port); + bpf_ctx_record_field_size(info, field_size); + return bpf_ctx_narrow_access_ok(off, size, field_size); + case offsetofend(struct bpf_sock, dst_port) ... + offsetof(struct bpf_sock, dst_ip4) - 1: + return false; } return size == size_default; -- Gitee From ea8d0294b99d34ab6037a80c39e78eaef333e7c6 Mon Sep 17 00:00:00 2001 From: Yang Guang Date: Thu, 14 Jul 2022 21:57:15 +0800 Subject: [PATCH 205/674] scsi: mvsas: Replace snprintf() with sysfs_emit() stable inclusion from stable-v5.10.111 commit ed7db959203e3ef06f7b59dfb504d392842312f3 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=ed7db959203e3ef06f7b59dfb504d392842312f3 -------------------------------- [ Upstream commit 0ad3867b0f13e45cfee5a1298bfd40eef096116c ] coccinelle report: ./drivers/scsi/mvsas/mv_init.c:699:8-16: WARNING: use scnprintf or sprintf ./drivers/scsi/mvsas/mv_init.c:747:8-16: WARNING: use scnprintf or sprintf Use sysfs_emit() instead of scnprintf() or sprintf(). Link: https://lore.kernel.org/r/c1711f7cf251730a8ceb5bdfc313bf85662b3395.1643182948.git.yang.guang5@zte.com.cn Reported-by: Zeal Robot Signed-off-by: Yang Guang Signed-off-by: David Yang Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/scsi/mvsas/mv_init.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/mvsas/mv_init.c b/drivers/scsi/mvsas/mv_init.c index b03c0f35d7b0..0cfea7b2ab13 100644 --- a/drivers/scsi/mvsas/mv_init.c +++ b/drivers/scsi/mvsas/mv_init.c @@ -697,7 +697,7 @@ static ssize_t mvs_show_driver_version(struct device *cdev, struct device_attribute *attr, char *buffer) { - return snprintf(buffer, PAGE_SIZE, "%s\n", DRV_VERSION); + return sysfs_emit(buffer, "%s\n", DRV_VERSION); } static DEVICE_ATTR(driver_version, @@ -749,7 +749,7 @@ mvs_store_interrupt_coalescing(struct device *cdev, static ssize_t mvs_show_interrupt_coalescing(struct device *cdev, struct device_attribute *attr, char *buffer) { - return snprintf(buffer, PAGE_SIZE, "%d\n", interrupt_coalescing); + return sysfs_emit(buffer, "%d\n", interrupt_coalescing); } static DEVICE_ATTR(interrupt_coalescing, -- Gitee From 469563ccf55711bafcb482164c800826a801ab1f Mon Sep 17 00:00:00 2001 From: Yang Guang Date: Thu, 14 Jul 2022 21:57:16 +0800 Subject: [PATCH 206/674] scsi: bfa: Replace snprintf() with sysfs_emit() stable inclusion from stable-v5.10.111 commit de9505936c47df0c1cbee112a48c883322135b54 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=de9505936c47df0c1cbee112a48c883322135b54 -------------------------------- [ Upstream commit 2245ea91fd3a04cafbe2f54911432a8657528c3b ] coccinelle report: ./drivers/scsi/bfa/bfad_attr.c:908:8-16: WARNING: use scnprintf or sprintf ./drivers/scsi/bfa/bfad_attr.c:860:8-16: WARNING: use scnprintf or sprintf ./drivers/scsi/bfa/bfad_attr.c:888:8-16: WARNING: use scnprintf or sprintf ./drivers/scsi/bfa/bfad_attr.c:853:8-16: WARNING: use scnprintf or sprintf ./drivers/scsi/bfa/bfad_attr.c:808:8-16: WARNING: use scnprintf or sprintf ./drivers/scsi/bfa/bfad_attr.c:728:8-16: WARNING: use scnprintf or sprintf ./drivers/scsi/bfa/bfad_attr.c:822:8-16: WARNING: use scnprintf or sprintf ./drivers/scsi/bfa/bfad_attr.c:927:9-17: WARNING: use scnprintf or sprintf ./drivers/scsi/bfa/bfad_attr.c:900:8-16: WARNING: use scnprintf or sprintf ./drivers/scsi/bfa/bfad_attr.c:874:8-16: WARNING: use scnprintf or sprintf ./drivers/scsi/bfa/bfad_attr.c:714:8-16: WARNING: use scnprintf or sprintf ./drivers/scsi/bfa/bfad_attr.c:839:8-16: WARNING: use scnprintf or sprintf Use sysfs_emit() instead of scnprintf() or sprintf(). Link: https://lore.kernel.org/r/def83ff75faec64ba592b867a8499b1367bae303.1643181468.git.yang.guang5@zte.com.cn Reported-by: Zeal Robot Signed-off-by: Yang Guang Signed-off-by: David Yang Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/scsi/bfa/bfad_attr.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/drivers/scsi/bfa/bfad_attr.c b/drivers/scsi/bfa/bfad_attr.c index 5ae1e3f78910..e049cdb3c286 100644 --- a/drivers/scsi/bfa/bfad_attr.c +++ b/drivers/scsi/bfa/bfad_attr.c @@ -711,7 +711,7 @@ bfad_im_serial_num_show(struct device *dev, struct device_attribute *attr, char serial_num[BFA_ADAPTER_SERIAL_NUM_LEN]; bfa_get_adapter_serial_num(&bfad->bfa, serial_num); - return snprintf(buf, PAGE_SIZE, "%s\n", serial_num); + return sysfs_emit(buf, "%s\n", serial_num); } static ssize_t @@ -725,7 +725,7 @@ bfad_im_model_show(struct device *dev, struct device_attribute *attr, char model[BFA_ADAPTER_MODEL_NAME_LEN]; bfa_get_adapter_model(&bfad->bfa, model); - return snprintf(buf, PAGE_SIZE, "%s\n", model); + return sysfs_emit(buf, "%s\n", model); } static ssize_t @@ -805,7 +805,7 @@ bfad_im_model_desc_show(struct device *dev, struct device_attribute *attr, snprintf(model_descr, BFA_ADAPTER_MODEL_DESCR_LEN, "Invalid Model"); - return snprintf(buf, PAGE_SIZE, "%s\n", model_descr); + return sysfs_emit(buf, "%s\n", model_descr); } static ssize_t @@ -819,7 +819,7 @@ bfad_im_node_name_show(struct device *dev, struct device_attribute *attr, u64 nwwn; nwwn = bfa_fcs_lport_get_nwwn(port->fcs_port); - return snprintf(buf, PAGE_SIZE, "0x%llx\n", cpu_to_be64(nwwn)); + return sysfs_emit(buf, "0x%llx\n", cpu_to_be64(nwwn)); } static ssize_t @@ -836,7 +836,7 @@ bfad_im_symbolic_name_show(struct device *dev, struct device_attribute *attr, bfa_fcs_lport_get_attr(&bfad->bfa_fcs.fabric.bport, &port_attr); strlcpy(symname, port_attr.port_cfg.sym_name.symname, BFA_SYMNAME_MAXLEN); - return snprintf(buf, PAGE_SIZE, "%s\n", symname); + return sysfs_emit(buf, "%s\n", symname); } static ssize_t @@ -850,14 +850,14 @@ bfad_im_hw_version_show(struct device *dev, struct device_attribute *attr, char hw_ver[BFA_VERSION_LEN]; bfa_get_pci_chip_rev(&bfad->bfa, hw_ver); - return snprintf(buf, PAGE_SIZE, "%s\n", hw_ver); + return sysfs_emit(buf, "%s\n", hw_ver); } static ssize_t bfad_im_drv_version_show(struct device *dev, struct device_attribute *attr, char *buf) { - return snprintf(buf, PAGE_SIZE, "%s\n", BFAD_DRIVER_VERSION); + return sysfs_emit(buf, "%s\n", BFAD_DRIVER_VERSION); } static ssize_t @@ -871,7 +871,7 @@ bfad_im_optionrom_version_show(struct device *dev, char optrom_ver[BFA_VERSION_LEN]; bfa_get_adapter_optrom_ver(&bfad->bfa, optrom_ver); - return snprintf(buf, PAGE_SIZE, "%s\n", optrom_ver); + return sysfs_emit(buf, "%s\n", optrom_ver); } static ssize_t @@ -885,7 +885,7 @@ bfad_im_fw_version_show(struct device *dev, struct device_attribute *attr, char fw_ver[BFA_VERSION_LEN]; bfa_get_adapter_fw_ver(&bfad->bfa, fw_ver); - return snprintf(buf, PAGE_SIZE, "%s\n", fw_ver); + return sysfs_emit(buf, "%s\n", fw_ver); } static ssize_t @@ -897,7 +897,7 @@ bfad_im_num_of_ports_show(struct device *dev, struct device_attribute *attr, (struct bfad_im_port_s *) shost->hostdata[0]; struct bfad_s *bfad = im_port->bfad; - return snprintf(buf, PAGE_SIZE, "%d\n", + return sysfs_emit(buf, "%d\n", bfa_get_nports(&bfad->bfa)); } @@ -905,7 +905,7 @@ static ssize_t bfad_im_drv_name_show(struct device *dev, struct device_attribute *attr, char *buf) { - return snprintf(buf, PAGE_SIZE, "%s\n", BFAD_DRIVER_NAME); + return sysfs_emit(buf, "%s\n", BFAD_DRIVER_NAME); } static ssize_t @@ -924,14 +924,14 @@ bfad_im_num_of_discovered_ports_show(struct device *dev, rports = kcalloc(nrports, sizeof(struct bfa_rport_qualifier_s), GFP_ATOMIC); if (rports == NULL) - return snprintf(buf, PAGE_SIZE, "Failed\n"); + return sysfs_emit(buf, "Failed\n"); spin_lock_irqsave(&bfad->bfad_lock, flags); bfa_fcs_lport_get_rport_quals(port->fcs_port, rports, &nrports); spin_unlock_irqrestore(&bfad->bfad_lock, flags); kfree(rports); - return snprintf(buf, PAGE_SIZE, "%d\n", nrports); + return sysfs_emit(buf, "%d\n", nrports); } static DEVICE_ATTR(serial_number, S_IRUGO, -- Gitee From f6cd84e06dbb2c9600dc4813556e3c7713fe786a Mon Sep 17 00:00:00 2001 From: Evgeny Boger Date: Thu, 14 Jul 2022 21:57:17 +0800 Subject: [PATCH 207/674] power: supply: axp20x_battery: properly report current when discharging stable inclusion from stable-v5.10.111 commit b42b6d0ec358432ef13d1e3ca4d33cd837af6f8a category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=b42b6d0ec358432ef13d1e3ca4d33cd837af6f8a -------------------------------- [ Upstream commit d4f408cdcd26921c1268cb8dcbe8ffb6faf837f3 ] As stated in [1], negative current values are used for discharging batteries. AXP PMICs internally have two different ADC channels for shunt current measurement: one used during charging and one during discharging. The values reported by these ADCs are unsigned. While the driver properly selects ADC channel to get the data from, it doesn't apply negative sign when reporting discharging current. [1] Documentation/ABI/testing/sysfs-class-power Signed-off-by: Evgeny Boger Acked-by: Chen-Yu Tsai Signed-off-by: Sebastian Reichel Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/power/supply/axp20x_battery.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/power/supply/axp20x_battery.c b/drivers/power/supply/axp20x_battery.c index e84b6e4da14a..9fda98b950ba 100644 --- a/drivers/power/supply/axp20x_battery.c +++ b/drivers/power/supply/axp20x_battery.c @@ -185,7 +185,6 @@ static int axp20x_battery_get_prop(struct power_supply *psy, union power_supply_propval *val) { struct axp20x_batt_ps *axp20x_batt = power_supply_get_drvdata(psy); - struct iio_channel *chan; int ret = 0, reg, val1; switch (psp) { @@ -265,12 +264,12 @@ static int axp20x_battery_get_prop(struct power_supply *psy, if (ret) return ret; - if (reg & AXP20X_PWR_STATUS_BAT_CHARGING) - chan = axp20x_batt->batt_chrg_i; - else - chan = axp20x_batt->batt_dischrg_i; - - ret = iio_read_channel_processed(chan, &val->intval); + if (reg & AXP20X_PWR_STATUS_BAT_CHARGING) { + ret = iio_read_channel_processed(axp20x_batt->batt_chrg_i, &val->intval); + } else { + ret = iio_read_channel_processed(axp20x_batt->batt_dischrg_i, &val1); + val->intval = -val1; + } if (ret) return ret; -- Gitee From 585eb591b678419b3df7177cb4ea29c2152ed982 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Thu, 14 Jul 2022 21:57:18 +0800 Subject: [PATCH 208/674] mt76: dma: initialize skip_unmap in mt76_dma_rx_fill stable inclusion from stable-v5.10.111 commit 9a56e2b271bc65116d1e44454b244b18d289a26b category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=9a56e2b271bc65116d1e44454b244b18d289a26b -------------------------------- [ Upstream commit 577298ec55dfc8b9aece54520f0258c3f93a6573 ] Even if it is only a false-positive since skip_buf0/skip_buf1 are only used in mt76_dma_tx_cleanup_idx routine, initialize skip_unmap in mt76_dma_rx_fill in order to fix the following UBSAN report: [ 13.924906] UBSAN: invalid-load in linux-5.15.0/drivers/net/wireless/mediatek/mt76/dma.c:162:13 [ 13.924909] load of value 225 is not a valid value for type '_Bool' [ 13.924912] CPU: 9 PID: 672 Comm: systemd-udevd Not tainted 5.15.0-18-generic #18-Ubuntu [ 13.924914] Hardware name: LENOVO 21A0000CMX/21A0000CMX, BIOS R1MET43W (1.13 ) 11/05/2021 [ 13.924915] Call Trace: [ 13.924917] [ 13.924920] show_stack+0x52/0x58 [ 13.924925] dump_stack_lvl+0x4a/0x5f [ 13.924931] dump_stack+0x10/0x12 [ 13.924932] ubsan_epilogue+0x9/0x45 [ 13.924934] __ubsan_handle_load_invalid_value.cold+0x44/0x49 [ 13.924935] ? __iommu_dma_map+0x84/0xf0 [ 13.924939] mt76_dma_add_buf.constprop.0.cold+0x23/0x85 [mt76] [ 13.924949] mt76_dma_rx_fill.isra.0+0x102/0x1f0 [mt76] [ 13.924954] mt76_dma_init+0xc9/0x150 [mt76] [ 13.924959] ? mt7921_dma_enable+0x110/0x110 [mt7921e] [ 13.924966] mt7921_dma_init+0x1e3/0x260 [mt7921e] [ 13.924970] mt7921_register_device+0x29d/0x510 [mt7921e] [ 13.924975] mt7921_pci_probe.part.0+0x17f/0x1b0 [mt7921e] [ 13.924980] mt7921_pci_probe+0x43/0x60 [mt7921e] [ 13.924984] local_pci_probe+0x4b/0x90 [ 13.924987] pci_device_probe+0x115/0x1f0 [ 13.924989] really_probe+0x21e/0x420 [ 13.924992] __driver_probe_device+0x115/0x190 [ 13.924994] driver_probe_device+0x23/0xc0 [ 13.924996] __driver_attach+0xbd/0x1d0 [ 13.924998] ? __device_attach_driver+0x110/0x110 [ 13.924999] bus_for_each_dev+0x7e/0xc0 [ 13.925001] driver_attach+0x1e/0x20 [ 13.925003] bus_add_driver+0x135/0x200 [ 13.925005] driver_register+0x95/0xf0 [ 13.925008] ? 0xffffffffc0766000 [ 13.925010] __pci_register_driver+0x68/0x70 [ 13.925011] mt7921_pci_driver_init+0x23/0x1000 [mt7921e] [ 13.925015] do_one_initcall+0x48/0x1d0 [ 13.925019] ? kmem_cache_alloc_trace+0x19e/0x2e0 [ 13.925022] do_init_module+0x62/0x280 [ 13.925025] load_module+0xac9/0xbb0 [ 13.925027] __do_sys_finit_module+0xbf/0x120 [ 13.925029] __x64_sys_finit_module+0x18/0x20 [ 13.925030] do_syscall_64+0x5c/0xc0 [ 13.925033] ? do_syscall_64+0x69/0xc0 [ 13.925034] ? sysvec_reschedule_ipi+0x78/0xe0 [ 13.925036] ? asm_sysvec_reschedule_ipi+0xa/0x20 [ 13.925039] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 13.925040] RIP: 0033:0x7fbf2b90f94d [ 13.925045] RSP: 002b:00007ffe2ec7e5d8 EFLAGS: 00000246 ORIG_RAX: 0000000000000139 [ 13.925047] RAX: ffffffffffffffda RBX: 000056106b0634e0 RCX: 00007fbf2b90f94d [ 13.925048] RDX: 0000000000000000 RSI: 00007fbf2baa3441 RDI: 0000000000000013 [ 13.925049] RBP: 0000000000020000 R08: 0000000000000000 R09: 0000000000000002 [ 13.925050] R10: 0000000000000013 R11: 0000000000000246 R12: 00007fbf2baa3441 [ 13.925051] R13: 000056106b062620 R14: 000056106b0610c0 R15: 000056106b0640d0 [ 13.925053] Signed-off-by: Lorenzo Bianconi Signed-off-by: Felix Fietkau Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/net/wireless/mediatek/mt76/dma.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wireless/mediatek/mt76/dma.c b/drivers/net/wireless/mediatek/mt76/dma.c index 0fdfead45c77..f01b455783b2 100644 --- a/drivers/net/wireless/mediatek/mt76/dma.c +++ b/drivers/net/wireless/mediatek/mt76/dma.c @@ -455,6 +455,7 @@ mt76_dma_rx_fill(struct mt76_dev *dev, struct mt76_queue *q) qbuf.addr = addr + offset; qbuf.len = len - offset; + qbuf.skip_unmap = false; mt76_dma_add_buf(dev, q, &qbuf, 1, 0, buf, NULL); frames++; } -- Gitee From 80972fc714726cd3abe83b6d88099ef435f5332a Mon Sep 17 00:00:00 2001 From: Avraham Stern Date: Thu, 14 Jul 2022 21:57:19 +0800 Subject: [PATCH 209/674] cfg80211: don't add non transmitted BSS to 6GHz scanned channels stable inclusion from stable-v5.10.111 commit 26a1e4739e442da11607f0a757cb22a653e595ec category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=26a1e4739e442da11607f0a757cb22a653e595ec -------------------------------- [ Upstream commit 5666ee154f4696c011dfa8544aaf5591b6b87515 ] When adding 6GHz channels to scan request based on reported co-located APs, don't add channels that have only APs with "non-transmitted" BSSes if they only match the wildcard SSID since they will be found by probing the "transmitted" BSS. Signed-off-by: Avraham Stern Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20220202104617.f6ddf099f934.I231e55885d3644f292d00dfe0f42653269f2559e@changeid Signed-off-by: Johannes Berg Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- net/wireless/scan.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/net/wireless/scan.c b/net/wireless/scan.c index fd614a5a00b4..c1b2655682a8 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -702,8 +702,12 @@ static bool cfg80211_find_ssid_match(struct cfg80211_colocated_ap *ap, for (i = 0; i < request->n_ssids; i++) { /* wildcard ssid in the scan request */ - if (!request->ssids[i].ssid_len) + if (!request->ssids[i].ssid_len) { + if (ap->multi_bss && !ap->transmitted_bssid) + continue; + return true; + } if (ap->ssid_len && ap->ssid_len == request->ssids[i].ssid_len) { @@ -830,6 +834,9 @@ static int cfg80211_scan_6ghz(struct cfg80211_registered_device *rdev) !cfg80211_find_ssid_match(ap, request)) continue; + if (!request->n_ssids && ap->multi_bss && !ap->transmitted_bssid) + continue; + cfg80211_scan_req_add_chan(request, chan, true); memcpy(scan_6ghz_params->bssid, ap->bssid, ETH_ALEN); scan_6ghz_params->short_ssid = ap->short_ssid; -- Gitee From e2e71220c8cafedebdbd20c69f91c7f71e56332c Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 14 Jul 2022 21:57:20 +0800 Subject: [PATCH 210/674] libbpf: Fix build issue with llvm-readelf stable inclusion from stable-v5.10.111 commit 5baf92a2c46c543ddcc2e3b1a96cd67a10f7e7fd category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=5baf92a2c46c543ddcc2e3b1a96cd67a10f7e7fd -------------------------------- [ Upstream commit 0908a66ad1124c1634c33847ac662106f7f2c198 ] There are cases where clang compiler is packaged in a way readelf is a symbolic link to llvm-readelf. In such cases, llvm-readelf will be used instead of default binutils readelf, and the following error will appear during libbpf build: # Warning: Num of global symbols in # /home/yhs/work/bpf-next/tools/testing/selftests/bpf/tools/build/libbpf/sharedobjs/libbpf-in.o (367) # does NOT match with num of versioned symbols in # /home/yhs/work/bpf-next/tools/testing/selftests/bpf/tools/build/libbpf/libbpf.so libbpf.map (383). # Please make sure all LIBBPF_API symbols are versioned in libbpf.map. # --- /home/yhs/work/bpf-next/tools/testing/selftests/bpf/tools/build/libbpf/libbpf_global_syms.tmp ... # +++ /home/yhs/work/bpf-next/tools/testing/selftests/bpf/tools/build/libbpf/libbpf_versioned_syms.tmp ... # @@ -324,6 +324,22 @@ # btf__str_by_offset # btf__type_by_id # btf__type_cnt # +LIBBPF_0.0.1 # +LIBBPF_0.0.2 # +LIBBPF_0.0.3 # +LIBBPF_0.0.4 # +LIBBPF_0.0.5 # +LIBBPF_0.0.6 # +LIBBPF_0.0.7 # +LIBBPF_0.0.8 # +LIBBPF_0.0.9 # +LIBBPF_0.1.0 # +LIBBPF_0.2.0 # +LIBBPF_0.3.0 # +LIBBPF_0.4.0 # +LIBBPF_0.5.0 # +LIBBPF_0.6.0 # +LIBBPF_0.7.0 # libbpf_attach_type_by_name # libbpf_find_kernel_btf # libbpf_find_vmlinux_btf_id # make[2]: *** [Makefile:184: check_abi] Error 1 # make[1]: *** [Makefile:140: all] Error 2 The above failure is due to different printouts for some ABS versioned symbols. For example, with the same libbpf.so, $ /bin/readelf --dyn-syms --wide tools/lib/bpf/libbpf.so | grep "LIBBPF" | grep ABS 134: 0000000000000000 0 OBJECT GLOBAL DEFAULT ABS LIBBPF_0.5.0 202: 0000000000000000 0 OBJECT GLOBAL DEFAULT ABS LIBBPF_0.6.0 ... $ /opt/llvm/bin/readelf --dyn-syms --wide tools/lib/bpf/libbpf.so | grep "LIBBPF" | grep ABS 134: 0000000000000000 0 OBJECT GLOBAL DEFAULT ABS LIBBPF_0.5.0@@LIBBPF_0.5.0 202: 0000000000000000 0 OBJECT GLOBAL DEFAULT ABS LIBBPF_0.6.0@@LIBBPF_0.6.0 ... The binutils readelf doesn't print out the symbol LIBBPF_* version and llvm-readelf does. Such a difference caused libbpf build failure with llvm-readelf. The proposed fix filters out all ABS symbols as they are not part of the comparison. This works for both binutils readelf and llvm-readelf. Reported-by: Delyan Kratunov Signed-off-by: Yonghong Song Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220204214355.502108-1-yhs@fb.com Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- tools/lib/bpf/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile index 154b75fc1373..f2a353bba25f 100644 --- a/tools/lib/bpf/Makefile +++ b/tools/lib/bpf/Makefile @@ -147,7 +147,7 @@ GLOBAL_SYM_COUNT = $(shell readelf -s --wide $(BPF_IN_SHARED) | \ sort -u | wc -l) VERSIONED_SYM_COUNT = $(shell readelf --dyn-syms --wide $(OUTPUT)libbpf.so | \ sed 's/\[.*\]//' | \ - awk '/GLOBAL/ && /DEFAULT/ && !/UND/ {print $$NF}' | \ + awk '/GLOBAL/ && /DEFAULT/ && !/UND|ABS/ {print $$NF}' | \ grep -Eo '[^ ]+@LIBBPF_' | cut -d@ -f1 | sort -u | wc -l) CMD_TARGETS = $(LIB_TARGET) $(PC_FILE) @@ -216,7 +216,7 @@ check_abi: $(OUTPUT)libbpf.so $(VERSION_SCRIPT) sort -u > $(OUTPUT)libbpf_global_syms.tmp; \ readelf --dyn-syms --wide $(OUTPUT)libbpf.so | \ sed 's/\[.*\]//' | \ - awk '/GLOBAL/ && /DEFAULT/ && !/UND/ {print $$NF}'| \ + awk '/GLOBAL/ && /DEFAULT/ && !/UND|ABS/ {print $$NF}'| \ grep -Eo '[^ ]+@LIBBPF_' | cut -d@ -f1 | \ sort -u > $(OUTPUT)libbpf_versioned_syms.tmp; \ diff -u $(OUTPUT)libbpf_global_syms.tmp \ -- Gitee From c04876da0c5e08d43a22fd7a60669284ac253472 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 14 Jul 2022 21:57:21 +0800 Subject: [PATCH 211/674] ipv6: make mc_forwarding atomic stable inclusion from stable-v5.10.111 commit fb5ac62fbe16dcc273e90bec99ce637ff2bd60c3 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=fb5ac62fbe16dcc273e90bec99ce637ff2bd60c3 -------------------------------- [ Upstream commit 145c7a793838add5e004e7d49a67654dc7eba147 ] This fixes minor data-races in ip6_mc_input() and batadv_mcast_mla_rtr_flags_softif_get_ipv6() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- include/linux/ipv6.h | 2 +- net/batman-adv/multicast.c | 2 +- net/ipv6/addrconf.c | 4 ++-- net/ipv6/ip6_input.c | 2 +- net/ipv6/ip6mr.c | 8 ++++---- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 0e6f5d2785e9..f4e1790a5eed 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -51,7 +51,7 @@ struct ipv6_devconf { __s32 use_optimistic; #endif #ifdef CONFIG_IPV6_MROUTE - __s32 mc_forwarding; + atomic_t mc_forwarding; #endif __s32 disable_ipv6; __s32 drop_unicast_in_l2_multicast; diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index 139894ca788b..c8a341cd652c 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -136,7 +136,7 @@ static u8 batadv_mcast_mla_rtr_flags_softif_get_ipv6(struct net_device *dev) { struct inet6_dev *in6_dev = __in6_dev_get(dev); - if (in6_dev && in6_dev->cnf.mc_forwarding) + if (in6_dev && atomic_read(&in6_dev->cnf.mc_forwarding)) return BATADV_NO_FLAGS; else return BATADV_MCAST_WANT_NO_RTR6; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 1064edea8841..4e24f3f7595c 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -546,7 +546,7 @@ static int inet6_netconf_fill_devconf(struct sk_buff *skb, int ifindex, #ifdef CONFIG_IPV6_MROUTE if ((all || type == NETCONFA_MC_FORWARDING) && nla_put_s32(skb, NETCONFA_MC_FORWARDING, - devconf->mc_forwarding) < 0) + atomic_read(&devconf->mc_forwarding)) < 0) goto nla_put_failure; #endif if ((all || type == NETCONFA_PROXY_NEIGH) && @@ -5519,7 +5519,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_USE_OPTIMISTIC] = cnf->use_optimistic; #endif #ifdef CONFIG_IPV6_MROUTE - array[DEVCONF_MC_FORWARDING] = cnf->mc_forwarding; + array[DEVCONF_MC_FORWARDING] = atomic_read(&cnf->mc_forwarding); #endif array[DEVCONF_DISABLE_IPV6] = cnf->disable_ipv6; array[DEVCONF_ACCEPT_DAD] = cnf->accept_dad; diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 06d60662717d..15ea3d082534 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -509,7 +509,7 @@ int ip6_mc_input(struct sk_buff *skb) /* * IPv6 multicast router mode is now supported ;) */ - if (dev_net(skb->dev)->ipv6.devconf_all->mc_forwarding && + if (atomic_read(&dev_net(skb->dev)->ipv6.devconf_all->mc_forwarding) && !(ipv6_addr_type(&hdr->daddr) & (IPV6_ADDR_LOOPBACK|IPV6_ADDR_LINKLOCAL)) && likely(!(IP6CB(skb)->flags & IP6SKB_FORWARDED))) { diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 41cb348a7c3c..5f0ac47acc74 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -740,7 +740,7 @@ static int mif6_delete(struct mr_table *mrt, int vifi, int notify, in6_dev = __in6_dev_get(dev); if (in6_dev) { - in6_dev->cnf.mc_forwarding--; + atomic_dec(&in6_dev->cnf.mc_forwarding); inet6_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF, NETCONFA_MC_FORWARDING, dev->ifindex, &in6_dev->cnf); @@ -908,7 +908,7 @@ static int mif6_add(struct net *net, struct mr_table *mrt, in6_dev = __in6_dev_get(dev); if (in6_dev) { - in6_dev->cnf.mc_forwarding++; + atomic_inc(&in6_dev->cnf.mc_forwarding); inet6_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF, NETCONFA_MC_FORWARDING, dev->ifindex, &in6_dev->cnf); @@ -1558,7 +1558,7 @@ static int ip6mr_sk_init(struct mr_table *mrt, struct sock *sk) } else { rcu_assign_pointer(mrt->mroute_sk, sk); sock_set_flag(sk, SOCK_RCU_FREE); - net->ipv6.devconf_all->mc_forwarding++; + atomic_inc(&net->ipv6.devconf_all->mc_forwarding); } write_unlock_bh(&mrt_lock); @@ -1591,7 +1591,7 @@ int ip6mr_sk_done(struct sock *sk) * so the RCU grace period before sk freeing * is guaranteed by sk_destruct() */ - net->ipv6.devconf_all->mc_forwarding--; + atomic_dec(&net->ipv6.devconf_all->mc_forwarding); write_unlock_bh(&mrt_lock); inet6_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_MC_FORWARDING, -- Gitee From 1191f146f6e5da5a9762a94facb310329c31205f Mon Sep 17 00:00:00 2001 From: Zheng Zengkai Date: Thu, 14 Jul 2022 21:57:22 +0800 Subject: [PATCH 212/674] ipv6: fix kabi for mc_forwarding in struct ipv6_devconf hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z CVE: NA --------------------------------------- Making mc_forwarding atomic breaks the KABI of struct ipv6_devconf. This patch uses KABI_REPLACE to fix it. Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- include/linux/ipv6.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index f4e1790a5eed..210402b26a20 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -51,7 +51,7 @@ struct ipv6_devconf { __s32 use_optimistic; #endif #ifdef CONFIG_IPV6_MROUTE - atomic_t mc_forwarding; + KABI_REPLACE(__s32 mc_forwarding, atomic_t mc_forwarding) #endif __s32 disable_ipv6; __s32 drop_unicast_in_l2_multicast; -- Gitee From 4c98b9576c915d4d0f420f2db10f34c13a7c3d8b Mon Sep 17 00:00:00 2001 From: Sourabh Jain Date: Thu, 14 Jul 2022 21:57:23 +0800 Subject: [PATCH 213/674] powerpc: Set crashkernel offset to mid of RMA region stable inclusion from stable-v5.10.111 commit ea21eaea7f5f368d710acb3e4dd51d14410d5b49 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=ea21eaea7f5f368d710acb3e4dd51d14410d5b49 -------------------------------- [ Upstream commit 7c5ed82b800d8615cdda00729e7b62e5899f0b13 ] On large config LPARs (having 192 and more cores), Linux fails to boot due to insufficient memory in the first memblock. It is due to the memory reservation for the crash kernel which starts at 128MB offset of the first memblock. This memory reservation for the crash kernel doesn't leave enough space in the first memblock to accommodate other essential system resources. The crash kernel start address was set to 128MB offset by default to ensure that the crash kernel get some memory below the RMA region which is used to be of size 256MB. But given that the RMA region size can be 512MB or more, setting the crash kernel offset to mid of RMA size will leave enough space for the kernel to allocate memory for other system resources. Since the above crash kernel offset change is only applicable to the LPAR platform, the LPAR feature detection is pushed before the crash kernel reservation. The rest of LPAR specific initialization will still be done during pseries_probe_fw_features as usual. This patch is dependent on changes to paca allocation for boot CPU. It expect boot CPU to discover 1T segment support which is introduced by the patch posted here: https://lists.ozlabs.org/pipermail/linuxppc-dev/2022-January/239175.html Reported-by: Abdul haleem Signed-off-by: Sourabh Jain Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220204085601.107257-1-sourabhjain@linux.ibm.com Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- arch/powerpc/kernel/rtas.c | 6 ++++++ arch/powerpc/kexec/core.c | 15 +++++++++++---- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index cccb32cf0e08..cf421eb7f90d 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -1296,6 +1296,12 @@ int __init early_init_dt_scan_rtas(unsigned long node, entryp = of_get_flat_dt_prop(node, "linux,rtas-entry", NULL); sizep = of_get_flat_dt_prop(node, "rtas-size", NULL); +#ifdef CONFIG_PPC64 + /* need this feature to decide the crashkernel offset */ + if (of_get_flat_dt_prop(node, "ibm,hypertas-functions", NULL)) + powerpc_firmware_features |= FW_FEATURE_LPAR; +#endif + if (basep && entryp && sizep) { rtas.base = *basep; rtas.entry = *entryp; diff --git a/arch/powerpc/kexec/core.c b/arch/powerpc/kexec/core.c index 56da5eb2b923..80c79cb5010c 100644 --- a/arch/powerpc/kexec/core.c +++ b/arch/powerpc/kexec/core.c @@ -147,11 +147,18 @@ void __init reserve_crashkernel(void) if (!crashk_res.start) { #ifdef CONFIG_PPC64 /* - * On 64bit we split the RMO in half but cap it at half of - * a small SLB (128MB) since the crash kernel needs to place - * itself and some stacks to be in the first segment. + * On the LPAR platform place the crash kernel to mid of + * RMA size (512MB or more) to ensure the crash kernel + * gets enough space to place itself and some stack to be + * in the first segment. At the same time normal kernel + * also get enough space to allocate memory for essential + * system resource in the first segment. Keep the crash + * kernel starts at 128MB offset on other platforms. */ - crashk_res.start = min(0x8000000ULL, (ppc64_rma_size / 2)); + if (firmware_has_feature(FW_FEATURE_LPAR)) + crashk_res.start = ppc64_rma_size / 2; + else + crashk_res.start = min(0x8000000ULL, (ppc64_rma_size / 2)); #else crashk_res.start = KDUMP_KERNELBASE; #endif -- Gitee From 613db224006696c7a210a0f850f37d8b4ad88fad Mon Sep 17 00:00:00 2001 From: Rajneesh Bhardwaj Date: Thu, 14 Jul 2022 21:57:24 +0800 Subject: [PATCH 214/674] drm/amdgpu: Fix recursive locking warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit stable inclusion from stable-v5.10.111 commit 6694b8643bde4b940f6b410507960793b922a77d category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=6694b8643bde4b940f6b410507960793b922a77d -------------------------------- [ Upstream commit 447c7997b62a5115ba4da846dcdee4fc12298a6a ] Noticed the below warning while running a pytorch workload on vega10 GPUs. Change to trylock to avoid conflicts with already held reservation locks. [ +0.000003] WARNING: possible recursive locking detected [ +0.000003] 5.13.0-kfd-rajneesh #1030 Not tainted [ +0.000004] -------------------------------------------- [ +0.000002] python/4822 is trying to acquire lock: [ +0.000004] ffff932cd9a259f8 (reservation_ww_class_mutex){+.+.}-{3:3}, at: amdgpu_bo_release_notify+0xc4/0x160 [amdgpu] [ +0.000203] but task is already holding lock: [ +0.000003] ffff932cbb7181f8 (reservation_ww_class_mutex){+.+.}-{3:3}, at: ttm_eu_reserve_buffers+0x270/0x470 [ttm] [ +0.000017] other info that might help us debug this: [ +0.000002] Possible unsafe locking scenario: [ +0.000003] CPU0 [ +0.000002] ---- [ +0.000002] lock(reservation_ww_class_mutex); [ +0.000004] lock(reservation_ww_class_mutex); [ +0.000003] *** DEADLOCK *** [ +0.000002] May be due to missing lock nesting notation [ +0.000003] 7 locks held by python/4822: [ +0.000003] #0: ffff932c4ac028d0 (&process->mutex){+.+.}-{3:3}, at: kfd_ioctl_map_memory_to_gpu+0x10b/0x320 [amdgpu] [ +0.000232] #1: ffff932c55e830a8 (&info->lock#2){+.+.}-{3:3}, at: amdgpu_amdkfd_gpuvm_map_memory_to_gpu+0x64/0xf60 [amdgpu] [ +0.000241] #2: ffff932cc45b5e68 (&(*mem)->lock){+.+.}-{3:3}, at: amdgpu_amdkfd_gpuvm_map_memory_to_gpu+0xdf/0xf60 [amdgpu] [ +0.000236] #3: ffffb2b35606fd28 (reservation_ww_class_acquire){+.+.}-{0:0}, at: amdgpu_amdkfd_gpuvm_map_memory_to_gpu+0x232/0xf60 [amdgpu] [ +0.000235] #4: ffff932cbb7181f8 (reservation_ww_class_mutex){+.+.}-{3:3}, at: ttm_eu_reserve_buffers+0x270/0x470 [ttm] [ +0.000015] #5: ffffffffc045f700 (*(sspp++)){....}-{0:0}, at: drm_dev_enter+0x5/0xa0 [drm] [ +0.000038] #6: ffff932c52da7078 (&vm->eviction_lock){+.+.}-{3:3}, at: amdgpu_vm_bo_update_mapping+0xd5/0x4f0 [amdgpu] [ +0.000195] stack backtrace: [ +0.000003] CPU: 11 PID: 4822 Comm: python Not tainted 5.13.0-kfd-rajneesh #1030 [ +0.000005] Hardware name: GIGABYTE MZ01-CE0-00/MZ01-CE0-00, BIOS F02 08/29/2018 [ +0.000003] Call Trace: [ +0.000003] dump_stack+0x6d/0x89 [ +0.000010] __lock_acquire+0xb93/0x1a90 [ +0.000009] lock_acquire+0x25d/0x2d0 [ +0.000005] ? amdgpu_bo_release_notify+0xc4/0x160 [amdgpu] [ +0.000184] ? lock_is_held_type+0xa2/0x110 [ +0.000006] ? amdgpu_bo_release_notify+0xc4/0x160 [amdgpu] [ +0.000184] __ww_mutex_lock.constprop.17+0xca/0x1060 [ +0.000007] ? amdgpu_bo_release_notify+0xc4/0x160 [amdgpu] [ +0.000183] ? lock_release+0x13f/0x270 [ +0.000005] ? lock_is_held_type+0xa2/0x110 [ +0.000006] ? amdgpu_bo_release_notify+0xc4/0x160 [amdgpu] [ +0.000183] amdgpu_bo_release_notify+0xc4/0x160 [amdgpu] [ +0.000185] ttm_bo_release+0x4c6/0x580 [ttm] [ +0.000010] amdgpu_bo_unref+0x1a/0x30 [amdgpu] [ +0.000183] amdgpu_vm_free_table+0x76/0xa0 [amdgpu] [ +0.000189] amdgpu_vm_free_pts+0xb8/0xf0 [amdgpu] [ +0.000189] amdgpu_vm_update_ptes+0x411/0x770 [amdgpu] [ +0.000191] amdgpu_vm_bo_update_mapping+0x324/0x4f0 [amdgpu] [ +0.000191] amdgpu_vm_bo_update+0x251/0x610 [amdgpu] [ +0.000191] update_gpuvm_pte+0xcc/0x290 [amdgpu] [ +0.000229] ? amdgpu_vm_bo_map+0xd7/0x130 [amdgpu] [ +0.000190] amdgpu_amdkfd_gpuvm_map_memory_to_gpu+0x912/0xf60 [amdgpu] [ +0.000234] kfd_ioctl_map_memory_to_gpu+0x182/0x320 [amdgpu] [ +0.000218] kfd_ioctl+0x2b9/0x600 [amdgpu] [ +0.000216] ? kfd_ioctl_unmap_memory_from_gpu+0x270/0x270 [amdgpu] [ +0.000216] ? lock_release+0x13f/0x270 [ +0.000006] ? __fget_files+0x107/0x1e0 [ +0.000007] __x64_sys_ioctl+0x8b/0xd0 [ +0.000007] do_syscall_64+0x36/0x70 [ +0.000004] entry_SYSCALL_64_after_hwframe+0x44/0xae [ +0.000007] RIP: 0033:0x7fbff90a7317 [ +0.000004] Code: b3 66 90 48 8b 05 71 4b 2d 00 64 c7 00 26 00 00 00 48 c7 c0 ff ff ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 b8 10 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 41 4b 2d 00 f7 d8 64 89 01 48 [ +0.000005] RSP: 002b:00007fbe301fe648 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 [ +0.000006] RAX: ffffffffffffffda RBX: 00007fbcc402d820 RCX: 00007fbff90a7317 [ +0.000003] RDX: 00007fbe301fe690 RSI: 00000000c0184b18 RDI: 0000000000000004 [ +0.000003] RBP: 00007fbe301fe690 R08: 0000000000000000 R09: 00007fbcc402d880 [ +0.000003] R10: 0000000002001000 R11: 0000000000000246 R12: 00000000c0184b18 [ +0.000003] R13: 0000000000000004 R14: 00007fbf689593a0 R15: 00007fbcc402d820 Cc: Christian König Cc: Felix Kuehling Cc: Alex Deucher Reviewed-by: Christian König Reviewed-by: Felix Kuehling Signed-off-by: Rajneesh Bhardwaj Signed-off-by: Alex Deucher Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index ad9863b84f1f..f615ecc06a22 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -1338,7 +1338,8 @@ void amdgpu_bo_release_notify(struct ttm_buffer_object *bo) !(abo->flags & AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE)) return; - dma_resv_lock(bo->base.resv, NULL); + if (WARN_ON_ONCE(!dma_resv_trylock(bo->base.resv))) + return; r = amdgpu_fill_buffer(abo, AMDGPU_POISON, bo->base.resv, &fence); if (!WARN_ON(r)) { -- Gitee From b23bcbc093a2826e1b387a3d9f9971581225a4ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Thu, 14 Jul 2022 21:57:25 +0800 Subject: [PATCH 215/674] PCI: aardvark: Fix support for MSI interrupts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit stable inclusion from stable-v5.10.111 commit e07e420a00564863b45f88e3bd26860e803d87fe category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=e07e420a00564863b45f88e3bd26860e803d87fe -------------------------------- [ Upstream commit b0b0b8b897f8e12b2368e868bd7cdc5742d5c5a9 ] Aardvark hardware supports Multi-MSI and MSI_FLAG_MULTI_PCI_MSI is already set for the MSI chip. But when allocating MSI interrupt numbers for Multi-MSI, the numbers need to be properly aligned, otherwise endpoint devices send MSI interrupt with incorrect numbers. Fix this issue by using function bitmap_find_free_region() instead of bitmap_find_next_zero_area(). To ensure that aligned MSI interrupt numbers are used by endpoint devices, we cannot use Linux virtual irq numbers (as they are random and not properly aligned). Instead we need to use the aligned hwirq numbers. This change fixes receiving MSI interrupts on Armada 3720 boards and allows using NVMe disks which use Multi-MSI feature with 3 interrupts. Without this NVMe disks freeze booting as linux nvme-core.c is waiting 60s for an interrupt. Link: https://lore.kernel.org/r/20220110015018.26359-4-kabel@kernel.org Signed-off-by: Pali Rohár Signed-off-by: Marek Behún Signed-off-by: Lorenzo Pieralisi Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/pci/controller/pci-aardvark.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index 49ff8bf10c74..af051fb88699 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -1186,7 +1186,7 @@ static void advk_msi_irq_compose_msi_msg(struct irq_data *data, msg->address_lo = lower_32_bits(msi_msg); msg->address_hi = upper_32_bits(msi_msg); - msg->data = data->irq; + msg->data = data->hwirq; } static int advk_msi_set_affinity(struct irq_data *irq_data, @@ -1203,15 +1203,11 @@ static int advk_msi_irq_domain_alloc(struct irq_domain *domain, int hwirq, i; mutex_lock(&pcie->msi_used_lock); - hwirq = bitmap_find_next_zero_area(pcie->msi_used, MSI_IRQ_NUM, - 0, nr_irqs, 0); - if (hwirq >= MSI_IRQ_NUM) { - mutex_unlock(&pcie->msi_used_lock); - return -ENOSPC; - } - - bitmap_set(pcie->msi_used, hwirq, nr_irqs); + hwirq = bitmap_find_free_region(pcie->msi_used, MSI_IRQ_NUM, + order_base_2(nr_irqs)); mutex_unlock(&pcie->msi_used_lock); + if (hwirq < 0) + return -ENOSPC; for (i = 0; i < nr_irqs; i++) irq_domain_set_info(domain, virq + i, hwirq + i, @@ -1229,7 +1225,7 @@ static void advk_msi_irq_domain_free(struct irq_domain *domain, struct advk_pcie *pcie = domain->host_data; mutex_lock(&pcie->msi_used_lock); - bitmap_clear(pcie->msi_used, d->hwirq, nr_irqs); + bitmap_release_region(pcie->msi_used, d->hwirq, order_base_2(nr_irqs)); mutex_unlock(&pcie->msi_used_lock); } -- Gitee From 3d4a08e07a387572ebfb4d0c4093b1825e2185d0 Mon Sep 17 00:00:00 2001 From: Zhou Guanghui Date: Thu, 14 Jul 2022 21:57:26 +0800 Subject: [PATCH 216/674] iommu/arm-smmu-v3: fix event handling soft lockup stable inclusion from stable-v5.10.111 commit 0b9cf0b599258dfbc8f3b4f21a0ef1faec8c455e category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=0b9cf0b599258dfbc8f3b4f21a0ef1faec8c455e -------------------------------- [ Upstream commit 30de2b541af98179780054836b48825fcfba4408 ] During event processing, events are read from the event queue one by one until the queue is empty.If the master device continuously requests address access at the same time and the SMMU generates events, the cyclic processing of the event takes a long time and softlockup warnings may be reported. arm-smmu-v3 arm-smmu-v3.34.auto: event 0x0a received: arm-smmu-v3 arm-smmu-v3.34.auto: 0x00007f220000280a arm-smmu-v3 arm-smmu-v3.34.auto: 0x000010000000007e arm-smmu-v3 arm-smmu-v3.34.auto: 0x00000000034e8670 watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [irq/268-arm-smm:247] Call trace: _dev_info+0x7c/0xa0 arm_smmu_evtq_thread+0x1c0/0x230 irq_thread_fn+0x30/0x80 irq_thread+0x128/0x210 kthread+0x134/0x138 ret_from_fork+0x10/0x1c Kernel panic - not syncing: softlockup: hung tasks Fix this by calling cond_resched() after the event information is printed. Signed-off-by: Zhou Guanghui Link: https://lore.kernel.org/r/20220119070754.26528-1-zhouguanghui1@huawei.com Signed-off-by: Will Deacon Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index e463fd31d268..b22d0187ea8a 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -1852,6 +1852,7 @@ static irqreturn_t arm_smmu_evtq_thread(int irq, void *dev) dev_info(smmu->dev, "\t0x%016llx\n", (unsigned long long)evt[i]); + cond_resched(); } /* -- Gitee From 3968a04694e7cb84c5c8183ef9ee6a11c1fff9b4 Mon Sep 17 00:00:00 2001 From: Neal Liu Date: Thu, 14 Jul 2022 21:57:27 +0800 Subject: [PATCH 217/674] usb: ehci: add pci device support for Aspeed platforms stable inclusion from stable-v5.10.111 commit 4820847e8bc2a7c964854a72f5d5d48c800d39c9 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=4820847e8bc2a7c964854a72f5d5d48c800d39c9 -------------------------------- [ Upstream commit c3c9cee592828528fd228b01d312c7526c584a42 ] Enable Aspeed quirks in commit 7f2d73788d90 ("usb: ehci: handshake CMD_RUN instead of STS_HALT") to support Aspeed ehci-pci device. Acked-by: Alan Stern Signed-off-by: Neal Liu Link: https://lore.kernel.org/r/20220208101657.76459-1-neal_liu@aspeedtech.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Conflicts: drivers/usb/host/ehci-pci.c Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/usb/host/ehci-pci.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/usb/host/ehci-pci.c b/drivers/usb/host/ehci-pci.c index a5e27deda83a..d0d07d30e912 100644 --- a/drivers/usb/host/ehci-pci.c +++ b/drivers/usb/host/ehci-pci.c @@ -21,6 +21,9 @@ static const char hcd_name[] = "ehci-pci"; /* defined here to avoid adding to pci_ids.h for single instance use */ #define PCI_DEVICE_ID_INTEL_CE4100_USB 0x2e70 +#define PCI_VENDOR_ID_ASPEED 0x1a03 +#define PCI_DEVICE_ID_ASPEED_EHCI 0x2603 + /*-------------------------------------------------------------------------*/ #define PCI_DEVICE_ID_INTEL_QUARK_X1000_SOC 0x0939 static inline bool is_intel_quark_x1000(struct pci_dev *pdev) @@ -226,6 +229,12 @@ static int ehci_pci_setup(struct usb_hcd *hcd) if (pdev->device == 0x3104 && (pdev->revision & 0xf0) == 0x90) ehci->zx_wakeup_clear = 1; break; + case PCI_VENDOR_ID_ASPEED: + if (pdev->device == PCI_DEVICE_ID_ASPEED_EHCI) { + ehci_info(ehci, "applying Aspeed HC workaround\n"); + ehci->is_aspeed = 1; + } + break; } /* optional debug port, normally in the first BAR */ -- Gitee From 914f2581374890a670257478d533594d449d0b73 Mon Sep 17 00:00:00 2001 From: Hou Zhiqiang Date: Thu, 14 Jul 2022 21:57:28 +0800 Subject: [PATCH 218/674] PCI: endpoint: Fix alignment fault error in copy tests stable inclusion from stable-v5.10.111 commit b02a1a65023f46b970486be38f7d209761084a7c category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=b02a1a65023f46b970486be38f7d209761084a7c -------------------------------- [ Upstream commit 829cc0e2ea2d61fb6c54bc3f8a17f86c56e11864 ] The copy test uses the memcpy() to copy data between IO memory spaces. This can trigger an alignment fault error (pasted the error logs below) because memcpy() may use unaligned accesses on a mapped memory that is just IO, which does not support unaligned memory accesses. Fix it by using the correct memcpy API to copy from/to IO memory. Alignment fault error logs: Unable to handle kernel paging request at virtual address ffff8000101cd3c1 Mem abort info: ESR = 0x96000021 EC = 0x25: DABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 FSC = 0x21: alignment fault Data abort info: ISV = 0, ISS = 0x00000021 CM = 0, WnR = 0 swapper pgtable: 4k pages, 48-bit VAs, pgdp=0000000081773000 [ffff8000101cd3c1] pgd=1000000082410003, p4d=1000000082410003, pud=1000000082411003, pmd=1000000082412003, pte=0068004000001f13 Internal error: Oops: 96000021 [#1] PREEMPT SMP Modules linked in: CPU: 0 PID: 6 Comm: kworker/0:0H Not tainted 5.15.0-rc1-next-20210914-dirty #2 Hardware name: LS1012A RDB Board (DT) Workqueue: kpcitest pci_epf_test_cmd_handler pstate: 80000005 (Nzcv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : __memcpy+0x168/0x230 lr : pci_epf_test_cmd_handler+0x6f0/0xa68 sp : ffff80001003bce0 x29: ffff80001003bce0 x28: ffff800010135000 x27: ffff8000101e5000 x26: ffff8000101cd000 x25: ffff6cda941cf6c8 x24: 0000000000000000 x23: ffff6cda863f2000 x22: ffff6cda9096c800 x21: ffff800010135000 x20: ffff6cda941cf680 x19: ffffaf39fd999000 x18: 0000000000000000 x17: 0000000000000000 x16: 0000000000000000 x15: ffffaf39fd2b6000 x14: 0000000000000000 x13: 15f5c8fa2f984d57 x12: 604d132b60275454 x11: 065cee5e5fb428b6 x10: aae662eb17d0cf3e x9 : 1d97c9a1b4ddef37 x8 : 7541b65edebf928c x7 : e71937c4fc595de0 x6 : b8a0e09562430d1c x5 : ffff8000101e5401 x4 : ffff8000101cd401 x3 : ffff8000101e5380 x2 : fffffffffffffff1 x1 : ffff8000101cd3c0 x0 : ffff8000101e5000 Call trace: __memcpy+0x168/0x230 process_one_work+0x1ec/0x370 worker_thread+0x44/0x478 kthread+0x154/0x160 ret_from_fork+0x10/0x20 Code: a984346c a9c4342c f1010042 54fffee8 (a97c3c8e) ---[ end trace 568c28c7b6336335 ]--- Link: https://lore.kernel.org/r/20211217094708.28678-1-Zhiqiang.Hou@nxp.com Signed-off-by: Hou Zhiqiang Signed-off-by: Lorenzo Pieralisi Reviewed-by: Kishon Vijay Abraham I Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/pci/endpoint/functions/pci-epf-test.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/pci/endpoint/functions/pci-epf-test.c b/drivers/pci/endpoint/functions/pci-epf-test.c index d41570715dc7..b861840e867c 100644 --- a/drivers/pci/endpoint/functions/pci-epf-test.c +++ b/drivers/pci/endpoint/functions/pci-epf-test.c @@ -285,7 +285,17 @@ static int pci_epf_test_copy(struct pci_epf_test *epf_test) if (ret) dev_err(dev, "Data transfer failed\n"); } else { - memcpy(dst_addr, src_addr, reg->size); + void *buf; + + buf = kzalloc(reg->size, GFP_KERNEL); + if (!buf) { + ret = -ENOMEM; + goto err_map_addr; + } + + memcpy_fromio(buf, src_addr, reg->size); + memcpy_toio(dst_addr, buf, reg->size); + kfree(buf); } ktime_get_ts64(&end); pci_epf_test_print_rate("COPY", reg->size, &start, &end, use_dma); -- Gitee From b65364a8d1b914b201a6f278545459603a53086e Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 14 Jul 2022 21:57:29 +0800 Subject: [PATCH 219/674] tcp: Don't acquire inet_listen_hashbucket::lock with disabled BH. stable inclusion from stable-v5.10.111 commit b1b27b0e8d489b40607127e26af70e5571bf9201 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=b1b27b0e8d489b40607127e26af70e5571bf9201 -------------------------------- [ Upstream commit 4f9bf2a2f5aacf988e6d5e56b961ba45c5a25248 ] Commit 9652dc2eb9e40 ("tcp: relax listening_hash operations") removed the need to disable bottom half while acquiring listening_hash.lock. There are still two callers left which disable bottom half before the lock is acquired. On PREEMPT_RT the softirqs are preemptible and local_bh_disable() acts as a lock to ensure that resources, that are protected by disabling bottom halves, remain protected. This leads to a circular locking dependency if the lock acquired with disabled bottom halves is also acquired with enabled bottom halves followed by disabling bottom halves. This is the reverse locking order. It has been observed with inet_listen_hashbucket::lock: local_bh_disable() + spin_lock(&ilb->lock): inet_listen() inet_csk_listen_start() sk->sk_prot->hash() := inet_hash() local_bh_disable() __inet_hash() spin_lock(&ilb->lock); acquire(&ilb->lock); Reverse order: spin_lock(&ilb2->lock) + local_bh_disable(): tcp_seq_next() listening_get_next() spin_lock(&ilb2->lock); acquire(&ilb2->lock); tcp4_seq_show() get_tcp4_sock() sock_i_ino() read_lock_bh(&sk->sk_callback_lock); acquire(softirq_ctrl) // <---- whoops acquire(&sk->sk_callback_lock) Drop local_bh_disable() around __inet_hash() which acquires listening_hash->lock. Split inet_unhash() and acquire the listen_hashbucket lock without disabling bottom halves; the inet_ehash lock with disabled bottom halves. Reported-by: Mike Galbraith Signed-off-by: Sebastian Andrzej Siewior Link: https://lkml.kernel.org/r/12d6f9879a97cd56c09fb53dee343cbb14f7f1f7.camel@gmx.de Link: https://lkml.kernel.org/r/X9CheYjuXWc75Spa@hirez.programming.kicks-ass.net Link: https://lore.kernel.org/r/YgQOebeZ10eNx1W6@linutronix.de Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- net/ipv4/inet_hashtables.c | 53 ++++++++++++++++++++++--------------- net/ipv6/inet6_hashtables.c | 5 +--- 2 files changed, 33 insertions(+), 25 deletions(-) diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index cf178d8eea81..2bb9ded807ee 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -637,7 +637,9 @@ int __inet_hash(struct sock *sk, struct sock *osk) int err = 0; if (sk->sk_state != TCP_LISTEN) { + local_bh_disable(); inet_ehash_nolisten(sk, osk, NULL); + local_bh_enable(); return 0; } WARN_ON(!sk_unhashed(sk)); @@ -669,45 +671,54 @@ int inet_hash(struct sock *sk) { int err = 0; - if (sk->sk_state != TCP_CLOSE) { - local_bh_disable(); + if (sk->sk_state != TCP_CLOSE) err = __inet_hash(sk, NULL); - local_bh_enable(); - } return err; } EXPORT_SYMBOL_GPL(inet_hash); -void inet_unhash(struct sock *sk) +static void __inet_unhash(struct sock *sk, struct inet_listen_hashbucket *ilb) { - struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; - struct inet_listen_hashbucket *ilb = NULL; - spinlock_t *lock; - if (sk_unhashed(sk)) return; - if (sk->sk_state == TCP_LISTEN) { - ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; - lock = &ilb->lock; - } else { - lock = inet_ehash_lockp(hashinfo, sk->sk_hash); - } - spin_lock_bh(lock); - if (sk_unhashed(sk)) - goto unlock; - if (rcu_access_pointer(sk->sk_reuseport_cb)) reuseport_detach_sock(sk); if (ilb) { + struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; + inet_unhash2(hashinfo, sk); ilb->count--; } __sk_nulls_del_node_init_rcu(sk); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); -unlock: - spin_unlock_bh(lock); +} + +void inet_unhash(struct sock *sk) +{ + struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; + + if (sk_unhashed(sk)) + return; + + if (sk->sk_state == TCP_LISTEN) { + struct inet_listen_hashbucket *ilb; + + ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; + /* Don't disable bottom halves while acquiring the lock to + * avoid circular locking dependency on PREEMPT_RT. + */ + spin_lock(&ilb->lock); + __inet_unhash(sk, ilb); + spin_unlock(&ilb->lock); + } else { + spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash); + + spin_lock_bh(lock); + __inet_unhash(sk, NULL); + spin_unlock_bh(lock); + } } EXPORT_SYMBOL_GPL(inet_unhash); diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index c9e7ecc7afd3..40203255ed88 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -333,11 +333,8 @@ int inet6_hash(struct sock *sk) { int err = 0; - if (sk->sk_state != TCP_CLOSE) { - local_bh_disable(); + if (sk->sk_state != TCP_CLOSE) err = __inet_hash(sk, NULL); - local_bh_enable(); - } return err; } -- Gitee From 833bfdd23b887a59b51b080b94fcd82becf90520 Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Thu, 14 Jul 2022 21:57:30 +0800 Subject: [PATCH 220/674] PCI: pciehp: Add Qualcomm quirk for Command Completed erratum stable inclusion from stable-v5.10.111 commit c66cc0404367fe0bd3e1e458ff9a238078fd3a39 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=c66cc0404367fe0bd3e1e458ff9a238078fd3a39 -------------------------------- [ Upstream commit 9f72d4757cbe4d1ed669192f6d23817c9e437c4b ] The Qualcomm PCI bridge device (Device ID 0x0110) found in chipsets such as SM8450 does not set the Command Completed bit unless writes to the Slot Command register change "Control" bits. This results in timeouts like below: pcieport 0001:00:00.0: pciehp: Timeout on hotplug command 0x03c0 (issued 2020 msec ago) Add the device to the Command Completed quirk to mark commands "completed" immediately unless they change the "Control" bits. Link: https://lore.kernel.org/r/20220210145003.135907-1-manivannan.sadhasivam@linaro.org Signed-off-by: Manivannan Sadhasivam Signed-off-by: Bjorn Helgaas Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/pci/hotplug/pciehp_hpc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c index c4ce4c4e9f7a..a2639135f5d3 100644 --- a/drivers/pci/hotplug/pciehp_hpc.c +++ b/drivers/pci/hotplug/pciehp_hpc.c @@ -1116,6 +1116,8 @@ static void quirk_cmd_compl(struct pci_dev *pdev) } DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, PCI_CLASS_BRIDGE_PCI, 8, quirk_cmd_compl); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_QCOM, 0x0110, + PCI_CLASS_BRIDGE_PCI, 8, quirk_cmd_compl); DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_QCOM, 0x0400, PCI_CLASS_BRIDGE_PCI, 8, quirk_cmd_compl); DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_QCOM, 0x0401, -- Gitee From ec47a8c660e3a3a5c2ad81da50929accae89824b Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Thu, 14 Jul 2022 21:57:31 +0800 Subject: [PATCH 221/674] power: supply: axp288-charger: Set Vhold to 4.4V stable inclusion from stable-v5.10.111 commit 9538563d31a2ea2f5621ce46982beb1ac109ef0c category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=9538563d31a2ea2f5621ce46982beb1ac109ef0c -------------------------------- [ Upstream commit 5ac121b81b4051e7fc83d5b3456a5e499d5bd147 ] The AXP288's recommended and factory default Vhold value (minimum input voltage below which the input current draw will be reduced) is 4.4V. This lines up with other charger IC's such as the TI bq2419x/bq2429x series which use 4.36V or 4.44V. For some reason some BIOS-es initialize Vhold to 4.6V or even 4.7V which combined with the typical voltage drop over typically low wire gauge micro-USB cables leads to the input-current getting capped below 1A (with a 2A capable dedicated charger) based on Vhold. This leads to slow charging, or even to the device slowly discharging if the device is in heavy use. As the Linux AXP288 drivers use the builtin BC1.2 charger detection and send the input-current-limit according to the detected charger there really is no reason not to use the recommended 4.4V Vhold. Set Vhold to 4.4V to fix the slow charging issue on various devices. There is one exception, the special-case of the HP X2 2-in-1s which combine this BC1.2 capable PMIC with a Type-C port and a 5V/3A factory provided charger with a Type-C plug which does not do BC1.2. These have their input-current-limit hardcoded to 3A (like under Windows) and use a higher Vhold on purpose to limit the current when used with other chargers. To avoid touching Vhold on these HP X2 laptops the code setting Vhold is added to an else branch of the if checking for these models. Note this also fixes the sofar unused VBUS_ISPOUT_VHOLD_SET_MASK define, which was wrong. Signed-off-by: Hans de Goede Signed-off-by: Sebastian Reichel Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/power/supply/axp288_charger.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/power/supply/axp288_charger.c b/drivers/power/supply/axp288_charger.c index a4df1ea92386..f65bf7b295c5 100644 --- a/drivers/power/supply/axp288_charger.c +++ b/drivers/power/supply/axp288_charger.c @@ -41,11 +41,11 @@ #define VBUS_ISPOUT_CUR_LIM_1500MA 0x1 /* 1500mA */ #define VBUS_ISPOUT_CUR_LIM_2000MA 0x2 /* 2000mA */ #define VBUS_ISPOUT_CUR_NO_LIM 0x3 /* 2500mA */ -#define VBUS_ISPOUT_VHOLD_SET_MASK 0x31 +#define VBUS_ISPOUT_VHOLD_SET_MASK 0x38 #define VBUS_ISPOUT_VHOLD_SET_BIT_POS 0x3 #define VBUS_ISPOUT_VHOLD_SET_OFFSET 4000 /* 4000mV */ #define VBUS_ISPOUT_VHOLD_SET_LSB_RES 100 /* 100mV */ -#define VBUS_ISPOUT_VHOLD_SET_4300MV 0x3 /* 4300mV */ +#define VBUS_ISPOUT_VHOLD_SET_4400MV 0x4 /* 4400mV */ #define VBUS_ISPOUT_VBUS_PATH_DIS BIT(7) #define CHRG_CCCV_CC_MASK 0xf /* 4 bits */ @@ -744,6 +744,16 @@ static int charger_init_hw_regs(struct axp288_chrg_info *info) ret = axp288_charger_vbus_path_select(info, true); if (ret < 0) return ret; + } else { + /* Set Vhold to the factory default / recommended 4.4V */ + val = VBUS_ISPOUT_VHOLD_SET_4400MV << VBUS_ISPOUT_VHOLD_SET_BIT_POS; + ret = regmap_update_bits(info->regmap, AXP20X_VBUS_IPSOUT_MGMT, + VBUS_ISPOUT_VHOLD_SET_MASK, val); + if (ret < 0) { + dev_err(&info->pdev->dev, "register(%x) write error(%d)\n", + AXP20X_VBUS_IPSOUT_MGMT, ret); + return ret; + } } /* Read current charge voltage and current limit */ -- Gitee From 235f9ee8bf43d7bc04431d50052be91379d25769 Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Thu, 14 Jul 2022 21:57:32 +0800 Subject: [PATCH 222/674] iwlwifi: mvm: Correctly set fragmented EBS stable inclusion from stable-v5.10.111 commit bae03957e8ca7bde1b0a7743cde6826317803ce8 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=bae03957e8ca7bde1b0a7743cde6826317803ce8 -------------------------------- [ Upstream commit d8d4dd26b9e0469baf5017f0544d852fd4e3fb6d ] Currently, fragmented EBS was set for a channel only if the 'hb_type' was set to fragmented or balanced scan. However, 'hb_type' is set only in case of CDB, and thus fragmented EBS is never set for a channel for non-CDB devices. Fix it. Signed-off-by: Ilan Peer Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20220204122220.a6165ac9b9d5.I654eafa62fd647030ae6d4f07f32c96c3171decb@changeid Signed-off-by: Luca Coelho Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/net/wireless/intel/iwlwifi/mvm/scan.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/scan.c b/drivers/net/wireless/intel/iwlwifi/mvm/scan.c index 46255d2c555b..17b992526694 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/scan.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/scan.c @@ -1706,7 +1706,10 @@ static u8 iwl_mvm_scan_umac_chan_flags_v2(struct iwl_mvm *mvm, IWL_SCAN_CHANNEL_FLAG_CACHE_ADD; /* set fragmented ebs for fragmented scan on HB channels */ - if (iwl_mvm_is_scan_fragmented(params->hb_type)) + if ((!iwl_mvm_is_cdb_supported(mvm) && + iwl_mvm_is_scan_fragmented(params->type)) || + (iwl_mvm_is_cdb_supported(mvm) && + iwl_mvm_is_scan_fragmented(params->hb_type))) flags |= IWL_SCAN_CHANNEL_FLAG_EBS_FRAG; return flags; -- Gitee From e415a37234e790e416dd930c8fba1268914dfdef Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 14 Jul 2022 21:57:33 +0800 Subject: [PATCH 223/674] ipv4: Invalidate neighbour for broadcast address upon address addition stable inclusion from stable-v5.10.111 commit f655b724b4407ec4284b52c6e957d3d344eb159a category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=f655b724b4407ec4284b52c6e957d3d344eb159a -------------------------------- [ Upstream commit 0c51e12e218f20b7d976158fdc18019627326f7a ] In case user space sends a packet destined to a broadcast address when a matching broadcast route is not configured, the kernel will create a unicast neighbour entry that will never be resolved [1]. When the broadcast route is configured, the unicast neighbour entry will not be invalidated and continue to linger, resulting in packets being dropped. Solve this by invalidating unresolved neighbour entries for broadcast addresses after routes for these addresses are internally configured by the kernel. This allows the kernel to create a broadcast neighbour entry following the next route lookup. Another possible solution that is more generic but also more complex is to have the ARP code register a listener to the FIB notification chain and invalidate matching neighbour entries upon the addition of broadcast routes. It is also possible to wave off the issue as a user space problem, but it seems a bit excessive to expect user space to be that intimately familiar with the inner workings of the FIB/neighbour kernel code. [1] https://lore.kernel.org/netdev/55a04a8f-56f3-f73c-2aea-2195923f09d1@huawei.com/ Reported-by: Wang Hai Signed-off-by: Ido Schimmel Tested-by: Wang Hai Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- include/net/arp.h | 1 + net/ipv4/arp.c | 9 +++++++-- net/ipv4/fib_frontend.c | 5 ++++- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/include/net/arp.h b/include/net/arp.h index 4950191f6b2b..4a23a97195f3 100644 --- a/include/net/arp.h +++ b/include/net/arp.h @@ -71,6 +71,7 @@ void arp_send(int type, int ptype, __be32 dest_ip, const unsigned char *src_hw, const unsigned char *th); int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir); void arp_ifdown(struct net_device *dev); +int arp_invalidate(struct net_device *dev, __be32 ip, bool force); struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip, struct net_device *dev, __be32 src_ip, diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 922dd73e5740..83a47998c4b1 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1116,13 +1116,18 @@ static int arp_req_get(struct arpreq *r, struct net_device *dev) return err; } -static int arp_invalidate(struct net_device *dev, __be32 ip) +int arp_invalidate(struct net_device *dev, __be32 ip, bool force) { struct neighbour *neigh = neigh_lookup(&arp_tbl, &ip, dev); int err = -ENXIO; struct neigh_table *tbl = &arp_tbl; if (neigh) { + if ((neigh->nud_state & NUD_VALID) && !force) { + neigh_release(neigh); + return 0; + } + if (neigh->nud_state & ~NUD_NOARP) err = neigh_update(neigh, NULL, NUD_FAILED, NEIGH_UPDATE_F_OVERRIDE| @@ -1169,7 +1174,7 @@ static int arp_req_delete(struct net *net, struct arpreq *r, if (!dev) return -EINVAL; } - return arp_invalidate(dev, ip); + return arp_invalidate(dev, ip, true); } /* diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 917ea953dfad..0df4594b49c7 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1112,9 +1112,11 @@ void fib_add_ifaddr(struct in_ifaddr *ifa) return; /* Add broadcast address, if it is explicitly assigned. */ - if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF)) + if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF)) { fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim, 0); + arp_invalidate(dev, ifa->ifa_broadcast, false); + } if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) && (prefix != addr || ifa->ifa_prefixlen < 32)) { @@ -1130,6 +1132,7 @@ void fib_add_ifaddr(struct in_ifaddr *ifa) prim, 0); fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask, 32, prim, 0); + arp_invalidate(dev, prefix | ~mask, false); } } } -- Gitee From 8ec846488847926241fa34983380d41baece54b6 Mon Sep 17 00:00:00 2001 From: Jordy Zomer Date: Thu, 14 Jul 2022 21:57:34 +0800 Subject: [PATCH 224/674] dm ioctl: prevent potential spectre v1 gadget stable inclusion from stable-v5.10.111 commit 71c8df33fd777c7628f6fbc09b14e84806c55914 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=71c8df33fd777c7628f6fbc09b14e84806c55914 -------------------------------- [ Upstream commit cd9c88da171a62c4b0f1c70e50c75845969fbc18 ] It appears like cmd could be a Spectre v1 gadget as it's supplied by a user and used as an array index. Prevent the contents of kernel memory from being leaked to userspace via speculative execution by using array_index_nospec. Signed-off-by: Jordy Zomer Signed-off-by: Mike Snitzer Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/md/dm-ioctl.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 1ca65b434f1f..b839705654d4 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -17,6 +17,7 @@ #include #include #include +#include #include @@ -1696,6 +1697,7 @@ static ioctl_fn lookup_ioctl(unsigned int cmd, int *ioctl_flags) if (unlikely(cmd >= ARRAY_SIZE(_ioctls))) return NULL; + cmd = array_index_nospec(cmd, ARRAY_SIZE(_ioctls)); *ioctl_flags = _ioctls[cmd].flags; return _ioctls[cmd].fn; } -- Gitee From 9314c8a29370a888602502fb120cdd4f9433da58 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 14 Jul 2022 21:57:35 +0800 Subject: [PATCH 225/674] drm/amdkfd: make CRAT table missing message informational only stable inclusion from stable-v5.10.111 commit fe4b6d5a0dd7701454981ea8a7e2332a2970ef81 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=fe4b6d5a0dd7701454981ea8a7e2332a2970ef81 -------------------------------- [ Upstream commit 9dff13f9edf755a15f6507874185a3290c1ae8bb ] The driver has a fallback so make the message informational rather than a warning. The driver has a fallback if the Component Resource Association Table (CRAT) is missing, so make this informational now. Bug: https://gitlab.freedesktop.org/drm/amd/-/issues/1906 Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/gpu/drm/amd/amdkfd/kfd_crat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c index 31d793ee0836..86b4dadf772e 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c @@ -784,7 +784,7 @@ int kfd_create_crat_image_acpi(void **crat_image, size_t *size) /* Fetch the CRAT table from ACPI */ status = acpi_get_table(CRAT_SIGNATURE, 0, &crat_table); if (status == AE_NOT_FOUND) { - pr_warn("CRAT table not found\n"); + pr_info("CRAT table not found\n"); return -ENODATA; } else if (ACPI_FAILURE(status)) { const char *err = acpi_format_exception(status); -- Gitee From 5b544929258d39d92fe674fc239ccdf50ad347a8 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 14 Jul 2022 21:57:36 +0800 Subject: [PATCH 226/674] scsi: pm8001: Fix pm80xx_pci_mem_copy() interface stable inclusion from stable-v5.10.111 commit ef969095c44245b85a940e9a0ceaaf8d2f035d87 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=ef969095c44245b85a940e9a0ceaaf8d2f035d87 -------------------------------- [ Upstream commit 3762d8f6edcdb03994c919f9487fd6d336c06561 ] The declaration of the local variable destination1 in pm80xx_pci_mem_copy() as a pointer to a u32 results in the sparse warning: warning: incorrect type in assignment (different base types) expected unsigned int [usertype] got restricted __le32 [usertype] Furthermore, the destination" argument of pm80xx_pci_mem_copy() is wrongly declared with the const attribute. Fix both problems by changing the type of the "destination" argument to "__le32 *" and use this argument directly inside the pm80xx_pci_mem_copy() function, thus removing the need for the destination1 local variable. Link: https://lore.kernel.org/r/20220220031810.738362-6-damien.lemoal@opensource.wdc.com Reviewed-by: Jack Wang Signed-off-by: Damien Le Moal Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/scsi/pm8001/pm80xx_hwi.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/scsi/pm8001/pm80xx_hwi.c b/drivers/scsi/pm8001/pm80xx_hwi.c index 2b3ee963e375..677ddc7bfb47 100644 --- a/drivers/scsi/pm8001/pm80xx_hwi.c +++ b/drivers/scsi/pm8001/pm80xx_hwi.c @@ -66,18 +66,16 @@ int pm80xx_bar4_shift(struct pm8001_hba_info *pm8001_ha, u32 shift_value) } static void pm80xx_pci_mem_copy(struct pm8001_hba_info *pm8001_ha, u32 soffset, - const void *destination, + __le32 *destination, u32 dw_count, u32 bus_base_number) { u32 index, value, offset; - u32 *destination1; - destination1 = (u32 *)destination; - for (index = 0; index < dw_count; index += 4, destination1++) { + for (index = 0; index < dw_count; index += 4, destination++) { offset = (soffset + index); if (offset < (64 * 1024)) { value = pm8001_cr32(pm8001_ha, bus_base_number, offset); - *destination1 = cpu_to_le32(value); + *destination = cpu_to_le32(value); } } return; -- Gitee From 9f47bf4e2f1810ebfabb8d291e439b41a23bad1d Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 14 Jul 2022 21:57:37 +0800 Subject: [PATCH 227/674] scsi: pm8001: Fix pm8001_mpi_task_abort_resp() stable inclusion from stable-v5.10.111 commit 3bd9a28798ca8f19e78fca1890ad08404e53c8fe category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=3bd9a28798ca8f19e78fca1890ad08404e53c8fe -------------------------------- [ Upstream commit 7e6b7e740addcea450041b5be8e42f0a4ceece0f ] The call to pm8001_ccb_task_free() at the end of pm8001_mpi_task_abort_resp() already frees the ccb tag. So when the device NCQ_ABORT_ALL_FLAG is set, the tag should not be freed again. Also change the hardcoded 0xBFFFFFFF value to ~NCQ_ABORT_ALL_FLAG as it ought to be. Link: https://lore.kernel.org/r/20220220031810.738362-19-damien.lemoal@opensource.wdc.com Reviewed-by: Jack Wang Signed-off-by: Damien Le Moal Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/scsi/pm8001/pm8001_hwi.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/pm8001/pm8001_hwi.c b/drivers/scsi/pm8001/pm8001_hwi.c index a8219a42c786..3a3bd0c13b5a 100644 --- a/drivers/scsi/pm8001/pm8001_hwi.c +++ b/drivers/scsi/pm8001/pm8001_hwi.c @@ -3669,12 +3669,11 @@ int pm8001_mpi_task_abort_resp(struct pm8001_hba_info *pm8001_ha, void *piomb) mb(); if (pm8001_dev->id & NCQ_ABORT_ALL_FLAG) { - pm8001_tag_free(pm8001_ha, tag); sas_free_task(t); - /* clear the flag */ - pm8001_dev->id &= 0xBFFFFFFF; - } else + pm8001_dev->id &= ~NCQ_ABORT_ALL_FLAG; + } else { t->task_done(t); + } return 0; } -- Gitee From dd9e52a11360b63880cb9f3078c7eac85e42fe23 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 14 Jul 2022 21:57:38 +0800 Subject: [PATCH 228/674] scsi: pm8001: Fix task leak in pm8001_send_abort_all() stable inclusion from stable-v5.10.111 commit 2051044d7901f1a9d7be95d0d32e53b88e9548f7 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=2051044d7901f1a9d7be95d0d32e53b88e9548f7 -------------------------------- [ Upstream commit f90a74892f3acf0cdec5844e90fc8686ca13e7d7 ] In pm8001_send_abort_all(), make sure to free the allocated sas task if pm8001_tag_alloc() or pm8001_mpi_build_cmd() fail. Link: https://lore.kernel.org/r/20220220031810.738362-21-damien.lemoal@opensource.wdc.com Reviewed-by: John Garry Signed-off-by: Damien Le Moal Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/scsi/pm8001/pm8001_hwi.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/pm8001/pm8001_hwi.c b/drivers/scsi/pm8001/pm8001_hwi.c index 3a3bd0c13b5a..c5edf6c425b5 100644 --- a/drivers/scsi/pm8001/pm8001_hwi.c +++ b/drivers/scsi/pm8001/pm8001_hwi.c @@ -1711,7 +1711,6 @@ static void pm8001_send_abort_all(struct pm8001_hba_info *pm8001_ha, } task = sas_alloc_slow_task(GFP_ATOMIC); - if (!task) { pm8001_dbg(pm8001_ha, FAIL, "cannot allocate task\n"); return; @@ -1720,8 +1719,10 @@ static void pm8001_send_abort_all(struct pm8001_hba_info *pm8001_ha, task->task_done = pm8001_task_done; res = pm8001_tag_alloc(pm8001_ha, &ccb_tag); - if (res) + if (res) { + sas_free_task(task); return; + } ccb = &pm8001_ha->ccb_info[ccb_tag]; ccb->device = pm8001_ha_dev; @@ -1738,8 +1739,10 @@ static void pm8001_send_abort_all(struct pm8001_hba_info *pm8001_ha, ret = pm8001_mpi_build_cmd(pm8001_ha, circularQ, opc, &task_abort, sizeof(task_abort), 0); - if (ret) + if (ret) { + sas_free_task(task); pm8001_tag_free(pm8001_ha, ccb_tag); + } } -- Gitee From 00bde281afd7caefecb49cd0da8c13f24532ad2e Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 14 Jul 2022 21:57:39 +0800 Subject: [PATCH 229/674] scsi: pm8001: Fix tag leaks on error stable inclusion from stable-v5.10.111 commit a0bb65eadbf942024226241d9d99fed17168940b category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=a0bb65eadbf942024226241d9d99fed17168940b -------------------------------- [ Upstream commit 4c8f04b1905cd4b776d0b720463c091545478ef7 ] In pm8001_chip_set_dev_state_req(), pm8001_chip_fw_flash_update_req(), pm80xx_chip_phy_ctl_req() and pm8001_chip_reg_dev_req() add missing calls to pm8001_tag_free() to free the allocated tag when pm8001_mpi_build_cmd() fails. Similarly, in pm8001_exec_internal_task_abort(), if the chip ->task_abort method fails, the tag allocated for the abort request task must be freed. Add the missing call to pm8001_tag_free(). Link: https://lore.kernel.org/r/20220220031810.738362-22-damien.lemoal@opensource.wdc.com Reviewed-by: John Garry Signed-off-by: Damien Le Moal Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/scsi/pm8001/pm8001_hwi.c | 9 +++++++++ drivers/scsi/pm8001/pm8001_sas.c | 2 +- drivers/scsi/pm8001/pm80xx_hwi.c | 9 +++++++-- 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/pm8001/pm8001_hwi.c b/drivers/scsi/pm8001/pm8001_hwi.c index c5edf6c425b5..2182c977498d 100644 --- a/drivers/scsi/pm8001/pm8001_hwi.c +++ b/drivers/scsi/pm8001/pm8001_hwi.c @@ -4444,6 +4444,9 @@ static int pm8001_chip_reg_dev_req(struct pm8001_hba_info *pm8001_ha, SAS_ADDR_SIZE); rc = pm8001_mpi_build_cmd(pm8001_ha, circularQ, opc, &payload, sizeof(payload), 0); + if (rc) + pm8001_tag_free(pm8001_ha, tag); + return rc; } @@ -4856,6 +4859,9 @@ pm8001_chip_fw_flash_update_req(struct pm8001_hba_info *pm8001_ha, ccb->ccb_tag = tag; rc = pm8001_chip_fw_flash_update_build(pm8001_ha, &flash_update_info, tag); + if (rc) + pm8001_tag_free(pm8001_ha, tag); + return rc; } @@ -4960,6 +4966,9 @@ pm8001_chip_set_dev_state_req(struct pm8001_hba_info *pm8001_ha, payload.nds = cpu_to_le32(state); rc = pm8001_mpi_build_cmd(pm8001_ha, circularQ, opc, &payload, sizeof(payload), 0); + if (rc) + pm8001_tag_free(pm8001_ha, tag); + return rc; } diff --git a/drivers/scsi/pm8001/pm8001_sas.c b/drivers/scsi/pm8001/pm8001_sas.c index ffafbc137a18..f77061396871 100644 --- a/drivers/scsi/pm8001/pm8001_sas.c +++ b/drivers/scsi/pm8001/pm8001_sas.c @@ -831,10 +831,10 @@ pm8001_exec_internal_task_abort(struct pm8001_hba_info *pm8001_ha, res = PM8001_CHIP_DISP->task_abort(pm8001_ha, pm8001_dev, flag, task_tag, ccb_tag); - if (res) { del_timer(&task->slow_task->timer); pm8001_dbg(pm8001_ha, FAIL, "Executing internal task failed\n"); + pm8001_tag_free(pm8001_ha, ccb_tag); goto ex_err; } wait_for_completion(&task->slow_task->completion); diff --git a/drivers/scsi/pm8001/pm80xx_hwi.c b/drivers/scsi/pm8001/pm80xx_hwi.c index 677ddc7bfb47..a66a06914913 100644 --- a/drivers/scsi/pm8001/pm80xx_hwi.c +++ b/drivers/scsi/pm8001/pm80xx_hwi.c @@ -4856,8 +4856,13 @@ static int pm80xx_chip_phy_ctl_req(struct pm8001_hba_info *pm8001_ha, payload.tag = cpu_to_le32(tag); payload.phyop_phyid = cpu_to_le32(((phy_op & 0xFF) << 8) | (phyId & 0xFF)); - return pm8001_mpi_build_cmd(pm8001_ha, circularQ, opc, &payload, - sizeof(payload), 0); + + rc = pm8001_mpi_build_cmd(pm8001_ha, circularQ, opc, &payload, + sizeof(payload), 0); + if (rc) + pm8001_tag_free(pm8001_ha, tag); + + return rc; } static u32 pm80xx_chip_is_our_interrupt(struct pm8001_hba_info *pm8001_ha) -- Gitee From 61f8f11694b71e2100fad9a9fb2d9b9e57828788 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 14 Jul 2022 21:57:40 +0800 Subject: [PATCH 230/674] scsi: pm8001: Fix memory leak in pm8001_chip_fw_flash_update_req() stable inclusion from stable-v5.10.111 commit d83574666bac4b1462e90df393fbed6c5f57d1a3 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=d83574666bac4b1462e90df393fbed6c5f57d1a3 -------------------------------- [ Upstream commit f792a3629f4c4aa4c3703d66b43ce1edcc3ec09a ] In pm8001_chip_fw_flash_update_build(), if pm8001_chip_fw_flash_update_build() fails, the struct fw_control_ex allocated must be freed. Link: https://lore.kernel.org/r/20220220031810.738362-23-damien.lemoal@opensource.wdc.com Reviewed-by: Jack Wang Signed-off-by: Damien Le Moal Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/scsi/pm8001/pm8001_hwi.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/pm8001/pm8001_hwi.c b/drivers/scsi/pm8001/pm8001_hwi.c index 2182c977498d..73e6f5d17ca7 100644 --- a/drivers/scsi/pm8001/pm8001_hwi.c +++ b/drivers/scsi/pm8001/pm8001_hwi.c @@ -4859,8 +4859,10 @@ pm8001_chip_fw_flash_update_req(struct pm8001_hba_info *pm8001_ha, ccb->ccb_tag = tag; rc = pm8001_chip_fw_flash_update_build(pm8001_ha, &flash_update_info, tag); - if (rc) + if (rc) { + kfree(fw_control_context); pm8001_tag_free(pm8001_ha, tag); + } return rc; } -- Gitee From 6a5fbd015bac01548cd295ba59f50b88f66a87e6 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Thu, 14 Jul 2022 21:57:41 +0800 Subject: [PATCH 231/674] mt76: mt7615: Fix assigning negative values to unsigned variable stable inclusion from stable-v5.10.111 commit 91ee8a14efb6ced3db850729cf72730aad2ae596 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=91ee8a14efb6ced3db850729cf72730aad2ae596 -------------------------------- [ Upstream commit 9273ffcc9a11942bd586bb42584337ef3962b692 ] Smatch reports the following: drivers/net/wireless/mediatek/mt76/mt7615/mac.c:1865 mt7615_mac_adjust_sensitivity() warn: assigning (-110) to unsigned variable 'def_th' drivers/net/wireless/mediatek/mt76/mt7615/mac.c:1865 mt7615_mac_adjust_sensitivity() warn: assigning (-98) to unsigned variable 'def_th' Reported-by: Abaci Robot Signed-off-by: Yang Li Signed-off-by: Felix Fietkau Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/net/wireless/mediatek/mt76/mt7615/mac.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/mediatek/mt76/mt7615/mac.c b/drivers/net/wireless/mediatek/mt76/mt7615/mac.c index 424be103093c..1465a92ea3fc 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7615/mac.c +++ b/drivers/net/wireless/mediatek/mt76/mt7615/mac.c @@ -1626,7 +1626,7 @@ mt7615_mac_adjust_sensitivity(struct mt7615_phy *phy, struct mt7615_dev *dev = phy->dev; int false_cca = ofdm ? phy->false_cca_ofdm : phy->false_cca_cck; bool ext_phy = phy != &dev->phy; - u16 def_th = ofdm ? -98 : -110; + s16 def_th = ofdm ? -98 : -110; bool update = false; s8 *sensitivity; int signal; -- Gitee From 40f234f823b4369a7e0f0858e774193ed387032f Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 14 Jul 2022 21:57:42 +0800 Subject: [PATCH 232/674] scsi: aha152x: Fix aha152x_setup() __setup handler return value stable inclusion from stable-v5.10.111 commit f49ffaa85d2c13134d4e91896198084edace387c category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=f49ffaa85d2c13134d4e91896198084edace387c -------------------------------- [ Upstream commit cc8294ec4738d25e2bb2d71f7d82a9bf7f4a157b ] __setup() handlers should return 1 if the command line option is handled and 0 if not (or maybe never return 0; doing so just pollutes init's environment with strings that are not init arguments/parameters). Return 1 from aha152x_setup() to indicate that the boot option has been handled. Link: lore.kernel.org/r/64644a2f-4a20-bab3-1e15-3b2cdd0defe3@omprussia.ru Link: https://lore.kernel.org/r/20220223000623.5920-1-rdunlap@infradead.org Cc: "Juergen E. Fischer" Cc: "James E.J. Bottomley" Cc: "Martin K. Petersen" Reported-by: Igor Zhbanov Signed-off-by: Randy Dunlap Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/scsi/aha152x.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/aha152x.c b/drivers/scsi/aha152x.c index d8e19afa7a14..c6607c4686bb 100644 --- a/drivers/scsi/aha152x.c +++ b/drivers/scsi/aha152x.c @@ -3367,13 +3367,11 @@ static int __init aha152x_setup(char *str) setup[setup_count].synchronous = ints[0] >= 6 ? ints[6] : 1; setup[setup_count].delay = ints[0] >= 7 ? ints[7] : DELAY_DEFAULT; setup[setup_count].ext_trans = ints[0] >= 8 ? ints[8] : 0; - if (ints[0] > 8) { /*}*/ + if (ints[0] > 8) printk(KERN_NOTICE "aha152x: usage: aha152x=[,[," "[,[,[,[,[,]]]]]]]\n"); - } else { + else setup_count++; - return 0; - } return 1; } -- Gitee From 946188f65fa4a553bbeba5179aa4c01c35e691f9 Mon Sep 17 00:00:00 2001 From: Qi Liu Date: Thu, 14 Jul 2022 21:57:43 +0800 Subject: [PATCH 233/674] scsi: hisi_sas: Free irq vectors in order for v3 HW stable inclusion from stable-v5.10.111 commit 224903cc60d045576393c3b16907742f23e6c740 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=224903cc60d045576393c3b16907742f23e6c740 -------------------------------- [ Upstream commit 554fb72ee34f4732c7f694f56c3c6e67790352a0 ] If the driver probe fails to request the channel IRQ or fatal IRQ, the driver will free the IRQ vectors before freeing the IRQs in free_irq(), and this will cause a kernel BUG like this: ------------[ cut here ]------------ kernel BUG at drivers/pci/msi.c:369! Internal error: Oops - BUG: 0 [#1] PREEMPT SMP Call trace: free_msi_irqs+0x118/0x13c pci_disable_msi+0xfc/0x120 pci_free_irq_vectors+0x24/0x3c hisi_sas_v3_probe+0x360/0x9d0 [hisi_sas_v3_hw] local_pci_probe+0x44/0xb0 work_for_cpu_fn+0x20/0x34 process_one_work+0x1d0/0x340 worker_thread+0x2e0/0x460 kthread+0x180/0x190 ret_from_fork+0x10/0x20 ---[ end trace b88990335b610c11 ]--- So we use devm_add_action() to control the order in which we free the vectors. Link: https://lore.kernel.org/r/1645703489-87194-4-git-send-email-john.garry@huawei.com Signed-off-by: Qi Liu Signed-off-by: John Garry Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Conflicts: drivers/scsi/hisi_sas/hisi_sas_v3_hw.c Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/scsi/hisi_sas/hisi_sas_v3_hw.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c index 3ecc61eb7214..fd5bdb0afa71 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c +++ b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c @@ -2389,17 +2389,25 @@ static irqreturn_t cq_interrupt_v3_hw(int irq_no, void *p) return IRQ_WAKE_THREAD; } +static void hisi_sas_v3_free_vectors(void *data) +{ + struct pci_dev *pdev = data; + + pci_free_irq_vectors(pdev); +} + static int interrupt_preinit_v3_hw(struct hisi_hba *hisi_hba) { int vectors; int max_msi = HISI_SAS_MSI_COUNT_V3_HW, min_msi; struct Scsi_Host *shost = hisi_hba->shost; + struct pci_dev *pdev = hisi_hba->pci_dev; struct irq_affinity desc = { .pre_vectors = BASE_VECTORS_V3_HW, }; min_msi = MIN_AFFINE_VECTORS_V3_HW; - vectors = pci_alloc_irq_vectors_affinity(hisi_hba->pci_dev, + vectors = pci_alloc_irq_vectors_affinity(pdev, min_msi, max_msi, PCI_IRQ_MSI | PCI_IRQ_AFFINITY, @@ -2411,6 +2419,7 @@ static int interrupt_preinit_v3_hw(struct hisi_hba *hisi_hba) hisi_hba->cq_nvecs = vectors - BASE_VECTORS_V3_HW; shost->nr_hw_queues = hisi_hba->cq_nvecs; + devm_add_action(&pdev->dev, hisi_sas_v3_free_vectors, pdev); return 0; } @@ -4808,7 +4817,7 @@ hisi_sas_v3_probe(struct pci_dev *pdev, const struct pci_device_id *id) dev_err(dev, "%d hw queues\n", shost->nr_hw_queues); rc = scsi_add_host(shost, dev); if (rc) - goto err_out_free_irq_vectors; + goto err_out_debugfs; rc = sas_register_ha(sha); if (rc) @@ -4839,8 +4848,6 @@ hisi_sas_v3_probe(struct pci_dev *pdev, const struct pci_device_id *id) sas_unregister_ha(sha); err_out_register_ha: scsi_remove_host(shost); -err_out_free_irq_vectors: - pci_free_irq_vectors(pdev); err_out_debugfs: debugfs_exit_v3_hw(hisi_hba); err_out_ha: @@ -4868,7 +4875,6 @@ hisi_sas_v3_destroy_irqs(struct pci_dev *pdev, struct hisi_hba *hisi_hba) devm_free_irq(&pdev->dev, pci_irq_vector(pdev, nr), cq); } - pci_free_irq_vectors(pdev); } static void hisi_sas_v3_remove(struct pci_dev *pdev) -- Gitee From 4942cbd3edb29a2d8b62ae33b52411bfbd40e4ee Mon Sep 17 00:00:00 2001 From: Dust Li Date: Thu, 14 Jul 2022 21:57:44 +0800 Subject: [PATCH 234/674] net/smc: correct settings of RMB window update limit stable inclusion from stable-v5.10.111 commit f2565cb40e9b0b71337e9bffddd2bf57c117461b category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=f2565cb40e9b0b71337e9bffddd2bf57c117461b -------------------------------- [ Upstream commit 6bf536eb5c8ca011d1ff57b5c5f7c57ceac06a37 ] rmbe_update_limit is used to limit announcing receive window updating too frequently. RFC7609 request a minimal increase in the window size of 10% of the receive buffer space. But current implementation used: min_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2) and SOCK_MIN_SNDBUF / 2 == 2304 Bytes, which is almost always less then 10% of the receive buffer space. This causes the receiver always sending CDC message to update its consumer cursor when it consumes more then 2K of data. And as a result, we may encounter something like "TCP silly window syndrome" when sending 2.5~8K message. This patch fixes this using max(rmbe_size / 10, SOCK_MIN_SNDBUF / 2). With this patch and SMC autocorking enabled, qperf 2K/4K/8K tcp_bw test shows 45%/75%/40% increase in throughput respectively. Signed-off-by: Dust Li Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- net/smc/smc_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index d69aac6c1fce..ef2fd28999ba 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1426,7 +1426,7 @@ static struct smc_buf_desc *smc_buf_get_slot(int compressed_bufsize, */ static inline int smc_rmb_wnd_update_limit(int rmbe_size) { - return min_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2); + return max_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2); } /* map an rmb buf to a link */ -- Gitee From 9a8d7db5e318d4c0b54d6ae2f85631d7ef4577c2 Mon Sep 17 00:00:00 2001 From: Hangyu Hua Date: Thu, 14 Jul 2022 21:57:45 +0800 Subject: [PATCH 235/674] mips: ralink: fix a refcount leak in ill_acc_of_setup() stable inclusion from stable-v5.10.111 commit 142ae7d4f21524acfe073e5a3da5667aa85eb970 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=142ae7d4f21524acfe073e5a3da5667aa85eb970 -------------------------------- [ Upstream commit 4a0a1436053b17e50b7c88858fb0824326641793 ] of_node_put(np) needs to be called when pdev == NULL. Signed-off-by: Hangyu Hua Signed-off-by: Thomas Bogendoerfer Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- arch/mips/ralink/ill_acc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/mips/ralink/ill_acc.c b/arch/mips/ralink/ill_acc.c index bdf53807d7c2..bea857c9da8b 100644 --- a/arch/mips/ralink/ill_acc.c +++ b/arch/mips/ralink/ill_acc.c @@ -61,6 +61,7 @@ static int __init ill_acc_of_setup(void) pdev = of_find_device_by_node(np); if (!pdev) { pr_err("%pOFn: failed to lookup pdev\n", np); + of_node_put(np); return -EINVAL; } -- Gitee From cc2e15dd0e8dc6be0e750dec0551f9fca3405329 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Thu, 14 Jul 2022 21:57:46 +0800 Subject: [PATCH 236/674] macvtap: advertise link netns via netlink MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit stable inclusion from stable-v5.10.111 commit 797b4ea9515eb01ec9b79f9319abc5ac4fc7087d category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=797b4ea9515eb01ec9b79f9319abc5ac4fc7087d -------------------------------- [ Upstream commit a02192151b7dbf855084c38dca380d77c7658353 ] Assign rtnl_link_ops->get_link_net() callback so that IFLA_LINK_NETNSID is added to rtnetlink messages. This fixes iproute2 which otherwise resolved the link interface to an interface in the wrong namespace. Test commands: ip netns add nst ip link add dummy0 type dummy ip link add link macvtap0 link dummy0 type macvtap ip link set macvtap0 netns nst ip -netns nst link show macvtap0 Before: 10: macvtap0@gre0: mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 500 link/ether 5e:8f:ae:1d:60:50 brd ff:ff:ff:ff:ff:ff After: 10: macvtap0@if2: mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 500 link/ether 5e:8f:ae:1d:60:50 brd ff:ff:ff:ff:ff:ff link-netnsid 0 Reported-by: Leonardo Mörlein Signed-off-by: Sven Eckelmann Link: https://lore.kernel.org/r/20220228003240.1337426-1-sven@narfation.org Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/net/macvtap.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c index 694e2f5dbbe5..39801c31e507 100644 --- a/drivers/net/macvtap.c +++ b/drivers/net/macvtap.c @@ -133,11 +133,17 @@ static void macvtap_setup(struct net_device *dev) dev->tx_queue_len = TUN_READQ_SIZE; } +static struct net *macvtap_link_net(const struct net_device *dev) +{ + return dev_net(macvlan_dev_real_dev(dev)); +} + static struct rtnl_link_ops macvtap_link_ops __read_mostly = { .kind = "macvtap", .setup = macvtap_setup, .newlink = macvtap_newlink, .dellink = macvtap_dellink, + .get_link_net = macvtap_link_net, .priv_size = sizeof(struct macvtap_dev), }; -- Gitee From 3e3f5567fbfb97b05316cbda2254ad8ab174b56e Mon Sep 17 00:00:00 2001 From: Harold Huang Date: Thu, 14 Jul 2022 21:57:47 +0800 Subject: [PATCH 237/674] tuntap: add sanity checks about msg_controllen in sendmsg stable inclusion from stable-v5.10.111 commit 647b35aaf454271d68067d4bf05154095fe63f93 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=647b35aaf454271d68067d4bf05154095fe63f93 -------------------------------- [ Upstream commit 74a335a07a17d131b9263bfdbdcb5e40673ca9ca ] In patch [1], tun_msg_ctl was added to allow pass batched xdp buffers to tun_sendmsg. Although we donot use msg_controllen in this path, we should check msg_controllen to make sure the caller pass a valid msg_ctl. [1]: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=fe8dd45bb7556246c6b76277b1ba4296c91c2505 Reported-by: Eric Dumazet Suggested-by: Jason Wang Signed-off-by: Harold Huang Acked-by: Jason Wang Link: https://lore.kernel.org/r/20220303022441.383865-1-baymaxhuang@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/net/tap.c | 3 ++- drivers/net/tun.c | 3 ++- drivers/vhost/net.c | 1 + 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/tap.c b/drivers/net/tap.c index f549d3a8e59c..8f7bb15206e9 100644 --- a/drivers/net/tap.c +++ b/drivers/net/tap.c @@ -1202,7 +1202,8 @@ static int tap_sendmsg(struct socket *sock, struct msghdr *m, struct xdp_buff *xdp; int i; - if (ctl && (ctl->type == TUN_MSG_PTR)) { + if (m->msg_controllen == sizeof(struct tun_msg_ctl) && + ctl && ctl->type == TUN_MSG_PTR) { for (i = 0; i < ctl->num; i++) { xdp = &((struct xdp_buff *)ctl->ptr)[i]; tap_get_user_xdp(q, xdp); diff --git a/drivers/net/tun.c b/drivers/net/tun.c index ab64f3d007c1..aef966a9dae2 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -2499,7 +2499,8 @@ static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) if (!tun) return -EBADFD; - if (ctl && (ctl->type == TUN_MSG_PTR)) { + if (m->msg_controllen == sizeof(struct tun_msg_ctl) && + ctl && ctl->type == TUN_MSG_PTR) { struct tun_page tpage; int n = ctl->num; int flush = 0; diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 379bae56040a..eba91da505e0 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -472,6 +472,7 @@ static void vhost_tx_batch(struct vhost_net *net, goto signal_used; msghdr->msg_control = &ctl; + msghdr->msg_controllen = sizeof(ctl); err = sock->ops->sendmsg(sock, msghdr, 0); if (unlikely(err < 0)) { vq_err(&nvq->vq, "Fail to batch sending packets\n"); -- Gitee From 886811ae87388dcbdd558e33cae1cd3d16632ce2 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Thu, 14 Jul 2022 21:57:48 +0800 Subject: [PATCH 238/674] Bluetooth: Fix not checking for valid hdev on bt_dev_{info,warn,err,dbg} stable inclusion from stable-v5.10.111 commit f9b183f1332ad0ed23317a980064d53d8028a280 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=f9b183f1332ad0ed23317a980064d53d8028a280 -------------------------------- [ Upstream commit 9b392e0e0b6d026da5a62bb79a08f32e27af858e ] This fixes attemting to print hdev->name directly which causes them to print an error: kernel: read_version:367: (efault): sock 000000006a3008f2 Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Marcel Holtmann Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- include/net/bluetooth/bluetooth.h | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index 9125effbf448..3fecc4a411a1 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -180,19 +180,21 @@ void bt_err_ratelimited(const char *fmt, ...); #define BT_DBG(fmt, ...) pr_debug(fmt "\n", ##__VA_ARGS__) #endif +#define bt_dev_name(hdev) ((hdev) ? (hdev)->name : "null") + #define bt_dev_info(hdev, fmt, ...) \ - BT_INFO("%s: " fmt, (hdev)->name, ##__VA_ARGS__) + BT_INFO("%s: " fmt, bt_dev_name(hdev), ##__VA_ARGS__) #define bt_dev_warn(hdev, fmt, ...) \ - BT_WARN("%s: " fmt, (hdev)->name, ##__VA_ARGS__) + BT_WARN("%s: " fmt, bt_dev_name(hdev), ##__VA_ARGS__) #define bt_dev_err(hdev, fmt, ...) \ - BT_ERR("%s: " fmt, (hdev)->name, ##__VA_ARGS__) + BT_ERR("%s: " fmt, bt_dev_name(hdev), ##__VA_ARGS__) #define bt_dev_dbg(hdev, fmt, ...) \ - BT_DBG("%s: " fmt, (hdev)->name, ##__VA_ARGS__) + BT_DBG("%s: " fmt, bt_dev_name(hdev), ##__VA_ARGS__) #define bt_dev_warn_ratelimited(hdev, fmt, ...) \ - bt_warn_ratelimited("%s: " fmt, (hdev)->name, ##__VA_ARGS__) + bt_warn_ratelimited("%s: " fmt, bt_dev_name(hdev), ##__VA_ARGS__) #define bt_dev_err_ratelimited(hdev, fmt, ...) \ - bt_err_ratelimited("%s: " fmt, (hdev)->name, ##__VA_ARGS__) + bt_err_ratelimited("%s: " fmt, bt_dev_name(hdev), ##__VA_ARGS__) /* Connection and socket states */ enum { -- Gitee From 20542a2d7ee0f75e490fa1df9b101b2e11b4e3d4 Mon Sep 17 00:00:00 2001 From: "Minghao Chi (CGEL ZTE)" Date: Thu, 14 Jul 2022 21:57:49 +0800 Subject: [PATCH 239/674] Bluetooth: use memset avoid memory leaks stable inclusion from stable-v5.10.111 commit 9567d54e70ff58c2695c2cc2e53c86c67551d3e6 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=9567d54e70ff58c2695c2cc2e53c86c67551d3e6 -------------------------------- [ Upstream commit d3715b2333e9a21692ba16ef8645eda584a9515d ] Use memset to initialize structs to prevent memory leaks in l2cap_ecred_connect Reported-by: Zeal Robot Signed-off-by: Minghao Chi (CGEL ZTE) Signed-off-by: Marcel Holtmann Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- net/bluetooth/l2cap_core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 0ddbc415ce15..012c1a0abda8 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -1438,6 +1438,7 @@ static void l2cap_ecred_connect(struct l2cap_chan *chan) l2cap_ecred_init(chan, 0); + memset(&data, 0, sizeof(data)); data.pdu.req.psm = chan->psm; data.pdu.req.mtu = cpu_to_le16(chan->imtu); data.pdu.req.mps = cpu_to_le16(chan->mps); -- Gitee From 5452a9496e8206fe5885bcb1d1dd116f1e9b5293 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Thu, 14 Jul 2022 21:57:50 +0800 Subject: [PATCH 240/674] bnxt_en: Eliminate unintended link toggle during FW reset stable inclusion from stable-v5.10.111 commit 79cfc0052f3916238421330c2a70474ee2df3567 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=79cfc0052f3916238421330c2a70474ee2df3567 -------------------------------- [ Upstream commit 7c492a2530c1f05441da541307c2534230dfd59b ] If the flow control settings have been changed, a subsequent FW reset may cause the ethernet link to toggle unnecessarily. This link toggle will increase the down time by a few seconds. The problem is caused by bnxt_update_phy_setting() detecting a false mismatch in the flow control settings between the stored software settings and the current FW settings after the FW reset. This mismatch is caused by the AUTONEG bit added to link_info->req_flow_ctrl in an inconsistent way in bnxt_set_pauseparam() in autoneg mode. The AUTONEG bit should not be added to link_info->req_flow_ctrl. Reviewed-by: Colin Winegarden Reviewed-by: Pavan Chebbi Signed-off-by: Michael Chan Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index 66f9d9b3de0a..d90b7b85c052 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -2049,9 +2049,7 @@ static int bnxt_set_pauseparam(struct net_device *dev, } link_info->autoneg |= BNXT_AUTONEG_FLOW_CTRL; - if (bp->hwrm_spec_code >= 0x10201) - link_info->req_flow_ctrl = - PORT_PHY_CFG_REQ_AUTO_PAUSE_AUTONEG_PAUSE; + link_info->req_flow_ctrl = 0; } else { /* when transition from auto pause to force pause, * force a link change -- Gitee From a3f2447127a7dd0c33d4feaa0b17afee2660c78e Mon Sep 17 00:00:00 2001 From: Li Chen Date: Thu, 14 Jul 2022 21:57:51 +0800 Subject: [PATCH 241/674] PCI: endpoint: Fix misused goto label stable inclusion from stable-v5.10.111 commit 7c657c0694ff690e361a13ce41c36b9dfb433ec8 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=7c657c0694ff690e361a13ce41c36b9dfb433ec8 -------------------------------- [ Upstream commit bf8d87c076f55b8b4dfdb6bc6c6b6dc0c2ccb487 ] Fix a misused goto label jump since that can result in a memory leak. Link: https://lore.kernel.org/r/17e7b9b9ee6.c6d9c6a02564.4545388417402742326@zohomail.com Signed-off-by: Li Chen Signed-off-by: Lorenzo Pieralisi Acked-by: Kishon Vijay Abraham I Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/pci/endpoint/functions/pci-epf-test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/endpoint/functions/pci-epf-test.c b/drivers/pci/endpoint/functions/pci-epf-test.c index b861840e867c..262b2c4c70c9 100644 --- a/drivers/pci/endpoint/functions/pci-epf-test.c +++ b/drivers/pci/endpoint/functions/pci-epf-test.c @@ -451,7 +451,7 @@ static int pci_epf_test_write(struct pci_epf_test *epf_test) if (!epf_test->dma_supported) { dev_err(dev, "Cannot transfer data using DMA\n"); ret = -EINVAL; - goto err_map_addr; + goto err_dma_map; } src_phys_addr = dma_map_single(dma_dev, buf, reg->size, -- Gitee From 671a69e4246fff6155e9b73eb2dc06bef77a840c Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Thu, 14 Jul 2022 21:57:52 +0800 Subject: [PATCH 242/674] MIPS: fix fortify panic when copying asm exception handlers stable inclusion from stable-v5.10.111 commit fd416c3f5a4c0e85abb443bd31b49e03d52ef07b category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=fd416c3f5a4c0e85abb443bd31b49e03d52ef07b -------------------------------- [ Upstream commit d17b66417308996e7e64b270a3c7f3c1fbd4cfc8 ] With KCFLAGS="-O3", I was able to trigger a fortify-source memcpy() overflow panic on set_vi_srs_handler(). Although O3 level is not supported in the mainline, under some conditions that may've happened with any optimization settings, it's just a matter of inlining luck. The panic itself is correct, more precisely, 50/50 false-positive and not at the same time. From the one side, no real overflow happens. Exception handler defined in asm just gets copied to some reserved places in the memory. But the reason behind is that C code refers to that exception handler declares it as `char`, i.e. something of 1 byte length. It's obvious that the asm function itself is way more than 1 byte, so fortify logics thought we are going to past the symbol declared. The standard way to refer to asm symbols from C code which is not supposed to be called from C is to declare them as `extern const u8[]`. This is fully correct from any point of view, as any code itself is just a bunch of bytes (including 0 as it is for syms like _stext/_etext/etc.), and the exact size is not known at the moment of compilation. Adjust the type of the except_vec_vi_*() and related variables. Make set_handler() take `const` as a second argument to avoid cast-away warnings and give a little more room for optimization. Signed-off-by: Alexander Lobakin Signed-off-by: Thomas Bogendoerfer Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- arch/mips/include/asm/setup.h | 2 +- arch/mips/kernel/traps.c | 22 +++++++++++----------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/mips/include/asm/setup.h b/arch/mips/include/asm/setup.h index bb36a400203d..8c56b862fd9c 100644 --- a/arch/mips/include/asm/setup.h +++ b/arch/mips/include/asm/setup.h @@ -16,7 +16,7 @@ static inline void setup_8250_early_printk_port(unsigned long base, unsigned int reg_shift, unsigned int timeout) {} #endif -extern void set_handler(unsigned long offset, void *addr, unsigned long len); +void set_handler(unsigned long offset, const void *addr, unsigned long len); extern void set_uncached_handler(unsigned long offset, void *addr, unsigned long len); typedef void (*vi_handler_t)(void); diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c index e0352958e2f7..b1fe4518bd22 100644 --- a/arch/mips/kernel/traps.c +++ b/arch/mips/kernel/traps.c @@ -2097,19 +2097,19 @@ static void *set_vi_srs_handler(int n, vi_handler_t addr, int srs) * If no shadow set is selected then use the default handler * that does normal register saving and standard interrupt exit */ - extern char except_vec_vi, except_vec_vi_lui; - extern char except_vec_vi_ori, except_vec_vi_end; - extern char rollback_except_vec_vi; - char *vec_start = using_rollback_handler() ? - &rollback_except_vec_vi : &except_vec_vi; + extern const u8 except_vec_vi[], except_vec_vi_lui[]; + extern const u8 except_vec_vi_ori[], except_vec_vi_end[]; + extern const u8 rollback_except_vec_vi[]; + const u8 *vec_start = using_rollback_handler() ? + rollback_except_vec_vi : except_vec_vi; #if defined(CONFIG_CPU_MICROMIPS) || defined(CONFIG_CPU_BIG_ENDIAN) - const int lui_offset = &except_vec_vi_lui - vec_start + 2; - const int ori_offset = &except_vec_vi_ori - vec_start + 2; + const int lui_offset = except_vec_vi_lui - vec_start + 2; + const int ori_offset = except_vec_vi_ori - vec_start + 2; #else - const int lui_offset = &except_vec_vi_lui - vec_start; - const int ori_offset = &except_vec_vi_ori - vec_start; + const int lui_offset = except_vec_vi_lui - vec_start; + const int ori_offset = except_vec_vi_ori - vec_start; #endif - const int handler_len = &except_vec_vi_end - vec_start; + const int handler_len = except_vec_vi_end - vec_start; if (handler_len > VECTORSPACING) { /* @@ -2317,7 +2317,7 @@ void per_cpu_trap_init(bool is_boot_cpu) } /* Install CPU exception handler */ -void set_handler(unsigned long offset, void *addr, unsigned long size) +void set_handler(unsigned long offset, const void *addr, unsigned long size) { #ifdef CONFIG_CPU_MICROMIPS memcpy((void *)(ebase + offset), ((unsigned char *)addr - 1), size); -- Gitee From 5365fa404c658fbd709aaf776f9617fc1170143c Mon Sep 17 00:00:00 2001 From: Hangyu Hua Date: Thu, 14 Jul 2022 21:57:53 +0800 Subject: [PATCH 243/674] powerpc/secvar: fix refcount leak in format_show() stable inclusion from stable-v5.10.111 commit 02222bf4f0a27f6eba66d1f597cdb5daadd51829 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=02222bf4f0a27f6eba66d1f597cdb5daadd51829 -------------------------------- [ Upstream commit d601fd24e6964967f115f036a840f4f28488f63f ] Refcount leak will happen when format_show returns failure in multiple cases. Unified management of of_node_put can fix this problem. Signed-off-by: Hangyu Hua Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220302021959.10959-1-hbh25y@gmail.com Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- arch/powerpc/kernel/secvar-sysfs.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kernel/secvar-sysfs.c b/arch/powerpc/kernel/secvar-sysfs.c index a0a78aba2083..1ee4640a2641 100644 --- a/arch/powerpc/kernel/secvar-sysfs.c +++ b/arch/powerpc/kernel/secvar-sysfs.c @@ -26,15 +26,18 @@ static ssize_t format_show(struct kobject *kobj, struct kobj_attribute *attr, const char *format; node = of_find_compatible_node(NULL, NULL, "ibm,secvar-backend"); - if (!of_device_is_available(node)) - return -ENODEV; + if (!of_device_is_available(node)) { + rc = -ENODEV; + goto out; + } rc = of_property_read_string(node, "format", &format); if (rc) - return rc; + goto out; rc = sprintf(buf, "%s\n", format); +out: of_node_put(node); return rc; -- Gitee From 17e3fdb4a43980671111eda19b77fe70610bdcfa Mon Sep 17 00:00:00 2001 From: Jianglei Nie Date: Thu, 14 Jul 2022 21:57:54 +0800 Subject: [PATCH 244/674] scsi: libfc: Fix use after free in fc_exch_abts_resp() stable inclusion from stable-v5.10.111 commit 1d7effe5fff9d28e45e18ac3a564067c7ddfe898 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=1d7effe5fff9d28e45e18ac3a564067c7ddfe898 -------------------------------- [ Upstream commit 271add11994ba1a334859069367e04d2be2ebdd4 ] fc_exch_release(ep) will decrease the ep's reference count. When the reference count reaches zero, it is freed. But ep is still used in the following code, which will lead to a use after free. Return after the fc_exch_release() call to avoid use after free. Link: https://lore.kernel.org/r/20220303015115.459778-1-niejianglei2021@163.com Reviewed-by: Hannes Reinecke Signed-off-by: Jianglei Nie Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/scsi/libfc/fc_exch.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/scsi/libfc/fc_exch.c b/drivers/scsi/libfc/fc_exch.c index a50f1eef0e0c..4261380af97b 100644 --- a/drivers/scsi/libfc/fc_exch.c +++ b/drivers/scsi/libfc/fc_exch.c @@ -1702,6 +1702,7 @@ static void fc_exch_abts_resp(struct fc_exch *ep, struct fc_frame *fp) if (cancel_delayed_work_sync(&ep->timeout_work)) { FC_EXCH_DBG(ep, "Exchange timer canceled due to ABTS response\n"); fc_exch_release(ep); /* release from pending timer hold */ + return; } spin_lock_bh(&ep->ex_lock); -- Gitee From 6e87a4cc684895159cdcbf5f624d506fe38fccbd Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Thu, 14 Jul 2022 21:57:55 +0800 Subject: [PATCH 245/674] can: isotp: set default value for N_As to 50 micro seconds stable inclusion from stable-v5.10.111 commit 74c4d50255519a1fd53dd791239f1442f1ae291b category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=74c4d50255519a1fd53dd791239f1442f1ae291b -------------------------------- [ Upstream commit 530e0d46c61314c59ecfdb8d3bcb87edbc0f85d3 ] The N_As value describes the time a CAN frame needs on the wire when transmitted by the CAN controller. Even very short CAN FD frames need arround 100 usecs (bitrate 1Mbit/s, data bitrate 8Mbit/s). Having N_As to be zero (the former default) leads to 'no CAN frame separation' when STmin is set to zero by the receiving node. This 'burst mode' should not be enabled by default as it could potentially dump a high number of CAN frames into the netdev queue from the soft hrtimer context. This does not affect the system stability but is just not nice and cooperative. With this N_As/frame_txtime value the 'burst mode' is disabled by default. As user space applications usually do not set the frame_txtime element of struct can_isotp_options the new in-kernel default is very likely overwritten with zero when the sockopt() CAN_ISOTP_OPTS is invoked. To make sure that a N_As value of zero is only set intentional the value '0' is now interpreted as 'do not change the current value'. When a frame_txtime of zero is required for testing purposes this CAN_ISOTP_FRAME_TXTIME_ZERO u32 value has to be set in frame_txtime. Link: https://lore.kernel.org/all/20220309120416.83514-2-socketcan@hartkopp.net Signed-off-by: Oliver Hartkopp Signed-off-by: Marc Kleine-Budde Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- include/uapi/linux/can/isotp.h | 28 ++++++++++++++++++++++------ net/can/isotp.c | 12 +++++++++++- 2 files changed, 33 insertions(+), 7 deletions(-) diff --git a/include/uapi/linux/can/isotp.h b/include/uapi/linux/can/isotp.h index c55935b64ccc..590f8aea2b6d 100644 --- a/include/uapi/linux/can/isotp.h +++ b/include/uapi/linux/can/isotp.h @@ -137,20 +137,16 @@ struct can_isotp_ll_options { #define CAN_ISOTP_WAIT_TX_DONE 0x400 /* wait for tx completion */ #define CAN_ISOTP_SF_BROADCAST 0x800 /* 1-to-N functional addressing */ -/* default values */ +/* protocol machine default values */ #define CAN_ISOTP_DEFAULT_FLAGS 0 #define CAN_ISOTP_DEFAULT_EXT_ADDRESS 0x00 #define CAN_ISOTP_DEFAULT_PAD_CONTENT 0xCC /* prevent bit-stuffing */ -#define CAN_ISOTP_DEFAULT_FRAME_TXTIME 0 +#define CAN_ISOTP_DEFAULT_FRAME_TXTIME 50000 /* 50 micro seconds */ #define CAN_ISOTP_DEFAULT_RECV_BS 0 #define CAN_ISOTP_DEFAULT_RECV_STMIN 0x00 #define CAN_ISOTP_DEFAULT_RECV_WFTMAX 0 -#define CAN_ISOTP_DEFAULT_LL_MTU CAN_MTU -#define CAN_ISOTP_DEFAULT_LL_TX_DL CAN_MAX_DLEN -#define CAN_ISOTP_DEFAULT_LL_TX_FLAGS 0 - /* * Remark on CAN_ISOTP_DEFAULT_RECV_* values: * @@ -162,4 +158,24 @@ struct can_isotp_ll_options { * consistency and copied directly into the flow control (FC) frame. */ +/* link layer default values => make use of Classical CAN frames */ + +#define CAN_ISOTP_DEFAULT_LL_MTU CAN_MTU +#define CAN_ISOTP_DEFAULT_LL_TX_DL CAN_MAX_DLEN +#define CAN_ISOTP_DEFAULT_LL_TX_FLAGS 0 + +/* + * The CAN_ISOTP_DEFAULT_FRAME_TXTIME has become a non-zero value as + * it only makes sense for isotp implementation tests to run without + * a N_As value. As user space applications usually do not set the + * frame_txtime element of struct can_isotp_options the new in-kernel + * default is very likely overwritten with zero when the sockopt() + * CAN_ISOTP_OPTS is invoked. + * To make sure that a N_As value of zero is only set intentional the + * value '0' is now interpreted as 'do not change the current value'. + * When a frame_txtime of zero is required for testing purposes this + * CAN_ISOTP_FRAME_TXTIME_ZERO u32 value has to be set in frame_txtime. + */ +#define CAN_ISOTP_FRAME_TXTIME_ZERO 0xFFFFFFFF + #endif /* !_UAPI_CAN_ISOTP_H */ diff --git a/net/can/isotp.c b/net/can/isotp.c index 63e6e8923200..9a4a9c5a9f24 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -141,6 +141,7 @@ struct isotp_sock { struct can_isotp_options opt; struct can_isotp_fc_options rxfc, txfc; struct can_isotp_ll_options ll; + u32 frame_txtime; u32 force_tx_stmin; u32 force_rx_stmin; struct tpcon rx, tx; @@ -360,7 +361,7 @@ static int isotp_rcv_fc(struct isotp_sock *so, struct canfd_frame *cf, int ae) so->tx_gap = ktime_set(0, 0); /* add transmission time for CAN frame N_As */ - so->tx_gap = ktime_add_ns(so->tx_gap, so->opt.frame_txtime); + so->tx_gap = ktime_add_ns(so->tx_gap, so->frame_txtime); /* add waiting time for consecutive frames N_Cs */ if (so->opt.flags & CAN_ISOTP_FORCE_TXSTMIN) so->tx_gap = ktime_add_ns(so->tx_gap, @@ -1245,6 +1246,14 @@ static int isotp_setsockopt_locked(struct socket *sock, int level, int optname, /* no separate rx_ext_address is given => use ext_address */ if (!(so->opt.flags & CAN_ISOTP_RX_EXT_ADDR)) so->opt.rx_ext_address = so->opt.ext_address; + + /* check for frame_txtime changes (0 => no changes) */ + if (so->opt.frame_txtime) { + if (so->opt.frame_txtime == CAN_ISOTP_FRAME_TXTIME_ZERO) + so->frame_txtime = 0; + else + so->frame_txtime = so->opt.frame_txtime; + } break; case CAN_ISOTP_RECV_FC: @@ -1446,6 +1455,7 @@ static int isotp_init(struct sock *sk) so->opt.rxpad_content = CAN_ISOTP_DEFAULT_PAD_CONTENT; so->opt.txpad_content = CAN_ISOTP_DEFAULT_PAD_CONTENT; so->opt.frame_txtime = CAN_ISOTP_DEFAULT_FRAME_TXTIME; + so->frame_txtime = CAN_ISOTP_DEFAULT_FRAME_TXTIME; so->rxfc.bs = CAN_ISOTP_DEFAULT_RECV_BS; so->rxfc.stmin = CAN_ISOTP_DEFAULT_RECV_STMIN; so->rxfc.wftmax = CAN_ISOTP_DEFAULT_RECV_WFTMAX; -- Gitee From c487d7a7fe5d0aeaef5b967b1c7421e5fd90c5be Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 14 Jul 2022 21:57:56 +0800 Subject: [PATCH 246/674] net: account alternate interface name memory stable inclusion from stable-v5.10.111 commit 423e7107f61ff6dfd4b380efc68c5acfc2546e4c category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=423e7107f61ff6dfd4b380efc68c5acfc2546e4c -------------------------------- [ Upstream commit 5d26cff5bdbebdf98ba48217c078ff102536f134 ] George reports that altnames can eat up kernel memory. We should charge that memory appropriately. Reported-by: George Shuklin Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- net/core/rtnetlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 59284e51a2b4..857b1db8189a 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3621,7 +3621,7 @@ static int rtnl_alt_ifname(int cmd, struct net_device *dev, struct nlattr *attr, if (err) return err; - alt_ifname = nla_strdup(attr, GFP_KERNEL); + alt_ifname = nla_strdup(attr, GFP_KERNEL_ACCOUNT); if (!alt_ifname) return -ENOMEM; -- Gitee From c005766ff8a26fa0ba8d69aad93cbc882c5dd363 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 14 Jul 2022 21:57:57 +0800 Subject: [PATCH 247/674] net: limit altnames to 64k total stable inclusion from stable-v5.10.111 commit 278b652f0ad9d34158d4834c6b00b95a808fd230 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=278b652f0ad9d34158d4834c6b00b95a808fd230 -------------------------------- [ Upstream commit 155fb43b70b5fce341347a77d1af2765d1e8fbb8 ] Property list (altname is a link "property") is wrapped in a nlattr. nlattrs length is 16bit so practically speaking the list of properties can't be longer than that, otherwise user space would have to interpret broken netlink messages. Prevent the problem from occurring by checking the length of the property list before adding new entries. Reported-by: George Shuklin Reviewed-by: David Ahern Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- net/core/rtnetlink.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 857b1db8189a..3c9c2d6e3b92 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3615,12 +3615,23 @@ static int rtnl_alt_ifname(int cmd, struct net_device *dev, struct nlattr *attr, bool *changed, struct netlink_ext_ack *extack) { char *alt_ifname; + size_t size; int err; err = nla_validate(attr, attr->nla_len, IFLA_MAX, ifla_policy, extack); if (err) return err; + if (cmd == RTM_NEWLINKPROP) { + size = rtnl_prop_list_size(dev); + size += nla_total_size(ALTIFNAMSIZ); + if (size >= U16_MAX) { + NL_SET_ERR_MSG(extack, + "effective property list too long"); + return -EINVAL; + } + } + alt_ifname = nla_strdup(attr, GFP_KERNEL_ACCOUNT); if (!alt_ifname) return -ENOMEM; -- Gitee From 3a10d8db1b6e0c5fe78342c61f77ee074911942a Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Thu, 14 Jul 2022 21:57:58 +0800 Subject: [PATCH 248/674] net: sfp: add 2500base-X quirk for Lantech SFP module stable inclusion from stable-v5.10.111 commit a4dd3e9e5ae84c2757d67933828c5d16a1e76769 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=a4dd3e9e5ae84c2757d67933828c5d16a1e76769 -------------------------------- [ Upstream commit 00eec9fe4f3b9588b4bfa8ef9dd0aae96407d5d7 ] The Lantech 8330-262D-E module is 2500base-X capable, but it reports the nominal bitrate as 2500MBd instead of 3125MBd. Add a quirk for the module. The following in an EEPROM dump of such a SFP with the serial number redacted: 00: 03 04 07 00 00 00 01 20 40 0c 05 01 19 00 00 00 ???...? @????... 10: 1e 0f 00 00 4c 61 6e 74 65 63 68 20 20 20 20 20 ??..Lantech 20: 20 20 20 20 00 00 00 00 38 33 33 30 2d 32 36 32 ....8330-262 30: 44 2d 45 20 20 20 20 20 56 31 2e 30 03 52 00 cb D-E V1.0?R.? 40: 00 1a 00 00 46 43 XX XX XX XX XX XX XX XX XX XX .?..FCXXXXXXXXXX 50: 20 20 20 20 32 32 30 32 31 34 20 20 68 b0 01 98 220214 h??? 60: 45 58 54 52 45 4d 45 4c 59 20 43 4f 4d 50 41 54 EXTREMELY COMPAT 70: 49 42 4c 45 20 20 20 20 20 20 20 20 20 20 20 20 IBLE Signed-off-by: Michael Walle Link: https://lore.kernel.org/r/20220312205014.4154907-1-michael@walle.cc Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/net/phy/sfp-bus.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/phy/sfp-bus.c b/drivers/net/phy/sfp-bus.c index a05d8372669c..850915a37f4c 100644 --- a/drivers/net/phy/sfp-bus.c +++ b/drivers/net/phy/sfp-bus.c @@ -74,6 +74,12 @@ static const struct sfp_quirk sfp_quirks[] = { .vendor = "HUAWEI", .part = "MA5671A", .modes = sfp_quirk_2500basex, + }, { + // Lantech 8330-262D-E can operate at 2500base-X, but + // incorrectly report 2500MBd NRZ in their EEPROM + .vendor = "Lantech", + .part = "8330-262D-E", + .modes = sfp_quirk_2500basex, }, { .vendor = "UBNT", .part = "UF-INSTANT", -- Gitee From acb549eedca10706368d9a9d8cdaeb4f02e3d508 Mon Sep 17 00:00:00 2001 From: "H. Nikolaus Schaller" Date: Thu, 14 Jul 2022 21:57:59 +0800 Subject: [PATCH 249/674] usb: dwc3: omap: fix "unbalanced disables for smps10_out1" on omap5evm stable inclusion from stable-v5.10.111 commit f6b9550f53674226838076139ebfc1818f5bbada category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=f6b9550f53674226838076139ebfc1818f5bbada -------------------------------- [ Upstream commit ac01df343e5a6c6bcead2ed421af1fde30f73e7e ] Usually, the vbus_regulator (smps10 on omap5evm) boots up disabled. Hence calling regulator_disable() indirectly through dwc3_omap_set_mailbox() during probe leads to: [ 10.332764] WARNING: CPU: 0 PID: 1628 at drivers/regulator/core.c:2853 _regulator_disable+0x40/0x164 [ 10.351919] unbalanced disables for smps10_out1 [ 10.361298] Modules linked in: dwc3_omap(+) clk_twl6040 at24 gpio_twl6040 palmas_gpadc palmas_pwrbutton industrialio snd_soc_omap_mcbsp(+) snd_soc_ti_sdma display_connector ti_tpd12s015 drm leds_gpio drm_panel_orientation_quirks ip_tables x_tables ipv6 autofs4 [ 10.387818] CPU: 0 PID: 1628 Comm: systemd-udevd Not tainted 5.17.0-rc1-letux-lpae+ #8139 [ 10.405129] Hardware name: Generic OMAP5 (Flattened Device Tree) [ 10.411455] unwind_backtrace from show_stack+0x10/0x14 [ 10.416970] show_stack from dump_stack_lvl+0x40/0x4c [ 10.422313] dump_stack_lvl from __warn+0xb8/0x170 [ 10.427377] __warn from warn_slowpath_fmt+0x70/0x9c [ 10.432595] warn_slowpath_fmt from _regulator_disable+0x40/0x164 [ 10.439037] _regulator_disable from regulator_disable+0x30/0x64 [ 10.445382] regulator_disable from dwc3_omap_set_mailbox+0x8c/0xf0 [dwc3_omap] [ 10.453116] dwc3_omap_set_mailbox [dwc3_omap] from dwc3_omap_probe+0x2b8/0x394 [dwc3_omap] [ 10.467021] dwc3_omap_probe [dwc3_omap] from platform_probe+0x58/0xa8 [ 10.481762] platform_probe from really_probe+0x168/0x2fc [ 10.481782] really_probe from __driver_probe_device+0xc4/0xd8 [ 10.481782] __driver_probe_device from driver_probe_device+0x24/0xa4 [ 10.503762] driver_probe_device from __driver_attach+0xc4/0xd8 [ 10.510018] __driver_attach from bus_for_each_dev+0x64/0xa0 [ 10.516001] bus_for_each_dev from bus_add_driver+0x148/0x1a4 [ 10.524880] bus_add_driver from driver_register+0xb4/0xf8 [ 10.530678] driver_register from do_one_initcall+0x90/0x1c4 [ 10.536661] do_one_initcall from do_init_module+0x4c/0x200 [ 10.536683] do_init_module from load_module+0x13dc/0x1910 [ 10.551159] load_module from sys_finit_module+0xc8/0xd8 [ 10.561319] sys_finit_module from __sys_trace_return+0x0/0x18 [ 10.561336] Exception stack(0xc344bfa8 to 0xc344bff0) [ 10.561341] bfa0: b6fb5778 b6fab8d8 00000007 b6ecfbb8 00000000 b6ed0398 [ 10.561341] bfc0: b6fb5778 b6fab8d8 855c0500 0000017b 00020000 b6f9a3cc 00000000 b6fb5778 [ 10.595500] bfe0: bede18f8 bede18e8 b6ec9aeb b6dda1c2 [ 10.601345] ---[ end trace 0000000000000000 ]--- Fix this unnecessary warning by checking if the regulator is enabled. Signed-off-by: H. Nikolaus Schaller Link: https://lore.kernel.org/r/af3b750dc2265d875deaabcf5f80098c9645da45.1646744616.git.hns@goldelico.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/usb/dwc3/dwc3-omap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/dwc3/dwc3-omap.c b/drivers/usb/dwc3/dwc3-omap.c index e196673f5c64..efaf0db595f4 100644 --- a/drivers/usb/dwc3/dwc3-omap.c +++ b/drivers/usb/dwc3/dwc3-omap.c @@ -242,7 +242,7 @@ static void dwc3_omap_set_mailbox(struct dwc3_omap *omap, break; case OMAP_DWC3_ID_FLOAT: - if (omap->vbus_reg) + if (omap->vbus_reg && regulator_is_enabled(omap->vbus_reg)) regulator_disable(omap->vbus_reg); val = dwc3_omap_read_utmi_ctrl(omap); val |= USBOTGSS_UTMI_OTG_CTRL_IDDIG; -- Gitee From 66ea27791e760aaa9d23ddba0b92fa76b550cf9d Mon Sep 17 00:00:00 2001 From: Max Filippov Date: Thu, 14 Jul 2022 21:58:00 +0800 Subject: [PATCH 250/674] xtensa: fix DTC warning unit_address_format stable inclusion from stable-v5.10.111 commit 61e25021e67a0ffb34e56147590d5f1193d2eb0e category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=61e25021e67a0ffb34e56147590d5f1193d2eb0e -------------------------------- [ Upstream commit e85d29ba4b24f68e7a78cb85c55e754362eeb2de ] DTC issues the following warnings when building xtfpga device trees: /soc/flash@00000000/partition@0x0: unit name should not have leading "0x" /soc/flash@00000000/partition@0x6000000: unit name should not have leading "0x" /soc/flash@00000000/partition@0x6800000: unit name should not have leading "0x" /soc/flash@00000000/partition@0x7fe0000: unit name should not have leading "0x" Drop leading 0x from flash partition unit names. Signed-off-by: Max Filippov Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- arch/xtensa/boot/dts/xtfpga-flash-128m.dtsi | 8 ++++---- arch/xtensa/boot/dts/xtfpga-flash-16m.dtsi | 8 ++++---- arch/xtensa/boot/dts/xtfpga-flash-4m.dtsi | 4 ++-- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/xtensa/boot/dts/xtfpga-flash-128m.dtsi b/arch/xtensa/boot/dts/xtfpga-flash-128m.dtsi index 9bf8bad1dd18..c33932568aa7 100644 --- a/arch/xtensa/boot/dts/xtfpga-flash-128m.dtsi +++ b/arch/xtensa/boot/dts/xtfpga-flash-128m.dtsi @@ -8,19 +8,19 @@ flash: flash@00000000 { reg = <0x00000000 0x08000000>; bank-width = <2>; device-width = <2>; - partition@0x0 { + partition@0 { label = "data"; reg = <0x00000000 0x06000000>; }; - partition@0x6000000 { + partition@6000000 { label = "boot loader area"; reg = <0x06000000 0x00800000>; }; - partition@0x6800000 { + partition@6800000 { label = "kernel image"; reg = <0x06800000 0x017e0000>; }; - partition@0x7fe0000 { + partition@7fe0000 { label = "boot environment"; reg = <0x07fe0000 0x00020000>; }; diff --git a/arch/xtensa/boot/dts/xtfpga-flash-16m.dtsi b/arch/xtensa/boot/dts/xtfpga-flash-16m.dtsi index 40c2f81f7cb6..7bde2ab2d6fb 100644 --- a/arch/xtensa/boot/dts/xtfpga-flash-16m.dtsi +++ b/arch/xtensa/boot/dts/xtfpga-flash-16m.dtsi @@ -8,19 +8,19 @@ flash: flash@08000000 { reg = <0x08000000 0x01000000>; bank-width = <2>; device-width = <2>; - partition@0x0 { + partition@0 { label = "boot loader area"; reg = <0x00000000 0x00400000>; }; - partition@0x400000 { + partition@400000 { label = "kernel image"; reg = <0x00400000 0x00600000>; }; - partition@0xa00000 { + partition@a00000 { label = "data"; reg = <0x00a00000 0x005e0000>; }; - partition@0xfe0000 { + partition@fe0000 { label = "boot environment"; reg = <0x00fe0000 0x00020000>; }; diff --git a/arch/xtensa/boot/dts/xtfpga-flash-4m.dtsi b/arch/xtensa/boot/dts/xtfpga-flash-4m.dtsi index fb8d3a9f33c2..0655b868749a 100644 --- a/arch/xtensa/boot/dts/xtfpga-flash-4m.dtsi +++ b/arch/xtensa/boot/dts/xtfpga-flash-4m.dtsi @@ -8,11 +8,11 @@ flash: flash@08000000 { reg = <0x08000000 0x00400000>; bank-width = <2>; device-width = <2>; - partition@0x0 { + partition@0 { label = "boot loader area"; reg = <0x00000000 0x003f0000>; }; - partition@0x3f0000 { + partition@3f0000 { label = "boot environment"; reg = <0x003f0000 0x00010000>; }; -- Gitee From bdb5b2272a8b6e6ee859ee66859761f7aa171ec1 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 14 Jul 2022 21:58:01 +0800 Subject: [PATCH 251/674] MIPS: ingenic: correct unit node address stable inclusion from stable-v5.10.111 commit 789621df196313e0b67ec1c96dbcb02bb77d35c8 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=789621df196313e0b67ec1c96dbcb02bb77d35c8 -------------------------------- [ Upstream commit 8931ddd8d6a55fcefb20f44a38ba42bb746f0b62 ] Unit node addresses should not have leading 0x: Warning (unit_address_format): /nemc@13410000/efuse@d0/eth-mac-addr@0x22: unit name should not have leading "0x" Signed-off-by: Krzysztof Kozlowski Reviewed-by: Paul Cercueil Signed-off-by: Thomas Bogendoerfer Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- arch/mips/boot/dts/ingenic/jz4780.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/boot/dts/ingenic/jz4780.dtsi b/arch/mips/boot/dts/ingenic/jz4780.dtsi index dfb5a7e1bb21..830e5dd3550e 100644 --- a/arch/mips/boot/dts/ingenic/jz4780.dtsi +++ b/arch/mips/boot/dts/ingenic/jz4780.dtsi @@ -429,7 +429,7 @@ efuse: efuse@d0 { #address-cells = <1>; #size-cells = <1>; - eth0_addr: eth-mac-addr@0x22 { + eth0_addr: eth-mac-addr@22 { reg = <0x22 0x6>; }; }; -- Gitee From 5a11d67ebcc2f039779d277218b2d1dbb5d03e32 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Thu, 14 Jul 2022 21:58:02 +0800 Subject: [PATCH 252/674] Bluetooth: Fix use after free in hci_send_acl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit stable inclusion from stable-v5.10.111 commit 2cc803804ec9a296b3156855d6c8c4ca1c6b84be category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=2cc803804ec9a296b3156855d6c8c4ca1c6b84be -------------------------------- [ Upstream commit f63d24baff787e13b723d86fe036f84bdbc35045 ] This fixes the following trace caused by receiving HCI_EV_DISCONN_PHY_LINK_COMPLETE which does call hci_conn_del without first checking if conn->type is in fact AMP_LINK and in case it is do properly cleanup upper layers with hci_disconn_cfm: ================================================================== BUG: KASAN: use-after-free in hci_send_acl+0xaba/0xc50 Read of size 8 at addr ffff88800e404818 by task bluetoothd/142 CPU: 0 PID: 142 Comm: bluetoothd Not tainted 5.17.0-rc5-00006-gda4022eeac1a #7 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack_lvl+0x45/0x59 print_address_description.constprop.0+0x1f/0x150 kasan_report.cold+0x7f/0x11b hci_send_acl+0xaba/0xc50 l2cap_do_send+0x23f/0x3d0 l2cap_chan_send+0xc06/0x2cc0 l2cap_sock_sendmsg+0x201/0x2b0 sock_sendmsg+0xdc/0x110 sock_write_iter+0x20f/0x370 do_iter_readv_writev+0x343/0x690 do_iter_write+0x132/0x640 vfs_writev+0x198/0x570 do_writev+0x202/0x280 do_syscall_64+0x38/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae RSP: 002b:00007ffce8a099b8 EFLAGS: 00000246 ORIG_RAX: 0000000000000014 Code: 0f 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b8 0f 1f 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 14 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51 c3 48 83 ec 28 89 54 24 1c 48 89 74 24 10 RDX: 0000000000000001 RSI: 00007ffce8a099e0 RDI: 0000000000000015 RAX: ffffffffffffffda RBX: 00007ffce8a099e0 RCX: 00007f788fc3cf77 R10: 00007ffce8af7080 R11: 0000000000000246 R12: 000055e4ccf75580 RBP: 0000000000000015 R08: 0000000000000002 R09: 0000000000000001 R13: 000055e4ccf754a0 R14: 000055e4ccf75cd0 R15: 000055e4ccf4a6b0 Allocated by task 45: kasan_save_stack+0x1e/0x40 __kasan_kmalloc+0x81/0xa0 hci_chan_create+0x9a/0x2f0 l2cap_conn_add.part.0+0x1a/0xdc0 l2cap_connect_cfm+0x236/0x1000 le_conn_complete_evt+0x15a7/0x1db0 hci_le_conn_complete_evt+0x226/0x2c0 hci_le_meta_evt+0x247/0x450 hci_event_packet+0x61b/0xe90 hci_rx_work+0x4d5/0xc50 process_one_work+0x8fb/0x15a0 worker_thread+0x576/0x1240 kthread+0x29d/0x340 ret_from_fork+0x1f/0x30 Freed by task 45: kasan_save_stack+0x1e/0x40 kasan_set_track+0x21/0x30 kasan_set_free_info+0x20/0x30 __kasan_slab_free+0xfb/0x130 kfree+0xac/0x350 hci_conn_cleanup+0x101/0x6a0 hci_conn_del+0x27e/0x6c0 hci_disconn_phylink_complete_evt+0xe0/0x120 hci_event_packet+0x812/0xe90 hci_rx_work+0x4d5/0xc50 process_one_work+0x8fb/0x15a0 worker_thread+0x576/0x1240 kthread+0x29d/0x340 ret_from_fork+0x1f/0x30 The buggy address belongs to the object at ffff88800c0f0500 The buggy address is located 24 bytes inside of which belongs to the cache kmalloc-128 of size 128 The buggy address belongs to the page: 128-byte region [ffff88800c0f0500, ffff88800c0f0580) flags: 0x100000000000200(slab|node=0|zone=1) page:00000000fe45cd86 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0xc0f0 raw: 0000000000000000 0000000080100010 00000001ffffffff 0000000000000000 raw: 0100000000000200 ffffea00003a2c80 dead000000000004 ffff8880078418c0 page dumped because: kasan: bad access detected ffff88800c0f0400: 00 00 00 00 00 00 00 00 00 00 00 00 00 fc fc fc Memory state around the buggy address: >ffff88800c0f0500: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff88800c0f0480: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff88800c0f0580: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ^ ================================================================== ffff88800c0f0600: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb Reported-by: Sönke Huster Tested-by: Sönke Huster Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Marcel Holtmann Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- net/bluetooth/hci_event.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 72b4127360c7..e926e80d9731 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -5061,8 +5061,9 @@ static void hci_disconn_phylink_complete_evt(struct hci_dev *hdev, hci_dev_lock(hdev); hcon = hci_conn_hash_lookup_handle(hdev, ev->phy_handle); - if (hcon) { + if (hcon && hcon->type == AMP_LINK) { hcon->state = BT_CLOSED; + hci_disconn_cfm(hcon, ev->reason); hci_conn_del(hcon); } -- Gitee From 066e7ecd0ac543cba8299d889b39801aea45a3f1 Mon Sep 17 00:00:00 2001 From: Wang Yufen Date: Thu, 14 Jul 2022 21:58:03 +0800 Subject: [PATCH 253/674] netlabel: fix out-of-bounds memory accesses stable inclusion from stable-v5.10.111 commit d745512d54fd79d58e227f6f583b84e6111204a8 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=d745512d54fd79d58e227f6f583b84e6111204a8 -------------------------------- [ Upstream commit f22881de730ebd472e15bcc2c0d1d46e36a87b9c ] In calipso_map_cat_ntoh(), in the for loop, if the return value of netlbl_bitmap_walk() is equal to (net_clen_bits - 1), when netlbl_bitmap_walk() is called next time, out-of-bounds memory accesses of bitmap[byte_offset] occurs. The bug was found during fuzzing. The following is the fuzzing report BUG: KASAN: slab-out-of-bounds in netlbl_bitmap_walk+0x3c/0xd0 Read of size 1 at addr ffffff8107bf6f70 by task err_OH/252 CPU: 7 PID: 252 Comm: err_OH Not tainted 5.17.0-rc7+ #17 Hardware name: linux,dummy-virt (DT) Call trace: dump_backtrace+0x21c/0x230 show_stack+0x1c/0x60 dump_stack_lvl+0x64/0x7c print_address_description.constprop.0+0x70/0x2d0 __kasan_report+0x158/0x16c kasan_report+0x74/0x120 __asan_load1+0x80/0xa0 netlbl_bitmap_walk+0x3c/0xd0 calipso_opt_getattr+0x1a8/0x230 calipso_sock_getattr+0x218/0x340 calipso_sock_getattr+0x44/0x60 netlbl_sock_getattr+0x44/0x80 selinux_netlbl_socket_setsockopt+0x138/0x170 selinux_socket_setsockopt+0x4c/0x60 security_socket_setsockopt+0x4c/0x90 __sys_setsockopt+0xbc/0x2b0 __arm64_sys_setsockopt+0x6c/0x84 invoke_syscall+0x64/0x190 el0_svc_common.constprop.0+0x88/0x200 do_el0_svc+0x88/0xa0 el0_svc+0x128/0x1b0 el0t_64_sync_handler+0x9c/0x120 el0t_64_sync+0x16c/0x170 Reported-by: Hulk Robot Signed-off-by: Wang Yufen Acked-by: Paul Moore Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- net/netlabel/netlabel_kapi.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c index 5e1239cef000..91b35b7c80d8 100644 --- a/net/netlabel/netlabel_kapi.c +++ b/net/netlabel/netlabel_kapi.c @@ -885,6 +885,8 @@ int netlbl_bitmap_walk(const unsigned char *bitmap, u32 bitmap_len, unsigned char bitmask; unsigned char byte; + if (offset >= bitmap_len) + return -1; byte_offset = offset / 8; byte = bitmap[byte_offset]; bit_spot = offset; -- Gitee From 58766443dd921a435c707a59129fbe75dfeecb20 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Thu, 14 Jul 2022 21:58:04 +0800 Subject: [PATCH 254/674] ceph: fix memory leak in ceph_readdir when note_last_dentry returns error stable inclusion from stable-v5.10.111 commit f4429786129648a8f4bb1e5faa143c4478cc5c4a category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=f4429786129648a8f4bb1e5faa143c4478cc5c4a -------------------------------- [ Upstream commit f639d9867eea647005dc824e0e24f39ffc50d4e4 ] Reset the last_readdir at the same time, and add a comment explaining why we don't free last_readdir when dir_emit returns false. Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- fs/ceph/dir.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index f63c1a090139..1fddb9cd3e88 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -478,8 +478,11 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) 2 : (fpos_off(rde->offset) + 1); err = note_last_dentry(dfi, rde->name, rde->name_len, next_offset); - if (err) + if (err) { + ceph_mdsc_put_request(dfi->last_readdir); + dfi->last_readdir = NULL; return err; + } } else if (req->r_reply_info.dir_end) { dfi->next_offset = 2; /* keep last name */ @@ -520,6 +523,12 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) if (!dir_emit(ctx, rde->name, rde->name_len, ceph_present_ino(inode->i_sb, le64_to_cpu(rde->inode.in->ino)), le32_to_cpu(rde->inode.in->mode) >> 12)) { + /* + * NOTE: Here no need to put the 'dfi->last_readdir', + * because when dir_emit stops us it's most likely + * doesn't have enough memory, etc. So for next readdir + * it will continue. + */ dout("filldir stopping us...\n"); return 0; } -- Gitee From ca9570a8e3d291fabc8687f8ea0e8fe78c920a53 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 14 Jul 2022 21:58:05 +0800 Subject: [PATCH 255/674] init/main.c: return 1 from handled __setup() functions stable inclusion from stable-v5.10.111 commit 8d9efd4434e3479ea624b8e65e57e39a3ea91126 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=8d9efd4434e3479ea624b8e65e57e39a3ea91126 -------------------------------- [ Upstream commit f9a40b0890658330c83c95511f9d6b396610defc ] initcall_blacklist() should return 1 to indicate that it handled its cmdline arguments. set_debug_rodata() should return 1 to indicate that it handled its cmdline arguments. Print a warning if the option string is invalid. This prevents these strings from being added to the 'init' program's environment as they are not init arguments/parameters. Link: https://lkml.kernel.org/r/20220221050901.23985-1-rdunlap@infradead.org Signed-off-by: Randy Dunlap Reported-by: Igor Zhbanov Cc: Ingo Molnar Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- init/main.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/init/main.c b/init/main.c index 11c96633c3f7..41a9ce782acc 100644 --- a/init/main.c +++ b/init/main.c @@ -1110,7 +1110,7 @@ static int __init initcall_blacklist(char *str) } } while (str_entry); - return 0; + return 1; } static bool __init_or_module initcall_blacklisted(initcall_t fn) @@ -1373,7 +1373,9 @@ static noinline void __init kernel_init_freeable(void); bool rodata_enabled __ro_after_init = true; static int __init set_debug_rodata(char *str) { - return strtobool(str, &rodata_enabled); + if (strtobool(str, &rodata_enabled)) + pr_warn("Invalid option string for rodata: '%s'\n", str); + return 1; } __setup("rodata=", set_debug_rodata); #endif -- Gitee From 0c134c80d758766c4aa1de45cd00dfcfa508b0ef Mon Sep 17 00:00:00 2001 From: Qinghua Jin Date: Thu, 14 Jul 2022 21:58:06 +0800 Subject: [PATCH 256/674] minix: fix bug when opening a file with O_DIRECT stable inclusion from stable-v5.10.111 commit c9cf6baabf780b60fcfd0fe8357f053184bb876e category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=c9cf6baabf780b60fcfd0fe8357f053184bb876e -------------------------------- [ Upstream commit 9ce3c0d26c42d279b6c378a03cd6a61d828f19ca ] Testcase: 1. create a minix file system and mount it 2. open a file on the file system with O_RDWR|O_CREAT|O_TRUNC|O_DIRECT 3. open fails with -EINVAL but leaves an empty file behind. All other open() failures don't leave the failed open files behind. It is hard to check the direct_IO op before creating the inode. Just as ext4 and btrfs do, this patch will resolve the issue by allowing to create the file with O_DIRECT but returning error when writing the file. Link: https://lkml.kernel.org/r/20220107133626.413379-1-qhjin.dev@gmail.com Signed-off-by: Qinghua Jin Reported-by: Colin Ian King Reviewed-by: Jan Kara Acked-by: Christian Brauner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- fs/minix/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 34f546404aa1..e938f5b1e4b9 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -446,7 +446,8 @@ static const struct address_space_operations minix_aops = { .writepage = minix_writepage, .write_begin = minix_write_begin, .write_end = generic_write_end, - .bmap = minix_bmap + .bmap = minix_bmap, + .direct_IO = noop_direct_IO }; static const struct inode_operations minix_symlink_inode_operations = { -- Gitee From e74ffd21806dfdf0a79c961786b7380a0a59673c Mon Sep 17 00:00:00 2001 From: Adam Wujek Date: Thu, 14 Jul 2022 21:58:07 +0800 Subject: [PATCH 257/674] clk: si5341: fix reported clk_rate when output divider is 2 stable inclusion from stable-v5.10.111 commit be4ecca95819175c4a5f6f8516edeb7be8711374 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=be4ecca95819175c4a5f6f8516edeb7be8711374 -------------------------------- [ Upstream commit 2a8b539433e111c4de364237627ef219d2f6350a ] SI5341_OUT_CFG_RDIV_FORCE2 shall be checked first to distinguish whether a divider for a given output is set to 2 (SI5341_OUT_CFG_RDIV_FORCE2 is set) or the output is disabled (SI5341_OUT_CFG_RDIV_FORCE2 not set, SI5341_OUT_R_REG is set 0). Before the change, divider set to 2 (SI5341_OUT_R_REG set to 0) was interpreted as output is disabled. Signed-off-by: Adam Wujek Link: https://lore.kernel.org/r/20211203141125.2447520-1-dev_public@wujek.eu Reviewed-by: Robert Hancock Signed-off-by: Stephen Boyd Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/clk/clk-si5341.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/clk/clk-si5341.c b/drivers/clk/clk-si5341.c index 772b48ad0cd7..382a0619a048 100644 --- a/drivers/clk/clk-si5341.c +++ b/drivers/clk/clk-si5341.c @@ -789,6 +789,15 @@ static unsigned long si5341_output_clk_recalc_rate(struct clk_hw *hw, u32 r_divider; u8 r[3]; + err = regmap_read(output->data->regmap, + SI5341_OUT_CONFIG(output), &val); + if (err < 0) + return err; + + /* If SI5341_OUT_CFG_RDIV_FORCE2 is set, r_divider is 2 */ + if (val & SI5341_OUT_CFG_RDIV_FORCE2) + return parent_rate / 2; + err = regmap_bulk_read(output->data->regmap, SI5341_OUT_R_REG(output), r, 3); if (err < 0) @@ -805,13 +814,6 @@ static unsigned long si5341_output_clk_recalc_rate(struct clk_hw *hw, r_divider += 1; r_divider <<= 1; - err = regmap_read(output->data->regmap, - SI5341_OUT_CONFIG(output), &val); - if (err < 0) - return err; - - if (val & SI5341_OUT_CFG_RDIV_FORCE2) - r_divider = 2; return parent_rate / r_divider; } -- Gitee From 7c02091ee1d74add6622c1b4e323d81fe073d88e Mon Sep 17 00:00:00 2001 From: Stefan Wahren Date: Thu, 14 Jul 2022 21:58:08 +0800 Subject: [PATCH 258/674] staging: vchiq_core: handle NULL result of find_service_by_handle stable inclusion from stable-v5.10.111 commit aa0b7296785312a4bfa8fac0ba8ad78698fd9fcf category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=aa0b7296785312a4bfa8fac0ba8ad78698fd9fcf -------------------------------- [ Upstream commit ca225857faf237234d2fffe5d1919467dfadd822 ] In case of an invalid handle the function find_servive_by_handle returns NULL. So take care of this and avoid a NULL pointer dereference. Reviewed-by: Nicolas Saenz Julienne Signed-off-by: Stefan Wahren Link: https://lore.kernel.org/r/1642968143-19281-18-git-send-email-stefan.wahren@i2se.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- .../staging/vc04_services/interface/vchiq_arm/vchiq_core.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_core.c b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_core.c index 38b10fd5d992..95b91fe45cb3 100644 --- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_core.c +++ b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_core.c @@ -2280,6 +2280,9 @@ void vchiq_msg_queue_push(unsigned int handle, struct vchiq_header *header) struct vchiq_service *service = find_service_by_handle(handle); int pos; + if (!service) + return; + while (service->msg_queue_write == service->msg_queue_read + VCHIQ_MAX_SLOTS) { if (wait_for_completion_interruptible(&service->msg_queue_pop)) @@ -2299,6 +2302,9 @@ struct vchiq_header *vchiq_msg_hold(unsigned int handle) struct vchiq_header *header; int pos; + if (!service) + return NULL; + if (service->msg_queue_write == service->msg_queue_read) return NULL; -- Gitee From ff34587608dcb43ed859a3889b9fa57d5602165f Mon Sep 17 00:00:00 2001 From: Amjad Ouled-Ameur Date: Thu, 14 Jul 2022 21:58:09 +0800 Subject: [PATCH 259/674] phy: amlogic: meson8b-usb2: Use dev_err_probe() stable inclusion from stable-v5.10.111 commit 8f1d24f85ffd0dd75b4220bc05693b90fc8cb59c category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=8f1d24f85ffd0dd75b4220bc05693b90fc8cb59c -------------------------------- [ Upstream commit 6466ba1898d415b527e1013bd8551a6fdfece94c ] Use the existing dev_err_probe() helper instead of open-coding the same operation. Signed-off-by: Amjad Ouled-Ameur Reported-by: Martin Blumenstingl Reviewed-by: Martin Blumenstingl Acked-by: Neil Armstrong Link: https://lore.kernel.org/r/20220111095255.176141-3-aouledameur@baylibre.com Signed-off-by: Vinod Koul Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/phy/amlogic/phy-meson8b-usb2.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/phy/amlogic/phy-meson8b-usb2.c b/drivers/phy/amlogic/phy-meson8b-usb2.c index 03c061dd5f0d..8f40b9342a97 100644 --- a/drivers/phy/amlogic/phy-meson8b-usb2.c +++ b/drivers/phy/amlogic/phy-meson8b-usb2.c @@ -261,8 +261,9 @@ static int phy_meson8b_usb2_probe(struct platform_device *pdev) return PTR_ERR(priv->clk_usb); priv->reset = devm_reset_control_get_optional_shared(&pdev->dev, NULL); - if (PTR_ERR(priv->reset) == -EPROBE_DEFER) - return PTR_ERR(priv->reset); + if (IS_ERR(priv->reset)) + return dev_err_probe(&pdev->dev, PTR_ERR(priv->reset), + "Failed to get the reset line"); priv->dr_mode = of_usb_get_dr_mode_by_phy(pdev->dev.of_node, -1); if (priv->dr_mode == USB_DR_MODE_UNKNOWN) { -- Gitee From 3d725385b53a2630886a4ae4476fd82d7ae695a8 Mon Sep 17 00:00:00 2001 From: Xiaoke Wang Date: Thu, 14 Jul 2022 21:58:10 +0800 Subject: [PATCH 260/674] staging: wfx: fix an error handling in wfx_init_common() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit stable inclusion from stable-v5.10.111 commit 93498c6e775ae91732a8109dba1bdcd324908f84 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=93498c6e775ae91732a8109dba1bdcd324908f84 -------------------------------- [ Upstream commit 60f1d3c92dc1ef1026e5b917a329a7fa947da036 ] One error handler of wfx_init_common() return without calling ieee80211_free_hw(hw), which may result in memory leak. And I add one err label to unify the error handler, which is useful for the subsequent changes. Suggested-by: Jérôme Pouiller Reviewed-by: Dan Carpenter Reviewed-by: Jérôme Pouiller Signed-off-by: Xiaoke Wang Link: https://lore.kernel.org/r/tencent_24A24A3EFF61206ECCC4B94B1C5C1454E108@qq.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/staging/wfx/main.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/staging/wfx/main.c b/drivers/staging/wfx/main.c index e7bc1988124a..d5dacd5583c6 100644 --- a/drivers/staging/wfx/main.c +++ b/drivers/staging/wfx/main.c @@ -309,7 +309,8 @@ struct wfx_dev *wfx_init_common(struct device *dev, wdev->pdata.gpio_wakeup = devm_gpiod_get_optional(dev, "wakeup", GPIOD_OUT_LOW); if (IS_ERR(wdev->pdata.gpio_wakeup)) - return NULL; + goto err; + if (wdev->pdata.gpio_wakeup) gpiod_set_consumer_name(wdev->pdata.gpio_wakeup, "wfx wakeup"); @@ -328,6 +329,10 @@ struct wfx_dev *wfx_init_common(struct device *dev, return NULL; return wdev; + +err: + ieee80211_free_hw(hw); + return NULL; } int wfx_probe(struct wfx_dev *wdev) -- Gitee From 317afd94832fb99d1beb89dcb654f1b2154b30c3 Mon Sep 17 00:00:00 2001 From: Lucas Denefle Date: Thu, 14 Jul 2022 21:58:11 +0800 Subject: [PATCH 261/674] w1: w1_therm: fixes w1_seq for ds28ea00 sensors stable inclusion from stable-v5.10.111 commit 2e16895d06e6e9a0057ecb93f03531c13cc8d595 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=2e16895d06e6e9a0057ecb93f03531c13cc8d595 -------------------------------- [ Upstream commit 41a92a89eee819298f805c40187ad8b02bb53426 ] w1_seq was failing due to several devices responding to the CHAIN_DONE at the same time. Now properly selects the current device in the chain with MATCH_ROM. Also acknowledgment was read twice. Signed-off-by: Lucas Denefle Link: https://lore.kernel.org/r/20220223113558.232750-1-lucas.denefle@converge.io Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/w1/slaves/w1_therm.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/w1/slaves/w1_therm.c b/drivers/w1/slaves/w1_therm.c index 974d02bb3a45..6546d029c7fd 100644 --- a/drivers/w1/slaves/w1_therm.c +++ b/drivers/w1/slaves/w1_therm.c @@ -2092,16 +2092,20 @@ static ssize_t w1_seq_show(struct device *device, if (sl->reg_num.id == reg_num->id) seq = i; + if (w1_reset_bus(sl->master)) + goto error; + + /* Put the device into chain DONE state */ + w1_write_8(sl->master, W1_MATCH_ROM); + w1_write_block(sl->master, (u8 *)&rn, 8); w1_write_8(sl->master, W1_42_CHAIN); w1_write_8(sl->master, W1_42_CHAIN_DONE); w1_write_8(sl->master, W1_42_CHAIN_DONE_INV); - w1_read_block(sl->master, &ack, sizeof(ack)); /* check for acknowledgment */ ack = w1_read_8(sl->master); if (ack != W1_42_SUCCESS_CONFIRM_BYTE) goto error; - } /* Exit from CHAIN state */ -- Gitee From a6d9fbda9f5f5f54277f445a72a2fe53374f35c0 Mon Sep 17 00:00:00 2001 From: Xin Xiong Date: Thu, 14 Jul 2022 21:58:12 +0800 Subject: [PATCH 262/674] NFSv4.2: fix reference count leaks in _nfs42_proc_copy_notify() stable inclusion from stable-v5.10.111 commit 9b9feec97c1fc7dd9bb69f62c4905cddf1801599 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=9b9feec97c1fc7dd9bb69f62c4905cddf1801599 -------------------------------- [ Upstream commit b7f114edd54326f730a754547e7cfb197b5bc132 ] [You don't often get email from xiongx18@fudan.edu.cn. Learn why this is important at http://aka.ms/LearnAboutSenderIdentification.] The reference counting issue happens in two error paths in the function _nfs42_proc_copy_notify(). In both error paths, the function simply returns the error code and forgets to balance the refcount of object `ctx`, bumped by get_nfs_open_context() earlier, which may cause refcount leaks. Fix it by balancing refcount of the `ctx` object before the function returns in both error paths. Signed-off-by: Xin Xiong Signed-off-by: Xiyu Yang Signed-off-by: Xin Tan Signed-off-by: Trond Myklebust Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- fs/nfs/nfs42proc.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 2587b1b8e2ef..dad32b171e67 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -567,8 +567,10 @@ static int _nfs42_proc_copy_notify(struct file *src, struct file *dst, ctx = get_nfs_open_context(nfs_file_open_context(src)); l_ctx = nfs_get_lock_context(ctx); - if (IS_ERR(l_ctx)) - return PTR_ERR(l_ctx); + if (IS_ERR(l_ctx)) { + status = PTR_ERR(l_ctx); + goto out; + } status = nfs4_set_rw_stateid(&args->cna_src_stateid, ctx, l_ctx, FMODE_READ); @@ -576,7 +578,7 @@ static int _nfs42_proc_copy_notify(struct file *src, struct file *dst, if (status) { if (status == -EAGAIN) status = -NFS4ERR_BAD_STATEID; - return status; + goto out; } status = nfs4_call_sync(src_server->client, src_server, &msg, @@ -584,6 +586,7 @@ static int _nfs42_proc_copy_notify(struct file *src, struct file *dst, if (status == -ENOTSUPP) src_server->caps &= ~NFS_CAP_COPY_NOTIFY; +out: put_nfs_open_context(nfs_file_open_context(src)); return status; } -- Gitee From f86e249e18079b72aa7414b102c43919f4d44a73 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 14 Jul 2022 21:58:13 +0800 Subject: [PATCH 263/674] NFSv4: Protect the state recovery thread against direct reclaim stable inclusion from stable-v5.10.111 commit 4a2544ce244bd27825a940824d1084505188695a category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=4a2544ce244bd27825a940824d1084505188695a -------------------------------- [ Upstream commit 3e17898aca293a24dae757a440a50aa63ca29671 ] If memory allocation triggers a direct reclaim from the state recovery thread, then we can deadlock. Use memalloc_nofs_save/restore to ensure that doesn't happen. Signed-off-by: Trond Myklebust Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- fs/nfs/nfs4state.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index cbeec29e9f21..a8fe8f84c5ae 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -49,6 +49,7 @@ #include #include #include +#include #include @@ -2557,9 +2558,17 @@ static void nfs4_layoutreturn_any_run(struct nfs_client *clp) static void nfs4_state_manager(struct nfs_client *clp) { + unsigned int memflags; int status = 0; const char *section = "", *section_sep = ""; + /* + * State recovery can deadlock if the direct reclaim code tries + * start NFS writeback. So ensure memory allocations are all + * GFP_NOFS. + */ + memflags = memalloc_nofs_save(); + /* Ensure exclusive access to NFSv4 state */ do { trace_nfs4_state_mgr(clp); @@ -2654,6 +2663,7 @@ static void nfs4_state_manager(struct nfs_client *clp) clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state); } + memalloc_nofs_restore(memflags); nfs4_end_drain_session(clp); nfs4_clear_state_manager_bit(clp); @@ -2671,6 +2681,7 @@ static void nfs4_state_manager(struct nfs_client *clp) return; if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0) return; + memflags = memalloc_nofs_save(); } while (refcount_read(&clp->cl_count) > 1 && !signalled()); goto out_drain; @@ -2683,6 +2694,7 @@ static void nfs4_state_manager(struct nfs_client *clp) clp->cl_hostname, -status); ssleep(1); out_drain: + memalloc_nofs_restore(memflags); nfs4_end_drain_session(clp); nfs4_clear_state_manager_bit(clp); } -- Gitee From 39f755411208bdd8c0667901ca55b3ba48dff0f5 Mon Sep 17 00:00:00 2001 From: Dongli Zhang Date: Thu, 14 Jul 2022 21:58:14 +0800 Subject: [PATCH 264/674] xen: delay xen_hvm_init_time_ops() if kdump is boot on vcpu>=32 stable inclusion from stable-v5.10.111 commit a2a0e04f6478e8c1038db64717f3fafd55de1420 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=a2a0e04f6478e8c1038db64717f3fafd55de1420 -------------------------------- [ Upstream commit eed05744322da07dd7e419432dcedf3c2e017179 ] The sched_clock() can be used very early since commit 857baa87b642 ("sched/clock: Enable sched clock early"). In addition, with commit 38669ba205d1 ("x86/xen/time: Output xen sched_clock time from 0"), kdump kernel in Xen HVM guest may panic at very early stage when accessing &__this_cpu_read(xen_vcpu)->time as in below: setup_arch() -> init_hypervisor_platform() -> x86_init.hyper.init_platform = xen_hvm_guest_init() -> xen_hvm_init_time_ops() -> xen_clocksource_read() -> src = &__this_cpu_read(xen_vcpu)->time; This is because Xen HVM supports at most MAX_VIRT_CPUS=32 'vcpu_info' embedded inside 'shared_info' during early stage until xen_vcpu_setup() is used to allocate/relocate 'vcpu_info' for boot cpu at arbitrary address. However, when Xen HVM guest panic on vcpu >= 32, since xen_vcpu_info_reset(0) would set per_cpu(xen_vcpu, cpu) = NULL when vcpu >= 32, xen_clocksource_read() on vcpu >= 32 would panic. This patch calls xen_hvm_init_time_ops() again later in xen_hvm_smp_prepare_boot_cpu() after the 'vcpu_info' for boot vcpu is registered when the boot vcpu is >= 32. This issue can be reproduced on purpose via below command at the guest side when kdump/kexec is enabled: "taskset -c 33 echo c > /proc/sysrq-trigger" The bugfix for PVM is not implemented due to the lack of testing environment. [boris: xen_hvm_init_time_ops() returns on errors instead of jumping to end] Cc: Joe Jin Signed-off-by: Dongli Zhang Reviewed-by: Boris Ostrovsky Link: https://lore.kernel.org/r/20220302164032.14569-3-dongli.zhang@oracle.com Signed-off-by: Boris Ostrovsky Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- arch/x86/xen/smp_hvm.c | 6 ++++++ arch/x86/xen/time.c | 24 +++++++++++++++++++++++- 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/arch/x86/xen/smp_hvm.c b/arch/x86/xen/smp_hvm.c index 6ff3c887e0b9..b70afdff419c 100644 --- a/arch/x86/xen/smp_hvm.c +++ b/arch/x86/xen/smp_hvm.c @@ -19,6 +19,12 @@ static void __init xen_hvm_smp_prepare_boot_cpu(void) */ xen_vcpu_setup(0); + /* + * Called again in case the kernel boots on vcpu >= MAX_VIRT_CPUS. + * Refer to comments in xen_hvm_init_time_ops(). + */ + xen_hvm_init_time_ops(); + /* * The alternative logic (which patches the unlock/lock) runs before * the smp bootup up code is activated. Hence we need to set this up diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 91f5b330dcc6..8183d17e1cf1 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -556,6 +556,11 @@ static void xen_hvm_setup_cpu_clockevents(void) void __init xen_hvm_init_time_ops(void) { + static bool hvm_time_initialized; + + if (hvm_time_initialized) + return; + /* * vector callback is needed otherwise we cannot receive interrupts * on cpu > 0 and at this point we don't know how many cpus are @@ -565,7 +570,22 @@ void __init xen_hvm_init_time_ops(void) return; if (!xen_feature(XENFEAT_hvm_safe_pvclock)) { - pr_info("Xen doesn't support pvclock on HVM, disable pv timer"); + pr_info_once("Xen doesn't support pvclock on HVM, disable pv timer"); + return; + } + + /* + * Only MAX_VIRT_CPUS 'vcpu_info' are embedded inside 'shared_info'. + * The __this_cpu_read(xen_vcpu) is still NULL when Xen HVM guest + * boots on vcpu >= MAX_VIRT_CPUS (e.g., kexec), To access + * __this_cpu_read(xen_vcpu) via xen_clocksource_read() will panic. + * + * The xen_hvm_init_time_ops() should be called again later after + * __this_cpu_read(xen_vcpu) is available. + */ + if (!__this_cpu_read(xen_vcpu)) { + pr_info("Delay xen_init_time_common() as kernel is running on vcpu=%d\n", + xen_vcpu_nr(0)); return; } @@ -577,6 +597,8 @@ void __init xen_hvm_init_time_ops(void) x86_platform.calibrate_tsc = xen_tsc_khz; x86_platform.get_wallclock = xen_get_wallclock; x86_platform.set_wallclock = xen_set_wallclock; + + hvm_time_initialized = true; } #endif -- Gitee From 97bb889524f242fa10776e7d0ce2df8997848cf2 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Thu, 14 Jul 2022 21:58:15 +0800 Subject: [PATCH 265/674] clk: ti: Preserve node in ti_dt_clocks_register() stable inclusion from stable-v5.10.111 commit 1e9b5538cf160788c88fc2e3e74badc65b8a6bac category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=1e9b5538cf160788c88fc2e3e74badc65b8a6bac -------------------------------- [ Upstream commit 80864594ff2ad002e2755daf97d46ff0c86faf1f ] In preparation for making use of the clock-output-names, we want to keep node around in ti_dt_clocks_register(). This change should not needed as a fix currently. Signed-off-by: Tony Lindgren Link: https://lore.kernel.org/r/20220204071449.16762-3-tony@atomide.com Signed-off-by: Stephen Boyd Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/clk/ti/clk.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/clk/ti/clk.c b/drivers/clk/ti/clk.c index 3da33c786d77..29eafab4353e 100644 --- a/drivers/clk/ti/clk.c +++ b/drivers/clk/ti/clk.c @@ -131,7 +131,7 @@ int ti_clk_setup_ll_ops(struct ti_clk_ll_ops *ops) void __init ti_dt_clocks_register(struct ti_dt_clk oclks[]) { struct ti_dt_clk *c; - struct device_node *node, *parent; + struct device_node *node, *parent, *child; struct clk *clk; struct of_phandle_args clkspec; char buf[64]; @@ -171,10 +171,13 @@ void __init ti_dt_clocks_register(struct ti_dt_clk oclks[]) node = of_find_node_by_name(NULL, buf); if (num_args && compat_mode) { parent = node; - node = of_get_child_by_name(parent, "clock"); - if (!node) - node = of_get_child_by_name(parent, "clk"); - of_node_put(parent); + child = of_get_child_by_name(parent, "clock"); + if (!child) + child = of_get_child_by_name(parent, "clk"); + if (child) { + of_node_put(parent); + node = child; + } } clkspec.np = node; -- Gitee From 8c27e3bcd939b56a2b323fbb15e428a0d77a09fc Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Thu, 14 Jul 2022 21:58:16 +0800 Subject: [PATCH 266/674] clk: Enforce that disjoints limits are invalid stable inclusion from stable-v5.10.111 commit e2b2542f7452672ce2511b73ccea234d51dd5827 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=e2b2542f7452672ce2511b73ccea234d51dd5827 -------------------------------- [ Upstream commit 10c46f2ea914202482d19cf80dcc9c321c9ff59b ] If we were to have two users of the same clock, doing something like: clk_set_rate_range(user1, 1000, 2000); clk_set_rate_range(user2, 3000, 4000); The second call would fail with -EINVAL, preventing from getting in a situation where we end up with impossible limits. However, this is never explicitly checked against and enforced, and works by relying on an undocumented behaviour of clk_set_rate(). Indeed, on the first clk_set_rate_range will make sure the current clock rate is within the new range, so it will be between 1000 and 2000Hz. On the second clk_set_rate_range(), it will consider (rightfully), that our current clock is outside of the 3000-4000Hz range, and will call clk_core_set_rate_nolock() to set it to 3000Hz. clk_core_set_rate_nolock() will then call clk_calc_new_rates() that will eventually check that our rate 3000Hz rate is outside the min 3000Hz max 2000Hz range, will bail out, the error will propagate and we'll eventually return -EINVAL. This solely relies on the fact that clk_calc_new_rates(), and in particular clk_core_determine_round_nolock(), won't modify the new rate allowing the error to be reported. That assumption won't be true for all drivers, and most importantly we'll break that assumption in a later patch. It can also be argued that we shouldn't even reach the point where we're calling clk_core_set_rate_nolock(). Let's make an explicit check for disjoints range before we're doing anything. Signed-off-by: Maxime Ripard Link: https://lore.kernel.org/r/20220225143534.405820-4-maxime@cerno.tech Signed-off-by: Stephen Boyd Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/clk/clk.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c index 92fc084203b7..2e56cc0a3bce 100644 --- a/drivers/clk/clk.c +++ b/drivers/clk/clk.c @@ -631,6 +631,24 @@ static void clk_core_get_boundaries(struct clk_core *core, *max_rate = min(*max_rate, clk_user->max_rate); } +static bool clk_core_check_boundaries(struct clk_core *core, + unsigned long min_rate, + unsigned long max_rate) +{ + struct clk *user; + + lockdep_assert_held(&prepare_lock); + + if (min_rate > core->max_rate || max_rate < core->min_rate) + return false; + + hlist_for_each_entry(user, &core->clks, clks_node) + if (min_rate > user->max_rate || max_rate < user->min_rate) + return false; + + return true; +} + void clk_hw_set_rate_range(struct clk_hw *hw, unsigned long min_rate, unsigned long max_rate) { @@ -2332,6 +2350,11 @@ int clk_set_rate_range(struct clk *clk, unsigned long min, unsigned long max) clk->min_rate = min; clk->max_rate = max; + if (!clk_core_check_boundaries(clk->core, min, max)) { + ret = -EINVAL; + goto out; + } + rate = clk_core_get_rate_nolock(clk->core); if (rate < min || rate > max) { /* @@ -2360,6 +2383,7 @@ int clk_set_rate_range(struct clk *clk, unsigned long min, unsigned long max) } } +out: if (clk->exclusive_count) clk_core_rate_protect(clk->core); -- Gitee From 2668aa0c0e749b8b4f9404e93d6b3b9550a41508 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 14 Jul 2022 21:58:17 +0800 Subject: [PATCH 267/674] SUNRPC/call_alloc: async tasks mustn't block waiting for memory stable inclusion from stable-v5.10.111 commit f9244d31e05a26fdf458207ec88617bfae04fe8f category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=f9244d31e05a26fdf458207ec88617bfae04fe8f -------------------------------- [ Upstream commit c487216bec83b0c5a8803e5c61433d33ad7b104d ] When memory is short, new worker threads cannot be created and we depend on the minimum one rpciod thread to be able to handle everything. So it must not block waiting for memory. mempools are particularly a problem as memory can only be released back to the mempool by an async rpc task running. If all available workqueue threads are waiting on the mempool, no thread is available to return anything. rpc_malloc() can block, and this might cause deadlocks. So check RPC_IS_ASYNC(), rather than RPC_IS_SWAPPER() to determine if blocking is acceptable. Signed-off-by: NeilBrown Signed-off-by: Trond Myklebust Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- net/sunrpc/sched.c | 4 +++- net/sunrpc/xprtrdma/transport.c | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index c045f63d11fa..6e4d476c6324 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -1012,8 +1012,10 @@ int rpc_malloc(struct rpc_task *task) struct rpc_buffer *buf; gfp_t gfp = GFP_NOFS; + if (RPC_IS_ASYNC(task)) + gfp = GFP_NOWAIT | __GFP_NOWARN; if (RPC_IS_SWAPPER(task)) - gfp = __GFP_MEMALLOC | GFP_NOWAIT | __GFP_NOWARN; + gfp |= __GFP_MEMALLOC; size += sizeof(struct rpc_buffer); if (size <= RPC_BUFFER_MAXSIZE) diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 8e2368a0c2a2..fb7a0ab27899 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -572,8 +572,10 @@ xprt_rdma_allocate(struct rpc_task *task) gfp_t flags; flags = RPCRDMA_DEF_GFP; + if (RPC_IS_ASYNC(task)) + flags = GFP_NOWAIT | __GFP_NOWARN; if (RPC_IS_SWAPPER(task)) - flags = __GFP_MEMALLOC | GFP_NOWAIT | __GFP_NOWARN; + flags |= __GFP_MEMALLOC; if (!rpcrdma_check_regbuf(r_xprt, req->rl_sendbuf, rqst->rq_callsize, flags)) -- Gitee From 621a0c7c51368d01902e41ab672a2a1960a23d36 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 14 Jul 2022 21:58:18 +0800 Subject: [PATCH 268/674] SUNRPC/xprt: async tasks mustn't block waiting for memory stable inclusion from stable-v5.10.111 commit f4fc47e71e32ac64b33e0d36e2aceeda65d46d4c category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=f4fc47e71e32ac64b33e0d36e2aceeda65d46d4c -------------------------------- [ Upstream commit a721035477fb5fb8abc738fbe410b07c12af3dc5 ] When memory is short, new worker threads cannot be created and we depend on the minimum one rpciod thread to be able to handle everything. So it must not block waiting for memory. xprt_dynamic_alloc_slot can block indefinitely. This can tie up all workqueue threads and NFS can deadlock. So when called from a workqueue, set __GFP_NORETRY. The rdma alloc_slot already does not block. However it sets the error to -EAGAIN suggesting this will trigger a sleep. It does not. As we can see in call_reserveresult(), only -ENOMEM causes a sleep. -EAGAIN causes immediate retry. Signed-off-by: NeilBrown Signed-off-by: Trond Myklebust Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- net/sunrpc/xprt.c | 5 ++++- net/sunrpc/xprtrdma/transport.c | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 3ea27bb3f512..cb5acd451489 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1656,12 +1656,15 @@ static bool xprt_throttle_congested(struct rpc_xprt *xprt, struct rpc_task *task static struct rpc_rqst *xprt_dynamic_alloc_slot(struct rpc_xprt *xprt) { struct rpc_rqst *req = ERR_PTR(-EAGAIN); + gfp_t gfp_mask = GFP_KERNEL; if (xprt->num_reqs >= xprt->max_reqs) goto out; ++xprt->num_reqs; spin_unlock(&xprt->reserve_lock); - req = kzalloc(sizeof(struct rpc_rqst), GFP_NOFS); + if (current->flags & PF_WQ_WORKER) + gfp_mask |= __GFP_NORETRY | __GFP_NOWARN; + req = kzalloc(sizeof(*req), gfp_mask); spin_lock(&xprt->reserve_lock); if (req != NULL) goto out; diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index fb7a0ab27899..9cf10cfb85c6 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -519,7 +519,7 @@ xprt_rdma_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task) return; out_sleep: - task->tk_status = -EAGAIN; + task->tk_status = -ENOMEM; xprt_add_backlog(xprt, task); } -- Gitee From be98f7ada5341938e53fbd08c2d1169fd2c57677 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 14 Jul 2022 21:58:19 +0800 Subject: [PATCH 269/674] SUNRPC: remove scheduling boost for "SWAPPER" tasks. stable inclusion from stable-v5.10.111 commit 4b6f122bdfdc048715d0ee630b3b20a1eea8094b category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=4b6f122bdfdc048715d0ee630b3b20a1eea8094b -------------------------------- [ Upstream commit a80a8461868905823609be97f91776a26befe839 ] Currently, tasks marked as "swapper" tasks get put to the front of non-priority rpc_queues, and are sorted earlier than non-swapper tasks on the transport's ->xmit_queue. This is pointless as currently *all* tasks for a mount that has swap enabled on *any* file are marked as "swapper" tasks. So the net result is that the non-priority rpc_queues are reverse-ordered (LIFO). This scheduling boost is not necessary to avoid deadlocks, and hurts fairness, so remove it. If there were a need to expedite some requests, the tk_priority mechanism is a more appropriate tool. Signed-off-by: NeilBrown Signed-off-by: Trond Myklebust Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- net/sunrpc/sched.c | 7 ------- net/sunrpc/xprt.c | 11 ----------- 2 files changed, 18 deletions(-) diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 6e4d476c6324..f0f55fbd1375 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -186,11 +186,6 @@ static void __rpc_add_wait_queue_priority(struct rpc_wait_queue *queue, /* * Add new request to wait queue. - * - * Swapper tasks always get inserted at the head of the queue. - * This should avoid many nasty memory deadlocks and hopefully - * improve overall performance. - * Everyone else gets appended to the queue to ensure proper FIFO behavior. */ static void __rpc_add_wait_queue(struct rpc_wait_queue *queue, struct rpc_task *task, @@ -199,8 +194,6 @@ static void __rpc_add_wait_queue(struct rpc_wait_queue *queue, INIT_LIST_HEAD(&task->u.tk_wait.timer_list); if (RPC_IS_PRIORITY(queue)) __rpc_add_wait_queue_priority(queue, task, queue_priority); - else if (RPC_IS_SWAPPER(task)) - list_add(&task->u.tk_wait.list, &queue->tasks[0]); else list_add_tail(&task->u.tk_wait.list, &queue->tasks[0]); task->tk_waitqueue = queue; diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index cb5acd451489..55b0c2b74933 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1327,17 +1327,6 @@ xprt_request_enqueue_transmit(struct rpc_task *task) INIT_LIST_HEAD(&req->rq_xmit2); goto out; } - } else if (RPC_IS_SWAPPER(task)) { - list_for_each_entry(pos, &xprt->xmit_queue, rq_xmit) { - if (pos->rq_cong || pos->rq_bytes_sent) - continue; - if (RPC_IS_SWAPPER(pos->rq_task)) - continue; - /* Note: req is added _before_ pos */ - list_add_tail(&req->rq_xmit, &pos->rq_xmit); - INIT_LIST_HEAD(&req->rq_xmit2); - goto out; - } } else if (!req->rq_seqno) { list_for_each_entry(pos, &xprt->xmit_queue, rq_xmit) { if (pos->rq_task->tk_owner != task->tk_owner) -- Gitee From eb6fd2d61f35cb3d7e4221dc441466c3283d5547 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 14 Jul 2022 21:58:20 +0800 Subject: [PATCH 270/674] NFS: swap IO handling is slightly different for O_DIRECT IO stable inclusion from stable-v5.10.111 commit d4170a28217a5abd168e18581add641716c14057 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=d4170a28217a5abd168e18581add641716c14057 -------------------------------- [ Upstream commit 64158668ac8b31626a8ce48db4cad08496eb8340 ] 1/ Taking the i_rwsem for swap IO triggers lockdep warnings regarding possible deadlocks with "fs_reclaim". These deadlocks could, I believe, eventuate if a buffered read on the swapfile was attempted. We don't need coherence with the page cache for a swap file, and buffered writes are forbidden anyway. There is no other need for i_rwsem during direct IO. So never take it for swap_rw() 2/ generic_write_checks() explicitly forbids writes to swap, and performs checks that are not needed for swap. So bypass it for swap_rw(). Signed-off-by: NeilBrown Signed-off-by: Trond Myklebust Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- fs/nfs/direct.c | 42 ++++++++++++++++++++++++++++-------------- fs/nfs/file.c | 4 ++-- include/linux/nfs_fs.h | 8 ++++---- 3 files changed, 34 insertions(+), 20 deletions(-) diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 3c0335c15a73..28afc315ec0c 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -172,8 +172,8 @@ ssize_t nfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) VM_BUG_ON(iov_iter_count(iter) != PAGE_SIZE); if (iov_iter_rw(iter) == READ) - return nfs_file_direct_read(iocb, iter); - return nfs_file_direct_write(iocb, iter); + return nfs_file_direct_read(iocb, iter, true); + return nfs_file_direct_write(iocb, iter, true); } static void nfs_direct_release_pages(struct page **pages, unsigned int npages) @@ -424,6 +424,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, * nfs_file_direct_read - file direct read operation for NFS files * @iocb: target I/O control block * @iter: vector of user buffers into which to read data + * @swap: flag indicating this is swap IO, not O_DIRECT IO * * We use this function for direct reads instead of calling * generic_file_aio_read() in order to avoid gfar's check to see if @@ -439,7 +440,8 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, * client must read the updated atime from the server back into its * cache. */ -ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter) +ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, + bool swap) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; @@ -481,12 +483,14 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter) if (iter_is_iovec(iter)) dreq->flags = NFS_ODIRECT_SHOULD_DIRTY; - nfs_start_io_direct(inode); + if (!swap) + nfs_start_io_direct(inode); NFS_I(inode)->read_io += count; requested = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos); - nfs_end_io_direct(inode); + if (!swap) + nfs_end_io_direct(inode); if (requested > 0) { result = nfs_direct_wait(dreq); @@ -875,6 +879,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, * nfs_file_direct_write - file direct write operation for NFS files * @iocb: target I/O control block * @iter: vector of user buffers from which to write data + * @swap: flag indicating this is swap IO, not O_DIRECT IO * * We use this function for direct writes instead of calling * generic_file_aio_write() in order to avoid taking the inode @@ -891,7 +896,8 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, * Note that O_APPEND is not supported for NFS direct writes, as there * is no atomic O_APPEND write facility in the NFS protocol. */ -ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) +ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter, + bool swap) { ssize_t result, requested; size_t count; @@ -905,7 +911,11 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n", file, iov_iter_count(iter), (long long) iocb->ki_pos); - result = generic_write_checks(iocb, iter); + if (swap) + /* bypass generic checks */ + result = iov_iter_count(iter); + else + result = generic_write_checks(iocb, iter); if (result <= 0) return result; count = result; @@ -936,16 +946,20 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) dreq->iocb = iocb; pnfs_init_ds_commit_info_ops(&dreq->ds_cinfo, inode); - nfs_start_io_direct(inode); + if (swap) { + requested = nfs_direct_write_schedule_iovec(dreq, iter, pos); + } else { + nfs_start_io_direct(inode); - requested = nfs_direct_write_schedule_iovec(dreq, iter, pos); + requested = nfs_direct_write_schedule_iovec(dreq, iter, pos); - if (mapping->nrpages) { - invalidate_inode_pages2_range(mapping, - pos >> PAGE_SHIFT, end); - } + if (mapping->nrpages) { + invalidate_inode_pages2_range(mapping, + pos >> PAGE_SHIFT, end); + } - nfs_end_io_direct(inode); + nfs_end_io_direct(inode); + } if (requested > 0) { result = nfs_direct_wait(dreq); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index f96367a2463e..aff1e65af6bd 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -158,7 +158,7 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to) ssize_t result; if (iocb->ki_flags & IOCB_DIRECT) - return nfs_file_direct_read(iocb, to); + return nfs_file_direct_read(iocb, to, false); dprintk("NFS: read(%pD2, %zu@%lu)\n", iocb->ki_filp, @@ -609,7 +609,7 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) return result; if (iocb->ki_flags & IOCB_DIRECT) - return nfs_file_direct_write(iocb, from); + return nfs_file_direct_write(iocb, from, false); dprintk("NFS: write(%pD2, %zu@%Ld)\n", file, iov_iter_count(from), (long long) iocb->ki_pos); diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 1e0a3497bdb4..2a17e0dfd431 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -478,10 +478,10 @@ static inline const struct cred *nfs_file_cred(struct file *file) * linux/fs/nfs/direct.c */ extern ssize_t nfs_direct_IO(struct kiocb *, struct iov_iter *); -extern ssize_t nfs_file_direct_read(struct kiocb *iocb, - struct iov_iter *iter); -extern ssize_t nfs_file_direct_write(struct kiocb *iocb, - struct iov_iter *iter); +ssize_t nfs_file_direct_read(struct kiocb *iocb, + struct iov_iter *iter, bool swap); +ssize_t nfs_file_direct_write(struct kiocb *iocb, + struct iov_iter *iter, bool swap); /* * linux/fs/nfs/dir.c -- Gitee From 9ae11d7acbd8c4b59ebf92f2af95b053fbb55980 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 14 Jul 2022 21:58:21 +0800 Subject: [PATCH 271/674] NFS: swap-out must always use STABLE writes. stable inclusion from stable-v5.10.111 commit b3882e78aa0a924fa6c2c09fe74aaaa2299a34ad category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=b3882e78aa0a924fa6c2c09fe74aaaa2299a34ad -------------------------------- [ Upstream commit c265de257f558a05c1859ee9e3fed04883b9ec0e ] The commit handling code is not safe against memory-pressure deadlocks when writing to swap. In particular, nfs_commitdata_alloc() blocks indefinitely waiting for memory, and this can consume all available workqueue threads. swap-out most likely uses STABLE writes anyway as COND_STABLE indicates that a stable write should be used if the write fits in a single request, and it normally does. However if we ever swap with a small wsize, or gather unusually large numbers of pages for a single write, this might change. For safety, make it explicit in the code that direct writes used for swap must always use FLUSH_STABLE. Signed-off-by: NeilBrown Signed-off-by: Trond Myklebust Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- fs/nfs/direct.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 28afc315ec0c..c220810c61d1 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -793,7 +793,7 @@ static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = { */ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, struct iov_iter *iter, - loff_t pos) + loff_t pos, int ioflags) { struct nfs_pageio_descriptor desc; struct inode *inode = dreq->inode; @@ -801,7 +801,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, size_t requested_bytes = 0; size_t wsize = max_t(size_t, NFS_SERVER(inode)->wsize, PAGE_SIZE); - nfs_pageio_init_write(&desc, inode, FLUSH_COND_STABLE, false, + nfs_pageio_init_write(&desc, inode, ioflags, false, &nfs_direct_write_completion_ops); desc.pg_dreq = dreq; get_dreq(dreq); @@ -947,11 +947,13 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter, pnfs_init_ds_commit_info_ops(&dreq->ds_cinfo, inode); if (swap) { - requested = nfs_direct_write_schedule_iovec(dreq, iter, pos); + requested = nfs_direct_write_schedule_iovec(dreq, iter, pos, + FLUSH_STABLE); } else { nfs_start_io_direct(inode); - requested = nfs_direct_write_schedule_iovec(dreq, iter, pos); + requested = nfs_direct_write_schedule_iovec(dreq, iter, pos, + FLUSH_COND_STABLE); if (mapping->nrpages) { invalidate_inode_pages2_range(mapping, -- Gitee From bca150089a21cd0d1b209e248559b06513a3ae43 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 14 Jul 2022 21:58:22 +0800 Subject: [PATCH 272/674] x86/Kconfig: Do not allow CONFIG_X86_X32_ABI=y with llvm-objcopy stable inclusion from stable-v5.10.111 commit 9cb90f9ad5975ddfc90d0906b40e9c71c2eca44e category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=9cb90f9ad5975ddfc90d0906b40e9c71c2eca44e -------------------------------- [ Upstream commit aaeed6ecc1253ce1463fa1aca0b70a4ccbc9fa75 ] There are two outstanding issues with CONFIG_X86_X32_ABI and llvm-objcopy, with similar root causes: 1. llvm-objcopy does not properly convert .note.gnu.property when going from x86_64 to x86_x32, resulting in a corrupted section when linking: https://github.com/ClangBuiltLinux/linux/issues/1141 2. llvm-objcopy produces corrupted compressed debug sections when going from x86_64 to x86_x32, also resulting in an error when linking: https://github.com/ClangBuiltLinux/linux/issues/514 After commit 41c5ef31ad71 ("x86/ibt: Base IBT bits"), the .note.gnu.property section is always generated when CONFIG_X86_KERNEL_IBT is enabled, which causes the first issue to become visible with an allmodconfig build: ld.lld: error: arch/x86/entry/vdso/vclock_gettime-x32.o:(.note.gnu.property+0x1c): program property is too short To avoid this error, do not allow CONFIG_X86_X32_ABI to be selected when using llvm-objcopy. If the two issues ever get fixed in llvm-objcopy, this can be turned into a feature check. Signed-off-by: Nathan Chancellor Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220314194842.3452-3-nathan@kernel.org Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- arch/x86/Kconfig | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 040fb77366dd..ec51ccea60c6 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2908,6 +2908,11 @@ config IA32_AOUT config X86_X32 bool "x32 ABI for 64-bit mode" depends on X86_64 + # llvm-objcopy does not convert x86_64 .note.gnu.property or + # compressed debug sections to x86_x32 properly: + # https://github.com/ClangBuiltLinux/linux/issues/514 + # https://github.com/ClangBuiltLinux/linux/issues/1141 + depends on $(success,$(OBJCOPY) --version | head -n1 | grep -qv llvm) help Include code to run binaries for the x32 native 32-bit ABI for 64-bit processors. An x32 process gets access to the -- Gitee From 27f1c36b60209c871375f5cd17d10473c8091d2c Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Thu, 14 Jul 2022 21:58:23 +0800 Subject: [PATCH 273/674] serial: samsung_tty: do not unlock port->lock for uart_write_wakeup() stable inclusion from stable-v5.10.111 commit 3309b322171180828564c386443eb5a62eabc862 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=3309b322171180828564c386443eb5a62eabc862 -------------------------------- [ Upstream commit 988c7c00691008ea1daaa1235680a0da49dab4e8 ] The commit c15c3747ee32 (serial: samsung: fix potential soft lockup during uart write) added an unlock of port->lock before uart_write_wakeup() and a lock after it. It was always problematic to write data from tty_ldisc_ops::write_wakeup and it was even documented that way. We fixed the line disciplines to conform to this recently. So if there is still a missed one, we should fix them instead of this workaround. On the top of that, s3c24xx_serial_tx_dma_complete() in this driver still holds the port->lock while calling uart_write_wakeup(). So revert the wrap added by the commit above. Cc: Thomas Abraham Cc: Kyungmin Park Cc: Hyeonkook Kim Signed-off-by: Jiri Slaby Link: https://lore.kernel.org/r/20220308115153.4225-1-jslaby@suse.cz Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/tty/serial/samsung_tty.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/tty/serial/samsung_tty.c b/drivers/tty/serial/samsung_tty.c index 8ae3e03fbd8c..81faead3c4f8 100644 --- a/drivers/tty/serial/samsung_tty.c +++ b/drivers/tty/serial/samsung_tty.c @@ -883,11 +883,8 @@ static irqreturn_t s3c24xx_serial_tx_chars(int irq, void *id) goto out; } - if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS) { - spin_unlock(&port->lock); + if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS) uart_write_wakeup(port); - spin_lock(&port->lock); - } if (uart_circ_empty(xmit)) s3c24xx_serial_stop_tx(port); -- Gitee From 62a0c4b1082844d07606cb6ccfeb90f24f2d17b3 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 14 Jul 2022 21:58:24 +0800 Subject: [PATCH 274/674] virtio_console: eliminate anonymous module_init & module_exit stable inclusion from stable-v5.10.111 commit c69b442125bf009fce26e15aa5616caf8a3673c3 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=c69b442125bf009fce26e15aa5616caf8a3673c3 -------------------------------- [ Upstream commit fefb8a2a941338d871e2d83fbd65fbfa068857bd ] Eliminate anonymous module_init() and module_exit(), which can lead to confusion or ambiguity when reading System.map, crashes/oops/bugs, or an initcall_debug log. Give each of these init and exit functions unique driver-specific names to eliminate the anonymous names. Example 1: (System.map) ffffffff832fc78c t init ffffffff832fc79e t init ffffffff832fc8f8 t init Example 2: (initcall_debug log) calling init+0x0/0x12 @ 1 initcall init+0x0/0x12 returned 0 after 15 usecs calling init+0x0/0x60 @ 1 initcall init+0x0/0x60 returned 0 after 2 usecs calling init+0x0/0x9a @ 1 initcall init+0x0/0x9a returned 0 after 74 usecs Signed-off-by: Randy Dunlap Reviewed-by: Amit Shah Cc: virtualization@lists.linux-foundation.org Cc: Arnd Bergmann Link: https://lore.kernel.org/r/20220316192010.19001-3-rdunlap@infradead.org Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/char/virtio_console.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c index 3dd4deb60adb..6d361420ffe8 100644 --- a/drivers/char/virtio_console.c +++ b/drivers/char/virtio_console.c @@ -2239,7 +2239,7 @@ static struct virtio_driver virtio_rproc_serial = { .remove = virtcons_remove, }; -static int __init init(void) +static int __init virtio_console_init(void) { int err; @@ -2276,7 +2276,7 @@ static int __init init(void) return err; } -static void __exit fini(void) +static void __exit virtio_console_fini(void) { reclaim_dma_bufs(); @@ -2286,8 +2286,8 @@ static void __exit fini(void) class_destroy(pdrvdata.class); debugfs_remove_recursive(pdrvdata.debugfs_dir); } -module_init(init); -module_exit(fini); +module_init(virtio_console_init); +module_exit(virtio_console_fini); MODULE_DESCRIPTION("Virtio console driver"); MODULE_LICENSE("GPL"); -- Gitee From a44452c4120ede42648c992e7b649b94c345a1db Mon Sep 17 00:00:00 2001 From: Haimin Zhang Date: Thu, 14 Jul 2022 21:58:25 +0800 Subject: [PATCH 275/674] jfs: prevent NULL deref in diFree stable inclusion from stable-v5.10.111 commit b9c5ac0a15f24d63b20f899072fa6dd8c93af136 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=b9c5ac0a15f24d63b20f899072fa6dd8c93af136 -------------------------------- [ Upstream commit a53046291020ec41e09181396c1e829287b48d47 ] Add validation check for JFS_IP(ipimap)->i_imap to prevent a NULL deref in diFree since diFree uses it without do any validations. When function jfs_mount calls diMount to initialize fileset inode allocation map, it can fail and JFS_IP(ipimap)->i_imap won't be initialized. Then it calls diFreeSpecial to close fileset inode allocation map inode and it will flow into jfs_evict_inode. Function jfs_evict_inode just validates JFS_SBI(inode->i_sb)->ipimap, then calls diFree. diFree use JFS_IP(ipimap)->i_imap directly, then it will cause a NULL deref. Reported-by: TCS Robot Signed-off-by: Haimin Zhang Signed-off-by: Dave Kleikamp Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- fs/jfs/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index b0eb9c85eea0..980aa3300f10 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -146,12 +146,13 @@ void jfs_evict_inode(struct inode *inode) dquot_initialize(inode); if (JFS_IP(inode)->fileset == FILESYSTEM_I) { + struct inode *ipimap = JFS_SBI(inode->i_sb)->ipimap; truncate_inode_pages_final(&inode->i_data); if (test_cflag(COMMIT_Freewmap, inode)) jfs_free_zero_link(inode); - if (JFS_SBI(inode->i_sb)->ipimap) + if (ipimap && JFS_IP(ipimap)->i_imap) diFree(inode); /* -- Gitee From 625a6a2735e87773bdca55da632e2bcf686169f6 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 14 Jul 2022 21:58:26 +0800 Subject: [PATCH 276/674] SUNRPC: Fix socket waits for write buffer space stable inclusion from stable-v5.10.111 commit 7a506fabcfe1a58bcdf6dd18fa3fca42635235f0 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=7a506fabcfe1a58bcdf6dd18fa3fca42635235f0 -------------------------------- [ Upstream commit 7496b59f588dd52886fdbac7633608097543a0a5 ] The socket layer requires that we use the socket lock to protect changes to the sock->sk_write_pending field and others. Reported-by: Chuck Lever Signed-off-by: Trond Myklebust Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- net/sunrpc/xprtsock.c | 54 +++++++++++++++++++++++++++++++------------ 1 file changed, 39 insertions(+), 15 deletions(-) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index b7c262f6d555..f57ccf5ae0f2 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -754,12 +754,12 @@ xs_stream_start_connect(struct sock_xprt *transport) /** * xs_nospace - handle transmit was incomplete * @req: pointer to RPC request + * @transport: pointer to struct sock_xprt * */ -static int xs_nospace(struct rpc_rqst *req) +static int xs_nospace(struct rpc_rqst *req, struct sock_xprt *transport) { - struct rpc_xprt *xprt = req->rq_xprt; - struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); + struct rpc_xprt *xprt = &transport->xprt; struct sock *sk = transport->inet; int ret = -EAGAIN; @@ -770,25 +770,49 @@ static int xs_nospace(struct rpc_rqst *req) /* Don't race with disconnect */ if (xprt_connected(xprt)) { + struct socket_wq *wq; + + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + set_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags); + rcu_read_unlock(); + /* wait for more buffer space */ + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); sk->sk_write_pending++; xprt_wait_for_buffer_space(xprt); } else ret = -ENOTCONN; spin_unlock(&xprt->transport_lock); + return ret; +} - /* Race breaker in case memory is freed before above code is called */ - if (ret == -EAGAIN) { - struct socket_wq *wq; +static int xs_sock_nospace(struct rpc_rqst *req) +{ + struct sock_xprt *transport = + container_of(req->rq_xprt, struct sock_xprt, xprt); + struct sock *sk = transport->inet; + int ret = -EAGAIN; - rcu_read_lock(); - wq = rcu_dereference(sk->sk_wq); - set_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags); - rcu_read_unlock(); + lock_sock(sk); + if (!sock_writeable(sk)) + ret = xs_nospace(req, transport); + release_sock(sk); + return ret; +} - sk->sk_write_space(sk); - } +static int xs_stream_nospace(struct rpc_rqst *req) +{ + struct sock_xprt *transport = + container_of(req->rq_xprt, struct sock_xprt, xprt); + struct sock *sk = transport->inet; + int ret = -EAGAIN; + + lock_sock(sk); + if (!sk_stream_memory_free(sk)) + ret = xs_nospace(req, transport); + release_sock(sk); return ret; } @@ -878,7 +902,7 @@ static int xs_local_send_request(struct rpc_rqst *req) case -ENOBUFS: break; case -EAGAIN: - status = xs_nospace(req); + status = xs_stream_nospace(req); break; default: dprintk("RPC: sendmsg returned unrecognized error %d\n", @@ -954,7 +978,7 @@ static int xs_udp_send_request(struct rpc_rqst *req) /* Should we call xs_close() here? */ break; case -EAGAIN: - status = xs_nospace(req); + status = xs_sock_nospace(req); break; case -ENETUNREACH: case -ENOBUFS: @@ -1069,7 +1093,7 @@ static int xs_tcp_send_request(struct rpc_rqst *req) /* Should we call xs_close() here? */ break; case -EAGAIN: - status = xs_nospace(req); + status = xs_stream_nospace(req); break; case -ECONNRESET: case -ECONNREFUSED: -- Gitee From 2549ae58e4f66de9341813d6c98bb5c0966f80f8 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 14 Jul 2022 21:58:27 +0800 Subject: [PATCH 277/674] NFS: nfsiod should not block forever in mempool_alloc() stable inclusion from stable-v5.10.111 commit 34681aeddcfcc0c47d45154f5d39310def87c55f category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=34681aeddcfcc0c47d45154f5d39310def87c55f -------------------------------- [ Upstream commit 515dcdcd48736576c6f5c197814da6f81c60a21e ] The concern is that since nfsiod is sometimes required to kick off a commit, it can get locked up waiting forever in mempool_alloc() instead of failing gracefully and leaving the commit until later. Try to allocate from the slab first, with GFP_KERNEL | __GFP_NORETRY, then fall back to a non-blocking attempt to allocate from the memory pool. Signed-off-by: Trond Myklebust Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- fs/nfs/internal.h | 7 +++++++ fs/nfs/pnfs_nfs.c | 8 ++++++-- fs/nfs/write.c | 24 +++++++++--------------- include/linux/nfs_fs.h | 2 +- 4 files changed, 23 insertions(+), 18 deletions(-) diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 4c7666cd20e1..7009a8dddd45 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -588,6 +588,13 @@ nfs_write_match_verf(const struct nfs_writeverf *verf, !nfs_write_verifier_cmp(&req->wb_verf, &verf->verifier); } +static inline gfp_t nfs_io_gfp_mask(void) +{ + if (current->flags & PF_WQ_WORKER) + return GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; + return GFP_KERNEL; +} + /* unlink.c */ extern struct rpc_task * nfs_async_rename(struct inode *old_dir, struct inode *new_dir, diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 7b9d701bef01..a2ad8bb87e2d 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -419,7 +419,7 @@ static struct nfs_commit_data * pnfs_bucket_fetch_commitdata(struct pnfs_commit_bucket *bucket, struct nfs_commit_info *cinfo) { - struct nfs_commit_data *data = nfs_commitdata_alloc(false); + struct nfs_commit_data *data = nfs_commitdata_alloc(); if (!data) return NULL; @@ -515,7 +515,11 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages, unsigned int nreq = 0; if (!list_empty(mds_pages)) { - data = nfs_commitdata_alloc(true); + data = nfs_commitdata_alloc(); + if (!data) { + nfs_retry_commit(mds_pages, NULL, cinfo, -1); + return -ENOMEM; + } data->ds_commit_index = -1; list_splice_init(mds_pages, &data->pages); list_add_tail(&data->list, &list); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index cc926e69ee9b..a97eaf4e813c 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -70,27 +70,17 @@ static mempool_t *nfs_wdata_mempool; static struct kmem_cache *nfs_cdata_cachep; static mempool_t *nfs_commit_mempool; -struct nfs_commit_data *nfs_commitdata_alloc(bool never_fail) +struct nfs_commit_data *nfs_commitdata_alloc(void) { struct nfs_commit_data *p; - if (never_fail) - p = mempool_alloc(nfs_commit_mempool, GFP_NOIO); - else { - /* It is OK to do some reclaim, not no safe to wait - * for anything to be returned to the pool. - * mempool_alloc() cannot handle that particular combination, - * so we need two separate attempts. - */ + p = kmem_cache_zalloc(nfs_cdata_cachep, nfs_io_gfp_mask()); + if (!p) { p = mempool_alloc(nfs_commit_mempool, GFP_NOWAIT); - if (!p) - p = kmem_cache_alloc(nfs_cdata_cachep, GFP_NOIO | - __GFP_NOWARN | __GFP_NORETRY); if (!p) return NULL; + memset(p, 0, sizeof(*p)); } - - memset(p, 0, sizeof(*p)); INIT_LIST_HEAD(&p->pages); return p; } @@ -1800,7 +1790,11 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how, if (list_empty(head)) return 0; - data = nfs_commitdata_alloc(true); + data = nfs_commitdata_alloc(); + if (!data) { + nfs_retry_commit(head, NULL, cinfo, -1); + return -ENOMEM; + } /* Set up the argument struct */ nfs_init_commit(data, head, NULL, cinfo); diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 2a17e0dfd431..e39342945a80 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -551,7 +551,7 @@ extern int nfs_wb_all(struct inode *inode); extern int nfs_wb_page(struct inode *inode, struct page *page); extern int nfs_wb_page_cancel(struct inode *inode, struct page* page); extern int nfs_commit_inode(struct inode *, int); -extern struct nfs_commit_data *nfs_commitdata_alloc(bool never_fail); +extern struct nfs_commit_data *nfs_commitdata_alloc(void); extern void nfs_commit_free(struct nfs_commit_data *data); bool nfs_commit_end(struct nfs_mds_commit_info *cinfo); -- Gitee From 0824355fd43f07da43bb4af8ddbfb94e8dd61d88 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 14 Jul 2022 21:58:28 +0800 Subject: [PATCH 278/674] NFS: Avoid writeback threads getting stuck in mempool_alloc() stable inclusion from stable-v5.10.111 commit c74e2f6ecc51bd08bb5b0335477dba954a50592e category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=c74e2f6ecc51bd08bb5b0335477dba954a50592e -------------------------------- [ Upstream commit 0bae835b63c53f86cdc524f5962e39409585b22c ] In a low memory situation, allow the NFS writeback code to fail without getting stuck in infinite loops in mempool_alloc(). Signed-off-by: Trond Myklebust Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- fs/nfs/pagelist.c | 10 +++++----- fs/nfs/write.c | 10 ++++++++-- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 98b9c1ed366e..17fef6eb490c 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -90,10 +90,10 @@ void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos) } } -static inline struct nfs_page * -nfs_page_alloc(void) +static inline struct nfs_page *nfs_page_alloc(void) { - struct nfs_page *p = kmem_cache_zalloc(nfs_page_cachep, GFP_KERNEL); + struct nfs_page *p = + kmem_cache_zalloc(nfs_page_cachep, nfs_io_gfp_mask()); if (p) INIT_LIST_HEAD(&p->wb_list); return p; @@ -901,7 +901,7 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc, struct nfs_commit_info cinfo; struct nfs_page_array *pg_array = &hdr->page_array; unsigned int pagecount, pageused; - gfp_t gfp_flags = GFP_KERNEL; + gfp_t gfp_flags = nfs_io_gfp_mask(); pagecount = nfs_page_array_len(mirror->pg_base, mirror->pg_count); pg_array->npages = pagecount; @@ -984,7 +984,7 @@ nfs_pageio_alloc_mirrors(struct nfs_pageio_descriptor *desc, desc->pg_mirrors_dynamic = NULL; if (mirror_count == 1) return desc->pg_mirrors_static; - ret = kmalloc_array(mirror_count, sizeof(*ret), GFP_KERNEL); + ret = kmalloc_array(mirror_count, sizeof(*ret), nfs_io_gfp_mask()); if (ret != NULL) { for (i = 0; i < mirror_count; i++) nfs_pageio_mirror_init(&ret[i], desc->pg_bsize); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index a97eaf4e813c..5d07799513a6 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -94,9 +94,15 @@ EXPORT_SYMBOL_GPL(nfs_commit_free); static struct nfs_pgio_header *nfs_writehdr_alloc(void) { - struct nfs_pgio_header *p = mempool_alloc(nfs_wdata_mempool, GFP_KERNEL); + struct nfs_pgio_header *p; - memset(p, 0, sizeof(*p)); + p = kmem_cache_zalloc(nfs_wdata_cachep, nfs_io_gfp_mask()); + if (!p) { + p = mempool_alloc(nfs_wdata_mempool, GFP_NOWAIT); + if (!p) + return NULL; + memset(p, 0, sizeof(*p)); + } p->rw_mode = FMODE_WRITE; return p; } -- Gitee From 2cfbd97835d65cfe05bf819b34e9c8c93c953f75 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Thu, 14 Jul 2022 21:58:29 +0800 Subject: [PATCH 279/674] parisc: Fix CPU affinity for Lasi, WAX and Dino chips stable inclusion from stable-v5.10.111 commit f7c35220305f273bddc0bdaf1e453b4ca280f145 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=f7c35220305f273bddc0bdaf1e453b4ca280f145 -------------------------------- [ Upstream commit 939fc856676c266c3bc347c1c1661872a3725c0f ] Add the missing logic to allow Lasi, WAX and Dino to set the CPU affinity. This fixes IRQ migration to other CPUs when a CPU is shutdown which currently holds the IRQs for one of those chips. Signed-off-by: Helge Deller Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/parisc/dino.c | 41 +++++++++++++++++++++++++++++++++-------- drivers/parisc/gsc.c | 31 +++++++++++++++++++++++++++++++ drivers/parisc/gsc.h | 1 + drivers/parisc/lasi.c | 7 +++---- drivers/parisc/wax.c | 7 +++---- 5 files changed, 71 insertions(+), 16 deletions(-) diff --git a/drivers/parisc/dino.c b/drivers/parisc/dino.c index 952a92504df6..e33036281327 100644 --- a/drivers/parisc/dino.c +++ b/drivers/parisc/dino.c @@ -142,9 +142,8 @@ struct dino_device { struct pci_hba_data hba; /* 'C' inheritance - must be first */ spinlock_t dinosaur_pen; - unsigned long txn_addr; /* EIR addr to generate interrupt */ - u32 txn_data; /* EIR data assign to each dino */ u32 imr; /* IRQ's which are enabled */ + struct gsc_irq gsc_irq; int global_irq[DINO_LOCAL_IRQS]; /* map IMR bit to global irq */ #ifdef DINO_DEBUG unsigned int dino_irr0; /* save most recent IRQ line stat */ @@ -339,14 +338,43 @@ static void dino_unmask_irq(struct irq_data *d) if (tmp & DINO_MASK_IRQ(local_irq)) { DBG(KERN_WARNING "%s(): IRQ asserted! (ILR 0x%x)\n", __func__, tmp); - gsc_writel(dino_dev->txn_data, dino_dev->txn_addr); + gsc_writel(dino_dev->gsc_irq.txn_data, dino_dev->gsc_irq.txn_addr); } } +#ifdef CONFIG_SMP +static int dino_set_affinity_irq(struct irq_data *d, const struct cpumask *dest, + bool force) +{ + struct dino_device *dino_dev = irq_data_get_irq_chip_data(d); + struct cpumask tmask; + int cpu_irq; + u32 eim; + + if (!cpumask_and(&tmask, dest, cpu_online_mask)) + return -EINVAL; + + cpu_irq = cpu_check_affinity(d, &tmask); + if (cpu_irq < 0) + return cpu_irq; + + dino_dev->gsc_irq.txn_addr = txn_affinity_addr(d->irq, cpu_irq); + eim = ((u32) dino_dev->gsc_irq.txn_addr) | dino_dev->gsc_irq.txn_data; + __raw_writel(eim, dino_dev->hba.base_addr+DINO_IAR0); + + irq_data_update_effective_affinity(d, &tmask); + + return IRQ_SET_MASK_OK; +} +#endif + static struct irq_chip dino_interrupt_type = { .name = "GSC-PCI", .irq_unmask = dino_unmask_irq, .irq_mask = dino_mask_irq, +#ifdef CONFIG_SMP + .irq_set_affinity = dino_set_affinity_irq, +#endif }; @@ -806,7 +834,6 @@ static int __init dino_common_init(struct parisc_device *dev, { int status; u32 eim; - struct gsc_irq gsc_irq; struct resource *res; pcibios_register_hba(&dino_dev->hba); @@ -821,10 +848,8 @@ static int __init dino_common_init(struct parisc_device *dev, ** still only has 11 IRQ input lines - just map some of them ** to a different processor. */ - dev->irq = gsc_alloc_irq(&gsc_irq); - dino_dev->txn_addr = gsc_irq.txn_addr; - dino_dev->txn_data = gsc_irq.txn_data; - eim = ((u32) gsc_irq.txn_addr) | gsc_irq.txn_data; + dev->irq = gsc_alloc_irq(&dino_dev->gsc_irq); + eim = ((u32) dino_dev->gsc_irq.txn_addr) | dino_dev->gsc_irq.txn_data; /* ** Dino needs a PA "IRQ" to get a processor's attention. diff --git a/drivers/parisc/gsc.c b/drivers/parisc/gsc.c index ed9371acf37e..ec175ae99873 100644 --- a/drivers/parisc/gsc.c +++ b/drivers/parisc/gsc.c @@ -135,10 +135,41 @@ static void gsc_asic_unmask_irq(struct irq_data *d) */ } +#ifdef CONFIG_SMP +static int gsc_set_affinity_irq(struct irq_data *d, const struct cpumask *dest, + bool force) +{ + struct gsc_asic *gsc_dev = irq_data_get_irq_chip_data(d); + struct cpumask tmask; + int cpu_irq; + + if (!cpumask_and(&tmask, dest, cpu_online_mask)) + return -EINVAL; + + cpu_irq = cpu_check_affinity(d, &tmask); + if (cpu_irq < 0) + return cpu_irq; + + gsc_dev->gsc_irq.txn_addr = txn_affinity_addr(d->irq, cpu_irq); + gsc_dev->eim = ((u32) gsc_dev->gsc_irq.txn_addr) | gsc_dev->gsc_irq.txn_data; + + /* switch IRQ's for devices below LASI/WAX to other CPU */ + gsc_writel(gsc_dev->eim, gsc_dev->hpa + OFFSET_IAR); + + irq_data_update_effective_affinity(d, &tmask); + + return IRQ_SET_MASK_OK; +} +#endif + + static struct irq_chip gsc_asic_interrupt_type = { .name = "GSC-ASIC", .irq_unmask = gsc_asic_unmask_irq, .irq_mask = gsc_asic_mask_irq, +#ifdef CONFIG_SMP + .irq_set_affinity = gsc_set_affinity_irq, +#endif }; int gsc_assign_irq(struct irq_chip *type, void *data) diff --git a/drivers/parisc/gsc.h b/drivers/parisc/gsc.h index 86abad3fa215..73cbd0bb1975 100644 --- a/drivers/parisc/gsc.h +++ b/drivers/parisc/gsc.h @@ -31,6 +31,7 @@ struct gsc_asic { int version; int type; int eim; + struct gsc_irq gsc_irq; int global_irq[32]; }; diff --git a/drivers/parisc/lasi.c b/drivers/parisc/lasi.c index 4e4fd12c2112..6ef621adb63a 100644 --- a/drivers/parisc/lasi.c +++ b/drivers/parisc/lasi.c @@ -163,7 +163,6 @@ static int __init lasi_init_chip(struct parisc_device *dev) { extern void (*chassis_power_off)(void); struct gsc_asic *lasi; - struct gsc_irq gsc_irq; int ret; lasi = kzalloc(sizeof(*lasi), GFP_KERNEL); @@ -185,7 +184,7 @@ static int __init lasi_init_chip(struct parisc_device *dev) lasi_init_irq(lasi); /* the IRQ lasi should use */ - dev->irq = gsc_alloc_irq(&gsc_irq); + dev->irq = gsc_alloc_irq(&lasi->gsc_irq); if (dev->irq < 0) { printk(KERN_ERR "%s(): cannot get GSC irq\n", __func__); @@ -193,9 +192,9 @@ static int __init lasi_init_chip(struct parisc_device *dev) return -EBUSY; } - lasi->eim = ((u32) gsc_irq.txn_addr) | gsc_irq.txn_data; + lasi->eim = ((u32) lasi->gsc_irq.txn_addr) | lasi->gsc_irq.txn_data; - ret = request_irq(gsc_irq.irq, gsc_asic_intr, 0, "lasi", lasi); + ret = request_irq(lasi->gsc_irq.irq, gsc_asic_intr, 0, "lasi", lasi); if (ret < 0) { kfree(lasi); return ret; diff --git a/drivers/parisc/wax.c b/drivers/parisc/wax.c index 5b6df1516235..73a2b01f8d9c 100644 --- a/drivers/parisc/wax.c +++ b/drivers/parisc/wax.c @@ -68,7 +68,6 @@ static int __init wax_init_chip(struct parisc_device *dev) { struct gsc_asic *wax; struct parisc_device *parent; - struct gsc_irq gsc_irq; int ret; wax = kzalloc(sizeof(*wax), GFP_KERNEL); @@ -85,7 +84,7 @@ static int __init wax_init_chip(struct parisc_device *dev) wax_init_irq(wax); /* the IRQ wax should use */ - dev->irq = gsc_claim_irq(&gsc_irq, WAX_GSC_IRQ); + dev->irq = gsc_claim_irq(&wax->gsc_irq, WAX_GSC_IRQ); if (dev->irq < 0) { printk(KERN_ERR "%s(): cannot get GSC irq\n", __func__); @@ -93,9 +92,9 @@ static int __init wax_init_chip(struct parisc_device *dev) return -EBUSY; } - wax->eim = ((u32) gsc_irq.txn_addr) | gsc_irq.txn_data; + wax->eim = ((u32) wax->gsc_irq.txn_addr) | wax->gsc_irq.txn_data; - ret = request_irq(gsc_irq.irq, gsc_asic_intr, 0, "wax", wax); + ret = request_irq(wax->gsc_irq.irq, gsc_asic_intr, 0, "wax", wax); if (ret < 0) { kfree(wax); return ret; -- Gitee From a6f7b8d42aee21dba2a7ad39ace0ebfaf0d76bc9 Mon Sep 17 00:00:00 2001 From: John David Anglin Date: Thu, 14 Jul 2022 21:58:30 +0800 Subject: [PATCH 280/674] parisc: Fix patch code locking and flushing stable inclusion from stable-v5.10.111 commit 1753a49e266d8f0433e60d4b59f12c2b2497646a category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=1753a49e266d8f0433e60d4b59f12c2b2497646a -------------------------------- [ Upstream commit a9fe7fa7d874a536e0540469f314772c054a0323 ] This change fixes the following: 1) The flags variable is not initialized. Always use raw_spin_lock_irqsave and raw_spin_unlock_irqrestore to serialize patching. 2) flush_kernel_vmap_range is primarily intended for DMA flushes. Since __patch_text_multiple is often called with interrupts disabled, it is better to directly call flush_kernel_dcache_range_asm and flush_kernel_icache_range_asm. This avoids an extra call. 3) The final call to flush_icache_range is unnecessary. Signed-off-by: John David Anglin Signed-off-by: Helge Deller Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- arch/parisc/kernel/patch.c | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/arch/parisc/kernel/patch.c b/arch/parisc/kernel/patch.c index 80a0ab372802..e59574f65e64 100644 --- a/arch/parisc/kernel/patch.c +++ b/arch/parisc/kernel/patch.c @@ -40,10 +40,7 @@ static void __kprobes *patch_map(void *addr, int fixmap, unsigned long *flags, *need_unmap = 1; set_fixmap(fixmap, page_to_phys(page)); - if (flags) - raw_spin_lock_irqsave(&patch_lock, *flags); - else - __acquire(&patch_lock); + raw_spin_lock_irqsave(&patch_lock, *flags); return (void *) (__fix_to_virt(fixmap) + (uintaddr & ~PAGE_MASK)); } @@ -52,10 +49,7 @@ static void __kprobes patch_unmap(int fixmap, unsigned long *flags) { clear_fixmap(fixmap); - if (flags) - raw_spin_unlock_irqrestore(&patch_lock, *flags); - else - __release(&patch_lock); + raw_spin_unlock_irqrestore(&patch_lock, *flags); } void __kprobes __patch_text_multiple(void *addr, u32 *insn, unsigned int len) @@ -67,8 +61,9 @@ void __kprobes __patch_text_multiple(void *addr, u32 *insn, unsigned int len) int mapped; /* Make sure we don't have any aliases in cache */ - flush_kernel_vmap_range(addr, len); - flush_icache_range(start, end); + flush_kernel_dcache_range_asm(start, end); + flush_kernel_icache_range_asm(start, end); + flush_tlb_kernel_range(start, end); p = fixmap = patch_map(addr, FIX_TEXT_POKE0, &flags, &mapped); @@ -81,8 +76,10 @@ void __kprobes __patch_text_multiple(void *addr, u32 *insn, unsigned int len) * We're crossing a page boundary, so * need to remap */ - flush_kernel_vmap_range((void *)fixmap, - (p-fixmap) * sizeof(*p)); + flush_kernel_dcache_range_asm((unsigned long)fixmap, + (unsigned long)p); + flush_tlb_kernel_range((unsigned long)fixmap, + (unsigned long)p); if (mapped) patch_unmap(FIX_TEXT_POKE0, &flags); p = fixmap = patch_map(addr, FIX_TEXT_POKE0, &flags, @@ -90,10 +87,10 @@ void __kprobes __patch_text_multiple(void *addr, u32 *insn, unsigned int len) } } - flush_kernel_vmap_range((void *)fixmap, (p-fixmap) * sizeof(*p)); + flush_kernel_dcache_range_asm((unsigned long)fixmap, (unsigned long)p); + flush_tlb_kernel_range((unsigned long)fixmap, (unsigned long)p); if (mapped) patch_unmap(FIX_TEXT_POKE0, &flags); - flush_icache_range(start, end); } void __kprobes __patch_text(void *addr, u32 insn) -- Gitee From b873d75277e79605a2350a4460fafbd851e7482e Mon Sep 17 00:00:00 2001 From: Mauricio Faria de Oliveira Date: Thu, 14 Jul 2022 21:58:31 +0800 Subject: [PATCH 281/674] mm: fix race between MADV_FREE reclaim and blkdev direct IO read stable inclusion from stable-v5.10.111 commit 3c3fbfa6dddb022e91be18902c3785a82aa4867d category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=3c3fbfa6dddb022e91be18902c3785a82aa4867d -------------------------------- commit 6c8e2a256915a223f6289f651d6b926cd7135c9e upstream. Problem: Reviewed-by: Wei Li ======= Userspace might read the zero-page instead of actual data from a direct IO read on a block device if the buffers have been called madvise(MADV_FREE) on earlier (this is discussed below) due to a race between page reclaim on MADV_FREE and blkdev direct IO read. - Race condition: ============== During page reclaim, the MADV_FREE page check in try_to_unmap_one() checks if the page is not dirty, then discards its rmap PTE(s) (vs. remap back if the page is dirty). However, after try_to_unmap_one() returns to shrink_page_list(), it might keep the page _anyway_ if page_ref_freeze() fails (it expects exactly _one_ page reference, from the isolation for page reclaim). Well, blkdev_direct_IO() gets references for all pages, and on READ operations it only sets them dirty _later_. So, if MADV_FREE'd pages (i.e., not dirty) are used as buffers for direct IO read from block devices, and page reclaim happens during __blkdev_direct_IO[_simple]() exactly AFTER bio_iov_iter_get_pages() returns, but BEFORE the pages are set dirty, the situation happens. The direct IO read eventually completes. Now, when userspace reads the buffers, the PTE is no longer there and the page fault handler do_anonymous_page() services that with the zero-page, NOT the data! A synthetic reproducer is provided. - Page faults: =========== If page reclaim happens BEFORE bio_iov_iter_get_pages() the issue doesn't happen, because that faults-in all pages as writeable, so do_anonymous_page() sets up a new page/rmap/PTE, and that is used by direct IO. The userspace reads don't fault as the PTE is there (thus zero-page is not used/setup). But if page reclaim happens AFTER it / BEFORE setting pages dirty, the PTE is no longer there; the subsequent page faults can't help: The data-read from the block device probably won't generate faults due to DMA (no MMU) but even in the case it wouldn't use DMA, that happens on different virtual addresses (not user-mapped addresses) because `struct bio_vec` stores `struct page` to figure addresses out (which are different from user-mapped addresses) for the read. Thus userspace reads (to user-mapped addresses) still fault, then do_anonymous_page() gets another `struct page` that would address/ map to other memory than the `struct page` used by `struct bio_vec` for the read. (The original `struct page` is not available, since it wasn't freed, as page_ref_freeze() failed due to more page refs. And even if it were available, its data cannot be trusted anymore.) Solution: ======== One solution is to check for the expected page reference count in try_to_unmap_one(). There should be one reference from the isolation (that is also checked in shrink_page_list() with page_ref_freeze()) plus one or more references from page mapping(s) (put in discard: label). Further references mean that rmap/PTE cannot be unmapped/nuked. (Note: there might be more than one reference from mapping due to fork()/clone() without CLONE_VM, which use the same `struct page` for references, until the copy-on-write page gets copied.) So, additional page references (e.g., from direct IO read) now prevent the rmap/PTE from being unmapped/dropped; similarly to the page is not freed per shrink_page_list()/page_ref_freeze()). - Races and Barriers: ================== The new check in try_to_unmap_one() should be safe in races with bio_iov_iter_get_pages() in get_user_pages() fast and slow paths, as it's done under the PTE lock. The fast path doesn't take the lock, but it checks if the PTE has changed and if so, it drops the reference and leaves the page for the slow path (which does take that lock). The fast path requires synchronization w/ full memory barrier: it writes the page reference count first then it reads the PTE later, while try_to_unmap() writes PTE first then it reads page refcount. And a second barrier is needed, as the page dirty flag should not be read before the page reference count (as in __remove_mapping()). (This can be a load memory barrier only; no writes are involved.) Call stack/comments: - try_to_unmap_one() - page_vma_mapped_walk() - map_pte() # see pte_offset_map_lock(): pte_offset_map() spin_lock() - ptep_get_and_clear() # write PTE - smp_mb() # (new barrier) GUP fast path - page_ref_count() # (new check) read refcount - page_vma_mapped_walk_done() # see pte_unmap_unlock(): pte_unmap() spin_unlock() - bio_iov_iter_get_pages() - __bio_iov_iter_get_pages() - iov_iter_get_pages() - get_user_pages_fast() - internal_get_user_pages_fast() # fast path - lockless_pages_from_mm() - gup_{pgd,p4d,pud,pmd,pte}_range() ptep = pte_offset_map() # not _lock() pte = ptep_get_lockless(ptep) page = pte_page(pte) try_grab_compound_head(page) # inc refcount # (RMW/barrier # on success) if (pte_val(pte) != pte_val(*ptep)) # read PTE put_compound_head(page) # dec refcount # go slow path # slow path - __gup_longterm_unlocked() - get_user_pages_unlocked() - __get_user_pages_locked() - __get_user_pages() - follow_{page,p4d,pud,pmd}_mask() - follow_page_pte() ptep = pte_offset_map_lock() pte = *ptep page = vm_normal_page(pte) try_grab_page(page) # inc refcount pte_unmap_unlock() - Huge Pages: ========== Regarding transparent hugepages, that logic shouldn't change, as MADV_FREE (aka lazyfree) pages are PageAnon() && !PageSwapBacked() (madvise_free_pte_range() -> mark_page_lazyfree() -> lru_lazyfree_fn()) thus should reach shrink_page_list() -> split_huge_page_to_list() before try_to_unmap[_one](), so it deals with normal pages only. (And in case unlikely/TTU_SPLIT_HUGE_PMD/split_huge_pmd_address() happens, which should not or be rare, the page refcount should be greater than mapcount: the head page is referenced by tail pages. That also prevents checking the head `page` then incorrectly call page_remove_rmap(subpage) for a tail page, that isn't even in the shrink_page_list()'s page_list (an effect of split huge pmd/pmvw), as it might happen today in this unlikely scenario.) MADV_FREE'd buffers: =================== So, back to the "if MADV_FREE pages are used as buffers" note. The case is arguable, and subject to multiple interpretations. The madvise(2) manual page on the MADV_FREE advice value says: 1) 'After a successful MADV_FREE ... data will be lost when the kernel frees the pages.' 2) 'the free operation will be canceled if the caller writes into the page' / 'subsequent writes ... will succeed and then [the] kernel cannot free those dirtied pages' 3) 'If there is no subsequent write, the kernel can free the pages at any time.' Thoughts, questions, considerations... respectively: 1) Since the kernel didn't actually free the page (page_ref_freeze() failed), should the data not have been lost? (on userspace read.) 2) Should writes performed by the direct IO read be able to cancel the free operation? - Should the direct IO read be considered as 'the caller' too, as it's been requested by 'the caller'? - Should the bio technique to dirty pages on return to userspace (bio_check_pages_dirty() is called/used by __blkdev_direct_IO()) be considered in another/special way here? 3) Should an upcoming write from a previously requested direct IO read be considered as a subsequent write, so the kernel should not free the pages? (as it's known at the time of page reclaim.) And lastly: Technically, the last point would seem a reasonable consideration and balance, as the madvise(2) manual page apparently (and fairly) seem to assume that 'writes' are memory access from the userspace process (not explicitly considering writes from the kernel or its corner cases; again, fairly).. plus the kernel fix implementation for the corner case of the largely 'non-atomic write' encompassed by a direct IO read operation, is relatively simple; and it helps. Reproducer: ========== @ test.c (simplified, but works) #define _GNU_SOURCE #include #include #include #include int main() { int fd, i; char *buf; fd = open(DEV, O_RDONLY | O_DIRECT); buf = mmap(NULL, BUF_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); for (i = 0; i < BUF_SIZE; i += PAGE_SIZE) buf[i] = 1; // init to non-zero madvise(buf, BUF_SIZE, MADV_FREE); read(fd, buf, BUF_SIZE); for (i = 0; i < BUF_SIZE; i += PAGE_SIZE) printf("%p: 0x%x\n", &buf[i], buf[i]); return 0; } @ block/fops.c (formerly fs/block_dev.c) +#include ... ... __blkdev_direct_IO[_simple](...) { ... + if (!strcmp(current->comm, "good")) + shrink_all_memory(ULONG_MAX); + ret = bio_iov_iter_get_pages(...); + + if (!strcmp(current->comm, "bad")) + shrink_all_memory(ULONG_MAX); ... } @ shell # NUM_PAGES=4 # PAGE_SIZE=$(getconf PAGE_SIZE) # yes | dd of=test.img bs=${PAGE_SIZE} count=${NUM_PAGES} # DEV=$(losetup -f --show test.img) # gcc -DDEV=\"$DEV\" \ -DBUF_SIZE=$((PAGE_SIZE * NUM_PAGES)) \ -DPAGE_SIZE=${PAGE_SIZE} \ test.c -o test # od -tx1 $DEV 0000000 79 0a 79 0a 79 0a 79 0a 79 0a 79 0a 79 0a 79 0a * 0040000 # mv test good # ./good 0x7f7c10418000: 0x79 0x7f7c10419000: 0x79 0x7f7c1041a000: 0x79 0x7f7c1041b000: 0x79 # mv good bad # ./bad 0x7fa1b8050000: 0x0 0x7fa1b8051000: 0x0 0x7fa1b8052000: 0x0 0x7fa1b8053000: 0x0 Note: the issue is consistent on v5.17-rc3, but it's intermittent with the support of MADV_FREE on v4.5 (60%-70% error; needs swap). [wrap do_direct_IO() in do_blockdev_direct_IO() @ fs/direct-io.c]. - v5.17-rc3: # for i in {1..1000}; do ./good; done \ | cut -d: -f2 | sort | uniq -c 4000 0x79 # mv good bad # for i in {1..1000}; do ./bad; done \ | cut -d: -f2 | sort | uniq -c 4000 0x0 # free | grep Swap Swap: 0 0 0 - v4.5: # for i in {1..1000}; do ./good; done \ | cut -d: -f2 | sort | uniq -c 4000 0x79 # mv good bad # for i in {1..1000}; do ./bad; done \ | cut -d: -f2 | sort | uniq -c 2702 0x0 1298 0x79 # swapoff -av swapoff /swap # for i in {1..1000}; do ./bad; done \ | cut -d: -f2 | sort | uniq -c 4000 0x79 Ceph/TCMalloc: ============= For documentation purposes, the use case driving the analysis/fix is Ceph on Ubuntu 18.04, as the TCMalloc library there still uses MADV_FREE to release unused memory to the system from the mmap'ed page heap (might be committed back/used again; it's not munmap'ed.) - PageHeap::DecommitSpan() -> TCMalloc_SystemRelease() -> madvise() - PageHeap::CommitSpan() -> TCMalloc_SystemCommit() -> do nothing. Note: TCMalloc switched back to MADV_DONTNEED a few commits after the release in Ubuntu 18.04 (google-perftools/gperftools 2.5), so the issue just 'disappeared' on Ceph on later Ubuntu releases but is still present in the kernel, and can be hit by other use cases. The observed issue seems to be the old Ceph bug #22464 [1], where checksum mismatches are observed (and instrumentation with buffer dumps shows zero-pages read from mmap'ed/MADV_FREE'd page ranges). The issue in Ceph was reasonably deemed a kernel bug (comment #50) and mostly worked around with a retry mechanism, but other parts of Ceph could still hit that (rocksdb). Anyway, it's less likely to be hit again as TCMalloc switched out of MADV_FREE by default. (Some kernel versions/reports from the Ceph bug, and relation with the MADV_FREE introduction/changes; TCMalloc versions not checked.) - 4.4 good - 4.5 (madv_free: introduction) - 4.9 bad - 4.10 good? maybe a swapless system - 4.12 (madv_free: no longer free instantly on swapless systems) - 4.13 bad [1] https://tracker.ceph.com/issues/22464 Thanks: ====== Several people contributed to analysis/discussions/tests/reproducers in the first stages when drilling down on ceph/tcmalloc/linux kernel: - Dan Hill - Dan Streetman - Dongdong Tao - Gavin Guo - Gerald Yang - Heitor Alves de Siqueira - Ioanna Alifieraki - Jay Vosburgh - Matthew Ruffell - Ponnuvel Palaniyappan Reviews, suggestions, corrections, comments: - Minchan Kim - Yu Zhao - Huang, Ying - John Hubbard - Christoph Hellwig [mfo@canonical.com: v4] Link: https://lkml.kernel.org/r/20220209202659.183418-1-mfo@canonical.comLink: https://lkml.kernel.org/r/20220131230255.789059-1-mfo@canonical.com Fixes: 802a3a92ad7a ("mm: reclaim MADV_FREE pages") Signed-off-by: Mauricio Faria de Oliveira Reviewed-by: "Huang, Ying" Cc: Minchan Kim Cc: Yu Zhao Cc: Yang Shi Cc: Miaohe Lin Cc: Dan Hill Cc: Dan Streetman Cc: Dongdong Tao Cc: Gavin Guo Cc: Gerald Yang Cc: Heitor Alves de Siqueira Cc: Ioanna Alifieraki Cc: Jay Vosburgh Cc: Matthew Ruffell Cc: Ponnuvel Palaniyappan Cc: Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds [mfo: backport: replace folio/test_flag with page/flag equivalents; real Fixes: 854e9ed09ded ("mm: support madvise(MADV_FREE)") in v4.] Signed-off-by: Mauricio Faria de Oliveira Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai --- mm/rmap.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/mm/rmap.c b/mm/rmap.c index a780862cd226..0dc39cf94345 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1657,7 +1657,30 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, /* MADV_FREE page check */ if (!PageSwapBacked(page)) { - if (!PageDirty(page)) { + int ref_count, map_count; + + /* + * Synchronize with gup_pte_range(): + * - clear PTE; barrier; read refcount + * - inc refcount; barrier; read PTE + */ + smp_mb(); + + ref_count = page_ref_count(page); + map_count = page_mapcount(page); + + /* + * Order reads for page refcount and dirty flag + * (see comments in __remove_mapping()). + */ + smp_rmb(); + + /* + * The only page refs must be one from isolation + * plus the rmap(s) (dropped by discard:). + */ + if (ref_count == 1 + map_count && + !PageDirty(page)) { /* Invalidate as we cleared the pte */ mmu_notifier_invalidate_range(mm, address, address + PAGE_SIZE); -- Gitee From 91aae60ff67df941151687801cb887f9d4199f22 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Thu, 14 Jul 2022 21:58:32 +0800 Subject: [PATCH 282/674] Revert "hv: utils: add PTP_1588_CLOCK to Kconfig to fix build" stable inclusion from stable-v5.10.111 commit 84e5dfc05f37bbb7203af4ec6bd18a17d37abacf category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=84e5dfc05f37bbb7203af4ec6bd18a17d37abacf -------------------------------- This reverts commit c4dc584a2d4c8d74b054f09d67e0a076767bdee5. On Sat, Apr 09, 2022 at 09:07:51AM -0700, Randy Dunlap wrote: >According to https://bugzilla.kernel.org/show_bug.cgi?id=215823, >c4dc584a2d4c8d74b054f09d67e0a076767bdee5 ("hv: utils: add PTP_1588_CLOCK to Kconfig to fix build") >is a problem for 5.10 since CONFIG_PTP_1588_CLOCK_OPTIONAL does not exist in 5.10. >This prevents the hyper-V NIC timestamping from working, so please revert that commit. Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/hv/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig index 210e532ac277..79e5356a737a 100644 --- a/drivers/hv/Kconfig +++ b/drivers/hv/Kconfig @@ -17,7 +17,6 @@ config HYPERV_TIMER config HYPERV_UTILS tristate "Microsoft Hyper-V Utilities driver" depends on HYPERV && CONNECTOR && NLS - depends on PTP_1588_CLOCK_OPTIONAL help Select this option to enable the Hyper-V Utilities. -- Gitee From bff89179ea68b4c16d011072859f787411bd7ed5 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 14 Jul 2022 21:58:33 +0800 Subject: [PATCH 283/674] drm/amdgpu: fix off by one in amdgpu_gfx_kiq_acquire() stable inclusion from stable-v5.10.111 commit 0c122eb3a109356fb2c39f49c1768321248d0f1d category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=0c122eb3a109356fb2c39f49c1768321248d0f1d -------------------------------- [ Upstream commit 1647b54ed55d4d48c7199d439f8834626576cbe9 ] This post-op should be a pre-op so that we do not pass -1 as the bit number to test_bit(). The current code will loop downwards from 63 to -1. After changing to a pre-op, it loops from 63 to 0. Fixes: 71c37505e7ea ("drm/amdgpu/gfx: move more common KIQ code to amdgpu_gfx.c") Signed-off-by: Dan Carpenter Signed-off-by: Alex Deucher Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index 9f9f55a2b257..f84582b70d0e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -263,7 +263,7 @@ static int amdgpu_gfx_kiq_acquire(struct amdgpu_device *adev, * adev->gfx.mec.num_pipe_per_mec * adev->gfx.mec.num_queue_per_pipe; - while (queue_bit-- >= 0) { + while (--queue_bit >= 0) { if (test_bit(queue_bit, adev->gfx.mec.queue_bitmap)) continue; -- Gitee From 909b329794f18d8b24c4945d7d1b35b374698da3 Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Thu, 14 Jul 2022 21:58:34 +0800 Subject: [PATCH 284/674] Drivers: hv: vmbus: Fix potential crash on module unload stable inclusion from stable-v5.10.111 commit cf580d2e3884dbafd6b90269b03a24d661578624 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=cf580d2e3884dbafd6b90269b03a24d661578624 -------------------------------- [ Upstream commit 792f232d57ff28bbd5f9c4abe0466b23d5879dc8 ] The vmbus driver relies on the panic notifier infrastructure to perform some operations when a panic event is detected. Since vmbus can be built as module, it is required that the driver handles both registering and unregistering such panic notifier callback. After commit 74347a99e73a ("x86/Hyper-V: Unload vmbus channel in hv panic callback") though, the panic notifier registration is done unconditionally in the module initialization routine whereas the unregistering procedure is conditionally guarded and executes only if HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE capability is set. This patch fixes that by unconditionally unregistering the panic notifier in the module's exit routine as well. Fixes: 74347a99e73a ("x86/Hyper-V: Unload vmbus channel in hv panic callback") Signed-off-by: Guilherme G. Piccoli Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20220315203535.682306-1-gpiccoli@igalia.com Signed-off-by: Wei Liu Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/hv/vmbus_drv.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 362da2a83b47..b9ac357e465d 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -2673,10 +2673,15 @@ static void __exit vmbus_exit(void) if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE) { kmsg_dump_unregister(&hv_kmsg_dumper); unregister_die_notifier(&hyperv_die_block); - atomic_notifier_chain_unregister(&panic_notifier_list, - &hyperv_panic_block); } + /* + * The panic notifier is always registered, hence we should + * also unconditionally unregister it here as well. + */ + atomic_notifier_chain_unregister(&panic_notifier_list, + &hyperv_panic_block); + free_page((unsigned long)hv_panic_page); unregister_sysctl_table(hv_ctl_table_hdr); hv_ctl_table_hdr = NULL; -- Gitee From c10e11a7a9f2b24764288978c863bf15fb4869a7 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Thu, 14 Jul 2022 21:58:35 +0800 Subject: [PATCH 285/674] scsi: zorro7xx: Fix a resource leak in zorro7xx_remove_one() stable inclusion from stable-v5.10.111 commit c5f77b595379b5191316edd365a542f8b1526066 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=c5f77b595379b5191316edd365a542f8b1526066 -------------------------------- [ Upstream commit 16ed828b872d12ccba8f07bcc446ae89ba662f9c ] The error handling path of the probe releases a resource that is not freed in the remove function. In some cases, a ioremap() must be undone. Add the missing iounmap() call in the remove function. Link: https://lore.kernel.org/r/247066a3104d25f9a05de8b3270fc3c848763bcc.1647673264.git.christophe.jaillet@wanadoo.fr Fixes: 45804fbb00ee ("[SCSI] 53c700: Amiga Zorro NCR53c710 SCSI") Reviewed-by: Geert Uytterhoeven Signed-off-by: Christophe JAILLET Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/scsi/zorro7xx.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/scsi/zorro7xx.c b/drivers/scsi/zorro7xx.c index 27b9e2baab1a..7acf9193a9e8 100644 --- a/drivers/scsi/zorro7xx.c +++ b/drivers/scsi/zorro7xx.c @@ -159,6 +159,8 @@ static void zorro7xx_remove_one(struct zorro_dev *z) scsi_remove_host(host); NCR_700_release(host); + if (host->base > 0x01000000) + iounmap(hostdata->base); kfree(hostdata); free_irq(host->irq, host); zorro_release_device(z); -- Gitee From 4052593d0d15a0ec3fe2b3fe381aebad8fd11299 Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Thu, 14 Jul 2022 21:58:37 +0800 Subject: [PATCH 286/674] ice: Clear default forwarding VSI during VSI release stable inclusion from stable-v5.10.111 commit 4be6ed03107b6e245f5ee0d9e273c868fda66154 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=4be6ed03107b6e245f5ee0d9e273c868fda66154 -------------------------------- [ Upstream commit bd8c624c0cd59de0032752ba3001c107bba97f7b ] VSI is set as default forwarding one when promisc mode is set for PF interface, when PF is switched to switchdev mode or when VF driver asks to enable allmulticast or promisc mode for the VF interface (when vf-true-promisc-support priv flag is off). The third case is buggy because in that case VSI associated with VF remains as default one after VF removal. Reproducer: 1. Create VF echo 1 > sys/class/net/ens7f0/device/sriov_numvfs 2. Enable allmulticast or promisc mode on VF ip link set ens7f0v0 allmulticast on ip link set ens7f0v0 promisc on 3. Delete VF echo 0 > sys/class/net/ens7f0/device/sriov_numvfs 4. Try to enable promisc mode on PF ip link set ens7f0 promisc on Although it looks that promisc mode on PF is enabled the opposite is true because ice_vsi_sync_fltr() responsible for IFF_PROMISC handling first checks if any other VSI is set as default forwarding one and if so the function does not do anything. At this point it is not possible to enable promisc mode on PF without re-probe device. To resolve the issue this patch clear default forwarding VSI during ice_vsi_release() when the VSI to be released is the default one. Fixes: 01b5e89aab49 ("ice: Add VF promiscuous support") Signed-off-by: Ivan Vecera Reviewed-by: Michal Swiatkowski Reviewed-by: Maciej Fijalkowski Signed-off-by: Alice Michael Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/net/ethernet/intel/ice/ice_lib.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c index 52ac6cc08e83..ec475353b620 100644 --- a/drivers/net/ethernet/intel/ice/ice_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_lib.c @@ -2667,6 +2667,8 @@ int ice_vsi_release(struct ice_vsi *vsi) } } + if (ice_is_vsi_dflt_vsi(pf->first_sw, vsi)) + ice_clear_dflt_vsi(pf->first_sw); ice_fltr_remove_all(vsi); ice_rm_vsi_lan_cfg(vsi->port_info, vsi->idx); ice_vsi_delete(vsi); -- Gitee From 7aa1bfa2c655fbc9ecb88937a72def428e2a298a Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Thu, 14 Jul 2022 21:58:38 +0800 Subject: [PATCH 287/674] net: ipv4: fix route with nexthop object delete warning stable inclusion from stable-v5.10.111 commit 63ea57478aaa3e06a597081a0f537318fc04e49f category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=63ea57478aaa3e06a597081a0f537318fc04e49f -------------------------------- [ Upstream commit 6bf92d70e690b7ff12b24f4bfff5e5434d019b82 ] FRR folks have hit a kernel warning[1] while deleting routes[2] which is caused by trying to delete a route pointing to a nexthop id without specifying nhid but matching on an interface. That is, a route is found but we hit a warning while matching it. The warning is from fib_info_nh() in include/net/nexthop.h because we run it on a fib_info with nexthop object. The call chain is: inet_rtm_delroute -> fib_table_delete -> fib_nh_match (called with a nexthop fib_info and also with fc_oif set thus calling fib_info_nh on the fib_info and triggering the warning). The fix is to not do any matching in that branch if the fi has a nexthop object because those are managed separately. I.e. we should match when deleting without nh spec and should fail when deleting a nexthop route with old-style nh spec because nexthop objects are managed separately, e.g.: $ ip r show 1.2.3.4/32 1.2.3.4 nhid 12 via 192.168.11.2 dev dummy0 $ ip r del 1.2.3.4/32 $ ip r del 1.2.3.4/32 nhid 12 $ ip r del 1.2.3.4/32 dev dummy0 [1] [ 523.462226] ------------[ cut here ]------------ [ 523.462230] WARNING: CPU: 14 PID: 22893 at include/net/nexthop.h:468 fib_nh_match+0x210/0x460 [ 523.462236] Modules linked in: dummy rpcsec_gss_krb5 xt_socket nf_socket_ipv4 nf_socket_ipv6 ip6table_raw iptable_raw bpf_preload xt_statistic ip_set ip_vs_sh ip_vs_wrr ip_vs_rr ip_vs xt_mark nf_tables xt_nat veth nf_conntrack_netlink nfnetlink xt_addrtype br_netfilter overlay dm_crypt nfsv3 nfs fscache netfs vhost_net vhost vhost_iotlb tap tun xt_CHECKSUM xt_MASQUERADE xt_conntrack 8021q garp mrp ipt_REJECT nf_reject_ipv4 ip6table_mangle ip6table_nat iptable_mangle iptable_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 iptable_filter bridge stp llc rfcomm snd_seq_dummy snd_hrtimer rpcrdma rdma_cm iw_cm ib_cm ib_core ip6table_filter xt_comment ip6_tables vboxnetadp(OE) vboxnetflt(OE) vboxdrv(OE) qrtr bnep binfmt_misc xfs vfat fat squashfs loop nvidia_drm(POE) nvidia_modeset(POE) nvidia_uvm(POE) nvidia(POE) intel_rapl_msr intel_rapl_common snd_hda_codec_realtek snd_hda_codec_generic ledtrig_audio snd_hda_codec_hdmi btusb btrtl iwlmvm uvcvideo btbcm snd_hda_intel edac_mce_amd [ 523.462274] videobuf2_vmalloc videobuf2_memops btintel snd_intel_dspcfg videobuf2_v4l2 snd_intel_sdw_acpi bluetooth snd_usb_audio snd_hda_codec mac80211 snd_usbmidi_lib joydev snd_hda_core videobuf2_common kvm_amd snd_rawmidi snd_hwdep snd_seq videodev ccp snd_seq_device libarc4 ecdh_generic mc snd_pcm kvm iwlwifi snd_timer drm_kms_helper snd cfg80211 cec soundcore irqbypass rapl wmi_bmof i2c_piix4 rfkill k10temp pcspkr acpi_cpufreq nfsd auth_rpcgss nfs_acl lockd grace sunrpc drm zram ip_tables crct10dif_pclmul crc32_pclmul crc32c_intel ghash_clmulni_intel nvme sp5100_tco r8169 nvme_core wmi ipmi_devintf ipmi_msghandler fuse [ 523.462300] CPU: 14 PID: 22893 Comm: ip Tainted: P OE 5.16.18-200.fc35.x86_64 #1 [ 523.462302] Hardware name: Micro-Star International Co., Ltd. MS-7C37/MPG X570 GAMING EDGE WIFI (MS-7C37), BIOS 1.C0 10/29/2020 [ 523.462303] RIP: 0010:fib_nh_match+0x210/0x460 [ 523.462304] Code: 7c 24 20 48 8b b5 90 00 00 00 e8 bb ee f4 ff 48 8b 7c 24 20 41 89 c4 e8 ee eb f4 ff 45 85 e4 0f 85 2e fe ff ff e9 4c ff ff ff <0f> 0b e9 17 ff ff ff 3c 0a 0f 85 61 fe ff ff 48 8b b5 98 00 00 00 [ 523.462306] RSP: 0018:ffffaa53d4d87928 EFLAGS: 00010286 [ 523.462307] RAX: 0000000000000000 RBX: ffffaa53d4d87a90 RCX: ffffaa53d4d87bb0 [ 523.462308] RDX: ffff9e3d2ee6be80 RSI: ffffaa53d4d87a90 RDI: ffffffff920ed380 [ 523.462309] RBP: ffff9e3d2ee6be80 R08: 0000000000000064 R09: 0000000000000000 [ 523.462310] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000031 [ 523.462310] R13: 0000000000000020 R14: 0000000000000000 R15: ffff9e3d331054e0 [ 523.462311] FS: 00007f245517c1c0(0000) GS:ffff9e492ed80000(0000) knlGS:0000000000000000 [ 523.462313] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 523.462313] CR2: 000055e5dfdd8268 CR3: 00000003ef488000 CR4: 0000000000350ee0 [ 523.462315] Call Trace: [ 523.462316] [ 523.462320] fib_table_delete+0x1a9/0x310 [ 523.462323] inet_rtm_delroute+0x93/0x110 [ 523.462325] rtnetlink_rcv_msg+0x133/0x370 [ 523.462327] ? _copy_to_iter+0xb5/0x6f0 [ 523.462330] ? rtnl_calcit.isra.0+0x110/0x110 [ 523.462331] netlink_rcv_skb+0x50/0xf0 [ 523.462334] netlink_unicast+0x211/0x330 [ 523.462336] netlink_sendmsg+0x23f/0x480 [ 523.462338] sock_sendmsg+0x5e/0x60 [ 523.462340] ____sys_sendmsg+0x22c/0x270 [ 523.462341] ? import_iovec+0x17/0x20 [ 523.462343] ? sendmsg_copy_msghdr+0x59/0x90 [ 523.462344] ? __mod_lruvec_page_state+0x85/0x110 [ 523.462348] ___sys_sendmsg+0x81/0xc0 [ 523.462350] ? netlink_seq_start+0x70/0x70 [ 523.462352] ? __dentry_kill+0x13a/0x180 [ 523.462354] ? __fput+0xff/0x250 [ 523.462356] __sys_sendmsg+0x49/0x80 [ 523.462358] do_syscall_64+0x3b/0x90 [ 523.462361] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 523.462364] RIP: 0033:0x7f24552aa337 [ 523.462365] Code: 0e 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b9 0f 1f 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 2e 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51 c3 48 83 ec 28 89 54 24 1c 48 89 74 24 10 [ 523.462366] RSP: 002b:00007fff7f05a838 EFLAGS: 00000246 ORIG_RAX: 000000000000002e [ 523.462368] RAX: ffffffffffffffda RBX: 000000006245bf91 RCX: 00007f24552aa337 [ 523.462368] RDX: 0000000000000000 RSI: 00007fff7f05a8a0 RDI: 0000000000000003 [ 523.462369] RBP: 0000000000000000 R08: 0000000000000001 R09: 0000000000000000 [ 523.462370] R10: 0000000000000008 R11: 0000000000000246 R12: 0000000000000001 [ 523.462370] R13: 00007fff7f05ce08 R14: 0000000000000000 R15: 000055e5dfdd1040 [ 523.462373] [ 523.462374] ---[ end trace ba537bc16f6bf4ed ]--- [2] https://github.com/FRRouting/frr/issues/6412 Fixes: 4c7e8084fd46 ("ipv4: Plumb support for nexthop object in a fib_info") Signed-off-by: Nikolay Aleksandrov Reviewed-by: David Ahern Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- net/ipv4/fib_semantics.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 838a876c168c..c8c7b76c3b2e 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -888,8 +888,13 @@ int fib_nh_match(struct net *net, struct fib_config *cfg, struct fib_info *fi, } if (cfg->fc_oif || cfg->fc_gw_family) { - struct fib_nh *nh = fib_info_nh(fi, 0); + struct fib_nh *nh; + + /* cannot match on nexthop object attributes */ + if (fi->nh) + return 1; + nh = fib_info_nh(fi, 0); if (cfg->fc_encap) { if (fib_encap_match(net, cfg->fc_encap_type, cfg->fc_encap, nh, cfg, extack)) -- Gitee From 1ecc903f99b69361d17f2cc8bcb6c1cd35fcddd7 Mon Sep 17 00:00:00 2001 From: Chen-Yu Tsai Date: Thu, 14 Jul 2022 21:58:39 +0800 Subject: [PATCH 288/674] net: stmmac: Fix unset max_speed difference between DT and non-DT platforms stable inclusion from stable-v5.10.111 commit 02ab4abe5bbf60f6d0e020a974e0c40e42984a1a category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=02ab4abe5bbf60f6d0e020a974e0c40e42984a1a -------------------------------- [ Upstream commit c21cabb0fd0b54b8b54235fc1ecfe1195a23bcb2 ] In commit 9cbadf094d9d ("net: stmmac: support max-speed device tree property"), when DT platforms don't set "max-speed", max_speed is set to -1; for non-DT platforms, it stays the default 0. Prior to commit eeef2f6b9f6e ("net: stmmac: Start adding phylink support"), the check for a valid max_speed setting was to check if it was greater than zero. This commit got it right, but subsequent patches just checked for non-zero, which is incorrect for DT platforms. In commit 92c3807b9ac3 ("net: stmmac: convert to phylink_get_linkmodes()") the conversion switched completely to checking for non-zero value as a valid value, which caused 1000base-T to stop getting advertised by default. Instead of trying to fix all the checks, simply leave max_speed alone if DT property parsing fails. Fixes: 9cbadf094d9d ("net: stmmac: support max-speed device tree property") Fixes: 92c3807b9ac3 ("net: stmmac: convert to phylink_get_linkmodes()") Signed-off-by: Chen-Yu Tsai Acked-by: Russell King (Oracle) Reviewed-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20220331184832.16316-1-wens@kernel.org Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c index 3183d8826981..b40b962055fa 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c @@ -432,8 +432,7 @@ stmmac_probe_config_dt(struct platform_device *pdev, const char **mac) plat->phylink_node = np; /* Get max speed of operation from device tree */ - if (of_property_read_u32(np, "max-speed", &plat->max_speed)) - plat->max_speed = -1; + of_property_read_u32(np, "max-speed", &plat->max_speed); plat->bus_id = of_alias_get_id(np, "ethernet"); if (plat->bus_id < 0) -- Gitee From ac294abfb19a7df6a86f4b6dd087e1d3cc9bd3cd Mon Sep 17 00:00:00 2001 From: Jiasheng Jiang Date: Thu, 14 Jul 2022 21:58:40 +0800 Subject: [PATCH 289/674] drm/imx: imx-ldb: Check for null pointer after calling kmemdup stable inclusion from stable-v5.10.111 commit 25bc9fd4c8d13036c0656a00ebedc2cff4650a91 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=25bc9fd4c8d13036c0656a00ebedc2cff4650a91 -------------------------------- [ Upstream commit 8027a9ad9b3568c5eb49c968ad6c97f279d76730 ] As the possible failure of the allocation, kmemdup() may return NULL pointer. Therefore, it should be better to check the return value of kmemdup() and return error if fails. Fixes: dc80d7038883 ("drm/imx-ldb: Add support to drm-bridge") Signed-off-by: Jiasheng Jiang Signed-off-by: Philipp Zabel Link: https://lore.kernel.org/r/20220105074729.2363657-1-jiasheng@iscas.ac.cn Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/gpu/drm/imx/imx-ldb.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/imx/imx-ldb.c b/drivers/gpu/drm/imx/imx-ldb.c index 75036aaa0c63..efd13e533726 100644 --- a/drivers/gpu/drm/imx/imx-ldb.c +++ b/drivers/gpu/drm/imx/imx-ldb.c @@ -553,6 +553,8 @@ static int imx_ldb_panel_ddc(struct device *dev, edidp = of_get_property(child, "edid", &edid_len); if (edidp) { channel->edid = kmemdup(edidp, edid_len, GFP_KERNEL); + if (!channel->edid) + return -ENOMEM; } else if (!channel->panel) { /* fallback to display-timings node */ ret = of_get_drm_display_mode(child, -- Gitee From 861e0d903aff3a0c3188dc0c96d70c289b00a0b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Exp=C3=B3sito?= Date: Thu, 14 Jul 2022 21:58:41 +0800 Subject: [PATCH 290/674] drm/imx: Fix memory leak in imx_pd_connector_get_modes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit stable inclusion from stable-v5.10.111 commit f8b0ef0a5889890b50482b6593bd8de544736351 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=f8b0ef0a5889890b50482b6593bd8de544736351 -------------------------------- [ Upstream commit bce81feb03a20fca7bbdd1c4af16b4e9d5c0e1d3 ] Avoid leaking the display mode variable if of_get_drm_display_mode fails. Fixes: 76ecd9c9fb24 ("drm/imx: parallel-display: check return code from of_get_drm_display_mode()") Addresses-Coverity-ID: 1443943 ("Resource leak") Signed-off-by: José Expósito Signed-off-by: Philipp Zabel Link: https://lore.kernel.org/r/20220108165230.44610-1-jose.exposito89@gmail.com Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/gpu/drm/imx/parallel-display.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/imx/parallel-display.c b/drivers/gpu/drm/imx/parallel-display.c index 605ac8825a59..b61bfa84b6bb 100644 --- a/drivers/gpu/drm/imx/parallel-display.c +++ b/drivers/gpu/drm/imx/parallel-display.c @@ -70,8 +70,10 @@ static int imx_pd_connector_get_modes(struct drm_connector *connector) ret = of_get_drm_display_mode(np, &imxpd->mode, &imxpd->bus_flags, OF_USE_NATIVE_MODE); - if (ret) + if (ret) { + drm_mode_destroy(connector->dev, mode); return ret; + } drm_mode_copy(mode, &imxpd->mode); mode->type |= DRM_MODE_TYPE_DRIVER | DRM_MODE_TYPE_PREFERRED, -- Gitee From d1a5b626a1bf101918afbaf9f70e9316144c8a4f Mon Sep 17 00:00:00 2001 From: Andy Gospodarek Date: Thu, 14 Jul 2022 21:58:42 +0800 Subject: [PATCH 291/674] bnxt_en: reserve space inside receive page for skb_shared_info stable inclusion from stable-v5.10.111 commit 0ac74169ebc34686a6ef2a033473a4913aa3e471 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=0ac74169ebc34686a6ef2a033473a4913aa3e471 -------------------------------- [ Upstream commit facc173cf700e55b2ad249ecbd3a7537f7315691 ] Insufficient space was being reserved in the page used for packet reception, so the interface MTU could be set too large to still have room for the contents of the packet when doing XDP redirect. This resulted in the following message when redirecting a packet between 3520 and 3822 bytes with an MTU of 3822: [311815.561880] XDP_WARN: xdp_update_frame_from_buff(line:200): Driver BUG: missing reserved tailroom Fixes: f18c2b77b2e4 ("bnxt_en: optimized XDP_REDIRECT support") Reviewed-by: Somnath Kotur Reviewed-by: Pavan Chebbi Signed-off-by: Andy Gospodarek Signed-off-by: Michael Chan Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/net/ethernet/broadcom/bnxt/bnxt.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 92f9f7f5240b..34affd1de91d 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -569,7 +569,8 @@ struct nqe_cn { #define BNXT_MAX_MTU 9500 #define BNXT_MAX_PAGE_MODE_MTU \ ((unsigned int)PAGE_SIZE - VLAN_ETH_HLEN - NET_IP_ALIGN - \ - XDP_PACKET_HEADROOM) + XDP_PACKET_HEADROOM - \ + SKB_DATA_ALIGN((unsigned int)sizeof(struct skb_shared_info))) #define BNXT_MIN_PKT_SIZE 52 -- Gitee From 4923c9da128bc0c53564b384ddfb181fbd92d301 Mon Sep 17 00:00:00 2001 From: Martin Habets Date: Thu, 14 Jul 2022 21:58:43 +0800 Subject: [PATCH 292/674] sfc: Do not free an empty page_ring stable inclusion from stable-v5.10.111 commit d8992b393f974b47c4a72b9bacdbfc4110e29eea category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=d8992b393f974b47c4a72b9bacdbfc4110e29eea -------------------------------- [ Upstream commit 458f5d92df4807e2a7c803ed928369129996bf96 ] When the page_ring is not used page_ptr_mask is 0. Do not dereference page_ring[0] in this case. Fixes: 2768935a4660 ("sfc: reuse pages to avoid DMA mapping/unmapping costs") Reported-by: Taehee Yoo Signed-off-by: Martin Habets Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/net/ethernet/sfc/rx_common.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/sfc/rx_common.c b/drivers/net/ethernet/sfc/rx_common.c index e423b17e2a14..2c09afac5beb 100644 --- a/drivers/net/ethernet/sfc/rx_common.c +++ b/drivers/net/ethernet/sfc/rx_common.c @@ -166,6 +166,9 @@ static void efx_fini_rx_recycle_ring(struct efx_rx_queue *rx_queue) struct efx_nic *efx = rx_queue->efx; int i; + if (unlikely(!rx_queue->page_ring)) + return; + /* Unmap and release the pages in the recycle ring. Remove the ring. */ for (i = 0; i <= rx_queue->page_ptr_mask; i++) { struct page *page = rx_queue->page_ring[i]; -- Gitee From 3889021d68a68371300974e5d708fd11640b4723 Mon Sep 17 00:00:00 2001 From: Aharon Landau Date: Thu, 14 Jul 2022 21:58:44 +0800 Subject: [PATCH 293/674] RDMA/mlx5: Don't remove cache MRs when a delay is needed stable inclusion from stable-v5.10.111 commit 3a57babfb6e9871b961e3c36e616af87df4ef1e8 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=3a57babfb6e9871b961e3c36e616af87df4ef1e8 -------------------------------- [ Upstream commit 84c2362fb65d69c721fec0974556378cbb36a62b ] Don't remove MRs from the cache if need to delay the removal. Fixes: b9358bdbc713 ("RDMA/mlx5: Fix locking in MR cache work queue") Link: https://lore.kernel.org/r/c3087a90ff362c8796c7eaa2715128743ce36722.1649062436.git.leonro@nvidia.com Signed-off-by: Aharon Landau Reviewed-by: Shay Drory Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/infiniband/hw/mlx5/mr.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 6cd0cbd4fc9f..d827a4e44c94 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -531,8 +531,10 @@ static void __cache_work_func(struct mlx5_cache_ent *ent) spin_lock_irq(&ent->lock); if (ent->disabled) goto out; - if (need_delay) + if (need_delay) { queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ); + goto out; + } remove_cache_mr_locked(ent); queue_adjust_cache_locked(ent); } -- Gitee From d087ba14ae40e0ffd311707621765bc33ca78a6b Mon Sep 17 00:00:00 2001 From: Niels Dossche Date: Thu, 14 Jul 2022 21:58:45 +0800 Subject: [PATCH 294/674] IB/rdmavt: add lock to call to rvt_error_qp to prevent a race condition stable inclusion from stable-v5.10.111 commit 43c2d7890ecabe527448a6c391fb2d9a5e6bbfe0 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=43c2d7890ecabe527448a6c391fb2d9a5e6bbfe0 -------------------------------- [ Upstream commit 4d809f69695d4e7d1378b3a072fa9aef23123018 ] The documentation of the function rvt_error_qp says both r_lock and s_lock need to be held when calling that function. It also asserts using lockdep that both of those locks are held. However, the commit I referenced in Fixes accidentally makes the call to rvt_error_qp in rvt_ruc_loopback no longer covered by r_lock. This results in the lockdep assertion failing and also possibly in a race condition. Fixes: d757c60eca9b ("IB/rdmavt: Fix concurrency panics in QP post_send and modify to error") Link: https://lore.kernel.org/r/20220228165330.41546-1-dossche.niels@gmail.com Signed-off-by: Niels Dossche Acked-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/infiniband/sw/rdmavt/qp.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 09f0dbf941c0..d8d52a00a1be 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -3241,7 +3241,11 @@ void rvt_ruc_loopback(struct rvt_qp *sqp) spin_lock_irqsave(&sqp->s_lock, flags); rvt_send_complete(sqp, wqe, send_status); if (sqp->ibqp.qp_type == IB_QPT_RC) { - int lastwqe = rvt_error_qp(sqp, IB_WC_WR_FLUSH_ERR); + int lastwqe; + + spin_lock(&sqp->r_lock); + lastwqe = rvt_error_qp(sqp, IB_WC_WR_FLUSH_ERR); + spin_unlock(&sqp->r_lock); sqp->s_flags &= ~RVT_S_BUSY; spin_unlock_irqrestore(&sqp->s_lock, flags); -- Gitee From a18d2070ed2ca064484e6153586083e57b6370d9 Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Thu, 14 Jul 2022 21:58:46 +0800 Subject: [PATCH 295/674] dpaa2-ptp: Fix refcount leak in dpaa2_ptp_probe stable inclusion from stable-v5.10.111 commit ebd1e3458dbf6643aa0272da398cb5b537d492b7 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=ebd1e3458dbf6643aa0272da398cb5b537d492b7 -------------------------------- [ Upstream commit 2b04bd4f03bba021959ca339314f6739710f0954 ] This node pointer is returned by of_find_compatible_node() with refcount incremented. Calling of_node_put() to aovid the refcount leak. Fixes: d346c9e86d86 ("dpaa2-ptp: reuse ptp_qoriq driver") Signed-off-by: Miaoqian Lin Link: https://lore.kernel.org/r/20220404125336.13427-1-linmq006@gmail.com Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/net/ethernet/freescale/dpaa2/dpaa2-ptp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-ptp.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-ptp.c index 32b5faa87bb8..208a3459f2e2 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-ptp.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-ptp.c @@ -168,7 +168,7 @@ static int dpaa2_ptp_probe(struct fsl_mc_device *mc_dev) base = of_iomap(node, 0); if (!base) { err = -ENOMEM; - goto err_close; + goto err_put; } err = fsl_mc_allocate_irqs(mc_dev); @@ -212,6 +212,8 @@ static int dpaa2_ptp_probe(struct fsl_mc_device *mc_dev) fsl_mc_free_irqs(mc_dev); err_unmap: iounmap(base); +err_put: + of_node_put(node); err_close: dprtc_close(mc_dev->mc_io, 0, mc_dev->mc_handle); err_free_mcp: -- Gitee From c5bafbbd9e3c4d96de225b74ad46a2102633219a Mon Sep 17 00:00:00 2001 From: Anatolii Gerasymenko Date: Thu, 14 Jul 2022 21:58:47 +0800 Subject: [PATCH 296/674] ice: Set txq_teid to ICE_INVAL_TEID on ring creation stable inclusion from stable-v5.10.111 commit b003fc4913ea79c3c1458879acb474c66240489a category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=b003fc4913ea79c3c1458879acb474c66240489a -------------------------------- [ Upstream commit ccfee1822042b87e5135d33cad8ea353e64612d2 ] When VF is freshly created, but not brought up, ring->txq_teid value is by default set to 0. But 0 is a valid TEID. On some platforms the Root Node of Tx scheduler has a TEID = 0. This can cause issues as shown below. The proper way is to set ring->txq_teid to ICE_INVAL_TEID (0xFFFFFFFF). Testing Hints: echo 1 > /sys/class/net/ens785f0/device/sriov_numvfs ip link set dev ens785f0v0 up ip link set dev ens785f0v0 down If we have freshly created VF and quickly turn it on and off, so there would be no time to reach VIRTCHNL_OP_CONFIG_VSI_QUEUES stage, then VIRTCHNL_OP_DISABLE_QUEUES stage will fail with error: [ 639.531454] disable queue 89 failed 14 [ 639.532233] Failed to disable LAN Tx queues, error: ICE_ERR_AQ_ERROR [ 639.533107] ice 0000:02:00.0: Failed to stop Tx ring 0 on VSI 5 The reason for the fail is that we are trying to send AQ command to delete queue 89, which has never been created and receive an "invalid argument" error from firmware. As this queue has never been created, it's teid and ring->txq_teid have default value 0. ice_dis_vsi_txq has a check against non-existent queues: node = ice_sched_find_node_by_teid(pi->root, q_teids[i]); if (!node) continue; But on some platforms the Root Node of Tx scheduler has a teid = 0. Hence, ice_sched_find_node_by_teid finds a node with teid = 0 (it is pi->root), and we go further to submit an erroneous request to firmware. Fixes: 37bb83901286 ("ice: Move common functions out of ice_main.c part 7/7") Signed-off-by: Anatolii Gerasymenko Reviewed-by: Maciej Fijalkowski Tested-by: Konrad Jankowski Signed-off-by: Alice Michael Signed-off-by: Tony Nguyen Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/net/ethernet/intel/ice/ice_lib.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c index ec475353b620..ea8d868c8f30 100644 --- a/drivers/net/ethernet/intel/ice/ice_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_lib.c @@ -1265,6 +1265,7 @@ static int ice_vsi_alloc_rings(struct ice_vsi *vsi) ring->vsi = vsi; ring->dev = dev; ring->count = vsi->num_tx_desc; + ring->txq_teid = ICE_INVAL_TEID; WRITE_ONCE(vsi->tx_rings[i], ring); } -- Gitee From 9708464859b169e7fbc5ad7cb53c49d137ab4902 Mon Sep 17 00:00:00 2001 From: Anatolii Gerasymenko Date: Thu, 14 Jul 2022 21:58:48 +0800 Subject: [PATCH 297/674] ice: Do not skip not enabled queues in ice_vc_dis_qs_msg stable inclusion from stable-v5.10.111 commit ffce126c952e0b849cf280e0344bc087adc7256c category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=ffce126c952e0b849cf280e0344bc087adc7256c -------------------------------- [ Upstream commit 05ef6813b234db3196f083b91db3963f040b65bb ] Disable check for queue being enabled in ice_vc_dis_qs_msg, because there could be a case when queues were created, but were not enabled. We still need to delete those queues. Normal workflow for VF looks like: Enable path: VIRTCHNL_OP_ADD_ETH_ADDR (opcode 10) VIRTCHNL_OP_CONFIG_VSI_QUEUES (opcode 6) VIRTCHNL_OP_ENABLE_QUEUES (opcode 8) Disable path: VIRTCHNL_OP_DISABLE_QUEUES (opcode 9) VIRTCHNL_OP_DEL_ETH_ADDR (opcode 11) The issue appears only in stress conditions when VF is enabled and disabled very fast. Eventually there will be a case, when queues are created by VIRTCHNL_OP_CONFIG_VSI_QUEUES, but are not enabled by VIRTCHNL_OP_ENABLE_QUEUES. In turn, these queues are not deleted by VIRTCHNL_OP_DISABLE_QUEUES, because there is a check whether queues are enabled in ice_vc_dis_qs_msg. When we bring up the VF again, we will see the "Failed to set LAN Tx queue context" error during VIRTCHNL_OP_CONFIG_VSI_QUEUES step. This happens because old 16 queues were not deleted and VF requests to create 16 more, but ice_sched_get_free_qparent in ice_ena_vsi_txq would fail to find a parent node for first newly requested queue (because all nodes are allocated to 16 old queues). Testing Hints: Just enable and disable VF fast enough, so it would be disabled before reaching VIRTCHNL_OP_ENABLE_QUEUES. while true; do ip link set dev ens785f0v0 up sleep 0.065 # adjust delay value for you machine ip link set dev ens785f0v0 down done Fixes: 77ca27c41705 ("ice: add support for virtchnl_queue_select.[tx|rx]_queues bitmap") Signed-off-by: Anatolii Gerasymenko Tested-by: Konrad Jankowski Signed-off-by: Alice Michael Signed-off-by: Tony Nguyen Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c index 5134342ff70f..a980d337861d 100644 --- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c @@ -2723,9 +2723,9 @@ static int ice_vc_dis_qs_msg(struct ice_vf *vf, u8 *msg) goto error_param; } - /* Skip queue if not enabled */ if (!test_bit(vf_q_id, vf->txq_ena)) - continue; + dev_dbg(ice_pf_to_dev(vsi->back), "Queue %u on VSI %u is not enabled, but stopping it anyway\n", + vf_q_id, vsi->vsi_num); ice_fill_txq_meta(vsi, ring, &txq_meta); -- Gitee From a78dd66e9269c807cb1e8f792587597928f4ba26 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 14 Jul 2022 21:58:49 +0800 Subject: [PATCH 298/674] ipv6: Fix stats accounting in ip6_pkt_drop stable inclusion from stable-v5.10.111 commit e3dd1202ab2ef81a69dbc20e18945b369dddea81 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=e3dd1202ab2ef81a69dbc20e18945b369dddea81 -------------------------------- [ Upstream commit 1158f79f82d437093aeed87d57df0548bdd68146 ] VRF devices are the loopbacks for VRFs, and a loopback can not be assigned to a VRF. Accordingly, the condition in ip6_pkt_drop should be '||' not '&&'. Fixes: 1d3fd8a10bed ("vrf: Use orig netdev to count Ip6InNoRoutes and a fresh route lookup when sending dest unreach") Reported-by: Pudak, Filip Reported-by: Xiao, Jiguang Signed-off-by: David Ahern Link: https://lore.kernel.org/r/20220404150908.2937-1-dsahern@kernel.org Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- net/ipv6/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index b298b40cf158..a44ad9637e8a 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -4393,7 +4393,7 @@ static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes) struct inet6_dev *idev; int type; - if (netif_is_l3_master(skb->dev) && + if (netif_is_l3_master(skb->dev) || dst->dev == net->loopback_dev) idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif)); else -- Gitee From 3fa6cb4bfd2c2a3948be6fc587f7cd3bcdf4eadc Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Thu, 14 Jul 2022 21:58:50 +0800 Subject: [PATCH 299/674] ice: synchronize_rcu() when terminating rings stable inclusion from stable-v5.10.111 commit e54ea8fc51cae0232982f78c1b168d3b89a46ad2 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=e54ea8fc51cae0232982f78c1b168d3b89a46ad2 -------------------------------- [ Upstream commit f9124c68f05ffdb87a47e3ea6d5fae9dad7cb6eb ] Unfortunately, the ice driver doesn't respect the RCU critical section that XSK wakeup is surrounded with. To fix this, add synchronize_rcu() calls to paths that destroy resources that might be in use. This was addressed in other AF_XDP ZC enabled drivers, for reference see for example commit b3873a5be757 ("net/i40e: Fix concurrency issues between config flow and XSK") Fixes: efc2214b6047 ("ice: Add support for XDP") Fixes: 2d4238f55697 ("ice: Add support for AF_XDP") Signed-off-by: Maciej Fijalkowski Tested-by: Shwetha Nagaraju Signed-off-by: Tony Nguyen Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/net/ethernet/intel/ice/ice.h | 2 +- drivers/net/ethernet/intel/ice/ice_main.c | 4 +++- drivers/net/ethernet/intel/ice/ice_xsk.c | 4 +++- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h index 6a57b41ddb54..7794703c1359 100644 --- a/drivers/net/ethernet/intel/ice/ice.h +++ b/drivers/net/ethernet/intel/ice/ice.h @@ -498,7 +498,7 @@ static inline struct ice_pf *ice_netdev_to_pf(struct net_device *netdev) static inline bool ice_is_xdp_ena_vsi(struct ice_vsi *vsi) { - return !!vsi->xdp_prog; + return !!READ_ONCE(vsi->xdp_prog); } static inline void ice_set_ring_xdp(struct ice_ring *ring) diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 20c9d55f3adc..eb0625b52e45 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -2475,8 +2475,10 @@ int ice_destroy_xdp_rings(struct ice_vsi *vsi) for (i = 0; i < vsi->num_xdp_txq; i++) if (vsi->xdp_rings[i]) { - if (vsi->xdp_rings[i]->desc) + if (vsi->xdp_rings[i]->desc) { + synchronize_rcu(); ice_free_tx_ring(vsi->xdp_rings[i]); + } kfree_rcu(vsi->xdp_rings[i], rcu); vsi->xdp_rings[i] = NULL; } diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index 9f36f8d7a985..5733526fa245 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -36,8 +36,10 @@ static void ice_qp_reset_stats(struct ice_vsi *vsi, u16 q_idx) static void ice_qp_clean_rings(struct ice_vsi *vsi, u16 q_idx) { ice_clean_tx_ring(vsi->tx_rings[q_idx]); - if (ice_is_xdp_ena_vsi(vsi)) + if (ice_is_xdp_ena_vsi(vsi)) { + synchronize_rcu(); ice_clean_tx_ring(vsi->xdp_rings[q_idx]); + } ice_clean_rx_ring(vsi->rx_rings[q_idx]); } -- Gitee From 25b6995796c162fbc8ddf7d510402f3eb9fe4158 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Thu, 14 Jul 2022 21:58:51 +0800 Subject: [PATCH 300/674] net: openvswitch: don't send internal clone attribute to the userspace. stable inclusion from stable-v5.10.111 commit 42ab401d22de3429f600a48acdd4c0ee5662783c category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=42ab401d22de3429f600a48acdd4c0ee5662783c -------------------------------- [ Upstream commit 3f2a3050b4a3e7f32fc0ea3c9b0183090ae00522 ] 'OVS_CLONE_ATTR_EXEC' is an internal attribute that is used for performance optimization inside the kernel. It's added by the kernel while parsing user-provided actions and should not be sent during the flow dump as it's not part of the uAPI. The issue doesn't cause any significant problems to the ovs-vswitchd process, because reported actions are not really used in the application lifecycle and only supposed to be shown to a human via ovs-dpctl flow dump. However, the action list is still incorrect and causes the following error if the user wants to look at the datapath flows: # ovs-dpctl add-dp system@ovs-system # ovs-dpctl add-flow "" "clone(ct(commit),0)" # ovs-dpctl dump-flows , packets:0, bytes:0, used:never, actions:clone(bad length 4, expected -1 for: action0(01 00 00 00), ct(commit),0) With the fix: # ovs-dpctl dump-flows , packets:0, bytes:0, used:never, actions:clone(ct(commit),0) Additionally fixed an incorrect attribute name in the comment. Fixes: b233504033db ("openvswitch: kernel datapath clone action") Signed-off-by: Ilya Maximets Acked-by: Aaron Conole Link: https://lore.kernel.org/r/20220404104150.2865736-1-i.maximets@ovn.org Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- net/openvswitch/actions.c | 2 +- net/openvswitch/flow_netlink.c | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 525c1540f10e..6d8d70021666 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -1044,7 +1044,7 @@ static int clone(struct datapath *dp, struct sk_buff *skb, int rem = nla_len(attr); bool dont_clone_flow_key; - /* The first action is always 'OVS_CLONE_ATTR_ARG'. */ + /* The first action is always 'OVS_CLONE_ATTR_EXEC'. */ clone_arg = nla_data(attr); dont_clone_flow_key = nla_get_u32(clone_arg); actions = nla_next(clone_arg, &rem); diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 8c4bdfa627ca..c41093540b2f 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -3419,7 +3419,9 @@ static int clone_action_to_attr(const struct nlattr *attr, if (!start) return -EMSGSIZE; - err = ovs_nla_put_actions(nla_data(attr), rem, skb); + /* Skipping the OVS_CLONE_ATTR_EXEC that is always the first attribute. */ + attr = nla_next(nla_data(attr), &rem); + err = ovs_nla_put_actions(attr, rem, skb); if (err) nla_nest_cancel(skb, start); -- Gitee From b6cea59998c5c91457499a93d5900efb635f1579 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Thu, 14 Jul 2022 21:58:52 +0800 Subject: [PATCH 301/674] net: openvswitch: fix leak of nested actions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit stable inclusion from stable-v5.10.111 commit 5ae05b5eb58773cfec307ff88aff4cfd843c4cff category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=5ae05b5eb58773cfec307ff88aff4cfd843c4cff -------------------------------- [ Upstream commit 1f30fb9166d4f15a1aa19449b9da871fe0ed4796 ] While parsing user-provided actions, openvswitch module may dynamically allocate memory and store pointers in the internal copy of the actions. So this memory has to be freed while destroying the actions. Currently there are only two such actions: ct() and set(). However, there are many actions that can hold nested lists of actions and ovs_nla_free_flow_actions() just jumps over them leaking the memory. For example, removal of the flow with the following actions will lead to a leak of the memory allocated by nf_ct_tmpl_alloc(): actions:clone(ct(commit),0) Non-freed set() action may also leak the 'dst' structure for the tunnel info including device references. Under certain conditions with a high rate of flow rotation that may cause significant memory leak problem (2MB per second in reporter's case). The problem is also hard to mitigate, because the user doesn't have direct control over the datapath flows generated by OVS. Fix that by iterating over all the nested actions and freeing everything that needs to be freed recursively. New build time assertion should protect us from this problem if new actions will be added in the future. Unfortunately, openvswitch module doesn't use NLA_F_NESTED, so all attributes has to be explicitly checked. sample() and clone() actions are mixing extra attributes into the user-provided action list. That prevents some code generalization too. Fixes: 34ae932a4036 ("openvswitch: Make tunnel set action attach a metadata dst") Link: https://mail.openvswitch.org/pipermail/ovs-dev/2022-March/392922.html Reported-by: Stéphane Graber Signed-off-by: Ilya Maximets Acked-by: Aaron Conole Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- net/openvswitch/flow_netlink.c | 95 ++++++++++++++++++++++++++++++++-- 1 file changed, 90 insertions(+), 5 deletions(-) diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index c41093540b2f..98a7e6f64ab0 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -2288,6 +2288,62 @@ static struct sw_flow_actions *nla_alloc_flow_actions(int size) return sfa; } +static void ovs_nla_free_nested_actions(const struct nlattr *actions, int len); + +static void ovs_nla_free_check_pkt_len_action(const struct nlattr *action) +{ + const struct nlattr *a; + int rem; + + nla_for_each_nested(a, action, rem) { + switch (nla_type(a)) { + case OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL: + case OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER: + ovs_nla_free_nested_actions(nla_data(a), nla_len(a)); + break; + } + } +} + +static void ovs_nla_free_clone_action(const struct nlattr *action) +{ + const struct nlattr *a = nla_data(action); + int rem = nla_len(action); + + switch (nla_type(a)) { + case OVS_CLONE_ATTR_EXEC: + /* The real list of actions follows this attribute. */ + a = nla_next(a, &rem); + ovs_nla_free_nested_actions(a, rem); + break; + } +} + +static void ovs_nla_free_dec_ttl_action(const struct nlattr *action) +{ + const struct nlattr *a = nla_data(action); + + switch (nla_type(a)) { + case OVS_DEC_TTL_ATTR_ACTION: + ovs_nla_free_nested_actions(nla_data(a), nla_len(a)); + break; + } +} + +static void ovs_nla_free_sample_action(const struct nlattr *action) +{ + const struct nlattr *a = nla_data(action); + int rem = nla_len(action); + + switch (nla_type(a)) { + case OVS_SAMPLE_ATTR_ARG: + /* The real list of actions follows this attribute. */ + a = nla_next(a, &rem); + ovs_nla_free_nested_actions(a, rem); + break; + } +} + static void ovs_nla_free_set_action(const struct nlattr *a) { const struct nlattr *ovs_key = nla_data(a); @@ -2301,25 +2357,54 @@ static void ovs_nla_free_set_action(const struct nlattr *a) } } -void ovs_nla_free_flow_actions(struct sw_flow_actions *sf_acts) +static void ovs_nla_free_nested_actions(const struct nlattr *actions, int len) { const struct nlattr *a; int rem; - if (!sf_acts) + /* Whenever new actions are added, the need to update this + * function should be considered. + */ + BUILD_BUG_ON(OVS_ACTION_ATTR_MAX != 23); + + if (!actions) return; - nla_for_each_attr(a, sf_acts->actions, sf_acts->actions_len, rem) { + nla_for_each_attr(a, actions, len, rem) { switch (nla_type(a)) { - case OVS_ACTION_ATTR_SET: - ovs_nla_free_set_action(a); + case OVS_ACTION_ATTR_CHECK_PKT_LEN: + ovs_nla_free_check_pkt_len_action(a); + break; + + case OVS_ACTION_ATTR_CLONE: + ovs_nla_free_clone_action(a); break; + case OVS_ACTION_ATTR_CT: ovs_ct_free_action(a); break; + + case OVS_ACTION_ATTR_DEC_TTL: + ovs_nla_free_dec_ttl_action(a); + break; + + case OVS_ACTION_ATTR_SAMPLE: + ovs_nla_free_sample_action(a); + break; + + case OVS_ACTION_ATTR_SET: + ovs_nla_free_set_action(a); + break; } } +} + +void ovs_nla_free_flow_actions(struct sw_flow_actions *sf_acts) +{ + if (!sf_acts) + return; + ovs_nla_free_nested_actions(sf_acts->actions, sf_acts->actions_len); kfree(sf_acts); } -- Gitee From 16ee25f37dbf211c272cce1eed4ab51ba8356051 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 14 Jul 2022 21:58:53 +0800 Subject: [PATCH 302/674] rxrpc: fix a race in rxrpc_exit_net() stable inclusion from stable-v5.10.111 commit 08ff0e74fab517dbc44e11b8bc683dd4ecc65950 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=08ff0e74fab517dbc44e11b8bc683dd4ecc65950 -------------------------------- [ Upstream commit 1946014ca3b19be9e485e780e862c375c6f98bad ] Current code can lead to the following race: CPU0 CPU1 rxrpc_exit_net() rxrpc_peer_keepalive_worker() if (rxnet->live) rxnet->live = false; del_timer_sync(&rxnet->peer_keepalive_timer); timer_reduce(&rxnet->peer_keepalive_timer, jiffies + delay); cancel_work_sync(&rxnet->peer_keepalive_work); rxrpc_exit_net() exits while peer_keepalive_timer is still armed, leading to use-after-free. syzbot report was: ODEBUG: free active (active state 0) object type: timer_list hint: rxrpc_peer_keepalive_timeout+0x0/0xb0 WARNING: CPU: 0 PID: 3660 at lib/debugobjects.c:505 debug_print_object+0x16e/0x250 lib/debugobjects.c:505 Modules linked in: CPU: 0 PID: 3660 Comm: kworker/u4:6 Not tainted 5.17.0-syzkaller-13993-g88e6c0207623 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: netns cleanup_net RIP: 0010:debug_print_object+0x16e/0x250 lib/debugobjects.c:505 Code: ff df 48 89 fa 48 c1 ea 03 80 3c 02 00 0f 85 af 00 00 00 48 8b 14 dd 00 1c 26 8a 4c 89 ee 48 c7 c7 00 10 26 8a e8 b1 e7 28 05 <0f> 0b 83 05 15 eb c5 09 01 48 83 c4 18 5b 5d 41 5c 41 5d 41 5e c3 RSP: 0018:ffffc9000353fb00 EFLAGS: 00010082 RAX: 0000000000000000 RBX: 0000000000000003 RCX: 0000000000000000 RDX: ffff888029196140 RSI: ffffffff815efad8 RDI: fffff520006a7f52 RBP: 0000000000000001 R08: 0000000000000000 R09: 0000000000000000 R10: ffffffff815ea4ae R11: 0000000000000000 R12: ffffffff89ce23e0 R13: ffffffff8a2614e0 R14: ffffffff816628c0 R15: dffffc0000000000 FS: 0000000000000000(0000) GS:ffff8880b9c00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fe1f2908924 CR3: 0000000043720000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __debug_check_no_obj_freed lib/debugobjects.c:992 [inline] debug_check_no_obj_freed+0x301/0x420 lib/debugobjects.c:1023 kfree+0xd6/0x310 mm/slab.c:3809 ops_free_list.part.0+0x119/0x370 net/core/net_namespace.c:176 ops_free_list net/core/net_namespace.c:174 [inline] cleanup_net+0x591/0xb00 net/core/net_namespace.c:598 process_one_work+0x996/0x1610 kernel/workqueue.c:2289 worker_thread+0x665/0x1080 kernel/workqueue.c:2436 kthread+0x2e9/0x3a0 kernel/kthread.c:376 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:298 Fixes: ace45bec6d77 ("rxrpc: Fix firewall route keepalive") Signed-off-by: Eric Dumazet Cc: David Howells Cc: Marc Dionne Cc: linux-afs@lists.infradead.org Reported-by: syzbot Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- net/rxrpc/net_ns.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/rxrpc/net_ns.c b/net/rxrpc/net_ns.c index 25bbc4cc8b13..f15d6942da45 100644 --- a/net/rxrpc/net_ns.c +++ b/net/rxrpc/net_ns.c @@ -113,8 +113,8 @@ static __net_exit void rxrpc_exit_net(struct net *net) struct rxrpc_net *rxnet = rxrpc_net(net); rxnet->live = false; - del_timer_sync(&rxnet->peer_keepalive_timer); cancel_work_sync(&rxnet->peer_keepalive_work); + del_timer_sync(&rxnet->peer_keepalive_timer); rxrpc_destroy_all_calls(rxnet); rxrpc_destroy_all_connections(rxnet); rxrpc_destroy_all_peers(rxnet); -- Gitee From 993871ac488687874fd489ce2e9ed85b4c3ab465 Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Thu, 14 Jul 2022 21:58:54 +0800 Subject: [PATCH 303/674] net: phy: mscc-miim: reject clause 45 register accesses stable inclusion from stable-v5.10.111 commit b7893388bb888cb15c7db0c0f71bd58a4fb5312c category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=b7893388bb888cb15c7db0c0f71bd58a4fb5312c -------------------------------- [ Upstream commit 8d90991e5bf7fdb9f264f5f579d18969913054b7 ] The driver doesn't support clause 45 register access yet, but doesn't check if the access is a c45 one either. This leads to spurious register reads and writes. Add the check. Fixes: 542671fe4d86 ("net: phy: mscc-miim: Add MDIO driver") Signed-off-by: Michael Walle Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/net/mdio/mdio-mscc-miim.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/mdio/mdio-mscc-miim.c b/drivers/net/mdio/mdio-mscc-miim.c index 11f583fd4611..1c9232fca1e2 100644 --- a/drivers/net/mdio/mdio-mscc-miim.c +++ b/drivers/net/mdio/mdio-mscc-miim.c @@ -76,6 +76,9 @@ static int mscc_miim_read(struct mii_bus *bus, int mii_id, int regnum) u32 val; int ret; + if (regnum & MII_ADDR_C45) + return -EOPNOTSUPP; + ret = mscc_miim_wait_pending(bus); if (ret) goto out; @@ -105,6 +108,9 @@ static int mscc_miim_write(struct mii_bus *bus, int mii_id, struct mscc_miim_dev *miim = bus->priv; int ret; + if (regnum & MII_ADDR_C45) + return -EOPNOTSUPP; + ret = mscc_miim_wait_pending(bus); if (ret < 0) goto out; -- Gitee From d44c28285509cde9d8d3ee017e9bab8b7de8bd8f Mon Sep 17 00:00:00 2001 From: Jamie Bainbridge Date: Thu, 14 Jul 2022 21:58:55 +0800 Subject: [PATCH 304/674] qede: confirm skb is allocated before using stable inclusion from stable-v5.10.111 commit 8928239e5e2e460d95b8a0b89f61671625e7ece0 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=8928239e5e2e460d95b8a0b89f61671625e7ece0 -------------------------------- [ Upstream commit 4e910dbe36508654a896d5735b318c0b88172570 ] qede_build_skb() assumes build_skb() always works and goes straight to skb_reserve(). However, build_skb() can fail under memory pressure. This results in a kernel panic because the skb to reserve is NULL. Add a check in case build_skb() failed to allocate and return NULL. The NULL return is handled correctly in callers to qede_build_skb(). Fixes: 8a8633978b842 ("qede: Add build_skb() support.") Signed-off-by: Jamie Bainbridge Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/net/ethernet/qlogic/qede/qede_fp.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/qlogic/qede/qede_fp.c b/drivers/net/ethernet/qlogic/qede/qede_fp.c index 21c906200e79..d210632676d3 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_fp.c +++ b/drivers/net/ethernet/qlogic/qede/qede_fp.c @@ -752,6 +752,9 @@ qede_build_skb(struct qede_rx_queue *rxq, buf = page_address(bd->data) + bd->page_offset; skb = build_skb(buf, rxq->rx_buf_seg_size); + if (unlikely(!skb)) + return NULL; + skb_reserve(skb, pad); skb_put(skb, len); -- Gitee From 8d205a67676cd45e872270911b401c05d4b26981 Mon Sep 17 00:00:00 2001 From: Kamal Dasu Date: Thu, 14 Jul 2022 21:58:56 +0800 Subject: [PATCH 305/674] spi: bcm-qspi: fix MSPI only access with bcm_qspi_exec_mem_op() stable inclusion from stable-v5.10.111 commit 6c17f4ef3c4f662865ac99ad167a8b896ca70d2f category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=6c17f4ef3c4f662865ac99ad167a8b896ca70d2f -------------------------------- [ Upstream commit 2c7d1b281286c46049cd22b43435cecba560edde ] This fixes case where MSPI controller is used to access spi-nor flash and BSPI block is not present. Fixes: 5f195ee7d830 ("spi: bcm-qspi: Implement the spi_mem interface") Signed-off-by: Kamal Dasu Acked-by: Florian Fainelli Link: https://lore.kernel.org/r/20220328142442.7553-1-kdasu.kdev@gmail.com Signed-off-by: Mark Brown Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/spi/spi-bcm-qspi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/spi/spi-bcm-qspi.c b/drivers/spi/spi-bcm-qspi.c index 4a80f043b7b1..766b00350e39 100644 --- a/drivers/spi/spi-bcm-qspi.c +++ b/drivers/spi/spi-bcm-qspi.c @@ -1032,7 +1032,7 @@ static int bcm_qspi_exec_mem_op(struct spi_mem *mem, addr = op->addr.val; len = op->data.nbytes; - if (bcm_qspi_bspi_ver_three(qspi) == true) { + if (has_bspi(qspi) && bcm_qspi_bspi_ver_three(qspi) == true) { /* * The address coming into this function is a raw flash offset. * But for BSPI <= V3, we need to convert it to a remapped BSPI @@ -1051,7 +1051,7 @@ static int bcm_qspi_exec_mem_op(struct spi_mem *mem, len < 4) mspi_read = true; - if (mspi_read) + if (!has_bspi(qspi) || mspi_read) return bcm_qspi_mspi_exec_mem_op(spi, op); ret = bcm_qspi_bspi_set_mode(qspi, op, 0); -- Gitee From 22c85679a13dc27646fe9c747598020cf8e07817 Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Thu, 14 Jul 2022 21:58:57 +0800 Subject: [PATCH 306/674] bpf: Support dual-stack sockets in bpf_tcp_check_syncookie stable inclusion from stable-v5.10.111 commit 970a6bb72912f2accae52e811a6618e3d4a5b0ff category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=970a6bb72912f2accae52e811a6618e3d4a5b0ff -------------------------------- [ Upstream commit 2e8702cc0cfa1080f29fd64003c00a3e24ac38de ] bpf_tcp_gen_syncookie looks at the IP version in the IP header and validates the address family of the socket. It supports IPv4 packets in AF_INET6 dual-stack sockets. On the other hand, bpf_tcp_check_syncookie looks only at the address family of the socket, ignoring the real IP version in headers, and validates only the packet size. This implementation has some drawbacks: 1. Packets are not validated properly, allowing a BPF program to trick bpf_tcp_check_syncookie into handling an IPv6 packet on an IPv4 socket. 2. Dual-stack sockets fail the checks on IPv4 packets. IPv4 clients end up receiving a SYNACK with the cookie, but the following ACK gets dropped. This patch fixes these issues by changing the checks in bpf_tcp_check_syncookie to match the ones in bpf_tcp_gen_syncookie. IP version from the header is taken into account, and it is validated properly with address family. Fixes: 399040847084 ("bpf: add helper to check for a valid SYN cookie") Signed-off-by: Maxim Mikityanskiy Signed-off-by: Alexei Starovoitov Reviewed-by: Tariq Toukan Acked-by: Arthur Fabre Link: https://lore.kernel.org/bpf/20220406124113.2795730-1-maximmi@nvidia.com Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- net/core/filter.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 6b619ef0dee3..933fdf6e6a90 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6562,24 +6562,33 @@ BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len if (!th->ack || th->rst || th->syn) return -ENOENT; + if (unlikely(iph_len < sizeof(struct iphdr))) + return -EINVAL; + if (tcp_synq_no_recent_overflow(sk)) return -ENOENT; cookie = ntohl(th->ack_seq) - 1; - switch (sk->sk_family) { - case AF_INET: - if (unlikely(iph_len < sizeof(struct iphdr))) + /* Both struct iphdr and struct ipv6hdr have the version field at the + * same offset so we can cast to the shorter header (struct iphdr). + */ + switch (((struct iphdr *)iph)->version) { + case 4: + if (sk->sk_family == AF_INET6 && ipv6_only_sock(sk)) return -EINVAL; ret = __cookie_v4_check((struct iphdr *)iph, th, cookie); break; #if IS_BUILTIN(CONFIG_IPV6) - case AF_INET6: + case 6: if (unlikely(iph_len < sizeof(struct ipv6hdr))) return -EINVAL; + if (sk->sk_family != AF_INET6) + return -EINVAL; + ret = __cookie_v6_check((struct ipv6hdr *)iph, th, cookie); break; #endif /* CONFIG_IPV6 */ -- Gitee From e1119e30c3433839ad8484ebc20b35107c110de8 Mon Sep 17 00:00:00 2001 From: Lv Yunlong Date: Thu, 14 Jul 2022 21:58:58 +0800 Subject: [PATCH 307/674] drbd: Fix five use after free bugs in get_initial_state MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit stable inclusion from stable-v5.10.111 commit 594205b4936771a250f9d141e7e0fff21c3dd2d9 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=594205b4936771a250f9d141e7e0fff21c3dd2d9 -------------------------------- [ Upstream commit aadb22ba2f656581b2f733deb3a467c48cc618f6 ] In get_initial_state, it calls notify_initial_state_done(skb,..) if cb->args[5]==1. If genlmsg_put() failed in notify_initial_state_done(), the skb will be freed by nlmsg_free(skb). Then get_initial_state will goto out and the freed skb will be used by return value skb->len, which is a uaf bug. What's worse, the same problem goes even further: skb can also be freed in the notify_*_state_change -> notify_*_state calls below. Thus 4 additional uaf bugs happened. My patch lets the problem callee functions: notify_initial_state_done and notify_*_state_change return an error code if errors happen. So that the error codes could be propagated and the uaf bugs can be avoid. v2 reports a compilation warning. This v3 fixed this warning and built successfully in my local environment with no additional warnings. v2: https://lore.kernel.org/patchwork/patch/1435218/ Fixes: a29728463b254 ("drbd: Backport the "events2" command") Signed-off-by: Lv Yunlong Reviewed-by: Christoph Böhmwalder Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/block/drbd/drbd_int.h | 8 ++--- drivers/block/drbd/drbd_nl.c | 41 ++++++++++++++++---------- drivers/block/drbd/drbd_state.c | 18 +++++------ drivers/block/drbd/drbd_state_change.h | 8 ++--- 4 files changed, 42 insertions(+), 33 deletions(-) diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 8f879e5c2f67..60b9ca53c0a3 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1644,22 +1644,22 @@ struct sib_info { }; void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib); -extern void notify_resource_state(struct sk_buff *, +extern int notify_resource_state(struct sk_buff *, unsigned int, struct drbd_resource *, struct resource_info *, enum drbd_notification_type); -extern void notify_device_state(struct sk_buff *, +extern int notify_device_state(struct sk_buff *, unsigned int, struct drbd_device *, struct device_info *, enum drbd_notification_type); -extern void notify_connection_state(struct sk_buff *, +extern int notify_connection_state(struct sk_buff *, unsigned int, struct drbd_connection *, struct connection_info *, enum drbd_notification_type); -extern void notify_peer_device_state(struct sk_buff *, +extern int notify_peer_device_state(struct sk_buff *, unsigned int, struct drbd_peer_device *, struct peer_device_info *, diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index bf7de4c7b96c..f8d0146bf785 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -4614,7 +4614,7 @@ static int nla_put_notification_header(struct sk_buff *msg, return drbd_notification_header_to_skb(msg, &nh, true); } -void notify_resource_state(struct sk_buff *skb, +int notify_resource_state(struct sk_buff *skb, unsigned int seq, struct drbd_resource *resource, struct resource_info *resource_info, @@ -4656,16 +4656,17 @@ void notify_resource_state(struct sk_buff *skb, if (err && err != -ESRCH) goto failed; } - return; + return 0; nla_put_failure: nlmsg_free(skb); failed: drbd_err(resource, "Error %d while broadcasting event. Event seq:%u\n", err, seq); + return err; } -void notify_device_state(struct sk_buff *skb, +int notify_device_state(struct sk_buff *skb, unsigned int seq, struct drbd_device *device, struct device_info *device_info, @@ -4705,16 +4706,17 @@ void notify_device_state(struct sk_buff *skb, if (err && err != -ESRCH) goto failed; } - return; + return 0; nla_put_failure: nlmsg_free(skb); failed: drbd_err(device, "Error %d while broadcasting event. Event seq:%u\n", err, seq); + return err; } -void notify_connection_state(struct sk_buff *skb, +int notify_connection_state(struct sk_buff *skb, unsigned int seq, struct drbd_connection *connection, struct connection_info *connection_info, @@ -4754,16 +4756,17 @@ void notify_connection_state(struct sk_buff *skb, if (err && err != -ESRCH) goto failed; } - return; + return 0; nla_put_failure: nlmsg_free(skb); failed: drbd_err(connection, "Error %d while broadcasting event. Event seq:%u\n", err, seq); + return err; } -void notify_peer_device_state(struct sk_buff *skb, +int notify_peer_device_state(struct sk_buff *skb, unsigned int seq, struct drbd_peer_device *peer_device, struct peer_device_info *peer_device_info, @@ -4804,13 +4807,14 @@ void notify_peer_device_state(struct sk_buff *skb, if (err && err != -ESRCH) goto failed; } - return; + return 0; nla_put_failure: nlmsg_free(skb); failed: drbd_err(peer_device, "Error %d while broadcasting event. Event seq:%u\n", err, seq); + return err; } void notify_helper(enum drbd_notification_type type, @@ -4861,7 +4865,7 @@ void notify_helper(enum drbd_notification_type type, err, seq); } -static void notify_initial_state_done(struct sk_buff *skb, unsigned int seq) +static int notify_initial_state_done(struct sk_buff *skb, unsigned int seq) { struct drbd_genlmsghdr *dh; int err; @@ -4875,11 +4879,12 @@ static void notify_initial_state_done(struct sk_buff *skb, unsigned int seq) if (nla_put_notification_header(skb, NOTIFY_EXISTS)) goto nla_put_failure; genlmsg_end(skb, dh); - return; + return 0; nla_put_failure: nlmsg_free(skb); pr_err("Error %d sending event. Event seq:%u\n", err, seq); + return err; } static void free_state_changes(struct list_head *list) @@ -4906,6 +4911,7 @@ static int get_initial_state(struct sk_buff *skb, struct netlink_callback *cb) unsigned int seq = cb->args[2]; unsigned int n; enum drbd_notification_type flags = 0; + int err = 0; /* There is no need for taking notification_mutex here: it doesn't matter if the initial state events mix with later state chage @@ -4914,32 +4920,32 @@ static int get_initial_state(struct sk_buff *skb, struct netlink_callback *cb) cb->args[5]--; if (cb->args[5] == 1) { - notify_initial_state_done(skb, seq); + err = notify_initial_state_done(skb, seq); goto out; } n = cb->args[4]++; if (cb->args[4] < cb->args[3]) flags |= NOTIFY_CONTINUES; if (n < 1) { - notify_resource_state_change(skb, seq, state_change->resource, + err = notify_resource_state_change(skb, seq, state_change->resource, NOTIFY_EXISTS | flags); goto next; } n--; if (n < state_change->n_connections) { - notify_connection_state_change(skb, seq, &state_change->connections[n], + err = notify_connection_state_change(skb, seq, &state_change->connections[n], NOTIFY_EXISTS | flags); goto next; } n -= state_change->n_connections; if (n < state_change->n_devices) { - notify_device_state_change(skb, seq, &state_change->devices[n], + err = notify_device_state_change(skb, seq, &state_change->devices[n], NOTIFY_EXISTS | flags); goto next; } n -= state_change->n_devices; if (n < state_change->n_devices * state_change->n_connections) { - notify_peer_device_state_change(skb, seq, &state_change->peer_devices[n], + err = notify_peer_device_state_change(skb, seq, &state_change->peer_devices[n], NOTIFY_EXISTS | flags); goto next; } @@ -4954,7 +4960,10 @@ static int get_initial_state(struct sk_buff *skb, struct netlink_callback *cb) cb->args[4] = 0; } out: - return skb->len; + if (err) + return err; + else + return skb->len; } int drbd_adm_get_initial_state(struct sk_buff *skb, struct netlink_callback *cb) diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c index 0067d328f0b5..5fbaea6b77b1 100644 --- a/drivers/block/drbd/drbd_state.c +++ b/drivers/block/drbd/drbd_state.c @@ -1537,7 +1537,7 @@ int drbd_bitmap_io_from_worker(struct drbd_device *device, return rv; } -void notify_resource_state_change(struct sk_buff *skb, +int notify_resource_state_change(struct sk_buff *skb, unsigned int seq, struct drbd_resource_state_change *resource_state_change, enum drbd_notification_type type) @@ -1550,10 +1550,10 @@ void notify_resource_state_change(struct sk_buff *skb, .res_susp_fen = resource_state_change->susp_fen[NEW], }; - notify_resource_state(skb, seq, resource, &resource_info, type); + return notify_resource_state(skb, seq, resource, &resource_info, type); } -void notify_connection_state_change(struct sk_buff *skb, +int notify_connection_state_change(struct sk_buff *skb, unsigned int seq, struct drbd_connection_state_change *connection_state_change, enum drbd_notification_type type) @@ -1564,10 +1564,10 @@ void notify_connection_state_change(struct sk_buff *skb, .conn_role = connection_state_change->peer_role[NEW], }; - notify_connection_state(skb, seq, connection, &connection_info, type); + return notify_connection_state(skb, seq, connection, &connection_info, type); } -void notify_device_state_change(struct sk_buff *skb, +int notify_device_state_change(struct sk_buff *skb, unsigned int seq, struct drbd_device_state_change *device_state_change, enum drbd_notification_type type) @@ -1577,10 +1577,10 @@ void notify_device_state_change(struct sk_buff *skb, .dev_disk_state = device_state_change->disk_state[NEW], }; - notify_device_state(skb, seq, device, &device_info, type); + return notify_device_state(skb, seq, device, &device_info, type); } -void notify_peer_device_state_change(struct sk_buff *skb, +int notify_peer_device_state_change(struct sk_buff *skb, unsigned int seq, struct drbd_peer_device_state_change *p, enum drbd_notification_type type) @@ -1594,7 +1594,7 @@ void notify_peer_device_state_change(struct sk_buff *skb, .peer_resync_susp_dependency = p->resync_susp_dependency[NEW], }; - notify_peer_device_state(skb, seq, peer_device, &peer_device_info, type); + return notify_peer_device_state(skb, seq, peer_device, &peer_device_info, type); } static void broadcast_state_change(struct drbd_state_change *state_change) @@ -1602,7 +1602,7 @@ static void broadcast_state_change(struct drbd_state_change *state_change) struct drbd_resource_state_change *resource_state_change = &state_change->resource[0]; bool resource_state_has_changed; unsigned int n_device, n_connection, n_peer_device, n_peer_devices; - void (*last_func)(struct sk_buff *, unsigned int, void *, + int (*last_func)(struct sk_buff *, unsigned int, void *, enum drbd_notification_type) = NULL; void *last_arg = NULL; diff --git a/drivers/block/drbd/drbd_state_change.h b/drivers/block/drbd/drbd_state_change.h index ba80f612d6ab..d5b0479bc9a6 100644 --- a/drivers/block/drbd/drbd_state_change.h +++ b/drivers/block/drbd/drbd_state_change.h @@ -44,19 +44,19 @@ extern struct drbd_state_change *remember_old_state(struct drbd_resource *, gfp_ extern void copy_old_to_new_state_change(struct drbd_state_change *); extern void forget_state_change(struct drbd_state_change *); -extern void notify_resource_state_change(struct sk_buff *, +extern int notify_resource_state_change(struct sk_buff *, unsigned int, struct drbd_resource_state_change *, enum drbd_notification_type type); -extern void notify_connection_state_change(struct sk_buff *, +extern int notify_connection_state_change(struct sk_buff *, unsigned int, struct drbd_connection_state_change *, enum drbd_notification_type type); -extern void notify_device_state_change(struct sk_buff *, +extern int notify_device_state_change(struct sk_buff *, unsigned int, struct drbd_device_state_change *, enum drbd_notification_type type); -extern void notify_peer_device_state_change(struct sk_buff *, +extern int notify_peer_device_state_change(struct sk_buff *, unsigned int, struct drbd_peer_device_state_change *, enum drbd_notification_type type); -- Gitee From 9cf5e90ac8a67144d7edcee720d7b1419d5d6be1 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 14 Jul 2022 21:58:59 +0800 Subject: [PATCH 308/674] io_uring: don't touch scm_fp_list after queueing skb stable inclusion from stable-v5.10.111 commit aed30a2054060e470db02eaaf352c14dc38aa611 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=aed30a2054060e470db02eaaf352c14dc38aa611 -------------------------------- [ Upstream commit a07211e3001435fe8591b992464cd8d5e3c98c5a ] It's safer to not touch scm_fp_list after we queued an skb to which it was assigned, there might be races lurking if we screw subtle sync guarantees on the io_uring side. Fixes: 6b06314c47e14 ("io_uring: add file set registration") Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- fs/io_uring.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index a1c2c04b8ca0..3da9c6f457ae 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7314,8 +7314,12 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset) refcount_add(skb->truesize, &sk->sk_wmem_alloc); skb_queue_head(&sk->sk_receive_queue, skb); - for (i = 0; i < nr_files; i++) - fput(fpl->fp[i]); + for (i = 0; i < nr; i++) { + struct file *file = io_file_from_index(ctx, i + offset); + + if (file) + fput(file); + } } else { kfree_skb(skb); free_uid(fpl->user); -- Gitee From 70391d566bb7f9348fd1b4c3941030f041df0ac5 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 14 Jul 2022 21:59:00 +0800 Subject: [PATCH 309/674] SUNRPC: Handle ENOMEM in call_transmit_status() stable inclusion from stable-v5.10.111 commit 132cbe2f182ab535e0a7fa309d2c73ed6257a246 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=132cbe2f182ab535e0a7fa309d2c73ed6257a246 -------------------------------- [ Upstream commit d3c15033b240767d0287f1c4a529cbbe2d5ded8a ] Both call_transmit() and call_bc_transmit() can now return ENOMEM, so let's make sure that we handle the errors gracefully. Fixes: 0472e4766049 ("SUNRPC: Convert socket page send code to use iov_iter()") Signed-off-by: Trond Myklebust Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- net/sunrpc/clnt.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 84c8a534029c..bae42ada8c10 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -2175,6 +2175,7 @@ call_transmit_status(struct rpc_task *task) * socket just returned a connection error, * then hold onto the transport lock. */ + case -ENOMEM: case -ENOBUFS: rpc_delay(task, HZ>>2); fallthrough; @@ -2258,6 +2259,7 @@ call_bc_transmit_status(struct rpc_task *task) case -ENOTCONN: case -EPIPE: break; + case -ENOMEM: case -ENOBUFS: rpc_delay(task, HZ>>2); fallthrough; -- Gitee From fa0241e9ba3fdb89bcd46fafbcf09bfd60c8d668 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 14 Jul 2022 21:59:01 +0800 Subject: [PATCH 310/674] SUNRPC: Handle low memory situations in call_status() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit stable inclusion from stable-v5.10.111 commit 9a45e08636bba2949819cbf4d9ee0b974dd5c7a6 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=9a45e08636bba2949819cbf4d9ee0b974dd5c7a6 -------------------------------- [ Upstream commit 9d82819d5b065348ce623f196bf601028e22ed00 ] We need to handle ENFILE, ENOBUFS, and ENOMEM, because xprt_wake_pending_tasks() can be called with any one of these due to socket creation failures. Fixes: b61d59fffd3e ("SUNRPC: xs_tcp_connect_worker{4,6}: merge common code") Signed-off-by: Trond Myklebust Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- net/sunrpc/clnt.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index bae42ada8c10..c5af31312e0c 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -2342,6 +2342,11 @@ call_status(struct rpc_task *task) case -EPIPE: case -EAGAIN: break; + case -ENFILE: + case -ENOBUFS: + case -ENOMEM: + rpc_delay(task, HZ>>2); + break; case -EIO: /* shutdown or soft timeout */ goto out_exit; -- Gitee From 49fd0817134311af332f0602a09650ff61298933 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 14 Jul 2022 21:59:02 +0800 Subject: [PATCH 311/674] SUNRPC: svc_tcp_sendmsg() should handle errors from xdr_alloc_bvec() stable inclusion from stable-v5.10.111 commit b3c00be2ff8bc7e211fb04ec42facf0e4c876800 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=b3c00be2ff8bc7e211fb04ec42facf0e4c876800 -------------------------------- [ Upstream commit b056fa070814897be32d83b079dbc311375588e7 ] The allocation is done with GFP_KERNEL, but it could still fail in a low memory situation. Fixes: 4a85a6a3320b ("SUNRPC: Handle TCP socket sends with kernel_sendpage() again") Signed-off-by: Trond Myklebust Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- net/sunrpc/svcsock.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index eba1714bf09a..6d5bb8bfed38 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1091,7 +1091,9 @@ static int svc_tcp_sendmsg(struct socket *sock, struct msghdr *msg, int flags, ret; *sentp = 0; - xdr_alloc_bvec(xdr, GFP_KERNEL); + ret = xdr_alloc_bvec(xdr, GFP_KERNEL); + if (ret < 0) + return ret; msg->msg_flags = MSG_MORE; ret = kernel_sendmsg(sock, msg, &rm, 1, rm.iov_len); -- Gitee From cc46da9de5344020d39687c82cfa22561be7a751 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Thu, 14 Jul 2022 21:59:03 +0800 Subject: [PATCH 312/674] iommu/omap: Fix regression in probe for NULL pointer dereference stable inclusion from stable-v5.10.111 commit bd905fed87ce01ac010011bb8f44ed0140116ceb category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=bd905fed87ce01ac010011bb8f44ed0140116ceb -------------------------------- [ Upstream commit 71ff461c3f41f6465434b9e980c01782763e7ad8 ] Commit 3f6634d997db ("iommu: Use right way to retrieve iommu_ops") started triggering a NULL pointer dereference for some omap variants: __iommu_probe_device from probe_iommu_group+0x2c/0x38 probe_iommu_group from bus_for_each_dev+0x74/0xbc bus_for_each_dev from bus_iommu_probe+0x34/0x2e8 bus_iommu_probe from bus_set_iommu+0x80/0xc8 bus_set_iommu from omap_iommu_init+0x88/0xcc omap_iommu_init from do_one_initcall+0x44/0x24 This is caused by omap iommu probe returning 0 instead of ERR_PTR(-ENODEV) as noted by Jason Gunthorpe . Looks like the regression already happened with an earlier commit 6785eb9105e3 ("iommu/omap: Convert to probe/release_device() call-backs") that changed the function return type and missed converting one place. Cc: Drew Fustini Cc: Lu Baolu Cc: Suman Anna Suggested-by: Jason Gunthorpe Fixes: 6785eb9105e3 ("iommu/omap: Convert to probe/release_device() call-backs") Fixes: 3f6634d997db ("iommu: Use right way to retrieve iommu_ops") Signed-off-by: Tony Lindgren Tested-by: Drew Fustini Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20220331062301.24269-1-tony@atomide.com Signed-off-by: Joerg Roedel Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/iommu/omap-iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c index 71f29c0927fc..ff2c692c0db4 100644 --- a/drivers/iommu/omap-iommu.c +++ b/drivers/iommu/omap-iommu.c @@ -1665,7 +1665,7 @@ static struct iommu_device *omap_iommu_probe_device(struct device *dev) num_iommus = of_property_count_elems_of_size(dev->of_node, "iommus", sizeof(phandle)); if (num_iommus < 0) - return 0; + return ERR_PTR(-ENODEV); arch_data = kcalloc(num_iommus + 1, sizeof(*arch_data), GFP_KERNEL); if (!arch_data) -- Gitee From 7495b6a600bcc5dc05096bf25e8608128b9d3330 Mon Sep 17 00:00:00 2001 From: James Clark Date: Thu, 14 Jul 2022 21:59:04 +0800 Subject: [PATCH 313/674] perf: arm-spe: Fix perf report --mem-mode stable inclusion from stable-v5.10.111 commit 65210fac639ea23c36187c4e4bb5a48e68dddff1 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=65210fac639ea23c36187c4e4bb5a48e68dddff1 -------------------------------- [ Upstream commit ffab487052054162b3b6c9c6005777ec6cfcea05 ] Since commit bb30acae4c4dacfa ("perf report: Bail out --mem-mode if mem info is not available") "perf mem report" and "perf report --mem-mode" don't allow opening the file unless one of the events has PERF_SAMPLE_DATA_SRC set. SPE doesn't have this set even though synthetic memory data is generated after it is decoded. Fix this issue by setting DATA_SRC on SPE events. This has no effect on the data collected because the SPE driver doesn't do anything with that flag and doesn't generate samples. Fixes: bb30acae4c4dacfa ("perf report: Bail out --mem-mode if mem info is not available") Signed-off-by: James Clark Tested-by: Leo Yan Acked-by: Namhyung Kim Cc: Alexander Shishkin Cc: German Gomez Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: linux-arm-kernel@lists.infradead.org Cc: Mark Rutland Cc: Mathieu Poirier Cc: Ravi Bangoria Cc: Will Deacon Link: https://lore.kernel.org/r/20220408144056.1955535-1-james.clark@arm.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- tools/perf/arch/arm64/util/arm-spe.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/perf/arch/arm64/util/arm-spe.c b/tools/perf/arch/arm64/util/arm-spe.c index f3aa09647606..3c0d919a73f4 100644 --- a/tools/perf/arch/arm64/util/arm-spe.c +++ b/tools/perf/arch/arm64/util/arm-spe.c @@ -239,6 +239,12 @@ static int arm_spe_recording_options(struct auxtrace_record *itr, arm_spe_set_timestamp(itr, arm_spe_evsel); } + /* + * Set this only so that perf report knows that SPE generates memory info. It has no effect + * on the opening of the event or the SPE data produced. + */ + evsel__set_sample_bit(arm_spe_evsel, DATA_SRC); + /* Add dummy event to keep tracking */ err = parse_events(evlist, "dummy:u", NULL); if (err) -- Gitee From d8cb6b975a4828ba2388e63ca31f58445e393922 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Thu, 14 Jul 2022 21:59:05 +0800 Subject: [PATCH 314/674] perf tools: Fix perf's libperf_print callback stable inclusion from stable-v5.10.111 commit 362ced37690d2069dc8f5b556a19599c9fb0686c category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=362ced37690d2069dc8f5b556a19599c9fb0686c -------------------------------- [ Upstream commit aeee9dc53ce405d2161f9915f553114e94e5b677 ] eprintf() does not expect va_list as the type of the 4th parameter. Use veprintf() because it does. Signed-off-by: Adrian Hunter Fixes: 428dab813a56ce94 ("libperf: Merge libperf_set_print() into libperf_init()") Cc: Jiri Olsa Link: https://lore.kernel.org/r/20220408132625.2451452-1-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- tools/perf/perf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/perf.c b/tools/perf/perf.c index 27f94b0bb874..505e2a2f1872 100644 --- a/tools/perf/perf.c +++ b/tools/perf/perf.c @@ -433,7 +433,7 @@ void pthread__unblock_sigwinch(void) static int libperf_print(enum libperf_print_level level, const char *fmt, va_list ap) { - return eprintf(level, verbose, fmt, ap); + return veprintf(level, verbose, fmt, ap); } int main(int argc, const char **argv) -- Gitee From dd63ae24cebfd4583af37b813ca4170a0a5b323b Mon Sep 17 00:00:00 2001 From: Denis Nikitin Date: Thu, 14 Jul 2022 21:59:06 +0800 Subject: [PATCH 315/674] perf session: Remap buf if there is no space for event stable inclusion from stable-v5.10.111 commit 4604b5738d5b0c160724b72886822c98c35ad316 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=4604b5738d5b0c160724b72886822c98c35ad316 -------------------------------- [ Upstream commit bc21e74d4775f883ae1f542c1f1dc7205b15d925 ] If a perf event doesn't fit into remaining buffer space return NULL to remap buf and fetch the event again. Keep the logic to error out on inadequate input from fuzzing. This fixes perf failing on ChromeOS (with 32b userspace): $ perf report -v -i perf.data ... prefetch_event: head=0x1fffff8 event->header_size=0x30, mmap_size=0x2000000: fuzzed or compressed perf.data? Error: failed to process sample Fixes: 57fc032ad643ffd0 ("perf session: Avoid infinite loop when seeing invalid header.size") Reviewed-by: James Clark Signed-off-by: Denis Nikitin Acked-by: Jiri Olsa Cc: Alexander Shishkin Cc: Alexey Budankov Cc: Namhyung Kim Link: https://lore.kernel.org/r/20220330031130.2152327-1-denik@chromium.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- tools/perf/util/session.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 9dddec19a494..354e1e04a266 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -2056,6 +2056,7 @@ prefetch_event(char *buf, u64 head, size_t mmap_size, bool needs_swap, union perf_event *error) { union perf_event *event; + u16 event_size; /* * Ensure we have enough space remaining to read @@ -2068,15 +2069,23 @@ prefetch_event(char *buf, u64 head, size_t mmap_size, if (needs_swap) perf_event_header__bswap(&event->header); - if (head + event->header.size <= mmap_size) + event_size = event->header.size; + if (head + event_size <= mmap_size) return event; /* We're not fetching the event so swap back again */ if (needs_swap) perf_event_header__bswap(&event->header); - pr_debug("%s: head=%#" PRIx64 " event->header_size=%#x, mmap_size=%#zx:" - " fuzzed or compressed perf.data?\n",__func__, head, event->header.size, mmap_size); + /* Check if the event fits into the next mmapped buf. */ + if (event_size <= mmap_size - head % page_size) { + /* Remap buf and fetch again. */ + return NULL; + } + + /* Invalid input. Event size should never exceed mmap_size. */ + pr_debug("%s: head=%#" PRIx64 " event->header.size=%#x, mmap_size=%#zx:" + " fuzzed or compressed perf.data?\n", __func__, head, event_size, mmap_size); return error; } -- Gitee From a5b625b788104b3bf7c3201c0a75f527be93182c Mon Sep 17 00:00:00 2001 From: Chanho Park Date: Thu, 14 Jul 2022 21:59:07 +0800 Subject: [PATCH 316/674] arm64: Add part number for Arm Cortex-A78AE stable inclusion from stable-v5.10.111 commit 9de98470db6ec5223ab47ec50b21c40f4d08655c category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=9de98470db6ec5223ab47ec50b21c40f4d08655c -------------------------------- commit 83bea32ac7ed37bbda58733de61fc9369513f9f9 upstream. Add the MIDR part number info for the Arm Cortex-A78AE[1] and add it to spectre-BHB affected list[2]. [1]: https://developer.arm.com/Processors/Cortex-A78AE [2]: https://developer.arm.com/Arm%20Security%20Center/Spectre-BHB Cc: Catalin Marinas Cc: Mark Rutland Cc: Will Deacon Cc: James Morse Signed-off-by: Chanho Park Link: https://lore.kernel.org/r/20220407091128.8700-1-chanho61.park@samsung.com Signed-off-by: Will Deacon Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Conflicts: arch/arm64/include/asm/cputype.h Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- arch/arm64/include/asm/cputype.h | 2 ++ arch/arm64/kernel/proton-pack.c | 1 + 2 files changed, 3 insertions(+) diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index 2e0066f13ad4..662708c56397 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -76,6 +76,7 @@ #define ARM_CPU_PART_CORTEX_A77 0xD0D #define ARM_CPU_PART_NEOVERSE_V1 0xD40 #define ARM_CPU_PART_CORTEX_A78 0xD41 +#define ARM_CPU_PART_CORTEX_A78AE 0xD42 #define ARM_CPU_PART_CORTEX_X1 0xD44 #define ARM_CPU_PART_CORTEX_A78C 0xD4B #define ARM_CPU_PART_CORTEX_A510 0xD46 @@ -131,6 +132,7 @@ #define MIDR_CORTEX_A77 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A77) #define MIDR_NEOVERSE_V1 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_V1) #define MIDR_CORTEX_A78 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A78) +#define MIDR_CORTEX_A78AE MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A78AE) #define MIDR_CORTEX_X1 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X1) #define MIDR_CORTEX_A78C MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A78C) #define MIDR_CORTEX_A510 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A510) diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c index a3fb28e170dd..e807f77737e0 100644 --- a/arch/arm64/kernel/proton-pack.c +++ b/arch/arm64/kernel/proton-pack.c @@ -861,6 +861,7 @@ u8 spectre_bhb_loop_affected(int scope) if (scope == SCOPE_LOCAL_CPU) { static const struct midr_range spectre_bhb_k32_list[] = { MIDR_ALL_VERSIONS(MIDR_CORTEX_A78), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A78AE), MIDR_ALL_VERSIONS(MIDR_CORTEX_A78C), MIDR_ALL_VERSIONS(MIDR_CORTEX_X1), MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V1), -- Gitee From a5207284dd6072c7e9eaa0d823042528e1eb8d4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Thu, 14 Jul 2022 21:59:08 +0800 Subject: [PATCH 317/674] Revert "mmc: sdhci-xenon: fix annoying 1.8V regulator warning" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit stable inclusion from stable-v5.10.111 commit 41a519c05beee0ad9a0e53143fe24c6eeb677db6 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=41a519c05beee0ad9a0e53143fe24c6eeb677db6 -------------------------------- commit 7e2646ed47542123168d43916b84b954532e5386 upstream. This reverts commit bb32e1987bc55ce1db400faf47d85891da3c9b9f. Commit 1a3ed0dc3594 ("mmc: sdhci-xenon: fix 1.8v regulator stabilization") contains proper fix for the issue described in commit bb32e1987bc5 ("mmc: sdhci-xenon: fix annoying 1.8V regulator warning"). Fixes: 8d876bf472db ("mmc: sdhci-xenon: wait 5ms after set 1.8V signal enable") Cc: stable@vger.kernel.org # 1a3ed0dc3594 ("mmc: sdhci-xenon: fix 1.8v regulator stabilization") Signed-off-by: Pali Rohár Reviewed-by: Marek Behún Reviewed-by: Marcin Wojtas Link: https://lore.kernel.org/r/20220318141441.32329-1-pali@kernel.org Signed-off-by: Ulf Hansson Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/mmc/host/sdhci-xenon.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/drivers/mmc/host/sdhci-xenon.c b/drivers/mmc/host/sdhci-xenon.c index 0e5234a5ca22..d509198c00c8 100644 --- a/drivers/mmc/host/sdhci-xenon.c +++ b/drivers/mmc/host/sdhci-xenon.c @@ -240,16 +240,6 @@ static void xenon_voltage_switch(struct sdhci_host *host) { /* Wait for 5ms after set 1.8V signal enable bit */ usleep_range(5000, 5500); - - /* - * For some reason the controller's Host Control2 register reports - * the bit representing 1.8V signaling as 0 when read after it was - * written as 1. Subsequent read reports 1. - * - * Since this may cause some issues, do an empty read of the Host - * Control2 register here to circumvent this. - */ - sdhci_readw(host, SDHCI_HOST_CONTROL2); } static const struct sdhci_ops sdhci_xenon_ops = { -- Gitee From 786719f2db6e047c47741f627891993075e7a54d Mon Sep 17 00:00:00 2001 From: Yann Gautier Date: Thu, 14 Jul 2022 21:59:09 +0800 Subject: [PATCH 318/674] mmc: mmci: stm32: correctly check all elements of sg list stable inclusion from stable-v5.10.111 commit 029b4170737f03253dd5cb9a57a657ce0cc2b532 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=029b4170737f03253dd5cb9a57a657ce0cc2b532 -------------------------------- commit 0d319dd5a27183b75d984e3dc495248e59f99334 upstream. Use sg and not data->sg when checking sg list elements. Else only the first element alignment is checked. The last element should be checked the same way, for_each_sg already set sg to sg_next(sg). Fixes: 46b723dd867d ("mmc: mmci: add stm32 sdmmc variant") Cc: stable@vger.kernel.org Signed-off-by: Yann Gautier Link: https://lore.kernel.org/r/20220317111944.116148-2-yann.gautier@foss.st.com Signed-off-by: Ulf Hansson Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/mmc/host/mmci_stm32_sdmmc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/mmc/host/mmci_stm32_sdmmc.c b/drivers/mmc/host/mmci_stm32_sdmmc.c index a75d3dd34d18..4cceb9bab036 100644 --- a/drivers/mmc/host/mmci_stm32_sdmmc.c +++ b/drivers/mmc/host/mmci_stm32_sdmmc.c @@ -62,8 +62,8 @@ static int sdmmc_idma_validate_data(struct mmci_host *host, * excepted the last element which has no constraint on idmasize */ for_each_sg(data->sg, sg, data->sg_len - 1, i) { - if (!IS_ALIGNED(data->sg->offset, sizeof(u32)) || - !IS_ALIGNED(data->sg->length, SDMMC_IDMA_BURST)) { + if (!IS_ALIGNED(sg->offset, sizeof(u32)) || + !IS_ALIGNED(sg->length, SDMMC_IDMA_BURST)) { dev_err(mmc_dev(host->mmc), "unaligned scatterlist: ofst:%x length:%d\n", data->sg->offset, data->sg->length); @@ -71,7 +71,7 @@ static int sdmmc_idma_validate_data(struct mmci_host *host, } } - if (!IS_ALIGNED(data->sg->offset, sizeof(u32))) { + if (!IS_ALIGNED(sg->offset, sizeof(u32))) { dev_err(mmc_dev(host->mmc), "unaligned last scatterlist: ofst:%x length:%d\n", data->sg->offset, data->sg->length); -- Gitee From 54d20fe524573e5de33005a6fc6e362820bee749 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 14 Jul 2022 21:59:10 +0800 Subject: [PATCH 319/674] mmc: renesas_sdhi: don't overwrite TAP settings when HS400 tuning is complete stable inclusion from stable-v5.10.111 commit 8b6f04b4c9d965c04431620d12afb5e729c7eacf category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=8b6f04b4c9d965c04431620d12afb5e729c7eacf -------------------------------- commit 03e59b1e2f56245163b14c69e0a830c24b1a3a47 upstream. When HS400 tuning is complete and HS400 is going to be activated, we have to keep the current number of TAPs and should not overwrite them with a hardcoded value. This was probably a copy&paste mistake when upporting HS400 support from the BSP. Fixes: 26eb2607fa28 ("mmc: renesas_sdhi: add eMMC HS400 mode support") Reported-by: Yoshihiro Shimoda Signed-off-by: Wolfram Sang Reviewed-by: Yoshihiro Shimoda Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20220404114902.12175-1-wsa+renesas@sang-engineering.com Signed-off-by: Ulf Hansson Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/mmc/host/renesas_sdhi_core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/mmc/host/renesas_sdhi_core.c b/drivers/mmc/host/renesas_sdhi_core.c index 782879d46ff4..ac01fb518386 100644 --- a/drivers/mmc/host/renesas_sdhi_core.c +++ b/drivers/mmc/host/renesas_sdhi_core.c @@ -390,10 +390,10 @@ static void renesas_sdhi_hs400_complete(struct mmc_host *mmc) SH_MOBILE_SDHI_SCC_TMPPORT2_HS400OSEL) | sd_scc_read32(host, priv, SH_MOBILE_SDHI_SCC_TMPPORT2)); - /* Set the sampling clock selection range of HS400 mode */ sd_scc_write32(host, priv, SH_MOBILE_SDHI_SCC_DTCNTL, SH_MOBILE_SDHI_SCC_DTCNTL_TAPEN | - 0x4 << SH_MOBILE_SDHI_SCC_DTCNTL_TAPNUM_SHIFT); + sd_scc_read32(host, priv, + SH_MOBILE_SDHI_SCC_DTCNTL)); /* Avoid bad TAP */ if (bad_taps & BIT(priv->tap_set)) { -- Gitee From b55f36cef846f393fbea2977d0423c201a4c9e5f Mon Sep 17 00:00:00 2001 From: Guo Xuenan Date: Thu, 14 Jul 2022 21:59:11 +0800 Subject: [PATCH 320/674] lz4: fix LZ4_decompress_safe_partial read out of bound stable inclusion from stable-v5.10.111 commit 6adc01a7aa37445dafe8846faa0610a86029b253 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=6adc01a7aa37445dafe8846faa0610a86029b253 -------------------------------- commit eafc0a02391b7b36617b36c97c4b5d6832cf5e24 upstream. When partialDecoding, it is EOF if we've either filled the output buffer or can't proceed with reading an offset for following match. In some extreme corner cases when compressed data is suitably corrupted, UAF will occur. As reported by KASAN [1], LZ4_decompress_safe_partial may lead to read out of bound problem during decoding. lz4 upstream has fixed it [2] and this issue has been disscussed here [3] before. current decompression routine was ported from lz4 v1.8.3, bumping lib/lz4 to v1.9.+ is certainly a huge work to be done later, so, we'd better fix it first. [1] https://lore.kernel.org/all/000000000000830d1205cf7f0477@google.com/ [2] https://github.com/lz4/lz4/commit/c5d6f8a8be3927c0bec91bcc58667a6cfad244ad# [3] https://lore.kernel.org/all/CC666AE8-4CA4-4951-B6FB-A2EFDE3AC03B@fb.com/ Link: https://lkml.kernel.org/r/20211111105048.2006070-1-guoxuenan@huawei.com Reported-by: syzbot+63d688f1d899c588fb71@syzkaller.appspotmail.com Signed-off-by: Guo Xuenan Reviewed-by: Nick Terrell Acked-by: Gao Xiang Cc: Yann Collet Cc: Chengyang Fan Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- lib/lz4/lz4_decompress.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c index 8a7724a6ce2f..5b6705c4b2d2 100644 --- a/lib/lz4/lz4_decompress.c +++ b/lib/lz4/lz4_decompress.c @@ -271,8 +271,12 @@ static FORCE_INLINE int LZ4_decompress_generic( ip += length; op += length; - /* Necessarily EOF, due to parsing restrictions */ - if (!partialDecoding || (cpy == oend)) + /* Necessarily EOF when !partialDecoding. + * When partialDecoding, it is EOF if we've either + * filled the output buffer or + * can't proceed with reading an offset for following match. + */ + if (!partialDecoding || (cpy == oend) || (ip >= (iend - 2))) break; } else { /* may overwrite up to WILDCOPYLENGTH beyond cpy */ -- Gitee From e4646733d5b0658a060d834a2043dcc314498658 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 14 Jul 2022 21:59:12 +0800 Subject: [PATCH 321/674] mmmremap.c: avoid pointless invalidate_range_start/end on mremap(old_size=0) stable inclusion from stable-v5.10.111 commit 7d659cb1763ff17d1c6ee082fa6feb4267c7a30b category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=7d659cb1763ff17d1c6ee082fa6feb4267c7a30b -------------------------------- commit 01e67e04c28170c47700c2c226d732bbfedb1ad0 upstream. If an mremap() syscall with old_size=0 ends up in move_page_tables(), it will call invalidate_range_start()/invalidate_range_end() unnecessarily, i.e. with an empty range. This causes a WARN in KVM's mmu_notifier. In the past, empty ranges have been diagnosed to be off-by-one bugs, hence the WARNing. Given the low (so far) number of unique reports, the benefits of detecting more buggy callers seem to outweigh the cost of having to fix cases such as this one, where userspace is doing something silly. In this particular case, an early return from move_page_tables() is enough to fix the issue. Link: https://lkml.kernel.org/r/20220329173155.172439-1-pbonzini@redhat.com Reported-by: syzbot+6bde52d89cfdf9f61425@syzkaller.appspotmail.com Signed-off-by: Paolo Bonzini Cc: Sean Christopherson Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- mm/mremap.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/mremap.c b/mm/mremap.c index ecfca97b97ae..2f7f3494a990 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -487,6 +487,9 @@ unsigned long move_page_tables(struct vm_area_struct *vma, pmd_t *old_pmd, *new_pmd; pud_t *old_pud, *new_pud; + if (!len) + return 0; + old_end = old_addr + len; flush_cache_range(vma, old_addr, old_end); -- Gitee From 31a2a59fa5a7d5c8fc20ef0aec8cbd3816e0905b Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 14 Jul 2022 21:59:13 +0800 Subject: [PATCH 322/674] mm/mempolicy: fix mpol_new leak in shared_policy_replace stable inclusion from stable-v5.10.111 commit f7e183b0a7136b6dc9c7b9b2a85a608a8feba894 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=f7e183b0a7136b6dc9c7b9b2a85a608a8feba894 -------------------------------- commit 4ad099559b00ac01c3726e5c95dc3108ef47d03e upstream. If mpol_new is allocated but not used in restart loop, mpol_new will be freed via mpol_put before returning to the caller. But refcnt is not initialized yet, so mpol_put could not do the right things and might leak the unused mpol_new. This would happen if mempolicy was updated on the shared shmem file while the sp->lock has been dropped during the memory allocation. This issue could be triggered easily with the below code snippet if there are many processes doing the below work at the same time: shmid = shmget((key_t)5566, 1024 * PAGE_SIZE, 0666|IPC_CREAT); shm = shmat(shmid, 0, 0); loop many times { mbind(shm, 1024 * PAGE_SIZE, MPOL_LOCAL, mask, maxnode, 0); mbind(shm + 128 * PAGE_SIZE, 128 * PAGE_SIZE, MPOL_DEFAULT, mask, maxnode, 0); } Link: https://lkml.kernel.org/r/20220329111416.27954-1-linmiaohe@huawei.com Fixes: 42288fe366c4 ("mm: mempolicy: Convert shared_policy mutex to spinlock") Signed-off-by: Miaohe Lin Acked-by: Michal Hocko Cc: KOSAKI Motohiro Cc: Mel Gorman Cc: [3.8] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- mm/mempolicy.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index ef7eb6a06827..e7fdb3a749b5 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2690,6 +2690,7 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start, mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL); if (!mpol_new) goto err_out; + atomic_set(&mpol_new->refcnt, 1); goto restart; } -- Gitee From eb917904cf24022c23f3e55f39678545e0c45883 Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Thu, 14 Jul 2022 21:59:14 +0800 Subject: [PATCH 323/674] x86/pm: Save the MSR validity status at context setup stable inclusion from stable-v5.10.111 commit 8c9e26c890ba130ad98137770d221dac7853540c category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=8c9e26c890ba130ad98137770d221dac7853540c -------------------------------- commit 73924ec4d560257004d5b5116b22a3647661e364 upstream. The mechanism to save/restore MSRs during S3 suspend/resume checks for the MSR validity during suspend, and only restores the MSR if its a valid MSR. This is not optimal, as an invalid MSR will unnecessarily throw an exception for every suspend cycle. The more invalid MSRs, higher the impact will be. Check and save the MSR validity at setup. This ensures that only valid MSRs that are guaranteed to not throw an exception will be attempted during suspend. Fixes: 7a9c2dd08ead ("x86/pm: Introduce quirk framework to save/restore extra MSR registers around suspend/resume") Suggested-by: Dave Hansen Signed-off-by: Pawan Gupta Reviewed-by: Dave Hansen Acked-by: Borislav Petkov Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- arch/x86/power/cpu.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index db1378c6ff26..3b42175673c2 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -40,7 +40,8 @@ static void msr_save_context(struct saved_context *ctxt) struct saved_msr *end = msr + ctxt->saved_msrs.num; while (msr < end) { - msr->valid = !rdmsrl_safe(msr->info.msr_no, &msr->info.reg.q); + if (msr->valid) + rdmsrl(msr->info.msr_no, msr->info.reg.q); msr++; } } @@ -427,8 +428,10 @@ static int msr_build_context(const u32 *msr_id, const int num) } for (i = saved_msrs->num, j = 0; i < total_num; i++, j++) { + u64 dummy; + msr_array[i].info.msr_no = msr_id[j]; - msr_array[i].valid = false; + msr_array[i].valid = !rdmsrl_safe(msr_id[j], &dummy); msr_array[i].info.reg.q = 0; } saved_msrs->num = total_num; -- Gitee From d362730053576be5d068beb2a76a2fae15608616 Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Thu, 14 Jul 2022 21:59:15 +0800 Subject: [PATCH 324/674] x86/speculation: Restore speculation related MSRs during S3 resume stable inclusion from stable-v5.10.111 commit fc4bdaed4d4ea4209e65115bd3948a1e4ac51cbb category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=fc4bdaed4d4ea4209e65115bd3948a1e4ac51cbb -------------------------------- commit e2a1256b17b16f9b9adf1b6fea56819e7b68e463 upstream. After resuming from suspend-to-RAM, the MSRs that control CPU's speculative execution behavior are not being restored on the boot CPU. These MSRs are used to mitigate speculative execution vulnerabilities. Not restoring them correctly may leave the CPU vulnerable. Secondary CPU's MSRs are correctly being restored at S3 resume by identify_secondary_cpu(). During S3 resume, restore these MSRs for boot CPU when restoring its processor state. Fixes: 772439717dbf ("x86/bugs/intel: Set proper CPU features and setup RDS") Reported-by: Neelima Krishnan Signed-off-by: Pawan Gupta Tested-by: Neelima Krishnan Acked-by: Borislav Petkov Reviewed-by: Dave Hansen Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- arch/x86/power/cpu.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 3b42175673c2..decebcd8ee1c 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -506,10 +506,24 @@ static int pm_cpu_check(const struct x86_cpu_id *c) return ret; } +static void pm_save_spec_msr(void) +{ + u32 spec_msr_id[] = { + MSR_IA32_SPEC_CTRL, + MSR_IA32_TSX_CTRL, + MSR_TSX_FORCE_ABORT, + MSR_IA32_MCU_OPT_CTRL, + MSR_AMD64_LS_CFG, + }; + + msr_build_context(spec_msr_id, ARRAY_SIZE(spec_msr_id)); +} + static int pm_check_save_msr(void) { dmi_check_system(msr_save_dmi_table); pm_cpu_check(msr_save_cpu_table); + pm_save_spec_msr(); return 0; } -- Gitee From 8fda77c8ca2585f55d6655055152994765478271 Mon Sep 17 00:00:00 2001 From: Ethan Lien Date: Thu, 14 Jul 2022 21:59:16 +0800 Subject: [PATCH 325/674] btrfs: fix qgroup reserve overflow the qgroup limit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit stable inclusion from stable-v5.10.111 commit 82ae73ac963cee877ce34f7c31b2b456b516e96c category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=82ae73ac963cee877ce34f7c31b2b456b516e96c -------------------------------- commit b642b52d0b50f4d398cb4293f64992d0eed2e2ce upstream. We use extent_changeset->bytes_changed in qgroup_reserve_data() to record how many bytes we set for EXTENT_QGROUP_RESERVED state. Currently the bytes_changed is set as "unsigned int", and it will overflow if we try to fallocate a range larger than 4GiB. The result is we reserve less bytes and eventually break the qgroup limit. Unlike regular buffered/direct write, which we use one changeset for each ordered extent, which can never be larger than 256M. For fallocate, we use one changeset for the whole range, thus it no longer respects the 256M per extent limit, and caused the problem. The following example test script reproduces the problem: $ cat qgroup-overflow.sh #!/bin/bash DEV=/dev/sdj MNT=/mnt/sdj mkfs.btrfs -f $DEV mount $DEV $MNT # Set qgroup limit to 2GiB. btrfs quota enable $MNT btrfs qgroup limit 2G $MNT # Try to fallocate a 3GiB file. This should fail. echo echo "Try to fallocate a 3GiB file..." fallocate -l 3G $MNT/3G.file # Try to fallocate a 5GiB file. echo echo "Try to fallocate a 5GiB file..." fallocate -l 5G $MNT/5G.file # See we break the qgroup limit. echo sync btrfs qgroup show -r $MNT umount $MNT When running the test: $ ./qgroup-overflow.sh (...) Try to fallocate a 3GiB file... fallocate: fallocate failed: Disk quota exceeded Try to fallocate a 5GiB file... qgroupid         rfer         excl     max_rfer --------         ----         ----     -------- 0/5           5.00GiB      5.00GiB      2.00GiB Since we have no control of how bytes_changed is used, it's better to set it to u64. CC: stable@vger.kernel.org # 4.14+ Reviewed-by: Qu Wenruo Signed-off-by: Ethan Lien Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- fs/btrfs/extent_io.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index f39d02e7f7ef..16f44bc481ab 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -121,7 +121,7 @@ struct extent_buffer { */ struct extent_changeset { /* How many bytes are set/cleared in this operation */ - unsigned int bytes_changed; + u64 bytes_changed; /* Changed ranges */ struct ulist range_changed; -- Gitee From 6ecec32ac172c992aaaeaa8846ab3c96905e9ac9 Mon Sep 17 00:00:00 2001 From: Kaiwen Hu Date: Thu, 14 Jul 2022 21:59:17 +0800 Subject: [PATCH 326/674] btrfs: prevent subvol with swapfile from being deleted stable inclusion from stable-v5.10.111 commit a044bca8ef310bd43e3c683a1b1df66f96fa9e0a category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=a044bca8ef310bd43e3c683a1b1df66f96fa9e0a -------------------------------- commit 60021bd754c6ca0addc6817994f20290a321d8d6 upstream. A subvolume with an active swapfile must not be deleted otherwise it would not be possible to deactivate it. After the subvolume is deleted, we cannot swapoff the swapfile in this deleted subvolume because the path is unreachable. The swapfile is still active and holding references, the filesystem cannot be unmounted. The test looks like this: mkfs.btrfs -f $dev > /dev/null mount $dev $mnt btrfs sub create $mnt/subvol touch $mnt/subvol/swapfile chmod 600 $mnt/subvol/swapfile chattr +C $mnt/subvol/swapfile dd if=/dev/zero of=$mnt/subvol/swapfile bs=1K count=4096 mkswap $mnt/subvol/swapfile swapon $mnt/subvol/swapfile btrfs sub delete $mnt/subvol swapoff $mnt/subvol/swapfile # failed: No such file or directory swapoff --all unmount $mnt # target is busy. To prevent above issue, we simply check that whether the subvolume contains any active swapfile, and stop the deleting process. This behavior is like snapshot ioctl dealing with a swapfile. CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Robbie Ko Reviewed-by: Qu Wenruo Reviewed-by: Filipe Manana Signed-off-by: Kaiwen Hu Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- fs/btrfs/inode.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1d9262a35473..f7f4ac01589b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4023,6 +4023,13 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry) dest->root_key.objectid); return -EPERM; } + if (atomic_read(&dest->nr_swapfiles)) { + spin_unlock(&dest->root_item_lock); + btrfs_warn(fs_info, + "attempt to delete subvolume %llu with active swapfile", + root->root_key.objectid); + return -EPERM; + } root_flags = btrfs_root_flags(&dest->root_item); btrfs_set_root_flags(&dest->root_item, root_flags | BTRFS_ROOT_SUBVOL_DEAD); @@ -10215,8 +10222,23 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, * set. We use this counter to prevent snapshots. We must increment it * before walking the extents because we don't want a concurrent * snapshot to run after we've already checked the extents. + * + * It is possible that subvolume is marked for deletion but still not + * removed yet. To prevent this race, we check the root status before + * activating the swapfile. */ + spin_lock(&root->root_item_lock); + if (btrfs_root_dead(root)) { + spin_unlock(&root->root_item_lock); + + btrfs_exclop_finish(fs_info); + btrfs_warn(fs_info, + "cannot activate swapfile because subvolume %llu is being deleted", + root->root_key.objectid); + return -EPERM; + } atomic_inc(&root->nr_swapfiles); + spin_unlock(&root->root_item_lock); isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize); -- Gitee From 46529dec611c6d97453432ea503a0080ae79a783 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Thu, 14 Jul 2022 21:59:18 +0800 Subject: [PATCH 327/674] arm64: patch_text: Fixup last cpu should be master stable inclusion from stable-v5.10.111 commit 8bb41682911f8d3d0874af6646406856290565a9 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=8bb41682911f8d3d0874af6646406856290565a9 -------------------------------- commit 31a099dbd91e69fcab55eef4be15ed7a8c984918 upstream. These patch_text implementations are using stop_machine_cpuslocked infrastructure with atomic cpu_count. The original idea: When the master CPU patch_text, the others should wait for it. But current implementation is using the first CPU as master, which couldn't guarantee the remaining CPUs are waiting. This patch changes the last CPU as the master to solve the potential risk. Fixes: ae16480785de ("arm64: introduce interfaces to hotpatch kernel and module code") Signed-off-by: Guo Ren Signed-off-by: Guo Ren Reviewed-by: Catalin Marinas Reviewed-by: Masami Hiramatsu Cc: Link: https://lore.kernel.org/r/20220407073323.743224-2-guoren@kernel.org Signed-off-by: Will Deacon Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- arch/arm64/kernel/insn.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kernel/insn.c b/arch/arm64/kernel/insn.c index 6c0de2f60ea9..7d4fdf974542 100644 --- a/arch/arm64/kernel/insn.c +++ b/arch/arm64/kernel/insn.c @@ -216,8 +216,8 @@ static int __kprobes aarch64_insn_patch_text_cb(void *arg) int i, ret = 0; struct aarch64_insn_patch *pp = arg; - /* The first CPU becomes master */ - if (atomic_inc_return(&pp->cpu_count) == 1) { + /* The last CPU becomes master */ + if (atomic_inc_return(&pp->cpu_count) == num_online_cpus()) { for (i = 0; ret == 0 && i < pp->insn_cnt; i++) ret = aarch64_insn_patch_text_nosync(pp->text_addrs[i], pp->new_insns[i]); -- Gitee From f71259712007a69998415360475dfe01721c728d Mon Sep 17 00:00:00 2001 From: Douglas Miller Date: Thu, 14 Jul 2022 21:59:19 +0800 Subject: [PATCH 328/674] RDMA/hfi1: Fix use-after-free bug for mm struct stable inclusion from stable-v5.10.111 commit 5f54364ff6cfcd14cddf5441c4a490bb28dd69f7 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=5f54364ff6cfcd14cddf5441c4a490bb28dd69f7 -------------------------------- commit 2bbac98d0930e8161b1957dc0ec99de39ade1b3c upstream. Under certain conditions, such as MPI_Abort, the hfi1 cleanup code may represent the last reference held on the task mm. hfi1_mmu_rb_unregister() then drops the last reference and the mm is freed before the final use in hfi1_release_user_pages(). A new task may allocate the mm structure while it is still being used, resulting in problems. One manifestation is corruption of the mmap_sem counter leading to a hang in down_write(). Another is corruption of an mm struct that is in use by another task. Fixes: 3d2a9d642512 ("IB/hfi1: Ensure correct mm is used at all times") Link: https://lore.kernel.org/r/20220408133523.122165.72975.stgit@awfm-01.cornelisnetworks.com Cc: Signed-off-by: Douglas Miller Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/infiniband/hw/hfi1/mmu_rb.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c index d213f65d4cdd..ed8a96ae61ce 100644 --- a/drivers/infiniband/hw/hfi1/mmu_rb.c +++ b/drivers/infiniband/hw/hfi1/mmu_rb.c @@ -121,6 +121,9 @@ void hfi1_mmu_rb_unregister(struct mmu_rb_handler *handler) unsigned long flags; struct list_head del_list; + /* Prevent freeing of mm until we are completely finished. */ + mmgrab(handler->mn.mm); + /* Unregister first so we don't get any more notifications. */ mmu_notifier_unregister(&handler->mn, handler->mn.mm); @@ -143,6 +146,9 @@ void hfi1_mmu_rb_unregister(struct mmu_rb_handler *handler) do_remove(handler, &del_list); + /* Now the mm may be freed. */ + mmdrop(handler->mn.mm); + kfree(handler); } -- Gitee From b20392ad0b9b08a9940ef5817929710e1637e9cd Mon Sep 17 00:00:00 2001 From: Shreeya Patel Date: Thu, 14 Jul 2022 21:59:20 +0800 Subject: [PATCH 329/674] gpio: Restrict usage of GPIO chip irq members before initialization stable inclusion from stable-v5.10.111 commit 7e88a50704b0c49ad3f2d11e8b963341cf68a89f category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=7e88a50704b0c49ad3f2d11e8b963341cf68a89f -------------------------------- commit 5467801f1fcbdc46bc7298a84dbf3ca1ff2a7320 upstream. GPIO chip irq members are exposed before they could be completely initialized and this leads to race conditions. One such issue was observed for the gc->irq.domain variable which was accessed through the I2C interface in gpiochip_to_irq() before it could be initialized by gpiochip_add_irqchip(). This resulted in Kernel NULL pointer dereference. Following are the logs for reference :- kernel: Call Trace: kernel: gpiod_to_irq+0x53/0x70 kernel: acpi_dev_gpio_irq_get_by+0x113/0x1f0 kernel: i2c_acpi_get_irq+0xc0/0xd0 kernel: i2c_device_probe+0x28a/0x2a0 kernel: really_probe+0xf2/0x460 kernel: RIP: 0010:gpiochip_to_irq+0x47/0xc0 To avoid such scenarios, restrict usage of GPIO chip irq members before they are completely initialized. Signed-off-by: Shreeya Patel Cc: stable@vger.kernel.org Reviewed-by: Andy Shevchenko Reviewed-by: Linus Walleij Signed-off-by: Bartosz Golaszewski Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/gpio/gpiolib.c | 19 +++++++++++++++++++ include/linux/gpio/driver.h | 9 +++++++++ 2 files changed, 28 insertions(+) diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 00526fdd7691..d18078748200 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -1411,6 +1411,16 @@ static int gpiochip_to_irq(struct gpio_chip *gc, unsigned offset) { struct irq_domain *domain = gc->irq.domain; +#ifdef CONFIG_GPIOLIB_IRQCHIP + /* + * Avoid race condition with other code, which tries to lookup + * an IRQ before the irqchip has been properly registered, + * i.e. while gpiochip is still being brought up. + */ + if (!gc->irq.initialized) + return -EPROBE_DEFER; +#endif + if (!gpiochip_irqchip_irq_valid(gc, offset)) return -ENXIO; @@ -1604,6 +1614,15 @@ static int gpiochip_add_irqchip(struct gpio_chip *gc, acpi_gpiochip_request_interrupts(gc); + /* + * Using barrier() here to prevent compiler from reordering + * gc->irq.initialized before initialization of above + * GPIO chip irq members. + */ + barrier(); + + gc->irq.initialized = true; + return 0; } diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index 8e144306e262..b216899b4745 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -224,6 +224,15 @@ struct gpio_irq_chip { unsigned long *valid_mask, unsigned int ngpios); + /** + * @initialized: + * + * Flag to track GPIO chip irq member's initialization. + * This flag will make sure GPIO chip irq members are not used + * before they are initialized. + */ + bool initialized; + /** * @valid_mask: * -- Gitee From 2a1074ace4931c8d4355fc8d897faea5e867f35d Mon Sep 17 00:00:00 2001 From: Christian Lamparter Date: Thu, 14 Jul 2022 21:59:21 +0800 Subject: [PATCH 330/674] ata: sata_dwc_460ex: Fix crash due to OOB write stable inclusion from stable-v5.10.111 commit fc629224aa62f23849cae83717932985ac51232d category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=fc629224aa62f23849cae83717932985ac51232d -------------------------------- commit 7aa8104a554713b685db729e66511b93d989dd6a upstream. the driver uses libata's "tag" values from in various arrays. Since the mentioned patch bumped the ATA_TAG_INTERNAL to 32, the value of the SATA_DWC_QCMD_MAX needs to account for that. Otherwise ATA_TAG_INTERNAL usage cause similar crashes like this as reported by Tice Rex on the OpenWrt Forum and reproduced (with symbols) here: | BUG: Kernel NULL pointer dereference at 0x00000000 | Faulting instruction address: 0xc03ed4b8 | Oops: Kernel access of bad area, sig: 11 [#1] | BE PAGE_SIZE=4K PowerPC 44x Platform | CPU: 0 PID: 362 Comm: scsi_eh_1 Not tainted 5.4.163 #0 | NIP: c03ed4b8 LR: c03d27e8 CTR: c03ed36c | REGS: cfa59950 TRAP: 0300 Not tainted (5.4.163) | MSR: 00021000 CR: 42000222 XER: 00000000 | DEAR: 00000000 ESR: 00000000 | GPR00: c03d27e8 cfa59a08 cfa55fe0 00000000 0fa46bc0 [...] | [..] | NIP [c03ed4b8] sata_dwc_qc_issue+0x14c/0x254 | LR [c03d27e8] ata_qc_issue+0x1c8/0x2dc | Call Trace: | [cfa59a08] [c003f4e0] __cancel_work_timer+0x124/0x194 (unreliable) | [cfa59a78] [c03d27e8] ata_qc_issue+0x1c8/0x2dc | [cfa59a98] [c03d2b3c] ata_exec_internal_sg+0x240/0x524 | [cfa59b08] [c03d2e98] ata_exec_internal+0x78/0xe0 | [cfa59b58] [c03d30fc] ata_read_log_page.part.38+0x1dc/0x204 | [cfa59bc8] [c03d324c] ata_identify_page_supported+0x68/0x130 | [...] This is because sata_dwc_dma_xfer_complete() NULLs the dma_pending's next neighbour "chan" (a *dma_chan struct) in this '32' case right here (line ~735): > hsdevp->dma_pending[tag] = SATA_DWC_DMA_PENDING_NONE; Then the next time, a dma gets issued; dma_dwc_xfer_setup() passes the NULL'd hsdevp->chan to the dmaengine_slave_config() which then causes the crash. With this patch, SATA_DWC_QCMD_MAX is now set to ATA_MAX_QUEUE + 1. This avoids the OOB. But please note, there was a worthwhile discussion on what ATA_TAG_INTERNAL and ATA_MAX_QUEUE is. And why there should not be a "fake" 33 command-long queue size. Ideally, the dw driver should account for the ATA_TAG_INTERNAL. In Damien Le Moal's words: "... having looked at the driver, it is a bigger change than just faking a 33rd "tag" that is in fact not a command tag at all." Fixes: 28361c403683c ("libata: add extra internal command") Cc: stable@kernel.org # 4.18+ BugLink: https://github.com/openwrt/openwrt/issues/9505 Signed-off-by: Christian Lamparter Signed-off-by: Damien Le Moal Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/ata/sata_dwc_460ex.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/ata/sata_dwc_460ex.c b/drivers/ata/sata_dwc_460ex.c index 982fe9112532..464260f66870 100644 --- a/drivers/ata/sata_dwc_460ex.c +++ b/drivers/ata/sata_dwc_460ex.c @@ -145,7 +145,11 @@ struct sata_dwc_device { #endif }; -#define SATA_DWC_QCMD_MAX 32 +/* + * Allow one extra special slot for commands and DMA management + * to account for libata internal commands. + */ +#define SATA_DWC_QCMD_MAX (ATA_MAX_QUEUE + 1) struct sata_dwc_device_port { struct sata_dwc_device *hsdev; -- Gitee From 14b1b10261db1b384c4c1e3797f8618cf80cc71b Mon Sep 17 00:00:00 2001 From: Xiaomeng Tong Date: Thu, 14 Jul 2022 21:59:22 +0800 Subject: [PATCH 331/674] perf: qcom_l2_pmu: fix an incorrect NULL check on list iterator stable inclusion from stable-v5.10.111 commit 451214b266e949b219d27faa9f44b434086a2f92 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=451214b266e949b219d27faa9f44b434086a2f92 -------------------------------- commit 2012a9e279013933885983cbe0a5fe828052563b upstream. The bug is here: return cluster; The list iterator value 'cluster' will *always* be set and non-NULL by list_for_each_entry(), so it is incorrect to assume that the iterator value will be NULL if the list is empty or no element is found. To fix the bug, return 'cluster' when found, otherwise return NULL. Cc: stable@vger.kernel.org Fixes: 21bdbb7102ed ("perf: add qcom l2 cache perf events driver") Signed-off-by: Xiaomeng Tong Link: https://lore.kernel.org/r/20220327055733.4070-1-xiam0nd.tong@gmail.com Signed-off-by: Will Deacon Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/perf/qcom_l2_pmu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/perf/qcom_l2_pmu.c b/drivers/perf/qcom_l2_pmu.c index 23a0e008dafa..d58810f53727 100644 --- a/drivers/perf/qcom_l2_pmu.c +++ b/drivers/perf/qcom_l2_pmu.c @@ -739,7 +739,7 @@ static struct cluster_pmu *l2_cache_associate_cpu_with_cluster( { u64 mpidr; int cpu_cluster_id; - struct cluster_pmu *cluster = NULL; + struct cluster_pmu *cluster; /* * This assumes that the cluster_id is in MPIDR[aff1] for @@ -761,10 +761,10 @@ static struct cluster_pmu *l2_cache_associate_cpu_with_cluster( cluster->cluster_id); cpumask_set_cpu(cpu, &cluster->cluster_cpus); *per_cpu_ptr(l2cache_pmu->pmu_cluster, cpu) = cluster; - break; + return cluster; } - return cluster; + return NULL; } static int l2cache_pmu_online_cpu(unsigned int cpu, struct hlist_node *node) -- Gitee From 0591f31b34eb378187203f76d4e94d76e30f8a06 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 14 Jul 2022 21:59:23 +0800 Subject: [PATCH 332/674] irqchip/gic-v3: Fix GICR_CTLR.RWP polling stable inclusion from stable-v5.10.111 commit ff24114bb08d8b90edf2aff0a4fd0689523e6c17 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=ff24114bb08d8b90edf2aff0a4fd0689523e6c17 -------------------------------- commit 0df6664531a12cdd8fc873f0cac0dcb40243d3e9 upstream. It turns out that our polling of RWP is totally wrong when checking for it in the redistributors, as we test the *distributor* bit index, whereas it is a different bit number in the RDs... Oopsie boo. This is embarassing. Not only because it is wrong, but also because it took *8 years* to notice the blunder... Just fix the damn thing. Fixes: 021f653791ad ("irqchip: gic-v3: Initial support for GICv3") Signed-off-by: Marc Zyngier Cc: stable@vger.kernel.org Reviewed-by: Andre Przywara Reviewed-by: Lorenzo Pieralisi Link: https://lore.kernel.org/r/20220315165034.794482-2-maz@kernel.org Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/irqchip/irq-gic-v3.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c index aa8facbdf407..130b19aeb679 100644 --- a/drivers/irqchip/irq-gic-v3.c +++ b/drivers/irqchip/irq-gic-v3.c @@ -206,11 +206,11 @@ static inline void __iomem *gic_dist_base(struct irq_data *d) } } -static void gic_do_wait_for_rwp(void __iomem *base) +static void gic_do_wait_for_rwp(void __iomem *base, u32 bit) { u32 count = 1000000; /* 1s! */ - while (readl_relaxed(base + GICD_CTLR) & GICD_CTLR_RWP) { + while (readl_relaxed(base + GICD_CTLR) & bit) { count--; if (!count) { pr_err_ratelimited("RWP timeout, gone fishing\n"); @@ -224,13 +224,13 @@ static void gic_do_wait_for_rwp(void __iomem *base) /* Wait for completion of a distributor change */ static void gic_dist_wait_for_rwp(void) { - gic_do_wait_for_rwp(gic_data.dist_base); + gic_do_wait_for_rwp(gic_data.dist_base, GICD_CTLR_RWP); } /* Wait for completion of a redistributor change */ static void gic_redist_wait_for_rwp(void) { - gic_do_wait_for_rwp(gic_data_rdist_rd_base()); + gic_do_wait_for_rwp(gic_data_rdist_rd_base(), GICR_CTLR_RWP); } #ifdef CONFIG_ARM64 -- Gitee From bf55f83f44824afe860c4772d938a4a64ebd97e6 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 14 Jul 2022 21:59:24 +0800 Subject: [PATCH 333/674] drm/amdgpu/smu10: fix SoC/fclk units in auto mode stable inclusion from stable-v5.10.111 commit 786ae8de3a5e556c12dc548a048715453ccec20d category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=786ae8de3a5e556c12dc548a048715453ccec20d -------------------------------- commit 2f25d8ce09b7ba5d769c132ba3d4eb84a941d2cb upstream. SMU takes clock limits in Mhz units. socclk and fclk were using 10 khz units in some cases. Switch to Mhz units. Fixes higher than required SoC clocks. Fixes: 97cf32996c46d9 ("drm/amd/pm: Removed fixed clock in auto mode DPM") Reviewed-by: Paul Menzel Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu10_hwmgr.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu10_hwmgr.c b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu10_hwmgr.c index e6f40ee9f313..9d97938bd49e 100644 --- a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu10_hwmgr.c +++ b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu10_hwmgr.c @@ -709,13 +709,13 @@ static int smu10_dpm_force_dpm_level(struct pp_hwmgr *hwmgr, smum_send_msg_to_smc_with_parameter(hwmgr, PPSMC_MSG_SetHardMinFclkByFreq, hwmgr->display_config->num_display > 3 ? - data->clock_vol_info.vdd_dep_on_fclk->entries[0].clk : + (data->clock_vol_info.vdd_dep_on_fclk->entries[0].clk / 100) : min_mclk, NULL); smum_send_msg_to_smc_with_parameter(hwmgr, PPSMC_MSG_SetHardMinSocclkByFreq, - data->clock_vol_info.vdd_dep_on_socclk->entries[0].clk, + data->clock_vol_info.vdd_dep_on_socclk->entries[0].clk / 100, NULL); smum_send_msg_to_smc_with_parameter(hwmgr, PPSMC_MSG_SetHardMinVcn, @@ -728,11 +728,11 @@ static int smu10_dpm_force_dpm_level(struct pp_hwmgr *hwmgr, NULL); smum_send_msg_to_smc_with_parameter(hwmgr, PPSMC_MSG_SetSoftMaxFclkByFreq, - data->clock_vol_info.vdd_dep_on_fclk->entries[index_fclk].clk, + data->clock_vol_info.vdd_dep_on_fclk->entries[index_fclk].clk / 100, NULL); smum_send_msg_to_smc_with_parameter(hwmgr, PPSMC_MSG_SetSoftMaxSocclkByFreq, - data->clock_vol_info.vdd_dep_on_socclk->entries[index_socclk].clk, + data->clock_vol_info.vdd_dep_on_socclk->entries[index_socclk].clk / 100, NULL); smum_send_msg_to_smc_with_parameter(hwmgr, PPSMC_MSG_SetSoftMaxVcn, -- Gitee From 2802838b83da289166b5a8a4e385398582fcaeaf Mon Sep 17 00:00:00 2001 From: Karol Herbst Date: Thu, 14 Jul 2022 21:59:25 +0800 Subject: [PATCH 334/674] drm/nouveau/pmu: Add missing callbacks for Tegra devices stable inclusion from stable-v5.10.111 commit 326b408e7ec7b1715bcd81d1d0861256eebf8890 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=326b408e7ec7b1715bcd81d1d0861256eebf8890 -------------------------------- commit 38d4e5cf5b08798f093374e53c2f4609d5382dd5 upstream. Fixes a crash booting on those platforms with nouveau. Fixes: 4cdd2450bf73 ("drm/nouveau/pmu/gm200-: use alternate falcon reset sequence") Cc: Ben Skeggs Cc: Karol Herbst Cc: dri-devel@lists.freedesktop.org Cc: nouveau@lists.freedesktop.org Cc: # v5.17+ Signed-off-by: Karol Herbst Reviewed-by: Lyude Paul Link: https://patchwork.freedesktop.org/patch/msgid/20220322124800.2605463-1-kherbst@redhat.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/gpu/drm/nouveau/nvkm/subdev/pmu/gm20b.c | 1 + drivers/gpu/drm/nouveau/nvkm/subdev/pmu/gp102.c | 2 +- drivers/gpu/drm/nouveau/nvkm/subdev/pmu/gp10b.c | 1 + drivers/gpu/drm/nouveau/nvkm/subdev/pmu/priv.h | 1 + 4 files changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/pmu/gm20b.c b/drivers/gpu/drm/nouveau/nvkm/subdev/pmu/gm20b.c index 7938722b4da1..d82529becfdc 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/pmu/gm20b.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/pmu/gm20b.c @@ -216,6 +216,7 @@ gm20b_pmu = { .intr = gt215_pmu_intr, .recv = gm20b_pmu_recv, .initmsg = gm20b_pmu_initmsg, + .reset = gf100_pmu_reset, }; #if IS_ENABLED(CONFIG_ARCH_TEGRA_210_SOC) diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/pmu/gp102.c b/drivers/gpu/drm/nouveau/nvkm/subdev/pmu/gp102.c index 3dfb3e8522f6..9f32982216b6 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/pmu/gp102.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/pmu/gp102.c @@ -23,7 +23,7 @@ */ #include "priv.h" -static void +void gp102_pmu_reset(struct nvkm_pmu *pmu) { struct nvkm_device *device = pmu->subdev.device; diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/pmu/gp10b.c b/drivers/gpu/drm/nouveau/nvkm/subdev/pmu/gp10b.c index 7f5f9d544836..0bd4b32ad863 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/pmu/gp10b.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/pmu/gp10b.c @@ -83,6 +83,7 @@ gp10b_pmu = { .intr = gt215_pmu_intr, .recv = gm20b_pmu_recv, .initmsg = gm20b_pmu_initmsg, + .reset = gp102_pmu_reset, }; #if IS_ENABLED(CONFIG_ARCH_TEGRA_210_SOC) diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/pmu/priv.h b/drivers/gpu/drm/nouveau/nvkm/subdev/pmu/priv.h index b945ec320cd2..80c4cb861d40 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/pmu/priv.h +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/pmu/priv.h @@ -41,6 +41,7 @@ int gt215_pmu_send(struct nvkm_pmu *, u32[2], u32, u32, u32, u32); bool gf100_pmu_enabled(struct nvkm_pmu *); void gf100_pmu_reset(struct nvkm_pmu *); +void gp102_pmu_reset(struct nvkm_pmu *pmu); void gk110_pmu_pgob(struct nvkm_pmu *, bool); -- Gitee From 092160affba0ccae99f68cb2ddc3520cb35ee904 Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Thu, 14 Jul 2022 21:59:26 +0800 Subject: [PATCH 335/674] drm/amdkfd: Create file descriptor after client is added to smi_clients list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit stable inclusion from stable-v5.10.111 commit 82e43950143cc92534c359f5fa42092c928930e7 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=82e43950143cc92534c359f5fa42092c928930e7 -------------------------------- commit e79a2398e1b2d47060474dca291542368183bc0f upstream. This ensures userspace cannot prematurely clean-up the client before it is fully initialised which has been proven to cause issues in the past. Cc: Felix Kuehling Cc: Alex Deucher Cc: "Christian König" Cc: "Pan, Xinhui" Cc: David Airlie Cc: Daniel Vetter Cc: amd-gfx@lists.freedesktop.org Cc: dri-devel@lists.freedesktop.org Signed-off-by: Lee Jones Reviewed-by: Felix Kuehling Signed-off-by: Felix Kuehling Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 24 +++++++++++++-------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c index 17d1736367ea..bd4caa36ab2e 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c @@ -270,15 +270,6 @@ int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd) return ret; } - ret = anon_inode_getfd(kfd_smi_name, &kfd_smi_ev_fops, (void *)client, - O_RDWR); - if (ret < 0) { - kfifo_free(&client->fifo); - kfree(client); - return ret; - } - *fd = ret; - init_waitqueue_head(&client->wait_queue); spin_lock_init(&client->lock); client->events = 0; @@ -288,5 +279,20 @@ int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd) list_add_rcu(&client->list, &dev->smi_clients); spin_unlock(&dev->smi_lock); + ret = anon_inode_getfd(kfd_smi_name, &kfd_smi_ev_fops, (void *)client, + O_RDWR); + if (ret < 0) { + spin_lock(&dev->smi_lock); + list_del_rcu(&client->list); + spin_unlock(&dev->smi_lock); + + synchronize_rcu(); + + kfifo_free(&client->fifo); + kfree(client); + return ret; + } + *fd = ret; + return 0; } -- Gitee From 2695cd91c767577de371da7aadea900d6aba588e Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 14 Jul 2022 21:59:27 +0800 Subject: [PATCH 336/674] perf build: Don't use -ffat-lto-objects in the python feature test when building with clang-13 stable inclusion from stable-v5.10.111 commit 79abc219bafdb27fa1b4e788757c71eed1003d6a category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=79abc219bafdb27fa1b4e788757c71eed1003d6a -------------------------------- commit 3a8a0475861a443f02e3a9b57d044fe2a0a99291 upstream. Using -ffat-lto-objects in the python feature test when building with clang-13 results in: clang-13: error: optimization flag '-ffat-lto-objects' is not supported [-Werror,-Wignored-optimization-argument] error: command '/usr/sbin/clang' failed with exit code 1 cp: cannot stat '/tmp/build/perf/python_ext_build/lib/perf*.so': No such file or directory make[2]: *** [Makefile.perf:639: /tmp/build/perf/python/perf.so] Error 1 Noticed when building on a docker.io/library/archlinux:base container. Cc: Adrian Hunter Cc: Fangrui Song Cc: Florian Fainelli Cc: Ian Rogers Cc: Jiri Olsa Cc: John Keeping Cc: Leo Yan Cc: Michael Petlan Cc: Namhyung Kim Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Sedat Dilek Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- tools/perf/Makefile.config | 3 +++ tools/perf/util/setup.py | 2 ++ 2 files changed, 5 insertions(+) diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index 99266f1da1b2..1e77304d1b42 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -261,6 +261,9 @@ ifdef PYTHON_CONFIG PYTHON_EMBED_LIBADD := $(call grep-libs,$(PYTHON_EMBED_LDOPTS)) -lutil PYTHON_EMBED_CCOPTS := $(shell $(PYTHON_CONFIG_SQ) --includes 2>/dev/null) FLAGS_PYTHON_EMBED := $(PYTHON_EMBED_CCOPTS) $(PYTHON_EMBED_LDOPTS) + ifeq ($(CC_NO_CLANG), 0) + PYTHON_EMBED_CCOPTS := $(filter-out -ffat-lto-objects, $(PYTHON_EMBED_CCOPTS)) + endif endif FEATURE_CHECK_CFLAGS-libpython := $(PYTHON_EMBED_CCOPTS) diff --git a/tools/perf/util/setup.py b/tools/perf/util/setup.py index c5e3e9a68162..78e8b31b3651 100644 --- a/tools/perf/util/setup.py +++ b/tools/perf/util/setup.py @@ -23,6 +23,8 @@ if cc_is_clang: vars[var] = sub("-fstack-protector-strong", "", vars[var]) if not clang_has_option("-fno-semantic-interposition"): vars[var] = sub("-fno-semantic-interposition", "", vars[var]) + if not clang_has_option("-ffat-lto-objects"): + vars[var] = sub("-ffat-lto-objects", "", vars[var]) from distutils.core import setup, Extension -- Gitee From 92fd030ba22b7a1ad73cf9a02c351bd9a7df3438 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 14 Jul 2022 21:59:28 +0800 Subject: [PATCH 337/674] perf python: Fix probing for some clang command line options stable inclusion from stable-v5.10.111 commit 6374faf49e89f4283c6af4f74d5928ef0a657714 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=6374faf49e89f4283c6af4f74d5928ef0a657714 -------------------------------- commit dd6e1fe91cdd52774ca642d1da75b58a86356b56 upstream. The clang compiler complains about some options even without a source file being available, while others require one, so use the simple tools/build/feature/test-hello.c file. Then check for the "is not supported" string in its output, in addition to the "unknown argument" already being looked for. This was noticed when building with clang-13 where -ffat-lto-objects isn't supported and since we were looking just for "unknown argument" and not providing a source code to clang, was mistakenly assumed as being available and not being filtered to set of command line options provided to clang, leading to a build failure. Cc: Adrian Hunter Cc: Fangrui Song Cc: Florian Fainelli Cc: Ian Rogers Cc: Jiri Olsa Cc: John Keeping Cc: Leo Yan Cc: Michael Petlan Cc: Namhyung Kim Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Sedat Dilek Link: http://lore.kernel.org/lkml/ Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- tools/perf/util/setup.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/setup.py b/tools/perf/util/setup.py index 78e8b31b3651..b670469a8124 100644 --- a/tools/perf/util/setup.py +++ b/tools/perf/util/setup.py @@ -1,12 +1,14 @@ -from os import getenv +from os import getenv, path from subprocess import Popen, PIPE from re import sub cc = getenv("CC") cc_is_clang = b"clang version" in Popen([cc.split()[0], "-v"], stderr=PIPE).stderr.readline() +src_feature_tests = getenv('srctree') + '/tools/build/feature' def clang_has_option(option): - return [o for o in Popen([cc, option], stderr=PIPE).stderr.readlines() if b"unknown argument" in o] == [ ] + cc_output = Popen([cc, option, path.join(src_feature_tests, "test-hello.c") ], stderr=PIPE).stderr.readlines() + return [o for o in cc_output if ((b"unknown argument" in o) or (b"is not supported" in o))] == [ ] if cc_is_clang: from distutils.sysconfig import get_config_vars -- Gitee From b7f76fdc7a1dd722887136255282e674efab7bad Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 14 Jul 2022 21:59:29 +0800 Subject: [PATCH 338/674] tools build: Filter out options and warnings not supported by clang stable inclusion from stable-v5.10.111 commit 75c8558d410f6cedc96f6f701f7ced1c8aced6fc category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=75c8558d410f6cedc96f6f701f7ced1c8aced6fc -------------------------------- commit 41caff459a5b956b3e23ba9ca759dd0629ad3dda upstream. These make the feature check fail when using clang, so remove them just like is done in tools/perf/Makefile.config to build perf itself. Adding -Wno-compound-token-split-by-macro to tools/perf/Makefile.config when building with clang is also necessary to avoid these warnings turned into errors (-Werror): CC /tmp/build/perf/util/scripting-engines/trace-event-perl.o In file included from util/scripting-engines/trace-event-perl.c:35: In file included from /usr/lib64/perl5/CORE/perl.h:4085: In file included from /usr/lib64/perl5/CORE/hv.h:659: In file included from /usr/lib64/perl5/CORE/hv_func.h:34: In file included from /usr/lib64/perl5/CORE/sbox32_hash.h:4: /usr/lib64/perl5/CORE/zaphod32_hash.h:150:5: error: '(' and '{' tokens introducing statement expression appear in different macro expansion contexts [-Werror,-Wcompound-token-split-by-macro] ZAPHOD32_SCRAMBLE32(state[0],0x9fade23b); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ /usr/lib64/perl5/CORE/zaphod32_hash.h:80:38: note: expanded from macro 'ZAPHOD32_SCRAMBLE32' #define ZAPHOD32_SCRAMBLE32(v,prime) STMT_START { \ ^~~~~~~~~~ /usr/lib64/perl5/CORE/perl.h:737:29: note: expanded from macro 'STMT_START' # define STMT_START (void)( /* gcc supports "({ STATEMENTS; })" */ ^ /usr/lib64/perl5/CORE/zaphod32_hash.h:150:5: note: '{' token is here ZAPHOD32_SCRAMBLE32(state[0],0x9fade23b); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ /usr/lib64/perl5/CORE/zaphod32_hash.h:80:49: note: expanded from macro 'ZAPHOD32_SCRAMBLE32' #define ZAPHOD32_SCRAMBLE32(v,prime) STMT_START { \ ^ /usr/lib64/perl5/CORE/zaphod32_hash.h:150:5: error: '}' and ')' tokens terminating statement expression appear in different macro expansion contexts [-Werror,-Wcompound-token-split-by-macro] ZAPHOD32_SCRAMBLE32(state[0],0x9fade23b); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ /usr/lib64/perl5/CORE/zaphod32_hash.h:87:41: note: expanded from macro 'ZAPHOD32_SCRAMBLE32' v ^= (v>>23); \ ^ /usr/lib64/perl5/CORE/zaphod32_hash.h:150:5: note: ')' token is here ZAPHOD32_SCRAMBLE32(state[0],0x9fade23b); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ /usr/lib64/perl5/CORE/zaphod32_hash.h:88:3: note: expanded from macro 'ZAPHOD32_SCRAMBLE32' } STMT_END ^~~~~~~~ /usr/lib64/perl5/CORE/perl.h:738:21: note: expanded from macro 'STMT_END' # define STMT_END ) ^ Please refer to the discussion on the Link: tag below, where Nathan clarifies the situation: acme> And then get to the problems at the end of this message, which seem acme> similar to the problem described here: acme> acme> From Nathan Chancellor <> acme> Subject [PATCH] mwifiex: Remove unnecessary braces from HostCmd_SET_SEQ_NO_BSS_INFO acme> acme> https://lkml.org/lkml/2020/9/1/135 acme> acme> So perhaps in this case its better to disable that acme> -Werror,-Wcompound-token-split-by-macro when building with clang? Yes, I think that is probably the best solution. As far as I can tell, at least in this file and context, the warning appears harmless, as the "create a GNU C statement expression from two different macros" is very much intentional, based on the presence of PERL_USE_GCC_BRACE_GROUPS. The warning is fixed in upstream Perl by just avoiding creating GNU C statement expressions using STMT_START and STMT_END: https://github.com/Perl/perl5/issues/18780 https://github.com/Perl/perl5/pull/18984 If I am reading the source code correctly, an alternative to disabling the warning would be specifying -DPERL_GCC_BRACE_GROUPS_FORBIDDEN but it seems like that might end up impacting more than just this site, according to the issue discussion above. Based-on-a-patch-by: Sedat Dilek Tested-by: Sedat Dilek # Debian/Selfmade LLVM-14 (x86-64) Cc: Adrian Hunter Cc: Fangrui Song Cc: Florian Fainelli Cc: Ian Rogers Cc: Jiri Olsa Cc: John Keeping Cc: Leo Yan Cc: Michael Petlan Cc: Namhyung Kim Cc: Nathan Chancellor Cc: Nick Desaulniers Link: http://lore.kernel.org/lkml/YkxWcYzph5pC1EK8@kernel.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- tools/build/feature/Makefile | 7 +++++++ tools/perf/Makefile.config | 3 +++ 2 files changed, 10 insertions(+) diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile index d45ec15ac383..09f47aff9012 100644 --- a/tools/build/feature/Makefile +++ b/tools/build/feature/Makefile @@ -217,6 +217,13 @@ PERL_EMBED_LIBADD = $(call grep-libs,$(PERL_EMBED_LDOPTS)) PERL_EMBED_CCOPTS = `perl -MExtUtils::Embed -e ccopts 2>/dev/null` FLAGS_PERL_EMBED=$(PERL_EMBED_CCOPTS) $(PERL_EMBED_LDOPTS) +ifeq ($(CC_NO_CLANG), 0) + PERL_EMBED_LDOPTS := $(filter-out -specs=%,$(PERL_EMBED_LDOPTS)) + PERL_EMBED_CCOPTS := $(filter-out -flto=auto -ffat-lto-objects, $(PERL_EMBED_CCOPTS)) + PERL_EMBED_CCOPTS := $(filter-out -specs=%,$(PERL_EMBED_CCOPTS)) + FLAGS_PERL_EMBED += -Wno-compound-token-split-by-macro +endif + $(OUTPUT)test-libperl.bin: $(BUILD) $(FLAGS_PERL_EMBED) diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index 1e77304d1b42..78700c7ec7df 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -776,6 +776,9 @@ else LDFLAGS += $(PERL_EMBED_LDFLAGS) EXTLIBS += $(PERL_EMBED_LIBADD) CFLAGS += -DHAVE_LIBPERL_SUPPORT + ifeq ($(CC_NO_CLANG), 0) + CFLAGS += -Wno-compound-token-split-by-macro + endif $(call detected,CONFIG_LIBPERL) endif endif -- Gitee From 1a568a6171d30d1a0d1c51d724b71ed20cfb9013 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 14 Jul 2022 21:59:30 +0800 Subject: [PATCH 339/674] tools build: Use $(shell ) instead of `` to get embedded libperl's ccopts stable inclusion from stable-v5.10.111 commit 40e00885a61f6d31c76f32d05f4c87b3fc272720 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=40e00885a61f6d31c76f32d05f4c87b3fc272720 -------------------------------- commit 541f695cbcb6932c22638b06e0cbe1d56177e2e9 upstream. Just like its done for ldopts and for both in tools/perf/Makefile.config. Using `` to initialize PERL_EMBED_CCOPTS somehow precludes using: $(filter-out SOMETHING_TO_FILTER,$(PERL_EMBED_CCOPTS)) And we need to do it to allow for building with versions of clang where some gcc options selected by distros are not available. Tested-by: Sedat Dilek # Debian/Selfmade LLVM-14 (x86-64) Cc: Adrian Hunter Cc: Fangrui Song Cc: Florian Fainelli Cc: Ian Rogers Cc: Jiri Olsa Cc: John Keeping Cc: Leo Yan Cc: Michael Petlan Cc: Namhyung Kim Cc: Nathan Chancellor Cc: Nick Desaulniers Link: http://lore.kernel.org/lkml/YktYX2OnLtyobRYD@kernel.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- tools/build/feature/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile index 09f47aff9012..db3417ff8c5f 100644 --- a/tools/build/feature/Makefile +++ b/tools/build/feature/Makefile @@ -214,7 +214,7 @@ strip-libs = $(filter-out -l%,$(1)) PERL_EMBED_LDOPTS = $(shell perl -MExtUtils::Embed -e ldopts 2>/dev/null) PERL_EMBED_LDFLAGS = $(call strip-libs,$(PERL_EMBED_LDOPTS)) PERL_EMBED_LIBADD = $(call grep-libs,$(PERL_EMBED_LDOPTS)) -PERL_EMBED_CCOPTS = `perl -MExtUtils::Embed -e ccopts 2>/dev/null` +PERL_EMBED_CCOPTS = $(shell perl -MExtUtils::Embed -e ccopts 2>/dev/null) FLAGS_PERL_EMBED=$(PERL_EMBED_CCOPTS) $(PERL_EMBED_LDOPTS) ifeq ($(CC_NO_CLANG), 0) -- Gitee From 0092f1288694966ce9468fb9f0056ffd733c5b3e Mon Sep 17 00:00:00 2001 From: Vinod Koul Date: Thu, 14 Jul 2022 21:59:31 +0800 Subject: [PATCH 340/674] dmaengine: Revert "dmaengine: shdma: Fix runtime PM imbalance on error" stable inclusion from stable-v5.10.111 commit 03b39bbbec8becbecde5d3f84a352633c8f2f49b category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=03b39bbbec8becbecde5d3f84a352633c8f2f49b -------------------------------- commit d143f939a95696d38ff800ada14402fa50ebbd6c upstream. This reverts commit 455896c53d5b ("dmaengine: shdma: Fix runtime PM imbalance on error") as the patch wrongly reduced the count on error and did not bail out. So drop the count by reverting the patch . Signed-off-by: Vinod Koul Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/dma/sh/shdma-base.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/dma/sh/shdma-base.c b/drivers/dma/sh/shdma-base.c index 19ac95c0098f..7f72b3f4cd1a 100644 --- a/drivers/dma/sh/shdma-base.c +++ b/drivers/dma/sh/shdma-base.c @@ -115,10 +115,8 @@ static dma_cookie_t shdma_tx_submit(struct dma_async_tx_descriptor *tx) ret = pm_runtime_get(schan->dev); spin_unlock_irq(&schan->chan_lock); - if (ret < 0) { + if (ret < 0) dev_err(schan->dev, "%s(): GET = %d\n", __func__, ret); - pm_runtime_put(schan->dev); - } pm_runtime_barrier(schan->dev); -- Gitee From 3e393ad0efe336af9da4abd7ce7b5b858e9b9b0c Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 14 Jul 2022 21:59:32 +0800 Subject: [PATCH 341/674] ubsan: remove CONFIG_UBSAN_OBJECT_SIZE stable inclusion from stable-v5.10.111 commit 58823a9b097cf36dd59c4d463396e15a5ec4efb7 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=58823a9b097cf36dd59c4d463396e15a5ec4efb7 -------------------------------- commit 69d0db01e210e07fe915e5da91b54a867cda040f upstream. The object-size sanitizer is redundant to -Warray-bounds, and inappropriately performs its checks at run-time when all information needed for the evaluation is available at compile-time, making it quite difficult to use: https://bugzilla.kernel.org/show_bug.cgi?id=214861 With -Warray-bounds almost enabled globally, it doesn't make sense to keep this around. Link: https://lkml.kernel.org/r/20211203235346.110809-1-keescook@chromium.org Signed-off-by: Kees Cook Reviewed-by: Marco Elver Cc: Masahiro Yamada Cc: Michal Marek Cc: Nick Desaulniers Cc: Nathan Chancellor Cc: Andrey Ryabinin Cc: "Peter Zijlstra (Intel)" Cc: Stephen Rothwell Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Tadeusz Struk Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- lib/test_ubsan.c | 11 ----------- scripts/Makefile.ubsan | 1 - 2 files changed, 12 deletions(-) diff --git a/lib/test_ubsan.c b/lib/test_ubsan.c index 9ea10adf7a66..b1d0a6ecfe1b 100644 --- a/lib/test_ubsan.c +++ b/lib/test_ubsan.c @@ -89,16 +89,6 @@ static void test_ubsan_misaligned_access(void) *ptr = val; } -static void test_ubsan_object_size_mismatch(void) -{ - /* "((aligned(8)))" helps this not into be misaligned for ptr-access. */ - volatile int val __aligned(8) = 4; - volatile long long *ptr, val2; - - ptr = (long long *)&val; - val2 = *ptr; -} - static const test_ubsan_fp test_ubsan_array[] = { test_ubsan_add_overflow, test_ubsan_sub_overflow, @@ -110,7 +100,6 @@ static const test_ubsan_fp test_ubsan_array[] = { test_ubsan_load_invalid_value, //test_ubsan_null_ptr_deref, /* exclude it because there is a crash */ test_ubsan_misaligned_access, - test_ubsan_object_size_mismatch, }; static int __init test_ubsan_init(void) diff --git a/scripts/Makefile.ubsan b/scripts/Makefile.ubsan index 9716dab06bc7..2156e18391a3 100644 --- a/scripts/Makefile.ubsan +++ b/scripts/Makefile.ubsan @@ -23,7 +23,6 @@ ifdef CONFIG_UBSAN_MISC CFLAGS_UBSAN += $(call cc-option, -fsanitize=integer-divide-by-zero) CFLAGS_UBSAN += $(call cc-option, -fsanitize=unreachable) CFLAGS_UBSAN += $(call cc-option, -fsanitize=signed-integer-overflow) - CFLAGS_UBSAN += $(call cc-option, -fsanitize=object-size) CFLAGS_UBSAN += $(call cc-option, -fsanitize=bool) CFLAGS_UBSAN += $(call cc-option, -fsanitize=enum) endif -- Gitee From b9be96107a16c9aacfbc3d386be05e9b250c68fd Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 14 Jul 2022 21:59:33 +0800 Subject: [PATCH 342/674] mm: don't skip swap entry even if zap_details specified stable inclusion from stable-v5.10.111 commit f089471d1b754cdd386f081f6c62eec414e8e188 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=f089471d1b754cdd386f081f6c62eec414e8e188 -------------------------------- commit 5abfd71d936a8aefd9f9ccd299dea7a164a5d455 upstream. Patch series "mm: Rework zap ptes on swap entries", v5. Patch 1 should fix a long standing bug for zap_pte_range() on zap_details usage. The risk is we could have some swap entries skipped while we should have zapped them. Migration entries are not the major concern because file backed memory always zap in the pattern that "first time without page lock, then re-zap with page lock" hence the 2nd zap will always make sure all migration entries are already recovered. However there can be issues with real swap entries got skipped errornoously. There's a reproducer provided in commit message of patch 1 for that. Patch 2-4 are cleanups that are based on patch 1. After the whole patchset applied, we should have a very clean view of zap_pte_range(). Only patch 1 needs to be backported to stable if necessary. This patch (of 4): The "details" pointer shouldn't be the token to decide whether we should skip swap entries. For example, when the callers specified details->zap_mapping==NULL, it means the user wants to zap all the pages (including COWed pages), then we need to look into swap entries because there can be private COWed pages that was swapped out. Skipping some swap entries when details is non-NULL may lead to wrongly leaving some of the swap entries while we should have zapped them. A reproducer of the problem: Reviewed-by: Wei Li ===8<=== #define _GNU_SOURCE /* See feature_test_macros(7) */ #include #include #include #include #include int page_size; int shmem_fd; char *buffer; void main(void) { int ret; char val; page_size = getpagesize(); shmem_fd = memfd_create("test", 0); assert(shmem_fd >= 0); ret = ftruncate(shmem_fd, page_size * 2); assert(ret == 0); buffer = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE, MAP_PRIVATE, shmem_fd, 0); assert(buffer != MAP_FAILED); /* Write private page, swap it out */ buffer[page_size] = 1; madvise(buffer, page_size * 2, MADV_PAGEOUT); /* This should drop private buffer[page_size] already */ ret = ftruncate(shmem_fd, page_size); assert(ret == 0); /* Recover the size */ ret = ftruncate(shmem_fd, page_size * 2); assert(ret == 0); /* Re-read the data, it should be all zero */ val = buffer[page_size]; if (val == 0) printf("Good\n"); else printf("BUG\n"); } ===8<=== We don't need to touch up the pmd path, because pmd never had a issue with swap entries. For example, shmem pmd migration will always be split into pte level, and same to swapping on anonymous. Add another helper should_zap_cows() so that we can also check whether we should zap private mappings when there's no page pointer specified. This patch drops that trick, so we handle swap ptes coherently. Meanwhile we should do the same check upon migration entry, hwpoison entry and genuine swap entries too. To be explicit, we should still remember to keep the private entries if even_cows==false, and always zap them when even_cows==true. The issue seems to exist starting from the initial commit of git. [peterx@redhat.com: comment tweaks] Link: https://lkml.kernel.org/r/20220217060746.71256-2-peterx@redhat.com Link: https://lkml.kernel.org/r/20220217060746.71256-1-peterx@redhat.com Link: https://lkml.kernel.org/r/20220216094810.60572-1-peterx@redhat.com Link: https://lkml.kernel.org/r/20220216094810.60572-2-peterx@redhat.com Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Peter Xu Reviewed-by: John Hubbard Cc: David Hildenbrand Cc: Hugh Dickins Cc: Alistair Popple Cc: Andrea Arcangeli Cc: "Kirill A . Shutemov" Cc: Matthew Wilcox Cc: Vlastimil Babka Cc: Yang Shi Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai --- mm/memory.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 8379f39dd697..6d3a07c821fe 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1204,6 +1204,17 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) return ret; } +/* Whether we should zap all COWed (private) pages too */ +static inline bool should_zap_cows(struct zap_details *details) +{ + /* By default, zap all pages */ + if (!details) + return true; + + /* Or, we zap COWed pages only if the caller wants to */ + return !details->check_mapping; +} + static unsigned long zap_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, @@ -1295,16 +1306,18 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, continue; } - /* If details->check_mapping, we leave swap entries. */ - if (unlikely(details)) - continue; - - if (!non_swap_entry(entry)) + if (!non_swap_entry(entry)) { + /* Genuine swap entry, hence a private anon page */ + if (!should_zap_cows(details)) + continue; rss[MM_SWAPENTS]--; - else if (is_migration_entry(entry)) { + } else if (is_migration_entry(entry)) { struct page *page; page = migration_entry_to_page(entry); + if (details && details->check_mapping && + details->check_mapping != page_rmapping(page)) + continue; rss[mm_counter(page)]--; } if (unlikely(!free_swap_and_cache(entry))) -- Gitee From 44ef621d6cc16f1830a38b570f129309f4193293 Mon Sep 17 00:00:00 2001 From: Sachin Sant Date: Thu, 14 Jul 2022 21:59:34 +0800 Subject: [PATCH 343/674] selftests/cgroup: Fix build on older distros stable inclusion from stable-v5.10.111 commit e74da71e66142784aa086dbaae047901074b4518 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=e74da71e66142784aa086dbaae047901074b4518 -------------------------------- commit c2e46f6b3e3551558d44c4dc518b9667cb0d5f8b upstream. On older distros struct clone_args does not have a cgroup member, leading to build errors: cgroup_util.c: In function 'clone_into_cgroup': cgroup_util.c:343:4: error: 'struct clone_args' has no member named 'cgroup' cgroup_util.c:346:33: error: invalid application of 'sizeof' to incomplete type 'struct clone_args' But the selftests already have a locally defined version of the structure which is up to date, called __clone_args. So use __clone_args which fixes the error. Signed-off-by: Michael Ellerman Signed-off-by: Sachin Sant > Acked-by: Christian Brauner Signed-off-by: Shuah Khan Signed-off-by: Ovidiu Panait Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- tools/testing/selftests/cgroup/cgroup_util.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/cgroup/cgroup_util.c b/tools/testing/selftests/cgroup/cgroup_util.c index 391ab5675290..f61145f98577 100644 --- a/tools/testing/selftests/cgroup/cgroup_util.c +++ b/tools/testing/selftests/cgroup/cgroup_util.c @@ -330,13 +330,13 @@ pid_t clone_into_cgroup(int cgroup_fd) #ifdef CLONE_ARGS_SIZE_VER2 pid_t pid; - struct clone_args args = { + struct __clone_args args = { .flags = CLONE_INTO_CGROUP, .exit_signal = SIGCHLD, .cgroup = cgroup_fd, }; - pid = sys_clone3(&args, sizeof(struct clone_args)); + pid = sys_clone3(&args, sizeof(struct __clone_args)); /* * Verify that this is a genuine test failure: * ENOSYS -> clone3() not available -- Gitee From f6e6c8df509e5f0ce1e554b1a63edeff9a862619 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 14 Jul 2022 21:59:35 +0800 Subject: [PATCH 344/674] selftests: cgroup: Make cg_create() use 0755 for permission instead of 0644 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit stable inclusion from stable-v5.10.111 commit 9dd39d2c6572c7be091ecd6aced950bd91175415 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=9dd39d2c6572c7be091ecd6aced950bd91175415 -------------------------------- commit b09c2baa56347ae65795350dfcc633dedb1c2970 upstream. 0644 is an odd perm to create a cgroup which is a directory. Use the regular 0755 instead. This is necessary for euid switching test case. Reviewed-by: Michal Koutný Signed-off-by: Tejun Heo Signed-off-by: Ovidiu Panait Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- tools/testing/selftests/cgroup/cgroup_util.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/cgroup/cgroup_util.c b/tools/testing/selftests/cgroup/cgroup_util.c index f61145f98577..962ced827513 100644 --- a/tools/testing/selftests/cgroup/cgroup_util.c +++ b/tools/testing/selftests/cgroup/cgroup_util.c @@ -212,7 +212,7 @@ int cg_find_unified_root(char *root, size_t len) int cg_create(const char *cgroup) { - return mkdir(cgroup, 0644); + return mkdir(cgroup, 0755); } int cg_wait_for_proc_count(const char *cgroup, int count) -- Gitee From 2960182013a289f79bb330f4d92043ea1e32d4e1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 14 Jul 2022 21:59:36 +0800 Subject: [PATCH 345/674] selftests: cgroup: Test open-time credential usage for migration checks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit stable inclusion from stable-v5.10.111 commit 637eca44b8f72c5eae6539fa037548c5fe7193a3 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=637eca44b8f72c5eae6539fa037548c5fe7193a3 -------------------------------- commit 613e040e4dc285367bff0f8f75ea59839bc10947 upstream. When a task is writing to an fd opened by a different task, the perm check should use the credentials of the latter task. Add a test for it. Tested-by: Michal Koutný Signed-off-by: Tejun Heo Signed-off-by: Ovidiu Panait Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- tools/testing/selftests/cgroup/test_core.c | 68 ++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/tools/testing/selftests/cgroup/test_core.c b/tools/testing/selftests/cgroup/test_core.c index 3df648c37876..01b766506973 100644 --- a/tools/testing/selftests/cgroup/test_core.c +++ b/tools/testing/selftests/cgroup/test_core.c @@ -674,6 +674,73 @@ static int test_cgcore_thread_migration(const char *root) return ret; } +/* + * cgroup migration permission check should be performed based on the + * credentials at the time of open instead of write. + */ +static int test_cgcore_lesser_euid_open(const char *root) +{ + const uid_t test_euid = 65534; /* usually nobody, any !root is fine */ + int ret = KSFT_FAIL; + char *cg_test_a = NULL, *cg_test_b = NULL; + char *cg_test_a_procs = NULL, *cg_test_b_procs = NULL; + int cg_test_b_procs_fd = -1; + uid_t saved_uid; + + cg_test_a = cg_name(root, "cg_test_a"); + cg_test_b = cg_name(root, "cg_test_b"); + + if (!cg_test_a || !cg_test_b) + goto cleanup; + + cg_test_a_procs = cg_name(cg_test_a, "cgroup.procs"); + cg_test_b_procs = cg_name(cg_test_b, "cgroup.procs"); + + if (!cg_test_a_procs || !cg_test_b_procs) + goto cleanup; + + if (cg_create(cg_test_a) || cg_create(cg_test_b)) + goto cleanup; + + if (cg_enter_current(cg_test_a)) + goto cleanup; + + if (chown(cg_test_a_procs, test_euid, -1) || + chown(cg_test_b_procs, test_euid, -1)) + goto cleanup; + + saved_uid = geteuid(); + if (seteuid(test_euid)) + goto cleanup; + + cg_test_b_procs_fd = open(cg_test_b_procs, O_RDWR); + + if (seteuid(saved_uid)) + goto cleanup; + + if (cg_test_b_procs_fd < 0) + goto cleanup; + + if (write(cg_test_b_procs_fd, "0", 1) >= 0 || errno != EACCES) + goto cleanup; + + ret = KSFT_PASS; + +cleanup: + cg_enter_current(root); + if (cg_test_b_procs_fd >= 0) + close(cg_test_b_procs_fd); + if (cg_test_b) + cg_destroy(cg_test_b); + if (cg_test_a) + cg_destroy(cg_test_a); + free(cg_test_b_procs); + free(cg_test_a_procs); + free(cg_test_b); + free(cg_test_a); + return ret; +} + #define T(x) { x, #x } struct corecg_test { int (*fn)(const char *root); @@ -689,6 +756,7 @@ struct corecg_test { T(test_cgcore_proc_migration), T(test_cgcore_thread_migration), T(test_cgcore_destroy), + T(test_cgcore_lesser_euid_open), }; #undef T -- Gitee From 71df1200c5804db3367e118f0cee906beed1d6d8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 14 Jul 2022 21:59:37 +0800 Subject: [PATCH 346/674] selftests: cgroup: Test open-time cgroup namespace usage for migration checks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit stable inclusion from stable-v5.10.111 commit 919823bd6738a5f4388df033a9d6795c82b97cf7 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=919823bd6738a5f4388df033a9d6795c82b97cf7 -------------------------------- commit bf35a7879f1dfb0d050fe779168bcf25c7de66f5 upstream. When a task is writing to an fd opened by a different task, the perm check should use the cgroup namespace of the latter task. Add a test for it. Tested-by: Michal Koutný Signed-off-by: Tejun Heo Signed-off-by: Ovidiu Panait Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- tools/testing/selftests/cgroup/test_core.c | 97 ++++++++++++++++++++++ 1 file changed, 97 insertions(+) diff --git a/tools/testing/selftests/cgroup/test_core.c b/tools/testing/selftests/cgroup/test_core.c index 01b766506973..600123503063 100644 --- a/tools/testing/selftests/cgroup/test_core.c +++ b/tools/testing/selftests/cgroup/test_core.c @@ -1,11 +1,14 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#define _GNU_SOURCE #include +#include #include #include #include #include #include +#include #include #include #include @@ -741,6 +744,99 @@ static int test_cgcore_lesser_euid_open(const char *root) return ret; } +struct lesser_ns_open_thread_arg { + const char *path; + int fd; + int err; +}; + +static int lesser_ns_open_thread_fn(void *arg) +{ + struct lesser_ns_open_thread_arg *targ = arg; + + targ->fd = open(targ->path, O_RDWR); + targ->err = errno; + return 0; +} + +/* + * cgroup migration permission check should be performed based on the cgroup + * namespace at the time of open instead of write. + */ +static int test_cgcore_lesser_ns_open(const char *root) +{ + static char stack[65536]; + const uid_t test_euid = 65534; /* usually nobody, any !root is fine */ + int ret = KSFT_FAIL; + char *cg_test_a = NULL, *cg_test_b = NULL; + char *cg_test_a_procs = NULL, *cg_test_b_procs = NULL; + int cg_test_b_procs_fd = -1; + struct lesser_ns_open_thread_arg targ = { .fd = -1 }; + pid_t pid; + int status; + + cg_test_a = cg_name(root, "cg_test_a"); + cg_test_b = cg_name(root, "cg_test_b"); + + if (!cg_test_a || !cg_test_b) + goto cleanup; + + cg_test_a_procs = cg_name(cg_test_a, "cgroup.procs"); + cg_test_b_procs = cg_name(cg_test_b, "cgroup.procs"); + + if (!cg_test_a_procs || !cg_test_b_procs) + goto cleanup; + + if (cg_create(cg_test_a) || cg_create(cg_test_b)) + goto cleanup; + + if (cg_enter_current(cg_test_b)) + goto cleanup; + + if (chown(cg_test_a_procs, test_euid, -1) || + chown(cg_test_b_procs, test_euid, -1)) + goto cleanup; + + targ.path = cg_test_b_procs; + pid = clone(lesser_ns_open_thread_fn, stack + sizeof(stack), + CLONE_NEWCGROUP | CLONE_FILES | CLONE_VM | SIGCHLD, + &targ); + if (pid < 0) + goto cleanup; + + if (waitpid(pid, &status, 0) < 0) + goto cleanup; + + if (!WIFEXITED(status)) + goto cleanup; + + cg_test_b_procs_fd = targ.fd; + if (cg_test_b_procs_fd < 0) + goto cleanup; + + if (cg_enter_current(cg_test_a)) + goto cleanup; + + if ((status = write(cg_test_b_procs_fd, "0", 1)) >= 0 || errno != ENOENT) + goto cleanup; + + ret = KSFT_PASS; + +cleanup: + cg_enter_current(root); + if (cg_test_b_procs_fd >= 0) + close(cg_test_b_procs_fd); + if (cg_test_b) + cg_destroy(cg_test_b); + if (cg_test_a) + cg_destroy(cg_test_a); + free(cg_test_b_procs); + free(cg_test_a_procs); + free(cg_test_b); + free(cg_test_a); + return ret; +} + #define T(x) { x, #x } struct corecg_test { int (*fn)(const char *root); @@ -757,6 +853,7 @@ struct corecg_test { T(test_cgcore_thread_migration), T(test_cgcore_destroy), T(test_cgcore_lesser_euid_open), + T(test_cgcore_lesser_ns_open), }; #undef T -- Gitee From 00c200254efd233f4a843dd3f6a763b1dfd92940 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Thu, 14 Jul 2022 21:59:38 +0800 Subject: [PATCH 347/674] arm64: module: remove (NOLOAD) from linker script stable inclusion from stable-v5.10.111 commit e1f540b752cbc671e52b30184eefa9ffab0bd28e category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=e1f540b752cbc671e52b30184eefa9ffab0bd28e -------------------------------- commit 4013e26670c590944abdab56c4fa797527b74325 upstream. On ELF, (NOLOAD) sets the section type to SHT_NOBITS[1]. It is conceptually inappropriate for .plt and .text.* sections which are always SHT_PROGBITS. In GNU ld, if PLT entries are needed, .plt will be SHT_PROGBITS anyway and (NOLOAD) will be essentially ignored. In ld.lld, since https://reviews.llvm.org/D118840 ("[ELF] Support (TYPE=) to customize the output section type"), ld.lld will report a `section type mismatch` error. Just remove (NOLOAD) to fix the error. [1] https://lld.llvm.org/ELF/linker_script.html As of today, "The section should be marked as not loadable" on https://sourceware.org/binutils/docs/ld/Output-Section-Type.html is outdated for ELF. Tested-by: Nathan Chancellor Reported-by: Nathan Chancellor Signed-off-by: Fangrui Song Acked-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20220218081209.354383-1-maskray@google.com Signed-off-by: Will Deacon [nathan: Fix conflicts due to lack of 1cbdf60bd1b7] Signed-off-by: Nathan Chancellor Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- arch/arm64/include/asm/module.lds.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/module.lds.h b/arch/arm64/include/asm/module.lds.h index 810045628c66..0522337d600a 100644 --- a/arch/arm64/include/asm/module.lds.h +++ b/arch/arm64/include/asm/module.lds.h @@ -1,7 +1,7 @@ #ifdef CONFIG_ARM64_MODULE_PLTS SECTIONS { - .plt 0 (NOLOAD) : { BYTE(0) } - .init.plt 0 (NOLOAD) : { BYTE(0) } - .text.ftrace_trampoline 0 (NOLOAD) : { BYTE(0) } + .plt 0 : { BYTE(0) } + .init.plt 0 : { BYTE(0) } + .text.ftrace_trampoline 0 : { BYTE(0) } } #endif -- Gitee From 13b7a59d21a8b21e731d7e146b2100de1f106be4 Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Thu, 14 Jul 2022 21:59:39 +0800 Subject: [PATCH 348/674] Drivers: hv: vmbus: Replace smp_store_mb() with virt_store_mb() stable inclusion from stable-v5.10.111 commit 000e09462f859d71563824c413717a2d3eda4bca category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=000e09462f859d71563824c413717a2d3eda4bca -------------------------------- commit eaa03d34535872d29004cb5cf77dc9dec1ba9a25 upstream. Following the recommendation in Documentation/memory-barriers.txt for virtual machine guests. Fixes: 8b6a877c060ed ("Drivers: hv: vmbus: Replace the per-CPU channel lists with a global array of channels") Signed-off-by: Andrea Parri (Microsoft) Link: https://lore.kernel.org/r/20220328154457.100872-1-parri.andrea@gmail.com Signed-off-by: Wei Liu Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/hv/channel_mgmt.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index 6476bfe193af..5dbb949b1afd 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -350,7 +350,7 @@ void vmbus_channel_map_relid(struct vmbus_channel *channel) * execute: * * (a) In the "normal (i.e., not resuming from hibernation)" path, - * the full barrier in smp_store_mb() guarantees that the store + * the full barrier in virt_store_mb() guarantees that the store * is propagated to all CPUs before the add_channel_work work * is queued. In turn, add_channel_work is queued before the * channel's ring buffer is allocated/initialized and the @@ -362,14 +362,14 @@ void vmbus_channel_map_relid(struct vmbus_channel *channel) * recv_int_page before retrieving the channel pointer from the * array of channels. * - * (b) In the "resuming from hibernation" path, the smp_store_mb() + * (b) In the "resuming from hibernation" path, the virt_store_mb() * guarantees that the store is propagated to all CPUs before * the VMBus connection is marked as ready for the resume event * (cf. check_ready_for_resume_event()). The interrupt handler * of the VMBus driver and vmbus_chan_sched() can not run before * vmbus_bus_resume() has completed execution (cf. resume_noirq). */ - smp_store_mb( + virt_store_mb( vmbus_connection.channels[channel->offermsg.child_relid], channel); } -- Gitee From 26acbfd810c1e4b9697b9c848ba73cdbf0198019 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Thu, 14 Jul 2022 21:59:40 +0800 Subject: [PATCH 349/674] irqchip/gic, gic-v3: Prevent GSI to SGI translations stable inclusion from stable-v5.10.111 commit 5973f7507a73878653de7f27f4a433c9a8ba2690 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=5973f7507a73878653de7f27f4a433c9a8ba2690 -------------------------------- commit 544808f7e21cb9ccdb8f3aa7de594c05b1419061 upstream. At the moment the GIC IRQ domain translation routine happily converts ACPI table GSI numbers below 16 to GIC SGIs (Software Generated Interrupts aka IPIs). On the Devicetree side we explicitly forbid this translation, actually the function will never return HWIRQs below 16 when using a DT based domain translation. We expect SGIs to be handled in the first part of the function, and any further occurrence should be treated as a firmware bug, so add a check and print to report this explicitly and avoid lengthy debug sessions. Fixes: 64b499d8df40 ("irqchip/gic-v3: Configure SGIs as standard interrupts") Signed-off-by: Andre Przywara Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220404110842.2882446-1-andre.przywara@arm.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- drivers/irqchip/irq-gic-v3.c | 6 ++++++ drivers/irqchip/irq-gic.c | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c index 130b19aeb679..64d7811a2b77 100644 --- a/drivers/irqchip/irq-gic-v3.c +++ b/drivers/irqchip/irq-gic-v3.c @@ -1560,6 +1560,12 @@ static int gic_irq_domain_translate(struct irq_domain *d, if(fwspec->param_count != 2) return -EINVAL; + if (fwspec->param[0] < 16) { + pr_err(FW_BUG "Illegal GSI%d translation request\n", + fwspec->param[0]); + return -EINVAL; + } + *hwirq = fwspec->param[0]; *type = fwspec->param[1]; diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c index 176f5f06432d..205cbd24ff20 100644 --- a/drivers/irqchip/irq-gic.c +++ b/drivers/irqchip/irq-gic.c @@ -1094,6 +1094,12 @@ static int gic_irq_domain_translate(struct irq_domain *d, if(fwspec->param_count != 2) return -EINVAL; + if (fwspec->param[0] < 16) { + pr_err(FW_BUG "Illegal GSI%d translation request\n", + fwspec->param[0]); + return -EINVAL; + } + *hwirq = fwspec->param[0]; *type = fwspec->param[1]; -- Gitee From e57838ef1deca12650ab27919d6b86bbdd2446c5 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 14 Jul 2022 21:59:41 +0800 Subject: [PATCH 350/674] mm/sparsemem: fix 'mem_section' will never be NULL gcc 12 warning stable inclusion from stable-v5.10.111 commit 5c672073bcca537a0880257f31c6d942388df046 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=5c672073bcca537a0880257f31c6d942388df046 -------------------------------- commit a431dbbc540532b7465eae4fc8b56a85a9fc7d17 upstream. The gcc 12 compiler reports a "'mem_section' will never be NULL" warning on the following code: static inline struct mem_section *__nr_to_section(unsigned long nr) { #ifdef CONFIG_SPARSEMEM_EXTREME if (!mem_section) return NULL; #endif if (!mem_section[SECTION_NR_TO_ROOT(nr)]) return NULL; : It happens with CONFIG_SPARSEMEM_EXTREME off. The mem_section definition is #ifdef CONFIG_SPARSEMEM_EXTREME extern struct mem_section **mem_section; #else extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]; #endif In the !CONFIG_SPARSEMEM_EXTREME case, mem_section is a static 2-dimensional array and so the check "!mem_section[SECTION_NR_TO_ROOT(nr)]" doesn't make sense. Fix this warning by moving the "!mem_section[SECTION_NR_TO_ROOT(nr)]" check up inside the CONFIG_SPARSEMEM_EXTREME block and adding an explicit NR_SECTION_ROOTS check to make sure that there is no out-of-bound array access. Link: https://lkml.kernel.org/r/20220331180246.2746210-1-longman@redhat.com Fixes: 3e347261a80b ("sparsemem extreme implementation") Signed-off-by: Waiman Long Reported-by: Justin Forbes Cc: "Kirill A . Shutemov" Cc: Ingo Molnar Cc: Rafael Aquini Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- include/linux/mmzone.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 7e1e09fe4e3b..13c7a048e5ab 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1282,13 +1282,16 @@ static inline unsigned long *section_to_usemap(struct mem_section *ms) static inline struct mem_section *__nr_to_section(unsigned long nr) { + unsigned long root = SECTION_NR_TO_ROOT(nr); + + if (unlikely(root >= NR_SECTION_ROOTS)) + return NULL; + #ifdef CONFIG_SPARSEMEM_EXTREME - if (!mem_section) + if (!mem_section || !mem_section[root]) return NULL; #endif - if (!mem_section[SECTION_NR_TO_ROOT(nr)]) - return NULL; - return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK]; + return &mem_section[root][nr & SECTION_ROOT_MASK]; } extern unsigned long __section_nr(struct mem_section *ms); extern size_t mem_section_usage_size(void); -- Gitee From 8a66ef5ba84e4dd35aa9a52a7c89e78fcd31ceb6 Mon Sep 17 00:00:00 2001 From: Zheng Zengkai Date: Thu, 14 Jul 2022 21:59:42 +0800 Subject: [PATCH 351/674] Revert "powerpc: Fix virt_addr_valid() check" hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z CVE: NA -------------------------------- This reverts commit 446340627ad9171d570513e86ff5337c5d74c113. Revert the old patch and apply the new patch from v5.10.111 LTS. Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- arch/powerpc/include/asm/page.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h index 300d4c105a3a..254687258f42 100644 --- a/arch/powerpc/include/asm/page.h +++ b/arch/powerpc/include/asm/page.h @@ -132,10 +132,7 @@ static inline bool pfn_valid(unsigned long pfn) #define virt_to_page(kaddr) pfn_to_page(virt_to_pfn(kaddr)) #define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) -#define virt_addr_valid(vaddr) ({ \ - unsigned long _addr = (unsigned long)vaddr; \ - (unsigned long)(_addr) >= PAGE_OFFSET && pfn_valid(virt_to_pfn(_addr)); \ -}) +#define virt_addr_valid(kaddr) pfn_valid(virt_to_pfn(kaddr)) /* * On Book-E parts we need __va to parse the device tree and we can't -- Gitee From 1377e00ff63a005f49784c3aeae0d1737a9fa007 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 14 Jul 2022 21:59:43 +0800 Subject: [PATCH 352/674] powerpc: Fix virt_addr_valid() for 64-bit Book3E & 32-bit stable inclusion from stable-v5.10.111 commit d36febbcd537fcc50284e8b89609632d0146529f category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=d36febbcd537fcc50284e8b89609632d0146529f -------------------------------- commit ffa0b64e3be58519ae472ea29a1a1ad681e32f48 upstream. mpe: On 64-bit Book3E vmalloc space starts at 0x8000000000000000. Because of the way __pa() works we have: __pa(0x8000000000000000) == 0, and therefore virt_to_pfn(0x8000000000000000) == 0, and therefore virt_addr_valid(0x8000000000000000) == true Which is wrong, virt_addr_valid() should be false for vmalloc space. In fact all vmalloc addresses that alias with a valid PFN will return true from virt_addr_valid(). That can cause bugs with hardened usercopy as described below by Kefeng Wang: When running ethtool eth0 on 64-bit Book3E, a BUG occurred: usercopy: Kernel memory exposure attempt detected from SLUB object not in SLUB page?! (offset 0, size 1048)! kernel BUG at mm/usercopy.c:99 ... usercopy_abort+0x64/0xa0 (unreliable) __check_heap_object+0x168/0x190 __check_object_size+0x1a0/0x200 dev_ethtool+0x2494/0x2b20 dev_ioctl+0x5d0/0x770 sock_do_ioctl+0xf0/0x1d0 sock_ioctl+0x3ec/0x5a0 __se_sys_ioctl+0xf0/0x160 system_call_exception+0xfc/0x1f0 system_call_common+0xf8/0x200 The code shows below, data = vzalloc(array_size(gstrings.len, ETH_GSTRING_LEN)); copy_to_user(useraddr, data, gstrings.len * ETH_GSTRING_LEN)) The data is alloced by vmalloc(), virt_addr_valid(ptr) will return true on 64-bit Book3E, which leads to the panic. As commit 4dd7554a6456 ("powerpc/64: Add VIRTUAL_BUG_ON checks for __va and __pa addresses") does, make sure the virt addr above PAGE_OFFSET in the virt_addr_valid() for 64-bit, also add upper limit check to make sure the virt is below high_memory. Meanwhile, for 32-bit PAGE_OFFSET is the virtual address of the start of lowmem, high_memory is the upper low virtual address, the check is suitable for 32-bit, this will fix the issue mentioned in commit 602946ec2f90 ("powerpc: Set max_mapnr correctly") too. On 32-bit there is a similar problem with high memory, that was fixed in commit 602946ec2f90 ("powerpc: Set max_mapnr correctly"), but that commit breaks highmem and needs to be reverted. We can't easily fix __pa(), we have code that relies on its current behaviour. So for now add extra checks to virt_addr_valid(). For 64-bit Book3S the extra checks are not necessary, the combination of virt_to_pfn() and pfn_valid() should yield the correct result, but they are harmless. Signed-off-by: Kefeng Wang Reviewed-by: Christophe Leroy [mpe: Add additional change log detail] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220406145802.538416-1-mpe@ellerman.id.au Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Reviewed-by: Wei Li --- arch/powerpc/include/asm/page.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h index 254687258f42..f2c5c26869f1 100644 --- a/arch/powerpc/include/asm/page.h +++ b/arch/powerpc/include/asm/page.h @@ -132,7 +132,11 @@ static inline bool pfn_valid(unsigned long pfn) #define virt_to_page(kaddr) pfn_to_page(virt_to_pfn(kaddr)) #define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) -#define virt_addr_valid(kaddr) pfn_valid(virt_to_pfn(kaddr)) +#define virt_addr_valid(vaddr) ({ \ + unsigned long _addr = (unsigned long)vaddr; \ + _addr >= PAGE_OFFSET && _addr < (unsigned long)high_memory && \ + pfn_valid(virt_to_pfn(_addr)); \ +}) /* * On Book-E parts we need __va to parse the device tree and we can't -- Gitee From 70ae4a2480398d82a47cd27b093859a42190ee66 Mon Sep 17 00:00:00 2001 From: Wang Wensheng Date: Tue, 19 Jul 2022 11:36:02 +0800 Subject: [PATCH 353/674] mm/sharepool: Allow share THP to kernel hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5DS9S CVE: NA -------------------------------------------------- This is not used for THP but the user page table is just like THP. The user alloc hugepages via a special driver and its vma is not marked with VM_HUGETLB. This commit allow to share those vma to kernel. Signed-off-by: Wang Wensheng Reviewed-by: Weilong Chen Signed-off-by: Zheng Zengkai --- include/linux/share_pool.h | 1 + mm/share_pool.c | 44 +++++++++++++++++++++++++++++++++----- 2 files changed, 40 insertions(+), 5 deletions(-) diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h index 6f294911c6af..d95084b8f624 100644 --- a/include/linux/share_pool.h +++ b/include/linux/share_pool.h @@ -178,6 +178,7 @@ struct sp_walk_data { unsigned long uva_aligned; unsigned long page_size; bool is_hugepage; + bool is_page_type_set; pmd_t *pmd; }; diff --git a/mm/share_pool.c b/mm/share_pool.c index 76088952d0a5..60ad48e238c4 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -2994,9 +2994,40 @@ EXPORT_SYMBOL_GPL(mg_sp_make_share_k2u); static int sp_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, struct mm_walk *walk) { + struct page *page; struct sp_walk_data *sp_walk_data = walk->private; + /* + * There exist a scene in DVPP where the pagetable is huge page but its + * vma doesn't record it, something like THP. + * So we cannot make out whether it is a hugepage map until we access the + * pmd here. If mixed size of pages appear, just return an error. + */ + if (pmd_huge(*pmd)) { + if (!sp_walk_data->is_page_type_set) { + sp_walk_data->is_page_type_set = true; + sp_walk_data->is_hugepage = true; + } else if (!sp_walk_data->is_hugepage) + return -EFAULT; + + /* To skip pte level walk */ + walk->action = ACTION_CONTINUE; + + page = pmd_page(*pmd); + get_page(page); + sp_walk_data->pages[sp_walk_data->page_count++] = page; + + return 0; + } + + if (!sp_walk_data->is_page_type_set) { + sp_walk_data->is_page_type_set = true; + sp_walk_data->is_hugepage = false; + } else if (sp_walk_data->is_hugepage) + return -EFAULT; + sp_walk_data->pmd = pmd; + return 0; } @@ -3140,6 +3171,8 @@ static int __sp_walk_page_range(unsigned long uva, unsigned long size, sp_walk.pmd_entry = sp_pmd_entry; } + sp_walk_data->is_page_type_set = false; + sp_walk_data->page_count = 0; sp_walk_data->page_size = page_size; uva_aligned = ALIGN_DOWN(uva, page_size); sp_walk_data->uva_aligned = uva_aligned; @@ -3164,8 +3197,12 @@ static int __sp_walk_page_range(unsigned long uva, unsigned long size, ret = walk_page_range(mm, uva_aligned, uva_aligned + size_aligned, &sp_walk, sp_walk_data); - if (ret) + if (ret) { + while (sp_walk_data->page_count--) + put_page(pages[sp_walk_data->page_count]); kvfree(pages); + sp_walk_data->pages = NULL; + } return ret; } @@ -3201,9 +3238,7 @@ void *sp_make_share_u2k(unsigned long uva, unsigned long size, int pid) int ret = 0; struct mm_struct *mm = current->mm; void *p = ERR_PTR(-ESRCH); - struct sp_walk_data sp_walk_data = { - .page_count = 0, - }; + struct sp_walk_data sp_walk_data; struct vm_struct *area; check_interrupt_context(); @@ -3544,7 +3579,6 @@ int sp_walk_page_range(unsigned long uva, unsigned long size, return -ESRCH; } - sp_walk_data->page_count = 0; down_write(&mm->mmap_lock); if (likely(!mm->core_state)) ret = __sp_walk_page_range(uva, size, mm, sp_walk_data); -- Gitee From 5cc1b791067f40e729f4690bc4957ba1c57aa943 Mon Sep 17 00:00:00 2001 From: Guo Mengqi Date: Tue, 19 Jul 2022 11:36:03 +0800 Subject: [PATCH 354/674] mm/sharepool: use rwsem to protect sp group exit ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5DS9S CVE: NA ------------------------------------------------- Fix following situation: when the last process in a group exits, and a second process tries to add to this group. The second process may get a invalid spg. However the group's use_count is increased by 1, which caused the first process failed to free the group when it exits. And then second process called sp_group_drop --> free_sp_group and cause a double request of rwsem. Signed-off-by: Guo Mengqi Reviewed-by: Weilong Chen Signed-off-by: Yang Yingliang Signed-off-by: Zheng Zengkai --- mm/share_pool.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/mm/share_pool.c b/mm/share_pool.c index 60ad48e238c4..a8266a4a7e79 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -698,20 +698,25 @@ static void free_new_spg_id(bool new, int spg_id) free_sp_group_id(spg_id); } -static void free_sp_group(struct sp_group *spg) +static void free_sp_group_locked(struct sp_group *spg) { fput(spg->file); fput(spg->file_hugetlb); free_spg_stat(spg->id); - down_write(&sp_group_sem); idr_remove(&sp_group_idr, spg->id); - up_write(&sp_group_sem); free_sp_group_id((unsigned int)spg->id); kfree(spg); system_group_count--; WARN(system_group_count < 0, "unexpected group count\n"); } +static void free_sp_group(struct sp_group *spg) +{ + down_write(&sp_group_sem); + free_sp_group_locked(spg); + up_write(&sp_group_sem); +} + static void sp_group_drop(struct sp_group *spg) { if (atomic_dec_and_test(&spg->use_count)) @@ -4473,14 +4478,15 @@ void sp_group_post_exit(struct mm_struct *mm) sp_proc_stat_drop(stat); } - /* lockless traverse */ + down_write(&sp_group_sem); list_for_each_entry_safe(spg_node, tmp, &master->node_list, group_node) { spg = spg_node->spg; /* match with refcount inc in sp_group_add_task */ - sp_group_drop(spg); + if (atomic_dec_and_test(&spg->use_count)) + free_sp_group_locked(spg); kfree(spg_node); } - + up_write(&sp_group_sem); kfree(master); } -- Gitee From 8d06b0971a06c4cd5e6dcc645e71e749e3085856 Mon Sep 17 00:00:00 2001 From: Wang Wensheng Date: Tue, 19 Jul 2022 11:36:04 +0800 Subject: [PATCH 355/674] mm/sharepool: Accept device_id in k2u flags ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5DS9S CVE: NA ------------------------------------------------- We use device_id to select the correct dvpp vspace range when SP_DVPP flag is specified. Signed-off-by: Wang Wensheng Reviewed-by: Weilong Chen Signed-off-by: Zheng Zengkai --- mm/share_pool.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/share_pool.c b/mm/share_pool.c index a8266a4a7e79..087e8f8cbfb3 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -2852,10 +2852,11 @@ static int sp_k2u_prepare(unsigned long kva, unsigned long size, trace_sp_k2u_begin(kc); - if (sp_flags & ~SP_DVPP) { + if (sp_flags & ~SP_FLAG_MASK) { pr_err_ratelimited("k2u sp_flags %lx error\n", sp_flags); return -EINVAL; } + sp_flags &= ~SP_HUGEPAGE; if (!current->mm) { pr_err_ratelimited("k2u: kthread is not allowed\n"); -- Gitee From 5815e08af8ecbfefc844e692ed7bf597ad696253 Mon Sep 17 00:00:00 2001 From: Guo Mengqi Date: Tue, 19 Jul 2022 11:36:05 +0800 Subject: [PATCH 356/674] mm/sharepool: Modify sharepool sp_mmap() page_offset ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5DS9S CVE: NA ----------------------------------- In sp_mmap(), if use offset = va - MMAP_BASE/DVPP_BASE, then normal sp_alloc pgoff may have same value with DVPP pgoff, causing DVPP and sp_alloc mapped to overlapped part of file unexpectedly. To fix the problem, pass VA value as mmap offset, for in this scenario, VA value in one task address space will not be same. Signed-off-by: Guo Mengqi Reviewed-by: Ding Tianhong Signed-off-by: Yang Yingliang Reviewed-by: Weilong Chen Signed-off-by: Zheng Zengkai --- mm/share_pool.c | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) diff --git a/mm/share_pool.c b/mm/share_pool.c index 087e8f8cbfb3..29bbe0732781 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -58,6 +58,11 @@ #define spg_valid(spg) ((spg)->is_alive == true) +/* Use spa va address as mmap offset. This can work because spa_file + * is setup with 64-bit address space. So va shall be well covered. + */ +#define addr_offset(spa) ((spa)->va_start) + #define byte2kb(size) ((size) >> 10) #define byte2mb(size) ((size) >> 20) #define page2kb(page_num) ((page_num) << (PAGE_SHIFT - 10)) @@ -931,22 +936,6 @@ static bool is_device_addr(unsigned long addr) return false; } -static loff_t addr_offset(struct sp_area *spa) -{ - unsigned long addr; - - if (unlikely(!spa)) { - WARN(1, "invalid spa when calculate addr offset\n"); - return 0; - } - addr = spa->va_start; - - if (!is_device_addr(addr)) - return (loff_t)(addr - MMAP_SHARE_POOL_START); - - return (loff_t)(addr - sp_dev_va_start[spa->device_id]); -} - static struct sp_group *create_spg(int spg_id) { int ret; -- Gitee From 022b7bdfb108d3623c3ce8ab6740d18ee22ac2e4 Mon Sep 17 00:00:00 2001 From: Zhou Guanghui Date: Tue, 19 Jul 2022 11:36:06 +0800 Subject: [PATCH 357/674] mm/sharepool: Support read-only memory allocation ascend inclusion category: Bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5DS9S CVE: NA -------------------------------- When the driver uses the shared pool memory to share the memory with the user space, the user space is not allowed to operate this area. This prevents users from damaging sensitive data. When the sp_alloc and k2u processes apply for private memory, read-only memory can be applied for. Signed-off-by: Zhou Guanghui Reviewed-by: Weilong Chen Signed-off-by: Yongqiang Liu Signed-off-by: Zheng Zengkai --- include/linux/share_pool.h | 3 ++- mm/share_pool.c | 18 ++++++++++++++++-- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h index d95084b8f624..022e61bb6ce4 100644 --- a/include/linux/share_pool.h +++ b/include/linux/share_pool.h @@ -15,6 +15,7 @@ #define SP_HUGEPAGE_ONLY (1 << 1) #define SP_DVPP (1 << 2) #define SP_SPEC_NODE_ID (1 << 3) +#define SP_PROT_RO (1 << 16) #define DEVICE_ID_BITS 4UL #define DEVICE_ID_MASK ((1UL << DEVICE_ID_BITS) - 1UL) @@ -24,7 +25,7 @@ #define NODE_ID_SHIFT (DEVICE_ID_SHIFT + DEVICE_ID_BITS) #define SP_FLAG_MASK (SP_HUGEPAGE | SP_HUGEPAGE_ONLY | SP_DVPP | \ - SP_SPEC_NODE_ID | \ + SP_SPEC_NODE_ID | SP_PROT_RO | \ (DEVICE_ID_MASK << DEVICE_ID_SHIFT) | \ (NODE_ID_MASK << NODE_ID_SHIFT)) diff --git a/mm/share_pool.c b/mm/share_pool.c index 29bbe0732781..8be3f75d0449 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -2325,6 +2325,9 @@ static int sp_alloc_mmap(struct mm_struct *mm, struct sp_area *spa, if (spg_node) prot = spg_node->prot; + if (ac->sp_flags & SP_PROT_RO) + prot = PROT_READ; + /* when success, mmap_addr == spa->va_start */ mmap_addr = sp_mmap(mm, spa_file(spa), spa, &populate, prot); if (IS_ERR_VALUE(mmap_addr)) { @@ -2349,6 +2352,10 @@ static int sp_alloc_mmap(struct mm_struct *mm, struct sp_area *spa, ret = -EINVAL; goto unmap; } + + if (ac->sp_flags & SP_PROT_RO) + vma->vm_flags &= ~VM_MAYWRITE; + /* clean PTE_RDONLY flags or trigger SMMU event */ if (prot & PROT_WRITE) vma->vm_page_prot = __pgprot(((~PTE_RDONLY) & vma->vm_page_prot.pgprot) | PTE_DIRTY); @@ -2644,6 +2651,9 @@ static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa, goto put_mm; } + if (kc && kc->sp_flags & SP_PROT_RO) + prot = PROT_READ; + ret_addr = sp_mmap(mm, spa_file(spa), spa, &populate, prot); if (IS_ERR_VALUE(ret_addr)) { pr_debug("k2u mmap failed %lx\n", ret_addr); @@ -2656,6 +2666,9 @@ static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa, if (prot & PROT_WRITE) vma->vm_page_prot = __pgprot(((~PTE_RDONLY) & vma->vm_page_prot.pgprot) | PTE_DIRTY); + if (kc && kc->sp_flags & SP_PROT_RO) + vma->vm_flags &= ~VM_MAYWRITE; + if (is_vm_hugetlb_page(vma)) { ret = remap_vmalloc_hugepage_range(vma, (void *)kva, 0); if (ret) { @@ -2707,6 +2720,7 @@ static void *sp_make_share_kva_to_task(unsigned long kva, unsigned long size, un struct sp_area *spa; struct spg_proc_stat *stat; unsigned long prot = PROT_READ | PROT_WRITE; + struct sp_k2u_context kc; down_write(&sp_group_sem); stat = sp_init_process_stat(current, current->mm, spg_none); @@ -2725,8 +2739,8 @@ static void *sp_make_share_kva_to_task(unsigned long kva, unsigned long size, un } spa->kva = kva; - - uva = (void *)sp_remap_kva_to_vma(kva, spa, current->mm, prot, NULL); + kc.sp_flags = sp_flags; + uva = (void *)sp_remap_kva_to_vma(kva, spa, current->mm, prot, &kc); __sp_area_drop(spa); if (IS_ERR(uva)) pr_err("remap k2u to task failed %ld\n", PTR_ERR(uva)); -- Gitee From ce9fcee175901940ba7734c898392de94012270c Mon Sep 17 00:00:00 2001 From: Wang Wensheng Date: Tue, 19 Jul 2022 11:36:07 +0800 Subject: [PATCH 358/674] mm/sharepool: Fix using uninitialized sp_flag hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5DS9S CVE: NA ------------------------------------------------- Add the missing initialization for kc.sp_flag in sp_make_share_kva_to_spg(). Or a random value would be used in sp_remap_kva_to_vma(). Signed-off-by: Wang Wensheng Reviewed-by: Weilong Chen Signed-off-by: Zheng Zengkai --- mm/share_pool.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/share_pool.c b/mm/share_pool.c index 8be3f75d0449..edba445d4bbf 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -2781,7 +2781,7 @@ static void *sp_make_share_kva_to_spg(unsigned long kva, unsigned long size, } spa->kva = kva; - + kc.sp_flags = sp_flags; list_for_each_entry(spg_node, &spg->procs, proc_node) { mm = spg_node->master->mm; kc.state = K2U_NORMAL; -- Gitee From d722c6b6843a68aaaaa1984e821e8db003df6d5c Mon Sep 17 00:00:00 2001 From: Yuan Can Date: Tue, 19 Jul 2022 11:36:08 +0800 Subject: [PATCH 359/674] mm/sharepool: Avoid NULL pointer dereference in mg_sp_group_add_task ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5DS9S CVE: NA ------------------------------------------------------ create_spg_node() may fail with NULL pointer returened, and in the out_drop_spg_node path, the NULL pointer will be dereferenced in free_spg_node(). Signed-off-by: Yuan Can Reviewed-by: Weilong Chen Signed-off-by: Zheng Zengkai --- mm/share_pool.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/share_pool.c b/mm/share_pool.c index edba445d4bbf..caf3e89b41c4 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -1283,7 +1283,7 @@ int mg_sp_group_add_task(int pid, unsigned long prot, int spg_id) node = create_spg_node(mm, prot, spg); if (unlikely(IS_ERR(node))) { ret = PTR_ERR(node); - goto out_drop_spg_node; + goto out_drop_group; } /* per process statistics initialization */ -- Gitee From 6db68bf84902025dd57fdfe94a5085166adade03 Mon Sep 17 00:00:00 2001 From: Zhou Guanghui Date: Tue, 19 Jul 2022 11:36:09 +0800 Subject: [PATCH 360/674] mm/sharepool: Delete single-group mode hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5DS9S CVE: NA ------------------------------------------------- The single-group mode has no application scenario. Therefore, the related branch is deleted. The boot option "enable_sp_multi_group_mode" does not take effect. Signed-off-by: Zhou Guanghui Reviewed-by: Weilong Chen Signed-off-by: Zheng Zengkai --- mm/share_pool.c | 137 +++++++++--------------------------------------- 1 file changed, 25 insertions(+), 112 deletions(-) diff --git a/mm/share_pool.c b/mm/share_pool.c index caf3e89b41c4..076243713f83 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -67,9 +67,6 @@ #define byte2mb(size) ((size) >> 20) #define page2kb(page_num) ((page_num) << (PAGE_SHIFT - 10)) -#define SINGLE_GROUP_MODE 1 -#define MULTI_GROUP_MODE 2 - #define MAX_GROUP_FOR_SYSTEM 50000 #define MAX_GROUP_FOR_TASK 3000 #define MAX_PROC_PER_GROUP 1024 @@ -98,8 +95,6 @@ int sysctl_share_pool_map_lock_enable; int sysctl_sp_perf_k2u; int sysctl_sp_perf_alloc; -static int share_pool_group_mode = SINGLE_GROUP_MODE; - static int system_group_count; static unsigned int sp_device_number; @@ -1068,12 +1063,6 @@ static int mm_add_group_init(struct mm_struct *mm, struct sp_group *spg) struct sp_group_master *master = mm->sp_group_master; bool exist = false; - if (share_pool_group_mode == SINGLE_GROUP_MODE && master && - master->count == 1) { - pr_err_ratelimited("at most one sp group for a task is allowed in single mode\n"); - return -EEXIST; - } - master = sp_init_group_master_locked(mm, &exist); if (IS_ERR(master)) return PTR_ERR(master); @@ -2211,72 +2200,30 @@ static int sp_alloc_prepare(unsigned long size, unsigned long sp_flags, if (sp_flags & SP_HUGEPAGE_ONLY) sp_flags |= SP_HUGEPAGE; - if (share_pool_group_mode == SINGLE_GROUP_MODE) { - spg = __sp_find_spg(current->pid, SPG_ID_DEFAULT); - if (spg) { - if (spg_id != SPG_ID_DEFAULT && spg->id != spg_id) { - sp_group_drop(spg); - return -ENODEV; - } - - /* up_read will be at the end of sp_alloc */ - down_read(&spg->rw_lock); - if (!spg_valid(spg)) { - up_read(&spg->rw_lock); - sp_group_drop(spg); - pr_err_ratelimited("allocation failed, spg is dead\n"); - return -ENODEV; - } - } else { /* alocation pass through scene */ - if (enable_mdc_default_group) { - int ret = 0; - - ret = sp_group_add_task(current->tgid, spg_id); - if (ret < 0) { - pr_err_ratelimited("add group failed in pass through\n"); - return ret; - } - - spg = __sp_find_spg(current->pid, SPG_ID_DEFAULT); - - /* up_read will be at the end of sp_alloc */ - down_read(&spg->rw_lock); - if (!spg_valid(spg)) { - up_read(&spg->rw_lock); - sp_group_drop(spg); - pr_err_ratelimited("pass through allocation failed, spg is dead\n"); - return -ENODEV; - } - } else { - spg = spg_none; - } + if (spg_id != SPG_ID_DEFAULT) { + spg = __sp_find_spg(current->pid, spg_id); + if (!spg) { + pr_err_ratelimited("allocation failed, can't find group\n"); + return -ENODEV; } - } else { - if (spg_id != SPG_ID_DEFAULT) { - spg = __sp_find_spg(current->pid, spg_id); - if (!spg) { - pr_err_ratelimited("allocation failed, can't find group\n"); - return -ENODEV; - } - /* up_read will be at the end of sp_alloc */ - down_read(&spg->rw_lock); - if (!spg_valid(spg)) { - up_read(&spg->rw_lock); - sp_group_drop(spg); - pr_err_ratelimited("allocation failed, spg is dead\n"); - return -ENODEV; - } + /* up_read will be at the end of sp_alloc */ + down_read(&spg->rw_lock); + if (!spg_valid(spg)) { + up_read(&spg->rw_lock); + sp_group_drop(spg); + pr_err_ratelimited("allocation failed, spg is dead\n"); + return -ENODEV; + } - if (!is_process_in_group(spg, current->mm)) { - up_read(&spg->rw_lock); - sp_group_drop(spg); - pr_err_ratelimited("allocation failed, task not in group\n"); - return -ENODEV; - } - } else { /* alocation pass through scene */ - spg = spg_none; + if (!is_process_in_group(spg, current->mm)) { + up_read(&spg->rw_lock); + sp_group_drop(spg); + pr_err_ratelimited("allocation failed, task not in group\n"); + return -ENODEV; } + } else { /* alocation pass through scene */ + spg = spg_none; } if (sp_flags & SP_HUGEPAGE) { @@ -2892,33 +2839,12 @@ static int sp_k2u_prepare(unsigned long kva, unsigned long size, kc->size_aligned = size_aligned; kc->sp_flags = sp_flags; kc->spg_id = spg_id; - kc->to_task = false; - return 0; -} - -static int sp_check_k2task(struct sp_k2u_context *kc) -{ - int ret = 0; - int spg_id = kc->spg_id; - - if (share_pool_group_mode == SINGLE_GROUP_MODE) { - struct sp_group *spg = get_first_group(current->mm); + if (spg_id == SPG_ID_DEFAULT || spg_id == SPG_ID_NONE) + kc->to_task = true; + else + kc->to_task = false; - if (!spg) { - if (spg_id != SPG_ID_NONE && spg_id != SPG_ID_DEFAULT) - ret = -EINVAL; - else - kc->to_task = true; - } else { - if (spg_id != SPG_ID_DEFAULT && spg_id != spg->id) - ret = -EINVAL; - sp_group_drop(spg); - } - } else { - if (spg_id == SPG_ID_DEFAULT || spg_id == SPG_ID_NONE) - kc->to_task = true; - } - return ret; + return 0; } static void *sp_k2u_finish(void *uva, struct sp_k2u_context *kc) @@ -2963,12 +2889,6 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, if (ret) return ERR_PTR(ret); - ret = sp_check_k2task(&kc); - if (ret) { - uva = ERR_PTR(ret); - goto out; - } - if (kc.to_task) uva = sp_make_share_kva_to_task(kc.kva_aligned, kc.size_aligned, kc.sp_flags); else { @@ -3735,13 +3655,6 @@ static int __init enable_share_k2u_to_group(char *s) } __setup("enable_sp_share_k2u_spg", enable_share_k2u_to_group); -static int __init enable_sp_multi_group_mode(char *s) -{ - share_pool_group_mode = MULTI_GROUP_MODE; - return 1; -} -__setup("enable_sp_multi_group_mode", enable_sp_multi_group_mode); - /*** Statistical and maintenance functions ***/ static void free_process_spg_proc_stat(struct sp_proc_stat *proc_stat) -- Gitee From b31f7f2856d50d7b943d12c43852f28ef630f5ba Mon Sep 17 00:00:00 2001 From: Zhou Guanghui Date: Tue, 19 Jul 2022 11:36:10 +0800 Subject: [PATCH 361/674] mm/sharepool: Create global normal and dvpp mapping hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5DS9S CVE: NA ------------------------------------------------- struct sp_mapping is used to manage the address space of a shared pool. During the initialization of the shared pool, normal address spaces are created to allocate the memory of the current shared pool. Signed-off-by: Zhou Guanghui Reviewed-by: Weilong Chen Signed-off-by: Zheng Zengkai --- include/linux/share_pool.h | 18 +++++++++++++ mm/share_pool.c | 52 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+) diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h index 022e61bb6ce4..654dc8cc2922 100644 --- a/include/linux/share_pool.h +++ b/include/linux/share_pool.h @@ -101,6 +101,17 @@ struct sp_proc_stat { atomic64_t k2u_size; }; +/* + * address space management + */ +struct sp_mapping { + unsigned long flag; + atomic_t user; + unsigned long start[MAX_DEVID]; + unsigned long end[MAX_DEVID]; + struct rb_root area_root; +}; + /* Processes in the same sp_group can share memory. * Memory layout for share pool: * @@ -142,6 +153,8 @@ struct sp_group { atomic_t use_count; /* protect the group internal elements, except spa_list */ struct rw_semaphore rw_lock; + struct sp_mapping *dvpp; + struct sp_mapping *normal; }; /* a per-process(per mm) struct which manages a sp_group_node list */ @@ -155,6 +168,11 @@ struct sp_group_master { struct list_head node_list; struct mm_struct *mm; struct sp_proc_stat *stat; + /* + * Used to apply for the shared pool memory of the current process. + * For example, sp_alloc non-share memory or k2task. + */ + struct sp_group *local; }; /* diff --git a/mm/share_pool.c b/mm/share_pool.c index 076243713f83..6c70ff72b7af 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -130,6 +130,48 @@ static DECLARE_RWSEM(sp_spg_stat_sem); /* for kthread buff_module_guard_work */ static struct sp_proc_stat kthread_stat; +#define SP_MAPPING_DVPP 0x1 +#define SP_MAPPING_NORMAL 0x2 +static struct sp_mapping *sp_mapping_normal; + +static void sp_mapping_range_init(struct sp_mapping *spm) +{ + int i; + + for (i = 0; i < MAX_DEVID; i++) { + if (spm->flag & SP_MAPPING_NORMAL) { + spm->start[i] = MMAP_SHARE_POOL_START; + spm->end[i] = MMAP_SHARE_POOL_16G_START; + continue; + } + + if (!is_sp_dev_addr_enabled(i)) { + spm->start[i] = MMAP_SHARE_POOL_16G_START + + i * MMAP_SHARE_POOL_16G_START; + spm->end[i] = spm->start[i] + MMAP_SHARE_POOL_16G_START; + } else { + spm->start[i] = sp_dev_va_start[i]; + spm->end[i] = spm->start[i] + sp_dev_va_size[i]; + } + } +} + +static struct sp_mapping *sp_mapping_create(unsigned long flag) +{ + struct sp_mapping *spm; + + spm = kzalloc(sizeof(struct sp_mapping), GFP_KERNEL); + if (!spm) + return ERR_PTR(-ENOMEM); + + spm->flag = flag; + sp_mapping_range_init(spm); + atomic_set(&spm->user, 0); + spm->area_root = RB_ROOT; + + return spm; +} + /* The caller must hold sp_group_sem */ static struct sp_group_master *sp_init_group_master_locked( struct mm_struct *mm, bool *exist) @@ -4432,12 +4474,22 @@ static void __init sp_device_number_detect(void) static int __init share_pool_init(void) { + if (!sp_is_enabled()) + return 0; + /* lockless, as init kthread has no sp operation else */ spg_none = create_spg(GROUP_NONE); /* without free spg_none, not a serious problem */ if (IS_ERR(spg_none) || !spg_none) goto fail; + sp_mapping_normal = sp_mapping_create(SP_MAPPING_NORMAL); + if (IS_ERR(sp_mapping_normal)) { + sp_group_drop(spg_none); + goto fail; + } + atomic_inc(&sp_mapping_normal->user); + sp_device_number_detect(); proc_sharepool_init(); -- Gitee From 55f1816a3b2fd31c9a2ca8ee7c01bcc387f1700e Mon Sep 17 00:00:00 2001 From: Wang Wensheng Date: Tue, 19 Jul 2022 11:36:11 +0800 Subject: [PATCH 362/674] mm/sharepool: Fix kabi borken in sp_group_master hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5DS9S CVE: NA ------------------------------------------------- The sp_group_master structure is used only in sharepool subsys and no other drivers use it. Signed-off-by: Wang Wensheng Reviewed-by: Weilong Chen Signed-off-by: Zheng Zengkai --- include/linux/share_pool.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h index 654dc8cc2922..b1b81947fa3f 100644 --- a/include/linux/share_pool.h +++ b/include/linux/share_pool.h @@ -10,6 +10,7 @@ #include #include #include +#include #define SP_HUGEPAGE (1 << 0) #define SP_HUGEPAGE_ONLY (1 << 1) @@ -172,7 +173,7 @@ struct sp_group_master { * Used to apply for the shared pool memory of the current process. * For example, sp_alloc non-share memory or k2task. */ - struct sp_group *local; + KABI_EXTEND(struct sp_group *local) }; /* -- Gitee From 85d9c2fe58f7fc974131b36e7c12bb1ae85edd15 Mon Sep 17 00:00:00 2001 From: Zhou Guanghui Date: Tue, 19 Jul 2022 11:36:12 +0800 Subject: [PATCH 363/674] mm/sharepool: Address space management for sp_group hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5DS9S CVE: NA ------------------------------------------------- Separately manage the normal and dvpp address spaces of the sp_group and set the normal and dvpp address spaces of the corresponding groups when adding a group, sp_alloc, and k2task. Signed-off-by: Zhou Guanghui Reviewed-by: Weilong Chen Signed-off-by: Zheng Zengkai --- include/linux/share_pool.h | 6 + mm/share_pool.c | 296 +++++++++++++++++++++++++++++-------- 2 files changed, 237 insertions(+), 65 deletions(-) diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h index b1b81947fa3f..289877dde2fb 100644 --- a/include/linux/share_pool.h +++ b/include/linux/share_pool.h @@ -40,6 +40,8 @@ #define SPG_ID_AUTO_MIN 100000 #define SPG_ID_AUTO_MAX 199999 #define SPG_ID_AUTO 200000 /* generate group id automatically */ +#define SPG_ID_LOCAL_MIN 200001 +#define SPG_ID_LOCAL_MAX 299999 #define MAX_DEVID 8 /* the max num of Da-vinci devices */ @@ -111,6 +113,10 @@ struct sp_mapping { unsigned long start[MAX_DEVID]; unsigned long end[MAX_DEVID]; struct rb_root area_root; + + struct rb_node *free_area_cache; + unsigned long cached_hole_size; + unsigned long cached_vstart; }; /* Processes in the same sp_group can share memory. diff --git a/mm/share_pool.c b/mm/share_pool.c index 6c70ff72b7af..c3634d62e1fa 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -147,8 +147,8 @@ static void sp_mapping_range_init(struct sp_mapping *spm) if (!is_sp_dev_addr_enabled(i)) { spm->start[i] = MMAP_SHARE_POOL_16G_START + - i * MMAP_SHARE_POOL_16G_START; - spm->end[i] = spm->start[i] + MMAP_SHARE_POOL_16G_START; + i * MMAP_SHARE_POOL_16G_SIZE; + spm->end[i] = spm->start[i] + MMAP_SHARE_POOL_16G_SIZE; } else { spm->start[i] = sp_dev_va_start[i]; spm->end[i] = spm->start[i] + sp_dev_va_size[i]; @@ -172,10 +172,91 @@ static struct sp_mapping *sp_mapping_create(unsigned long flag) return spm; } +static void sp_mapping_destroy(struct sp_mapping *spm) +{ + kfree(spm); +} + +static void sp_mapping_attach(struct sp_group *spg, struct sp_mapping *spm) +{ + atomic_inc(&spm->user); + if (spm->flag & SP_MAPPING_DVPP) + spg->dvpp = spm; + else if (spm->flag & SP_MAPPING_NORMAL) + spg->normal = spm; +} + +static void sp_mapping_detach(struct sp_group *spg, struct sp_mapping *spm) +{ + if (spm && atomic_dec_and_test(&spm->user)) + sp_mapping_destroy(spm); +} + +/* + * When you set the address space of a group, the normal address space + * is globally unified. When processing the DVPP address space, consider + * the following situations: + * 1. If a process is added to a non-new group, the DVPP address space + * must have been created. If the local group of the process also + * contains the DVPP address space and they are different, this + * scenario is not allowed to avoid address conflict. + * 2. If the DVPP address space does not exist in the local group of the + * process, attach the local group of the process to the DVPP address + * space of the group. + * 3. Add a new group. If the process has applied for the dvpp address + * space (sp_alloc or k2u), attach the new group to the dvpp address + * space of the current process. + * 4. If the process has not applied for the DVPP address space, attach + * the new group and the local group of the current process to the + * newly created DVPP address space. + * + * the caller must hold sp_group_sem + */ +static int sp_mapping_group_setup(struct mm_struct *mm, struct sp_group *spg) +{ + struct sp_group_master *master = mm->sp_group_master; + struct sp_group *local = master->local; + struct sp_mapping *spm; + + if (!list_empty(&spg->procs)) { + /* 1 */ + if (local->dvpp && local->dvpp != spg->dvpp) { + pr_info_ratelimited("Duplicate address space, id=%d\n", + spg->id); + return 0; + } + + /* 2 */ + if (!local->dvpp) { + sp_mapping_attach(local, spg->dvpp); + sp_mapping_attach(local, spg->normal); + } + } else { + /* 4 */ + if (!local->dvpp) { + spm = sp_mapping_create(SP_MAPPING_DVPP); + if (IS_ERR(spm)) + return PTR_ERR(spm); + sp_mapping_attach(local, spm); + sp_mapping_attach(local, sp_mapping_normal); + } + + /* 3 */ + sp_mapping_attach(spg, local->dvpp); + sp_mapping_attach(spg, sp_mapping_normal); + } + + return 0; +} + +static struct sp_group *create_spg(int spg_id); +static void free_new_spg_id(bool new, int spg_id); /* The caller must hold sp_group_sem */ static struct sp_group_master *sp_init_group_master_locked( struct mm_struct *mm, bool *exist) { + int spg_id; + struct sp_group *spg; struct sp_group_master *master = mm->sp_group_master; if (master) { @@ -187,16 +268,92 @@ static struct sp_group_master *sp_init_group_master_locked( if (master == NULL) return ERR_PTR(-ENOMEM); + spg_id = ida_alloc_range(&sp_group_id_ida, SPG_ID_LOCAL_MIN, + SPG_ID_LOCAL_MAX, GFP_ATOMIC); + if (spg_id < 0) { + kfree(master); + pr_err_ratelimited("generate local group id failed %d\n", spg_id); + return ERR_PTR(spg_id); + } + + spg = create_spg(spg_id); + if (IS_ERR(spg)) { + free_new_spg_id(true, spg_id); + kfree(master); + return (struct sp_group_master *)spg; + } + INIT_LIST_HEAD(&master->node_list); master->count = 0; master->stat = NULL; master->mm = mm; + master->local = spg; mm->sp_group_master = master; *exist = false; return master; } +static inline bool is_local_group(int spg_id) +{ + return spg_id >= SPG_ID_LOCAL_MIN && spg_id <= SPG_ID_LOCAL_MAX; +} + +/* + * If the process is added to a group first, the address space of the local + * group of the process must have been set. If the process is not added to + * a group, directly create or attach the process to the corresponding DVPP + * and normal address space. + */ +static int sp_mapping_group_setup_local(struct mm_struct *mm) +{ + struct sp_group_master *master; + struct sp_mapping *spm; + bool exist = false; + + master = sp_init_group_master_locked(mm, &exist); + if (IS_ERR(master)) + return PTR_ERR(master); + + if (master->local->dvpp) + return 0; + + spm = sp_mapping_create(SP_MAPPING_DVPP); + if (IS_ERR(spm)) + return PTR_ERR(spm); + sp_mapping_attach(master->local, spm); + sp_mapping_attach(master->local, sp_mapping_normal); + + return 0; +} + +static struct sp_group *sp_get_local_group(struct mm_struct *mm) +{ + int ret; + struct sp_group_master *master; + + down_read(&sp_group_sem); + master = mm->sp_group_master; + if (master && master->local) { + atomic_inc(&master->local->use_count); + up_read(&sp_group_sem); + return master->local; + } + up_read(&sp_group_sem); + + down_write(&sp_group_sem); + ret = sp_mapping_group_setup_local(mm); + if (ret) { + up_write(&sp_group_sem); + return ERR_PTR(ret); + } + master = mm->sp_group_master; + atomic_inc(&master->local->use_count); + up_write(&sp_group_sem); + + return master->local; +} + static struct sp_proc_stat *sp_get_proc_stat(struct mm_struct *mm) { struct sp_proc_stat *stat; @@ -580,7 +737,7 @@ static void spa_inc_usage(struct sp_area *spa) case SPA_TYPE_K2TASK: spa_stat.k2u_task_num += 1; spa_stat.k2u_task_size += size; - update_spg_stat_k2u(size, true, spg_none->stat); + update_spg_stat_k2u(size, true, spa->spg->stat); break; case SPA_TYPE_K2SPG: spa_stat.k2u_spg_num += 1; @@ -603,7 +760,7 @@ static void spa_inc_usage(struct sp_area *spa) spa_stat.total_num += 1; spa_stat.total_size += size; - if (spa->spg != spg_none) { + if (!is_local_group(spa->spg->id)) { atomic_inc(&sp_overall_stat.spa_total_num); atomic64_add(size, &sp_overall_stat.spa_total_size); } @@ -626,7 +783,7 @@ static void spa_dec_usage(struct sp_area *spa) case SPA_TYPE_K2TASK: spa_stat.k2u_task_num -= 1; spa_stat.k2u_task_size -= size; - update_spg_stat_k2u(size, false, spg_none->stat); + update_spg_stat_k2u(size, false, spa->spg->stat); break; case SPA_TYPE_K2SPG: spa_stat.k2u_spg_num -= 1; @@ -645,7 +802,7 @@ static void spa_dec_usage(struct sp_area *spa) spa_stat.total_num -= 1; spa_stat.total_size -= size; - if (spa->spg != spg_none) { + if (!is_local_group(spa->spg->id)) { atomic_dec(&sp_overall_stat.spa_total_num); atomic64_sub(spa->real_size, &sp_overall_stat.spa_total_size); } @@ -730,7 +887,8 @@ static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa, static void free_sp_group_id(int spg_id) { /* ida operation is protected by an internal spin_lock */ - if (spg_id >= SPG_ID_AUTO_MIN && spg_id <= SPG_ID_AUTO_MAX) + if ((spg_id >= SPG_ID_AUTO_MIN && spg_id <= SPG_ID_AUTO_MAX) || + (spg_id >= SPG_ID_LOCAL_MIN && spg_id <= SPG_ID_LOCAL_MAX)) ida_free(&sp_group_id_ida, spg_id); } @@ -747,8 +905,11 @@ static void free_sp_group_locked(struct sp_group *spg) free_spg_stat(spg->id); idr_remove(&sp_group_idr, spg->id); free_sp_group_id((unsigned int)spg->id); + sp_mapping_detach(spg, spg->dvpp); + sp_mapping_detach(spg, spg->normal); + if (!is_local_group(spg->id)) + system_group_count--; kfree(spg); - system_group_count--; WARN(system_group_count < 0, "unexpected group count\n"); } @@ -981,7 +1142,8 @@ static struct sp_group *create_spg(int spg_id) struct user_struct *user = NULL; int hsize_log = MAP_HUGE_2MB >> MAP_HUGE_SHIFT; - if (unlikely(system_group_count + 1 == MAX_GROUP_FOR_SYSTEM)) { + if (unlikely(system_group_count + 1 == MAX_GROUP_FOR_SYSTEM && + !is_local_group(spg_id))) { pr_err_ratelimited("reach system max group num\n"); return ERR_PTR(-ENOSPC); } @@ -1028,7 +1190,8 @@ static struct sp_group *create_spg(int spg_id) if (ret < 0) goto out_fput_all; - system_group_count++; + if (!is_local_group(spg_id)) + system_group_count++; return spg; out_fput_all: @@ -1311,6 +1474,10 @@ int mg_sp_group_add_task(int pid, unsigned long prot, int spg_id) if (ret) goto out_drop_group; + ret = sp_mapping_group_setup(mm, spg); + if (ret) + goto out_drop_group; + node = create_spg_node(mm, prot, spg); if (unlikely(IS_ERR(node))) { ret = PTR_ERR(node); @@ -1592,7 +1759,6 @@ static void __insert_sp_area(struct sp_area *spa) /* The sp_area cache globals are protected by sp_area_lock */ static struct rb_node *free_sp_area_cache; -static unsigned long cached_hole_size; static unsigned long cached_vstart; /* affected by SP_DVPP and sp_config_dvpp_range() */ /** @@ -1611,11 +1777,12 @@ static struct sp_area *sp_alloc_area(unsigned long size, unsigned long flags, { struct sp_area *spa, *first, *err; struct rb_node *n; - unsigned long vstart = MMAP_SHARE_POOL_START; - unsigned long vend = MMAP_SHARE_POOL_16G_START; + unsigned long vstart; + unsigned long vend; unsigned long addr; unsigned long size_align = ALIGN(size, PMD_SIZE); /* va aligned to 2M */ int device_id, node_id; + struct sp_mapping *mapping; device_id = sp_flags_device_id(flags); node_id = flags & SP_SPEC_NODE_ID ? sp_flags_node_id(flags) : device_id; @@ -1625,17 +1792,13 @@ static struct sp_area *sp_alloc_area(unsigned long size, unsigned long flags, return ERR_PTR(-EINVAL); } - if ((flags & SP_DVPP)) { - if (!is_sp_dev_addr_enabled(device_id)) { - vstart = MMAP_SHARE_POOL_16G_START + - device_id * MMAP_SHARE_POOL_16G_SIZE; - vend = vstart + MMAP_SHARE_POOL_16G_SIZE; - } else { - vstart = sp_dev_va_start[device_id]; - vend = vstart + sp_dev_va_size[device_id]; - } - } + if (flags & SP_DVPP) + mapping = spg->dvpp; + else + mapping = spg->normal; + vstart = mapping->start[device_id]; + vend = mapping->end[device_id]; spa = __kmalloc_node(sizeof(struct sp_area), GFP_KERNEL, node_id); if (unlikely(!spa)) return ERR_PTR(-ENOMEM); @@ -1651,18 +1814,18 @@ static struct sp_area *sp_alloc_area(unsigned long size, unsigned long flags, * Note that sp_free_area may update free_sp_area_cache * without updating cached_hole_size. */ - if (!free_sp_area_cache || size_align < cached_hole_size || - vstart != cached_vstart) { - cached_hole_size = 0; - free_sp_area_cache = NULL; + if (!mapping->free_area_cache || size_align < mapping->cached_hole_size || + vstart != mapping->cached_vstart) { + mapping->cached_hole_size = 0; + mapping->free_area_cache = NULL; } /* record if we encounter less permissive parameters */ - cached_vstart = vstart; + mapping->cached_vstart = vstart; /* find starting point for our search */ - if (free_sp_area_cache) { - first = rb_entry(free_sp_area_cache, struct sp_area, rb_node); + if (mapping->free_area_cache) { + first = rb_entry(mapping->free_area_cache, struct sp_area, rb_node); addr = first->va_end; if (addr + size_align < addr) { err = ERR_PTR(-EOVERFLOW); @@ -1675,7 +1838,7 @@ static struct sp_area *sp_alloc_area(unsigned long size, unsigned long flags, goto error; } - n = sp_area_root.rb_node; + n = mapping->area_root.rb_node; first = NULL; while (n) { @@ -1697,8 +1860,8 @@ static struct sp_area *sp_alloc_area(unsigned long size, unsigned long flags, /* from the starting point, traverse areas until a suitable hole is found */ while (addr + size_align > first->va_start && addr + size_align <= vend) { - if (addr + cached_hole_size < first->va_start) - cached_hole_size = first->va_start - addr; + if (addr + mapping->cached_hole_size < first->va_start) + mapping->cached_hole_size = first->va_start - addr; addr = first->va_end; if (addr + size_align < addr) { err = ERR_PTR(-EOVERFLOW); @@ -1736,9 +1899,8 @@ static struct sp_area *sp_alloc_area(unsigned long size, unsigned long flags, spa_inc_usage(spa); __insert_sp_area(spa); - free_sp_area_cache = &spa->rb_node; - if (spa->spg != spg_none) - list_add_tail(&spa->link, &spg->spa_list); + mapping->free_area_cache = &spa->rb_node; + list_add_tail(&spa->link, &spg->spa_list); spin_unlock(&sp_area_lock); @@ -1829,8 +1991,7 @@ static void sp_free_area(struct sp_area *spa) pr_debug("clear spa->kva %ld is not valid\n", spa->kva); spa_dec_usage(spa); - if (spa->spg != spg_none) - list_del(&spa->link); + list_del(&spa->link); rb_erase(&spa->rb_node, &sp_area_root); RB_CLEAR_NODE(&spa->rb_node); @@ -1990,7 +2151,7 @@ static void sp_fallocate(struct sp_area *spa) static void sp_free_unmap_fallocate(struct sp_area *spa) { - if (spa->spg != spg_none) { + if (!is_local_group(spa->spg->id)) { down_read(&spa->spg->rw_lock); __sp_free(spa->spg, spa->va_start, spa_size(spa), NULL); sp_fallocate(spa); @@ -2195,7 +2356,6 @@ static void trace_sp_alloc_begin(struct sp_alloc_context *ac) static void trace_sp_alloc_finish(struct sp_alloc_context *ac, unsigned long va) { unsigned long cost; - bool is_pass_through = ac->spg == spg_none ? true : false; if (!sysctl_sp_perf_alloc) return; @@ -2207,7 +2367,8 @@ static void trace_sp_alloc_finish(struct sp_alloc_context *ac, unsigned long va) if (cost >= (unsigned long)sysctl_sp_perf_alloc) { pr_err("Task %s(%d/%d) sp_alloc returns 0x%lx consumes %luus, size is %luKB, size_aligned is %luKB, sp_flags is %lx, pass through is %d\n", current->comm, current->tgid, current->pid, - va, cost, byte2kb(ac->size), byte2kb(ac->size_aligned), ac->sp_flags, is_pass_through); + va, cost, byte2kb(ac->size), byte2kb(ac->size_aligned), ac->sp_flags, + is_local_group(ac->spg->id)); } } @@ -2265,7 +2426,9 @@ static int sp_alloc_prepare(unsigned long size, unsigned long sp_flags, return -ENODEV; } } else { /* alocation pass through scene */ - spg = spg_none; + spg = sp_get_local_group(current->mm); + if (IS_ERR(spg)) + return PTR_ERR(spg); } if (sp_flags & SP_HUGEPAGE) { @@ -2288,7 +2451,7 @@ static int sp_alloc_prepare(unsigned long size, unsigned long sp_flags, static void sp_alloc_unmap(struct mm_struct *mm, struct sp_area *spa, struct sp_group_node *spg_node) { - if (spa->spg != spg_none) + if (!is_local_group(spa->spg->id)) __sp_free(spa->spg, spa->va_start, spa->real_size, mm); } @@ -2353,7 +2516,7 @@ static int sp_alloc_mmap(struct mm_struct *mm, struct sp_area *spa, return ret; unmap: - if (spa->spg != spg_none) + if (!is_local_group(spa->spg->id)) sp_alloc_unmap(list_next_entry(spg_node, proc_node)->master->mm, spa, spg_node); else sp_munmap(mm, spa->va_start, spa->real_size); @@ -2456,7 +2619,7 @@ static int __sp_alloc_mmap_populate(struct mm_struct *mm, struct sp_area *spa, ret = sp_alloc_populate(mm, spa, ac); if (ret) { err: - if (spa->spg != spg_none) + if (!is_local_group(spa->spg->id)) sp_alloc_unmap(list_next_entry(spg_node, proc_node)->master->mm, spa, spg_node); else sp_munmap(mm, spa->va_start, spa->real_size); @@ -2482,7 +2645,7 @@ static int sp_alloc_mmap_populate(struct sp_area *spa, struct mm_struct *mm; struct sp_group_node *spg_node; - if (spa->spg == spg_none) { + if (is_local_group(spa->spg->id)) { ret = __sp_alloc_mmap_populate(current->mm, spa, NULL, ac); } else { /* create mapping for each process in the group */ @@ -2506,10 +2669,9 @@ static void sp_alloc_finish(int result, struct sp_area *spa, struct sp_alloc_context *ac) { struct sp_group *spg = ac->spg; - bool is_pass_through = spg == spg_none ? true : false; - /* match sp_alloc_check_prepare */ - if (!is_pass_through) + /* match sp_alloc_prepare */ + if (!is_local_group(spg->id)) up_read(&spg->rw_lock); if (!result) @@ -2521,9 +2683,7 @@ static void sp_alloc_finish(int result, struct sp_area *spa, trace_sp_alloc_finish(ac, spa->va_start); } - if (!is_pass_through) - sp_group_drop(spg); - + sp_group_drop(spg); sp_dump_stack(); sp_try_to_compact(); } @@ -2705,22 +2865,33 @@ static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa, */ static void *sp_make_share_kva_to_task(unsigned long kva, unsigned long size, unsigned long sp_flags) { + int ret; void *uva; struct sp_area *spa; struct spg_proc_stat *stat; unsigned long prot = PROT_READ | PROT_WRITE; struct sp_k2u_context kc; + struct sp_group *spg; down_write(&sp_group_sem); - stat = sp_init_process_stat(current, current->mm, spg_none); - up_write(&sp_group_sem); + ret = sp_mapping_group_setup_local(current->mm); + if (ret) { + up_write(&sp_group_sem); + pr_err_ratelimited("k2u_task init local mapping failed %d\n", ret); + return ERR_PTR(ret); + } + + spg = current->mm->sp_group_master->local; + stat = sp_init_process_stat(current, current->mm, spg); if (IS_ERR(stat)) { + up_write(&sp_group_sem); pr_err_ratelimited("k2u_task init process stat failed %lx\n", PTR_ERR(stat)); return stat; } + up_write(&sp_group_sem); - spa = sp_alloc_area(size, sp_flags, spg_none, SPA_TYPE_K2TASK, current->tgid); + spa = sp_alloc_area(size, sp_flags, spg, SPA_TYPE_K2TASK, current->tgid); if (IS_ERR(spa)) { pr_err_ratelimited("alloc spa failed in k2u_task (potential no enough virtual memory when -75): %ld\n", PTR_ERR(spa)); @@ -3916,7 +4087,7 @@ static void rb_spa_stat_show(struct seq_file *seq) atomic_inc(&spa->use_count); spin_unlock(&sp_area_lock); - if (spa->spg == spg_none) /* k2u to task */ + if (is_local_group(spa->spg->id)) /* k2u to task */ seq_printf(seq, "%-10s ", "None"); else { down_read(&spa->spg->rw_lock); @@ -4446,6 +4617,9 @@ void sp_group_post_exit(struct mm_struct *mm) kfree(spg_node); } up_write(&sp_group_sem); + + if (master->local) + sp_group_drop(master->local); kfree(master); } @@ -4477,17 +4651,9 @@ static int __init share_pool_init(void) if (!sp_is_enabled()) return 0; - /* lockless, as init kthread has no sp operation else */ - spg_none = create_spg(GROUP_NONE); - /* without free spg_none, not a serious problem */ - if (IS_ERR(spg_none) || !spg_none) - goto fail; - sp_mapping_normal = sp_mapping_create(SP_MAPPING_NORMAL); - if (IS_ERR(sp_mapping_normal)) { - sp_group_drop(spg_none); + if (IS_ERR(sp_mapping_normal)) goto fail; - } atomic_inc(&sp_mapping_normal->user); sp_device_number_detect(); -- Gitee From 561d77b6ceb86ef4b7ec0d0d393657b3a3feea15 Mon Sep 17 00:00:00 2001 From: Zhou Guanghui Date: Tue, 19 Jul 2022 11:36:13 +0800 Subject: [PATCH 364/674] mm/sharepool: Add an interface to obtain an id hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5DS9S CVE: NA ------------------------------------------------- The DVPP address space is per process or per sharing group. During sp_free and unshare, you need to know which address space the current address belongs to. Signed-off-by: Zhou Guanghui Reviewed-by: Weilong Chen Signed-off-by: Zheng Zengkai --- include/linux/share_pool.h | 12 ++++++++++++ mm/share_pool.c | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+) diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h index 289877dde2fb..077816a6fb3b 100644 --- a/include/linux/share_pool.h +++ b/include/linux/share_pool.h @@ -284,6 +284,9 @@ extern bool mg_is_sharepool_addr(unsigned long addr); extern int mg_sp_group_add_task(int pid, unsigned long prot, int spg_id); extern int sp_group_add_task(int pid, int spg_id); +extern int sp_id_of_current(void); +extern int mg_sp_id_of_current(void); + extern void sp_area_drop(struct vm_area_struct *vma); extern int sp_group_exit(struct mm_struct *mm); extern void sp_group_post_exit(struct mm_struct *mm); @@ -431,6 +434,15 @@ static inline int mg_sp_unshare(unsigned long va, unsigned long size) return -EPERM; } +static inline int sp_id_of_current(void) +{ + return -EPERM; +} + +static inline int mg_sp_id_of_current(void) +{ + return -EPERM; +} static inline void sp_init_mm(struct mm_struct *mm) { diff --git a/mm/share_pool.c b/mm/share_pool.c index c3634d62e1fa..39ecde90c01b 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -1734,6 +1734,43 @@ int sp_group_del_task(int pid, int spg_id) } EXPORT_SYMBOL_GPL(sp_group_del_task); +int sp_id_of_current(void) +{ + int ret, spg_id; + struct sp_group_master *master; + + if (current->flags & PF_KTHREAD || !current->mm) + return -EINVAL; + + down_read(&sp_group_sem); + master = current->mm->sp_group_master; + if (master && master->local) { + spg_id = master->local->id; + up_read(&sp_group_sem); + return spg_id; + } + up_read(&sp_group_sem); + + down_write(&sp_group_sem); + ret = sp_mapping_group_setup_local(current->mm); + if (ret) { + up_write(&sp_group_sem); + return ret; + } + master = current->mm->sp_group_master; + spg_id = master->local->id; + up_write(&sp_group_sem); + + return spg_id; +} +EXPORT_SYMBOL_GPL(sp_id_of_current); + +int mg_sp_id_of_current(void) +{ + return sp_id_of_current(); +} +EXPORT_SYMBOL_GPL(mg_sp_id_of_current); + /* the caller must hold sp_area_lock */ static void __insert_sp_area(struct sp_area *spa) { -- Gitee From 7de67bbc1aa0cf7b7ca7408505ae5c174fa96abc Mon Sep 17 00:00:00 2001 From: Zhou Guanghui Date: Tue, 19 Jul 2022 11:36:14 +0800 Subject: [PATCH 365/674] mm/sharepool: Release the sp addr based on the id hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5DS9S CVE: NA ------------------------------------------------- The address space of the DVPP is managed by group. When releasing the shared pool memory, you need to find the corresponding address space based on the ID. Signed-off-by: Zhou Guanghui Reviewed-by: Weilong Chen Signed-off-by: Zheng Zengkai --- include/linux/share_pool.h | 12 +- mm/share_pool.c | 235 ++++++++++++++++++------------------- 2 files changed, 122 insertions(+), 125 deletions(-) diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h index 077816a6fb3b..8eb964230fd4 100644 --- a/include/linux/share_pool.h +++ b/include/linux/share_pool.h @@ -250,8 +250,8 @@ extern int proc_sp_group_state(struct seq_file *m, struct pid_namespace *ns, extern void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id); extern void *mg_sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id); -extern int sp_free(unsigned long addr); -extern int mg_sp_free(unsigned long addr); +extern int sp_free(unsigned long addr, int id); +extern int mg_sp_free(unsigned long addr, int id); extern void *sp_make_share_k2u(unsigned long kva, unsigned long size, unsigned long sp_flags, int pid, int spg_id); @@ -262,7 +262,7 @@ extern void *sp_make_share_u2k(unsigned long uva, unsigned long size, int pid); extern void *mg_sp_make_share_u2k(unsigned long uva, unsigned long size, int pid); extern int sp_unshare(unsigned long va, unsigned long size, int pid, int spg_id); -extern int mg_sp_unshare(unsigned long va, unsigned long size); +extern int mg_sp_unshare(unsigned long va, unsigned long size, int id); extern int sp_walk_page_range(unsigned long uva, unsigned long size, struct task_struct *tsk, struct sp_walk_data *sp_walk_data); @@ -392,12 +392,12 @@ static inline void *mg_sp_alloc(unsigned long size, unsigned long sp_flags, int return NULL; } -static inline int sp_free(unsigned long addr) +static inline int sp_free(unsigned long addr, int id) { return -EPERM; } -static inline int mg_sp_free(unsigned long addr) +static inline int mg_sp_free(unsigned long addr, int id) { return -EPERM; } @@ -429,7 +429,7 @@ static inline int sp_unshare(unsigned long va, unsigned long size, int pid, int return -EPERM; } -static inline int mg_sp_unshare(unsigned long va, unsigned long size) +static inline int mg_sp_unshare(unsigned long va, unsigned long size, int id) { return -EPERM; } diff --git a/mm/share_pool.c b/mm/share_pool.c index 39ecde90c01b..45cdb77a4299 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -642,12 +642,6 @@ static void free_spg_stat(int spg_id) kfree(stat); } -/* - * Group '0' for k2u_task and pass through. No process will be actually - * added to. - */ -static struct sp_group *spg_none; - /* statistics of all sp area, protected by sp_area_lock */ struct sp_spa_stat { unsigned int total_num; @@ -944,26 +938,6 @@ static int get_task(int pid, struct task_struct **task) return 0; } -static struct sp_group *get_first_group(struct mm_struct *mm) -{ - struct sp_group *spg = NULL; - struct sp_group_master *master = mm->sp_group_master; - - if (master && master->count >= 1) { - struct sp_group_node *spg_node = NULL; - - spg_node = list_first_entry(&master->node_list, - struct sp_group_node, group_node); - spg = spg_node->spg; - - /* don't revive a dead group */ - if (!spg || !atomic_inc_not_zero(&spg->use_count)) - spg = NULL; - } - - return spg; -} - /* * the caller must: * 1. hold spg->rw_lock @@ -988,35 +962,27 @@ static struct sp_group *__sp_find_spg_locked(int pid, int spg_id) struct task_struct *tsk = NULL; int ret = 0; - ret = get_task(pid, &tsk); - if (ret) - return NULL; - if (spg_id == SPG_ID_DEFAULT) { - /* - * Once we encounter a concurrency problem here. - * To fix it, we believe get_task_mm() and mmput() is too - * heavy because we just get the pointer of sp_group. - */ + ret = get_task(pid, &tsk); + if (ret) + return NULL; + task_lock(tsk); if (tsk->mm == NULL) spg = NULL; - else - spg = get_first_group(tsk->mm); + else if (tsk->mm->sp_group_master) + spg = tsk->mm->sp_group_master->local; task_unlock(tsk); + + put_task_struct(tsk); } else { spg = idr_find(&sp_group_idr, spg_id); - /* don't revive a dead group */ - if (!spg || !atomic_inc_not_zero(&spg->use_count)) - goto fail; } - put_task_struct(tsk); - return spg; + if (!spg || !atomic_inc_not_zero(&spg->use_count)) + return NULL; -fail: - put_task_struct(tsk); - return NULL; + return spg; } static struct sp_group *__sp_find_spg(int pid, int spg_id) @@ -1772,9 +1738,9 @@ int mg_sp_id_of_current(void) EXPORT_SYMBOL_GPL(mg_sp_id_of_current); /* the caller must hold sp_area_lock */ -static void __insert_sp_area(struct sp_area *spa) +static void __insert_sp_area(struct sp_mapping *spm, struct sp_area *spa) { - struct rb_node **p = &sp_area_root.rb_node; + struct rb_node **p = &spm->area_root.rb_node; struct rb_node *parent = NULL; while (*p) { @@ -1791,13 +1757,9 @@ static void __insert_sp_area(struct sp_area *spa) } rb_link_node(&spa->rb_node, parent, p); - rb_insert_color(&spa->rb_node, &sp_area_root); + rb_insert_color(&spa->rb_node, &spm->area_root); } -/* The sp_area cache globals are protected by sp_area_lock */ -static struct rb_node *free_sp_area_cache; -static unsigned long cached_vstart; /* affected by SP_DVPP and sp_config_dvpp_range() */ - /** * sp_alloc_area() - Allocate a region of VA from the share pool. * @size: the size of VA to allocate. @@ -1845,10 +1807,10 @@ static struct sp_area *sp_alloc_area(unsigned long size, unsigned long flags, /* * Invalidate cache if we have more permissive parameters. * cached_hole_size notes the largest hole noticed _below_ - * the sp_area cached in free_sp_area_cache: if size fits + * the sp_area cached in free_area_cache: if size fits * into that hole, we want to scan from vstart to reuse - * the hole instead of allocating above free_sp_area_cache. - * Note that sp_free_area may update free_sp_area_cache + * the hole instead of allocating above free_area_cache. + * Note that sp_free_area may update free_area_cache * without updating cached_hole_size. */ if (!mapping->free_area_cache || size_align < mapping->cached_hole_size || @@ -1935,7 +1897,7 @@ static struct sp_area *sp_alloc_area(unsigned long size, unsigned long flags, spa->device_id = device_id; spa_inc_usage(spa); - __insert_sp_area(spa); + __insert_sp_area(mapping, spa); mapping->free_area_cache = &spa->rb_node; list_add_tail(&spa->link, &spg->spa_list); @@ -1950,9 +1912,15 @@ static struct sp_area *sp_alloc_area(unsigned long size, unsigned long flags, } /* the caller should hold sp_area_lock */ -static struct sp_area *__find_sp_area_locked(unsigned long addr) +static struct sp_area *__find_sp_area_locked(struct sp_group *spg, + unsigned long addr) { - struct rb_node *n = sp_area_root.rb_node; + struct rb_node *n; + + if (addr >= MMAP_SHARE_POOL_START && addr < MMAP_SHARE_POOL_16G_START) + n = spg->normal->area_root.rb_node; + else + n = spg->dvpp->area_root.rb_node; while (n) { struct sp_area *spa; @@ -1970,12 +1938,12 @@ static struct sp_area *__find_sp_area_locked(unsigned long addr) return NULL; } -static struct sp_area *__find_sp_area(unsigned long addr) +static struct sp_area *__find_sp_area(struct sp_group *spg, unsigned long addr) { struct sp_area *n; spin_lock(&sp_area_lock); - n = __find_sp_area_locked(addr); + n = __find_sp_area_locked(spg, addr); if (n) atomic_inc(&n->use_count); spin_unlock(&sp_area_lock); @@ -2000,22 +1968,30 @@ static bool vmalloc_area_clr_flag(unsigned long kva, unsigned long flags) */ static void sp_free_area(struct sp_area *spa) { + unsigned long addr = spa->va_start; + struct sp_mapping *spm; + lockdep_assert_held(&sp_area_lock); - if (free_sp_area_cache) { + if (addr >= MMAP_SHARE_POOL_START && addr < MMAP_SHARE_POOL_16G_START) + spm = spa->spg->normal; + else + spm = spa->spg->dvpp; + + if (spm->free_area_cache) { struct sp_area *cache; - cache = rb_entry(free_sp_area_cache, struct sp_area, rb_node); + cache = rb_entry(spm->free_area_cache, struct sp_area, rb_node); if (spa->va_start <= cache->va_start) { - free_sp_area_cache = rb_prev(&spa->rb_node); + spm->free_area_cache = rb_prev(&spa->rb_node); /* * the new cache node may be changed to another region, * i.e. from DVPP region to normal region */ - if (free_sp_area_cache) { - cache = rb_entry(free_sp_area_cache, + if (spm->free_area_cache) { + cache = rb_entry(spm->free_area_cache, struct sp_area, rb_node); - cached_vstart = cache->region_vstart; + spm->cached_vstart = cache->region_vstart; } /* * We don't try to update cached_hole_size, @@ -2030,7 +2006,7 @@ static void sp_free_area(struct sp_area *spa) spa_dec_usage(spa); list_del(&spa->link); - rb_erase(&spa->rb_node, &sp_area_root); + rb_erase(&spa->rb_node, &spm->area_root); RB_CLEAR_NODE(&spa->rb_node); kfree(spa); } @@ -2072,7 +2048,7 @@ void sp_area_drop(struct vm_area_struct *vma) * an atomic operation. */ spin_lock(&sp_area_lock); - spa = __find_sp_area_locked(vma->vm_start); + spa = __find_sp_area_locked(vma->vm_mm->sp_group_master->local, vma->vm_start); __sp_area_drop_locked(spa); spin_unlock(&sp_area_lock); } @@ -2204,7 +2180,7 @@ static int sp_check_caller_permission(struct sp_group *spg, struct mm_struct *mm int ret = 0; down_read(&spg->rw_lock); - if (!is_process_in_group(spg, mm)) + if (!is_local_group(spg->id) && !is_process_in_group(spg, mm)) ret = -EPERM; up_read(&spg->rw_lock); return ret; @@ -2217,6 +2193,7 @@ struct sp_free_context { unsigned long addr; struct sp_area *spa; int state; + int spg_id; }; /* when success, __sp_area_drop(spa) should be used */ @@ -2225,10 +2202,18 @@ static int sp_free_get_spa(struct sp_free_context *fc) int ret = 0; unsigned long addr = fc->addr; struct sp_area *spa; + struct sp_group *spg; + + spg = __sp_find_spg(current->tgid, fc->spg_id); + if (!spg) { + pr_debug("sp free get group failed %d\n", fc->spg_id); + return -EINVAL; + } fc->state = FREE_CONT; - spa = __find_sp_area(addr); + spa = __find_sp_area(spg, addr); + sp_group_drop(spg); if (!spa) { pr_debug("sp free invalid input addr %lx\n", addr); return -EINVAL; @@ -2241,46 +2226,37 @@ static int sp_free_get_spa(struct sp_free_context *fc) } fc->spa = spa; - if (spa->spg != spg_none) { - /* - * Access control: an sp addr can only be freed by - * 1. another task in the same spg - * 2. a kthread - * - * a passthrough addr can only be freed by the applier process - */ - if (!current->mm) - goto check_spa; + if (!current->mm) + goto check_spa; - ret = sp_check_caller_permission(spa->spg, current->mm); - if (ret < 0) - goto drop_spa; + ret = sp_check_caller_permission(spa->spg, current->mm); + if (ret < 0) + goto drop_spa; check_spa: - down_write(&spa->spg->rw_lock); - if (!spg_valid(spa->spg)) { - fc->state = FREE_END; - up_write(&spa->spg->rw_lock); - goto drop_spa; - /* we must return success(0) in this situation */ - } - /* the life cycle of spa has a direct relation with sp group */ - if (unlikely(spa->is_dead)) { - up_write(&spa->spg->rw_lock); - pr_err_ratelimited("unexpected double sp free\n"); - dump_stack(); - ret = -EINVAL; - goto drop_spa; - } - spa->is_dead = true; - up_write(&spa->spg->rw_lock); + if (is_local_group(spa->spg->id) && (current->tgid != spa->applier)) { + ret = -EPERM; + goto drop_spa; + } - } else { - if (current->tgid != spa->applier) { - ret = -EPERM; - goto drop_spa; - } + down_write(&spa->spg->rw_lock); + if (!spg_valid(spa->spg)) { + fc->state = FREE_END; + up_write(&spa->spg->rw_lock); + goto drop_spa; + /* we must return success(0) in this situation */ + } + /* the life cycle of spa has a direct relation with sp group */ + if (unlikely(spa->is_dead)) { + up_write(&spa->spg->rw_lock); + pr_err_ratelimited("unexpected double sp free\n"); + dump_stack(); + ret = -EINVAL; + goto drop_spa; } + spa->is_dead = true; + up_write(&spa->spg->rw_lock); + return 0; drop_spa: @@ -2291,21 +2267,26 @@ static int sp_free_get_spa(struct sp_free_context *fc) /** * sp_free() - Free the memory allocated by sp_alloc(). * @addr: the starting VA of the memory. + * @id: Address space identifier, which is used to distinguish the addr. * * Return: * * 0 - success. * * -EINVAL - the memory can't be found or was not allocted by share pool. * * -EPERM - the caller has no permision to free the memory. */ -int sp_free(unsigned long addr) +int sp_free(unsigned long addr, int id) { int ret = 0; struct sp_free_context fc = { .addr = addr, + .spg_id = id, }; check_interrupt_context(); + if (current->flags & PF_KTHREAD) + return -EINVAL; + ret = sp_free_get_spa(&fc); if (ret || fc.state == FREE_END) goto out; @@ -2326,9 +2307,9 @@ int sp_free(unsigned long addr) } EXPORT_SYMBOL_GPL(sp_free); -int mg_sp_free(unsigned long addr) +int mg_sp_free(unsigned long addr, int id) { - return sp_free(addr); + return sp_free(addr, id); } EXPORT_SYMBOL_GPL(mg_sp_free); @@ -2422,6 +2403,11 @@ static int sp_alloc_prepare(unsigned long size, unsigned long sp_flags, if (enable_mdc_default_group) spg_id = mdc_default_group_id; + if (current->flags & PF_KTHREAD) { + pr_err_ratelimited("allocation failed, task is kthread\n"); + return -EINVAL; + } + if (unlikely(!size || (size >> PAGE_SHIFT) > totalram_pages())) { pr_err_ratelimited("allocation failed, invalid size %lu\n", size); return -EINVAL; @@ -2462,7 +2448,7 @@ static int sp_alloc_prepare(unsigned long size, unsigned long sp_flags, pr_err_ratelimited("allocation failed, task not in group\n"); return -ENODEV; } - } else { /* alocation pass through scene */ + } else { /* allocation pass through scene */ spg = sp_get_local_group(current->mm); if (IS_ERR(spg)) return PTR_ERR(spg); @@ -3493,7 +3479,7 @@ EXPORT_SYMBOL_GPL(mg_sp_make_share_u2k); * * This also means we must trust DVPP channel destroy and guard worker code. */ -static int sp_unshare_uva(unsigned long uva, unsigned long size) +static int sp_unshare_uva(unsigned long uva, unsigned long size, int group_id) { int ret = 0; struct mm_struct *mm; @@ -3501,14 +3487,21 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size) unsigned long uva_aligned; unsigned long size_aligned; unsigned int page_size; + struct sp_group *spg; + + spg = __sp_find_spg(current->tgid, group_id); + if (!spg) { + pr_debug("sp unshare find group failed %d\n", group_id); + return -EINVAL; + } /* * at first we guess it's a hugepage addr * we can tolerate at most PMD_SIZE or PAGE_SIZE which is matched in k2u */ - spa = __find_sp_area(ALIGN_DOWN(uva, PMD_SIZE)); + spa = __find_sp_area(spg, ALIGN_DOWN(uva, PMD_SIZE)); if (!spa) { - spa = __find_sp_area(ALIGN_DOWN(uva, PAGE_SIZE)); + spa = __find_sp_area(spg, ALIGN_DOWN(uva, PAGE_SIZE)); if (!spa) { ret = -EINVAL; pr_debug("invalid input uva %lx in unshare uva\n", (unsigned long)uva); @@ -3639,6 +3632,7 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size) out_drop_area: __sp_area_drop(spa); out: + sp_group_drop(spg); return ret; } @@ -3702,9 +3696,12 @@ int sp_unshare(unsigned long va, unsigned long size, int pid, int spg_id) check_interrupt_context(); + if (current->flags & PF_KTHREAD) + return -EINVAL; + if (va < TASK_SIZE) { /* user address */ - ret = sp_unshare_uva(va, size); + ret = sp_unshare_uva(va, size, spg_id); } else if (va >= PAGE_OFFSET) { /* kernel address */ ret = sp_unshare_kva(va, size); @@ -3718,9 +3715,9 @@ int sp_unshare(unsigned long va, unsigned long size, int pid, int spg_id) } EXPORT_SYMBOL_GPL(sp_unshare); -int mg_sp_unshare(unsigned long va, unsigned long size) +int mg_sp_unshare(unsigned long va, unsigned long size, int id) { - return sp_unshare(va, size, 0, 0); + return sp_unshare(va, size, 0, id); } EXPORT_SYMBOL_GPL(mg_sp_unshare); @@ -3880,8 +3877,8 @@ int sp_node_id(struct vm_area_struct *vma) if (!sp_is_enabled()) return node_id; - if (vma) { - spa = __find_sp_area(vma->vm_start); + if (vma && vma->vm_flags & VM_SHARE_POOL) { + spa = __find_sp_area(vma->vm_mm->sp_group_master->local, vma->vm_start); if (spa) { node_id = spa->node_id; __sp_area_drop(spa); @@ -4047,7 +4044,7 @@ static void print_process_prot(struct seq_file *seq, unsigned long prot) seq_puts(seq, "R"); else if (prot == (PROT_READ | PROT_WRITE)) seq_puts(seq, "RW"); - else /* e.g. spg_none */ + else seq_puts(seq, "-"); } @@ -4448,7 +4445,7 @@ vm_fault_t sharepool_no_page(struct mm_struct *mm, int node_id; struct sp_area *spa; - spa = __find_sp_area(vma->vm_start); + spa = __find_sp_area(mm->sp_group_master->local, vma->vm_start); if (!spa) { pr_err("share pool: vma is invalid, not from sp mmap\n"); return ret; -- Gitee From c70421912716bd11b5bd2f18e3f537e7e20935a1 Mon Sep 17 00:00:00 2001 From: Zhou Guanghui Date: Tue, 19 Jul 2022 11:36:15 +0800 Subject: [PATCH 366/674] mm/sharepool: Share pool statistics adaption hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5DS9S CVE: NA ------------------------------------------------- The management of the address space is adjusted, and the statistical data processing of the shared pool needs to be adapted. Signed-off-by: Zhou Guanghui Signed-off-by: Zhang Jian Reviewed-by: Weilong Chen Signed-off-by: Zheng Zengkai --- mm/share_pool.c | 69 ++++++++++++++++++++++++++++--------------------- 1 file changed, 40 insertions(+), 29 deletions(-) diff --git a/mm/share_pool.c b/mm/share_pool.c index 45cdb77a4299..e673b05bda5a 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -699,7 +699,6 @@ struct sp_area { int device_id; }; static DEFINE_SPINLOCK(sp_area_lock); -static struct rb_root sp_area_root = RB_ROOT; static unsigned long spa_size(struct sp_area *spa) { @@ -4106,14 +4105,13 @@ int proc_sp_group_state(struct seq_file *m, struct pid_namespace *ns, return 0; } -static void rb_spa_stat_show(struct seq_file *seq) +static void spa_stat_of_mapping_show(struct seq_file *seq, struct sp_mapping *spm) { struct rb_node *node; struct sp_area *spa, *prev = NULL; spin_lock(&sp_area_lock); - - for (node = rb_first(&sp_area_root); node; node = rb_next(node)) { + for (node = rb_first(&spm->area_root); node; node = rb_next(node)) { __sp_area_drop_locked(prev); spa = rb_entry(node, struct sp_area, rb_node); @@ -4121,16 +4119,12 @@ static void rb_spa_stat_show(struct seq_file *seq) atomic_inc(&spa->use_count); spin_unlock(&sp_area_lock); - if (is_local_group(spa->spg->id)) /* k2u to task */ - seq_printf(seq, "%-10s ", "None"); - else { - down_read(&spa->spg->rw_lock); - if (spg_valid(spa->spg)) /* k2u to group */ - seq_printf(seq, "%-10d ", spa->spg->id); - else /* spg is dead */ - seq_printf(seq, "%-10s ", "Dead"); - up_read(&spa->spg->rw_lock); - } + down_read(&spa->spg->rw_lock); + if (spg_valid(spa->spg)) /* k2u to group */ + seq_printf(seq, "%-10d ", spa->spg->id); + else /* spg is dead */ + seq_printf(seq, "%-10s ", "Dead"); + up_read(&spa->spg->rw_lock); seq_printf(seq, "%2s%-14lx %2s%-14lx %-10ld ", "0x", spa->va_start, @@ -4166,6 +4160,30 @@ static void rb_spa_stat_show(struct seq_file *seq) spin_unlock(&sp_area_lock); } +static void spa_normal_stat_show(struct seq_file *seq) +{ + spa_stat_of_mapping_show(seq, sp_mapping_normal); +} + +static int idr_spg_dvpp_stat_show_cb(int id, void *p, void *data) +{ + struct sp_group *spg = p; + struct seq_file *seq = data; + + if (!is_local_group(spg->id) || atomic_read(&spg->dvpp->user) == 1) + spa_stat_of_mapping_show(seq, spg->dvpp); + + return 0; +} + +static void spa_dvpp_stat_show(struct seq_file *seq) +{ + down_read(&sp_group_sem); + idr_for_each(&sp_group_idr, idr_spg_dvpp_stat_show_cb, seq); + up_read(&sp_group_sem); +} + + void spa_overview_show(struct seq_file *seq) { unsigned int total_num, alloc_num, k2u_task_num, k2u_spg_num; @@ -4219,12 +4237,11 @@ static int idr_spg_stat_cb(int id, void *p, void *data) struct sp_spg_stat *s = p; struct seq_file *seq = data; - if (seq != NULL) { - if (id == 0) - seq_puts(seq, "Non Group "); - else - seq_printf(seq, "Group %6d ", id); + if (is_local_group(id) && atomic64_read(&s->size) == 0) + return 0; + if (seq != NULL) { + seq_printf(seq, "Group %6d ", id); seq_printf(seq, "size: %lld KB, spa num: %d, total alloc: %lld KB, normal alloc: %lld KB, huge alloc: %lld KB\n", byte2kb(atomic64_read(&s->size)), atomic_read(&s->spa_num), @@ -4232,11 +4249,7 @@ static int idr_spg_stat_cb(int id, void *p, void *data) byte2kb(atomic64_read(&s->alloc_nsize)), byte2kb(atomic64_read(&s->alloc_hsize))); } else { - if (id == 0) - pr_info("Non Group "); - else - pr_info("Group %6d ", id); - + pr_info("Group %6d ", id); pr_info("size: %lld KB, spa num: %d, total alloc: %lld KB, normal alloc: %lld KB, huge alloc: %lld KB\n", byte2kb(atomic64_read(&s->size)), atomic_read(&s->spa_num), @@ -4280,7 +4293,8 @@ static int spa_stat_show(struct seq_file *seq, void *offset) /* print the file header */ seq_printf(seq, "%-10s %-16s %-16s %-10s %-7s %-5s %-8s %-8s\n", "Group ID", "va_start", "va_end", "Size(KB)", "Type", "Huge", "PID", "Ref"); - rb_spa_stat_show(seq); + spa_normal_stat_show(seq); + spa_dvpp_stat_show(seq); return 0; } @@ -4317,10 +4331,7 @@ static int idr_proc_stat_cb(int id, void *p, void *data) prot = get_process_prot_locked(id, mm); seq_printf(seq, "%-8d ", tgid); - if (id == 0) - seq_printf(seq, "%-8c ", '-'); - else - seq_printf(seq, "%-8d ", id); + seq_printf(seq, "%-8d ", id); seq_printf(seq, "%-9ld %-9ld %-9ld %-10ld %-10ld %-8ld %-7ld %-7ld %-10ld ", get_spg_proc_alloc(spg_proc_stat), get_spg_proc_k2u(spg_proc_stat), -- Gitee From 331c3817722767fafe762073c5bec5e5cd99c5f8 Mon Sep 17 00:00:00 2001 From: Wang Wensheng Date: Tue, 19 Jul 2022 11:36:16 +0800 Subject: [PATCH 367/674] mm/sharepool: Use vm_private_data to store the spa hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5DS9S CVE: NA ------------------------------------------------- When we destroy a vma, we first find the spa depending on the vma->vm_start, during which we should hold the sp_area_lock. While we store the spa in vma, we can get the spa directly. Don't worry if the spa exists or if it's to be freed soon, since we have increaced the refcount for the spa when it's mappend into a vma. Signed-off-by: Wang Wensheng Reviewed-by: Weilong Chen Signed-off-by: Zheng Zengkai --- mm/share_pool.c | 45 ++++++++++++++++----------------------------- 1 file changed, 16 insertions(+), 29 deletions(-) diff --git a/mm/share_pool.c b/mm/share_pool.c index e673b05bda5a..c7b185739b61 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -855,7 +855,7 @@ static inline bool check_aoscore_process(struct task_struct *tsk) static unsigned long sp_mmap(struct mm_struct *mm, struct file *file, struct sp_area *spa, unsigned long *populate, - unsigned long prot); + unsigned long prot, struct vm_area_struct **pvma); static void sp_munmap(struct mm_struct *mm, unsigned long addr, unsigned long size); #define K2U_NORMAL 0 @@ -1504,7 +1504,7 @@ int mg_sp_group_add_task(int pid, unsigned long prot, int spg_id) break; } - addr = sp_mmap(mm, file, spa, &populate, prot); + addr = sp_mmap(mm, file, spa, &populate, prot, NULL); if (IS_ERR_VALUE(addr)) { sp_munmap_task_areas(mm, spg, &spa->link); up_write(&mm->mmap_lock); @@ -2034,8 +2034,6 @@ static void __sp_area_drop(struct sp_area *spa) void sp_area_drop(struct vm_area_struct *vma) { - struct sp_area *spa; - if (!(vma->vm_flags & VM_SHARE_POOL)) return; @@ -2047,8 +2045,7 @@ void sp_area_drop(struct vm_area_struct *vma) * an atomic operation. */ spin_lock(&sp_area_lock); - spa = __find_sp_area_locked(vma->vm_mm->sp_group_master->local, vma->vm_start); - __sp_area_drop_locked(spa); + __sp_area_drop_locked(vma->vm_private_data); spin_unlock(&sp_area_lock); } @@ -2315,7 +2312,7 @@ EXPORT_SYMBOL_GPL(mg_sp_free); /* wrapper of __do_mmap() and the caller must hold down_write(&mm->mmap_lock). */ static unsigned long sp_mmap(struct mm_struct *mm, struct file *file, struct sp_area *spa, unsigned long *populate, - unsigned long prot) + unsigned long prot, struct vm_area_struct **pvma) { unsigned long addr = spa->va_start; unsigned long size = spa_size(spa); @@ -2323,6 +2320,7 @@ static unsigned long sp_mmap(struct mm_struct *mm, struct file *file, MAP_SHARE_POOL; unsigned long vm_flags = VM_NORESERVE | VM_SHARE_POOL | VM_DONTCOPY; unsigned long pgoff = addr_offset(spa) >> PAGE_SHIFT; + struct vm_area_struct *vma; /* Mark the mapped region to be locked. After the MAP_LOCKED is enable, * multiple tasks will preempt resources, causing performance loss. @@ -2338,8 +2336,13 @@ static unsigned long sp_mmap(struct mm_struct *mm, struct file *file, pr_err("do_mmap fails %ld\n", addr); } else { BUG_ON(addr != spa->va_start); + vma = find_vma(mm, addr); + vma->vm_private_data = spa; + if (pvma) + *pvma = vma; } + return addr; } @@ -2484,7 +2487,6 @@ static int sp_alloc_mmap(struct mm_struct *mm, struct sp_area *spa, unsigned long mmap_addr; /* pass through default permission */ unsigned long prot = PROT_READ | PROT_WRITE; - unsigned long sp_addr = spa->va_start; unsigned long populate = 0; struct vm_area_struct *vma; @@ -2503,7 +2505,7 @@ static int sp_alloc_mmap(struct mm_struct *mm, struct sp_area *spa, prot = PROT_READ; /* when success, mmap_addr == spa->va_start */ - mmap_addr = sp_mmap(mm, spa_file(spa), spa, &populate, prot); + mmap_addr = sp_mmap(mm, spa_file(spa), spa, &populate, prot, &vma); if (IS_ERR_VALUE(mmap_addr)) { up_write(&mm->mmap_lock); sp_alloc_unmap(mm, spa, spg_node); @@ -2519,14 +2521,6 @@ static int sp_alloc_mmap(struct mm_struct *mm, struct sp_area *spa, } ac->populate = populate; - vma = find_vma(mm, sp_addr); - if (unlikely(!vma)) { - up_write(&mm->mmap_lock); - WARN(1, "allocation failed, can't find %lx vma\n", sp_addr); - ret = -EINVAL; - goto unmap; - } - if (ac->sp_flags & SP_PROT_RO) vma->vm_flags &= ~VM_MAYWRITE; @@ -2825,15 +2819,12 @@ static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa, if (kc && kc->sp_flags & SP_PROT_RO) prot = PROT_READ; - ret_addr = sp_mmap(mm, spa_file(spa), spa, &populate, prot); + ret_addr = sp_mmap(mm, spa_file(spa), spa, &populate, prot, &vma); if (IS_ERR_VALUE(ret_addr)) { pr_debug("k2u mmap failed %lx\n", ret_addr); goto put_mm; } - BUG_ON(ret_addr != spa->va_start); - vma = find_vma(mm, ret_addr); - BUG_ON(vma == NULL); if (prot & PROT_WRITE) vma->vm_page_prot = __pgprot(((~PTE_RDONLY) & vma->vm_page_prot.pgprot) | PTE_DIRTY); @@ -3876,12 +3867,9 @@ int sp_node_id(struct vm_area_struct *vma) if (!sp_is_enabled()) return node_id; - if (vma && vma->vm_flags & VM_SHARE_POOL) { - spa = __find_sp_area(vma->vm_mm->sp_group_master->local, vma->vm_start); - if (spa) { - node_id = spa->node_id; - __sp_area_drop(spa); - } + if (vma && vma->vm_flags & VM_SHARE_POOL && vma->vm_private_data) { + spa = vma->vm_private_data; + node_id = spa->node_id; } return node_id; @@ -4456,13 +4444,12 @@ vm_fault_t sharepool_no_page(struct mm_struct *mm, int node_id; struct sp_area *spa; - spa = __find_sp_area(mm->sp_group_master->local, vma->vm_start); + spa = vma->vm_private_data; if (!spa) { pr_err("share pool: vma is invalid, not from sp mmap\n"); return ret; } node_id = spa->node_id; - __sp_area_drop(spa); retry: page = find_lock_page(mapping, idx); -- Gitee From 447951e3d1f83b41cb91a3fb332eacf734b6714a Mon Sep 17 00:00:00 2001 From: Wang Wensheng Date: Tue, 19 Jul 2022 11:36:17 +0800 Subject: [PATCH 368/674] mm/sharepool: Unify the memory allocation process hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5DS9S CVE: NA ------------------------------------------------- There are two types of memory allocated from sharepool: passthrough memory for DVPP and shared memory. Currently, we branch to different routines depending on the memory type, both during the allocation and free process. Since we have already create a local group for passthrough memory, with just one step ahead, we could drop the redundant branches in allocation and free process and in all the fallback process when an error occurs. Here is the content of this patch: 1. Add erery process to its local group when initilizing its group_master. 2. Avoid to return the local group in find_sp_group_id_by_pid(). 3. Delete the redundant branches during allocation and free process. Signed-off-by: Wang Wensheng Reviewed-by: Weilong Chen Signed-off-by: Zheng Zengkai --- mm/share_pool.c | 139 ++++++++++++++++++++++++++++++------------------ 1 file changed, 87 insertions(+), 52 deletions(-) diff --git a/mm/share_pool.c b/mm/share_pool.c index c7b185739b61..b627c9347f78 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -249,13 +249,15 @@ static int sp_mapping_group_setup(struct mm_struct *mm, struct sp_group *spg) return 0; } +static void free_sp_group_locked(struct sp_group *spg); +static int local_group_add_task(struct mm_struct *mm, struct sp_group *spg); static struct sp_group *create_spg(int spg_id); static void free_new_spg_id(bool new, int spg_id); /* The caller must hold sp_group_sem */ static struct sp_group_master *sp_init_group_master_locked( struct mm_struct *mm, bool *exist) { - int spg_id; + int spg_id, ret; struct sp_group *spg; struct sp_group_master *master = mm->sp_group_master; @@ -271,16 +273,15 @@ static struct sp_group_master *sp_init_group_master_locked( spg_id = ida_alloc_range(&sp_group_id_ida, SPG_ID_LOCAL_MIN, SPG_ID_LOCAL_MAX, GFP_ATOMIC); if (spg_id < 0) { - kfree(master); pr_err_ratelimited("generate local group id failed %d\n", spg_id); - return ERR_PTR(spg_id); + ret = spg_id; + goto free_master; } spg = create_spg(spg_id); if (IS_ERR(spg)) { - free_new_spg_id(true, spg_id); - kfree(master); - return (struct sp_group_master *)spg; + ret = PTR_ERR(spg); + goto free_spg_id; } INIT_LIST_HEAD(&master->node_list); @@ -290,8 +291,20 @@ static struct sp_group_master *sp_init_group_master_locked( master->local = spg; mm->sp_group_master = master; + ret = local_group_add_task(mm, spg); + if (ret < 0) + goto free_spg; + *exist = false; return master; + +free_spg: + free_sp_group_locked(spg); +free_spg_id: + free_new_spg_id(true, spg_id); +free_master: + kfree(master); + return ERR_PTR(ret); } static inline bool is_local_group(int spg_id) @@ -670,6 +683,8 @@ static struct sp_overall_stat sp_overall_stat; enum spa_type { SPA_TYPE_ALLOC = 1, + /* NOTE: reorganize after the statisical structure is reconstructed. */ + SPA_TYPE_ALLOC_PRIVATE = SPA_TYPE_ALLOC, SPA_TYPE_K2TASK, SPA_TYPE_K2SPG, }; @@ -1037,7 +1052,7 @@ EXPORT_SYMBOL_GPL(sp_group_id_by_pid); */ int mg_sp_group_id_by_pid(int pid, int *spg_ids, int *num) { - int ret = 0; + int ret = 0, real_count; struct sp_group_node *node; struct sp_group_master *master = NULL; struct task_struct *tsk; @@ -1062,18 +1077,28 @@ int mg_sp_group_id_by_pid(int pid, int *spg_ids, int *num) goto out_up_read; } - if (!master->count) { + /* + * There is a local group for each process which is used for + * passthrough allocation. The local group is a internal + * implementation for convenience and is not attempt to bother + * the user. + */ + real_count = master->count - 1; + if (real_count <= 0) { ret = -ENODEV; goto out_up_read; } - if ((unsigned int)*num < master->count) { + if ((unsigned int)*num < real_count) { ret = -E2BIG; goto out_up_read; } - *num = master->count; + *num = real_count; - list_for_each_entry(node, &master->node_list, group_node) + list_for_each_entry(node, &master->node_list, group_node) { + if (is_local_group(node->spg->id)) + continue; *(spg_ids++) = node->spg->id; + } out_up_read: up_read(&sp_group_sem); @@ -1245,7 +1270,7 @@ static int mm_add_group_init(struct mm_struct *mm, struct sp_group *spg) return -EEXIST; } - if (master->count + 1 == MAX_GROUP_FOR_TASK) { + if (master->count == MAX_GROUP_FOR_TASK) { pr_err("task reaches max group num\n"); return -ENOSPC; } @@ -1289,6 +1314,29 @@ static int insert_spg_node(struct sp_group *spg, struct sp_group_node *node) return 0; } +static int local_group_add_task(struct mm_struct *mm, struct sp_group *spg) +{ + struct sp_group_node *node; + struct spg_proc_stat *stat; + + node = create_spg_node(mm, PROT_READ | PROT_WRITE, spg); + if (IS_ERR(node)) + return PTR_ERR(node); + + /* use current just to avoid compile error, rebuild in following patch */ + stat = sp_init_process_stat(current, mm, spg); + if (IS_ERR(stat)) { + free_sp_group_locked(spg); + pr_err_ratelimited("init process stat failed %lx\n", PTR_ERR(stat)); + return PTR_ERR(stat); + } + + insert_spg_node(spg, node); + mmget(mm); + + return 0; +} + /* the caller must down_write(&spg->rw_lock) */ static void delete_spg_node(struct sp_group *spg, struct sp_group_node *node) { @@ -2160,15 +2208,10 @@ static void sp_fallocate(struct sp_area *spa) static void sp_free_unmap_fallocate(struct sp_area *spa) { - if (!is_local_group(spa->spg->id)) { - down_read(&spa->spg->rw_lock); - __sp_free(spa->spg, spa->va_start, spa_size(spa), NULL); - sp_fallocate(spa); - up_read(&spa->spg->rw_lock); - } else { - sp_munmap(current->mm, spa->va_start, spa_size(spa)); - sp_fallocate(spa); - } + down_read(&spa->spg->rw_lock); + __sp_free(spa->spg, spa->va_start, spa_size(spa), NULL); + sp_fallocate(spa); + up_read(&spa->spg->rw_lock); } static int sp_check_caller_permission(struct sp_group *spg, struct mm_struct *mm) @@ -2176,9 +2219,10 @@ static int sp_check_caller_permission(struct sp_group *spg, struct mm_struct *mm int ret = 0; down_read(&spg->rw_lock); - if (!is_local_group(spg->id) && !is_process_in_group(spg, mm)) + if (!is_process_in_group(spg, mm)) ret = -EPERM; up_read(&spg->rw_lock); + return ret; } @@ -2363,6 +2407,7 @@ struct sp_alloc_context { struct timespec64 start; struct timespec64 end; bool have_mbind; + enum spa_type type; }; static void trace_sp_alloc_begin(struct sp_alloc_context *ac) @@ -2450,10 +2495,13 @@ static int sp_alloc_prepare(unsigned long size, unsigned long sp_flags, pr_err_ratelimited("allocation failed, task not in group\n"); return -ENODEV; } + ac->type = SPA_TYPE_ALLOC; } else { /* allocation pass through scene */ spg = sp_get_local_group(current->mm); if (IS_ERR(spg)) return PTR_ERR(spg); + down_read(&spg->rw_lock); + ac->type = SPA_TYPE_ALLOC_PRIVATE; } if (sp_flags & SP_HUGEPAGE) { @@ -2476,8 +2524,7 @@ static int sp_alloc_prepare(unsigned long size, unsigned long sp_flags, static void sp_alloc_unmap(struct mm_struct *mm, struct sp_area *spa, struct sp_group_node *spg_node) { - if (!is_local_group(spa->spg->id)) - __sp_free(spa->spg, spa->va_start, spa->real_size, mm); + __sp_free(spa->spg, spa->va_start, spa->real_size, mm); } static int sp_alloc_mmap(struct mm_struct *mm, struct sp_area *spa, @@ -2532,10 +2579,7 @@ static int sp_alloc_mmap(struct mm_struct *mm, struct sp_area *spa, return ret; unmap: - if (!is_local_group(spa->spg->id)) - sp_alloc_unmap(list_next_entry(spg_node, proc_node)->master->mm, spa, spg_node); - else - sp_munmap(mm, spa->va_start, spa->real_size); + sp_alloc_unmap(list_next_entry(spg_node, proc_node)->master->mm, spa, spg_node); return ret; } @@ -2635,10 +2679,7 @@ static int __sp_alloc_mmap_populate(struct mm_struct *mm, struct sp_area *spa, ret = sp_alloc_populate(mm, spa, ac); if (ret) { err: - if (!is_local_group(spa->spg->id)) - sp_alloc_unmap(list_next_entry(spg_node, proc_node)->master->mm, spa, spg_node); - else - sp_munmap(mm, spa->va_start, spa->real_size); + sp_alloc_unmap(list_next_entry(spg_node, proc_node)->master->mm, spa, spg_node); if (unlikely(fatal_signal_pending(current))) pr_warn_ratelimited("allocation failed, current thread is killed\n"); @@ -2661,34 +2702,30 @@ static int sp_alloc_mmap_populate(struct sp_area *spa, struct mm_struct *mm; struct sp_group_node *spg_node; - if (is_local_group(spa->spg->id)) { - ret = __sp_alloc_mmap_populate(current->mm, spa, NULL, ac); - } else { - /* create mapping for each process in the group */ - list_for_each_entry(spg_node, &spa->spg->procs, proc_node) { - mm = spg_node->master->mm; - mmap_ret = __sp_alloc_mmap_populate(mm, spa, spg_node, ac); - if (mmap_ret) { - if (ac->state != ALLOC_COREDUMP) - return mmap_ret; - ac->state = ALLOC_NORMAL; - continue; - } - ret = mmap_ret; + /* create mapping for each process in the group */ + list_for_each_entry(spg_node, &spa->spg->procs, proc_node) { + mm = spg_node->master->mm; + mmap_ret = __sp_alloc_mmap_populate(mm, spa, spg_node, ac); + if (mmap_ret) { + if (ac->state != ALLOC_COREDUMP) + return mmap_ret; + ac->state = ALLOC_NORMAL; + continue; } + ret = mmap_ret; } + return ret; } /* spa maybe an error pointer, so introduce variable spg */ static void sp_alloc_finish(int result, struct sp_area *spa, - struct sp_alloc_context *ac) + struct sp_alloc_context *ac) { struct sp_group *spg = ac->spg; /* match sp_alloc_prepare */ - if (!is_local_group(spg->id)) - up_read(&spg->rw_lock); + up_read(&spg->rw_lock); if (!result) sp_update_process_stat(current, true, spa); @@ -2728,7 +2765,7 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) try_again: spa = sp_alloc_area(ac.size_aligned, ac.sp_flags, ac.spg, - SPA_TYPE_ALLOC, current->tgid); + ac.type, current->tgid); if (IS_ERR(spa)) { pr_err_ratelimited("alloc spa failed in allocation(potential no enough virtual memory when -75): %ld\n", PTR_ERR(spa)); @@ -4650,8 +4687,6 @@ void sp_group_post_exit(struct mm_struct *mm) } up_write(&sp_group_sem); - if (master->local) - sp_group_drop(master->local); kfree(master); } -- Gitee From 283d8d877377f78b6403e7f95abc7683394390e2 Mon Sep 17 00:00:00 2001 From: Wang Wensheng Date: Tue, 19 Jul 2022 11:36:18 +0800 Subject: [PATCH 369/674] mm/sharepool: Clear the initialization of sp-associated structure for a process hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5DS9S CVE: NA ------------------------------------------------- A few structures must have been created when a process want to get into sharepool subsystem, including allocating sharepool memory, being added into a spg or doing k2u and so on. Currently we create those structures just before we actually need them. For example, we find or create a sp_spa_stat after a successful memory allocation and before updating the statistical structure. The creation of a new structure may fail due to oom and we should then reclaim the memory allocated and revert all the process before. Or we just forget to do that and a potential memory-leak occurs. This design makes it confused when we indeed create a structure and we always worry about potential memory-leak when we changes the code around it. A better solution is to initialize all that structures at the same time when a process join in sharepool subsystem. And in future, we will clear the unnecessary statistical structures. Signed-off-by: Wang Wensheng Reviewed-by: Weilong Chen Signed-off-by: Zheng Zengkai --- include/linux/share_pool.h | 4 - mm/share_pool.c | 278 ++++++++++++++++--------------------- 2 files changed, 116 insertions(+), 166 deletions(-) diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h index 8eb964230fd4..51710669b1a7 100644 --- a/include/linux/share_pool.h +++ b/include/linux/share_pool.h @@ -507,10 +507,6 @@ static inline struct sp_proc_stat *sp_get_proc_stat_ref(struct mm_struct *mm) return NULL; } -static inline void sp_proc_stat_drop(struct sp_proc_stat *stat) -{ -} - static inline void spa_overview_show(struct seq_file *seq) { } diff --git a/mm/share_pool.c b/mm/share_pool.c index b627c9347f78..75339e9d130e 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -249,33 +249,22 @@ static int sp_mapping_group_setup(struct mm_struct *mm, struct sp_group *spg) return 0; } -static void free_sp_group_locked(struct sp_group *spg); -static int local_group_add_task(struct mm_struct *mm, struct sp_group *spg); static struct sp_group *create_spg(int spg_id); static void free_new_spg_id(bool new, int spg_id); -/* The caller must hold sp_group_sem */ -static struct sp_group_master *sp_init_group_master_locked( - struct mm_struct *mm, bool *exist) +static void free_sp_group_locked(struct sp_group *spg); +static int local_group_add_task(struct mm_struct *mm, struct sp_group *spg); +static int init_local_group(struct mm_struct *mm) { int spg_id, ret; struct sp_group *spg; + struct sp_mapping *spm; struct sp_group_master *master = mm->sp_group_master; - if (master) { - *exist = true; - return master; - } - - master = kmalloc(sizeof(struct sp_group_master), GFP_KERNEL); - if (master == NULL) - return ERR_PTR(-ENOMEM); - spg_id = ida_alloc_range(&sp_group_id_ida, SPG_ID_LOCAL_MIN, SPG_ID_LOCAL_MAX, GFP_ATOMIC); if (spg_id < 0) { pr_err_ratelimited("generate local group id failed %d\n", spg_id); - ret = spg_id; - goto free_master; + return spg_id; } spg = create_spg(spg_id); @@ -284,60 +273,73 @@ static struct sp_group_master *sp_init_group_master_locked( goto free_spg_id; } - INIT_LIST_HEAD(&master->node_list); - master->count = 0; - master->stat = NULL; - master->mm = mm; master->local = spg; - mm->sp_group_master = master; + spm = sp_mapping_create(SP_MAPPING_DVPP); + if (IS_ERR(spm)) { + ret = PTR_ERR(spm); + goto free_spg; + } + sp_mapping_attach(master->local, spm); + sp_mapping_attach(master->local, sp_mapping_normal); ret = local_group_add_task(mm, spg); if (ret < 0) + /* The spm would be released while destroying the spg*/ goto free_spg; - *exist = false; - return master; + return 0; free_spg: free_sp_group_locked(spg); + master->local = NULL; free_spg_id: free_new_spg_id(true, spg_id); -free_master: - kfree(master); - return ERR_PTR(ret); -} -static inline bool is_local_group(int spg_id) -{ - return spg_id >= SPG_ID_LOCAL_MIN && spg_id <= SPG_ID_LOCAL_MAX; + return ret; } -/* - * If the process is added to a group first, the address space of the local - * group of the process must have been set. If the process is not added to - * a group, directly create or attach the process to the corresponding DVPP - * and normal address space. - */ -static int sp_mapping_group_setup_local(struct mm_struct *mm) +static void sp_proc_stat_drop(struct sp_proc_stat *stat); +static int sp_init_proc_stat(struct mm_struct *mm, struct task_struct *tsk); +/* The caller must hold sp_group_sem */ +static int sp_init_group_master_locked(struct task_struct *tsk, struct mm_struct *mm) { + int ret; struct sp_group_master *master; - struct sp_mapping *spm; - bool exist = false; - - master = sp_init_group_master_locked(mm, &exist); - if (IS_ERR(master)) - return PTR_ERR(master); - if (master->local->dvpp) + if (mm->sp_group_master) return 0; - spm = sp_mapping_create(SP_MAPPING_DVPP); - if (IS_ERR(spm)) - return PTR_ERR(spm); - sp_mapping_attach(master->local, spm); - sp_mapping_attach(master->local, sp_mapping_normal); + master = kmalloc(sizeof(struct sp_group_master), GFP_KERNEL); + if (!master) + return -ENOMEM; + + INIT_LIST_HEAD(&master->node_list); + master->count = 0; + master->mm = mm; + mm->sp_group_master = master; + + ret = sp_init_proc_stat(mm, tsk); + if (ret) + goto free_master; + + ret = init_local_group(mm); + if (ret) + goto put_stat; return 0; + +put_stat: + sp_proc_stat_drop(master->stat); +free_master: + mm->sp_group_master = NULL; + kfree(master); + + return ret; +} + +static inline bool is_local_group(int spg_id) +{ + return spg_id >= SPG_ID_LOCAL_MIN && spg_id <= SPG_ID_LOCAL_MAX; } static struct sp_group *sp_get_local_group(struct mm_struct *mm) @@ -355,7 +357,7 @@ static struct sp_group *sp_get_local_group(struct mm_struct *mm) up_read(&sp_group_sem); down_write(&sp_group_sem); - ret = sp_mapping_group_setup_local(mm); + ret = sp_init_group_master_locked(current, mm); if (ret) { up_write(&sp_group_sem); return ERR_PTR(ret); @@ -403,37 +405,29 @@ static struct sp_proc_stat *create_proc_stat(struct mm_struct *mm, return stat; } -static struct sp_proc_stat *sp_init_proc_stat(struct sp_group_master *master, - struct mm_struct *mm, struct task_struct *tsk) +static int sp_init_proc_stat(struct mm_struct *mm, struct task_struct *tsk) { struct sp_proc_stat *stat; int alloc_id, tgid = tsk->tgid; - - down_write(&sp_proc_stat_sem); - stat = master->stat; - if (stat) { - up_write(&sp_proc_stat_sem); - return stat; - } + struct sp_group_master *master = mm->sp_group_master; stat = create_proc_stat(mm, tsk); - if (IS_ERR(stat)) { - up_write(&sp_proc_stat_sem); - return stat; - } + if (IS_ERR(stat)) + return PTR_ERR(stat); + down_write(&sp_proc_stat_sem); alloc_id = idr_alloc(&sp_proc_stat_idr, stat, tgid, tgid + 1, GFP_KERNEL); if (alloc_id < 0) { up_write(&sp_proc_stat_sem); pr_err_ratelimited("proc stat idr alloc failed %d\n", alloc_id); kfree(stat); - return ERR_PTR(alloc_id); + return alloc_id; } master->stat = stat; up_write(&sp_proc_stat_sem); - return stat; + return 0; } static void update_spg_stat_alloc(unsigned long size, bool inc, @@ -547,18 +541,14 @@ static struct spg_proc_stat *create_spg_proc_stat(int tgid, int spg_id) return stat; } -static struct spg_proc_stat *sp_init_spg_proc_stat( - struct sp_proc_stat *proc_stat, int tgid, struct sp_group *spg) +static struct spg_proc_stat *sp_init_spg_proc_stat(struct sp_proc_stat *proc_stat, + struct sp_group *spg) { struct spg_proc_stat *stat; int spg_id = spg->id; /* visit spg id locklessly */ struct sp_spg_stat *spg_stat = spg->stat; - stat = find_spg_proc_stat(proc_stat, tgid, spg_id); - if (stat) - return stat; - - stat = create_spg_proc_stat(tgid, spg_id); + stat = create_spg_proc_stat(proc_stat->tgid, spg_id); if (IS_ERR(stat)) return stat; @@ -575,31 +565,6 @@ static struct spg_proc_stat *sp_init_spg_proc_stat( return stat; } -/* - * The caller must - * 1. ensure no concurrency problem for task_struct and mm_struct. - * 2. hold sp_group_sem for sp_group_master (pay attention to ABBA deadlock) - */ -static struct spg_proc_stat *sp_init_process_stat(struct task_struct *tsk, - struct mm_struct *mm, struct sp_group *spg) -{ - struct sp_group_master *master; - bool exist; - struct sp_proc_stat *proc_stat; - struct spg_proc_stat *spg_proc_stat; - - master = sp_init_group_master_locked(mm, &exist); - if (IS_ERR(master)) - return (struct spg_proc_stat *)master; - - proc_stat = sp_init_proc_stat(master, mm, tsk); - if (IS_ERR(proc_stat)) - return (struct spg_proc_stat *)proc_stat; - - spg_proc_stat = sp_init_spg_proc_stat(proc_stat, tsk->tgid, spg); - return spg_proc_stat; -} - static struct sp_spg_stat *create_spg_stat(int spg_id) { struct sp_spg_stat *stat; @@ -846,9 +811,9 @@ static void sp_update_process_stat(struct task_struct *tsk, bool inc, enum spa_type type = spa->type; down_write(&sp_group_sem); - stat = sp_init_process_stat(tsk, tsk->mm, spa->spg); + stat = find_spg_proc_stat(tsk->mm->sp_group_master->stat, tsk->tgid, spa->spg->id); up_write(&sp_group_sem); - if (unlikely(IS_ERR(stat))) + if (!stat) return; update_spg_proc_stat(size, inc, stat, type); @@ -1253,26 +1218,27 @@ static void sp_munmap_task_areas(struct mm_struct *mm, struct sp_group *spg, str } /* the caller must hold sp_group_sem */ -static int mm_add_group_init(struct mm_struct *mm, struct sp_group *spg) +static int mm_add_group_init(struct task_struct *tsk, struct mm_struct *mm, + struct sp_group *spg) { - struct sp_group_master *master = mm->sp_group_master; - bool exist = false; - - master = sp_init_group_master_locked(mm, &exist); - if (IS_ERR(master)) - return PTR_ERR(master); - - if (!exist) - return 0; + int ret; + struct sp_group_master *master; - if (is_process_in_group(spg, mm)) { - pr_err_ratelimited("task already in target group, id=%d\n", spg->id); - return -EEXIST; - } + if (!mm->sp_group_master) { + ret = sp_init_group_master_locked(tsk, mm); + if (ret) + return ret; + } else { + if (is_process_in_group(spg, mm)) { + pr_err_ratelimited("task already in target group, id=%d\n", spg->id); + return -EEXIST; + } - if (master->count == MAX_GROUP_FOR_TASK) { - pr_err("task reaches max group num\n"); - return -ENOSPC; + master = mm->sp_group_master; + if (master->count == MAX_GROUP_FOR_TASK) { + pr_err("task reaches max group num\n"); + return -ENOSPC; + } } return 0; @@ -1311,29 +1277,13 @@ static int insert_spg_node(struct sp_group *spg, struct sp_group_node *node) spg->proc_num++; list_add_tail(&node->proc_node, &spg->procs); - return 0; -} - -static int local_group_add_task(struct mm_struct *mm, struct sp_group *spg) -{ - struct sp_group_node *node; - struct spg_proc_stat *stat; - - node = create_spg_node(mm, PROT_READ | PROT_WRITE, spg); - if (IS_ERR(node)) - return PTR_ERR(node); - - /* use current just to avoid compile error, rebuild in following patch */ - stat = sp_init_process_stat(current, mm, spg); - if (IS_ERR(stat)) { - free_sp_group_locked(spg); - pr_err_ratelimited("init process stat failed %lx\n", PTR_ERR(stat)); - return PTR_ERR(stat); - } - - insert_spg_node(spg, node); - mmget(mm); + /* + * The only way where sp_init_spg_proc_stat got failed is that there is no + * memory for sp_spg_stat. We will avoid this failure when we put sp_spg_stat + * into sp_group_node later. + */ + sp_init_spg_proc_stat(node->master->stat, spg); return 0; } @@ -1356,6 +1306,20 @@ static void free_spg_node(struct mm_struct *mm, struct sp_group *spg, kfree(spg_node); } +static int local_group_add_task(struct mm_struct *mm, struct sp_group *spg) +{ + struct sp_group_node *node; + + node = create_spg_node(mm, PROT_READ | PROT_WRITE, spg); + if (IS_ERR(node)) + return PTR_ERR(node); + + insert_spg_node(spg, node); + mmget(mm); + + return 0; +} + /** * sp_group_add_task() - Add a process to an share group (sp_group). * @pid: the pid of the task to be added. @@ -1380,7 +1344,6 @@ int mg_sp_group_add_task(int pid, unsigned long prot, int spg_id) int ret = 0; bool id_newly_generated = false; struct sp_area *spa, *prev = NULL; - struct spg_proc_stat *stat; check_interrupt_context(); @@ -1483,29 +1446,26 @@ int mg_sp_group_add_task(int pid, unsigned long prot, int spg_id) } } - ret = mm_add_group_init(mm, spg); - if (ret) + down_write(&spg->rw_lock); + ret = mm_add_group_init(tsk, mm, spg); + if (ret) { + up_write(&spg->rw_lock); goto out_drop_group; + } ret = sp_mapping_group_setup(mm, spg); - if (ret) + if (ret) { + up_write(&spg->rw_lock); goto out_drop_group; + } node = create_spg_node(mm, prot, spg); if (unlikely(IS_ERR(node))) { + up_write(&spg->rw_lock); ret = PTR_ERR(node); goto out_drop_group; } - /* per process statistics initialization */ - stat = sp_init_process_stat(tsk, mm, spg); - if (IS_ERR(stat)) { - ret = PTR_ERR(stat); - pr_err_ratelimited("init process stat failed %lx\n", PTR_ERR(stat)); - goto out_drop_spg_node; - } - - down_write(&spg->rw_lock); ret = insert_spg_node(spg, node); if (unlikely(ret)) { up_write(&spg->rw_lock); @@ -1757,7 +1717,7 @@ int sp_id_of_current(void) down_read(&sp_group_sem); master = current->mm->sp_group_master; - if (master && master->local) { + if (master) { spg_id = master->local->id; up_read(&sp_group_sem); return spg_id; @@ -1765,7 +1725,7 @@ int sp_id_of_current(void) up_read(&sp_group_sem); down_write(&sp_group_sem); - ret = sp_mapping_group_setup_local(current->mm); + ret = sp_init_group_master_locked(current, current->mm); if (ret) { up_write(&sp_group_sem); return ret; @@ -2924,7 +2884,7 @@ static void *sp_make_share_kva_to_task(unsigned long kva, unsigned long size, un struct sp_group *spg; down_write(&sp_group_sem); - ret = sp_mapping_group_setup_local(current->mm); + ret = sp_init_group_master_locked(current, current->mm); if (ret) { up_write(&sp_group_sem); pr_err_ratelimited("k2u_task init local mapping failed %d\n", ret); @@ -2932,13 +2892,7 @@ static void *sp_make_share_kva_to_task(unsigned long kva, unsigned long size, un } spg = current->mm->sp_group_master->local; - stat = sp_init_process_stat(current, current->mm, spg); - if (IS_ERR(stat)) { - up_write(&sp_group_sem); - pr_err_ratelimited("k2u_task init process stat failed %lx\n", - PTR_ERR(stat)); - return stat; - } + stat = find_spg_proc_stat(current->mm->sp_group_master->stat, current->tgid, spg->id); up_write(&sp_group_sem); spa = sp_alloc_area(size, sp_flags, spg, SPA_TYPE_K2TASK, current->tgid); @@ -3959,7 +3913,7 @@ static void free_sp_proc_stat(struct sp_proc_stat *stat) } /* the caller make sure stat is not NULL */ -void sp_proc_stat_drop(struct sp_proc_stat *stat) +static void sp_proc_stat_drop(struct sp_proc_stat *stat) { if (atomic_dec_and_test(&stat->use_count)) free_sp_proc_stat(stat); -- Gitee From b632bf41c09a6e1c6587db46a1b0d4419f83ee13 Mon Sep 17 00:00:00 2001 From: Wang Wensheng Date: Tue, 19 Jul 2022 11:36:19 +0800 Subject: [PATCH 370/674] mm/sharepool: Update sp_mapping structure hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5DS9S CVE: NA ------------------------------------------------- 1. Add a list for sp_mapping to record all the sp_groups attached to it. 2. Initialize the sp_mapping for local_group when it is created. So when we add a task to a group, we should merge the dvpp mapping of the local group. 3. Every two groups can be merged if and only if at least one of them is empty. Then the empty mapping would be dropped and another mapping would be attached to the two groups. This need to traverse all the groups attached to the mapping. 4. A mapping is considered empty when no spa is allocated from it and its address space is default. Signed-off-by: Wang Wensheng Reviewed-by: Weilong Chen Signed-off-by: Zheng Zengkai --- include/linux/share_pool.h | 9 ++++-- mm/share_pool.c | 65 +++++++++++++++++++++++--------------- 2 files changed, 47 insertions(+), 27 deletions(-) diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h index 51710669b1a7..f002370ab5f8 100644 --- a/include/linux/share_pool.h +++ b/include/linux/share_pool.h @@ -117,6 +117,9 @@ struct sp_mapping { struct rb_node *free_area_cache; unsigned long cached_hole_size; unsigned long cached_vstart; + + /* list head for all groups attached to this mapping, dvpp mapping only */ + struct list_head group_head; }; /* Processes in the same sp_group can share memory. @@ -160,8 +163,10 @@ struct sp_group { atomic_t use_count; /* protect the group internal elements, except spa_list */ struct rw_semaphore rw_lock; - struct sp_mapping *dvpp; - struct sp_mapping *normal; + /* list node for dvpp mapping */ + struct list_head mnode; + struct sp_mapping *dvpp; + struct sp_mapping *normal; }; /* a per-process(per mm) struct which manages a sp_group_node list */ diff --git a/mm/share_pool.c b/mm/share_pool.c index 75339e9d130e..e38436b25efc 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -168,6 +168,7 @@ static struct sp_mapping *sp_mapping_create(unsigned long flag) sp_mapping_range_init(spm); atomic_set(&spm->user, 0); spm->area_root = RB_ROOT; + INIT_LIST_HEAD(&spm->group_head); return spm; } @@ -180,18 +181,45 @@ static void sp_mapping_destroy(struct sp_mapping *spm) static void sp_mapping_attach(struct sp_group *spg, struct sp_mapping *spm) { atomic_inc(&spm->user); - if (spm->flag & SP_MAPPING_DVPP) + if (spm->flag & SP_MAPPING_DVPP) { spg->dvpp = spm; - else if (spm->flag & SP_MAPPING_NORMAL) + list_add_tail(&spg->mnode, &spm->group_head); + } else if (spm->flag & SP_MAPPING_NORMAL) spg->normal = spm; } static void sp_mapping_detach(struct sp_group *spg, struct sp_mapping *spm) { - if (spm && atomic_dec_and_test(&spm->user)) + if (!spm) + return; + if (spm->flag & SP_MAPPING_DVPP) + list_del(&spg->mnode); + if (atomic_dec_and_test(&spm->user)) sp_mapping_destroy(spm); } +/* merge old mapping to new, and the old mapping would be destroyed */ +static void sp_mapping_merge(struct sp_mapping *new, struct sp_mapping *old) +{ + struct sp_group *spg, *tmp; + + if (new == old) + return; + + list_for_each_entry_safe(spg, tmp, &old->group_head, mnode) { + list_move_tail(&spg->mnode, &new->group_head); + spg->dvpp = new; + } + + atomic_add(atomic_read(&old->user), &new->user); + sp_mapping_destroy(old); +} + +static bool is_mapping_empty(struct sp_mapping *spm) +{ + return RB_EMPTY_ROOT(&spm->area_root); +} + /* * When you set the address space of a group, the normal address space * is globally unified. When processing the DVPP address space, consider @@ -216,32 +244,18 @@ static int sp_mapping_group_setup(struct mm_struct *mm, struct sp_group *spg) { struct sp_group_master *master = mm->sp_group_master; struct sp_group *local = master->local; - struct sp_mapping *spm; if (!list_empty(&spg->procs)) { - /* 1 */ - if (local->dvpp && local->dvpp != spg->dvpp) { - pr_info_ratelimited("Duplicate address space, id=%d\n", - spg->id); - return 0; - } - - /* 2 */ - if (!local->dvpp) { - sp_mapping_attach(local, spg->dvpp); - sp_mapping_attach(local, spg->normal); + if (is_mapping_empty(local->dvpp)) + sp_mapping_merge(spg->dvpp, local->dvpp); + else if (is_mapping_empty(spg->dvpp)) + sp_mapping_merge(local->dvpp, spg->dvpp); + else { + pr_info_ratelimited("Duplicate address space, id=%d\n", spg->id); + return -EINVAL; } } else { - /* 4 */ - if (!local->dvpp) { - spm = sp_mapping_create(SP_MAPPING_DVPP); - if (IS_ERR(spm)) - return PTR_ERR(spm); - sp_mapping_attach(local, spm); - sp_mapping_attach(local, sp_mapping_normal); - } - - /* 3 */ + /* the mapping of local group is always set */ sp_mapping_attach(spg, local->dvpp); sp_mapping_attach(spg, sp_mapping_normal); } @@ -1121,6 +1135,7 @@ static struct sp_group *create_spg(int spg_id) atomic_set(&spg->use_count, 1); INIT_LIST_HEAD(&spg->procs); INIT_LIST_HEAD(&spg->spa_list); + INIT_LIST_HEAD(&spg->mnode); init_rwsem(&spg->rw_lock); sprintf(name, "sp_group_%d", spg_id); -- Gitee From 8e8888e678ff09fda4691f55954325fc6bb64dd1 Mon Sep 17 00:00:00 2001 From: Wang Wensheng Date: Tue, 19 Jul 2022 11:36:20 +0800 Subject: [PATCH 371/674] mm/sharepool: Introduce SPG_NON_DVPP flag for sp_group_add_task hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5DS9S CVE: NA ------------------------------------------------- When SPG_NOD_DVPP is specified to sp_group_add_task, we don't create a DVPP mapping for the newly created sp_group. And the new group cannot support allocating DVPP memory. Signed-off-by: Wang Wensheng Reviewed-by: Weilong Chen Signed-off-by: Zheng Zengkai --- include/linux/share_pool.h | 7 +++-- mm/share_pool.c | 60 ++++++++++++++++++++------------------ 2 files changed, 36 insertions(+), 31 deletions(-) diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h index f002370ab5f8..1911cd35843b 100644 --- a/include/linux/share_pool.h +++ b/include/linux/share_pool.h @@ -43,6 +43,9 @@ #define SPG_ID_LOCAL_MIN 200001 #define SPG_ID_LOCAL_MAX 299999 +#define SPG_FLAG_NON_DVPP (1 << 0) +#define SPG_FLAG_MASK (SPG_FLAG_NON_DVPP) + #define MAX_DEVID 8 /* the max num of Da-vinci devices */ extern int sysctl_share_pool_hugepage_enable; @@ -146,6 +149,7 @@ struct sp_mapping { */ struct sp_group { int id; + unsigned long flag; struct file *file; struct file *file_hugetlb; /* number of process in this group */ @@ -286,9 +290,6 @@ extern bool mg_sp_config_dvpp_range(size_t start, size_t size, int device_id, in extern bool is_sharepool_addr(unsigned long addr); extern bool mg_is_sharepool_addr(unsigned long addr); -extern int mg_sp_group_add_task(int pid, unsigned long prot, int spg_id); -extern int sp_group_add_task(int pid, int spg_id); - extern int sp_id_of_current(void); extern int mg_sp_id_of_current(void); diff --git a/mm/share_pool.c b/mm/share_pool.c index e38436b25efc..8297635c6dac 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -221,31 +221,20 @@ static bool is_mapping_empty(struct sp_mapping *spm) } /* - * When you set the address space of a group, the normal address space - * is globally unified. When processing the DVPP address space, consider - * the following situations: - * 1. If a process is added to a non-new group, the DVPP address space - * must have been created. If the local group of the process also - * contains the DVPP address space and they are different, this - * scenario is not allowed to avoid address conflict. - * 2. If the DVPP address space does not exist in the local group of the - * process, attach the local group of the process to the DVPP address - * space of the group. - * 3. Add a new group. If the process has applied for the dvpp address - * space (sp_alloc or k2u), attach the new group to the dvpp address - * space of the current process. - * 4. If the process has not applied for the DVPP address space, attach - * the new group and the local group of the current process to the - * newly created DVPP address space. - * + * 1. The mappings of local group is set on creating. + * 2. This is used to setup the mapping for groups created during add_task. + * 3. The normal mapping exists for all groups. + * 4. The dvpp mappings for the new group and local group can merge _iff_ at + * least one of the mapping is empty. * the caller must hold sp_group_sem + * NOTE: undo the mergeing when the later process failed. */ static int sp_mapping_group_setup(struct mm_struct *mm, struct sp_group *spg) { struct sp_group_master *master = mm->sp_group_master; struct sp_group *local = master->local; - if (!list_empty(&spg->procs)) { + if (!list_empty(&spg->procs) && !(spg->flag & SPG_FLAG_NON_DVPP)) { if (is_mapping_empty(local->dvpp)) sp_mapping_merge(spg->dvpp, local->dvpp); else if (is_mapping_empty(spg->dvpp)) @@ -255,15 +244,17 @@ static int sp_mapping_group_setup(struct mm_struct *mm, struct sp_group *spg) return -EINVAL; } } else { - /* the mapping of local group is always set */ - sp_mapping_attach(spg, local->dvpp); - sp_mapping_attach(spg, sp_mapping_normal); + if (!(spg->flag & SPG_FLAG_NON_DVPP)) + /* the mapping of local group is always set */ + sp_mapping_attach(spg, local->dvpp); + if (!spg->normal) + sp_mapping_attach(spg, sp_mapping_normal); } return 0; } -static struct sp_group *create_spg(int spg_id); +static struct sp_group *create_spg(int spg_id, unsigned long flag); static void free_new_spg_id(bool new, int spg_id); static void free_sp_group_locked(struct sp_group *spg); static int local_group_add_task(struct mm_struct *mm, struct sp_group *spg); @@ -281,7 +272,7 @@ static int init_local_group(struct mm_struct *mm) return spg_id; } - spg = create_spg(spg_id); + spg = create_spg(spg_id, 0); if (IS_ERR(spg)) { ret = PTR_ERR(spg); goto free_spg_id; @@ -1103,7 +1094,7 @@ static bool is_device_addr(unsigned long addr) return false; } -static struct sp_group *create_spg(int spg_id) +static struct sp_group *create_spg(int spg_id, unsigned long flag) { int ret; struct sp_group *spg; @@ -1117,6 +1108,11 @@ static struct sp_group *create_spg(int spg_id) return ERR_PTR(-ENOSPC); } + if (flag & ~SPG_FLAG_MASK) { + pr_err_ratelimited("invalid flag:%#lx\n", flag); + return ERR_PTR(-EINVAL); + } + spg = kzalloc(sizeof(*spg), GFP_KERNEL); if (spg == NULL) return ERR_PTR(-ENOMEM); @@ -1129,6 +1125,7 @@ static struct sp_group *create_spg(int spg_id) } spg->id = spg_id; + spg->flag = flag; spg->is_alive = true; spg->proc_num = 0; spg->owner = current->group_leader; @@ -1176,14 +1173,14 @@ static struct sp_group *create_spg(int spg_id) } /* the caller must hold sp_group_sem */ -static struct sp_group *find_or_alloc_sp_group(int spg_id) +static struct sp_group *find_or_alloc_sp_group(int spg_id, unsigned long flag) { struct sp_group *spg; spg = __sp_find_spg_locked(current->pid, spg_id); if (!spg) { - spg = create_spg(spg_id); + spg = create_spg(spg_id, flag); } else { down_read(&spg->rw_lock); if (!spg_valid(spg)) { @@ -1336,10 +1333,11 @@ static int local_group_add_task(struct mm_struct *mm, struct sp_group *spg) } /** - * sp_group_add_task() - Add a process to an share group (sp_group). + * mg_sp_group_add_task() - Add a process to an share group (sp_group). * @pid: the pid of the task to be added. * @prot: the prot of task for this spg. * @spg_id: the ID of the sp_group. + * @flag: to give some special message. * * A process can't be added to more than one sp_group in single group mode * and can in multiple group mode. @@ -1352,6 +1350,7 @@ static int local_group_add_task(struct mm_struct *mm, struct sp_group *spg) */ int mg_sp_group_add_task(int pid, unsigned long prot, int spg_id) { + unsigned long flag = 0; struct task_struct *tsk; struct mm_struct *mm; struct sp_group *spg; @@ -1445,7 +1444,7 @@ int mg_sp_group_add_task(int pid, unsigned long prot, int spg_id) goto out_put_task; } - spg = find_or_alloc_sp_group(spg_id); + spg = find_or_alloc_sp_group(spg_id, flag); if (IS_ERR(spg)) { up_write(&sp_group_sem); ret = PTR_ERR(spg); @@ -1818,6 +1817,11 @@ static struct sp_area *sp_alloc_area(unsigned long size, unsigned long flags, else mapping = spg->normal; + if (!mapping) { + pr_err_ratelimited("non DVPP spg, id %d\n", spg->id); + return ERR_PTR(-EINVAL); + } + vstart = mapping->start[device_id]; vend = mapping->end[device_id]; spa = __kmalloc_node(sizeof(struct sp_area), GFP_KERNEL, node_id); -- Gitee From 1e645d0e51cde5a3e7f1228249cda8f6bb2c8640 Mon Sep 17 00:00:00 2001 From: Wang Wensheng Date: Tue, 19 Jul 2022 11:36:21 +0800 Subject: [PATCH 372/674] mm/sharepool: Configure the DVPP range for process hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5DS9S CVE: NA ------------------------------------------------- Currently the dvpp range is global for each device. And it is unreasonable after the reconstruction that makes the DVPP mappings private to each process or group. This allows to configure the dvpp range for each process. The dvpp range for each dvpp mapping can only be configured once just as the old version. Signed-off-by: Wang Wensheng Reviewed-by: Weilong Chen Signed-off-by: Zheng Zengkai --- mm/share_pool.c | 66 +++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 58 insertions(+), 8 deletions(-) diff --git a/mm/share_pool.c b/mm/share_pool.c index 8297635c6dac..1c10b136f20f 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -220,6 +220,17 @@ static bool is_mapping_empty(struct sp_mapping *spm) return RB_EMPTY_ROOT(&spm->area_root); } +static bool can_mappings_merge(struct sp_mapping *m1, struct sp_mapping *m2) +{ + int i; + + for (i = 0; i < sp_device_number; i++) + if (m1->start[i] != m2->start[i] || m1->end[i] != m2->end[i]) + return false; + + return true; +} + /* * 1. The mappings of local group is set on creating. * 2. This is used to setup the mapping for groups created during add_task. @@ -235,6 +246,11 @@ static int sp_mapping_group_setup(struct mm_struct *mm, struct sp_group *spg) struct sp_group *local = master->local; if (!list_empty(&spg->procs) && !(spg->flag & SPG_FLAG_NON_DVPP)) { + if (!can_mappings_merge(local->dvpp, spg->dvpp)) { + pr_info_ratelimited("address space conflict, id=%d\n", spg->id); + return -EINVAL; + } + if (is_mapping_empty(local->dvpp)) sp_mapping_merge(spg->dvpp, local->dvpp); else if (is_mapping_empty(spg->dvpp)) @@ -3825,16 +3841,50 @@ EXPORT_SYMBOL_GPL(sp_unregister_notifier); */ bool sp_config_dvpp_range(size_t start, size_t size, int device_id, int pid) { - if (pid < 0 || - size <= 0 || size > MMAP_SHARE_POOL_16G_SIZE || - device_id < 0 || device_id >= sp_device_number || - !is_online_node_id(device_id) || - is_sp_dev_addr_enabled(device_id)) + int ret; + bool err = false; + struct task_struct *tsk; + struct mm_struct *mm; + struct sp_group *spg; + struct sp_mapping *spm; + unsigned long default_start; + + /* NOTE: check the start address */ + if (pid < 0 || size <= 0 || size > MMAP_SHARE_POOL_16G_SIZE || + device_id < 0 || device_id >= sp_device_number || !is_online_node_id(device_id)) return false; - sp_dev_va_start[device_id] = start; - sp_dev_va_size[device_id] = size; - return true; + ret = get_task(pid, &tsk); + if (ret) + return false; + + mm = get_task_mm(tsk->group_leader); + if (!mm) + goto put_task; + + spg = sp_get_local_group(mm); + if (IS_ERR(spg)) + goto put_mm; + + spm = spg->dvpp; + default_start = MMAP_SHARE_POOL_16G_START + device_id * MMAP_SHARE_POOL_16G_SIZE; + /* The dvpp range of each group can be configured only once */ + if (spm->start[device_id] != default_start) + goto put_spg; + + spm->start[device_id] = start; + spm->end[device_id] = start + size; + + err = true; + +put_spg: + sp_group_drop(spg); +put_mm: + mmput(mm); +put_task: + put_task_struct(tsk); + + return err; } EXPORT_SYMBOL_GPL(sp_config_dvpp_range); -- Gitee From 451822b698080b9fd8d72efdcaa6c3220802fae5 Mon Sep 17 00:00:00 2001 From: Wang Wensheng Date: Tue, 19 Jul 2022 11:36:22 +0800 Subject: [PATCH 373/674] mm/sharepool: Don't check the DVPP address space range before merging hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5DS9S CVE: NA ------------------------------------------------- The user doesn't care about the start address of the dvpp range, what is mattered is that the virtual space tagged DVPP located at in a 16G range. So we can safely drop the dvpp address space as long as it's empty during merging process. Signed-off-by: Wang Wensheng Reviewed-by: Weilong Chen Signed-off-by: Zheng Zengkai --- mm/share_pool.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/mm/share_pool.c b/mm/share_pool.c index 1c10b136f20f..b14152acbee8 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -246,16 +246,23 @@ static int sp_mapping_group_setup(struct mm_struct *mm, struct sp_group *spg) struct sp_group *local = master->local; if (!list_empty(&spg->procs) && !(spg->flag & SPG_FLAG_NON_DVPP)) { - if (!can_mappings_merge(local->dvpp, spg->dvpp)) { - pr_info_ratelimited("address space conflict, id=%d\n", spg->id); - return -EINVAL; - } + /* + * Don't return an error when the mappings' address range conflict. + * As long as the mapping is unused, we can drop the empty mapping. + * This may change the address range for the task or group implicitly, + * give a warn for it. + */ + bool is_conflict = !can_mappings_merge(local->dvpp, spg->dvpp); - if (is_mapping_empty(local->dvpp)) + if (is_mapping_empty(local->dvpp)) { sp_mapping_merge(spg->dvpp, local->dvpp); - else if (is_mapping_empty(spg->dvpp)) + if (is_conflict) + pr_warn_ratelimited("task address space conflict, spg_id=%d\n", spg->id); + } else if (is_mapping_empty(spg->dvpp)) { sp_mapping_merge(local->dvpp, spg->dvpp); - else { + if (is_conflict) + pr_warn_ratelimited("group address space conflict, spg_id=%d\n", spg->id); + } else { pr_info_ratelimited("Duplicate address space, id=%d\n", spg->id); return -EINVAL; } -- Gitee From a829370d803a17014c0509be190d926641d34011 Mon Sep 17 00:00:00 2001 From: Wang Wensheng Date: Tue, 19 Jul 2022 11:36:23 +0800 Subject: [PATCH 374/674] mm/sharepool: Add a task_struct parameter for sp_get_local_group() hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5DS9S CVE: NA ------------------------------------------------- sp_get_local_group() could be invoked in kthread, where the current process isn't the process we want. Add a parameter and let the caller to avoid this problem. Signed-off-by: Wang Wensheng Reviewed-by: Weilong Chen Signed-off-by: Zheng Zengkai --- mm/share_pool.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/share_pool.c b/mm/share_pool.c index b14152acbee8..a39bece9af96 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -370,7 +370,7 @@ static inline bool is_local_group(int spg_id) return spg_id >= SPG_ID_LOCAL_MIN && spg_id <= SPG_ID_LOCAL_MAX; } -static struct sp_group *sp_get_local_group(struct mm_struct *mm) +static struct sp_group *sp_get_local_group(struct task_struct *tsk, struct mm_struct *mm) { int ret; struct sp_group_master *master; @@ -385,7 +385,7 @@ static struct sp_group *sp_get_local_group(struct mm_struct *mm) up_read(&sp_group_sem); down_write(&sp_group_sem); - ret = sp_init_group_master_locked(current, mm); + ret = sp_init_group_master_locked(tsk, mm); if (ret) { up_write(&sp_group_sem); return ERR_PTR(ret); @@ -2499,7 +2499,7 @@ static int sp_alloc_prepare(unsigned long size, unsigned long sp_flags, } ac->type = SPA_TYPE_ALLOC; } else { /* allocation pass through scene */ - spg = sp_get_local_group(current->mm); + spg = sp_get_local_group(current, current->mm); if (IS_ERR(spg)) return PTR_ERR(spg); down_read(&spg->rw_lock); @@ -3869,7 +3869,7 @@ bool sp_config_dvpp_range(size_t start, size_t size, int device_id, int pid) if (!mm) goto put_task; - spg = sp_get_local_group(mm); + spg = sp_get_local_group(tsk, mm); if (IS_ERR(spg)) goto put_mm; -- Gitee From ac4045cfdc80d4b27d5746430d9bb6976624a1bd Mon Sep 17 00:00:00 2001 From: Wang Wensheng Date: Tue, 19 Jul 2022 11:36:24 +0800 Subject: [PATCH 375/674] mm/sharepool: Check sp_is_enabled() in all exported interfaces hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5DS9S CVE: NA -------------------------------------------------- We should forbid the usage of sharepool interfaces if sharepool is not enabled. Or undefined behaviour would panic the kernel. Signed-off-by: Wang Wensheng Reviewed-by: Weilong Chen Signed-off-by: Zheng Zengkai --- mm/share_pool.c | 45 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/mm/share_pool.c b/mm/share_pool.c index a39bece9af96..750524f1afc2 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -1015,6 +1015,9 @@ int sp_group_id_by_pid(int pid) struct sp_group *spg; int spg_id = -ENODEV; + if (!sp_is_enabled()) + return -EOPNOTSUPP; + check_interrupt_context(); spg = __sp_find_spg(pid, SPG_ID_DEFAULT); @@ -1050,6 +1053,9 @@ int mg_sp_group_id_by_pid(int pid, int *spg_ids, int *num) struct sp_group_master *master = NULL; struct task_struct *tsk; + if (!sp_is_enabled()) + return -EOPNOTSUPP; + check_interrupt_context(); if (!spg_ids || num <= 0) @@ -1382,6 +1388,9 @@ int mg_sp_group_add_task(int pid, unsigned long prot, int spg_id) bool id_newly_generated = false; struct sp_area *spa, *prev = NULL; + if (!sp_is_enabled()) + return -EOPNOTSUPP; + check_interrupt_context(); /* only allow READ, READ | WRITE */ @@ -1658,6 +1667,9 @@ int mg_sp_group_del_task(int pid, int spg_id) struct mm_struct *mm = NULL; bool is_alive = true; + if (!sp_is_enabled()) + return -EOPNOTSUPP; + if (spg_id < SPG_ID_MIN || spg_id > SPG_ID_AUTO) { pr_err_ratelimited("del from group failed, invalid group id %d\n", spg_id); return -EINVAL; @@ -1749,6 +1761,9 @@ int sp_id_of_current(void) int ret, spg_id; struct sp_group_master *master; + if (!sp_is_enabled()) + return -EOPNOTSUPP; + if (current->flags & PF_KTHREAD || !current->mm) return -EINVAL; @@ -2324,6 +2339,9 @@ int sp_free(unsigned long addr, int id) .spg_id = id, }; + if (!sp_is_enabled()) + return -EOPNOTSUPP; + check_interrupt_context(); if (current->flags & PF_KTHREAD) @@ -2761,6 +2779,9 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) int ret = 0; struct sp_alloc_context ac; + if (!sp_is_enabled()) + return ERR_PTR(-EOPNOTSUPP); + ret = sp_alloc_prepare(size, sp_flags, spg_id, &ac); if (ret) return ERR_PTR(ret); @@ -3142,6 +3163,9 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, int ret; struct sp_k2u_context kc; + if (!sp_is_enabled()) + return ERR_PTR(-EOPNOTSUPP); + check_interrupt_context(); ret = sp_k2u_prepare(kva, size, sp_flags, spg_id, &kc); @@ -3429,6 +3453,9 @@ void *sp_make_share_u2k(unsigned long uva, unsigned long size, int pid) struct sp_walk_data sp_walk_data; struct vm_struct *area; + if (!sp_is_enabled()) + return ERR_PTR(-EOPNOTSUPP); + check_interrupt_context(); if (mm == NULL) { @@ -3717,6 +3744,9 @@ int sp_unshare(unsigned long va, unsigned long size, int pid, int spg_id) { int ret = 0; + if (!sp_is_enabled()) + return -EOPNOTSUPP; + check_interrupt_context(); if (current->flags & PF_KTHREAD) @@ -3762,6 +3792,9 @@ int sp_walk_page_range(unsigned long uva, unsigned long size, struct mm_struct *mm; int ret = 0; + if (!sp_is_enabled()) + return -EOPNOTSUPP; + check_interrupt_context(); if (unlikely(!sp_walk_data)) { @@ -3807,6 +3840,9 @@ EXPORT_SYMBOL_GPL(mg_sp_walk_page_range); */ void sp_walk_page_free(struct sp_walk_data *sp_walk_data) { + if (!sp_is_enabled()) + return; + check_interrupt_context(); if (!sp_walk_data) @@ -3856,6 +3892,9 @@ bool sp_config_dvpp_range(size_t start, size_t size, int device_id, int pid) struct sp_mapping *spm; unsigned long default_start; + if (!sp_is_enabled()) + return false; + /* NOTE: check the start address */ if (pid < 0 || size <= 0 || size > MMAP_SHARE_POOL_16G_SIZE || device_id < 0 || device_id >= sp_device_number || !is_online_node_id(device_id)) @@ -3916,7 +3955,8 @@ static bool is_sp_normal_addr(unsigned long addr) */ bool is_sharepool_addr(unsigned long addr) { - return is_sp_normal_addr(addr) || is_device_addr(addr); + return sp_is_enabled() && + (is_sp_normal_addr(addr) || is_device_addr(addr)); } EXPORT_SYMBOL_GPL(is_sharepool_addr); @@ -4113,6 +4153,9 @@ int proc_sp_group_state(struct seq_file *m, struct pid_namespace *ns, unsigned long anon, file, shmem, total_rss, prot; long sp_res, sp_res_nsize, non_sp_res, non_sp_shm; + if (!sp_is_enabled()) + return 0; + if (!mm) return 0; -- Gitee From e906032bf468bd9dc1f95a90feae88abd33a690a Mon Sep 17 00:00:00 2001 From: Felix Kuehling Date: Tue, 19 Jul 2022 16:30:15 +0800 Subject: [PATCH 376/674] drm/amdkfd: Use drm_priv to pass VM from KFD to amdgpu stable inclusion from stable-v5.10.112 commit f0c31f192f38c76c2ab09469dd1e636984fb9093 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=f0c31f192f38c76c2ab09469dd1e636984fb9093 -------------------------------- commit b40a6ab2cf9213923bf8e821ce7fa7f6a0a26990 upstream. amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu needs the drm_priv to allow mmap to access the BO through the corresponding file descriptor. The VM can also be extracted from drm_priv, so drm_priv can replace the vm parameter in the kfd2kgd interface. Signed-off-by: Felix Kuehling Reviewed-by: Philip Yang Signed-off-by: Alex Deucher [This is a partial cherry-pick of the upstream commit.] Signed-off-by: Lee Jones Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 26f8a2138377..1b4c7ced8b92 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -1024,11 +1024,15 @@ int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct kgd_dev *kgd, struct dma_fence **ef) { struct amdgpu_device *adev = get_amdgpu_device(kgd); - struct drm_file *drm_priv = filp->private_data; - struct amdgpu_fpriv *drv_priv = drm_priv->driver_priv; - struct amdgpu_vm *avm = &drv_priv->vm; + struct amdgpu_fpriv *drv_priv; + struct amdgpu_vm *avm; int ret; + ret = amdgpu_file_to_fpriv(filp, &drv_priv); + if (ret) + return ret; + avm = &drv_priv->vm; + /* Already a compute VM? */ if (avm->process_info) return -EINVAL; -- Gitee From 9be8a9dfb0c93b3e4dd447e3ce1b1f23f6b7b1ec Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Tue, 19 Jul 2022 16:30:16 +0800 Subject: [PATCH 377/674] cpuidle: PSCI: Move the `has_lpi` check to the beginning of the function stable inclusion from stable-v5.10.112 commit 503934df3108c75bb4f913c6d483df2ae2c3eff0 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=503934df3108c75bb4f913c6d483df2ae2c3eff0 -------------------------------- commit 01f6c7338ce267959975da65d86ba34f44d54220 upstream. Currently the first thing checked is whether the PCSI cpu_suspend function has been initialized. Another change will be overloading `acpi_processor_ffh_lpi_probe` and calling it sooner. So make the `has_lpi` check the first thing checked to prepare for that change. Reviewed-by: Sudeep Holla Signed-off-by: Mario Limonciello Signed-off-by: Rafael J. Wysocki Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- arch/arm64/kernel/cpuidle.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kernel/cpuidle.c b/arch/arm64/kernel/cpuidle.c index b512b5503f6e..d4ff9ae673fa 100644 --- a/arch/arm64/kernel/cpuidle.c +++ b/arch/arm64/kernel/cpuidle.c @@ -54,6 +54,9 @@ static int psci_acpi_cpu_init_idle(unsigned int cpu) struct acpi_lpi_state *lpi; struct acpi_processor *pr = per_cpu(processors, cpu); + if (unlikely(!pr || !pr->flags.has_lpi)) + return -EINVAL; + /* * If the PSCI cpu_suspend function hook has not been initialized * idle states must not be enabled, so bail out @@ -61,9 +64,6 @@ static int psci_acpi_cpu_init_idle(unsigned int cpu) if (!psci_ops.cpu_suspend) return -EOPNOTSUPP; - if (unlikely(!pr || !pr->flags.has_lpi)) - return -EINVAL; - count = pr->power.count - 1; if (count <= 0) return -ENODEV; -- Gitee From a5c4a16563c4cc73b930e407811fa8af048fdf32 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Tue, 19 Jul 2022 16:30:17 +0800 Subject: [PATCH 378/674] ACPI: processor idle: Check for architectural support for LPI stable inclusion from stable-v5.10.112 commit 5d131318bb8765839e45c0e55fd1f57483e62a50 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=5d131318bb8765839e45c0e55fd1f57483e62a50 -------------------------------- commit eb087f305919ee8169ad65665610313e74260463 upstream. When `osc_pc_lpi_support_confirmed` is set through `_OSC` and `_LPI` is populated then the cpuidle driver assumes that LPI is fully functional. However currently the kernel only provides architectural support for LPI on ARM. This leads to high power consumption on X86 platforms that otherwise try to enable LPI. So probe whether or not LPI support is implemented before enabling LPI in the kernel. This is done by overloading `acpi_processor_ffh_lpi_probe` to check whether it returns `-EOPNOTSUPP`. It also means that all future implementations of `acpi_processor_ffh_lpi_probe` will need to follow these semantics as well. Reviewed-by: Sudeep Holla Signed-off-by: Mario Limonciello Signed-off-by: Rafael J. Wysocki Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/acpi/processor_idle.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index 8377c3ed10ff..9921b481c7ee 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -1080,6 +1080,11 @@ static int flatten_lpi_states(struct acpi_processor *pr, return 0; } +int __weak acpi_processor_ffh_lpi_probe(unsigned int cpu) +{ + return -EOPNOTSUPP; +} + static int acpi_processor_get_lpi_info(struct acpi_processor *pr) { int ret, i; @@ -1088,6 +1093,11 @@ static int acpi_processor_get_lpi_info(struct acpi_processor *pr) struct acpi_device *d = NULL; struct acpi_lpi_states_array info[2], *tmp, *prev, *curr; + /* make sure our architecture has support */ + ret = acpi_processor_ffh_lpi_probe(pr->id); + if (ret == -EOPNOTSUPP) + return ret; + if (!osc_pc_lpi_support_confirmed) return -EOPNOTSUPP; @@ -1139,11 +1149,6 @@ static int acpi_processor_get_lpi_info(struct acpi_processor *pr) return 0; } -int __weak acpi_processor_ffh_lpi_probe(unsigned int cpu) -{ - return -ENODEV; -} - int __weak acpi_processor_ffh_lpi_enter(struct acpi_lpi_state *lpi) { return -ENODEV; -- Gitee From 1ba702b21c4b2cc9b68fbef75ae931dfb442b098 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Tue, 19 Jul 2022 16:30:18 +0800 Subject: [PATCH 379/674] btrfs: remove unused variable in btrfs_{start,write}_dirty_block_groups() stable inclusion from stable-v5.10.112 commit 921fdc45a08456e456ae031dfa2b0996173097db category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=921fdc45a08456e456ae031dfa2b0996173097db -------------------------------- commit 6d4a6b515c39f1f8763093e0f828959b2fbc2f45 upstream. Clang's version of -Wunused-but-set-variable recently gained support for unary operations, which reveals two unused variables: fs/btrfs/block-group.c:2949:6: error: variable 'num_started' set but not used [-Werror,-Wunused-but-set-variable] int num_started = 0; ^ fs/btrfs/block-group.c:3116:6: error: variable 'num_started' set but not used [-Werror,-Wunused-but-set-variable] int num_started = 0; ^ 2 errors generated. These variables appear to be unused from their introduction, so just remove them to silence the warnings. Fixes: c9dc4c657850 ("Btrfs: two stage dirty block group writeout") Fixes: 1bbc621ef284 ("Btrfs: allow block group cache writeout outside critical section in commit") CC: stable@vger.kernel.org # 5.4+ Link: https://github.com/ClangBuiltLinux/linux/issues/1614 Signed-off-by: Nathan Chancellor Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- fs/btrfs/block-group.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index c99e293b50f5..e351f5319950 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -2570,7 +2570,6 @@ int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans) struct btrfs_path *path = NULL; LIST_HEAD(dirty); struct list_head *io = &cur_trans->io_bgs; - int num_started = 0; int loops = 0; spin_lock(&cur_trans->dirty_bgs_lock); @@ -2636,7 +2635,6 @@ int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans) cache->io_ctl.inode = NULL; ret = btrfs_write_out_cache(trans, cache, path); if (ret == 0 && cache->io_ctl.inode) { - num_started++; should_put = 0; /* @@ -2737,7 +2735,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) int should_put; struct btrfs_path *path; struct list_head *io = &cur_trans->io_bgs; - int num_started = 0; path = btrfs_alloc_path(); if (!path) @@ -2795,7 +2792,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) cache->io_ctl.inode = NULL; ret = btrfs_write_out_cache(trans, cache, path); if (ret == 0 && cache->io_ctl.inode) { - num_started++; should_put = 0; list_add_tail(&cache->io_list, io); } else { -- Gitee From 7d3f6a00b77d08b8f777aeca070a39c9fcc433ec Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Tue, 19 Jul 2022 16:30:19 +0800 Subject: [PATCH 380/674] drm/msm: Add missing put_task_struct() in debugfs path stable inclusion from stable-v5.10.112 commit cb66641f81060a5a5d88f85d2aeb73860edc79df category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=cb66641f81060a5a5d88f85d2aeb73860edc79df -------------------------------- [ Upstream commit ac3e4f42d5ec459f701743debd9c1ad2f2247402 ] Fixes: 25faf2f2e065 ("drm/msm: Show process names in gem_describe") Signed-off-by: Rob Clark Link: https://lore.kernel.org/r/20220317184550.227991-1-robdclark@gmail.com Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/gpu/drm/msm/msm_gem.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c index 819567e40565..9c05bf6c4551 100644 --- a/drivers/gpu/drm/msm/msm_gem.c +++ b/drivers/gpu/drm/msm/msm_gem.c @@ -849,6 +849,7 @@ void msm_gem_describe(struct drm_gem_object *obj, struct seq_file *m) get_pid_task(aspace->pid, PIDTYPE_PID); if (task) { comm = kstrdup(task->comm, GFP_KERNEL); + put_task_struct(task); } else { comm = NULL; } -- Gitee From dff5229263233c766939f0ba745b8e342ba70293 Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Tue, 19 Jul 2022 16:30:20 +0800 Subject: [PATCH 381/674] memory: atmel-ebi: Fix missing of_node_put in atmel_ebi_probe stable inclusion from stable-v5.10.112 commit 16c628b0c6faa2fab1a75ffce80860f8106cf002 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=16c628b0c6faa2fab1a75ffce80860f8106cf002 -------------------------------- [ Upstream commit 6f296a9665ba5ac68937bf11f96214eb9de81baa ] The device_node pointer is returned by of_parse_phandle() with refcount incremented. We should use of_node_put() on it when done. Fixes: 87108dc78eb8 ("memory: atmel-ebi: Enable the SMC clock if specified") Signed-off-by: Miaoqian Lin Reviewed-by: Claudiu Beznea Link: https://lore.kernel.org/r/20220309110144.22412-1-linmq006@gmail.com Signed-off-by: Krzysztof Kozlowski Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/memory/atmel-ebi.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/drivers/memory/atmel-ebi.c b/drivers/memory/atmel-ebi.c index c267283b01fd..e749dcb3ddea 100644 --- a/drivers/memory/atmel-ebi.c +++ b/drivers/memory/atmel-ebi.c @@ -544,20 +544,27 @@ static int atmel_ebi_probe(struct platform_device *pdev) smc_np = of_parse_phandle(dev->of_node, "atmel,smc", 0); ebi->smc.regmap = syscon_node_to_regmap(smc_np); - if (IS_ERR(ebi->smc.regmap)) - return PTR_ERR(ebi->smc.regmap); + if (IS_ERR(ebi->smc.regmap)) { + ret = PTR_ERR(ebi->smc.regmap); + goto put_node; + } ebi->smc.layout = atmel_hsmc_get_reg_layout(smc_np); - if (IS_ERR(ebi->smc.layout)) - return PTR_ERR(ebi->smc.layout); + if (IS_ERR(ebi->smc.layout)) { + ret = PTR_ERR(ebi->smc.layout); + goto put_node; + } ebi->smc.clk = of_clk_get(smc_np, 0); if (IS_ERR(ebi->smc.clk)) { - if (PTR_ERR(ebi->smc.clk) != -ENOENT) - return PTR_ERR(ebi->smc.clk); + if (PTR_ERR(ebi->smc.clk) != -ENOENT) { + ret = PTR_ERR(ebi->smc.clk); + goto put_node; + } ebi->smc.clk = NULL; } + of_node_put(smc_np); ret = clk_prepare_enable(ebi->smc.clk); if (ret) return ret; @@ -608,6 +615,10 @@ static int atmel_ebi_probe(struct platform_device *pdev) } return of_platform_populate(np, NULL, NULL, dev); + +put_node: + of_node_put(smc_np); + return ret; } static __maybe_unused int atmel_ebi_resume(struct device *dev) -- Gitee From 85ad5f549c32641b3e7a31b359bd84bd55388a15 Mon Sep 17 00:00:00 2001 From: Cristian Marussi Date: Tue, 19 Jul 2022 16:30:21 +0800 Subject: [PATCH 382/674] firmware: arm_scmi: Fix sorting of retrieved clock rates stable inclusion from stable-v5.10.112 commit 5637129712023680c1b165a758c2c4d702bcd199 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=5637129712023680c1b165a758c2c4d702bcd199 -------------------------------- [ Upstream commit 23274739a5b6166f74d8d9cb5243d7bf6b46aab9 ] During SCMI Clock protocol initialization, after having retrieved from the SCMI platform all the available discrete rates for a specific clock, the clock rates array is sorted, unfortunately using a pointer to its end as a base instead of its start, so that sorting does not work. Fix invocation of sort() passing as base a pointer to the start of the retrieved clock rates array. Link: https://lore.kernel.org/r/20220318092813.49283-1-cristian.marussi@arm.com Fixes: dccec73de91d ("firmware: arm_scmi: Keep the discrete clock rates sorted") Signed-off-by: Cristian Marussi Signed-off-by: Sudeep Holla Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/firmware/arm_scmi/clock.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/firmware/arm_scmi/clock.c b/drivers/firmware/arm_scmi/clock.c index 4645677d86f1..a45678cd9b74 100644 --- a/drivers/firmware/arm_scmi/clock.c +++ b/drivers/firmware/arm_scmi/clock.c @@ -202,7 +202,8 @@ scmi_clock_describe_rates_get(const struct scmi_handle *handle, u32 clk_id, if (rate_discrete && rate) { clk->list.num_rates = tot_rate_cnt; - sort(rate, tot_rate_cnt, sizeof(*rate), rate_cmp_func, NULL); + sort(clk->list.rates, tot_rate_cnt, sizeof(*rate), + rate_cmp_func, NULL); } clk->rate_discrete = rate_discrete; -- Gitee From f274e8812df479b375a9cda1d455f35f29353333 Mon Sep 17 00:00:00 2001 From: Kyle Copperfield Date: Tue, 19 Jul 2022 16:30:22 +0800 Subject: [PATCH 383/674] media: rockchip/rga: do proper error checking in probe stable inclusion from stable-v5.10.112 commit af12dd71235cf9f06ed3e98b1c28f55951effa6d category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=af12dd71235cf9f06ed3e98b1c28f55951effa6d -------------------------------- [ Upstream commit 6150f276073a1480030242a7e006a89e161d6cd6 ] The latest fix for probe error handling contained a typo that causes probing to fail with the following message: rockchip-rga: probe of ff680000.rga failed with error -12 This patch fixes the typo. Fixes: e58430e1d4fd (media: rockchip/rga: fix error handling in probe) Reviewed-by: Dragan Simic Signed-off-by: Kyle Copperfield Reviewed-by: Kieran Bingham Reviewed-by: Dan Carpenter Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/media/platform/rockchip/rga/rga.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/platform/rockchip/rga/rga.c b/drivers/media/platform/rockchip/rga/rga.c index 6759091b15e0..d99ea8973b67 100644 --- a/drivers/media/platform/rockchip/rga/rga.c +++ b/drivers/media/platform/rockchip/rga/rga.c @@ -895,7 +895,7 @@ static int rga_probe(struct platform_device *pdev) } rga->dst_mmu_pages = (unsigned int *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 3); - if (rga->dst_mmu_pages) { + if (!rga->dst_mmu_pages) { ret = -ENOMEM; goto free_src_pages; } -- Gitee From d22a8440aff6344482a5c03c289138a750cdd2ff Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 19 Jul 2022 16:30:23 +0800 Subject: [PATCH 384/674] SUNRPC: Fix the svc_deferred_event trace class stable inclusion from stable-v5.10.112 commit 85ee17ca21cf92989e8c923e3ea4514c291e9d38 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=85ee17ca21cf92989e8c923e3ea4514c291e9d38 -------------------------------- [ Upstream commit 4d5004451ab2218eab94a30e1841462c9316ba19 ] Fix a NULL deref crash that occurs when an svc_rqst is deferred while the sunrpc tracing subsystem is enabled. svc_revisit() sets dr->xprt to NULL, so it can't be relied upon in the tracepoint to provide the remote's address. Unfortunately we can't revert the "svc_deferred_class" hunk in commit ece200ddd54b ("sunrpc: Save remote presentation address in svc_xprt for trace events") because there is now a specific check of event format specifiers for unsafe dereferences. The warning that check emits is: event svc_defer_recv has unsafe dereference of argument 1 A "%pISpc" format specifier with a "struct sockaddr *" is indeed flagged by this check. Instead, take the brute-force approach used by the svcrdma_qp_error tracepoint. Convert the dr::addr field into a presentation address in the TP_fast_assign() arm of the trace event, and store that as a string. This fix can be backported to -stable kernels. In the meantime, commit c6ced22997ad ("tracing: Update print fmt check to handle new __get_sockaddr() macro") is now in v5.18, so this wonky fix can be replaced with __sockaddr() and friends properly during the v5.19 merge window. Fixes: ece200ddd54b ("sunrpc: Save remote presentation address in svc_xprt for trace events") Signed-off-by: Chuck Lever Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- include/trace/events/sunrpc.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 9fe6cdcf2222..8220369ee610 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -1873,17 +1873,18 @@ DECLARE_EVENT_CLASS(svc_deferred_event, TP_STRUCT__entry( __field(const void *, dr) __field(u32, xid) - __string(addr, dr->xprt->xpt_remotebuf) + __array(__u8, addr, INET6_ADDRSTRLEN + 10) ), TP_fast_assign( __entry->dr = dr; __entry->xid = be32_to_cpu(*(__be32 *)(dr->args + (dr->xprt_hlen>>2))); - __assign_str(addr, dr->xprt->xpt_remotebuf); + snprintf(__entry->addr, sizeof(__entry->addr) - 1, + "%pISpc", (struct sockaddr *)&dr->addr); ), - TP_printk("addr=%s dr=%p xid=0x%08x", __get_str(addr), __entry->dr, + TP_printk("addr=%s dr=%p xid=0x%08x", __entry->addr, __entry->dr, __entry->xid) ); -- Gitee From 871e0a3789244674e4db0411f059783ab1a0c272 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Tue, 19 Jul 2022 16:30:24 +0800 Subject: [PATCH 385/674] veth: Ensure eth header is in skb's linear part stable inclusion from stable-v5.10.112 commit d67c900f1947d64ba8a64f693504bcaab8d9000c category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=d67c900f1947d64ba8a64f693504bcaab8d9000c -------------------------------- [ Upstream commit 726e2c5929de841fdcef4e2bf995680688ae1b87 ] After feeding a decapsulated packet to a veth device with act_mirred, skb_headlen() may be 0. But veth_xmit() calls __dev_forward_skb(), which expects at least ETH_HLEN byte of linear data (as __dev_forward_skb2() calls eth_type_trans(), which pulls ETH_HLEN bytes unconditionally). Use pskb_may_pull() to ensure veth_xmit() respects this constraint. kernel BUG at include/linux/skbuff.h:2328! RIP: 0010:eth_type_trans+0xcf/0x140 Call Trace: __dev_forward_skb2+0xe3/0x160 veth_xmit+0x6e/0x250 [veth] dev_hard_start_xmit+0xc7/0x200 __dev_queue_xmit+0x47f/0x520 ? skb_ensure_writable+0x85/0xa0 ? skb_mpls_pop+0x98/0x1c0 tcf_mirred_act+0x442/0x47e [act_mirred] tcf_action_exec+0x86/0x140 fl_classify+0x1d8/0x1e0 [cls_flower] ? dma_pte_clear_level+0x129/0x1a0 ? dma_pte_clear_level+0x129/0x1a0 ? prb_fill_curr_block+0x2f/0xc0 ? skb_copy_bits+0x11a/0x220 __tcf_classify+0x58/0x110 tcf_classify_ingress+0x6b/0x140 __netif_receive_skb_core.constprop.0+0x47d/0xfd0 ? __iommu_dma_unmap_swiotlb+0x44/0x90 __netif_receive_skb_one_core+0x3d/0xa0 netif_receive_skb+0x116/0x170 be_process_rx+0x22f/0x330 [be2net] be_poll+0x13c/0x370 [be2net] __napi_poll+0x2a/0x170 net_rx_action+0x22f/0x2f0 __do_softirq+0xca/0x2a8 __irq_exit_rcu+0xc1/0xe0 common_interrupt+0x83/0xa0 Fixes: e314dbdc1c0d ("[NET]: Virtual ethernet device driver.") Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/net/veth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index f7e3eb309a26..5be8ed910553 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -292,7 +292,7 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) rcu_read_lock(); rcv = rcu_dereference(priv->peer); - if (unlikely(!rcv)) { + if (unlikely(!rcv) || !pskb_may_pull(skb, ETH_HLEN)) { kfree_skb(skb); goto drop; } -- Gitee From 959e7afb724417d91eb14dcf2944560c29ffc66a Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 19 Jul 2022 16:30:25 +0800 Subject: [PATCH 386/674] gpiolib: acpi: use correct format characters stable inclusion from stable-v5.10.112 commit 9709c8b5cdc88b1adb77acdbfd6e8a3f942d9af5 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=9709c8b5cdc88b1adb77acdbfd6e8a3f942d9af5 -------------------------------- [ Upstream commit 213d266ebfb1621aab79cfe63388facc520a1381 ] When compiling with -Wformat, clang emits the following warning: gpiolib-acpi.c:393:4: warning: format specifies type 'unsigned char' but the argument has type 'int' [-Wformat] pin); ^~~ So warning that '%hhX' is paired with an 'int' is all just completely mindless and wrong. Sadly, I can see a different bogus warning reason why people would want to use '%02hhX'. Again, the *sane* thing from a human perspective is to use '%02X. But if the compiler doesn't do any range analysis at all, it could decide that "Oh, that print format could need up to 8 bytes of space in the result". Using '%02hhX' would cut that down to two. And since we use char ev_name[5]; and currently use "_%c%02hhX" as the format string, even a compiler that doesn't notice that "pin <= 255" test that guards this all will go "OK, that's at most 4 bytes and the final NUL termination, so it's fine". While a compiler - like gcc - that only sees that the original source of the 'pin' value is a 'unsigned short' array, and then doesn't take the "pin <= 255" into account, will warn like this: gpiolib-acpi.c: In function 'acpi_gpiochip_request_interrupt': gpiolib-acpi.c:206:24: warning: '%02X' directive writing between 2 and 4 bytes into a region of size 3 [-Wformat-overflow=] sprintf(ev_name, "_%c%02X", ^~~~ gpiolib-acpi.c:206:20: note: directive argument in the range [0, 65535] because gcc isn't being very good at that argument range analysis either. In other words, the original use of 'hhx' was bogus to begin with, and due to *another* compiler warning being bad, and we had that bad code being written back in 2016 to work around _that_ compiler warning (commit e40a3ae1f794: "gpio: acpi: work around false-positive -Wstring-overflow warning"). Sadly, two different bad compiler warnings together does not make for one good one. It just makes for even more pain. End result: I think the simplest and cleanest option is simply the proposed change which undoes that '%hhX' change for gcc, and replaces it with just using a slightly bigger stack allocation. It's not like a 5-byte allocation is in any way likely to have saved any actual stack, since all the other variables in that function are 'int' or bigger. False-positive compiler warnings really do make people write worse code, and that's a problem. But on a scale of bad code, I feel that extending the buffer trivially is better than adding a pointless cast that literally makes no sense. At least in this case the end result isn't unreadable or buggy. We've had several cases of bad compiler warnings that caused changes that were actually horrendously wrong. Fixes: e40a3ae1f794 ("gpio: acpi: work around false-positive -Wstring-overflow warning") Signed-off-by: Linus Torvalds Signed-off-by: Andy Shevchenko Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/gpio/gpiolib-acpi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpio/gpiolib-acpi.c b/drivers/gpio/gpiolib-acpi.c index 55e4f402ec8b..44ee319da1b3 100644 --- a/drivers/gpio/gpiolib-acpi.c +++ b/drivers/gpio/gpiolib-acpi.c @@ -276,8 +276,8 @@ static acpi_status acpi_gpiochip_alloc_event(struct acpi_resource *ares, pin = agpio->pin_table[0]; if (pin <= 255) { - char ev_name[5]; - sprintf(ev_name, "_%c%02hhX", + char ev_name[8]; + sprintf(ev_name, "_%c%02X", agpio->triggering == ACPI_EDGE_SENSITIVE ? 'E' : 'L', pin); if (ACPI_SUCCESS(acpi_get_handle(handle, ev_name, &evt_handle))) -- Gitee From 90c40a7f629da238f092827e977ded885831507c Mon Sep 17 00:00:00 2001 From: Calvin Johnson Date: Tue, 19 Jul 2022 16:30:26 +0800 Subject: [PATCH 387/674] net: mdio: Alphabetically sort header inclusion stable inclusion from stable-v5.10.112 commit 43e58e119a2b913d3b7576455b0b1f56b569ca0b category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=43e58e119a2b913d3b7576455b0b1f56b569ca0b -------------------------------- [ Upstream commit 1bf343665057312167750509b0c48e8299293ac5 ] Alphabetically sort header inclusion Signed-off-by: Calvin Johnson Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/net/mdio/mdio-bcm-unimac.c | 16 +++++++--------- drivers/net/mdio/mdio-bitbang.c | 4 ++-- drivers/net/mdio/mdio-cavium.c | 2 +- drivers/net/mdio/mdio-gpio.c | 10 +++++----- drivers/net/mdio/mdio-ipq4019.c | 4 ++-- drivers/net/mdio/mdio-ipq8064.c | 4 ++-- drivers/net/mdio/mdio-mscc-miim.c | 8 ++++---- drivers/net/mdio/mdio-mux-bcm-iproc.c | 10 +++++----- drivers/net/mdio/mdio-mux-gpio.c | 8 ++++---- drivers/net/mdio/mdio-mux-mmioreg.c | 6 +++--- drivers/net/mdio/mdio-mux-multiplexer.c | 2 +- drivers/net/mdio/mdio-mux.c | 6 +++--- drivers/net/mdio/mdio-octeon.c | 8 ++++---- drivers/net/mdio/mdio-thunder.c | 10 +++++----- drivers/net/mdio/mdio-xgene.c | 6 +++--- drivers/net/mdio/of_mdio.c | 10 +++++----- 16 files changed, 56 insertions(+), 58 deletions(-) diff --git a/drivers/net/mdio/mdio-bcm-unimac.c b/drivers/net/mdio/mdio-bcm-unimac.c index fbd36891ee64..5d171e7f118d 100644 --- a/drivers/net/mdio/mdio-bcm-unimac.c +++ b/drivers/net/mdio/mdio-bcm-unimac.c @@ -5,20 +5,18 @@ * Copyright (C) 2014-2017 Broadcom */ +#include +#include +#include #include -#include -#include -#include #include -#include -#include -#include - #include -#include #include - +#include +#include #include +#include +#include #define MDIO_CMD 0x00 #define MDIO_START_BUSY (1 << 29) diff --git a/drivers/net/mdio/mdio-bitbang.c b/drivers/net/mdio/mdio-bitbang.c index 5136275c8e73..99588192cc78 100644 --- a/drivers/net/mdio/mdio-bitbang.c +++ b/drivers/net/mdio/mdio-bitbang.c @@ -14,10 +14,10 @@ * Vitaly Bordug */ -#include +#include #include +#include #include -#include #define MDIO_READ 2 #define MDIO_WRITE 1 diff --git a/drivers/net/mdio/mdio-cavium.c b/drivers/net/mdio/mdio-cavium.c index 1afd6fc1a351..95ce274c1be1 100644 --- a/drivers/net/mdio/mdio-cavium.c +++ b/drivers/net/mdio/mdio-cavium.c @@ -4,9 +4,9 @@ */ #include +#include #include #include -#include #include "mdio-cavium.h" diff --git a/drivers/net/mdio/mdio-gpio.c b/drivers/net/mdio/mdio-gpio.c index 1b00235d7dc5..56c8f914f893 100644 --- a/drivers/net/mdio/mdio-gpio.c +++ b/drivers/net/mdio/mdio-gpio.c @@ -17,15 +17,15 @@ * Vitaly Bordug */ -#include -#include +#include #include -#include -#include #include #include -#include +#include #include +#include +#include +#include struct mdio_gpio_info { struct mdiobb_ctrl ctrl; diff --git a/drivers/net/mdio/mdio-ipq4019.c b/drivers/net/mdio/mdio-ipq4019.c index 25c25ea6da66..9cd71d896963 100644 --- a/drivers/net/mdio/mdio-ipq4019.c +++ b/drivers/net/mdio/mdio-ipq4019.c @@ -3,10 +3,10 @@ /* Copyright (c) 2020 Sartura Ltd. */ #include -#include -#include #include #include +#include +#include #include #include #include diff --git a/drivers/net/mdio/mdio-ipq8064.c b/drivers/net/mdio/mdio-ipq8064.c index f0a6bfa61645..49d4e9aa30bb 100644 --- a/drivers/net/mdio/mdio-ipq8064.c +++ b/drivers/net/mdio/mdio-ipq8064.c @@ -7,12 +7,12 @@ #include #include +#include #include -#include #include #include #include -#include +#include /* MII address register definitions */ #define MII_ADDR_REG_ADDR 0x10 diff --git a/drivers/net/mdio/mdio-mscc-miim.c b/drivers/net/mdio/mdio-mscc-miim.c index 1c9232fca1e2..037649bef92e 100644 --- a/drivers/net/mdio/mdio-mscc-miim.c +++ b/drivers/net/mdio/mdio-mscc-miim.c @@ -6,14 +6,14 @@ * Copyright (c) 2017 Microsemi Corporation */ -#include -#include -#include -#include #include #include #include +#include +#include #include +#include +#include #define MSCC_MIIM_REG_STATUS 0x0 #define MSCC_MIIM_STATUS_STAT_PENDING BIT(2) diff --git a/drivers/net/mdio/mdio-mux-bcm-iproc.c b/drivers/net/mdio/mdio-mux-bcm-iproc.c index 42fb5f166136..641cfa41f492 100644 --- a/drivers/net/mdio/mdio-mux-bcm-iproc.c +++ b/drivers/net/mdio/mdio-mux-bcm-iproc.c @@ -3,14 +3,14 @@ * Copyright 2016 Broadcom */ #include -#include +#include #include -#include +#include +#include #include +#include #include -#include -#include -#include +#include #define MDIO_RATE_ADJ_EXT_OFFSET 0x000 #define MDIO_RATE_ADJ_INT_OFFSET 0x004 diff --git a/drivers/net/mdio/mdio-mux-gpio.c b/drivers/net/mdio/mdio-mux-gpio.c index 10a758fdc9e6..3c7f16f06b45 100644 --- a/drivers/net/mdio/mdio-mux-gpio.c +++ b/drivers/net/mdio/mdio-mux-gpio.c @@ -3,13 +3,13 @@ * Copyright (C) 2011, 2012 Cavium, Inc. */ -#include #include -#include +#include +#include #include +#include #include -#include -#include +#include #define DRV_VERSION "1.1" #define DRV_DESCRIPTION "GPIO controlled MDIO bus multiplexer driver" diff --git a/drivers/net/mdio/mdio-mux-mmioreg.c b/drivers/net/mdio/mdio-mux-mmioreg.c index d1a8780e24d8..c02fb2a067ee 100644 --- a/drivers/net/mdio/mdio-mux-mmioreg.c +++ b/drivers/net/mdio/mdio-mux-mmioreg.c @@ -7,13 +7,13 @@ * Copyright 2012 Freescale Semiconductor, Inc. */ -#include #include +#include +#include #include #include -#include #include -#include +#include struct mdio_mux_mmioreg_state { void *mux_handle; diff --git a/drivers/net/mdio/mdio-mux-multiplexer.c b/drivers/net/mdio/mdio-mux-multiplexer.c index d6564381aa3e..527acfc3c045 100644 --- a/drivers/net/mdio/mdio-mux-multiplexer.c +++ b/drivers/net/mdio/mdio-mux-multiplexer.c @@ -4,10 +4,10 @@ * Copyright 2019 NXP */ -#include #include #include #include +#include struct mdio_mux_multiplexer_state { struct mux_control *muxc; diff --git a/drivers/net/mdio/mdio-mux.c b/drivers/net/mdio/mdio-mux.c index ccb3ee704eb1..3dde0c2b3e09 100644 --- a/drivers/net/mdio/mdio-mux.c +++ b/drivers/net/mdio/mdio-mux.c @@ -3,12 +3,12 @@ * Copyright (C) 2011, 2012 Cavium, Inc. */ -#include -#include -#include #include +#include #include +#include #include +#include #define DRV_DESCRIPTION "MDIO bus multiplexer driver" diff --git a/drivers/net/mdio/mdio-octeon.c b/drivers/net/mdio/mdio-octeon.c index 6faf39314ac9..e096e68ac667 100644 --- a/drivers/net/mdio/mdio-octeon.c +++ b/drivers/net/mdio/mdio-octeon.c @@ -3,13 +3,13 @@ * Copyright (C) 2009-2015 Cavium, Inc. */ -#include +#include +#include +#include #include #include -#include -#include #include -#include +#include #include "mdio-cavium.h" diff --git a/drivers/net/mdio/mdio-thunder.c b/drivers/net/mdio/mdio-thunder.c index dd7430c998a2..822d2cdd2f35 100644 --- a/drivers/net/mdio/mdio-thunder.c +++ b/drivers/net/mdio/mdio-thunder.c @@ -3,14 +3,14 @@ * Copyright (C) 2009-2016 Cavium, Inc. */ -#include -#include -#include +#include #include -#include #include -#include +#include +#include +#include #include +#include #include "mdio-cavium.h" diff --git a/drivers/net/mdio/mdio-xgene.c b/drivers/net/mdio/mdio-xgene.c index 461207cdf5d6..7ab4e26db08c 100644 --- a/drivers/net/mdio/mdio-xgene.c +++ b/drivers/net/mdio/mdio-xgene.c @@ -13,11 +13,11 @@ #include #include #include -#include -#include #include -#include +#include +#include #include +#include #include static bool xgene_mdio_status; diff --git a/drivers/net/mdio/of_mdio.c b/drivers/net/mdio/of_mdio.c index 4daf94bb56a5..ea0bf13e8ac3 100644 --- a/drivers/net/mdio/of_mdio.c +++ b/drivers/net/mdio/of_mdio.c @@ -8,17 +8,17 @@ * out of the OpenFirmware device tree and using it to populate an mii_bus. */ -#include #include -#include #include -#include -#include +#include +#include +#include #include #include #include #include -#include +#include +#include #define DEFAULT_GPIO_RESET_DELAY 10 /* in microseconds */ -- Gitee From 7245b48fdf9bfaa2208cd9cc0a52c8ba9a83e9bc Mon Sep 17 00:00:00 2001 From: Vadim Pasternak Date: Tue, 19 Jul 2022 16:30:27 +0800 Subject: [PATCH 388/674] mlxsw: i2c: Fix initialization error flow stable inclusion from stable-v5.10.112 commit 7a7cf84148416d93d5904eb9a7cf5499ab852b84 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=7a7cf84148416d93d5904eb9a7cf5499ab852b84 -------------------------------- [ Upstream commit d452088cdfd5a4ad9d96d847d2273fe958d6339b ] Add mutex_destroy() call in driver initialization error flow. Fixes: 6882b0aee180f ("mlxsw: Introduce support for I2C bus") Signed-off-by: Vadim Pasternak Signed-off-by: Ido Schimmel Link: https://lore.kernel.org/r/20220407070703.2421076-1-idosch@nvidia.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/net/ethernet/mellanox/mlxsw/i2c.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/mellanox/mlxsw/i2c.c b/drivers/net/ethernet/mellanox/mlxsw/i2c.c index 939b692ffc33..ce843ea91464 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/i2c.c +++ b/drivers/net/ethernet/mellanox/mlxsw/i2c.c @@ -650,6 +650,7 @@ static int mlxsw_i2c_probe(struct i2c_client *client, return 0; errout: + mutex_destroy(&mlxsw_i2c->cmd.lock); i2c_set_clientdata(client, NULL); return err; -- Gitee From b8be64b53ebbf0643cfb6bb46bc207290acbefe3 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Tue, 19 Jul 2022 16:30:28 +0800 Subject: [PATCH 389/674] net/sched: fix initialization order when updating chain 0 head stable inclusion from stable-v5.10.112 commit f2cc341fcc42a2507d9c36e94317f5e0834fc5fd category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=f2cc341fcc42a2507d9c36e94317f5e0834fc5fd -------------------------------- [ Upstream commit e65812fd22eba32f11abe28cb377cbd64cfb1ba0 ] Currently, when inserting a new filter that needs to sit at the head of chain 0, it will first update the heads pointer on all devices using the (shared) block, and only then complete the initialization of the new element so that it has a "next" element. This can lead to a situation that the chain 0 head is propagated to another CPU before the "next" initialization is done. When this race condition is triggered, packets being matched on that CPU will simply miss all other filters, and will flow through the stack as if there were no other filters installed. If the system is using OVS + TC, such packets will get handled by vswitchd via upcall, which results in much higher latency and reordering. For other applications it may result in packet drops. This is reproducible with a tc only setup, but it varies from system to system. It could be reproduced with a shared block amongst 10 veth tunnels, and an ingress filter mirroring packets to another veth. That's because using the last added veth tunnel to the shared block to do the actual traffic, it makes the race window bigger and easier to trigger. The fix is rather simple, to just initialize the next pointer of the new filter instance (tp) before propagating the head change. The fixes tag is pointing to the original code though this issue should only be observed when using it unlocked. Fixes: 2190d1d0944f ("net: sched: introduce helpers to work with filter chains") Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: Vlad Buslov Reviewed-by: Davide Caratti Link: https://lore.kernel.org/r/b97d5f4eaffeeb9d058155bcab63347527261abf.1649341369.git.marcelo.leitner@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- net/sched/cls_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 9a789a057a74..b8ffb7e4f696 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1656,10 +1656,10 @@ static int tcf_chain_tp_insert(struct tcf_chain *chain, if (chain->flushing) return -EAGAIN; + RCU_INIT_POINTER(tp->next, tcf_chain_tp_prev(chain, chain_info)); if (*chain_info->pprev == chain->filter_chain) tcf_chain0_head_change(chain, tp); tcf_proto_get(tp); - RCU_INIT_POINTER(tp->next, tcf_chain_tp_prev(chain, chain_info)); rcu_assign_pointer(*chain_info->pprev, tp); return 0; -- Gitee From 4578de3d669952439941ad0ba4fc4b50fb1e85ba Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Tue, 19 Jul 2022 16:30:29 +0800 Subject: [PATCH 390/674] net: dsa: felix: suppress -EPROBE_DEFER errors stable inclusion from stable-v5.10.112 commit 2ad9d890d850db6c55ec2d9b5b3cbb7ca5178f3f category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=2ad9d890d850db6c55ec2d9b5b3cbb7ca5178f3f -------------------------------- [ Upstream commit e6934e4048c91502efcb21da92b7ae37cd8fa741 ] The DSA master might not have been probed yet in which case the probe of the felix switch fails with -EPROBE_DEFER: [ 4.435305] mscc_felix 0000:00:00.5: Failed to register DSA switch: -517 It is not an error. Use dev_err_probe() to demote this particular error to a debug message. Fixes: 56051948773e ("net: dsa: ocelot: add driver for Felix switch family") Signed-off-by: Michael Walle Reviewed-by: Vladimir Oltean Link: https://lore.kernel.org/r/20220408101521.281886-1-michael@walle.cc Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/net/dsa/ocelot/felix_vsc9959.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/ocelot/felix_vsc9959.c b/drivers/net/dsa/ocelot/felix_vsc9959.c index cd8d9b0e0edb..c96dfc11aa6f 100644 --- a/drivers/net/dsa/ocelot/felix_vsc9959.c +++ b/drivers/net/dsa/ocelot/felix_vsc9959.c @@ -1466,7 +1466,7 @@ static int felix_pci_probe(struct pci_dev *pdev, err = dsa_register_switch(ds); if (err) { - dev_err(&pdev->dev, "Failed to register DSA switch: %d\n", err); + dev_err_probe(&pdev->dev, err, "Failed to register DSA switch\n"); goto err_register_ds; } -- Gitee From 29da03f5d2a00383d7da4b7a600ac824f4e27f31 Mon Sep 17 00:00:00 2001 From: Dinh Nguyen Date: Tue, 19 Jul 2022 16:30:30 +0800 Subject: [PATCH 391/674] net: ethernet: stmmac: fix altr_tse_pcs function when using a fixed-link stable inclusion from stable-v5.10.112 commit 08d5e3e954537931c8da7428034808d202e98299 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=08d5e3e954537931c8da7428034808d202e98299 -------------------------------- [ Upstream commit a6aaa00324240967272b451bfa772547bd576ee6 ] When using a fixed-link, the altr_tse_pcs driver crashes due to null-pointer dereference as no phy_device is provided to tse_pcs_fix_mac_speed function. Fix this by adding a check for phy_dev before calling the tse_pcs_fix_mac_speed() function. Also clean up the tse_pcs_fix_mac_speed function a bit. There is no need to check for splitter_base and sgmii_adapter_base because the driver will fail if these 2 variables are not derived from the device tree. Fixes: fb3bbdb85989 ("net: ethernet: Add TSE PCS support to dwmac-socfpga") Signed-off-by: Dinh Nguyen Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/net/ethernet/stmicro/stmmac/altr_tse_pcs.c | 8 -------- drivers/net/ethernet/stmicro/stmmac/altr_tse_pcs.h | 4 ++++ drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c | 13 +++++-------- 3 files changed, 9 insertions(+), 16 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/altr_tse_pcs.c b/drivers/net/ethernet/stmicro/stmmac/altr_tse_pcs.c index cd478d2cd871..00f6d347eaf7 100644 --- a/drivers/net/ethernet/stmicro/stmmac/altr_tse_pcs.c +++ b/drivers/net/ethernet/stmicro/stmmac/altr_tse_pcs.c @@ -57,10 +57,6 @@ #define TSE_PCS_USE_SGMII_ENA BIT(0) #define TSE_PCS_IF_USE_SGMII 0x03 -#define SGMII_ADAPTER_CTRL_REG 0x00 -#define SGMII_ADAPTER_DISABLE 0x0001 -#define SGMII_ADAPTER_ENABLE 0x0000 - #define AUTONEGO_LINK_TIMER 20 static int tse_pcs_reset(void __iomem *base, struct tse_pcs *pcs) @@ -202,12 +198,8 @@ void tse_pcs_fix_mac_speed(struct tse_pcs *pcs, struct phy_device *phy_dev, unsigned int speed) { void __iomem *tse_pcs_base = pcs->tse_pcs_base; - void __iomem *sgmii_adapter_base = pcs->sgmii_adapter_base; u32 val; - writew(SGMII_ADAPTER_ENABLE, - sgmii_adapter_base + SGMII_ADAPTER_CTRL_REG); - pcs->autoneg = phy_dev->autoneg; if (phy_dev->autoneg == AUTONEG_ENABLE) { diff --git a/drivers/net/ethernet/stmicro/stmmac/altr_tse_pcs.h b/drivers/net/ethernet/stmicro/stmmac/altr_tse_pcs.h index 442812c0a4bd..694ac25ef426 100644 --- a/drivers/net/ethernet/stmicro/stmmac/altr_tse_pcs.h +++ b/drivers/net/ethernet/stmicro/stmmac/altr_tse_pcs.h @@ -10,6 +10,10 @@ #include #include +#define SGMII_ADAPTER_CTRL_REG 0x00 +#define SGMII_ADAPTER_ENABLE 0x0000 +#define SGMII_ADAPTER_DISABLE 0x0001 + struct tse_pcs { struct device *dev; void __iomem *tse_pcs_base; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c index f37b6d57b2fe..8bb0106cb7ea 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c @@ -18,9 +18,6 @@ #include "altr_tse_pcs.h" -#define SGMII_ADAPTER_CTRL_REG 0x00 -#define SGMII_ADAPTER_DISABLE 0x0001 - #define SYSMGR_EMACGRP_CTRL_PHYSEL_ENUM_GMII_MII 0x0 #define SYSMGR_EMACGRP_CTRL_PHYSEL_ENUM_RGMII 0x1 #define SYSMGR_EMACGRP_CTRL_PHYSEL_ENUM_RMII 0x2 @@ -62,16 +59,14 @@ static void socfpga_dwmac_fix_mac_speed(void *priv, unsigned int speed) { struct socfpga_dwmac *dwmac = (struct socfpga_dwmac *)priv; void __iomem *splitter_base = dwmac->splitter_base; - void __iomem *tse_pcs_base = dwmac->pcs.tse_pcs_base; void __iomem *sgmii_adapter_base = dwmac->pcs.sgmii_adapter_base; struct device *dev = dwmac->dev; struct net_device *ndev = dev_get_drvdata(dev); struct phy_device *phy_dev = ndev->phydev; u32 val; - if ((tse_pcs_base) && (sgmii_adapter_base)) - writew(SGMII_ADAPTER_DISABLE, - sgmii_adapter_base + SGMII_ADAPTER_CTRL_REG); + writew(SGMII_ADAPTER_DISABLE, + sgmii_adapter_base + SGMII_ADAPTER_CTRL_REG); if (splitter_base) { val = readl(splitter_base + EMAC_SPLITTER_CTRL_REG); @@ -93,7 +88,9 @@ static void socfpga_dwmac_fix_mac_speed(void *priv, unsigned int speed) writel(val, splitter_base + EMAC_SPLITTER_CTRL_REG); } - if (tse_pcs_base && sgmii_adapter_base) + writew(SGMII_ADAPTER_ENABLE, + sgmii_adapter_base + SGMII_ADAPTER_CTRL_REG); + if (phy_dev) tse_pcs_fix_mac_speed(&dwmac->pcs, phy_dev, speed); } -- Gitee From 1421e20fee73d26e077b2fc005b15d6f346416cb Mon Sep 17 00:00:00 2001 From: Benedikt Spranger Date: Tue, 19 Jul 2022 16:30:31 +0800 Subject: [PATCH 392/674] net/sched: taprio: Check if socket flags are valid stable inclusion from stable-v5.10.112 commit a44938950e5e12dd676ec2b38701b6d3c4e1a281 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=a44938950e5e12dd676ec2b38701b6d3c4e1a281 -------------------------------- [ Upstream commit e8a64bbaaad1f6548cec5508297bc6d45e8ab69e ] A user may set the SO_TXTIME socket option to ensure a packet is send at a given time. The taprio scheduler has to confirm, that it is allowed to send a packet at that given time, by a check against the packet time schedule. The scheduler drop the packet, if the gates are closed at the given send time. The check, if SO_TXTIME is set, may fail since sk_flags are part of an union and the union is used otherwise. This happen, if a socket is not a full socket, like a request socket for example. Add a check to verify, if the union is used for sk_flags. Fixes: 4cfd5779bd6e ("taprio: Add support for txtime-assist mode") Signed-off-by: Benedikt Spranger Reviewed-by: Kurt Kanzenbach Acked-by: Vinicius Costa Gomes Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- net/sched/sch_taprio.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 806babdd838d..eca525791013 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -427,7 +427,8 @@ static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (unlikely(!child)) return qdisc_drop(skb, sch, to_free); - if (skb->sk && sock_flag(skb->sk, SOCK_TXTIME)) { + /* sk_flags are only safe to use on full sockets. */ + if (skb->sk && sk_fullsock(skb->sk) && sock_flag(skb->sk, SOCK_TXTIME)) { if (!is_valid_interval(skb, sch)) return qdisc_drop(skb, sch, to_free); } else if (TXTIME_ASSIST_IS_ENABLED(q->flags)) { -- Gitee From cf3b55065cc63242aa66207a9a7aa8abe63d6693 Mon Sep 17 00:00:00 2001 From: Rameshkumar Sundaram Date: Tue, 19 Jul 2022 16:30:32 +0800 Subject: [PATCH 393/674] cfg80211: hold bss_lock while updating nontrans_list stable inclusion from stable-v5.10.112 commit 5513f9a0b0684383c1bc120ea37b35ae944a709a category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=5513f9a0b0684383c1bc120ea37b35ae944a709a -------------------------------- [ Upstream commit a5199b5626cd6913cf8776a835bc63d40e0686ad ] Synchronize additions to nontrans_list of transmitting BSS with bss_lock to avoid races. Also when cfg80211_add_nontrans_list() fails __cfg80211_unlink_bss() needs bss_lock to be held (has lockdep assert on bss_lock). So protect the whole block with bss_lock to avoid races and warnings. Found during code review. Fixes: 0b8fb8235be8 ("cfg80211: Parsing of Multiple BSSID information in scanning") Signed-off-by: Rameshkumar Sundaram Link: https://lore.kernel.org/r/1649668071-9370-1-git-send-email-quic_ramess@quicinc.com Signed-off-by: Johannes Berg Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- net/wireless/scan.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/wireless/scan.c b/net/wireless/scan.c index c1b2655682a8..6dc9b7e22b71 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -1968,11 +1968,13 @@ cfg80211_inform_single_bss_data(struct wiphy *wiphy, /* this is a nontransmitting bss, we need to add it to * transmitting bss' list if it is not there */ + spin_lock_bh(&rdev->bss_lock); if (cfg80211_add_nontrans_list(non_tx_data->tx_bss, &res->pub)) { if (__cfg80211_unlink_bss(rdev, res)) rdev->bss_generation++; } + spin_unlock_bh(&rdev->bss_lock); } trace_cfg80211_return_bss(&res->pub); -- Gitee From 8d0ffc27fd8f031c68c7d1fae4ea65819e71f052 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Tue, 19 Jul 2022 16:30:33 +0800 Subject: [PATCH 394/674] drm/msm: Fix range size vs end confusion stable inclusion from stable-v5.10.112 commit 5f78ad93837c61ed62a6f7b10a960c0c9218382c category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=5f78ad93837c61ed62a6f7b10a960c0c9218382c -------------------------------- [ Upstream commit 537fef808be5ea56f6fc06932162550819a3b3c3 ] The fourth param is size, rather than range_end. Note that we could increase the address space size if we had a way to prevent buffers from spanning a 4G split, mostly just to avoid fw bugs with 64b math. Fixes: 84c31ee16f90 ("drm/msm/a6xx: Add support for per-instance pagetables") Signed-off-by: Rob Clark Link: https://lore.kernel.org/r/20220407202836.1211268-1-robdclark@gmail.com Signed-off-by: Rob Clark Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c index 9e09805575db..39563daff4a0 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c @@ -1222,7 +1222,7 @@ a6xx_create_private_address_space(struct msm_gpu *gpu) return ERR_CAST(mmu); return msm_gem_address_space_create(mmu, - "gpu", 0x100000000ULL, 0x1ffffffffULL); + "gpu", 0x100000000ULL, SZ_4G); } static uint32_t a6xx_get_rptr(struct msm_gpu *gpu, struct msm_ringbuffer *ring) -- Gitee From e8dcb50674b91f69cf1c54d27af9ff783b2305c1 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Tue, 19 Jul 2022 16:30:34 +0800 Subject: [PATCH 395/674] drm/msm/dsi: Use connector directly in msm_dsi_manager_connector_init() stable inclusion from stable-v5.10.112 commit 98a7f6c4ada4ca97fcb0d508e097f8597df19d4d category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=98a7f6c4ada4ca97fcb0d508e097f8597df19d4d -------------------------------- [ Upstream commit 47b7de6b88b962ef339a2427a023d2a23d161654 ] The member 'msm_dsi->connector' isn't assigned until msm_dsi_manager_connector_init() returns (see msm_dsi_modeset_init() and how it assigns the return value). Therefore this pointer is going to be NULL here. Let's use 'connector' which is what was intended. Cc: Dmitry Baryshkov Cc: Sean Paul Fixes: 6d5e78406991 ("drm/msm/dsi: Move dsi panel init into modeset init path") Signed-off-by: Stephen Boyd Reviewed-by: Dmitry Baryshkov Patchwork: https://patchwork.freedesktop.org/patch/478693/ Link: https://lore.kernel.org/r/20220318000731.2823718-1-swboyd@chromium.org Signed-off-by: Dmitry Baryshkov Signed-off-by: Rob Clark Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/gpu/drm/msm/dsi/dsi_manager.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/msm/dsi/dsi_manager.c b/drivers/gpu/drm/msm/dsi/dsi_manager.c index 1d28dfba2c9b..fb421ca56b3d 100644 --- a/drivers/gpu/drm/msm/dsi/dsi_manager.c +++ b/drivers/gpu/drm/msm/dsi/dsi_manager.c @@ -644,7 +644,7 @@ struct drm_connector *msm_dsi_manager_connector_init(u8 id) return connector; fail: - connector->funcs->destroy(msm_dsi->connector); + connector->funcs->destroy(connector); return ERR_PTR(ret); } -- Gitee From 0a0cbbcba4acbadb29ef77ce3cdf774688a9f533 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Tue, 19 Jul 2022 16:30:35 +0800 Subject: [PATCH 396/674] net/smc: Fix NULL pointer dereference in smc_pnet_find_ib() stable inclusion from stable-v5.10.112 commit 35b91e49bc80ca944a8679c3b139ddaf2f8eea0f category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=35b91e49bc80ca944a8679c3b139ddaf2f8eea0f -------------------------------- [ Upstream commit d22f4f977236f97e01255a80bca2ea93a8094fc8 ] dev_name() was called with dev.parent as argument but without to NULL-check it before. Solve this by checking the pointer before the call to dev_name(). Fixes: af5f60c7e3d5 ("net/smc: allow PCI IDs as ib device names in the pnet table") Reported-by: syzbot+03e3e228510223dabd34@syzkaller.appspotmail.com Signed-off-by: Karsten Graul Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- net/smc/smc_pnet.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index 9007c7e3bae4..30bae60d626c 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -310,8 +310,9 @@ static struct smc_ib_device *smc_pnet_find_ib(char *ib_name) list_for_each_entry(ibdev, &smc_ib_devices.list, list) { if (!strncmp(ibdev->ibdev->name, ib_name, sizeof(ibdev->ibdev->name)) || - !strncmp(dev_name(ibdev->ibdev->dev.parent), ib_name, - IB_DEVICE_NAME_MAX - 1)) { + (ibdev->ibdev->dev.parent && + !strncmp(dev_name(ibdev->ibdev->dev.parent), ib_name, + IB_DEVICE_NAME_MAX - 1))) { goto out; } } -- Gitee From 892570c0513e94daee3c90231e744554fd1ed370 Mon Sep 17 00:00:00 2001 From: Ajish Koshy Date: Tue, 19 Jul 2022 16:30:36 +0800 Subject: [PATCH 397/674] scsi: pm80xx: Mask and unmask upper interrupt vectors 32-63 stable inclusion from stable-v5.10.112 commit e08d26971237d1af8c3ca456f5729f3f4033890b category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=e08d26971237d1af8c3ca456f5729f3f4033890b -------------------------------- [ Upstream commit 294080eacf92a0781e6d43663448a55001ec8c64 ] When upper inbound and outbound queues 32-63 are enabled, we see upper vectors 32-63 in interrupt service routine. We need corresponding registers to handle masking and unmasking of these upper interrupts. To achieve this, we use registers MSGU_ODMR_U(0x34) to mask and MSGU_ODMR_CLR_U(0x3C) to unmask the interrupts. In these registers bit 0-31 represents interrupt vectors 32-63. Link: https://lore.kernel.org/r/20220411064603.668448-2-Ajish.Koshy@microchip.com Fixes: 05c6c029a44d ("scsi: pm80xx: Increase number of supported queues") Reviewed-by: John Garry Acked-by: Jack Wang Signed-off-by: Ajish Koshy Signed-off-by: Viswas G Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/scsi/pm8001/pm80xx_hwi.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/drivers/scsi/pm8001/pm80xx_hwi.c b/drivers/scsi/pm8001/pm80xx_hwi.c index a66a06914913..e1014bb9ee28 100644 --- a/drivers/scsi/pm8001/pm80xx_hwi.c +++ b/drivers/scsi/pm8001/pm80xx_hwi.c @@ -1682,10 +1682,11 @@ static void pm80xx_chip_interrupt_enable(struct pm8001_hba_info *pm8001_ha, u8 vec) { #ifdef PM8001_USE_MSIX - u32 mask; - mask = (u32)(1 << vec); - - pm8001_cw32(pm8001_ha, 0, MSGU_ODMR_CLR, (u32)(mask & 0xFFFFFFFF)); + if (vec < 32) + pm8001_cw32(pm8001_ha, 0, MSGU_ODMR_CLR, 1U << vec); + else + pm8001_cw32(pm8001_ha, 0, MSGU_ODMR_CLR_U, + 1U << (vec - 32)); return; #endif pm80xx_chip_intx_interrupt_enable(pm8001_ha); @@ -1701,12 +1702,15 @@ static void pm80xx_chip_interrupt_disable(struct pm8001_hba_info *pm8001_ha, u8 vec) { #ifdef PM8001_USE_MSIX - u32 mask; - if (vec == 0xFF) - mask = 0xFFFFFFFF; + if (vec == 0xFF) { + /* disable all vectors 0-31, 32-63 */ + pm8001_cw32(pm8001_ha, 0, MSGU_ODMR, 0xFFFFFFFF); + pm8001_cw32(pm8001_ha, 0, MSGU_ODMR_U, 0xFFFFFFFF); + } else if (vec < 32) + pm8001_cw32(pm8001_ha, 0, MSGU_ODMR, 1U << vec); else - mask = (u32)(1 << vec); - pm8001_cw32(pm8001_ha, 0, MSGU_ODMR, (u32)(mask & 0xFFFFFFFF)); + pm8001_cw32(pm8001_ha, 0, MSGU_ODMR_U, + 1U << (vec - 32)); return; #endif pm80xx_chip_intx_interrupt_disable(pm8001_ha); -- Gitee From 6ad5d4746792d911f5dde2f44071fe381bd095ed Mon Sep 17 00:00:00 2001 From: Ajish Koshy Date: Tue, 19 Jul 2022 16:30:37 +0800 Subject: [PATCH 398/674] scsi: pm80xx: Enable upper inbound, outbound queues stable inclusion from stable-v5.10.112 commit da9cf24aa739f48cdbf5b2624eacb4a6c81cf9e9 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=da9cf24aa739f48cdbf5b2624eacb4a6c81cf9e9 -------------------------------- [ Upstream commit bcd8a45223470e00b5f254018174d64a75db4bbe ] Executing driver on servers with more than 32 CPUs were faced with command timeouts. This is because we were not geting completions for commands submitted on IQ32 - IQ63. Set E64Q bit to enable upper inbound and outbound queues 32 to 63 in the MPI main configuration table. Added 500ms delay after successful MPI initialization as mentioned in controller datasheet. Link: https://lore.kernel.org/r/20220411064603.668448-3-Ajish.Koshy@microchip.com Fixes: 05c6c029a44d ("scsi: pm80xx: Increase number of supported queues") Reviewed-by: Damien Le Moal Acked-by: Jack Wang Signed-off-by: Ajish Koshy Signed-off-by: Viswas G Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/scsi/pm8001/pm80xx_hwi.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/scsi/pm8001/pm80xx_hwi.c b/drivers/scsi/pm8001/pm80xx_hwi.c index e1014bb9ee28..51345f02383d 100644 --- a/drivers/scsi/pm8001/pm80xx_hwi.c +++ b/drivers/scsi/pm8001/pm80xx_hwi.c @@ -765,6 +765,10 @@ static void init_default_table_values(struct pm8001_hba_info *pm8001_ha) pm8001_ha->main_cfg_tbl.pm80xx_tbl.pcs_event_log_severity = 0x01; pm8001_ha->main_cfg_tbl.pm80xx_tbl.fatal_err_interrupt = 0x01; + /* Enable higher IQs and OQs, 32 to 63, bit 16 */ + if (pm8001_ha->max_q_num > 32) + pm8001_ha->main_cfg_tbl.pm80xx_tbl.fatal_err_interrupt |= + 1 << 16; /* Disable end to end CRC checking */ pm8001_ha->main_cfg_tbl.pm80xx_tbl.crc_core_dump = (0x1 << 16); @@ -1024,6 +1028,13 @@ static int mpi_init_check(struct pm8001_hba_info *pm8001_ha) if (0x0000 != gst_len_mpistate) return -EBUSY; + /* + * As per controller datasheet, after successful MPI + * initialization minimum 500ms delay is required before + * issuing commands. + */ + msleep(500); + return 0; } -- Gitee From c19ae4e35a05c1b9deb051f21612276f4f5c259f Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Tue, 19 Jul 2022 16:30:38 +0800 Subject: [PATCH 399/674] scsi: iscsi: Stop queueing during ep_disconnect stable inclusion from stable-v5.10.112 commit 17d14456f6262b87f2ce6e749cc52ebdfa90949d category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=17d14456f6262b87f2ce6e749cc52ebdfa90949d -------------------------------- [ Upstream commit 891e2639deae721dc43764a44fa255890dc34313 ] During ep_disconnect we have been doing iscsi_suspend_tx/queue to block new I/O but every driver except cxgbi and iscsi_tcp can still get I/O from __iscsi_conn_send_pdu() if we haven't called iscsi_conn_failure() before ep_disconnect. This could happen if we were terminating the session, and the logout timed out before it was even sent to libiscsi. Fix the issue by adding a helper which reverses the bind_conn call that allows new I/O to be queued. Drivers implementing ep_disconnect can use this to make sure new I/O is not queued to them when handling the disconnect. Link: https://lore.kernel.org/r/20210525181821.7617-3-michael.christie@oracle.com Reviewed-by: Lee Duncan Signed-off-by: Mike Christie Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/infiniband/ulp/iser/iscsi_iser.c | 1 + drivers/scsi/be2iscsi/be_main.c | 1 + drivers/scsi/bnx2i/bnx2i_iscsi.c | 1 + drivers/scsi/cxgbi/cxgb3i/cxgb3i.c | 1 + drivers/scsi/cxgbi/cxgb4i/cxgb4i.c | 1 + drivers/scsi/libiscsi.c | 70 +++++++++++++++++++++--- drivers/scsi/qedi/qedi_iscsi.c | 1 + drivers/scsi/qla4xxx/ql4_os.c | 1 + drivers/scsi/scsi_transport_iscsi.c | 10 +++- include/scsi/libiscsi.h | 1 + include/scsi/scsi_transport_iscsi.h | 1 + 11 files changed, 78 insertions(+), 11 deletions(-) diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c index 3690e28cc7ea..8857d8322710 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.c +++ b/drivers/infiniband/ulp/iser/iscsi_iser.c @@ -988,6 +988,7 @@ static struct iscsi_transport iscsi_iser_transport = { /* connection management */ .create_conn = iscsi_iser_conn_create, .bind_conn = iscsi_iser_conn_bind, + .unbind_conn = iscsi_conn_unbind, .destroy_conn = iscsi_conn_teardown, .attr_is_visible = iser_attr_is_visible, .set_param = iscsi_iser_set_param, diff --git a/drivers/scsi/be2iscsi/be_main.c b/drivers/scsi/be2iscsi/be_main.c index 987dc8135a9b..b977e039bb78 100644 --- a/drivers/scsi/be2iscsi/be_main.c +++ b/drivers/scsi/be2iscsi/be_main.c @@ -5810,6 +5810,7 @@ struct iscsi_transport beiscsi_iscsi_transport = { .destroy_session = beiscsi_session_destroy, .create_conn = beiscsi_conn_create, .bind_conn = beiscsi_conn_bind, + .unbind_conn = iscsi_conn_unbind, .destroy_conn = iscsi_conn_teardown, .attr_is_visible = beiscsi_attr_is_visible, .set_iface_param = beiscsi_iface_set_param, diff --git a/drivers/scsi/bnx2i/bnx2i_iscsi.c b/drivers/scsi/bnx2i/bnx2i_iscsi.c index 37f5b719050e..197aca76a69d 100644 --- a/drivers/scsi/bnx2i/bnx2i_iscsi.c +++ b/drivers/scsi/bnx2i/bnx2i_iscsi.c @@ -2276,6 +2276,7 @@ struct iscsi_transport bnx2i_iscsi_transport = { .destroy_session = bnx2i_session_destroy, .create_conn = bnx2i_conn_create, .bind_conn = bnx2i_conn_bind, + .unbind_conn = iscsi_conn_unbind, .destroy_conn = bnx2i_conn_destroy, .attr_is_visible = bnx2i_attr_is_visible, .set_param = iscsi_set_param, diff --git a/drivers/scsi/cxgbi/cxgb3i/cxgb3i.c b/drivers/scsi/cxgbi/cxgb3i/cxgb3i.c index 37d99357120f..edcd3fab6973 100644 --- a/drivers/scsi/cxgbi/cxgb3i/cxgb3i.c +++ b/drivers/scsi/cxgbi/cxgb3i/cxgb3i.c @@ -117,6 +117,7 @@ static struct iscsi_transport cxgb3i_iscsi_transport = { /* connection management */ .create_conn = cxgbi_create_conn, .bind_conn = cxgbi_bind_conn, + .unbind_conn = iscsi_conn_unbind, .destroy_conn = iscsi_tcp_conn_teardown, .start_conn = iscsi_conn_start, .stop_conn = iscsi_conn_stop, diff --git a/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c b/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c index 2c3491528d42..efb3e2b3398e 100644 --- a/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c +++ b/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c @@ -134,6 +134,7 @@ static struct iscsi_transport cxgb4i_iscsi_transport = { /* connection management */ .create_conn = cxgbi_create_conn, .bind_conn = cxgbi_bind_conn, + .unbind_conn = iscsi_conn_unbind, .destroy_conn = iscsi_tcp_conn_teardown, .start_conn = iscsi_conn_start, .stop_conn = iscsi_conn_stop, diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c index 7ef5efd94422..e361856509d5 100644 --- a/drivers/scsi/libiscsi.c +++ b/drivers/scsi/libiscsi.c @@ -1386,23 +1386,32 @@ void iscsi_session_failure(struct iscsi_session *session, } EXPORT_SYMBOL_GPL(iscsi_session_failure); -void iscsi_conn_failure(struct iscsi_conn *conn, enum iscsi_err err) +static bool iscsi_set_conn_failed(struct iscsi_conn *conn) { struct iscsi_session *session = conn->session; - spin_lock_bh(&session->frwd_lock); - if (session->state == ISCSI_STATE_FAILED) { - spin_unlock_bh(&session->frwd_lock); - return; - } + if (session->state == ISCSI_STATE_FAILED) + return false; if (conn->stop_stage == 0) session->state = ISCSI_STATE_FAILED; - spin_unlock_bh(&session->frwd_lock); set_bit(ISCSI_SUSPEND_BIT, &conn->suspend_tx); set_bit(ISCSI_SUSPEND_BIT, &conn->suspend_rx); - iscsi_conn_error_event(conn->cls_conn, err); + return true; +} + +void iscsi_conn_failure(struct iscsi_conn *conn, enum iscsi_err err) +{ + struct iscsi_session *session = conn->session; + bool needs_evt; + + spin_lock_bh(&session->frwd_lock); + needs_evt = iscsi_set_conn_failed(conn); + spin_unlock_bh(&session->frwd_lock); + + if (needs_evt) + iscsi_conn_error_event(conn->cls_conn, err); } EXPORT_SYMBOL_GPL(iscsi_conn_failure); @@ -2178,6 +2187,51 @@ static void iscsi_check_transport_timeouts(struct timer_list *t) spin_unlock(&session->frwd_lock); } +/** + * iscsi_conn_unbind - prevent queueing to conn. + * @cls_conn: iscsi conn ep is bound to. + * @is_active: is the conn in use for boot or is this for EH/termination + * + * This must be called by drivers implementing the ep_disconnect callout. + * It disables queueing to the connection from libiscsi in preparation for + * an ep_disconnect call. + */ +void iscsi_conn_unbind(struct iscsi_cls_conn *cls_conn, bool is_active) +{ + struct iscsi_session *session; + struct iscsi_conn *conn; + + if (!cls_conn) + return; + + conn = cls_conn->dd_data; + session = conn->session; + /* + * Wait for iscsi_eh calls to exit. We don't wait for the tmf to + * complete or timeout. The caller just wants to know what's running + * is everything that needs to be cleaned up, and no cmds will be + * queued. + */ + mutex_lock(&session->eh_mutex); + + iscsi_suspend_queue(conn); + iscsi_suspend_tx(conn); + + spin_lock_bh(&session->frwd_lock); + if (!is_active) { + /* + * if logout timed out before userspace could even send a PDU + * the state might still be in ISCSI_STATE_LOGGED_IN and + * allowing new cmds and TMFs. + */ + if (session->state == ISCSI_STATE_LOGGED_IN) + iscsi_set_conn_failed(conn); + } + spin_unlock_bh(&session->frwd_lock); + mutex_unlock(&session->eh_mutex); +} +EXPORT_SYMBOL_GPL(iscsi_conn_unbind); + static void iscsi_prep_abort_task_pdu(struct iscsi_task *task, struct iscsi_tm *hdr) { diff --git a/drivers/scsi/qedi/qedi_iscsi.c b/drivers/scsi/qedi/qedi_iscsi.c index f51723e2d522..8f8036ae9a56 100644 --- a/drivers/scsi/qedi/qedi_iscsi.c +++ b/drivers/scsi/qedi/qedi_iscsi.c @@ -1428,6 +1428,7 @@ struct iscsi_transport qedi_iscsi_transport = { .destroy_session = qedi_session_destroy, .create_conn = qedi_conn_create, .bind_conn = qedi_conn_bind, + .unbind_conn = iscsi_conn_unbind, .start_conn = qedi_conn_start, .stop_conn = iscsi_conn_stop, .destroy_conn = qedi_conn_destroy, diff --git a/drivers/scsi/qla4xxx/ql4_os.c b/drivers/scsi/qla4xxx/ql4_os.c index 2c23b692e318..a8b8bf118c76 100644 --- a/drivers/scsi/qla4xxx/ql4_os.c +++ b/drivers/scsi/qla4xxx/ql4_os.c @@ -259,6 +259,7 @@ static struct iscsi_transport qla4xxx_iscsi_transport = { .start_conn = qla4xxx_conn_start, .create_conn = qla4xxx_conn_create, .bind_conn = qla4xxx_conn_bind, + .unbind_conn = iscsi_conn_unbind, .stop_conn = iscsi_conn_stop, .destroy_conn = qla4xxx_conn_destroy, .set_param = iscsi_set_param, diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c index a5759d0e388a..6490211b55d2 100644 --- a/drivers/scsi/scsi_transport_iscsi.c +++ b/drivers/scsi/scsi_transport_iscsi.c @@ -2957,7 +2957,7 @@ static int iscsi_if_ep_connect(struct iscsi_transport *transport, } static int iscsi_if_ep_disconnect(struct iscsi_transport *transport, - u64 ep_handle) + u64 ep_handle, bool is_active) { struct iscsi_cls_conn *conn; struct iscsi_endpoint *ep; @@ -2974,6 +2974,8 @@ static int iscsi_if_ep_disconnect(struct iscsi_transport *transport, conn->ep = NULL; mutex_unlock(&conn->ep_mutex); conn->state = ISCSI_CONN_FAILED; + + transport->unbind_conn(conn, is_active); } transport->ep_disconnect(ep); @@ -3005,7 +3007,8 @@ iscsi_if_transport_ep(struct iscsi_transport *transport, break; case ISCSI_UEVENT_TRANSPORT_EP_DISCONNECT: rc = iscsi_if_ep_disconnect(transport, - ev->u.ep_disconnect.ep_handle); + ev->u.ep_disconnect.ep_handle, + false); break; } return rc; @@ -3730,7 +3733,7 @@ iscsi_if_recv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, uint32_t *group) conn = iscsi_conn_lookup(ev->u.b_conn.sid, ev->u.b_conn.cid); if (conn && conn->ep) - iscsi_if_ep_disconnect(transport, conn->ep->id); + iscsi_if_ep_disconnect(transport, conn->ep->id, true); if (!session || !conn) { err = -EINVAL; @@ -4649,6 +4652,7 @@ iscsi_register_transport(struct iscsi_transport *tt) int err; BUG_ON(!tt); + WARN_ON(tt->ep_disconnect && !tt->unbind_conn); priv = iscsi_if_transport_lookup(tt); if (priv) diff --git a/include/scsi/libiscsi.h b/include/scsi/libiscsi.h index b47428d86a4b..881e4762d626 100644 --- a/include/scsi/libiscsi.h +++ b/include/scsi/libiscsi.h @@ -424,6 +424,7 @@ extern int iscsi_conn_start(struct iscsi_cls_conn *); extern void iscsi_conn_stop(struct iscsi_cls_conn *, int); extern int iscsi_conn_bind(struct iscsi_cls_session *, struct iscsi_cls_conn *, int); +extern void iscsi_conn_unbind(struct iscsi_cls_conn *cls_conn, bool is_active); extern void iscsi_conn_failure(struct iscsi_conn *conn, enum iscsi_err err); extern void iscsi_session_failure(struct iscsi_session *session, enum iscsi_err err); diff --git a/include/scsi/scsi_transport_iscsi.h b/include/scsi/scsi_transport_iscsi.h index f28bb20d6271..eb6ed499324d 100644 --- a/include/scsi/scsi_transport_iscsi.h +++ b/include/scsi/scsi_transport_iscsi.h @@ -82,6 +82,7 @@ struct iscsi_transport { void (*destroy_session) (struct iscsi_cls_session *session); struct iscsi_cls_conn *(*create_conn) (struct iscsi_cls_session *sess, uint32_t cid); + void (*unbind_conn) (struct iscsi_cls_conn *conn, bool is_active); int (*bind_conn) (struct iscsi_cls_session *session, struct iscsi_cls_conn *cls_conn, uint64_t transport_eph, int is_leading); -- Gitee From 46a768c953aeaf0744e3543eb51d534c4fde1cfb Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Tue, 19 Jul 2022 16:30:39 +0800 Subject: [PATCH 400/674] scsi: iscsi: Force immediate failure during shutdown stable inclusion from stable-v5.10.112 commit 4029a1e992fc6a0e29ff56c74a9632317fe76022 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=4029a1e992fc6a0e29ff56c74a9632317fe76022 -------------------------------- [ Upstream commit 06c203a5566beecebb1f8838d026de8a61c8df71 ] If the system is not up, we can just fail immediately since iscsid is not going to ever answer our netlink events. We are already setting the recovery_tmo to 0, but by passing stop_conn STOP_CONN_TERM we never will block the session and start the recovery timer, because for that flag userspace will do the unbind and destroy events which would remove the devices and wake up and kill the eh. Since the conn is dead and the system is going dowm this just has us use STOP_CONN_RECOVER with recovery_tmo=0 so we fail immediately. However, if the user has set the recovery_tmo=-1 we let the system hang like they requested since they might have used that setting for specific reasons (one known reason is for buggy cluster software). Link: https://lore.kernel.org/r/20210525181821.7617-5-michael.christie@oracle.com Signed-off-by: Mike Christie Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/scsi/scsi_transport_iscsi.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c index 6490211b55d2..f93660142e35 100644 --- a/drivers/scsi/scsi_transport_iscsi.c +++ b/drivers/scsi/scsi_transport_iscsi.c @@ -2508,11 +2508,17 @@ static void stop_conn_work_fn(struct work_struct *work) session = iscsi_session_lookup(sid); if (session) { if (system_state != SYSTEM_RUNNING) { - session->recovery_tmo = 0; - iscsi_if_stop_conn(conn, STOP_CONN_TERM); - } else { - iscsi_if_stop_conn(conn, STOP_CONN_RECOVER); + /* + * If the user has set up for the session to + * never timeout then hang like they wanted. + * For all other cases fail right away since + * userspace is not going to relogin. + */ + if (session->recovery_tmo > 0) + session->recovery_tmo = 0; } + + iscsi_if_stop_conn(conn, STOP_CONN_RECOVER); } list_del_init(&conn->conn_list_err); -- Gitee From 0be1155784fa93ef8777bab4219fccb588c1ed26 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Tue, 19 Jul 2022 16:30:40 +0800 Subject: [PATCH 401/674] scsi: iscsi: Use system_unbound_wq for destroy_work stable inclusion from stable-v5.10.112 commit 22608545b834b855c0a7fd9f675edc884b7d21a3 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=22608545b834b855c0a7fd9f675edc884b7d21a3 -------------------------------- [ Upstream commit b25b957d2db1585602c2c70fdf4261a5641fe6b7 ] Use the system_unbound_wq for async session destruction. We don't need a dedicated workqueue for async session destruction because: 1. perf does not seem to be an issue since we only allow 1 active work. 2. it does not have deps with other system works and we can run them in parallel with each other. Link: https://lore.kernel.org/r/20210525181821.7617-6-michael.christie@oracle.com Reviewed-by: Lee Duncan Signed-off-by: Mike Christie Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/scsi/scsi_transport_iscsi.c | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c index f93660142e35..ed0c7e812445 100644 --- a/drivers/scsi/scsi_transport_iscsi.c +++ b/drivers/scsi/scsi_transport_iscsi.c @@ -95,8 +95,6 @@ static DECLARE_WORK(stop_conn_work, stop_conn_work_fn); static atomic_t iscsi_session_nr; /* sysfs session id for next new session */ static struct workqueue_struct *iscsi_eh_timer_workq; -static struct workqueue_struct *iscsi_destroy_workq; - static DEFINE_IDA(iscsi_sess_ida); /* * list of registered transports and lock that must @@ -3717,7 +3715,7 @@ iscsi_if_recv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, uint32_t *group) list_del_init(&session->sess_list); spin_unlock_irqrestore(&sesslock, flags); - queue_work(iscsi_destroy_workq, &session->destroy_work); + queue_work(system_unbound_wq, &session->destroy_work); } break; case ISCSI_UEVENT_UNBIND_SESSION: @@ -4813,18 +4811,8 @@ static __init int iscsi_transport_init(void) goto release_nls; } - iscsi_destroy_workq = alloc_workqueue("%s", - WQ_SYSFS | __WQ_LEGACY | WQ_MEM_RECLAIM | WQ_UNBOUND, - 1, "iscsi_destroy"); - if (!iscsi_destroy_workq) { - err = -ENOMEM; - goto destroy_wq; - } - return 0; -destroy_wq: - destroy_workqueue(iscsi_eh_timer_workq); release_nls: netlink_kernel_release(nls); unregister_flashnode_bus: @@ -4846,7 +4834,6 @@ static __init int iscsi_transport_init(void) static void __exit iscsi_transport_exit(void) { - destroy_workqueue(iscsi_destroy_workq); destroy_workqueue(iscsi_eh_timer_workq); netlink_kernel_release(nls); bus_unregister(&iscsi_flashnode_bus); -- Gitee From aa1de7f19c05dba66a6b7a09e7c6984c9bd4c663 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Tue, 19 Jul 2022 16:30:41 +0800 Subject: [PATCH 402/674] scsi: iscsi: Rel ref after iscsi_lookup_endpoint() stable inclusion from stable-v5.10.112 commit 812573896711f3ffb083c374f26f5807efca3b2f category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=812573896711f3ffb083c374f26f5807efca3b2f -------------------------------- [ Upstream commit 9e5fe1700896c85040943fdc0d3fee0dd3e0d36f ] Subsequent commits allow the kernel to do ep_disconnect. In that case we will have to get a proper refcount on the ep so one thread does not delete it from under another. Link: https://lore.kernel.org/r/20210525181821.7617-7-michael.christie@oracle.com Reviewed-by: Lee Duncan Signed-off-by: Mike Christie Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/infiniband/ulp/iser/iscsi_iser.c | 1 + drivers/scsi/be2iscsi/be_iscsi.c | 19 ++++++++++++------ drivers/scsi/bnx2i/bnx2i_iscsi.c | 23 +++++++++++++++------- drivers/scsi/cxgbi/libcxgbi.c | 12 ++++++++---- drivers/scsi/qedi/qedi_iscsi.c | 25 +++++++++++++++++------- drivers/scsi/qla4xxx/ql4_os.c | 1 + drivers/scsi/scsi_transport_iscsi.c | 25 ++++++++++++++++-------- include/scsi/scsi_transport_iscsi.h | 1 + 8 files changed, 75 insertions(+), 32 deletions(-) diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c index 8857d8322710..a16e066989fa 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.c +++ b/drivers/infiniband/ulp/iser/iscsi_iser.c @@ -499,6 +499,7 @@ iscsi_iser_conn_bind(struct iscsi_cls_session *cls_session, iser_conn->iscsi_conn = conn; out: + iscsi_put_endpoint(ep); mutex_unlock(&iser_conn->state_mutex); return error; } diff --git a/drivers/scsi/be2iscsi/be_iscsi.c b/drivers/scsi/be2iscsi/be_iscsi.c index a13c203ef7a9..c4881657a807 100644 --- a/drivers/scsi/be2iscsi/be_iscsi.c +++ b/drivers/scsi/be2iscsi/be_iscsi.c @@ -182,6 +182,7 @@ int beiscsi_conn_bind(struct iscsi_cls_session *cls_session, struct beiscsi_endpoint *beiscsi_ep; struct iscsi_endpoint *ep; uint16_t cri_index; + int rc = 0; ep = iscsi_lookup_endpoint(transport_fd); if (!ep) @@ -189,15 +190,17 @@ int beiscsi_conn_bind(struct iscsi_cls_session *cls_session, beiscsi_ep = ep->dd_data; - if (iscsi_conn_bind(cls_session, cls_conn, is_leading)) - return -EINVAL; + if (iscsi_conn_bind(cls_session, cls_conn, is_leading)) { + rc = -EINVAL; + goto put_ep; + } if (beiscsi_ep->phba != phba) { beiscsi_log(phba, KERN_ERR, BEISCSI_LOG_CONFIG, "BS_%d : beiscsi_ep->hba=%p not equal to phba=%p\n", beiscsi_ep->phba, phba); - - return -EEXIST; + rc = -EEXIST; + goto put_ep; } cri_index = BE_GET_CRI_FROM_CID(beiscsi_ep->ep_cid); if (phba->conn_table[cri_index]) { @@ -209,7 +212,8 @@ int beiscsi_conn_bind(struct iscsi_cls_session *cls_session, beiscsi_ep->ep_cid, beiscsi_conn, phba->conn_table[cri_index]); - return -EINVAL; + rc = -EINVAL; + goto put_ep; } } @@ -226,7 +230,10 @@ int beiscsi_conn_bind(struct iscsi_cls_session *cls_session, "BS_%d : cid %d phba->conn_table[%u]=%p\n", beiscsi_ep->ep_cid, cri_index, beiscsi_conn); phba->conn_table[cri_index] = beiscsi_conn; - return 0; + +put_ep: + iscsi_put_endpoint(ep); + return rc; } static int beiscsi_iface_create_ipv4(struct beiscsi_hba *phba) diff --git a/drivers/scsi/bnx2i/bnx2i_iscsi.c b/drivers/scsi/bnx2i/bnx2i_iscsi.c index 197aca76a69d..8cf2f9a7cfdc 100644 --- a/drivers/scsi/bnx2i/bnx2i_iscsi.c +++ b/drivers/scsi/bnx2i/bnx2i_iscsi.c @@ -1420,17 +1420,23 @@ static int bnx2i_conn_bind(struct iscsi_cls_session *cls_session, * Forcefully terminate all in progress connection recovery at the * earliest, either in bind(), send_pdu(LOGIN), or conn_start() */ - if (bnx2i_adapter_ready(hba)) - return -EIO; + if (bnx2i_adapter_ready(hba)) { + ret_code = -EIO; + goto put_ep; + } bnx2i_ep = ep->dd_data; if ((bnx2i_ep->state == EP_STATE_TCP_FIN_RCVD) || - (bnx2i_ep->state == EP_STATE_TCP_RST_RCVD)) + (bnx2i_ep->state == EP_STATE_TCP_RST_RCVD)) { /* Peer disconnect via' FIN or RST */ - return -EINVAL; + ret_code = -EINVAL; + goto put_ep; + } - if (iscsi_conn_bind(cls_session, cls_conn, is_leading)) - return -EINVAL; + if (iscsi_conn_bind(cls_session, cls_conn, is_leading)) { + ret_code = -EINVAL; + goto put_ep; + } if (bnx2i_ep->hba != hba) { /* Error - TCP connection does not belong to this device @@ -1441,7 +1447,8 @@ static int bnx2i_conn_bind(struct iscsi_cls_session *cls_session, iscsi_conn_printk(KERN_ALERT, cls_conn->dd_data, "belong to hba (%s)\n", hba->netdev->name); - return -EEXIST; + ret_code = -EEXIST; + goto put_ep; } bnx2i_ep->conn = bnx2i_conn; bnx2i_conn->ep = bnx2i_ep; @@ -1458,6 +1465,8 @@ static int bnx2i_conn_bind(struct iscsi_cls_session *cls_session, bnx2i_put_rq_buf(bnx2i_conn, 0); bnx2i_arm_cq_event_coalescing(bnx2i_conn->ep, CNIC_ARM_CQE); +put_ep: + iscsi_put_endpoint(ep); return ret_code; } diff --git a/drivers/scsi/cxgbi/libcxgbi.c b/drivers/scsi/cxgbi/libcxgbi.c index ecb134b4699f..506b561670af 100644 --- a/drivers/scsi/cxgbi/libcxgbi.c +++ b/drivers/scsi/cxgbi/libcxgbi.c @@ -2690,11 +2690,13 @@ int cxgbi_bind_conn(struct iscsi_cls_session *cls_session, err = csk->cdev->csk_ddp_setup_pgidx(csk, csk->tid, ppm->tformat.pgsz_idx_dflt); if (err < 0) - return err; + goto put_ep; err = iscsi_conn_bind(cls_session, cls_conn, is_leading); - if (err) - return -EINVAL; + if (err) { + err = -EINVAL; + goto put_ep; + } /* calculate the tag idx bits needed for this conn based on cmds_max */ cconn->task_idx_bits = (__ilog2_u32(conn->session->cmds_max - 1)) + 1; @@ -2715,7 +2717,9 @@ int cxgbi_bind_conn(struct iscsi_cls_session *cls_session, /* init recv engine */ iscsi_tcp_hdr_recv_prep(tcp_conn); - return 0; +put_ep: + iscsi_put_endpoint(ep); + return err; } EXPORT_SYMBOL_GPL(cxgbi_bind_conn); diff --git a/drivers/scsi/qedi/qedi_iscsi.c b/drivers/scsi/qedi/qedi_iscsi.c index 8f8036ae9a56..5f7e62f19d83 100644 --- a/drivers/scsi/qedi/qedi_iscsi.c +++ b/drivers/scsi/qedi/qedi_iscsi.c @@ -387,6 +387,7 @@ static int qedi_conn_bind(struct iscsi_cls_session *cls_session, struct qedi_ctx *qedi = iscsi_host_priv(shost); struct qedi_endpoint *qedi_ep; struct iscsi_endpoint *ep; + int rc = 0; ep = iscsi_lookup_endpoint(transport_fd); if (!ep) @@ -394,11 +395,16 @@ static int qedi_conn_bind(struct iscsi_cls_session *cls_session, qedi_ep = ep->dd_data; if ((qedi_ep->state == EP_STATE_TCP_FIN_RCVD) || - (qedi_ep->state == EP_STATE_TCP_RST_RCVD)) - return -EINVAL; + (qedi_ep->state == EP_STATE_TCP_RST_RCVD)) { + rc = -EINVAL; + goto put_ep; + } + + if (iscsi_conn_bind(cls_session, cls_conn, is_leading)) { + rc = -EINVAL; + goto put_ep; + } - if (iscsi_conn_bind(cls_session, cls_conn, is_leading)) - return -EINVAL; qedi_ep->conn = qedi_conn; qedi_conn->ep = qedi_ep; @@ -408,13 +414,18 @@ static int qedi_conn_bind(struct iscsi_cls_session *cls_session, qedi_conn->cmd_cleanup_req = 0; qedi_conn->cmd_cleanup_cmpl = 0; - if (qedi_bind_conn_to_iscsi_cid(qedi, qedi_conn)) - return -EINVAL; + if (qedi_bind_conn_to_iscsi_cid(qedi, qedi_conn)) { + rc = -EINVAL; + goto put_ep; + } + spin_lock_init(&qedi_conn->tmf_work_lock); INIT_LIST_HEAD(&qedi_conn->tmf_work_list); init_waitqueue_head(&qedi_conn->wait_queue); - return 0; +put_ep: + iscsi_put_endpoint(ep); + return rc; } static int qedi_iscsi_update_conn(struct qedi_ctx *qedi, diff --git a/drivers/scsi/qla4xxx/ql4_os.c b/drivers/scsi/qla4xxx/ql4_os.c index a8b8bf118c76..8d82d2a83059 100644 --- a/drivers/scsi/qla4xxx/ql4_os.c +++ b/drivers/scsi/qla4xxx/ql4_os.c @@ -3238,6 +3238,7 @@ static int qla4xxx_conn_bind(struct iscsi_cls_session *cls_session, conn = cls_conn->dd_data; qla_conn = conn->dd_data; qla_conn->qla_ep = ep->dd_data; + iscsi_put_endpoint(ep); return 0; } diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c index ed0c7e812445..dea0944ebbfa 100644 --- a/drivers/scsi/scsi_transport_iscsi.c +++ b/drivers/scsi/scsi_transport_iscsi.c @@ -266,9 +266,20 @@ void iscsi_destroy_endpoint(struct iscsi_endpoint *ep) } EXPORT_SYMBOL_GPL(iscsi_destroy_endpoint); +void iscsi_put_endpoint(struct iscsi_endpoint *ep) +{ + put_device(&ep->dev); +} +EXPORT_SYMBOL_GPL(iscsi_put_endpoint); + +/** + * iscsi_lookup_endpoint - get ep from handle + * @handle: endpoint handle + * + * Caller must do a iscsi_put_endpoint. + */ struct iscsi_endpoint *iscsi_lookup_endpoint(u64 handle) { - struct iscsi_endpoint *ep; struct device *dev; dev = class_find_device(&iscsi_endpoint_class, NULL, &handle, @@ -276,13 +287,7 @@ struct iscsi_endpoint *iscsi_lookup_endpoint(u64 handle) if (!dev) return NULL; - ep = iscsi_dev_to_endpoint(dev); - /* - * we can drop this now because the interface will prevent - * removals and lookups from racing. - */ - put_device(dev); - return ep; + return iscsi_dev_to_endpoint(dev); } EXPORT_SYMBOL_GPL(iscsi_lookup_endpoint); @@ -2983,6 +2988,7 @@ static int iscsi_if_ep_disconnect(struct iscsi_transport *transport, } transport->ep_disconnect(ep); + iscsi_put_endpoint(ep); return 0; } @@ -3008,6 +3014,7 @@ iscsi_if_transport_ep(struct iscsi_transport *transport, ev->r.retcode = transport->ep_poll(ep, ev->u.ep_poll.timeout_ms); + iscsi_put_endpoint(ep); break; case ISCSI_UEVENT_TRANSPORT_EP_DISCONNECT: rc = iscsi_if_ep_disconnect(transport, @@ -3691,6 +3698,7 @@ iscsi_if_recv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, uint32_t *group) ev->u.c_bound_session.initial_cmdsn, ev->u.c_bound_session.cmds_max, ev->u.c_bound_session.queue_depth); + iscsi_put_endpoint(ep); break; case ISCSI_UEVENT_DESTROY_SESSION: session = iscsi_session_lookup(ev->u.d_session.sid); @@ -3762,6 +3770,7 @@ iscsi_if_recv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, uint32_t *group) mutex_lock(&conn->ep_mutex); conn->ep = ep; mutex_unlock(&conn->ep_mutex); + iscsi_put_endpoint(ep); } else iscsi_cls_conn_printk(KERN_ERR, conn, "Could not set ep conn " diff --git a/include/scsi/scsi_transport_iscsi.h b/include/scsi/scsi_transport_iscsi.h index eb6ed499324d..09992f019dc9 100644 --- a/include/scsi/scsi_transport_iscsi.h +++ b/include/scsi/scsi_transport_iscsi.h @@ -444,6 +444,7 @@ extern int iscsi_scan_finished(struct Scsi_Host *shost, unsigned long time); extern struct iscsi_endpoint *iscsi_create_endpoint(int dd_size); extern void iscsi_destroy_endpoint(struct iscsi_endpoint *ep); extern struct iscsi_endpoint *iscsi_lookup_endpoint(u64 handle); +extern void iscsi_put_endpoint(struct iscsi_endpoint *ep); extern int iscsi_block_scsi_eh(struct scsi_cmnd *cmd); extern struct iscsi_iface *iscsi_create_iface(struct Scsi_Host *shost, struct iscsi_transport *t, -- Gitee From 3cc519b403fc2f1807a4b6c9acd76fd423494a1a Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Tue, 19 Jul 2022 16:30:42 +0800 Subject: [PATCH 403/674] scsi: iscsi: Fix in-kernel conn failure handling stable inclusion from stable-v5.10.112 commit 46f37a34a53d4c12fa413b7aec7fe7c3e5cece51 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=46f37a34a53d4c12fa413b7aec7fe7c3e5cece51 -------------------------------- [ Upstream commit 23d6fefbb3f6b1cc29794427588b470ed06ff64e ] Commit 0ab710458da1 ("scsi: iscsi: Perform connection failure entirely in kernel space") has the following regressions/bugs that this patch fixes: 1. It can return cmds to upper layers like dm-multipath where that can retry them. After they are successful the fs/app can send new I/O to the same sectors, but we've left the cmds running in FW or in the net layer. We need to be calling ep_disconnect if userspace is not up. This patch only fixes the issue for offload drivers. iscsi_tcp will be fixed in separate commit because it doesn't have a ep_disconnect call. 2. The drivers that implement ep_disconnect expect that it's called before conn_stop. Besides crashes, if the cleanup_task callout is called before ep_disconnect it might free up driver/card resources for session1 then they could be allocated for session2. But because the driver's ep_disconnect is not called it has not cleaned up the firmware so the card is still using the resources for the original cmd. 3. The stop_conn_work_fn can run after userspace has done its recovery and we are happily using the session. We will then end up with various bugs depending on what is going on at the time. We may also run stop_conn_work_fn late after userspace has called stop_conn and ep_disconnect and is now going to call start/bind conn. If stop_conn_work_fn runs after bind but before start, we would leave the conn in a unbound but sort of started state where IO might be allowed even though the drivers have been set in a state where they no longer expect I/O. 4. Returning -EAGAIN in iscsi_if_destroy_conn if we haven't yet run the in kernel stop_conn function is breaking userspace. We should have been doing this for the caller. Link: https://lore.kernel.org/r/20210525181821.7617-8-michael.christie@oracle.com Fixes: 0ab710458da1 ("scsi: iscsi: Perform connection failure entirely in kernel space") Reviewed-by: Lee Duncan Signed-off-by: Mike Christie Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/scsi/scsi_transport_iscsi.c | 471 ++++++++++++++++------------ include/scsi/scsi_transport_iscsi.h | 10 +- 2 files changed, 283 insertions(+), 198 deletions(-) diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c index dea0944ebbfa..7246f7b2ff03 100644 --- a/drivers/scsi/scsi_transport_iscsi.c +++ b/drivers/scsi/scsi_transport_iscsi.c @@ -86,15 +86,11 @@ struct iscsi_internal { struct transport_container session_cont; }; -/* Worker to perform connection failure on unresponsive connections - * completely in kernel space. - */ -static void stop_conn_work_fn(struct work_struct *work); -static DECLARE_WORK(stop_conn_work, stop_conn_work_fn); - static atomic_t iscsi_session_nr; /* sysfs session id for next new session */ static struct workqueue_struct *iscsi_eh_timer_workq; +static struct workqueue_struct *iscsi_conn_cleanup_workq; + static DEFINE_IDA(iscsi_sess_ida); /* * list of registered transports and lock that must @@ -1601,12 +1597,6 @@ static DECLARE_TRANSPORT_CLASS(iscsi_connection_class, static struct sock *nls; static DEFINE_MUTEX(rx_queue_mutex); -/* - * conn_mutex protects the {start,bind,stop,destroy}_conn from racing - * against the kernel stop_connection recovery mechanism - */ -static DEFINE_MUTEX(conn_mutex); - static LIST_HEAD(sesslist); static DEFINE_SPINLOCK(sesslock); static LIST_HEAD(connlist); @@ -2228,6 +2218,123 @@ void iscsi_remove_session(struct iscsi_cls_session *session) } EXPORT_SYMBOL_GPL(iscsi_remove_session); +static void iscsi_stop_conn(struct iscsi_cls_conn *conn, int flag) +{ + ISCSI_DBG_TRANS_CONN(conn, "Stopping conn.\n"); + + switch (flag) { + case STOP_CONN_RECOVER: + conn->state = ISCSI_CONN_FAILED; + break; + case STOP_CONN_TERM: + conn->state = ISCSI_CONN_DOWN; + break; + default: + iscsi_cls_conn_printk(KERN_ERR, conn, "invalid stop flag %d\n", + flag); + return; + } + + conn->transport->stop_conn(conn, flag); + ISCSI_DBG_TRANS_CONN(conn, "Stopping conn done.\n"); +} + +static int iscsi_if_stop_conn(struct iscsi_transport *transport, + struct iscsi_uevent *ev) +{ + int flag = ev->u.stop_conn.flag; + struct iscsi_cls_conn *conn; + + conn = iscsi_conn_lookup(ev->u.stop_conn.sid, ev->u.stop_conn.cid); + if (!conn) + return -EINVAL; + + ISCSI_DBG_TRANS_CONN(conn, "iscsi if conn stop.\n"); + /* + * If this is a termination we have to call stop_conn with that flag + * so the correct states get set. If we haven't run the work yet try to + * avoid the extra run. + */ + if (flag == STOP_CONN_TERM) { + cancel_work_sync(&conn->cleanup_work); + iscsi_stop_conn(conn, flag); + } else { + /* + * Figure out if it was the kernel or userspace initiating this. + */ + if (!test_and_set_bit(ISCSI_CLS_CONN_BIT_CLEANUP, &conn->flags)) { + iscsi_stop_conn(conn, flag); + } else { + ISCSI_DBG_TRANS_CONN(conn, + "flush kernel conn cleanup.\n"); + flush_work(&conn->cleanup_work); + } + /* + * Only clear for recovery to avoid extra cleanup runs during + * termination. + */ + clear_bit(ISCSI_CLS_CONN_BIT_CLEANUP, &conn->flags); + } + ISCSI_DBG_TRANS_CONN(conn, "iscsi if conn stop done.\n"); + return 0; +} + +static void iscsi_ep_disconnect(struct iscsi_cls_conn *conn, bool is_active) +{ + struct iscsi_cls_session *session = iscsi_conn_to_session(conn); + struct iscsi_endpoint *ep; + + ISCSI_DBG_TRANS_CONN(conn, "disconnect ep.\n"); + conn->state = ISCSI_CONN_FAILED; + + if (!conn->ep || !session->transport->ep_disconnect) + return; + + ep = conn->ep; + conn->ep = NULL; + + session->transport->unbind_conn(conn, is_active); + session->transport->ep_disconnect(ep); + ISCSI_DBG_TRANS_CONN(conn, "disconnect ep done.\n"); +} + +static void iscsi_cleanup_conn_work_fn(struct work_struct *work) +{ + struct iscsi_cls_conn *conn = container_of(work, struct iscsi_cls_conn, + cleanup_work); + struct iscsi_cls_session *session = iscsi_conn_to_session(conn); + + mutex_lock(&conn->ep_mutex); + /* + * If we are not at least bound there is nothing for us to do. Userspace + * will do a ep_disconnect call if offload is used, but will not be + * doing a stop since there is nothing to clean up, so we have to clear + * the cleanup bit here. + */ + if (conn->state != ISCSI_CONN_BOUND && conn->state != ISCSI_CONN_UP) { + ISCSI_DBG_TRANS_CONN(conn, "Got error while conn is already failed. Ignoring.\n"); + clear_bit(ISCSI_CLS_CONN_BIT_CLEANUP, &conn->flags); + mutex_unlock(&conn->ep_mutex); + return; + } + + iscsi_ep_disconnect(conn, false); + + if (system_state != SYSTEM_RUNNING) { + /* + * If the user has set up for the session to never timeout + * then hang like they wanted. For all other cases fail right + * away since userspace is not going to relogin. + */ + if (session->recovery_tmo > 0) + session->recovery_tmo = 0; + } + + iscsi_stop_conn(conn, STOP_CONN_RECOVER); + mutex_unlock(&conn->ep_mutex); + ISCSI_DBG_TRANS_CONN(conn, "cleanup done.\n"); +} + void iscsi_free_session(struct iscsi_cls_session *session) { ISCSI_DBG_TRANS_SESSION(session, "Freeing session\n"); @@ -2267,7 +2374,7 @@ iscsi_create_conn(struct iscsi_cls_session *session, int dd_size, uint32_t cid) mutex_init(&conn->ep_mutex); INIT_LIST_HEAD(&conn->conn_list); - INIT_LIST_HEAD(&conn->conn_list_err); + INIT_WORK(&conn->cleanup_work, iscsi_cleanup_conn_work_fn); conn->transport = transport; conn->cid = cid; conn->state = ISCSI_CONN_DOWN; @@ -2324,7 +2431,6 @@ int iscsi_destroy_conn(struct iscsi_cls_conn *conn) spin_lock_irqsave(&connlock, flags); list_del(&conn->conn_list); - list_del(&conn->conn_list_err); spin_unlock_irqrestore(&connlock, flags); transport_unregister_device(&conn->dev); @@ -2451,83 +2557,6 @@ int iscsi_offload_mesg(struct Scsi_Host *shost, } EXPORT_SYMBOL_GPL(iscsi_offload_mesg); -/* - * This can be called without the rx_queue_mutex, if invoked by the kernel - * stop work. But, in that case, it is guaranteed not to race with - * iscsi_destroy by conn_mutex. - */ -static void iscsi_if_stop_conn(struct iscsi_cls_conn *conn, int flag) -{ - /* - * It is important that this path doesn't rely on - * rx_queue_mutex, otherwise, a thread doing allocation on a - * start_session/start_connection could sleep waiting on a - * writeback to a failed iscsi device, that cannot be recovered - * because the lock is held. If we don't hold it here, the - * kernel stop_conn_work_fn has a chance to stop the broken - * session and resolve the allocation. - * - * Still, the user invoked .stop_conn() needs to be serialized - * with stop_conn_work_fn by a private mutex. Not pretty, but - * it works. - */ - mutex_lock(&conn_mutex); - switch (flag) { - case STOP_CONN_RECOVER: - conn->state = ISCSI_CONN_FAILED; - break; - case STOP_CONN_TERM: - conn->state = ISCSI_CONN_DOWN; - break; - default: - iscsi_cls_conn_printk(KERN_ERR, conn, - "invalid stop flag %d\n", flag); - goto unlock; - } - - conn->transport->stop_conn(conn, flag); -unlock: - mutex_unlock(&conn_mutex); -} - -static void stop_conn_work_fn(struct work_struct *work) -{ - struct iscsi_cls_conn *conn, *tmp; - unsigned long flags; - LIST_HEAD(recovery_list); - - spin_lock_irqsave(&connlock, flags); - if (list_empty(&connlist_err)) { - spin_unlock_irqrestore(&connlock, flags); - return; - } - list_splice_init(&connlist_err, &recovery_list); - spin_unlock_irqrestore(&connlock, flags); - - list_for_each_entry_safe(conn, tmp, &recovery_list, conn_list_err) { - uint32_t sid = iscsi_conn_get_sid(conn); - struct iscsi_cls_session *session; - - session = iscsi_session_lookup(sid); - if (session) { - if (system_state != SYSTEM_RUNNING) { - /* - * If the user has set up for the session to - * never timeout then hang like they wanted. - * For all other cases fail right away since - * userspace is not going to relogin. - */ - if (session->recovery_tmo > 0) - session->recovery_tmo = 0; - } - - iscsi_if_stop_conn(conn, STOP_CONN_RECOVER); - } - - list_del_init(&conn->conn_list_err); - } -} - void iscsi_conn_error_event(struct iscsi_cls_conn *conn, enum iscsi_err error) { struct nlmsghdr *nlh; @@ -2535,12 +2564,9 @@ void iscsi_conn_error_event(struct iscsi_cls_conn *conn, enum iscsi_err error) struct iscsi_uevent *ev; struct iscsi_internal *priv; int len = nlmsg_total_size(sizeof(*ev)); - unsigned long flags; - spin_lock_irqsave(&connlock, flags); - list_add(&conn->conn_list_err, &connlist_err); - spin_unlock_irqrestore(&connlock, flags); - queue_work(system_unbound_wq, &stop_conn_work); + if (!test_and_set_bit(ISCSI_CLS_CONN_BIT_CLEANUP, &conn->flags)) + queue_work(iscsi_conn_cleanup_workq, &conn->cleanup_work); priv = iscsi_if_transport_lookup(conn->transport); if (!priv) @@ -2870,26 +2896,17 @@ static int iscsi_if_destroy_conn(struct iscsi_transport *transport, struct iscsi_uevent *ev) { struct iscsi_cls_conn *conn; - unsigned long flags; conn = iscsi_conn_lookup(ev->u.d_conn.sid, ev->u.d_conn.cid); if (!conn) return -EINVAL; - spin_lock_irqsave(&connlock, flags); - if (!list_empty(&conn->conn_list_err)) { - spin_unlock_irqrestore(&connlock, flags); - return -EAGAIN; - } - spin_unlock_irqrestore(&connlock, flags); - + ISCSI_DBG_TRANS_CONN(conn, "Flushing cleanup during destruction\n"); + flush_work(&conn->cleanup_work); ISCSI_DBG_TRANS_CONN(conn, "Destroying transport conn\n"); - mutex_lock(&conn_mutex); if (transport->destroy_conn) transport->destroy_conn(conn); - mutex_unlock(&conn_mutex); - return 0; } @@ -2966,7 +2983,7 @@ static int iscsi_if_ep_connect(struct iscsi_transport *transport, } static int iscsi_if_ep_disconnect(struct iscsi_transport *transport, - u64 ep_handle, bool is_active) + u64 ep_handle) { struct iscsi_cls_conn *conn; struct iscsi_endpoint *ep; @@ -2977,17 +2994,30 @@ static int iscsi_if_ep_disconnect(struct iscsi_transport *transport, ep = iscsi_lookup_endpoint(ep_handle); if (!ep) return -EINVAL; + conn = ep->conn; - if (conn) { - mutex_lock(&conn->ep_mutex); - conn->ep = NULL; + if (!conn) { + /* + * conn was not even bound yet, so we can't get iscsi conn + * failures yet. + */ + transport->ep_disconnect(ep); + goto put_ep; + } + + mutex_lock(&conn->ep_mutex); + /* Check if this was a conn error and the kernel took ownership */ + if (test_bit(ISCSI_CLS_CONN_BIT_CLEANUP, &conn->flags)) { + ISCSI_DBG_TRANS_CONN(conn, "flush kernel conn cleanup.\n"); mutex_unlock(&conn->ep_mutex); - conn->state = ISCSI_CONN_FAILED; - transport->unbind_conn(conn, is_active); + flush_work(&conn->cleanup_work); + goto put_ep; } - transport->ep_disconnect(ep); + iscsi_ep_disconnect(conn, false); + mutex_unlock(&conn->ep_mutex); +put_ep: iscsi_put_endpoint(ep); return 0; } @@ -3018,8 +3048,7 @@ iscsi_if_transport_ep(struct iscsi_transport *transport, break; case ISCSI_UEVENT_TRANSPORT_EP_DISCONNECT: rc = iscsi_if_ep_disconnect(transport, - ev->u.ep_disconnect.ep_handle, - false); + ev->u.ep_disconnect.ep_handle); break; } return rc; @@ -3646,18 +3675,129 @@ iscsi_get_host_stats(struct iscsi_transport *transport, struct nlmsghdr *nlh) return err; } +static int iscsi_if_transport_conn(struct iscsi_transport *transport, + struct nlmsghdr *nlh) +{ + struct iscsi_uevent *ev = nlmsg_data(nlh); + struct iscsi_cls_session *session; + struct iscsi_cls_conn *conn = NULL; + struct iscsi_endpoint *ep; + uint32_t pdu_len; + int err = 0; + + switch (nlh->nlmsg_type) { + case ISCSI_UEVENT_CREATE_CONN: + return iscsi_if_create_conn(transport, ev); + case ISCSI_UEVENT_DESTROY_CONN: + return iscsi_if_destroy_conn(transport, ev); + case ISCSI_UEVENT_STOP_CONN: + return iscsi_if_stop_conn(transport, ev); + } + + /* + * The following cmds need to be run under the ep_mutex so in kernel + * conn cleanup (ep_disconnect + unbind and conn) is not done while + * these are running. They also must not run if we have just run a conn + * cleanup because they would set the state in a way that might allow + * IO or send IO themselves. + */ + switch (nlh->nlmsg_type) { + case ISCSI_UEVENT_START_CONN: + conn = iscsi_conn_lookup(ev->u.start_conn.sid, + ev->u.start_conn.cid); + break; + case ISCSI_UEVENT_BIND_CONN: + conn = iscsi_conn_lookup(ev->u.b_conn.sid, ev->u.b_conn.cid); + break; + case ISCSI_UEVENT_SEND_PDU: + conn = iscsi_conn_lookup(ev->u.send_pdu.sid, ev->u.send_pdu.cid); + break; + } + + if (!conn) + return -EINVAL; + + mutex_lock(&conn->ep_mutex); + if (test_bit(ISCSI_CLS_CONN_BIT_CLEANUP, &conn->flags)) { + mutex_unlock(&conn->ep_mutex); + ev->r.retcode = -ENOTCONN; + return 0; + } + + switch (nlh->nlmsg_type) { + case ISCSI_UEVENT_BIND_CONN: + if (conn->ep) { + /* + * For offload boot support where iscsid is restarted + * during the pivot root stage, the ep will be intact + * here when the new iscsid instance starts up and + * reconnects. + */ + iscsi_ep_disconnect(conn, true); + } + + session = iscsi_session_lookup(ev->u.b_conn.sid); + if (!session) { + err = -EINVAL; + break; + } + + ev->r.retcode = transport->bind_conn(session, conn, + ev->u.b_conn.transport_eph, + ev->u.b_conn.is_leading); + if (!ev->r.retcode) + conn->state = ISCSI_CONN_BOUND; + + if (ev->r.retcode || !transport->ep_connect) + break; + + ep = iscsi_lookup_endpoint(ev->u.b_conn.transport_eph); + if (ep) { + ep->conn = conn; + conn->ep = ep; + iscsi_put_endpoint(ep); + } else { + err = -ENOTCONN; + iscsi_cls_conn_printk(KERN_ERR, conn, + "Could not set ep conn binding\n"); + } + break; + case ISCSI_UEVENT_START_CONN: + ev->r.retcode = transport->start_conn(conn); + if (!ev->r.retcode) + conn->state = ISCSI_CONN_UP; + break; + case ISCSI_UEVENT_SEND_PDU: + pdu_len = nlh->nlmsg_len - sizeof(*nlh) - sizeof(*ev); + + if ((ev->u.send_pdu.hdr_size > pdu_len) || + (ev->u.send_pdu.data_size > (pdu_len - ev->u.send_pdu.hdr_size))) { + err = -EINVAL; + break; + } + + ev->r.retcode = transport->send_pdu(conn, + (struct iscsi_hdr *)((char *)ev + sizeof(*ev)), + (char *)ev + sizeof(*ev) + ev->u.send_pdu.hdr_size, + ev->u.send_pdu.data_size); + break; + default: + err = -ENOSYS; + } + + mutex_unlock(&conn->ep_mutex); + return err; +} static int iscsi_if_recv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, uint32_t *group) { int err = 0; u32 portid; - u32 pdu_len; struct iscsi_uevent *ev = nlmsg_data(nlh); struct iscsi_transport *transport = NULL; struct iscsi_internal *priv; struct iscsi_cls_session *session; - struct iscsi_cls_conn *conn; struct iscsi_endpoint *ep = NULL; if (!netlink_capable(skb, CAP_SYS_ADMIN)) @@ -3734,90 +3874,16 @@ iscsi_if_recv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, uint32_t *group) else err = -EINVAL; break; - case ISCSI_UEVENT_CREATE_CONN: - err = iscsi_if_create_conn(transport, ev); - break; - case ISCSI_UEVENT_DESTROY_CONN: - err = iscsi_if_destroy_conn(transport, ev); - break; - case ISCSI_UEVENT_BIND_CONN: - session = iscsi_session_lookup(ev->u.b_conn.sid); - conn = iscsi_conn_lookup(ev->u.b_conn.sid, ev->u.b_conn.cid); - - if (conn && conn->ep) - iscsi_if_ep_disconnect(transport, conn->ep->id, true); - - if (!session || !conn) { - err = -EINVAL; - break; - } - - mutex_lock(&conn_mutex); - ev->r.retcode = transport->bind_conn(session, conn, - ev->u.b_conn.transport_eph, - ev->u.b_conn.is_leading); - if (!ev->r.retcode) - conn->state = ISCSI_CONN_BOUND; - mutex_unlock(&conn_mutex); - - if (ev->r.retcode || !transport->ep_connect) - break; - - ep = iscsi_lookup_endpoint(ev->u.b_conn.transport_eph); - if (ep) { - ep->conn = conn; - - mutex_lock(&conn->ep_mutex); - conn->ep = ep; - mutex_unlock(&conn->ep_mutex); - iscsi_put_endpoint(ep); - } else - iscsi_cls_conn_printk(KERN_ERR, conn, - "Could not set ep conn " - "binding\n"); - break; case ISCSI_UEVENT_SET_PARAM: err = iscsi_set_param(transport, ev); break; - case ISCSI_UEVENT_START_CONN: - conn = iscsi_conn_lookup(ev->u.start_conn.sid, ev->u.start_conn.cid); - if (conn) { - mutex_lock(&conn_mutex); - ev->r.retcode = transport->start_conn(conn); - if (!ev->r.retcode) - conn->state = ISCSI_CONN_UP; - mutex_unlock(&conn_mutex); - } - else - err = -EINVAL; - break; + case ISCSI_UEVENT_CREATE_CONN: + case ISCSI_UEVENT_DESTROY_CONN: case ISCSI_UEVENT_STOP_CONN: - conn = iscsi_conn_lookup(ev->u.stop_conn.sid, ev->u.stop_conn.cid); - if (conn) - iscsi_if_stop_conn(conn, ev->u.stop_conn.flag); - else - err = -EINVAL; - break; + case ISCSI_UEVENT_START_CONN: + case ISCSI_UEVENT_BIND_CONN: case ISCSI_UEVENT_SEND_PDU: - pdu_len = nlh->nlmsg_len - sizeof(*nlh) - sizeof(*ev); - - if ((ev->u.send_pdu.hdr_size > pdu_len) || - (ev->u.send_pdu.data_size > (pdu_len - ev->u.send_pdu.hdr_size))) { - err = -EINVAL; - break; - } - - conn = iscsi_conn_lookup(ev->u.send_pdu.sid, ev->u.send_pdu.cid); - if (conn) { - mutex_lock(&conn_mutex); - ev->r.retcode = transport->send_pdu(conn, - (struct iscsi_hdr*)((char*)ev + sizeof(*ev)), - (char*)ev + sizeof(*ev) + ev->u.send_pdu.hdr_size, - ev->u.send_pdu.data_size); - mutex_unlock(&conn_mutex); - } - else - err = -EINVAL; + err = iscsi_if_transport_conn(transport, nlh); break; case ISCSI_UEVENT_GET_STATS: err = iscsi_if_get_stats(transport, nlh); @@ -4820,8 +4886,18 @@ static __init int iscsi_transport_init(void) goto release_nls; } + iscsi_conn_cleanup_workq = alloc_workqueue("%s", + WQ_SYSFS | WQ_MEM_RECLAIM | WQ_UNBOUND, 0, + "iscsi_conn_cleanup"); + if (!iscsi_conn_cleanup_workq) { + err = -ENOMEM; + goto destroy_wq; + } + return 0; +destroy_wq: + destroy_workqueue(iscsi_eh_timer_workq); release_nls: netlink_kernel_release(nls); unregister_flashnode_bus: @@ -4843,6 +4919,7 @@ static __init int iscsi_transport_init(void) static void __exit iscsi_transport_exit(void) { + destroy_workqueue(iscsi_conn_cleanup_workq); destroy_workqueue(iscsi_eh_timer_workq); netlink_kernel_release(nls); bus_unregister(&iscsi_flashnode_bus); diff --git a/include/scsi/scsi_transport_iscsi.h b/include/scsi/scsi_transport_iscsi.h index 09992f019dc9..c5d7810fd792 100644 --- a/include/scsi/scsi_transport_iscsi.h +++ b/include/scsi/scsi_transport_iscsi.h @@ -197,15 +197,23 @@ enum iscsi_connection_state { ISCSI_CONN_BOUND, }; +#define ISCSI_CLS_CONN_BIT_CLEANUP 1 + struct iscsi_cls_conn { struct list_head conn_list; /* item in connlist */ - struct list_head conn_list_err; /* item in connlist_err */ void *dd_data; /* LLD private data */ struct iscsi_transport *transport; uint32_t cid; /* connection id */ + /* + * This protects the conn startup and binding/unbinding of the ep to + * the conn. Unbinding includes ep_disconnect and stop_conn. + */ struct mutex ep_mutex; struct iscsi_endpoint *ep; + unsigned long flags; + struct work_struct cleanup_work; + struct device dev; /* sysfs transport/container device */ enum iscsi_connection_state state; }; -- Gitee From 7c342fdea7eb24f53c1525330344bd30c994f50e Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Tue, 19 Jul 2022 16:30:43 +0800 Subject: [PATCH 404/674] scsi: iscsi: Move iscsi_ep_disconnect() stable inclusion from stable-v5.10.112 commit 699bd835c36e095dc80bea21d9a15cd7f7bc8b9f category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=699bd835c36e095dc80bea21d9a15cd7f7bc8b9f -------------------------------- [ Upstream commit c34f95e98d8fb750eefd4f3fe58b4f8b5e89253b ] This patch moves iscsi_ep_disconnect() so it can be called earlier in the next patch. Link: https://lore.kernel.org/r/20220408001314.5014-2-michael.christie@oracle.com Tested-by: Manish Rangankar Reviewed-by: Lee Duncan Reviewed-by: Chris Leech Signed-off-by: Mike Christie Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/scsi/scsi_transport_iscsi.c | 38 ++++++++++++++--------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c index 7246f7b2ff03..8f6dc391356d 100644 --- a/drivers/scsi/scsi_transport_iscsi.c +++ b/drivers/scsi/scsi_transport_iscsi.c @@ -2239,6 +2239,25 @@ static void iscsi_stop_conn(struct iscsi_cls_conn *conn, int flag) ISCSI_DBG_TRANS_CONN(conn, "Stopping conn done.\n"); } +static void iscsi_ep_disconnect(struct iscsi_cls_conn *conn, bool is_active) +{ + struct iscsi_cls_session *session = iscsi_conn_to_session(conn); + struct iscsi_endpoint *ep; + + ISCSI_DBG_TRANS_CONN(conn, "disconnect ep.\n"); + conn->state = ISCSI_CONN_FAILED; + + if (!conn->ep || !session->transport->ep_disconnect) + return; + + ep = conn->ep; + conn->ep = NULL; + + session->transport->unbind_conn(conn, is_active); + session->transport->ep_disconnect(ep); + ISCSI_DBG_TRANS_CONN(conn, "disconnect ep done.\n"); +} + static int iscsi_if_stop_conn(struct iscsi_transport *transport, struct iscsi_uevent *ev) { @@ -2279,25 +2298,6 @@ static int iscsi_if_stop_conn(struct iscsi_transport *transport, return 0; } -static void iscsi_ep_disconnect(struct iscsi_cls_conn *conn, bool is_active) -{ - struct iscsi_cls_session *session = iscsi_conn_to_session(conn); - struct iscsi_endpoint *ep; - - ISCSI_DBG_TRANS_CONN(conn, "disconnect ep.\n"); - conn->state = ISCSI_CONN_FAILED; - - if (!conn->ep || !session->transport->ep_disconnect) - return; - - ep = conn->ep; - conn->ep = NULL; - - session->transport->unbind_conn(conn, is_active); - session->transport->ep_disconnect(ep); - ISCSI_DBG_TRANS_CONN(conn, "disconnect ep done.\n"); -} - static void iscsi_cleanup_conn_work_fn(struct work_struct *work) { struct iscsi_cls_conn *conn = container_of(work, struct iscsi_cls_conn, -- Gitee From 2f4ea143646fb62a066c48ce57ba1f9046fda321 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Tue, 19 Jul 2022 16:30:44 +0800 Subject: [PATCH 405/674] scsi: iscsi: Fix offload conn cleanup when iscsid restarts stable inclusion from stable-v5.10.112 commit 73805795c99ff32154c9e37c4334968c037c0f7f category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=73805795c99ff32154c9e37c4334968c037c0f7f -------------------------------- [ Upstream commit cbd2283aaf47fef4ded4b29124b1ef3beb515f3a ] When userspace restarts during boot or upgrades it won't know about the offload driver's endpoint and connection mappings. iscsid will start by cleaning up the old session by doing a stop_conn call. Later, if we are able to create a new connection, we clean up the old endpoint during the binding stage. The problem is that if we do stop_conn before doing the ep_disconnect call offload, drivers can still be executing I/O. We then might free tasks from the under the card/driver. This moves the ep_disconnect call to before we do the stop_conn call for this case. It will then work and look like a normal recovery/cleanup procedure from the driver's point of view. Link: https://lore.kernel.org/r/20220408001314.5014-3-michael.christie@oracle.com Tested-by: Manish Rangankar Reviewed-by: Lee Duncan Reviewed-by: Chris Leech Signed-off-by: Mike Christie Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/scsi/scsi_transport_iscsi.c | 48 +++++++++++++++++------------ 1 file changed, 28 insertions(+), 20 deletions(-) diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c index 8f6dc391356d..38abc3a15692 100644 --- a/drivers/scsi/scsi_transport_iscsi.c +++ b/drivers/scsi/scsi_transport_iscsi.c @@ -2258,6 +2258,23 @@ static void iscsi_ep_disconnect(struct iscsi_cls_conn *conn, bool is_active) ISCSI_DBG_TRANS_CONN(conn, "disconnect ep done.\n"); } +static void iscsi_if_disconnect_bound_ep(struct iscsi_cls_conn *conn, + struct iscsi_endpoint *ep, + bool is_active) +{ + /* Check if this was a conn error and the kernel took ownership */ + if (!test_bit(ISCSI_CLS_CONN_BIT_CLEANUP, &conn->flags)) { + iscsi_ep_disconnect(conn, is_active); + } else { + ISCSI_DBG_TRANS_CONN(conn, "flush kernel conn cleanup.\n"); + mutex_unlock(&conn->ep_mutex); + + flush_work(&conn->cleanup_work); + + mutex_lock(&conn->ep_mutex); + } +} + static int iscsi_if_stop_conn(struct iscsi_transport *transport, struct iscsi_uevent *ev) { @@ -2278,6 +2295,16 @@ static int iscsi_if_stop_conn(struct iscsi_transport *transport, cancel_work_sync(&conn->cleanup_work); iscsi_stop_conn(conn, flag); } else { + /* + * For offload, when iscsid is restarted it won't know about + * existing endpoints so it can't do a ep_disconnect. We clean + * it up here for userspace. + */ + mutex_lock(&conn->ep_mutex); + if (conn->ep) + iscsi_if_disconnect_bound_ep(conn, conn->ep, true); + mutex_unlock(&conn->ep_mutex); + /* * Figure out if it was the kernel or userspace initiating this. */ @@ -3006,16 +3033,7 @@ static int iscsi_if_ep_disconnect(struct iscsi_transport *transport, } mutex_lock(&conn->ep_mutex); - /* Check if this was a conn error and the kernel took ownership */ - if (test_bit(ISCSI_CLS_CONN_BIT_CLEANUP, &conn->flags)) { - ISCSI_DBG_TRANS_CONN(conn, "flush kernel conn cleanup.\n"); - mutex_unlock(&conn->ep_mutex); - - flush_work(&conn->cleanup_work); - goto put_ep; - } - - iscsi_ep_disconnect(conn, false); + iscsi_if_disconnect_bound_ep(conn, ep, false); mutex_unlock(&conn->ep_mutex); put_ep: iscsi_put_endpoint(ep); @@ -3726,16 +3744,6 @@ static int iscsi_if_transport_conn(struct iscsi_transport *transport, switch (nlh->nlmsg_type) { case ISCSI_UEVENT_BIND_CONN: - if (conn->ep) { - /* - * For offload boot support where iscsid is restarted - * during the pivot root stage, the ep will be intact - * here when the new iscsid instance starts up and - * reconnects. - */ - iscsi_ep_disconnect(conn, true); - } - session = iscsi_session_lookup(ev->u.b_conn.sid); if (!session) { err = -EINVAL; -- Gitee From bb9e69a1c8fe8138412cdb7d5666c33ece5d9f1c Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Tue, 19 Jul 2022 16:30:45 +0800 Subject: [PATCH 406/674] scsi: iscsi: Fix conn cleanup and stop race during iscsid restart stable inclusion from stable-v5.10.112 commit 45226fac4d316fea68357b307cb46a4aac764ec6 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=45226fac4d316fea68357b307cb46a4aac764ec6 -------------------------------- [ Upstream commit 7c6e99c18167ed89729bf167ccb4a7e3ab3115ba ] If iscsid is doing a stop_conn at the same time the kernel is starting error recovery we can hit a race that allows the cleanup work to run on a valid connection. In the race, iscsi_if_stop_conn sees the cleanup bit set, but it calls flush_work on the clean_work before iscsi_conn_error_event has queued it. The flush then returns before the queueing and so the cleanup_work can run later and disconnect/stop a conn while it's in a connected state. The patch: Commit 0ab710458da1 ("scsi: iscsi: Perform connection failure entirely in kernel space") added the late stop_conn call bug originally, and the patch: Commit 23d6fefbb3f6 ("scsi: iscsi: Fix in-kernel conn failure handling") attempted to fix it but only fixed the normal EH case and left the above race for the iscsid restart case. For the normal EH case we don't hit the race because we only signal userspace to start recovery after we have done the queueing, so the flush will always catch the queued work or see it completed. For iscsid restart cases like boot, we can hit the race because iscsid will call down to the kernel before the kernel has signaled any error, so both code paths can be running at the same time. This adds a lock around the setting of the cleanup bit and queueing so they happen together. Link: https://lore.kernel.org/r/20220408001314.5014-6-michael.christie@oracle.com Fixes: 0ab710458da1 ("scsi: iscsi: Perform connection failure entirely in kernel space") Tested-by: Manish Rangankar Reviewed-by: Lee Duncan Reviewed-by: Chris Leech Signed-off-by: Mike Christie Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/scsi/scsi_transport_iscsi.c | 17 +++++++++++++++++ include/scsi/scsi_transport_iscsi.h | 2 ++ 2 files changed, 19 insertions(+) diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c index 38abc3a15692..21cc306a0f11 100644 --- a/drivers/scsi/scsi_transport_iscsi.c +++ b/drivers/scsi/scsi_transport_iscsi.c @@ -2263,9 +2263,12 @@ static void iscsi_if_disconnect_bound_ep(struct iscsi_cls_conn *conn, bool is_active) { /* Check if this was a conn error and the kernel took ownership */ + spin_lock_irq(&conn->lock); if (!test_bit(ISCSI_CLS_CONN_BIT_CLEANUP, &conn->flags)) { + spin_unlock_irq(&conn->lock); iscsi_ep_disconnect(conn, is_active); } else { + spin_unlock_irq(&conn->lock); ISCSI_DBG_TRANS_CONN(conn, "flush kernel conn cleanup.\n"); mutex_unlock(&conn->ep_mutex); @@ -2308,9 +2311,12 @@ static int iscsi_if_stop_conn(struct iscsi_transport *transport, /* * Figure out if it was the kernel or userspace initiating this. */ + spin_lock_irq(&conn->lock); if (!test_and_set_bit(ISCSI_CLS_CONN_BIT_CLEANUP, &conn->flags)) { + spin_unlock_irq(&conn->lock); iscsi_stop_conn(conn, flag); } else { + spin_unlock_irq(&conn->lock); ISCSI_DBG_TRANS_CONN(conn, "flush kernel conn cleanup.\n"); flush_work(&conn->cleanup_work); @@ -2319,7 +2325,9 @@ static int iscsi_if_stop_conn(struct iscsi_transport *transport, * Only clear for recovery to avoid extra cleanup runs during * termination. */ + spin_lock_irq(&conn->lock); clear_bit(ISCSI_CLS_CONN_BIT_CLEANUP, &conn->flags); + spin_unlock_irq(&conn->lock); } ISCSI_DBG_TRANS_CONN(conn, "iscsi if conn stop done.\n"); return 0; @@ -2340,7 +2348,9 @@ static void iscsi_cleanup_conn_work_fn(struct work_struct *work) */ if (conn->state != ISCSI_CONN_BOUND && conn->state != ISCSI_CONN_UP) { ISCSI_DBG_TRANS_CONN(conn, "Got error while conn is already failed. Ignoring.\n"); + spin_lock_irq(&conn->lock); clear_bit(ISCSI_CLS_CONN_BIT_CLEANUP, &conn->flags); + spin_unlock_irq(&conn->lock); mutex_unlock(&conn->ep_mutex); return; } @@ -2400,6 +2410,7 @@ iscsi_create_conn(struct iscsi_cls_session *session, int dd_size, uint32_t cid) conn->dd_data = &conn[1]; mutex_init(&conn->ep_mutex); + spin_lock_init(&conn->lock); INIT_LIST_HEAD(&conn->conn_list); INIT_WORK(&conn->cleanup_work, iscsi_cleanup_conn_work_fn); conn->transport = transport; @@ -2591,9 +2602,12 @@ void iscsi_conn_error_event(struct iscsi_cls_conn *conn, enum iscsi_err error) struct iscsi_uevent *ev; struct iscsi_internal *priv; int len = nlmsg_total_size(sizeof(*ev)); + unsigned long flags; + spin_lock_irqsave(&conn->lock, flags); if (!test_and_set_bit(ISCSI_CLS_CONN_BIT_CLEANUP, &conn->flags)) queue_work(iscsi_conn_cleanup_workq, &conn->cleanup_work); + spin_unlock_irqrestore(&conn->lock, flags); priv = iscsi_if_transport_lookup(conn->transport); if (!priv) @@ -3736,11 +3750,14 @@ static int iscsi_if_transport_conn(struct iscsi_transport *transport, return -EINVAL; mutex_lock(&conn->ep_mutex); + spin_lock_irq(&conn->lock); if (test_bit(ISCSI_CLS_CONN_BIT_CLEANUP, &conn->flags)) { + spin_unlock_irq(&conn->lock); mutex_unlock(&conn->ep_mutex); ev->r.retcode = -ENOTCONN; return 0; } + spin_unlock_irq(&conn->lock); switch (nlh->nlmsg_type) { case ISCSI_UEVENT_BIND_CONN: diff --git a/include/scsi/scsi_transport_iscsi.h b/include/scsi/scsi_transport_iscsi.h index c5d7810fd792..037c77fb5dc5 100644 --- a/include/scsi/scsi_transport_iscsi.h +++ b/include/scsi/scsi_transport_iscsi.h @@ -211,6 +211,8 @@ struct iscsi_cls_conn { struct mutex ep_mutex; struct iscsi_endpoint *ep; + /* Used when accessing flags and queueing work. */ + spinlock_t lock; unsigned long flags; struct work_struct cleanup_work; -- Gitee From 27ad469e84a31f70057c77002b6a49892c519c2d Mon Sep 17 00:00:00 2001 From: Petr Malat Date: Tue, 19 Jul 2022 16:30:46 +0800 Subject: [PATCH 407/674] sctp: Initialize daddr on peeled off socket stable inclusion from stable-v5.10.112 commit eb8873b324d918eeaa572d08d1d05785f4fb1e0a category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=eb8873b324d918eeaa572d08d1d05785f4fb1e0a -------------------------------- [ Upstream commit 8467dda0c26583547731e7f3ea73fc3856bae3bf ] Function sctp_do_peeloff() wrongly initializes daddr of the original socket instead of the peeled off socket, which makes getpeername() return zeroes instead of the primary address. Initialize the new socket instead. Fixes: d570ee490fb1 ("[SCTP]: Correctly set daddr for IPv6 sockets during peeloff") Signed-off-by: Petr Malat Acked-by: Marcelo Ricardo Leitner Link: https://lore.kernel.org/r/20220409063611.673193-1-oss@malat.biz Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- net/sctp/socket.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 0a9e2c7d8e5f..e9b4ea3d934f 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -5518,7 +5518,7 @@ int sctp_do_peeloff(struct sock *sk, sctp_assoc_t id, struct socket **sockp) * Set the daddr and initialize id to something more random and also * copy over any ip options. */ - sp->pf->to_sk_daddr(&asoc->peer.primary_addr, sk); + sp->pf->to_sk_daddr(&asoc->peer.primary_addr, sock->sk); sp->pf->copy_ip_options(sk, sock->sk); /* Populate the fields of the newsk from the oldsk and migrate the -- Gitee From 63bb182af29369522607c1173290629e068c3ae0 Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Tue, 19 Jul 2022 16:30:47 +0800 Subject: [PATCH 408/674] testing/selftests/mqueue: Fix mq_perf_tests to free the allocated cpu set stable inclusion from stable-v5.10.112 commit 280f721edc54a782f1dfcec573ee929e92bf186a category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=280f721edc54a782f1dfcec573ee929e92bf186a -------------------------------- [ Upstream commit ce64763c63854b4079f2e036638aa881a1fb3fbc ] The selftest "mqueue/mq_perf_tests.c" use CPU_ALLOC to allocate CPU set. This cpu set is used further in pthread_attr_setaffinity_np and by pthread_create in the code. But in current code, allocated cpu set is not freed. Fix this issue by adding CPU_FREE in the "shutdown" function which is called in most of the error/exit path for the cleanup. There are few error paths which exit without using shutdown. Add a common goto error path with CPU_FREE for these cases. Fixes: 7820b0715b6f ("tools/selftests: add mq_perf_tests") Signed-off-by: Athira Rajeev Signed-off-by: Shuah Khan Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- .../testing/selftests/mqueue/mq_perf_tests.c | 25 +++++++++++++------ 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/mqueue/mq_perf_tests.c b/tools/testing/selftests/mqueue/mq_perf_tests.c index b019e0b8221c..84fda3b49073 100644 --- a/tools/testing/selftests/mqueue/mq_perf_tests.c +++ b/tools/testing/selftests/mqueue/mq_perf_tests.c @@ -180,6 +180,9 @@ void shutdown(int exit_val, char *err_cause, int line_no) if (in_shutdown++) return; + /* Free the cpu_set allocated using CPU_ALLOC in main function */ + CPU_FREE(cpu_set); + for (i = 0; i < num_cpus_to_pin; i++) if (cpu_threads[i]) { pthread_kill(cpu_threads[i], SIGUSR1); @@ -551,6 +554,12 @@ int main(int argc, char *argv[]) perror("sysconf(_SC_NPROCESSORS_ONLN)"); exit(1); } + + if (getuid() != 0) + ksft_exit_skip("Not running as root, but almost all tests " + "require root in order to modify\nsystem settings. " + "Exiting.\n"); + cpus_online = min(MAX_CPUS, sysconf(_SC_NPROCESSORS_ONLN)); cpu_set = CPU_ALLOC(cpus_online); if (cpu_set == NULL) { @@ -589,7 +598,7 @@ int main(int argc, char *argv[]) cpu_set)) { fprintf(stderr, "Any given CPU may " "only be given once.\n"); - exit(1); + goto err_code; } else CPU_SET_S(cpus_to_pin[cpu], cpu_set_size, cpu_set); @@ -607,7 +616,7 @@ int main(int argc, char *argv[]) queue_path = malloc(strlen(option) + 2); if (!queue_path) { perror("malloc()"); - exit(1); + goto err_code; } queue_path[0] = '/'; queue_path[1] = 0; @@ -622,17 +631,12 @@ int main(int argc, char *argv[]) fprintf(stderr, "Must pass at least one CPU to continuous " "mode.\n"); poptPrintUsage(popt_context, stderr, 0); - exit(1); + goto err_code; } else if (!continuous_mode) { num_cpus_to_pin = 1; cpus_to_pin[0] = cpus_online - 1; } - if (getuid() != 0) - ksft_exit_skip("Not running as root, but almost all tests " - "require root in order to modify\nsystem settings. " - "Exiting.\n"); - max_msgs = fopen(MAX_MSGS, "r+"); max_msgsize = fopen(MAX_MSGSIZE, "r+"); if (!max_msgs) @@ -740,4 +744,9 @@ int main(int argc, char *argv[]) sleep(1); } shutdown(0, "", 0); + +err_code: + CPU_FREE(cpu_set); + exit(1); + } -- Gitee From 861b9ebf493ff71b55d558fcb2d05ba75d25e3ca Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Tue, 19 Jul 2022 16:30:48 +0800 Subject: [PATCH 409/674] perf tools: Fix misleading add event PMU debug message stable inclusion from stable-v5.10.112 commit bfba9722cf2e801af181a56072f92f924ad7b156 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=bfba9722cf2e801af181a56072f92f924ad7b156 -------------------------------- [ Upstream commit f034fc50d3c7d9385c20d505ab4cf56b8fd18ac7 ] Fix incorrect debug message: Attempting to add event pmu 'intel_pt' with '' that may result in non-fatal errors which always appears with perf record -vv and intel_pt e.g. perf record -vv -e intel_pt//u uname The message is incorrect because there will never be non-fatal errors. Suppress the message if the PMU is 'selectable' i.e. meant to be selected directly as an event. Fixes: 4ac22b484d4c79e8 ("perf parse-events: Make add PMU verbose output clearer") Signed-off-by: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Link: http://lore.kernel.org/lkml/20220411061758.2458417-1-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- tools/perf/util/parse-events.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index d0408320c434..db3c85bb4fd0 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -1446,7 +1446,9 @@ int parse_events_add_pmu(struct parse_events_state *parse_state, bool use_uncore_alias; LIST_HEAD(config_terms); - if (verbose > 1) { + pmu = parse_state->fake_pmu ?: perf_pmu__find(name); + + if (verbose > 1 && !(pmu && pmu->selectable)) { fprintf(stderr, "Attempting to add event pmu '%s' with '", name); if (head_config) { @@ -1459,7 +1461,6 @@ int parse_events_add_pmu(struct parse_events_state *parse_state, fprintf(stderr, "' that may result in non-fatal errors\n"); } - pmu = parse_state->fake_pmu ?: perf_pmu__find(name); if (!pmu) { char *err_str; -- Gitee From 1f92c40221e98e94c159274b21e172ae137de8c1 Mon Sep 17 00:00:00 2001 From: Lin Ma Date: Tue, 19 Jul 2022 16:30:49 +0800 Subject: [PATCH 410/674] nfc: nci: add flush_workqueue to prevent uaf stable inclusion from stable-v5.10.112 commit 67677050cecbe0edfdd81cd508415e9636ba7c65 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=67677050cecbe0edfdd81cd508415e9636ba7c65 -------------------------------- [ Upstream commit ef27324e2cb7bb24542d6cb2571740eefe6b00dc ] Our detector found a concurrent use-after-free bug when detaching an NCI device. The main reason for this bug is the unexpected scheduling between the used delayed mechanism (timer and workqueue). The race can be demonstrated below: Thread-1 Thread-2 | nci_dev_up() | nci_open_device() | __nci_request(nci_reset_req) | nci_send_cmd | queue_work(cmd_work) nci_unregister_device() | nci_close_device() | ... del_timer_sync(cmd_timer)[1] | ... | Worker nci_free_device() | nci_cmd_work() kfree(ndev)[3] | mod_timer(cmd_timer)[2] In short, the cleanup routine thought that the cmd_timer has already been detached by [1] but the mod_timer can re-attach the timer [2], even it is already released [3], resulting in UAF. This UAF is easy to trigger, crash trace by POC is like below [ 66.703713] ================================================================== [ 66.703974] BUG: KASAN: use-after-free in enqueue_timer+0x448/0x490 [ 66.703974] Write of size 8 at addr ffff888009fb7058 by task kworker/u4:1/33 [ 66.703974] [ 66.703974] CPU: 1 PID: 33 Comm: kworker/u4:1 Not tainted 5.18.0-rc2 #5 [ 66.703974] Workqueue: nfc2_nci_cmd_wq nci_cmd_work [ 66.703974] Call Trace: [ 66.703974] [ 66.703974] dump_stack_lvl+0x57/0x7d [ 66.703974] print_report.cold+0x5e/0x5db [ 66.703974] ? enqueue_timer+0x448/0x490 [ 66.703974] kasan_report+0xbe/0x1c0 [ 66.703974] ? enqueue_timer+0x448/0x490 [ 66.703974] enqueue_timer+0x448/0x490 [ 66.703974] __mod_timer+0x5e6/0xb80 [ 66.703974] ? mark_held_locks+0x9e/0xe0 [ 66.703974] ? try_to_del_timer_sync+0xf0/0xf0 [ 66.703974] ? lockdep_hardirqs_on_prepare+0x17b/0x410 [ 66.703974] ? queue_work_on+0x61/0x80 [ 66.703974] ? lockdep_hardirqs_on+0xbf/0x130 [ 66.703974] process_one_work+0x8bb/0x1510 [ 66.703974] ? lockdep_hardirqs_on_prepare+0x410/0x410 [ 66.703974] ? pwq_dec_nr_in_flight+0x230/0x230 [ 66.703974] ? rwlock_bug.part.0+0x90/0x90 [ 66.703974] ? _raw_spin_lock_irq+0x41/0x50 [ 66.703974] worker_thread+0x575/0x1190 [ 66.703974] ? process_one_work+0x1510/0x1510 [ 66.703974] kthread+0x2a0/0x340 [ 66.703974] ? kthread_complete_and_exit+0x20/0x20 [ 66.703974] ret_from_fork+0x22/0x30 [ 66.703974] [ 66.703974] [ 66.703974] Allocated by task 267: [ 66.703974] kasan_save_stack+0x1e/0x40 [ 66.703974] __kasan_kmalloc+0x81/0xa0 [ 66.703974] nci_allocate_device+0xd3/0x390 [ 66.703974] nfcmrvl_nci_register_dev+0x183/0x2c0 [ 66.703974] nfcmrvl_nci_uart_open+0xf2/0x1dd [ 66.703974] nci_uart_tty_ioctl+0x2c3/0x4a0 [ 66.703974] tty_ioctl+0x764/0x1310 [ 66.703974] __x64_sys_ioctl+0x122/0x190 [ 66.703974] do_syscall_64+0x3b/0x90 [ 66.703974] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 66.703974] [ 66.703974] Freed by task 406: [ 66.703974] kasan_save_stack+0x1e/0x40 [ 66.703974] kasan_set_track+0x21/0x30 [ 66.703974] kasan_set_free_info+0x20/0x30 [ 66.703974] __kasan_slab_free+0x108/0x170 [ 66.703974] kfree+0xb0/0x330 [ 66.703974] nfcmrvl_nci_unregister_dev+0x90/0xd0 [ 66.703974] nci_uart_tty_close+0xdf/0x180 [ 66.703974] tty_ldisc_kill+0x73/0x110 [ 66.703974] tty_ldisc_hangup+0x281/0x5b0 [ 66.703974] __tty_hangup.part.0+0x431/0x890 [ 66.703974] tty_release+0x3a8/0xc80 [ 66.703974] __fput+0x1f0/0x8c0 [ 66.703974] task_work_run+0xc9/0x170 [ 66.703974] exit_to_user_mode_prepare+0x194/0x1a0 [ 66.703974] syscall_exit_to_user_mode+0x19/0x50 [ 66.703974] do_syscall_64+0x48/0x90 [ 66.703974] entry_SYSCALL_64_after_hwframe+0x44/0xae To fix the UAF, this patch adds flush_workqueue() to ensure the nci_cmd_work is finished before the following del_timer_sync. This combination will promise the timer is actually detached. Fixes: 6a2968aaf50c ("NFC: basic NCI protocol implementation") Signed-off-by: Lin Ma Reviewed-by: Krzysztof Kozlowski Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- net/nfc/nci/core.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index e38719e2ee58..2cfff70f70e0 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -548,6 +548,10 @@ static int nci_close_device(struct nci_dev *ndev) mutex_lock(&ndev->req_lock); if (!test_and_clear_bit(NCI_UP, &ndev->flags)) { + /* Need to flush the cmd wq in case + * there is a queued/running cmd_work + */ + flush_workqueue(ndev->cmd_wq); del_timer_sync(&ndev->cmd_timer); del_timer_sync(&ndev->data_timer); mutex_unlock(&ndev->req_lock); -- Gitee From 6c99e853acfb83e08bdd1f0718bd2ed0140e1d91 Mon Sep 17 00:00:00 2001 From: Harshit Mogalapalli Date: Tue, 19 Jul 2022 16:30:50 +0800 Subject: [PATCH 411/674] cifs: potential buffer overflow in handling symlinks stable inclusion from stable-v5.10.112 commit 4e166a41180be2f1e66bbb6d46448e80a9a5ec05 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=4e166a41180be2f1e66bbb6d46448e80a9a5ec05 -------------------------------- [ Upstream commit 64c4a37ac04eeb43c42d272f6e6c8c12bfcf4304 ] Smatch printed a warning: arch/x86/crypto/poly1305_glue.c:198 poly1305_update_arch() error: __memcpy() 'dctx->buf' too small (16 vs u32max) It's caused because Smatch marks 'link_len' as untrusted since it comes from sscanf(). Add a check to ensure that 'link_len' is not larger than the size of the 'link_str' buffer. Fixes: c69c1b6eaea1 ("cifs: implement CIFSParseMFSymlink()") Signed-off-by: Harshit Mogalapalli Reviewed-by: Ronnie Sahlberg Signed-off-by: Steve French Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- fs/cifs/link.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 94dab4309fbb..85d30fef98a2 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -97,6 +97,9 @@ parse_mf_symlink(const u8 *buf, unsigned int buf_len, unsigned int *_link_len, if (rc != 1) return -EINVAL; + if (link_len > CIFS_MF_SYMLINK_LINK_MAXLEN) + return -EINVAL; + rc = symlink_hash(link_len, link_str, md5_hash); if (rc) { cifs_dbg(FYI, "%s: MD5 hash failure: %d\n", __func__, rc); -- Gitee From def148dda90b1b8026cc515078cdcae72a4a3742 Mon Sep 17 00:00:00 2001 From: Khazhismel Kumykov Date: Tue, 19 Jul 2022 16:30:51 +0800 Subject: [PATCH 412/674] dm mpath: only use ktime_get_ns() in historical selector stable inclusion from stable-v5.10.112 commit 504c15f07f541f87185113ccd485a814adf31e4f category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=504c15f07f541f87185113ccd485a814adf31e4f -------------------------------- [ Upstream commit ce40426fdc3c92acdba6b5ca74bc7277ffaa6a3d ] Mixing sched_clock() and ktime_get_ns() usage will give bad results. Switch hst_select_path() from using sched_clock() to ktime_get_ns(). Also rename path_service_time()'s 'sched_now' variable to 'now'. Fixes: 2613eab11996 ("dm mpath: add Historical Service Time Path Selector") Signed-off-by: Khazhismel Kumykov Signed-off-by: Mike Snitzer Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/md/dm-historical-service-time.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/md/dm-historical-service-time.c b/drivers/md/dm-historical-service-time.c index 186f91e2752c..06fe43c13ba3 100644 --- a/drivers/md/dm-historical-service-time.c +++ b/drivers/md/dm-historical-service-time.c @@ -429,7 +429,7 @@ static struct dm_path *hst_select_path(struct path_selector *ps, { struct selector *s = ps->context; struct path_info *pi = NULL, *best = NULL; - u64 time_now = sched_clock(); + u64 time_now = ktime_get_ns(); struct dm_path *ret = NULL; unsigned long flags; @@ -470,7 +470,7 @@ static int hst_start_io(struct path_selector *ps, struct dm_path *path, static u64 path_service_time(struct path_info *pi, u64 start_time) { - u64 sched_now = ktime_get_ns(); + u64 now = ktime_get_ns(); /* if a previous disk request has finished after this IO was * sent to the hardware, pretend the submission happened @@ -479,11 +479,11 @@ static u64 path_service_time(struct path_info *pi, u64 start_time) if (time_after64(pi->last_finish, start_time)) start_time = pi->last_finish; - pi->last_finish = sched_now; - if (time_before64(sched_now, start_time)) + pi->last_finish = now; + if (time_before64(now, start_time)) return 0; - return sched_now - start_time; + return now - start_time; } static int hst_end_io(struct path_selector *ps, struct dm_path *path, -- Gitee From 27806a5a06b3e9807856235d29669faf104a209c Mon Sep 17 00:00:00 2001 From: Jeremy Linton Date: Tue, 19 Jul 2022 16:30:52 +0800 Subject: [PATCH 413/674] net: bcmgenet: Revert "Use stronger register read/writes to assure ordering" stable inclusion from stable-v5.10.112 commit 6f9c06501d288c96ebfb651443dd365b0aca3757 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=6f9c06501d288c96ebfb651443dd365b0aca3757 -------------------------------- [ Upstream commit 2df3fc4a84e917a422935cc5bae18f43f9955d31 ] It turns out after digging deeper into this bug, that it was being triggered by GCC12 failing to call the bcmgenet_enable_dma() routine. Given that a gcc12 fix has been merged [1] and the genet driver now works properly when built with gcc12, this commit should be reverted. [1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105160 https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=aabb9a261ef060cf24fd626713f1d7d9df81aa57 Fixes: 8d3ea3d402db ("net: bcmgenet: Use stronger register read/writes to assure ordering") Signed-off-by: Jeremy Linton Acked-by: Florian Fainelli Link: https://lore.kernel.org/r/20220412210420.1129430-1-jeremy.linton@arm.com Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/net/ethernet/broadcom/genet/bcmgenet.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c index d676e59eb82d..5de37c33a737 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c @@ -76,7 +76,7 @@ static inline void bcmgenet_writel(u32 value, void __iomem *offset) if (IS_ENABLED(CONFIG_MIPS) && IS_ENABLED(CONFIG_CPU_BIG_ENDIAN)) __raw_writel(value, offset); else - writel(value, offset); + writel_relaxed(value, offset); } static inline u32 bcmgenet_readl(void __iomem *offset) @@ -84,7 +84,7 @@ static inline u32 bcmgenet_readl(void __iomem *offset) if (IS_ENABLED(CONFIG_MIPS) && IS_ENABLED(CONFIG_CPU_BIG_ENDIAN)) return __raw_readl(offset); else - return readl(offset); + return readl_relaxed(offset); } static inline void dmadesc_set_length_status(struct bcmgenet_priv *priv, -- Gitee From 5a088293eeb41bc6a6282b491a354a4e6394b80f Mon Sep 17 00:00:00 2001 From: Aurabindo Pillai Date: Tue, 19 Jul 2022 16:30:53 +0800 Subject: [PATCH 414/674] drm/amd: Add USBC connector ID stable inclusion from stable-v5.10.112 commit 9b5d1b3413d784fea12c387c27dbd10e3446e404 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=9b5d1b3413d784fea12c387c27dbd10e3446e404 -------------------------------- [ Upstream commit c5c948aa894a831f96fccd025e47186b1ee41615 ] [Why&How] Add a dedicated AMDGPU specific ID for use with newer ASICs that support USB-C output Signed-off-by: Aurabindo Pillai Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/gpu/drm/amd/amdgpu/ObjectID.h | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/amdgpu/ObjectID.h b/drivers/gpu/drm/amd/amdgpu/ObjectID.h index 5b393622f592..a0f0a17e224f 100644 --- a/drivers/gpu/drm/amd/amdgpu/ObjectID.h +++ b/drivers/gpu/drm/amd/amdgpu/ObjectID.h @@ -119,6 +119,7 @@ #define CONNECTOR_OBJECT_ID_eDP 0x14 #define CONNECTOR_OBJECT_ID_MXM 0x15 #define CONNECTOR_OBJECT_ID_LVDS_eDP 0x16 +#define CONNECTOR_OBJECT_ID_USBC 0x17 /* deleted */ -- Gitee From 3c484dfda822f1c6cf3d9ae7c9edc0814a0647f1 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 19 Jul 2022 16:30:54 +0800 Subject: [PATCH 415/674] btrfs: fix fallocate to use file_modified to update permissions consistently stable inclusion from stable-v5.10.112 commit 217190dc66efdf58e04a5c1f239f330c4c47d81b category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=217190dc66efdf58e04a5c1f239f330c4c47d81b -------------------------------- [ Upstream commit 05fd9564e9faf0f23b4676385e27d9405cef6637 ] Since the initial introduction of (posix) fallocate back at the turn of the century, it has been possible to use this syscall to change the user-visible contents of files. This can happen by extending the file size during a preallocation, or through any of the newer modes (punch, zero range). Because the call can be used to change file contents, we should treat it like we do any other modification to a file -- update the mtime, and drop set[ug]id privileges/capabilities. The VFS function file_modified() does all this for us if pass it a locked inode, so let's make fallocate drop permissions correctly. Reviewed-by: Filipe Manana Signed-off-by: Darrick J. Wong Signed-off-by: David Sterba Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- fs/btrfs/file.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index f59ec55e5feb..416a1b753ff6 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2833,8 +2833,9 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, return ret; } -static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) +static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) { + struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_state *cached_state = NULL; @@ -2866,6 +2867,10 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) goto out_only_mutex; } + ret = file_modified(file); + if (ret) + goto out_only_mutex; + lockstart = round_up(offset, btrfs_inode_sectorsize(BTRFS_I(inode))); lockend = round_down(offset + len, btrfs_inode_sectorsize(BTRFS_I(inode))) - 1; @@ -3301,7 +3306,7 @@ static long btrfs_fallocate(struct file *file, int mode, return -EOPNOTSUPP; if (mode & FALLOC_FL_PUNCH_HOLE) - return btrfs_punch_hole(inode, offset, len); + return btrfs_punch_hole(file, offset, len); /* * Only trigger disk allocation, don't trigger qgroup reserve @@ -3323,6 +3328,10 @@ static long btrfs_fallocate(struct file *file, int mode, goto out; } + ret = file_modified(file); + if (ret) + goto out; + /* * TODO: Move these two operations after we have checked * accurate reserved space, or fallocate can still fail but -- Gitee From 66c9a7dc806a961055c7b9ede42f4d06905a1e1a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 19 Jul 2022 16:30:55 +0800 Subject: [PATCH 416/674] btrfs: do not warn for free space inode in cow_file_range stable inclusion from stable-v5.10.112 commit 76e086ce7b2d71b0d7a1a121c5f53b5fa07c1c8c category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=76e086ce7b2d71b0d7a1a121c5f53b5fa07c1c8c -------------------------------- [ Upstream commit a7d16d9a07bbcb7dcd5214a1bea75c808830bc0d ] This is a long time leftover from when I originally added the free space inode, the point was to catch cases where we weren't honoring the NOCOW flag. However there exists a race with relocation, if we allocate our free space inode in a block group that is about to be relocated, we could trigger the COW path before the relocation has the opportunity to find the extents and delete the free space cache. In production where we have auto-relocation enabled we're seeing this WARN_ON_ONCE() around 5k times in a 2 week period, so not super common but enough that it's at the top of our metrics. We're properly handling the error here, and with us phasing out v1 space cache anyway just drop the WARN_ON_ONCE. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- fs/btrfs/inode.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f7f4ac01589b..4a5248097d7a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -995,7 +995,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode, int ret = 0; if (btrfs_is_free_space_inode(inode)) { - WARN_ON_ONCE(1); ret = -EINVAL; goto out_unlock; } -- Gitee From 4b68fc4d27954159a26562642360a24f6a435678 Mon Sep 17 00:00:00 2001 From: Charlene Liu Date: Tue, 19 Jul 2022 16:30:56 +0800 Subject: [PATCH 417/674] drm/amd/display: fix audio format not updated after edid updated stable inclusion from stable-v5.10.112 commit 756c61c1680f435292b7d06072a8f390f551bca6 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=756c61c1680f435292b7d06072a8f390f551bca6 -------------------------------- [ Upstream commit 5e8a71cf13bc9184fee915b2220be71b4c6cac74 ] [why] for the case edid change only changed audio format. driver still need to update stream. Reviewed-by: Alvin Lee Reviewed-by: Aric Cyr Acked-by: Alex Hung Signed-off-by: Charlene Liu Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/gpu/drm/amd/display/dc/core/dc_resource.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_resource.c b/drivers/gpu/drm/amd/display/dc/core/dc_resource.c index 5c5ccbad9658..1e47afc4ccc1 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc_resource.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc_resource.c @@ -1701,8 +1701,8 @@ bool dc_is_stream_unchanged( if (old_stream->ignore_msa_timing_param != stream->ignore_msa_timing_param) return false; - // Only Have Audio left to check whether it is same or not. This is a corner case for Tiled sinks - if (old_stream->audio_info.mode_count != stream->audio_info.mode_count) + /*compare audio info*/ + if (memcmp(&old_stream->audio_info, &stream->audio_info, sizeof(stream->audio_info)) != 0) return false; return true; -- Gitee From 4b7c332d3a355b2841bd5c15e5f67be2f23baac1 Mon Sep 17 00:00:00 2001 From: Chiawen Huang Date: Tue, 19 Jul 2022 16:30:57 +0800 Subject: [PATCH 418/674] drm/amd/display: FEC check in timing validation stable inclusion from stable-v5.10.112 commit 6906e05cf3ad2cfe704eda003de59aa0cb89b291 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=6906e05cf3ad2cfe704eda003de59aa0cb89b291 -------------------------------- [ Upstream commit 7d56a154e22ffb3613fdebf83ec34d5225a22993 ] [Why] disable/enable leads FEC mismatch between hw/sw FEC state. [How] check FEC status to fastboot on/off. Reviewed-by: Anthony Koo Acked-by: Alex Hung Signed-off-by: Chiawen Huang Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/gpu/drm/amd/display/dc/core/dc.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc.c b/drivers/gpu/drm/amd/display/dc/core/dc.c index 93f5229c303e..ac5323596c65 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc.c @@ -1173,6 +1173,10 @@ bool dc_validate_seamless_boot_timing(const struct dc *dc, if (!link->link_enc->funcs->is_dig_enabled(link->link_enc)) return false; + /* Check for FEC status*/ + if (link->link_enc->funcs->fec_is_active(link->link_enc)) + return false; + enc_inst = link->link_enc->funcs->get_dig_frontend(link->link_enc); if (enc_inst == ENGINE_ID_UNKNOWN) -- Gitee From 12565d62e538545a7394254551290145099600ed Mon Sep 17 00:00:00 2001 From: "Leo (Hanghong) Ma" Date: Tue, 19 Jul 2022 16:30:58 +0800 Subject: [PATCH 419/674] drm/amd/display: Update VTEM Infopacket definition stable inclusion from stable-v5.10.112 commit 7fc0610ad8182d05596e671c950bca1f3921d5e0 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=7fc0610ad8182d05596e671c950bca1f3921d5e0 -------------------------------- [ Upstream commit c9fbf6435162ed5fb7201d1d4adf6585c6a8c327 ] [Why & How] The latest HDMI SPEC has updated the VTEM packet structure, so change the VTEM Infopacket defined in the driver side to align with the SPEC. Reviewed-by: Chris Park Acked-by: Alex Hung Signed-off-by: Leo (Hanghong) Ma Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- .../gpu/drm/amd/display/modules/info_packet/info_packet.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/modules/info_packet/info_packet.c b/drivers/gpu/drm/amd/display/modules/info_packet/info_packet.c index 0fdf7a3e96de..96e18050a617 100644 --- a/drivers/gpu/drm/amd/display/modules/info_packet/info_packet.c +++ b/drivers/gpu/drm/amd/display/modules/info_packet/info_packet.c @@ -100,7 +100,8 @@ enum vsc_packet_revision { //PB7 = MD0 #define MASK_VTEM_MD0__VRR_EN 0x01 #define MASK_VTEM_MD0__M_CONST 0x02 -#define MASK_VTEM_MD0__RESERVED2 0x0C +#define MASK_VTEM_MD0__QMS_EN 0x04 +#define MASK_VTEM_MD0__RESERVED2 0x08 #define MASK_VTEM_MD0__FVA_FACTOR_M1 0xF0 //MD1 @@ -109,7 +110,7 @@ enum vsc_packet_revision { //MD2 #define MASK_VTEM_MD2__BASE_REFRESH_RATE_98 0x03 #define MASK_VTEM_MD2__RB 0x04 -#define MASK_VTEM_MD2__RESERVED3 0xF8 +#define MASK_VTEM_MD2__NEXT_TFR 0xF8 //MD3 #define MASK_VTEM_MD3__BASE_REFRESH_RATE_07 0xFF -- Gitee From ec62d273d1dbdea4f1b1ce234925be2accd23128 Mon Sep 17 00:00:00 2001 From: Tushar Patel Date: Tue, 19 Jul 2022 16:30:59 +0800 Subject: [PATCH 420/674] drm/amdkfd: Fix Incorrect VMIDs passed to HWS stable inclusion from stable-v5.10.112 commit d2e0931e6d84d133a78b87b70d32d5776ee2b101 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=d2e0931e6d84d133a78b87b70d32d5776ee2b101 -------------------------------- [ Upstream commit b7dfbd2e601f3fee545bc158feceba4f340fe7cf ] Compute-only GPUs have more than 8 VMIDs allocated to KFD. Fix this by passing correct number of VMIDs to HWS v2: squash in warning fix (Alex) Signed-off-by: Tushar Patel Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 2 +- drivers/gpu/drm/amd/amdkfd/kfd_device.c | 11 +++-------- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index ed13a2f76884..30659c1776e8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -632,7 +632,7 @@ MODULE_PARM_DESC(sched_policy, * Maximum number of processes that HWS can schedule concurrently. The maximum is the * number of VMIDs assigned to the HWS, which is also the default. */ -int hws_max_conc_proc = 8; +int hws_max_conc_proc = -1; module_param(hws_max_conc_proc, int, 0444); MODULE_PARM_DESC(hws_max_conc_proc, "Max # processes HWS can execute concurrently when sched_policy=0 (0 = no concurrency, #VMIDs for KFD = Maximum(default))"); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 84313135c2ea..148e43dee657 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -664,15 +664,10 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, - kfd->vm_info.first_vmid_kfd + 1; /* Verify module parameters regarding mapped process number*/ - if ((hws_max_conc_proc < 0) - || (hws_max_conc_proc > kfd->vm_info.vmid_num_kfd)) { - dev_err(kfd_device, - "hws_max_conc_proc %d must be between 0 and %d, use %d instead\n", - hws_max_conc_proc, kfd->vm_info.vmid_num_kfd, - kfd->vm_info.vmid_num_kfd); + if (hws_max_conc_proc >= 0) + kfd->max_proc_per_quantum = min((u32)hws_max_conc_proc, kfd->vm_info.vmid_num_kfd); + else kfd->max_proc_per_quantum = kfd->vm_info.vmid_num_kfd; - } else - kfd->max_proc_per_quantum = hws_max_conc_proc; /* calculate max size of mqds needed for queues */ size = max_num_of_queues_per_device * -- Gitee From 69ce17cda16b6153a6d8cffe4004f85c00653f08 Mon Sep 17 00:00:00 2001 From: Tianci Yin Date: Tue, 19 Jul 2022 16:31:00 +0800 Subject: [PATCH 421/674] drm/amdgpu/vcn: improve vcn dpg stop procedure stable inclusion from stable-v5.10.112 commit e5afacc826a80cd82ac3f9eb26bc80e45cd14c20 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=e5afacc826a80cd82ac3f9eb26bc80e45cd14c20 -------------------------------- [ Upstream commit 6ea239adc2a712eb318f04f5c29b018ba65ea38a ] Prior to disabling dpg, VCN need unpausing dpg mode, or VCN will hang in S3 resuming. Reviewed-by: James Zhu Signed-off-by: Tianci Yin Signed-off-by: Alex Deucher Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c index 2099f6ebd833..bdb8e596bda6 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c @@ -1429,8 +1429,11 @@ static int vcn_v3_0_start_sriov(struct amdgpu_device *adev) static int vcn_v3_0_stop_dpg_mode(struct amdgpu_device *adev, int inst_idx) { + struct dpg_pause_state state = {.fw_based = VCN_DPG_STATE__UNPAUSE}; uint32_t tmp; + vcn_v3_0_pause_dpg_mode(adev, 0, &state); + /* Wait for power status to be 1 */ SOC15_WAIT_ON_RREG(VCN, inst_idx, mmUVD_POWER_STATUS, 1, UVD_POWER_STATUS__UVD_POWER_STATUS_MASK); -- Gitee From ea649459fc1bff1151b9dbc58fa7a502c2bb20c6 Mon Sep 17 00:00:00 2001 From: QintaoShen Date: Tue, 19 Jul 2022 16:31:01 +0800 Subject: [PATCH 422/674] drm/amdkfd: Check for potential null return of kmalloc_array() stable inclusion from stable-v5.10.112 commit 1d7a5aae884ca727d41c7ed15d4c82fdb67c040c category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=1d7a5aae884ca727d41c7ed15d4c82fdb67c040c -------------------------------- [ Upstream commit ebbb7bb9e80305820dc2328a371c1b35679f2667 ] As the kmalloc_array() may return null, the 'event_waiters[i].wait' would lead to null-pointer dereference. Therefore, it is better to check the return value of kmalloc_array() to avoid this confusion. Signed-off-by: QintaoShen Signed-off-by: Alex Deucher Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/gpu/drm/amd/amdkfd/kfd_events.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c index ba2c2ce0c55a..159be13ef20b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c @@ -531,6 +531,8 @@ static struct kfd_event_waiter *alloc_event_waiters(uint32_t num_events) event_waiters = kmalloc_array(num_events, sizeof(struct kfd_event_waiter), GFP_KERNEL); + if (!event_waiters) + return NULL; for (i = 0; (event_waiters) && (i < num_events) ; i++) { init_wait(&event_waiters[i].wait); -- Gitee From aed10586a103cb44228fcbc35ed89ba073dc160a Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Tue, 19 Jul 2022 16:31:02 +0800 Subject: [PATCH 423/674] Drivers: hv: vmbus: Prevent load re-ordering when reading ring buffer stable inclusion from stable-v5.10.112 commit 43666798059c7cb5c374ce1f79f51e147aa5d419 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=43666798059c7cb5c374ce1f79f51e147aa5d419 -------------------------------- [ Upstream commit b6cae15b5710c8097aad26a2e5e752c323ee5348 ] When reading a packet from a host-to-guest ring buffer, there is no memory barrier between reading the write index (to see if there is a packet to read) and reading the contents of the packet. The Hyper-V host uses store-release when updating the write index to ensure that writes of the packet data are completed first. On the guest side, the processor can reorder and read the packet data before the write index, and sometimes get stale packet data. Getting such stale packet data has been observed in a reproducible case in a VM on ARM64. Fix this by using virt_load_acquire() to read the write index, ensuring that reads of the packet data cannot be reordered before it. Preventing such reordering is logically correct, and with this change, getting stale data can no longer be reproduced. Signed-off-by: Michael Kelley Reviewed-by: Andrea Parri (Microsoft) Link: https://lore.kernel.org/r/1648394710-33480-1-git-send-email-mikelley@microsoft.com Signed-off-by: Wei Liu Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/hv/ring_buffer.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/hv/ring_buffer.c b/drivers/hv/ring_buffer.c index 356e22159e83..769851b6e74c 100644 --- a/drivers/hv/ring_buffer.c +++ b/drivers/hv/ring_buffer.c @@ -378,7 +378,16 @@ int hv_ringbuffer_read(struct vmbus_channel *channel, static u32 hv_pkt_iter_avail(const struct hv_ring_buffer_info *rbi) { u32 priv_read_loc = rbi->priv_read_index; - u32 write_loc = READ_ONCE(rbi->ring_buffer->write_index); + u32 write_loc; + + /* + * The Hyper-V host writes the packet data, then uses + * store_release() to update the write_index. Use load_acquire() + * here to prevent loads of the packet data from being re-ordered + * before the read of the write_index and potentially getting + * stale data. + */ + write_loc = virt_load_acquire(&rbi->ring_buffer->write_index); if (write_loc >= priv_read_loc) return write_loc - priv_read_loc; -- Gitee From a10270a6d949ebe9bd6487e4a198c524de8bbb45 Mon Sep 17 00:00:00 2001 From: Xiaoguang Wang Date: Tue, 19 Jul 2022 16:31:03 +0800 Subject: [PATCH 424/674] scsi: target: tcmu: Fix possible page UAF stable inclusion from stable-v5.10.112 commit aec36b98a1bbaa84bfd8299a306e4c12314af626 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=aec36b98a1bbaa84bfd8299a306e4c12314af626 -------------------------------- [ Upstream commit a6968f7a367f128d120447360734344d5a3d5336 ] tcmu_try_get_data_page() looks up pages under cmdr_lock, but it does not take refcount properly and just returns page pointer. When tcmu_try_get_data_page() returns, the returned page may have been freed by tcmu_blocks_release(). We need to get_page() under cmdr_lock to avoid concurrent tcmu_blocks_release(). Link: https://lore.kernel.org/r/20220311132206.24515-1-xiaoguang.wang@linux.alibaba.com Reviewed-by: Bodo Stroesser Signed-off-by: Xiaoguang Wang Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/target/target_core_user.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c index c6950f157b99..c283e45ac300 100644 --- a/drivers/target/target_core_user.c +++ b/drivers/target/target_core_user.c @@ -1676,6 +1676,7 @@ static struct page *tcmu_try_get_block_page(struct tcmu_dev *udev, uint32_t dbi) mutex_lock(&udev->cmdr_lock); page = tcmu_get_block_page(udev, dbi); if (likely(page)) { + get_page(page); mutex_unlock(&udev->cmdr_lock); return page; } @@ -1714,6 +1715,7 @@ static vm_fault_t tcmu_vma_fault(struct vm_fault *vmf) /* For the vmalloc()ed cmd area pages */ addr = (void *)(unsigned long)info->mem[mi].addr + offset; page = vmalloc_to_page(addr); + get_page(page); } else { uint32_t dbi; @@ -1724,7 +1726,6 @@ static vm_fault_t tcmu_vma_fault(struct vm_fault *vmf) return VM_FAULT_SIGBUS; } - get_page(page); vmf->page = page; return 0; } -- Gitee From 39621a1d6b6040950287449cf56a62ae1e415a41 Mon Sep 17 00:00:00 2001 From: James Smart Date: Tue, 19 Jul 2022 16:31:04 +0800 Subject: [PATCH 425/674] scsi: lpfc: Fix queue failures when recovering from PCI parity error stable inclusion from stable-v5.10.112 commit b9a110fa755b2a144c677254938d709bdbea36ac category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=b9a110fa755b2a144c677254938d709bdbea36ac -------------------------------- [ Upstream commit df0101197c4d9596682901631f3ee193ed354873 ] When recovering from a pci-parity error the driver is failing to re-create queues, causing recovery to fail. Looking deeper, it was found that the interrupt vector count allocated on the recovery was fewer than the vectors originally allocated. This disparity resulted in CPU map entries with stale information. When the driver tries to re-create the queues, it attempts to use the stale information which indicates an eq/interrupt vector that was no longer created. Fix by clearng the cpup map array before enabling and requesting the IRQs in the lpfc_sli_reset_slot_s4 routine(). Link: https://lore.kernel.org/r/20220317032737.45308-4-jsmart2021@gmail.com Co-developed-by: Justin Tee Signed-off-by: Justin Tee Signed-off-by: James Smart Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/scsi/lpfc/lpfc_init.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c index 1149bfc42fe6..134e4ee5dc48 100644 --- a/drivers/scsi/lpfc/lpfc_init.c +++ b/drivers/scsi/lpfc/lpfc_init.c @@ -13614,6 +13614,8 @@ lpfc_io_slot_reset_s4(struct pci_dev *pdev) psli->sli_flag &= ~LPFC_SLI_ACTIVE; spin_unlock_irq(&phba->hbalock); + /* Init cpu_map array */ + lpfc_cpu_map_array_init(phba); /* Configure and enable interrupt */ intr_mode = lpfc_sli4_enable_intr(phba, phba->intr_mode); if (intr_mode == LPFC_INTR_ERROR) { -- Gitee From 3a7835f442a2fe8b45dd3a87e218a1bbefef5e49 Mon Sep 17 00:00:00 2001 From: Tyrel Datwyler Date: Tue, 19 Jul 2022 16:31:05 +0800 Subject: [PATCH 426/674] scsi: ibmvscsis: Increase INITIAL_SRP_LIMIT to 1024 stable inclusion from stable-v5.10.112 commit d3478709edf2cb84a46f87f7960d555f5797a967 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=d3478709edf2cb84a46f87f7960d555f5797a967 -------------------------------- [ Upstream commit 0bade8e53279157c7cc9dd95d573b7e82223d78a ] The adapter request_limit is hardcoded to be INITIAL_SRP_LIMIT which is currently an arbitrary value of 800. Increase this value to 1024 which better matches the characteristics of the typical IBMi Initiator that supports 32 LUNs and a queue depth of 32. This change also has the secondary benefit of being a power of two as required by the kfifo API. Since, Commit ab9bb6318b09 ("Partially revert "kfifo: fix kfifo_alloc() and kfifo_init()"") the size of IU pool for each target has been rounded down to 512 when attempting to kfifo_init() those pools with the current request_limit size of 800. Link: https://lore.kernel.org/r/20220322194443.678433-1-tyreld@linux.ibm.com Signed-off-by: Tyrel Datwyler Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c b/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c index cc3908c2d2f9..a3431485def8 100644 --- a/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c +++ b/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c @@ -35,7 +35,7 @@ #define IBMVSCSIS_VERSION "v0.2" -#define INITIAL_SRP_LIMIT 800 +#define INITIAL_SRP_LIMIT 1024 #define DEFAULT_MAX_SECTORS 256 #define MAX_TXU 1024 * 1024 -- Gitee From 132b9e782031f5b8a728e1550f8f422704ab36de Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 19 Jul 2022 16:31:06 +0800 Subject: [PATCH 427/674] net: micrel: fix KS8851_MLL Kconfig stable inclusion from stable-v5.10.112 commit 1ff5359afa5ec0dd09fe76183dc4fa24b50e4125 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=1ff5359afa5ec0dd09fe76183dc4fa24b50e4125 -------------------------------- [ Upstream commit c3efcedd272aa6dd5929e20cf902a52ddaa1197a ] KS8851_MLL selects MICREL_PHY, which depends on PTP_1588_CLOCK_OPTIONAL, so make KS8851_MLL also depend on PTP_1588_CLOCK_OPTIONAL since 'select' does not follow any dependency chains. Fixes kconfig warning and build errors: WARNING: unmet direct dependencies detected for MICREL_PHY Depends on [m]: NETDEVICES [=y] && PHYLIB [=y] && PTP_1588_CLOCK_OPTIONAL [=m] Selected by [y]: - KS8851_MLL [=y] && NETDEVICES [=y] && ETHERNET [=y] && NET_VENDOR_MICREL [=y] && HAS_IOMEM [=y] ld: drivers/net/phy/micrel.o: in function `lan8814_ts_info': micrel.c:(.text+0xb35): undefined reference to `ptp_clock_index' ld: drivers/net/phy/micrel.o: in function `lan8814_probe': micrel.c:(.text+0x2586): undefined reference to `ptp_clock_register' Signed-off-by: Randy Dunlap Cc: "David S. Miller" Cc: Jakub Kicinski Cc: Paolo Abeni Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/net/ethernet/micrel/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/micrel/Kconfig b/drivers/net/ethernet/micrel/Kconfig index 42bc014136fe..9ceb7e1fb169 100644 --- a/drivers/net/ethernet/micrel/Kconfig +++ b/drivers/net/ethernet/micrel/Kconfig @@ -37,6 +37,7 @@ config KS8851 config KS8851_MLL tristate "Micrel KS8851 MLL" depends on HAS_IOMEM + depends on PTP_1588_CLOCK_OPTIONAL select MII select CRC32 select EEPROM_93CX6 -- Gitee From f568a8dd74946b9ca81f7091d4e325bec0cb4bc1 Mon Sep 17 00:00:00 2001 From: Christian Lamparter Date: Tue, 19 Jul 2022 16:31:07 +0800 Subject: [PATCH 428/674] ata: libata-core: Disable READ LOG DMA EXT for Samsung 840 EVOs stable inclusion from stable-v5.10.112 commit 150fe861c57c042510c81b24bddf4a9818ceb5a2 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=150fe861c57c042510c81b24bddf4a9818ceb5a2 -------------------------------- [ Upstream commit 5399752299396a3c9df6617f4b3c907d7aa4ded8 ] Samsung' 840 EVO with the latest firmware (EXT0DB6Q) locks up with the a message: "READ LOG DMA EXT failed, trying PIO" during boot. Initially this was discovered because it caused a crash with the sata_dwc_460ex controller on a WD MyBook Live DUO. The reporter "Tice Rex" which has the unique opportunity that he has two Samsung 840 EVO SSD! One with the older firmware "EXT0BB0Q" which booted fine and didn't expose "READ LOG DMA EXT". But the newer/latest firmware "EXT0DB6Q" caused the headaches. BugLink: https://github.com/openwrt/openwrt/issues/9505 Signed-off-by: Christian Lamparter Signed-off-by: Damien Le Moal Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/ata/libata-core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index d2b544bdc7b5..f963a0a7da46 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -3974,6 +3974,9 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = { ATA_HORKAGE_ZERO_AFTER_TRIM, }, { "Crucial_CT*MX100*", "MU01", ATA_HORKAGE_NO_NCQ_TRIM | ATA_HORKAGE_ZERO_AFTER_TRIM, }, + { "Samsung SSD 840 EVO*", NULL, ATA_HORKAGE_NO_NCQ_TRIM | + ATA_HORKAGE_NO_DMA_LOG | + ATA_HORKAGE_ZERO_AFTER_TRIM, }, { "Samsung SSD 840*", NULL, ATA_HORKAGE_NO_NCQ_TRIM | ATA_HORKAGE_ZERO_AFTER_TRIM, }, { "Samsung SSD 850*", NULL, ATA_HORKAGE_NO_NCQ_TRIM | -- Gitee From 40a23576d58afca5e521f319eb61fa2d346f45ae Mon Sep 17 00:00:00 2001 From: Leo Ruan Date: Tue, 19 Jul 2022 16:31:08 +0800 Subject: [PATCH 429/674] gpu: ipu-v3: Fix dev_dbg frequency output stable inclusion from stable-v5.10.112 commit aa8cdedaf7606fc0e7e16cdf67c1d651f2c93b5f category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=aa8cdedaf7606fc0e7e16cdf67c1d651f2c93b5f -------------------------------- [ Upstream commit 070a88fd4a03f921b73a2059e97d55faaa447dab ] This commit corrects the printing of the IPU clock error percentage if it is between -0.1% to -0.9%. For example, if the pixel clock requested is 27.2 MHz but only 27.0 MHz can be achieved the deviation is -0.8%. But the fixed point math had a flaw and calculated error of 0.2%. Before: Clocks: IPU 270000000Hz DI 24716667Hz Needed 27200000Hz IPU clock can give 27000000 with divider 10, error 0.2% Want 27200000Hz IPU 270000000Hz DI 24716667Hz using IPU, 27000000Hz After: Clocks: IPU 270000000Hz DI 24716667Hz Needed 27200000Hz IPU clock can give 27000000 with divider 10, error -0.8% Want 27200000Hz IPU 270000000Hz DI 24716667Hz using IPU, 27000000Hz Signed-off-by: Leo Ruan Signed-off-by: Mark Jonas Reviewed-by: Philipp Zabel Signed-off-by: Philipp Zabel Link: https://lore.kernel.org/r/20220207151411.5009-1-mark.jonas@de.bosch.com Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/gpu/ipu-v3/ipu-di.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/ipu-v3/ipu-di.c b/drivers/gpu/ipu-v3/ipu-di.c index b4a31d506fcc..74eca68891ad 100644 --- a/drivers/gpu/ipu-v3/ipu-di.c +++ b/drivers/gpu/ipu-v3/ipu-di.c @@ -451,8 +451,9 @@ static void ipu_di_config_clock(struct ipu_di *di, error = rate / (sig->mode.pixelclock / 1000); - dev_dbg(di->ipu->dev, " IPU clock can give %lu with divider %u, error %d.%u%%\n", - rate, div, (signed)(error - 1000) / 10, error % 10); + dev_dbg(di->ipu->dev, " IPU clock can give %lu with divider %u, error %c%d.%d%%\n", + rate, div, error < 1000 ? '-' : '+', + abs(error - 1000) / 10, abs(error - 1000) % 10); /* Allow a 1% error */ if (error < 1010 && error >= 990) { -- Gitee From d069824e2bfda41cd7fd8a3116057aec850d8744 Mon Sep 17 00:00:00 2001 From: Jonathan Bakker Date: Tue, 19 Jul 2022 16:31:09 +0800 Subject: [PATCH 430/674] regulator: wm8994: Add an off-on delay for WM8994 variant stable inclusion from stable-v5.10.112 commit 2462faffbfa582698906a6aa5fe677cf35003851 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=2462faffbfa582698906a6aa5fe677cf35003851 -------------------------------- [ Upstream commit 92d96b603738ec4f35cde7198c303ae264dd47cb ] As per Table 130 of the wm8994 datasheet at [1], there is an off-on delay for LDO1 and LDO2. In the wm8958 datasheet [2], I could not find any reference to it. I could not find a wm1811 datasheet to double-check there, but as no one has complained presumably it works without it. This solves the issue on Samsung Aries boards with a wm8994 where register writes fail when the device is powered off and back-on quickly. [1] https://statics.cirrus.com/pubs/proDatasheet/WM8994_Rev4.6.pdf [2] https://statics.cirrus.com/pubs/proDatasheet/WM8958_v3.5.pdf Signed-off-by: Jonathan Bakker Acked-by: Charles Keepax Link: https://lore.kernel.org/r/CY4PR04MB056771CFB80DC447C30D5A31CB1D9@CY4PR04MB0567.namprd04.prod.outlook.com Signed-off-by: Mark Brown Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/regulator/wm8994-regulator.c | 42 ++++++++++++++++++++++++++-- 1 file changed, 39 insertions(+), 3 deletions(-) diff --git a/drivers/regulator/wm8994-regulator.c b/drivers/regulator/wm8994-regulator.c index cadea0344486..40befdd9dfa9 100644 --- a/drivers/regulator/wm8994-regulator.c +++ b/drivers/regulator/wm8994-regulator.c @@ -71,6 +71,35 @@ static const struct regulator_ops wm8994_ldo2_ops = { }; static const struct regulator_desc wm8994_ldo_desc[] = { + { + .name = "LDO1", + .id = 1, + .type = REGULATOR_VOLTAGE, + .n_voltages = WM8994_LDO1_MAX_SELECTOR + 1, + .vsel_reg = WM8994_LDO_1, + .vsel_mask = WM8994_LDO1_VSEL_MASK, + .ops = &wm8994_ldo1_ops, + .min_uV = 2400000, + .uV_step = 100000, + .enable_time = 3000, + .off_on_delay = 36000, + .owner = THIS_MODULE, + }, + { + .name = "LDO2", + .id = 2, + .type = REGULATOR_VOLTAGE, + .n_voltages = WM8994_LDO2_MAX_SELECTOR + 1, + .vsel_reg = WM8994_LDO_2, + .vsel_mask = WM8994_LDO2_VSEL_MASK, + .ops = &wm8994_ldo2_ops, + .enable_time = 3000, + .off_on_delay = 36000, + .owner = THIS_MODULE, + }, +}; + +static const struct regulator_desc wm8958_ldo_desc[] = { { .name = "LDO1", .id = 1, @@ -172,9 +201,16 @@ static int wm8994_ldo_probe(struct platform_device *pdev) * regulator core and we need not worry about it on the * error path. */ - ldo->regulator = devm_regulator_register(&pdev->dev, - &wm8994_ldo_desc[id], - &config); + if (ldo->wm8994->type == WM8994) { + ldo->regulator = devm_regulator_register(&pdev->dev, + &wm8994_ldo_desc[id], + &config); + } else { + ldo->regulator = devm_regulator_register(&pdev->dev, + &wm8958_ldo_desc[id], + &config); + } + if (IS_ERR(ldo->regulator)) { ret = PTR_ERR(ldo->regulator); dev_err(wm8994->dev, "Failed to register LDO%d: %d\n", -- Gitee From c43d31e079236b16bef0241ff118474661325aca Mon Sep 17 00:00:00 2001 From: Joey Gouly Date: Tue, 19 Jul 2022 16:31:10 +0800 Subject: [PATCH 431/674] arm64: alternatives: mark patch_alternative() as `noinstr` stable inclusion from stable-v5.10.112 commit 98973d2bdd4a9af53cc7ec9560ddbddf01216eba category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=98973d2bdd4a9af53cc7ec9560ddbddf01216eba -------------------------------- [ Upstream commit a2c0b0fbe01419f8f5d1c0b9c581631f34ffce8b ] The alternatives code must be `noinstr` such that it does not patch itself, as the cache invalidation is only performed after all the alternatives have been applied. Mark patch_alternative() as `noinstr`. Mark branch_insn_requires_update() and get_alt_insn() with `__always_inline` since they are both only called through patch_alternative(). Booting a kernel in QEMU TCG with KCSAN=y and ARM64_USE_LSE_ATOMICS=y caused a boot hang: [ 0.241121] CPU: All CPU(s) started at EL2 The alternatives code was patching the atomics in __tsan_read4() from LL/SC atomics to LSE atomics. The following fragment is using LL/SC atomics in the .text section: | <__tsan_unaligned_read4+304>: ldxr x6, [x2] | <__tsan_unaligned_read4+308>: add x6, x6, x5 | <__tsan_unaligned_read4+312>: stxr w7, x6, [x2] | <__tsan_unaligned_read4+316>: cbnz w7, <__tsan_unaligned_read4+304> This LL/SC atomic sequence was to be replaced with LSE atomics. However since the alternatives code was instrumentable, __tsan_read4() was being called after only the first instruction was replaced, which led to the following code in memory: | <__tsan_unaligned_read4+304>: ldadd x5, x6, [x2] | <__tsan_unaligned_read4+308>: add x6, x6, x5 | <__tsan_unaligned_read4+312>: stxr w7, x6, [x2] | <__tsan_unaligned_read4+316>: cbnz w7, <__tsan_unaligned_read4+304> This caused an infinite loop as the `stxr` instruction never completed successfully, so `w7` was always 0. Signed-off-by: Joey Gouly Cc: Mark Rutland Cc: Catalin Marinas Cc: Will Deacon Link: https://lore.kernel.org/r/20220405104733.11476-1-joey.gouly@arm.com Signed-off-by: Will Deacon Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- arch/arm64/kernel/alternative.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kernel/alternative.c b/arch/arm64/kernel/alternative.c index 73039949b5ce..5f8e4c2df53c 100644 --- a/arch/arm64/kernel/alternative.c +++ b/arch/arm64/kernel/alternative.c @@ -41,7 +41,7 @@ bool alternative_is_applied(u16 cpufeature) /* * Check if the target PC is within an alternative block. */ -static bool branch_insn_requires_update(struct alt_instr *alt, unsigned long pc) +static __always_inline bool branch_insn_requires_update(struct alt_instr *alt, unsigned long pc) { unsigned long replptr = (unsigned long)ALT_REPL_PTR(alt); return !(pc >= replptr && pc <= (replptr + alt->alt_len)); @@ -49,7 +49,7 @@ static bool branch_insn_requires_update(struct alt_instr *alt, unsigned long pc) #define align_down(x, a) ((unsigned long)(x) & ~(((unsigned long)(a)) - 1)) -static u32 get_alt_insn(struct alt_instr *alt, __le32 *insnptr, __le32 *altinsnptr) +static __always_inline u32 get_alt_insn(struct alt_instr *alt, __le32 *insnptr, __le32 *altinsnptr) { u32 insn; @@ -94,7 +94,7 @@ static u32 get_alt_insn(struct alt_instr *alt, __le32 *insnptr, __le32 *altinsnp return insn; } -static void patch_alternative(struct alt_instr *alt, +static noinstr void patch_alternative(struct alt_instr *alt, __le32 *origptr, __le32 *updptr, int nr_inst) { __le32 *replptr; -- Gitee From 827abe9996cbedb288fd7c07f53c5233406ca204 Mon Sep 17 00:00:00 2001 From: Steve Capper Date: Tue, 19 Jul 2022 16:31:11 +0800 Subject: [PATCH 432/674] tlb: hugetlb: Add more sizes to tlb_remove_huge_tlb_entry stable inclusion from stable-v5.10.112 commit b643807a735e2d80eec972ad22536dcb66f79c2e category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=b643807a735e2d80eec972ad22536dcb66f79c2e -------------------------------- [ Upstream commit 697a1d44af8ba0477ee729e632f4ade37999249a ] tlb_remove_huge_tlb_entry only considers PMD_SIZE and PUD_SIZE when updating the mmu_gather structure. Unfortunately on arm64 there are two additional huge page sizes that need to be covered: CONT_PTE_SIZE and CONT_PMD_SIZE. Where an end-user attempts to employ contiguous huge pages, a VM_BUG_ON can be experienced due to the fact that the tlb structure hasn't been correctly updated by the relevant tlb_flush_p.._range() call from tlb_remove_huge_tlb_entry. This patch adds inequality logic to the generic implementation of tlb_remove_huge_tlb_entry s.t. CONT_PTE_SIZE and CONT_PMD_SIZE are effectively covered on arm64. Also, as well as ptes, pmds and puds; p4ds are now considered too. Reported-by: David Hildenbrand Suggested-by: Peter Zijlstra (Intel) Cc: Anshuman Khandual Cc: Catalin Marinas Cc: Will Deacon Link: https://lore.kernel.org/linux-mm/811c5c8e-b3a2-85d2-049c-717f17c3a03a@redhat.com/ Signed-off-by: Steve Capper Acked-by: David Hildenbrand Reviewed-by: Anshuman Khandual Reviewed-by: Catalin Marinas Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220330112543.863-1-steve.capper@arm.com Signed-off-by: Will Deacon Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- include/asm-generic/tlb.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 6661ee1cff47..a0c4b99d2899 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -563,10 +563,14 @@ static inline void tlb_flush_p4d_range(struct mmu_gather *tlb, #define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \ do { \ unsigned long _sz = huge_page_size(h); \ - if (_sz == PMD_SIZE) \ - tlb_flush_pmd_range(tlb, address, _sz); \ - else if (_sz == PUD_SIZE) \ + if (_sz >= P4D_SIZE) \ + tlb_flush_p4d_range(tlb, address, _sz); \ + else if (_sz >= PUD_SIZE) \ tlb_flush_pud_range(tlb, address, _sz); \ + else if (_sz >= PMD_SIZE) \ + tlb_flush_pmd_range(tlb, address, _sz); \ + else \ + tlb_flush_pte_range(tlb, address, _sz); \ __tlb_remove_tlb_entry(tlb, ptep, address); \ } while (0) -- Gitee From 80804b54f0cdbced28775c066034a95cc3e3129f Mon Sep 17 00:00:00 2001 From: Andy Chiu Date: Tue, 19 Jul 2022 16:31:12 +0800 Subject: [PATCH 433/674] net: axienet: setup mdio unconditionally stable inclusion from stable-v5.10.112 commit 9c12fcf1d864b20bdbb422804b69b2dec936839b category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=9c12fcf1d864b20bdbb422804b69b2dec936839b -------------------------------- [ Upstream commit d1c4f93e3f0a023024a6f022a61528c06cf1daa9 ] The call to axienet_mdio_setup should not depend on whether "phy-node" pressents on the DT. Besides, since `lp->phy_node` is used if PHY is in SGMII or 100Base-X modes, move it into the if statement. And the next patch will remove `lp->phy_node` from driver's private structure and do an of_node_put on it right away after use since it is not used elsewhere. Signed-off-by: Andy Chiu Reviewed-by: Greentime Hu Reviewed-by: Robert Hancock Reviewed-by: Radhey Shyam Pandey Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/net/ethernet/xilinx/xilinx_axienet_main.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c index b8e85ef7ce92..ef20184277ff 100644 --- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c +++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c @@ -2076,15 +2076,14 @@ static int axienet_probe(struct platform_device *pdev) if (ret) goto cleanup_clk; - lp->phy_node = of_parse_phandle(pdev->dev.of_node, "phy-handle", 0); - if (lp->phy_node) { - ret = axienet_mdio_setup(lp); - if (ret) - dev_warn(&pdev->dev, - "error registering MDIO bus: %d\n", ret); - } + ret = axienet_mdio_setup(lp); + if (ret) + dev_warn(&pdev->dev, + "error registering MDIO bus: %d\n", ret); + if (lp->phy_mode == PHY_INTERFACE_MODE_SGMII || lp->phy_mode == PHY_INTERFACE_MODE_1000BASEX) { + lp->phy_node = of_parse_phandle(pdev->dev.of_node, "phy-handle", 0); if (!lp->phy_node) { dev_err(&pdev->dev, "phy-handle required for 1000BaseX/SGMII\n"); ret = -EINVAL; -- Gitee From b7fdd0885604d2c2d7df33e1fa7801710528c7d0 Mon Sep 17 00:00:00 2001 From: Marcin Kozlowski Date: Tue, 19 Jul 2022 16:31:13 +0800 Subject: [PATCH 434/674] net: usb: aqc111: Fix out-of-bounds accesses in RX fixup stable inclusion from stable-v5.10.112 commit d90df6da50c56ad8b1a132e3cf86b6cdf8f507b7 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=d90df6da50c56ad8b1a132e3cf86b6cdf8f507b7 -------------------------------- [ Upstream commit afb8e246527536848b9b4025b40e613edf776a9d ] aqc111_rx_fixup() contains several out-of-bounds accesses that can be triggered by a malicious (or defective) USB device, in particular: - The metadata array (desc_offset..desc_offset+2*pkt_count) can be out of bounds, causing OOB reads and (on big-endian systems) OOB endianness flips. - A packet can overlap the metadata array, causing a later OOB endianness flip to corrupt data used by a cloned SKB that has already been handed off into the network stack. - A packet SKB can be constructed whose tail is far beyond its end, causing out-of-bounds heap data to be considered part of the SKB's data. Found doing variant analysis. Tested it with another driver (ax88179_178a), since I don't have a aqc111 device to test it, but the code looks very similar. Signed-off-by: Marcin Kozlowski Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/net/usb/aqc111.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/net/usb/aqc111.c b/drivers/net/usb/aqc111.c index 0717c18015c9..c9c409518174 100644 --- a/drivers/net/usb/aqc111.c +++ b/drivers/net/usb/aqc111.c @@ -1102,10 +1102,15 @@ static int aqc111_rx_fixup(struct usbnet *dev, struct sk_buff *skb) if (start_of_descs != desc_offset) goto err; - /* self check desc_offset from header*/ - if (desc_offset >= skb_len) + /* self check desc_offset from header and make sure that the + * bounds of the metadata array are inside the SKB + */ + if (pkt_count * 2 + desc_offset >= skb_len) goto err; + /* Packets must not overlap the metadata array */ + skb_trim(skb, desc_offset); + if (pkt_count == 0) goto err; -- Gitee From 178c28b56c0cf450c4683b59aabe81a164e0099a Mon Sep 17 00:00:00 2001 From: Xiaomeng Tong Date: Tue, 19 Jul 2022 16:31:14 +0800 Subject: [PATCH 435/674] myri10ge: fix an incorrect free for skb in myri10ge_sw_tso stable inclusion from stable-v5.10.112 commit fa5ee7c4232cc9976c0db483b914519639de75dc category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=fa5ee7c4232cc9976c0db483b914519639de75dc -------------------------------- [ Upstream commit b423e54ba965b4469b48e46fd16941f1e1701697 ] All remaining skbs should be released when myri10ge_xmit fails to transmit a packet. Fix it within another skb_list_walk_safe. Signed-off-by: Xiaomeng Tong Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/net/ethernet/myricom/myri10ge/myri10ge.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c index b5c4a64bc025..30763e966633 100644 --- a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c +++ b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c @@ -2901,11 +2901,9 @@ static netdev_tx_t myri10ge_sw_tso(struct sk_buff *skb, status = myri10ge_xmit(curr, dev); if (status != 0) { dev_kfree_skb_any(curr); - if (segs != NULL) { - curr = segs; - segs = next; + skb_list_walk_safe(next, curr, next) { curr->next = NULL; - dev_kfree_skb_any(segs); + dev_kfree_skb_any(curr); } goto drop; } -- Gitee From 403e0c62a0c117a53d7e8635f93b63068b37ad95 Mon Sep 17 00:00:00 2001 From: Martin Leung Date: Tue, 19 Jul 2022 16:31:15 +0800 Subject: [PATCH 436/674] drm/amd/display: Revert FEC check in validation stable inclusion from stable-v5.10.112 commit 34ea097fb63d116d857e6c23b97cb94ff02ccf42 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=34ea097fb63d116d857e6c23b97cb94ff02ccf42 -------------------------------- [ Upstream commit b2075fce104b88b789c15ef1ed2b91dc94198e26 ] why and how: causes failure on install on certain machines Reviewed-by: George Shen Acked-by: Alex Hung Signed-off-by: Martin Leung Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/gpu/drm/amd/display/dc/core/dc.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc.c b/drivers/gpu/drm/amd/display/dc/core/dc.c index ac5323596c65..93f5229c303e 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc.c @@ -1173,10 +1173,6 @@ bool dc_validate_seamless_boot_timing(const struct dc *dc, if (!link->link_enc->funcs->is_dig_enabled(link->link_enc)) return false; - /* Check for FEC status*/ - if (link->link_enc->funcs->fec_is_active(link->link_enc)) - return false; - enc_inst = link->link_enc->funcs->get_dig_frontend(link->link_enc); if (enc_inst == ENGINE_ID_UNKNOWN) -- Gitee From 91b2c7e8a9b175f7e497dc79208d4f6479a43304 Mon Sep 17 00:00:00 2001 From: Roman Li Date: Tue, 19 Jul 2022 16:31:16 +0800 Subject: [PATCH 437/674] drm/amd/display: Fix allocate_mst_payload assert on resume stable inclusion from stable-v5.10.112 commit 4b44cd5840577479ee6ed387371dd1d1252ca921 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=4b44cd5840577479ee6ed387371dd1d1252ca921 -------------------------------- [ Upstream commit f4346fb3edf7720db3f7f5e1cab1f667cd024280 ] [Why] On resume we do link detection for all non-MST connectors. MST is handled separately. However the condition for telling if connector is on mst branch is not enough for mst hub case. Link detection for mst branch link leads to mst topology reset. That causes assert in dc_link_allocate_mst_payload() [How] Use link type as indicator for mst link. Reviewed-by: Wayne Lin Acked-by: Alex Hung Signed-off-by: Roman Li Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index e828f9414ba2..7bb151283f44 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -2022,7 +2022,8 @@ static int dm_resume(void *handle) * this is the case when traversing through already created * MST connectors, should be skipped */ - if (aconnector->mst_port) + if (aconnector->dc_link && + aconnector->dc_link->type == dc_connection_mst_branch) continue; mutex_lock(&aconnector->hpd_lock); -- Gitee From b1b141e7bc7c97341376d4ccfb36067d643f8737 Mon Sep 17 00:00:00 2001 From: Alexey Galakhov Date: Tue, 19 Jul 2022 16:31:17 +0800 Subject: [PATCH 438/674] scsi: mvsas: Add PCI ID of RocketRaid 2640 stable inclusion from stable-v5.10.112 commit 5b7ce74b6bc8c7e6ee3cb8b22b8c5e3e27943339 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=5b7ce74b6bc8c7e6ee3cb8b22b8c5e3e27943339 -------------------------------- [ Upstream commit 5f2bce1e222028dc1c15f130109a17aa654ae6e8 ] The HighPoint RocketRaid 2640 is a low-cost SAS controller based on Marvell chip. The chip in question was already supported by the kernel, just the PCI ID of this particular board was missing. Link: https://lore.kernel.org/r/20220309212535.402987-1-agalakhov@gmail.com Signed-off-by: Alexey Galakhov Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/scsi/mvsas/mv_init.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/scsi/mvsas/mv_init.c b/drivers/scsi/mvsas/mv_init.c index 0cfea7b2ab13..85ca8421fb86 100644 --- a/drivers/scsi/mvsas/mv_init.c +++ b/drivers/scsi/mvsas/mv_init.c @@ -646,6 +646,7 @@ static struct pci_device_id mvs_pci_table[] = { { PCI_VDEVICE(ARECA, PCI_DEVICE_ID_ARECA_1300), chip_1300 }, { PCI_VDEVICE(ARECA, PCI_DEVICE_ID_ARECA_1320), chip_1320 }, { PCI_VDEVICE(ADAPTEC2, 0x0450), chip_6440 }, + { PCI_VDEVICE(TTI, 0x2640), chip_6440 }, { PCI_VDEVICE(TTI, 0x2710), chip_9480 }, { PCI_VDEVICE(TTI, 0x2720), chip_9480 }, { PCI_VDEVICE(TTI, 0x2721), chip_9480 }, -- Gitee From d0ed685b6c3043cadeccd631164b3042bb1582c7 Mon Sep 17 00:00:00 2001 From: Chandrakanth patil Date: Tue, 19 Jul 2022 16:31:18 +0800 Subject: [PATCH 439/674] scsi: megaraid_sas: Target with invalid LUN ID is deleted during scan stable inclusion from stable-v5.10.112 commit e8cf1e4d953d7022704f57aca4e5ccce6cee14ea category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=e8cf1e4d953d7022704f57aca4e5ccce6cee14ea -------------------------------- [ Upstream commit 56495f295d8e021f77d065b890fc0100e3f9f6d8 ] The megaraid_sas driver supports single LUN for RAID devices. That is LUN 0. All other LUNs are unsupported. When a device scan on a logical target with invalid LUN number is invoked through sysfs, that target ends up getting removed. Add LUN ID validation in the slave destroy function to avoid the target deletion. Link: https://lore.kernel.org/r/20220324094711.48833-1-chandrakanth.patil@broadcom.com Signed-off-by: Chandrakanth patil Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/scsi/megaraid/megaraid_sas.h | 3 +++ drivers/scsi/megaraid/megaraid_sas_base.c | 7 +++++++ 2 files changed, 10 insertions(+) diff --git a/drivers/scsi/megaraid/megaraid_sas.h b/drivers/scsi/megaraid/megaraid_sas.h index 6b8ec57e8bdf..c088a848776e 100644 --- a/drivers/scsi/megaraid/megaraid_sas.h +++ b/drivers/scsi/megaraid/megaraid_sas.h @@ -2554,6 +2554,9 @@ struct megasas_instance_template { #define MEGASAS_IS_LOGICAL(sdev) \ ((sdev->channel < MEGASAS_MAX_PD_CHANNELS) ? 0 : 1) +#define MEGASAS_IS_LUN_VALID(sdev) \ + (((sdev)->lun == 0) ? 1 : 0) + #define MEGASAS_DEV_INDEX(scp) \ (((scp->device->channel % 2) * MEGASAS_MAX_DEV_PER_CHANNEL) + \ scp->device->id) diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c index 1c86b67476f0..718160ca66b3 100644 --- a/drivers/scsi/megaraid/megaraid_sas_base.c +++ b/drivers/scsi/megaraid/megaraid_sas_base.c @@ -2116,6 +2116,9 @@ static int megasas_slave_alloc(struct scsi_device *sdev) goto scan_target; } return -ENXIO; + } else if (!MEGASAS_IS_LUN_VALID(sdev)) { + sdev_printk(KERN_INFO, sdev, "%s: invalid LUN\n", __func__); + return -ENXIO; } scan_target: @@ -2146,6 +2149,10 @@ static void megasas_slave_destroy(struct scsi_device *sdev) instance = megasas_lookup_instance(sdev->host->host_no); if (MEGASAS_IS_LOGICAL(sdev)) { + if (!MEGASAS_IS_LUN_VALID(sdev)) { + sdev_printk(KERN_INFO, sdev, "%s: invalid LUN\n", __func__); + return; + } ld_tgt_id = MEGASAS_TARGET_ID(sdev); instance->ld_tgtid_status[ld_tgt_id] = LD_TARGET_ID_DELETED; if (megasas_dbg_lvl & LD_PD_DEBUG) -- Gitee From f13ab67744d7dd85de295d2fb71fb9285c589b73 Mon Sep 17 00:00:00 2001 From: Duoming Zhou Date: Tue, 19 Jul 2022 16:31:19 +0800 Subject: [PATCH 440/674] drivers: net: slip: fix NPD bug in sl_tx_timeout() stable inclusion from stable-v5.10.112 commit ca24c5e8f0ac3d43ec0cff29e1c861be73aff165 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=ca24c5e8f0ac3d43ec0cff29e1c861be73aff165 -------------------------------- [ Upstream commit ec4eb8a86ade4d22633e1da2a7d85a846b7d1798 ] When a slip driver is detaching, the slip_close() will act to cleanup necessary resources and sl->tty is set to NULL in slip_close(). Meanwhile, the packet we transmit is blocked, sl_tx_timeout() will be called. Although slip_close() and sl_tx_timeout() use sl->lock to synchronize, we don`t judge whether sl->tty equals to NULL in sl_tx_timeout() and the null pointer dereference bug will happen. (Thread 1) | (Thread 2) | slip_close() | spin_lock_bh(&sl->lock) | ... ... | sl->tty = NULL //(1) sl_tx_timeout() | spin_unlock_bh(&sl->lock) spin_lock(&sl->lock); | ... | ... tty_chars_in_buffer(sl->tty)| if (tty->ops->..) //(2) | ... | synchronize_rcu() We set NULL to sl->tty in position (1) and dereference sl->tty in position (2). This patch adds check in sl_tx_timeout(). If sl->tty equals to NULL, sl_tx_timeout() will goto out. Signed-off-by: Duoming Zhou Reviewed-by: Jiri Slaby Link: https://lore.kernel.org/r/20220405132206.55291-1-duoming@zju.edu.cn Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/net/slip/slip.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/slip/slip.c b/drivers/net/slip/slip.c index f81fb0b13a94..369bd30fed35 100644 --- a/drivers/net/slip/slip.c +++ b/drivers/net/slip/slip.c @@ -468,7 +468,7 @@ static void sl_tx_timeout(struct net_device *dev, unsigned int txqueue) spin_lock(&sl->lock); if (netif_queue_stopped(dev)) { - if (!netif_running(dev)) + if (!netif_running(dev) || !sl->tty) goto out; /* May be we must check transmitter timeout here ? -- Gitee From de06824ed8a1447dcaa55c1689369b3d58926652 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 19 Jul 2022 16:31:20 +0800 Subject: [PATCH 441/674] perf/imx_ddr: Fix undefined behavior due to shift overflowing the constant MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit stable inclusion from stable-v5.10.112 commit 000b3921b4d52399865ea398e5f6c999246437f4 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=000b3921b4d52399865ea398e5f6c999246437f4 -------------------------------- [ Upstream commit d02b4dd84e1a90f7f1444d027c0289bf355b0d5a ] Fix: In file included from :0:0: In function ‘ddr_perf_counter_enable’, inlined from ‘ddr_perf_irq_handler’ at drivers/perf/fsl_imx8_ddr_perf.c:651:2: ././include/linux/compiler_types.h:352:38: error: call to ‘__compiletime_assert_729’ \ declared with attribute error: FIELD_PREP: mask is not constant _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__) ... See https://lore.kernel.org/r/YkwQ6%2BtIH8GQpuct@zn.tnic for the gory details as to why it triggers with older gccs only. Signed-off-by: Borislav Petkov Cc: Frank Li Cc: Will Deacon Cc: Mark Rutland Cc: Shawn Guo Cc: Sascha Hauer Cc: Pengutronix Kernel Team Cc: Fabio Estevam Cc: NXP Linux Team Cc: linux-arm-kernel@lists.infradead.org Acked-by: Will Deacon Link: https://lore.kernel.org/r/20220405151517.29753-10-bp@alien8.de Signed-off-by: Will Deacon Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/perf/fsl_imx8_ddr_perf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/perf/fsl_imx8_ddr_perf.c b/drivers/perf/fsl_imx8_ddr_perf.c index 7f7bc0993670..e09bbf3890c4 100644 --- a/drivers/perf/fsl_imx8_ddr_perf.c +++ b/drivers/perf/fsl_imx8_ddr_perf.c @@ -29,7 +29,7 @@ #define CNTL_OVER_MASK 0xFFFFFFFE #define CNTL_CSV_SHIFT 24 -#define CNTL_CSV_MASK (0xFF << CNTL_CSV_SHIFT) +#define CNTL_CSV_MASK (0xFFU << CNTL_CSV_SHIFT) #define EVENT_CYCLES_ID 0 #define EVENT_CYCLES_COUNTER 0 -- Gitee From 4558f170f76cde124ffd2ac7f038a37f786a40c4 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Tue, 19 Jul 2022 16:31:21 +0800 Subject: [PATCH 442/674] mm, page_alloc: fix build_zonerefs_node() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit stable inclusion from stable-v5.10.112 commit 192e507ef894b558f51e12ae87e938a959b7dfb6 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=192e507ef894b558f51e12ae87e938a959b7dfb6 -------------------------------- commit e553f62f10d93551eb883eca227ac54d1a4fad84 upstream. Since commit 6aa303defb74 ("mm, vmscan: only allocate and reclaim from zones with pages managed by the buddy allocator") only zones with free memory are included in a built zonelist. This is problematic when e.g. all memory of a zone has been ballooned out when zonelists are being rebuilt. The decision whether to rebuild the zonelists when onlining new memory is done based on populated_zone() returning 0 for the zone the memory will be added to. The new zone is added to the zonelists only, if it has free memory pages (managed_zone() returns a non-zero value) after the memory has been onlined. This implies, that onlining memory will always free the added pages to the allocator immediately, but this is not true in all cases: when e.g. running as a Xen guest the onlined new memory will be added only to the ballooned memory list, it will be freed only when the guest is being ballooned up afterwards. Another problem with using managed_zone() for the decision whether a zone is being added to the zonelists is, that a zone with all memory used will in fact be removed from all zonelists in case the zonelists happen to be rebuilt. Use populated_zone() when building a zonelist as it has been done before that commit. There was a report that QubesOS (based on Xen) is hitting this problem. Xen has switched to use the zone device functionality in kernel 5.9 and QubesOS wants to use memory hotplugging for guests in order to be able to start a guest with minimal memory and expand it as needed. This was the report leading to the patch. Link: https://lkml.kernel.org/r/20220407120637.9035-1-jgross@suse.com Fixes: 6aa303defb74 ("mm, vmscan: only allocate and reclaim from zones with pages managed by the buddy allocator") Signed-off-by: Juergen Gross Reported-by: Marek Marczykowski-Górecki Acked-by: Michal Hocko Acked-by: David Hildenbrand Cc: Marek Marczykowski-Górecki Reviewed-by: Wei Yang Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c81ff36f4121..ec73cca1726c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5895,7 +5895,7 @@ static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs) do { zone_type--; zone = pgdat->node_zones + zone_type; - if (managed_zone(zone)) { + if (populated_zone(zone)) { zoneref_set_zone(zone, &zonerefs[nr_zones++]); check_highest_zone(zone_type); } -- Gitee From 4549c05d8c359e00b5a7d383b0a3c9a3a124b776 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 19 Jul 2022 16:31:22 +0800 Subject: [PATCH 443/674] mm: fix unexpected zeroed page mapping with zram swap stable inclusion from stable-v5.10.112 commit 20ed94f8181a25212e7404e44958e234f407624b category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=20ed94f8181a25212e7404e44958e234f407624b -------------------------------- commit e914d8f00391520ecc4495dd0ca0124538ab7119 upstream. Two processes under CLONE_VM cloning, user process can be corrupted by seeing zeroed page unexpectedly. CPU A CPU B do_swap_page do_swap_page SWP_SYNCHRONOUS_IO path SWP_SYNCHRONOUS_IO path swap_readpage valid data swap_slot_free_notify delete zram entry swap_readpage zeroed(invalid) data pte_lock map the *zero data* to userspace pte_unlock pte_lock if (!pte_same) goto out_nomap; pte_unlock return and next refault will read zeroed data The swap_slot_free_notify is bogus for CLONE_VM case since it doesn't increase the refcount of swap slot at copy_mm so it couldn't catch up whether it's safe or not to discard data from backing device. In the case, only the lock it could rely on to synchronize swap slot freeing is page table lock. Thus, this patch gets rid of the swap_slot_free_notify function. With this patch, CPU A will see correct data. CPU A CPU B do_swap_page do_swap_page SWP_SYNCHRONOUS_IO path SWP_SYNCHRONOUS_IO path swap_readpage original data pte_lock map the original data swap_free swap_range_free bd_disk->fops->swap_slot_free_notify swap_readpage read zeroed data pte_unlock pte_lock if (!pte_same) goto out_nomap; pte_unlock return on next refault will see mapped data by CPU B The concern of the patch would increase memory consumption since it could keep wasted memory with compressed form in zram as well as uncompressed form in address space. However, most of cases of zram uses no readahead and do_swap_page is followed by swap_free so it will free the compressed form from in zram quickly. Link: https://lkml.kernel.org/r/YjTVVxIAsnKAXjTd@google.com Fixes: 0bcac06f27d7 ("mm, swap: skip swapcache for swapin of synchronous device") Reported-by: Ivan Babrou Tested-by: Ivan Babrou Signed-off-by: Minchan Kim Cc: Nitin Gupta Cc: Sergey Senozhatsky Cc: Jens Axboe Cc: David Hildenbrand Cc: [4.14+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- mm/page_io.c | 54 ---------------------------------------------------- 1 file changed, 54 deletions(-) diff --git a/mm/page_io.c b/mm/page_io.c index 21f3160d39a8..ee28c39e566e 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -69,54 +69,6 @@ void end_swap_bio_write(struct bio *bio) bio_put(bio); } -static void swap_slot_free_notify(struct page *page) -{ - struct swap_info_struct *sis; - struct gendisk *disk; - swp_entry_t entry; - - /* - * There is no guarantee that the page is in swap cache - the software - * suspend code (at least) uses end_swap_bio_read() against a non- - * swapcache page. So we must check PG_swapcache before proceeding with - * this optimization. - */ - if (unlikely(!PageSwapCache(page))) - return; - - sis = page_swap_info(page); - if (data_race(!(sis->flags & SWP_BLKDEV))) - return; - - /* - * The swap subsystem performs lazy swap slot freeing, - * expecting that the page will be swapped out again. - * So we can avoid an unnecessary write if the page - * isn't redirtied. - * This is good for real swap storage because we can - * reduce unnecessary I/O and enhance wear-leveling - * if an SSD is used as the as swap device. - * But if in-memory swap device (eg zram) is used, - * this causes a duplicated copy between uncompressed - * data in VM-owned memory and compressed data in - * zram-owned memory. So let's free zram-owned memory - * and make the VM-owned decompressed page *dirty*, - * so the page should be swapped out somewhere again if - * we again wish to reclaim it. - */ - disk = sis->bdev->bd_disk; - entry.val = page_private(page); - if (disk->fops->swap_slot_free_notify && __swap_count(entry) == 1) { - unsigned long offset; - - offset = swp_offset(entry); - - SetPageDirty(page); - disk->fops->swap_slot_free_notify(sis->bdev, - offset); - } -} - static void end_swap_bio_read(struct bio *bio) { struct page *page = bio_first_page_all(bio); @@ -132,7 +84,6 @@ static void end_swap_bio_read(struct bio *bio) } SetPageUptodate(page); - swap_slot_free_notify(page); out: unlock_page(page); WRITE_ONCE(bio->bi_private, NULL); @@ -411,11 +362,6 @@ int swap_readpage(struct page *page, bool synchronous) if (sis->flags & SWP_SYNCHRONOUS_IO) { ret = bdev_read_page(sis->bdev, swap_page_sector(page), page); if (!ret) { - if (trylock_page(page)) { - swap_slot_free_notify(page); - unlock_page(page); - } - count_vm_event(PSWPIN); goto out; } -- Gitee From cad9c6583ef4ffd342a40b344ce38dc0b5f1de17 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Tue, 19 Jul 2022 16:31:23 +0800 Subject: [PATCH 444/674] mm: kmemleak: take a full lowmem check in kmemleak_*_phys() stable inclusion from stable-v5.10.112 commit 06c348fde545ec90e25de3e5bc4b814bff70ae9f category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=06c348fde545ec90e25de3e5bc4b814bff70ae9f -------------------------------- commit 23c2d497de21f25898fbea70aeb292ab8acc8c94 upstream. The kmemleak_*_phys() apis do not check the address for lowmem's min boundary, while the caller may pass an address below lowmem, which will trigger an oops: # echo scan > /sys/kernel/debug/kmemleak Unable to handle kernel paging request at virtual address ff5fffffffe00000 Oops [#1] Modules linked in: CPU: 2 PID: 134 Comm: bash Not tainted 5.18.0-rc1-next-20220407 #33 Hardware name: riscv-virtio,qemu (DT) epc : scan_block+0x74/0x15c ra : scan_block+0x72/0x15c epc : ffffffff801e5806 ra : ffffffff801e5804 sp : ff200000104abc30 gp : ffffffff815cd4e8 tp : ff60000004cfa340 t0 : 0000000000000200 t1 : 00aaaaaac23954cc t2 : 00000000000003ff s0 : ff200000104abc90 s1 : ffffffff81b0ff28 a0 : 0000000000000000 a1 : ff5fffffffe01000 a2 : ffffffff81b0ff28 a3 : 0000000000000002 a4 : 0000000000000001 a5 : 0000000000000000 a6 : ff200000104abd7c a7 : 0000000000000005 s2 : ff5fffffffe00ff9 s3 : ffffffff815cd998 s4 : ffffffff815d0e90 s5 : ffffffff81b0ff28 s6 : 0000000000000020 s7 : ffffffff815d0eb0 s8 : ffffffffffffffff s9 : ff5fffffffe00000 s10: ff5fffffffe01000 s11: 0000000000000022 t3 : 00ffffffaa17db4c t4 : 000000000000000f t5 : 0000000000000001 t6 : 0000000000000000 status: 0000000000000100 badaddr: ff5fffffffe00000 cause: 000000000000000d scan_gray_list+0x12e/0x1a6 kmemleak_scan+0x2aa/0x57e kmemleak_write+0x32a/0x40c full_proxy_write+0x56/0x82 vfs_write+0xa6/0x2a6 ksys_write+0x6c/0xe2 sys_write+0x22/0x2a ret_from_syscall+0x0/0x2 The callers may not quite know the actual address they pass(e.g. from devicetree). So the kmemleak_*_phys() apis should guarantee the address they finally use is in lowmem range, so check the address for lowmem's min boundary. Link: https://lkml.kernel.org/r/20220413122925.33856-1-patrick.wang.shcn@gmail.com Signed-off-by: Patrick Wang Acked-by: Catalin Marinas Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- mm/kmemleak.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 90eb82299149..118be6533374 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1124,7 +1124,7 @@ EXPORT_SYMBOL(kmemleak_no_scan); void __ref kmemleak_alloc_phys(phys_addr_t phys, size_t size, int min_count, gfp_t gfp) { - if (!IS_ENABLED(CONFIG_HIGHMEM) || PHYS_PFN(phys) < max_low_pfn) + if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn) kmemleak_alloc(__va(phys), size, min_count, gfp); } EXPORT_SYMBOL(kmemleak_alloc_phys); @@ -1138,7 +1138,7 @@ EXPORT_SYMBOL(kmemleak_alloc_phys); */ void __ref kmemleak_free_part_phys(phys_addr_t phys, size_t size) { - if (!IS_ENABLED(CONFIG_HIGHMEM) || PHYS_PFN(phys) < max_low_pfn) + if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn) kmemleak_free_part(__va(phys), size); } EXPORT_SYMBOL(kmemleak_free_part_phys); @@ -1150,7 +1150,7 @@ EXPORT_SYMBOL(kmemleak_free_part_phys); */ void __ref kmemleak_not_leak_phys(phys_addr_t phys) { - if (!IS_ENABLED(CONFIG_HIGHMEM) || PHYS_PFN(phys) < max_low_pfn) + if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn) kmemleak_not_leak(__va(phys)); } EXPORT_SYMBOL(kmemleak_not_leak_phys); @@ -1162,7 +1162,7 @@ EXPORT_SYMBOL(kmemleak_not_leak_phys); */ void __ref kmemleak_ignore_phys(phys_addr_t phys) { - if (!IS_ENABLED(CONFIG_HIGHMEM) || PHYS_PFN(phys) < max_low_pfn) + if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn) kmemleak_ignore(__va(phys)); } EXPORT_SYMBOL(kmemleak_ignore_phys); -- Gitee From 85fc72e3a63a7017a2c50040f40a96baab16577d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 19 Jul 2022 16:31:24 +0800 Subject: [PATCH 445/674] KVM: x86/mmu: Resolve nx_huge_pages when kvm.ko is loaded stable inclusion from stable-v5.10.112 commit 342454231ee5f2c2782f5510cab2e7a968486fef category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=342454231ee5f2c2782f5510cab2e7a968486fef -------------------------------- commit 1d0e84806047f38027d7572adb4702ef7c09b317 upstream. Resolve nx_huge_pages to true/false when kvm.ko is loaded, leaving it as -1 is technically undefined behavior when its value is read out by param_get_bool(), as boolean values are supposed to be '0' or '1'. Alternatively, KVM could define a custom getter for the param, but the auto value doesn't depend on the vendor module in any way, and printing "auto" would be unnecessarily unfriendly to the user. In addition to fixing the undefined behavior, resolving the auto value also fixes the scenario where the auto value resolves to N and no vendor module is loaded. Previously, -1 would result in Y being printed even though KVM would ultimately disable the mitigation. Rename the existing MMU module init/exit helpers to clarify that they're invoked with respect to the vendor module, and add comments to document why KVM has two separate "module init" flows. ========================================================================= UBSAN: invalid-load in kernel/params.c:320:33 load of value 255 is not a valid value for type '_Bool' CPU: 6 PID: 892 Comm: tail Not tainted 5.17.0-rc3+ #799 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 Call Trace: dump_stack_lvl+0x34/0x44 ubsan_epilogue+0x5/0x40 __ubsan_handle_load_invalid_value.cold+0x43/0x48 param_get_bool.cold+0xf/0x14 param_attr_show+0x55/0x80 module_attr_show+0x1c/0x30 sysfs_kf_seq_show+0x93/0xc0 seq_read_iter+0x11c/0x450 new_sync_read+0x11b/0x1a0 vfs_read+0xf0/0x190 ksys_read+0x5f/0xe0 do_syscall_64+0x3b/0xc0 entry_SYSCALL_64_after_hwframe+0x44/0xae ========================================================================= Fixes: b8e8c8303ff2 ("kvm: mmu: ITLB_MULTIHIT mitigation") Cc: stable@vger.kernel.org Reported-by: Bruno Goncalves Reported-by: Jan Stancek Signed-off-by: Sean Christopherson Message-Id: <20220331221359.3912754-1-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- arch/x86/include/asm/kvm_host.h | 5 +++-- arch/x86/kvm/mmu/mmu.c | 20 ++++++++++++++++---- arch/x86/kvm/x86.c | 20 ++++++++++++++++++-- 3 files changed, 37 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 559063fd8907..3d9da4629325 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1360,8 +1360,9 @@ static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm) return -ENOTSUPP; } -int kvm_mmu_module_init(void); -void kvm_mmu_module_exit(void); +void kvm_mmu_x86_module_init(void); +int kvm_mmu_vendor_module_init(void); +void kvm_mmu_vendor_module_exit(void); void kvm_mmu_destroy(struct kvm_vcpu *vcpu); int kvm_mmu_create(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 9506cfcb86be..99ae11011ed4 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5878,12 +5878,24 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp) return 0; } -int kvm_mmu_module_init(void) +/* + * nx_huge_pages needs to be resolved to true/false when kvm.ko is loaded, as + * its default value of -1 is technically undefined behavior for a boolean. + */ +void kvm_mmu_x86_module_init(void) { - int ret = -ENOMEM; - if (nx_huge_pages == -1) __set_nx_huge_pages(get_nx_auto_mode()); +} + +/* + * The bulk of the MMU initialization is deferred until the vendor module is + * loaded as many of the masks/values may be modified by VMX or SVM, i.e. need + * to be reset when a potentially different vendor module is loaded. + */ +int kvm_mmu_vendor_module_init(void) +{ + int ret = -ENOMEM; /* * MMU roles use union aliasing which is, generally speaking, an @@ -5957,7 +5969,7 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu) mmu_free_memory_caches(vcpu); } -void kvm_mmu_module_exit(void) +void kvm_mmu_vendor_module_exit(void) { mmu_destroy_caches(); percpu_counter_destroy(&kvm_total_used_mmu_pages); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1c45d9bafe0a..e43760895eec 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8098,7 +8098,7 @@ int kvm_arch_init(void *opaque) goto out_free_x86_emulator_cache; } - r = kvm_mmu_module_init(); + r = kvm_mmu_vendor_module_init(); if (r) goto out_free_percpu; @@ -8158,7 +8158,7 @@ void kvm_arch_exit(void) cancel_work_sync(&pvclock_gtod_work); #endif kvm_x86_ops.hardware_enable = NULL; - kvm_mmu_module_exit(); + kvm_mmu_vendor_module_exit(); free_percpu(user_return_msrs); kmem_cache_destroy(x86_emulator_cache); kmem_cache_destroy(x86_fpu_cache); @@ -11549,3 +11549,19 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_update_request); + +static int __init kvm_x86_init(void) +{ + kvm_mmu_x86_module_init(); + return 0; +} +module_init(kvm_x86_init); + +static void __exit kvm_x86_exit(void) +{ + /* + * If module_init() is implemented, module_exit() must also be + * implemented to allow module unload. + */ +} +module_exit(kvm_x86_exit); -- Gitee From 27c1f5f2de382058cca4a54c75348162d4d1b901 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 19 Jul 2022 16:31:25 +0800 Subject: [PATCH 446/674] memory: renesas-rpc-if: fix platform-device leak in error path stable inclusion from stable-v5.10.112 commit c089ffc846c85f200db34ad208338f4f81a6d82d category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=c089ffc846c85f200db34ad208338f4f81a6d82d -------------------------------- commit b452dbf24d7d9a990d70118462925f6ee287d135 upstream. Make sure to free the flash platform device in the event that registration fails during probe. Fixes: ca7d8b980b67 ("memory: add Renesas RPC-IF driver") Cc: stable@vger.kernel.org # 5.8 Cc: Sergei Shtylyov Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20220303180632.3194-1-johan@kernel.org Signed-off-by: Krzysztof Kozlowski Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/memory/renesas-rpc-if.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/memory/renesas-rpc-if.c b/drivers/memory/renesas-rpc-if.c index 9019121a80f5..781af51e3f79 100644 --- a/drivers/memory/renesas-rpc-if.c +++ b/drivers/memory/renesas-rpc-if.c @@ -592,6 +592,7 @@ static int rpcif_probe(struct platform_device *pdev) struct platform_device *vdev; struct device_node *flash; const char *name; + int ret; flash = of_get_next_child(pdev->dev.of_node, NULL); if (!flash) { @@ -615,7 +616,14 @@ static int rpcif_probe(struct platform_device *pdev) return -ENOMEM; vdev->dev.parent = &pdev->dev; platform_set_drvdata(pdev, vdev); - return platform_device_add(vdev); + + ret = platform_device_add(vdev); + if (ret) { + platform_device_put(vdev); + return ret; + } + + return 0; } static int rpcif_remove(struct platform_device *pdev) -- Gitee From 2722edfe52ea5b7ed9a6a2f2a818ba5d8f2f518e Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Tue, 19 Jul 2022 16:31:26 +0800 Subject: [PATCH 447/674] gcc-plugins: latent_entropy: use /dev/urandom stable inclusion from stable-v5.10.112 commit cc21ae932656483b07982afcec7e38a5bd6acc0c category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=cc21ae932656483b07982afcec7e38a5bd6acc0c -------------------------------- commit c40160f2998c897231f8454bf797558d30a20375 upstream. While the latent entropy plugin mostly doesn't derive entropy from get_random_const() for measuring the call graph, when __latent_entropy is applied to a constant, then it's initialized statically to output from get_random_const(). In that case, this data is derived from a 64-bit seed, which means a buffer of 512 bits doesn't really have that amount of compile-time entropy. This patch fixes that shortcoming by just buffering chunks of /dev/urandom output and doling it out as requested. At the same time, it's important that we don't break the use of -frandom-seed, for people who want the runtime benefits of the latent entropy plugin, while still having compile-time determinism. In that case, we detect whether gcc's set_random_seed() has been called by making a call to get_random_seed(noinit=true) in the plugin init function, which is called after set_random_seed() is called but before anything that calls get_random_seed(noinit=false), and seeing if it's zero or not. If it's not zero, we're in deterministic mode, and so we just generate numbers with a basic xorshift prng. Note that we don't detect if -frandom-seed is being used using the documented local_tick variable, because it's assigned via: local_tick = (unsigned) tv.tv_sec * 1000 + tv.tv_usec / 1000; which may well overflow and become -1 on its own, and so isn't reliable: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105171 [kees: The 256 byte rnd_buf size was chosen based on average (250), median (64), and std deviation (575) bytes of used entropy for a defconfig x86_64 build] Fixes: 38addce8b600 ("gcc-plugins: Add latent_entropy plugin") Cc: stable@vger.kernel.org Cc: PaX Team Signed-off-by: Jason A. Donenfeld Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20220405222815.21155-1-Jason@zx2c4.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- scripts/gcc-plugins/latent_entropy_plugin.c | 44 +++++++++++++-------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/scripts/gcc-plugins/latent_entropy_plugin.c b/scripts/gcc-plugins/latent_entropy_plugin.c index cbe1d6c4b1a5..c84bef1d2895 100644 --- a/scripts/gcc-plugins/latent_entropy_plugin.c +++ b/scripts/gcc-plugins/latent_entropy_plugin.c @@ -86,25 +86,31 @@ static struct plugin_info latent_entropy_plugin_info = { .help = "disable\tturn off latent entropy instrumentation\n", }; -static unsigned HOST_WIDE_INT seed; -/* - * get_random_seed() (this is a GCC function) generates the seed. - * This is a simple random generator without any cryptographic security because - * the entropy doesn't come from here. - */ +static unsigned HOST_WIDE_INT deterministic_seed; +static unsigned HOST_WIDE_INT rnd_buf[32]; +static size_t rnd_idx = ARRAY_SIZE(rnd_buf); +static int urandom_fd = -1; + static unsigned HOST_WIDE_INT get_random_const(void) { - unsigned int i; - unsigned HOST_WIDE_INT ret = 0; - - for (i = 0; i < 8 * sizeof(ret); i++) { - ret = (ret << 1) | (seed & 1); - seed >>= 1; - if (ret & 1) - seed ^= 0xD800000000000000ULL; + if (deterministic_seed) { + unsigned HOST_WIDE_INT w = deterministic_seed; + w ^= w << 13; + w ^= w >> 7; + w ^= w << 17; + deterministic_seed = w; + return deterministic_seed; } - return ret; + if (urandom_fd < 0) { + urandom_fd = open("/dev/urandom", O_RDONLY); + gcc_assert(urandom_fd >= 0); + } + if (rnd_idx >= ARRAY_SIZE(rnd_buf)) { + gcc_assert(read(urandom_fd, rnd_buf, sizeof(rnd_buf)) == sizeof(rnd_buf)); + rnd_idx = 0; + } + return rnd_buf[rnd_idx++]; } static tree tree_get_random_const(tree type) @@ -549,8 +555,6 @@ static void latent_entropy_start_unit(void *gcc_data __unused, tree type, id; int quals; - seed = get_random_seed(false); - if (in_lto_p) return; @@ -585,6 +589,12 @@ __visible int plugin_init(struct plugin_name_args *plugin_info, const struct plugin_argument * const argv = plugin_info->argv; int i; + /* + * Call get_random_seed() with noinit=true, so that this returns + * 0 in the case where no seed has been passed via -frandom-seed. + */ + deterministic_seed = get_random_seed(true); + static const struct ggc_root_tab gt_ggc_r_gt_latent_entropy[] = { { .base = &latent_entropy_decl, -- Gitee From 7fe75e370e687ab849938c7c32f3960cb83eda28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Tue, 19 Jul 2022 16:31:27 +0800 Subject: [PATCH 448/674] ath9k: Properly clear TX status area before reporting to mac80211 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit stable inclusion from stable-v5.10.112 commit 0f65cedae5009ad004dde18e66694615adfc7365 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=0f65cedae5009ad004dde18e66694615adfc7365 -------------------------------- commit 037250f0a45cf9ecf5b52d4b9ff8eadeb609c800 upstream. The ath9k driver was not properly clearing the status area in the ieee80211_tx_info struct before reporting TX status to mac80211. Instead, it was manually filling in fields, which meant that fields introduced later were left as-is. Conveniently, mac80211 actually provides a helper to zero out the status area, so use that to make sure we zero everything. The last commit touching the driver function writing the status information seems to have actually been fixing an issue that was also caused by the area being uninitialised; but it only added clearing of a single field instead of the whole struct. That is now redundant, though, so revert that commit and use it as a convenient Fixes tag. Fixes: cc591d77aba1 ("ath9k: Make sure to zero status.tx_time before reporting TX status") Reported-by: Bagas Sanjaya Cc: Signed-off-by: Toke Høiland-Jørgensen Tested-by: Bagas Sanjaya Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20220330164409.16645-1-toke@toke.dk Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/net/wireless/ath/ath9k/xmit.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/ath/ath9k/xmit.c b/drivers/net/wireless/ath/ath9k/xmit.c index 5691bd6eb82c..d9ece2090a1d 100644 --- a/drivers/net/wireless/ath/ath9k/xmit.c +++ b/drivers/net/wireless/ath/ath9k/xmit.c @@ -2512,6 +2512,8 @@ static void ath_tx_rc_status(struct ath_softc *sc, struct ath_buf *bf, struct ath_hw *ah = sc->sc_ah; u8 i, tx_rateindex; + ieee80211_tx_info_clear_status(tx_info); + if (txok) tx_info->status.ack_signal = ts->ts_rssi; @@ -2554,9 +2556,6 @@ static void ath_tx_rc_status(struct ath_softc *sc, struct ath_buf *bf, } tx_info->status.rates[tx_rateindex].count = ts->ts_longretry + 1; - - /* we report airtime in ath_tx_count_airtime(), don't report twice */ - tx_info->status.tx_time = 0; } static void ath_tx_processq(struct ath_softc *sc, struct ath_txq *txq) -- Gitee From 0d73660552f81e9219575f5cf9b39d28aaf13ad6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Tue, 19 Jul 2022 16:31:28 +0800 Subject: [PATCH 449/674] ath9k: Fix usage of driver-private space in tx_info MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit stable inclusion from stable-v5.10.112 commit 9b7ec35253c9d23f3471e8ae6e7a956d86bb2a2c category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=9b7ec35253c9d23f3471e8ae6e7a956d86bb2a2c -------------------------------- commit 5a6b06f5927c940fa44026695779c30b7536474c upstream. The ieee80211_tx_info_clear_status() helper also clears the rate counts and the driver-private part of struct ieee80211_tx_info, so using it breaks quite a few other things. So back out of using it, and instead define a ath-internal helper that only clears the area between the status_driver_data and the rates info. Combined with moving the ath_frame_info struct to status_driver_data, this avoids clearing anything we shouldn't be, and so we can keep the existing code for handling the rate information. While fixing this I also noticed that the setting of tx_info->status.rates[tx_rateindex].count on hardware underrun errors was always immediately overridden by the normal setting of the same fields, so rearrange the code so that the underrun detection actually takes effect. The new helper could be generalised to a 'memset_between()' helper, but leave it as a driver-internal helper for now since this needs to go to stable. Cc: stable@vger.kernel.org Reported-by: Peter Seiderer Fixes: 037250f0a45c ("ath9k: Properly clear TX status area before reporting to mac80211") Signed-off-by: Toke Høiland-Jørgensen Reviewed-by: Peter Seiderer Tested-by: Peter Seiderer Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20220404204800.2681133-1-toke@toke.dk Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/net/wireless/ath/ath9k/main.c | 2 +- drivers/net/wireless/ath/ath9k/xmit.c | 30 ++++++++++++++++++--------- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/drivers/net/wireless/ath/ath9k/main.c b/drivers/net/wireless/ath/ath9k/main.c index af367696fd92..ac354dfc5055 100644 --- a/drivers/net/wireless/ath/ath9k/main.c +++ b/drivers/net/wireless/ath/ath9k/main.c @@ -839,7 +839,7 @@ static bool ath9k_txq_list_has_key(struct list_head *txq_list, u32 keyix) continue; txinfo = IEEE80211_SKB_CB(bf->bf_mpdu); - fi = (struct ath_frame_info *)&txinfo->rate_driver_data[0]; + fi = (struct ath_frame_info *)&txinfo->status.status_driver_data[0]; if (fi->keyix == keyix) return true; } diff --git a/drivers/net/wireless/ath/ath9k/xmit.c b/drivers/net/wireless/ath/ath9k/xmit.c index d9ece2090a1d..6555abf02f18 100644 --- a/drivers/net/wireless/ath/ath9k/xmit.c +++ b/drivers/net/wireless/ath/ath9k/xmit.c @@ -141,8 +141,8 @@ static struct ath_frame_info *get_frame_info(struct sk_buff *skb) { struct ieee80211_tx_info *tx_info = IEEE80211_SKB_CB(skb); BUILD_BUG_ON(sizeof(struct ath_frame_info) > - sizeof(tx_info->rate_driver_data)); - return (struct ath_frame_info *) &tx_info->rate_driver_data[0]; + sizeof(tx_info->status.status_driver_data)); + return (struct ath_frame_info *) &tx_info->status.status_driver_data[0]; } static void ath_send_bar(struct ath_atx_tid *tid, u16 seqno) @@ -2501,6 +2501,16 @@ static void ath_tx_complete_buf(struct ath_softc *sc, struct ath_buf *bf, spin_unlock_irqrestore(&sc->tx.txbuflock, flags); } +static void ath_clear_tx_status(struct ieee80211_tx_info *tx_info) +{ + void *ptr = &tx_info->status; + + memset(ptr + sizeof(tx_info->status.rates), 0, + sizeof(tx_info->status) - + sizeof(tx_info->status.rates) - + sizeof(tx_info->status.status_driver_data)); +} + static void ath_tx_rc_status(struct ath_softc *sc, struct ath_buf *bf, struct ath_tx_status *ts, int nframes, int nbad, int txok) @@ -2512,7 +2522,7 @@ static void ath_tx_rc_status(struct ath_softc *sc, struct ath_buf *bf, struct ath_hw *ah = sc->sc_ah; u8 i, tx_rateindex; - ieee80211_tx_info_clear_status(tx_info); + ath_clear_tx_status(tx_info); if (txok) tx_info->status.ack_signal = ts->ts_rssi; @@ -2528,6 +2538,13 @@ static void ath_tx_rc_status(struct ath_softc *sc, struct ath_buf *bf, tx_info->status.ampdu_len = nframes; tx_info->status.ampdu_ack_len = nframes - nbad; + tx_info->status.rates[tx_rateindex].count = ts->ts_longretry + 1; + + for (i = tx_rateindex + 1; i < hw->max_rates; i++) { + tx_info->status.rates[i].count = 0; + tx_info->status.rates[i].idx = -1; + } + if ((ts->ts_status & ATH9K_TXERR_FILT) == 0 && (tx_info->flags & IEEE80211_TX_CTL_NO_ACK) == 0) { /* @@ -2549,13 +2566,6 @@ static void ath_tx_rc_status(struct ath_softc *sc, struct ath_buf *bf, tx_info->status.rates[tx_rateindex].count = hw->max_rate_tries; } - - for (i = tx_rateindex + 1; i < hw->max_rates; i++) { - tx_info->status.rates[i].count = 0; - tx_info->status.rates[i].idx = -1; - } - - tx_info->status.rates[tx_rateindex].count = ts->ts_longretry + 1; } static void ath_tx_processq(struct ath_softc *sc, struct ath_txq *txq) -- Gitee From 0263aca6a297fa8aa203747352714924f2eca0cc Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Tue, 19 Jul 2022 16:31:29 +0800 Subject: [PATCH 450/674] btrfs: fix root ref counts in error handling in btrfs_get_root_ref stable inclusion from stable-v5.10.112 commit 1d2eda18f6ffbd9902594469c6e1a055014eb2ac category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=1d2eda18f6ffbd9902594469c6e1a055014eb2ac -------------------------------- commit 168a2f776b9762f4021421008512dd7ab7474df1 upstream. In btrfs_get_root_ref(), when btrfs_insert_fs_root() fails, btrfs_put_root() can happen for two reasons: - the root already exists in the tree, in that case it returns the reference obtained in btrfs_lookup_fs_root() - another error so the cleanup is done in the fail label Calling btrfs_put_root() unconditionally would lead to double decrement of the root reference possibly freeing it in the second case. Reported-by: TOTE Robot Fixes: bc44d7c4b2b1 ("btrfs: push btrfs_grab_fs_root into btrfs_get_fs_root") CC: stable@vger.kernel.org # 5.10+ Signed-off-by: Jia-Ju Bai Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- fs/btrfs/disk-io.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a5bcad027883..87e55b024ac2 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1596,9 +1596,10 @@ static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info, ret = btrfs_insert_fs_root(fs_info, root); if (ret) { - btrfs_put_root(root); - if (ret == -EEXIST) + if (ret == -EEXIST) { + btrfs_put_root(root); goto again; + } goto fail; } return root; -- Gitee From ee239ac33693cabb8d0dc9c9e140bd046b467d16 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 19 Jul 2022 16:31:30 +0800 Subject: [PATCH 451/674] btrfs: mark resumed async balance as writing stable inclusion from stable-v5.10.112 commit 5e4dd1799883941cd9590415a47d967909c8d2ac category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=5e4dd1799883941cd9590415a47d967909c8d2ac -------------------------------- commit a690e5f2db4d1dca742ce734aaff9f3112d63764 upstream. When btrfs balance is interrupted with umount, the background balance resumes on the next mount. There is a potential deadlock with FS freezing here like as described in commit 26559780b953 ("btrfs: zoned: mark relocation as writing"). Mark the process as sb_writing to avoid it. Reviewed-by: Filipe Manana CC: stable@vger.kernel.org # 4.9+ Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- fs/btrfs/volumes.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index e462de991723..366d04763864 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4220,10 +4220,12 @@ static int balance_kthread(void *data) struct btrfs_fs_info *fs_info = data; int ret = 0; + sb_start_write(fs_info->sb); mutex_lock(&fs_info->balance_mutex); if (fs_info->balance_ctl) ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL); mutex_unlock(&fs_info->balance_mutex); + sb_end_write(fs_info->sb); return ret; } -- Gitee From 9fe75bc0d31f056bf8ed165bdbaf08125f2d65f0 Mon Sep 17 00:00:00 2001 From: Tim Crawford Date: Tue, 19 Jul 2022 16:31:31 +0800 Subject: [PATCH 452/674] ALSA: hda/realtek: Add quirk for Clevo PD50PNT stable inclusion from stable-v5.10.112 commit 163e162471308932c2f57f98a9a5fc25a8991247 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=163e162471308932c2f57f98a9a5fc25a8991247 -------------------------------- commit 9eb6f5c388060d8cef3c8b616cc31b765e022359 upstream. Fixes speaker output and headset detection on Clevo PD50PNT. Signed-off-by: Tim Crawford Cc: Link: https://lore.kernel.org/r/20220405182029.27431-1-tcrawford@system76.com Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index b886326ce9b9..ca0b8b9da577 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -2626,6 +2626,7 @@ static const struct snd_pci_quirk alc882_fixup_tbl[] = { SND_PCI_QUIRK(0x1558, 0x65e1, "Clevo PB51[ED][DF]", ALC1220_FIXUP_CLEVO_PB51ED_PINS), SND_PCI_QUIRK(0x1558, 0x65e5, "Clevo PC50D[PRS](?:-D|-G)?", ALC1220_FIXUP_CLEVO_PB51ED_PINS), SND_PCI_QUIRK(0x1558, 0x65f1, "Clevo PC50HS", ALC1220_FIXUP_CLEVO_PB51ED_PINS), + SND_PCI_QUIRK(0x1558, 0x65f5, "Clevo PD50PN[NRT]", ALC1220_FIXUP_CLEVO_PB51ED_PINS), SND_PCI_QUIRK(0x1558, 0x67d1, "Clevo PB71[ER][CDF]", ALC1220_FIXUP_CLEVO_PB51ED_PINS), SND_PCI_QUIRK(0x1558, 0x67e1, "Clevo PB71[DE][CDF]", ALC1220_FIXUP_CLEVO_PB51ED_PINS), SND_PCI_QUIRK(0x1558, 0x67e5, "Clevo PC70D[PRS](?:-D|-G)?", ALC1220_FIXUP_CLEVO_PB51ED_PINS), -- Gitee From 393019e7734575f237cf27208be878e6c32df16b Mon Sep 17 00:00:00 2001 From: Tao Jin Date: Tue, 19 Jul 2022 16:31:32 +0800 Subject: [PATCH 453/674] ALSA: hda/realtek: add quirk for Lenovo Thinkpad X12 speakers stable inclusion from stable-v5.10.112 commit 48d070ca5e7e015b7aa24d355656037863247910 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=48d070ca5e7e015b7aa24d355656037863247910 -------------------------------- commit 264fb03497ec1c7841bba872571bcd11beed57a7 upstream. For this specific device on Lenovo Thinkpad X12 tablet, the verbs were dumped by qemu running a guest OS that init this codec properly. After studying the dump, it turns out that the same quirk used by the other Lenovo devices can be reused. The patch was tested working against the mainline kernel. Cc: Signed-off-by: Tao Jin Link: https://lore.kernel.org/r/CO6PR03MB6241CD73310B37858FE64C85E1E89@CO6PR03MB6241.namprd03.prod.outlook.com Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index ca0b8b9da577..11d653190e6e 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -8995,6 +8995,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x17aa, 0x505d, "Thinkpad", ALC298_FIXUP_TPT470_DOCK), SND_PCI_QUIRK(0x17aa, 0x505f, "Thinkpad", ALC298_FIXUP_TPT470_DOCK), SND_PCI_QUIRK(0x17aa, 0x5062, "Thinkpad", ALC298_FIXUP_TPT470_DOCK), + SND_PCI_QUIRK(0x17aa, 0x508b, "Thinkpad X12 Gen 1", ALC287_FIXUP_LEGION_15IMHG05_SPEAKERS), SND_PCI_QUIRK(0x17aa, 0x5109, "Thinkpad", ALC269_FIXUP_LIMIT_INT_MIC_BOOST), SND_PCI_QUIRK(0x17aa, 0x511e, "Thinkpad", ALC298_FIXUP_TPT470_DOCK), SND_PCI_QUIRK(0x17aa, 0x511f, "Thinkpad", ALC298_FIXUP_TPT470_DOCK), -- Gitee From 5f9cced69272c308e918adc6bc33f99691b4ca2e Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Tue, 19 Jul 2022 16:31:33 +0800 Subject: [PATCH 454/674] ALSA: pcm: Test for "silence" field in struct "pcm_format_data" stable inclusion from stable-v5.10.112 commit 912797e54c99a98f0722f21313e13a3938bb6dba category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=912797e54c99a98f0722f21313e13a3938bb6dba -------------------------------- commit 2f7a26abb8241a0208c68d22815aa247c5ddacab upstream. Syzbot reports "KASAN: null-ptr-deref Write in snd_pcm_format_set_silence".[1] It is due to missing validation of the "silence" field of struct "pcm_format_data" in "pcm_formats" array. Add a test for valid "pat" and, if it is not so, return -EINVAL. [1] https://lore.kernel.org/lkml/000000000000d188ef05dc2c7279@google.com/ Reported-and-tested-by: syzbot+205eb15961852c2c5974@syzkaller.appspotmail.com Signed-off-by: Fabio M. De Francesco Cc: Link: https://lore.kernel.org/r/20220409012655.9399-1-fmdefrancesco@gmail.com Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- sound/core/pcm_misc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/core/pcm_misc.c b/sound/core/pcm_misc.c index 257d412eac5d..30f0f96e0000 100644 --- a/sound/core/pcm_misc.c +++ b/sound/core/pcm_misc.c @@ -429,7 +429,7 @@ int snd_pcm_format_set_silence(snd_pcm_format_t format, void *data, unsigned int return 0; width = pcm_formats[(INT)format].phys; /* physical width */ pat = pcm_formats[(INT)format].silence; - if (! width) + if (!width || !pat) return -EINVAL; /* signed or 1 byte data */ if (pcm_formats[(INT)format].signd == 1 || width <= 8) { -- Gitee From bc08c8d7dc05ab89ef8a64a206f12fd80b1b681d Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 19 Jul 2022 16:31:34 +0800 Subject: [PATCH 455/674] nl80211: correctly check NL80211_ATTR_REG_ALPHA2 size stable inclusion from stable-v5.10.112 commit 659214603bf26d2699039099cad1bea5b13bcae0 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=659214603bf26d2699039099cad1bea5b13bcae0 -------------------------------- commit 6624bb34b4eb19f715db9908cca00122748765d7 upstream. We need this to be at least two bytes, so we can access alpha2[0] and alpha2[1]. It may be three in case some userspace used NUL-termination since it was NLA_STRING (and we also push it out with NUL-termination). Cc: stable@vger.kernel.org Reported-by: Lee Jones Link: https://lore.kernel.org/r/20220411114201.fd4a31f06541.Ie7ff4be2cf348d8cc28ed0d626fc54becf7ea799@changeid Signed-off-by: Johannes Berg Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- net/wireless/nl80211.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 0df8b9a19952..12f44ad4e0d8 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -475,7 +475,8 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { .len = IEEE80211_MAX_MESH_ID_LEN }, [NL80211_ATTR_MPATH_NEXT_HOP] = NLA_POLICY_ETH_ADDR_COMPAT, - [NL80211_ATTR_REG_ALPHA2] = { .type = NLA_STRING, .len = 2 }, + /* allow 3 for NUL-termination, we used to declare this NLA_STRING */ + [NL80211_ATTR_REG_ALPHA2] = NLA_POLICY_RANGE(NLA_BINARY, 2, 3), [NL80211_ATTR_REG_RULES] = { .type = NLA_NESTED }, [NL80211_ATTR_BSS_CTS_PROT] = { .type = NLA_U8 }, -- Gitee From 87c8fc794e1ad4c982a5f14d61451e4414530e88 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Tue, 19 Jul 2022 16:31:35 +0800 Subject: [PATCH 456/674] ipv6: fix panic when forwarding a pkt with no in6 dev stable inclusion from stable-v5.10.112 commit a263712ba8c9ded25dd9d2d5ced11bcea5b33a3e category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=a263712ba8c9ded25dd9d2d5ced11bcea5b33a3e -------------------------------- commit e3fa461d8b0e185b7da8a101fe94dfe6dd500ac0 upstream. kongweibin reported a kernel panic in ip6_forward() when input interface has no in6 dev associated. The following tc commands were used to reproduce this panic: tc qdisc del dev vxlan100 root tc qdisc add dev vxlan100 root netem corrupt 5% CC: stable@vger.kernel.org Fixes: ccd27f05ae7b ("ipv6: fix 'disable_policy' for fwd packets") Reported-by: kongweibin Signed-off-by: Nicolas Dichtel Reviewed-by: David Ahern Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- net/ipv6/ip6_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 2aa39ce7093d..05e19e5d6514 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -508,7 +508,7 @@ int ip6_forward(struct sk_buff *skb) goto drop; if (!net->ipv6.devconf_all->disable_policy && - !idev->cnf.disable_policy && + (!idev || !idev->cnf.disable_policy) && !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); goto drop; -- Gitee From c8051bbbb4dc679a4d608a04540905776581c0d8 Mon Sep 17 00:00:00 2001 From: Melissa Wen Date: Tue, 19 Jul 2022 16:31:36 +0800 Subject: [PATCH 457/674] drm/amd/display: don't ignore alpha property on pre-multiplied mode stable inclusion from stable-v5.10.112 commit 68ae52efa132601bf11f5a3f11521846f7d0ea76 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=68ae52efa132601bf11f5a3f11521846f7d0ea76 -------------------------------- commit e4f1541caf60fcbe5a59e9d25805c0b5865e546a upstream. "Pre-multiplied" is the default pixel blend mode for KMS/DRM, as documented in supported_modes of drm_plane_create_blend_mode_property(): https://cgit.freedesktop.org/drm/drm-misc/tree/drivers/gpu/drm/drm_blend.c In this mode, both 'pixel alpha' and 'plane alpha' participate in the calculation, as described by the pixel blend mode formula in KMS/DRM documentation: out.rgb = plane_alpha * fg.rgb + (1 - (plane_alpha * fg.alpha)) * bg.rgb Considering the blend config mechanisms we have in the driver so far, the alpha mode that better fits this blend mode is the _PER_PIXEL_ALPHA_COMBINED_GLOBAL_GAIN, where the value for global_gain is the plane alpha (global_alpha). With this change, alpha property stops to be ignored. It also addresses Bug: https://gitlab.freedesktop.org/drm/amd/-/issues/1734 v2: * keep the 8-bit value for global_alpha_value (Nicholas) * correct the logical ordering for combined global gain (Nicholas) * apply to dcn10 too (Nicholas) Signed-off-by: Melissa Wen Tested-by: Rodrigo Siqueira Reviewed-by: Harry Wentland Tested-by: Simon Ser Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- .../drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c | 14 +++++++++----- drivers/gpu/drm/amd/display/dc/dcn20/dcn20_hwseq.c | 14 +++++++++----- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c index 532f6a1145b5..31a13daf4289 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c +++ b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c @@ -2387,14 +2387,18 @@ void dcn10_update_mpcc(struct dc *dc, struct pipe_ctx *pipe_ctx) &blnd_cfg.black_color); } - if (per_pixel_alpha) - blnd_cfg.alpha_mode = MPCC_ALPHA_BLEND_MODE_PER_PIXEL_ALPHA; - else - blnd_cfg.alpha_mode = MPCC_ALPHA_BLEND_MODE_GLOBAL_ALPHA; - blnd_cfg.overlap_only = false; blnd_cfg.global_gain = 0xff; + if (per_pixel_alpha && pipe_ctx->plane_state->global_alpha) { + blnd_cfg.alpha_mode = MPCC_ALPHA_BLEND_MODE_PER_PIXEL_ALPHA_COMBINED_GLOBAL_GAIN; + blnd_cfg.global_gain = pipe_ctx->plane_state->global_alpha_value; + } else if (per_pixel_alpha) { + blnd_cfg.alpha_mode = MPCC_ALPHA_BLEND_MODE_PER_PIXEL_ALPHA; + } else { + blnd_cfg.alpha_mode = MPCC_ALPHA_BLEND_MODE_GLOBAL_ALPHA; + } + if (pipe_ctx->plane_state->global_alpha) blnd_cfg.global_alpha = pipe_ctx->plane_state->global_alpha_value; else diff --git a/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_hwseq.c b/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_hwseq.c index 79a2b9c785f0..3d778760a3b5 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_hwseq.c @@ -2270,14 +2270,18 @@ void dcn20_update_mpcc(struct dc *dc, struct pipe_ctx *pipe_ctx) pipe_ctx, &blnd_cfg.black_color); } - if (per_pixel_alpha) - blnd_cfg.alpha_mode = MPCC_ALPHA_BLEND_MODE_PER_PIXEL_ALPHA; - else - blnd_cfg.alpha_mode = MPCC_ALPHA_BLEND_MODE_GLOBAL_ALPHA; - blnd_cfg.overlap_only = false; blnd_cfg.global_gain = 0xff; + if (per_pixel_alpha && pipe_ctx->plane_state->global_alpha) { + blnd_cfg.alpha_mode = MPCC_ALPHA_BLEND_MODE_PER_PIXEL_ALPHA_COMBINED_GLOBAL_GAIN; + blnd_cfg.global_gain = pipe_ctx->plane_state->global_alpha_value; + } else if (per_pixel_alpha) { + blnd_cfg.alpha_mode = MPCC_ALPHA_BLEND_MODE_PER_PIXEL_ALPHA; + } else { + blnd_cfg.alpha_mode = MPCC_ALPHA_BLEND_MODE_GLOBAL_ALPHA; + } + if (pipe_ctx->plane_state->global_alpha) blnd_cfg.global_alpha = pipe_ctx->plane_state->global_alpha_value; else -- Gitee From 216b1f0fcc1e079b4b946864436f608a377e61d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tomasz=20Mo=C5=84?= Date: Tue, 19 Jul 2022 16:31:37 +0800 Subject: [PATCH 458/674] drm/amdgpu: Enable gfxoff quirk on MacBook Pro MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit stable inclusion from stable-v5.10.112 commit 1fcfe37d170ad81d8a5432f49e76c2b2b2918274 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=1fcfe37d170ad81d8a5432f49e76c2b2b2918274 -------------------------------- commit 4593c1b6d159f1e5c35c07a7f125e79e5a864302 upstream. Enabling gfxoff quirk results in perfectly usable graphical user interface on MacBook Pro (15-inch, 2019) with Radeon Pro Vega 20 4 GB. Without the quirk, X server is completely unusable as every few seconds there is gpu reset due to ring gfx timeout. Signed-off-by: Tomasz Moń Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index 8dd7587fea26..c621ebd90031 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -1248,6 +1248,8 @@ static const struct amdgpu_gfxoff_quirk amdgpu_gfxoff_quirk_list[] = { { 0x1002, 0x15dd, 0x103c, 0x83e7, 0xd3 }, /* GFXOFF is unstable on C6 parts with a VBIOS 113-RAVEN-114 */ { 0x1002, 0x15dd, 0x1002, 0x15dd, 0xc6 }, + /* Apple MacBook Pro (15-inch, 2019) Radeon Pro Vega 20 4 GB */ + { 0x1002, 0x69af, 0x106b, 0x019a, 0xc0 }, { 0, 0, 0, 0, 0 }, }; -- Gitee From f98299044e078934160b31fa51fdc758cb8591e5 Mon Sep 17 00:00:00 2001 From: Rei Yamamoto Date: Tue, 19 Jul 2022 16:31:38 +0800 Subject: [PATCH 459/674] genirq/affinity: Consider that CPUs on nodes can be unbalanced stable inclusion from stable-v5.10.112 commit 0275c75955d1cdec07bc6d4f6551dbf253c2aaeb category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=0275c75955d1cdec07bc6d4f6551dbf253c2aaeb -------------------------------- commit 08d835dff916bfe8f45acc7b92c7af6c4081c8a7 upstream. If CPUs on a node are offline at boot time, the number of nodes is different when building affinity masks for present cpus and when building affinity masks for possible cpus. This causes the following problem: In the case that the number of vectors is less than the number of nodes there are cases where bits of masks for present cpus are overwritten when building masks for possible cpus. Fix this by excluding CPUs, which are not part of the current build mask (present/possible). [ tglx: Massaged changelog and added comment ] Fixes: b82592199032 ("genirq/affinity: Spread IRQs to all available NUMA nodes") Signed-off-by: Rei Yamamoto Signed-off-by: Thomas Gleixner Reviewed-by: Ming Lei Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20220331003309.10891-1-yamamoto.rei@jp.fujitsu.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- kernel/irq/affinity.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index 4d89ad4fae3b..5fb78addff51 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -269,8 +269,9 @@ static int __irq_build_affinity_masks(unsigned int startvec, */ if (numvecs <= nodes) { for_each_node_mask(n, nodemsk) { - cpumask_or(&masks[curvec].mask, &masks[curvec].mask, - node_to_cpumask[n]); + /* Ensure that only CPUs which are in both masks are set */ + cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); + cpumask_or(&masks[curvec].mask, &masks[curvec].mask, nmsk); if (++curvec == last_affv) curvec = firstvec; } -- Gitee From ed6e6b980f5110a34f15993debb2e1e3d7727b25 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Tue, 19 Jul 2022 16:31:39 +0800 Subject: [PATCH 460/674] tick/nohz: Use WARN_ON_ONCE() to prevent console saturation stable inclusion from stable-v5.10.112 commit 0806f1930562c1522cef061353e3d6a8d78899d0 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=0806f1930562c1522cef061353e3d6a8d78899d0 -------------------------------- commit 40e97e42961f8c6cc7bd5fe67cc18417e02d78f1 upstream. While running some testing on code that happened to allow the variable tick_nohz_full_running to get set but with no "possible" NOHZ cores to back up that setting, this warning triggered: if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) WARN_ON(tick_nohz_full_running); The console was overwhemled with an endless stream of one WARN per tick per core and there was no way to even see what was going on w/o using a serial console to capture it and then trace it back to this. Change it to WARN_ON_ONCE(). Fixes: 08ae95f4fd3b ("nohz_full: Allow the boot CPU to be nohz_full") Signed-off-by: Paul Gortmaker Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20211206145950.10927-3-paul.gortmaker@windriver.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- kernel/time/tick-sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 2d7899700b7a..d0440f5d5b45 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -164,7 +164,7 @@ static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) */ if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) { #ifdef CONFIG_NO_HZ_FULL - WARN_ON(tick_nohz_full_running); + WARN_ON_ONCE(tick_nohz_full_running); #endif tick_do_timer_cpu = cpu; } -- Gitee From a615fdf82ac1af66bbc3533faa655348b4f1af16 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Tue, 19 Jul 2022 16:31:40 +0800 Subject: [PATCH 461/674] ARM: davinci: da850-evm: Avoid NULL pointer dereference stable inclusion from stable-v5.10.112 commit 0a312ec66a03133d28570f07bc52749ccfef54da category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=0a312ec66a03133d28570f07bc52749ccfef54da -------------------------------- commit 83a1cde5c74bfb44b49cb2a940d044bb2380f4ea upstream. With newer versions of GCC, there is a panic in da850_evm_config_emac() when booting multi_v5_defconfig in QEMU under the palmetto-bmc machine: Unable to handle kernel NULL pointer dereference at virtual address 00000020 pgd = (ptrval) [00000020] *pgd=00000000 Internal error: Oops: 5 [#1] PREEMPT ARM Modules linked in: CPU: 0 PID: 1 Comm: swapper Not tainted 5.15.0 #1 Hardware name: Generic DT based system PC is at da850_evm_config_emac+0x1c/0x120 LR is at do_one_initcall+0x50/0x1e0 The emac_pdata pointer in soc_info is NULL because davinci_soc_info only gets populated on davinci machines but da850_evm_config_emac() is called on all machines via device_initcall(). Move the rmii_en assignment below the machine check so that it is only dereferenced when running on a supported SoC. Fixes: bae105879f2f ("davinci: DA850/OMAP-L138 EVM: implement autodetect of RMII PHY") Signed-off-by: Nathan Chancellor Reviewed-by: Arnd Bergmann Reviewed-by: Bartosz Golaszewski Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/YcS4xVWs6bQlQSPC@archlinux-ax161/ Signed-off-by: Arnd Bergmann Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- arch/arm/mach-davinci/board-da850-evm.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/arm/mach-davinci/board-da850-evm.c b/arch/arm/mach-davinci/board-da850-evm.c index 428012687a80..7f7f6bae21c2 100644 --- a/arch/arm/mach-davinci/board-da850-evm.c +++ b/arch/arm/mach-davinci/board-da850-evm.c @@ -1101,11 +1101,13 @@ static int __init da850_evm_config_emac(void) int ret; u32 val; struct davinci_soc_info *soc_info = &davinci_soc_info; - u8 rmii_en = soc_info->emac_pdata->rmii_en; + u8 rmii_en; if (!machine_is_davinci_da850_evm()) return 0; + rmii_en = soc_info->emac_pdata->rmii_en; + cfg_chip3_base = DA8XX_SYSCFG0_VIRT(DA8XX_CFGCHIP3_REG); val = __raw_readl(cfg_chip3_base); -- Gitee From df361a40bff129549f4d1a2a8120108c8c217ef7 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Tue, 19 Jul 2022 16:31:41 +0800 Subject: [PATCH 462/674] dm integrity: fix memory corruption when tag_size is less than digest size stable inclusion from stable-v5.10.112 commit cd02b2687d66f0a8e716384de4b9a0671331f1dc category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=cd02b2687d66f0a8e716384de4b9a0671331f1dc -------------------------------- commit 08c1af8f1c13bbf210f1760132f4df24d0ed46d6 upstream. It is possible to set up dm-integrity in such a way that the "tag_size" parameter is less than the actual digest size. In this situation, a part of the digest beyond tag_size is ignored. In this case, dm-integrity would write beyond the end of the ic->recalc_tags array and corrupt memory. The corruption happened in integrity_recalc->integrity_sector_checksum->crypto_shash_final. Fix this corruption by increasing the tags array so that it has enough padding at the end to accomodate the loop in integrity_recalc() being able to write a full digest size for the last member of the tags array. Cc: stable@vger.kernel.org # v4.19+ Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/md/dm-integrity.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index f7471a2642dd..6f085e96c3f3 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -4232,6 +4232,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) } if (ic->internal_hash) { + size_t recalc_tags_size; ic->recalc_wq = alloc_workqueue("dm-integrity-recalc", WQ_MEM_RECLAIM, 1); if (!ic->recalc_wq ) { ti->error = "Cannot allocate workqueue"; @@ -4245,8 +4246,10 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) r = -ENOMEM; goto bad; } - ic->recalc_tags = kvmalloc_array(RECALC_SECTORS >> ic->sb->log2_sectors_per_block, - ic->tag_size, GFP_KERNEL); + recalc_tags_size = (RECALC_SECTORS >> ic->sb->log2_sectors_per_block) * ic->tag_size; + if (crypto_shash_digestsize(ic->internal_hash) > ic->tag_size) + recalc_tags_size += crypto_shash_digestsize(ic->internal_hash) - ic->tag_size; + ic->recalc_tags = kvmalloc(recalc_tags_size, GFP_KERNEL); if (!ic->recalc_tags) { ti->error = "Cannot allocate tags for recalculating"; r = -ENOMEM; -- Gitee From 1b379510924fbac700bfa5e8de9a7c39dee4ea23 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Tue, 19 Jul 2022 16:31:42 +0800 Subject: [PATCH 463/674] smp: Fix offline cpu check in flush_smp_call_function_queue() stable inclusion from stable-v5.10.112 commit 89496d80bf847b49cb1349a7028b8b3c6e595d95 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=89496d80bf847b49cb1349a7028b8b3c6e595d95 -------------------------------- commit 9e949a3886356fe9112c6f6f34a6e23d1d35407f upstream. The check in flush_smp_call_function_queue() for callbacks that are sent to offline CPUs currently checks whether the queue is empty. However, flush_smp_call_function_queue() has just deleted all the callbacks from the queue and moved all the entries into a local list. This checks would only be positive if some callbacks were added in the short time after llist_del_all() was called. This does not seem to be the intention of this check. Change the check to look at the local list to which the entries were moved instead of the queue from which all the callbacks were just removed. Fixes: 8d056c48e4862 ("CPU hotplug, smp: flush any pending IPI callbacks before CPU offline") Signed-off-by: Nadav Amit Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20220319072015.1495036-1-namit@vmware.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- kernel/smp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/smp.c b/kernel/smp.c index 7cb03edf1735..114776d0d11e 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -376,7 +376,7 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) /* There shouldn't be any pending callbacks on an offline CPU. */ if (unlikely(warn_cpu_offline && !cpu_online(smp_processor_id()) && - !warned && !llist_empty(head))) { + !warned && entry != NULL)) { warned = true; WARN(1, "IPI on offline CPU %d\n", smp_processor_id()); -- Gitee From 9563c809dc9eee0873cbae6aa662ad56fd498642 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Povi=C5=A1er?= Date: Tue, 19 Jul 2022 16:31:43 +0800 Subject: [PATCH 464/674] i2c: pasemi: Wait for write xfers to finish MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit stable inclusion from stable-v5.10.112 commit 84837f43e56fc2594f07a49399667c467de527a3 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=84837f43e56fc2594f07a49399667c467de527a3 -------------------------------- commit bd8963e602c77adc76dbbbfc3417c3cf14fed76b upstream. Wait for completion of write transfers before returning from the driver. At first sight it may seem advantageous to leave write transfers queued for the controller to carry out on its own time, but there's a couple of issues with it: * Driver doesn't check for FIFO space. * The queued writes can complete while the driver is in its I2C read transfer path which means it will get confused by the raising of XEN (the 'transaction ended' signal). This can cause a spurious ENODATA error due to premature reading of the MRXFIFO register. Adding the wait fixes some unreliability issues with the driver. There's some efficiency cost to it (especially with pasemi_smb_waitready doing its polling), but that will be alleviated once the driver receives interrupt support. Fixes: beb58aa39e6e ("i2c: PA Semi SMBus driver") Signed-off-by: Martin Povišer Reviewed-by: Sven Peter Signed-off-by: Wolfram Sang Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/i2c/busses/i2c-pasemi.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/i2c/busses/i2c-pasemi.c b/drivers/i2c/busses/i2c-pasemi.c index 20f2772c0e79..2c909522f0f3 100644 --- a/drivers/i2c/busses/i2c-pasemi.c +++ b/drivers/i2c/busses/i2c-pasemi.c @@ -137,6 +137,12 @@ static int pasemi_i2c_xfer_msg(struct i2c_adapter *adapter, TXFIFO_WR(smbus, msg->buf[msg->len-1] | (stop ? MTXFIFO_STOP : 0)); + + if (stop) { + err = pasemi_smb_waitready(smbus); + if (err) + goto reset_out; + } } return 0; -- Gitee From bff236a574e83e12d5825dd7b1bbdbdc8352144d Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Tue, 19 Jul 2022 16:31:44 +0800 Subject: [PATCH 465/674] timers: Fix warning condition in __run_timers() stable inclusion from stable-v5.10.112 commit 9a5a4d23e24d39784b643e5db2e4c3a97da4a1ac category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=9a5a4d23e24d39784b643e5db2e4c3a97da4a1ac -------------------------------- commit c54bc0fc84214b203f7a0ebfd1bd308ce2abe920 upstream. When the timer base is empty, base::next_expiry is set to base::clk + NEXT_TIMER_MAX_DELTA and base::next_expiry_recalc is false. When no timer is queued until jiffies reaches base::next_expiry value, the warning for not finding any expired timer and base::next_expiry_recalc is false in __run_timers() triggers. To prevent triggering the warning in this valid scenario base::timers_pending needs to be added to the warning condition. Fixes: 31cd0e119d50 ("timers: Recalculate next timer interrupt only when necessary") Reported-by: Johannes Berg Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20220405191732.7438-3-anna-maria@linutronix.de Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- kernel/time/timer.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 351420c23064..f7d3a108e27c 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1738,11 +1738,14 @@ static inline void __run_timers(struct timer_base *base) time_after_eq(jiffies, base->next_expiry)) { levels = collect_expired_timers(base, heads); /* - * The only possible reason for not finding any expired - * timer at this clk is that all matching timers have been - * dequeued. + * The two possible reasons for not finding any expired + * timer at this clk are that all matching timers have been + * dequeued or no timer has been queued since + * base::next_expiry was set to base::clk + + * NEXT_TIMER_MAX_DELTA. */ - WARN_ON_ONCE(!levels && !base->next_expiry_recalc); + WARN_ON_ONCE(!levels && !base->next_expiry_recalc + && base->timers_pending); base->clk++; base->next_expiry = __next_timer_interrupt(base); -- Gitee From d1cd8c77cc229f0a36962b246f61efabc1e9c475 Mon Sep 17 00:00:00 2001 From: Chao Gao Date: Tue, 19 Jul 2022 16:31:45 +0800 Subject: [PATCH 466/674] dma-direct: avoid redundant memory sync for swiotlb stable inclusion from stable-v5.10.112 commit 26f827e095aba4141ffd684276ef54025de27574 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=26f827e095aba4141ffd684276ef54025de27574 -------------------------------- commit 9e02977bfad006af328add9434c8bffa40e053bb upstream. When we looked into FIO performance with swiotlb enabled in VM, we found swiotlb_bounce() is always called one more time than expected for each DMA read request. It turns out that the bounce buffer is copied to original DMA buffer twice after the completion of a DMA request (one is done by in dma_direct_sync_single_for_cpu(), the other by swiotlb_tbl_unmap_single()). But the content in bounce buffer actually doesn't change between the two rounds of copy. So, one round of copy is redundant. Pass DMA_ATTR_SKIP_CPU_SYNC flag to swiotlb_tbl_unmap_single() to skip the memory copy in it. This fix increases FIO 64KB sequential read throughput in a guest with swiotlb=force by 5.6%. Fixes: 55897af63091 ("dma-direct: merge swiotlb_dma_ops into the dma_direct code") Reported-by: Wang Zhaoyang1 Reported-by: Gao Liang Signed-off-by: Chao Gao Reviewed-by: Kevin Tian Signed-off-by: Christoph Hellwig Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- kernel/dma/direct.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h index b98615578737..c9d380318dd8 100644 --- a/kernel/dma/direct.h +++ b/kernel/dma/direct.h @@ -114,6 +114,7 @@ static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr, dma_direct_sync_single_for_cpu(dev, addr, size, dir); if (unlikely(is_swiotlb_buffer(phys))) - swiotlb_tbl_unmap_single(dev, phys, size, size, dir, attrs); + swiotlb_tbl_unmap_single(dev, phys, size, size, dir, + attrs | DMA_ATTR_SKIP_CPU_SYNC); } #endif /* _KERNEL_DMA_DIRECT_H */ -- Gitee From ae2fd3a1fbc11fbd574dd44f99a99cac1a639516 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Tue, 19 Jul 2022 16:31:46 +0800 Subject: [PATCH 467/674] scsi: iscsi: Fix endpoint reuse regression stable inclusion from stable-v5.10.112 commit 129db30599bc8b4a7cd0f219feb1e0154802e3d2 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=129db30599bc8b4a7cd0f219feb1e0154802e3d2 -------------------------------- commit 0aadafb5c34403a7cced1a8d61877048dc059f70 upstream. This patch fixes a bug where when using iSCSI offload we can free an endpoint while userspace still thinks it's active. That then causes the endpoint ID to be reused for a new connection's endpoint while userspace still thinks the ID is for the original connection. Userspace will then end up disconnecting a running connection's endpoint or trying to bind to another connection's endpoint. This bug is a regression added in: Commit 23d6fefbb3f6 ("scsi: iscsi: Fix in-kernel conn failure handling") where we added a in kernel ep_disconnect call to fix a bug in: Commit 0ab710458da1 ("scsi: iscsi: Perform connection failure entirely in kernel space") where we would call stop_conn without having done ep_disconnect. This early ep_disconnect call will then free the endpoint and it's ID while userspace still thinks the ID is valid. Fix the early release of the ID by having the in kernel recovery code keep a reference to the endpoint until userspace has called into the kernel to finish cleaning up the endpoint/connection. It requires the previous commit "scsi: iscsi: Release endpoint ID when its freed" which moved the freeing of the ID until when the endpoint is released. Link: https://lore.kernel.org/r/20220408001314.5014-5-michael.christie@oracle.com Fixes: 23d6fefbb3f6 ("scsi: iscsi: Fix in-kernel conn failure handling") Tested-by: Manish Rangankar Reviewed-by: Lee Duncan Reviewed-by: Chris Leech Signed-off-by: Mike Christie Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/scsi/scsi_transport_iscsi.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c index 21cc306a0f11..6877a75e1a5d 100644 --- a/drivers/scsi/scsi_transport_iscsi.c +++ b/drivers/scsi/scsi_transport_iscsi.c @@ -2273,7 +2273,11 @@ static void iscsi_if_disconnect_bound_ep(struct iscsi_cls_conn *conn, mutex_unlock(&conn->ep_mutex); flush_work(&conn->cleanup_work); - + /* + * Userspace is now done with the EP so we can release the ref + * iscsi_cleanup_conn_work_fn took. + */ + iscsi_put_endpoint(ep); mutex_lock(&conn->ep_mutex); } } @@ -2355,6 +2359,12 @@ static void iscsi_cleanup_conn_work_fn(struct work_struct *work) return; } + /* + * Get a ref to the ep, so we don't release its ID until after + * userspace is done referencing it in iscsi_if_disconnect_bound_ep. + */ + if (conn->ep) + get_device(&conn->ep->dev); iscsi_ep_disconnect(conn, false); if (system_state != SYSTEM_RUNNING) { -- Gitee From f8cb0194fef2d021f4c52d2aa65afc942a682f2a Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Tue, 19 Jul 2022 16:31:47 +0800 Subject: [PATCH 468/674] scsi: iscsi: Fix unbound endpoint error handling stable inclusion from stable-v5.10.112 commit 361288633bfa0927b936ed3e1f01b605fb239994 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=361288633bfa0927b936ed3e1f01b605fb239994 -------------------------------- commit 03690d81974535f228e892a14f0d2d44404fe555 upstream. If a driver raises a connection error before the connection is bound, we can leave a cleanup_work queued that can later run and disconnect/stop a connection that is logged in. The problem is that drivers can call iscsi_conn_error_event for endpoints that are connected but not yet bound when something like the network port they are using is brought down. iscsi_cleanup_conn_work_fn will check for this and exit early, but if the cleanup_work is stuck behind other works, it might not get run until after userspace has done ep_disconnect. Because the endpoint is not yet bound there was no way for ep_disconnect to flush the work. The bug of leaving stop_conns queued was added in: Commit 23d6fefbb3f6 ("scsi: iscsi: Fix in-kernel conn failure handling") and: Commit 0ab710458da1 ("scsi: iscsi: Perform connection failure entirely in kernel space") was supposed to fix it, but left this case. This patch moves the conn state check to before we even queue the work so we can avoid queueing. Link: https://lore.kernel.org/r/20220408001314.5014-7-michael.christie@oracle.com Fixes: 0ab710458da1 ("scsi: iscsi: Perform connection failure entirely in kernel space") Tested-by: Manish Rangankar Reviewed-by: Lee Duncan Reviewed-by: Chris Leech Signed-off-by: Mike Christie Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/scsi/scsi_transport_iscsi.c | 65 ++++++++++++++++------------- 1 file changed, 36 insertions(+), 29 deletions(-) diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c index 6877a75e1a5d..ef7cd7520e7c 100644 --- a/drivers/scsi/scsi_transport_iscsi.c +++ b/drivers/scsi/scsi_transport_iscsi.c @@ -2224,10 +2224,10 @@ static void iscsi_stop_conn(struct iscsi_cls_conn *conn, int flag) switch (flag) { case STOP_CONN_RECOVER: - conn->state = ISCSI_CONN_FAILED; + WRITE_ONCE(conn->state, ISCSI_CONN_FAILED); break; case STOP_CONN_TERM: - conn->state = ISCSI_CONN_DOWN; + WRITE_ONCE(conn->state, ISCSI_CONN_DOWN); break; default: iscsi_cls_conn_printk(KERN_ERR, conn, "invalid stop flag %d\n", @@ -2245,7 +2245,7 @@ static void iscsi_ep_disconnect(struct iscsi_cls_conn *conn, bool is_active) struct iscsi_endpoint *ep; ISCSI_DBG_TRANS_CONN(conn, "disconnect ep.\n"); - conn->state = ISCSI_CONN_FAILED; + WRITE_ONCE(conn->state, ISCSI_CONN_FAILED); if (!conn->ep || !session->transport->ep_disconnect) return; @@ -2344,21 +2344,6 @@ static void iscsi_cleanup_conn_work_fn(struct work_struct *work) struct iscsi_cls_session *session = iscsi_conn_to_session(conn); mutex_lock(&conn->ep_mutex); - /* - * If we are not at least bound there is nothing for us to do. Userspace - * will do a ep_disconnect call if offload is used, but will not be - * doing a stop since there is nothing to clean up, so we have to clear - * the cleanup bit here. - */ - if (conn->state != ISCSI_CONN_BOUND && conn->state != ISCSI_CONN_UP) { - ISCSI_DBG_TRANS_CONN(conn, "Got error while conn is already failed. Ignoring.\n"); - spin_lock_irq(&conn->lock); - clear_bit(ISCSI_CLS_CONN_BIT_CLEANUP, &conn->flags); - spin_unlock_irq(&conn->lock); - mutex_unlock(&conn->ep_mutex); - return; - } - /* * Get a ref to the ep, so we don't release its ID until after * userspace is done referencing it in iscsi_if_disconnect_bound_ep. @@ -2425,7 +2410,7 @@ iscsi_create_conn(struct iscsi_cls_session *session, int dd_size, uint32_t cid) INIT_WORK(&conn->cleanup_work, iscsi_cleanup_conn_work_fn); conn->transport = transport; conn->cid = cid; - conn->state = ISCSI_CONN_DOWN; + WRITE_ONCE(conn->state, ISCSI_CONN_DOWN); /* this is released in the dev's release function */ if (!get_device(&session->dev)) @@ -2613,10 +2598,30 @@ void iscsi_conn_error_event(struct iscsi_cls_conn *conn, enum iscsi_err error) struct iscsi_internal *priv; int len = nlmsg_total_size(sizeof(*ev)); unsigned long flags; + int state; spin_lock_irqsave(&conn->lock, flags); - if (!test_and_set_bit(ISCSI_CLS_CONN_BIT_CLEANUP, &conn->flags)) - queue_work(iscsi_conn_cleanup_workq, &conn->cleanup_work); + /* + * Userspace will only do a stop call if we are at least bound. And, we + * only need to do the in kernel cleanup if in the UP state so cmds can + * be released to upper layers. If in other states just wait for + * userspace to avoid races that can leave the cleanup_work queued. + */ + state = READ_ONCE(conn->state); + switch (state) { + case ISCSI_CONN_BOUND: + case ISCSI_CONN_UP: + if (!test_and_set_bit(ISCSI_CLS_CONN_BIT_CLEANUP, + &conn->flags)) { + queue_work(iscsi_conn_cleanup_workq, + &conn->cleanup_work); + } + break; + default: + ISCSI_DBG_TRANS_CONN(conn, "Got conn error in state %d\n", + state); + break; + } spin_unlock_irqrestore(&conn->lock, flags); priv = iscsi_if_transport_lookup(conn->transport); @@ -2967,7 +2972,7 @@ iscsi_set_param(struct iscsi_transport *transport, struct iscsi_uevent *ev) char *data = (char*)ev + sizeof(*ev); struct iscsi_cls_conn *conn; struct iscsi_cls_session *session; - int err = 0, value = 0; + int err = 0, value = 0, state; if (ev->u.set_param.len > PAGE_SIZE) return -EINVAL; @@ -2984,8 +2989,8 @@ iscsi_set_param(struct iscsi_transport *transport, struct iscsi_uevent *ev) session->recovery_tmo = value; break; default: - if ((conn->state == ISCSI_CONN_BOUND) || - (conn->state == ISCSI_CONN_UP)) { + state = READ_ONCE(conn->state); + if (state == ISCSI_CONN_BOUND || state == ISCSI_CONN_UP) { err = transport->set_param(conn, ev->u.set_param.param, data, ev->u.set_param.len); } else { @@ -3781,7 +3786,7 @@ static int iscsi_if_transport_conn(struct iscsi_transport *transport, ev->u.b_conn.transport_eph, ev->u.b_conn.is_leading); if (!ev->r.retcode) - conn->state = ISCSI_CONN_BOUND; + WRITE_ONCE(conn->state, ISCSI_CONN_BOUND); if (ev->r.retcode || !transport->ep_connect) break; @@ -3800,7 +3805,8 @@ static int iscsi_if_transport_conn(struct iscsi_transport *transport, case ISCSI_UEVENT_START_CONN: ev->r.retcode = transport->start_conn(conn); if (!ev->r.retcode) - conn->state = ISCSI_CONN_UP; + WRITE_ONCE(conn->state, ISCSI_CONN_UP); + break; case ISCSI_UEVENT_SEND_PDU: pdu_len = nlh->nlmsg_len - sizeof(*nlh) - sizeof(*ev); @@ -4108,10 +4114,11 @@ static ssize_t show_conn_state(struct device *dev, { struct iscsi_cls_conn *conn = iscsi_dev_to_conn(dev->parent); const char *state = "unknown"; + int conn_state = READ_ONCE(conn->state); - if (conn->state >= 0 && - conn->state < ARRAY_SIZE(connection_state_names)) - state = connection_state_names[conn->state]; + if (conn_state >= 0 && + conn_state < ARRAY_SIZE(connection_state_names)) + state = connection_state_names[conn_state]; return sysfs_emit(buf, "%s\n", state); } -- Gitee From 9fa5f8762ef864534ce1e813298462751a3c67a3 Mon Sep 17 00:00:00 2001 From: Zhang Wensheng Date: Tue, 19 Jul 2022 16:31:48 +0800 Subject: [PATCH 469/674] scsi: iscsi: fix kabi broken in struct iscsi_cls_conn hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X CVE: NA ------------------------------- In struct iscsi_cls_conn, "lock", "flags", "cleanup_work" are added and the kabi is broken, fix it by wrapper. Signed-off-by: Zhang Wensheng Signed-off-by: Zheng Zengkai Reviewed-by: Jason Yan --- drivers/scsi/scsi_transport_iscsi.c | 71 +++++++++++++++++------------ include/scsi/scsi_transport_iscsi.h | 16 +++++-- 2 files changed, 55 insertions(+), 32 deletions(-) diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c index ef7cd7520e7c..55b4ada390a6 100644 --- a/drivers/scsi/scsi_transport_iscsi.c +++ b/drivers/scsi/scsi_transport_iscsi.c @@ -2262,17 +2262,18 @@ static void iscsi_if_disconnect_bound_ep(struct iscsi_cls_conn *conn, struct iscsi_endpoint *ep, bool is_active) { + struct iscsi_cls_conn_wrapper *conn_wrapper = conn_to_wrapper(conn); /* Check if this was a conn error and the kernel took ownership */ - spin_lock_irq(&conn->lock); - if (!test_bit(ISCSI_CLS_CONN_BIT_CLEANUP, &conn->flags)) { - spin_unlock_irq(&conn->lock); + spin_lock_irq(&conn_wrapper->lock); + if (!test_bit(ISCSI_CLS_CONN_BIT_CLEANUP, &conn_wrapper->flags)) { + spin_unlock_irq(&conn_wrapper->lock); iscsi_ep_disconnect(conn, is_active); } else { - spin_unlock_irq(&conn->lock); + spin_unlock_irq(&conn_wrapper->lock); ISCSI_DBG_TRANS_CONN(conn, "flush kernel conn cleanup.\n"); mutex_unlock(&conn->ep_mutex); - flush_work(&conn->cleanup_work); + flush_work(&conn_wrapper->cleanup_work); /* * Userspace is now done with the EP so we can release the ref * iscsi_cleanup_conn_work_fn took. @@ -2287,11 +2288,13 @@ static int iscsi_if_stop_conn(struct iscsi_transport *transport, { int flag = ev->u.stop_conn.flag; struct iscsi_cls_conn *conn; + struct iscsi_cls_conn_wrapper *conn_wrapper; conn = iscsi_conn_lookup(ev->u.stop_conn.sid, ev->u.stop_conn.cid); if (!conn) return -EINVAL; + conn_wrapper = conn_to_wrapper(conn); ISCSI_DBG_TRANS_CONN(conn, "iscsi if conn stop.\n"); /* * If this is a termination we have to call stop_conn with that flag @@ -2299,7 +2302,7 @@ static int iscsi_if_stop_conn(struct iscsi_transport *transport, * avoid the extra run. */ if (flag == STOP_CONN_TERM) { - cancel_work_sync(&conn->cleanup_work); + cancel_work_sync(&conn_wrapper->cleanup_work); iscsi_stop_conn(conn, flag); } else { /* @@ -2315,23 +2318,23 @@ static int iscsi_if_stop_conn(struct iscsi_transport *transport, /* * Figure out if it was the kernel or userspace initiating this. */ - spin_lock_irq(&conn->lock); - if (!test_and_set_bit(ISCSI_CLS_CONN_BIT_CLEANUP, &conn->flags)) { - spin_unlock_irq(&conn->lock); + spin_lock_irq(&conn_wrapper->lock); + if (!test_and_set_bit(ISCSI_CLS_CONN_BIT_CLEANUP, &conn_wrapper->flags)) { + spin_unlock_irq(&conn_wrapper->lock); iscsi_stop_conn(conn, flag); } else { - spin_unlock_irq(&conn->lock); + spin_unlock_irq(&conn_wrapper->lock); ISCSI_DBG_TRANS_CONN(conn, "flush kernel conn cleanup.\n"); - flush_work(&conn->cleanup_work); + flush_work(&conn_wrapper->cleanup_work); } /* * Only clear for recovery to avoid extra cleanup runs during * termination. */ - spin_lock_irq(&conn->lock); - clear_bit(ISCSI_CLS_CONN_BIT_CLEANUP, &conn->flags); - spin_unlock_irq(&conn->lock); + spin_lock_irq(&conn_wrapper->lock); + clear_bit(ISCSI_CLS_CONN_BIT_CLEANUP, &conn_wrapper->flags); + spin_unlock_irq(&conn_wrapper->lock); } ISCSI_DBG_TRANS_CONN(conn, "iscsi if conn stop done.\n"); return 0; @@ -2339,8 +2342,10 @@ static int iscsi_if_stop_conn(struct iscsi_transport *transport, static void iscsi_cleanup_conn_work_fn(struct work_struct *work) { - struct iscsi_cls_conn *conn = container_of(work, struct iscsi_cls_conn, + struct iscsi_cls_conn_wrapper *conn_wrapper = + container_of(work, struct iscsi_cls_conn_wrapper, cleanup_work); + struct iscsi_cls_conn *conn = &conn_wrapper->conn; struct iscsi_cls_session *session = iscsi_conn_to_session(conn); mutex_lock(&conn->ep_mutex); @@ -2395,19 +2400,22 @@ iscsi_create_conn(struct iscsi_cls_session *session, int dd_size, uint32_t cid) { struct iscsi_transport *transport = session->transport; struct iscsi_cls_conn *conn; + struct iscsi_cls_conn_wrapper *conn_wrapper; unsigned long flags; int err; - conn = kzalloc(sizeof(*conn) + dd_size, GFP_KERNEL); - if (!conn) + conn_wrapper = kzalloc(sizeof(*conn_wrapper) + dd_size, GFP_KERNEL); + if (!conn_wrapper) return NULL; + + conn = &conn_wrapper->conn; if (dd_size) - conn->dd_data = &conn[1]; + conn->dd_data = &conn_wrapper[1]; mutex_init(&conn->ep_mutex); - spin_lock_init(&conn->lock); + spin_lock_init(&conn_wrapper->lock); INIT_LIST_HEAD(&conn->conn_list); - INIT_WORK(&conn->cleanup_work, iscsi_cleanup_conn_work_fn); + INIT_WORK(&conn_wrapper->cleanup_work, iscsi_cleanup_conn_work_fn); conn->transport = transport; conn->cid = cid; WRITE_ONCE(conn->state, ISCSI_CONN_DOWN); @@ -2597,10 +2605,11 @@ void iscsi_conn_error_event(struct iscsi_cls_conn *conn, enum iscsi_err error) struct iscsi_uevent *ev; struct iscsi_internal *priv; int len = nlmsg_total_size(sizeof(*ev)); + struct iscsi_cls_conn_wrapper *conn_wrapper = conn_to_wrapper(conn); unsigned long flags; int state; - spin_lock_irqsave(&conn->lock, flags); + spin_lock_irqsave(&conn_wrapper->lock, flags); /* * Userspace will only do a stop call if we are at least bound. And, we * only need to do the in kernel cleanup if in the UP state so cmds can @@ -2612,9 +2621,9 @@ void iscsi_conn_error_event(struct iscsi_cls_conn *conn, enum iscsi_err error) case ISCSI_CONN_BOUND: case ISCSI_CONN_UP: if (!test_and_set_bit(ISCSI_CLS_CONN_BIT_CLEANUP, - &conn->flags)) { + &conn_wrapper->flags)) { queue_work(iscsi_conn_cleanup_workq, - &conn->cleanup_work); + &conn_wrapper->cleanup_work); } break; default: @@ -2622,7 +2631,7 @@ void iscsi_conn_error_event(struct iscsi_cls_conn *conn, enum iscsi_err error) state); break; } - spin_unlock_irqrestore(&conn->lock, flags); + spin_unlock_irqrestore(&conn_wrapper->lock, flags); priv = iscsi_if_transport_lookup(conn->transport); if (!priv) @@ -2952,13 +2961,15 @@ static int iscsi_if_destroy_conn(struct iscsi_transport *transport, struct iscsi_uevent *ev) { struct iscsi_cls_conn *conn; + struct iscsi_cls_conn_wrapper *conn_wrapper; conn = iscsi_conn_lookup(ev->u.d_conn.sid, ev->u.d_conn.cid); if (!conn) return -EINVAL; + conn_wrapper = conn_to_wrapper(conn); ISCSI_DBG_TRANS_CONN(conn, "Flushing cleanup during destruction\n"); - flush_work(&conn->cleanup_work); + flush_work(&conn_wrapper->cleanup_work); ISCSI_DBG_TRANS_CONN(conn, "Destroying transport conn\n"); if (transport->destroy_conn) @@ -3728,6 +3739,7 @@ static int iscsi_if_transport_conn(struct iscsi_transport *transport, struct iscsi_uevent *ev = nlmsg_data(nlh); struct iscsi_cls_session *session; struct iscsi_cls_conn *conn = NULL; + struct iscsi_cls_conn_wrapper *conn_wrapper; struct iscsi_endpoint *ep; uint32_t pdu_len; int err = 0; @@ -3764,15 +3776,16 @@ static int iscsi_if_transport_conn(struct iscsi_transport *transport, if (!conn) return -EINVAL; + conn_wrapper = conn_to_wrapper(conn); mutex_lock(&conn->ep_mutex); - spin_lock_irq(&conn->lock); - if (test_bit(ISCSI_CLS_CONN_BIT_CLEANUP, &conn->flags)) { - spin_unlock_irq(&conn->lock); + spin_lock_irq(&conn_wrapper->lock); + if (test_bit(ISCSI_CLS_CONN_BIT_CLEANUP, &conn_wrapper->flags)) { + spin_unlock_irq(&conn_wrapper->lock); mutex_unlock(&conn->ep_mutex); ev->r.retcode = -ENOTCONN; return 0; } - spin_unlock_irq(&conn->lock); + spin_unlock_irq(&conn_wrapper->lock); switch (nlh->nlmsg_type) { case ISCSI_UEVENT_BIND_CONN: diff --git a/include/scsi/scsi_transport_iscsi.h b/include/scsi/scsi_transport_iscsi.h index 037c77fb5dc5..6f55690d3986 100644 --- a/include/scsi/scsi_transport_iscsi.h +++ b/include/scsi/scsi_transport_iscsi.h @@ -201,6 +201,7 @@ enum iscsi_connection_state { struct iscsi_cls_conn { struct list_head conn_list; /* item in connlist */ + struct list_head conn_list_err; /* add back for fix kabi broken */ void *dd_data; /* LLD private data */ struct iscsi_transport *transport; uint32_t cid; /* connection id */ @@ -211,15 +212,24 @@ struct iscsi_cls_conn { struct mutex ep_mutex; struct iscsi_endpoint *ep; + struct device dev; /* sysfs transport/container device */ + enum iscsi_connection_state state; +}; +/* + * The wrapper of iscsi_cls_conn to fix kabi while adding members. + */ +struct iscsi_cls_conn_wrapper { + struct iscsi_cls_conn conn; + /* Used when accessing flags and queueing work. */ spinlock_t lock; unsigned long flags; struct work_struct cleanup_work; - - struct device dev; /* sysfs transport/container device */ - enum iscsi_connection_state state; }; +#define conn_to_wrapper(ic_conn) \ + container_of(ic_conn, struct iscsi_cls_conn_wrapper, conn) + #define iscsi_dev_to_conn(_dev) \ container_of(_dev, struct iscsi_cls_conn, dev) -- Gitee From 230035efb98b6edc066f3671e092a25013a131a2 Mon Sep 17 00:00:00 2001 From: Li Nan Date: Tue, 19 Jul 2022 16:31:49 +0800 Subject: [PATCH 470/674] scsi: iscsi: fix kabi broken in struct iscsi_transport hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X CVE: NA ------------------------------- In struct iscsi_transport, "unbind_conn" was added and the kabi is broken. Fix it by using "tgt_dscvr" and caps flag. Signed-off-by: Li Nan Signed-off-by: Zheng Zengkai Reviewed-by: linan Reviewed-by: Jason Yan --- drivers/infiniband/ulp/iser/iscsi_iser.c | 9 +++++-- drivers/scsi/be2iscsi/be_main.c | 9 +++++-- drivers/scsi/bnx2i/bnx2i_iscsi.c | 9 +++++-- drivers/scsi/cxgbi/cxgb3i/cxgb3i.c | 9 +++++-- drivers/scsi/cxgbi/cxgb4i/cxgb4i.c | 9 +++++-- drivers/scsi/qedi/qedi_iscsi.c | 8 ++++-- drivers/scsi/qla4xxx/ql4_os.c | 8 ++++-- drivers/scsi/scsi_transport_iscsi.c | 31 ++++++++++++++++++------ include/scsi/iscsi_if.h | 1 + include/scsi/scsi_transport_iscsi.h | 17 ++++++++++++- 10 files changed, 87 insertions(+), 23 deletions(-) diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c index a16e066989fa..4884b122e413 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.c +++ b/drivers/infiniband/ulp/iser/iscsi_iser.c @@ -979,17 +979,22 @@ static struct scsi_host_template iscsi_iser_sht = { .track_queue_depth = 1, }; +static struct iscsi_transport_expand iscsi_iser_expand = { + .unbind_conn = iscsi_conn_unbind, +}; + static struct iscsi_transport iscsi_iser_transport = { .owner = THIS_MODULE, .name = "iser", - .caps = CAP_RECOVERY_L0 | CAP_MULTI_R2T | CAP_TEXT_NEGO, + .caps = CAP_RECOVERY_L0 | CAP_MULTI_R2T | CAP_TEXT_NEGO + | CAP_OPS_EXPAND, /* session management */ .create_session = iscsi_iser_session_create, .destroy_session = iscsi_iser_session_destroy, /* connection management */ .create_conn = iscsi_iser_conn_create, .bind_conn = iscsi_iser_conn_bind, - .unbind_conn = iscsi_conn_unbind, + .ops_expand = &iscsi_iser_expand, .destroy_conn = iscsi_conn_teardown, .attr_is_visible = iser_attr_is_visible, .set_param = iscsi_iser_set_param, diff --git a/drivers/scsi/be2iscsi/be_main.c b/drivers/scsi/be2iscsi/be_main.c index b977e039bb78..06f697bfc49f 100644 --- a/drivers/scsi/be2iscsi/be_main.c +++ b/drivers/scsi/be2iscsi/be_main.c @@ -5801,16 +5801,21 @@ static struct pci_error_handlers beiscsi_eeh_handlers = { .resume = beiscsi_eeh_resume, }; +struct iscsi_transport_expand beiscsi_iscsi_expand = { + .unbind_conn = iscsi_conn_unbind, +}; + struct iscsi_transport beiscsi_iscsi_transport = { .owner = THIS_MODULE, .name = DRV_NAME, .caps = CAP_RECOVERY_L0 | CAP_HDRDGST | CAP_TEXT_NEGO | - CAP_MULTI_R2T | CAP_DATADGST | CAP_DATA_PATH_OFFLOAD, + CAP_MULTI_R2T | CAP_DATADGST | CAP_DATA_PATH_OFFLOAD | + CAP_OPS_EXPAND, .create_session = beiscsi_session_create, .destroy_session = beiscsi_session_destroy, .create_conn = beiscsi_conn_create, .bind_conn = beiscsi_conn_bind, - .unbind_conn = iscsi_conn_unbind, + .ops_expand = &beiscsi_iscsi_expand, .destroy_conn = iscsi_conn_teardown, .attr_is_visible = beiscsi_attr_is_visible, .set_iface_param = beiscsi_iface_set_param, diff --git a/drivers/scsi/bnx2i/bnx2i_iscsi.c b/drivers/scsi/bnx2i/bnx2i_iscsi.c index 8cf2f9a7cfdc..e13c77a76150 100644 --- a/drivers/scsi/bnx2i/bnx2i_iscsi.c +++ b/drivers/scsi/bnx2i/bnx2i_iscsi.c @@ -2274,18 +2274,23 @@ static struct scsi_host_template bnx2i_host_template = { .track_queue_depth = 1, }; + +static struct iscsi_transport_expand bnx2i_iscsi_expand = { + .unbind_conn = iscsi_conn_unbind, +}; + struct iscsi_transport bnx2i_iscsi_transport = { .owner = THIS_MODULE, .name = "bnx2i", .caps = CAP_RECOVERY_L0 | CAP_HDRDGST | CAP_MULTI_R2T | CAP_DATADGST | CAP_DATA_PATH_OFFLOAD | - CAP_TEXT_NEGO, + CAP_TEXT_NEGO | CAP_OPS_EXPAND, .create_session = bnx2i_session_create, .destroy_session = bnx2i_session_destroy, .create_conn = bnx2i_conn_create, .bind_conn = bnx2i_conn_bind, - .unbind_conn = iscsi_conn_unbind, + .ops_expand = &bnx2i_iscsi_expand, .destroy_conn = bnx2i_conn_destroy, .attr_is_visible = bnx2i_attr_is_visible, .set_param = iscsi_set_param, diff --git a/drivers/scsi/cxgbi/cxgb3i/cxgb3i.c b/drivers/scsi/cxgbi/cxgb3i/cxgb3i.c index edcd3fab6973..65eb6230d390 100644 --- a/drivers/scsi/cxgbi/cxgb3i/cxgb3i.c +++ b/drivers/scsi/cxgbi/cxgb3i/cxgb3i.c @@ -100,13 +100,18 @@ static struct scsi_host_template cxgb3i_host_template = { .track_queue_depth = 1, }; +static struct iscsi_transport_expand cxgb3i_iscsi_expand = { + .unbind_conn = iscsi_conn_unbind, +}; + static struct iscsi_transport cxgb3i_iscsi_transport = { .owner = THIS_MODULE, .name = DRV_MODULE_NAME, /* owner and name should be set already */ .caps = CAP_RECOVERY_L0 | CAP_MULTI_R2T | CAP_HDRDGST | CAP_DATADGST | CAP_DIGEST_OFFLOAD | - CAP_PADDING_OFFLOAD | CAP_TEXT_NEGO, + CAP_PADDING_OFFLOAD | CAP_TEXT_NEGO | + CAP_OPS_EXPAND, .attr_is_visible = cxgbi_attr_is_visible, .get_host_param = cxgbi_get_host_param, .set_host_param = cxgbi_set_host_param, @@ -117,7 +122,7 @@ static struct iscsi_transport cxgb3i_iscsi_transport = { /* connection management */ .create_conn = cxgbi_create_conn, .bind_conn = cxgbi_bind_conn, - .unbind_conn = iscsi_conn_unbind, + .ops_expand = &cxgb3i_iscsi_expand, .destroy_conn = iscsi_tcp_conn_teardown, .start_conn = iscsi_conn_start, .stop_conn = iscsi_conn_stop, diff --git a/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c b/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c index efb3e2b3398e..88f96964c356 100644 --- a/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c +++ b/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c @@ -118,12 +118,17 @@ static struct scsi_host_template cxgb4i_host_template = { .track_queue_depth = 1, }; +static struct iscsi_transport_expand cxgb4i_iscsi_expand = { + .unbind_conn = iscsi_conn_unbind, +}; + static struct iscsi_transport cxgb4i_iscsi_transport = { .owner = THIS_MODULE, .name = DRV_MODULE_NAME, .caps = CAP_RECOVERY_L0 | CAP_MULTI_R2T | CAP_HDRDGST | CAP_DATADGST | CAP_DIGEST_OFFLOAD | - CAP_PADDING_OFFLOAD | CAP_TEXT_NEGO, + CAP_PADDING_OFFLOAD | CAP_TEXT_NEGO | + CAP_OPS_EXPAND, .attr_is_visible = cxgbi_attr_is_visible, .get_host_param = cxgbi_get_host_param, .set_host_param = cxgbi_set_host_param, @@ -134,7 +139,7 @@ static struct iscsi_transport cxgb4i_iscsi_transport = { /* connection management */ .create_conn = cxgbi_create_conn, .bind_conn = cxgbi_bind_conn, - .unbind_conn = iscsi_conn_unbind, + .ops_expand = &cxgb4i_iscsi_expand, .destroy_conn = iscsi_tcp_conn_teardown, .start_conn = iscsi_conn_start, .stop_conn = iscsi_conn_stop, diff --git a/drivers/scsi/qedi/qedi_iscsi.c b/drivers/scsi/qedi/qedi_iscsi.c index 5f7e62f19d83..ba9a22e55e32 100644 --- a/drivers/scsi/qedi/qedi_iscsi.c +++ b/drivers/scsi/qedi/qedi_iscsi.c @@ -1430,16 +1430,20 @@ static void qedi_cleanup_task(struct iscsi_task *task) cmd->scsi_cmd = NULL; } +static struct iscsi_transport_expand qedi_iscsi_expand = { + .unbind_conn = iscsi_conn_unbind, +}; + struct iscsi_transport qedi_iscsi_transport = { .owner = THIS_MODULE, .name = QEDI_MODULE_NAME, .caps = CAP_RECOVERY_L0 | CAP_HDRDGST | CAP_MULTI_R2T | CAP_DATADGST | - CAP_DATA_PATH_OFFLOAD | CAP_TEXT_NEGO, + CAP_DATA_PATH_OFFLOAD | CAP_TEXT_NEGO | CAP_OPS_EXPAND, .create_session = qedi_session_create, .destroy_session = qedi_session_destroy, .create_conn = qedi_conn_create, .bind_conn = qedi_conn_bind, - .unbind_conn = iscsi_conn_unbind, + .ops_expand = &qedi_iscsi_expand, .start_conn = qedi_conn_start, .stop_conn = iscsi_conn_stop, .destroy_conn = qedi_conn_destroy, diff --git a/drivers/scsi/qla4xxx/ql4_os.c b/drivers/scsi/qla4xxx/ql4_os.c index 8d82d2a83059..377d83762099 100644 --- a/drivers/scsi/qla4xxx/ql4_os.c +++ b/drivers/scsi/qla4xxx/ql4_os.c @@ -246,20 +246,24 @@ static struct scsi_host_template qla4xxx_driver_template = { .vendor_id = SCSI_NL_VID_TYPE_PCI | PCI_VENDOR_ID_QLOGIC, }; +static struct iscsi_transport_expand qla4xxx_iscsi_expand = { + .unbind_conn = iscsi_conn_unbind, +}; + static struct iscsi_transport qla4xxx_iscsi_transport = { .owner = THIS_MODULE, .name = DRIVER_NAME, .caps = CAP_TEXT_NEGO | CAP_DATA_PATH_OFFLOAD | CAP_HDRDGST | CAP_DATADGST | CAP_LOGIN_OFFLOAD | - CAP_MULTI_R2T, + CAP_MULTI_R2T | CAP_OPS_EXPAND, .attr_is_visible = qla4_attr_is_visible, .create_session = qla4xxx_session_create, .destroy_session = qla4xxx_session_destroy, .start_conn = qla4xxx_conn_start, .create_conn = qla4xxx_conn_create, .bind_conn = qla4xxx_conn_bind, - .unbind_conn = iscsi_conn_unbind, + .ops_expand = &qla4xxx_iscsi_expand, .stop_conn = iscsi_conn_stop, .destroy_conn = qla4xxx_conn_destroy, .set_param = iscsi_set_param, diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c index 55b4ada390a6..e23cb62ab216 100644 --- a/drivers/scsi/scsi_transport_iscsi.c +++ b/drivers/scsi/scsi_transport_iscsi.c @@ -2253,7 +2253,11 @@ static void iscsi_ep_disconnect(struct iscsi_cls_conn *conn, bool is_active) ep = conn->ep; conn->ep = NULL; - session->transport->unbind_conn(conn, is_active); + if (session->transport->caps & CAP_OPS_EXPAND && + session->transport->ops_expand && + session->transport->ops_expand->unbind_conn) + session->transport->ops_expand->unbind_conn(conn, is_active); + session->transport->ep_disconnect(ep); ISCSI_DBG_TRANS_CONN(conn, "disconnect ep done.\n"); } @@ -3119,10 +3123,19 @@ iscsi_tgt_dscvr(struct iscsi_transport *transport, struct Scsi_Host *shost; struct sockaddr *dst_addr; int err; + int (*tgt_dscvr)(struct Scsi_Host *shost, enum iscsi_tgt_dscvr type, + uint32_t enable, struct sockaddr *dst_addr); - if (!transport->tgt_dscvr) - return -EINVAL; - + if (transport->caps & CAP_OPS_EXPAND) { + if (!transport->ops_expand || !transport->ops_expand->tgt_dscvr) + return -EINVAL; + tgt_dscvr = transport->ops_expand->tgt_dscvr; + } else { + if (!transport->ops_expand) + return -EINVAL; + tgt_dscvr = (int (*)(struct Scsi_Host *, enum iscsi_tgt_dscvr, uint32_t, + struct sockaddr *))(transport->ops_expand); + } shost = scsi_host_lookup(ev->u.tgt_dscvr.host_no); if (!shost) { printk(KERN_ERR "target discovery could not find host no %u\n", @@ -3132,8 +3145,8 @@ iscsi_tgt_dscvr(struct iscsi_transport *transport, dst_addr = (struct sockaddr *)((char*)ev + sizeof(*ev)); - err = transport->tgt_dscvr(shost, ev->u.tgt_dscvr.type, - ev->u.tgt_dscvr.enable, dst_addr); + err = tgt_dscvr(shost, ev->u.tgt_dscvr.type, + ev->u.tgt_dscvr.enable, dst_addr); scsi_host_put(shost); return err; } @@ -4786,8 +4799,10 @@ iscsi_register_transport(struct iscsi_transport *tt) int err; BUG_ON(!tt); - WARN_ON(tt->ep_disconnect && !tt->unbind_conn); - + if (tt->caps & CAP_OPS_EXPAND) { + BUG_ON(!tt->ops_expand); + WARN_ON(tt->ep_disconnect && !tt->ops_expand->unbind_conn); + } priv = iscsi_if_transport_lookup(tt); if (priv) return NULL; diff --git a/include/scsi/iscsi_if.h b/include/scsi/iscsi_if.h index 5225a23f2d0e..5679b9fb2b1e 100644 --- a/include/scsi/iscsi_if.h +++ b/include/scsi/iscsi_if.h @@ -761,6 +761,7 @@ enum iscsi_ping_status_code { and verification */ #define CAP_LOGIN_OFFLOAD 0x4000 /* offload session login */ +#define CAP_OPS_EXPAND 0x8000 /* oiscsi_transport->ops_expand flag */ /* * These flags describes reason of stop_conn() call */ diff --git a/include/scsi/scsi_transport_iscsi.h b/include/scsi/scsi_transport_iscsi.h index 6f55690d3986..f9297176dcb8 100644 --- a/include/scsi/scsi_transport_iscsi.h +++ b/include/scsi/scsi_transport_iscsi.h @@ -29,6 +29,15 @@ struct bsg_job; struct iscsi_bus_flash_session; struct iscsi_bus_flash_conn; +/* + * The expansion of iscsi_transport to fix kabi while adding members. + */ +struct iscsi_transport_expand { + int (*tgt_dscvr)(struct Scsi_Host *shost, enum iscsi_tgt_dscvr type, + uint32_t enable, struct sockaddr *dst_addr); + void (*unbind_conn)(struct iscsi_cls_conn *conn, bool is_active); +}; + /** * struct iscsi_transport - iSCSI Transport template * @@ -82,7 +91,6 @@ struct iscsi_transport { void (*destroy_session) (struct iscsi_cls_session *session); struct iscsi_cls_conn *(*create_conn) (struct iscsi_cls_session *sess, uint32_t cid); - void (*unbind_conn) (struct iscsi_cls_conn *conn, bool is_active); int (*bind_conn) (struct iscsi_cls_session *session, struct iscsi_cls_conn *cls_conn, uint64_t transport_eph, int is_leading); @@ -124,8 +132,15 @@ struct iscsi_transport { int non_blocking); int (*ep_poll) (struct iscsi_endpoint *ep, int timeout_ms); void (*ep_disconnect) (struct iscsi_endpoint *ep); +#ifdef __GENKSYMS__ int (*tgt_dscvr) (struct Scsi_Host *shost, enum iscsi_tgt_dscvr type, uint32_t enable, struct sockaddr *dst_addr); +#else + /* + * onece ops_expand is used, caps must be set to CAP_OPS_EXPAND + */ + struct iscsi_transport_expand *ops_expand; +#endif int (*set_path) (struct Scsi_Host *shost, struct iscsi_path *params); int (*set_iface_param) (struct Scsi_Host *shost, void *data, uint32_t len); -- Gitee From bb0fba32beb4f89107920efd62bdebb367fed0ce Mon Sep 17 00:00:00 2001 From: LeoLiu-oc Date: Tue, 19 Jul 2022 16:51:06 +0800 Subject: [PATCH 471/674] Driver for Zhaoxin CPU core temperature monitoring zhaoxin inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I52DS7 CVE: NA -------------------------------------------- Add support for the temperature sensor inside CPU. Supported are all known variants of the Zhaoxin processors. v1: Fix some character encoding mistaken. Signed-off-by: LeoLiu-oc Signed-off-by: Zheng Zengkai Reviewed-by: Xiongfeng Wang --- drivers/hwmon/Kconfig | 9 + drivers/hwmon/Makefile | 1 + drivers/hwmon/via-cputemp.c | 1 - drivers/hwmon/zhaoxin-cputemp.c | 292 ++++++++++++++++++++++++++++++++ 4 files changed, 302 insertions(+), 1 deletion(-) create mode 100644 drivers/hwmon/zhaoxin-cputemp.c diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig index 45a1a5969d01..f2fe56e6f8bd 100644 --- a/drivers/hwmon/Kconfig +++ b/drivers/hwmon/Kconfig @@ -1899,6 +1899,15 @@ config SENSORS_VIA_CPUTEMP sensor inside your CPU. Supported are all known variants of the VIA C7 and Nano. +config SENSORS_ZHAOXIN_CPUTEMP + tristate "Zhaoxin CPU temperature sensor" + depends on X86 + select HWMON_VID + help + If you say yes here you get support for the temperature + sensor inside your CPU. Supported are all known variants of + the Zhaoxin processors. + config SENSORS_VIA686A tristate "VIA686A" depends on PCI diff --git a/drivers/hwmon/Makefile b/drivers/hwmon/Makefile index c22a5316bd91..95908c478b94 100644 --- a/drivers/hwmon/Makefile +++ b/drivers/hwmon/Makefile @@ -184,6 +184,7 @@ obj-$(CONFIG_SENSORS_TMP421) += tmp421.o obj-$(CONFIG_SENSORS_TMP513) += tmp513.o obj-$(CONFIG_SENSORS_VEXPRESS) += vexpress-hwmon.o obj-$(CONFIG_SENSORS_VIA_CPUTEMP)+= via-cputemp.o +obj-$(CONFIG_SENSORS_ZHAOXIN_CPUTEMP) += zhaoxin-cputemp.o obj-$(CONFIG_SENSORS_VIA686A) += via686a.o obj-$(CONFIG_SENSORS_VT1211) += vt1211.o obj-$(CONFIG_SENSORS_VT8231) += vt8231.o diff --git a/drivers/hwmon/via-cputemp.c b/drivers/hwmon/via-cputemp.c index e5d18dac8ee7..0a5057dbe51a 100644 --- a/drivers/hwmon/via-cputemp.c +++ b/drivers/hwmon/via-cputemp.c @@ -273,7 +273,6 @@ static const struct x86_cpu_id __initconst cputemp_ids[] = { X86_MATCH_VENDOR_FAM_MODEL(CENTAUR, 6, X86_CENTAUR_FAM6_C7_A, NULL), X86_MATCH_VENDOR_FAM_MODEL(CENTAUR, 6, X86_CENTAUR_FAM6_C7_D, NULL), X86_MATCH_VENDOR_FAM_MODEL(CENTAUR, 6, X86_CENTAUR_FAM6_NANO, NULL), - X86_MATCH_VENDOR_FAM_MODEL(CENTAUR, 7, X86_MODEL_ANY, NULL), {} }; MODULE_DEVICE_TABLE(x86cpu, cputemp_ids); diff --git a/drivers/hwmon/zhaoxin-cputemp.c b/drivers/hwmon/zhaoxin-cputemp.c new file mode 100644 index 000000000000..39e729590eba --- /dev/null +++ b/drivers/hwmon/zhaoxin-cputemp.c @@ -0,0 +1,292 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * zhaoxin-cputemp.c - Driver for Zhaoxin CPU core temperature monitoring + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DRVNAME "zhaoxin_cputemp" + +enum { SHOW_TEMP, SHOW_LABEL, SHOW_NAME }; + +/* Functions declaration */ + +struct zhaoxin_cputemp_data { + struct device *hwmon_dev; + const char *name; + u8 vrm; + u32 id; + u32 msr_temp; + u32 msr_vid; +}; + +/* Sysfs stuff */ + +static ssize_t name_show(struct device *dev, struct device_attribute *devattr, + char *buf) +{ + int ret; + struct sensor_device_attribute *attr = to_sensor_dev_attr(devattr); + struct zhaoxin_cputemp_data *data = dev_get_drvdata(dev); + + if (attr->index == SHOW_NAME) + ret = sprintf(buf, "%s\n", data->name); + else /* show label */ + ret = sprintf(buf, "Core %d\n", data->id); + return ret; +} + +static ssize_t temp_show(struct device *dev, struct device_attribute *devattr, char *buf) +{ + struct zhaoxin_cputemp_data *data = dev_get_drvdata(dev); + u32 eax, edx; + int err; + + err = rdmsr_safe_on_cpu(data->id, data->msr_temp, &eax, &edx); + if (err) + return -EAGAIN; + + return sprintf(buf, "%lu\n", ((unsigned long)eax & 0xffffff) * 1000); +} + +static ssize_t cpu0_vid_show(struct device *dev, struct device_attribute *devattr, char *buf) +{ + struct zhaoxin_cputemp_data *data = dev_get_drvdata(dev); + u32 eax, edx; + int err; + + err = rdmsr_safe_on_cpu(data->id, data->msr_vid, &eax, &edx); + if (err) + return -EAGAIN; + + return sprintf(buf, "%d\n", vid_from_reg(~edx & 0x7f, data->vrm)); +} + +static SENSOR_DEVICE_ATTR_RO(temp1_input, temp, SHOW_TEMP); +static SENSOR_DEVICE_ATTR_RO(temp1_label, name, SHOW_LABEL); +static SENSOR_DEVICE_ATTR_RO(name, name, SHOW_NAME); + +static struct attribute *zhaoxin_cputemp_attributes[] = { + &sensor_dev_attr_name.dev_attr.attr, + &sensor_dev_attr_temp1_label.dev_attr.attr, + &sensor_dev_attr_temp1_input.dev_attr.attr, + NULL +}; + +static const struct attribute_group zhaoxin_cputemp_group = { + .attrs = zhaoxin_cputemp_attributes, +}; + +/* Optional attributes */ +static DEVICE_ATTR_RO(cpu0_vid); + +static int zhaoxin_cputemp_probe(struct platform_device *pdev) +{ + struct zhaoxin_cputemp_data *data; + int err; + u32 eax, edx; + + data = devm_kzalloc(&pdev->dev, sizeof(struct zhaoxin_cputemp_data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + data->id = pdev->id; + data->name = "zhaoxin_cputemp"; + data->msr_temp = 0x1423; + + /* test if we can access the TEMPERATURE MSR */ + err = rdmsr_safe_on_cpu(data->id, data->msr_temp, &eax, &edx); + if (err) { + dev_err(&pdev->dev, "Unable to access TEMPERATURE MSR, giving up\n"); + return err; + } + + platform_set_drvdata(pdev, data); + + err = sysfs_create_group(&pdev->dev.kobj, &zhaoxin_cputemp_group); + if (err) + return err; + + if (data->msr_vid) + data->vrm = vid_which_vrm(); + + if (data->vrm) { + err = device_create_file(&pdev->dev, &dev_attr_cpu0_vid); + if (err) + goto exit_remove; + } + + data->hwmon_dev = hwmon_device_register(&pdev->dev); + if (IS_ERR(data->hwmon_dev)) { + err = PTR_ERR(data->hwmon_dev); + dev_err(&pdev->dev, "Class registration failed (%d)\n", err); + goto exit_remove; + } + + return 0; + +exit_remove: + if (data->vrm) + device_remove_file(&pdev->dev, &dev_attr_cpu0_vid); + sysfs_remove_group(&pdev->dev.kobj, &zhaoxin_cputemp_group); + return err; +} + +static int zhaoxin_cputemp_remove(struct platform_device *pdev) +{ + struct zhaoxin_cputemp_data *data = platform_get_drvdata(pdev); + + hwmon_device_unregister(data->hwmon_dev); + if (data->vrm) + device_remove_file(&pdev->dev, &dev_attr_cpu0_vid); + sysfs_remove_group(&pdev->dev.kobj, &zhaoxin_cputemp_group); + return 0; +} + +static struct platform_driver zhaoxin_cputemp_driver = { + .driver = { + .name = DRVNAME, + }, + .probe = zhaoxin_cputemp_probe, + .remove = zhaoxin_cputemp_remove, +}; + +struct pdev_entry { + struct list_head list; + struct platform_device *pdev; + unsigned int cpu; +}; + +static LIST_HEAD(pdev_list); +static DEFINE_MUTEX(pdev_list_mutex); + +static int zhaoxin_cputemp_online(unsigned int cpu) +{ + int err; + struct platform_device *pdev; + struct pdev_entry *pdev_entry; + + pdev = platform_device_alloc(DRVNAME, cpu); + if (!pdev) { + err = -ENOMEM; + pr_err("Device allocation failed\n"); + goto exit; + } + + pdev_entry = kzalloc(sizeof(struct pdev_entry), GFP_KERNEL); + if (!pdev_entry) { + err = -ENOMEM; + goto exit_device_put; + } + + err = platform_device_add(pdev); + if (err) { + pr_err("Device addition failed (%d)\n", err); + goto exit_device_free; + } + + pdev_entry->pdev = pdev; + pdev_entry->cpu = cpu; + mutex_lock(&pdev_list_mutex); + list_add_tail(&pdev_entry->list, &pdev_list); + mutex_unlock(&pdev_list_mutex); + + return 0; + +exit_device_free: + kfree(pdev_entry); +exit_device_put: + platform_device_put(pdev); +exit: + return err; +} + +static int zhaoxin_cputemp_down_prep(unsigned int cpu) +{ + struct pdev_entry *p; + + mutex_lock(&pdev_list_mutex); + list_for_each_entry(p, &pdev_list, list) { + if (p->cpu == cpu) { + platform_device_unregister(p->pdev); + list_del(&p->list); + mutex_unlock(&pdev_list_mutex); + kfree(p); + return 0; + } + } + mutex_unlock(&pdev_list_mutex); + return 0; +} + +static const struct x86_cpu_id __initconst cputemp_ids[] = { + X86_MATCH_VENDOR_FAM_MODEL(CENTAUR, 7, X86_MODEL_ANY, NULL), + X86_MATCH_VENDOR_FAM_MODEL(ZHAOXIN, 7, X86_MODEL_ANY, NULL), + {} +}; +MODULE_DEVICE_TABLE(x86cpu, cputemp_ids); + +static enum cpuhp_state zhaoxin_temp_online; + +static int __init zhaoxin_cputemp_init(void) +{ + int err; + + if (!x86_match_cpu(cputemp_ids)) + return -ENODEV; + + err = platform_driver_register(&zhaoxin_cputemp_driver); + if (err) + goto exit; + + err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "hwmon/zhaoxin:online", + zhaoxin_cputemp_online, zhaoxin_cputemp_down_prep); + if (err < 0) + goto exit_driver_unreg; + zhaoxin_temp_online = err; + +#ifndef CONFIG_HOTPLUG_CPU + if (list_empty(&pdev_list)) { + err = -ENODEV; + goto exit_hp_unreg; + } +#endif + return 0; + +#ifndef CONFIG_HOTPLUG_CPU +exit_hp_unreg: + cpuhp_remove_state_nocalls(zhaoxin_temp_online); +#endif +exit_driver_unreg: + platform_driver_unregister(&zhaoxin_cputemp_driver); +exit: + return err; +} + +static void __exit zhaoxin_cputemp_exit(void) +{ + cpuhp_remove_state(zhaoxin_temp_online); + platform_driver_unregister(&zhaoxin_cputemp_driver); +} + +MODULE_DESCRIPTION("Zhaoxin CPU temperature monitor"); +MODULE_LICENSE("GPL"); + +module_init(zhaoxin_cputemp_init) +module_exit(zhaoxin_cputemp_exit) -- Gitee From d660fbb1aac606323b6623f5bfb9d4b890b3c5fa Mon Sep 17 00:00:00 2001 From: LeoLiuoc Date: Tue, 19 Jul 2022 16:51:07 +0800 Subject: [PATCH 472/674] openeuler_defconfig: Enable SENSORS_ZHAOXIN_CPUTEMP as module by default zhaoxin inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I52DS7 CVE: NA -------------------------------------------- Set CONFIG_SENSORS_ZHAOXIN_CPUTEMP to 'm' by default in openeuler_defconfig. Signed-off-by: LeoLiuoc Signed-off-by: Zheng Zengkai Reviewed-by: Xiongfeng Wang --- arch/x86/configs/openeuler_defconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index f9c94b618ad4..3eac70518e6f 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -4053,6 +4053,7 @@ CONFIG_SENSORS_TMP421=m # CONFIG_SENSORS_TMP513 is not set CONFIG_SENSORS_VIA_CPUTEMP=m CONFIG_SENSORS_VIA686A=m +CONFIG_SENSORS_ZHAOXIN_CPUTEMP=m CONFIG_SENSORS_VT1211=m CONFIG_SENSORS_VT8231=m # CONFIG_SENSORS_W83773G is not set -- Gitee From 2dd89ae225574b493ce87324dacc2a5ef0f00dbd Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 19 Jul 2022 16:51:08 +0800 Subject: [PATCH 473/674] nbd: call genl_unregister_family() first in nbd_cleanup() mainline inclusion from mainline-v5.19-rc1 commit 06c4da89c24e7023ea448cadf8e9daf06a0aae6e category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5G01M CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=06c4da89c24e7023ea448cadf8e9daf06a0aae6e -------------------------------- Otherwise there may be race between module removal and the handling of netlink command, which can lead to the oops as shown below: BUG: kernel NULL pointer dereference, address: 0000000000000098 Oops: 0002 [#1] SMP PTI CPU: 1 PID: 31299 Comm: nbd-client Tainted: G E 5.14.0-rc4 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996) RIP: 0010:down_write+0x1a/0x50 Call Trace: start_creating+0x89/0x130 debugfs_create_dir+0x1b/0x130 nbd_start_device+0x13d/0x390 [nbd] nbd_genl_connect+0x42f/0x748 [nbd] genl_family_rcv_msg_doit.isra.0+0xec/0x150 genl_rcv_msg+0xe5/0x1e0 netlink_rcv_skb+0x55/0x100 genl_rcv+0x29/0x40 netlink_unicast+0x1a8/0x250 netlink_sendmsg+0x21b/0x430 ____sys_sendmsg+0x2a4/0x2d0 ___sys_sendmsg+0x81/0xc0 __sys_sendmsg+0x62/0xb0 __x64_sys_sendmsg+0x1f/0x30 do_syscall_64+0x3b/0xc0 entry_SYSCALL_64_after_hwframe+0x44/0xae Modules linked in: nbd(E-) Signed-off-by: Hou Tao Signed-off-by: Yu Kuai Reviewed-by: Josef Bacik Link: https://lore.kernel.org/r/20220521073749.3146892-2-yukuai3@huawei.com Signed-off-by: Jens Axboe Reviewed-by: Jason Yan Signed-off-by: Zheng Zengkai --- drivers/block/nbd.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 0ab548c78f24..a12daca0f92a 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -2537,6 +2537,12 @@ static void __exit nbd_cleanup(void) struct nbd_device *nbd; LIST_HEAD(del_list); + /* + * Unregister netlink interface prior to waiting + * for the completion of netlink commands. + */ + genl_unregister_family(&nbd_genl_family); + nbd_dbg_close(); mutex_lock(&nbd_index_mutex); @@ -2552,7 +2558,6 @@ static void __exit nbd_cleanup(void) } idr_destroy(&nbd_index_idr); - genl_unregister_family(&nbd_genl_family); unregister_blkdev(NBD_MAJOR, "nbd"); } -- Gitee From b0452dec11cd872f0547ec6478779e664f2b95fe Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 19 Jul 2022 16:51:09 +0800 Subject: [PATCH 474/674] nbd: fix race between nbd_alloc_config() and module removal mainline inclusion from mainline-v5.19-rc1 commit c55b2b983b0fa012942c3eb16384b2b722caa810 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5G01M CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=c55b2b983b0fa012942c3eb16384b2b722caa810 -------------------------------- When nbd module is being removing, nbd_alloc_config() may be called concurrently by nbd_genl_connect(), although try_module_get() will return false, but nbd_alloc_config() doesn't handle it. The race may lead to the leak of nbd_config and its related resources (e.g, recv_workq) and oops in nbd_read_stat() due to the unload of nbd module as shown below: BUG: kernel NULL pointer dereference, address: 0000000000000040 Oops: 0000 [#1] SMP PTI CPU: 5 PID: 13840 Comm: kworker/u17:33 Not tainted 5.14.0+ #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996) Workqueue: knbd16-recv recv_work [nbd] RIP: 0010:nbd_read_stat.cold+0x130/0x1a4 [nbd] Call Trace: recv_work+0x3b/0xb0 [nbd] process_one_work+0x1ed/0x390 worker_thread+0x4a/0x3d0 kthread+0x12a/0x150 ret_from_fork+0x22/0x30 Fixing it by checking the return value of try_module_get() in nbd_alloc_config(). As nbd_alloc_config() may return ERR_PTR(-ENODEV), assign nbd->config only when nbd_alloc_config() succeeds to ensure the value of nbd->config is binary (valid or NULL). Also adding a debug message to check the reference counter of nbd_config during module removal. Signed-off-by: Hou Tao Signed-off-by: Yu Kuai Reviewed-by: Josef Bacik Link: https://lore.kernel.org/r/20220521073749.3146892-3-yukuai3@huawei.com Signed-off-by: Jens Axboe Reviewed-by: Jason Yan Signed-off-by: Zheng Zengkai --- drivers/block/nbd.c | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index a12daca0f92a..873ce6f71d85 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1530,15 +1530,20 @@ static struct nbd_config *nbd_alloc_config(void) { struct nbd_config *config; + if (!try_module_get(THIS_MODULE)) + return ERR_PTR(-ENODEV); + config = kzalloc(sizeof(struct nbd_config), GFP_NOFS); - if (!config) - return NULL; + if (!config) { + module_put(THIS_MODULE); + return ERR_PTR(-ENOMEM); + } + atomic_set(&config->recv_threads, 0); init_waitqueue_head(&config->recv_wq); init_waitqueue_head(&config->conn_wait); config->blksize = NBD_DEF_BLKSIZE; atomic_set(&config->live_connections, 0); - try_module_get(THIS_MODULE); return config; } @@ -1565,12 +1570,13 @@ static int nbd_open(struct block_device *bdev, fmode_t mode) mutex_unlock(&nbd->config_lock); goto out; } - config = nbd->config = nbd_alloc_config(); - if (!config) { - ret = -ENOMEM; + config = nbd_alloc_config(); + if (IS_ERR(config)) { + ret = PTR_ERR(config); mutex_unlock(&nbd->config_lock); goto out; } + nbd->config = config; refcount_set(&nbd->config_refs, 1); refcount_inc(&nbd->refs); mutex_unlock(&nbd->config_lock); @@ -2005,13 +2011,14 @@ static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info) nbd_put(nbd); return -EINVAL; } - config = nbd->config = nbd_alloc_config(); - if (!nbd->config) { + config = nbd_alloc_config(); + if (IS_ERR(config)) { mutex_unlock(&nbd->config_lock); nbd_put(nbd); printk(KERN_ERR "nbd: couldn't allocate config\n"); - return -ENOMEM; + return PTR_ERR(config); } + nbd->config = config; refcount_set(&nbd->config_refs, 1); set_bit(NBD_RT_BOUND, &config->runtime_flags); @@ -2552,6 +2559,9 @@ static void __exit nbd_cleanup(void) while (!list_empty(&del_list)) { nbd = list_first_entry(&del_list, struct nbd_device, list); list_del_init(&nbd->list); + if (refcount_read(&nbd->config_refs)) + printk(KERN_ERR "nbd: possibly leaking nbd_config (ref %d)\n", + refcount_read(&nbd->config_refs)); if (refcount_read(&nbd->refs) != 1) printk(KERN_ERR "nbd: possibly leaking a device\n"); nbd_put(nbd); -- Gitee From 353a77e50e3931a82be3cdee8c71d7ef29726672 Mon Sep 17 00:00:00 2001 From: Duoming Zhou Date: Tue, 19 Jul 2022 16:51:10 +0800 Subject: [PATCH 475/674] net: rose: fix UAF bugs caused by timer handler stable inclusion from stable-v5.10.129 commit 8f74cb27c2b4872fd14bf046201fa7b36a46885e category: bugfix bugzilla: 187170 https://gitee.com/src-openeuler/kernel/issues/I5FNWN CVE: CVE-2022-2318 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?h=linux-5.10.y&id=8f74cb27c2b4872fd14bf046201fa7b36a46885e -------------------------------- commit 9cc02ede696272c5271a401e4f27c262359bc2f6 upstream. There are UAF bugs in rose_heartbeat_expiry(), rose_timer_expiry() and rose_idletimer_expiry(). The root cause is that del_timer() could not stop the timer handler that is running and the refcount of sock is not managed properly. One of the UAF bugs is shown below: (thread 1) | (thread 2) | rose_bind | rose_connect | rose_start_heartbeat rose_release | (wait a time) case ROSE_STATE_0 | rose_destroy_socket | rose_heartbeat_expiry rose_stop_heartbeat | sock_put(sk) | ... sock_put(sk) // FREE | | bh_lock_sock(sk) // USE The sock is deallocated by sock_put() in rose_release() and then used by bh_lock_sock() in rose_heartbeat_expiry(). Although rose_destroy_socket() calls rose_stop_heartbeat(), it could not stop the timer that is running. The KASAN report triggered by POC is shown below: BUG: KASAN: use-after-free in _raw_spin_lock+0x5a/0x110 Write of size 4 at addr ffff88800ae59098 by task swapper/3/0 ... Call Trace: dump_stack_lvl+0xbf/0xee print_address_description+0x7b/0x440 print_report+0x101/0x230 ? irq_work_single+0xbb/0x140 ? _raw_spin_lock+0x5a/0x110 kasan_report+0xed/0x120 ? _raw_spin_lock+0x5a/0x110 kasan_check_range+0x2bd/0x2e0 _raw_spin_lock+0x5a/0x110 rose_heartbeat_expiry+0x39/0x370 ? rose_start_heartbeat+0xb0/0xb0 call_timer_fn+0x2d/0x1c0 ? rose_start_heartbeat+0xb0/0xb0 expire_timers+0x1f3/0x320 __run_timers+0x3ff/0x4d0 run_timer_softirq+0x41/0x80 __do_softirq+0x233/0x544 irq_exit_rcu+0x41/0xa0 sysvec_apic_timer_interrupt+0x8c/0xb0 asm_sysvec_apic_timer_interrupt+0x1b/0x20 RIP: 0010:default_idle+0xb/0x10 RSP: 0018:ffffc9000012fea0 EFLAGS: 00000202 RAX: 000000000000bcae RBX: ffff888006660f00 RCX: 000000000000bcae RDX: 0000000000000001 RSI: ffffffff843a11c0 RDI: ffffffff843a1180 RBP: dffffc0000000000 R08: dffffc0000000000 R09: ffffed100da36d46 R10: dfffe9100da36d47 R11: ffffffff83cf0950 R12: 0000000000000000 R13: 1ffff11000ccc1e0 R14: ffffffff8542af28 R15: dffffc0000000000 ... Allocated by task 146: __kasan_kmalloc+0xc4/0xf0 sk_prot_alloc+0xdd/0x1a0 sk_alloc+0x2d/0x4e0 rose_create+0x7b/0x330 __sock_create+0x2dd/0x640 __sys_socket+0xc7/0x270 __x64_sys_socket+0x71/0x80 do_syscall_64+0x43/0x90 entry_SYSCALL_64_after_hwframe+0x46/0xb0 Freed by task 152: kasan_set_track+0x4c/0x70 kasan_set_free_info+0x1f/0x40 ____kasan_slab_free+0x124/0x190 kfree+0xd3/0x270 __sk_destruct+0x314/0x460 rose_release+0x2fa/0x3b0 sock_close+0xcb/0x230 __fput+0x2d9/0x650 task_work_run+0xd6/0x160 exit_to_user_mode_loop+0xc7/0xd0 exit_to_user_mode_prepare+0x4e/0x80 syscall_exit_to_user_mode+0x20/0x40 do_syscall_64+0x4f/0x90 entry_SYSCALL_64_after_hwframe+0x46/0xb0 This patch adds refcount of sock when we use functions such as rose_start_heartbeat() and so on to start timer, and decreases the refcount of sock when timer is finished or deleted by functions such as rose_stop_heartbeat() and so on. As a result, the UAF bugs could be mitigated. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Duoming Zhou Tested-by: Duoming Zhou Link: https://lore.kernel.org/r/20220629002640.5693-1-duoming@zju.edu.cn Signed-off-by: Paolo Abeni Signed-off-by: Greg Kroah-Hartman Signed-off-by: Xu Jia Reviewed-by: Wei Yongjun Reviewed-by: Wang Weiyang Signed-off-by: Zheng Zengkai --- net/rose/rose_timer.c | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/net/rose/rose_timer.c b/net/rose/rose_timer.c index b3138fc2e552..f06ddbed3fed 100644 --- a/net/rose/rose_timer.c +++ b/net/rose/rose_timer.c @@ -31,89 +31,89 @@ static void rose_idletimer_expiry(struct timer_list *); void rose_start_heartbeat(struct sock *sk) { - del_timer(&sk->sk_timer); + sk_stop_timer(sk, &sk->sk_timer); sk->sk_timer.function = rose_heartbeat_expiry; sk->sk_timer.expires = jiffies + 5 * HZ; - add_timer(&sk->sk_timer); + sk_reset_timer(sk, &sk->sk_timer, sk->sk_timer.expires); } void rose_start_t1timer(struct sock *sk) { struct rose_sock *rose = rose_sk(sk); - del_timer(&rose->timer); + sk_stop_timer(sk, &rose->timer); rose->timer.function = rose_timer_expiry; rose->timer.expires = jiffies + rose->t1; - add_timer(&rose->timer); + sk_reset_timer(sk, &rose->timer, rose->timer.expires); } void rose_start_t2timer(struct sock *sk) { struct rose_sock *rose = rose_sk(sk); - del_timer(&rose->timer); + sk_stop_timer(sk, &rose->timer); rose->timer.function = rose_timer_expiry; rose->timer.expires = jiffies + rose->t2; - add_timer(&rose->timer); + sk_reset_timer(sk, &rose->timer, rose->timer.expires); } void rose_start_t3timer(struct sock *sk) { struct rose_sock *rose = rose_sk(sk); - del_timer(&rose->timer); + sk_stop_timer(sk, &rose->timer); rose->timer.function = rose_timer_expiry; rose->timer.expires = jiffies + rose->t3; - add_timer(&rose->timer); + sk_reset_timer(sk, &rose->timer, rose->timer.expires); } void rose_start_hbtimer(struct sock *sk) { struct rose_sock *rose = rose_sk(sk); - del_timer(&rose->timer); + sk_stop_timer(sk, &rose->timer); rose->timer.function = rose_timer_expiry; rose->timer.expires = jiffies + rose->hb; - add_timer(&rose->timer); + sk_reset_timer(sk, &rose->timer, rose->timer.expires); } void rose_start_idletimer(struct sock *sk) { struct rose_sock *rose = rose_sk(sk); - del_timer(&rose->idletimer); + sk_stop_timer(sk, &rose->idletimer); if (rose->idle > 0) { rose->idletimer.function = rose_idletimer_expiry; rose->idletimer.expires = jiffies + rose->idle; - add_timer(&rose->idletimer); + sk_reset_timer(sk, &rose->idletimer, rose->idletimer.expires); } } void rose_stop_heartbeat(struct sock *sk) { - del_timer(&sk->sk_timer); + sk_stop_timer(sk, &sk->sk_timer); } void rose_stop_timer(struct sock *sk) { - del_timer(&rose_sk(sk)->timer); + sk_stop_timer(sk, &rose_sk(sk)->timer); } void rose_stop_idletimer(struct sock *sk) { - del_timer(&rose_sk(sk)->idletimer); + sk_stop_timer(sk, &rose_sk(sk)->idletimer); } static void rose_heartbeat_expiry(struct timer_list *t) @@ -130,6 +130,7 @@ static void rose_heartbeat_expiry(struct timer_list *t) (sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_DEAD))) { bh_unlock_sock(sk); rose_destroy_socket(sk); + sock_put(sk); return; } break; @@ -152,6 +153,7 @@ static void rose_heartbeat_expiry(struct timer_list *t) rose_start_heartbeat(sk); bh_unlock_sock(sk); + sock_put(sk); } static void rose_timer_expiry(struct timer_list *t) @@ -181,6 +183,7 @@ static void rose_timer_expiry(struct timer_list *t) break; } bh_unlock_sock(sk); + sock_put(sk); } static void rose_idletimer_expiry(struct timer_list *t) @@ -205,4 +208,5 @@ static void rose_idletimer_expiry(struct timer_list *t) sock_set_flag(sk, SOCK_DEAD); } bh_unlock_sock(sk); + sock_put(sk); } -- Gitee From 75b7fe6665a03a3430bf18ac4daca685b1baf5e6 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 19 Jul 2022 16:51:11 +0800 Subject: [PATCH 476/674] Revert "mm/filemap: fix that first page is not mark accessed in filemap_read()" hulk inclusion category: bugfix bugzilla: 186896, https://gitee.com/src-openeuler/kernel/issues/I5GZC8 CVE: NA -------------------------------- This reverts commit 499ecade21fc377c04ec66ed7f0505f1ca74d755. Prepare to backport solution from mainline. Signed-off-by: Yu Kuai Reviewed-by: Kefeng Wang Signed-off-by: Zheng Zengkai --- mm/filemap.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index ebae261f9df9..9e209e8a3b0d 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2551,11 +2551,10 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb, flush_dcache_page(pages[i]); copied = copy_page_to_iter(pages[i], offset, bytes, iter); - if (copied) { - ra->prev_pos = iocb->ki_pos; - written += copied; - iocb->ki_pos += copied; - } + + written += copied; + iocb->ki_pos += copied; + ra->prev_pos = iocb->ki_pos; if (copied < bytes) { error = -EFAULT; -- Gitee From 939325bbad27dc99b8f8cea2791192e5f81b76de Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 19 Jul 2022 16:51:12 +0800 Subject: [PATCH 477/674] filemap: Correct the conditions for marking a folio as accessed mainline inclusion from mainline-5.19-rc4 commit 5ccc944dce3df5fd2fd683a7df4fd49d1068eba2 category: bugfix bugzilla: 186896, https://gitee.com/src-openeuler/kernel/issues/I5GZC8 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=5ccc944dce3df5fd2fd683a7df4fd49d1068eba2 ------------------------------------------------- We had an off-by-one error which meant that we never marked the first page in a read as accessed. This was visible as a slowdown when re-reading a file as pages were being evicted from cache too soon. In reviewing this code, we noticed a second bug where a multi-page folio would be marked as accessed multiple times when doing reads that were less than the size of the folio. Abstract the comparison of whether two file positions are in the same folio into a new function, fixing both of these bugs. Reported-by: Yu Kuai Reviewed-by: Kent Overstreet Signed-off-by: Matthew Wilcox (Oracle) Conflict: folios is not supported yet Signed-off-by: Yu Kuai Reviewed-by: Kefeng Wang Signed-off-by: Zheng Zengkai --- mm/filemap.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 9e209e8a3b0d..edb94663c5df 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2437,6 +2437,13 @@ static int generic_file_buffered_read_get_pages(struct kiocb *iocb, goto find_page; } +static inline bool pos_same_page(loff_t pos1, loff_t pos2, struct page *page) +{ + unsigned int shift = page_shift(page); + + return (pos1 >> shift == pos2 >> shift); +} + /** * generic_file_buffered_read - generic file read routine * @iocb: the iocb to read @@ -2527,11 +2534,10 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb, writably_mapped = mapping_writably_mapped(mapping); /* - * When a sequential read accesses a page several times, only + * When a read accesses a page several times, only * mark it as accessed the first time. */ - if (iocb->ki_pos >> PAGE_SHIFT != - ra->prev_pos >> PAGE_SHIFT) + if (pos_same_page(iocb->ki_pos, ra->prev_pos -1, pages[0])) mark_page_accessed(pages[0]); for (i = 1; i < pg_nr; i++) mark_page_accessed(pages[i]); -- Gitee From 5399ee65708b987d36cb7ad7fbb33e5e8aff126d Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 19 Jul 2022 16:51:13 +0800 Subject: [PATCH 478/674] nbd: don't clear 'NBD_CMD_INFLIGHT' flag if request is not completed mainline inclusion from mainline-v5.19-rc1 commit 2895f1831e911ca87d4efdf43e35eb72a0c7e66e category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5H32C CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=2895f1831e911ca87d4efdf43e35eb72a0c7e66e -------------------------------- Otherwise io will hung because request will only be completed if the cmd has the flag 'NBD_CMD_INFLIGHT'. Fixes: 07175cb1baf4 ("nbd: make sure request completion won't concurrent") Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20220521073749.3146892-4-yukuai3@huawei.com Signed-off-by: Jens Axboe Reviewed-by: Jason Yan Signed-off-by: Zheng Zengkai --- drivers/block/nbd.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 873ce6f71d85..f7bc67e74390 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -395,13 +395,14 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req, if (!mutex_trylock(&cmd->lock)) return BLK_EH_RESET_TIMER; - if (!__test_and_clear_bit(NBD_CMD_INFLIGHT, &cmd->flags)) { + if (!test_bit(NBD_CMD_INFLIGHT, &cmd->flags)) { mutex_unlock(&cmd->lock); return BLK_EH_DONE; } if (!refcount_inc_not_zero(&nbd->config_refs)) { cmd->status = BLK_STS_TIMEOUT; + __clear_bit(NBD_CMD_INFLIGHT, &cmd->flags); mutex_unlock(&cmd->lock); goto done; } @@ -470,6 +471,7 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req, dev_err_ratelimited(nbd_to_dev(nbd), "Connection timed out\n"); set_bit(NBD_RT_TIMEDOUT, &config->runtime_flags); cmd->status = BLK_STS_IOERR; + __clear_bit(NBD_CMD_INFLIGHT, &cmd->flags); mutex_unlock(&cmd->lock); sock_shutdown(nbd); nbd_config_put(nbd); @@ -737,7 +739,7 @@ static struct nbd_cmd *nbd_handle_reply(struct nbd_device *nbd, int index, cmd = blk_mq_rq_to_pdu(req); mutex_lock(&cmd->lock); - if (!__test_and_clear_bit(NBD_CMD_INFLIGHT, &cmd->flags)) { + if (!test_bit(NBD_CMD_INFLIGHT, &cmd->flags)) { dev_err(disk_to_dev(nbd->disk), "Suspicious reply %d (status %u flags %lu)", tag, cmd->status, cmd->flags); ret = -ENOENT; @@ -844,8 +846,16 @@ static void recv_work(struct work_struct *work) } rq = blk_mq_rq_from_pdu(cmd); - if (likely(!blk_should_fake_timeout(rq->q))) - blk_mq_complete_request(rq); + if (likely(!blk_should_fake_timeout(rq->q))) { + bool complete; + + mutex_lock(&cmd->lock); + complete = __test_and_clear_bit(NBD_CMD_INFLIGHT, + &cmd->flags); + mutex_unlock(&cmd->lock); + if (complete) + blk_mq_complete_request(rq); + } percpu_ref_put(&q->q_usage_counter); } -- Gitee From bc5bd4749331fde731378d46148191ac6403843b Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 19 Jul 2022 16:51:14 +0800 Subject: [PATCH 479/674] nbd: fix io hung while disconnecting device mainline inclusion from mainline-v5.19-rc1 commit 09dadb5985023e27d4740ebd17e6fea4640110e5 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5H32C CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=09dadb5985023e27d4740ebd17e6fea4640110e5 -------------------------------- In our tests, "qemu-nbd" triggers a io hung: INFO: task qemu-nbd:11445 blocked for more than 368 seconds. Not tainted 5.18.0-rc3-next-20220422-00003-g2176915513ca #884 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. task:qemu-nbd state:D stack: 0 pid:11445 ppid: 1 flags:0x00000000 Call Trace: __schedule+0x480/0x1050 ? _raw_spin_lock_irqsave+0x3e/0xb0 schedule+0x9c/0x1b0 blk_mq_freeze_queue_wait+0x9d/0xf0 ? ipi_rseq+0x70/0x70 blk_mq_freeze_queue+0x2b/0x40 nbd_add_socket+0x6b/0x270 [nbd] nbd_ioctl+0x383/0x510 [nbd] blkdev_ioctl+0x18e/0x3e0 __x64_sys_ioctl+0xac/0x120 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7fd8ff706577 RSP: 002b:00007fd8fcdfebf8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 0000000040000000 RCX: 00007fd8ff706577 RDX: 000000000000000d RSI: 000000000000ab00 RDI: 000000000000000f RBP: 000000000000000f R08: 000000000000fbe8 R09: 000055fe497c62b0 R10: 00000002aff20000 R11: 0000000000000246 R12: 000000000000006d R13: 0000000000000000 R14: 00007ffe82dc5e70 R15: 00007fd8fcdff9c0 "qemu-ndb -d" will call ioctl 'NBD_DISCONNECT' first, however, following message was found: block nbd0: Send disconnect failed -32 Which indicate that something is wrong with the server. Then, "qemu-nbd -d" will call ioctl 'NBD_CLEAR_SOCK', however ioctl can't clear requests after commit 2516ab1543fd("nbd: only clear the queue on device teardown"). And in the meantime, request can't complete through timeout because nbd_xmit_timeout() will always return 'BLK_EH_RESET_TIMER', which means such request will never be completed in this situation. Now that the flag 'NBD_CMD_INFLIGHT' can make sure requests won't complete multiple times, switch back to call nbd_clear_sock() in nbd_clear_sock_ioctl(), so that inflight requests can be cleared. Signed-off-by: Yu Kuai Reviewed-by: Josef Bacik Link: https://lore.kernel.org/r/20220521073749.3146892-5-yukuai3@huawei.com Signed-off-by: Jens Axboe Reviewed-by: Jason Yan Signed-off-by: Zheng Zengkai --- drivers/block/nbd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index f7bc67e74390..b45f4e5585b5 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1424,7 +1424,7 @@ static int nbd_start_device_ioctl(struct nbd_device *nbd, struct block_device *b static void nbd_clear_sock_ioctl(struct nbd_device *nbd, struct block_device *bdev) { - sock_shutdown(nbd); + nbd_clear_sock(nbd); __invalidate_device(bdev, true); nbd_bdev_reset(bdev); if (test_and_clear_bit(NBD_RT_HAS_CONFIG_REF, -- Gitee From 7a36a8c476f698de1351166d7f661e29e0404e3d Mon Sep 17 00:00:00 2001 From: Oleksandr Tyshchenko Date: Tue, 19 Jul 2022 16:51:15 +0800 Subject: [PATCH 480/674] xen/arm: Fix race in RB-tree based P2M accounting mainline inclusion from mainline-v5.19-rc6 commit b75cd218274e01d026dc5240e86fdeb44bbed0c8 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5G5TV CVE: CVE-2022-33744 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b75cd218274e01d026dc5240e86fdeb44bbed0c8 -------------------------------- xen/arm: Fix race in RB-tree based P2M accounting During the PV driver life cycle the mappings are added to the RB-tree by set_foreign_p2m_mapping(), which is called from gnttab_map_refs() and are removed by clear_foreign_p2m_mapping() which is called from gnttab_unmap_refs(). As both functions end up calling __set_phys_to_machine_multi() which updates the RB-tree, this function can be called concurrently. There is already a "p2m_lock" to protect against concurrent accesses, but the problem is that the first read of "phys_to_mach.rb_node" in __set_phys_to_machine_multi() is not covered by it, so this might lead to the incorrect mappings update (removing in our case) in RB-tree. In my environment the related issue happens rarely and only when PV net backend is running, the xen_add_phys_to_mach_entry() claims that it cannot add new pfn <-> mfn mapping to the tree since it is already exists which results in a failure when mapping foreign pages. But there might be other bad consequences related to the non-protected root reads such use-after-free, etc. While at it, also fix the similar usage in __pfn_to_mfn(), so initialize "struct rb_node *n" with the "p2m_lock" held in both functions to avoid possible bad consequences. This is CVE-2022-33744 / XSA-406. Signed-off-by: Oleksandr Tyshchenko Reviewed-by: Stefano Stabellini Signed-off-by: Juergen Gross Signed-off-by: Zhao Wenhui Reviewed-by: Xiu Jianfeng Reviewed-by: Chen Hui Signed-off-by: Zheng Zengkai --- arch/arm/xen/p2m.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/arm/xen/p2m.c b/arch/arm/xen/p2m.c index acb464547a54..4a1991a103ea 100644 --- a/arch/arm/xen/p2m.c +++ b/arch/arm/xen/p2m.c @@ -62,11 +62,12 @@ static int xen_add_phys_to_mach_entry(struct xen_p2m_entry *new) unsigned long __pfn_to_mfn(unsigned long pfn) { - struct rb_node *n = phys_to_mach.rb_node; + struct rb_node *n; struct xen_p2m_entry *entry; unsigned long irqflags; read_lock_irqsave(&p2m_lock, irqflags); + n = phys_to_mach.rb_node; while (n) { entry = rb_entry(n, struct xen_p2m_entry, rbnode_phys); if (entry->pfn <= pfn && @@ -153,10 +154,11 @@ bool __set_phys_to_machine_multi(unsigned long pfn, int rc; unsigned long irqflags; struct xen_p2m_entry *p2m_entry; - struct rb_node *n = phys_to_mach.rb_node; + struct rb_node *n; if (mfn == INVALID_P2M_ENTRY) { write_lock_irqsave(&p2m_lock, irqflags); + n = phys_to_mach.rb_node; while (n) { p2m_entry = rb_entry(n, struct xen_p2m_entry, rbnode_phys); if (p2m_entry->pfn <= pfn && -- Gitee From 5809235f957616012f0a5a38f53a8f155588b457 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 19 Jul 2022 16:51:16 +0800 Subject: [PATCH 481/674] crypto: hisilicon - use dev_driver_string() instead of pci_dev->driver->name MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mainline inclusion from mainline-v5.16-rc1 commit 1fbbcffd0ee1 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=1fbbcffd0ee1 ---------------------------------------------------------------------- Replace dev->driver_name() by dev_driver_string() for the corresponding struct device. This is a step toward removing pci_dev->driver. [bhelgaas: split to separate patch] Link: https://lore.kernel.org/r/20211004125935.2300113-8-u.kleine-koenig@pengutronix.de Signed-off-by: Uwe Kleine-König Signed-off-by: Bjorn Helgaas Signed-off-by: Jiangshui Yang Reviewed-by: Yang Shen Acked-by: Xie XiuQi Signed-off-by: Zheng Zengkai --- drivers/crypto/hisilicon/qm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index 57de083e8967..56e19062ba44 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -3220,7 +3220,7 @@ static int qm_alloc_uacce(struct hisi_qm *qm) }; int ret; - ret = strscpy(interface.name, pdev->driver->name, + ret = strscpy(interface.name, dev_driver_string(&pdev->dev), sizeof(interface.name)); if (ret < 0) return -ENAMETOOLONG; -- Gitee From 6c0fc54930cb6f16f1d7c8b8a2d0ce65778493c4 Mon Sep 17 00:00:00 2001 From: Yang Guang Date: Tue, 19 Jul 2022 16:51:17 +0800 Subject: [PATCH 482/674] crypto: hisilicon/hpre - use swap() to make code cleaner mainline inclusion from mainline-v5.17-rc1 commit 574c833ef3a6 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=574c833ef3a6 ---------------------------------------------------------------------- Use the macro 'swap()' defined in 'include/linux/minmax.h' to avoid opencoding it. Reported-by: Zeal Robot Signed-off-by: Yang Guang Signed-off-by: Herbert Xu Signed-off-by: Jiangshui Yang Reviewed-by: Yang Shen Acked-by: Xie XiuQi Signed-off-by: Zheng Zengkai --- drivers/crypto/hisilicon/hpre/hpre_crypto.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/crypto/hisilicon/hpre/hpre_crypto.c b/drivers/crypto/hisilicon/hpre/hpre_crypto.c index a032c192ef1d..0f1724d355b8 100644 --- a/drivers/crypto/hisilicon/hpre/hpre_crypto.c +++ b/drivers/crypto/hisilicon/hpre/hpre_crypto.c @@ -1177,13 +1177,10 @@ static void hpre_rsa_exit_tfm(struct crypto_akcipher *tfm) static void hpre_key_to_big_end(u8 *data, int len) { int i, j; - u8 tmp; for (i = 0; i < len / 2; i++) { j = len - i - 1; - tmp = data[j]; - data[j] = data[i]; - data[i] = tmp; + swap(data[j], data[i]); } } -- Gitee From f7093db71f3180d4abb378debc5b3206c58e0390 Mon Sep 17 00:00:00 2001 From: Kai Ye Date: Tue, 19 Jul 2022 16:51:18 +0800 Subject: [PATCH 483/674] crypto: hisilicon - modify the value of engine type rate mainline inclusion from mainline-v5.17-rc1 commit 376a5c3cdd7c category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=376a5c3cdd7c ---------------------------------------------------------------------- Modify the value of type rate from new QM spec. Signed-off-by: Kai Ye Signed-off-by: Herbert Xu Signed-off-by: Jiangshui Yang Reviewed-by: Yang Shen Acked-by: Xie XiuQi Signed-off-by: Zheng Zengkai --- drivers/crypto/hisilicon/hpre/hpre_main.c | 2 +- drivers/crypto/hisilicon/sec2/sec_main.c | 2 +- drivers/crypto/hisilicon/zip/zip_main.c | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/crypto/hisilicon/hpre/hpre_main.c b/drivers/crypto/hisilicon/hpre/hpre_main.c index 65a641396c07..ebfab3e14499 100644 --- a/drivers/crypto/hisilicon/hpre/hpre_main.c +++ b/drivers/crypto/hisilicon/hpre/hpre_main.c @@ -103,7 +103,7 @@ #define HPRE_QM_PM_FLR BIT(11) #define HPRE_QM_SRIOV_FLR BIT(12) -#define HPRE_SHAPER_TYPE_RATE 128 +#define HPRE_SHAPER_TYPE_RATE 640 #define HPRE_VIA_MSI_DSM 1 #define HPRE_SQE_MASK_OFFSET 8 #define HPRE_SQE_MASK_LEN 24 diff --git a/drivers/crypto/hisilicon/sec2/sec_main.c b/drivers/crypto/hisilicon/sec2/sec_main.c index 90551bf38b52..26d3ab1d308b 100644 --- a/drivers/crypto/hisilicon/sec2/sec_main.c +++ b/drivers/crypto/hisilicon/sec2/sec_main.c @@ -105,7 +105,7 @@ #define SEC_SQE_MASK_OFFSET 64 #define SEC_SQE_MASK_LEN 48 -#define SEC_SHAPER_TYPE_RATE 128 +#define SEC_SHAPER_TYPE_RATE 400 struct sec_hw_error { u32 int_msk; diff --git a/drivers/crypto/hisilicon/zip/zip_main.c b/drivers/crypto/hisilicon/zip/zip_main.c index 9c4fb0af5ebd..678f8b58ec42 100644 --- a/drivers/crypto/hisilicon/zip/zip_main.c +++ b/drivers/crypto/hisilicon/zip/zip_main.c @@ -103,8 +103,8 @@ #define HZIP_PREFETCH_ENABLE (~(BIT(26) | BIT(17) | BIT(0))) #define HZIP_SVA_PREFETCH_DISABLE BIT(26) #define HZIP_SVA_DISABLE_READY (BIT(26) | BIT(30)) -#define HZIP_SHAPER_RATE_COMPRESS 252 -#define HZIP_SHAPER_RATE_DECOMPRESS 229 +#define HZIP_SHAPER_RATE_COMPRESS 750 +#define HZIP_SHAPER_RATE_DECOMPRESS 140 #define HZIP_DELAY_1_US 1 #define HZIP_POLL_TIMEOUT_US 1000 -- Gitee From 42b23856b36ff79226eb4e9022e9ed1b1a55e56a Mon Sep 17 00:00:00 2001 From: Kai Ye Date: Tue, 19 Jul 2022 16:51:19 +0800 Subject: [PATCH 484/674] crypto: hisilicon/qm - modify the value of qos initialization mainline inclusion from mainline-v5.17-rc1 commit ecc7169d4f73 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ecc7169d4f73 ---------------------------------------------------------------------- The value of qos should be reset after flr resetting or device resetting. So set the max of qos value for every function. Then update the value of qos when user writing the alg_qos. Signed-off-by: Kai Ye Signed-off-by: Herbert Xu Signed-off-by: Jiangshui Yang Reviewed-by: Yang Shen Acked-by: Xie XiuQi Signed-off-by: Zheng Zengkai --- drivers/crypto/hisilicon/qm.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index 56e19062ba44..fea69f7aadba 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -1257,10 +1257,10 @@ static int qm_set_vft_common(struct hisi_qm *qm, enum vft_type type, static int qm_shaper_init_vft(struct hisi_qm *qm, u32 fun_num) { + u32 qos = qm->factor[fun_num].func_qos; int ret, i; - qm->factor[fun_num].func_qos = QM_QOS_MAX_VAL; - ret = qm_get_shaper_para(QM_QOS_MAX_VAL * QM_QOS_RATE, &qm->factor[fun_num]); + ret = qm_get_shaper_para(qos * QM_QOS_RATE, &qm->factor[fun_num]); if (ret) { dev_err(&qm->pdev->dev, "failed to calculate shaper parameter!\n"); return ret; @@ -5838,13 +5838,15 @@ static int hisi_qp_alloc_memory(struct hisi_qm *qm) static int hisi_qm_memory_init(struct hisi_qm *qm) { struct device *dev = &qm->pdev->dev; - int ret, total_vfs; + int ret, total_func, i; size_t off = 0; - total_vfs = pci_sriov_get_totalvfs(qm->pdev); - qm->factor = kcalloc(total_vfs + 1, sizeof(struct qm_shaper_factor), GFP_KERNEL); + total_func = pci_sriov_get_totalvfs(qm->pdev) + 1; + qm->factor = kcalloc(total_func, sizeof(struct qm_shaper_factor), GFP_KERNEL); if (!qm->factor) return -ENOMEM; + for (i = 0; i < total_func; i++) + qm->factor[i].func_qos = QM_QOS_MAX_VAL; #define QM_INIT_BUF(qm, type, num) do { \ (qm)->type = ((qm)->qdma.va + (off)); \ -- Gitee From 9412be3d72142ad991e79d082a2742305f69fcf3 Mon Sep 17 00:00:00 2001 From: Kai Ye Date: Tue, 19 Jul 2022 16:51:20 +0800 Subject: [PATCH 485/674] crypto: hisilicon/qm - some optimizations of ths qos write process mainline inclusion from mainline-v5.17-rc1 commit 488f30d4b8b3 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=488f30d4b8b3 ---------------------------------------------------------------------- 1. Optimize overly long functions. 2. Fix the format symbol does not match the actual type. 3. Use the PCI_DEVFN to get the function id. Signed-off-by: Kai Ye Signed-off-by: Herbert Xu Signed-off-by: Jiangshui Yang Reviewed-by: Yang Shen Acked-by: Xie XiuQi Signed-off-by: Zheng Zengkai --- drivers/crypto/hisilicon/qm.c | 87 ++++++++++++++++++----------------- 1 file changed, 46 insertions(+), 41 deletions(-) diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index fea69f7aadba..9e578eba295f 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -4352,66 +4352,69 @@ static ssize_t qm_qos_value_init(const char *buf, unsigned long *val) return 0; } +static ssize_t qm_get_qos_value(struct hisi_qm *qm, const char *buf, + unsigned long *val, + unsigned int *fun_index) +{ + char tbuf_bdf[QM_DBG_READ_LEN] = {0}; + char val_buf[QM_QOS_VAL_MAX_LEN] = {0}; + u32 tmp1, device, function; + int ret, bus; + + ret = sscanf(buf, "%s %s", tbuf_bdf, val_buf); + if (ret != QM_QOS_PARAM_NUM) + return -EINVAL; + + ret = qm_qos_value_init(val_buf, val); + if (*val == 0 || *val > QM_QOS_MAX_VAL || ret) { + pci_err(qm->pdev, "input qos value is error, please set 1~1000!\n"); + return -EINVAL; + } + + ret = sscanf(tbuf_bdf, "%u:%x:%u.%u", &tmp1, &bus, &device, &function); + if (ret != QM_QOS_BDF_PARAM_NUM) { + pci_err(qm->pdev, "input pci bdf value is error!\n"); + return -EINVAL; + } + + *fun_index = PCI_DEVFN(device, function); + + return 0; +} + static ssize_t qm_algqos_write(struct file *filp, const char __user *buf, size_t count, loff_t *pos) { struct hisi_qm *qm = filp->private_data; char tbuf[QM_DBG_READ_LEN]; - int tmp1, bus, device, function; - char tbuf_bdf[QM_DBG_READ_LEN] = {0}; - char val_buf[QM_QOS_VAL_MAX_LEN] = {0}; unsigned int fun_index; - unsigned long val = 0; + unsigned long val; int len, ret; if (qm->fun_type == QM_HW_VF) return -EINVAL; - /* Mailbox and reset cannot be operated at the same time */ - if (test_and_set_bit(QM_RESETTING, &qm->misc_ctl)) { - pci_err(qm->pdev, "dev resetting, write alg qos failed!\n"); - return -EAGAIN; - } - - if (*pos != 0) { - ret = 0; - goto err_get_status; - } + if (*pos != 0) + return 0; - if (count >= QM_DBG_READ_LEN) { - ret = -ENOSPC; - goto err_get_status; - } + if (count >= QM_DBG_READ_LEN) + return -ENOSPC; len = simple_write_to_buffer(tbuf, QM_DBG_READ_LEN - 1, pos, buf, count); - if (len < 0) { - ret = len; - goto err_get_status; - } + if (len < 0) + return len; tbuf[len] = '\0'; - ret = sscanf(tbuf, "%s %s", tbuf_bdf, val_buf); - if (ret != QM_QOS_PARAM_NUM) { - ret = -EINVAL; - goto err_get_status; - } - - ret = qm_qos_value_init(val_buf, &val); - if (val == 0 || val > QM_QOS_MAX_VAL || ret) { - pci_err(qm->pdev, "input qos value is error, please set 1~1000!\n"); - ret = -EINVAL; - goto err_get_status; - } + ret = qm_get_qos_value(qm, tbuf, &val, &fun_index); + if (ret) + return ret; - ret = sscanf(tbuf_bdf, "%d:%x:%d.%d", &tmp1, &bus, &device, &function); - if (ret != QM_QOS_BDF_PARAM_NUM) { - pci_err(qm->pdev, "input pci bdf value is error!\n"); - ret = -EINVAL; - goto err_get_status; + /* Mailbox and reset cannot be operated at the same time */ + if (test_and_set_bit(QM_RESETTING, &qm->misc_ctl)) { + pci_err(qm->pdev, "dev resetting, write alg qos failed!\n"); + return -EAGAIN; } - fun_index = device * 8 + function; - ret = qm_pm_get_sync(qm); if (ret) { ret = -EINVAL; @@ -4425,6 +4428,8 @@ static ssize_t qm_algqos_write(struct file *filp, const char __user *buf, goto err_put_sync; } + pci_info(qm->pdev, "the qos value of function%u is set to %lu.\n", + fun_index, val); ret = count; err_put_sync: -- Gitee From 990c7e184747263ca37f0ca88419e62d7c6b4889 Mon Sep 17 00:00:00 2001 From: Kai Ye Date: Tue, 19 Jul 2022 16:51:21 +0800 Subject: [PATCH 486/674] crypto: hisilicon/qm - simplified the calculation of qos shaper parameters mainline inclusion from mainline-v5.17-rc1 commit 13389403fe8a category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=13389403fe8a ---------------------------------------------------------------------- Some optimize for the calculation of qos shaper parameters. and modify the comments. Signed-off-by: Kai Ye Signed-off-by: Herbert Xu Signed-off-by: Jiangshui Yang Reviewed-by: Yang Shen Acked-by: Xie XiuQi Signed-off-by: Zheng Zengkai --- drivers/crypto/hisilicon/qm.c | 84 +++++++++++++++++++++++------------ 1 file changed, 55 insertions(+), 29 deletions(-) diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index 9e578eba295f..46aeb88be4ed 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -505,10 +505,30 @@ static const char * const qp_s[] = { "none", "init", "start", "stop", "close", }; -static const u32 typical_qos_val[QM_QOS_TYPICAL_NUM] = {100, 250, 500, 1000, - 10000, 25000, 50000, 100000}; -static const u32 typical_qos_cbs_s[QM_QOS_TYPICAL_NUM] = {9, 10, 11, 12, 16, - 17, 18, 19}; +struct qm_typical_qos_table { + u32 start; + u32 end; + u32 val; +}; + +/* the qos step is 100 */ +static struct qm_typical_qos_table shaper_cir_s[] = { + {100, 100, 4}, + {200, 200, 3}, + {300, 500, 2}, + {600, 1000, 1}, + {1100, 100000, 0}, +}; + +static struct qm_typical_qos_table shaper_cbs_s[] = { + {100, 200, 9}, + {300, 500, 11}, + {600, 1000, 12}, + {1100, 10000, 16}, + {10100, 25000, 17}, + {25100, 50000, 18}, + {50100, 100000, 19} +}; static bool qm_avail_state(struct hisi_qm *qm, enum qm_state new) { @@ -1119,12 +1139,14 @@ static void qm_init_prefetch(struct hisi_qm *qm) } /* + * acc_shaper_para_calc() Get the IR value by the qos formula, the return value + * is the expected qos calculated. * the formula: * IR = X Mbps if ir = 1 means IR = 100 Mbps, if ir = 10000 means = 10Gbps * - * IR_b * (2 ^ IR_u) * 8 - * IR(Mbps) * 10 ^ -3 = ------------------------- - * Tick * (2 ^ IR_s) + * IR_b * (2 ^ IR_u) * 8000 + * IR(Mbps) = ------------------------- + * Tick * (2 ^ IR_s) */ static u32 acc_shaper_para_calc(u64 cir_b, u64 cir_u, u64 cir_s) { @@ -1134,17 +1156,28 @@ static u32 acc_shaper_para_calc(u64 cir_b, u64 cir_u, u64 cir_s) static u32 acc_shaper_calc_cbs_s(u32 ir) { + int table_size = ARRAY_SIZE(shaper_cbs_s); int i; - if (ir < typical_qos_val[0]) - return QM_SHAPER_MIN_CBS_S; + for (i = 0; i < table_size; i++) { + if (ir >= shaper_cbs_s[i].start && ir <= shaper_cbs_s[i].end) + return shaper_cbs_s[i].val; + } - for (i = 1; i < QM_QOS_TYPICAL_NUM; i++) { - if (ir >= typical_qos_val[i - 1] && ir < typical_qos_val[i]) - return typical_qos_cbs_s[i - 1]; + return QM_SHAPER_MIN_CBS_S; +} + +static u32 acc_shaper_calc_cir_s(u32 ir) +{ + int table_size = ARRAY_SIZE(shaper_cir_s); + int i; + + for (i = 0; i < table_size; i++) { + if (ir >= shaper_cir_s[i].start && ir <= shaper_cir_s[i].end) + return shaper_cir_s[i].val; } - return typical_qos_cbs_s[QM_QOS_TYPICAL_NUM - 1]; + return 0; } static int qm_get_shaper_para(u32 ir, struct qm_shaper_factor *factor) @@ -1153,25 +1186,18 @@ static int qm_get_shaper_para(u32 ir, struct qm_shaper_factor *factor) u32 error_rate; factor->cbs_s = acc_shaper_calc_cbs_s(ir); + cir_s = acc_shaper_calc_cir_s(ir); for (cir_b = QM_QOS_MIN_CIR_B; cir_b <= QM_QOS_MAX_CIR_B; cir_b++) { for (cir_u = 0; cir_u <= QM_QOS_MAX_CIR_U; cir_u++) { - for (cir_s = 0; cir_s <= QM_QOS_MAX_CIR_S; cir_s++) { - /** the formula is changed to: - * IR_b * (2 ^ IR_u) * DIVISOR_CLK - * IR(Mbps) = ------------------------- - * 768 * (2 ^ IR_s) - */ - ir_calc = acc_shaper_para_calc(cir_b, cir_u, - cir_s); - error_rate = QM_QOS_EXPAND_RATE * (u32)abs(ir_calc - ir) / ir; - if (error_rate <= QM_QOS_MIN_ERROR_RATE) { - factor->cir_b = cir_b; - factor->cir_u = cir_u; - factor->cir_s = cir_s; - - return 0; - } + ir_calc = acc_shaper_para_calc(cir_b, cir_u, cir_s); + + error_rate = QM_QOS_EXPAND_RATE * (u32)abs(ir_calc - ir) / ir; + if (error_rate <= QM_QOS_MIN_ERROR_RATE) { + factor->cir_b = cir_b; + factor->cir_u = cir_u; + factor->cir_s = cir_s; + return 0; } } } -- Gitee From deca7e9bccabbce816c9f6b03fe122d8785bcde3 Mon Sep 17 00:00:00 2001 From: Weili Qian Date: Tue, 19 Jul 2022 16:51:22 +0800 Subject: [PATCH 487/674] crypto: hisilicon/qm - fix incorrect return value of hisi_qm_resume() mainline inclusion from mainline-v5.17-rc1 commit 3f9dd4c802b9 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=3f9dd4c802b9 ---------------------------------------------------------------------- When hisi_qm_resume() returns 0, it indicates that the device has started successfully. If the device fails to start, hisi_qm_resume() needs to return the actual error code to the caller instead of 0. Fixes: d7ea53395b72 ("crypto: hisilicon - add runtime PM ops") Signed-off-by: Weili Qian Signed-off-by: Herbert Xu Signed-off-by: Jiangshui Yang Reviewed-by: Yang Shen Acked-by: Xie XiuQi Signed-off-by: Zheng Zengkai --- drivers/crypto/hisilicon/qm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index 46aeb88be4ed..23933c8bd88d 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -6170,7 +6170,7 @@ int hisi_qm_resume(struct device *dev) if (ret) pci_err(pdev, "failed to start qm(%d)\n", ret); - return 0; + return ret; } EXPORT_SYMBOL_GPL(hisi_qm_resume); -- Gitee From f25cf6ed884994315fbaada94ad24dac3fb911d2 Mon Sep 17 00:00:00 2001 From: Weili Qian Date: Tue, 19 Jul 2022 16:51:23 +0800 Subject: [PATCH 488/674] crypto: hisilicon/hpre - fix memory leak in hpre_curve25519_src_init() mainline inclusion from mainline-v5.17-rc1 commit 51fa916b81e5 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=51fa916b81e5 ---------------------------------------------------------------------- hpre_curve25519_src_init() allocates memory for 'ptr' before calling memcmp(). If memcmp() returns 0, the function will return '-EINVAL' without freeing memory. Signed-off-by: Weili Qian Signed-off-by: Herbert Xu Signed-off-by: Jiangshui Yang Reviewed-by: Yang Shen Acked-by: Xie XiuQi Signed-off-by: Zheng Zengkai --- drivers/crypto/hisilicon/hpre/hpre_crypto.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/crypto/hisilicon/hpre/hpre_crypto.c b/drivers/crypto/hisilicon/hpre/hpre_crypto.c index 0f1724d355b8..97d54c1465c2 100644 --- a/drivers/crypto/hisilicon/hpre/hpre_crypto.c +++ b/drivers/crypto/hisilicon/hpre/hpre_crypto.c @@ -1862,7 +1862,7 @@ static int hpre_curve25519_src_init(struct hpre_asym_request *hpre_req, */ if (memcmp(ptr, p, ctx->key_sz) == 0) { dev_err(dev, "gx is p!\n"); - return -EINVAL; + goto err; } else if (memcmp(ptr, p, ctx->key_sz) > 0) { hpre_curve25519_src_modulo_p(ptr); } -- Gitee From 53bff6015473369f097159156f2ae9544db4d6b4 Mon Sep 17 00:00:00 2001 From: Weili Qian Date: Tue, 19 Jul 2022 16:51:24 +0800 Subject: [PATCH 489/674] crypto: hisilicon/qm - disable qm clock-gating mainline inclusion from mainline-v5.17-rc1 commit 4cee0700cf1d category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=4cee0700cf1d ---------------------------------------------------------------------- For Kunpeng930, if qm clock-gating is enabled, rate limiter will be inaccurate. Therefore, disable clock-gating before doing task. Signed-off-by: Weili Qian Signed-off-by: Herbert Xu Reviewed-by: Yang Shen Acked-by: Xie XiuQi Signed-off-by: Zheng Zengkai --- drivers/crypto/hisilicon/qm.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index 23933c8bd88d..60ca59d11702 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -126,6 +126,8 @@ #define QM_CQC_VFT 0x1 #define QM_VFT_CFG 0x100060 #define QM_VFT_CFG_OP_ENABLE 0x100054 +#define QM_PM_CTRL 0x100148 +#define QM_IDLE_DISABLE BIT(9) #define QM_VFT_CFG_DATA_L 0x100064 #define QM_VFT_CFG_DATA_H 0x100068 @@ -800,6 +802,19 @@ static void qm_db(struct hisi_qm *qm, u16 qn, u8 cmd, u16 index, u8 priority) qm->ops->qm_db(qm, qn, cmd, index, priority); } +static void qm_disable_clock_gate(struct hisi_qm *qm) +{ + u32 val; + + /* if qm enables clock gating in Kunpeng930, qos will be inaccurate. */ + if (qm->ver < QM_HW_V3) + return; + + val = readl(qm->io_base + QM_PM_CTRL); + val |= QM_IDLE_DISABLE; + writel(val, qm->io_base + QM_PM_CTRL); +} + static int qm_dev_mem_reset(struct hisi_qm *qm) { u32 val; @@ -5947,6 +5962,7 @@ int hisi_qm_init(struct hisi_qm *qm) } if (qm->fun_type == QM_HW_PF) { + qm_disable_clock_gate(qm); ret = qm_dev_mem_reset(qm); if (ret) { dev_err(dev, "failed to reset device memory\n"); @@ -6111,6 +6127,7 @@ static int qm_rebuild_for_resume(struct hisi_qm *qm) qm_cmd_init(qm); hisi_qm_dev_err_init(qm); + qm_disable_clock_gate(qm); ret = qm_dev_mem_reset(qm); if (ret) pci_err(pdev, "failed to reset device memory\n"); -- Gitee From 3753ee5611dcd2a48df0633d0cd83210d3cea803 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Tue, 19 Jul 2022 16:51:25 +0800 Subject: [PATCH 490/674] crypto: hisilicon - cleanup warning in qm_get_qos_value() mainline inclusion from mainline-v5.17-rc1 commit c5d692a2335d category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c5d692a2335d ---------------------------------------------------------------------- Building with clang static analysis returns this warning: qm.c:4382:11: warning: The left operand of '==' is a garbage value if (*val == 0 || *val > QM_QOS_MAX_VAL || ret) { ~~~~ ^ The call to qm_qos_value_init() can return an error without setting *val. So check ret before checking *val. Fixes: 72b010dc33b9 ("crypto: hisilicon/qm - supports writing QoS int the host") Signed-off-by: Tom Rix Signed-off-by: Herbert Xu Signed-off-by: Jiangshui Yang Reviewed-by: Yang Shen Acked-by: Xie XiuQi Signed-off-by: Zheng Zengkai --- drivers/crypto/hisilicon/qm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index 60ca59d11702..4e02bbf4768a 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -4407,7 +4407,7 @@ static ssize_t qm_get_qos_value(struct hisi_qm *qm, const char *buf, return -EINVAL; ret = qm_qos_value_init(val_buf, val); - if (*val == 0 || *val > QM_QOS_MAX_VAL || ret) { + if (ret || *val == 0 || *val > QM_QOS_MAX_VAL) { pci_err(qm->pdev, "input qos value is error, please set 1~1000!\n"); return -EINVAL; } -- Gitee From 00a6d7cd188ea5d2376bd3389e888c32f8c0bf2d Mon Sep 17 00:00:00 2001 From: Kai Ye Date: Tue, 19 Jul 2022 16:51:26 +0800 Subject: [PATCH 491/674] crypto: hisilicon/sec - use the correct print format mainline inclusion from mainline-v5.18-rc1 commit 498382593c7c category: cleanup bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=498382593c7c ---------------------------------------------------------------------- Use the correct print format. Printing an unsigned int value should use %u instead of %d. Signed-off-by: Kai Ye Signed-off-by: Herbert Xu Reviewed-by: Yang Shen Acked-by: Xie XiuQi Signed-off-by: Zheng Zengkai --- drivers/crypto/hisilicon/sec2/sec_crypto.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/crypto/hisilicon/sec2/sec_crypto.c b/drivers/crypto/hisilicon/sec2/sec_crypto.c index 9d0a39f60fb2..a91635c348b5 100644 --- a/drivers/crypto/hisilicon/sec2/sec_crypto.c +++ b/drivers/crypto/hisilicon/sec2/sec_crypto.c @@ -240,7 +240,7 @@ static void sec_req_cb(struct hisi_qp *qp, void *resp) if (unlikely(type != type_supported)) { atomic64_inc(&dfx->err_bd_cnt); - pr_err("err bd type [%d]\n", type); + pr_err("err bd type [%u]\n", type); return; } -- Gitee From c9f4c20d3e3504f8f7f8c8ee38e81303d829e64f Mon Sep 17 00:00:00 2001 From: Kai Ye Date: Tue, 19 Jul 2022 16:51:27 +0800 Subject: [PATCH 492/674] crypto: hisilicon/qm - cleanup warning in qm_vf_read_qos mainline inclusion from mainline-v5.18-rc1 commit 05b3bade290d category: clean up bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=05b3bade290d ---------------------------------------------------------------------- The kernel test rebot report this warning: Uninitialized variable: ret. The code flow may return value of ret directly. This value is an uninitialized variable, here is fix it. Signed-off-by: Kai Ye Signed-off-by: Herbert Xu Reviewed-by: Yang Shen Acked-by: Xie XiuQi Signed-off-by: Zheng Zengkai --- drivers/crypto/hisilicon/qm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index 4e02bbf4768a..bf103c8743d3 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -4308,7 +4308,7 @@ static void qm_vf_get_qos(struct hisi_qm *qm, u32 fun_num) static int qm_vf_read_qos(struct hisi_qm *qm) { int cnt = 0; - int ret; + int ret = -EINVAL; /* reset mailbox qos val */ qm->mb_qos = 0; -- Gitee From 2fef6b05e7b41437e9adbe48d957791c7f2b8664 Mon Sep 17 00:00:00 2001 From: Kai Ye Date: Tue, 19 Jul 2022 16:51:28 +0800 Subject: [PATCH 493/674] crypto: hisilicon/sec - add the register configuration for HW V3 mainline inclusion from mainline-v5.18-rc1 commit aec01cc8d119 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=aec01cc8d119 ---------------------------------------------------------------------- Added the register configuration of the SVA mode for HW V3. Signed-off-by: Kai Ye Signed-off-by: Herbert Xu Reviewed-by: Yang Shen Acked-by: Xie XiuQi Signed-off-by: Zheng Zengkai --- drivers/crypto/hisilicon/sec2/sec_main.c | 51 +++++++++++++++++++----- 1 file changed, 40 insertions(+), 11 deletions(-) diff --git a/drivers/crypto/hisilicon/sec2/sec_main.c b/drivers/crypto/hisilicon/sec2/sec_main.c index 26d3ab1d308b..45d2b27da9ad 100644 --- a/drivers/crypto/hisilicon/sec2/sec_main.c +++ b/drivers/crypto/hisilicon/sec2/sec_main.c @@ -90,6 +90,10 @@ SEC_USER1_WB_DATA_SSV) #define SEC_USER1_SMMU_SVA (SEC_USER1_SMMU_NORMAL | SEC_USER1_SVA_SET) #define SEC_USER1_SMMU_MASK (~SEC_USER1_SVA_SET) +#define SEC_INTERFACE_USER_CTRL0_REG_V3 0x302220 +#define SEC_INTERFACE_USER_CTRL1_REG_V3 0x302224 +#define SEC_USER1_SMMU_NORMAL_V3 (BIT(23) | BIT(17) | BIT(11) | BIT(5)) +#define SEC_USER1_SMMU_MASK_V3 0xFF79E79E #define SEC_CORE_INT_STATUS_M_ECC BIT(2) #define SEC_PREFETCH_CFG 0x301130 @@ -335,6 +339,41 @@ static void sec_set_endian(struct hisi_qm *qm) writel_relaxed(reg, qm->io_base + SEC_CONTROL_REG); } +static void sec_engine_sva_config(struct hisi_qm *qm) +{ + u32 reg; + + if (qm->ver > QM_HW_V2) { + reg = readl_relaxed(qm->io_base + + SEC_INTERFACE_USER_CTRL0_REG_V3); + reg |= SEC_USER0_SMMU_NORMAL; + writel_relaxed(reg, qm->io_base + + SEC_INTERFACE_USER_CTRL0_REG_V3); + + reg = readl_relaxed(qm->io_base + + SEC_INTERFACE_USER_CTRL1_REG_V3); + reg &= SEC_USER1_SMMU_MASK_V3; + reg |= SEC_USER1_SMMU_NORMAL_V3; + writel_relaxed(reg, qm->io_base + + SEC_INTERFACE_USER_CTRL1_REG_V3); + } else { + reg = readl_relaxed(qm->io_base + + SEC_INTERFACE_USER_CTRL0_REG); + reg |= SEC_USER0_SMMU_NORMAL; + writel_relaxed(reg, qm->io_base + + SEC_INTERFACE_USER_CTRL0_REG); + reg = readl_relaxed(qm->io_base + + SEC_INTERFACE_USER_CTRL1_REG); + reg &= SEC_USER1_SMMU_MASK; + if (qm->use_sva) + reg |= SEC_USER1_SMMU_SVA; + else + reg |= SEC_USER1_SMMU_NORMAL; + writel_relaxed(reg, qm->io_base + + SEC_INTERFACE_USER_CTRL1_REG); + } +} + static void sec_open_sva_prefetch(struct hisi_qm *qm) { u32 val; @@ -426,17 +465,7 @@ static int sec_engine_init(struct hisi_qm *qm) reg |= (0x1 << SEC_TRNG_EN_SHIFT); writel_relaxed(reg, qm->io_base + SEC_CONTROL_REG); - reg = readl_relaxed(qm->io_base + SEC_INTERFACE_USER_CTRL0_REG); - reg |= SEC_USER0_SMMU_NORMAL; - writel_relaxed(reg, qm->io_base + SEC_INTERFACE_USER_CTRL0_REG); - - reg = readl_relaxed(qm->io_base + SEC_INTERFACE_USER_CTRL1_REG); - reg &= SEC_USER1_SMMU_MASK; - if (qm->use_sva && qm->ver == QM_HW_V2) - reg |= SEC_USER1_SMMU_SVA; - else - reg |= SEC_USER1_SMMU_NORMAL; - writel_relaxed(reg, qm->io_base + SEC_INTERFACE_USER_CTRL1_REG); + sec_engine_sva_config(qm); writel(SEC_SINGLE_PORT_MAX_TRANS, qm->io_base + AM_CFG_SINGLE_PORT_MAX_TRANS); -- Gitee From abe783433177f68cee95ccf27bb1caf043049740 Mon Sep 17 00:00:00 2001 From: Kai Ye Date: Tue, 19 Jul 2022 16:51:29 +0800 Subject: [PATCH 494/674] crypto: hisilicon/sec - not need to enable sm4 extra mode at HW V3 mainline inclusion from mainline-v5.18-rc1 commit f8a265282644 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=f8a265282644 ---------------------------------------------------------------------- It is not need to enable sm4 extra mode in at HW V3. Here is fix it. Signed-off-by: Kai Ye Signed-off-by: Herbert Xu Reviewed-by: Yang Shen Acked-by: Xie XiuQi Signed-off-by: Zheng Zengkai --- drivers/crypto/hisilicon/sec2/sec_main.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/crypto/hisilicon/sec2/sec_main.c b/drivers/crypto/hisilicon/sec2/sec_main.c index 45d2b27da9ad..0b9906ff69e3 100644 --- a/drivers/crypto/hisilicon/sec2/sec_main.c +++ b/drivers/crypto/hisilicon/sec2/sec_main.c @@ -472,9 +472,11 @@ static int sec_engine_init(struct hisi_qm *qm) writel(SEC_SAA_ENABLE, qm->io_base + SEC_SAA_EN_REG); - /* Enable sm4 extra mode, as ctr/ecb */ - writel_relaxed(SEC_BD_ERR_CHK_EN0, - qm->io_base + SEC_BD_ERR_CHK_EN_REG0); + /* HW V2 enable sm4 extra mode, as ctr/ecb */ + if (qm->ver < QM_HW_V3) + writel_relaxed(SEC_BD_ERR_CHK_EN0, + qm->io_base + SEC_BD_ERR_CHK_EN_REG0); + /* Enable sm4 xts mode multiple iv */ writel_relaxed(SEC_BD_ERR_CHK_EN1, qm->io_base + SEC_BD_ERR_CHK_EN_REG1); -- Gitee From 6af5a7bb0f0743e7142bd81e610639570283aa2c Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Tue, 19 Jul 2022 16:51:30 +0800 Subject: [PATCH 495/674] crypto: hisilicon/qm: Move the QM header to include/linux mainline inclusion from mainline-v5.18-rc1 commit ff5812e00d5e category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ff5812e00d5e ---------------------------------------------------------------------- Since we are going to introduce VFIO PCI HiSilicon ACC driver for live migration in subsequent patches, move the ACC QM header file to a common include dir. Acked-by: Zhou Wang Acked-by: Longfang Liu Acked-by: Kai Ye Signed-off-by: Shameer Kolothum Link: https://lore.kernel.org/r/20220308184902.2242-2-shameerali.kolothum.thodi@huawei.com Signed-off-by: Alex Williamson Reviewed-by: Yang Shen Acked-by: Xie XiuQi Signed-off-by: Zheng Zengkai --- drivers/crypto/hisilicon/hpre/hpre.h | 2 +- drivers/crypto/hisilicon/qm.c | 2 +- drivers/crypto/hisilicon/sec2/sec.h | 2 +- drivers/crypto/hisilicon/sgl.c | 2 +- drivers/crypto/hisilicon/zip/zip.h | 2 +- drivers/crypto/hisilicon/qm.h => include/linux/hisi_acc_qm.h | 0 6 files changed, 5 insertions(+), 5 deletions(-) rename drivers/crypto/hisilicon/qm.h => include/linux/hisi_acc_qm.h (100%) diff --git a/drivers/crypto/hisilicon/hpre/hpre.h b/drivers/crypto/hisilicon/hpre/hpre.h index e0b4a1982ee9..9a0558ed82f9 100644 --- a/drivers/crypto/hisilicon/hpre/hpre.h +++ b/drivers/crypto/hisilicon/hpre/hpre.h @@ -4,7 +4,7 @@ #define __HISI_HPRE_H #include -#include "../qm.h" +#include #define HPRE_SQE_SIZE sizeof(struct hpre_sqe) #define HPRE_PF_DEF_Q_NUM 64 diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index bf103c8743d3..f7ddf5659c43 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -15,7 +15,7 @@ #include #include #include -#include "qm.h" +#include /* eq/aeq irq enable */ #define QM_VF_AEQ_INT_SOURCE 0x0 diff --git a/drivers/crypto/hisilicon/sec2/sec.h b/drivers/crypto/hisilicon/sec2/sec.h index d97cf02b1df7..c2e9b01187a7 100644 --- a/drivers/crypto/hisilicon/sec2/sec.h +++ b/drivers/crypto/hisilicon/sec2/sec.h @@ -4,7 +4,7 @@ #ifndef __HISI_SEC_V2_H #define __HISI_SEC_V2_H -#include "../qm.h" +#include #include "sec_crypto.h" /* Algorithm resource per hardware SEC queue */ diff --git a/drivers/crypto/hisilicon/sgl.c b/drivers/crypto/hisilicon/sgl.c index 057273769f26..f7efc02b065f 100644 --- a/drivers/crypto/hisilicon/sgl.c +++ b/drivers/crypto/hisilicon/sgl.c @@ -1,9 +1,9 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2019 HiSilicon Limited. */ #include +#include #include #include -#include "qm.h" #define HISI_ACC_SGL_SGE_NR_MIN 1 #define HISI_ACC_SGL_NR_MAX 256 diff --git a/drivers/crypto/hisilicon/zip/zip.h b/drivers/crypto/hisilicon/zip/zip.h index 517fdbdff3ea..3dfd3bac5a33 100644 --- a/drivers/crypto/hisilicon/zip/zip.h +++ b/drivers/crypto/hisilicon/zip/zip.h @@ -7,7 +7,7 @@ #define pr_fmt(fmt) "hisi_zip: " fmt #include -#include "../qm.h" +#include enum hisi_zip_error_type { /* negative compression */ diff --git a/drivers/crypto/hisilicon/qm.h b/include/linux/hisi_acc_qm.h similarity index 100% rename from drivers/crypto/hisilicon/qm.h rename to include/linux/hisi_acc_qm.h -- Gitee From e073afaff8c17c31cebf51a1e9d1a0cf253e1ce2 Mon Sep 17 00:00:00 2001 From: Kai Ye Date: Tue, 19 Jul 2022 16:51:31 +0800 Subject: [PATCH 496/674] crypto: hisilicon/qm - support the userspace task resetting mainline inclusion from mainline-v5.16-rc1 commit 8bb765271aded24ca724a39701c6e686234c7020 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=8bb765271aded24ca724a39701c6e686234c7020 ---------------------------------------------------------------------- Allocate an extra memory page for qp in the qp memory initialization. Set a qp error flag in the extra page addr when device resetting. This error flag can be seen in the userspace. This flag can helps users to stop tasks when device resetting. After resetting, this error flag will be reset when this qp is created again. So app should release the old qp and request a new one, and do the task on the new queue again. Signed-off-by: Kai Ye Signed-off-by: Herbert Xu Signed-off-by: Jiangshui Yang Reviewed-by: Yang Shen Acked-by: Xie XiuQi Signed-off-by: Zheng Zengkai --- drivers/crypto/hisilicon/migration/acc_vf_migration.h | 2 +- drivers/crypto/hisilicon/qm.c | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/crypto/hisilicon/migration/acc_vf_migration.h b/drivers/crypto/hisilicon/migration/acc_vf_migration.h index 28aa7e318ca9..2f459eecb8f0 100644 --- a/drivers/crypto/hisilicon/migration/acc_vf_migration.h +++ b/drivers/crypto/hisilicon/migration/acc_vf_migration.h @@ -8,7 +8,7 @@ #include #include -#include "../qm.h" +#include #define VFIO_PCI_OFFSET_SHIFT 40 #define VFIO_PCI_OFFSET_TO_INDEX(off) ((off) >> VFIO_PCI_OFFSET_SHIFT) diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index f7ddf5659c43..7dadf476335f 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -5492,11 +5492,14 @@ static void qm_pf_reset_vf_prepare(struct hisi_qm *qm, atomic_set(&qm->status.flags, QM_STOP); cmd = QM_VF_PREPARE_FAIL; goto err_prepare; + } else { + goto out; } err_prepare: hisi_qm_set_hw_reset(qm, QM_RESET_STOP_TX_OFFSET); hisi_qm_set_hw_reset(qm, QM_RESET_STOP_RX_OFFSET); +out: pci_save_state(pdev); ret = qm->ops->ping_pf(qm, cmd); if (ret) -- Gitee From 4a1bf8c8e36ad8a44f865f8f9ddb889a6f0d1040 Mon Sep 17 00:00:00 2001 From: Longfang Liu Date: Tue, 19 Jul 2022 16:51:32 +0800 Subject: [PATCH 497/674] crypto: hisilicon/qm: Move few definitions to common header mainline inclusion from mainline-v5.18-rc1 commit b4b084d71332 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b4b084d71332 ---------------------------------------------------------------------- Move Doorbell and Mailbox definitions to common header file. Also export QM mailbox functions. This will be useful when we introduce VFIO PCI HiSilicon ACC live migration driver. Signed-off-by: Longfang Liu Acked-by: Zhou Wang Signed-off-by: Shameer Kolothum Link: https://lore.kernel.org/r/20220308184902.2242-3-shameerali.kolothum.thodi@huawei.com Signed-off-by: Alex Williamson Signed-off-by: Jiangshui Yang Reviewed-by: Yang Shen Acked-by: Xie XiuQi Signed-off-by: Zheng Zengkai --- drivers/crypto/hisilicon/qm.c | 58 +++++++++++------------------------ include/linux/hisi_acc_qm.h | 38 +++++++++++++++++++++++ 2 files changed, 56 insertions(+), 40 deletions(-) diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index 7dadf476335f..abeaef09b91c 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -33,23 +33,6 @@ #define QM_ABNORMAL_EVENT_IRQ_VECTOR 3 /* mailbox */ -#define QM_MB_CMD_SQC 0x0 -#define QM_MB_CMD_CQC 0x1 -#define QM_MB_CMD_EQC 0x2 -#define QM_MB_CMD_AEQC 0x3 -#define QM_MB_CMD_SQC_BT 0x4 -#define QM_MB_CMD_CQC_BT 0x5 -#define QM_MB_CMD_SQC_VFT_V2 0x6 -#define QM_MB_CMD_STOP_QP 0x8 -#define QM_MB_CMD_SRC 0xc -#define QM_MB_CMD_DST 0xd - -#define QM_MB_CMD_SEND_BASE 0x300 -#define QM_MB_EVENT_SHIFT 8 -#define QM_MB_BUSY_SHIFT 13 -#define QM_MB_OP_SHIFT 14 -#define QM_MB_CMD_DATA_ADDR_L 0x304 -#define QM_MB_CMD_DATA_ADDR_H 0x308 #define QM_MB_PING_ALL_VFS 0xffff #define QM_MB_CMD_DATA_SHIFT 32 #define QM_MB_CMD_DATA_MASK GENMASK(31, 0) @@ -103,19 +86,12 @@ #define QM_DB_CMD_SHIFT_V1 16 #define QM_DB_INDEX_SHIFT_V1 32 #define QM_DB_PRIORITY_SHIFT_V1 48 -#define QM_DOORBELL_SQ_CQ_BASE_V2 0x1000 -#define QM_DOORBELL_EQ_AEQ_BASE_V2 0x2000 #define QM_QUE_ISO_CFG_V 0x0030 #define QM_PAGE_SIZE 0x0034 #define QM_QUE_ISO_EN 0x100154 #define QM_CAPBILITY 0x100158 #define QM_QP_NUN_MASK GENMASK(10, 0) #define QM_QP_DB_INTERVAL 0x10000 -#define QM_QP_MAX_NUM_SHIFT 11 -#define QM_DB_CMD_SHIFT_V2 12 -#define QM_DB_RAND_SHIFT_V2 16 -#define QM_DB_INDEX_SHIFT_V2 32 -#define QM_DB_PRIORITY_SHIFT_V2 48 #define QM_MEM_START_INIT 0x100040 #define QM_MEM_INIT_DONE 0x100044 @@ -693,7 +669,7 @@ static void qm_mb_pre_init(struct qm_mailbox *mailbox, u8 cmd, } /* return 0 mailbox ready, -ETIMEDOUT hardware timeout */ -static int qm_wait_mb_ready(struct hisi_qm *qm) +int hisi_qm_wait_mb_ready(struct hisi_qm *qm) { u32 val; @@ -701,6 +677,7 @@ static int qm_wait_mb_ready(struct hisi_qm *qm) val, !((val >> QM_MB_BUSY_SHIFT) & 0x1), POLL_PERIOD, POLL_TIMEOUT); } +EXPORT_SYMBOL_GPL(hisi_qm_wait_mb_ready); /* 128 bit should be written to hardware at one time to trigger a mailbox */ static void qm_mb_write(struct hisi_qm *qm, const void *src) @@ -726,14 +703,14 @@ static void qm_mb_write(struct hisi_qm *qm, const void *src) static int qm_mb_nolock(struct hisi_qm *qm, struct qm_mailbox *mailbox) { - if (unlikely(qm_wait_mb_ready(qm))) { + if (unlikely(hisi_qm_wait_mb_ready(qm))) { dev_err(&qm->pdev->dev, "QM mailbox is busy to start!\n"); goto mb_busy; } qm_mb_write(qm, mailbox); - if (unlikely(qm_wait_mb_ready(qm))) { + if (unlikely(hisi_qm_wait_mb_ready(qm))) { dev_err(&qm->pdev->dev, "QM mailbox operation timeout!\n"); goto mb_busy; } @@ -745,8 +722,8 @@ static int qm_mb_nolock(struct hisi_qm *qm, struct qm_mailbox *mailbox) return -EBUSY; } -static int qm_mb(struct hisi_qm *qm, u8 cmd, dma_addr_t dma_addr, u16 queue, - bool op) +int hisi_qm_mb(struct hisi_qm *qm, u8 cmd, dma_addr_t dma_addr, u16 queue, + bool op) { struct qm_mailbox mailbox; int ret; @@ -762,6 +739,7 @@ static int qm_mb(struct hisi_qm *qm, u8 cmd, dma_addr_t dma_addr, u16 queue, return ret; } +EXPORT_SYMBOL_GPL(hisi_qm_mb); static void qm_db_v1(struct hisi_qm *qm, u16 qn, u8 cmd, u16 index, u8 priority) { @@ -1351,7 +1329,7 @@ static int qm_get_vft_v2(struct hisi_qm *qm, u32 *base, u32 *number) u64 sqc_vft; int ret; - ret = qm_mb(qm, QM_MB_CMD_SQC_VFT_V2, 0, 0, 1); + ret = hisi_qm_mb(qm, QM_MB_CMD_SQC_VFT_V2, 0, 0, 1); if (ret) return ret; @@ -1725,12 +1703,12 @@ static int dump_show(struct hisi_qm *qm, void *info, static int qm_dump_sqc_raw(struct hisi_qm *qm, dma_addr_t dma_addr, u16 qp_id) { - return qm_mb(qm, QM_MB_CMD_SQC, dma_addr, qp_id, 1); + return hisi_qm_mb(qm, QM_MB_CMD_SQC, dma_addr, qp_id, 1); } static int qm_dump_cqc_raw(struct hisi_qm *qm, dma_addr_t dma_addr, u16 qp_id) { - return qm_mb(qm, QM_MB_CMD_CQC, dma_addr, qp_id, 1); + return hisi_qm_mb(qm, QM_MB_CMD_CQC, dma_addr, qp_id, 1); } static int qm_sqc_dump(struct hisi_qm *qm, const char *s) @@ -1842,7 +1820,7 @@ static int qm_eqc_aeqc_dump(struct hisi_qm *qm, char *s, size_t size, if (IS_ERR(xeqc)) return PTR_ERR(xeqc); - ret = qm_mb(qm, cmd, xeqc_dma, 0, 1); + ret = hisi_qm_mb(qm, cmd, xeqc_dma, 0, 1); if (ret) goto err_free_ctx; @@ -2495,7 +2473,7 @@ static int qm_ping_pf(struct hisi_qm *qm, u64 cmd) static int qm_stop_qp(struct hisi_qp *qp) { - return qm_mb(qp->qm, QM_MB_CMD_STOP_QP, 0, qp->qp_id, 0); + return hisi_qm_mb(qp->qm, QM_MB_CMD_STOP_QP, 0, qp->qp_id, 0); } static int qm_set_msi(struct hisi_qm *qm, bool set) @@ -2763,7 +2741,7 @@ static int qm_sq_ctx_cfg(struct hisi_qp *qp, int qp_id, u32 pasid) return -ENOMEM; } - ret = qm_mb(qm, QM_MB_CMD_SQC, sqc_dma, qp_id, 0); + ret = hisi_qm_mb(qm, QM_MB_CMD_SQC, sqc_dma, qp_id, 0); dma_unmap_single(dev, sqc_dma, sizeof(struct qm_sqc), DMA_TO_DEVICE); kfree(sqc); @@ -2804,7 +2782,7 @@ static int qm_cq_ctx_cfg(struct hisi_qp *qp, int qp_id, u32 pasid) return -ENOMEM; } - ret = qm_mb(qm, QM_MB_CMD_CQC, cqc_dma, qp_id, 0); + ret = hisi_qm_mb(qm, QM_MB_CMD_CQC, cqc_dma, qp_id, 0); dma_unmap_single(dev, cqc_dma, sizeof(struct qm_cqc), DMA_TO_DEVICE); kfree(cqc); @@ -3664,7 +3642,7 @@ static int qm_eq_ctx_cfg(struct hisi_qm *qm) return -ENOMEM; } - ret = qm_mb(qm, QM_MB_CMD_EQC, eqc_dma, 0, 0); + ret = hisi_qm_mb(qm, QM_MB_CMD_EQC, eqc_dma, 0, 0); dma_unmap_single(dev, eqc_dma, sizeof(struct qm_eqc), DMA_TO_DEVICE); kfree(eqc); @@ -3693,7 +3671,7 @@ static int qm_aeq_ctx_cfg(struct hisi_qm *qm) return -ENOMEM; } - ret = qm_mb(qm, QM_MB_CMD_AEQC, aeqc_dma, 0, 0); + ret = hisi_qm_mb(qm, QM_MB_CMD_AEQC, aeqc_dma, 0, 0); dma_unmap_single(dev, aeqc_dma, sizeof(struct qm_aeqc), DMA_TO_DEVICE); kfree(aeqc); @@ -3732,11 +3710,11 @@ static int __hisi_qm_start(struct hisi_qm *qm) if (ret) return ret; - ret = qm_mb(qm, QM_MB_CMD_SQC_BT, qm->sqc_dma, 0, 0); + ret = hisi_qm_mb(qm, QM_MB_CMD_SQC_BT, qm->sqc_dma, 0, 0); if (ret) return ret; - ret = qm_mb(qm, QM_MB_CMD_CQC_BT, qm->cqc_dma, 0, 0); + ret = hisi_qm_mb(qm, QM_MB_CMD_CQC_BT, qm->cqc_dma, 0, 0); if (ret) return ret; diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h index 718687dd7242..0ac0fdb6a6bd 100644 --- a/include/linux/hisi_acc_qm.h +++ b/include/linux/hisi_acc_qm.h @@ -34,6 +34,40 @@ #define QM_WUSER_M_CFG_ENABLE 0x1000a8 #define WUSER_M_CFG_ENABLE 0xffffffff +/* mailbox */ +#define QM_MB_CMD_SQC 0x0 +#define QM_MB_CMD_CQC 0x1 +#define QM_MB_CMD_EQC 0x2 +#define QM_MB_CMD_AEQC 0x3 +#define QM_MB_CMD_SQC_BT 0x4 +#define QM_MB_CMD_CQC_BT 0x5 +#define QM_MB_CMD_SQC_VFT_V2 0x6 +#define QM_MB_CMD_STOP_QP 0x8 +#define QM_MB_CMD_SRC 0xc +#define QM_MB_CMD_DST 0xd + +#define QM_MB_CMD_SEND_BASE 0x300 +#define QM_MB_EVENT_SHIFT 8 +#define QM_MB_BUSY_SHIFT 13 +#define QM_MB_OP_SHIFT 14 +#define QM_MB_CMD_DATA_ADDR_L 0x304 +#define QM_MB_CMD_DATA_ADDR_H 0x308 +#define QM_MB_MAX_WAIT_CNT 6000 + +/* doorbell */ +#define QM_DOORBELL_CMD_SQ 0 +#define QM_DOORBELL_CMD_CQ 1 +#define QM_DOORBELL_CMD_EQ 2 +#define QM_DOORBELL_CMD_AEQ 3 + +#define QM_DOORBELL_SQ_CQ_BASE_V2 0x1000 +#define QM_DOORBELL_EQ_AEQ_BASE_V2 0x2000 +#define QM_QP_MAX_NUM_SHIFT 11 +#define QM_DB_CMD_SHIFT_V2 12 +#define QM_DB_RAND_SHIFT_V2 16 +#define QM_DB_INDEX_SHIFT_V2 32 +#define QM_DB_PRIORITY_SHIFT_V2 48 + /* qm cache */ #define QM_CACHE_CTL 0x100050 #define SQC_CACHE_ENABLE BIT(0) @@ -421,6 +455,10 @@ pci_ers_result_t hisi_qm_dev_slot_reset(struct pci_dev *pdev); void hisi_qm_reset_prepare(struct pci_dev *pdev); void hisi_qm_reset_done(struct pci_dev *pdev); +int hisi_qm_wait_mb_ready(struct hisi_qm *qm); +int hisi_qm_mb(struct hisi_qm *qm, u8 cmd, dma_addr_t dma_addr, u16 queue, + bool op); + struct hisi_acc_sgl_pool; struct hisi_acc_hw_sgl *hisi_acc_sg_buf_map_to_hw_sgl(struct device *dev, struct scatterlist *sgl, struct hisi_acc_sgl_pool *pool, -- Gitee From 8fd5a1b7df96e0dec7c9f613cce0c706856c986a Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Tue, 19 Jul 2022 16:51:33 +0800 Subject: [PATCH 498/674] hisi_acc_qm: Move VF PCI device IDs to common header mainline inclusion from mainline-v5.18-rc1 commit fae74feacd2d category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=fae74feacd2d ---------------------------------------------------------------------- Move the PCI Device IDs of HiSilicon ACC VF devices to a common header and also use a uniform naming convention. This will be useful when we introduce the vfio PCI HiSilicon ACC live migration driver in subsequent patches. Cc: Bjorn Helgaas Acked-by: Zhou Wang Acked-by: Longfang Liu Acked-by: Kai Ye Signed-off-by: Shameer Kolothum Acked-by: Bjorn Helgaas # pci_ids.h Link: https://lore.kernel.org/r/20220308184902.2242-4-shameerali.kolothum.thodi@huawei.com Signed-off-by: Alex Williamson Signed-off-by: Jiangshui Yang Reviewed-by: Yang Shen Acked-by: Xie XiuQi Signed-off-by: Zheng Zengkai --- drivers/crypto/hisilicon/hpre/hpre_main.c | 13 ++++++------- drivers/crypto/hisilicon/sec2/sec_main.c | 15 +++++++-------- drivers/crypto/hisilicon/zip/zip_main.c | 11 +++++------ include/linux/pci_ids.h | 3 +++ 4 files changed, 21 insertions(+), 21 deletions(-) diff --git a/drivers/crypto/hisilicon/hpre/hpre_main.c b/drivers/crypto/hisilicon/hpre/hpre_main.c index ebfab3e14499..3589d8879b5e 100644 --- a/drivers/crypto/hisilicon/hpre/hpre_main.c +++ b/drivers/crypto/hisilicon/hpre/hpre_main.c @@ -68,8 +68,7 @@ #define HPRE_REG_RD_INTVRL_US 10 #define HPRE_REG_RD_TMOUT_US 1000 #define HPRE_DBGFS_VAL_MAX_LEN 20 -#define HPRE_PCI_DEVICE_ID 0xa258 -#define HPRE_PCI_VF_DEVICE_ID 0xa259 +#define PCI_DEVICE_ID_HUAWEI_HPRE_PF 0xa258 #define HPRE_QM_USR_CFG_MASK GENMASK(31, 1) #define HPRE_QM_AXI_CFG_MASK GENMASK(15, 0) #define HPRE_QM_VFG_AX_MASK GENMASK(7, 0) @@ -111,8 +110,8 @@ static const char hpre_name[] = "hisi_hpre"; static struct dentry *hpre_debugfs_root; static const struct pci_device_id hpre_dev_ids[] = { - { PCI_DEVICE(PCI_VENDOR_ID_HUAWEI, HPRE_PCI_DEVICE_ID) }, - { PCI_DEVICE(PCI_VENDOR_ID_HUAWEI, HPRE_PCI_VF_DEVICE_ID) }, + { PCI_DEVICE(PCI_VENDOR_ID_HUAWEI, PCI_DEVICE_ID_HUAWEI_HPRE_PF) }, + { PCI_DEVICE(PCI_VENDOR_ID_HUAWEI, PCI_DEVICE_ID_HUAWEI_HPRE_VF) }, { 0, } }; @@ -242,7 +241,7 @@ MODULE_PARM_DESC(uacce_mode, UACCE_MODE_DESC); static int pf_q_num_set(const char *val, const struct kernel_param *kp) { - return q_num_set(val, kp, HPRE_PCI_DEVICE_ID); + return q_num_set(val, kp, PCI_DEVICE_ID_HUAWEI_HPRE_PF); } static const struct kernel_param_ops hpre_pf_q_num_ops = { @@ -921,7 +920,7 @@ static int hpre_debugfs_init(struct hisi_qm *qm) qm->debug.sqe_mask_len = HPRE_SQE_MASK_LEN; hisi_qm_debug_init(qm); - if (qm->pdev->device == HPRE_PCI_DEVICE_ID) { + if (qm->pdev->device == PCI_DEVICE_ID_HUAWEI_HPRE_PF) { ret = hpre_ctrl_debug_init(qm); if (ret) goto failed_to_create; @@ -958,7 +957,7 @@ static int hpre_qm_init(struct hisi_qm *qm, struct pci_dev *pdev) qm->sqe_size = HPRE_SQE_SIZE; qm->dev_name = hpre_name; - qm->fun_type = (pdev->device == HPRE_PCI_DEVICE_ID) ? + qm->fun_type = (pdev->device == PCI_DEVICE_ID_HUAWEI_HPRE_PF) ? QM_HW_PF : QM_HW_VF; if (qm->fun_type == QM_HW_PF) { qm->qp_base = HPRE_PF_DEF_Q_BASE; diff --git a/drivers/crypto/hisilicon/sec2/sec_main.c b/drivers/crypto/hisilicon/sec2/sec_main.c index 0b9906ff69e3..85f55d8f04cf 100644 --- a/drivers/crypto/hisilicon/sec2/sec_main.c +++ b/drivers/crypto/hisilicon/sec2/sec_main.c @@ -20,8 +20,7 @@ #define SEC_VF_NUM 63 #define SEC_QUEUE_NUM_V1 4096 -#define SEC_PF_PCI_DEVICE_ID 0xa255 -#define SEC_VF_PCI_DEVICE_ID 0xa256 +#define PCI_DEVICE_ID_HUAWEI_SEC_PF 0xa255 #define SEC_BD_ERR_CHK_EN0 0xEFFFFFFF #define SEC_BD_ERR_CHK_EN1 0x7ffff7fd @@ -229,7 +228,7 @@ static const struct debugfs_reg32 sec_dfx_regs[] = { static int sec_pf_q_num_set(const char *val, const struct kernel_param *kp) { - return q_num_set(val, kp, SEC_PF_PCI_DEVICE_ID); + return q_num_set(val, kp, PCI_DEVICE_ID_HUAWEI_SEC_PF); } static const struct kernel_param_ops sec_pf_q_num_ops = { @@ -317,8 +316,8 @@ module_param_cb(uacce_mode, &sec_uacce_mode_ops, &uacce_mode, 0444); MODULE_PARM_DESC(uacce_mode, UACCE_MODE_DESC); static const struct pci_device_id sec_dev_ids[] = { - { PCI_DEVICE(PCI_VENDOR_ID_HUAWEI, SEC_PF_PCI_DEVICE_ID) }, - { PCI_DEVICE(PCI_VENDOR_ID_HUAWEI, SEC_VF_PCI_DEVICE_ID) }, + { PCI_DEVICE(PCI_VENDOR_ID_HUAWEI, PCI_DEVICE_ID_HUAWEI_SEC_PF) }, + { PCI_DEVICE(PCI_VENDOR_ID_HUAWEI, PCI_DEVICE_ID_HUAWEI_SEC_VF) }, { 0, } }; MODULE_DEVICE_TABLE(pci, sec_dev_ids); @@ -748,7 +747,7 @@ static int sec_core_debug_init(struct hisi_qm *qm) regset->base = qm->io_base; regset->dev = dev; - if (qm->pdev->device == SEC_PF_PCI_DEVICE_ID) + if (qm->pdev->device == PCI_DEVICE_ID_HUAWEI_SEC_PF) debugfs_create_file("regs", 0444, tmp_d, regset, &sec_regs_fops); for (i = 0; i < ARRAY_SIZE(sec_dfx_labels); i++) { @@ -766,7 +765,7 @@ static int sec_debug_init(struct hisi_qm *qm) struct sec_dev *sec = container_of(qm, struct sec_dev, qm); int i; - if (qm->pdev->device == SEC_PF_PCI_DEVICE_ID) { + if (qm->pdev->device == PCI_DEVICE_ID_HUAWEI_SEC_PF) { for (i = SEC_CLEAR_ENABLE; i < SEC_DEBUG_FILE_NUM; i++) { spin_lock_init(&sec->debug.files[i].lock); sec->debug.files[i].index = i; @@ -908,7 +907,7 @@ static int sec_qm_init(struct hisi_qm *qm, struct pci_dev *pdev) qm->sqe_size = SEC_SQE_SIZE; qm->dev_name = sec_name; - qm->fun_type = (pdev->device == SEC_PF_PCI_DEVICE_ID) ? + qm->fun_type = (pdev->device == PCI_DEVICE_ID_HUAWEI_SEC_PF) ? QM_HW_PF : QM_HW_VF; if (qm->fun_type == QM_HW_PF) { qm->qp_base = SEC_PF_DEF_Q_BASE; diff --git a/drivers/crypto/hisilicon/zip/zip_main.c b/drivers/crypto/hisilicon/zip/zip_main.c index 678f8b58ec42..66decfe07282 100644 --- a/drivers/crypto/hisilicon/zip/zip_main.c +++ b/drivers/crypto/hisilicon/zip/zip_main.c @@ -15,8 +15,7 @@ #include #include "zip.h" -#define PCI_DEVICE_ID_ZIP_PF 0xa250 -#define PCI_DEVICE_ID_ZIP_VF 0xa251 +#define PCI_DEVICE_ID_HUAWEI_ZIP_PF 0xa250 #define HZIP_QUEUE_NUM_V1 4096 @@ -246,7 +245,7 @@ MODULE_PARM_DESC(uacce_mode, UACCE_MODE_DESC); static int pf_q_num_set(const char *val, const struct kernel_param *kp) { - return q_num_set(val, kp, PCI_DEVICE_ID_ZIP_PF); + return q_num_set(val, kp, PCI_DEVICE_ID_HUAWEI_ZIP_PF); } static const struct kernel_param_ops pf_q_num_ops = { @@ -268,8 +267,8 @@ module_param_cb(vfs_num, &vfs_num_ops, &vfs_num, 0444); MODULE_PARM_DESC(vfs_num, "Number of VFs to enable(1-63), 0(default)"); static const struct pci_device_id hisi_zip_dev_ids[] = { - { PCI_DEVICE(PCI_VENDOR_ID_HUAWEI, PCI_DEVICE_ID_ZIP_PF) }, - { PCI_DEVICE(PCI_VENDOR_ID_HUAWEI, PCI_DEVICE_ID_ZIP_VF) }, + { PCI_DEVICE(PCI_VENDOR_ID_HUAWEI, PCI_DEVICE_ID_HUAWEI_ZIP_PF) }, + { PCI_DEVICE(PCI_VENDOR_ID_HUAWEI, PCI_DEVICE_ID_HUAWEI_ZIP_VF) }, { 0, } }; MODULE_DEVICE_TABLE(pci, hisi_zip_dev_ids); @@ -838,7 +837,7 @@ static int hisi_zip_qm_init(struct hisi_qm *qm, struct pci_dev *pdev) qm->sqe_size = HZIP_SQE_SIZE; qm->dev_name = hisi_zip_name; - qm->fun_type = (pdev->device == PCI_DEVICE_ID_ZIP_PF) ? + qm->fun_type = (pdev->device == PCI_DEVICE_ID_HUAWEI_ZIP_PF) ? QM_HW_PF : QM_HW_VF; if (qm->fun_type == QM_HW_PF) { qm->qp_base = HZIP_PF_DEF_Q_BASE; diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index a1069c76e9bc..f9792f9128fb 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2570,6 +2570,9 @@ #define PCI_DEVICE_ID_KORENIX_JETCARDF3 0x17ff #define PCI_VENDOR_ID_HUAWEI 0x19e5 +#define PCI_DEVICE_ID_HUAWEI_ZIP_VF 0xa251 +#define PCI_DEVICE_ID_HUAWEI_SEC_VF 0xa256 +#define PCI_DEVICE_ID_HUAWEI_HPRE_VF 0xa259 /* Hisilicon PCIe NP devices */ #define PCIE_DEVICE_ID_HISI_5896 0x5896 -- Gitee From b5592e08022f801ee8ff6db035a69924d7a21d1a Mon Sep 17 00:00:00 2001 From: Longfang Liu Date: Tue, 19 Jul 2022 16:51:34 +0800 Subject: [PATCH 499/674] crypto: hisilicon/qm: Set the VF QM state register mainline inclusion from mainline-v5.18-rc1 commit 1e459b25081d category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=1e459b25081d ---------------------------------------------------------------------- We use VF QM state register to record the status of the QM configuration state. This will be used in the ACC migration driver to determine whether we can safely save and restore the QM data. Signed-off-by: Longfang Liu Acked-by: Zhou Wang Signed-off-by: Shameer Kolothum Link: https://lore.kernel.org/r/20220308184902.2242-8-shameerali.kolothum.thodi@huawei.com Signed-off-by: Alex Williamson Signed-off-by: Jiangshui Yang Reviewed-by: Yang Shen Acked-by: Xie XiuQi Signed-off-by: Zheng Zengkai --- drivers/crypto/hisilicon/qm.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index abeaef09b91c..46918ea35cd6 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -3527,9 +3527,8 @@ void hisi_qm_uninit(struct hisi_qm *qm) dma_free_coherent(dev, qm->qdma.size, qm->qdma.va, qm->qdma.dma); } - up_write(&qm->qps_lock); - hisi_qm_set_state(qm, VF_NOT_READY); + up_write(&qm->qps_lock); qm_irq_unregister(qm); hisi_qm_pci_uninit(qm); @@ -3547,7 +3546,6 @@ EXPORT_SYMBOL_GPL(hisi_qm_uninit); * @number: The number of queues in vft. * * We can allocate multiple queues to a qm by configuring virtual function - * table. We get related configures by this function. Normally, we call this * function in VF driver to get the queue information. * * qm hw v1 does not support this interface. -- Gitee From 86bab9ccc46065903e44971ca0d81bc23593efed Mon Sep 17 00:00:00 2001 From: Hui Tang Date: Tue, 19 Jul 2022 16:51:35 +0800 Subject: [PATCH 500/674] crypto: hisilicon/qm - optimize the barrier operation mainline inclusion from mainline-v5.19-rc1 commit 4cda2f4a0ee6 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=4cda2f4a0ee6 ---------------------------------------------------------------------- A 'dma_wmb' barrier is enough to guarantee previous writes before accessing by acc device in the outer shareable domain. A 'smp_wmb' barrier is enough to guarantee previous writes before accessing by other cpus in the inner shareble domain. Signed-off-by: Hui Tang Signed-off-by: Herbert Xu Signed-off-by: Jiangshui Yang Reviewed-by: Yang Shen Acked-by: Xie XiuQi Signed-off-by: Zheng Zengkai --- drivers/crypto/hisilicon/qm.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index 46918ea35cd6..30e91e328859 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -687,13 +687,13 @@ static void qm_mb_write(struct hisi_qm *qm, const void *src) if (!IS_ENABLED(CONFIG_ARM64)) { memcpy_toio(fun_base, src, 16); - wmb(); + dma_wmb(); return; } asm volatile("ldp %0, %1, %3\n" "stp %0, %1, %2\n" - "dsb sy\n" + "dmb oshst\n" : "=&r" (tmp0), "=&r" (tmp1), "+Q" (*((char __iomem *)fun_base)) @@ -982,7 +982,7 @@ static void qm_set_qp_disable(struct hisi_qp *qp, int offset) *addr = 1; /* make sure setup is completed */ - mb(); + smp_wmb(); } static void qm_disable_qp(struct hisi_qm *qm, u32 qp_id) -- Gitee From 05666491669e9fe8d6bec2854efa00dcc48f01da Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 19 Jul 2022 16:51:36 +0800 Subject: [PATCH 501/674] kernel.h: drop inclusion in bitmap.h mainline inclusion from mainline-v5.13-rc1 commit 08c5188ef40ff82aed559123dc0ab2d2254b1b1c category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=1fbbcffd0ee1 ---------------------------------------------------------------------- The bitmap.h header is used in a lot of code around the kernel. Besides that it includes kernel.h which sometimes makes a loop. The problem here is many unneeded loops that make header hell dependencies. For example, how may you move bitmap_zalloc() from C-file to the header? Currently it's impossible. And bitmap.h here is only the tip of an iceberg. kerne.h is a dump of everything that even has nothing in common at all. We may still have it, but in my new code I prefer to include only the headers that I want to use, without the bulk of unneeded kernel code. Break the loop by introducing align.h, including it in kernel.h and bitmap.h followed by replacing kernel.h with limits.h. Link: https://lkml.kernel.org/r/20210326170347.37441-1-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Acked-by: Yury Norov Cc: Rasmus Villemoes Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Reviewed-by: Yang Shen Acked-by: Xie XiuQi Signed-off-by: Zheng Zengkai --- include/linux/align.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 include/linux/align.h diff --git a/include/linux/align.h b/include/linux/align.h new file mode 100644 index 000000000000..2b4acec7b95a --- /dev/null +++ b/include/linux/align.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_ALIGN_H +#define _LINUX_ALIGN_H + +#include + +/* @a is a power of 2 value */ +#define ALIGN(x, a) __ALIGN_KERNEL((x), (a)) +#define ALIGN_DOWN(x, a) __ALIGN_KERNEL((x) - ((a) - 1), (a)) +#define __ALIGN_MASK(x, mask) __ALIGN_KERNEL_MASK((x), (mask)) +#define PTR_ALIGN(p, a) ((typeof(p))ALIGN((unsigned long)(p), (a))) +#define PTR_ALIGN_DOWN(p, a) ((typeof(p))ALIGN_DOWN((unsigned long)(p), (a))) +#define IS_ALIGNED(x, a) (((x) & ((typeof(x))(a) - 1)) == 0) + +#endif /* _LINUX_ALIGN_H */ -- Gitee From d8df683a6f2fe0c5ce696f43fb8f31dd6c0b409d Mon Sep 17 00:00:00 2001 From: Kai Ye Date: Tue, 19 Jul 2022 16:51:37 +0800 Subject: [PATCH 502/674] crypto: hisilicon/qm - add register checking for ACC mainline inclusion from mainline-v5.19-rc1 commit f1724d397c60 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=f1724d397c60 ---------------------------------------------------------------------- Add register detection function to accelerator. Provided a tool that user can checking differential register through Debugfs. e.g. cd /sys/kernel/debug/hisi_zip//zip_dfx cat diff_regs Signed-off-by: Longfang Liu Signed-off-by: Kai Ye Signed-off-by: Herbert Xu Signed-off-by: Jiangshui Yang Reviewed-by: Yang Shen Acked-by: Xie XiuQi Signed-off-by: Zheng Zengkai --- drivers/crypto/hisilicon/qm.c | 182 +++++++++++++++++++++++++++++++++- include/linux/hisi_acc_qm.h | 14 +++ 2 files changed, 195 insertions(+), 1 deletion(-) diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index 30e91e328859..45f1ded0801f 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -253,7 +253,15 @@ #define QM_QOS_MAX_CIR_U 6 #define QM_QOS_MAX_CIR_S 11 #define QM_QOS_VAL_MAX_LEN 32 - +#define QM_DFX_BASE 0x0100000 +#define QM_DFX_STATE1 0x0104000 +#define QM_DFX_STATE2 0x01040C8 +#define QM_DFX_COMMON 0x0000 +#define QM_DFX_BASE_LEN 0x5A +#define QM_DFX_STATE1_LEN 0x2E +#define QM_DFX_STATE2_LEN 0x11 +#define QM_DFX_COMMON_LEN 0xC3 +#define QM_DFX_REGS_LEN 4UL #define QM_AUTOSUSPEND_DELAY 3000 #define QM_MK_CQC_DW3_V1(hop_num, pg_sz, buf_sz, cqe_sz) \ @@ -467,6 +475,23 @@ static const struct hisi_qm_hw_error qm_hw_error[] = { { /* sentinel */ } }; +/* define the QM's dfx regs region and region length */ +static struct dfx_diff_registers qm_diff_regs[] = { + { + .reg_offset = QM_DFX_BASE, + .reg_len = QM_DFX_BASE_LEN, + }, { + .reg_offset = QM_DFX_STATE1, + .reg_len = QM_DFX_STATE1_LEN, + }, { + .reg_offset = QM_DFX_STATE2, + .reg_len = QM_DFX_STATE2_LEN, + }, { + .reg_offset = QM_DFX_COMMON, + .reg_len = QM_DFX_COMMON_LEN, + }, +}; + static const char * const qm_db_timeout[] = { "sq", "cq", "eq", "aeq", }; @@ -1625,6 +1650,156 @@ static int qm_regs_show(struct seq_file *s, void *unused) DEFINE_SHOW_ATTRIBUTE(qm_regs); +static struct dfx_diff_registers *dfx_regs_init(struct hisi_qm *qm, + const struct dfx_diff_registers *cregs, int reg_len) +{ + struct dfx_diff_registers *diff_regs; + u32 j, base_offset; + int i; + + diff_regs = kcalloc(reg_len, sizeof(*diff_regs), GFP_KERNEL); + if (!diff_regs) + return ERR_PTR(-ENOMEM); + + for (i = 0; i < reg_len; i++) { + if (!cregs[i].reg_len) + continue; + + diff_regs[i].reg_offset = cregs[i].reg_offset; + diff_regs[i].reg_len = cregs[i].reg_len; + diff_regs[i].regs = kcalloc(QM_DFX_REGS_LEN, cregs[i].reg_len, + GFP_KERNEL); + if (!diff_regs[i].regs) + goto alloc_error; + + for (j = 0; j < diff_regs[i].reg_len; j++) { + base_offset = diff_regs[i].reg_offset + + j * QM_DFX_REGS_LEN; + diff_regs[i].regs[j] = readl(qm->io_base + base_offset); + } + } + + return diff_regs; + +alloc_error: + while (i > 0) { + i--; + kfree(diff_regs[i].regs); + } + kfree(diff_regs); + return ERR_PTR(-ENOMEM); +} + +static void dfx_regs_uninit(struct hisi_qm *qm, + struct dfx_diff_registers *dregs, int reg_len) +{ + int i; + + /* Setting the pointer is NULL to prevent double free */ + for (i = 0; i < reg_len; i++) { + kfree(dregs[i].regs); + dregs[i].regs = NULL; + } + kfree(dregs); + dregs = NULL; +} + +/** + * hisi_qm_diff_regs_init() - Allocate memory for registers. + * @qm: device qm handle. + * @dregs: diff registers handle. + * @reg_len: diff registers region length. + */ +int hisi_qm_diff_regs_init(struct hisi_qm *qm, + struct dfx_diff_registers *dregs, int reg_len) +{ + if (!qm || !dregs || reg_len <= 0) + return -EINVAL; + + if (qm->fun_type != QM_HW_PF) + return 0; + + qm->debug.qm_diff_regs = dfx_regs_init(qm, qm_diff_regs, + ARRAY_SIZE(qm_diff_regs)); + if (IS_ERR(qm->debug.qm_diff_regs)) + return PTR_ERR(qm->debug.qm_diff_regs); + + qm->debug.acc_diff_regs = dfx_regs_init(qm, dregs, reg_len); + if (IS_ERR(qm->debug.acc_diff_regs)) { + dfx_regs_uninit(qm, qm->debug.qm_diff_regs, + ARRAY_SIZE(qm_diff_regs)); + return PTR_ERR(qm->debug.acc_diff_regs); + } + + return 0; +} +EXPORT_SYMBOL_GPL(hisi_qm_diff_regs_init); + +/** + * hisi_qm_diff_regs_uninit() - Free memory for registers. + * @qm: device qm handle. + * @reg_len: diff registers region length. + */ +void hisi_qm_diff_regs_uninit(struct hisi_qm *qm, int reg_len) +{ + if (!qm || reg_len <= 0 || qm->fun_type != QM_HW_PF) + return; + + dfx_regs_uninit(qm, qm->debug.acc_diff_regs, reg_len); + dfx_regs_uninit(qm, qm->debug.qm_diff_regs, ARRAY_SIZE(qm_diff_regs)); +} +EXPORT_SYMBOL_GPL(hisi_qm_diff_regs_uninit); + +/** + * hisi_qm_acc_diff_regs_dump() - Dump registers's value. + * @qm: device qm handle. + * @s: Debugfs file handle. + * @dregs: diff registers handle. + * @regs_len: diff registers region length. + */ +void hisi_qm_acc_diff_regs_dump(struct hisi_qm *qm, struct seq_file *s, + struct dfx_diff_registers *dregs, int regs_len) +{ + u32 j, val, base_offset; + int i, ret; + + if (!qm || !s || !dregs || regs_len <= 0) + return; + + ret = hisi_qm_get_dfx_access(qm); + if (ret) + return; + + down_read(&qm->qps_lock); + for (i = 0; i < regs_len; i++) { + if (!dregs[i].reg_len) + continue; + + for (j = 0; j < dregs[i].reg_len; j++) { + base_offset = dregs[i].reg_offset + j * QM_DFX_REGS_LEN; + val = readl(qm->io_base + base_offset); + if (val != dregs[i].regs[j]) + seq_printf(s, "0x%08x = 0x%08x ---> 0x%08x\n", + base_offset, dregs[i].regs[j], val); + } + } + up_read(&qm->qps_lock); + + hisi_qm_put_dfx_access(qm); +} +EXPORT_SYMBOL_GPL(hisi_qm_acc_diff_regs_dump); + +static int qm_diff_regs_show(struct seq_file *s, void *unused) +{ + struct hisi_qm *qm = s->private; + + hisi_qm_acc_diff_regs_dump(qm, s, qm->debug.qm_diff_regs, + ARRAY_SIZE(qm_diff_regs)); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(qm_diff_regs); + static ssize_t qm_cmd_read(struct file *filp, char __user *buffer, size_t count, loff_t *pos) { @@ -4487,6 +4662,7 @@ static void hisi_qm_set_algqos_init(struct hisi_qm *qm) */ void hisi_qm_debug_init(struct hisi_qm *qm) { + struct dfx_diff_registers *qm_regs = qm->debug.qm_diff_regs; struct qm_dfx *dfx = &qm->debug.dfx; struct dentry *qm_d; void *data; @@ -4502,6 +4678,10 @@ void hisi_qm_debug_init(struct hisi_qm *qm) qm_create_debugfs_file(qm, qm->debug.qm_d, i); } + if (qm_regs) + debugfs_create_file("diff_regs", 0444, qm->debug.qm_d, + qm, &qm_diff_regs_fops); + debugfs_create_file("regs", 0444, qm->debug.qm_d, qm, &qm_regs_fops); debugfs_create_file("cmd", 0600, qm->debug.qm_d, qm, &qm_cmd_fops); diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h index 0ac0fdb6a6bd..7e318055f76b 100644 --- a/include/linux/hisi_acc_qm.h +++ b/include/linux/hisi_acc_qm.h @@ -137,6 +137,12 @@ enum qm_state { QM_STOP, }; +struct dfx_diff_registers { + u32 *regs; + u32 reg_offset; + u32 reg_len; +}; + enum qp_state { QP_INIT = 1, QP_START, @@ -191,6 +197,8 @@ struct qm_debug { struct dentry *debug_root; struct dentry *qm_d; struct debugfs_file files[DEBUG_FILE_NUM]; + struct dfx_diff_registers *qm_diff_regs; + struct dfx_diff_registers *acc_diff_regs; }; struct qm_shaper_factor { @@ -449,6 +457,12 @@ int hisi_qm_sriov_disable(struct pci_dev *pdev, bool is_frozen); int hisi_qm_sriov_configure(struct pci_dev *pdev, int num_vfs); void hisi_qm_dev_err_init(struct hisi_qm *qm); void hisi_qm_dev_err_uninit(struct hisi_qm *qm); +int hisi_qm_diff_regs_init(struct hisi_qm *qm, + struct dfx_diff_registers *dregs, int reg_len); +void hisi_qm_diff_regs_uninit(struct hisi_qm *qm, int reg_len); +void hisi_qm_acc_diff_regs_dump(struct hisi_qm *qm, struct seq_file *s, + struct dfx_diff_registers *dregs, int regs_len); + pci_ers_result_t hisi_qm_dev_err_detected(struct pci_dev *pdev, pci_channel_state_t state); pci_ers_result_t hisi_qm_dev_slot_reset(struct pci_dev *pdev); -- Gitee From 77c39b7bdfb49e70596d03a0bb28416f28a132ae Mon Sep 17 00:00:00 2001 From: Kai Ye Date: Tue, 19 Jul 2022 16:51:38 +0800 Subject: [PATCH 503/674] crypto: hisilicon/hpre - support register checking mainline inclusion from mainline-v5.19-rc1 commit 9210bdaa0d49 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=9210bdaa0d49 ---------------------------------------------------------------------- The value of the register is changed after the task running. A debugfs file node is added to help users to check the change of register values. Signed-off-by: Longfang Liu Signed-off-by: Kai Ye Signed-off-by: Herbert Xu Signed-off-by: Jiangshui Yang Reviewed-by: Yang Shen Acked-by: Xie XiuQi Signed-off-by: Zheng Zengkai --- drivers/crypto/hisilicon/hpre/hpre_main.c | 90 ++++++++++++++++++----- 1 file changed, 72 insertions(+), 18 deletions(-) diff --git a/drivers/crypto/hisilicon/hpre/hpre_main.c b/drivers/crypto/hisilicon/hpre/hpre_main.c index 3589d8879b5e..07222856cc84 100644 --- a/drivers/crypto/hisilicon/hpre/hpre_main.c +++ b/drivers/crypto/hisilicon/hpre/hpre_main.c @@ -107,6 +107,15 @@ #define HPRE_SQE_MASK_OFFSET 8 #define HPRE_SQE_MASK_LEN 24 +#define HPRE_DFX_BASE 0x301000 +#define HPRE_DFX_COMMON1 0x301400 +#define HPRE_DFX_COMMON2 0x301A00 +#define HPRE_DFX_CORE 0x302000 +#define HPRE_DFX_BASE_LEN 0x55 +#define HPRE_DFX_COMMON1_LEN 0x41 +#define HPRE_DFX_COMMON2_LEN 0xE +#define HPRE_DFX_CORE_LEN 0x43 + static const char hpre_name[] = "hisi_hpre"; static struct dentry *hpre_debugfs_root; static const struct pci_device_id hpre_dev_ids[] = { @@ -226,6 +235,53 @@ static const char *hpre_dfx_files[HPRE_DFX_FILE_NUM] = { "invalid_req_cnt" }; +/* define the HPRE's dfx regs region and region length */ +static struct dfx_diff_registers hpre_diff_regs[] = { + { + .reg_offset = HPRE_DFX_BASE, + .reg_len = HPRE_DFX_BASE_LEN, + }, { + .reg_offset = HPRE_DFX_COMMON1, + .reg_len = HPRE_DFX_COMMON1_LEN, + }, { + .reg_offset = HPRE_DFX_COMMON2, + .reg_len = HPRE_DFX_COMMON2_LEN, + }, { + .reg_offset = HPRE_DFX_CORE, + .reg_len = HPRE_DFX_CORE_LEN, + }, +}; + +static int hpre_diff_regs_show(struct seq_file *s, void *unused) +{ + struct hisi_qm *qm = s->private; + + hisi_qm_acc_diff_regs_dump(qm, s, qm->debug.acc_diff_regs, + ARRAY_SIZE(hpre_diff_regs)); + + return 0; +} + +DEFINE_SHOW_ATTRIBUTE(hpre_diff_regs); + +static int hpre_com_regs_show(struct seq_file *s, void *unused) +{ + hisi_qm_regs_dump(s, s->private); + + return 0; +} + +DEFINE_SHOW_ATTRIBUTE(hpre_com_regs); + +static int hpre_cluster_regs_show(struct seq_file *s, void *unused) +{ + hisi_qm_regs_dump(s, s->private); + + return 0; +} + +DEFINE_SHOW_ATTRIBUTE(hpre_cluster_regs); + static const struct kernel_param_ops hpre_uacce_mode_ops = { .set = uacce_mode_set, .get = param_get_int, @@ -779,24 +835,6 @@ static int hpre_debugfs_atomic64_set(void *data, u64 val) DEFINE_DEBUGFS_ATTRIBUTE(hpre_atomic64_ops, hpre_debugfs_atomic64_get, hpre_debugfs_atomic64_set, "%llu\n"); -static int hpre_com_regs_show(struct seq_file *s, void *unused) -{ - hisi_qm_regs_dump(s, s->private); - - return 0; -} - -DEFINE_SHOW_ATTRIBUTE(hpre_com_regs); - -static int hpre_cluster_regs_show(struct seq_file *s, void *unused) -{ - hisi_qm_regs_dump(s, s->private); - - return 0; -} - -DEFINE_SHOW_ATTRIBUTE(hpre_cluster_regs); - static int hpre_create_debugfs_file(struct hisi_qm *qm, struct dentry *dir, enum hpre_ctrl_dbgfs_file type, int indx) { @@ -895,6 +933,7 @@ static int hpre_ctrl_debug_init(struct hisi_qm *qm) static void hpre_dfx_debug_init(struct hisi_qm *qm) { + struct dfx_diff_registers *hpre_regs = qm->debug.acc_diff_regs; struct hpre *hpre = container_of(qm, struct hpre, qm); struct hpre_dfx *dfx = hpre->debug.dfx; struct dentry *parent; @@ -906,6 +945,10 @@ static void hpre_dfx_debug_init(struct hisi_qm *qm) debugfs_create_file(hpre_dfx_files[i], 0644, parent, &dfx[i], &hpre_atomic64_ops); } + + if (qm->fun_type == QM_HW_PF && hpre_regs) + debugfs_create_file("diff_regs", 0444, parent, + qm, &hpre_diff_regs_fops); } static int hpre_debugfs_init(struct hisi_qm *qm) @@ -918,6 +961,13 @@ static int hpre_debugfs_init(struct hisi_qm *qm) qm->debug.sqe_mask_offset = HPRE_SQE_MASK_OFFSET; qm->debug.sqe_mask_len = HPRE_SQE_MASK_LEN; + ret = hisi_qm_diff_regs_init(qm, hpre_diff_regs, + ARRAY_SIZE(hpre_diff_regs)); + if (ret) { + dev_warn(dev, "Failed to init HPRE diff regs!\n"); + goto debugfs_remove; + } + hisi_qm_debug_init(qm); if (qm->pdev->device == PCI_DEVICE_ID_HUAWEI_HPRE_PF) { @@ -931,12 +981,16 @@ static int hpre_debugfs_init(struct hisi_qm *qm) return 0; failed_to_create: + hisi_qm_diff_regs_uninit(qm, ARRAY_SIZE(hpre_diff_regs)); +debugfs_remove: debugfs_remove_recursive(qm->debug.debug_root); return ret; } static void hpre_debugfs_exit(struct hisi_qm *qm) { + hisi_qm_diff_regs_uninit(qm, ARRAY_SIZE(hpre_diff_regs)); + debugfs_remove_recursive(qm->debug.debug_root); } -- Gitee From 0a1b0ca9c4ae8c5c479bfe3b2be89c19a567b584 Mon Sep 17 00:00:00 2001 From: Kai Ye Date: Tue, 19 Jul 2022 16:51:39 +0800 Subject: [PATCH 504/674] crypto: hisilicon/sec - support register checking mainline inclusion from mainline-v5.19-rc1 commit 16175030bb5b category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=16175030bb5b ---------------------------------------------------------------------- The value of the register is changed after the task running. A debugfs file node is added to help users to check the change of register values. Signed-off-by: Longfang Liu Signed-off-by: Kai Ye Signed-off-by: Herbert Xu Signed-off-by: Jiangshui Yang Reviewed-by: Yang Shen Acked-by: Xie XiuQi Signed-off-by: Zheng Zengkai --- drivers/crypto/hisilicon/sec2/sec_main.c | 53 ++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/drivers/crypto/hisilicon/sec2/sec_main.c b/drivers/crypto/hisilicon/sec2/sec_main.c index 85f55d8f04cf..0dab1fde56d1 100644 --- a/drivers/crypto/hisilicon/sec2/sec_main.c +++ b/drivers/crypto/hisilicon/sec2/sec_main.c @@ -110,6 +110,15 @@ #define SEC_SQE_MASK_LEN 48 #define SEC_SHAPER_TYPE_RATE 400 +#define SEC_DFX_BASE 0x301000 +#define SEC_DFX_CORE 0x302100 +#define SEC_DFX_COMMON1 0x301600 +#define SEC_DFX_COMMON2 0x301C00 +#define SEC_DFX_BASE_LEN 0x9D +#define SEC_DFX_CORE_LEN 0x32B +#define SEC_DFX_COMMON1_LEN 0x45 +#define SEC_DFX_COMMON2_LEN 0xBA + struct sec_hw_error { u32 int_msk; const char *msg; @@ -226,6 +235,34 @@ static const struct debugfs_reg32 sec_dfx_regs[] = { {"SEC_BD_SAA8 ", 0x301C40}, }; +/* define the SEC's dfx regs region and region length */ +static struct dfx_diff_registers sec_diff_regs[] = { + { + .reg_offset = SEC_DFX_BASE, + .reg_len = SEC_DFX_BASE_LEN, + }, { + .reg_offset = SEC_DFX_COMMON1, + .reg_len = SEC_DFX_COMMON1_LEN, + }, { + .reg_offset = SEC_DFX_COMMON2, + .reg_len = SEC_DFX_COMMON2_LEN, + }, { + .reg_offset = SEC_DFX_CORE, + .reg_len = SEC_DFX_CORE_LEN, + }, +}; + +static int sec_diff_regs_show(struct seq_file *s, void *unused) +{ + struct hisi_qm *qm = s->private; + + hisi_qm_acc_diff_regs_dump(qm, s, qm->debug.acc_diff_regs, + ARRAY_SIZE(sec_diff_regs)); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(sec_diff_regs); + static int sec_pf_q_num_set(const char *val, const struct kernel_param *kp) { return q_num_set(val, kp, PCI_DEVICE_ID_HUAWEI_SEC_PF); @@ -729,6 +766,7 @@ DEFINE_SHOW_ATTRIBUTE(sec_regs); static int sec_core_debug_init(struct hisi_qm *qm) { + struct dfx_diff_registers *sec_regs = qm->debug.acc_diff_regs; struct sec_dev *sec = container_of(qm, struct sec_dev, qm); struct device *dev = &qm->pdev->dev; struct sec_dfx *dfx = &sec->debug.dfx; @@ -749,6 +787,9 @@ static int sec_core_debug_init(struct hisi_qm *qm) if (qm->pdev->device == PCI_DEVICE_ID_HUAWEI_SEC_PF) debugfs_create_file("regs", 0444, tmp_d, regset, &sec_regs_fops); + if (qm->fun_type == QM_HW_PF && sec_regs) + debugfs_create_file("diff_regs", 0444, tmp_d, + qm, &sec_diff_regs_fops); for (i = 0; i < ARRAY_SIZE(sec_dfx_labels); i++) { atomic64_t *data = (atomic64_t *)((uintptr_t)dfx + @@ -790,6 +831,14 @@ static int sec_debugfs_init(struct hisi_qm *qm) sec_debugfs_root); qm->debug.sqe_mask_offset = SEC_SQE_MASK_OFFSET; qm->debug.sqe_mask_len = SEC_SQE_MASK_LEN; + + ret = hisi_qm_diff_regs_init(qm, sec_diff_regs, + ARRAY_SIZE(sec_diff_regs)); + if (ret) { + dev_warn(dev, "Failed to init SEC diff regs!\n"); + goto debugfs_remove; + } + hisi_qm_debug_init(qm); ret = sec_debug_init(qm); @@ -799,12 +848,16 @@ static int sec_debugfs_init(struct hisi_qm *qm) return 0; failed_to_create: + hisi_qm_diff_regs_uninit(qm, ARRAY_SIZE(sec_diff_regs)); +debugfs_remove: debugfs_remove_recursive(sec_debugfs_root); return ret; } static void sec_debugfs_exit(struct hisi_qm *qm) { + hisi_qm_diff_regs_uninit(qm, ARRAY_SIZE(sec_diff_regs)); + debugfs_remove_recursive(qm->debug.debug_root); } -- Gitee From 16683f6b5cc2566f2660154788901cf40a52d690 Mon Sep 17 00:00:00 2001 From: Kai Ye Date: Tue, 19 Jul 2022 16:51:40 +0800 Subject: [PATCH 505/674] crypto: hisilicon/zip - support register checking mainline inclusion from mainline-v5.19-rc1 commit 9b0c97dfc215 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=9b0c97dfc215 ---------------------------------------------------------------------- The value of the register is changed after the task running. A debugfs file node is added to help users to check the change of register values. Signed-off-by: Longfang Liu Signed-off-by: Kai Ye Signed-off-by: Herbert Xu Signed-off-by: Jiangshui Yang Reviewed-by: Yang Shen Acked-by: Xie XiuQi Signed-off-by: Zheng Zengkai --- drivers/crypto/hisilicon/zip/zip_main.c | 78 ++++++++++++++++++++++--- 1 file changed, 70 insertions(+), 8 deletions(-) diff --git a/drivers/crypto/hisilicon/zip/zip_main.c b/drivers/crypto/hisilicon/zip/zip_main.c index 66decfe07282..ed539755eaf8 100644 --- a/drivers/crypto/hisilicon/zip/zip_main.c +++ b/drivers/crypto/hisilicon/zip/zip_main.c @@ -49,14 +49,18 @@ #define HZIP_QM_IDEL_STATUS 0x3040e4 -#define HZIP_CORE_DEBUG_COMP_0 0x302000 -#define HZIP_CORE_DEBUG_COMP_1 0x303000 -#define HZIP_CORE_DEBUG_DECOMP_0 0x304000 -#define HZIP_CORE_DEBUG_DECOMP_1 0x305000 -#define HZIP_CORE_DEBUG_DECOMP_2 0x306000 -#define HZIP_CORE_DEBUG_DECOMP_3 0x307000 -#define HZIP_CORE_DEBUG_DECOMP_4 0x308000 -#define HZIP_CORE_DEBUG_DECOMP_5 0x309000 +#define HZIP_CORE_DFX_BASE 0x301000 +#define HZIP_CLOCK_GATED_CONTL 0X301004 +#define HZIP_CORE_DFX_COMP_0 0x302000 +#define HZIP_CORE_DFX_COMP_1 0x303000 +#define HZIP_CORE_DFX_DECOMP_0 0x304000 +#define HZIP_CORE_DFX_DECOMP_1 0x305000 +#define HZIP_CORE_DFX_DECOMP_2 0x306000 +#define HZIP_CORE_DFX_DECOMP_3 0x307000 +#define HZIP_CORE_DFX_DECOMP_4 0x308000 +#define HZIP_CORE_DFX_DECOMP_5 0x309000 +#define HZIP_CORE_REGS_BASE_LEN 0xB0 +#define HZIP_CORE_REGS_DFX_LEN 0x28 #define HZIP_CORE_INT_SOURCE 0x3010A0 #define HZIP_CORE_INT_MASK_REG 0x3010A4 @@ -230,6 +234,48 @@ static const struct debugfs_reg32 hzip_dfx_regs[] = { {"HZIP_DECOMP_LZ77_CURR_ST ", 0x9cull}, }; +/* define the ZIP's dfx regs region and region length */ +static struct dfx_diff_registers hzip_diff_regs[] = { + { + .reg_offset = HZIP_CORE_DFX_BASE, + .reg_len = HZIP_CORE_REGS_BASE_LEN, + }, { + .reg_offset = HZIP_CORE_DFX_COMP_0, + .reg_len = HZIP_CORE_REGS_DFX_LEN, + }, { + .reg_offset = HZIP_CORE_DFX_COMP_1, + .reg_len = HZIP_CORE_REGS_DFX_LEN, + }, { + .reg_offset = HZIP_CORE_DFX_DECOMP_0, + .reg_len = HZIP_CORE_REGS_DFX_LEN, + }, { + .reg_offset = HZIP_CORE_DFX_DECOMP_1, + .reg_len = HZIP_CORE_REGS_DFX_LEN, + }, { + .reg_offset = HZIP_CORE_DFX_DECOMP_2, + .reg_len = HZIP_CORE_REGS_DFX_LEN, + }, { + .reg_offset = HZIP_CORE_DFX_DECOMP_3, + .reg_len = HZIP_CORE_REGS_DFX_LEN, + }, { + .reg_offset = HZIP_CORE_DFX_DECOMP_4, + .reg_len = HZIP_CORE_REGS_DFX_LEN, + }, { + .reg_offset = HZIP_CORE_DFX_DECOMP_5, + .reg_len = HZIP_CORE_REGS_DFX_LEN, + }, +}; + +static int hzip_diff_regs_show(struct seq_file *s, void *unused) +{ + struct hisi_qm *qm = s->private; + + hisi_qm_acc_diff_regs_dump(qm, s, qm->debug.acc_diff_regs, + ARRAY_SIZE(hzip_diff_regs)); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(hzip_diff_regs); static const struct kernel_param_ops zip_uacce_mode_ops = { .set = uacce_mode_set, .get = param_get_int, @@ -621,6 +667,7 @@ static int hisi_zip_core_debug_init(struct hisi_qm *qm) static void hisi_zip_dfx_debug_init(struct hisi_qm *qm) { + struct dfx_diff_registers *hzip_regs = qm->debug.acc_diff_regs; struct hisi_zip *zip = container_of(qm, struct hisi_zip, qm); struct hisi_zip_dfx *dfx = &zip->dfx; struct dentry *tmp_dir; @@ -634,6 +681,10 @@ static void hisi_zip_dfx_debug_init(struct hisi_qm *qm) 0644, tmp_dir, data, &zip_atomic64_ops); } + + if (qm->fun_type == QM_HW_PF && hzip_regs) + debugfs_create_file("diff_regs", 0444, tmp_dir, + qm, &hzip_diff_regs_fops); } static int hisi_zip_ctrl_debug_init(struct hisi_qm *qm) @@ -666,6 +717,13 @@ static int hisi_zip_debugfs_init(struct hisi_qm *qm) qm->debug.sqe_mask_offset = HZIP_SQE_MASK_OFFSET; qm->debug.sqe_mask_len = HZIP_SQE_MASK_LEN; qm->debug.debug_root = dev_d; + ret = hisi_qm_diff_regs_init(qm, hzip_diff_regs, + ARRAY_SIZE(hzip_diff_regs)); + if (ret) { + dev_warn(dev, "Failed to init ZIP diff regs!\n"); + goto debugfs_remove; + } + hisi_qm_debug_init(qm); if (qm->fun_type == QM_HW_PF) { @@ -679,6 +737,8 @@ static int hisi_zip_debugfs_init(struct hisi_qm *qm) return 0; failed_to_create: + hisi_qm_diff_regs_uninit(qm, ARRAY_SIZE(hzip_diff_regs)); +debugfs_remove: debugfs_remove_recursive(hzip_debugfs_root); return ret; } @@ -703,6 +763,8 @@ static void hisi_zip_debug_regs_clear(struct hisi_qm *qm) static void hisi_zip_debugfs_exit(struct hisi_qm *qm) { + hisi_qm_diff_regs_uninit(qm, ARRAY_SIZE(hzip_diff_regs)); + debugfs_remove_recursive(qm->debug.debug_root); if (qm->fun_type == QM_HW_PF) { -- Gitee From 64ee1e58dc5b085080d90b0a7b0504240723395a Mon Sep 17 00:00:00 2001 From: Kai Ye Date: Tue, 19 Jul 2022 16:51:41 +0800 Subject: [PATCH 506/674] crypto: hisilicon/qm - add last word dumping for ACC mainline inclusion from mainline-v5.19-rc1 commit a888ccd6c666 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a888ccd6c666 ---------------------------------------------------------------------- Add last word dumping function during acc engines controller reset. The last words are reported to the printed information during the reset. The dmesg information included qm debugging registers and engine debugging registers. It can help to improve debugging capability. Signed-off-by: Kai Ye Signed-off-by: Herbert Xu Signed-off-by: Jiangshui Yang Reviewed-by: Yang Shen Acked-by: Xie XiuQi Signed-off-by: Zheng Zengkai --- drivers/crypto/hisilicon/qm.c | 57 +++++++++++++++++++++++++++++++++++ include/linux/hisi_acc_qm.h | 4 +++ 2 files changed, 61 insertions(+) diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index 45f1ded0801f..e44d0c46705a 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -3604,6 +3604,17 @@ static void hisi_qm_set_state(struct hisi_qm *qm, enum vf_state state) writel(state, qm->io_base + QM_VF_STATE); } +static void qm_last_regs_uninit(struct hisi_qm *qm) +{ + struct qm_debug *debug = &qm->debug; + + if (qm->fun_type == QM_HW_VF || !debug->qm_last_words) + return; + + kfree(debug->qm_last_words); + debug->qm_last_words = NULL; +} + static void hisi_qm_pre_init(struct hisi_qm *qm) { struct pci_dev *pdev = qm->pdev; @@ -3685,6 +3696,8 @@ void hisi_qm_uninit(struct hisi_qm *qm) struct pci_dev *pdev = qm->pdev; struct device *dev = &pdev->dev; + qm_last_regs_uninit(qm); + qm_cmd_uninit(qm); kfree(qm->factor); down_write(&qm->qps_lock); @@ -5364,6 +5377,24 @@ static int qm_controller_reset_done(struct hisi_qm *qm) return 0; } +static void qm_show_last_dfx_regs(struct hisi_qm *qm) +{ + struct qm_debug *debug = &qm->debug; + struct pci_dev *pdev = qm->pdev; + u32 val; + int i; + + if (qm->fun_type == QM_HW_VF || !debug->qm_last_words) + return; + + for (i = 0; i < ARRAY_SIZE(qm_dfx_regs); i++) { + val = readl_relaxed(qm->io_base + qm_dfx_regs[i].offset); + if (debug->qm_last_words[i] != val) + pci_info(pdev, "%s \t= 0x%08x => 0x%08x\n", + qm_dfx_regs[i].name, debug->qm_last_words[i], val); + } +} + static int qm_controller_reset(struct hisi_qm *qm) { struct pci_dev *pdev = qm->pdev; @@ -5379,6 +5410,10 @@ static int qm_controller_reset(struct hisi_qm *qm) return ret; } + qm_show_last_dfx_regs(qm); + if (qm->err_ini->show_last_dfx_regs) + qm->err_ini->show_last_dfx_regs(qm); + ret = qm_soft_reset(qm); if (ret) { pci_err(pdev, "Controller reset failed (%d)\n", ret); @@ -6091,6 +6126,26 @@ static int hisi_qm_memory_init(struct hisi_qm *qm) return ret; } +static void qm_last_regs_init(struct hisi_qm *qm) +{ + int dfx_regs_num = ARRAY_SIZE(qm_dfx_regs); + struct qm_debug *debug = &qm->debug; + int i; + + if (qm->fun_type == QM_HW_VF) + return; + + debug->qm_last_words = kcalloc(dfx_regs_num, sizeof(unsigned int), + GFP_KERNEL); + if (!debug->qm_last_words) + return; + + for (i = 0; i < dfx_regs_num; i++) { + debug->qm_last_words[i] = readl_relaxed(qm->io_base + + qm_dfx_regs[i].offset); + } +} + /** * hisi_qm_init() - Initialize configures about qm. * @qm: The qm needing init. @@ -6143,6 +6198,8 @@ int hisi_qm_init(struct hisi_qm *qm) qm_cmd_init(qm); atomic_set(&qm->status.flags, QM_INIT); + qm_last_regs_init(qm); + return 0; err_alloc_uacce: diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h index 7e318055f76b..677ce831949a 100644 --- a/include/linux/hisi_acc_qm.h +++ b/include/linux/hisi_acc_qm.h @@ -197,6 +197,9 @@ struct qm_debug { struct dentry *debug_root; struct dentry *qm_d; struct debugfs_file files[DEBUG_FILE_NUM]; + unsigned int *qm_last_words; + /* ACC engines recoreding last regs */ + unsigned int *last_words; struct dfx_diff_registers *qm_diff_regs; struct dfx_diff_registers *acc_diff_regs; }; @@ -252,6 +255,7 @@ struct hisi_qm_err_ini { void (*open_sva_prefetch)(struct hisi_qm *qm); void (*close_sva_prefetch)(struct hisi_qm *qm); void (*log_dev_hw_err)(struct hisi_qm *qm, u32 err_sts); + void (*show_last_dfx_regs)(struct hisi_qm *qm); void (*err_info_init)(struct hisi_qm *qm); }; -- Gitee From 1ad337a645d4ef19149608fadc651f8d0a3fb17b Mon Sep 17 00:00:00 2001 From: Kai Ye Date: Tue, 19 Jul 2022 16:51:42 +0800 Subject: [PATCH 507/674] crypto: hisilicon/sec - support last word dumping mainline inclusion from mainline-v5.19-rc1 commit 8a88d0914529 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=8a88d0914529 ---------------------------------------------------------------------- Add last word dumping function during sec engine controller reset. Signed-off-by: Kai Ye Signed-off-by: Herbert Xu Signed-off-by: Jiangshui Yang Reviewed-by: Yang Shen Acked-by: Xie XiuQi Signed-off-by: Zheng Zengkai --- drivers/crypto/hisilicon/sec2/sec_main.c | 55 +++++++++++++++++++++++- 1 file changed, 54 insertions(+), 1 deletion(-) diff --git a/drivers/crypto/hisilicon/sec2/sec_main.c b/drivers/crypto/hisilicon/sec2/sec_main.c index 0dab1fde56d1..eb6474d3f2cc 100644 --- a/drivers/crypto/hisilicon/sec2/sec_main.c +++ b/drivers/crypto/hisilicon/sec2/sec_main.c @@ -861,6 +861,53 @@ static void sec_debugfs_exit(struct hisi_qm *qm) debugfs_remove_recursive(qm->debug.debug_root); } +static int sec_show_last_regs_init(struct hisi_qm *qm) +{ + struct qm_debug *debug = &qm->debug; + int i; + + debug->last_words = kcalloc(ARRAY_SIZE(sec_dfx_regs), + sizeof(unsigned int), GFP_KERNEL); + if (!debug->last_words) + return -ENOMEM; + + for (i = 0; i < ARRAY_SIZE(sec_dfx_regs); i++) + debug->last_words[i] = readl_relaxed(qm->io_base + + sec_dfx_regs[i].offset); + + return 0; +} + +static void sec_show_last_regs_uninit(struct hisi_qm *qm) +{ + struct qm_debug *debug = &qm->debug; + + if (qm->fun_type == QM_HW_VF || !debug->last_words) + return; + + kfree(debug->last_words); + debug->last_words = NULL; +} + +static void sec_show_last_dfx_regs(struct hisi_qm *qm) +{ + struct qm_debug *debug = &qm->debug; + struct pci_dev *pdev = qm->pdev; + u32 val; + int i; + + if (qm->fun_type == QM_HW_VF || !debug->last_words) + return; + + /* dumps last word of the debugging registers during controller reset */ + for (i = 0; i < ARRAY_SIZE(sec_dfx_regs); i++) { + val = readl_relaxed(qm->io_base + sec_dfx_regs[i].offset); + if (val != debug->last_words[i]) + pci_info(pdev, "%s \t= 0x%08x => 0x%08x\n", + sec_dfx_regs[i].name, debug->last_words[i], val); + } +} + static void sec_log_hw_error(struct hisi_qm *qm, u32 err_sts) { const struct sec_hw_error *errs = sec_hw_errors; @@ -927,6 +974,7 @@ static const struct hisi_qm_err_ini sec_err_ini = { .open_axi_master_ooo = sec_open_axi_master_ooo, .open_sva_prefetch = sec_open_sva_prefetch, .close_sva_prefetch = sec_close_sva_prefetch, + .show_last_dfx_regs = sec_show_last_dfx_regs, .err_info_init = sec_err_info_init, }; @@ -945,8 +993,11 @@ static int sec_pf_probe_init(struct sec_dev *sec) sec_open_sva_prefetch(qm); hisi_qm_dev_err_init(qm); sec_debug_regs_clear(qm); + ret = sec_show_last_regs_init(qm); + if (ret) + pci_err(qm->pdev, "Failed to init last word regs!\n"); - return 0; + return ret; } static int sec_qm_init(struct hisi_qm *qm, struct pci_dev *pdev) @@ -1120,6 +1171,7 @@ static int sec_probe(struct pci_dev *pdev, const struct pci_device_id *id) sec_debugfs_exit(qm); hisi_qm_stop(qm, QM_NORMAL); err_probe_uninit: + sec_show_last_regs_uninit(qm); sec_probe_uninit(qm); err_qm_uninit: sec_qm_uninit(qm); @@ -1144,6 +1196,7 @@ static void sec_remove(struct pci_dev *pdev) if (qm->fun_type == QM_HW_PF) sec_debug_regs_clear(qm); + sec_show_last_regs_uninit(qm); sec_probe_uninit(qm); -- Gitee From 5366ae3636ab81eab3d8cac3b7f2ab211b98bf76 Mon Sep 17 00:00:00 2001 From: Kai Ye Date: Tue, 19 Jul 2022 16:51:43 +0800 Subject: [PATCH 508/674] crypto: hisilicon/hpre - support last word dumping mainline inclusion from mainline-v5.19-rc1 commit 42123e81fdba category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=42123e81fdba ---------------------------------------------------------------------- 1. Add some debugging registers. 2. Add last word dumping function during hpre engine controller reset. Signed-off-by: Kai Ye Signed-off-by: Herbert Xu Signed-off-by: Jiangshui Yang Reviewed-by: Yang Shen Acked-by: Xie XiuQi Signed-off-by: Zheng Zengkai --- drivers/crypto/hisilicon/hpre/hpre_main.c | 132 ++++++++++++++++++---- 1 file changed, 112 insertions(+), 20 deletions(-) diff --git a/drivers/crypto/hisilicon/hpre/hpre_main.c b/drivers/crypto/hisilicon/hpre/hpre_main.c index 07222856cc84..8200793aa15c 100644 --- a/drivers/crypto/hisilicon/hpre/hpre_main.c +++ b/drivers/crypto/hisilicon/hpre/hpre_main.c @@ -36,6 +36,12 @@ #define HPRE_DATA_WUSER_CFG 0x301040 #define HPRE_INT_MASK 0x301400 #define HPRE_INT_STATUS 0x301800 +#define HPRE_HAC_INT_MSK 0x301400 +#define HPRE_HAC_RAS_CE_ENB 0x301410 +#define HPRE_HAC_RAS_NFE_ENB 0x301414 +#define HPRE_HAC_RAS_FE_ENB 0x301418 +#define HPRE_HAC_INT_SET 0x301500 +#define HPRE_RNG_TIMEOUT_NUM 0x301A34 #define HPRE_CORE_INT_ENABLE 0 #define HPRE_CORE_INT_DISABLE GENMASK(21, 0) #define HPRE_RDCHN_INI_ST 0x301a00 @@ -201,28 +207,32 @@ static const u64 hpre_cluster_offsets[] = { }; static const struct debugfs_reg32 hpre_cluster_dfx_regs[] = { - {"CORES_EN_STATUS ", HPRE_CORE_EN_OFFSET}, - {"CORES_INI_CFG ", HPRE_CORE_INI_CFG_OFFSET}, - {"CORES_INI_STATUS ", HPRE_CORE_INI_STATUS_OFFSET}, - {"CORES_HTBT_WARN ", HPRE_CORE_HTBT_WARN_OFFSET}, - {"CORES_IS_SCHD ", HPRE_CORE_IS_SCHD_OFFSET}, + {"CORES_EN_STATUS ", HPRE_CORE_EN_OFFSET}, + {"CORES_INI_CFG ", HPRE_CORE_INI_CFG_OFFSET}, + {"CORES_INI_STATUS ", HPRE_CORE_INI_STATUS_OFFSET}, + {"CORES_HTBT_WARN ", HPRE_CORE_HTBT_WARN_OFFSET}, + {"CORES_IS_SCHD ", HPRE_CORE_IS_SCHD_OFFSET}, }; static const struct debugfs_reg32 hpre_com_dfx_regs[] = { - {"READ_CLR_EN ", HPRE_CTRL_CNT_CLR_CE}, - {"AXQOS ", HPRE_VFG_AXQOS}, - {"AWUSR_CFG ", HPRE_AWUSR_FP_CFG}, - {"QM_ARUSR_MCFG1 ", QM_ARUSER_M_CFG_1}, - {"QM_AWUSR_MCFG1 ", QM_AWUSER_M_CFG_1}, - {"BD_ENDIAN ", HPRE_BD_ENDIAN}, - {"ECC_CHECK_CTRL ", HPRE_ECC_BYPASS}, - {"RAS_INT_WIDTH ", HPRE_RAS_WIDTH_CFG}, - {"POISON_BYPASS ", HPRE_POISON_BYPASS}, - {"BD_ARUSER ", HPRE_BD_ARUSR_CFG}, - {"BD_AWUSER ", HPRE_BD_AWUSR_CFG}, - {"DATA_ARUSER ", HPRE_DATA_RUSER_CFG}, - {"DATA_AWUSER ", HPRE_DATA_WUSER_CFG}, - {"INT_STATUS ", HPRE_INT_STATUS}, + {"READ_CLR_EN ", HPRE_CTRL_CNT_CLR_CE}, + {"AXQOS ", HPRE_VFG_AXQOS}, + {"AWUSR_CFG ", HPRE_AWUSR_FP_CFG}, + {"BD_ENDIAN ", HPRE_BD_ENDIAN}, + {"ECC_CHECK_CTRL ", HPRE_ECC_BYPASS}, + {"RAS_INT_WIDTH ", HPRE_RAS_WIDTH_CFG}, + {"POISON_BYPASS ", HPRE_POISON_BYPASS}, + {"BD_ARUSER ", HPRE_BD_ARUSR_CFG}, + {"BD_AWUSER ", HPRE_BD_AWUSR_CFG}, + {"DATA_ARUSER ", HPRE_DATA_RUSER_CFG}, + {"DATA_AWUSER ", HPRE_DATA_WUSER_CFG}, + {"INT_STATUS ", HPRE_INT_STATUS}, + {"INT_MASK ", HPRE_HAC_INT_MSK}, + {"RAS_CE_ENB ", HPRE_HAC_RAS_CE_ENB}, + {"RAS_NFE_ENB ", HPRE_HAC_RAS_NFE_ENB}, + {"RAS_FE_ENB ", HPRE_HAC_RAS_FE_ENB}, + {"INT_SET ", HPRE_HAC_INT_SET}, + {"RNG_TIMEOUT_NUM ", HPRE_RNG_TIMEOUT_NUM}, }; static const char *hpre_dfx_files[HPRE_DFX_FILE_NUM] = { @@ -1023,6 +1033,82 @@ static int hpre_qm_init(struct hisi_qm *qm, struct pci_dev *pdev) return hisi_qm_init(qm); } +static int hpre_show_last_regs_init(struct hisi_qm *qm) +{ + int cluster_dfx_regs_num = ARRAY_SIZE(hpre_cluster_dfx_regs); + int com_dfx_regs_num = ARRAY_SIZE(hpre_com_dfx_regs); + u8 clusters_num = hpre_cluster_num(qm); + struct qm_debug *debug = &qm->debug; + void __iomem *io_base; + int i, j, idx; + + debug->last_words = kcalloc(cluster_dfx_regs_num * clusters_num + + com_dfx_regs_num, sizeof(unsigned int), GFP_KERNEL); + if (!debug->last_words) + return -ENOMEM; + + for (i = 0; i < com_dfx_regs_num; i++) + debug->last_words[i] = readl_relaxed(qm->io_base + + hpre_com_dfx_regs[i].offset); + + for (i = 0; i < clusters_num; i++) { + io_base = qm->io_base + hpre_cluster_offsets[i]; + for (j = 0; j < cluster_dfx_regs_num; j++) { + idx = com_dfx_regs_num + i * cluster_dfx_regs_num + j; + debug->last_words[idx] = readl_relaxed( + io_base + hpre_cluster_dfx_regs[j].offset); + } + } + + return 0; +} + +static void hpre_show_last_regs_uninit(struct hisi_qm *qm) +{ + struct qm_debug *debug = &qm->debug; + + if (qm->fun_type == QM_HW_VF || !debug->last_words) + return; + + kfree(debug->last_words); + debug->last_words = NULL; +} + +static void hpre_show_last_dfx_regs(struct hisi_qm *qm) +{ + int cluster_dfx_regs_num = ARRAY_SIZE(hpre_cluster_dfx_regs); + int com_dfx_regs_num = ARRAY_SIZE(hpre_com_dfx_regs); + u8 clusters_num = hpre_cluster_num(qm); + struct qm_debug *debug = &qm->debug; + struct pci_dev *pdev = qm->pdev; + void __iomem *io_base; + int i, j, idx; + u32 val; + + if (qm->fun_type == QM_HW_VF || !debug->last_words) + return; + + /* dumps last word of the debugging registers during controller reset */ + for (i = 0; i < com_dfx_regs_num; i++) { + val = readl_relaxed(qm->io_base + hpre_com_dfx_regs[i].offset); + if (debug->last_words[i] != val) + pci_info(pdev, "Common_core:%s \t= 0x%08x => 0x%08x\n", + hpre_com_dfx_regs[i].name, debug->last_words[i], val); + } + + for (i = 0; i < clusters_num; i++) { + io_base = qm->io_base + hpre_cluster_offsets[i]; + for (j = 0; j < cluster_dfx_regs_num; j++) { + val = readl_relaxed(io_base + + hpre_cluster_dfx_regs[j].offset); + idx = com_dfx_regs_num + i * cluster_dfx_regs_num + j; + if (debug->last_words[idx] != val) + pci_info(pdev, "cluster-%d:%s \t= 0x%08x => 0x%08x\n", + i, hpre_cluster_dfx_regs[j].name, debug->last_words[idx], val); + } + } +} + static void hpre_log_hw_error(struct hisi_qm *qm, u32 err_sts) { const struct hpre_hw_error *err = hpre_hw_errors; @@ -1081,6 +1167,7 @@ static const struct hisi_qm_err_ini hpre_err_ini = { .open_axi_master_ooo = hpre_open_axi_master_ooo, .open_sva_prefetch = hpre_open_sva_prefetch, .close_sva_prefetch = hpre_close_sva_prefetch, + .show_last_dfx_regs = hpre_show_last_dfx_regs, .err_info_init = hpre_err_info_init, }; @@ -1098,8 +1185,11 @@ static int hpre_pf_probe_init(struct hpre *hpre) qm->err_ini = &hpre_err_ini; qm->err_ini->err_info_init(qm); hisi_qm_dev_err_init(qm); + ret = hpre_show_last_regs_init(qm); + if (ret) + pci_err(qm->pdev, "Failed to init last word regs!\n"); - return 0; + return ret; } static int hpre_probe_init(struct hpre *hpre) @@ -1185,6 +1275,7 @@ static int hpre_probe(struct pci_dev *pdev, const struct pci_device_id *id) hisi_qm_stop(qm, QM_NORMAL); err_with_err_init: + hpre_show_last_regs_uninit(qm); hisi_qm_dev_err_uninit(qm); err_with_qm_init: @@ -1215,6 +1306,7 @@ static void hpre_remove(struct pci_dev *pdev) if (qm->fun_type == QM_HW_PF) { hpre_cnt_regs_clear(qm); qm->debug.curr_qm_qp_num = 0; + hpre_show_last_regs_uninit(qm); hisi_qm_dev_err_uninit(qm); } -- Gitee From 29c3db94b3d218f5370bc932a32c8770c276ecfa Mon Sep 17 00:00:00 2001 From: Kai Ye Date: Tue, 19 Jul 2022 16:51:44 +0800 Subject: [PATCH 509/674] crypto: hisilicon/zip - support last word dumping mainline inclusion from mainline-v5.19-rc1 commit 5bfabd50c6fa category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=5bfabd50c6fa ---------------------------------------------------------------------- 1. Add some debugging registers. 2. Add last word dumping function during zip engine controller reset. Signed-off-by: Kai Ye Signed-off-by: Herbert Xu Signed-off-by: Jiangshui Yang Reviewed-by: Yang Shen Acked-by: Xie XiuQi Signed-off-by: Zheng Zengkai --- drivers/crypto/hisilicon/zip/zip_main.c | 107 +++++++++++++++++++++++- 1 file changed, 106 insertions(+), 1 deletion(-) diff --git a/drivers/crypto/hisilicon/zip/zip_main.c b/drivers/crypto/hisilicon/zip/zip_main.c index ed539755eaf8..727030e1a57e 100644 --- a/drivers/crypto/hisilicon/zip/zip_main.c +++ b/drivers/crypto/hisilicon/zip/zip_main.c @@ -234,6 +234,22 @@ static const struct debugfs_reg32 hzip_dfx_regs[] = { {"HZIP_DECOMP_LZ77_CURR_ST ", 0x9cull}, }; +static const struct debugfs_reg32 hzip_com_dfx_regs[] = { + {"HZIP_CLOCK_GATE_CTRL ", 0x301004}, + {"HZIP_CORE_INT_RAS_CE_ENB ", 0x301160}, + {"HZIP_CORE_INT_RAS_NFE_ENB ", 0x301164}, + {"HZIP_CORE_INT_RAS_FE_ENB ", 0x301168}, + {"HZIP_UNCOM_ERR_RAS_CTRL ", 0x30116C}, +}; + +static const struct debugfs_reg32 hzip_dump_dfx_regs[] = { + {"HZIP_GET_BD_NUM ", 0x00ull}, + {"HZIP_GET_RIGHT_BD ", 0x04ull}, + {"HZIP_GET_ERROR_BD ", 0x08ull}, + {"HZIP_DONE_BD_NUM ", 0x0cull}, + {"HZIP_MAX_DELAY ", 0x20ull}, +}; + /* define the ZIP's dfx regs region and region length */ static struct dfx_diff_registers hzip_diff_regs[] = { { @@ -773,6 +789,87 @@ static void hisi_zip_debugfs_exit(struct hisi_qm *qm) } } +static int hisi_zip_show_last_regs_init(struct hisi_qm *qm) +{ + int core_dfx_regs_num = ARRAY_SIZE(hzip_dump_dfx_regs); + int com_dfx_regs_num = ARRAY_SIZE(hzip_com_dfx_regs); + struct qm_debug *debug = &qm->debug; + void __iomem *io_base; + int i, j, idx; + + debug->last_words = kcalloc(core_dfx_regs_num * HZIP_CORE_NUM + + com_dfx_regs_num, sizeof(unsigned int), GFP_KERNEL); + if (!debug->last_words) + return -ENOMEM; + + for (i = 0; i < com_dfx_regs_num; i++) { + io_base = qm->io_base + hzip_com_dfx_regs[i].offset; + debug->last_words[i] = readl_relaxed(io_base); + } + + for (i = 0; i < HZIP_CORE_NUM; i++) { + io_base = qm->io_base + core_offsets[i]; + for (j = 0; j < core_dfx_regs_num; j++) { + idx = com_dfx_regs_num + i * core_dfx_regs_num + j; + debug->last_words[idx] = readl_relaxed( + io_base + hzip_dump_dfx_regs[j].offset); + } + } + + return 0; +} + +static void hisi_zip_show_last_regs_uninit(struct hisi_qm *qm) +{ + struct qm_debug *debug = &qm->debug; + + if (qm->fun_type == QM_HW_VF || !debug->last_words) + return; + + kfree(debug->last_words); + debug->last_words = NULL; +} + +static void hisi_zip_show_last_dfx_regs(struct hisi_qm *qm) +{ + int core_dfx_regs_num = ARRAY_SIZE(hzip_dump_dfx_regs); + int com_dfx_regs_num = ARRAY_SIZE(hzip_com_dfx_regs); + struct qm_debug *debug = &qm->debug; + char buf[HZIP_BUF_SIZE]; + void __iomem *base; + int i, j, idx; + u32 val; + + if (qm->fun_type == QM_HW_VF || !debug->last_words) + return; + + for (i = 0; i < com_dfx_regs_num; i++) { + val = readl_relaxed(qm->io_base + hzip_com_dfx_regs[i].offset); + if (debug->last_words[i] != val) + pci_info(qm->pdev, "com_dfx: %s \t= 0x%08x => 0x%08x\n", + hzip_com_dfx_regs[i].name, debug->last_words[i], val); + } + + for (i = 0; i < HZIP_CORE_NUM; i++) { + if (i < HZIP_COMP_CORE_NUM) + scnprintf(buf, sizeof(buf), "Comp_core-%d", i); + else + scnprintf(buf, sizeof(buf), "Decomp_core-%d", + i - HZIP_COMP_CORE_NUM); + base = qm->io_base + core_offsets[i]; + + pci_info(qm->pdev, "==>%s:\n", buf); + /* dump last word for dfx regs during control resetting */ + for (j = 0; j < core_dfx_regs_num; j++) { + idx = com_dfx_regs_num + i * core_dfx_regs_num + j; + val = readl_relaxed(base + hzip_dump_dfx_regs[j].offset); + if (debug->last_words[idx] != val) + pci_info(qm->pdev, "%s \t= 0x%08x => 0x%08x\n", + hzip_dump_dfx_regs[j].name, debug->last_words[idx], val); + } + } +} + static void hisi_zip_log_hw_error(struct hisi_qm *qm, u32 err_sts) { const struct hisi_zip_hw_error *err = zip_hw_error; @@ -860,6 +957,7 @@ static const struct hisi_qm_err_ini hisi_zip_err_ini = { .close_axi_master_ooo = hisi_zip_close_axi_master_ooo, .open_sva_prefetch = hisi_zip_open_sva_prefetch, .close_sva_prefetch = hisi_zip_close_sva_prefetch, + .show_last_dfx_regs = hisi_zip_show_last_dfx_regs, .err_info_init = hisi_zip_err_info_init, }; @@ -867,6 +965,7 @@ static int hisi_zip_pf_probe_init(struct hisi_zip *hisi_zip) { struct hisi_qm *qm = &hisi_zip->qm; struct hisi_zip_ctrl *ctrl; + int ret; ctrl = devm_kzalloc(&qm->pdev->dev, sizeof(*ctrl), GFP_KERNEL); if (!ctrl) @@ -882,7 +981,11 @@ static int hisi_zip_pf_probe_init(struct hisi_zip *hisi_zip) hisi_qm_dev_err_init(qm); hisi_zip_debug_regs_clear(qm); - return 0; + ret = hisi_zip_show_last_regs_init(qm); + if (ret) + pci_err(qm->pdev, "Failed to init last word regs!\n"); + + return ret; } static int hisi_zip_qm_init(struct hisi_qm *qm, struct pci_dev *pdev) @@ -1026,6 +1129,7 @@ static int hisi_zip_probe(struct pci_dev *pdev, const struct pci_device_id *id) hisi_qm_stop(qm, QM_NORMAL); err_dev_err_uninit: + hisi_zip_show_last_regs_uninit(qm); hisi_qm_dev_err_uninit(qm); err_qm_uninit: @@ -1047,6 +1151,7 @@ static void hisi_zip_remove(struct pci_dev *pdev) hisi_zip_debugfs_exit(qm); hisi_qm_stop(qm, QM_NORMAL); + hisi_zip_show_last_regs_uninit(qm); hisi_qm_dev_err_uninit(qm); hisi_zip_qm_uninit(qm); } -- Gitee From e4c7bd8e82ef37fccbbb968e75ab58ec1d250a31 Mon Sep 17 00:00:00 2001 From: Yang Shen Date: Tue, 19 Jul 2022 16:51:45 +0800 Subject: [PATCH 510/674] crypto: hisilicon/sgl - align the hardware sgl dma address mainline inclusion from mainline-v5.19-rc1 commit 948e35f13181 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=948e35f13181 ---------------------------------------------------------------------- The hardware needs aligned sgl dma address. So expend the sgl_size to align 64 bytes. Signed-off-by: Yang Shen Signed-off-by: Herbert Xu Signed-off-by: Jiangshui Yang Reviewed-by: Yang Shen Acked-by: Xie XiuQi Signed-off-by: Zheng Zengkai --- drivers/crypto/hisilicon/sgl.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/crypto/hisilicon/sgl.c b/drivers/crypto/hisilicon/sgl.c index f7efc02b065f..2b6f2281cfd6 100644 --- a/drivers/crypto/hisilicon/sgl.c +++ b/drivers/crypto/hisilicon/sgl.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2019 HiSilicon Limited. */ +#include #include #include #include @@ -64,8 +65,9 @@ struct hisi_acc_sgl_pool *hisi_acc_create_sgl_pool(struct device *dev, if (!dev || !count || !sge_nr || sge_nr > HISI_ACC_SGL_SGE_NR_MAX) return ERR_PTR(-EINVAL); - sgl_size = sizeof(struct acc_hw_sge) * sge_nr + - sizeof(struct hisi_acc_hw_sgl); + sgl_size = ALIGN(sizeof(struct acc_hw_sge) * sge_nr + + sizeof(struct hisi_acc_hw_sgl), + HISI_ACC_SGL_ALIGN_SIZE); /* * the pool may allocate a block of memory of size PAGE_SIZE * 2^(MAX_ORDER - 1), -- Gitee From 8061d4c1d05c1e2ec061b9b98940cd18d0f004dd Mon Sep 17 00:00:00 2001 From: Weili Qian Date: Tue, 19 Jul 2022 16:51:46 +0800 Subject: [PATCH 511/674] crypto: hisilicon/qm - remove unused function declaration mainline inclusion from mainline-v5.19-rc1 commit 0b0002315adf category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0b0002315adf ---------------------------------------------------------------------- The 'hisi_qm_get_hw_version' function is unused, so remove the function declaration. Signed-off-by: Weili Qian Signed-off-by: Herbert Xu Signed-off-by: Jiangshui Yang Reviewed-by: Yang Shen Acked-by: Xie XiuQi Signed-off-by: Zheng Zengkai --- include/linux/hisi_acc_qm.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h index 677ce831949a..b84092a39429 100644 --- a/include/linux/hisi_acc_qm.h +++ b/include/linux/hisi_acc_qm.h @@ -454,7 +454,6 @@ int hisi_qp_send(struct hisi_qp *qp, const void *msg); int hisi_qm_get_free_qp_num(struct hisi_qm *qm); int hisi_qm_get_vft(struct hisi_qm *qm, u32 *base, u32 *number); void hisi_qm_debug_init(struct hisi_qm *qm); -enum qm_hw_ver hisi_qm_get_hw_version(struct pci_dev *pdev); void hisi_qm_debug_regs_clear(struct hisi_qm *qm); int hisi_qm_sriov_enable(struct pci_dev *pdev, int max_vfs); int hisi_qm_sriov_disable(struct pci_dev *pdev, bool is_frozen); -- Gitee From 6362db4b8c498a64eac4f98d897b559c73dd1532 Mon Sep 17 00:00:00 2001 From: Weili Qian Date: Tue, 19 Jul 2022 16:51:47 +0800 Subject: [PATCH 512/674] crypto: hisilicon/qm - set function with static mainline inclusion from mainline-v5.19-rc1 commit fb06eb9727d6 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=fb06eb9727d6 ---------------------------------------------------------------------- These functions 'hisi_qm_create_qp' and 'hisi_qm_set_vft' are not used outside qm.c, so they are marked as static. Signed-off-by: Weili Qian Signed-off-by: Herbert Xu Signed-off-by: Jiangshui Yang Reviewed-by: Yang Shen Acked-by: Xie XiuQi Signed-off-by: Zheng Zengkai --- drivers/crypto/hisilicon/qm.c | 6 ++---- include/linux/hisi_acc_qm.h | 2 -- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index e44d0c46705a..c14c52f17ca9 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -2835,7 +2835,7 @@ static struct hisi_qp *qm_create_qp_nolock(struct hisi_qm *qm, u8 alg_type) * return created qp, -EBUSY if all qps in qm allocated, -ENOMEM if allocating * qp memory fails. */ -struct hisi_qp *hisi_qm_create_qp(struct hisi_qm *qm, u8 alg_type) +static struct hisi_qp *hisi_qm_create_qp(struct hisi_qm *qm, u8 alg_type) { struct hisi_qp *qp; int ret; @@ -2853,7 +2853,6 @@ struct hisi_qp *hisi_qm_create_qp(struct hisi_qm *qm, u8 alg_type) return qp; } -EXPORT_SYMBOL_GPL(hisi_qm_create_qp); /** * hisi_qm_release_qp() - Release a qp back to its qm. @@ -3738,7 +3737,7 @@ EXPORT_SYMBOL_GPL(hisi_qm_uninit); * * qm hw v1 does not support this interface. */ -int hisi_qm_get_vft(struct hisi_qm *qm, u32 *base, u32 *number) +static int hisi_qm_get_vft(struct hisi_qm *qm, u32 *base, u32 *number) { if (!base || !number) return -EINVAL; @@ -3750,7 +3749,6 @@ int hisi_qm_get_vft(struct hisi_qm *qm, u32 *base, u32 *number) return qm->ops->get_vft(qm, base, number); } -EXPORT_SYMBOL_GPL(hisi_qm_get_vft); /** * hisi_qm_set_vft() - Set vft to a qm. diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h index b84092a39429..4ac2d64c2f32 100644 --- a/include/linux/hisi_acc_qm.h +++ b/include/linux/hisi_acc_qm.h @@ -446,13 +446,11 @@ int hisi_qm_init(struct hisi_qm *qm); void hisi_qm_uninit(struct hisi_qm *qm); int hisi_qm_start(struct hisi_qm *qm); int hisi_qm_stop(struct hisi_qm *qm, enum qm_stop_reason r); -struct hisi_qp *hisi_qm_create_qp(struct hisi_qm *qm, u8 alg_type); int hisi_qm_start_qp(struct hisi_qp *qp, unsigned long arg); int hisi_qm_stop_qp(struct hisi_qp *qp); void hisi_qm_release_qp(struct hisi_qp *qp); int hisi_qp_send(struct hisi_qp *qp, const void *msg); int hisi_qm_get_free_qp_num(struct hisi_qm *qm); -int hisi_qm_get_vft(struct hisi_qm *qm, u32 *base, u32 *number); void hisi_qm_debug_init(struct hisi_qm *qm); void hisi_qm_debug_regs_clear(struct hisi_qm *qm); int hisi_qm_sriov_enable(struct pci_dev *pdev, int max_vfs); -- Gitee From 80a2e816f7a5196f99d029a7387d51962a74a3fe Mon Sep 17 00:00:00 2001 From: Weili Qian Date: Tue, 19 Jul 2022 16:51:48 +0800 Subject: [PATCH 513/674] crypto: hisilicon/qm - replace hisi_qm_release_qp() with hisi_qm_free_qps() mainline inclusion from mainline-v5.19-rc1 commit 7982996c5b08 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=7982996c5b08 ---------------------------------------------------------------------- hisi_qm_free_qps() can release multiple queues in one call, and it is already exported. So, replace hisi_qm_release_qp() with hisi_qm_free_qps() in zip_crypto.c, and do not export hisi_qm_release_qp() outside qm.c. Signed-off-by: Weili Qian Signed-off-by: Herbert Xu Signed-off-by: Jiangshui Yang Reviewed-by: Yang Shen Acked-by: Xie XiuQi Signed-off-by: Zheng Zengkai --- drivers/crypto/hisilicon/qm.c | 3 +-- drivers/crypto/hisilicon/zip/zip_crypto.c | 2 +- include/linux/hisi_acc_qm.h | 1 - 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index c14c52f17ca9..b5aedf7dacdd 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -2860,7 +2860,7 @@ static struct hisi_qp *hisi_qm_create_qp(struct hisi_qm *qm, u8 alg_type) * * This function releases the resource of a qp. */ -void hisi_qm_release_qp(struct hisi_qp *qp) +static void hisi_qm_release_qp(struct hisi_qp *qp) { struct hisi_qm *qm = qp->qm; @@ -2878,7 +2878,6 @@ void hisi_qm_release_qp(struct hisi_qp *qp) qm_pm_put_sync(qm); } -EXPORT_SYMBOL_GPL(hisi_qm_release_qp); static int qm_sq_ctx_cfg(struct hisi_qp *qp, int qp_id, u32 pasid) { diff --git a/drivers/crypto/hisilicon/zip/zip_crypto.c b/drivers/crypto/hisilicon/zip/zip_crypto.c index 9520a4113c81..67869513e48c 100644 --- a/drivers/crypto/hisilicon/zip/zip_crypto.c +++ b/drivers/crypto/hisilicon/zip/zip_crypto.c @@ -521,7 +521,7 @@ static int hisi_zip_start_qp(struct hisi_qp *qp, struct hisi_zip_qp_ctx *ctx, static void hisi_zip_release_qp(struct hisi_zip_qp_ctx *ctx) { hisi_qm_stop_qp(ctx->qp); - hisi_qm_release_qp(ctx->qp); + hisi_qm_free_qps(&ctx->qp, 1); } static const struct hisi_zip_sqe_ops hisi_zip_ops_v1 = { diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h index 4ac2d64c2f32..452d50f2c05c 100644 --- a/include/linux/hisi_acc_qm.h +++ b/include/linux/hisi_acc_qm.h @@ -448,7 +448,6 @@ int hisi_qm_start(struct hisi_qm *qm); int hisi_qm_stop(struct hisi_qm *qm, enum qm_stop_reason r); int hisi_qm_start_qp(struct hisi_qp *qp, unsigned long arg); int hisi_qm_stop_qp(struct hisi_qp *qp); -void hisi_qm_release_qp(struct hisi_qp *qp); int hisi_qp_send(struct hisi_qp *qp, const void *msg); int hisi_qm_get_free_qp_num(struct hisi_qm *qm); void hisi_qm_debug_init(struct hisi_qm *qm); -- Gitee From 139dd6e79d062c15be9eaa37b0c66c3f257bba18 Mon Sep 17 00:00:00 2001 From: Weili Qian Date: Tue, 19 Jul 2022 16:51:49 +0800 Subject: [PATCH 514/674] crypto: hisilicon/qm - remove hisi_qm_get_free_qp_num() mainline inclusion from mainline-v5.19-rc1 commit b0c42232fce4 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b0c42232fce4 ---------------------------------------------------------------------- hisi_qm_get_free_qp_num() is to get the free queue number on the function. It is a simple function and is only called by hisi_qm_get_available_instances(). This patch modifies to get the free queue directly in hisi_qm_get_available_instances(), and remove hisi_qm_get_free_qp_num(). Signed-off-by: Weili Qian Signed-off-by: Herbert Xu Signed-off-by: Jiangshui Yang Reviewed-by: Yang Shen Acked-by: Xie XiuQi Signed-off-by: Zheng Zengkai --- drivers/crypto/hisilicon/qm.c | 28 +++++++++------------------- include/linux/hisi_acc_qm.h | 1 - 2 files changed, 9 insertions(+), 20 deletions(-) diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index b5aedf7dacdd..f75e14e64c3c 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -3227,9 +3227,17 @@ static void qm_qp_event_notifier(struct hisi_qp *qp) wake_up_interruptible(&qp->uacce_q->wait); } + /* This function returns free number of qp in qm. */ static int hisi_qm_get_available_instances(struct uacce_device *uacce) { - return hisi_qm_get_free_qp_num(uacce->priv); + struct hisi_qm *qm = uacce->priv; + int ret; + + down_read(&qm->qps_lock); + ret = qm->qp_num - qm->qp_in_used; + up_read(&qm->qps_lock); + + return ret; } static void hisi_qm_set_hw_reset(struct hisi_qm *qm, int offset) @@ -3540,24 +3548,6 @@ void hisi_qm_wait_task_finish(struct hisi_qm *qm, struct hisi_qm_list *qm_list) } EXPORT_SYMBOL_GPL(hisi_qm_wait_task_finish); -/** - * hisi_qm_get_free_qp_num() - Get free number of qp in qm. - * @qm: The qm which want to get free qp. - * - * This function return free number of qp in qm. - */ -int hisi_qm_get_free_qp_num(struct hisi_qm *qm) -{ - int ret; - - down_read(&qm->qps_lock); - ret = qm->qp_num - qm->qp_in_used; - up_read(&qm->qps_lock); - - return ret; -} -EXPORT_SYMBOL_GPL(hisi_qm_get_free_qp_num); - static void hisi_qp_memory_uninit(struct hisi_qm *qm, int num) { struct device *dev = &qm->pdev->dev; diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h index 452d50f2c05c..7ffdee585153 100644 --- a/include/linux/hisi_acc_qm.h +++ b/include/linux/hisi_acc_qm.h @@ -449,7 +449,6 @@ int hisi_qm_stop(struct hisi_qm *qm, enum qm_stop_reason r); int hisi_qm_start_qp(struct hisi_qp *qp, unsigned long arg); int hisi_qm_stop_qp(struct hisi_qp *qp); int hisi_qp_send(struct hisi_qp *qp, const void *msg); -int hisi_qm_get_free_qp_num(struct hisi_qm *qm); void hisi_qm_debug_init(struct hisi_qm *qm); void hisi_qm_debug_regs_clear(struct hisi_qm *qm); int hisi_qm_sriov_enable(struct pci_dev *pdev, int max_vfs); -- Gitee From d6217b97b475f3ebb4c466aee52b89e4b16f9588 Mon Sep 17 00:00:00 2001 From: Kai Ye Date: Tue, 19 Jul 2022 16:51:50 +0800 Subject: [PATCH 515/674] crypto: hisilicon/sec - add sm4 generic selection mainline inclusion from mainline-v5.19-rc1 commit fdbf5e46e7af category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=fdbf5e46e7af ---------------------------------------------------------------------- Add sm4 generic selection for fallback tfm in the Kconfig. Signed-off-by: Kai Ye Signed-off-by: Herbert Xu Signed-off-by: Jiangshui Yang Reviewed-by: Yang Shen Acked-by: Xie XiuQi Signed-off-by: Zheng Zengkai --- drivers/crypto/hisilicon/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/crypto/hisilicon/Kconfig b/drivers/crypto/hisilicon/Kconfig index ae0bd02f5c40..f9f749e96aae 100644 --- a/drivers/crypto/hisilicon/Kconfig +++ b/drivers/crypto/hisilicon/Kconfig @@ -26,6 +26,7 @@ config CRYPTO_DEV_HISI_SEC2 select CRYPTO_SHA1 select CRYPTO_SHA256 select CRYPTO_SHA512 + select CRYPTO_SM4 depends on PCI && PCI_MSI depends on UACCE || UACCE=n depends on ARM64 || (COMPILE_TEST && 64BIT) -- Gitee From da6deb3f3174a3df57e060911f6f7fe933f64194 Mon Sep 17 00:00:00 2001 From: Kai Ye Date: Tue, 19 Jul 2022 16:51:51 +0800 Subject: [PATCH 516/674] crypto: hisilicon/sec - delete the flag CRYPTO_ALG_ALLOCATES_MEMORY mainline inclusion from mainline-v5.19-rc1 commit 2d33f5771b51 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=2d33f5771b51 ---------------------------------------------------------------------- Should not to uses the CRYPTO_ALG_ALLOCATES_MEMORY in SEC2. The SEC2 driver uses the pre-allocated buffers, including the src sgl pool, dst sgl pool and other qp ctx resources. (e.g. IV buffer, mac buffer, key buffer). The SEC2 driver doesn't allocate memory during request processing. The driver only maps software sgl to allocated hardware sgl during I/O. So here is fix it. Signed-off-by: Kai Ye Signed-off-by: Herbert Xu Signed-off-by: Jiangshui Yang Reviewed-by: Yang Shen Acked-by: Xie XiuQi Signed-off-by: Zheng Zengkai --- drivers/crypto/hisilicon/sec2/sec_crypto.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/crypto/hisilicon/sec2/sec_crypto.c b/drivers/crypto/hisilicon/sec2/sec_crypto.c index a91635c348b5..6eebe739893c 100644 --- a/drivers/crypto/hisilicon/sec2/sec_crypto.c +++ b/drivers/crypto/hisilicon/sec2/sec_crypto.c @@ -2113,7 +2113,6 @@ static int sec_skcipher_decrypt(struct skcipher_request *sk_req) .cra_driver_name = "hisi_sec_"sec_cra_name,\ .cra_priority = SEC_PRIORITY,\ .cra_flags = CRYPTO_ALG_ASYNC |\ - CRYPTO_ALG_ALLOCATES_MEMORY |\ CRYPTO_ALG_NEED_FALLBACK,\ .cra_blocksize = blk_size,\ .cra_ctxsize = sizeof(struct sec_ctx),\ @@ -2366,7 +2365,6 @@ static int sec_aead_decrypt(struct aead_request *a_req) .cra_driver_name = "hisi_sec_"sec_cra_name,\ .cra_priority = SEC_PRIORITY,\ .cra_flags = CRYPTO_ALG_ASYNC |\ - CRYPTO_ALG_ALLOCATES_MEMORY |\ CRYPTO_ALG_NEED_FALLBACK,\ .cra_blocksize = blk_size,\ .cra_ctxsize = sizeof(struct sec_ctx),\ -- Gitee From ce5059233c0fda81b7fd2c871b260b73499e0d75 Mon Sep 17 00:00:00 2001 From: Zhengchao Shao Date: Tue, 19 Jul 2022 22:10:05 +0800 Subject: [PATCH 517/674] crypto: hisilicon/sec - don't sleep when in softirq mainline inclusion from mainline-v5.19-rc6 commit 02884a4f12de11f54d4ca67a07dd1f111d96fdbd category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GEVR CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/herbert/cryptodev-2.6.git/commit/?id=68740ab505431f268dc1ee26a54b871e75f0ddaa -------------------------------- When kunpeng920 encryption driver is used to deencrypt and decrypt packets during the softirq, it is not allowed to use mutex lock. The kernel will report the following error: BUG: scheduling while atomic: swapper/57/0/0x00000300 Call trace: dump_backtrace+0x0/0x1e4 show_stack+0x20/0x2c dump_stack+0xd8/0x140 __schedule_bug+0x68/0x80 __schedule+0x728/0x840 schedule+0x50/0xe0 schedule_preempt_disabled+0x18/0x24 __mutex_lock.constprop.0+0x594/0x5dc __mutex_lock_slowpath+0x1c/0x30 mutex_lock+0x50/0x60 sec_request_init+0x8c/0x1a0 [hisi_sec2] sec_process+0x28/0x1ac [hisi_sec2] sec_skcipher_crypto+0xf4/0x1d4 [hisi_sec2] sec_skcipher_encrypt+0x1c/0x30 [hisi_sec2] crypto_skcipher_encrypt+0x2c/0x40 crypto_authenc_encrypt+0xc8/0xfc [authenc] crypto_aead_encrypt+0x2c/0x40 echainiv_encrypt+0x144/0x1a0 [echainiv] crypto_aead_encrypt+0x2c/0x40 esp_output_tail+0x348/0x5c0 [esp4] esp_output+0x120/0x19c [esp4] xfrm_output_one+0x25c/0x4d4 xfrm_output_resume+0x6c/0x1fc xfrm_output+0xac/0x3c0 xfrm4_output+0x64/0x130 ip_build_and_send_pkt+0x158/0x20c tcp_v4_send_synack+0xdc/0x1f0 tcp_conn_request+0x7d0/0x994 tcp_v4_conn_request+0x58/0x6c tcp_v6_conn_request+0xf0/0x100 tcp_rcv_state_process+0x1cc/0xd60 tcp_v4_do_rcv+0x10c/0x250 tcp_v4_rcv+0xfc4/0x10a4 ip_protocol_deliver_rcu+0xf4/0x200 ip_local_deliver_finish+0x58/0x70 ip_local_deliver+0x68/0x120 ip_sublist_rcv_finish+0x70/0x94 ip_list_rcv_finish.constprop.0+0x17c/0x1d0 ip_sublist_rcv+0x40/0xb0 ip_list_rcv+0x140/0x1dc __netif_receive_skb_list_core+0x154/0x28c __netif_receive_skb_list+0x120/0x1a0 netif_receive_skb_list_internal+0xe4/0x1f0 napi_complete_done+0x70/0x1f0 gro_cell_poll+0x9c/0xb0 napi_poll+0xcc/0x264 net_rx_action+0xd4/0x21c __do_softirq+0x130/0x358 irq_exit+0x11c/0x13c __handle_domain_irq+0x88/0xf0 gic_handle_irq+0x78/0x2c0 el1_irq+0xb8/0x140 arch_cpu_idle+0x18/0x40 default_idle_call+0x5c/0x1c0 cpuidle_idle_call+0x174/0x1b0 do_idle+0xc8/0x160 cpu_startup_entry+0x30/0x11c secondary_start_kernel+0x158/0x1e4 softirq: huh, entered softirq 3 NET_RX 0000000093774ee4 with preempt_count 00000100, exited with fffffe00? Fixes: 416d82204df4 ("crypto: hisilicon - add HiSilicon SEC V2 driver") Signed-off-by: Zhengchao Shao Signed-off-by: Herbert Xu Signed-off-by: Zhengchao Shao Reviewed-by: Yue Haibing Reviewed-by: Wei Yongjun Signed-off-by: Zheng Zengkai --- drivers/crypto/hisilicon/sec2/sec.h | 2 +- drivers/crypto/hisilicon/sec2/sec_crypto.c | 20 ++++++++++---------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/crypto/hisilicon/sec2/sec.h b/drivers/crypto/hisilicon/sec2/sec.h index c2e9b01187a7..a44c8dba3cda 100644 --- a/drivers/crypto/hisilicon/sec2/sec.h +++ b/drivers/crypto/hisilicon/sec2/sec.h @@ -119,7 +119,7 @@ struct sec_qp_ctx { struct idr req_idr; struct sec_alg_res res[QM_Q_DEPTH]; struct sec_ctx *ctx; - struct mutex req_lock; + spinlock_t req_lock; struct list_head backlog; struct hisi_acc_sgl_pool *c_in_pool; struct hisi_acc_sgl_pool *c_out_pool; diff --git a/drivers/crypto/hisilicon/sec2/sec_crypto.c b/drivers/crypto/hisilicon/sec2/sec_crypto.c index 6eebe739893c..71dfa7db6394 100644 --- a/drivers/crypto/hisilicon/sec2/sec_crypto.c +++ b/drivers/crypto/hisilicon/sec2/sec_crypto.c @@ -127,11 +127,11 @@ static int sec_alloc_req_id(struct sec_req *req, struct sec_qp_ctx *qp_ctx) { int req_id; - mutex_lock(&qp_ctx->req_lock); + spin_lock_bh(&qp_ctx->req_lock); req_id = idr_alloc_cyclic(&qp_ctx->req_idr, NULL, 0, QM_Q_DEPTH, GFP_ATOMIC); - mutex_unlock(&qp_ctx->req_lock); + spin_unlock_bh(&qp_ctx->req_lock); if (unlikely(req_id < 0)) { dev_err(req->ctx->dev, "alloc req id fail!\n"); return req_id; @@ -156,9 +156,9 @@ static void sec_free_req_id(struct sec_req *req) qp_ctx->req_list[req_id] = NULL; req->qp_ctx = NULL; - mutex_lock(&qp_ctx->req_lock); + spin_lock_bh(&qp_ctx->req_lock); idr_remove(&qp_ctx->req_idr, req_id); - mutex_unlock(&qp_ctx->req_lock); + spin_unlock_bh(&qp_ctx->req_lock); } static u8 pre_parse_finished_bd(struct bd_status *status, void *resp) @@ -273,7 +273,7 @@ static int sec_bd_send(struct sec_ctx *ctx, struct sec_req *req) !(req->flag & CRYPTO_TFM_REQ_MAY_BACKLOG)) return -EBUSY; - mutex_lock(&qp_ctx->req_lock); + spin_lock_bh(&qp_ctx->req_lock); ret = hisi_qp_send(qp_ctx->qp, &req->sec_sqe); if (ctx->fake_req_limit <= @@ -281,10 +281,10 @@ static int sec_bd_send(struct sec_ctx *ctx, struct sec_req *req) list_add_tail(&req->backlog_head, &qp_ctx->backlog); atomic64_inc(&ctx->sec->debug.dfx.send_cnt); atomic64_inc(&ctx->sec->debug.dfx.send_busy_cnt); - mutex_unlock(&qp_ctx->req_lock); + spin_unlock_bh(&qp_ctx->req_lock); return -EBUSY; } - mutex_unlock(&qp_ctx->req_lock); + spin_unlock_bh(&qp_ctx->req_lock); if (unlikely(ret == -EBUSY)) return -ENOBUFS; @@ -487,7 +487,7 @@ static int sec_create_qp_ctx(struct hisi_qm *qm, struct sec_ctx *ctx, qp->req_cb = sec_req_cb; - mutex_init(&qp_ctx->req_lock); + spin_lock_init(&qp_ctx->req_lock); idr_init(&qp_ctx->req_idr); INIT_LIST_HEAD(&qp_ctx->backlog); @@ -1382,7 +1382,7 @@ static struct sec_req *sec_back_req_clear(struct sec_ctx *ctx, { struct sec_req *backlog_req = NULL; - mutex_lock(&qp_ctx->req_lock); + spin_lock_bh(&qp_ctx->req_lock); if (ctx->fake_req_limit >= atomic_read(&qp_ctx->qp->qp_status.used) && !list_empty(&qp_ctx->backlog)) { @@ -1390,7 +1390,7 @@ static struct sec_req *sec_back_req_clear(struct sec_ctx *ctx, typeof(*backlog_req), backlog_head); list_del(&backlog_req->backlog_head); } - mutex_unlock(&qp_ctx->req_lock); + spin_unlock_bh(&qp_ctx->req_lock); return backlog_req; } -- Gitee From 21f33dad65b601629e5d4b1441aa4b3596c2ef39 Mon Sep 17 00:00:00 2001 From: GUO Zihua Date: Tue, 19 Jul 2022 22:10:06 +0800 Subject: [PATCH 518/674] KEYS: Fix error path return value in pgp_generate_fingerprint hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GRSV CVE: NA -------------------------------- In function pgp_generate_fingerprint, return value is not set correctly shall kmalloc failed, leading to the caller triggering a read out-of-bound while reading ctx.fingerprint or ctx.raw_fingerprint. This patch fixes this issue by setting correct return value on the error path. Fixes: 4006f47d4e21 ("KEYS: PGP data parser") Signed-off-by: GUO Zihua Reviewed-by: Roberto Sassu Signed-off-by: Zheng Zengkai --- crypto/asymmetric_keys/pgp_public_key.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/crypto/asymmetric_keys/pgp_public_key.c b/crypto/asymmetric_keys/pgp_public_key.c index 27b9efeafc4f..33a089797e59 100644 --- a/crypto/asymmetric_keys/pgp_public_key.c +++ b/crypto/asymmetric_keys/pgp_public_key.c @@ -152,8 +152,10 @@ static int pgp_generate_fingerprint(struct pgp_key_data_parse_context *ctx, digest_size = crypto_shash_digestsize(tfm); raw_fingerprint = kmalloc(digest_size, GFP_KERNEL); - if (!raw_fingerprint) + if (!raw_fingerprint) { + ret = -ENOMEM; goto cleanup_hash; + } ret = crypto_shash_final(digest, raw_fingerprint); if (ret < 0) @@ -161,8 +163,10 @@ static int pgp_generate_fingerprint(struct pgp_key_data_parse_context *ctx, ctx->fingerprint_len = digest_size * 2; fingerprint = kmalloc(digest_size * 2 + 1, GFP_KERNEL); - if (!fingerprint) + if (!fingerprint) { + ret = -ENOMEM; goto cleanup_raw_fingerprint; + } offset = digest_size - 8; pr_debug("offset %u/%u\n", offset, digest_size); -- Gitee From a176b3de08939eb678861bfb694b2145cf8c05b8 Mon Sep 17 00:00:00 2001 From: GUO Zihua Date: Tue, 19 Jul 2022 22:10:07 +0800 Subject: [PATCH 519/674] KEYS: Add safe guard against faulty PGP key hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5H4FC CVE: NA -------------------------------- Under normal condition, when there is a user id packet, there will always be a public key packet in the front, meaning ctx.fingerprint will never be NULL. However, if a malicious or faulty PGP key is provided with only user id packet but not public key packet, a read out-of-bound will be triggered during the generation of key description. To make things worse, a NULL pointer deference could be triggered in pgp_key_generate_id(). This patch adds a safe guard which prevents parsing the key further if no public key packet is provided. Fixes: a98cb7a4b757 ("KEYS: Provide PGP key description autogeneration") Signed-off-by: GUO Zihua Reviewed-by: Roberto Sassu Signed-off-by: Zheng Zengkai --- crypto/asymmetric_keys/pgp_public_key.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/crypto/asymmetric_keys/pgp_public_key.c b/crypto/asymmetric_keys/pgp_public_key.c index 33a089797e59..98b1707a0164 100644 --- a/crypto/asymmetric_keys/pgp_public_key.c +++ b/crypto/asymmetric_keys/pgp_public_key.c @@ -315,6 +315,11 @@ static int pgp_key_parse(struct key_preparsed_payload *prep) if (ret < 0) goto error; + if (!ctx.fingerprint) { + ret = -EINVAL; + goto error; + } + if (ctx.user_id && ctx.user_id_len > 0) { /* Propose a description for the key * (user ID without the comment) -- Gitee From 886e6e2458ccd70aee6b17f5b40638cde92aa89e Mon Sep 17 00:00:00 2001 From: GUO Zihua Date: Tue, 19 Jul 2022 22:10:08 +0800 Subject: [PATCH 520/674] KEYS: Fix mistaken sizeof call in pgp_key_generate_id hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5H61R CVE: NA -------------------------------- pgp_key_generate_id() is trying get the size of a flexible length structure, however the sizeof() is called on the pointer itself. Besides, considering it's trying to get the size of a flexible length structure, use struct_size() instead. Fixes: 4006f47d4e21 ("KEYS: PGP data parser") Signed-off-by: GUO Zihua Reviewed-by: Roberto Sassu Signed-off-by: Zheng Zengkai --- crypto/asymmetric_keys/pgp_public_key.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/crypto/asymmetric_keys/pgp_public_key.c b/crypto/asymmetric_keys/pgp_public_key.c index 98b1707a0164..928029a13435 100644 --- a/crypto/asymmetric_keys/pgp_public_key.c +++ b/crypto/asymmetric_keys/pgp_public_key.c @@ -283,7 +283,8 @@ static struct asymmetric_key_ids *pgp_key_generate_id( goto error; kids->id[0] = kid; - kids->id[1] = kmemdup(kid, sizeof(kid) + fingerprint_len, GFP_KERNEL); + kids->id[1] = kmemdup(kid, struct_size(kid, data, fingerprint_len), + GFP_KERNEL); if (!kids->id[1]) goto error; -- Gitee From ad02cf0db6075088d349c2a89d564aa04648395c Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 19 Jul 2022 22:10:09 +0800 Subject: [PATCH 521/674] block: fix mismatch size for flush_rq hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HEZ8 CVE: NA -------------------------------- commit 5c250d556e10 ("blk-mq: fix kabi broken in struct request") intrudoce 'struct request_wrapper' to fix kabi broken in 'struct request', it requires to allocate more size for 'struct request'. However, flush_rq is missed for such adaptation, which will lead to following slab-out-of-bounds: ================================================================== BUG: KASAN: slab-out-of-bounds in sg_init_table+0x23/0x40 Write of size 4096 at addr ffff88812249a148 by task swapper/0/1 Call Trace: dump_stack+0xbe/0xf9 ? sg_init_table+0x23/0x40 print_address_description.constprop.0+0x1e/0x220 ? _raw_spin_lock_irqsave+0x80/0xe0 ? _raw_write_unlock_irqrestore+0x20/0x20 ? blk_alloc_flush_queue+0xd3/0x1a0 ? sg_init_table+0x23/0x40 ? sg_init_table+0x23/0x40 kasan_report.cold+0x67/0x7f ? sg_init_table+0x23/0x40 check_memory_region+0x17c/0x1e0 memset+0x20/0x40 sg_init_table+0x23/0x40 virtblk_init_request+0x3d/0x50 ? virtblk_map_queues+0x40/0x40 blk_mq_realloc_hw_ctxs+0x44d/0xb50 blk_mq_init_allocated_queue+0x20f/0x980 ? blk_set_default_limits+0x1ac/0x1c0 ? blk_alloc_queue+0x3f0/0x410 blk_mq_init_queue_data+0x58/0xa0 virtblk_probe+0x51b/0xee0 ? cache_type_store+0x1a0/0x1a0 ? __sanitizer_cov_trace_switch+0x50/0x90 ? ioread8+0x89/0xa0 virtio_dev_probe+0x449/0x5d0 ? virtio_features_ok.part.0+0xb0/0xb0 really_probe+0x26d/0x8a0 driver_probe_device+0xef/0x280 device_driver_attach+0xaf/0xc0 __driver_attach+0x158/0x280 ? device_driver_attach+0xc0/0xc0 bus_for_each_dev+0x111/0x180 ? subsys_dev_iter_exit+0x20/0x20 ? bus_add_driver+0xb6/0x3e0 ? klist_node_init+0x7c/0xb0 bus_add_driver+0x336/0x3e0 driver_register+0x105/0x1a0 ? nbd_init+0x273/0x273 init+0x69/0xad do_one_initcall+0xcb/0x370 ? initcall_blacklisted+0x1b0/0x1b0 ? parameq+0x110/0x110 ? __kasan_kmalloc.constprop.0+0xc2/0xd0 ? kasan_unpoison_shadow+0x33/0x40 do_initcalls+0x223/0x265 kernel_init_freeable+0x2bb/0x302 ? rest_init+0xea/0xea kernel_init+0x13/0x1f6 ? rest_init+0xea/0xea ret_from_fork+0x22/0x30 Allocated by task 1: kasan_save_stack+0x1b/0x40 __kasan_kmalloc.constprop.0+0xc2/0xd0 blk_alloc_flush_queue+0xd3/0x1a0 blk_mq_realloc_hw_ctxs+0x9fa/0xb50 blk_mq_init_allocated_queue+0x20f/0x980 blk_mq_init_queue_data+0x58/0xa0 virtblk_probe+0x51b/0xee0 virtio_dev_probe+0x449/0x5d0 really_probe+0x26d/0x8a0 driver_probe_device+0xef/0x280 device_driver_attach+0xaf/0xc0 __driver_attach+0x158/0x280 bus_for_each_dev+0x111/0x180 bus_add_driver+0x336/0x3e0 driver_register+0x105/0x1a0 init+0x69/0xad do_one_initcall+0xcb/0x370 do_initcalls+0x223/0x265 kernel_init_freeable+0x2bb/0x302 kernel_init+0x13/0x1f6 ret_from_fork+0x22/0x30 Fixes: 5c250d556e10 ("blk-mq: fix kabi broken in struct request") Signed-off-by: Yu Kuai Reviewed-by: Jason Yan Signed-off-by: Zheng Zengkai --- block/blk-flush.c | 2 +- drivers/scsi/scsi_error.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/block/blk-flush.c b/block/blk-flush.c index 82919829bc4d..71faf07a626f 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -470,7 +470,7 @@ struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size, gfp_t flags) { struct blk_flush_queue *fq; - int rq_sz = sizeof(struct request); + int rq_sz = sizeof(struct request_wrapper); fq = kzalloc_node(sizeof(*fq), flags, node); if (!fq) diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c index f11f51e2465f..bcbeadb2d0f0 100644 --- a/drivers/scsi/scsi_error.c +++ b/drivers/scsi/scsi_error.c @@ -2359,7 +2359,7 @@ scsi_ioctl_reset(struct scsi_device *dev, int __user *arg) return -EIO; error = -EIO; - rq = kzalloc(sizeof(struct request) + sizeof(struct scsi_cmnd) + + rq = kzalloc(sizeof(struct request_wrapper) + sizeof(struct scsi_cmnd) + shost->hostt->cmd_size, GFP_KERNEL); if (!rq) goto out_put_autopm_host; -- Gitee From 22706f326dde3b8d062c738904f28c4c6501f9ad Mon Sep 17 00:00:00 2001 From: Zhang Zekun Date: Tue, 19 Jul 2022 22:10:10 +0800 Subject: [PATCH 522/674] arm64: openeuler_defconfig: enable ACPI_HMAT and HOT_MEMREMOVE hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5E461 CVE: NA ------------------------ Enable ACPI HMAT and memory hot remove feature on arm64 by default. For ACPI_HMAT: ACPI HMAT describe the memory attributes, such as bandwidth and latency details, related to the System Physical Address(SPA) Memory Ranges. HMAT is especially useful when software wants to get some information about a certain special memory's memory attributes, such as PMEM and HBM. For MEMORY_HOT_REMOTE: Add support for memory hot remove feature. Some special memory, such as HBM, can be power comsuming, and will only be used in some aimed senarios. With memory hot remove feature, User can offline the idle memory for energy saving purpose when this special memory is unused. As PMEM and HBM are getting more popular, ACPI_HMAT and memory hot remove feature should be enabled as default. The following configs should be opened with CONFIG_ACPI_HMAT by default: 1.CONFIG_EFI_SOFT_RESERVE=y 2.CONFIG_HMEM_REPORTING=y 3.CONFIG_DEV_DAX_HMEM=m 4.CONFIG_DEV_DAX_HMEM_DEVICES=y Signed-off-by: Zhang Zekun Reviewed-by: Chao Liu Reviewed-by: Kefeng Wang Reviewed-by: Kai Liu Signed-off-by: Zheng Zengkai --- arch/arm64/configs/openeuler_defconfig | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 78a63cbc3db6..d246fd508ef6 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -709,7 +709,12 @@ CONFIG_ACPI_REDUCED_HARDWARE_ONLY=y CONFIG_ACPI_NFIT=m # CONFIG_NFIT_SECURITY_DEBUG is not set CONFIG_ACPI_NUMA=y -# CONFIG_ACPI_HMAT is not set +CONFIG_ACPI_HMAT=y +CONFIG_EFI_SOFT_RESERVE=y +# CONFIG_ZONE_DEVICE is not set +CONFIG_HMEM_REPORTING=y +CONFIG_DEV_DAX_HMEM=m +CONFIG_DEV_DAX_HMEM_DEVICES=y CONFIG_HAVE_ACPI_APEI=y CONFIG_ACPI_APEI=y CONFIG_ACPI_APEI_GHES=y @@ -1046,7 +1051,7 @@ CONFIG_COHERENT_DEVICE=y CONFIG_MEMORY_HOTPLUG=y CONFIG_MEMORY_HOTPLUG_SPARSE=y CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE=y -# CONFIG_MEMORY_HOTREMOVE is not set +CONFIG_MEMORY_HOTREMOVE=y CONFIG_SPLIT_PTLOCK_CPUS=4 CONFIG_MEMORY_BALLOON=y CONFIG_BALLOON_COMPACTION=y -- Gitee From 4be8584d8d0305e1c6b673777f8a351481e78b73 Mon Sep 17 00:00:00 2001 From: Chen Zhongjin Date: Tue, 19 Jul 2022 22:10:11 +0800 Subject: [PATCH 523/674] ARM: module: Add all unwind tables when load module mainline inclusion from mainline-v5.18-rc1 commit b6f21d14f1ac1261579b691673a0c823275cbaf8 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5H8ET Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b6f21d14f1ac1261579b691673a0c823275cbaf8 -------------------------------- For EABI stack unwinding, when loading .ko module the EXIDX sections will be added to a unwind_table list. However not all EXIDX sections are added because EXIDX sections are searched by hardcoded section names. For functions in other sections such as .ref.text or .kprobes.text, gcc generates seprated EXIDX sections (such as .ARM.exidx.ref.text or .ARM.exidx.kprobes.text). These extra EXIDX sections are not loaded, so when unwinding functions in these sections, we will failed with: unwind: Index not found xxx To fix that, I refactor the code for searching and adding EXIDX sections: - Check section type to search EXIDX tables (0x70000001) instead of strcmp() the hardcoded names. Then find the corresponding text sections by their section names. - Add a unwind_table list in module->arch to save their own unwind_table instead of the fixed-lenth array. - Save .ARM.exidx.init.text section ptr, because it should be cleaned after module init. Now all EXIDX sections of .ko can be added correctly. Signed-off-by: Chen Zhongjin Reviewed-by: Kuohai Xu Signed-off-by: Zheng Zengkai --- arch/arm/include/asm/module.h | 17 ++------ arch/arm/include/asm/unwind.h | 1 + arch/arm/kernel/module.c | 78 ++++++++++++++++++----------------- 3 files changed, 45 insertions(+), 51 deletions(-) diff --git a/arch/arm/include/asm/module.h b/arch/arm/include/asm/module.h index 734b8fe36896..c7c5dc6f3777 100644 --- a/arch/arm/include/asm/module.h +++ b/arch/arm/include/asm/module.h @@ -3,20 +3,10 @@ #define _ASM_ARM_MODULE_H #include - -struct unwind_table; +#include #ifdef CONFIG_ARM_UNWIND -enum { - ARM_SEC_INIT, - ARM_SEC_DEVINIT, - ARM_SEC_CORE, - ARM_SEC_EXIT, - ARM_SEC_DEVEXIT, - ARM_SEC_HOT, - ARM_SEC_UNLIKELY, - ARM_SEC_MAX, -}; +#define ELF_SECTION_UNWIND 0x70000001 #endif #define PLT_ENT_STRIDE L1_CACHE_BYTES @@ -37,7 +27,8 @@ struct mod_plt_sec { struct mod_arch_specific { #ifdef CONFIG_ARM_UNWIND - struct unwind_table *unwind[ARM_SEC_MAX]; + struct list_head unwind_list; + struct unwind_table *init_table; #endif #ifdef CONFIG_ARM_MODULE_PLTS struct mod_plt_sec core; diff --git a/arch/arm/include/asm/unwind.h b/arch/arm/include/asm/unwind.h index 0f8a3439902d..b51f85417f58 100644 --- a/arch/arm/include/asm/unwind.h +++ b/arch/arm/include/asm/unwind.h @@ -24,6 +24,7 @@ struct unwind_idx { struct unwind_table { struct list_head list; + struct list_head mod_list; const struct unwind_idx *start; const struct unwind_idx *origin; const struct unwind_idx *stop; diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c index 1cd09cf38c69..bfe2bc380d38 100644 --- a/arch/arm/kernel/module.c +++ b/arch/arm/kernel/module.c @@ -369,46 +369,40 @@ int module_finalize(const Elf32_Ehdr *hdr, const Elf_Shdr *sechdrs, #ifdef CONFIG_ARM_UNWIND const char *secstrs = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; const Elf_Shdr *sechdrs_end = sechdrs + hdr->e_shnum; - struct mod_unwind_map maps[ARM_SEC_MAX]; - int i; + struct list_head *unwind_list = &mod->arch.unwind_list; - memset(maps, 0, sizeof(maps)); + INIT_LIST_HEAD(unwind_list); + mod->arch.init_table = NULL; for (s = sechdrs; s < sechdrs_end; s++) { const char *secname = secstrs + s->sh_name; + const char *txtname; + const Elf_Shdr *txt_sec; - if (!(s->sh_flags & SHF_ALLOC)) + if (!(s->sh_flags & SHF_ALLOC) || + s->sh_type != ELF_SECTION_UNWIND) continue; - if (strcmp(".ARM.exidx.init.text", secname) == 0) - maps[ARM_SEC_INIT].unw_sec = s; - else if (strcmp(".ARM.exidx", secname) == 0) - maps[ARM_SEC_CORE].unw_sec = s; - else if (strcmp(".ARM.exidx.exit.text", secname) == 0) - maps[ARM_SEC_EXIT].unw_sec = s; - else if (strcmp(".ARM.exidx.text.unlikely", secname) == 0) - maps[ARM_SEC_UNLIKELY].unw_sec = s; - else if (strcmp(".ARM.exidx.text.hot", secname) == 0) - maps[ARM_SEC_HOT].unw_sec = s; - else if (strcmp(".init.text", secname) == 0) - maps[ARM_SEC_INIT].txt_sec = s; - else if (strcmp(".text", secname) == 0) - maps[ARM_SEC_CORE].txt_sec = s; - else if (strcmp(".exit.text", secname) == 0) - maps[ARM_SEC_EXIT].txt_sec = s; - else if (strcmp(".text.unlikely", secname) == 0) - maps[ARM_SEC_UNLIKELY].txt_sec = s; - else if (strcmp(".text.hot", secname) == 0) - maps[ARM_SEC_HOT].txt_sec = s; - } + if (!strcmp(".ARM.exidx", secname)) + txtname = ".text"; + else + txtname = secname + strlen(".ARM.exidx"); + txt_sec = find_mod_section(hdr, sechdrs, txtname); + + if (txt_sec) { + struct unwind_table *table = + unwind_table_add(s->sh_addr, + s->sh_size, + txt_sec->sh_addr, + txt_sec->sh_size); - for (i = 0; i < ARM_SEC_MAX; i++) - if (maps[i].unw_sec && maps[i].txt_sec) - mod->arch.unwind[i] = - unwind_table_add(maps[i].unw_sec->sh_addr, - maps[i].unw_sec->sh_size, - maps[i].txt_sec->sh_addr, - maps[i].txt_sec->sh_size); + list_add(&table->mod_list, unwind_list); + + /* save init table for module_arch_freeing_init */ + if (strcmp(".ARM.exidx.init.text", secname) == 0) + mod->arch.init_table = table; + } + } #endif #ifdef CONFIG_ARM_PATCH_PHYS_VIRT s = find_mod_section(hdr, sechdrs, ".pv_table"); @@ -429,19 +423,27 @@ void module_arch_cleanup(struct module *mod) { #ifdef CONFIG_ARM_UNWIND - int i; + struct unwind_table *tmp; + struct unwind_table *n; - for (i = 0; i < ARM_SEC_MAX; i++) { - unwind_table_del(mod->arch.unwind[i]); - mod->arch.unwind[i] = NULL; + list_for_each_entry_safe(tmp, n, + &mod->arch.unwind_list, mod_list) { + list_del(&tmp->mod_list); + unwind_table_del(tmp); } + mod->arch.init_table = NULL; #endif } void __weak module_arch_freeing_init(struct module *mod) { #ifdef CONFIG_ARM_UNWIND - unwind_table_del(mod->arch.unwind[ARM_SEC_INIT]); - mod->arch.unwind[ARM_SEC_INIT] = NULL; + struct unwind_table *init = mod->arch.init_table; + + if (init) { + mod->arch.init_table = NULL; + list_del(&init->mod_list); + unwind_table_del(init); + } #endif } -- Gitee From 97b810c871a4912c6f62a2b92468dd2d90a266fd Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 19 Jul 2022 22:10:12 +0800 Subject: [PATCH 524/674] af_unix: take address assignment/hash insertion into a new helper mainline inclusion from mainline-v5.14-rc1 commit 185ab886d3fb283e837283c343bf539c371e26cf category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4OM1C CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=185ab886d3fb283e837283c343bf539c371e26cf -------------------------------- Duplicated logics in all bind variants (autobind, bind-to-path, bind-to-abstract) gets taken into a common helper. Signed-off-by: Al Viro Signed-off-by: David S. Miller Signed-off-by: Baisong Zhong Reviewed-by: Yue Haibing Reviewed-by: Wei Yongjun Signed-off-by: Zheng Zengkai --- net/unix/af_unix.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index b7edca89e0ba..8c5389e4cb2c 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -262,6 +262,14 @@ static void __unix_insert_socket(struct hlist_head *list, struct sock *sk) sk_add_node(sk, list); } +static void __unix_set_addr(struct sock *sk, struct unix_address *addr, + unsigned hash) +{ + __unix_remove_socket(sk); + smp_store_release(&unix_sk(sk)->addr, addr); + __unix_insert_socket(&unix_socket_table[hash], sk); +} + static inline void unix_remove_socket(struct sock *sk) { spin_lock(&unix_table_lock); @@ -935,9 +943,7 @@ static int unix_autobind(struct socket *sock) } addr->hash ^= sk->sk_type; - __unix_remove_socket(sk); - smp_store_release(&u->addr, addr); - __unix_insert_socket(&unix_socket_table[addr->hash], sk); + __unix_set_addr(sk, addr, addr->hash); spin_unlock(&unix_table_lock); err = 0; @@ -1039,7 +1045,6 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) int err; unsigned int hash; struct unix_address *addr; - struct hlist_head *list; struct path path = { }; err = -EINVAL; @@ -1091,25 +1096,20 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE - 1); spin_lock(&unix_table_lock); u->path = path; - list = &unix_socket_table[hash]; } else { spin_lock(&unix_table_lock); err = -EADDRINUSE; if (__unix_find_socket_byname(net, sunaddr, addr_len, sk->sk_type, hash)) { + spin_unlock(&unix_table_lock); unix_release_addr(addr); - goto out_unlock; + goto out_up; } - - list = &unix_socket_table[addr->hash]; + hash = addr->hash; } err = 0; - __unix_remove_socket(sk); - smp_store_release(&u->addr, addr); - __unix_insert_socket(list, sk); - -out_unlock: + __unix_set_addr(sk, addr, hash); spin_unlock(&unix_table_lock); out_up: mutex_unlock(&u->bindlock); -- Gitee From 4a41ddaf9b4a55bad2642712ce5ff59d113abb0f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 19 Jul 2022 22:10:13 +0800 Subject: [PATCH 525/674] unix_bind(): allocate addr earlier mainline inclusion from mainline-v5.14-rc1 commit c34d4582518ff83a4848c2d33a46be82e2499a5b category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4OM1C CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c34d4582518ff83a4848c2d33a46be82e2499a5b -------------------------------- makes it easier to massage; we do pay for that by extra work (kmalloc+memcpy+kfree) in some error cases, but those are not on the hot paths anyway. Signed-off-by: Al Viro Signed-off-by: David S. Miller Signed-off-by: Baisong Zhong Reviewed-by: Yue Haibing Reviewed-by: Wei Yongjun Signed-off-by: Zheng Zengkai --- net/unix/af_unix.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 8c5389e4cb2c..8dcd342bee10 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1061,6 +1061,15 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (err < 0) goto out; addr_len = err; + err = -ENOMEM; + addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL); + if (!addr) + goto out; + + memcpy(addr->name, sunaddr, addr_len); + addr->len = addr_len; + addr->hash = hash ^ sk->sk_type; + refcount_set(&addr->refcnt, 1); if (sun_path[0]) { umode_t mode = S_IFSOCK | @@ -1069,7 +1078,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (err) { if (err == -EEXIST) err = -EADDRINUSE; - goto out; + goto out_addr; } } @@ -1081,16 +1090,6 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (u->addr) goto out_up; - err = -ENOMEM; - addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL); - if (!addr) - goto out_up; - - memcpy(addr->name, sunaddr, addr_len); - addr->len = addr_len; - addr->hash = hash ^ sk->sk_type; - refcount_set(&addr->refcnt, 1); - if (sun_path[0]) { addr->hash = UNIX_HASH_SIZE; hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE - 1); @@ -1102,20 +1101,23 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (__unix_find_socket_byname(net, sunaddr, addr_len, sk->sk_type, hash)) { spin_unlock(&unix_table_lock); - unix_release_addr(addr); goto out_up; } hash = addr->hash; } - err = 0; __unix_set_addr(sk, addr, hash); spin_unlock(&unix_table_lock); + addr = NULL; + err = 0; out_up: mutex_unlock(&u->bindlock); out_put: if (err) path_put(&path); +out_addr: + if (addr) + unix_release_addr(addr); out: return err; } -- Gitee From 5d545249d66573a819ceee8c0b60b66fd34a5a6c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 19 Jul 2022 22:10:14 +0800 Subject: [PATCH 526/674] unix_bind(): separate BSD and abstract cases mainline inclusion from mainline-v5.14-rc1 commit aee515170576609a0aa3413dc06a7f36f05a5fe2 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4OM1C CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=aee515170576609a0aa3413dc06a7f36f05a5fe2 -------------------------------- We do get some duplication that way, but it's minor compared to parts that are different. What we get is an ability to change locking in BSD case without making failure exits very hard to follow. Signed-off-by: Al Viro Signed-off-by: David S. Miller Signed-off-by: Baisong Zhong Reviewed-by: Yue Haibing Reviewed-by: Wei Yongjun Signed-off-by: Zheng Zengkai --- net/unix/af_unix.c | 55 ++++++++++++++++++++++++++++------------------ 1 file changed, 34 insertions(+), 21 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 8dcd342bee10..685c190e4366 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1045,7 +1045,6 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) int err; unsigned int hash; struct unix_address *addr; - struct path path = { }; err = -EINVAL; if (addr_len < offsetofend(struct sockaddr_un, sun_family) || @@ -1072,6 +1071,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) refcount_set(&addr->refcnt, 1); if (sun_path[0]) { + struct path path = { }; umode_t mode = S_IFSOCK | (SOCK_INODE(sock)->i_mode & ~current_umask()); err = unix_mknod(sun_path, mode, &path); @@ -1080,41 +1080,54 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) err = -EADDRINUSE; goto out_addr; } - } - err = mutex_lock_interruptible(&u->bindlock); - if (err) - goto out_put; + err = mutex_lock_interruptible(&u->bindlock); + if (err) { + path_put(&path); + goto out_addr; + } - err = -EINVAL; - if (u->addr) - goto out_up; + err = -EINVAL; + if (u->addr) { + mutex_unlock(&u->bindlock); + path_put(&path); + goto out_addr; + } - if (sun_path[0]) { addr->hash = UNIX_HASH_SIZE; hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE - 1); spin_lock(&unix_table_lock); u->path = path; + __unix_set_addr(sk, addr, hash); + spin_unlock(&unix_table_lock); + mutex_unlock(&u->bindlock); + addr = NULL; + err = 0; } else { + err = mutex_lock_interruptible(&u->bindlock); + if (err) + goto out_addr; + + err = -EINVAL; + if (u->addr) { + mutex_unlock(&u->bindlock); + goto out_addr; + } + spin_lock(&unix_table_lock); err = -EADDRINUSE; if (__unix_find_socket_byname(net, sunaddr, addr_len, sk->sk_type, hash)) { spin_unlock(&unix_table_lock); - goto out_up; + mutex_unlock(&u->bindlock); + goto out_addr; } - hash = addr->hash; + __unix_set_addr(sk, addr, addr->hash); + spin_unlock(&unix_table_lock); + mutex_unlock(&u->bindlock); + addr = NULL; + err = 0; } - - __unix_set_addr(sk, addr, hash); - spin_unlock(&unix_table_lock); - addr = NULL; - err = 0; -out_up: - mutex_unlock(&u->bindlock); -out_put: - if (err) - path_put(&path); out_addr: if (addr) unix_release_addr(addr); -- Gitee From 7fb99800c8fee521e0e98d46be4fb9634e0a56e4 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 19 Jul 2022 22:10:15 +0800 Subject: [PATCH 527/674] unix_bind(): take BSD and abstract address cases into new helpers mainline inclusion from mainline-v5.14-rc1 commit fa42d910a38ee310d5c6826563dd58a08735d5b0 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4OM1C CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=fa42d910a38ee310d5c6826563dd58a08735d5b0 -------------------------------- unix_bind_bsd() and unix_bind_abstract() respectively. Signed-off-by: Al Viro Signed-off-by: David S. Miller Signed-off-by: Baisong Zhong Reviewed-by: Yue Haibing Reviewed-by: Wei Yongjun Signed-off-by: Zheng Zengkai --- net/unix/af_unix.c | 147 +++++++++++++++++++++++---------------------- 1 file changed, 74 insertions(+), 73 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 685c190e4366..9172058866b4 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1035,104 +1035,105 @@ static int unix_mknod(const char *sun_path, umode_t mode, struct path *res) return err; } +static int unix_bind_bsd(struct sock *sk, struct unix_address *addr) +{ + struct unix_sock *u = unix_sk(sk); + struct path path = { }; + umode_t mode = S_IFSOCK | + (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask()); + unsigned int hash; + int err; + + err = unix_mknod(addr->name->sun_path, mode, &path); + if (err) + return err; + + err = mutex_lock_interruptible(&u->bindlock); + if (err) { + path_put(&path); + return err; + } + + if (u->addr) { + mutex_unlock(&u->bindlock); + path_put(&path); + return -EINVAL; + } + + addr->hash = UNIX_HASH_SIZE; + hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE - 1); + spin_lock(&unix_table_lock); + u->path = path; + __unix_set_addr(sk, addr, hash); + spin_unlock(&unix_table_lock); + mutex_unlock(&u->bindlock); + return 0; +} + +static int unix_bind_abstract(struct sock *sk, unsigned hash, + struct unix_address *addr) +{ + struct unix_sock *u = unix_sk(sk); + int err; + + err = mutex_lock_interruptible(&u->bindlock); + if (err) + return err; + + if (u->addr) { + mutex_unlock(&u->bindlock); + return -EINVAL; + } + + spin_lock(&unix_table_lock); + if (__unix_find_socket_byname(sock_net(sk), addr->name, addr->len, + sk->sk_type, hash)) { + spin_unlock(&unix_table_lock); + mutex_unlock(&u->bindlock); + return -EADDRINUSE; + } + __unix_set_addr(sk, addr, addr->hash); + spin_unlock(&unix_table_lock); + mutex_unlock(&u->bindlock); + return 0; +} + static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sock *sk = sock->sk; - struct net *net = sock_net(sk); - struct unix_sock *u = unix_sk(sk); struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; char *sun_path = sunaddr->sun_path; int err; unsigned int hash; struct unix_address *addr; - err = -EINVAL; if (addr_len < offsetofend(struct sockaddr_un, sun_family) || sunaddr->sun_family != AF_UNIX) - goto out; + return -EINVAL; - if (addr_len == sizeof(short)) { - err = unix_autobind(sock); - goto out; - } + if (addr_len == sizeof(short)) + return unix_autobind(sock); err = unix_mkname(sunaddr, addr_len, &hash); if (err < 0) - goto out; + return err; addr_len = err; - err = -ENOMEM; addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL); if (!addr) - goto out; + return -ENOMEM; memcpy(addr->name, sunaddr, addr_len); addr->len = addr_len; addr->hash = hash ^ sk->sk_type; refcount_set(&addr->refcnt, 1); - if (sun_path[0]) { - struct path path = { }; - umode_t mode = S_IFSOCK | - (SOCK_INODE(sock)->i_mode & ~current_umask()); - err = unix_mknod(sun_path, mode, &path); - if (err) { - if (err == -EEXIST) - err = -EADDRINUSE; - goto out_addr; - } - - err = mutex_lock_interruptible(&u->bindlock); - if (err) { - path_put(&path); - goto out_addr; - } - - err = -EINVAL; - if (u->addr) { - mutex_unlock(&u->bindlock); - path_put(&path); - goto out_addr; - } - - addr->hash = UNIX_HASH_SIZE; - hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE - 1); - spin_lock(&unix_table_lock); - u->path = path; - __unix_set_addr(sk, addr, hash); - spin_unlock(&unix_table_lock); - mutex_unlock(&u->bindlock); - addr = NULL; - err = 0; - } else { - err = mutex_lock_interruptible(&u->bindlock); - if (err) - goto out_addr; - - err = -EINVAL; - if (u->addr) { - mutex_unlock(&u->bindlock); - goto out_addr; - } - - spin_lock(&unix_table_lock); - err = -EADDRINUSE; - if (__unix_find_socket_byname(net, sunaddr, addr_len, - sk->sk_type, hash)) { - spin_unlock(&unix_table_lock); - mutex_unlock(&u->bindlock); - goto out_addr; - } - __unix_set_addr(sk, addr, addr->hash); - spin_unlock(&unix_table_lock); - mutex_unlock(&u->bindlock); - addr = NULL; - err = 0; - } -out_addr: - if (addr) + if (sun_path[0]) + err = unix_bind_bsd(sk, addr); + else + err = unix_bind_abstract(sk, hash, addr); + if (err) unix_release_addr(addr); -out: - return err; + return err == -EEXIST ? -EADDRINUSE : err; } static void unix_state_double_lock(struct sock *sk1, struct sock *sk2) -- Gitee From 79d3eccb01d2c61ea60d9929f073fdda8b25ab00 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 19 Jul 2022 22:10:16 +0800 Subject: [PATCH 528/674] af_unix: Use offsetof() instead of sizeof(). mainline inclusion from mainline-v5.17-rc1 commit 755662ce78d14c1a9118df921c528b1f992ded2e category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4OM1C CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=755662ce78d14c1a9118df921c528b1f992ded2e -------------------------------- The length of the AF_UNIX socket address contains an offset to the member sun_path of struct sockaddr_un. Currently, the preceding member is just sun_family, and its type is sa_family_t and resolved to short. Therefore, the offset is represented by sizeof(short). However, it is not clear and fragile to changes in struct sockaddr_storage or sockaddr_un. This commit makes it clear and robust by rewriting sizeof() with offsetof(). Signed-off-by: Kuniyuki Iwashima Signed-off-by: Jakub Kicinski Signed-off-by: Baisong Zhong Reviewed-by: Yue Haibing Reviewed-by: Wei Yongjun Signed-off-by: Zheng Zengkai --- net/unix/af_unix.c | 19 ++++++++++++------- net/unix/diag.c | 3 ++- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 9172058866b4..cdd2a46e2a98 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -230,7 +230,8 @@ static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp { *hashp = 0; - if (len <= sizeof(short) || len > sizeof(*sunaddr)) + if (len <= offsetof(struct sockaddr_un, sun_path) || + len > sizeof(*sunaddr)) return -EINVAL; if (!sunaddr || sunaddr->sun_family != AF_UNIX) return -EINVAL; @@ -243,7 +244,8 @@ static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp * kernel address buffer. */ ((char *)sunaddr)[len] = 0; - len = strlen(sunaddr->sun_path)+1+sizeof(short); + len = strlen(sunaddr->sun_path) + + offsetof(struct sockaddr_un, sun_path) + 1; return len; } @@ -911,7 +913,8 @@ static int unix_autobind(struct socket *sock) goto out; err = -ENOMEM; - addr = kzalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL); + addr = kzalloc(sizeof(*addr) + + offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL); if (!addr) goto out; @@ -919,7 +922,8 @@ static int unix_autobind(struct socket *sock) refcount_set(&addr->refcnt, 1); retry: - addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short); + addr->len = sprintf(addr->name->sun_path + 1, "%05x", ordernum) + + offsetof(struct sockaddr_un, sun_path) + 1; addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0)); spin_lock(&unix_table_lock); @@ -1111,7 +1115,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) sunaddr->sun_family != AF_UNIX) return -EINVAL; - if (addr_len == sizeof(short)) + if (addr_len == offsetof(struct sockaddr_un, sun_path)) return unix_autobind(sock); err = unix_mkname(sunaddr, addr_len, &hash); @@ -1549,7 +1553,7 @@ static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer) if (!addr) { sunaddr->sun_family = AF_UNIX; sunaddr->sun_path[0] = 0; - err = sizeof(short); + err = offsetof(struct sockaddr_un, sun_path); } else { err = addr->len; memcpy(sunaddr, addr->name, addr->len); @@ -2927,7 +2931,8 @@ static int unix_seq_show(struct seq_file *seq, void *v) seq_putc(seq, ' '); i = 0; - len = u->addr->len - sizeof(short); + len = u->addr->len - + offsetof(struct sockaddr_un, sun_path); if (!UNIX_ABSTRACT(s)) len--; else { diff --git a/net/unix/diag.c b/net/unix/diag.c index 9ff64f9df1f3..dd77e81b41a0 100644 --- a/net/unix/diag.c +++ b/net/unix/diag.c @@ -19,7 +19,8 @@ static int sk_diag_dump_name(struct sock *sk, struct sk_buff *nlskb) if (!addr) return 0; - return nla_put(nlskb, UNIX_DIAG_NAME, addr->len - sizeof(short), + return nla_put(nlskb, UNIX_DIAG_NAME, + addr->len - offsetof(struct sockaddr_un, sun_path), addr->name->sun_path); } -- Gitee From a7a7a7cf78d041127a4116e088a64fb8544d2ebc Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 19 Jul 2022 22:10:17 +0800 Subject: [PATCH 529/674] __unix_find_socket_byname(): don't pass hash and type separately mainline inclusion from mainline-v5.14-rc1 commit be752283a2a2b4bfc2df512b5d9b03a34aece252 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4OM1C CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=be752283a2a2b4bfc2df512b5d9b03a34aece252 -------------------------------- We only care about exclusive or of those, so pass that directly. Makes life simpler for callers as well... Signed-off-by: Al Viro Signed-off-by: David S. Miller Signed-off-by: Baisong Zhong Reviewed-by: Yue Haibing Reviewed-by: Wei Yongjun Signed-off-by: Zheng Zengkai --- net/unix/af_unix.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index cdd2a46e2a98..e59f5714e02f 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -288,11 +288,11 @@ static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk) static struct sock *__unix_find_socket_byname(struct net *net, struct sockaddr_un *sunname, - int len, int type, unsigned int hash) + int len, unsigned int hash) { struct sock *s; - sk_for_each(s, &unix_socket_table[hash ^ type]) { + sk_for_each(s, &unix_socket_table[hash]) { struct unix_sock *u = unix_sk(s); if (!net_eq(sock_net(s), net)) @@ -307,13 +307,12 @@ static struct sock *__unix_find_socket_byname(struct net *net, static inline struct sock *unix_find_socket_byname(struct net *net, struct sockaddr_un *sunname, - int len, int type, - unsigned int hash) + int len, unsigned int hash) { struct sock *s; spin_lock(&unix_table_lock); - s = __unix_find_socket_byname(net, sunname, len, type, hash); + s = __unix_find_socket_byname(net, sunname, len, hash); if (s) sock_hold(s); spin_unlock(&unix_table_lock); @@ -925,12 +924,12 @@ static int unix_autobind(struct socket *sock) addr->len = sprintf(addr->name->sun_path + 1, "%05x", ordernum) + offsetof(struct sockaddr_un, sun_path) + 1; addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0)); + addr->hash ^= sk->sk_type; spin_lock(&unix_table_lock); ordernum = (ordernum+1)&0xFFFFF; - if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type, - addr->hash)) { + if (__unix_find_socket_byname(net, addr->name, addr->len, addr->hash)) { spin_unlock(&unix_table_lock); /* * __unix_find_socket_byname() may take long time if many names @@ -945,7 +944,6 @@ static int unix_autobind(struct socket *sock) } goto retry; } - addr->hash ^= sk->sk_type; __unix_set_addr(sk, addr, addr->hash); spin_unlock(&unix_table_lock); @@ -992,7 +990,7 @@ static struct sock *unix_find_other(struct net *net, } } else { err = -ECONNREFUSED; - u = unix_find_socket_byname(net, sunname, len, type, hash); + u = unix_find_socket_byname(net, sunname, len, type ^ hash); if (u) { struct dentry *dentry; dentry = unix_sk(u)->path.dentry; @@ -1074,8 +1072,7 @@ static int unix_bind_bsd(struct sock *sk, struct unix_address *addr) return 0; } -static int unix_bind_abstract(struct sock *sk, unsigned hash, - struct unix_address *addr) +static int unix_bind_abstract(struct sock *sk, struct unix_address *addr) { struct unix_sock *u = unix_sk(sk); int err; @@ -1091,7 +1088,7 @@ static int unix_bind_abstract(struct sock *sk, unsigned hash, spin_lock(&unix_table_lock); if (__unix_find_socket_byname(sock_net(sk), addr->name, addr->len, - sk->sk_type, hash)) { + addr->hash)) { spin_unlock(&unix_table_lock); mutex_unlock(&u->bindlock); return -EADDRINUSE; @@ -1134,7 +1131,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (sun_path[0]) err = unix_bind_bsd(sk, addr); else - err = unix_bind_abstract(sk, hash, addr); + err = unix_bind_abstract(sk, addr); if (err) unix_release_addr(addr); return err == -EEXIST ? -EADDRINUSE : err; -- Gitee From 06b109c3b4894add5786033c766f07b04d37476a Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 19 Jul 2022 22:10:18 +0800 Subject: [PATCH 530/674] af_unix: Pass struct sock to unix_autobind(). mainline inclusion from mainline-v5.17-rc1 commit f7ed31f4615f4e1d97c0e4325c5b8a240e10073c category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4OM1C CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=f7ed31f4615f4e1d97c0e4325c5b8a240e10073c -------------------------------- We do not use struct socket in unix_autobind() and pass struct sock to unix_bind_bsd() and unix_bind_abstract(). Let's pass it to unix_autobind() as well. Also, this patch fixes these errors by checkpatch.pl. ERROR: do not use assignment in if condition #1795: FILE: net/unix/af_unix.c:1795: + if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr CHECK: Logical continuations should be on the previous line #1796: FILE: net/unix/af_unix.c:1796: + if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr + && (err = unix_autobind(sock)) != 0) Signed-off-by: Kuniyuki Iwashima Signed-off-by: Jakub Kicinski Signed-off-by: Baisong Zhong Reviewed-by: Yue Haibing Reviewed-by: Wei Yongjun Signed-off-by: Zheng Zengkai --- net/unix/af_unix.c | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index e59f5714e02f..006e35065e51 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -894,15 +894,13 @@ static int unix_release(struct socket *sock) return 0; } -static int unix_autobind(struct socket *sock) +static int unix_autobind(struct sock *sk) { - struct sock *sk = sock->sk; - struct net *net = sock_net(sk); struct unix_sock *u = unix_sk(sk); - static u32 ordernum = 1; struct unix_address *addr; - int err; unsigned int retries = 0; + static u32 ordernum = 1; + int err; err = mutex_lock_interruptible(&u->bindlock); if (err) @@ -929,7 +927,8 @@ static int unix_autobind(struct socket *sock) spin_lock(&unix_table_lock); ordernum = (ordernum+1)&0xFFFFF; - if (__unix_find_socket_byname(net, addr->name, addr->len, addr->hash)) { + if (__unix_find_socket_byname(sock_net(sk), addr->name, addr->len, + addr->hash)) { spin_unlock(&unix_table_lock); /* * __unix_find_socket_byname() may take long time if many names @@ -1113,7 +1112,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) return -EINVAL; if (addr_len == offsetof(struct sockaddr_un, sun_path)) - return unix_autobind(sock); + return unix_autobind(sk); err = unix_mkname(sunaddr, addr_len, &hash); if (err < 0) @@ -1183,8 +1182,11 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, alen = err; if (test_bit(SOCK_PASSCRED, &sock->flags) && - !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0) - goto out; + !unix_sk(sk)->addr) { + err = unix_autobind(sk); + if (err) + goto out; + } restart: other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err); @@ -1284,9 +1286,11 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, goto out; addr_len = err; - if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr && - (err = unix_autobind(sock)) != 0) - goto out; + if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr) { + err = unix_autobind(sk); + if (err) + goto out; + } timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); @@ -1737,9 +1741,11 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, goto out; } - if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr - && (err = unix_autobind(sock)) != 0) - goto out; + if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr) { + err = unix_autobind(sk); + if (err) + goto out; + } err = -EMSGSIZE; if (len > sk->sk_sndbuf - 32) -- Gitee From 75136c5f52f91a1606125965bb16136d5c4385f0 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 19 Jul 2022 22:10:19 +0800 Subject: [PATCH 531/674] fold unix_mknod() into unix_bind_bsd() mainline inclusion from mainline-v5.14-rc1 commit 71e6be6f7d2bada7099d79205779c4452d4fd35b category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4OM1C CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=71e6be6f7d2bada7099d79205779c4452d4fd35b -------------------------------- Signed-off-by: Al Viro Signed-off-by: David S. Miller Conflicts: net/unix/af_unix.c Signed-off-by: Baisong Zhong Reviewed-by: Yue Haibing Reviewed-by: Yue Haibing Reviewed-by: Wei Yongjun Signed-off-by: Zheng Zengkai --- net/unix/af_unix.c | 39 +++++++++++++++------------------------ 1 file changed, 15 insertions(+), 24 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 006e35065e51..3eba0e23c512 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1007,45 +1007,36 @@ static struct sock *unix_find_other(struct net *net, return NULL; } -static int unix_mknod(const char *sun_path, umode_t mode, struct path *res) +static int unix_bind_bsd(struct sock *sk, struct unix_address *addr) { + struct unix_sock *u = unix_sk(sk); + umode_t mode = S_IFSOCK | + (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask()); + struct path parent, path; struct dentry *dentry; - struct path path; - int err = 0; + unsigned int hash; + int err; + /* * Get the parent directory, calculate the hash for last * component. */ - dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0); - err = PTR_ERR(dentry); + dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0); if (IS_ERR(dentry)) - return err; + return PTR_ERR(dentry); /* * All right, let's create it. */ - err = security_path_mknod(&path, dentry, mode, 0); + err = security_path_mknod(&parent, dentry, mode, 0); if (!err) { - err = vfs_mknod(d_inode(path.dentry), dentry, mode, 0); + err = vfs_mknod(d_inode(parent.dentry), dentry, mode, 0); if (!err) { - res->mnt = mntget(path.mnt); - res->dentry = dget(dentry); + path.mnt = mntget(parent.mnt); + path.dentry = dget(dentry); } } - done_path_create(&path, dentry); - return err; -} - -static int unix_bind_bsd(struct sock *sk, struct unix_address *addr) -{ - struct unix_sock *u = unix_sk(sk); - struct path path = { }; - umode_t mode = S_IFSOCK | - (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask()); - unsigned int hash; - int err; - - err = unix_mknod(addr->name->sun_path, mode, &path); + done_path_create(&parent, dentry); if (err) return err; -- Gitee From 55929e5b1e8074f1c4cc1189ae2fd522515046f0 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 19 Jul 2022 22:10:20 +0800 Subject: [PATCH 532/674] af_unix: Factorise unix_find_other() based on address types. mainline inclusion from mainline-v5.17-rc1 commit fa39ef0e472961baef49ddb0e6f7b8ebb555bd8f category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4OM1C CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=fa39ef0e472961baef49ddb0e6f7b8ebb555bd8f -------------------------------- As done in the commit fa42d910a38e ("unix_bind(): take BSD and abstract address cases into new helpers"), this patch moves BSD and abstract address cases from unix_find_other() into unix_find_bsd() and unix_find_abstract(). Signed-off-by: Kuniyuki Iwashima Signed-off-by: Jakub Kicinski Conflicts: net/unix/af_unix.c Signed-off-by: Baisong Zhong Reviewed-by: Yue Haibing Reviewed-by: Wei Yongjun Signed-off-by: Zheng Zengkai --- include/linux/fs.h | 4 ++ net/unix/af_unix.c | 136 +++++++++++++++++++++++++++------------------ 2 files changed, 85 insertions(+), 55 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index 18259e38dcd7..a7bc1eaa27ee 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2844,6 +2844,10 @@ static inline int bmap(struct inode *inode, sector_t *block) extern int notify_change(struct dentry *, struct iattr *, struct inode **); extern int inode_permission(struct inode *, int); extern int generic_permission(struct inode *, int); +static inline int path_permission(const struct path *path, int mask) +{ + return inode_permission(d_inode(path->dentry), mask); +} extern int __check_sticky(struct inode *dir, struct inode *inode); static inline bool execute_ok(struct inode *inode) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 3eba0e23c512..eafd1d62bf55 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -894,6 +894,87 @@ static int unix_release(struct socket *sock) return 0; } +static struct sock *unix_find_bsd(struct net *net, struct sockaddr_un *sunaddr, + int type, int *error) +{ + struct inode *inode; + struct path path; + struct sock *sk; + int err; + + err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path); + if (err) + goto fail; + + err = path_permission(&path, MAY_WRITE); + if (err) + goto path_put; + + err = -ECONNREFUSED; + inode = d_backing_inode(path.dentry); + if (!S_ISSOCK(inode->i_mode)) + goto path_put; + + sk = unix_find_socket_byinode(inode); + if (!sk) + goto path_put; + + err = -EPROTOTYPE; + if (sk->sk_type == type) + touch_atime(&path); + else + goto sock_put; + + path_put(&path); + + return sk; + +sock_put: + sock_put(sk); +path_put: + path_put(&path); +fail: + *error = err; + return NULL; +} + +static struct sock *unix_find_abstract(struct net *net, + struct sockaddr_un *sunaddr, + int addr_len, int type, + unsigned int hash, int *error) +{ + struct dentry *dentry; + struct sock *sk; + + sk = unix_find_socket_byname(net, sunaddr, addr_len, type ^ hash); + if (!sk) { + *error = -ECONNREFUSED; + return NULL; + } + + dentry = unix_sk(sk)->path.dentry; + if (dentry) + touch_atime(&unix_sk(sk)->path); + + return sk; +} + +static struct sock *unix_find_other(struct net *net, + struct sockaddr_un *sunaddr, + int addr_len, int type, + unsigned int hash, int *error) +{ + struct sock *sk; + + if (sunaddr->sun_path[0]) + sk = unix_find_bsd(net, sunaddr, type, error); + else + sk = unix_find_abstract(net, sunaddr, addr_len, type, hash, + error); + + return sk; +} + static int unix_autobind(struct sock *sk) { struct unix_sock *u = unix_sk(sk); @@ -952,61 +1033,6 @@ out: mutex_unlock(&u->bindlock); return err; } -static struct sock *unix_find_other(struct net *net, - struct sockaddr_un *sunname, int len, - int type, unsigned int hash, int *error) -{ - struct sock *u; - struct path path; - int err = 0; - - if (sunname->sun_path[0]) { - struct inode *inode; - err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path); - if (err) - goto fail; - inode = d_backing_inode(path.dentry); - err = inode_permission(inode, MAY_WRITE); - if (err) - goto put_fail; - - err = -ECONNREFUSED; - if (!S_ISSOCK(inode->i_mode)) - goto put_fail; - u = unix_find_socket_byinode(inode); - if (!u) - goto put_fail; - - if (u->sk_type == type) - touch_atime(&path); - - path_put(&path); - - err = -EPROTOTYPE; - if (u->sk_type != type) { - sock_put(u); - goto fail; - } - } else { - err = -ECONNREFUSED; - u = unix_find_socket_byname(net, sunname, len, type ^ hash); - if (u) { - struct dentry *dentry; - dentry = unix_sk(u)->path.dentry; - if (dentry) - touch_atime(&unix_sk(u)->path); - } else - goto fail; - } - return u; - -put_fail: - path_put(&path); -fail: - *error = err; - return NULL; -} - static int unix_bind_bsd(struct sock *sk, struct unix_address *addr) { struct unix_sock *u = unix_sk(sk); -- Gitee From c05abd84ed1185f7d46f23c69513624e07897f78 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 19 Jul 2022 22:10:21 +0800 Subject: [PATCH 533/674] af_unix: Return an error as a pointer in unix_find_other(). mainline inclusion from mainline-v5.17-rc1 commit aed26f557bbc94f0c778f63d7dfe86af99208f68 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4OM1C CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=aed26f557bbc94f0c778f63d7dfe86af99208f68 -------------------------------- We can return an error as a pointer and need not pass an additional argument to unix_find_other(). Signed-off-by: Kuniyuki Iwashima Signed-off-by: Jakub Kicinski Signed-off-by: Baisong Zhong Reviewed-by: Yue Haibing Reviewed-by: Wei Yongjun Signed-off-by: Zheng Zengkai --- net/unix/af_unix.c | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index eafd1d62bf55..e78477a4fa4b 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -895,7 +895,7 @@ static int unix_release(struct socket *sock) } static struct sock *unix_find_bsd(struct net *net, struct sockaddr_un *sunaddr, - int type, int *error) + int type) { struct inode *inode; struct path path; @@ -934,23 +934,20 @@ static struct sock *unix_find_bsd(struct net *net, struct sockaddr_un *sunaddr, path_put: path_put(&path); fail: - *error = err; - return NULL; + return ERR_PTR(err); } static struct sock *unix_find_abstract(struct net *net, struct sockaddr_un *sunaddr, int addr_len, int type, - unsigned int hash, int *error) + unsigned int hash) { struct dentry *dentry; struct sock *sk; sk = unix_find_socket_byname(net, sunaddr, addr_len, type ^ hash); - if (!sk) { - *error = -ECONNREFUSED; - return NULL; - } + if (!sk) + return ERR_PTR(-ECONNREFUSED); dentry = unix_sk(sk)->path.dentry; if (dentry) @@ -962,15 +959,14 @@ static struct sock *unix_find_abstract(struct net *net, static struct sock *unix_find_other(struct net *net, struct sockaddr_un *sunaddr, int addr_len, int type, - unsigned int hash, int *error) + unsigned int hash) { struct sock *sk; if (sunaddr->sun_path[0]) - sk = unix_find_bsd(net, sunaddr, type, error); + sk = unix_find_bsd(net, sunaddr, type); else - sk = unix_find_abstract(net, sunaddr, addr_len, type, hash, - error); + sk = unix_find_abstract(net, sunaddr, addr_len, type, hash); return sk; } @@ -1206,9 +1202,11 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, } restart: - other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err); - if (!other) + other = unix_find_other(net, sunaddr, alen, sock->type, hash); + if (IS_ERR(other)) { + err = PTR_ERR(other); goto out; + } unix_state_double_lock(sk, other); @@ -1330,9 +1328,12 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, restart: /* Find listening sock. */ - other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err); - if (!other) + other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash); + if (IS_ERR(other)) { + err = PTR_ERR(other); + other = NULL; goto out; + } /* Latch state of peer */ unix_state_lock(other); @@ -1803,9 +1804,12 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, goto out_free; other = unix_find_other(net, sunaddr, namelen, sk->sk_type, - hash, &err); - if (other == NULL) + hash); + if (IS_ERR(other)) { + err = PTR_ERR(other); + other = NULL; goto out_free; + } } if (sk_filter(other, skb) < 0) { -- Gitee From 4d0707b8fbdbc96b6b74d94f0131b43f07ccd010 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 19 Jul 2022 22:10:22 +0800 Subject: [PATCH 534/674] af_unix: Cut unix_validate_addr() out of unix_mkname(). mainline inclusion from mainline-v5.17-rc1 commit b8a58aa6fccc5b2940f0da18c7f02e8a1deb693a category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4OM1C CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b8a58aa6fccc5b2940f0da18c7f02e8a1deb693a -------------------------------- unix_mkname() tests socket address length and family and does some processing based on the address type. It is called in the early stage, and therefore some instructions are redundant and can end up in vain. The address length/family tests are done twice in unix_bind(). Also, the address type is rechecked later in unix_bind() and unix_find_other(), where we can do the same processing. Moreover, in the BSD address case, the hash is set to 0 but never used and confusing. This patch moves the address tests out of unix_mkname(), and the following patches move the other part into appropriate places and remove unix_mkname() finally. Signed-off-by: Kuniyuki Iwashima Signed-off-by: Jakub Kicinski Signed-off-by: Baisong Zhong Reviewed-by: Yue Haibing Reviewed-by: Wei Yongjun Signed-off-by: Zheng Zengkai --- net/unix/af_unix.c | 40 ++++++++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 10 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index e78477a4fa4b..ef8b4fcd52fb 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -226,15 +226,22 @@ static inline void unix_release_addr(struct unix_address *addr) * - if started by zero, it is abstract name. */ +static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len) +{ + if (addr_len <= offsetof(struct sockaddr_un, sun_path) || + addr_len > sizeof(*sunaddr)) + return -EINVAL; + + if (sunaddr->sun_family != AF_UNIX) + return -EINVAL; + + return 0; +} + static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp) { *hashp = 0; - if (len <= offsetof(struct sockaddr_un, sun_path) || - len > sizeof(*sunaddr)) - return -EINVAL; - if (!sunaddr || sunaddr->sun_family != AF_UNIX) - return -EINVAL; if (sunaddr->sun_path[0]) { /* * This may look like an off by one error but it is a bit more @@ -1120,13 +1127,14 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) unsigned int hash; struct unix_address *addr; - if (addr_len < offsetofend(struct sockaddr_un, sun_family) || - sunaddr->sun_family != AF_UNIX) - return -EINVAL; - - if (addr_len == offsetof(struct sockaddr_un, sun_path)) + if (addr_len == offsetof(struct sockaddr_un, sun_path) && + sunaddr->sun_family == AF_UNIX) return unix_autobind(sk); + err = unix_validate_addr(sunaddr, addr_len); + if (err) + return err; + err = unix_mkname(sunaddr, addr_len, &hash); if (err < 0) return err; @@ -1189,6 +1197,10 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, goto out; if (addr->sa_family != AF_UNSPEC) { + err = unix_validate_addr(sunaddr, alen); + if (err) + goto out; + err = unix_mkname(sunaddr, alen, &hash); if (err < 0) goto out; @@ -1296,6 +1308,10 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, int err; long timeo; + err = unix_validate_addr(sunaddr, addr_len); + if (err) + goto out; + err = unix_mkname(sunaddr, addr_len, &hash); if (err < 0) goto out; @@ -1747,6 +1763,10 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, goto out; if (msg->msg_namelen) { + err = unix_validate_addr(sunaddr, msg->msg_namelen); + if (err) + goto out; + err = unix_mkname(sunaddr, msg->msg_namelen, &hash); if (err < 0) goto out; -- Gitee From e6a91904b5adb313f0e81a1a41bfabb2e3940905 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 19 Jul 2022 22:10:23 +0800 Subject: [PATCH 535/674] af_unix: Copy unix_mkname() into unix_find_(bsd|abstract)(). mainline inclusion from mainline-v5.17-rc1 commit d2d8c9fddb1c11ccfa73bf0ad2b1e6b4ea7afdaf category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4OM1C CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=d2d8c9fddb1c11ccfa73bf0ad2b1e6b4ea7afdaf -------------------------------- We should not call unix_mkname() before unix_find_other() and instead do the same thing where necessary based on the address type: - terminating the address with '\0' in unix_find_bsd() - calculating the hash in unix_find_abstract(). Signed-off-by: Kuniyuki Iwashima Signed-off-by: Jakub Kicinski Signed-off-by: Baisong Zhong Reviewed-by: Yue Haibing Reviewed-by: Wei Yongjun Signed-off-by: Zheng Zengkai --- net/unix/af_unix.c | 63 ++++++++++++++++++---------------------------- 1 file changed, 25 insertions(+), 38 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index ef8b4fcd52fb..333b76fd830c 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -238,19 +238,25 @@ static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len) return 0; } +static void unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len) +{ + /* This may look like an off by one error but it is a bit more + * subtle. 108 is the longest valid AF_UNIX path for a binding. + * sun_path[108] doesn't as such exist. However in kernel space + * we are guaranteed that it is a valid memory location in our + * kernel address buffer because syscall functions always pass + * a pointer of struct sockaddr_storage which has a bigger buffer + * than 108. + */ + ((char *)sunaddr)[addr_len] = 0; +} + static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp) { *hashp = 0; if (sunaddr->sun_path[0]) { - /* - * This may look like an off by one error but it is a bit more - * subtle. 108 is the longest valid AF_UNIX path for a binding. - * sun_path[108] doesn't as such exist. However in kernel space - * we are guaranteed that it is a valid memory location in our - * kernel address buffer. - */ - ((char *)sunaddr)[len] = 0; + unix_mkname_bsd(sunaddr, len); len = strlen(sunaddr->sun_path) + offsetof(struct sockaddr_un, sun_path) + 1; return len; @@ -902,13 +908,14 @@ static int unix_release(struct socket *sock) } static struct sock *unix_find_bsd(struct net *net, struct sockaddr_un *sunaddr, - int type) + int addr_len, int type) { struct inode *inode; struct path path; struct sock *sk; int err; + unix_mkname_bsd(sunaddr, addr_len); err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path); if (err) goto fail; @@ -946,9 +953,9 @@ static struct sock *unix_find_bsd(struct net *net, struct sockaddr_un *sunaddr, static struct sock *unix_find_abstract(struct net *net, struct sockaddr_un *sunaddr, - int addr_len, int type, - unsigned int hash) + int addr_len, int type) { + unsigned int hash = unix_hash_fold(csum_partial(sunaddr, addr_len, 0)); struct dentry *dentry; struct sock *sk; @@ -965,15 +972,14 @@ static struct sock *unix_find_abstract(struct net *net, static struct sock *unix_find_other(struct net *net, struct sockaddr_un *sunaddr, - int addr_len, int type, - unsigned int hash) + int addr_len, int type) { struct sock *sk; if (sunaddr->sun_path[0]) - sk = unix_find_bsd(net, sunaddr, type); + sk = unix_find_bsd(net, sunaddr, addr_len, type); else - sk = unix_find_abstract(net, sunaddr, addr_len, type, hash); + sk = unix_find_abstract(net, sunaddr, addr_len, type); return sk; } @@ -1189,7 +1195,6 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, struct net *net = sock_net(sk); struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr; struct sock *other; - unsigned int hash; int err; err = -EINVAL; @@ -1201,11 +1206,6 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, if (err) goto out; - err = unix_mkname(sunaddr, alen, &hash); - if (err < 0) - goto out; - alen = err; - if (test_bit(SOCK_PASSCRED, &sock->flags) && !unix_sk(sk)->addr) { err = unix_autobind(sk); @@ -1214,7 +1214,7 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, } restart: - other = unix_find_other(net, sunaddr, alen, sock->type, hash); + other = unix_find_other(net, sunaddr, alen, sock->type); if (IS_ERR(other)) { err = PTR_ERR(other); goto out; @@ -1303,7 +1303,6 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, struct sock *newsk = NULL; struct sock *other = NULL; struct sk_buff *skb = NULL; - unsigned int hash; int st; int err; long timeo; @@ -1312,11 +1311,6 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, if (err) goto out; - err = unix_mkname(sunaddr, addr_len, &hash); - if (err < 0) - goto out; - addr_len = err; - if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr) { err = unix_autobind(sk); if (err) @@ -1344,7 +1338,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, restart: /* Find listening sock. */ - other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash); + other = unix_find_other(net, sunaddr, addr_len, sk->sk_type); if (IS_ERR(other)) { err = PTR_ERR(other); other = NULL; @@ -1744,9 +1738,7 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, struct unix_sock *u = unix_sk(sk); DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name); struct sock *other = NULL; - int namelen = 0; /* fake GCC */ int err; - unsigned int hash; struct sk_buff *skb; long timeo; struct scm_cookie scm; @@ -1766,11 +1758,6 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, err = unix_validate_addr(sunaddr, msg->msg_namelen); if (err) goto out; - - err = unix_mkname(sunaddr, msg->msg_namelen, &hash); - if (err < 0) - goto out; - namelen = err; } else { sunaddr = NULL; err = -ENOTCONN; @@ -1823,8 +1810,8 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, if (sunaddr == NULL) goto out_free; - other = unix_find_other(net, sunaddr, namelen, sk->sk_type, - hash); + other = unix_find_other(net, sunaddr, msg->msg_namelen, + sk->sk_type); if (IS_ERR(other)) { err = PTR_ERR(other); other = NULL; -- Gitee From 3687543508a837e7202cdf2195bfbdebf9dbbf8b Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 19 Jul 2022 22:10:24 +0800 Subject: [PATCH 536/674] af_unix: Remove unix_mkname(). mainline inclusion from mainline-v5.17-rc1 commit 5c32a3ed64b4c87ed6d9978074db5f0a54c4cd20 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4OM1C CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=5c32a3ed64b4c87ed6d9978074db5f0a54c4cd20 -------------------------------- This patch removes unix_mkname() and postpones calculating a hash to unix_bind_abstract(). Some BSD stuffs still remain in unix_bind() though, the next patch packs them into unix_bind_bsd(). Signed-off-by: Kuniyuki Iwashima Signed-off-by: Jakub Kicinski Signed-off-by: Baisong Zhong Reviewed-by: Yue Haibing Reviewed-by: Wei Yongjun Signed-off-by: Zheng Zengkai --- net/unix/af_unix.c | 34 +++++++++++----------------------- 1 file changed, 11 insertions(+), 23 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 333b76fd830c..9f1273fab014 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -251,21 +251,6 @@ static void unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len) ((char *)sunaddr)[addr_len] = 0; } -static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp) -{ - *hashp = 0; - - if (sunaddr->sun_path[0]) { - unix_mkname_bsd(sunaddr, len); - len = strlen(sunaddr->sun_path) + - offsetof(struct sockaddr_un, sun_path) + 1; - return len; - } - - *hashp = unix_hash_fold(csum_partial(sunaddr, len, 0)); - return len; -} - static void __unix_remove_socket(struct sock *sk) { sk_del_node_init(sk); @@ -1111,6 +1096,9 @@ static int unix_bind_abstract(struct sock *sk, struct unix_address *addr) return -EINVAL; } + addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0)); + addr->hash ^= sk->sk_type; + spin_lock(&unix_table_lock); if (__unix_find_socket_byname(sock_net(sk), addr->name, addr->len, addr->hash)) { @@ -1126,12 +1114,11 @@ static int unix_bind_abstract(struct sock *sk, struct unix_address *addr) static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { - struct sock *sk = sock->sk; struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; char *sun_path = sunaddr->sun_path; - int err; - unsigned int hash; + struct sock *sk = sock->sk; struct unix_address *addr; + int err; if (addr_len == offsetof(struct sockaddr_un, sun_path) && sunaddr->sun_family == AF_UNIX) @@ -1141,17 +1128,18 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (err) return err; - err = unix_mkname(sunaddr, addr_len, &hash); - if (err < 0) - return err; - addr_len = err; + if (sun_path[0]) { + unix_mkname_bsd(sunaddr, addr_len); + addr_len = strlen(sunaddr->sun_path) + + offsetof(struct sockaddr_un, sun_path) + 1; + } + addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL); if (!addr) return -ENOMEM; memcpy(addr->name, sunaddr, addr_len); addr->len = addr_len; - addr->hash = hash ^ sk->sk_type; refcount_set(&addr->refcnt, 1); if (sun_path[0]) -- Gitee From e73a033cd4e3796ef6885df595658646e560c607 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 19 Jul 2022 22:10:25 +0800 Subject: [PATCH 537/674] unix_bind_bsd(): move done_path_create() call after dealing with ->bindlock mainline inclusion from mainline-v5.14-rc1 commit 56c1731b280dc71febf5df80fcac1923ea973ab8 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4OM1C CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=56c1731b280dc71febf5df80fcac1923ea973ab8 -------------------------------- Final preparations for doing unlink on failure past the successful mknod. We can't hold ->bindlock over ->mknod() or ->unlink(), since either might do sb_start_write() (e.g. on overlayfs). However, we can do it while holding filesystem and VFS locks - doing kern_path_create() vfs_mknod() grab ->bindlock if u->addr had been set drop ->bindlock done_path_create return -EINVAL else assign the address to socket drop ->bindlock done_path_create return 0 would be deadlock-free. Here we massage unix_bind_bsd() to that form. We are still doing equivalent transformations. Next commit will *not* be an equivalent transformation - it will add a call of vfs_unlink() before done_path_create() in "alread bound" case. Signed-off-by: Al Viro Signed-off-by: David S. Miller Conflicts: net/unix/af_unix.c Signed-off-by: Baisong Zhong Reviewed-by: Yue Haibing Reviewed-by: Wei Yongjun Signed-off-by: Zheng Zengkai --- net/unix/af_unix.c | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 9f1273fab014..fbb7dcaf7d85 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1032,7 +1032,7 @@ static int unix_bind_bsd(struct sock *sk, struct unix_address *addr) struct unix_sock *u = unix_sk(sk); umode_t mode = S_IFSOCK | (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask()); - struct path parent, path; + struct path parent; struct dentry *dentry; unsigned int hash; int err; @@ -1049,36 +1049,32 @@ static int unix_bind_bsd(struct sock *sk, struct unix_address *addr) * All right, let's create it. */ err = security_path_mknod(&parent, dentry, mode, 0); - if (!err) { + if (!err) err = vfs_mknod(d_inode(parent.dentry), dentry, mode, 0); - if (!err) { - path.mnt = mntget(parent.mnt); - path.dentry = dget(dentry); - } - } - done_path_create(&parent, dentry); - if (err) + if (err) { + done_path_create(&parent, dentry); return err; - + } err = mutex_lock_interruptible(&u->bindlock); if (err) { - path_put(&path); + done_path_create(&parent, dentry); return err; } - if (u->addr) { mutex_unlock(&u->bindlock); - path_put(&path); + done_path_create(&parent, dentry); return -EINVAL; } addr->hash = UNIX_HASH_SIZE; - hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE - 1); + hash = d_backing_inode(dentry)->i_ino & (UNIX_HASH_SIZE - 1); spin_lock(&unix_table_lock); - u->path = path; + u->path.mnt = mntget(parent.mnt); + u->path.dentry = dget(dentry); __unix_set_addr(sk, addr, hash); spin_unlock(&unix_table_lock); mutex_unlock(&u->bindlock); + done_path_create(&parent, dentry); return 0; } -- Gitee From ce559a0c9ae94877e3db8e1a19d2bdf2dc8a07a0 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 19 Jul 2022 22:10:26 +0800 Subject: [PATCH 538/674] unix_bind_bsd(): unlink if we fail after successful mknod mainline inclusion from mainline-v5.14-rc1 commit c0c3b8d380a8f54c75786d41f6f9efbe761dac6c category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4OM1C CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c0c3b8d380a8f54c75786d41f6f9efbe761dac6c -------------------------------- We can do that more or less safely, since the parent is held locked all along. Yes, somebody might observe the object via dcache, only to have it disappear afterwards, but there's really no good way to prevent that. It won't race with other bind(2) or attempts to move the sucker elsewhere, or put something else in its place - locked parent prevents that. Signed-off-by: Al Viro Signed-off-by: David S. Miller Conflicts: net/unix/af_unix.c Signed-off-by: Baisong Zhong Reviewed-by: Yue Haibing Reviewed-by: Wei Yongjun Signed-off-by: Zheng Zengkai --- net/unix/af_unix.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index fbb7dcaf7d85..a443b8862c4c 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1051,20 +1051,13 @@ static int unix_bind_bsd(struct sock *sk, struct unix_address *addr) err = security_path_mknod(&parent, dentry, mode, 0); if (!err) err = vfs_mknod(d_inode(parent.dentry), dentry, mode, 0); - if (err) { - done_path_create(&parent, dentry); - return err; - } + if (err) + goto out; err = mutex_lock_interruptible(&u->bindlock); - if (err) { - done_path_create(&parent, dentry); - return err; - } - if (u->addr) { - mutex_unlock(&u->bindlock); - done_path_create(&parent, dentry); - return -EINVAL; - } + if (err) + goto out_unlink; + if (u->addr) + goto out_unlock; addr->hash = UNIX_HASH_SIZE; hash = d_backing_inode(dentry)->i_ino & (UNIX_HASH_SIZE - 1); @@ -1076,6 +1069,16 @@ static int unix_bind_bsd(struct sock *sk, struct unix_address *addr) mutex_unlock(&u->bindlock); done_path_create(&parent, dentry); return 0; + +out_unlock: + mutex_unlock(&u->bindlock); + err = -EINVAL; +out_unlink: + /* failed after successful mknod? unlink what we'd created... */ + vfs_unlink(d_inode(parent.dentry), dentry, NULL); +out: + done_path_create(&parent, dentry); + return err; } static int unix_bind_abstract(struct sock *sk, struct unix_address *addr) -- Gitee From 26b3fc648100bf91002039cf87d130b6b3389139 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 19 Jul 2022 22:10:27 +0800 Subject: [PATCH 539/674] af_unix: Allocate unix_address in unix_bind_(bsd|abstract)(). mainline inclusion from mainline-v5.17-rc1 commit 12f21c49ad83eba93d0485b8c9edcc28201bee93 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4OM1C CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=12f21c49ad83eba93d0485b8c9edcc28201bee93 -------------------------------- To terminate address with '\0' in unix_bind_bsd(), we add unix_create_addr() and call it in unix_bind_bsd() and unix_bind_abstract(). Also, unix_bind_abstract() does not return -EEXIST. Only kern_path_create() and vfs_mknod() in unix_bind_bsd() can return it, so we move the last error check in unix_bind() to unix_bind_bsd(). Signed-off-by: Kuniyuki Iwashima Signed-off-by: Jakub Kicinski Conflicts: net/unix/af_unix.c Signed-off-by: Baisong Zhong Reviewed-by: Yue Haibing Reviewed-by: Wei Yongjun Signed-off-by: Zheng Zengkai --- net/unix/af_unix.c | 105 ++++++++++++++++++++++++++++----------------- 1 file changed, 66 insertions(+), 39 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index a443b8862c4c..4a25742ed4be 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -213,6 +213,22 @@ struct sock *unix_peer_get(struct sock *s) } EXPORT_SYMBOL_GPL(unix_peer_get); +static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr, + int addr_len) +{ + struct unix_address *addr; + + addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL); + if (!addr) + return NULL; + + refcount_set(&addr->refcnt, 1); + addr->len = addr_len; + memcpy(addr->name, sunaddr, addr_len); + + return addr; +} + static inline void unix_release_addr(struct unix_address *addr) { if (refcount_dec_and_test(&addr->refcnt)) @@ -1027,23 +1043,35 @@ out: mutex_unlock(&u->bindlock); return err; } -static int unix_bind_bsd(struct sock *sk, struct unix_address *addr) +static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, + int addr_len) { - struct unix_sock *u = unix_sk(sk); umode_t mode = S_IFSOCK | (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask()); - struct path parent; + struct unix_sock *u = unix_sk(sk); + struct unix_address *addr; struct dentry *dentry; + struct path parent; unsigned int hash; int err; + unix_mkname_bsd(sunaddr, addr_len); + addr_len = strlen(sunaddr->sun_path) + + offsetof(struct sockaddr_un, sun_path) + 1; + + addr = unix_create_addr(sunaddr, addr_len); + if (!addr) + return -ENOMEM; + /* * Get the parent directory, calculate the hash for last * component. */ dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0); - if (IS_ERR(dentry)) - return PTR_ERR(dentry); + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); + goto out; + } /* * All right, let's create it. @@ -1052,7 +1080,7 @@ static int unix_bind_bsd(struct sock *sk, struct unix_address *addr) if (!err) err = vfs_mknod(d_inode(parent.dentry), dentry, mode, 0); if (err) - goto out; + goto out_path; err = mutex_lock_interruptible(&u->bindlock); if (err) goto out_unlink; @@ -1076,47 +1104,61 @@ static int unix_bind_bsd(struct sock *sk, struct unix_address *addr) out_unlink: /* failed after successful mknod? unlink what we'd created... */ vfs_unlink(d_inode(parent.dentry), dentry, NULL); -out: +out_path: done_path_create(&parent, dentry); - return err; +out: + unix_release_addr(addr); + return err == -EEXIST ? -EADDRINUSE : err; } -static int unix_bind_abstract(struct sock *sk, struct unix_address *addr) +static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr, + int addr_len) { struct unix_sock *u = unix_sk(sk); + struct unix_address *addr; int err; + addr = unix_create_addr(sunaddr, addr_len); + if (!addr) + return -ENOMEM; + err = mutex_lock_interruptible(&u->bindlock); if (err) - return err; + goto out; if (u->addr) { - mutex_unlock(&u->bindlock); - return -EINVAL; + err = -EINVAL; + goto out_mutex; } addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0)); addr->hash ^= sk->sk_type; spin_lock(&unix_table_lock); + if (__unix_find_socket_byname(sock_net(sk), addr->name, addr->len, - addr->hash)) { - spin_unlock(&unix_table_lock); - mutex_unlock(&u->bindlock); - return -EADDRINUSE; - } + addr->hash)) + goto out_spin; + __unix_set_addr(sk, addr, addr->hash); spin_unlock(&unix_table_lock); mutex_unlock(&u->bindlock); return 0; + +out_spin: + spin_unlock(&unix_table_lock); + err = -EADDRINUSE; +out_mutex: + mutex_unlock(&u->bindlock); +out: + unix_release_addr(addr); + return err; } static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; - char *sun_path = sunaddr->sun_path; struct sock *sk = sock->sk; - struct unix_address *addr; int err; if (addr_len == offsetof(struct sockaddr_un, sun_path) && @@ -1127,27 +1169,12 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (err) return err; - if (sun_path[0]) { - unix_mkname_bsd(sunaddr, addr_len); - addr_len = strlen(sunaddr->sun_path) + - offsetof(struct sockaddr_un, sun_path) + 1; - } - - addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL); - if (!addr) - return -ENOMEM; - - memcpy(addr->name, sunaddr, addr_len); - addr->len = addr_len; - refcount_set(&addr->refcnt, 1); - - if (sun_path[0]) - err = unix_bind_bsd(sk, addr); + if (sunaddr->sun_path[0]) + err = unix_bind_bsd(sk, sunaddr, addr_len); else - err = unix_bind_abstract(sk, addr); - if (err) - unix_release_addr(addr); - return err == -EEXIST ? -EADDRINUSE : err; + err = unix_bind_abstract(sk, sunaddr, addr_len); + + return err; } static void unix_state_double_lock(struct sock *sk1, struct sock *sk2) -- Gitee From 915d7d25c0d71e35f7cac439bfe7c06fa8065c99 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 19 Jul 2022 22:10:28 +0800 Subject: [PATCH 540/674] af_unix: Remove UNIX_ABSTRACT() macro and test sun_path[0] instead. mainline inclusion from mainline-v5.17-rc1 commit 5ce7ab4961a9320ca0836e06849210d088723a56 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4OM1C CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=5ce7ab4961a9320ca0836e06849210d088723a56 -------------------------------- In BSD and abstract address cases, we store sockets in the hash table with keys between 0 and UNIX_HASH_SIZE - 1. However, the hash saved in a socket varies depending on its address type; sockets with BSD addresses always have UNIX_HASH_SIZE in their unix_sk(sk)->addr->hash. This is just for the UNIX_ABSTRACT() macro used to check the address type. The difference of the saved hashes comes from the first byte of the address in the first place. So, we can test it directly. Then we can keep a real hash in each socket and replace unix_table_lock with per-hash locks in the later patch. Signed-off-by: Kuniyuki Iwashima Signed-off-by: Jakub Kicinski Conflicts: tools/testing/selftests/bpf/progs/bpf_iter_unix.c tools/testing/selftests/bpf/progs/bpf_tracing_net.h tools/testing/selftests/bpf/progs/test_skc_to_unix_sock.c Signed-off-by: Baisong Zhong Reviewed-by: Yue Haibing Reviewed-by: Wei Yongjun Signed-off-by: Zheng Zengkai --- net/unix/af_unix.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 4a25742ed4be..e66551d6a43d 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -133,8 +133,6 @@ static struct hlist_head *unix_sockets_unbound(void *addr) return &unix_socket_table[UNIX_HASH_SIZE + hash]; } -#define UNIX_ABSTRACT(sk) (unix_sk(sk)->addr->hash < UNIX_HASH_SIZE) - #ifdef CONFIG_SECURITY_NETWORK static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) { @@ -2978,9 +2976,9 @@ static int unix_seq_show(struct seq_file *seq, void *v) i = 0; len = u->addr->len - offsetof(struct sockaddr_un, sun_path); - if (!UNIX_ABSTRACT(s)) + if (u->addr->name->sun_path[0]) { len--; - else { + } else { seq_putc(seq, '@'); i++; } -- Gitee From 2ae0412dc0863a7096624c94aac0a631e13d7e49 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 19 Jul 2022 22:10:29 +0800 Subject: [PATCH 541/674] af_unix: Return errno instead of NULL in unix_create1(). mainline inclusion from mainline-v5.15-rc4 commit f4bd73b5a950866f6c6fc98a7b684d307c5d586a category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4OM1C CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=f4bd73b5a950866f6c6fc98a7b684d307c5d586a -------------------------------- unix_create1() returns NULL on error, and the callers assume that it never fails for reasons other than out of memory. So, the callers always return -ENOMEM when unix_create1() fails. However, it also returns NULL when the number of af_unix sockets exceeds twice the limit controlled by sysctl: fs.file-max. In this case, the callers should return -ENFILE like alloc_empty_file(). This patch changes unix_create1() to return the correct error value instead of NULL on error. Out of curiosity, the assumption has been wrong since 1999 due to this change introduced in 2.2.4 [0]. diff -u --recursive --new-file v2.2.3/linux/net/unix/af_unix.c linux/net/unix/af_unix.c --- v2.2.3/linux/net/unix/af_unix.c Tue Jan 19 11:32:53 1999 +++ linux/net/unix/af_unix.c Sun Mar 21 07:22:00 1999 @@ -388,6 +413,9 @@ { struct sock *sk; + if (atomic_read(&unix_nr_socks) >= 2*max_files) + return NULL; + MOD_INC_USE_COUNT; sk = sk_alloc(PF_UNIX, GFP_KERNEL, 1); if (!sk) { [0]: https://cdn.kernel.org/pub/linux/kernel/v2.2/patch-2.2.4.gz Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller Conflicts: net/unix/af_unix.c Signed-off-by: Baisong Zhong Reviewed-by: Yue Haibing Reviewed-by: Wei Yongjun Signed-off-by: Zheng Zengkai --- net/unix/af_unix.c | 50 ++++++++++++++++++++++++++++++---------------- 1 file changed, 33 insertions(+), 17 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index e66551d6a43d..9589e54bd3c9 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -821,16 +821,22 @@ static struct proto unix_proto = { static struct sock *unix_create1(struct net *net, struct socket *sock, int kern) { - struct sock *sk = NULL; struct unix_sock *u; + struct sock *sk; + int err; atomic_long_inc(&unix_nr_socks); - if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) - goto out; + if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) { + err = -ENFILE; + goto err; + } sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto, kern); - if (!sk) - goto out; + + if (!sk) { + err = -ENOMEM; + goto err; + } sock_init_data(sock, sk); @@ -850,20 +856,23 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern) init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay); memset(&u->scm_stat, 0, sizeof(struct scm_stat)); unix_insert_socket(unix_sockets_unbound(sk), sk); -out: - if (sk == NULL) - atomic_long_dec(&unix_nr_socks); - else { - local_bh_disable(); - sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); - local_bh_enable(); - } + + local_bh_disable(); + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); + local_bh_enable(); + return sk; + +err: + atomic_long_dec(&unix_nr_socks); + return ERR_PTR(err); } static int unix_create(struct net *net, struct socket *sock, int protocol, int kern) { + struct sock *sk; + if (protocol && protocol != PF_UNIX) return -EPROTONOSUPPORT; @@ -890,7 +899,11 @@ static int unix_create(struct net *net, struct socket *sock, int protocol, return -ESOCKTNOSUPPORT; } - return unix_create1(net, sock, kern) ? 0 : -ENOMEM; + sk = unix_create1(net, sock, kern); + if (IS_ERR(sk)) + return PTR_ERR(sk); + + return 0; } static int unix_release(struct socket *sock) @@ -1336,12 +1349,15 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, we will have to recheck all again in any case. */ - err = -ENOMEM; - /* create new sock for complete connection */ newsk = unix_create1(sock_net(sk), NULL, 0); - if (newsk == NULL) + if (IS_ERR(newsk)) { + err = PTR_ERR(newsk); + newsk = NULL; goto out; + } + + err = -ENOMEM; /* Allocate skb for sending to listening sock */ skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL); -- Gitee From ff751cc10e429bd22dc113dff0912d099be193c6 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 19 Jul 2022 22:10:30 +0800 Subject: [PATCH 542/674] af_unix: Add helpers to calculate hashes. mainline inclusion from mainline-v5.17-rc1 commit f452be496a5c8f58b1a67cde79e89b9f1cfde31c category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4OM1C CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=f452be496a5c8f58b1a67cde79e89b9f1cfde31c -------------------------------- This patch adds three helper functions that calculate hashes for unbound sockets and bound sockets with BSD/abstract addresses. Signed-off-by: Kuniyuki Iwashima Signed-off-by: Jakub Kicinski Signed-off-by: Baisong Zhong Reviewed-by: Yue Haibing Reviewed-by: Wei Yongjun Signed-off-by: Zheng Zengkai --- net/unix/af_unix.c | 64 +++++++++++++++++++++++++--------------------- 1 file changed, 35 insertions(+), 29 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 9589e54bd3c9..114b09d22b68 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -122,15 +122,38 @@ DEFINE_SPINLOCK(unix_table_lock); EXPORT_SYMBOL_GPL(unix_table_lock); static atomic_long_t unix_nr_socks; +/* SMP locking strategy: + * hash table is protected with spinlock unix_table_lock + * each socket state is protected by separate spin lock. + */ -static struct hlist_head *unix_sockets_unbound(void *addr) +static unsigned int unix_unbound_hash(struct sock *sk) { - unsigned long hash = (unsigned long)addr; + unsigned long hash = (unsigned long)sk; hash ^= hash >> 16; hash ^= hash >> 8; - hash %= UNIX_HASH_SIZE; - return &unix_socket_table[UNIX_HASH_SIZE + hash]; + hash ^= sk->sk_type; + + return UNIX_HASH_SIZE + (hash & (UNIX_HASH_SIZE - 1)); +} + +static unsigned int unix_bsd_hash(struct inode *i) +{ + return i->i_ino & (UNIX_HASH_SIZE - 1); +} + +static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr, + int addr_len, int type) +{ + __wsum csum = csum_partial(sunaddr, addr_len, 0); + unsigned int hash; + + hash = (__force unsigned int)csum_fold(csum); + hash ^= hash >> 8; + hash ^= type; + + return hash & (UNIX_HASH_SIZE - 1); } #ifdef CONFIG_SECURITY_NETWORK @@ -161,20 +184,6 @@ static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) } #endif /* CONFIG_SECURITY_NETWORK */ -/* - * SMP locking strategy: - * hash table is protected with spinlock unix_table_lock - * each socket state is protected by separate spin lock. - */ - -static inline unsigned int unix_hash_fold(__wsum n) -{ - unsigned int hash = (__force unsigned int)csum_fold(n); - - hash ^= hash>>8; - return hash&(UNIX_HASH_SIZE-1); -} - #define unix_peer(sk) (unix_sk(sk)->peer) static inline int unix_our_peer(struct sock *sk, struct sock *osk) @@ -333,11 +342,11 @@ static inline struct sock *unix_find_socket_byname(struct net *net, static struct sock *unix_find_socket_byinode(struct inode *i) { + unsigned int hash = unix_bsd_hash(i); struct sock *s; spin_lock(&unix_table_lock); - sk_for_each(s, - &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) { + sk_for_each(s, &unix_socket_table[hash]) { struct dentry *dentry = unix_sk(s)->path.dentry; if (dentry && d_backing_inode(dentry) == i) { @@ -855,7 +864,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern) init_waitqueue_head(&u->peer_wait); init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay); memset(&u->scm_stat, 0, sizeof(struct scm_stat)); - unix_insert_socket(unix_sockets_unbound(sk), sk); + unix_insert_socket(&unix_socket_table[unix_unbound_hash(sk)], sk); local_bh_disable(); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); @@ -967,11 +976,11 @@ static struct sock *unix_find_abstract(struct net *net, struct sockaddr_un *sunaddr, int addr_len, int type) { - unsigned int hash = unix_hash_fold(csum_partial(sunaddr, addr_len, 0)); + unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type); struct dentry *dentry; struct sock *sk; - sk = unix_find_socket_byname(net, sunaddr, addr_len, type ^ hash); + sk = unix_find_socket_byname(net, sunaddr, addr_len, hash); if (!sk) return ERR_PTR(-ECONNREFUSED); @@ -1023,8 +1032,7 @@ static int unix_autobind(struct sock *sk) retry: addr->len = sprintf(addr->name->sun_path + 1, "%05x", ordernum) + offsetof(struct sockaddr_un, sun_path) + 1; - addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0)); - addr->hash ^= sk->sk_type; + addr->hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); spin_lock(&unix_table_lock); ordernum = (ordernum+1)&0xFFFFF; @@ -1099,7 +1107,7 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, goto out_unlock; addr->hash = UNIX_HASH_SIZE; - hash = d_backing_inode(dentry)->i_ino & (UNIX_HASH_SIZE - 1); + hash = unix_bsd_hash(d_backing_inode(dentry)); spin_lock(&unix_table_lock); u->path.mnt = mntget(parent.mnt); u->path.dentry = dget(dentry); @@ -1142,9 +1150,7 @@ static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr, goto out_mutex; } - addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0)); - addr->hash ^= sk->sk_type; - + addr->hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); spin_lock(&unix_table_lock); if (__unix_find_socket_byname(sock_net(sk), addr->name, addr->len, -- Gitee From ef9bfa8e29eaa22b13a4e2c006245484b7fb6c40 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 19 Jul 2022 22:10:31 +0800 Subject: [PATCH 543/674] af_unix: Save hash in sk_hash. mainline inclusion from mainline-v5.17-rc1 commit e6b4b873896f0e9298f70d25726f4bb1e1b265ba category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4OM1C CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e6b4b873896f0e9298f70d25726f4bb1e1b265ba -------------------------------- To replace unix_table_lock with per-hash locks in the next patch, we need to save a hash in each socket because /proc/net/unix or BPF prog iterate sockets while holding a hash table lock and release it later in a different function. Currently, we store a real/pseudo hash in struct unix_address. However, we do not allocate it to unbound sockets, nor should we do just for that. For this purpose, we can use sk_hash. Then, we no longer use the hash field in struct unix_address and can remove it. Also, this patch does - rename unix_insert_socket() to unix_insert_unbound_socket() - remove the redundant list argument from __unix_insert_socket() and unix_insert_unbound_socket() - use 'unsigned int' instead of 'unsigned' in __unix_set_addr_hash() - remove 'inline' from unix_remove_socket() and unix_insert_unbound_socket(). Signed-off-by: Kuniyuki Iwashima Signed-off-by: Jakub Kicinski Signed-off-by: Baisong Zhong Reviewed-by: Yue Haibing Reviewed-by: Wei Yongjun Signed-off-by: Zheng Zengkai --- include/net/af_unix.h | 2 +- net/unix/af_unix.c | 42 +++++++++++++++++++++++------------------- 2 files changed, 24 insertions(+), 20 deletions(-) diff --git a/include/net/af_unix.h b/include/net/af_unix.h index f42fdddecd41..beb6ef56bf85 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -26,7 +26,7 @@ extern struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE]; struct unix_address { refcount_t refcnt; int len; - unsigned int hash; + unsigned int hash; /* deprecated */ struct sockaddr_un name[]; }; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 114b09d22b68..552f95c7d9e1 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -279,31 +279,33 @@ static void __unix_remove_socket(struct sock *sk) sk_del_node_init(sk); } -static void __unix_insert_socket(struct hlist_head *list, struct sock *sk) +static void __unix_insert_socket(struct sock *sk) { WARN_ON(!sk_unhashed(sk)); - sk_add_node(sk, list); + sk_add_node(sk, &unix_socket_table[sk->sk_hash]); } -static void __unix_set_addr(struct sock *sk, struct unix_address *addr, - unsigned hash) +static void __unix_set_addr_hash(struct sock *sk, struct unix_address *addr, + unsigned int hash) { __unix_remove_socket(sk); smp_store_release(&unix_sk(sk)->addr, addr); - __unix_insert_socket(&unix_socket_table[hash], sk); + + sk->sk_hash = hash; + __unix_insert_socket(sk); } -static inline void unix_remove_socket(struct sock *sk) +static void unix_remove_socket(struct sock *sk) { spin_lock(&unix_table_lock); __unix_remove_socket(sk); spin_unlock(&unix_table_lock); } -static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk) +static void unix_insert_unbound_socket(struct sock *sk) { spin_lock(&unix_table_lock); - __unix_insert_socket(list, sk); + __unix_insert_socket(sk); spin_unlock(&unix_table_lock); } @@ -849,6 +851,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern) sock_init_data(sock, sk); + sk->sk_hash = unix_unbound_hash(sk); sk->sk_allocation = GFP_KERNEL_ACCOUNT; sk->sk_write_space = unix_write_space; sk->sk_max_ack_backlog = net->unx.sysctl_max_dgram_qlen; @@ -864,7 +867,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern) init_waitqueue_head(&u->peer_wait); init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay); memset(&u->scm_stat, 0, sizeof(struct scm_stat)); - unix_insert_socket(&unix_socket_table[unix_unbound_hash(sk)], sk); + unix_insert_unbound_socket(sk); local_bh_disable(); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); @@ -1011,6 +1014,7 @@ static int unix_autobind(struct sock *sk) struct unix_address *addr; unsigned int retries = 0; static u32 ordernum = 1; + unsigned int new_hash; int err; err = mutex_lock_interruptible(&u->bindlock); @@ -1032,13 +1036,13 @@ static int unix_autobind(struct sock *sk) retry: addr->len = sprintf(addr->name->sun_path + 1, "%05x", ordernum) + offsetof(struct sockaddr_un, sun_path) + 1; - addr->hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); + new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); spin_lock(&unix_table_lock); ordernum = (ordernum+1)&0xFFFFF; if (__unix_find_socket_byname(sock_net(sk), addr->name, addr->len, - addr->hash)) { + new_hash)) { spin_unlock(&unix_table_lock); /* * __unix_find_socket_byname() may take long time if many names @@ -1054,7 +1058,7 @@ static int unix_autobind(struct sock *sk) goto retry; } - __unix_set_addr(sk, addr, addr->hash); + __unix_set_addr_hash(sk, addr, new_hash); spin_unlock(&unix_table_lock); err = 0; @@ -1069,9 +1073,9 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask()); struct unix_sock *u = unix_sk(sk); struct unix_address *addr; + unsigned int new_hash; struct dentry *dentry; struct path parent; - unsigned int hash; int err; unix_mkname_bsd(sunaddr, addr_len); @@ -1106,12 +1110,11 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, if (u->addr) goto out_unlock; - addr->hash = UNIX_HASH_SIZE; - hash = unix_bsd_hash(d_backing_inode(dentry)); + new_hash = unix_bsd_hash(d_backing_inode(dentry)); spin_lock(&unix_table_lock); u->path.mnt = mntget(parent.mnt); u->path.dentry = dget(dentry); - __unix_set_addr(sk, addr, hash); + __unix_set_addr_hash(sk, addr, new_hash); spin_unlock(&unix_table_lock); mutex_unlock(&u->bindlock); done_path_create(&parent, dentry); @@ -1135,6 +1138,7 @@ static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr, { struct unix_sock *u = unix_sk(sk); struct unix_address *addr; + unsigned int new_hash; int err; addr = unix_create_addr(sunaddr, addr_len); @@ -1150,14 +1154,14 @@ static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr, goto out_mutex; } - addr->hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); + new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); spin_lock(&unix_table_lock); if (__unix_find_socket_byname(sock_net(sk), addr->name, addr->len, - addr->hash)) + new_hash)) goto out_spin; - __unix_set_addr(sk, addr, addr->hash); + __unix_set_addr_hash(sk, addr, new_hash); spin_unlock(&unix_table_lock); mutex_unlock(&u->bindlock); return 0; -- Gitee From 0a92541d3ad1cb5f2cae7605fd7ce186b544795a Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 19 Jul 2022 22:10:32 +0800 Subject: [PATCH 544/674] af_unix: Replace the big lock with small locks. mainline inclusion from mainline-v5.17-rc1 commit afd20b9290e184c203fe22f2d6b80dc7127ba724 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4OM1C CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=afd20b9290e184c203fe22f2d6b80dc7127ba724 -------------------------------- The hash table of AF_UNIX sockets is protected by the single lock. This patch replaces it with per-hash locks. The effect is noticeable when we handle multiple sockets simultaneously. Here is a test result on an EC2 c5.24xlarge instance. It shows latency (under 10us only) in unix_insert_unbound_socket() while 64 CPUs creating 1024 sockets for each in parallel. Without this patch: nsec : count distribution 0 : 179 | | 500 : 3021 |********* | 1000 : 6271 |******************* | 1500 : 6318 |******************* | 2000 : 5828 |***************** | 2500 : 5124 |*************** | 3000 : 4426 |************* | 3500 : 3672 |*********** | 4000 : 3138 |********* | 4500 : 2811 |******** | 5000 : 2384 |******* | 5500 : 2023 |****** | 6000 : 1954 |***** | 6500 : 1737 |***** | 7000 : 1749 |***** | 7500 : 1520 |**** | 8000 : 1469 |**** | 8500 : 1394 |**** | 9000 : 1232 |*** | 9500 : 1138 |*** | 10000 : 994 |*** | With this patch: nsec : count distribution 0 : 1634 |**** | 500 : 13170 |****************************************| 1000 : 13156 |*************************************** | 1500 : 9010 |*************************** | 2000 : 6363 |******************* | 2500 : 4443 |************* | 3000 : 3240 |********* | 3500 : 2549 |******* | 4000 : 1872 |***** | 4500 : 1504 |**** | 5000 : 1247 |*** | 5500 : 1035 |*** | 6000 : 889 |** | 6500 : 744 |** | 7000 : 634 |* | 7500 : 498 |* | 8000 : 433 |* | 8500 : 355 |* | 9000 : 336 |* | 9500 : 284 | | 10000 : 243 | | Signed-off-by: Kuniyuki Iwashima Signed-off-by: Jakub Kicinski Conflicts: net/unix/af_unix.c Signed-off-by: Baisong Zhong Reviewed-by: Yue Haibing Reviewed-by: Wei Yongjun Signed-off-by: Zheng Zengkai --- include/net/af_unix.h | 2 +- net/unix/af_unix.c | 98 ++++++++++++++++++++++++++----------------- net/unix/diag.c | 20 ++++----- 3 files changed, 71 insertions(+), 49 deletions(-) diff --git a/include/net/af_unix.h b/include/net/af_unix.h index beb6ef56bf85..ae41d8ee970a 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -20,7 +20,7 @@ struct sock *unix_peer_get(struct sock *sk); #define UNIX_HASH_BITS 8 extern unsigned int unix_tot_inflight; -extern spinlock_t unix_table_lock; +extern spinlock_t unix_table_locks[2 * UNIX_HASH_SIZE]; extern struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE]; struct unix_address { diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 552f95c7d9e1..b497dee9b2a4 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -116,14 +116,14 @@ #include "scm.h" +spinlock_t unix_table_locks[2 * UNIX_HASH_SIZE]; +EXPORT_SYMBOL_GPL(unix_table_locks); struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE]; EXPORT_SYMBOL_GPL(unix_socket_table); -DEFINE_SPINLOCK(unix_table_lock); -EXPORT_SYMBOL_GPL(unix_table_lock); static atomic_long_t unix_nr_socks; /* SMP locking strategy: - * hash table is protected with spinlock unix_table_lock + * hash table is protected with spinlock unix_table_locks * each socket state is protected by separate spin lock. */ @@ -156,6 +156,25 @@ static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr, return hash & (UNIX_HASH_SIZE - 1); } +static void unix_table_double_lock(unsigned int hash1, unsigned int hash2) +{ + /* hash1 and hash2 is never the same because + * one is between 0 and UNIX_HASH_SIZE - 1, and + * another is between UNIX_HASH_SIZE and UNIX_HASH_SIZE * 2. + */ + if (hash1 > hash2) + swap(hash1, hash2); + + spin_lock(&unix_table_locks[hash1]); + spin_lock_nested(&unix_table_locks[hash2], SINGLE_DEPTH_NESTING); +} + +static void unix_table_double_unlock(unsigned int hash1, unsigned int hash2) +{ + spin_unlock(&unix_table_locks[hash1]); + spin_unlock(&unix_table_locks[hash2]); +} + #ifdef CONFIG_SECURITY_NETWORK static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) { @@ -297,16 +316,16 @@ static void __unix_set_addr_hash(struct sock *sk, struct unix_address *addr, static void unix_remove_socket(struct sock *sk) { - spin_lock(&unix_table_lock); + spin_lock(&unix_table_locks[sk->sk_hash]); __unix_remove_socket(sk); - spin_unlock(&unix_table_lock); + spin_unlock(&unix_table_locks[sk->sk_hash]); } static void unix_insert_unbound_socket(struct sock *sk) { - spin_lock(&unix_table_lock); + spin_lock(&unix_table_locks[sk->sk_hash]); __unix_insert_socket(sk); - spin_unlock(&unix_table_lock); + spin_unlock(&unix_table_locks[sk->sk_hash]); } static struct sock *__unix_find_socket_byname(struct net *net, @@ -334,11 +353,11 @@ static inline struct sock *unix_find_socket_byname(struct net *net, { struct sock *s; - spin_lock(&unix_table_lock); + spin_lock(&unix_table_locks[hash]); s = __unix_find_socket_byname(net, sunname, len, hash); if (s) sock_hold(s); - spin_unlock(&unix_table_lock); + spin_unlock(&unix_table_locks[hash]); return s; } @@ -347,19 +366,18 @@ static struct sock *unix_find_socket_byinode(struct inode *i) unsigned int hash = unix_bsd_hash(i); struct sock *s; - spin_lock(&unix_table_lock); + spin_lock(&unix_table_locks[hash]); sk_for_each(s, &unix_socket_table[hash]) { struct dentry *dentry = unix_sk(s)->path.dentry; if (dentry && d_backing_inode(dentry) == i) { sock_hold(s); - goto found; + spin_unlock(&unix_table_locks[hash]); + return s; } } - s = NULL; -found: - spin_unlock(&unix_table_lock); - return s; + spin_unlock(&unix_table_locks[hash]); + return NULL; } /* Support code for asymmetrically connected dgram sockets @@ -1010,11 +1028,11 @@ static struct sock *unix_find_other(struct net *net, static int unix_autobind(struct sock *sk) { + unsigned int new_hash, old_hash = sk->sk_hash; struct unix_sock *u = unix_sk(sk); struct unix_address *addr; unsigned int retries = 0; static u32 ordernum = 1; - unsigned int new_hash; int err; err = mutex_lock_interruptible(&u->bindlock); @@ -1038,12 +1056,13 @@ static int unix_autobind(struct sock *sk) offsetof(struct sockaddr_un, sun_path) + 1; new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); - spin_lock(&unix_table_lock); + unix_table_double_lock(old_hash, new_hash); ordernum = (ordernum+1)&0xFFFFF; if (__unix_find_socket_byname(sock_net(sk), addr->name, addr->len, new_hash)) { - spin_unlock(&unix_table_lock); + unix_table_double_unlock(old_hash, new_hash); + /* * __unix_find_socket_byname() may take long time if many names * are already in use. @@ -1059,7 +1078,7 @@ static int unix_autobind(struct sock *sk) } __unix_set_addr_hash(sk, addr, new_hash); - spin_unlock(&unix_table_lock); + unix_table_double_unlock(old_hash, new_hash); err = 0; out: mutex_unlock(&u->bindlock); @@ -1071,9 +1090,9 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, { umode_t mode = S_IFSOCK | (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask()); + unsigned int new_hash, old_hash = sk->sk_hash; struct unix_sock *u = unix_sk(sk); struct unix_address *addr; - unsigned int new_hash; struct dentry *dentry; struct path parent; int err; @@ -1111,11 +1130,11 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, goto out_unlock; new_hash = unix_bsd_hash(d_backing_inode(dentry)); - spin_lock(&unix_table_lock); + unix_table_double_lock(old_hash, new_hash); u->path.mnt = mntget(parent.mnt); u->path.dentry = dget(dentry); __unix_set_addr_hash(sk, addr, new_hash); - spin_unlock(&unix_table_lock); + unix_table_double_unlock(old_hash, new_hash); mutex_unlock(&u->bindlock); done_path_create(&parent, dentry); return 0; @@ -1136,9 +1155,9 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr, int addr_len) { + unsigned int new_hash, old_hash = sk->sk_hash; struct unix_sock *u = unix_sk(sk); struct unix_address *addr; - unsigned int new_hash; int err; addr = unix_create_addr(sunaddr, addr_len); @@ -1155,19 +1174,19 @@ static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr, } new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); - spin_lock(&unix_table_lock); + unix_table_double_lock(old_hash, new_hash); if (__unix_find_socket_byname(sock_net(sk), addr->name, addr->len, new_hash)) goto out_spin; __unix_set_addr_hash(sk, addr, new_hash); - spin_unlock(&unix_table_lock); + unix_table_double_unlock(old_hash, new_hash); mutex_unlock(&u->bindlock); return 0; out_spin: - spin_unlock(&unix_table_lock); + unix_table_double_unlock(old_hash, new_hash); err = -EADDRINUSE; out_mutex: mutex_unlock(&u->bindlock); @@ -1469,9 +1488,9 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, * * The contents of *(otheru->addr) and otheru->path * are seen fully set up here, since we have found - * otheru in hash under unix_table_lock. Insertion + * otheru in hash under unix_table_locks. Insertion * into the hash chain we'd found it in had been done - * in an earlier critical area protected by unix_table_lock, + * in an earlier critical area protected by unix_table_locks, * the same one where we'd set *(otheru->addr) contents, * as well as otheru->path and otheru->addr itself. * @@ -2900,7 +2919,7 @@ static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1) #define get_bucket(x) ((x) >> BUCKET_SPACE) -#define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1)) +#define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1)) #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o)) static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos) @@ -2924,7 +2943,7 @@ static struct sock *unix_next_socket(struct seq_file *seq, struct sock *sk, loff_t *pos) { - unsigned long bucket; + unsigned long bucket = get_bucket(*pos); while (sk > (struct sock *)SEQ_START_TOKEN) { sk = sk_next(sk); @@ -2935,12 +2954,13 @@ static struct sock *unix_next_socket(struct seq_file *seq, } do { + spin_lock(&unix_table_locks[bucket]); sk = unix_from_bucket(seq, pos); if (sk) return sk; next_bucket: - bucket = get_bucket(*pos) + 1; + spin_unlock(&unix_table_locks[bucket++]); *pos = set_bucket_offset(bucket, 1); } while (bucket < ARRAY_SIZE(unix_socket_table)); @@ -2948,10 +2968,7 @@ static struct sock *unix_next_socket(struct seq_file *seq, } static void *unix_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(unix_table_lock) { - spin_lock(&unix_table_lock); - if (!*pos) return SEQ_START_TOKEN; @@ -2968,9 +2985,11 @@ static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) } static void unix_seq_stop(struct seq_file *seq, void *v) - __releases(unix_table_lock) { - spin_unlock(&unix_table_lock); + struct sock *sk = v; + + if (sk) + spin_unlock(&unix_table_locks[sk->sk_hash]); } static int unix_seq_show(struct seq_file *seq, void *v) @@ -2995,7 +3014,7 @@ static int unix_seq_show(struct seq_file *seq, void *v) (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING), sock_i_ino(s)); - if (u->addr) { // under unix_table_lock here + if (u->addr) { // under unix_table_locks here int i, len; seq_putc(seq, ' '); @@ -3067,10 +3086,13 @@ static struct pernet_operations unix_net_ops = { static int __init af_unix_init(void) { - int rc = -1; + int i, rc = -1; BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb)); + for (i = 0; i < 2 * UNIX_HASH_SIZE; i++) + spin_lock_init(&unix_table_locks[i]); + rc = proto_register(&unix_proto, 1); if (rc != 0) { pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); diff --git a/net/unix/diag.c b/net/unix/diag.c index dd77e81b41a0..c522ab94d0c8 100644 --- a/net/unix/diag.c +++ b/net/unix/diag.c @@ -13,7 +13,7 @@ static int sk_diag_dump_name(struct sock *sk, struct sk_buff *nlskb) { - /* might or might not have unix_table_lock */ + /* might or might not have unix_table_locks */ struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr); if (!addr) @@ -204,13 +204,13 @@ static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) s_slot = cb->args[0]; num = s_num = cb->args[1]; - spin_lock(&unix_table_lock); for (slot = s_slot; slot < ARRAY_SIZE(unix_socket_table); s_num = 0, slot++) { struct sock *sk; num = 0; + spin_lock(&unix_table_locks[slot]); sk_for_each(sk, &unix_socket_table[slot]) { if (!net_eq(sock_net(sk), net)) continue; @@ -221,14 +221,16 @@ static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) if (sk_diag_dump(sk, skb, req, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - NLM_F_MULTI) < 0) + NLM_F_MULTI) < 0) { + spin_unlock(&unix_table_locks[slot]); goto done; + } next: num++; } + spin_unlock(&unix_table_locks[slot]); } done: - spin_unlock(&unix_table_lock); cb->args[0] = slot; cb->args[1] = num; @@ -237,21 +239,19 @@ static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) static struct sock *unix_lookup_by_ino(unsigned int ino) { - int i; struct sock *sk; + int i; - spin_lock(&unix_table_lock); for (i = 0; i < ARRAY_SIZE(unix_socket_table); i++) { + spin_lock(&unix_table_locks[i]); sk_for_each(sk, &unix_socket_table[i]) if (ino == sock_i_ino(sk)) { sock_hold(sk); - spin_unlock(&unix_table_lock); - + spin_unlock(&unix_table_locks[i]); return sk; } + spin_unlock(&unix_table_locks[i]); } - - spin_unlock(&unix_table_lock); return NULL; } -- Gitee From 93e5fa79f21ad078d7f7c055e8be023892006d05 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 19 Jul 2022 22:10:33 +0800 Subject: [PATCH 545/674] af_unix: Relax race in unix_autobind(). mainline inclusion from mainline-v5.17-rc1 commit 9acbc584c3a4e9706703039708ec947ffc152c66 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4OM1C CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=9acbc584c3a4e9706703039708ec947ffc152c66 -------------------------------- When we bind an AF_UNIX socket without a name specified, the kernel selects an available one from 0x00000 to 0xFFFFF. unix_autobind() starts searching from a number in the 'static' variable and increments it after acquiring two locks. If multiple processes try autobind, they obtain the same lock and check if a socket in the hash list has the same name. If not, one process uses it, and all except one end up retrying the _next_ number (actually not, it may be incremented by the other processes). The more we autobind sockets in parallel, the longer the latency gets. We can avoid such a race by searching for a name from a random number. These show latency in unix_autobind() while 64 CPUs are simultaneously autobind-ing 1024 sockets for each. Without this patch: usec : count distribution 0 : 1176 |*** | 2 : 3655 |*********** | 4 : 4094 |************* | 6 : 3831 |************ | 8 : 3829 |************ | 10 : 3844 |************ | 12 : 3638 |*********** | 14 : 2992 |********* | 16 : 2485 |******* | 18 : 2230 |******* | 20 : 2095 |****** | 22 : 1853 |***** | 24 : 1827 |***** | 26 : 1677 |***** | 28 : 1473 |**** | 30 : 1573 |***** | 32 : 1417 |**** | 34 : 1385 |**** | 36 : 1345 |**** | 38 : 1344 |**** | 40 : 1200 |*** | With this patch: usec : count distribution 0 : 1855 |****** | 2 : 6464 |********************* | 4 : 9936 |******************************** | 6 : 12107 |****************************************| 8 : 10441 |********************************** | 10 : 7264 |*********************** | 12 : 4254 |************** | 14 : 2538 |******** | 16 : 1596 |***** | 18 : 1088 |*** | 20 : 800 |** | 22 : 670 |** | 24 : 601 |* | 26 : 562 |* | 28 : 525 |* | 30 : 446 |* | 32 : 378 |* | 34 : 337 |* | 36 : 317 |* | 38 : 314 |* | 40 : 298 | | Signed-off-by: Kuniyuki Iwashima Signed-off-by: Jakub Kicinski Signed-off-by: Baisong Zhong Reviewed-by: Yue Haibing Reviewed-by: Wei Yongjun Signed-off-by: Zheng Zengkai --- net/unix/af_unix.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index b497dee9b2a4..de3a1ffac26d 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1031,8 +1031,7 @@ static int unix_autobind(struct sock *sk) unsigned int new_hash, old_hash = sk->sk_hash; struct unix_sock *u = unix_sk(sk); struct unix_address *addr; - unsigned int retries = 0; - static u32 ordernum = 1; + u32 lastnum, ordernum; int err; err = mutex_lock_interruptible(&u->bindlock); @@ -1048,32 +1047,35 @@ static int unix_autobind(struct sock *sk) if (!addr) goto out; + addr->len = offsetof(struct sockaddr_un, sun_path) + 6; addr->name->sun_family = AF_UNIX; refcount_set(&addr->refcnt, 1); + ordernum = prandom_u32(); + lastnum = ordernum & 0xFFFFF; retry: - addr->len = sprintf(addr->name->sun_path + 1, "%05x", ordernum) + - offsetof(struct sockaddr_un, sun_path) + 1; + ordernum = (ordernum + 1) & 0xFFFFF; + sprintf(addr->name->sun_path + 1, "%05x", ordernum); new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); unix_table_double_lock(old_hash, new_hash); - ordernum = (ordernum+1)&0xFFFFF; if (__unix_find_socket_byname(sock_net(sk), addr->name, addr->len, new_hash)) { unix_table_double_unlock(old_hash, new_hash); - /* - * __unix_find_socket_byname() may take long time if many names + /* __unix_find_socket_byname() may take long time if many names * are already in use. */ cond_resched(); - /* Give up if all names seems to be in use. */ - if (retries++ == 0xFFFFF) { + + if (ordernum == lastnum) { + /* Give up if all names seems to be in use. */ err = -ENOSPC; - kfree(addr); + unix_release_addr(addr); goto out; } + goto retry; } -- Gitee From 8b038d52a6cb7e735aaa5f100cecb70cba382981 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 13 Jul 2022 12:19:58 +0800 Subject: [PATCH 546/674] mm/hwpoison: do not lock page again when me_huge_page() successfully recovers mainline inclusion from mainline-v5.13 commit ea6d0630100b285f059d0a8d8e86f38a46407536 category: bugfix bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1 CVE: NA Intel_SIG: commit ea6d0630100b mm/hwpoison: do not lock page again when me_huge_page() successfully recovers. Backport for MCA recovery enhancing & bug fix. -------------------------------- Currently me_huge_page() temporary unlocks page to perform some actions then locks it again later. My testcase (which calls hard-offline on some tail page in a hugetlb, then accesses the address of the hugetlb range) showed that page allocation code detects this page lock on buddy page and printed out "BUG: Bad page state" message. check_new_page_bad() does not consider a page with __PG_HWPOISON as bad page, so this flag works as kind of filter, but this filtering doesn't work in this case because the "bad page" is not the actual hwpoisoned page. So stop locking page again. Actions to be taken depend on the page type of the error, so page unlocking should be done in ->action() callbacks. So let's make it assumed and change all existing callbacks that way. Link: https://lkml.kernel.org/r/20210609072029.74645-1-nao.horiguchi@gmail.com Fixes: commit 78bb920344b8 ("mm: hwpoison: dissolve in-use hugepage in unrecoverable memory error") Signed-off-by: Naoya Horiguchi Cc: Oscar Salvador Cc: Michal Hocko Cc: Tony Luck Cc: "Aneesh Kumar K.V" Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Youquan Song --- mm/memory-failure.c | 44 ++++++++++++++++++++++++++++++-------------- 1 file changed, 30 insertions(+), 14 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 0519f20d2b57..74a21042845a 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -658,6 +658,7 @@ static int truncate_error_page(struct page *p, unsigned long pfn, */ static int me_kernel(struct page *p, unsigned long pfn) { + unlock_page(p); return MF_IGNORED; } @@ -667,6 +668,7 @@ static int me_kernel(struct page *p, unsigned long pfn) static int me_unknown(struct page *p, unsigned long pfn) { pr_err("Memory failure: %#lx: Unknown page state\n", pfn); + unlock_page(p); return MF_FAILED; } @@ -675,6 +677,7 @@ static int me_unknown(struct page *p, unsigned long pfn) */ static int me_pagecache_clean(struct page *p, unsigned long pfn) { + int ret; struct address_space *mapping; delete_from_lru_cache(p); @@ -683,8 +686,10 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) * For anonymous pages we're done the only reference left * should be the one m_f() holds. */ - if (PageAnon(p)) - return MF_RECOVERED; + if (PageAnon(p)) { + ret = MF_RECOVERED; + goto out; + } /* * Now truncate the page in the page cache. This is really @@ -698,7 +703,8 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) /* * Page has been teared down in the meanwhile */ - return MF_FAILED; + ret = MF_FAILED; + goto out; } /* @@ -706,7 +712,10 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) * * Open: to take i_mutex or not for this? Right now we don't. */ - return truncate_error_page(p, pfn, mapping); + ret = truncate_error_page(p, pfn, mapping); +out: + unlock_page(p); + return ret; } /* @@ -782,24 +791,26 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn) */ static int me_swapcache_dirty(struct page *p, unsigned long pfn) { + int ret; + ClearPageDirty(p); /* Trigger EIO in shmem: */ ClearPageUptodate(p); - if (!delete_from_lru_cache(p)) - return MF_DELAYED; - else - return MF_FAILED; + ret = delete_from_lru_cache(p) ? MF_FAILED : MF_DELAYED; + unlock_page(p); + return ret; } static int me_swapcache_clean(struct page *p, unsigned long pfn) { + int ret; + delete_from_swap_cache(p); - if (!delete_from_lru_cache(p)) - return MF_RECOVERED; - else - return MF_FAILED; + ret = delete_from_lru_cache(p) ? MF_FAILED : MF_RECOVERED; + unlock_page(p); + return ret; } /* @@ -820,6 +831,7 @@ static int me_huge_page(struct page *p, unsigned long pfn) mapping = page_mapping(hpage); if (mapping) { res = truncate_error_page(hpage, pfn, mapping); + unlock_page(hpage); } else { res = MF_FAILED; unlock_page(hpage); @@ -834,7 +846,6 @@ static int me_huge_page(struct page *p, unsigned long pfn) page_ref_inc(p); res = MF_RECOVERED; } - lock_page(hpage); } return res; @@ -866,6 +877,8 @@ static struct page_state { unsigned long mask; unsigned long res; enum mf_action_page_type type; + + /* Callback ->action() has to unlock the relevant page inside it. */ int (*action)(struct page *p, unsigned long pfn); } error_states[] = { { reserved, reserved, MF_MSG_KERNEL, me_kernel }, @@ -929,6 +942,7 @@ static int page_action(struct page_state *ps, struct page *p, int result; int count; + /* page p should be unlocked after returning from ps->action(). */ result = ps->action(p, pfn); count = page_count(p) - 1; @@ -1246,7 +1260,7 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags) goto out; } - res = identify_page_state(pfn, p, page_flags); + return identify_page_state(pfn, p, page_flags); out: unlock_page(head); return res; @@ -1529,6 +1543,8 @@ int memory_failure(unsigned long pfn, int flags) identify_page_state: res = identify_page_state(pfn, p, page_flags); + mutex_unlock(&mf_mutex); + return res; unlock_page: unlock_page(p); unlock_mutex: -- Gitee From 75a005c5418abcb879bdf3053d0853956cec7e73 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Mon, 28 Jun 2021 19:43:14 -0700 Subject: [PATCH 547/674] mm,hwpoison: send SIGBUS with error virutal address mainline inclusion from mainline-v5.14-rc1 commit a3f5d80ea401ac857f2910e28b15f35b2cf902f4 category: bugfix bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1 CVE: NA Intel-SIG: commit a3f5d80ea401 mm,hwpoison: send SIGBUS with error virutal address. Backport for MCA recovery enhancing & bug fix. -------------------------------- Now an action required MCE in already hwpoisoned address surely sends a SIGBUS to current process, but the SIGBUS doesn't convey error virtual address. That's not optimal for hwpoison-aware applications. To fix the issue, make memory_failure() call kill_accessing_process(), that does pagetable walk to find the error virtual address. It could find multiple virtual addresses for the same error page, and it seems hard to tell which virtual address is correct one. But that's rare and sending incorrect virtual address could be better than no address. So let's report the first found virtual address for now. [naoya.horiguchi@nec.com: fix walk_page_range() return] Link: https://lkml.kernel.org/r/20210603051055.GA244241@hori.linux.bs1.fc.nec.co.jp Link: https://lkml.kernel.org/r/20210521030156.2612074-4-nao.horiguchi@gmail.com Signed-off-by: Naoya Horiguchi Cc: Tony Luck Cc: Aili Yao Cc: Oscar Salvador Cc: David Hildenbrand Cc: Borislav Petkov Cc: Andy Lutomirski Cc: Jue Wang Cc: Borislav Petkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Youquan Song --- arch/x86/kernel/cpu/mce/core.c | 13 ++- include/linux/swapops.h | 5 ++ mm/memory-failure.c | 150 ++++++++++++++++++++++++++++++++- 3 files changed, 165 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 0ced8b250090..cdda2198deae 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1271,6 +1271,7 @@ static void kill_me_maybe(struct callback_head *cb) { struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me); int flags = MF_ACTION_REQUIRED; + int ret; p->mce_count = 0; pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr); @@ -1278,13 +1279,21 @@ static void kill_me_maybe(struct callback_head *cb) if (!p->mce_ripv) flags |= MF_MUST_KILL; - if (!memory_failure(p->mce_addr >> PAGE_SHIFT, flags) && - !(p->mce_kflags & MCE_IN_KERNEL_COPYIN)) { + ret = memory_failure(p->mce_addr >> PAGE_SHIFT, flags); + if (!ret && !(p->mce_kflags & MCE_IN_KERNEL_COPYIN)) { set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page); sync_core(); return; } + /* + * -EHWPOISON from memory_failure() means that it already sent SIGBUS + * to the current process with the proper error info, so no need to + * send SIGBUS here again. + */ + if (ret == -EHWPOISON) + return; + if (p->mce_vaddr != (void __user *)-1l) { force_sig_mceerr(BUS_MCEERR_AR, p->mce_vaddr, PAGE_SHIFT); } else { diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 0d429a102d41..708fbeb21dd3 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -332,6 +332,11 @@ static inline int is_hwpoison_entry(swp_entry_t entry) return swp_type(entry) == SWP_HWPOISON; } +static inline unsigned long hwpoison_entry_to_pfn(swp_entry_t entry) +{ + return swp_offset(entry); +} + static inline void num_poisoned_pages_inc(void) { atomic_long_inc(&num_poisoned_pages); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 74a21042845a..0035e8ca4e49 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -56,6 +56,7 @@ #include #include #include +#include #include "internal.h" #include "ras/ras_event.h" @@ -554,6 +555,148 @@ void collect_procs(struct page *page, struct list_head *tokill, } EXPORT_SYMBOL_GPL(collect_procs); +struct hwp_walk { + struct to_kill tk; + unsigned long pfn; + int flags; +}; + +static void set_to_kill(struct to_kill *tk, unsigned long addr, short shift) +{ + tk->addr = addr; + tk->size_shift = shift; +} + +static int check_hwpoisoned_entry(pte_t pte, unsigned long addr, short shift, + unsigned long poisoned_pfn, struct to_kill *tk) +{ + unsigned long pfn = 0; + + if (pte_present(pte)) { + pfn = pte_pfn(pte); + } else { + swp_entry_t swp = pte_to_swp_entry(pte); + + if (is_hwpoison_entry(swp)) + pfn = hwpoison_entry_to_pfn(swp); + } + + if (!pfn || pfn != poisoned_pfn) + return 0; + + set_to_kill(tk, addr, shift); + return 1; +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr, + struct hwp_walk *hwp) +{ + pmd_t pmd = *pmdp; + unsigned long pfn; + unsigned long hwpoison_vaddr; + + if (!pmd_present(pmd)) + return 0; + pfn = pmd_pfn(pmd); + if (pfn <= hwp->pfn && hwp->pfn < pfn + HPAGE_PMD_NR) { + hwpoison_vaddr = addr + ((hwp->pfn - pfn) << PAGE_SHIFT); + set_to_kill(&hwp->tk, hwpoison_vaddr, PAGE_SHIFT); + return 1; + } + return 0; +} +#else +static int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr, + struct hwp_walk *hwp) +{ + return 0; +} +#endif + +static int hwpoison_pte_range(pmd_t *pmdp, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ + struct hwp_walk *hwp = (struct hwp_walk *)walk->private; + int ret = 0; + pte_t *ptep; + spinlock_t *ptl; + + ptl = pmd_trans_huge_lock(pmdp, walk->vma); + if (ptl) { + ret = check_hwpoisoned_pmd_entry(pmdp, addr, hwp); + spin_unlock(ptl); + goto out; + } + + if (pmd_trans_unstable(pmdp)) + goto out; + + ptep = pte_offset_map_lock(walk->vma->vm_mm, pmdp, addr, &ptl); + for (; addr != end; ptep++, addr += PAGE_SIZE) { + ret = check_hwpoisoned_entry(*ptep, addr, PAGE_SHIFT, + hwp->pfn, &hwp->tk); + if (ret == 1) + break; + } + pte_unmap_unlock(ptep - 1, ptl); +out: + cond_resched(); + return ret; +} + +#ifdef CONFIG_HUGETLB_PAGE +static int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask, + unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct hwp_walk *hwp = (struct hwp_walk *)walk->private; + pte_t pte = huge_ptep_get(ptep); + struct hstate *h = hstate_vma(walk->vma); + + return check_hwpoisoned_entry(pte, addr, huge_page_shift(h), + hwp->pfn, &hwp->tk); +} +#else +#define hwpoison_hugetlb_range NULL +#endif + +static struct mm_walk_ops hwp_walk_ops = { + .pmd_entry = hwpoison_pte_range, + .hugetlb_entry = hwpoison_hugetlb_range, +}; + +/* + * Sends SIGBUS to the current process with error info. + * + * This function is intended to handle "Action Required" MCEs on already + * hardware poisoned pages. They could happen, for example, when + * memory_failure() failed to unmap the error page at the first call, or + * when multiple local machine checks happened on different CPUs. + * + * MCE handler currently has no easy access to the error virtual address, + * so this function walks page table to find it. The returned virtual address + * is proper in most cases, but it could be wrong when the application + * process has multiple entries mapping the error page. + */ +static int kill_accessing_process(struct task_struct *p, unsigned long pfn, + int flags) +{ + int ret; + struct hwp_walk priv = { + .pfn = pfn, + }; + priv.tk.tsk = p; + + mmap_read_lock(p->mm); + ret = walk_page_range(p->mm, 0, TASK_SIZE, &hwp_walk_ops, + (void *)&priv); + if (ret == 1 && priv.tk.addr) + kill_proc(&priv.tk, pfn, flags); + mmap_read_unlock(p->mm); + return ret ? -EFAULT : -EHWPOISON; +} + static const char *action_name[] = { [MF_IGNORED] = "Ignored", [MF_FAILED] = "Failed", @@ -1204,7 +1347,10 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags) if (TestSetPageHWPoison(head)) { pr_err("Memory failure: %#lx: already hardware poisoned\n", pfn); - return -EHWPOISON; + res = -EHWPOISON; + if (flags & MF_ACTION_REQUIRED) + res = kill_accessing_process(current, page_to_pfn(head), flags); + return res; } num_poisoned_pages_inc(); @@ -1409,6 +1555,8 @@ int memory_failure(unsigned long pfn, int flags) pr_err("Memory failure: %#lx: already hardware poisoned\n", pfn); res = -EHWPOISON; + if (flags & MF_ACTION_REQUIRED) + res = kill_accessing_process(current, pfn, flags); goto unlock_mutex; } -- Gitee From d1b9413b3213dc72557fc4cdb01ee3ab18d716f4 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Tue, 17 Aug 2021 17:29:41 -0700 Subject: [PATCH 548/674] x86/mce: Change to not send SIGBUS error during copy from user mainline inclusion from mainline-v5.16-rc1 commit a6e3cf70b772541c2388abdb86e5a562cfe18e63 category: bugfix bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1 CVE: NA Intel-SIG: commit a6e3cf70b772 x86/mce: Change to not send SIGBUS error during copy from user. Backport for MCA recovery enhance and bug fix. -------------------------------- Sending a SIGBUS for a copy from user is not the correct semantic. System calls should return -EFAULT (or a short count for write(2)). Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210818002942.1607544-3-tony.luck@intel.com Signed-off-by: Youquan Song --- arch/x86/kernel/cpu/mce/core.c | 36 +++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index cdda2198deae..8aeb56f5e577 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1280,7 +1280,7 @@ static void kill_me_maybe(struct callback_head *cb) flags |= MF_MUST_KILL; ret = memory_failure(p->mce_addr >> PAGE_SHIFT, flags); - if (!ret && !(p->mce_kflags & MCE_IN_KERNEL_COPYIN)) { + if (!ret) { set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page); sync_core(); return; @@ -1294,15 +1294,21 @@ static void kill_me_maybe(struct callback_head *cb) if (ret == -EHWPOISON) return; - if (p->mce_vaddr != (void __user *)-1l) { - force_sig_mceerr(BUS_MCEERR_AR, p->mce_vaddr, PAGE_SHIFT); - } else { - pr_err("Memory error not recovered"); - kill_me_now(cb); - } + pr_err("Memory error not recovered"); + kill_me_now(cb); } -static void queue_task_work(struct mce *m, char *msg, int kill_current_task) +static void kill_me_never(struct callback_head *cb) +{ + struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me); + + p->mce_count = 0; + pr_err("Kernel accessed poison in user space at %llx\n", p->mce_addr); + if (!memory_failure(p->mce_addr >> PAGE_SHIFT, 0)) + set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page); +} + +static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callback_head *)) { int count = ++current->mce_count; @@ -1312,11 +1318,7 @@ static void queue_task_work(struct mce *m, char *msg, int kill_current_task) current->mce_kflags = m->kflags; current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV); current->mce_whole_page = whole_page(m); - - if (kill_current_task) - current->mce_kill_me.func = kill_me_now; - else - current->mce_kill_me.func = kill_me_maybe; + current->mce_kill_me.func = func; } /* Ten is likely overkill. Don't expect more than two faults before task_work() */ @@ -1486,8 +1488,10 @@ noinstr void do_machine_check(struct pt_regs *regs) /* If this triggers there is no way to recover. Die hard. */ BUG_ON(!on_thread_stack() || !user_mode(regs)); - queue_task_work(&m, msg, kill_it); - + if (kill_it) + queue_task_work(&m, msg, kill_me_now); + else + queue_task_work(&m, msg, kill_me_maybe); } else { /* * Handle an MCE which has happened in kernel space but from @@ -1504,7 +1508,7 @@ noinstr void do_machine_check(struct pt_regs *regs) } if (m.kflags & MCE_IN_KERNEL_COPYIN) - queue_task_work(&m, msg, kill_it); + queue_task_work(&m, msg, kill_me_never); } instrumentation_end(); -- Gitee From 4bc1efa1413b9bdfaa10be0f049f19e2a2266253 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 31 May 2021 00:32:44 -0400 Subject: [PATCH 549/674] generic_perform_write()/iomap_write_actor(): saner logics for short copy mainline inclusion from mainline-v5.13 commit bc1bb416bbb9203e250f5c49aaf1d11b5d9c8adb category: bugfix bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1 CVE: NA Intel-SIG: commit bc1bb416bbb9 generic_perform_write()/iomap_write_actor(): saner logics for short copy. Backport for MCA recovery enhance and bug fix. -------------------------------- if we run into a short copy and ->write_end() refuses to advance at all, use the amount we'd managed to copy for the next iteration to handle. Signed-off-by: Al Viro Signed-off-by: Youquan Song --- fs/iomap/buffered-io.c | 25 ++++++++++--------------- mm/filemap.c | 24 +++++++++--------------- 2 files changed, 19 insertions(+), 30 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 3ec494f5d7ee..d5246e277f17 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -771,10 +771,6 @@ iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data, * Otherwise there's a nasty deadlock on copying from the * same page as we're writing to, without it being marked * up-to-date. - * - * Not only is this an optimisation, but it is also required - * to check that the address is actually valid, when atomic - * usercopies are used, below. */ if (unlikely(iov_iter_fault_in_readable(i, bytes))) { status = -EFAULT; @@ -791,25 +787,24 @@ iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data, copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); - copied = iomap_write_end(inode, pos, bytes, copied, page, iomap, + status = iomap_write_end(inode, pos, bytes, copied, page, iomap, srcmap); cond_resched(); - iov_iter_advance(i, copied); - if (unlikely(copied == 0)) { + if (unlikely(status == 0)) { /* - * If we were unable to copy any data at all, we must - * fall back to a single segment length write. - * - * If we didn't fallback here, we could livelock - * because not all segments in the iov can be copied at - * once without a pagefault. + * A short copy made iomap_write_end() reject the + * thing entirely. Might be memory poisoning + * halfway through, might be a race with munmap, + * might be severe memory pressure. */ - bytes = min_t(unsigned long, PAGE_SIZE - offset, - iov_iter_single_seg_count(i)); + if (copied) + bytes = copied; goto again; } + copied = status; + iov_iter_advance(i, copied); pos += copied; written += copied; length -= copied; diff --git a/mm/filemap.c b/mm/filemap.c index 3958fc3280d8..80c5c418eba8 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3510,10 +3510,6 @@ ssize_t generic_perform_write(struct file *file, * Otherwise there's a nasty deadlock on copying from the * same page as we're writing to, without it being marked * up-to-date. - * - * Not only is this an optimisation, but it is also required - * to check that the address is actually valid, when atomic - * usercopies are used, below. */ if (unlikely(iov_iter_fault_in_readable(i, bytes))) { status = -EFAULT; @@ -3540,24 +3536,22 @@ ssize_t generic_perform_write(struct file *file, page, fsdata); if (unlikely(status < 0)) break; - copied = status; cond_resched(); - iov_iter_advance(i, copied); - if (unlikely(copied == 0)) { + if (unlikely(status == 0)) { /* - * If we were unable to copy any data at all, we must - * fall back to a single segment length write. - * - * If we didn't fallback here, we could livelock - * because not all segments in the iov can be copied at - * once without a pagefault. + * A short copy made ->write_end() reject the + * thing entirely. Might be memory poisoning + * halfway through, might be a race with munmap, + * might be severe memory pressure. */ - bytes = min_t(unsigned long, PAGE_SIZE - offset, - iov_iter_single_seg_count(i)); + if (copied) + bytes = copied; goto again; } + copied = status; + iov_iter_advance(i, copied); pos += copied; written += copied; -- Gitee From a1728f8de83520f93310fc7409c502358ca46fdd Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Tue, 17 Aug 2021 17:29:42 -0700 Subject: [PATCH 550/674] x86/mce: Drop copyin special case for #MC mainline inclusion from mainline-v5.16-rc1 commit 690658471b5f28d306e6492c4585d748cb5304e8 category: bugfix bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1 CVE: NA Intel-SIG: commit 690658471b5f x86/mce: Drop copyin special case for #MC. Backport for MCA recovery enhancing & bug fix. -------------------------------- Fixes to the iterator code to handle faults that are not on page boundaries mean that the special case for machine check during copy from user is no longer needed. For a full list of those fixes, see the output of: git log --oneline v5.14 ^v5.13 -- lib/iov_iter.c Intel-SIG: commit 690658471b5f x86/mce: Drop copyin special case for #MC. backport for MCA recovery enhance and bug fix. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210818002942.1607544-4-tony.luck@intel.com Signed-off-by: Youquan Song --- arch/x86/lib/copy_user_64.S | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index 77b9b2a3b5c8..e0e71ca023ce 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -234,24 +234,11 @@ EXPORT_SYMBOL(copy_user_enhanced_fast_string) */ SYM_CODE_START_LOCAL(.Lcopy_user_handle_tail) movl %edx,%ecx - cmp $X86_TRAP_MC,%eax /* check if X86_TRAP_MC */ - je 3f 1: rep movsb 2: mov %ecx,%eax ASM_CLAC ret - /* - * Return zero to pretend that this copy succeeded. This - * is counter-intuitive, but needed to prevent the code - * in lib/iov_iter.c from retrying and running back into - * the poison cache line again. The machine check handler - * will ensure that a SIGBUS is sent to the task. - */ -3: xorl %eax,%eax - ASM_CLAC - ret - _ASM_EXTABLE_CPY(1b, 2b) SYM_CODE_END(.Lcopy_user_handle_tail) -- Gitee From 3ea63cb74c8cf3446e0caf8c32682ae92d31a58f Mon Sep 17 00:00:00 2001 From: Youquan Song Date: Thu, 23 Dec 2021 12:07:01 -0800 Subject: [PATCH 551/674] x86/mce: Reduce number of machine checks taken during recovery mainline inclusion from mainline-v5.17-rc1 commit 3376136300a00df9a864b88fa969177d6c3be8e5 category: bugfix bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1 CVE: NA Intel-SIG: commit 3376136300a0 x86/mce: Reduce number of machine checks taken during recovery. Backport for MCA recovery enhancing & bug fix. -------------------------------- When any of the copy functions in arch/x86/lib/copy_user_64.S take a fault, the fixup code copies the remaining byte count from %ecx to %edx and unconditionally jumps to .Lcopy_user_handle_tail to continue the copy in case any more bytes can be copied. If the fault was #PF this may copy more bytes (because the page fault handler might have fixed the fault). But when the fault is a machine check the original copy code will have copied all the way to the poisoned cache line. So .Lcopy_user_handle_tail will just take another machine check for no good reason. Every code path to .Lcopy_user_handle_tail comes from an exception fixup path, so add a check there to check the trap type (in %eax) and simply return the count of remaining bytes if the trap was a machine check. Doing this reduces the number of machine checks taken during synthetic tests from four to three. As well as reducing the number of machine checks, this also allows Skylake generation Xeons to recover some cases that currently fail. The is because REP; MOVSB is only recoverable when source and destination are well aligned and the byte count is large. That useless call to .Lcopy_user_handle_tail may violate one or more of these conditions and generate a fatal machine check. [ Tony: Add more details to commit message. ] [ bp: Fixup comment. Also, another tip patchset which is adding straight-line speculation mitigation changes the "ret" instruction to an all-caps macro "RET". But, since gas is case-insensitive, use "RET" in the newly added asm block already in order to simplify tip branch merging on its way upstream. ] Signed-off-by: Youquan Song Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/YcTW5dh8yTGucDd+@agluck-desk2.amr.corp.intel.com Signed-off-by: Youquan Song --- arch/x86/lib/copy_user_64.S | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index e0e71ca023ce..403ba3eb4b84 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -225,6 +225,7 @@ EXPORT_SYMBOL(copy_user_enhanced_fast_string) * Don't try to copy the tail if machine check happened * * Input: + * eax trap number written by ex_handler_copy() * rdi destination * rsi source * rdx count @@ -233,12 +234,20 @@ EXPORT_SYMBOL(copy_user_enhanced_fast_string) * eax uncopied bytes or 0 if successful. */ SYM_CODE_START_LOCAL(.Lcopy_user_handle_tail) + cmp $X86_TRAP_MC,%eax + je 3f + movl %edx,%ecx 1: rep movsb 2: mov %ecx,%eax ASM_CLAC ret +3: + movl %edx,%eax + ASM_CLAC + RET + _ASM_EXTABLE_CPY(1b, 2b) SYM_CODE_END(.Lcopy_user_handle_tail) -- Gitee From 297f05904ec5352fa767b706f70b7bf9d88ab986 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Sun, 6 Feb 2022 15:10:14 +0800 Subject: [PATCH 552/674] mm/hwpoison: fix error page recovered but reported "not recovered" mainline inclusion from mainline-v5.18-rc1 commit 046545a661af2beec21de7b90ca0e35f05088a81 category: bugfix bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1 CVE: NA Intel-SIG: commit 046545a661af mm/hwpoison: fix error page recovered but reported "not recovered". Backport for MCA recovery enhancing & bug fix. -------------------------------- When an uncorrected memory error is consumed there is a race between the CMCI from the memory controller reporting an uncorrected error with a UCNA signature, and the core reporting and SRAR signature machine check when the data is about to be consumed. If the CMCI wins that race, the page is marked poisoned when uc_decode_notifier() calls memory_failure() and the machine check processing code finds the page already poisoned. It calls kill_accessing_process() to make sure a SIGBUS is sent. But returns the wrong error code. Console log looks like this: mce: Uncorrected hardware memory error in user-access at 3710b3400 Memory failure: 0x3710b3: recovery action for dirty LRU page: Recovered Memory failure: 0x3710b3: already hardware poisoned Memory failure: 0x3710b3: Sending SIGBUS to einj_mem_uc:361438 due to hardware memory corruption mce: Memory error not recovered kill_accessing_process() is supposed to return -EHWPOISON to notify that SIGBUS is already set to the process and kill_me_maybe() doesn't have to send it again. But current code simply fails to do this, so fix it to make sure to work as intended. This change avoids the noise message "Memory error not recovered" and skips duplicate SIGBUSs. [tony.luck@intel.com: reword some parts of commit message] Link: https://lkml.kernel.org/r/20220113231117.1021405-1-naoya.horiguchi@linux.dev Fixes: a3f5d80ea401 ("mm,hwpoison: send SIGBUS with error virutal address") Signed-off-by: Naoya Horiguchi Reported-by: Youquan Song Cc: Tony Luck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Youquan Song --- mm/memory-failure.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 0035e8ca4e49..7c7769ff0f1d 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -693,8 +693,10 @@ static int kill_accessing_process(struct task_struct *p, unsigned long pfn, (void *)&priv); if (ret == 1 && priv.tk.addr) kill_proc(&priv.tk, pfn, flags); + else + ret = 0; mmap_read_unlock(p->mm); - return ret ? -EFAULT : -EHWPOISON; + return ret > 0 ? -EHWPOISON : -EFAULT; } static const char *action_name[] = { -- Gitee From 928722d827c692d23c11f40980c7edc536a58cd9 Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Tue, 17 Nov 2020 20:49:52 +0800 Subject: [PATCH 553/674] EDAC: Add DDR5 new memory type mainline inclusion from mainline-v5.13 commit bc1c99a5971aa7571e8b9731c28fa32abe12cab8 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1 CVE: NA Intel_SIG: commit bc1c99a5971a EDAC: Add DDR5 new memory type. Backport for EDAC enhancing & bug fix. -------------------------------- Add a new entry to 'enum mem_type' and a new string to 'edac_mem_types[]' for DDR5 new memory type. Signed-off-by: Qiuxu Zhuo Signed-off-by: Tony Luck Signed-off-by: Youquan Song --- drivers/edac/edac_mc.c | 1 + include/linux/edac.h | 3 +++ 2 files changed, 4 insertions(+) diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index f4eb071327be..34514c638f19 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -161,6 +161,7 @@ const char * const edac_mem_types[] = { [MEM_DDR4] = "Unbuffered-DDR4", [MEM_RDDR4] = "Registered-DDR4", [MEM_LRDDR4] = "Load-Reduced-DDR4-RAM", + [MEM_DDR5] = "Unbuffered-DDR5", [MEM_NVDIMM] = "Non-volatile-RAM", }; EXPORT_SYMBOL_GPL(edac_mem_types); diff --git a/include/linux/edac.h b/include/linux/edac.h index 15e8f3d8a895..6c4565cc6273 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -179,6 +179,7 @@ static inline char *mc_event_error_type(const unsigned int err_type) * @MEM_RDDR4: Registered DDR4 RAM * This is a variant of the DDR4 memories. * @MEM_LRDDR4: Load-Reduced DDR4 memory. + * @MEM_DDR5: Unbuffered DDR5 RAM * @MEM_NVDIMM: Non-volatile RAM */ enum mem_type { @@ -203,6 +204,7 @@ enum mem_type { MEM_DDR4, MEM_RDDR4, MEM_LRDDR4, + MEM_DDR5, MEM_NVDIMM, }; @@ -226,6 +228,7 @@ enum mem_type { #define MEM_FLAG_DDR4 BIT(MEM_DDR4) #define MEM_FLAG_RDDR4 BIT(MEM_RDDR4) #define MEM_FLAG_LRDDR4 BIT(MEM_LRDDR4) +#define MEM_FLAG_DDR5 BIT(MEM_DDR5) #define MEM_FLAG_NVDIMM BIT(MEM_NVDIMM) /** -- Gitee From 432fad27ed99567be07b4ebfbb7aa50b8856c83e Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Tue, 17 Nov 2020 20:49:53 +0800 Subject: [PATCH 554/674] EDAC/i10nm: Add Intel Sapphire Rapids server support mainline inclusion from mainline-v5.11-rc1 commit 479f58dda25bb46daeb937f124718e8b4aea6781 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1 CVE: NA Intel_SIG: commit 479f58dda25b EDAC/i10nm: Add Intel Sapphire Rapids server support. Backport for add EDAC SPR suppporting. -------------------------------- The Sapphire Rapids CPU model shares the same memory controller architecture with Ice Lake server. There are some configurations different from Ice Lake server as below: - The device ID for configuration agent. - The size for per channel memory-mapped I/O. - The DDR5 memory support. So add the above configurations and the Sapphire Rapids CPU model ID for EDAC support. Signed-off-by: Qiuxu Zhuo Signed-off-by: Tony Luck Signed-off-by: Youquan Song --- drivers/edac/i10nm_base.c | 34 +++++++++++++++++++++++++--------- drivers/edac/skx_base.c | 6 +++--- drivers/edac/skx_common.c | 23 ++++++++++++++++++----- drivers/edac/skx_common.h | 16 ++++++++++++---- 4 files changed, 58 insertions(+), 21 deletions(-) diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c index 3a7362f968c9..4f7f9970a901 100644 --- a/drivers/edac/i10nm_base.c +++ b/drivers/edac/i10nm_base.c @@ -13,7 +13,7 @@ #include "edac_module.h" #include "skx_common.h" -#define I10NM_REVISION "v0.0.3" +#define I10NM_REVISION "v0.0.4" #define EDAC_MOD_STR "i10nm_edac" /* Debug macros */ @@ -25,11 +25,13 @@ #define I10NM_GET_IMC_BAR(d, i, reg) \ pci_read_config_dword((d)->uracu, 0xd8 + (i) * 4, &(reg)) #define I10NM_GET_DIMMMTR(m, i, j) \ - readl((m)->mbase + 0x2080c + (i) * 0x4000 + (j) * 4) + readl((m)->mbase + 0x2080c + (i) * (m)->chan_mmio_sz + (j) * 4) #define I10NM_GET_MCDDRTCFG(m, i) \ - readl((m)->mbase + 0x20970 + (i) * 0x4000) + readl((m)->mbase + 0x20970 + (i) * (m)->chan_mmio_sz) #define I10NM_GET_MCMTR(m, i) \ - readl((m)->mbase + 0x20ef8 + (i) * 0x4000) + readl((m)->mbase + 0x20ef8 + (i) * (m)->chan_mmio_sz) +#define I10NM_GET_AMAP(m, i) \ + readl((m)->mbase + 0x20814 + (i) * (m)->chan_mmio_sz) #define I10NM_GET_SCK_MMIO_BASE(reg) (GET_BITFIELD(reg, 0, 28) << 23) #define I10NM_GET_IMC_MMIO_OFFSET(reg) (GET_BITFIELD(reg, 0, 10) << 12) @@ -129,12 +131,22 @@ static struct res_config i10nm_cfg0 = { .type = I10NM, .decs_did = 0x3452, .busno_cfg_offset = 0xcc, + .ddr_chan_mmio_sz = 0x4000, }; static struct res_config i10nm_cfg1 = { .type = I10NM, .decs_did = 0x3452, .busno_cfg_offset = 0xd0, + .ddr_chan_mmio_sz = 0x4000, +}; + +static struct res_config spr_cfg = { + .type = SPR, + .decs_did = 0x3252, + .busno_cfg_offset = 0xd0, + .ddr_chan_mmio_sz = 0x8000, + .support_ddr5 = true, }; static const struct x86_cpu_id i10nm_cpuids[] = { @@ -143,6 +155,7 @@ static const struct x86_cpu_id i10nm_cpuids[] = { X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(ICELAKE_X, X86_STEPPINGS(0x0, 0x3), &i10nm_cfg0), X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(ICELAKE_X, X86_STEPPINGS(0x4, 0xf), &i10nm_cfg1), X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(ICELAKE_D, X86_STEPPINGS(0x0, 0xf), &i10nm_cfg1), + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(SAPPHIRERAPIDS_X, X86_STEPPINGS(0x0, 0xf), &spr_cfg), {} }; MODULE_DEVICE_TABLE(x86cpu, i10nm_cpuids); @@ -157,12 +170,13 @@ static bool i10nm_check_ecc(struct skx_imc *imc, int chan) return !!GET_BITFIELD(mcmtr, 2, 2); } -static int i10nm_get_dimm_config(struct mem_ctl_info *mci) +static int i10nm_get_dimm_config(struct mem_ctl_info *mci, + struct res_config *cfg) { struct skx_pvt *pvt = mci->pvt_info; struct skx_imc *imc = pvt->imc; + u32 mtr, amap, mcddrtcfg; struct dimm_info *dimm; - u32 mtr, mcddrtcfg; int i, j, ndimms; for (i = 0; i < I10NM_NUM_CHANNELS; i++) { @@ -171,6 +185,7 @@ static int i10nm_get_dimm_config(struct mem_ctl_info *mci) ndimms = 0; mcddrtcfg = I10NM_GET_MCDDRTCFG(imc, i); + amap = I10NM_GET_AMAP(imc, i); for (j = 0; j < I10NM_NUM_DIMMS; j++) { dimm = edac_get_dimm(mci, i, j, 0); mtr = I10NM_GET_DIMMMTR(imc, i, j); @@ -178,8 +193,8 @@ static int i10nm_get_dimm_config(struct mem_ctl_info *mci) mtr, mcddrtcfg, imc->mc, i, j); if (IS_DIMM_PRESENT(mtr)) - ndimms += skx_get_dimm_info(mtr, 0, 0, dimm, - imc, i, j); + ndimms += skx_get_dimm_info(mtr, 0, amap, dimm, + imc, i, j, cfg); else if (IS_NVDIMM_PRESENT(mcddrtcfg, j)) ndimms += skx_get_nvdimm_info(dimm, imc, i, j, EDAC_MOD_STR); @@ -306,10 +321,11 @@ static int __init i10nm_init(void) d->imc[i].lmc = i; d->imc[i].src_id = src_id; d->imc[i].node_id = node_id; + d->imc[i].chan_mmio_sz = cfg->ddr_chan_mmio_sz; rc = skx_register_mci(&d->imc[i], d->imc[i].mdev, "Intel_10nm Socket", EDAC_MOD_STR, - i10nm_get_dimm_config); + i10nm_get_dimm_config, cfg); if (rc < 0) goto fail; } diff --git a/drivers/edac/skx_base.c b/drivers/edac/skx_base.c index f887e3166651..4dbd46575bfb 100644 --- a/drivers/edac/skx_base.c +++ b/drivers/edac/skx_base.c @@ -174,7 +174,7 @@ static bool skx_check_ecc(u32 mcmtr) return !!GET_BITFIELD(mcmtr, 2, 2); } -static int skx_get_dimm_config(struct mem_ctl_info *mci) +static int skx_get_dimm_config(struct mem_ctl_info *mci, struct res_config *cfg) { struct skx_pvt *pvt = mci->pvt_info; u32 mtr, mcmtr, amap, mcddrtcfg; @@ -195,7 +195,7 @@ static int skx_get_dimm_config(struct mem_ctl_info *mci) pci_read_config_dword(imc->chan[i].cdev, 0x80 + 4 * j, &mtr); if (IS_DIMM_PRESENT(mtr)) { - ndimms += skx_get_dimm_info(mtr, mcmtr, amap, dimm, imc, i, j); + ndimms += skx_get_dimm_info(mtr, mcmtr, amap, dimm, imc, i, j, cfg); } else if (IS_NVDIMM_PRESENT(mcddrtcfg, j)) { ndimms += skx_get_nvdimm_info(dimm, imc, i, j, EDAC_MOD_STR); @@ -705,7 +705,7 @@ static int __init skx_init(void) d->imc[i].node_id = node_id; rc = skx_register_mci(&d->imc[i], d->imc[i].chan[0].cdev, "Skylake Socket", EDAC_MOD_STR, - skx_get_dimm_config); + skx_get_dimm_config, cfg); if (rc < 0) goto fail; } diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c index 2b4ce8e5ac2f..81c3e2ec6f56 100644 --- a/drivers/edac/skx_common.c +++ b/drivers/edac/skx_common.c @@ -304,15 +304,25 @@ static int skx_get_dimm_attr(u32 reg, int lobit, int hibit, int add, #define numcol(reg) skx_get_dimm_attr(reg, 0, 1, 10, 0, 2, "cols") int skx_get_dimm_info(u32 mtr, u32 mcmtr, u32 amap, struct dimm_info *dimm, - struct skx_imc *imc, int chan, int dimmno) + struct skx_imc *imc, int chan, int dimmno, + struct res_config *cfg) { - int banks = 16, ranks, rows, cols, npages; + int banks, ranks, rows, cols, npages; + enum mem_type mtype; u64 size; ranks = numrank(mtr); rows = numrow(mtr); cols = numcol(mtr); + if (cfg->support_ddr5 && (amap & 0x8)) { + banks = 32; + mtype = MEM_DDR5; + } else { + banks = 16; + mtype = MEM_DDR4; + } + /* * Compute size in 8-byte (2^3) words, then shift to MiB (2^20) */ @@ -332,7 +342,7 @@ int skx_get_dimm_info(u32 mtr, u32 mcmtr, u32 amap, struct dimm_info *dimm, dimm->nr_pages = npages; dimm->grain = 32; dimm->dtype = get_width(mtr); - dimm->mtype = MEM_DDR4; + dimm->mtype = mtype; dimm->edac_mode = EDAC_SECDED; /* likely better than this */ snprintf(dimm->label, sizeof(dimm->label), "CPU_SrcID#%u_MC#%u_Chan#%u_DIMM#%u", imc->src_id, imc->lmc, chan, dimmno); @@ -390,7 +400,8 @@ int skx_get_nvdimm_info(struct dimm_info *dimm, struct skx_imc *imc, int skx_register_mci(struct skx_imc *imc, struct pci_dev *pdev, const char *ctl_name, const char *mod_str, - get_dimm_config_f get_dimm_config) + get_dimm_config_f get_dimm_config, + struct res_config *cfg) { struct mem_ctl_info *mci; struct edac_mc_layer layers[2]; @@ -425,13 +436,15 @@ int skx_register_mci(struct skx_imc *imc, struct pci_dev *pdev, } mci->mtype_cap = MEM_FLAG_DDR4 | MEM_FLAG_NVDIMM; + if (cfg->support_ddr5) + mci->mtype_cap |= MEM_FLAG_DDR5; mci->edac_ctl_cap = EDAC_FLAG_NONE; mci->edac_cap = EDAC_FLAG_NONE; mci->mod_name = mod_str; mci->dev_name = pci_name(pdev); mci->ctl_page_to_phys = NULL; - rc = get_dimm_config(mci); + rc = get_dimm_config(mci, cfg); if (rc < 0) goto fail; diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h index 78f8c1de0b71..bf56bebff138 100644 --- a/drivers/edac/skx_common.h +++ b/drivers/edac/skx_common.h @@ -59,6 +59,7 @@ struct skx_dev { struct mem_ctl_info *mci; struct pci_dev *mdev; /* for i10nm CPU */ void __iomem *mbase; /* for i10nm CPU */ + int chan_mmio_sz; /* for i10nm CPU */ u8 mc; /* system wide mc# */ u8 lmc; /* socket relative mc# */ u8 src_id, node_id; @@ -82,7 +83,8 @@ struct skx_pvt { enum type { SKX, - I10NM + I10NM, + SPR }; enum { @@ -118,9 +120,13 @@ struct res_config { unsigned int decs_did; /* Default bus number configuration register offset */ int busno_cfg_offset; + /* Per DDR channel memory-mapped I/O size */ + int ddr_chan_mmio_sz; + bool support_ddr5; }; -typedef int (*get_dimm_config_f)(struct mem_ctl_info *mci); +typedef int (*get_dimm_config_f)(struct mem_ctl_info *mci, + struct res_config *cfg); typedef bool (*skx_decode_f)(struct decoded_addr *res); typedef void (*skx_show_retry_log_f)(struct decoded_addr *res, char *msg, int len); @@ -136,14 +142,16 @@ int skx_get_all_bus_mappings(struct res_config *cfg, struct list_head **list); int skx_get_hi_lo(unsigned int did, int off[], u64 *tolm, u64 *tohm); int skx_get_dimm_info(u32 mtr, u32 mcmtr, u32 amap, struct dimm_info *dimm, - struct skx_imc *imc, int chan, int dimmno); + struct skx_imc *imc, int chan, int dimmno, + struct res_config *cfg); int skx_get_nvdimm_info(struct dimm_info *dimm, struct skx_imc *imc, int chan, int dimmno, const char *mod_str); int skx_register_mci(struct skx_imc *imc, struct pci_dev *pdev, const char *ctl_name, const char *mod_str, - get_dimm_config_f get_dimm_config); + get_dimm_config_f get_dimm_config, + struct res_config *cfg); int skx_mce_check_error(struct notifier_block *nb, unsigned long val, void *data); -- Gitee From 3dd707aa7c8e5d5cda72390ee0d22508944bdb77 Mon Sep 17 00:00:00 2001 From: Youquan Song Date: Wed, 18 Aug 2021 10:57:01 -0700 Subject: [PATCH 555/674] EDAC/i10nm: Retrieve and print retry_rd_err_log registers mainline inclusion from mainline-v5.15-rc1 commit cf4e6d52f58399c777276172ec250502e19d5e63 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1 CVE: NA Intel-SIG: commit cf4e6d52f583 EDAC/i10nm: Retrieve and print retry_rd_err_log registers. Backport for EDAC retry_rd_err_log for ICX/SPR support. -------------------------------- Retrieve and print retry_rd_err_log registers like the earlier change: commit e80634a75aba ("EDAC, skx: Retrieve and print retry_rd_err_log registers") This is a little trickier than on Skylake because of potential interference with BIOS use of the same registers. The default behavior is to ignore these registers. A module parameter retry_rd_err_log(default=0) controls the mode of operation: - 0=off : Default. - 1=bios : Linux doesn't reset any control bits, but just reports values. This is "no harm" mode, but it may miss reporting some data. - 2=linux: Linux tries to take control and resets mode bits, clears valid/UC bits after reading. This should be more reliable (especially if BIOS interference is reduced by disabling eMCA reporting mode in BIOS setup). Co-developed-by: Qiuxu Zhuo Signed-off-by: Qiuxu Zhuo Signed-off-by: Youquan Song Signed-off-by: Tony Luck Link: https://lore.kernel.org/r/20210818175701.1611513-3-tony.luck@intel.com Signed-off-by: Youquan Song --- drivers/edac/i10nm_base.c | 146 ++++++++++++++++++++++++++++++++++++++ drivers/edac/skx_base.c | 3 +- drivers/edac/skx_common.c | 4 +- drivers/edac/skx_common.h | 7 +- 4 files changed, 157 insertions(+), 3 deletions(-) diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c index 4f7f9970a901..7798de1e2aed 100644 --- a/drivers/edac/i10nm_base.c +++ b/drivers/edac/i10nm_base.c @@ -32,14 +32,137 @@ readl((m)->mbase + 0x20ef8 + (i) * (m)->chan_mmio_sz) #define I10NM_GET_AMAP(m, i) \ readl((m)->mbase + 0x20814 + (i) * (m)->chan_mmio_sz) +#define I10NM_GET_REG32(m, i, offset) \ + readl((m)->mbase + (i) * (m)->chan_mmio_sz + (offset)) +#define I10NM_GET_REG64(m, i, offset) \ + readq((m)->mbase + (i) * (m)->chan_mmio_sz + (offset)) +#define I10NM_SET_REG32(m, i, offset, v) \ + writel(v, (m)->mbase + (i) * (m)->chan_mmio_sz + (offset)) #define I10NM_GET_SCK_MMIO_BASE(reg) (GET_BITFIELD(reg, 0, 28) << 23) #define I10NM_GET_IMC_MMIO_OFFSET(reg) (GET_BITFIELD(reg, 0, 10) << 12) #define I10NM_GET_IMC_MMIO_SIZE(reg) ((GET_BITFIELD(reg, 13, 23) - \ GET_BITFIELD(reg, 0, 10) + 1) << 12) +#define RETRY_RD_ERR_LOG_UC BIT(1) +#define RETRY_RD_ERR_LOG_NOOVER BIT(14) +#define RETRY_RD_ERR_LOG_EN BIT(15) +#define RETRY_RD_ERR_LOG_NOOVER_UC (BIT(14) | BIT(1)) +#define RETRY_RD_ERR_LOG_OVER_UC_V (BIT(2) | BIT(1) | BIT(0)) + static struct list_head *i10nm_edac_list; +static struct res_config *res_cfg; +static int retry_rd_err_log; + +static u32 offsets_scrub_icx[] = {0x22c60, 0x22c54, 0x22c5c, 0x22c58, 0x22c28, 0x20ed8}; +static u32 offsets_scrub_spr[] = {0x22c60, 0x22c54, 0x22f08, 0x22c58, 0x22c28, 0x20ed8}; +static u32 offsets_demand_icx[] = {0x22e54, 0x22e60, 0x22e64, 0x22e58, 0x22e5c, 0x20ee0}; +static u32 offsets_demand_spr[] = {0x22e54, 0x22e60, 0x22f10, 0x22e58, 0x22e5c, 0x20ee0}; + +static void __enable_retry_rd_err_log(struct skx_imc *imc, int chan, bool enable) +{ + u32 s, d; + + if (!imc->mbase) + return; + + s = I10NM_GET_REG32(imc, chan, res_cfg->offsets_scrub[0]); + d = I10NM_GET_REG32(imc, chan, res_cfg->offsets_demand[0]); + + if (enable) { + /* Save default configurations */ + imc->chan[chan].retry_rd_err_log_s = s; + imc->chan[chan].retry_rd_err_log_d = d; + + s &= ~RETRY_RD_ERR_LOG_NOOVER_UC; + s |= RETRY_RD_ERR_LOG_EN; + d &= ~RETRY_RD_ERR_LOG_NOOVER_UC; + d |= RETRY_RD_ERR_LOG_EN; + } else { + /* Restore default configurations */ + if (imc->chan[chan].retry_rd_err_log_s & RETRY_RD_ERR_LOG_UC) + s |= RETRY_RD_ERR_LOG_UC; + if (imc->chan[chan].retry_rd_err_log_s & RETRY_RD_ERR_LOG_NOOVER) + s |= RETRY_RD_ERR_LOG_NOOVER; + if (!(imc->chan[chan].retry_rd_err_log_s & RETRY_RD_ERR_LOG_EN)) + s &= ~RETRY_RD_ERR_LOG_EN; + if (imc->chan[chan].retry_rd_err_log_d & RETRY_RD_ERR_LOG_UC) + d |= RETRY_RD_ERR_LOG_UC; + if (imc->chan[chan].retry_rd_err_log_d & RETRY_RD_ERR_LOG_NOOVER) + d |= RETRY_RD_ERR_LOG_NOOVER; + if (!(imc->chan[chan].retry_rd_err_log_d & RETRY_RD_ERR_LOG_EN)) + d &= ~RETRY_RD_ERR_LOG_EN; + } + + I10NM_SET_REG32(imc, chan, res_cfg->offsets_scrub[0], s); + I10NM_SET_REG32(imc, chan, res_cfg->offsets_demand[0], d); +} + +static void enable_retry_rd_err_log(bool enable) +{ + struct skx_dev *d; + int i, j; + + edac_dbg(2, "\n"); + + list_for_each_entry(d, i10nm_edac_list, list) + for (i = 0; i < I10NM_NUM_IMC; i++) + for (j = 0; j < I10NM_NUM_CHANNELS; j++) + __enable_retry_rd_err_log(&d->imc[i], j, enable); +} + +static void show_retry_rd_err_log(struct decoded_addr *res, char *msg, + int len, bool scrub_err) +{ + struct skx_imc *imc = &res->dev->imc[res->imc]; + u32 log0, log1, log2, log3, log4; + u32 corr0, corr1, corr2, corr3; + u64 log2a, log5; + u32 *offsets; + int n; + + if (!imc->mbase) + return; + + offsets = scrub_err ? res_cfg->offsets_scrub : res_cfg->offsets_demand; + + log0 = I10NM_GET_REG32(imc, res->channel, offsets[0]); + log1 = I10NM_GET_REG32(imc, res->channel, offsets[1]); + log3 = I10NM_GET_REG32(imc, res->channel, offsets[3]); + log4 = I10NM_GET_REG32(imc, res->channel, offsets[4]); + log5 = I10NM_GET_REG64(imc, res->channel, offsets[5]); + + if (res_cfg->type == SPR) { + log2a = I10NM_GET_REG64(imc, res->channel, offsets[2]); + n = snprintf(msg, len, " retry_rd_err_log[%.8x %.8x %.16llx %.8x %.8x %.16llx]", + log0, log1, log2a, log3, log4, log5); + } else { + log2 = I10NM_GET_REG32(imc, res->channel, offsets[2]); + n = snprintf(msg, len, " retry_rd_err_log[%.8x %.8x %.8x %.8x %.8x %.16llx]", + log0, log1, log2, log3, log4, log5); + } + + corr0 = I10NM_GET_REG32(imc, res->channel, 0x22c18); + corr1 = I10NM_GET_REG32(imc, res->channel, 0x22c1c); + corr2 = I10NM_GET_REG32(imc, res->channel, 0x22c20); + corr3 = I10NM_GET_REG32(imc, res->channel, 0x22c24); + + if (len - n > 0) + snprintf(msg + n, len - n, + " correrrcnt[%.4x %.4x %.4x %.4x %.4x %.4x %.4x %.4x]", + corr0 & 0xffff, corr0 >> 16, + corr1 & 0xffff, corr1 >> 16, + corr2 & 0xffff, corr2 >> 16, + corr3 & 0xffff, corr3 >> 16); + + /* Clear status bits */ + if (retry_rd_err_log == 2 && (log0 & RETRY_RD_ERR_LOG_OVER_UC_V)) { + log0 &= ~RETRY_RD_ERR_LOG_OVER_UC_V; + I10NM_SET_REG32(imc, res->channel, offsets[0], log0); + } +} + static struct pci_dev *pci_get_dev_wrapper(int dom, unsigned int bus, unsigned int dev, unsigned int fun) { @@ -132,6 +255,8 @@ static struct res_config i10nm_cfg0 = { .decs_did = 0x3452, .busno_cfg_offset = 0xcc, .ddr_chan_mmio_sz = 0x4000, + .offsets_scrub = offsets_scrub_icx, + .offsets_demand = offsets_demand_icx, }; static struct res_config i10nm_cfg1 = { @@ -139,6 +264,8 @@ static struct res_config i10nm_cfg1 = { .decs_did = 0x3452, .busno_cfg_offset = 0xd0, .ddr_chan_mmio_sz = 0x4000, + .offsets_scrub = offsets_scrub_icx, + .offsets_demand = offsets_demand_icx, }; static struct res_config spr_cfg = { @@ -147,6 +274,8 @@ static struct res_config spr_cfg = { .busno_cfg_offset = 0xd0, .ddr_chan_mmio_sz = 0x8000, .support_ddr5 = true, + .offsets_scrub = offsets_scrub_spr, + .offsets_demand = offsets_demand_spr, }; static const struct x86_cpu_id i10nm_cpuids[] = { @@ -286,6 +415,7 @@ static int __init i10nm_init(void) return -ENODEV; cfg = (struct res_config *)id->driver_data; + res_cfg = cfg; rc = skx_get_hi_lo(0x09a2, off, &tolm, &tohm); if (rc) @@ -339,6 +469,12 @@ static int __init i10nm_init(void) mce_register_decode_chain(&i10nm_mce_dec); setup_i10nm_debug(); + if (retry_rd_err_log && res_cfg->offsets_scrub && res_cfg->offsets_demand) { + skx_set_decode(NULL, show_retry_rd_err_log); + if (retry_rd_err_log == 2) + enable_retry_rd_err_log(true); + } + i10nm_printk(KERN_INFO, "%s\n", I10NM_REVISION); return 0; @@ -350,6 +486,13 @@ static int __init i10nm_init(void) static void __exit i10nm_exit(void) { edac_dbg(2, "\n"); + + if (retry_rd_err_log && res_cfg->offsets_scrub && res_cfg->offsets_demand) { + skx_set_decode(NULL, NULL); + if (retry_rd_err_log == 2) + enable_retry_rd_err_log(false); + } + teardown_i10nm_debug(); mce_unregister_decode_chain(&i10nm_mce_dec); skx_adxl_put(); @@ -359,5 +502,8 @@ static void __exit i10nm_exit(void) module_init(i10nm_init); module_exit(i10nm_exit); +module_param(retry_rd_err_log, int, 0444); +MODULE_PARM_DESC(retry_rd_err_log, "retry_rd_err_log: 0=off(default), 1=bios(Linux doesn't reset any control bits, but just reports values.), 2=linux(Linux tries to take control and resets mode bits, clear valid/UC bits after reading.)"); + MODULE_LICENSE("GPL v2"); MODULE_DESCRIPTION("MC Driver for Intel 10nm server processors"); diff --git a/drivers/edac/skx_base.c b/drivers/edac/skx_base.c index 4dbd46575bfb..1abc020d49ab 100644 --- a/drivers/edac/skx_base.c +++ b/drivers/edac/skx_base.c @@ -230,7 +230,8 @@ static int skx_get_dimm_config(struct mem_ctl_info *mci, struct res_config *cfg) #define SKX_ILV_TARGET(tgt) ((tgt) & 7) static void skx_show_retry_rd_err_log(struct decoded_addr *res, - char *msg, int len) + char *msg, int len, + bool scrub_err) { u32 log0, log1, log2, log3, log4; u32 corr0, corr1, corr2, corr3; diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c index 81c3e2ec6f56..05ffd39d6de9 100644 --- a/drivers/edac/skx_common.c +++ b/drivers/edac/skx_common.c @@ -494,6 +494,7 @@ static void skx_mce_output_error(struct mem_ctl_info *mci, bool ripv = GET_BITFIELD(m->mcgstatus, 0, 0); bool overflow = GET_BITFIELD(m->status, 62, 62); bool uncorrected_error = GET_BITFIELD(m->status, 61, 61); + bool scrub_err = false; bool recoverable; int len; u32 core_err_cnt = GET_BITFIELD(m->status, 38, 52); @@ -545,6 +546,7 @@ static void skx_mce_output_error(struct mem_ctl_info *mci, break; case 4: optype = "memory scrubbing error"; + scrub_err = true; break; default: optype = "reserved"; @@ -567,7 +569,7 @@ static void skx_mce_output_error(struct mem_ctl_info *mci, } if (skx_show_retry_rd_err_log) - skx_show_retry_rd_err_log(res, skx_msg + len, MSG_SIZE - len); + skx_show_retry_rd_err_log(res, skx_msg + len, MSG_SIZE - len, scrub_err); edac_dbg(0, "%s\n", skx_msg); diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h index bf56bebff138..22e174c740be 100644 --- a/drivers/edac/skx_common.h +++ b/drivers/edac/skx_common.h @@ -66,6 +66,8 @@ struct skx_dev { struct skx_channel { struct pci_dev *cdev; struct pci_dev *edev; + u32 retry_rd_err_log_s; + u32 retry_rd_err_log_d; struct skx_dimm { u8 close_pg; u8 bank_xor_enable; @@ -123,12 +125,15 @@ struct res_config { /* Per DDR channel memory-mapped I/O size */ int ddr_chan_mmio_sz; bool support_ddr5; + /* Offsets of retry_rd_err_log registers */ + u32 *offsets_scrub; + u32 *offsets_demand; }; typedef int (*get_dimm_config_f)(struct mem_ctl_info *mci, struct res_config *cfg); typedef bool (*skx_decode_f)(struct decoded_addr *res); -typedef void (*skx_show_retry_log_f)(struct decoded_addr *res, char *msg, int len); +typedef void (*skx_show_retry_log_f)(struct decoded_addr *res, char *msg, int len, bool scrub_err); int __init skx_adxl_get(void); void __exit skx_adxl_put(void); -- Gitee From f82dbd0e34a668a0fc38aa46107fb0f2866eb1d9 Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Fri, 11 Jun 2021 10:01:18 -0700 Subject: [PATCH 556/674] EDAC/skx_common: Add new ADXL components for 2-level memory mainline inclusion from mainline-v5.14-rc1 commit 2f4348e5a86198704368a699a7c4cdeb21d569f5 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1 CVE: NA Intel-SIG: commit 2f4348e5a861 EDAC/skx_common: Add new ADXL components for 2-level memory. Backport to add EDAC 2LM support. -------------------------------- Some Intel servers may configure memory in 2 levels, using fast "near" memory (e.g. DDR) as a cache for larger, slower, "far" memory (e.g. 3D X-point). In these configurations the BIOS ADXL address translation for an address in a 2-level memory range will provide details of both the "near" and far components. Current exported ADXL components are only for 1-level memory system or for 2nd level memory of 2-level memory system. So add new ADXL components for 1st level memory of 2-level memory system to fully support 2-level memory system and the detection of memory error source(1st level memory or 2nd level memory). Signed-off-by: Qiuxu Zhuo Signed-off-by: Tony Luck Link: https://lore.kernel.org/r/20210611170123.1057025-2-tony.luck@intel.com Signed-off-by: Youquan Song --- drivers/edac/skx_common.c | 67 ++++++++++++++++++++++++++++++++------- drivers/edac/skx_common.h | 11 +++++++ 2 files changed, 67 insertions(+), 11 deletions(-) diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c index 05ffd39d6de9..3d0b391a400d 100644 --- a/drivers/edac/skx_common.c +++ b/drivers/edac/skx_common.c @@ -23,10 +23,13 @@ #include "skx_common.h" static const char * const component_names[] = { - [INDEX_SOCKET] = "ProcessorSocketId", - [INDEX_MEMCTRL] = "MemoryControllerId", - [INDEX_CHANNEL] = "ChannelId", - [INDEX_DIMM] = "DimmSlotId", + [INDEX_SOCKET] = "ProcessorSocketId", + [INDEX_MEMCTRL] = "MemoryControllerId", + [INDEX_CHANNEL] = "ChannelId", + [INDEX_DIMM] = "DimmSlotId", + [INDEX_NM_MEMCTRL] = "NmMemoryControllerId", + [INDEX_NM_CHANNEL] = "NmChannelId", + [INDEX_NM_DIMM] = "NmDimmSlotId", }; static int component_indices[ARRAY_SIZE(component_names)]; @@ -34,12 +37,14 @@ static int adxl_component_count; static const char * const *adxl_component_names; static u64 *adxl_values; static char *adxl_msg; +static unsigned long adxl_nm_bitmap; static char skx_msg[MSG_SIZE]; static skx_decode_f skx_decode; static skx_show_retry_log_f skx_show_retry_rd_err_log; static u64 skx_tolm, skx_tohm; static LIST_HEAD(dev_edac_list); +static bool skx_mem_cfg_2lm; int __init skx_adxl_get(void) { @@ -56,14 +61,25 @@ int __init skx_adxl_get(void) for (j = 0; names[j]; j++) { if (!strcmp(component_names[i], names[j])) { component_indices[i] = j; + + if (i >= INDEX_NM_FIRST) + adxl_nm_bitmap |= 1 << i; + break; } } - if (!names[j]) + if (!names[j] && i < INDEX_NM_FIRST) goto err; } + if (skx_mem_cfg_2lm) { + if (!adxl_nm_bitmap) + skx_printk(KERN_NOTICE, "Not enough ADXL components for 2-level memory.\n"); + else + edac_dbg(2, "adxl_nm_bitmap: 0x%lx\n", adxl_nm_bitmap); + } + adxl_component_names = names; while (*names++) adxl_component_count++; @@ -99,7 +115,7 @@ void __exit skx_adxl_put(void) kfree(adxl_msg); } -static bool skx_adxl_decode(struct decoded_addr *res) +static bool skx_adxl_decode(struct decoded_addr *res, bool error_in_1st_level_mem) { struct skx_dev *d; int i, len = 0; @@ -116,11 +132,20 @@ static bool skx_adxl_decode(struct decoded_addr *res) } res->socket = (int)adxl_values[component_indices[INDEX_SOCKET]]; - res->imc = (int)adxl_values[component_indices[INDEX_MEMCTRL]]; - res->channel = (int)adxl_values[component_indices[INDEX_CHANNEL]]; - res->dimm = (int)adxl_values[component_indices[INDEX_DIMM]]; + if (error_in_1st_level_mem) { + res->imc = (adxl_nm_bitmap & BIT_NM_MEMCTRL) ? + (int)adxl_values[component_indices[INDEX_NM_MEMCTRL]] : -1; + res->channel = (adxl_nm_bitmap & BIT_NM_CHANNEL) ? + (int)adxl_values[component_indices[INDEX_NM_CHANNEL]] : -1; + res->dimm = (adxl_nm_bitmap & BIT_NM_DIMM) ? + (int)adxl_values[component_indices[INDEX_NM_DIMM]] : -1; + } else { + res->imc = (int)adxl_values[component_indices[INDEX_MEMCTRL]]; + res->channel = (int)adxl_values[component_indices[INDEX_CHANNEL]]; + res->dimm = (int)adxl_values[component_indices[INDEX_DIMM]]; + } - if (res->imc > NUM_IMC - 1) { + if (res->imc > NUM_IMC - 1 || res->imc < 0) { skx_printk(KERN_ERR, "Bad imc %d\n", res->imc); return false; } @@ -151,6 +176,11 @@ static bool skx_adxl_decode(struct decoded_addr *res) return true; } +void skx_set_mem_cfg(bool mem_cfg_2lm) +{ + skx_mem_cfg_2lm = mem_cfg_2lm; +} + void skx_set_decode(skx_decode_f decode, skx_show_retry_log_f show_retry_log) { skx_decode = decode; @@ -580,6 +610,21 @@ static void skx_mce_output_error(struct mem_ctl_info *mci, optype, skx_msg); } +static bool skx_error_in_1st_level_mem(const struct mce *m) +{ + u32 errcode; + + if (!skx_mem_cfg_2lm) + return false; + + errcode = GET_BITFIELD(m->status, 0, 15); + + if ((errcode & 0xef80) != 0x280) + return false; + + return true; +} + int skx_mce_check_error(struct notifier_block *nb, unsigned long val, void *data) { @@ -599,7 +644,7 @@ int skx_mce_check_error(struct notifier_block *nb, unsigned long val, res.addr = mce->addr; if (adxl_component_count) { - if (!skx_adxl_decode(&res)) + if (!skx_adxl_decode(&res, skx_error_in_1st_level_mem(mce))) return NOTIFY_DONE; } else if (!skx_decode || !skx_decode(&res)) { return NOTIFY_DONE; diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h index 22e174c740be..de9f82977f0e 100644 --- a/drivers/edac/skx_common.h +++ b/drivers/edac/skx_common.h @@ -9,6 +9,8 @@ #ifndef _SKX_COMM_EDAC_H #define _SKX_COMM_EDAC_H +#include + #define MSG_SIZE 1024 /* @@ -94,9 +96,17 @@ enum { INDEX_MEMCTRL, INDEX_CHANNEL, INDEX_DIMM, + INDEX_NM_FIRST, + INDEX_NM_MEMCTRL = INDEX_NM_FIRST, + INDEX_NM_CHANNEL, + INDEX_NM_DIMM, INDEX_MAX }; +#define BIT_NM_MEMCTRL BIT_ULL(INDEX_NM_MEMCTRL) +#define BIT_NM_CHANNEL BIT_ULL(INDEX_NM_CHANNEL) +#define BIT_NM_DIMM BIT_ULL(INDEX_NM_DIMM) + struct decoded_addr { struct skx_dev *dev; u64 addr; @@ -138,6 +148,7 @@ typedef void (*skx_show_retry_log_f)(struct decoded_addr *res, char *msg, int le int __init skx_adxl_get(void); void __exit skx_adxl_put(void); void skx_set_decode(skx_decode_f decode, skx_show_retry_log_f show_retry_log); +void skx_set_mem_cfg(bool mem_cfg_2lm); int skx_get_src_id(struct skx_dev *d, int off, u8 *id); int skx_get_node_id(struct skx_dev *d, u8 *id); -- Gitee From 6b81ee370e7b72e93ff4d3bef67c02ed4a35632b Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Fri, 11 Jun 2021 10:01:19 -0700 Subject: [PATCH 557/674] EDAC/i10nm: Add detection of memory levels for ICX/SPR servers mainline inclusion from mainline-v5.14-rc1 commit 4bd4d32e9a38d7ffb091b4109ab63c8f601e5678 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1 CVE: NA Intel-SIG: commit 4bd4d32e9a38 EDAC/i10nm: Add detection of memory levels for ICX/SPR servers. Backport to add EDAC 2LM support. -------------------------------- Current i10nm_edac driver is only for system configured in 1-level memory. If the system is configured in 2-level memory, the driver doesn't report the 1st level memory DIMM for the error address, even if the error occurs in the 1st level memory. Both Ice Lake servers and Sapphire Rapids servers can be configured in 2-level memory. Add detection of memory levels to i10nm_edac for the two kinds of servers so that the driver can report the 2nd level memory DIMM or the 1st level memory DIMM according to error source. Signed-off-by: Qiuxu Zhuo Signed-off-by: Tony Luck Link: https://lore.kernel.org/r/20210611170123.1057025-3-tony.luck@intel.com Signed-off-by: Youquan Song --- drivers/edac/i10nm_base.c | 39 +++++++++++++++++++++++++++++++++++++++ drivers/edac/skx_common.h | 3 +++ 2 files changed, 42 insertions(+) diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c index 7798de1e2aed..4e2a4d8396da 100644 --- a/drivers/edac/i10nm_base.c +++ b/drivers/edac/i10nm_base.c @@ -24,6 +24,8 @@ pci_read_config_dword((d)->uracu, 0xd0, &(reg)) #define I10NM_GET_IMC_BAR(d, i, reg) \ pci_read_config_dword((d)->uracu, 0xd8 + (i) * 4, &(reg)) +#define I10NM_GET_SAD(d, offset, i, reg)\ + pci_read_config_dword((d)->sad_all, (offset) + (i) * 8, &(reg)) #define I10NM_GET_DIMMMTR(m, i, j) \ readl((m)->mbase + 0x2080c + (i) * (m)->chan_mmio_sz + (j) * 4) #define I10NM_GET_MCDDRTCFG(m, i) \ @@ -50,6 +52,10 @@ #define RETRY_RD_ERR_LOG_NOOVER_UC (BIT(14) | BIT(1)) #define RETRY_RD_ERR_LOG_OVER_UC_V (BIT(2) | BIT(1) | BIT(0)) +#define I10NM_MAX_SAD 16 +#define I10NM_SAD_ENABLE(reg) GET_BITFIELD(reg, 0, 0) +#define I10NM_SAD_NM_CACHEABLE(reg) GET_BITFIELD(reg, 5, 5) + static struct list_head *i10nm_edac_list; static struct res_config *res_cfg; @@ -186,6 +192,31 @@ static struct pci_dev *pci_get_dev_wrapper(int dom, unsigned int bus, return pdev; } +static bool i10nm_check_2lm(struct res_config *cfg) +{ + struct skx_dev *d; + u32 reg; + int i; + + list_for_each_entry(d, i10nm_edac_list, list) { + d->sad_all = pci_get_dev_wrapper(d->seg, d->bus[1], + PCI_SLOT(cfg->sad_all_devfn), + PCI_FUNC(cfg->sad_all_devfn)); + if (!d->sad_all) + continue; + + for (i = 0; i < I10NM_MAX_SAD; i++) { + I10NM_GET_SAD(d, cfg->sad_all_offset, i, reg); + if (I10NM_SAD_ENABLE(reg) && I10NM_SAD_NM_CACHEABLE(reg)) { + edac_dbg(2, "2-level memory configuration.\n"); + return true; + } + } + } + + return false; +} + static int i10nm_get_all_munits(void) { struct pci_dev *mdev; @@ -255,6 +286,8 @@ static struct res_config i10nm_cfg0 = { .decs_did = 0x3452, .busno_cfg_offset = 0xcc, .ddr_chan_mmio_sz = 0x4000, + .sad_all_devfn = PCI_DEVFN(29, 0), + .sad_all_offset = 0x108, .offsets_scrub = offsets_scrub_icx, .offsets_demand = offsets_demand_icx, }; @@ -264,6 +297,8 @@ static struct res_config i10nm_cfg1 = { .decs_did = 0x3452, .busno_cfg_offset = 0xd0, .ddr_chan_mmio_sz = 0x4000, + .sad_all_devfn = PCI_DEVFN(29, 0), + .sad_all_offset = 0x108, .offsets_scrub = offsets_scrub_icx, .offsets_demand = offsets_demand_icx, }; @@ -274,6 +309,8 @@ static struct res_config spr_cfg = { .busno_cfg_offset = 0xd0, .ddr_chan_mmio_sz = 0x8000, .support_ddr5 = true, + .sad_all_devfn = PCI_DEVFN(10, 0), + .sad_all_offset = 0x300, .offsets_scrub = offsets_scrub_spr, .offsets_demand = offsets_demand_spr, }; @@ -429,6 +466,8 @@ static int __init i10nm_init(void) return -ENODEV; } + skx_set_mem_cfg(i10nm_check_2lm(cfg)); + rc = i10nm_get_all_munits(); if (rc < 0) goto fail; diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h index de9f82977f0e..1fb7540a7092 100644 --- a/drivers/edac/skx_common.h +++ b/drivers/edac/skx_common.h @@ -135,6 +135,9 @@ struct res_config { /* Per DDR channel memory-mapped I/O size */ int ddr_chan_mmio_sz; bool support_ddr5; + /* SAD device number and function number */ + unsigned int sad_all_devfn; + int sad_all_offset; /* Offsets of retry_rd_err_log registers */ u32 *offsets_scrub; u32 *offsets_demand; -- Gitee From 45875988308087e7fae3b15f217ae376765b126b Mon Sep 17 00:00:00 2001 From: root Date: Wed, 13 Jul 2022 15:28:12 +0800 Subject: [PATCH 558/674] EDAC/i10nm: Add support for high bandwidth memory mainline inclusion from mainline-v5.14-rc1 commit c945088384d00e6eb61535cc4ba25bc062090909 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1 CVE: NA Intel-SIG: commit c945088384d0 EDAC/i10nm: Add support for high bandwidth memory. Backport to add EDAC HBM support. -------------------------------- A future Xeon processor will include in-package HBM (high bandwidth memory). The in-package HBM memory controller shares the same architecture with the regular DDR memory controller. Add the HBM memory controller devices for EDAC support. Tested-by: Hongyu Ning Signed-off-by: Qiuxu Zhuo Signed-off-by: Tony Luck Link: https://lore.kernel.org/r/20210611170123.1057025-4-tony.luck@intel.com Signed-off-by: Youquan Song --- drivers/edac/i10nm_base.c | 132 ++++++++++++++++++++++++++++++++++---- drivers/edac/skx_common.c | 15 +++-- drivers/edac/skx_common.h | 20 +++++- 3 files changed, 148 insertions(+), 19 deletions(-) diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c index 4e2a4d8396da..47d6d7282c58 100644 --- a/drivers/edac/i10nm_base.c +++ b/drivers/edac/i10nm_base.c @@ -13,7 +13,7 @@ #include "edac_module.h" #include "skx_common.h" -#define I10NM_REVISION "v0.0.4" +#define I10NM_REVISION "v0.0.5" #define EDAC_MOD_STR "i10nm_edac" /* Debug macros */ @@ -26,14 +26,22 @@ pci_read_config_dword((d)->uracu, 0xd8 + (i) * 4, &(reg)) #define I10NM_GET_SAD(d, offset, i, reg)\ pci_read_config_dword((d)->sad_all, (offset) + (i) * 8, &(reg)) +#define I10NM_GET_HBM_IMC_BAR(d, reg) \ + pci_read_config_dword((d)->uracu, 0xd4, &(reg)) +#define I10NM_GET_CAPID3_CFG(d, reg) \ + pci_read_config_dword((d)->pcu_cr3, 0x90, &(reg)) #define I10NM_GET_DIMMMTR(m, i, j) \ - readl((m)->mbase + 0x2080c + (i) * (m)->chan_mmio_sz + (j) * 4) + readl((m)->mbase + ((m)->hbm_mc ? 0x80c : 0x2080c) + \ + (i) * (m)->chan_mmio_sz + (j) * 4) #define I10NM_GET_MCDDRTCFG(m, i) \ - readl((m)->mbase + 0x20970 + (i) * (m)->chan_mmio_sz) + readl((m)->mbase + ((m)->hbm_mc ? 0x970 : 0x20970) + \ + (i) * (m)->chan_mmio_sz) #define I10NM_GET_MCMTR(m, i) \ - readl((m)->mbase + 0x20ef8 + (i) * (m)->chan_mmio_sz) + readl((m)->mbase + ((m)->hbm_mc ? 0xef8 : 0x20ef8) + \ + (i) * (m)->chan_mmio_sz) #define I10NM_GET_AMAP(m, i) \ - readl((m)->mbase + 0x20814 + (i) * (m)->chan_mmio_sz) + readl((m)->mbase + ((m)->hbm_mc ? 0x814 : 0x20814) + \ + (i) * (m)->chan_mmio_sz) #define I10NM_GET_REG32(m, i, offset) \ readl((m)->mbase + (i) * (m)->chan_mmio_sz + (offset)) #define I10NM_GET_REG64(m, i, offset) \ @@ -45,6 +53,12 @@ #define I10NM_GET_IMC_MMIO_OFFSET(reg) (GET_BITFIELD(reg, 0, 10) << 12) #define I10NM_GET_IMC_MMIO_SIZE(reg) ((GET_BITFIELD(reg, 13, 23) - \ GET_BITFIELD(reg, 0, 10) + 1) << 12) +#define I10NM_GET_HBM_IMC_MMIO_OFFSET(reg) \ + ((GET_BITFIELD(reg, 0, 10) << 12) + 0x140000) + +#define I10NM_HBM_IMC_MMIO_SIZE 0x9000 +#define I10NM_IS_HBM_PRESENT(reg) GET_BITFIELD(reg, 27, 30) +#define I10NM_IS_HBM_IMC(reg) GET_BITFIELD(reg, 29, 29) #define RETRY_RD_ERR_LOG_UC BIT(1) #define RETRY_RD_ERR_LOG_NOOVER BIT(14) @@ -217,7 +231,7 @@ static bool i10nm_check_2lm(struct res_config *cfg) return false; } -static int i10nm_get_all_munits(void) +static int i10nm_get_ddr_munits(void) { struct pci_dev *mdev; void __iomem *mbase; @@ -245,7 +259,7 @@ static int i10nm_get_all_munits(void) edac_dbg(2, "socket%d mmio base 0x%llx (reg 0x%x)\n", j++, base, reg); - for (i = 0; i < I10NM_NUM_IMC; i++) { + for (i = 0; i < I10NM_NUM_DDR_IMC; i++) { mdev = pci_get_dev_wrapper(d->seg, d->bus[0], 12 + i, 0); if (i == 0 && !mdev) { @@ -281,6 +295,90 @@ static int i10nm_get_all_munits(void) return 0; } +static bool i10nm_check_hbm_imc(struct skx_dev *d) +{ + u32 reg; + + if (I10NM_GET_CAPID3_CFG(d, reg)) { + i10nm_printk(KERN_ERR, "Failed to get capid3_cfg\n"); + return false; + } + + return I10NM_IS_HBM_PRESENT(reg) != 0; +} + +static int i10nm_get_hbm_munits(void) +{ + struct pci_dev *mdev; + void __iomem *mbase; + u32 reg, off, mcmtr; + struct skx_dev *d; + int i, lmc; + u64 base; + + list_for_each_entry(d, i10nm_edac_list, list) { + d->pcu_cr3 = pci_get_dev_wrapper(d->seg, d->bus[1], 30, 3); + if (!d->pcu_cr3) + return -ENODEV; + + if (!i10nm_check_hbm_imc(d)) { + i10nm_printk(KERN_DEBUG, "No hbm memory\n"); + return -ENODEV; + } + + if (I10NM_GET_SCK_BAR(d, reg)) { + i10nm_printk(KERN_ERR, "Failed to get socket bar\n"); + return -ENODEV; + } + base = I10NM_GET_SCK_MMIO_BASE(reg); + + if (I10NM_GET_HBM_IMC_BAR(d, reg)) { + i10nm_printk(KERN_ERR, "Failed to get hbm mc bar\n"); + return -ENODEV; + } + base += I10NM_GET_HBM_IMC_MMIO_OFFSET(reg); + + lmc = I10NM_NUM_DDR_IMC; + + for (i = 0; i < I10NM_NUM_HBM_IMC; i++) { + mdev = pci_get_dev_wrapper(d->seg, d->bus[0], + 12 + i / 4, 1 + i % 4); + if (i == 0 && !mdev) { + i10nm_printk(KERN_ERR, "No hbm mc found\n"); + return -ENODEV; + } + if (!mdev) + continue; + + d->imc[lmc].mdev = mdev; + off = i * I10NM_HBM_IMC_MMIO_SIZE; + + edac_dbg(2, "hbm mc%d mmio base 0x%llx size 0x%x\n", + lmc, base + off, I10NM_HBM_IMC_MMIO_SIZE); + + mbase = ioremap(base + off, I10NM_HBM_IMC_MMIO_SIZE); + if (!mbase) { + i10nm_printk(KERN_ERR, "Failed to ioremap for hbm mc 0x%llx\n", + base + off); + return -ENOMEM; + } + + d->imc[lmc].mbase = mbase; + d->imc[lmc].hbm_mc = true; + + mcmtr = I10NM_GET_MCMTR(&d->imc[lmc], 0); + if (!I10NM_IS_HBM_IMC(mcmtr)) { + i10nm_printk(KERN_ERR, "This isn't an hbm mc!\n"); + return -ENODEV; + } + + lmc++; + } + } + + return 0; +} + static struct res_config i10nm_cfg0 = { .type = I10NM, .decs_did = 0x3452, @@ -308,6 +406,7 @@ static struct res_config spr_cfg = { .decs_did = 0x3252, .busno_cfg_offset = 0xd0, .ddr_chan_mmio_sz = 0x8000, + .hbm_chan_mmio_sz = 0x4000, .support_ddr5 = true, .sad_all_devfn = PCI_DEVFN(10, 0), .sad_all_offset = 0x300, @@ -345,14 +444,14 @@ static int i10nm_get_dimm_config(struct mem_ctl_info *mci, struct dimm_info *dimm; int i, j, ndimms; - for (i = 0; i < I10NM_NUM_CHANNELS; i++) { + for (i = 0; i < imc->num_channels; i++) { if (!imc->mbase) continue; ndimms = 0; mcddrtcfg = I10NM_GET_MCDDRTCFG(imc, i); amap = I10NM_GET_AMAP(imc, i); - for (j = 0; j < I10NM_NUM_DIMMS; j++) { + for (j = 0; j < imc->num_dimms; j++) { dimm = edac_get_dimm(mci, i, j, 0); mtr = I10NM_GET_DIMMMTR(imc, i, j); edac_dbg(1, "dimmmtr 0x%x mcddrtcfg 0x%x (mc%d ch%d dimm%d)\n", @@ -468,8 +567,9 @@ static int __init i10nm_init(void) skx_set_mem_cfg(i10nm_check_2lm(cfg)); - rc = i10nm_get_all_munits(); - if (rc < 0) + rc = i10nm_get_ddr_munits(); + + if (i10nm_get_hbm_munits() && rc) goto fail; list_for_each_entry(d, i10nm_edac_list, list) { @@ -490,7 +590,15 @@ static int __init i10nm_init(void) d->imc[i].lmc = i; d->imc[i].src_id = src_id; d->imc[i].node_id = node_id; - d->imc[i].chan_mmio_sz = cfg->ddr_chan_mmio_sz; + if (d->imc[i].hbm_mc) { + d->imc[i].chan_mmio_sz = cfg->hbm_chan_mmio_sz; + d->imc[i].num_channels = I10NM_NUM_HBM_CHANNELS; + d->imc[i].num_dimms = I10NM_NUM_HBM_DIMMS; + } else { + d->imc[i].chan_mmio_sz = cfg->ddr_chan_mmio_sz; + d->imc[i].num_channels = I10NM_NUM_DDR_CHANNELS; + d->imc[i].num_dimms = I10NM_NUM_DDR_DIMMS; + } rc = skx_register_mci(&d->imc[i], d->imc[i].mdev, "Intel_10nm Socket", EDAC_MOD_STR, diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c index 3d0b391a400d..0c133d32b777 100644 --- a/drivers/edac/skx_common.c +++ b/drivers/edac/skx_common.c @@ -343,9 +343,9 @@ int skx_get_dimm_info(u32 mtr, u32 mcmtr, u32 amap, struct dimm_info *dimm, ranks = numrank(mtr); rows = numrow(mtr); - cols = numcol(mtr); + cols = imc->hbm_mc ? 6 : numcol(mtr); - if (cfg->support_ddr5 && (amap & 0x8)) { + if (cfg->support_ddr5 && ((amap & 0x8) || imc->hbm_mc)) { banks = 32; mtype = MEM_DDR5; } else { @@ -374,8 +374,13 @@ int skx_get_dimm_info(u32 mtr, u32 mcmtr, u32 amap, struct dimm_info *dimm, dimm->dtype = get_width(mtr); dimm->mtype = mtype; dimm->edac_mode = EDAC_SECDED; /* likely better than this */ - snprintf(dimm->label, sizeof(dimm->label), "CPU_SrcID#%u_MC#%u_Chan#%u_DIMM#%u", - imc->src_id, imc->lmc, chan, dimmno); + + if (imc->hbm_mc) + snprintf(dimm->label, sizeof(dimm->label), "CPU_SrcID#%u_HBMC#%u_Chan#%u", + imc->src_id, imc->lmc, chan); + else + snprintf(dimm->label, sizeof(dimm->label), "CPU_SrcID#%u_MC#%u_Chan#%u_DIMM#%u", + imc->src_id, imc->lmc, chan, dimmno); return 1; } @@ -705,6 +710,8 @@ void skx_remove(void) } if (d->util_all) pci_dev_put(d->util_all); + if (d->pcu_cr3) + pci_dev_put(d->pcu_cr3); if (d->sad_all) pci_dev_put(d->sad_all); if (d->uracu) diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h index 1fb7540a7092..03ac067a80b9 100644 --- a/drivers/edac/skx_common.h +++ b/drivers/edac/skx_common.h @@ -32,9 +32,17 @@ #define SKX_NUM_CHANNELS 3 /* Channels per memory controller */ #define SKX_NUM_DIMMS 2 /* Max DIMMS per channel */ -#define I10NM_NUM_IMC 4 -#define I10NM_NUM_CHANNELS 2 -#define I10NM_NUM_DIMMS 2 +#define I10NM_NUM_DDR_IMC 4 +#define I10NM_NUM_DDR_CHANNELS 2 +#define I10NM_NUM_DDR_DIMMS 2 + +#define I10NM_NUM_HBM_IMC 16 +#define I10NM_NUM_HBM_CHANNELS 2 +#define I10NM_NUM_HBM_DIMMS 1 + +#define I10NM_NUM_IMC (I10NM_NUM_DDR_IMC + I10NM_NUM_HBM_IMC) +#define I10NM_NUM_CHANNELS MAX(I10NM_NUM_DDR_CHANNELS, I10NM_NUM_HBM_CHANNELS) +#define I10NM_NUM_DIMMS MAX(I10NM_NUM_DDR_DIMMS, I10NM_NUM_HBM_DIMMS) #define MAX(a, b) ((a) > (b) ? (a) : (b)) #define NUM_IMC MAX(SKX_NUM_IMC, I10NM_NUM_IMC) @@ -56,12 +64,16 @@ struct skx_dev { struct pci_dev *sad_all; struct pci_dev *util_all; struct pci_dev *uracu; /* for i10nm CPU */ + struct pci_dev *pcu_cr3; /* for HBM memory detection */ u32 mcroute; struct skx_imc { struct mem_ctl_info *mci; struct pci_dev *mdev; /* for i10nm CPU */ void __iomem *mbase; /* for i10nm CPU */ int chan_mmio_sz; /* for i10nm CPU */ + int num_channels; /* channels per memory controller */ + int num_dimms; /* dimms per channel */ + bool hbm_mc; u8 mc; /* system wide mc# */ u8 lmc; /* socket relative mc# */ u8 src_id, node_id; @@ -134,6 +146,8 @@ struct res_config { int busno_cfg_offset; /* Per DDR channel memory-mapped I/O size */ int ddr_chan_mmio_sz; + /* Per HBM channel memory-mapped I/O size */ + int hbm_chan_mmio_sz; bool support_ddr5; /* SAD device number and function number */ unsigned int sad_all_devfn; -- Gitee From b0adb31550986f669e45c7f42f13876451a91d0e Mon Sep 17 00:00:00 2001 From: Naveen Krishna Chatradhi Date: Wed, 30 Jun 2021 20:58:24 +0530 Subject: [PATCH 559/674] EDAC/mc: Add new HBM2 memory type mainline inclusion from mainline-v5.13 commit e1ca90b7cc5cb5d3a38321cbb65ad36a59fcb574 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1 CVE: NA Intel-SIG: commit e1ca90b7cc5c EDAC/mc: Add new HBM2 memory type. Backport to add EDAC HBM support. -------------------------------- Add a new entry to 'enum mem_type' and a new string to 'edac_mem_types[]' for HBM2 (High Bandwidth Memory Gen 2) new memory type. Reviewed-by: Yazen Ghannam Signed-off-by: Muralidhara M K Signed-off-by: Naveen Krishna Chatradhi Signed-off-by: Tony Luck Link: https://lore.kernel.org/r/20210630152828.162659-4-nchatrad@amd.com Signed-off-by: Youquan Song --- drivers/edac/edac_mc.c | 1 + include/linux/edac.h | 3 +++ 2 files changed, 4 insertions(+) diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index 34514c638f19..bf4297075c22 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -163,6 +163,7 @@ const char * const edac_mem_types[] = { [MEM_LRDDR4] = "Load-Reduced-DDR4-RAM", [MEM_DDR5] = "Unbuffered-DDR5", [MEM_NVDIMM] = "Non-volatile-RAM", + [MEM_HBM2] = "High-bandwidth-memory-Gen2", }; EXPORT_SYMBOL_GPL(edac_mem_types); diff --git a/include/linux/edac.h b/include/linux/edac.h index 6c4565cc6273..490134495008 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -181,6 +181,7 @@ static inline char *mc_event_error_type(const unsigned int err_type) * @MEM_LRDDR4: Load-Reduced DDR4 memory. * @MEM_DDR5: Unbuffered DDR5 RAM * @MEM_NVDIMM: Non-volatile RAM + * @MEM_HBM2: High bandwidth Memory Gen 2. */ enum mem_type { MEM_EMPTY = 0, @@ -206,6 +207,7 @@ enum mem_type { MEM_LRDDR4, MEM_DDR5, MEM_NVDIMM, + MEM_HBM2, }; #define MEM_FLAG_EMPTY BIT(MEM_EMPTY) @@ -230,6 +232,7 @@ enum mem_type { #define MEM_FLAG_LRDDR4 BIT(MEM_LRDDR4) #define MEM_FLAG_DDR5 BIT(MEM_DDR5) #define MEM_FLAG_NVDIMM BIT(MEM_NVDIMM) +#define MEM_FLAG_HBM2 BIT(MEM_HBM2) /** * enum edac-type - Error Detection and Correction capabilities and mode -- Gitee From 4a111addf4c90fc79db41b634b5b5b8b788d0f09 Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Tue, 20 Jul 2021 09:30:09 -0700 Subject: [PATCH 560/674] EDAC/skx_common: Set the memory type correctly for HBM memory mainline inclusion from mainline-v5.15-rc1 commit fd07a4a0d30b5468a1f4a0739e34f5f014df7d44 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1 CVE: NA Intel-SIG: commit fd07a4a0d30b EDAC/skx_common: Set the memory type correctly for HBM memory. Backport to add EDAC HBM support. -------------------------------- Set the memory type to MEM_HBM2 if it's managed by the HBM2 memory controller. Signed-off-by: Qiuxu Zhuo Signed-off-by: Tony Luck Link: https://lore.kernel.org/r/20210720163009.GA1417532@agluck-desk2.amr.corp.intel.com Signed-off-by: Youquan Song --- drivers/edac/skx_common.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c index 0c133d32b777..19c17c5198c5 100644 --- a/drivers/edac/skx_common.c +++ b/drivers/edac/skx_common.c @@ -345,7 +345,10 @@ int skx_get_dimm_info(u32 mtr, u32 mcmtr, u32 amap, struct dimm_info *dimm, rows = numrow(mtr); cols = imc->hbm_mc ? 6 : numcol(mtr); - if (cfg->support_ddr5 && ((amap & 0x8) || imc->hbm_mc)) { + if (imc->hbm_mc) { + banks = 32; + mtype = MEM_HBM2; + } else if (cfg->support_ddr5 && (amap & 0x8)) { banks = 32; mtype = MEM_DDR5; } else { -- Gitee From 028e1264e604bafb90f67c5533b2ad86b0cc8ea1 Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Fri, 24 Dec 2021 04:11:26 -0500 Subject: [PATCH 561/674] EDAC/i10nm: Release mdev/mbase when failing to detect HBM mainline inclusion from mainline-v5.16 commit c370baa328022cbd46c59c821d1b467a97f047be category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1 CVE: NA Intel-SIG: commit c370baa32802 EDAC/i10nm: Release mdev/mbase when failing to detect HBM. Backport to add EDAC HBM support. -------------------------------- On systems without HBM (High Bandwidth Memory) mdev/mbase are not released/unmapped. Add the code to release mdev/mbase when failing to detect HBM. [Tony: re-word commit message] Cc: Fixes: c945088384d0 ("EDAC/i10nm: Add support for high bandwidth memory") Reported-by: kernel test robot Reported-by: Dan Carpenter Signed-off-by: Qiuxu Zhuo Signed-off-by: Tony Luck Link: https://lore.kernel.org/r/20211224091126.1246-1-qiuxu.zhuo@intel.com Signed-off-by: Youquan Song --- drivers/edac/i10nm_base.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c index 47d6d7282c58..d63ddc9c994d 100644 --- a/drivers/edac/i10nm_base.c +++ b/drivers/edac/i10nm_base.c @@ -358,6 +358,9 @@ static int i10nm_get_hbm_munits(void) mbase = ioremap(base + off, I10NM_HBM_IMC_MMIO_SIZE); if (!mbase) { + pci_dev_put(d->imc[lmc].mdev); + d->imc[lmc].mdev = NULL; + i10nm_printk(KERN_ERR, "Failed to ioremap for hbm mc 0x%llx\n", base + off); return -ENOMEM; @@ -368,6 +371,12 @@ static int i10nm_get_hbm_munits(void) mcmtr = I10NM_GET_MCMTR(&d->imc[lmc], 0); if (!I10NM_IS_HBM_IMC(mcmtr)) { + iounmap(d->imc[lmc].mbase); + d->imc[lmc].mbase = NULL; + d->imc[lmc].hbm_mc = false; + pci_dev_put(d->imc[lmc].mdev); + d->imc[lmc].mdev = NULL; + i10nm_printk(KERN_ERR, "This isn't an hbm mc!\n"); return -ENODEV; } -- Gitee From 910f3dd810c9bab8332d53f2c32c55f5fe65a64c Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 26 Jul 2022 17:38:14 +0800 Subject: [PATCH 562/674] etherdevice: Adjust ether_addr* prototypes to silence -Wstringop-overead MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit stable inclusion from stable-v5.10.113 commit 08ad7a770efacfecf903143f8de88d1e351a1f2d category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=08ad7a770efacfecf903143f8de88d1e351a1f2d -------------------------------- commit 2618a0dae09ef37728dab89ff60418cbe25ae6bd upstream. With GCC 12, -Wstringop-overread was warning about an implicit cast from char[6] to char[8]. However, the extra 2 bytes are always thrown away, alignment doesn't matter, and the risk of hitting the edge of unallocated memory has been accepted, so this prototype can just be converted to a regular char *. Silences: net/core/dev.c: In function ‘bpf_prog_run_generic_xdp’: net/core/dev.c:4618:21: warning: ‘ether_addr_equal_64bits’ reading 8 bytes from a region of size 6 [-Wstringop-overread] 4618 | orig_host = ether_addr_equal_64bits(eth->h_dest, > skb->dev->dev_addr); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ net/core/dev.c:4618:21: note: referencing argument 1 of type ‘const u8[8]’ {aka ‘const unsigned char[8]’} net/core/dev.c:4618:21: note: referencing argument 2 of type ‘const u8[8]’ {aka ‘const unsigned char[8]’} In file included from net/core/dev.c:91: include/linux/etherdevice.h:375:20: note: in a call to function ‘ether_addr_equal_64bits’ 375 | static inline bool ether_addr_equal_64bits(const u8 addr1[6+2], | ^~~~~~~~~~~~~~~~~~~~~~~ Reported-by: Marc Kleine-Budde Tested-by: Marc Kleine-Budde Link: https://lore.kernel.org/netdev/20220212090811.uuzk6d76agw2vv73@pengutronix.de Cc: Jakub Kicinski Cc: "David S. Miller" Cc: netdev@vger.kernel.org Signed-off-by: Kees Cook Signed-off-by: David S. Miller Cc: Khem Raj Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- include/linux/etherdevice.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index 2e5debc0373c..99209f50915f 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -127,7 +127,7 @@ static inline bool is_multicast_ether_addr(const u8 *addr) #endif } -static inline bool is_multicast_ether_addr_64bits(const u8 addr[6+2]) +static inline bool is_multicast_ether_addr_64bits(const u8 *addr) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 #ifdef __BIG_ENDIAN @@ -352,8 +352,7 @@ static inline bool ether_addr_equal(const u8 *addr1, const u8 *addr2) * Please note that alignment of addr1 & addr2 are only guaranteed to be 16 bits. */ -static inline bool ether_addr_equal_64bits(const u8 addr1[6+2], - const u8 addr2[6+2]) +static inline bool ether_addr_equal_64bits(const u8 *addr1, const u8 *addr2) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 u64 fold = (*(const u64 *)addr1) ^ (*(const u64 *)addr2); -- Gitee From 0ccd441a7641338e1220f8a244da9159b653ff5b Mon Sep 17 00:00:00 2001 From: Xiongwei Song Date: Tue, 26 Jul 2022 17:38:15 +0800 Subject: [PATCH 563/674] mm: page_alloc: fix building error on -Werror=array-compare stable inclusion from stable-v5.10.113 commit 69848f9488bc1088bcd9a73987dff8d2cb47a060 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=69848f9488bc1088bcd9a73987dff8d2cb47a060 -------------------------------- commit ca831f29f8f25c97182e726429b38c0802200c8f upstream. Arthur Marsh reported we would hit the error below when building kernel with gcc-12: CC mm/page_alloc.o mm/page_alloc.c: In function `mem_init_print_info': mm/page_alloc.c:8173:27: error: comparison between two arrays [-Werror=array-compare] 8173 | if (start <= pos && pos < end && size > adj) \ | In C++20, the comparision between arrays should be warned. Link: https://lkml.kernel.org/r/20211125130928.32465-1-sxwjean@me.com Signed-off-by: Xiongwei Song Reported-by: Arthur Marsh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Cc: Khem Raj Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ec73cca1726c..cf9c69d631f3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7974,7 +7974,7 @@ void __init mem_init_print_info(const char *str) */ #define adj_init_size(start, end, size, pos, adj) \ do { \ - if (start <= pos && pos < end && size > adj) \ + if (&start[0] <= &pos[0] && &pos[0] < &end[0] && size > adj) \ size -= adj; \ } while (0) -- Gitee From d0005c0a490428fb4f12a085d77c1bb34bf1dbf5 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Tue, 26 Jul 2022 17:38:16 +0800 Subject: [PATCH 564/674] tracing: Dump stacktrace trigger to the corresponding instance mainline inclusion from mainline-v5.17-rc6 commit ce33c845b030c9cf768370c951bc699470b09fa7 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ce33c845b030c9cf768370c951bc699470b09fa7 -------------------------------- The stacktrace event trigger is not dumping the stacktrace to the instance where it was enabled, but to the global "instance." Use the private_data, pointing to the trigger file, to figure out the corresponding trace instance, and use it in the trigger action, like snapshot_trigger does. Link: https://lkml.kernel.org/r/afbb0b4f18ba92c276865bc97204d438473f4ebc.1645396236.git.bristot@kernel.org Cc: stable@vger.kernel.org Fixes: ae63b31e4d0e2 ("tracing: Separate out trace events from global variables") Reviewed-by: Tom Zanussi Tested-by: Tom Zanussi Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) [zzk: __trace_stack() was changed due to the merge of "36590c50b2d0 tracing: Merge irqflags + preempt counter", cherry-pick from mainline ce33c845b030c9 instead of stable 5.10.y] Signed-off-by: Zheng Zengkai Reviewed-by: Yang Jihong Acked-by: Xie XiuQi --- kernel/trace/trace_events_trigger.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index d0309de2f84f..3c6229f16e81 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -1219,7 +1219,12 @@ static void stacktrace_trigger(struct event_trigger_data *data, void *rec, struct ring_buffer_event *event) { - trace_dump_stack(STACK_SKIP); + struct trace_event_file *file = data->private_data; + + if (file) + __trace_stack(file->tr, tracing_gen_ctx(), STACK_SKIP); + else + trace_dump_stack(STACK_SKIP); } static void -- Gitee From 32af01826a81a3f51b3e13d211351569f762f6c2 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Tue, 26 Jul 2022 17:38:17 +0800 Subject: [PATCH 565/674] perf tools: Fix segfault accessing sample_id xyarray stable inclusion from stable-v5.10.113 commit 378061c9b886994fa045186390d61a5e7c696ae3 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=378061c9b886994fa045186390d61a5e7c696ae3 -------------------------------- commit a668cc07f990d2ed19424d5c1a529521a9d1cee1 upstream. perf_evsel::sample_id is an xyarray which can cause a segfault when accessed beyond its size. e.g. # perf record -e intel_pt// -C 1 sleep 1 Segmentation fault (core dumped) # That is happening because a dummy event is opened to capture text poke events accross all CPUs, however the mmap logic is allocating according to the number of user_requested_cpus. In general, perf sometimes uses the evsel cpus to open events, and sometimes the evlist user_requested_cpus. However, it is not necessary to determine which case is which because the opened event file descriptors are also in an xyarray, the size of whch can be used to correctly allocate the size of the sample_id xyarray, because there is one ID per file descriptor. Note, in the affected code path, perf_evsel fd array is subsequently used to get the file descriptor for the mmap, so it makes sense for the xyarrays to be the same size there. Fixes: d1a177595b3a824c ("libperf: Adopt perf_evlist__mmap()/munmap() from tools/perf") Fixes: 246eba8e9041c477 ("perf tools: Add support for PERF_RECORD_TEXT_POKE") Signed-off-by: Adrian Hunter Acked-by: Ian Rogers Cc: Adrian Hunter Cc: Jiri Olsa Cc: stable@vger.kernel.org # 5.5+ Link: https://lore.kernel.org/r/20220413114232.26914-1-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- tools/lib/perf/evlist.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tools/lib/perf/evlist.c b/tools/lib/perf/evlist.c index 17465d454a0e..f76b1a9d5a6e 100644 --- a/tools/lib/perf/evlist.c +++ b/tools/lib/perf/evlist.c @@ -571,7 +571,6 @@ int perf_evlist__mmap_ops(struct perf_evlist *evlist, { struct perf_evsel *evsel; const struct perf_cpu_map *cpus = evlist->cpus; - const struct perf_thread_map *threads = evlist->threads; if (!ops || !ops->get || !ops->mmap) return -EINVAL; @@ -583,7 +582,7 @@ int perf_evlist__mmap_ops(struct perf_evlist *evlist, perf_evlist__for_each_entry(evlist, evsel) { if ((evsel->attr.read_format & PERF_FORMAT_ID) && evsel->sample_id == NULL && - perf_evsel__alloc_id(evsel, perf_cpu_map__nr(cpus), threads->nr) < 0) + perf_evsel__alloc_id(evsel, evsel->fd->max_x, evsel->fd->max_y) < 0) return -ENOMEM; } -- Gitee From 9f91d21694fddc986c0beb30b2680d69457e66ec Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Tue, 26 Jul 2022 17:38:18 +0800 Subject: [PATCH 566/674] gfs2: assign rgrp glock before compute_bitstructs stable inclusion from stable-v5.10.113 commit 04dd45d9776eb4802df8b0a0a6fcdf27842fbdd1 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=04dd45d9776eb4802df8b0a0a6fcdf27842fbdd1 -------------------------------- commit 428f651cb80b227af47fc302e4931791f2fb4741 upstream. Before this patch, function read_rindex_entry called compute_bitstructs before it allocated a glock for the rgrp. But if compute_bitstructs found a problem with the rgrp, it called gfs2_consist_rgrpd, and that called gfs2_dump_glock for rgd->rd_gl which had not yet been assigned. read_rindex_entry compute_bitstructs gfs2_consist_rgrpd gfs2_dump_glock <---------rgd->rd_gl was not set. This patch changes read_rindex_entry so it assigns an rgrp glock before calling compute_bitstructs so gfs2_dump_glock does not reference an unassigned pointer. If an error is discovered, the glock must also be put, so a new goto and label were added. Reported-by: syzbot+c6fd14145e2f62ca0784@syzkaller.appspotmail.com Signed-off-by: Bob Peterson Signed-off-by: Andreas Gruenbacher Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- fs/gfs2/rgrp.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index dc55b029afaa..c5bde789a16d 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -906,15 +906,15 @@ static int read_rindex_entry(struct gfs2_inode *ip) rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes); spin_lock_init(&rgd->rd_rsspin); - error = compute_bitstructs(rgd); - if (error) - goto fail; - error = gfs2_glock_get(sdp, rgd->rd_addr, &gfs2_rgrp_glops, CREATE, &rgd->rd_gl); if (error) goto fail; + error = compute_bitstructs(rgd); + if (error) + goto fail_glock; + rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr; rgd->rd_flags &= ~(GFS2_RDF_UPTODATE | GFS2_RDF_PREFERRED); if (rgd->rd_data > sdp->sd_max_rg_data) @@ -928,6 +928,7 @@ static int read_rindex_entry(struct gfs2_inode *ip) } error = 0; /* someone else read in the rgrp; free it and ignore it */ +fail_glock: gfs2_glock_put(rgd->rd_gl); fail: -- Gitee From 302f6418e23bbcecc28eafb2b9b50b3fd5de7806 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 26 Jul 2022 17:38:19 +0800 Subject: [PATCH 567/674] ALSA: usb-audio: Clear MIDI port active flag after draining stable inclusion from stable-v5.10.113 commit 8ce3820fc9d496100fb39008a952eebfb695b060 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=8ce3820fc9d496100fb39008a952eebfb695b060 -------------------------------- commit 0665886ad1392e6b5bae85d7a6ccbed48dca1522 upstream. When a rawmidi output stream is closed, it calls the drain at first, then does trigger-off only when the drain returns -ERESTARTSYS as a fallback. It implies that each driver should turn off the stream properly after the drain. Meanwhile, USB-audio MIDI interface didn't change the port->active flag after the drain. This may leave the output work picking up the port that is closed right now, which eventually leads to a use-after-free for the already released rawmidi object. This patch fixes the bug by properly clearing the port->active flag after the output drain. Reported-by: syzbot+70e777a39907d6d5fd0a@syzkaller.appspotmail.com Cc: Link: https://lore.kernel.org/r/00000000000011555605dceaff03@google.com Link: https://lore.kernel.org/r/20220420130247.22062-1-tiwai@suse.de Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- sound/usb/midi.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/usb/midi.c b/sound/usb/midi.c index fa91290ad89d..84676a8fb60d 100644 --- a/sound/usb/midi.c +++ b/sound/usb/midi.c @@ -1210,6 +1210,7 @@ static void snd_usbmidi_output_drain(struct snd_rawmidi_substream *substream) } while (drain_urbs && timeout); finish_wait(&ep->drain_wait, &wait); } + port->active = 0; spin_unlock_irq(&ep->buffer_lock); } -- Gitee From 6942c4a9bb82dd73e524481428c69ffad9c9a445 Mon Sep 17 00:00:00 2001 From: Tim Crawford Date: Tue, 26 Jul 2022 17:38:20 +0800 Subject: [PATCH 568/674] ALSA: hda/realtek: Add quirk for Clevo NP70PNP stable inclusion from stable-v5.10.113 commit cf9b19546494b2e5df87486752946ae71c6775db category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=cf9b19546494b2e5df87486752946ae71c6775db -------------------------------- commit 86222af07abf1f5f07a5873cc399c29ab8a9b8b8 upstream. Fixes headset detection on Clevo NP70PNP. Signed-off-by: Tim Crawford Cc: Link: https://lore.kernel.org/r/20220421170412.3697-1-tcrawford@system76.com Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 11d653190e6e..b5168959fcf6 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -8897,6 +8897,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1558, 0x8562, "Clevo NH[5|7][0-9]RZ[Q]", ALC269_FIXUP_DMIC), SND_PCI_QUIRK(0x1558, 0x8668, "Clevo NP50B[BE]", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0x866d, "Clevo NP5[05]PN[HJK]", ALC256_FIXUP_SYSTEM76_MIC_NO_PRESENCE), + SND_PCI_QUIRK(0x1558, 0x867c, "Clevo NP7[01]PNP", ALC256_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0x867d, "Clevo NP7[01]PN[HJK]", ALC256_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0x8680, "Clevo NJ50LU", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0x8686, "Clevo NH50[CZ]U", ALC256_FIXUP_MIC_NO_PRESENCE_AND_RESUME), -- Gitee From 83ab10a7a9ff529b76e8517e74380568f7b81797 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 26 Jul 2022 17:38:21 +0800 Subject: [PATCH 569/674] ASoC: atmel: Remove system clock tree configuration for at91sam9g20ek stable inclusion from stable-v5.10.113 commit 608fc58858bfa7552a9824c2f0e4a3ab8dd4efaa category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=608fc58858bfa7552a9824c2f0e4a3ab8dd4efaa -------------------------------- [ Upstream commit c775cbf62ed4911e4f0f23880f01815753123690 ] The MCLK of the WM8731 on the AT91SAM9G20-EK board is connected to the PCK0 output of the SoC, intended in the reference software to be supplied using PLLB and programmed to 12MHz. As originally written for use with a board file the audio driver was responsible for configuring the entire tree but in the conversion to the common clock framework the registration of the named pck0 and pllb clocks was removed so the driver has failed to instantiate ever since. Since the WM8731 driver has had support for managing a MCLK provided via the common clock framework for some time we can simply drop all the clock management code from the machine driver other than configuration of the sysclk rate, the CODEC driver still respects that configuration from the machine driver. Fixes: ff78a189b0ae55f ("ARM: at91: remove old at91-specific clock driver") Signed-off-by: Mark Brown Reviewed-by: Codrin Ciubotariu Link: https://lore.kernel.org/r/20220325154241.1600757-2-broonie@kernel.org Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- sound/soc/atmel/sam9g20_wm8731.c | 61 -------------------------------- 1 file changed, 61 deletions(-) diff --git a/sound/soc/atmel/sam9g20_wm8731.c b/sound/soc/atmel/sam9g20_wm8731.c index 8a55d59a6c2a..d243de5f23dc 100644 --- a/sound/soc/atmel/sam9g20_wm8731.c +++ b/sound/soc/atmel/sam9g20_wm8731.c @@ -46,35 +46,6 @@ */ #undef ENABLE_MIC_INPUT -static struct clk *mclk; - -static int at91sam9g20ek_set_bias_level(struct snd_soc_card *card, - struct snd_soc_dapm_context *dapm, - enum snd_soc_bias_level level) -{ - static int mclk_on; - int ret = 0; - - switch (level) { - case SND_SOC_BIAS_ON: - case SND_SOC_BIAS_PREPARE: - if (!mclk_on) - ret = clk_enable(mclk); - if (ret == 0) - mclk_on = 1; - break; - - case SND_SOC_BIAS_OFF: - case SND_SOC_BIAS_STANDBY: - if (mclk_on) - clk_disable(mclk); - mclk_on = 0; - break; - } - - return ret; -} - static const struct snd_soc_dapm_widget at91sam9g20ek_dapm_widgets[] = { SND_SOC_DAPM_MIC("Int Mic", NULL), SND_SOC_DAPM_SPK("Ext Spk", NULL), @@ -135,7 +106,6 @@ static struct snd_soc_card snd_soc_at91sam9g20ek = { .owner = THIS_MODULE, .dai_link = &at91sam9g20ek_dai, .num_links = 1, - .set_bias_level = at91sam9g20ek_set_bias_level, .dapm_widgets = at91sam9g20ek_dapm_widgets, .num_dapm_widgets = ARRAY_SIZE(at91sam9g20ek_dapm_widgets), @@ -148,7 +118,6 @@ static int at91sam9g20ek_audio_probe(struct platform_device *pdev) { struct device_node *np = pdev->dev.of_node; struct device_node *codec_np, *cpu_np; - struct clk *pllb; struct snd_soc_card *card = &snd_soc_at91sam9g20ek; int ret; @@ -162,31 +131,6 @@ static int at91sam9g20ek_audio_probe(struct platform_device *pdev) return -EINVAL; } - /* - * Codec MCLK is supplied by PCK0 - set it up. - */ - mclk = clk_get(NULL, "pck0"); - if (IS_ERR(mclk)) { - dev_err(&pdev->dev, "Failed to get MCLK\n"); - ret = PTR_ERR(mclk); - goto err; - } - - pllb = clk_get(NULL, "pllb"); - if (IS_ERR(pllb)) { - dev_err(&pdev->dev, "Failed to get PLLB\n"); - ret = PTR_ERR(pllb); - goto err_mclk; - } - ret = clk_set_parent(mclk, pllb); - clk_put(pllb); - if (ret != 0) { - dev_err(&pdev->dev, "Failed to set MCLK parent\n"); - goto err_mclk; - } - - clk_set_rate(mclk, MCLK_RATE); - card->dev = &pdev->dev; /* Parse device node info */ @@ -230,9 +174,6 @@ static int at91sam9g20ek_audio_probe(struct platform_device *pdev) return ret; -err_mclk: - clk_put(mclk); - mclk = NULL; err: atmel_ssc_put_audio(0); return ret; @@ -242,8 +183,6 @@ static int at91sam9g20ek_audio_remove(struct platform_device *pdev) { struct snd_soc_card *card = platform_get_drvdata(pdev); - clk_disable(mclk); - mclk = NULL; snd_soc_unregister_card(card); atmel_ssc_put_audio(0); -- Gitee From f7ec810a47d7f46e7b1739befe01d4850fd7ca31 Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Tue, 26 Jul 2022 17:38:22 +0800 Subject: [PATCH 570/674] ASoC: msm8916-wcd-digital: Check failure for devm_snd_soc_register_component stable inclusion from stable-v5.10.113 commit b6f474cd3097b9570d5c2c30075ca3506fbbcf33 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=b6f474cd3097b9570d5c2c30075ca3506fbbcf33 -------------------------------- [ Upstream commit e927b05f3cc20de87f6b7d912a5bbe556931caca ] devm_snd_soc_register_component() may fails, we should check the error and do the corresponding error handling. Fixes: 150db8c5afa1 ("ASoC: codecs: Add msm8916-wcd digital codec") Signed-off-by: Miaoqian Lin Link: https://lore.kernel.org/r/20220403115239.30140-1-linmq006@gmail.com Signed-off-by: Mark Brown Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- sound/soc/codecs/msm8916-wcd-digital.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/sound/soc/codecs/msm8916-wcd-digital.c b/sound/soc/codecs/msm8916-wcd-digital.c index 9ad7fc0baf07..20a07c92b2fc 100644 --- a/sound/soc/codecs/msm8916-wcd-digital.c +++ b/sound/soc/codecs/msm8916-wcd-digital.c @@ -1206,9 +1206,16 @@ static int msm8916_wcd_digital_probe(struct platform_device *pdev) dev_set_drvdata(dev, priv); - return devm_snd_soc_register_component(dev, &msm8916_wcd_digital, + ret = devm_snd_soc_register_component(dev, &msm8916_wcd_digital, msm8916_wcd_digital_dai, ARRAY_SIZE(msm8916_wcd_digital_dai)); + if (ret) + goto err_mclk; + + return 0; + +err_mclk: + clk_disable_unprepare(priv->mclk); err_clk: clk_disable_unprepare(priv->ahbclk); return ret; -- Gitee From ecf9eaf316dc293ccd09a729f924840b09fe60db Mon Sep 17 00:00:00 2001 From: Srinivas Kandagatla Date: Tue, 26 Jul 2022 17:38:23 +0800 Subject: [PATCH 571/674] ASoC: codecs: wcd934x: do not switch off SIDO Buck when codec is in use stable inclusion from stable-v5.10.113 commit 12aa8021c7a72811cd8096b1e456cd5d265896d9 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=12aa8021c7a72811cd8096b1e456cd5d265896d9 -------------------------------- [ Upstream commit db6dd1bee63d1d88fbddfe07af800af5948ac28e ] SIDO(Single-Inductor Dual-Ouput) Buck powers up both analog and digital circuits along with internal memory, powering off this is the last thing that codec should do when going to very low power. Current code was powering off this Buck if there are no users of sysclk, which is not correct. Powering off this buck will result in no register access. This code path was never tested until recently after adding pm support in SoundWire controller. Fix this by removing the buck poweroff when the codec is active and also the code that is not used. Without this patch all the read/write transactions will never complete and results in SLIMBus Errors like: qcom,slim-ngd qcom,slim-ngd.1: Tx:MT:0x0, MC:0x60, LA:0xcf failed:-110 wcd934x-codec wcd934x-codec.1.auto: ASoC: error at soc_component_read_no_lock on wcd934x-codec.1.auto for register: [0x00000d05] -110 qcom,slim-ngd-ctrl 171c0000.slim: Error Interrupt received 0x82000000 Reported-by: Amit Pundir Fixes: a61f3b4f476e ("ASoC: wcd934x: add support to wcd9340/wcd9341 codec") Signed-off-by: Srinivas Kandagatla Tested-by: Amit Pundir Link: https://lore.kernel.org/r/20220407094313.2880-1-srinivas.kandagatla@linaro.org Signed-off-by: Mark Brown Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- sound/soc/codecs/wcd934x.c | 26 +------------------------- 1 file changed, 1 insertion(+), 25 deletions(-) diff --git a/sound/soc/codecs/wcd934x.c b/sound/soc/codecs/wcd934x.c index 8540ac230d0e..fd704df9b175 100644 --- a/sound/soc/codecs/wcd934x.c +++ b/sound/soc/codecs/wcd934x.c @@ -1188,29 +1188,7 @@ static int wcd934x_set_sido_input_src(struct wcd934x_codec *wcd, int sido_src) if (sido_src == wcd->sido_input_src) return 0; - if (sido_src == SIDO_SOURCE_INTERNAL) { - regmap_update_bits(wcd->regmap, WCD934X_ANA_BUCK_CTL, - WCD934X_ANA_BUCK_HI_ACCU_EN_MASK, 0); - usleep_range(100, 110); - regmap_update_bits(wcd->regmap, WCD934X_ANA_BUCK_CTL, - WCD934X_ANA_BUCK_HI_ACCU_PRE_ENX_MASK, 0x0); - usleep_range(100, 110); - regmap_update_bits(wcd->regmap, WCD934X_ANA_RCO, - WCD934X_ANA_RCO_BG_EN_MASK, 0); - usleep_range(100, 110); - regmap_update_bits(wcd->regmap, WCD934X_ANA_BUCK_CTL, - WCD934X_ANA_BUCK_PRE_EN1_MASK, - WCD934X_ANA_BUCK_PRE_EN1_ENABLE); - usleep_range(100, 110); - regmap_update_bits(wcd->regmap, WCD934X_ANA_BUCK_CTL, - WCD934X_ANA_BUCK_PRE_EN2_MASK, - WCD934X_ANA_BUCK_PRE_EN2_ENABLE); - usleep_range(100, 110); - regmap_update_bits(wcd->regmap, WCD934X_ANA_BUCK_CTL, - WCD934X_ANA_BUCK_HI_ACCU_EN_MASK, - WCD934X_ANA_BUCK_HI_ACCU_ENABLE); - usleep_range(100, 110); - } else if (sido_src == SIDO_SOURCE_RCO_BG) { + if (sido_src == SIDO_SOURCE_RCO_BG) { regmap_update_bits(wcd->regmap, WCD934X_ANA_RCO, WCD934X_ANA_RCO_BG_EN_MASK, WCD934X_ANA_RCO_BG_ENABLE); @@ -1296,8 +1274,6 @@ static int wcd934x_disable_ana_bias_and_syclk(struct wcd934x_codec *wcd) regmap_update_bits(wcd->regmap, WCD934X_CLK_SYS_MCLK_PRG, WCD934X_EXT_CLK_BUF_EN_MASK | WCD934X_MCLK_EN_MASK, 0x0); - wcd934x_set_sido_input_src(wcd, SIDO_SOURCE_INTERNAL); - regmap_update_bits(wcd->regmap, WCD934X_ANA_BIAS, WCD934X_ANA_BIAS_EN_MASK, 0); regmap_update_bits(wcd->regmap, WCD934X_ANA_BIAS, -- Gitee From 4be6d449af6b349bdeeb388d9a40f814d8120112 Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Tue, 26 Jul 2022 17:38:24 +0800 Subject: [PATCH 572/674] dmaengine: imx-sdma: Fix error checking in sdma_event_remap stable inclusion from stable-v5.10.113 commit 9bc949a181ba805b76b81d1ae1519af093328b81 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=9bc949a181ba805b76b81d1ae1519af093328b81 -------------------------------- [ Upstream commit 7104b9cb35a33ad803a1adbbfa50569b008faf15 ] of_parse_phandle() returns NULL on errors, rather than error pointers. Using NULL check on grp_np to fix this. Fixes: d078cd1b4185 ("dmaengine: imx-sdma: Add imx6sx platform support") Signed-off-by: Miaoqian Lin Link: https://lore.kernel.org/r/20220308064952.15743-1-linmq006@gmail.com Signed-off-by: Vinod Koul Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/dma/imx-sdma.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/dma/imx-sdma.c b/drivers/dma/imx-sdma.c index 306f93e4b26a..792c91cd1608 100644 --- a/drivers/dma/imx-sdma.c +++ b/drivers/dma/imx-sdma.c @@ -1789,7 +1789,7 @@ static int sdma_event_remap(struct sdma_engine *sdma) u32 reg, val, shift, num_map, i; int ret = 0; - if (IS_ERR(np) || IS_ERR(gpr_np)) + if (IS_ERR(np) || !gpr_np) goto out; event_remap = of_find_property(np, propname, NULL); @@ -1837,7 +1837,7 @@ static int sdma_event_remap(struct sdma_engine *sdma) } out: - if (!IS_ERR(gpr_np)) + if (gpr_np) of_node_put(gpr_np); return ret; -- Gitee From dfc6a02d0e0f49da209a4e3fce9e1e52775ae7a8 Mon Sep 17 00:00:00 2001 From: zhangqilong Date: Tue, 26 Jul 2022 17:38:25 +0800 Subject: [PATCH 573/674] dmaengine: mediatek:Fix PM usage reference leak of mtk_uart_apdma_alloc_chan_resources stable inclusion from stable-v5.10.113 commit f714abf28f819849d2fd93a4a8db15cff3f8798e category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=f714abf28f819849d2fd93a4a8db15cff3f8798e -------------------------------- [ Upstream commit 545b2baac89b859180e51215468c05d85ea8465a ] pm_runtime_get_sync will increment pm usage counter even it failed. Forgetting to putting operation will result in reference leak here. We fix it: 1) Replacing it with pm_runtime_resume_and_get to keep usage counter balanced. 2) Add putting operation before returning error. Fixes:9135408c3ace4 ("dmaengine: mediatek: Add MediaTek UART APDMA support") Signed-off-by: Zhang Qilong Link: https://lore.kernel.org/r/20220319022142.142709-1-zhangqilong3@huawei.com Signed-off-by: Vinod Koul Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/dma/mediatek/mtk-uart-apdma.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/dma/mediatek/mtk-uart-apdma.c b/drivers/dma/mediatek/mtk-uart-apdma.c index 375e7e647df6..a1517ef1f4a0 100644 --- a/drivers/dma/mediatek/mtk-uart-apdma.c +++ b/drivers/dma/mediatek/mtk-uart-apdma.c @@ -274,7 +274,7 @@ static int mtk_uart_apdma_alloc_chan_resources(struct dma_chan *chan) unsigned int status; int ret; - ret = pm_runtime_get_sync(mtkd->ddev.dev); + ret = pm_runtime_resume_and_get(mtkd->ddev.dev); if (ret < 0) { pm_runtime_put_noidle(chan->device->dev); return ret; @@ -288,18 +288,21 @@ static int mtk_uart_apdma_alloc_chan_resources(struct dma_chan *chan) ret = readx_poll_timeout(readl, c->base + VFF_EN, status, !status, 10, 100); if (ret) - return ret; + goto err_pm; ret = request_irq(c->irq, mtk_uart_apdma_irq_handler, IRQF_TRIGGER_NONE, KBUILD_MODNAME, chan); if (ret < 0) { dev_err(chan->device->dev, "Can't request dma IRQ\n"); - return -EINVAL; + ret = -EINVAL; + goto err_pm; } if (mtkd->support_33bits) mtk_uart_apdma_write(c, VFF_4G_SUPPORT, VFF_4G_SUPPORT_CLR_B); +err_pm: + pm_runtime_put_noidle(mtkd->ddev.dev); return ret; } -- Gitee From 4ab150b4794d5ccbae344f03323fce6d73d66fcd Mon Sep 17 00:00:00 2001 From: Allen-KH Cheng Date: Tue, 26 Jul 2022 17:38:26 +0800 Subject: [PATCH 574/674] spi: spi-mtk-nor: initialize spi controller after resume stable inclusion from stable-v5.10.113 commit 3f7914dbeacdffe4fa198dee7afefa74998b5d8d category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=3f7914dbeacdffe4fa198dee7afefa74998b5d8d -------------------------------- [ Upstream commit 317c2045618cc1f8d38beb8c93a7bdb6ad8638c6 ] After system resumes, the registers of nor controller are initialized with default values. The nor controller will not function properly. To handle both issues above, we add mtk_nor_init() in mtk_nor_resume after pm_runtime_force_resume(). Fixes: 3bfd9103c7af ("spi: spi-mtk-nor: Add power management support") Signed-off-by: Allen-KH Cheng Reviewed-by: Rex-BC Chen Link: https://lore.kernel.org/r/20220412115743.22641-1-allen-kh.cheng@mediatek.com Signed-off-by: Mark Brown Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/spi/spi-mtk-nor.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/spi/spi-mtk-nor.c b/drivers/spi/spi-mtk-nor.c index 288f6c2bbd57..106e3cacba4c 100644 --- a/drivers/spi/spi-mtk-nor.c +++ b/drivers/spi/spi-mtk-nor.c @@ -895,7 +895,17 @@ static int __maybe_unused mtk_nor_suspend(struct device *dev) static int __maybe_unused mtk_nor_resume(struct device *dev) { - return pm_runtime_force_resume(dev); + struct spi_controller *ctlr = dev_get_drvdata(dev); + struct mtk_nor *sp = spi_controller_get_devdata(ctlr); + int ret; + + ret = pm_runtime_force_resume(dev); + if (ret) + return ret; + + mtk_nor_init(sp); + + return 0; } static const struct dev_pm_ops mtk_nor_pm_ops = { -- Gitee From 79b7e6ed7a6170a8f469c8035db58f163600a410 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Tue, 26 Jul 2022 17:38:27 +0800 Subject: [PATCH 575/674] esp: limit skb_page_frag_refill use to a single page stable inclusion from stable-v5.10.113 commit c075c3ea031757f8ea2d34567565b61a868c08d5 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=c075c3ea031757f8ea2d34567565b61a868c08d5 -------------------------------- [ Upstream commit 5bd8baab087dff657e05387aee802e70304cc813 ] Commit ebe48d368e97 ("esp: Fix possible buffer overflow in ESP transformation") tried to fix skb_page_frag_refill usage in ESP by capping allocsize to 32k, but that doesn't completely solve the issue, as skb_page_frag_refill may return a single page. If that happens, we will write out of bounds, despite the check introduced in the previous patch. This patch forces COW in cases where we would end up calling skb_page_frag_refill with a size larger than a page (first in esp_output_head with tailen, then in esp_output_tail with skb->data_len). Fixes: cac2661c53f3 ("esp4: Avoid skb_cow_data whenever possible") Fixes: 03e2a30f6a27 ("esp6: Avoid skb_cow_data whenever possible") Signed-off-by: Sabrina Dubroca Signed-off-by: Steffen Klassert Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- include/net/esp.h | 2 -- net/ipv4/esp4.c | 5 ++--- net/ipv6/esp6.c | 5 ++--- 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/include/net/esp.h b/include/net/esp.h index 90cd02ff77ef..9c5637d41d95 100644 --- a/include/net/esp.h +++ b/include/net/esp.h @@ -4,8 +4,6 @@ #include -#define ESP_SKB_FRAG_MAXSIZE (PAGE_SIZE << SKB_FRAG_PAGE_ORDER) - struct ip_esp_hdr; static inline struct ip_esp_hdr *ip_esp_hdr(const struct sk_buff *skb) diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 9aae82145bc1..20d738137841 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -448,7 +448,6 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * struct page *page; struct sk_buff *trailer; int tailen = esp->tailen; - unsigned int allocsz; /* this is non-NULL only with TCP/UDP Encapsulation */ if (x->encap) { @@ -458,8 +457,8 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * return err; } - allocsz = ALIGN(skb->data_len + tailen, L1_CACHE_BYTES); - if (allocsz > ESP_SKB_FRAG_MAXSIZE) + if (ALIGN(tailen, L1_CACHE_BYTES) > PAGE_SIZE || + ALIGN(skb->data_len, L1_CACHE_BYTES) > PAGE_SIZE) goto cow; if (!skb_cloned(skb)) { diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 20c7bef6829e..cb28f8928f9e 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -483,7 +483,6 @@ int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info struct page *page; struct sk_buff *trailer; int tailen = esp->tailen; - unsigned int allocsz; if (x->encap) { int err = esp6_output_encap(x, skb, esp); @@ -492,8 +491,8 @@ int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info return err; } - allocsz = ALIGN(skb->data_len + tailen, L1_CACHE_BYTES); - if (allocsz > ESP_SKB_FRAG_MAXSIZE) + if (ALIGN(tailen, L1_CACHE_BYTES) > PAGE_SIZE || + ALIGN(skb->data_len, L1_CACHE_BYTES) > PAGE_SIZE) goto cow; if (!skb_cloned(skb)) { -- Gitee From f648582b76cf544acd8efbeed6f5fa0020acf160 Mon Sep 17 00:00:00 2001 From: Sasha Neftin Date: Tue, 26 Jul 2022 17:38:28 +0800 Subject: [PATCH 576/674] igc: Fix infinite loop in release_swfw_sync stable inclusion from stable-v5.10.113 commit 46b0e4f998ceead45f273be4d282b7fbb70f6125 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=46b0e4f998ceead45f273be4d282b7fbb70f6125 -------------------------------- [ Upstream commit 907862e9aef75bf89e2b265efcc58870be06081e ] An infinite loop may occur if we fail to acquire the HW semaphore, which is needed for resource release. This will typically happen if the hardware is surprise-removed. At this stage there is nothing to do, except log an error and quit. Fixes: c0071c7aa5fe ("igc: Add HW initialization code") Suggested-by: Dima Ruinskiy Signed-off-by: Sasha Neftin Tested-by: Naama Meir Signed-off-by: Tony Nguyen Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/net/ethernet/intel/igc/igc_i225.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_i225.c b/drivers/net/ethernet/intel/igc/igc_i225.c index 553d6bc78e6b..624236a4202e 100644 --- a/drivers/net/ethernet/intel/igc/igc_i225.c +++ b/drivers/net/ethernet/intel/igc/igc_i225.c @@ -156,8 +156,15 @@ void igc_release_swfw_sync_i225(struct igc_hw *hw, u16 mask) { u32 swfw_sync; - while (igc_get_hw_semaphore_i225(hw)) - ; /* Empty */ + /* Releasing the resource requires first getting the HW semaphore. + * If we fail to get the semaphore, there is nothing we can do, + * except log an error and quit. We are not allowed to hang here + * indefinitely, as it may cause denial of service or system crash. + */ + if (igc_get_hw_semaphore_i225(hw)) { + hw_dbg("Failed to release SW_FW_SYNC.\n"); + return; + } swfw_sync = rd32(IGC_SW_FW_SYNC); swfw_sync &= ~mask; -- Gitee From 39d9023b5d9c8d03091fb5c7c0781836854451bb Mon Sep 17 00:00:00 2001 From: Sasha Neftin Date: Tue, 26 Jul 2022 17:38:29 +0800 Subject: [PATCH 577/674] igc: Fix BUG: scheduling while atomic stable inclusion from stable-v5.10.113 commit fc7116a79a86500e15520658f279b6536ff9f700 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=fc7116a79a86500e15520658f279b6536ff9f700 -------------------------------- [ Upstream commit c80a29f0fe9b6f5457e0788e27d1110577eba99b ] Replace usleep_range() method with udelay() method to allow atomic contexts in low-level MDIO access functions. The following issue can be seen by doing the following: $ modprobe -r bonding $ modprobe -v bonding max_bonds=1 mode=1 miimon=100 use_carrier=0 $ ip link set bond0 up $ ifenslave bond0 eth0 eth1 [ 982.357308] BUG: scheduling while atomic: kworker/u64:0/9/0x00000002 [ 982.364431] INFO: lockdep is turned off. [ 982.368824] Modules linked in: bonding sctp ip6_udp_tunnel udp_tunnel mlx4_ib ib_uverbs ib_core mlx4_en mlx4_core nfp tls sunrpc intel_rapl_msr iTCO_wdt iTCO_vendor_support mxm_wmi dcdbas intel_rapl_common sb_edac x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel kvm irqbypass crct10dif_pclmul crc32_pclmul ghash_clmulni_intel rapl intel_cstate intel_uncore pcspkr lpc_ich mei_me ipmi_ssif mei ipmi_si ipmi_devintf ipmi_msghandler wmi acpi_power_meter xfs libcrc32c sr_mod cdrom sd_mod t10_pi sg mgag200 drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops drm ahci libahci crc32c_intel libata i2c_algo_bit tg3 megaraid_sas igc dm_mirror dm_region_hash dm_log dm_mod [last unloaded: bonding] [ 982.437941] CPU: 25 PID: 9 Comm: kworker/u64:0 Kdump: loaded Tainted: G W --------- - - 4.18.0-348.el8.x86_64+debug #1 [ 982.451333] Hardware name: Dell Inc. PowerEdge R730/0H21J3, BIOS 2.7.0 12/005/2017 [ 982.459791] Workqueue: bond0 bond_mii_monitor [bonding] [ 982.465622] Call Trace: [ 982.468355] dump_stack+0x8e/0xd0 [ 982.472056] __schedule_bug.cold.60+0x3a/0x60 [ 982.476919] __schedule+0x147b/0x1bc0 [ 982.481007] ? firmware_map_remove+0x16b/0x16b [ 982.485967] ? hrtimer_fixup_init+0x40/0x40 [ 982.490625] schedule+0xd9/0x250 [ 982.494227] schedule_hrtimeout_range_clock+0x10d/0x2c0 [ 982.500058] ? hrtimer_nanosleep_restart+0x130/0x130 [ 982.505598] ? hrtimer_init_sleeper_on_stack+0x90/0x90 [ 982.511332] ? usleep_range+0x88/0x130 [ 982.515514] ? recalibrate_cpu_khz+0x10/0x10 [ 982.520279] ? ktime_get+0xab/0x1c0 [ 982.524175] ? usleep_range+0x88/0x130 [ 982.528355] usleep_range+0xdd/0x130 [ 982.532344] ? console_conditional_schedule+0x30/0x30 [ 982.537987] ? igc_put_hw_semaphore+0x17/0x60 [igc] [ 982.543432] igc_read_phy_reg_gpy+0x111/0x2b0 [igc] [ 982.548887] igc_phy_has_link+0xfa/0x260 [igc] [ 982.553847] ? igc_get_phy_id+0x210/0x210 [igc] [ 982.558894] ? lock_acquire+0x34d/0x890 [ 982.563187] ? lock_downgrade+0x710/0x710 [ 982.567659] ? rcu_read_unlock+0x50/0x50 [ 982.572039] igc_check_for_copper_link+0x106/0x210 [igc] [ 982.577970] ? igc_config_fc_after_link_up+0x840/0x840 [igc] [ 982.584286] ? rcu_read_unlock+0x50/0x50 [ 982.588661] ? lock_release+0x591/0xb80 [ 982.592939] ? lock_release+0x591/0xb80 [ 982.597220] igc_has_link+0x113/0x330 [igc] [ 982.601887] ? lock_downgrade+0x710/0x710 [ 982.606362] igc_ethtool_get_link+0x6d/0x90 [igc] [ 982.611614] bond_check_dev_link+0x131/0x2c0 [bonding] [ 982.617350] ? bond_time_in_interval+0xd0/0xd0 [bonding] [ 982.623277] ? rcu_read_lock_held+0x62/0xc0 [ 982.627944] ? rcu_read_lock_sched_held+0xe0/0xe0 [ 982.633198] bond_mii_monitor+0x314/0x2500 [bonding] [ 982.638738] ? lock_contended+0x880/0x880 [ 982.643214] ? bond_miimon_link_change+0xa0/0xa0 [bonding] [ 982.649336] ? lock_acquire+0x34d/0x890 [ 982.653615] ? lock_downgrade+0x710/0x710 [ 982.658089] ? debug_object_deactivate+0x221/0x340 [ 982.663436] ? rcu_read_unlock+0x50/0x50 [ 982.667811] ? debug_print_object+0x2b0/0x2b0 [ 982.672672] ? __switch_to_asm+0x41/0x70 [ 982.677049] ? __switch_to_asm+0x35/0x70 [ 982.681426] ? _raw_spin_unlock_irq+0x24/0x40 [ 982.686288] ? trace_hardirqs_on+0x20/0x195 [ 982.690956] ? _raw_spin_unlock_irq+0x24/0x40 [ 982.695818] process_one_work+0x8f0/0x1770 [ 982.700390] ? pwq_dec_nr_in_flight+0x320/0x320 [ 982.705443] ? debug_show_held_locks+0x50/0x50 [ 982.710403] worker_thread+0x87/0xb40 [ 982.714489] ? process_one_work+0x1770/0x1770 [ 982.719349] kthread+0x344/0x410 [ 982.722950] ? kthread_insert_work_sanity_check+0xd0/0xd0 [ 982.728975] ret_from_fork+0x3a/0x50 Fixes: 5586838fe9ce ("igc: Add code for PHY support") Reported-by: Corinna Vinschen Suggested-by: Dima Ruinskiy Signed-off-by: Sasha Neftin Tested-by: Corinna Vinschen Tested-by: Naama Meir Signed-off-by: Tony Nguyen Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/net/ethernet/intel/igc/igc_phy.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_phy.c b/drivers/net/ethernet/intel/igc/igc_phy.c index e380b7a3ea63..8de4de2e5636 100644 --- a/drivers/net/ethernet/intel/igc/igc_phy.c +++ b/drivers/net/ethernet/intel/igc/igc_phy.c @@ -583,7 +583,7 @@ static s32 igc_read_phy_reg_mdic(struct igc_hw *hw, u32 offset, u16 *data) * the lower time out */ for (i = 0; i < IGC_GEN_POLL_TIMEOUT; i++) { - usleep_range(500, 1000); + udelay(50); mdic = rd32(IGC_MDIC); if (mdic & IGC_MDIC_READY) break; @@ -640,7 +640,7 @@ static s32 igc_write_phy_reg_mdic(struct igc_hw *hw, u32 offset, u16 data) * the lower time out */ for (i = 0; i < IGC_GEN_POLL_TIMEOUT; i++) { - usleep_range(500, 1000); + udelay(50); mdic = rd32(IGC_MDIC); if (mdic & IGC_MDIC_READY) break; -- Gitee From ab18d18b75f5f270c8f87b9743007d904f37b66e Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 26 Jul 2022 17:38:30 +0800 Subject: [PATCH 578/674] rxrpc: Restore removed timer deletion stable inclusion from stable-v5.10.113 commit 60592f16a456d239df9060968dd9cb561d0a6fb0 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=60592f16a456d239df9060968dd9cb561d0a6fb0 -------------------------------- [ Upstream commit ee3b0826b4764f6c13ad6db67495c5a1c38e9025 ] A recent patch[1] from Eric Dumazet flipped the order in which the keepalive timer and the keepalive worker were cancelled in order to fix a syzbot reported issue[2]. Unfortunately, this enables the mirror image bug whereby the timer races with rxrpc_exit_net(), restarting the worker after it has been cancelled: CPU 1 CPU 2 =============== ===================== if (rxnet->live) rxnet->live = false; cancel_work_sync(&rxnet->peer_keepalive_work); rxrpc_queue_work(&rxnet->peer_keepalive_work); del_timer_sync(&rxnet->peer_keepalive_timer); Fix this by restoring the removed del_timer_sync() so that we try to remove the timer twice. If the timer runs again, it should see ->live == false and not restart the worker. Fixes: 1946014ca3b1 ("rxrpc: fix a race in rxrpc_exit_net()") Signed-off-by: David Howells cc: Eric Dumazet cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://lore.kernel.org/r/20220404183439.3537837-1-eric.dumazet@gmail.com/ [1] Link: https://syzkaller.appspot.com/bug?extid=724378c4bb58f703b09a [2] Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- net/rxrpc/net_ns.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/rxrpc/net_ns.c b/net/rxrpc/net_ns.c index f15d6942da45..cc7e30733feb 100644 --- a/net/rxrpc/net_ns.c +++ b/net/rxrpc/net_ns.c @@ -113,7 +113,9 @@ static __net_exit void rxrpc_exit_net(struct net *net) struct rxrpc_net *rxnet = rxrpc_net(net); rxnet->live = false; + del_timer_sync(&rxnet->peer_keepalive_timer); cancel_work_sync(&rxnet->peer_keepalive_work); + /* Remove the timer again as the worker may have restarted it. */ del_timer_sync(&rxnet->peer_keepalive_timer); rxrpc_destroy_all_calls(rxnet); rxrpc_destroy_all_connections(rxnet); -- Gitee From 27b6e01f60e73dc8944d2d18ed50a60a32e68f08 Mon Sep 17 00:00:00 2001 From: Tony Lu Date: Tue, 26 Jul 2022 17:38:31 +0800 Subject: [PATCH 579/674] net/smc: Fix sock leak when release after smc_shutdown() stable inclusion from stable-v5.10.113 commit a499cb5f3ef9f976eac96c02adbcc505764e2b91 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=a499cb5f3ef9f976eac96c02adbcc505764e2b91 -------------------------------- [ Upstream commit 1a74e99323746353bba11562a2f2d0aa8102f402 ] Since commit e5d5aadcf3cd ("net/smc: fix sk_refcnt underflow on linkdown and fallback"), for a fallback connection, __smc_release() does not call sock_put() if its state is already SMC_CLOSED. When calling smc_shutdown() after falling back, its state is set to SMC_CLOSED but does not call sock_put(), so this patch calls it. Reported-and-tested-by: syzbot+6e29a053eb165bd50de5@syzkaller.appspotmail.com Fixes: e5d5aadcf3cd ("net/smc: fix sk_refcnt underflow on linkdown and fallback") Signed-off-by: Tony Lu Acked-by: Karsten Graul Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- net/smc/af_smc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 4f16d406ad8e..1b98f3241150 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2144,8 +2144,10 @@ static int smc_shutdown(struct socket *sock, int how) if (smc->use_fallback) { rc = kernel_sock_shutdown(smc->clcsock, how); sk->sk_shutdown = smc->clcsock->sk->sk_shutdown; - if (sk->sk_shutdown == SHUTDOWN_MASK) + if (sk->sk_shutdown == SHUTDOWN_MASK) { sk->sk_state = SMC_CLOSED; + sock_put(sk); + } goto out; } switch (how) { -- Gitee From 3059be689186bbdbc081b5cdc9f8b08e78487795 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 26 Jul 2022 17:38:32 +0800 Subject: [PATCH 580/674] net/packet: fix packet_sock xmit return value checking stable inclusion from stable-v5.10.113 commit 8fb76adb89f0d576c107e05fd922bf3d04c470d9 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=8fb76adb89f0d576c107e05fd922bf3d04c470d9 -------------------------------- [ Upstream commit 29e8e659f984be00d75ec5fef4e37c88def72712 ] packet_sock xmit could be dev_queue_xmit, which also returns negative errors. So only checking positive errors is not enough, or userspace sendmsg may return success while packet is not send out. Move the net_xmit_errno() assignment in the braces as checkpatch.pl said do not use assignment in if condition. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: Flavio Leitner Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- net/packet/af_packet.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index d0c95d7dd292..5ee600d108a0 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2817,8 +2817,9 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) status = TP_STATUS_SEND_REQUEST; err = po->xmit(skb); - if (unlikely(err > 0)) { - err = net_xmit_errno(err); + if (unlikely(err != 0)) { + if (err > 0) + err = net_xmit_errno(err); if (err && __packet_get_status(po, ph) == TP_STATUS_AVAILABLE) { /* skb was destructed already */ @@ -3019,8 +3020,12 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) skb->no_fcs = 1; err = po->xmit(skb); - if (err > 0 && (err = net_xmit_errno(err)) != 0) - goto out_unlock; + if (unlikely(err != 0)) { + if (err > 0) + err = net_xmit_errno(err); + if (err) + goto out_unlock; + } dev_put(dev); -- Gitee From beeab1369997e4479ef51ac7c3a97460393a01e0 Mon Sep 17 00:00:00 2001 From: Peilin Ye Date: Tue, 26 Jul 2022 17:38:33 +0800 Subject: [PATCH 581/674] ip6_gre: Avoid updating tunnel->tun_hlen in __gre6_xmit() stable inclusion from stable-v5.10.113 commit 200f96ebb389788dea563e8a0b1105863a836039 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=200f96ebb389788dea563e8a0b1105863a836039 -------------------------------- [ Upstream commit f40c064e933d7787ca7411b699504d7a2664c1f5 ] Do not update tunnel->tun_hlen in data plane code. Use a local variable instead, just like "tunnel_hlen" in net/ipv4/ip_gre.c:gre_fb_xmit(). Co-developed-by: Cong Wang Signed-off-by: Cong Wang Signed-off-by: Peilin Ye Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- net/ipv6/ip6_gre.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 9a0263f25232..949d6fbc1ca0 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -743,6 +743,7 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, struct ip_tunnel_info *tun_info; const struct ip_tunnel_key *key; __be16 flags; + int tun_hlen; tun_info = skb_tunnel_info_txcheck(skb); if (IS_ERR(tun_info) || @@ -760,9 +761,9 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, dsfield = key->tos; flags = key->tun_flags & (TUNNEL_CSUM | TUNNEL_KEY | TUNNEL_SEQ); - tunnel->tun_hlen = gre_calc_hlen(flags); + tun_hlen = gre_calc_hlen(flags); - gre_build_header(skb, tunnel->tun_hlen, + gre_build_header(skb, tun_hlen, flags, protocol, tunnel_id_to_key32(tun_info->key.tun_id), (flags & TUNNEL_SEQ) ? htonl(tunnel->o_seqno++) -- Gitee From be914f9d3ae55f43b0733a64e2b769bcd02d7d0c Mon Sep 17 00:00:00 2001 From: Peilin Ye Date: Tue, 26 Jul 2022 17:38:34 +0800 Subject: [PATCH 582/674] ip6_gre: Fix skb_under_panic in __gre6_xmit() stable inclusion from stable-v5.10.113 commit 93366275be72bc55cada31a2a1273147553acab8 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=93366275be72bc55cada31a2a1273147553acab8 -------------------------------- [ Upstream commit ab198e1d0dd8dc4bc7575fb50758e2cbd51e14e1 ] Feng reported an skb_under_panic BUG triggered by running test_ip6gretap() in tools/testing/selftests/bpf/test_tunnel.sh: [ 82.492551] skbuff: skb_under_panic: text:ffffffffb268bb8e len:403 put:12 head:ffff9997c5480000 data:ffff9997c547fff8 tail:0x18b end:0x2c0 dev:ip6gretap11 <...> [ 82.607380] Call Trace: [ 82.609389] [ 82.611136] skb_push.cold.109+0x10/0x10 [ 82.614289] __gre6_xmit+0x41e/0x590 [ 82.617169] ip6gre_tunnel_xmit+0x344/0x3f0 [ 82.620526] dev_hard_start_xmit+0xf1/0x330 [ 82.623882] sch_direct_xmit+0xe4/0x250 [ 82.626961] __dev_queue_xmit+0x720/0xfe0 <...> [ 82.633431] packet_sendmsg+0x96a/0x1cb0 [ 82.636568] sock_sendmsg+0x30/0x40 <...> The following sequence of events caused the BUG: 1. During ip6gretap device initialization, tunnel->tun_hlen (e.g. 4) is calculated based on old flags (see ip6gre_calc_hlen()); 2. packet_snd() reserves header room for skb A, assuming tunnel->tun_hlen is 4; 3. Later (in clsact Qdisc), the eBPF program sets a new tunnel key for skb A using bpf_skb_set_tunnel_key() (see _ip6gretap_set_tunnel()); 4. __gre6_xmit() detects the new tunnel key, and recalculates "tun_hlen" (e.g. 12) based on new flags (e.g. TUNNEL_KEY and TUNNEL_SEQ); 5. gre_build_header() calls skb_push() with insufficient reserved header room, triggering the BUG. As sugguested by Cong, fix it by moving the call to skb_cow_head() after the recalculation of tun_hlen. Reproducer: OBJ=$LINUX/tools/testing/selftests/bpf/test_tunnel_kern.o ip netns add at_ns0 ip link add veth0 type veth peer name veth1 ip link set veth0 netns at_ns0 ip netns exec at_ns0 ip addr add 172.16.1.100/24 dev veth0 ip netns exec at_ns0 ip link set dev veth0 up ip link set dev veth1 up mtu 1500 ip addr add dev veth1 172.16.1.200/24 ip netns exec at_ns0 ip addr add ::11/96 dev veth0 ip netns exec at_ns0 ip link set dev veth0 up ip addr add dev veth1 ::22/96 ip link set dev veth1 up ip netns exec at_ns0 \ ip link add dev ip6gretap00 type ip6gretap seq flowlabel 0xbcdef key 2 \ local ::11 remote ::22 ip netns exec at_ns0 ip addr add dev ip6gretap00 10.1.1.100/24 ip netns exec at_ns0 ip addr add dev ip6gretap00 fc80::100/96 ip netns exec at_ns0 ip link set dev ip6gretap00 up ip link add dev ip6gretap11 type ip6gretap external ip addr add dev ip6gretap11 10.1.1.200/24 ip addr add dev ip6gretap11 fc80::200/24 ip link set dev ip6gretap11 up tc qdisc add dev ip6gretap11 clsact tc filter add dev ip6gretap11 egress bpf da obj $OBJ sec ip6gretap_set_tunnel tc filter add dev ip6gretap11 ingress bpf da obj $OBJ sec ip6gretap_get_tunnel ping6 -c 3 -w 10 -q ::11 Fixes: 6712abc168eb ("ip6_gre: add ip6 gre and gretap collect_md mode") Reported-by: Feng Zhou Co-developed-by: Cong Wang Signed-off-by: Cong Wang Signed-off-by: Peilin Ye Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- net/ipv6/ip6_gre.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 949d6fbc1ca0..1f6c752f13b4 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -733,9 +733,6 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, else fl6->daddr = tunnel->parms.raddr; - if (skb_cow_head(skb, dev->needed_headroom ?: tunnel->hlen)) - return -ENOMEM; - /* Push GRE header. */ protocol = (dev->type == ARPHRD_ETHER) ? htons(ETH_P_TEB) : proto; @@ -763,6 +760,9 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, (TUNNEL_CSUM | TUNNEL_KEY | TUNNEL_SEQ); tun_hlen = gre_calc_hlen(flags); + if (skb_cow_head(skb, dev->needed_headroom ?: tun_hlen + tunnel->encap_hlen)) + return -ENOMEM; + gre_build_header(skb, tun_hlen, flags, protocol, tunnel_id_to_key32(tun_info->key.tun_id), @@ -773,6 +773,9 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, if (tunnel->parms.o_flags & TUNNEL_SEQ) tunnel->o_seqno++; + if (skb_cow_head(skb, dev->needed_headroom ?: tunnel->hlen)) + return -ENOMEM; + gre_build_header(skb, tunnel->tun_hlen, tunnel->parms.o_flags, protocol, tunnel->parms.o_key, htonl(tunnel->o_seqno)); -- Gitee From cf223164578f4184522fc77982b2d795305ba27e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 Jul 2022 17:38:35 +0800 Subject: [PATCH 583/674] net/sched: cls_u32: fix possible leak in u32_init_knode() stable inclusion from stable-v5.10.113 commit 0ac8f83d8f6471e39b3cfe8ee48560df3bf9c451 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=0ac8f83d8f6471e39b3cfe8ee48560df3bf9c451 -------------------------------- [ Upstream commit ec5b0f605b105457f257f2870acad4a5d463984b ] While investigating a related syzbot report, I found that whenever call to tcf_exts_init() from u32_init_knode() is failing, we end up with an elevated refcount on ht->refcnt To avoid that, only increase the refcount after all possible errors have been evaluated. Fixes: b9a24bb76bf6 ("net_sched: properly handle failure case of tcf_exts_init()") Signed-off-by: Eric Dumazet Cc: Cong Wang Cc: Jiri Pirko Acked-by: Jamal Hadi Salim Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- net/sched/cls_u32.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index b61db335c49d..da042bc8b239 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -814,10 +814,6 @@ static struct tc_u_knode *u32_init_knode(struct net *net, struct tcf_proto *tp, new->flags = n->flags; RCU_INIT_POINTER(new->ht_down, ht); - /* bump reference count as long as we hold pointer to structure */ - if (ht) - ht->refcnt++; - #ifdef CONFIG_CLS_U32_PERF /* Statistics may be incremented by readers during update * so we must keep them in tact. When the node is later destroyed @@ -839,6 +835,10 @@ static struct tc_u_knode *u32_init_knode(struct net *net, struct tcf_proto *tp, return NULL; } + /* bump reference count as long as we hold pointer to structure */ + if (ht) + ht->refcnt++; + return new; } -- Gitee From aa8ab17792740416d50918eaf63bc38ed591fbc2 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 26 Jul 2022 17:38:36 +0800 Subject: [PATCH 584/674] l3mdev: l3mdev_master_upper_ifindex_by_index_rcu should be using netdev_master_upper_dev_get_rcu stable inclusion from stable-v5.10.113 commit 078d839f11acfa9bf5b549d2189abda9f2a943a0 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=078d839f11acfa9bf5b549d2189abda9f2a943a0 -------------------------------- [ Upstream commit 83daab06252ee5d0e1f4373ff28b79304945fc19 ] Next patch uses l3mdev_master_upper_ifindex_by_index_rcu which throws a splat with debug kernels: [13783.087570] ------------[ cut here ]------------ [13783.093974] RTNL: assertion failed at net/core/dev.c (6702) [13783.100761] WARNING: CPU: 3 PID: 51132 at net/core/dev.c:6702 netdev_master_upper_dev_get+0x16a/0x1a0 [13783.184226] CPU: 3 PID: 51132 Comm: kworker/3:3 Not tainted 5.17.0-custom-100090-g6f963aafb1cc #682 [13783.194788] Hardware name: Mellanox Technologies Ltd. MSN2010/SA002610, BIOS 5.6.5 08/24/2017 [13783.204755] Workqueue: mld mld_ifc_work [ipv6] [13783.210338] RIP: 0010:netdev_master_upper_dev_get+0x16a/0x1a0 [13783.217209] Code: 0f 85 e3 fe ff ff e8 65 ac ec fe ba 2e 1a 00 00 48 c7 c6 60 6f 38 83 48 c7 c7 c0 70 38 83 c6 05 5e b5 d7 01 01 e8 c6 29 52 00 <0f> 0b e9 b8 fe ff ff e8 5a 6c 35 ff e9 1c ff ff ff 48 89 ef e8 7d [13783.238659] RSP: 0018:ffffc9000b37f5a8 EFLAGS: 00010286 [13783.244995] RAX: 0000000000000000 RBX: ffff88812ee5c000 RCX: 0000000000000000 [13783.253379] RDX: ffff88811ce09d40 RSI: ffffffff812d0fcd RDI: fffff5200166fea7 [13783.261769] RBP: 0000000000000000 R08: 0000000000000001 R09: ffff8882375f4287 [13783.270138] R10: ffffed1046ebe850 R11: 0000000000000001 R12: dffffc0000000000 [13783.278510] R13: 0000000000000275 R14: ffffc9000b37f688 R15: ffff8881273b4af8 [13783.286870] FS: 0000000000000000(0000) GS:ffff888237400000(0000) knlGS:0000000000000000 [13783.296352] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [13783.303177] CR2: 00007ff25fc9b2e8 CR3: 0000000174d23000 CR4: 00000000001006e0 [13783.311546] Call Trace: [13783.314660] [13783.317553] l3mdev_master_upper_ifindex_by_index_rcu+0x43/0xe0 ... Change l3mdev_master_upper_ifindex_by_index_rcu to use netdev_master_upper_dev_get_rcu. Fixes: 6a6d6681ac1a ("l3mdev: add function to retreive upper master") Signed-off-by: Ido Schimmel Signed-off-by: David Ahern Cc: Alexis Bauvin Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- net/l3mdev/l3mdev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c index 864326f150e2..f2c3a61ad134 100644 --- a/net/l3mdev/l3mdev.c +++ b/net/l3mdev/l3mdev.c @@ -147,7 +147,7 @@ int l3mdev_master_upper_ifindex_by_index_rcu(struct net *net, int ifindex) dev = dev_get_by_index_rcu(net, ifindex); while (dev && !netif_is_l3_master(dev)) - dev = netdev_master_upper_dev_get(dev); + dev = netdev_master_upper_dev_get_rcu(dev); return dev ? dev->ifindex : 0; } -- Gitee From 3bd107cbe765f64dbde8b4282a01282e5a9be749 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 Jul 2022 17:38:37 +0800 Subject: [PATCH 585/674] ipv6: make ip6_rt_gc_expire an atomic_t stable inclusion from stable-v5.10.113 commit 49516e6ed91434d022a800321a8bc7d8054f62ac category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=49516e6ed91434d022a800321a8bc7d8054f62ac -------------------------------- [ Upstream commit 9cb7c013420f98fa6fd12fc6a5dc055170c108db ] Reads and Writes to ip6_rt_gc_expire always have been racy, as syzbot reported lately [1] There is a possible risk of under-flow, leading to unexpected high value passed to fib6_run_gc(), although I have not observed this in the field. Hosts hitting ip6_dst_gc() very hard are under pretty bad state anyway. [1] BUG: KCSAN: data-race in ip6_dst_gc / ip6_dst_gc read-write to 0xffff888102110744 of 4 bytes by task 13165 on cpu 1: ip6_dst_gc+0x1f3/0x220 net/ipv6/route.c:3311 dst_alloc+0x9b/0x160 net/core/dst.c:86 ip6_dst_alloc net/ipv6/route.c:344 [inline] icmp6_dst_alloc+0xb2/0x360 net/ipv6/route.c:3261 mld_sendpack+0x2b9/0x580 net/ipv6/mcast.c:1807 mld_send_cr net/ipv6/mcast.c:2119 [inline] mld_ifc_work+0x576/0x800 net/ipv6/mcast.c:2651 process_one_work+0x3d3/0x720 kernel/workqueue.c:2289 worker_thread+0x618/0xa70 kernel/workqueue.c:2436 kthread+0x1a9/0x1e0 kernel/kthread.c:376 ret_from_fork+0x1f/0x30 read-write to 0xffff888102110744 of 4 bytes by task 11607 on cpu 0: ip6_dst_gc+0x1f3/0x220 net/ipv6/route.c:3311 dst_alloc+0x9b/0x160 net/core/dst.c:86 ip6_dst_alloc net/ipv6/route.c:344 [inline] icmp6_dst_alloc+0xb2/0x360 net/ipv6/route.c:3261 mld_sendpack+0x2b9/0x580 net/ipv6/mcast.c:1807 mld_send_cr net/ipv6/mcast.c:2119 [inline] mld_ifc_work+0x576/0x800 net/ipv6/mcast.c:2651 process_one_work+0x3d3/0x720 kernel/workqueue.c:2289 worker_thread+0x618/0xa70 kernel/workqueue.c:2436 kthread+0x1a9/0x1e0 kernel/kthread.c:376 ret_from_fork+0x1f/0x30 value changed: 0x00000bb3 -> 0x00000ba9 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 11607 Comm: kworker/0:21 Not tainted 5.18.0-rc1-syzkaller-00037-g42e7a03d3bad-dirty #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: mld mld_ifc_work Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Reported-by: syzbot Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20220413181333.649424-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Conflicts: include/net/netns/ipv6.h Acked-by: Xie XiuQi --- include/net/netns/ipv6.h | 4 ++-- net/ipv6/route.c | 11 ++++++----- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index b2a28201f4fd..35e0c4a6c71f 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -79,8 +79,8 @@ struct netns_ipv6 { struct dst_ops ip6_dst_ops; rwlock_t fib6_walker_lock; spinlock_t fib6_gc_lock; - unsigned int ip6_rt_gc_expire; - unsigned long ip6_rt_last_gc; + atomic_t ip6_rt_gc_expire; + unsigned long ip6_rt_last_gc; #ifdef CONFIG_IPV6_MULTIPLE_TABLES unsigned int fib6_rules_require_fldissect; #endif diff --git a/net/ipv6/route.c b/net/ipv6/route.c index a44ad9637e8a..4ef59dc515e5 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3187,6 +3187,7 @@ static int ip6_dst_gc(struct dst_ops *ops) int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; + unsigned int val; int entries; entries = dst_entries_get_fast(ops); @@ -3197,13 +3198,13 @@ static int ip6_dst_gc(struct dst_ops *ops) entries <= rt_max_size) goto out; - net->ipv6.ip6_rt_gc_expire++; - fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true); + fib6_run_gc(atomic_inc_return(&net->ipv6.ip6_rt_gc_expire), net, true); entries = dst_entries_get_slow(ops); if (entries < ops->gc_thresh) - net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1; + atomic_set(&net->ipv6.ip6_rt_gc_expire, rt_gc_timeout >> 1); out: - net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity; + val = atomic_read(&net->ipv6.ip6_rt_gc_expire); + atomic_set(&net->ipv6.ip6_rt_gc_expire, val - (val >> rt_elasticity)); return entries > rt_max_size; } @@ -6358,7 +6359,7 @@ static int __net_init ip6_route_net_init(struct net *net) net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; net->ipv6.sysctl.skip_notify_on_dev_down = 0; - net->ipv6.ip6_rt_gc_expire = 30*HZ; + atomic_set(&net->ipv6.ip6_rt_gc_expire, 30*HZ); ret = 0; out: -- Gitee From 6007bb2484ce25d53b3df5fb1dba2aea01cef119 Mon Sep 17 00:00:00 2001 From: Xu Jia Date: Tue, 26 Jul 2022 17:38:38 +0800 Subject: [PATCH 586/674] ipv6: fix kabi for ip6_rt_gc_expire in struct netns_ipv6 hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH -------------------------------- Making ip6_rt_gc_expire atomic breaks the KABI of struct netns_ipv6. This patch uses KABI_REPLACE to fix it. Signed-off-by: Xu Jia Signed-off-by: Zheng Zengkai Reviewed-by: Yue Haibing Acked-by: Xie XiuQi --- include/net/netns/ipv6.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 35e0c4a6c71f..31d0cf3c7377 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -79,7 +79,7 @@ struct netns_ipv6 { struct dst_ops ip6_dst_ops; rwlock_t fib6_walker_lock; spinlock_t fib6_gc_lock; - atomic_t ip6_rt_gc_expire; + KABI_REPLACE(unsigned int ip6_rt_gc_expire, atomic_t ip6_rt_gc_expire) unsigned long ip6_rt_last_gc; #ifdef CONFIG_IPV6_MULTIPLE_TABLES unsigned int fib6_rules_require_fldissect; -- Gitee From d7434e9cbc0f809add85594b97bcd8731050cfe9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 Jul 2022 17:38:39 +0800 Subject: [PATCH 587/674] netlink: reset network and mac headers in netlink_dump() stable inclusion from stable-v5.10.113 commit 3d55b195747c1259c994b868a509beae947a7c6e category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=3d55b195747c1259c994b868a509beae947a7c6e -------------------------------- [ Upstream commit 99c07327ae11e24886d552dddbe4537bfca2765d ] netlink_dump() is allocating an skb, reserves space in it but forgets to reset network header. This allows a BPF program, invoked later from sk_filter() to access uninitialized kernel memory from the reserved space. Theorically mac header reset could be omitted, because it is set to a special initial value. bpf_internal_load_pointer_neg_helper calls skb_mac_header() without checking skb_mac_header_was_set(). Relying on skb->len not being too big seems fragile. We also could add a sanity check in bpf_internal_load_pointer_neg_helper() to avoid surprises in the future. syzbot report was: BUG: KMSAN: uninit-value in ___bpf_prog_run+0xa22b/0xb420 kernel/bpf/core.c:1637 ___bpf_prog_run+0xa22b/0xb420 kernel/bpf/core.c:1637 __bpf_prog_run32+0x121/0x180 kernel/bpf/core.c:1796 bpf_dispatcher_nop_func include/linux/bpf.h:784 [inline] __bpf_prog_run include/linux/filter.h:626 [inline] bpf_prog_run include/linux/filter.h:633 [inline] __bpf_prog_run_save_cb+0x168/0x580 include/linux/filter.h:756 bpf_prog_run_save_cb include/linux/filter.h:770 [inline] sk_filter_trim_cap+0x3bc/0x8c0 net/core/filter.c:150 sk_filter include/linux/filter.h:905 [inline] netlink_dump+0xe0c/0x16c0 net/netlink/af_netlink.c:2276 netlink_recvmsg+0x1129/0x1c80 net/netlink/af_netlink.c:2002 sock_recvmsg_nosec net/socket.c:948 [inline] sock_recvmsg net/socket.c:966 [inline] sock_read_iter+0x5a9/0x630 net/socket.c:1039 do_iter_readv_writev+0xa7f/0xc70 do_iter_read+0x52c/0x14c0 fs/read_write.c:786 vfs_readv fs/read_write.c:906 [inline] do_readv+0x432/0x800 fs/read_write.c:943 __do_sys_readv fs/read_write.c:1034 [inline] __se_sys_readv fs/read_write.c:1031 [inline] __x64_sys_readv+0xe5/0x120 fs/read_write.c:1031 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x54/0xd0 arch/x86/entry/common.c:81 entry_SYSCALL_64_after_hwframe+0x44/0xae Uninit was stored to memory at: ___bpf_prog_run+0x96c/0xb420 kernel/bpf/core.c:1558 __bpf_prog_run32+0x121/0x180 kernel/bpf/core.c:1796 bpf_dispatcher_nop_func include/linux/bpf.h:784 [inline] __bpf_prog_run include/linux/filter.h:626 [inline] bpf_prog_run include/linux/filter.h:633 [inline] __bpf_prog_run_save_cb+0x168/0x580 include/linux/filter.h:756 bpf_prog_run_save_cb include/linux/filter.h:770 [inline] sk_filter_trim_cap+0x3bc/0x8c0 net/core/filter.c:150 sk_filter include/linux/filter.h:905 [inline] netlink_dump+0xe0c/0x16c0 net/netlink/af_netlink.c:2276 netlink_recvmsg+0x1129/0x1c80 net/netlink/af_netlink.c:2002 sock_recvmsg_nosec net/socket.c:948 [inline] sock_recvmsg net/socket.c:966 [inline] sock_read_iter+0x5a9/0x630 net/socket.c:1039 do_iter_readv_writev+0xa7f/0xc70 do_iter_read+0x52c/0x14c0 fs/read_write.c:786 vfs_readv fs/read_write.c:906 [inline] do_readv+0x432/0x800 fs/read_write.c:943 __do_sys_readv fs/read_write.c:1034 [inline] __se_sys_readv fs/read_write.c:1031 [inline] __x64_sys_readv+0xe5/0x120 fs/read_write.c:1031 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x54/0xd0 arch/x86/entry/common.c:81 entry_SYSCALL_64_after_hwframe+0x44/0xae Uninit was created at: slab_post_alloc_hook mm/slab.h:737 [inline] slab_alloc_node mm/slub.c:3244 [inline] __kmalloc_node_track_caller+0xde3/0x14f0 mm/slub.c:4972 kmalloc_reserve net/core/skbuff.c:354 [inline] __alloc_skb+0x545/0xf90 net/core/skbuff.c:426 alloc_skb include/linux/skbuff.h:1158 [inline] netlink_dump+0x30f/0x16c0 net/netlink/af_netlink.c:2242 netlink_recvmsg+0x1129/0x1c80 net/netlink/af_netlink.c:2002 sock_recvmsg_nosec net/socket.c:948 [inline] sock_recvmsg net/socket.c:966 [inline] sock_read_iter+0x5a9/0x630 net/socket.c:1039 do_iter_readv_writev+0xa7f/0xc70 do_iter_read+0x52c/0x14c0 fs/read_write.c:786 vfs_readv fs/read_write.c:906 [inline] do_readv+0x432/0x800 fs/read_write.c:943 __do_sys_readv fs/read_write.c:1034 [inline] __se_sys_readv fs/read_write.c:1031 [inline] __x64_sys_readv+0xe5/0x120 fs/read_write.c:1031 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x54/0xd0 arch/x86/entry/common.c:81 entry_SYSCALL_64_after_hwframe+0x44/0xae CPU: 0 PID: 3470 Comm: syz-executor751 Not tainted 5.17.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Fixes: db65a3aaf29e ("netlink: Trim skb to alloc size to avoid MSG_TRUNC") Fixes: 9063e21fb026 ("netlink: autosize skb lengthes") Signed-off-by: Eric Dumazet Reported-by: syzbot Link: https://lore.kernel.org/r/20220415181442.551228-1-eric.dumazet@gmail.com Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- net/netlink/af_netlink.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index f37916156ca5..cbfb601c4ee9 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2276,6 +2276,13 @@ static int netlink_dump(struct sock *sk) * single netdev. The outcome is MSG_TRUNC error. */ skb_reserve(skb, skb_tailroom(skb) - alloc_size); + + /* Make sure malicious BPF programs can not read unitialized memory + * from skb->head -> skb->data + */ + skb_reset_network_header(skb); + skb_reset_mac_header(skb); + netlink_skb_set_owner_r(skb, sk); if (nlk->dump_done_errno > 0) { -- Gitee From ceaf8ddd59c3450db33261efc14a28835bafb9ce Mon Sep 17 00:00:00 2001 From: Kevin Hao Date: Tue, 26 Jul 2022 17:38:40 +0800 Subject: [PATCH 588/674] net: stmmac: Use readl_poll_timeout_atomic() in atomic state stable inclusion from stable-v5.10.113 commit f593f49fcd174ebe0ebeebecf46fadc134593ddb category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=f593f49fcd174ebe0ebeebecf46fadc134593ddb -------------------------------- [ Upstream commit 234901de2bc6847eaa0aeb4aba62c31ffb8d3ad6 ] The init_systime() may be invoked in atomic state. We have observed the following call trace when running "phc_ctl /dev/ptp0 set" on a Intel Agilex board. BUG: sleeping function called from invalid context at drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c:74 in_atomic(): 1, irqs_disabled(): 128, non_block: 0, pid: 381, name: phc_ctl preempt_count: 1, expected: 0 RCU nest depth: 0, expected: 0 Preemption disabled at: [] stmmac_set_time+0x34/0x8c CPU: 2 PID: 381 Comm: phc_ctl Not tainted 5.18.0-rc2-next-20220414-yocto-standard+ #567 Hardware name: SoCFPGA Agilex SoCDK (DT) Call trace: dump_backtrace.part.0+0xc4/0xd0 show_stack+0x24/0x40 dump_stack_lvl+0x7c/0xa0 dump_stack+0x18/0x34 __might_resched+0x154/0x1c0 __might_sleep+0x58/0x90 init_systime+0x78/0x120 stmmac_set_time+0x64/0x8c ptp_clock_settime+0x60/0x9c pc_clock_settime+0x6c/0xc0 __arm64_sys_clock_settime+0x88/0xf0 invoke_syscall+0x5c/0x130 el0_svc_common.constprop.0+0x4c/0x100 do_el0_svc+0x7c/0xa0 el0_svc+0x58/0xcc el0t_64_sync_handler+0xa4/0x130 el0t_64_sync+0x18c/0x190 So we should use readl_poll_timeout_atomic() here instead of readl_poll_timeout(). Also adjust the delay time to 10us to fix a "__bad_udelay" build error reported by "kernel test robot ". I have tested this on Intel Agilex and NXP S32G boards, there is no delay needed at all. So the 10us delay should be long enough for most cases. Fixes: ff8ed737860e ("net: stmmac: use readl_poll_timeout() function in init_systime()") Signed-off-by: Kevin Hao Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c index 07b1b8374cd2..53efcc9c40e2 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c @@ -68,9 +68,9 @@ static int init_systime(void __iomem *ioaddr, u32 sec, u32 nsec) writel(value, ioaddr + PTP_TCR); /* wait for present system time initialize to complete */ - return readl_poll_timeout(ioaddr + PTP_TCR, value, + return readl_poll_timeout_atomic(ioaddr + PTP_TCR, value, !(value & PTP_TCR_TSINIT), - 10000, 100000); + 10, 100000); } static int config_addend(void __iomem *ioaddr, u32 addend) -- Gitee From 1d394f14128d12cf5b6313d895622aad5fd5a5ba Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Tue, 26 Jul 2022 17:38:41 +0800 Subject: [PATCH 589/674] dmaengine: idxd: add RO check for wq max_batch_size write stable inclusion from stable-v5.10.113 commit 9a3c026dc3a59fa23c63d90752f07538a3c75646 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=9a3c026dc3a59fa23c63d90752f07538a3c75646 -------------------------------- [ Upstream commit 66903461ffed0b66fc3e0200082d4e09365aacdc ] Block wq_max_batch_size_store() when the device is configured as read-only and not configurable. Fixes: e7184b159dd3 ("dmaengine: idxd: add support for configurable max wq batch size") Reported-by: Bernice Zhang Tested-by: Bernice Zhang Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/164971493551.2201159.1942042593642155209.stgit@djiang5-desk3.ch.intel.com Signed-off-by: Vinod Koul Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/dma/idxd/sysfs.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c index 7b41cdff1a2c..5bf4b4be64e4 100644 --- a/drivers/dma/idxd/sysfs.c +++ b/drivers/dma/idxd/sysfs.c @@ -1132,6 +1132,9 @@ static ssize_t wq_max_batch_size_store(struct device *dev, struct device_attribu u64 batch_size; int rc; + if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags)) + return -EPERM; + if (wq->state != IDXD_WQ_DISABLED) return -EPERM; -- Gitee From c324bed6bcd3c12392271dddbbbd03c23acddd8d Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Tue, 26 Jul 2022 17:38:42 +0800 Subject: [PATCH 590/674] dmaengine: idxd: add RO check for wq max_transfer_size write stable inclusion from stable-v5.10.113 commit 520aab8b723cb2bffef957844b634af04e96428d category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=520aab8b723cb2bffef957844b634af04e96428d -------------------------------- [ Upstream commit 505a2d1032ae656b0a8c736be110255503941cde ] Block wq_max_transfer_size_store() when the device is configured as read-only and not configurable. Fixes: d7aad5550eca ("dmaengine: idxd: add support for configurable max wq xfer size") Reported-by: Bernice Zhang Tested-by: Bernice Zhang Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/164971488154.2200913.10706665404118545941.stgit@djiang5-desk3.ch.intel.com Signed-off-by: Vinod Koul Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/dma/idxd/sysfs.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c index 5bf4b4be64e4..51af0dfc3c63 100644 --- a/drivers/dma/idxd/sysfs.c +++ b/drivers/dma/idxd/sysfs.c @@ -1098,6 +1098,9 @@ static ssize_t wq_max_transfer_size_store(struct device *dev, struct device_attr u64 xfer_size; int rc; + if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags)) + return -EPERM; + if (wq->state != IDXD_WQ_DISABLED) return -EPERM; -- Gitee From aec8147faa831c7bd51628f95d992afe0942c107 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 26 Jul 2022 17:38:43 +0800 Subject: [PATCH 591/674] selftests: mlxsw: vxlan_flooding: Prevent flooding of unwanted packets stable inclusion from stable-v5.10.113 commit 3bf8ca35017024fa1cad55344f798cd5cd131c16 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=3bf8ca35017024fa1cad55344f798cd5cd131c16 -------------------------------- [ Upstream commit 044011fdf162c5dd61c02841930c8f438a9adadb ] The test verifies that packets are correctly flooded by the bridge and the VXLAN device by matching on the encapsulated packets at the other end. However, if packets other than those generated by the test also ingress the bridge (e.g., MLD packets), they will be flooded as well and interfere with the expected count. Make the test more robust by making sure that only the packets generated by the test can ingress the bridge. Drop all the rest using tc filters on the egress of 'br0' and 'h1'. In the software data path, the problem can be solved by matching on the inner destination MAC or dropping unwanted packets at the egress of the VXLAN device, but this is not currently supported by mlxsw. Fixes: 94d302deae25 ("selftests: mlxsw: Add a test for VxLAN flooding") Signed-off-by: Ido Schimmel Reviewed-by: Amit Cohen Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- .../drivers/net/mlxsw/vxlan_flooding.sh | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tools/testing/selftests/drivers/net/mlxsw/vxlan_flooding.sh b/tools/testing/selftests/drivers/net/mlxsw/vxlan_flooding.sh index fedcb7b35af9..af5ea50ed5c0 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/vxlan_flooding.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/vxlan_flooding.sh @@ -172,6 +172,17 @@ flooding_filters_add() local lsb local i + # Prevent unwanted packets from entering the bridge and interfering + # with the test. + tc qdisc add dev br0 clsact + tc filter add dev br0 egress protocol all pref 1 handle 1 \ + matchall skip_hw action drop + tc qdisc add dev $h1 clsact + tc filter add dev $h1 egress protocol all pref 1 handle 1 \ + flower skip_hw dst_mac de:ad:be:ef:13:37 action pass + tc filter add dev $h1 egress protocol all pref 2 handle 2 \ + matchall skip_hw action drop + tc qdisc add dev $rp2 clsact for i in $(eval echo {1..$num_remotes}); do @@ -194,6 +205,12 @@ flooding_filters_del() done tc qdisc del dev $rp2 clsact + + tc filter del dev $h1 egress protocol all pref 2 handle 2 matchall + tc filter del dev $h1 egress protocol all pref 1 handle 1 flower + tc qdisc del dev $h1 clsact + tc filter del dev br0 egress protocol all pref 1 handle 1 matchall + tc qdisc del dev br0 clsact } flooding_check_packets() -- Gitee From a260b077acedec14a79fb4dc4e06619ac344af7f Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 26 Jul 2022 17:38:44 +0800 Subject: [PATCH 592/674] arm64/mm: Remove [PUD|PMD]_TABLE_BIT from [pud|pmd]_bad() stable inclusion from stable-v5.10.113 commit 18ff7a2efa4e14a6bd86aad31a6a8cebdf0dbabc category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=18ff7a2efa4e14a6bd86aad31a6a8cebdf0dbabc -------------------------------- [ Upstream commit e377ab82311af95c99648c6424a6b888a0ccb102 ] Semantics wise, [pud|pmd]_bad() have always implied that a given [PUD|PMD] entry does not have a pointer to the next level page table. This had been made clear in the commit a1c76574f345 ("arm64: mm: use *_sect to check for section maps"). Hence explicitly check for a table entry rather than just testing a single bit. This basically redefines [pud|pmd]_bad() in terms of [pud|pmd]_table() making the semantics clear. Cc: Catalin Marinas Cc: Will Deacon Cc: Mark Rutland Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Anshuman Khandual Acked-by: Mark Rutland Acked-by: Catalin Marinas Acked-by: Mark Rutland Link: https://lore.kernel.org/r/1620644871-26280-1-git-send-email-anshuman.khandual@arm.com Signed-off-by: Will Deacon Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- arch/arm64/include/asm/pgtable.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index daed33697d98..19d18bca6958 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -516,13 +516,12 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, #define pmd_none(pmd) (!pmd_val(pmd)) -#define pmd_bad(pmd) (!(pmd_val(pmd) & PMD_TABLE_BIT)) - #define pmd_table(pmd) ((pmd_val(pmd) & PMD_TYPE_MASK) == \ PMD_TYPE_TABLE) #define pmd_sect(pmd) ((pmd_val(pmd) & PMD_TYPE_MASK) == \ PMD_TYPE_SECT) #define pmd_leaf(pmd) pmd_sect(pmd) +#define pmd_bad(pmd) (!pmd_table(pmd)) #if defined(CONFIG_ARM64_64K_PAGES) || CONFIG_PGTABLE_LEVELS < 3 static inline bool pud_sect(pud_t pud) { return false; } @@ -606,7 +605,7 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) pr_err("%s:%d: bad pmd %016llx.\n", __FILE__, __LINE__, pmd_val(e)) #define pud_none(pud) (!pud_val(pud)) -#define pud_bad(pud) (!(pud_val(pud) & PUD_TABLE_BIT)) +#define pud_bad(pud) (!pud_table(pud)) #define pud_present(pud) pte_present(pud_pte(pud)) #define pud_leaf(pud) pud_sect(pud) #define pud_valid(pud) pte_valid(pud_pte(pud)) -- Gitee From 286861f72d40a9d40e51210f620f52c090483b8a Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Tue, 26 Jul 2022 17:38:45 +0800 Subject: [PATCH 593/674] arm64: mm: fix p?d_leaf() stable inclusion from stable-v5.10.113 commit 052e4a661f90d27586b4364764dcfb2c022b00ee category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=052e4a661f90d27586b4364764dcfb2c022b00ee -------------------------------- [ Upstream commit 23bc8f69f0eceecbb87c3801d2e48827d2dca92b ] The pmd_leaf() is used to test a leaf mapped PMD, however, it misses the PROT_NONE mapped PMD on arm64. Fix it. A real world issue [1] caused by this was reported by Qian Cai. Also fix pud_leaf(). Link: https://patchwork.kernel.org/comment/24798260/ [1] Fixes: 8aa82df3c123 ("arm64: mm: add p?d_leaf() definitions") Reported-by: Qian Cai Signed-off-by: Muchun Song Link: https://lore.kernel.org/r/20220422060033.48711-1-songmuchun@bytedance.com Signed-off-by: Will Deacon Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- arch/arm64/include/asm/pgtable.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 19d18bca6958..ab2443900f4e 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -520,7 +520,7 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, PMD_TYPE_TABLE) #define pmd_sect(pmd) ((pmd_val(pmd) & PMD_TYPE_MASK) == \ PMD_TYPE_SECT) -#define pmd_leaf(pmd) pmd_sect(pmd) +#define pmd_leaf(pmd) (pmd_present(pmd) && !pmd_table(pmd)) #define pmd_bad(pmd) (!pmd_table(pmd)) #if defined(CONFIG_ARM64_64K_PAGES) || CONFIG_PGTABLE_LEVELS < 3 @@ -607,7 +607,7 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) #define pud_none(pud) (!pud_val(pud)) #define pud_bad(pud) (!pud_table(pud)) #define pud_present(pud) pte_present(pud_pte(pud)) -#define pud_leaf(pud) pud_sect(pud) +#define pud_leaf(pud) (pud_present(pud) && !pud_table(pud)) #define pud_valid(pud) pte_valid(pud_pte(pud)) static inline void set_pud(pud_t *pudp, pud_t pud) -- Gitee From be31be301f9aa6457bb1cd1ac8ba374db3ecba79 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 26 Jul 2022 17:38:46 +0800 Subject: [PATCH 594/674] ARM: vexpress/spc: Avoid negative array index when !SMP stable inclusion from stable-v5.10.113 commit d513ea9b7ef822b638857c7114d26af108e4bab1 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=d513ea9b7ef822b638857c7114d26af108e4bab1 -------------------------------- [ Upstream commit b3f1dd52c991d79118f35e6d1bf4d7cb09882e38 ] When building multi_v7_defconfig+CONFIG_SMP=n, -Warray-bounds exposes a couple negative array index accesses: arch/arm/mach-vexpress/spc.c: In function 've_spc_clk_init': arch/arm/mach-vexpress/spc.c:583:21: warning: array subscript -1 is below array bounds of 'bool[2]' {aka '_Bool[2]'} [-Warray-bounds] 583 | if (init_opp_table[cluster]) | ~~~~~~~~~~~~~~^~~~~~~~~ arch/arm/mach-vexpress/spc.c:556:7: note: while referencing 'init_opp_table' 556 | bool init_opp_table[MAX_CLUSTERS] = { false }; | ^~~~~~~~~~~~~~ arch/arm/mach-vexpress/spc.c:592:18: warning: array subscript -1 is below array bounds of 'bool[2]' {aka '_Bool[2]'} [-Warray-bounds] 592 | init_opp_table[cluster] = true; | ~~~~~~~~~~~~~~^~~~~~~~~ arch/arm/mach-vexpress/spc.c:556:7: note: while referencing 'init_opp_table' 556 | bool init_opp_table[MAX_CLUSTERS] = { false }; | ^~~~~~~~~~~~~~ Skip this logic when built !SMP. Link: https://lore.kernel.org/r/20220331190443.851661-1-keescook@chromium.org Cc: Liviu Dudau Cc: Sudeep Holla Cc: Lorenzo Pieralisi Cc: Russell King Cc: linux-arm-kernel@lists.infradead.org Acked-by: Liviu Dudau Signed-off-by: Kees Cook Signed-off-by: Sudeep Holla Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- arch/arm/mach-vexpress/spc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/mach-vexpress/spc.c b/arch/arm/mach-vexpress/spc.c index 1da11bdb1dfb..1c6500c4e6a1 100644 --- a/arch/arm/mach-vexpress/spc.c +++ b/arch/arm/mach-vexpress/spc.c @@ -580,7 +580,7 @@ static int __init ve_spc_clk_init(void) } cluster = topology_physical_package_id(cpu_dev->id); - if (init_opp_table[cluster]) + if (cluster < 0 || init_opp_table[cluster]) continue; if (ve_init_opp_table(cpu_dev)) -- Gitee From c8a5122e572911c692240f7a894883779217a520 Mon Sep 17 00:00:00 2001 From: Sameer Pujar Date: Tue, 26 Jul 2022 17:38:47 +0800 Subject: [PATCH 595/674] reset: tegra-bpmp: Restore Handle errors in BPMP response stable inclusion from stable-v5.10.113 commit cb17b56a9b4de46c057691b524c692c10e8d9d51 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=cb17b56a9b4de46c057691b524c692c10e8d9d51 -------------------------------- [ Upstream commit d1da1052ffad63aa5181b69f20a6952e31f339c2 ] This reverts following commit 69125b4b9440 ("reset: tegra-bpmp: Revert Handle errors in BPMP response"). The Tegra194 HDA reset failure is fixed by commit d278dc9151a0 ("ALSA: hda/tegra: Fix Tegra194 HDA reset failure"). The temporary revert of original commit c045ceb5a145 ("reset: tegra-bpmp: Handle errors in BPMP response") can be removed now. Signed-off-by: Sameer Pujar Tested-by: Jon Hunter Reviewed-by: Jon Hunter Acked-by: Thierry Reding Signed-off-by: Philipp Zabel Link: https://lore.kernel.org/r/1641995806-15245-1-git-send-email-spujar@nvidia.com Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/reset/tegra/reset-bpmp.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/reset/tegra/reset-bpmp.c b/drivers/reset/tegra/reset-bpmp.c index 24d3395964cc..4c5bba52b105 100644 --- a/drivers/reset/tegra/reset-bpmp.c +++ b/drivers/reset/tegra/reset-bpmp.c @@ -20,6 +20,7 @@ static int tegra_bpmp_reset_common(struct reset_controller_dev *rstc, struct tegra_bpmp *bpmp = to_tegra_bpmp(rstc); struct mrq_reset_request request; struct tegra_bpmp_message msg; + int err; memset(&request, 0, sizeof(request)); request.cmd = command; @@ -30,7 +31,13 @@ static int tegra_bpmp_reset_common(struct reset_controller_dev *rstc, msg.tx.data = &request; msg.tx.size = sizeof(request); - return tegra_bpmp_transfer(bpmp, &msg); + err = tegra_bpmp_transfer(bpmp, &msg); + if (err) + return err; + if (msg.rx.ret) + return -EINVAL; + + return 0; } static int tegra_bpmp_reset_module(struct reset_controller_dev *rstc, -- Gitee From f0894049e9ec662a3fbb60d84f3124663a1efb9b Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Tue, 26 Jul 2022 17:38:48 +0800 Subject: [PATCH 596/674] platform/x86: samsung-laptop: Fix an unsigned comparison which can never be negative stable inclusion from stable-v5.10.113 commit 490815f0b50e42238a22ea4b2dc64105f5a59e6f category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=490815f0b50e42238a22ea4b2dc64105f5a59e6f -------------------------------- [ Upstream commit 0284d4d1be753f648f28b77bdfbe6a959212af5c ] Eliminate the follow smatch warnings: drivers/platform/x86/samsung-laptop.c:1124 kbd_led_set() warn: unsigned 'value' is never less than zero. Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Link: https://lore.kernel.org/r/20220322061830.105579-1-jiapeng.chong@linux.alibaba.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/platform/x86/samsung-laptop.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/platform/x86/samsung-laptop.c b/drivers/platform/x86/samsung-laptop.c index d5cec6e35bb8..0e456c39a603 100644 --- a/drivers/platform/x86/samsung-laptop.c +++ b/drivers/platform/x86/samsung-laptop.c @@ -1121,8 +1121,6 @@ static void kbd_led_set(struct led_classdev *led_cdev, if (value > samsung->kbd_led.max_brightness) value = samsung->kbd_led.max_brightness; - else if (value < 0) - value = 0; samsung->kbd_led_wk = value; queue_work(samsung->led_workqueue, &samsung->kbd_led_work); -- Gitee From 45f7650be7ed083d437c26a46a5148ee50c5641b Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 26 Jul 2022 17:38:49 +0800 Subject: [PATCH 597/674] ALSA: usb-audio: Fix undefined behavior due to shift overflowing the constant MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit stable inclusion from stable-v5.10.113 commit cd227ac03f2aa918ee3672fe2a5960dd0e7be331 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=cd227ac03f2aa918ee3672fe2a5960dd0e7be331 -------------------------------- [ Upstream commit 1ef8715975de8bd481abbd0839ed4f49d9e5b0ff ] Fix: sound/usb/midi.c: In function ‘snd_usbmidi_out_endpoint_create’: sound/usb/midi.c:1389:2: error: case label does not reduce to an integer constant case USB_ID(0xfc08, 0x0101): /* Unknown vendor Cable */ ^~~~ See https://lore.kernel.org/r/YkwQ6%2BtIH8GQpuct@zn.tnic for the gory details as to why it triggers with older gccs only. [ A slight correction with parentheses around the argument by tiwai ] Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220405151517.29753-3-bp@alien8.de Signed-off-by: Takashi Iwai Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- sound/usb/usbaudio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/usb/usbaudio.h b/sound/usb/usbaudio.h index e54a98f46549..d8e31ee03b9d 100644 --- a/sound/usb/usbaudio.h +++ b/sound/usb/usbaudio.h @@ -8,7 +8,7 @@ */ /* handling of USB vendor/product ID pairs as 32-bit numbers */ -#define USB_ID(vendor, product) (((vendor) << 16) | (product)) +#define USB_ID(vendor, product) (((unsigned int)(vendor) << 16) | (product)) #define USB_ID_VENDOR(id) ((id) >> 16) #define USB_ID_PRODUCT(id) ((u16)(id)) -- Gitee From bade8d4a2dbd0d43173f01e1a0ec525814c13f44 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Tue, 26 Jul 2022 17:38:50 +0800 Subject: [PATCH 598/674] arm64: dts: imx: Fix imx8*-var-som touchscreen property sizes stable inclusion from stable-v5.10.113 commit 8e7ea11364758d43e577b7835b8e98f27927d56c category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=8e7ea11364758d43e577b7835b8e98f27927d56c -------------------------------- [ Upstream commit 1bc12d301594eafde0a8529d28d459af81053b3a ] The common touchscreen properties are all 32-bit, not 16-bit. These properties must not be too important as they are all ignored in case of an error reading them. Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/Yk3moe6Hz8ELM0iS@robh.at.kernel.org' Signed-off-by: Arnd Bergmann Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- arch/arm64/boot/dts/freescale/imx8mm-var-som.dtsi | 8 ++++---- arch/arm64/boot/dts/freescale/imx8mn-var-som.dtsi | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/arm64/boot/dts/freescale/imx8mm-var-som.dtsi b/arch/arm64/boot/dts/freescale/imx8mm-var-som.dtsi index 49082529764f..0fac1f3f7f47 100644 --- a/arch/arm64/boot/dts/freescale/imx8mm-var-som.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mm-var-som.dtsi @@ -89,12 +89,12 @@ touchscreen@0 { pendown-gpio = <&gpio1 3 GPIO_ACTIVE_LOW>; ti,x-min = /bits/ 16 <125>; - touchscreen-size-x = /bits/ 16 <4008>; + touchscreen-size-x = <4008>; ti,y-min = /bits/ 16 <282>; - touchscreen-size-y = /bits/ 16 <3864>; + touchscreen-size-y = <3864>; ti,x-plate-ohms = /bits/ 16 <180>; - touchscreen-max-pressure = /bits/ 16 <255>; - touchscreen-average-samples = /bits/ 16 <10>; + touchscreen-max-pressure = <255>; + touchscreen-average-samples = <10>; ti,debounce-tol = /bits/ 16 <3>; ti,debounce-rep = /bits/ 16 <1>; ti,settle-delay-usec = /bits/ 16 <150>; diff --git a/arch/arm64/boot/dts/freescale/imx8mn-var-som.dtsi b/arch/arm64/boot/dts/freescale/imx8mn-var-som.dtsi index 7f356edf9f91..f6287f174355 100644 --- a/arch/arm64/boot/dts/freescale/imx8mn-var-som.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mn-var-som.dtsi @@ -70,12 +70,12 @@ touchscreen@0 { pendown-gpio = <&gpio1 3 GPIO_ACTIVE_LOW>; ti,x-min = /bits/ 16 <125>; - touchscreen-size-x = /bits/ 16 <4008>; + touchscreen-size-x = <4008>; ti,y-min = /bits/ 16 <282>; - touchscreen-size-y = /bits/ 16 <3864>; + touchscreen-size-y = <3864>; ti,x-plate-ohms = /bits/ 16 <180>; - touchscreen-max-pressure = /bits/ 16 <255>; - touchscreen-average-samples = /bits/ 16 <10>; + touchscreen-max-pressure = <255>; + touchscreen-average-samples = <10>; ti,debounce-tol = /bits/ 16 <3>; ti,debounce-rep = /bits/ 16 <1>; ti,settle-delay-usec = /bits/ 16 <150>; -- Gitee From b9a0a6d062e89d410cd33f03b17ed7840e096128 Mon Sep 17 00:00:00 2001 From: Hongbin Wang Date: Tue, 26 Jul 2022 17:38:51 +0800 Subject: [PATCH 599/674] vxlan: fix error return code in vxlan_fdb_append stable inclusion from stable-v5.10.113 commit e129c55153c8aba74e4c73451399f12dacf6347d category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=e129c55153c8aba74e4c73451399f12dacf6347d -------------------------------- [ Upstream commit 7cea5560bf656b84f9ed01c0cc829d4eecd0640b ] When kmalloc and dst_cache_init failed, should return ENOMEM rather than ENOBUFS. Signed-off-by: Hongbin Wang Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/net/vxlan.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index c362a6ac94c4..73953b0141ba 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -710,11 +710,11 @@ static int vxlan_fdb_append(struct vxlan_fdb *f, rd = kmalloc(sizeof(*rd), GFP_ATOMIC); if (rd == NULL) - return -ENOBUFS; + return -ENOMEM; if (dst_cache_init(&rd->dst_cache, GFP_ATOMIC)) { kfree(rd); - return -ENOBUFS; + return -ENOMEM; } rd->remote_ip = *ip; -- Gitee From 62f154bc19641ca00a8b926c5d28d3d1271eea1c Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 26 Jul 2022 17:38:52 +0800 Subject: [PATCH 600/674] cifs: Check the IOCB_DIRECT flag, not O_DIRECT stable inclusion from stable-v5.10.113 commit 5bef9fc38ffa3e0a712fcd1e07f4f123ba160242 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=5bef9fc38ffa3e0a712fcd1e07f4f123ba160242 -------------------------------- [ Upstream commit 994fd530a512597ffcd713b0f6d5bc916c5698f0 ] Use the IOCB_DIRECT indicator flag on the I/O context rather than checking to see if the file was opened O_DIRECT. Signed-off-by: David Howells cc: Steve French cc: Shyam Prasad N cc: Rohith Surabattula cc: linux-cifs@vger.kernel.org Signed-off-by: Steve French Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- fs/cifs/cifsfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index aa5a4d759ca2..370188b2a55d 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -898,7 +898,7 @@ cifs_loose_read_iter(struct kiocb *iocb, struct iov_iter *iter) ssize_t rc; struct inode *inode = file_inode(iocb->ki_filp); - if (iocb->ki_filp->f_flags & O_DIRECT) + if (iocb->ki_flags & IOCB_DIRECT) return cifs_user_readv(iocb, iter); rc = cifs_revalidate_mapping(inode); -- Gitee From 652cce7aeaa6bf232f67c210d878f48733c34045 Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng Date: Tue, 26 Jul 2022 17:38:53 +0800 Subject: [PATCH 601/674] net: atlantic: Avoid out-of-bounds indexing stable inclusion from stable-v5.10.113 commit 0de9c104d04aed5e464bd788abb423ed333980c7 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=0de9c104d04aed5e464bd788abb423ed333980c7 -------------------------------- [ Upstream commit 8d3a6c37d50d5a0504c126c932cc749e6dd9c78f ] UBSAN warnings are observed on atlantic driver: [ 294.432996] UBSAN: array-index-out-of-bounds in /build/linux-Qow4fL/linux-5.15.0/drivers/net/ethernet/aquantia/atlantic/aq_nic.c:484:48 [ 294.433695] index 8 is out of range for type 'aq_vec_s *[8]' The ring is dereferenced right before breaking out the loop, to prevent that from happening, only use the index in the loop to fix the issue. BugLink: https://bugs.launchpad.net/bugs/1958770 Tested-by: Mario Limonciello Signed-off-by: Kai-Heng Feng Reviewed-by: Igor Russkikh Link: https://lore.kernel.org/r/20220408022204.16815-1-kai.heng.feng@canonical.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- .../net/ethernet/aquantia/atlantic/aq_nic.c | 8 +++---- .../net/ethernet/aquantia/atlantic/aq_vec.c | 24 +++++++++---------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c index 0cf8ae8aeac8..2fb4126ae8d8 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c @@ -480,8 +480,8 @@ int aq_nic_start(struct aq_nic_s *self) if (err < 0) goto err_exit; - for (i = 0U, aq_vec = self->aq_vec[0]; - self->aq_vecs > i; ++i, aq_vec = self->aq_vec[i]) { + for (i = 0U; self->aq_vecs > i; ++i) { + aq_vec = self->aq_vec[i]; err = aq_vec_start(aq_vec); if (err < 0) goto err_exit; @@ -511,8 +511,8 @@ int aq_nic_start(struct aq_nic_s *self) mod_timer(&self->polling_timer, jiffies + AQ_CFG_POLLING_TIMER_INTERVAL); } else { - for (i = 0U, aq_vec = self->aq_vec[0]; - self->aq_vecs > i; ++i, aq_vec = self->aq_vec[i]) { + for (i = 0U; self->aq_vecs > i; ++i) { + aq_vec = self->aq_vec[i]; err = aq_pci_func_alloc_irq(self, i, self->ndev->name, aq_vec_isr, aq_vec, aq_vec_get_affinity_mask(aq_vec)); diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_vec.c b/drivers/net/ethernet/aquantia/atlantic/aq_vec.c index f4774cf051c9..6ab1f3212d24 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_vec.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_vec.c @@ -43,8 +43,8 @@ static int aq_vec_poll(struct napi_struct *napi, int budget) if (!self) { err = -EINVAL; } else { - for (i = 0U, ring = self->ring[0]; - self->tx_rings > i; ++i, ring = self->ring[i]) { + for (i = 0U; self->tx_rings > i; ++i) { + ring = self->ring[i]; u64_stats_update_begin(&ring[AQ_VEC_RX_ID].stats.rx.syncp); ring[AQ_VEC_RX_ID].stats.rx.polls++; u64_stats_update_end(&ring[AQ_VEC_RX_ID].stats.rx.syncp); @@ -182,8 +182,8 @@ int aq_vec_init(struct aq_vec_s *self, const struct aq_hw_ops *aq_hw_ops, self->aq_hw_ops = aq_hw_ops; self->aq_hw = aq_hw; - for (i = 0U, ring = self->ring[0]; - self->tx_rings > i; ++i, ring = self->ring[i]) { + for (i = 0U; self->tx_rings > i; ++i) { + ring = self->ring[i]; err = aq_ring_init(&ring[AQ_VEC_TX_ID], ATL_RING_TX); if (err < 0) goto err_exit; @@ -224,8 +224,8 @@ int aq_vec_start(struct aq_vec_s *self) unsigned int i = 0U; int err = 0; - for (i = 0U, ring = self->ring[0]; - self->tx_rings > i; ++i, ring = self->ring[i]) { + for (i = 0U; self->tx_rings > i; ++i) { + ring = self->ring[i]; err = self->aq_hw_ops->hw_ring_tx_start(self->aq_hw, &ring[AQ_VEC_TX_ID]); if (err < 0) @@ -248,8 +248,8 @@ void aq_vec_stop(struct aq_vec_s *self) struct aq_ring_s *ring = NULL; unsigned int i = 0U; - for (i = 0U, ring = self->ring[0]; - self->tx_rings > i; ++i, ring = self->ring[i]) { + for (i = 0U; self->tx_rings > i; ++i) { + ring = self->ring[i]; self->aq_hw_ops->hw_ring_tx_stop(self->aq_hw, &ring[AQ_VEC_TX_ID]); @@ -268,8 +268,8 @@ void aq_vec_deinit(struct aq_vec_s *self) if (!self) goto err_exit; - for (i = 0U, ring = self->ring[0]; - self->tx_rings > i; ++i, ring = self->ring[i]) { + for (i = 0U; self->tx_rings > i; ++i) { + ring = self->ring[i]; aq_ring_tx_clean(&ring[AQ_VEC_TX_ID]); aq_ring_rx_deinit(&ring[AQ_VEC_RX_ID]); } @@ -297,8 +297,8 @@ void aq_vec_ring_free(struct aq_vec_s *self) if (!self) goto err_exit; - for (i = 0U, ring = self->ring[0]; - self->tx_rings > i; ++i, ring = self->ring[i]) { + for (i = 0U; self->tx_rings > i; ++i) { + ring = self->ring[i]; aq_ring_free(&ring[AQ_VEC_TX_ID]); if (i < self->rx_rings) aq_ring_free(&ring[AQ_VEC_RX_ID]); -- Gitee From e10c83e7cbb797c3b2d7dbe6c704fa84cebf6fc6 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 26 Jul 2022 17:38:54 +0800 Subject: [PATCH 602/674] mt76: Fix undefined behavior due to shift overflowing the constant MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit stable inclusion from stable-v5.10.113 commit 202748f441488e08c730736bf5def1ae08f16811 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=202748f441488e08c730736bf5def1ae08f16811 -------------------------------- [ Upstream commit dbc2b1764734857d68425468ffa8486e97ab89df ] Fix: drivers/net/wireless/mediatek/mt76/mt76x2/pci.c: In function ‘mt76x2e_probe’: ././include/linux/compiler_types.h:352:38: error: call to ‘__compiletime_assert_946’ \ declared with attribute error: FIELD_PREP: mask is not constant _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__) See https://lore.kernel.org/r/YkwQ6%2BtIH8GQpuct@zn.tnic for the gory details as to why it triggers with older gccs only. Signed-off-by: Borislav Petkov Cc: Felix Fietkau Cc: Lorenzo Bianconi Cc: Ryder Lee Cc: Shayne Chen Cc: Sean Wang Cc: Kalle Valo Cc: "David S. Miller" Cc: Jakub Kicinski Cc: linux-wireless@vger.kernel.org Cc: netdev@vger.kernel.org Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20220405151517.29753-9-bp@alien8.de Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/net/wireless/mediatek/mt76/mt76x2/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2/pci.c b/drivers/net/wireless/mediatek/mt76/mt76x2/pci.c index ecaf85b483ac..e57e49a722dc 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x2/pci.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x2/pci.c @@ -80,7 +80,7 @@ mt76x2e_probe(struct pci_dev *pdev, const struct pci_device_id *id) mt76_rmw_field(dev, 0x15a10, 0x1f << 16, 0x9); /* RG_SSUSB_G1_CDR_BIC_LTR = 0xf */ - mt76_rmw_field(dev, 0x15a0c, 0xf << 28, 0xf); + mt76_rmw_field(dev, 0x15a0c, 0xfU << 28, 0xf); /* RG_SSUSB_CDR_BR_PE1D = 0x3 */ mt76_rmw_field(dev, 0x15c58, 0x3 << 6, 0x3); -- Gitee From d89078f497c0d04cb4ec777eca7bc4a451c68b00 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 26 Jul 2022 17:38:55 +0800 Subject: [PATCH 603/674] brcmfmac: sdio: Fix undefined behavior due to shift overflowing the constant MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit stable inclusion from stable-v5.10.113 commit b3afe5a7fd7548fa3a818f8dd20e22db4b8db019 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=b3afe5a7fd7548fa3a818f8dd20e22db4b8db019 -------------------------------- [ Upstream commit 6fb3a5868b2117611f41e421e10e6a8c2a13039a ] Fix: drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c: In function ‘brcmf_sdio_drivestrengthinit’: drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c:3798:2: error: case label does not reduce to an integer constant case SDIOD_DRVSTR_KEY(BRCM_CC_43143_CHIP_ID, 17): ^~~~ drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c:3809:2: error: case label does not reduce to an integer constant case SDIOD_DRVSTR_KEY(BRCM_CC_43362_CHIP_ID, 13): ^~~~ See https://lore.kernel.org/r/YkwQ6%2BtIH8GQpuct@zn.tnic for the gory details as to why it triggers with older gccs only. Signed-off-by: Borislav Petkov Cc: Arend van Spriel Cc: Franky Lin Cc: Hante Meuleman Cc: Kalle Valo Cc: "David S. Miller" Cc: Jakub Kicinski Cc: brcm80211-dev-list.pdl@broadcom.com Cc: netdev@vger.kernel.org Acked-by: Arend van Spriel Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/Ykx0iRlvtBnKqtbG@zn.tnic Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c index 6d5d5c39c635..9929e90866f0 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c @@ -557,7 +557,7 @@ enum brcmf_sdio_frmtype { BRCMF_SDIO_FT_SUB, }; -#define SDIOD_DRVSTR_KEY(chip, pmu) (((chip) << 16) | (pmu)) +#define SDIOD_DRVSTR_KEY(chip, pmu) (((unsigned int)(chip) << 16) | (pmu)) /* SDIO Pad drive strength to select value mappings */ struct sdiod_drive_str { -- Gitee From d1c00ee05775fa5b9a20a1392fe710e785fc68c9 Mon Sep 17 00:00:00 2001 From: Lv Ruyi Date: Tue, 26 Jul 2022 17:38:56 +0800 Subject: [PATCH 604/674] dpaa_eth: Fix missing of_node_put in dpaa_get_ts_info() stable inclusion from stable-v5.10.113 commit 8d71edabb0abe6c8e3e1601e73f63f81783b6cc0 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=8d71edabb0abe6c8e3e1601e73f63f81783b6cc0 -------------------------------- [ Upstream commit 1a7eb80d170c28be2928433702256fe2a0bd1e0f ] Both of of_get_parent() and of_parse_phandle() return node pointer with refcount incremented, use of_node_put() on it to decrease refcount when done. Reported-by: Zeal Robot Signed-off-by: Lv Ruyi Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c b/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c index 763d2c7b5fb1..5750f9a56393 100644 --- a/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c +++ b/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c @@ -489,11 +489,15 @@ static int dpaa_get_ts_info(struct net_device *net_dev, info->phc_index = -1; fman_node = of_get_parent(mac_node); - if (fman_node) + if (fman_node) { ptp_node = of_parse_phandle(fman_node, "ptimer-handle", 0); + of_node_put(fman_node); + } - if (ptp_node) + if (ptp_node) { ptp_dev = of_find_device_by_node(ptp_node); + of_node_put(ptp_node); + } if (ptp_dev) ptp = platform_get_drvdata(ptp_dev); -- Gitee From 63fc6600e0de84218fd9587b1037e42025fb27c9 Mon Sep 17 00:00:00 2001 From: Xiaoke Wang Date: Tue, 26 Jul 2022 17:38:57 +0800 Subject: [PATCH 605/674] drm/msm/mdp5: check the return of kzalloc() stable inclusion from stable-v5.10.113 commit 9581e07b549bd911ce3af12b57ea2646b4995ebf category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=9581e07b549bd911ce3af12b57ea2646b4995ebf -------------------------------- [ Upstream commit 047ae665577776b7feb11bd4f81f46627cff95e7 ] kzalloc() is a memory allocation function which can return NULL when some internal memory errors happen. So it is better to check it to prevent potential wrong memory access. Besides, since mdp5_plane_reset() is void type, so we should better set `plane-state` to NULL after releasing it. Signed-off-by: Xiaoke Wang Reviewed-by: Dmitry Baryshkov Patchwork: https://patchwork.freedesktop.org/patch/481055/ Link: https://lore.kernel.org/r/tencent_8E2A1C78140EE1784AB2FF4B2088CC0AB908@qq.com Signed-off-by: Dmitry Baryshkov Signed-off-by: Rob Clark Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/gpu/drm/msm/disp/mdp5/mdp5_plane.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/msm/disp/mdp5/mdp5_plane.c b/drivers/gpu/drm/msm/disp/mdp5/mdp5_plane.c index 83423092de2f..da0799333970 100644 --- a/drivers/gpu/drm/msm/disp/mdp5/mdp5_plane.c +++ b/drivers/gpu/drm/msm/disp/mdp5/mdp5_plane.c @@ -179,7 +179,10 @@ static void mdp5_plane_reset(struct drm_plane *plane) drm_framebuffer_put(plane->state->fb); kfree(to_mdp5_plane_state(plane->state)); + plane->state = NULL; mdp5_state = kzalloc(sizeof(*mdp5_state), GFP_KERNEL); + if (!mdp5_state) + return; /* assign default blend parameters */ mdp5_state->alpha = 255; -- Gitee From 6f2cb02318c126edf72a06c12938b3bc93a7de1a Mon Sep 17 00:00:00 2001 From: Tomas Melin Date: Tue, 26 Jul 2022 17:38:58 +0800 Subject: [PATCH 606/674] net: macb: Restart tx only if queue pointer is lagging stable inclusion from stable-v5.10.113 commit a284cca3d81ae5e5f5f23c09b8298339188953a5 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=a284cca3d81ae5e5f5f23c09b8298339188953a5 -------------------------------- [ Upstream commit 5ad7f18cd82cee8e773d40cc7a1465a526f2615c ] commit 4298388574da ("net: macb: restart tx after tx used bit read") added support for restarting transmission. Restarting tx does not work in case controller asserts TXUBR interrupt and TQBP is already at the end of the tx queue. In that situation, restarting tx will immediately cause assertion of another TXUBR interrupt. The driver will end up in an infinite interrupt loop which it cannot break out of. For cases where TQBP is at the end of the tx queue, instead only clear TX_USED interrupt. As more data gets pushed to the queue, transmission will resume. This issue was observed on a Xilinx Zynq-7000 based board. During stress test of the network interface, driver would get stuck on interrupt loop within seconds or minutes causing CPU to stall. Signed-off-by: Tomas Melin Tested-by: Claudiu Beznea Reviewed-by: Claudiu Beznea Link: https://lore.kernel.org/r/20220407161659.14532-1-tomas.melin@vaisala.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/net/ethernet/cadence/macb_main.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index 78c6d133f54f..3244f69555f7 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -1531,6 +1531,7 @@ static void macb_tx_restart(struct macb_queue *queue) unsigned int head = queue->tx_head; unsigned int tail = queue->tx_tail; struct macb *bp = queue->bp; + unsigned int head_idx, tbqp; if (bp->caps & MACB_CAPS_ISR_CLEAR_ON_WRITE) queue_writel(queue, ISR, MACB_BIT(TXUBR)); @@ -1538,6 +1539,13 @@ static void macb_tx_restart(struct macb_queue *queue) if (head == tail) return; + tbqp = queue_readl(queue, TBQP) / macb_dma_desc_get_size(bp); + tbqp = macb_adj_dma_desc_idx(bp, macb_tx_ring_wrap(bp, tbqp)); + head_idx = macb_adj_dma_desc_idx(bp, macb_tx_ring_wrap(bp, head)); + + if (tbqp == head_idx) + return; + macb_writel(bp, NCR, macb_readl(bp, NCR) | MACB_BIT(TSTART)); } -- Gitee From 0389f1ff99ad7375ff5980b964ab00267a72af25 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Tue, 26 Jul 2022 17:38:59 +0800 Subject: [PATCH 607/674] scsi: qedi: Fix failed disconnect handling stable inclusion from stable-v5.10.113 commit bf28bba3041055759ce6afe5e61bc4fd709ea6e4 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=bf28bba3041055759ce6afe5e61bc4fd709ea6e4 -------------------------------- [ Upstream commit 857b06527f707f5df634b854898a191b5c1d0272 ] We set the qedi_ep state to EP_STATE_OFLDCONN_START when the ep is created. Then in qedi_set_path we kick off the offload work. If userspace times out the connection and calls ep_disconnect, qedi will only flush the offload work if the qedi_ep state has transitioned away from EP_STATE_OFLDCONN_START. If we can't connect we will not have transitioned state and will leave the offload work running, and we will free the qedi_ep from under it. This patch just has us init the work when we create the ep, then always flush it. Link: https://lore.kernel.org/r/20220408001314.5014-10-michael.christie@oracle.com Tested-by: Manish Rangankar Reviewed-by: Lee Duncan Reviewed-by: Chris Leech Acked-by: Manish Rangankar Signed-off-by: Mike Christie Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/scsi/qedi/qedi_iscsi.c | 69 +++++++++++++++++----------------- 1 file changed, 34 insertions(+), 35 deletions(-) diff --git a/drivers/scsi/qedi/qedi_iscsi.c b/drivers/scsi/qedi/qedi_iscsi.c index ba9a22e55e32..8003b3519b95 100644 --- a/drivers/scsi/qedi/qedi_iscsi.c +++ b/drivers/scsi/qedi/qedi_iscsi.c @@ -828,6 +828,37 @@ static int qedi_task_xmit(struct iscsi_task *task) return qedi_iscsi_send_ioreq(task); } +static void qedi_offload_work(struct work_struct *work) +{ + struct qedi_endpoint *qedi_ep = + container_of(work, struct qedi_endpoint, offload_work); + struct qedi_ctx *qedi; + int wait_delay = 5 * HZ; + int ret; + + qedi = qedi_ep->qedi; + + ret = qedi_iscsi_offload_conn(qedi_ep); + if (ret) { + QEDI_ERR(&qedi->dbg_ctx, + "offload error: iscsi_cid=%u, qedi_ep=%p, ret=%d\n", + qedi_ep->iscsi_cid, qedi_ep, ret); + qedi_ep->state = EP_STATE_OFLDCONN_FAILED; + return; + } + + ret = wait_event_interruptible_timeout(qedi_ep->tcp_ofld_wait, + (qedi_ep->state == + EP_STATE_OFLDCONN_COMPL), + wait_delay); + if (ret <= 0 || qedi_ep->state != EP_STATE_OFLDCONN_COMPL) { + qedi_ep->state = EP_STATE_OFLDCONN_FAILED; + QEDI_ERR(&qedi->dbg_ctx, + "Offload conn TIMEOUT iscsi_cid=%u, qedi_ep=%p\n", + qedi_ep->iscsi_cid, qedi_ep); + } +} + static struct iscsi_endpoint * qedi_ep_connect(struct Scsi_Host *shost, struct sockaddr *dst_addr, int non_blocking) @@ -876,6 +907,7 @@ qedi_ep_connect(struct Scsi_Host *shost, struct sockaddr *dst_addr, } qedi_ep = ep->dd_data; memset(qedi_ep, 0, sizeof(struct qedi_endpoint)); + INIT_WORK(&qedi_ep->offload_work, qedi_offload_work); qedi_ep->state = EP_STATE_IDLE; qedi_ep->iscsi_cid = (u32)-1; qedi_ep->qedi = qedi; @@ -1026,12 +1058,11 @@ static void qedi_ep_disconnect(struct iscsi_endpoint *ep) qedi_ep = ep->dd_data; qedi = qedi_ep->qedi; + flush_work(&qedi_ep->offload_work); + if (qedi_ep->state == EP_STATE_OFLDCONN_START) goto ep_exit_recover; - if (qedi_ep->state != EP_STATE_OFLDCONN_NONE) - flush_work(&qedi_ep->offload_work); - if (qedi_ep->conn) { qedi_conn = qedi_ep->conn; conn = qedi_conn->cls_conn->dd_data; @@ -1196,37 +1227,6 @@ static int qedi_data_avail(struct qedi_ctx *qedi, u16 vlanid) return rc; } -static void qedi_offload_work(struct work_struct *work) -{ - struct qedi_endpoint *qedi_ep = - container_of(work, struct qedi_endpoint, offload_work); - struct qedi_ctx *qedi; - int wait_delay = 5 * HZ; - int ret; - - qedi = qedi_ep->qedi; - - ret = qedi_iscsi_offload_conn(qedi_ep); - if (ret) { - QEDI_ERR(&qedi->dbg_ctx, - "offload error: iscsi_cid=%u, qedi_ep=%p, ret=%d\n", - qedi_ep->iscsi_cid, qedi_ep, ret); - qedi_ep->state = EP_STATE_OFLDCONN_FAILED; - return; - } - - ret = wait_event_interruptible_timeout(qedi_ep->tcp_ofld_wait, - (qedi_ep->state == - EP_STATE_OFLDCONN_COMPL), - wait_delay); - if ((ret <= 0) || (qedi_ep->state != EP_STATE_OFLDCONN_COMPL)) { - qedi_ep->state = EP_STATE_OFLDCONN_FAILED; - QEDI_ERR(&qedi->dbg_ctx, - "Offload conn TIMEOUT iscsi_cid=%u, qedi_ep=%p\n", - qedi_ep->iscsi_cid, qedi_ep); - } -} - static int qedi_set_path(struct Scsi_Host *shost, struct iscsi_path *path_data) { struct qedi_ctx *qedi; @@ -1342,7 +1342,6 @@ static int qedi_set_path(struct Scsi_Host *shost, struct iscsi_path *path_data) qedi_ep->dst_addr, qedi_ep->dst_port); } - INIT_WORK(&qedi_ep->offload_work, qedi_offload_work); queue_work(qedi->offload_thread, &qedi_ep->offload_work); ret = 0; -- Gitee From 7bec28503270e64c6bd754f2d882beeef7c3278a Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Tue, 26 Jul 2022 17:39:00 +0800 Subject: [PATCH 608/674] stat: fix inconsistency between struct stat and struct compat_stat stable inclusion from stable-v5.10.113 commit 76101c8e0c31938e59bbcb4beed47c8d8b89d39a category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=76101c8e0c31938e59bbcb4beed47c8d8b89d39a -------------------------------- [ Upstream commit 932aba1e169090357a77af18850a10c256b50819 ] struct stat (defined in arch/x86/include/uapi/asm/stat.h) has 32-bit st_dev and st_rdev; struct compat_stat (defined in arch/x86/include/asm/compat.h) has 16-bit st_dev and st_rdev followed by a 16-bit padding. This patch fixes struct compat_stat to match struct stat. [ Historical note: the old x86 'struct stat' did have that 16-bit field that the compat layer had kept around, but it was changes back in 2003 by "struct stat - support larger dev_t": https://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git/commit/?id=e95b2065677fe32512a597a79db94b77b90c968d and back in those days, the x86_64 port was still new, and separate from the i386 code, and had already picked up the old version with a 16-bit st_dev field ] Note that we can't change compat_dev_t because it is used by compat_loop_info. Also, if the st_dev and st_rdev values are 32-bit, we don't have to use old_valid_dev to test if the value fits into them. This fixes -EOVERFLOW on filesystems that are on NVMe because NVMe uses the major number 259. Signed-off-by: Mikulas Patocka Cc: Andreas Schwab Cc: Matthew Wilcox Cc: Christoph Hellwig Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- arch/x86/include/asm/compat.h | 6 ++---- fs/stat.c | 19 ++++++++++--------- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index 0e327a01f50f..46a067bd7e0b 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -29,15 +29,13 @@ typedef u32 compat_caddr_t; typedef __kernel_fsid_t compat_fsid_t; struct compat_stat { - compat_dev_t st_dev; - u16 __pad1; + u32 st_dev; compat_ino_t st_ino; compat_mode_t st_mode; compat_nlink_t st_nlink; __compat_uid_t st_uid; __compat_gid_t st_gid; - compat_dev_t st_rdev; - u16 __pad2; + u32 st_rdev; u32 st_size; u32 st_blksize; u32 st_blocks; diff --git a/fs/stat.c b/fs/stat.c index 1196af4d1ea0..04550c0ba540 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -306,9 +306,6 @@ SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, stat # define choose_32_64(a,b) b #endif -#define valid_dev(x) choose_32_64(old_valid_dev(x),true) -#define encode_dev(x) choose_32_64(old_encode_dev,new_encode_dev)(x) - #ifndef INIT_STRUCT_STAT_PADDING # define INIT_STRUCT_STAT_PADDING(st) memset(&st, 0, sizeof(st)) #endif @@ -317,7 +314,9 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf) { struct stat tmp; - if (!valid_dev(stat->dev) || !valid_dev(stat->rdev)) + if (sizeof(tmp.st_dev) < 4 && !old_valid_dev(stat->dev)) + return -EOVERFLOW; + if (sizeof(tmp.st_rdev) < 4 && !old_valid_dev(stat->rdev)) return -EOVERFLOW; #if BITS_PER_LONG == 32 if (stat->size > MAX_NON_LFS) @@ -325,7 +324,7 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf) #endif INIT_STRUCT_STAT_PADDING(tmp); - tmp.st_dev = encode_dev(stat->dev); + tmp.st_dev = new_encode_dev(stat->dev); tmp.st_ino = stat->ino; if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino) return -EOVERFLOW; @@ -335,7 +334,7 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf) return -EOVERFLOW; SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid)); SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid)); - tmp.st_rdev = encode_dev(stat->rdev); + tmp.st_rdev = new_encode_dev(stat->rdev); tmp.st_size = stat->size; tmp.st_atime = stat->atime.tv_sec; tmp.st_mtime = stat->mtime.tv_sec; @@ -616,11 +615,13 @@ static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf) { struct compat_stat tmp; - if (!old_valid_dev(stat->dev) || !old_valid_dev(stat->rdev)) + if (sizeof(tmp.st_dev) < 4 && !old_valid_dev(stat->dev)) + return -EOVERFLOW; + if (sizeof(tmp.st_rdev) < 4 && !old_valid_dev(stat->rdev)) return -EOVERFLOW; memset(&tmp, 0, sizeof(tmp)); - tmp.st_dev = old_encode_dev(stat->dev); + tmp.st_dev = new_encode_dev(stat->dev); tmp.st_ino = stat->ino; if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino) return -EOVERFLOW; @@ -630,7 +631,7 @@ static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf) return -EOVERFLOW; SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid)); SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid)); - tmp.st_rdev = old_encode_dev(stat->rdev); + tmp.st_rdev = new_encode_dev(stat->rdev); if ((u64) stat->size > MAX_NON_LFS) return -EOVERFLOW; tmp.st_size = stat->size; -- Gitee From 47590400345a095df3f6c386c59924849fc0fe28 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 26 Jul 2022 17:39:01 +0800 Subject: [PATCH 609/674] nvme: add a quirk to disable namespace identifiers stable inclusion from stable-v5.10.113 commit 316bd86c2261fffa37577c3d35721a9337248e54 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=316bd86c2261fffa37577c3d35721a9337248e54 -------------------------------- [ Upstream commit 00ff400e6deee00f7b15e200205b2708b63b8cf6 ] Add a quirk to disable using and exporting namespace identifiers for controllers where they are broken beyond repair. The most directly visible problem with non-unique namespace identifiers is that they break the /dev/disk/by-id/ links, with the link for a supposedly unique identifier now pointing to one of multiple possible namespaces that share the same ID, and a somewhat random selection of which one actually shows up. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/nvme/host/core.c | 24 ++++++++++++++++++------ drivers/nvme/host/nvme.h | 5 +++++ 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index dcc047f01a07..274635c0c02a 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1250,6 +1250,8 @@ static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids, warn_str, cur->nidl); return -1; } + if (ctrl->quirks & NVME_QUIRK_BOGUS_NID) + return NVME_NIDT_EUI64_LEN; memcpy(ids->eui64, data + sizeof(*cur), NVME_NIDT_EUI64_LEN); return NVME_NIDT_EUI64_LEN; case NVME_NIDT_NGUID: @@ -1258,6 +1260,8 @@ static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids, warn_str, cur->nidl); return -1; } + if (ctrl->quirks & NVME_QUIRK_BOGUS_NID) + return NVME_NIDT_NGUID_LEN; memcpy(ids->nguid, data + sizeof(*cur), NVME_NIDT_NGUID_LEN); return NVME_NIDT_NGUID_LEN; case NVME_NIDT_UUID: @@ -1266,6 +1270,8 @@ static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids, warn_str, cur->nidl); return -1; } + if (ctrl->quirks & NVME_QUIRK_BOGUS_NID) + return NVME_NIDT_UUID_LEN; uuid_copy(&ids->uuid, data + sizeof(*cur)); return NVME_NIDT_UUID_LEN; case NVME_NIDT_CSI: @@ -1361,12 +1367,18 @@ static int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid, if ((*id)->ncap == 0) /* namespace not allocated or attached */ goto out_free_id; - if (ctrl->vs >= NVME_VS(1, 1, 0) && - !memchr_inv(ids->eui64, 0, sizeof(ids->eui64))) - memcpy(ids->eui64, (*id)->eui64, sizeof(ids->eui64)); - if (ctrl->vs >= NVME_VS(1, 2, 0) && - !memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) - memcpy(ids->nguid, (*id)->nguid, sizeof(ids->nguid)); + + if (ctrl->quirks & NVME_QUIRK_BOGUS_NID) { + dev_info(ctrl->device, + "Ignoring bogus Namespace Identifiers\n"); + } else { + if (ctrl->vs >= NVME_VS(1, 1, 0) && + !memchr_inv(ids->eui64, 0, sizeof(ids->eui64))) + memcpy(ids->eui64, (*id)->eui64, sizeof(ids->eui64)); + if (ctrl->vs >= NVME_VS(1, 2, 0) && + !memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) + memcpy(ids->nguid, (*id)->nguid, sizeof(ids->nguid)); + } return 0; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 94cee2c566d3..11d3cc2890f9 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -151,6 +151,11 @@ enum nvme_quirks { * encoding the generation sequence number. */ NVME_QUIRK_SKIP_CID_GEN = (1 << 17), + + /* + * Reports garbage in the namespace identifiers (eui64, nguid, uuid). + */ + NVME_QUIRK_BOGUS_NID = (1 << 18), }; /* -- Gitee From 983009806cf6c01a28ac7890c08a83cc38e75bc8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 26 Jul 2022 17:39:02 +0800 Subject: [PATCH 610/674] nvme-pci: disable namespace identifiers for Qemu controllers stable inclusion from stable-v5.10.113 commit 7ec6e06ee405756f90226b946790a5a7a65795b7 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=7ec6e06ee405756f90226b946790a5a7a65795b7 -------------------------------- [ Upstream commit 66dd346b84d79fde20832ed691a54f4881eac20d ] Qemu unconditionally reports a UUID, which depending on the qemu version is either all-null (which is incorrect but harmless) or contains a single bit set for all controllers. In addition it can also optionally report a eui64 which needs to be manually set. Disable namespace identifiers for Qemu controlles entirely even if in some cases they could be set correctly through manual intervention. Reported-by: Luis Chamberlain Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/nvme/host/pci.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index f435ab0809fb..e0a3d03198a2 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -3212,7 +3212,10 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, }, { PCI_VDEVICE(INTEL, 0x5845), /* Qemu emulated controller */ .driver_data = NVME_QUIRK_IDENTIFY_CNS | - NVME_QUIRK_DISABLE_WRITE_ZEROES, }, + NVME_QUIRK_DISABLE_WRITE_ZEROES | + NVME_QUIRK_BOGUS_NID, }, + { PCI_VDEVICE(REDHAT, 0x0010), /* Qemu emulated controller */ + .driver_data = NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(0x126f, 0x2263), /* Silicon Motion unidentified */ .driver_data = NVME_QUIRK_NO_NS_DESC_LIST, }, { PCI_DEVICE(0x1bb1, 0x0100), /* Seagate Nytro Flash Storage */ -- Gitee From 3d0cf64034acd4253f9d7e4e54c0ece407faa42c Mon Sep 17 00:00:00 2001 From: Shubhrajyoti Datta Date: Tue, 26 Jul 2022 17:39:03 +0800 Subject: [PATCH 611/674] EDAC/synopsys: Read the error count from the correct register stable inclusion from stable-v5.10.113 commit 50cbc583fa838a63f8447251b88da569c3d36ba6 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=50cbc583fa838a63f8447251b88da569c3d36ba6 -------------------------------- commit e2932d1f6f055b2af2114c7e64a26dc1b5593d0c upstream. Currently, the error count is read wrongly from the status register. Read the count from the proper error count register (ERRCNT). [ bp: Massage. ] Fixes: b500b4a029d5 ("EDAC, synopsys: Add ECC support for ZynqMP DDR controller") Signed-off-by: Shubhrajyoti Datta Signed-off-by: Borislav Petkov Acked-by: Michal Simek Cc: Link: https://lore.kernel.org/r/20220414102813.4468-1-shubhrajyoti.datta@xilinx.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/edac/synopsys_edac.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/edac/synopsys_edac.c b/drivers/edac/synopsys_edac.c index 92906b56b1a2..fea44dc0484b 100644 --- a/drivers/edac/synopsys_edac.c +++ b/drivers/edac/synopsys_edac.c @@ -163,6 +163,11 @@ #define ECC_STAT_CECNT_SHIFT 8 #define ECC_STAT_BITNUM_MASK 0x7F +/* ECC error count register definitions */ +#define ECC_ERRCNT_UECNT_MASK 0xFFFF0000 +#define ECC_ERRCNT_UECNT_SHIFT 16 +#define ECC_ERRCNT_CECNT_MASK 0xFFFF + /* DDR QOS Interrupt register definitions */ #define DDR_QOS_IRQ_STAT_OFST 0x20200 #define DDR_QOSUE_MASK 0x4 @@ -418,15 +423,16 @@ static int zynqmp_get_error_info(struct synps_edac_priv *priv) base = priv->baseaddr; p = &priv->stat; + regval = readl(base + ECC_ERRCNT_OFST); + p->ce_cnt = regval & ECC_ERRCNT_CECNT_MASK; + p->ue_cnt = (regval & ECC_ERRCNT_UECNT_MASK) >> ECC_ERRCNT_UECNT_SHIFT; + if (!p->ce_cnt) + goto ue_err; + regval = readl(base + ECC_STAT_OFST); if (!regval) return 1; - p->ce_cnt = (regval & ECC_STAT_CECNT_MASK) >> ECC_STAT_CECNT_SHIFT; - p->ue_cnt = (regval & ECC_STAT_UECNT_MASK) >> ECC_STAT_UECNT_SHIFT; - if (!p->ce_cnt) - goto ue_err; - p->ceinfo.bitpos = (regval & ECC_STAT_BITNUM_MASK); regval = readl(base + ECC_CEADDR0_OFST); -- Gitee From eb179e5cfb436d79cb432f855a26e7596e24531e Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 26 Jul 2022 17:39:04 +0800 Subject: [PATCH 612/674] mm, hugetlb: allow for "high" userspace addresses stable inclusion from stable-v5.10.113 commit 6b932920b96fc3002352fe8225ec63a1cd1717ec category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=6b932920b96fc3002352fe8225ec63a1cd1717ec -------------------------------- commit 5f24d5a579d1eace79d505b148808a850b417d4c upstream. This is a fix for commit f6795053dac8 ("mm: mmap: Allow for "high" userspace addresses") for hugetlb. This patch adds support for "high" userspace addresses that are optionally supported on the system and have to be requested via a hint mechanism ("high" addr parameter to mmap). Architectures such as powerpc and x86 achieve this by making changes to their architectural versions of hugetlb_get_unmapped_area() function. However, arm64 uses the generic version of that function. So take into account arch_get_mmap_base() and arch_get_mmap_end() in hugetlb_get_unmapped_area(). To allow that, move those two macros out of mm/mmap.c into include/linux/sched/mm.h If these macros are not defined in architectural code then they default to (TASK_SIZE) and (base) so should not introduce any behavioural changes to architectures that do not define them. For the time being, only ARM64 is affected by this change. Catalin (ARM64) said "We should have fixed hugetlb_get_unmapped_area() as well when we added support for 52-bit VA. The reason for commit f6795053dac8 was to prevent normal mmap() from returning addresses above 48-bit by default as some user-space had hard assumptions about this. It's a slight ABI change if you do this for hugetlb_get_unmapped_area() but I doubt anyone would notice. It's more likely that the current behaviour would cause issues, so I'd rather have them consistent. Basically when arm64 gained support for 52-bit addresses we did not want user-space calling mmap() to suddenly get such high addresses, otherwise we could have inadvertently broken some programs (similar behaviour to x86 here). Hence we added commit f6795053dac8. But we missed hugetlbfs which could still get such high mmap() addresses. So in theory that's a potential regression that should have bee addressed at the same time as commit f6795053dac8 (and before arm64 enabled 52-bit addresses)" Link: https://lkml.kernel.org/r/ab847b6edb197bffdfe189e70fb4ac76bfe79e0d.1650033747.git.christophe.leroy@csgroup.eu Fixes: f6795053dac8 ("mm: mmap: Allow for "high" userspace addresses") Signed-off-by: Christophe Leroy Reviewed-by: Catalin Marinas Cc: Steve Capper Cc: Will Deacon Cc: [5.0.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Conflicts: fs/hugetlbfs/inode.c Acked-by: Xie XiuQi --- fs/hugetlbfs/inode.c | 9 +++++---- include/linux/sched/mm.h | 8 ++++++++ mm/mmap.c | 8 -------- 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 6f2943465bff..8a87d1b43387 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -252,7 +252,7 @@ hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr, info.flags = 0; info.length = len; info.low_limit = current->mm->mmap_base; - info.high_limit = TASK_SIZE; + info.high_limit = arch_get_mmap_end(addr); info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; @@ -272,7 +272,7 @@ hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr, info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; info.low_limit = max(PAGE_SIZE, mmap_min_addr); - info.high_limit = current->mm->mmap_base; + info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base); info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; @@ -291,7 +291,7 @@ hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr, VM_BUG_ON(addr != -ENOMEM); info.flags = 0; info.low_limit = current->mm->mmap_base; - info.high_limit = TASK_SIZE; + info.high_limit = arch_get_mmap_end(addr); if (enable_mmap_dvpp) dvpp_mmap_get_area(&info, flags); @@ -309,6 +309,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, struct mm_struct *mm = current->mm; struct vm_area_struct *vma; struct hstate *h = hstate_file(file); + const unsigned long mmap_end = arch_get_mmap_end(addr); if (len & ~huge_page_mask(h)) return -EINVAL; @@ -328,7 +329,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, return -ENOMEM; vma = find_vma(mm, addr); - if (TASK_SIZE - len >= addr && + if (mmap_end - len >= addr && (!vma || addr + len <= vm_start_gap(vma))) return addr; } diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index dc1f4dcd9a82..e3e5e149b00e 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -106,6 +106,14 @@ static inline void mm_update_next_owner(struct mm_struct *mm) #endif /* CONFIG_MEMCG */ #ifdef CONFIG_MMU +#ifndef arch_get_mmap_end +#define arch_get_mmap_end(addr) (TASK_SIZE) +#endif + +#ifndef arch_get_mmap_base +#define arch_get_mmap_base(addr, base) (base) +#endif + extern void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack); extern unsigned long diff --git a/mm/mmap.c b/mm/mmap.c index 5ad32537604a..5489d70db84e 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2404,14 +2404,6 @@ unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info) return addr; } -#ifndef arch_get_mmap_end -#define arch_get_mmap_end(addr) (TASK_SIZE) -#endif - -#ifndef arch_get_mmap_base -#define arch_get_mmap_base(addr, base) (base) -#endif - /* Get an address range which is currently unmapped. * For shmat() with addr=0. * -- Gitee From ccc63440f2c0b4d4473bfecbaf1a19e3c14afce7 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Tue, 26 Jul 2022 17:39:05 +0800 Subject: [PATCH 613/674] mm/mmu_notifier.c: fix race in mmu_interval_notifier_remove() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit stable inclusion from stable-v5.10.113 commit 9ca66d79143980260be615b964b8dc1504a5d0c6 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=9ca66d79143980260be615b964b8dc1504a5d0c6 -------------------------------- commit 319561669a59d8e9206ab311ae5433ef92fd79d1 upstream. In some cases it is possible for mmu_interval_notifier_remove() to race with mn_tree_inv_end() allowing it to return while the notifier data structure is still in use. Consider the following sequence: CPU0 - mn_tree_inv_end() CPU1 - mmu_interval_notifier_remove() ----------------------------------- ------------------------------------ spin_lock(subscriptions->lock); seq = subscriptions->invalidate_seq; spin_lock(subscriptions->lock); spin_unlock(subscriptions->lock); subscriptions->invalidate_seq++; wait_event(invalidate_seq != seq); return; interval_tree_remove(interval_sub); kfree(interval_sub); spin_unlock(subscriptions->lock); wake_up_all(); As the wait_event() condition is true it will return immediately. This can lead to use-after-free type errors if the caller frees the data structure containing the interval notifier subscription while it is still on a deferred list. Fix this by taking the appropriate lock when reading invalidate_seq to ensure proper synchronisation. I observed this whilst running stress testing during some development. You do have to be pretty unlucky, but it leads to the usual problems of use-after-free (memory corruption, kernel crash, difficult to diagnose WARN_ON, etc). Link: https://lkml.kernel.org/r/20220420043734.476348-1-apopple@nvidia.com Fixes: 99cb252f5e68 ("mm/mmu_notifier: add an interval tree notifier") Signed-off-by: Alistair Popple Signed-off-by: Jason Gunthorpe Cc: Christian König Cc: John Hubbard Cc: Ralph Campbell Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- mm/mmu_notifier.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 07f42a7a6065..9165ca619c8c 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -1043,6 +1043,18 @@ int mmu_interval_notifier_insert_locked( } EXPORT_SYMBOL_GPL(mmu_interval_notifier_insert_locked); +static bool +mmu_interval_seq_released(struct mmu_notifier_subscriptions *subscriptions, + unsigned long seq) +{ + bool ret; + + spin_lock(&subscriptions->lock); + ret = subscriptions->invalidate_seq != seq; + spin_unlock(&subscriptions->lock); + return ret; +} + /** * mmu_interval_notifier_remove - Remove a interval notifier * @interval_sub: Interval subscription to unregister @@ -1090,7 +1102,7 @@ void mmu_interval_notifier_remove(struct mmu_interval_notifier *interval_sub) lock_map_release(&__mmu_notifier_invalidate_range_start_map); if (seq) wait_event(subscriptions->wq, - READ_ONCE(subscriptions->invalidate_seq) != seq); + mmu_interval_seq_released(subscriptions, seq)); /* pairs with mmgrab in mmu_interval_notifier_insert() */ mmdrop(mm); -- Gitee From 0affb0e98c917096fe752a041443b815bb2b61ba Mon Sep 17 00:00:00 2001 From: Zheyu Ma Date: Tue, 26 Jul 2022 17:39:06 +0800 Subject: [PATCH 614/674] ata: pata_marvell: Check the 'bmdma_addr' beforing reading stable inclusion from stable-v5.10.113 commit cf23a960c5c62ddd8283a855a742977ee983a62a category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=cf23a960c5c62ddd8283a855a742977ee983a62a -------------------------------- commit aafa9f958342db36c17ac2a7f1b841032c96feb4 upstream. Before detecting the cable type on the dma bar, the driver should check whether the 'bmdma_addr' is zero, which means the adapter does not support DMA, otherwise we will get the following error: [ 5.146634] Bad IO access at port 0x1 (return inb(port)) [ 5.147206] WARNING: CPU: 2 PID: 303 at lib/iomap.c:44 ioread8+0x4a/0x60 [ 5.150856] RIP: 0010:ioread8+0x4a/0x60 [ 5.160238] Call Trace: [ 5.160470] [ 5.160674] marvell_cable_detect+0x6e/0xc0 [pata_marvell] [ 5.161728] ata_eh_recover+0x3520/0x6cc0 [ 5.168075] ata_do_eh+0x49/0x3c0 Signed-off-by: Zheyu Ma Signed-off-by: Damien Le Moal Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/ata/pata_marvell.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/ata/pata_marvell.c b/drivers/ata/pata_marvell.c index b066809ba9a1..c56f4043b0cc 100644 --- a/drivers/ata/pata_marvell.c +++ b/drivers/ata/pata_marvell.c @@ -83,6 +83,8 @@ static int marvell_cable_detect(struct ata_port *ap) switch(ap->port_no) { case 0: + if (!ap->ioaddr.bmdma_addr) + return ATA_CBL_PATA_UNK; if (ioread8(ap->ioaddr.bmdma_addr + 1) & 1) return ATA_CBL_PATA40; return ATA_CBL_PATA80; -- Gitee From d5e4b0822bc88bcd32070ab02176509d7a9eb06a Mon Sep 17 00:00:00 2001 From: Xiaomeng Tong Date: Tue, 26 Jul 2022 17:39:07 +0800 Subject: [PATCH 615/674] dma: at_xdmac: fix a missing check on list iterator stable inclusion from stable-v5.10.113 commit 358a3846f6a950b8db803262e74081ad4ab235d4 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=358a3846f6a950b8db803262e74081ad4ab235d4 -------------------------------- commit 206680c4e46b62fd8909385e0874a36952595b85 upstream. The bug is here: __func__, desc, &desc->tx_dma_desc.phys, ret, cookie, residue); The list iterator 'desc' will point to a bogus position containing HEAD if the list is empty or no element is found. To avoid dev_dbg() prints a invalid address, use a new variable 'iter' as the list iterator, while use the origin variable 'desc' as a dedicated pointer to point to the found element. Cc: stable@vger.kernel.org Fixes: 82e2424635f4c ("dmaengine: xdmac: fix print warning on dma_addr_t variable") Signed-off-by: Xiaomeng Tong Link: https://lore.kernel.org/r/20220327061154.4867-1-xiam0nd.tong@gmail.com Signed-off-by: Vinod Koul Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/dma/at_xdmac.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/dma/at_xdmac.c b/drivers/dma/at_xdmac.c index 90afba0b36fe..47552db6b8dc 100644 --- a/drivers/dma/at_xdmac.c +++ b/drivers/dma/at_xdmac.c @@ -1390,7 +1390,7 @@ at_xdmac_tx_status(struct dma_chan *chan, dma_cookie_t cookie, { struct at_xdmac_chan *atchan = to_at_xdmac_chan(chan); struct at_xdmac *atxdmac = to_at_xdmac(atchan->chan.device); - struct at_xdmac_desc *desc, *_desc; + struct at_xdmac_desc *desc, *_desc, *iter; struct list_head *descs_list; enum dma_status ret; int residue, retry; @@ -1505,11 +1505,13 @@ at_xdmac_tx_status(struct dma_chan *chan, dma_cookie_t cookie, * microblock. */ descs_list = &desc->descs_list; - list_for_each_entry_safe(desc, _desc, descs_list, desc_node) { - dwidth = at_xdmac_get_dwidth(desc->lld.mbr_cfg); - residue -= (desc->lld.mbr_ubc & 0xffffff) << dwidth; - if ((desc->lld.mbr_nda & 0xfffffffc) == cur_nda) + list_for_each_entry_safe(iter, _desc, descs_list, desc_node) { + dwidth = at_xdmac_get_dwidth(iter->lld.mbr_cfg); + residue -= (iter->lld.mbr_ubc & 0xffffff) << dwidth; + if ((iter->lld.mbr_nda & 0xfffffffc) == cur_nda) { + desc = iter; break; + } } residue += cur_ubc << dwidth; -- Gitee From 2d19822cbd21ac865ee8a7dc3376e640a457ee1f Mon Sep 17 00:00:00 2001 From: Manuel Ullmann Date: Tue, 26 Jul 2022 17:39:08 +0800 Subject: [PATCH 616/674] net: atlantic: invert deep par in pm functions, preventing null derefs stable inclusion from stable-v5.10.113 commit ba2716da233618c3f361b8ece818a0e91de7a8f0 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=ba2716da233618c3f361b8ece818a0e91de7a8f0 -------------------------------- commit cbe6c3a8f8f4315b96e46e1a1c70393c06d95a4c upstream. This will reset deeply on freeze and thaw instead of suspend and resume and prevent null pointer dereferences of the uninitialized ring 0 buffer while thawing. The impact is an indefinitely hanging kernel. You can't switch consoles after this and the only possible user interaction is SysRq. BUG: kernel NULL pointer dereference RIP: 0010:aq_ring_rx_fill+0xcf/0x210 [atlantic] aq_vec_init+0x85/0xe0 [atlantic] aq_nic_init+0xf7/0x1d0 [atlantic] atl_resume_common+0x4f/0x100 [atlantic] pci_pm_thaw+0x42/0xa0 resolves in aq_ring.o to ``` 0000000000000ae0 : { /* ... */ baf: 48 8b 43 08 mov 0x8(%rbx),%rax buff->flags = 0U; /* buff is NULL */ ``` The bug has been present since the introduction of the new pm code in 8aaa112a57c1 ("net: atlantic: refactoring pm logic") and was hidden until 8ce84271697a ("net: atlantic: changes for multi-TC support"), which refactored the aq_vec_{free,alloc} functions into aq_vec_{,ring}_{free,alloc}, but is technically not wrong. The original functions just always reinitialized the buffers on S3/S4. If the interface is down before freezing, the bug does not occur. It does not matter, whether the initrd contains and loads the module before thawing. So the fix is to invert the boolean parameter deep in all pm function calls, which was clearly intended to be set like that. First report was on Github [1], which you have to guess from the resume logs in the posted dmesg snippet. Recently I posted one on Bugzilla [2], since I did not have an AQC device so far. #regzbot introduced: 8ce84271697a #regzbot from: koo5 #regzbot monitor: https://github.com/Aquantia/AQtion/issues/32 Fixes: 8aaa112a57c1 ("net: atlantic: refactoring pm logic") Link: https://github.com/Aquantia/AQtion/issues/32 [1] Link: https://bugzilla.kernel.org/show_bug.cgi?id=215798 [2] Cc: stable@vger.kernel.org Reported-by: koo5 Signed-off-by: Manuel Ullmann Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c b/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c index 1826253f97dc..bdfd462c74db 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c @@ -450,22 +450,22 @@ static int atl_resume_common(struct device *dev, bool deep) static int aq_pm_freeze(struct device *dev) { - return aq_suspend_common(dev, false); + return aq_suspend_common(dev, true); } static int aq_pm_suspend_poweroff(struct device *dev) { - return aq_suspend_common(dev, true); + return aq_suspend_common(dev, false); } static int aq_pm_thaw(struct device *dev) { - return atl_resume_common(dev, false); + return atl_resume_common(dev, true); } static int aq_pm_resume_restore(struct device *dev) { - return atl_resume_common(dev, true); + return atl_resume_common(dev, false); } static const struct dev_pm_ops aq_pm_ops = { -- Gitee From e85467a78402f38cdbc4a15730731610dd8aa195 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Tue, 26 Jul 2022 17:39:09 +0800 Subject: [PATCH 617/674] xtensa: patch_text: Fixup last cpu should be master stable inclusion from stable-v5.10.113 commit f399ab11dd6cc0a6927029371fbe2ec027181598 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=f399ab11dd6cc0a6927029371fbe2ec027181598 -------------------------------- commit ee69d4be8fd064cd08270b4808d2dfece3614ee0 upstream. These patch_text implementations are using stop_machine_cpuslocked infrastructure with atomic cpu_count. The original idea: When the master CPU patch_text, the others should wait for it. But current implementation is using the first CPU as master, which couldn't guarantee the remaining CPUs are waiting. This patch changes the last CPU as the master to solve the potential risk. Fixes: 64711f9a47d4 ("xtensa: implement jump_label support") Signed-off-by: Guo Ren Signed-off-by: Guo Ren Reviewed-by: Max Filippov Reviewed-by: Masami Hiramatsu Cc: Message-Id: <20220407073323.743224-4-guoren@kernel.org> Signed-off-by: Max Filippov Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- arch/xtensa/kernel/jump_label.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/xtensa/kernel/jump_label.c b/arch/xtensa/kernel/jump_label.c index 0dde21e0d3de..ad1841cecdfb 100644 --- a/arch/xtensa/kernel/jump_label.c +++ b/arch/xtensa/kernel/jump_label.c @@ -40,7 +40,7 @@ static int patch_text_stop_machine(void *data) { struct patch *patch = data; - if (atomic_inc_return(&patch->cpu_count) == 1) { + if (atomic_inc_return(&patch->cpu_count) == num_online_cpus()) { local_patch_text(patch->addr, patch->data, patch->sz); atomic_inc(&patch->cpu_count); } else { -- Gitee From 90fa436f8a1afa90f01292c96ed7813fe10371d5 Mon Sep 17 00:00:00 2001 From: Max Filippov Date: Tue, 26 Jul 2022 17:39:10 +0800 Subject: [PATCH 618/674] xtensa: fix a7 clobbering in coprocessor context load/store stable inclusion from stable-v5.10.113 commit 19f6dcb1f0f0f8523976c8aa1800856c9b4f35c3 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=19f6dcb1f0f0f8523976c8aa1800856c9b4f35c3 -------------------------------- commit 839769c35477d4acc2369e45000ca7b0b6af39a7 upstream. Fast coprocessor exception handler saves a3..a6, but coprocessor context load/store code uses a4..a7 as temporaries, potentially clobbering a7. 'Potentially' because coprocessor state load/store macros may not use all four temporary registers (and neither FPU nor HiFi macros do). Use a3..a6 as intended. Cc: stable@vger.kernel.org Fixes: c658eac628aa ("[XTENSA] Add support for configurable registers and coprocessors") Signed-off-by: Max Filippov Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- arch/xtensa/kernel/coprocessor.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/xtensa/kernel/coprocessor.S b/arch/xtensa/kernel/coprocessor.S index 45cc0ae0af6f..c7b9f12896f2 100644 --- a/arch/xtensa/kernel/coprocessor.S +++ b/arch/xtensa/kernel/coprocessor.S @@ -29,7 +29,7 @@ .if XTENSA_HAVE_COPROCESSOR(x); \ .align 4; \ .Lsave_cp_regs_cp##x: \ - xchal_cp##x##_store a2 a4 a5 a6 a7; \ + xchal_cp##x##_store a2 a3 a4 a5 a6; \ jx a0; \ .endif @@ -46,7 +46,7 @@ .if XTENSA_HAVE_COPROCESSOR(x); \ .align 4; \ .Lload_cp_regs_cp##x: \ - xchal_cp##x##_load a2 a4 a5 a6 a7; \ + xchal_cp##x##_load a2 a3 a4 a5 a6; \ jx a0; \ .endif -- Gitee From 490085c27ad981285d7f65b677a78bd578084e98 Mon Sep 17 00:00:00 2001 From: Paolo Valerio Date: Tue, 26 Jul 2022 17:39:11 +0800 Subject: [PATCH 619/674] openvswitch: fix OOB access in reserve_sfa_size() stable inclusion from stable-v5.10.113 commit 0837ff17d052b7d755d5086208c3445867aaff82 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=0837ff17d052b7d755d5086208c3445867aaff82 -------------------------------- commit cefa91b2332d7009bc0be5d951d6cbbf349f90f8 upstream. Given a sufficiently large number of actions, while copying and reserving memory for a new action of a new flow, if next_offset is greater than MAX_ACTIONS_BUFSIZE, the function reserve_sfa_size() does not return -EMSGSIZE as expected, but it allocates MAX_ACTIONS_BUFSIZE bytes increasing actions_len by req_size. This can then lead to an OOB write access, especially when further actions need to be copied. Fix it by rearranging the flow action size check. KASAN splat below: Acked-by: Xie XiuQi ================================================================== BUG: KASAN: slab-out-of-bounds in reserve_sfa_size+0x1ba/0x380 [openvswitch] Write of size 65360 at addr ffff888147e4001c by task handler15/836 CPU: 1 PID: 836 Comm: handler15 Not tainted 5.18.0-rc1+ #27 ... Call Trace: dump_stack_lvl+0x45/0x5a print_report.cold+0x5e/0x5db ? __lock_text_start+0x8/0x8 ? reserve_sfa_size+0x1ba/0x380 [openvswitch] kasan_report+0xb5/0x130 ? reserve_sfa_size+0x1ba/0x380 [openvswitch] kasan_check_range+0xf5/0x1d0 memcpy+0x39/0x60 reserve_sfa_size+0x1ba/0x380 [openvswitch] __add_action+0x24/0x120 [openvswitch] ovs_nla_add_action+0xe/0x20 [openvswitch] ovs_ct_copy_action+0x29d/0x1130 [openvswitch] ? __kernel_text_address+0xe/0x30 ? unwind_get_return_address+0x56/0xa0 ? create_prof_cpu_mask+0x20/0x20 ? ovs_ct_verify+0xf0/0xf0 [openvswitch] ? prep_compound_page+0x198/0x2a0 ? __kasan_check_byte+0x10/0x40 ? kasan_unpoison+0x40/0x70 ? ksize+0x44/0x60 ? reserve_sfa_size+0x75/0x380 [openvswitch] __ovs_nla_copy_actions+0xc26/0x2070 [openvswitch] ? __zone_watermark_ok+0x420/0x420 ? validate_set.constprop.0+0xc90/0xc90 [openvswitch] ? __alloc_pages+0x1a9/0x3e0 ? __alloc_pages_slowpath.constprop.0+0x1da0/0x1da0 ? unwind_next_frame+0x991/0x1e40 ? __mod_node_page_state+0x99/0x120 ? __mod_lruvec_page_state+0x2e3/0x470 ? __kasan_kmalloc_large+0x90/0xe0 ovs_nla_copy_actions+0x1b4/0x2c0 [openvswitch] ovs_flow_cmd_new+0x3cd/0xb10 [openvswitch] ... Cc: stable@vger.kernel.org Fixes: f28cd2af22a0 ("openvswitch: fix flow actions reallocation") Signed-off-by: Paolo Valerio Acked-by: Eelco Chaudron Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai --- net/openvswitch/flow_netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 98a7e6f64ab0..293a798e89f4 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -2436,7 +2436,7 @@ static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa, new_acts_size = max(next_offset + req_size, ksize(*sfa) * 2); if (new_acts_size > MAX_ACTIONS_BUFSIZE) { - if ((MAX_ACTIONS_BUFSIZE - next_offset) < req_size) { + if ((next_offset + req_size) > MAX_ACTIONS_BUFSIZE) { OVS_NLERR(log, "Flow action size exceeds max %u", MAX_ACTIONS_BUFSIZE); return ERR_PTR(-EMSGSIZE); -- Gitee From 140cdaa1ebf8c1c01a60b1f56522154c58dc71ce Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Tue, 26 Jul 2022 17:39:12 +0800 Subject: [PATCH 620/674] gpio: Request interrupts after IRQ is initialized MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit stable inclusion from stable-v5.10.113 commit 54e6180c8c2d71b6f1fab0d1420f65299b26953e category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=54e6180c8c2d71b6f1fab0d1420f65299b26953e -------------------------------- commit 06fb4ecfeac7e00d6704fa5ed19299f2fefb3cc9 upstream. Commit 5467801f1fcb ("gpio: Restrict usage of GPIO chip irq members before initialization") attempted to fix a race condition that lead to a NULL pointer, but in the process caused a regression for _AEI/_EVT declared GPIOs. This manifests in messages showing deferred probing while trying to allocate IRQs like so: amd_gpio AMDI0030:00: Failed to translate GPIO pin 0x0000 to IRQ, err -517 amd_gpio AMDI0030:00: Failed to translate GPIO pin 0x002C to IRQ, err -517 amd_gpio AMDI0030:00: Failed to translate GPIO pin 0x003D to IRQ, err -517 [ .. more of the same .. ] The code for walking _AEI doesn't handle deferred probing and so this leads to non-functional GPIO interrupts. Fix this issue by moving the call to `acpi_gpiochip_request_interrupts` to occur after gc->irc.initialized is set. Fixes: 5467801f1fcb ("gpio: Restrict usage of GPIO chip irq members before initialization") Link: https://lore.kernel.org/linux-gpio/BL1PR12MB51577A77F000A008AA694675E2EF9@BL1PR12MB5157.namprd12.prod.outlook.com/ Link: https://bugzilla.suse.com/show_bug.cgi?id=1198697 Link: https://bugzilla.kernel.org/show_bug.cgi?id=215850 Link: https://gitlab.freedesktop.org/drm/amd/-/issues/1979 Link: https://gitlab.freedesktop.org/drm/amd/-/issues/1976 Reported-by: Mario Limonciello Signed-off-by: Mario Limonciello Reviewed-by: Shreeya Patel Tested-By: Samuel Čavoj Tested-By: lukeluk498@gmail.com Link: Reviewed-by: Andy Shevchenko Acked-by: Linus Walleij Reviewed-and-tested-by: Takashi Iwai Cc: Shreeya Patel Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/gpio/gpiolib.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index d18078748200..59d8affad343 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -1612,8 +1612,6 @@ static int gpiochip_add_irqchip(struct gpio_chip *gc, gpiochip_set_irq_hooks(gc); - acpi_gpiochip_request_interrupts(gc); - /* * Using barrier() here to prevent compiler from reordering * gc->irq.initialized before initialization of above @@ -1623,6 +1621,8 @@ static int gpiochip_add_irqchip(struct gpio_chip *gc, gc->irq.initialized = true; + acpi_gpiochip_request_interrupts(gc); + return 0; } -- Gitee From 9d7cda5f4e3dc4ab3acf27cd75d78b156a0cc099 Mon Sep 17 00:00:00 2001 From: Xiaomeng Tong Date: Tue, 26 Jul 2022 17:39:13 +0800 Subject: [PATCH 621/674] ASoC: soc-dapm: fix two incorrect uses of list iterator stable inclusion from stable-v5.10.113 commit 43a2a3734aa3d25ef11b90f14f18bf31fb0a7feb category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=43a2a3734aa3d25ef11b90f14f18bf31fb0a7feb -------------------------------- commit f730a46b931d894816af34a0ff8e4ad51565b39f upstream. These two bug are here: list_for_each_entry_safe_continue(w, n, list, power_list); list_for_each_entry_safe_continue(w, n, list, power_list); After the list_for_each_entry_safe_continue() exits, the list iterator will always be a bogus pointer which point to an invalid struct objdect containing HEAD member. The funciton poniter 'w->event' will be a invalid value which can lead to a control-flow hijack if the 'w' can be controlled. The original intention was to continue the outer list_for_each_entry_safe() loop with the same entry if w->event is NULL, but misunderstanding the meaning of list_for_each_entry_safe_continue(). So just add a 'continue;' to fix the bug. Cc: stable@vger.kernel.org Fixes: 163cac061c973 ("ASoC: Factor out DAPM sequence execution") Signed-off-by: Xiaomeng Tong Link: https://lore.kernel.org/r/20220329012134.9375-1-xiam0nd.tong@gmail.com Signed-off-by: Mark Brown Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- sound/soc/soc-dapm.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c index 2924d89bf0da..417732bdf286 100644 --- a/sound/soc/soc-dapm.c +++ b/sound/soc/soc-dapm.c @@ -1683,8 +1683,7 @@ static void dapm_seq_run(struct snd_soc_card *card, switch (w->id) { case snd_soc_dapm_pre: if (!w->event) - list_for_each_entry_safe_continue(w, n, list, - power_list); + continue; if (event == SND_SOC_DAPM_STREAM_START) ret = w->event(w, @@ -1696,8 +1695,7 @@ static void dapm_seq_run(struct snd_soc_card *card, case snd_soc_dapm_post: if (!w->event) - list_for_each_entry_safe_continue(w, n, list, - power_list); + continue; if (event == SND_SOC_DAPM_STREAM_START) ret = w->event(w, -- Gitee From 2e5f28ee0f8c9876e8af1a9347aee57594f8478d Mon Sep 17 00:00:00 2001 From: Sasha Neftin Date: Tue, 26 Jul 2022 17:39:14 +0800 Subject: [PATCH 622/674] e1000e: Fix possible overflow in LTR decoding stable inclusion from stable-v5.10.113 commit 7082650eb8265b583d0aea0515bcc6f65b0e8755 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=7082650eb8265b583d0aea0515bcc6f65b0e8755 -------------------------------- commit 04ebaa1cfddae5f240cc7404f009133bb0389a47 upstream. When we decode the latency and the max_latency, u16 value may not fit the required size and could lead to the wrong LTR representation. Scaling is represented as: scale 0 - 1 (2^(5*0)) = 2^0 scale 1 - 32 (2^(5 *1))= 2^5 scale 2 - 1024 (2^(5 *2)) =2^10 scale 3 - 32768 (2^(5 *3)) =2^15 scale 4 - 1048576 (2^(5 *4)) = 2^20 scale 5 - 33554432 (2^(5 *4)) = 2^25 scale 4 and scale 5 required 20 and 25 bits respectively. scale 6 reserved. Replace the u16 type with the u32 type and allow corrected LTR representation. Cc: stable@vger.kernel.org Fixes: 44a13a5d99c7 ("e1000e: Fix the max snoop/no-snoop latency for 10M") Reported-by: James Hutchinson Link: https://bugzilla.kernel.org/show_bug.cgi?id=215689 Suggested-by: Dima Ruinskiy Signed-off-by: Sasha Neftin Tested-by: Naama Meir Tested-by: James Hutchinson Signed-off-by: Tony Nguyen Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/net/ethernet/intel/e1000e/ich8lan.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/e1000e/ich8lan.c b/drivers/net/ethernet/intel/e1000e/ich8lan.c index 15b1503d5b6c..1f51252b465a 100644 --- a/drivers/net/ethernet/intel/e1000e/ich8lan.c +++ b/drivers/net/ethernet/intel/e1000e/ich8lan.c @@ -1006,8 +1006,8 @@ static s32 e1000_platform_pm_pch_lpt(struct e1000_hw *hw, bool link) { u32 reg = link << (E1000_LTRV_REQ_SHIFT + E1000_LTRV_NOSNOOP_SHIFT) | link << E1000_LTRV_REQ_SHIFT | E1000_LTRV_SEND; - u16 max_ltr_enc_d = 0; /* maximum LTR decoded by platform */ - u16 lat_enc_d = 0; /* latency decoded */ + u32 max_ltr_enc_d = 0; /* maximum LTR decoded by platform */ + u32 lat_enc_d = 0; /* latency decoded */ u16 lat_enc = 0; /* latency encoded */ if (link) { -- Gitee From d51577c959b12c0ff173292fd7ebc9090ed20a29 Mon Sep 17 00:00:00 2001 From: Sergey Matyukevich Date: Tue, 26 Jul 2022 17:39:15 +0800 Subject: [PATCH 623/674] ARC: entry: fix syscall_trace_exit argument stable inclusion from stable-v5.10.113 commit 5580b974a84b30f6da90a3a562ea0dbfb0038110 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=5580b974a84b30f6da90a3a562ea0dbfb0038110 -------------------------------- commit b1c6ecfdd06907554518ec384ce8e99889d15193 upstream. Function syscall_trace_exit expects pointer to pt_regs. However r0 is also used to keep syscall return value. Restore pointer to pt_regs before calling syscall_trace_exit. Cc: Signed-off-by: Sergey Matyukevich Signed-off-by: Vineet Gupta Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- arch/arc/kernel/entry.S | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arc/kernel/entry.S b/arch/arc/kernel/entry.S index ae656bfc31c3..301ade4d0b94 100644 --- a/arch/arc/kernel/entry.S +++ b/arch/arc/kernel/entry.S @@ -199,6 +199,7 @@ tracesys_exit: st r0, [sp, PT_r0] ; sys call return value in pt_regs ;POST Sys Call Ptrace Hook + mov r0, sp ; pt_regs needed bl @syscall_trace_exit b ret_from_exception ; NOT ret_from_system_call at is saves r0 which ; we'd done before calling post hook above -- Gitee From 1e6c493076318775eed92b3e3fe887ab4dfde197 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Tue, 26 Jul 2022 17:39:16 +0800 Subject: [PATCH 624/674] arm_pmu: Validate single/group leader events stable inclusion from stable-v5.10.113 commit c55327bc3712ca96dc05bc4c6aab805b6fa8bb74 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=c55327bc3712ca96dc05bc4c6aab805b6fa8bb74 -------------------------------- commit e5c23779f93d45e39a52758ca593bd7e62e9b4be upstream. In the case where there is only a cycle counter available (i.e. PMCR_EL0.N is 0) and an event other than CPU cycles is opened, the open should fail as the event can never possibly be scheduled. However, the event validation when an event is opened is skipped when the group leader is opened. Fix this by always validating the group leader events. Reported-by: Al Grant Cc: Will Deacon Cc: Mark Rutland Signed-off-by: Rob Herring Acked-by: Mark Rutland Link: https://lore.kernel.org/r/20220408203330.4014015-1-robh@kernel.org Cc: Signed-off-by: Will Deacon Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/perf/arm_pmu.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c index fe075d9f95e2..c87faafbdba2 100644 --- a/drivers/perf/arm_pmu.c +++ b/drivers/perf/arm_pmu.c @@ -398,6 +398,9 @@ validate_group(struct perf_event *event) if (!validate_event(event->pmu, &fake_pmu, leader)) return -EINVAL; + if (event == leader) + return 0; + for_each_sibling_event(sibling, leader) { if (!validate_event(event->pmu, &fake_pmu, sibling)) return -EINVAL; @@ -487,12 +490,7 @@ __hw_perf_event_init(struct perf_event *event) local64_set(&hwc->period_left, hwc->sample_period); } - if (event->group_leader != event) { - if (validate_group(event) != 0) - return -EINVAL; - } - - return 0; + return validate_group(event); } static int armpmu_event_init(struct perf_event *event) -- Gitee From 8109e1614b619b8cb7791a76ca932a6e9da496d8 Mon Sep 17 00:00:00 2001 From: kuyo chang Date: Tue, 26 Jul 2022 17:39:17 +0800 Subject: [PATCH 625/674] sched/pelt: Fix attach_entity_load_avg() corner case stable inclusion from stable-v5.10.113 commit 88fcfd6ee6c5a617e712b346e9c15fc3057e532e category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=88fcfd6ee6c5a617e712b346e9c15fc3057e532e -------------------------------- [ Upstream commit 40f5aa4c5eaebfeaca4566217cb9c468e28ed682 ] The warning in cfs_rq_is_decayed() triggered: SCHED_WARN_ON(cfs_rq->avg.load_avg || cfs_rq->avg.util_avg || cfs_rq->avg.runnable_avg) There exists a corner case in attach_entity_load_avg() which will cause load_sum to be zero while load_avg will not be. Consider se_weight is 88761 as per the sched_prio_to_weight[] table. Further assume the get_pelt_divider() is 47742, this gives: se->avg.load_avg is 1. However, calculating load_sum: se->avg.load_sum = div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se)); se->avg.load_sum = 1*47742/88761 = 0. Then enqueue_load_avg() adds this to the cfs_rq totals: cfs_rq->avg.load_avg += se->avg.load_avg; cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum; Resulting in load_avg being 1 with load_sum is 0, which will trigger the WARN. Fixes: f207934fb79d ("sched/fair: Align PELT windows between cfs_rq and its se") Signed-off-by: kuyo chang [peterz: massage changelog] Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Tested-by: Dietmar Eggemann Link: https://lkml.kernel.org/r/20220414090229.342-1-kuyo.chang@mediatek.com Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- kernel/sched/fair.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 50d457979db6..09f002f1fe5d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3771,11 +3771,11 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s se->avg.runnable_sum = se->avg.runnable_avg * divider; - se->avg.load_sum = divider; - if (se_weight(se)) { - se->avg.load_sum = - div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se)); - } + se->avg.load_sum = se->avg.load_avg * divider; + if (se_weight(se) < se->avg.load_sum) + se->avg.load_sum = div_u64(se->avg.load_sum, se_weight(se)); + else + se->avg.load_sum = 1; enqueue_load_avg(cfs_rq, se); cfs_rq->avg.util_avg += se->avg.util_avg; -- Gitee From 47edbfbf9e4c8b189615ea817ffc52e0dc011aed Mon Sep 17 00:00:00 2001 From: Zhipeng Xie Date: Tue, 26 Jul 2022 17:39:18 +0800 Subject: [PATCH 626/674] perf/core: Fix perf_mmap fail when CONFIG_PERF_USE_VMALLOC enabled stable inclusion from stable-v5.10.113 commit 51d9cbbb0f5a175e5b4c4a25d2ec995363304860 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=51d9cbbb0f5a175e5b4c4a25d2ec995363304860 -------------------------------- [ Upstream commit 60490e7966659b26d74bf1fa4aa8693d9a94ca88 ] This problem can be reproduced with CONFIG_PERF_USE_VMALLOC enabled on both x86_64 and aarch64 arch when using sysdig -B(using ebpf)[1]. sysdig -B works fine after rebuilding the kernel with CONFIG_PERF_USE_VMALLOC disabled. I tracked it down to the if condition event->rb->nr_pages != nr_pages in perf_mmap is true when CONFIG_PERF_USE_VMALLOC is enabled where event->rb->nr_pages = 1 and nr_pages = 2048 resulting perf_mmap to return -EINVAL. This is because when CONFIG_PERF_USE_VMALLOC is enabled, rb->nr_pages is always equal to 1. Arch with CONFIG_PERF_USE_VMALLOC enabled by default: arc/arm/csky/mips/sh/sparc/xtensa Arch with CONFIG_PERF_USE_VMALLOC disabled by default: x86_64/aarch64/... Fix this problem by using data_page_nr() [1] https://github.com/draios/sysdig Fixes: 906010b2134e ("perf_event: Provide vmalloc() based mmap() backing") Signed-off-by: Zhipeng Xie Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220209145417.6495-1-xiezhipeng1@huawei.com Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- kernel/events/core.c | 2 +- kernel/events/internal.h | 5 +++++ kernel/events/ring_buffer.c | 5 ----- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 4bd9dd6c3b72..68dc8a8e7990 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6174,7 +6174,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) again: mutex_lock(&event->mmap_mutex); if (event->rb) { - if (event->rb->nr_pages != nr_pages) { + if (data_page_nr(event->rb) != nr_pages) { ret = -EINVAL; goto unlock; } diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 228801e20788..aa23ffdaf819 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -116,6 +116,11 @@ static inline int page_order(struct perf_buffer *rb) } #endif +static inline int data_page_nr(struct perf_buffer *rb) +{ + return rb->nr_pages << page_order(rb); +} + static inline unsigned long perf_data_size(struct perf_buffer *rb) { return rb->nr_pages << (PAGE_SHIFT + page_order(rb)); diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index ef91ae75ca56..4032cd475000 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -856,11 +856,6 @@ void rb_free(struct perf_buffer *rb) } #else -static int data_page_nr(struct perf_buffer *rb) -{ - return rb->nr_pages << page_order(rb); -} - static struct page * __perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff) { -- Gitee From 5f70f8f020405ec27343ea6c8c7f580922443b4a Mon Sep 17 00:00:00 2001 From: Dave Stevenson Date: Tue, 26 Jul 2022 17:39:19 +0800 Subject: [PATCH 627/674] drm/panel/raspberrypi-touchscreen: Avoid NULL deref if not initialised stable inclusion from stable-v5.10.113 commit 231381f5211620cec836b921f1c7a2cf702b3e8a category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=231381f5211620cec836b921f1c7a2cf702b3e8a -------------------------------- [ Upstream commit f92055ae0acb035891e988ce345d6b81a0316423 ] If a call to rpi_touchscreen_i2c_write from rpi_touchscreen_probe fails before mipi_dsi_device_register_full is called, then in trying to log the error message if uses ts->dsi->dev when it is still NULL. Use ts->i2c->dev instead, which is initialised earlier in probe. Fixes: 2f733d6194bd ("drm/panel: Add support for the Raspberry Pi 7" Touchscreen.") Signed-off-by: Dave Stevenson Signed-off-by: Stefan Wahren Signed-off-by: Maxime Ripard Link: https://patchwork.freedesktop.org/patch/msgid/20220415162513.42190-2-stefan.wahren@i2se.com Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/gpu/drm/panel/panel-raspberrypi-touchscreen.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/panel/panel-raspberrypi-touchscreen.c b/drivers/gpu/drm/panel/panel-raspberrypi-touchscreen.c index bbdd086be7f5..90487df62480 100644 --- a/drivers/gpu/drm/panel/panel-raspberrypi-touchscreen.c +++ b/drivers/gpu/drm/panel/panel-raspberrypi-touchscreen.c @@ -229,7 +229,7 @@ static void rpi_touchscreen_i2c_write(struct rpi_touchscreen *ts, ret = i2c_smbus_write_byte_data(ts->i2c, reg, val); if (ret) - dev_err(&ts->dsi->dev, "I2C write failed: %d\n", ret); + dev_err(&ts->i2c->dev, "I2C write failed: %d\n", ret); } static int rpi_touchscreen_write(struct rpi_touchscreen *ts, u16 reg, u32 val) -- Gitee From 27f402d6441a30ef1541f3bca32429aceb1ce8de Mon Sep 17 00:00:00 2001 From: Dave Stevenson Date: Tue, 26 Jul 2022 17:39:20 +0800 Subject: [PATCH 628/674] drm/panel/raspberrypi-touchscreen: Initialise the bridge in prepare stable inclusion from stable-v5.10.113 commit 405d98427416849cf37c84c0c70bd5008b686a1e category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=405d98427416849cf37c84c0c70bd5008b686a1e -------------------------------- [ Upstream commit 5f18c0782b99e26121efa93d20b76c19e17aa1dd ] The panel has a prepare call which is before video starts, and an enable call which is after. The Toshiba bridge should be configured before video, so move the relevant power and initialisation calls to prepare. Fixes: 2f733d6194bd ("drm/panel: Add support for the Raspberry Pi 7" Touchscreen.") Signed-off-by: Dave Stevenson Signed-off-by: Stefan Wahren Signed-off-by: Maxime Ripard Link: https://patchwork.freedesktop.org/patch/msgid/20220415162513.42190-3-stefan.wahren@i2se.com Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/gpu/drm/panel/panel-raspberrypi-touchscreen.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/panel/panel-raspberrypi-touchscreen.c b/drivers/gpu/drm/panel/panel-raspberrypi-touchscreen.c index 90487df62480..4b92c6341490 100644 --- a/drivers/gpu/drm/panel/panel-raspberrypi-touchscreen.c +++ b/drivers/gpu/drm/panel/panel-raspberrypi-touchscreen.c @@ -265,7 +265,7 @@ static int rpi_touchscreen_noop(struct drm_panel *panel) return 0; } -static int rpi_touchscreen_enable(struct drm_panel *panel) +static int rpi_touchscreen_prepare(struct drm_panel *panel) { struct rpi_touchscreen *ts = panel_to_ts(panel); int i; @@ -295,6 +295,13 @@ static int rpi_touchscreen_enable(struct drm_panel *panel) rpi_touchscreen_write(ts, DSI_STARTDSI, 0x01); msleep(100); + return 0; +} + +static int rpi_touchscreen_enable(struct drm_panel *panel) +{ + struct rpi_touchscreen *ts = panel_to_ts(panel); + /* Turn on the backlight. */ rpi_touchscreen_i2c_write(ts, REG_PWM, 255); @@ -349,7 +356,7 @@ static int rpi_touchscreen_get_modes(struct drm_panel *panel, static const struct drm_panel_funcs rpi_touchscreen_funcs = { .disable = rpi_touchscreen_disable, .unprepare = rpi_touchscreen_noop, - .prepare = rpi_touchscreen_noop, + .prepare = rpi_touchscreen_prepare, .enable = rpi_touchscreen_enable, .get_modes = rpi_touchscreen_get_modes, }; -- Gitee From d4248c33b8a3403ca22cce4319412763975d9b9f Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Tue, 26 Jul 2022 17:39:21 +0800 Subject: [PATCH 629/674] KVM: PPC: Fix TCE handling for VFIO stable inclusion from stable-v5.10.113 commit f8f8b3124b899867a18a3f63e538c791e21252ac category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=f8f8b3124b899867a18a3f63e538c791e21252ac -------------------------------- [ Upstream commit 26a62b750a4e6364b0393562f66759b1494c3a01 ] The LoPAPR spec defines a guest visible IOMMU with a variable page size. Currently QEMU advertises 4K, 64K, 2M, 16MB pages, a Linux VM picks the biggest (16MB). In the case of a passed though PCI device, there is a hardware IOMMU which does not support all pages sizes from the above - P8 cannot do 2MB and P9 cannot do 16MB. So for each emulated 16M IOMMU page we may create several smaller mappings ("TCEs") in the hardware IOMMU. The code wrongly uses the emulated TCE index instead of hardware TCE index in error handling. The problem is easier to see on POWER8 with multi-level TCE tables (when only the first level is preallocated) as hash mode uses real mode TCE hypercalls handlers. The kernel starts using indirect tables when VMs get bigger than 128GB (depends on the max page order). The very first real mode hcall is going to fail with H_TOO_HARD as in the real mode we cannot allocate memory for TCEs (we can in the virtual mode) but on the way out the code attempts to clear hardware TCEs using emulated TCE indexes which corrupts random kernel memory because it_offset==1<<59 is subtracted from those indexes and the resulting index is out of the TCE table bounds. This fixes kvmppc_clear_tce() to use the correct TCE indexes. While at it, this fixes TCE cache invalidation which uses emulated TCE indexes instead of the hardware ones. This went unnoticed as 64bit DMA is used these days and VMs map all RAM in one go and only then do DMA and this is when the TCE cache gets populated. Potentially this could slow down mapping, however normally 16MB emulated pages are backed by 64K hardware pages so it is one write to the "TCE Kill" per 256 updates which is not that bad considering the size of the cache (1024 TCEs or so). Fixes: ca1fc489cfa0 ("KVM: PPC: Book3S: Allow backing bigger guest IOMMU pages with smaller physical pages") Signed-off-by: Alexey Kardashevskiy Tested-by: David Gibson Reviewed-by: Frederic Barrat Reviewed-by: David Gibson Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220420050840.328223-1-aik@ozlabs.ru Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- arch/powerpc/kvm/book3s_64_vio.c | 45 +++++++++++++++-------------- arch/powerpc/kvm/book3s_64_vio_hv.c | 44 ++++++++++++++-------------- 2 files changed, 45 insertions(+), 44 deletions(-) diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 8da93fdfa59e..c640053ab03f 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -421,13 +421,19 @@ static void kvmppc_tce_put(struct kvmppc_spapr_tce_table *stt, tbl[idx % TCES_PER_PAGE] = tce; } -static void kvmppc_clear_tce(struct mm_struct *mm, struct iommu_table *tbl, - unsigned long entry) +static void kvmppc_clear_tce(struct mm_struct *mm, struct kvmppc_spapr_tce_table *stt, + struct iommu_table *tbl, unsigned long entry) { - unsigned long hpa = 0; - enum dma_data_direction dir = DMA_NONE; + unsigned long i; + unsigned long subpages = 1ULL << (stt->page_shift - tbl->it_page_shift); + unsigned long io_entry = entry << (stt->page_shift - tbl->it_page_shift); + + for (i = 0; i < subpages; ++i) { + unsigned long hpa = 0; + enum dma_data_direction dir = DMA_NONE; - iommu_tce_xchg_no_kill(mm, tbl, entry, &hpa, &dir); + iommu_tce_xchg_no_kill(mm, tbl, io_entry + i, &hpa, &dir); + } } static long kvmppc_tce_iommu_mapped_dec(struct kvm *kvm, @@ -486,6 +492,8 @@ static long kvmppc_tce_iommu_unmap(struct kvm *kvm, break; } + iommu_tce_kill(tbl, io_entry, subpages); + return ret; } @@ -545,6 +553,8 @@ static long kvmppc_tce_iommu_map(struct kvm *kvm, break; } + iommu_tce_kill(tbl, io_entry, subpages); + return ret; } @@ -591,10 +601,9 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, ret = kvmppc_tce_iommu_map(vcpu->kvm, stt, stit->tbl, entry, ua, dir); - iommu_tce_kill(stit->tbl, entry, 1); if (ret != H_SUCCESS) { - kvmppc_clear_tce(vcpu->kvm->mm, stit->tbl, entry); + kvmppc_clear_tce(vcpu->kvm->mm, stt, stit->tbl, entry); goto unlock_exit; } } @@ -670,13 +679,13 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu, */ if (get_user(tce, tces + i)) { ret = H_TOO_HARD; - goto invalidate_exit; + goto unlock_exit; } tce = be64_to_cpu(tce); if (kvmppc_tce_to_ua(vcpu->kvm, tce, &ua)) { ret = H_PARAMETER; - goto invalidate_exit; + goto unlock_exit; } list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { @@ -685,19 +694,15 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu, iommu_tce_direction(tce)); if (ret != H_SUCCESS) { - kvmppc_clear_tce(vcpu->kvm->mm, stit->tbl, - entry); - goto invalidate_exit; + kvmppc_clear_tce(vcpu->kvm->mm, stt, stit->tbl, + entry + i); + goto unlock_exit; } } kvmppc_tce_put(stt, entry + i, tce); } -invalidate_exit: - list_for_each_entry_lockless(stit, &stt->iommu_tables, next) - iommu_tce_kill(stit->tbl, entry, npages); - unlock_exit: srcu_read_unlock(&vcpu->kvm->srcu, idx); @@ -736,20 +741,16 @@ long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu, continue; if (ret == H_TOO_HARD) - goto invalidate_exit; + return ret; WARN_ON_ONCE(1); - kvmppc_clear_tce(vcpu->kvm->mm, stit->tbl, entry); + kvmppc_clear_tce(vcpu->kvm->mm, stt, stit->tbl, entry + i); } } for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift)) kvmppc_tce_put(stt, ioba >> stt->page_shift, tce_value); -invalidate_exit: - list_for_each_entry_lockless(stit, &stt->iommu_tables, next) - iommu_tce_kill(stit->tbl, ioba >> stt->page_shift, npages); - return ret; } EXPORT_SYMBOL_GPL(kvmppc_h_stuff_tce); diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index e5ba96c41f3f..57af53a6a2d8 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -247,13 +247,19 @@ static void iommu_tce_kill_rm(struct iommu_table *tbl, tbl->it_ops->tce_kill(tbl, entry, pages, true); } -static void kvmppc_rm_clear_tce(struct kvm *kvm, struct iommu_table *tbl, - unsigned long entry) +static void kvmppc_rm_clear_tce(struct kvm *kvm, struct kvmppc_spapr_tce_table *stt, + struct iommu_table *tbl, unsigned long entry) { - unsigned long hpa = 0; - enum dma_data_direction dir = DMA_NONE; + unsigned long i; + unsigned long subpages = 1ULL << (stt->page_shift - tbl->it_page_shift); + unsigned long io_entry = entry << (stt->page_shift - tbl->it_page_shift); + + for (i = 0; i < subpages; ++i) { + unsigned long hpa = 0; + enum dma_data_direction dir = DMA_NONE; - iommu_tce_xchg_no_kill_rm(kvm->mm, tbl, entry, &hpa, &dir); + iommu_tce_xchg_no_kill_rm(kvm->mm, tbl, io_entry + i, &hpa, &dir); + } } static long kvmppc_rm_tce_iommu_mapped_dec(struct kvm *kvm, @@ -316,6 +322,8 @@ static long kvmppc_rm_tce_iommu_unmap(struct kvm *kvm, break; } + iommu_tce_kill_rm(tbl, io_entry, subpages); + return ret; } @@ -379,6 +387,8 @@ static long kvmppc_rm_tce_iommu_map(struct kvm *kvm, break; } + iommu_tce_kill_rm(tbl, io_entry, subpages); + return ret; } @@ -424,10 +434,8 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, ret = kvmppc_rm_tce_iommu_map(vcpu->kvm, stt, stit->tbl, entry, ua, dir); - iommu_tce_kill_rm(stit->tbl, entry, 1); - if (ret != H_SUCCESS) { - kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry); + kvmppc_rm_clear_tce(vcpu->kvm, stt, stit->tbl, entry); return ret; } } @@ -569,7 +577,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, ua = 0; if (kvmppc_rm_tce_to_ua(vcpu->kvm, tce, &ua)) { ret = H_PARAMETER; - goto invalidate_exit; + goto unlock_exit; } list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { @@ -578,19 +586,15 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, iommu_tce_direction(tce)); if (ret != H_SUCCESS) { - kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, - entry); - goto invalidate_exit; + kvmppc_rm_clear_tce(vcpu->kvm, stt, stit->tbl, + entry + i); + goto unlock_exit; } } kvmppc_rm_tce_put(stt, entry + i, tce); } -invalidate_exit: - list_for_each_entry_lockless(stit, &stt->iommu_tables, next) - iommu_tce_kill_rm(stit->tbl, entry, npages); - unlock_exit: if (!prereg) arch_spin_unlock(&kvm->mmu_lock.rlock.raw_lock); @@ -632,20 +636,16 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu, continue; if (ret == H_TOO_HARD) - goto invalidate_exit; + return ret; WARN_ON_ONCE_RM(1); - kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry); + kvmppc_rm_clear_tce(vcpu->kvm, stt, stit->tbl, entry + i); } } for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift)) kvmppc_rm_tce_put(stt, ioba >> stt->page_shift, tce_value); -invalidate_exit: - list_for_each_entry_lockless(stit, &stt->iommu_tables, next) - iommu_tce_kill_rm(stit->tbl, ioba >> stt->page_shift, npages); - return ret; } -- Gitee From c0d651a41c25108ae1ada52d0b1f1a3e3020d53f Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Tue, 26 Jul 2022 17:39:22 +0800 Subject: [PATCH 630/674] drm/vc4: Use pm_runtime_resume_and_get to fix pm_runtime_get_sync() usage stable inclusion from stable-v5.10.113 commit 0a2cef65b32919af7df4df979c5eede5f7825f17 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=0a2cef65b32919af7df4df979c5eede5f7825f17 -------------------------------- [ Upstream commit 3d0b93d92a2790337aa9d18cb332d02356a24126 ] If the device is already in a runtime PM enabled state pm_runtime_get_sync() will return 1. Also, we need to call pm_runtime_put_noidle() when pm_runtime_get_sync() fails, so use pm_runtime_resume_and_get() instead. this function will handle this. Fixes: 4078f5757144 ("drm/vc4: Add DSI driver") Signed-off-by: Miaoqian Lin Signed-off-by: Maxime Ripard Link: https://patchwork.freedesktop.org/patch/msgid/20220420135008.2757-1-linmq006@gmail.com Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/gpu/drm/vc4/vc4_dsi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/vc4/vc4_dsi.c b/drivers/gpu/drm/vc4/vc4_dsi.c index eaf276978ee7..ad84b56f4091 100644 --- a/drivers/gpu/drm/vc4/vc4_dsi.c +++ b/drivers/gpu/drm/vc4/vc4_dsi.c @@ -835,7 +835,7 @@ static void vc4_dsi_encoder_enable(struct drm_encoder *encoder) unsigned long phy_clock; int ret; - ret = pm_runtime_get_sync(dev); + ret = pm_runtime_resume_and_get(dev); if (ret) { DRM_ERROR("Failed to runtime PM enable on DSI%d\n", dsi->port); return; -- Gitee From a0820bde54b9a6f88f03400828dd5f0fdfa2051f Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Tue, 26 Jul 2022 17:39:23 +0800 Subject: [PATCH 631/674] powerpc/perf: Fix power9 event alternatives stable inclusion from stable-v5.10.113 commit e012f9d1af54ca3c24ca0e9ec03a1a212972771c category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=e012f9d1af54ca3c24ca0e9ec03a1a212972771c -------------------------------- [ Upstream commit 0dcad700bb2776e3886fe0a645a4bf13b1e747cd ] When scheduling a group of events, there are constraint checks done to make sure all events can go in a group. Example, one of the criteria is that events in a group cannot use the same PMC. But platform specific PMU supports alternative event for some of the event codes. During perf_event_open(), if any event group doesn't match constraint check criteria, further lookup is done to find alternative event. By current design, the array of alternatives events in PMU code is expected to be sorted by column 0. This is because in find_alternative() the return criteria is based on event code comparison. ie. "event < ev_alt[i][0])". This optimisation is there since find_alternative() can be called multiple times. In power9 PMU code, the alternative event array is not sorted properly and hence there is breakage in finding alternative events. To work with existing logic, fix the alternative event array to be sorted by column 0 for power9-pmu.c Results: With alternative events, multiplexing can be avoided. That is, for example, in power9 PM_LD_MISS_L1 (0x3e054) has alternative event, PM_LD_MISS_L1_ALT (0x400f0). This is an identical event which can be programmed in a different PMC. Before: # perf stat -e r3e054,r300fc Performance counter stats for 'system wide': 1057860 r3e054 (50.21%) 379 r300fc (49.79%) 0.944329741 seconds time elapsed Since both the events are using PMC3 in this case, they are multiplexed here. After: # perf stat -e r3e054,r300fc Performance counter stats for 'system wide': 1006948 r3e054 182 r300fc Fixes: 91e0bd1e6251 ("powerpc/perf: Add PM_LD_MISS_L1 and PM_BR_2PATH to power9 event list") Signed-off-by: Athira Rajeev Reviewed-by: Madhavan Srinivasan Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220419114828.89843-1-atrajeev@linux.vnet.ibm.com Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- arch/powerpc/perf/power9-pmu.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/perf/power9-pmu.c b/arch/powerpc/perf/power9-pmu.c index 2a57e93a79dc..7245355bee28 100644 --- a/arch/powerpc/perf/power9-pmu.c +++ b/arch/powerpc/perf/power9-pmu.c @@ -133,11 +133,11 @@ int p9_dd22_bl_ev[] = { /* Table of alternatives, sorted by column 0 */ static const unsigned int power9_event_alternatives[][MAX_ALT] = { - { PM_INST_DISP, PM_INST_DISP_ALT }, - { PM_RUN_CYC_ALT, PM_RUN_CYC }, - { PM_RUN_INST_CMPL_ALT, PM_RUN_INST_CMPL }, - { PM_LD_MISS_L1, PM_LD_MISS_L1_ALT }, { PM_BR_2PATH, PM_BR_2PATH_ALT }, + { PM_INST_DISP, PM_INST_DISP_ALT }, + { PM_RUN_CYC_ALT, PM_RUN_CYC }, + { PM_LD_MISS_L1, PM_LD_MISS_L1_ALT }, + { PM_RUN_INST_CMPL_ALT, PM_RUN_INST_CMPL }, }; static int power9_get_alternatives(u64 event, unsigned int flags, u64 alt[]) -- Gitee From ce66add4b99dde3edf99491d41ea364b6005683c Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Tue, 26 Jul 2022 17:39:24 +0800 Subject: [PATCH 632/674] perf report: Set PERF_SAMPLE_DATA_SRC bit for Arm SPE event stable inclusion from stable-v5.10.113 commit 19590bbc691d81f03d2a24a3ec30c399ebe071e0 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=19590bbc691d81f03d2a24a3ec30c399ebe071e0 -------------------------------- [ Upstream commit ccb17caecfbd542f49a2a79ae088136ba8bfb794 ] Since commit bb30acae4c4dacfa ("perf report: Bail out --mem-mode if mem info is not available") "perf mem report" and "perf report --mem-mode" don't report result if the PERF_SAMPLE_DATA_SRC bit is missed in sample type. The commit ffab487052054162 ("perf: arm-spe: Fix perf report --mem-mode") partially fixes the issue. It adds PERF_SAMPLE_DATA_SRC bit for Arm SPE event, this allows the perf data file generated by kernel v5.18-rc1 or later version can be reported properly. On the other hand, perf tool still fails to be backward compatibility for a data file recorded by an older version's perf which contains Arm SPE trace data. This patch is a workaround in reporting phase, when detects ARM SPE PMU event and without PERF_SAMPLE_DATA_SRC bit, it will force to set the bit in the sample type and give a warning info. Fixes: bb30acae4c4dacfa ("perf report: Bail out --mem-mode if mem info is not available") Reviewed-by: James Clark Signed-off-by: Leo Yan Tested-by: German Gomez Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Link: https://lore.kernel.org/r/20220414123201.842754-1-leo.yan@linaro.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- tools/perf/builtin-report.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 91cab5cdfbc1..b55ee073c2f7 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -340,6 +340,7 @@ static int report__setup_sample_type(struct report *rep) struct perf_session *session = rep->session; u64 sample_type = evlist__combined_sample_type(session->evlist); bool is_pipe = perf_data__is_pipe(session->data); + struct evsel *evsel; if (session->itrace_synth_opts->callchain || session->itrace_synth_opts->add_callchain || @@ -394,6 +395,19 @@ static int report__setup_sample_type(struct report *rep) } if (sort__mode == SORT_MODE__MEMORY) { + /* + * FIXUP: prior to kernel 5.18, Arm SPE missed to set + * PERF_SAMPLE_DATA_SRC bit in sample type. For backward + * compatibility, set the bit if it's an old perf data file. + */ + evlist__for_each_entry(session->evlist, evsel) { + if (strstr(evsel->name, "arm_spe") && + !(sample_type & PERF_SAMPLE_DATA_SRC)) { + evsel->core.attr.sample_type |= PERF_SAMPLE_DATA_SRC; + sample_type |= PERF_SAMPLE_DATA_SRC; + } + } + if (!is_pipe && !(sample_type & PERF_SAMPLE_DATA_SRC)) { ui__error("Selected --mem-mode but no mem data. " "Did you call perf record without -d?\n"); -- Gitee From 97cada6b51f5737aba63d7d3f69cf413a3c30a1b Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 26 Jul 2022 17:39:25 +0800 Subject: [PATCH 633/674] ext4: fix fallocate to use file_modified to update permissions consistently stable inclusion from stable-v5.10.113 commit f6038d43b25bba1cd50d2a77e207f6550aee9954 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=f6038d43b25bba1cd50d2a77e207f6550aee9954 -------------------------------- commit ad5cd4f4ee4d5fcdb1bfb7a0c073072961e70783 upstream. Since the initial introduction of (posix) fallocate back at the turn of the century, it has been possible to use this syscall to change the user-visible contents of files. This can happen by extending the file size during a preallocation, or through any of the newer modes (punch, zero, collapse, insert range). Because the call can be used to change file contents, we should treat it like we do any other modification to a file -- update the mtime, and drop set[ug]id privileges/capabilities. The VFS function file_modified() does all this for us if pass it a locked inode, so let's make fallocate drop permissions correctly. Signed-off-by: Darrick J. Wong Link: https://lore.kernel.org/r/20220308185043.GA117678@magnolia Signed-off-by: Theodore Ts'o Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- fs/ext4/ext4.h | 2 +- fs/ext4/extents.c | 32 +++++++++++++++++++++++++------- fs/ext4/inode.c | 7 ++++++- 3 files changed, 32 insertions(+), 9 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 277f89d5de03..df43cda8fc2f 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2908,7 +2908,7 @@ extern int ext4_inode_attach_jinode(struct inode *inode); extern int ext4_can_truncate(struct inode *inode); extern int ext4_truncate(struct inode *); extern int ext4_break_layouts(struct inode *); -extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length); +extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length); extern void ext4_set_inode_flags(struct inode *, bool init); extern int ext4_alloc_da_blocks(struct inode *inode); extern void ext4_set_aops(struct inode *inode); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 4323186bae78..9d06695c04ab 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4510,9 +4510,9 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, return ret > 0 ? ret2 : ret; } -static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len); +static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len); -static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len); +static int ext4_insert_range(struct file *file, loff_t offset, loff_t len); static long ext4_zero_range(struct file *file, loff_t offset, loff_t len, int mode) @@ -4583,6 +4583,10 @@ static long ext4_zero_range(struct file *file, loff_t offset, /* Wait all existing dio workers, newcomers will block on i_mutex */ inode_dio_wait(inode); + ret = file_modified(file); + if (ret) + goto out_mutex; + /* Preallocate the range including the unaligned edges */ if (partial_begin || partial_end) { ret = ext4_alloc_file_blocks(file, @@ -4707,17 +4711,17 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) goto exit; if (mode & FALLOC_FL_PUNCH_HOLE) { - ret = ext4_punch_hole(inode, offset, len); + ret = ext4_punch_hole(file, offset, len); goto exit; } if (mode & FALLOC_FL_COLLAPSE_RANGE) { - ret = ext4_collapse_range(inode, offset, len); + ret = ext4_collapse_range(file, offset, len); goto exit; } if (mode & FALLOC_FL_INSERT_RANGE) { - ret = ext4_insert_range(inode, offset, len); + ret = ext4_insert_range(file, offset, len); goto exit; } @@ -4753,6 +4757,10 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) /* Wait all existing dio workers, newcomers will block on i_mutex */ inode_dio_wait(inode); + ret = file_modified(file); + if (ret) + goto out; + ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags); if (ret) goto out; @@ -5255,8 +5263,9 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, * This implements the fallocate's collapse range functionality for ext4 * Returns: 0 and non-zero on error. */ -static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) +static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len) { + struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; ext4_lblk_t punch_start, punch_stop; handle_t *handle; @@ -5307,6 +5316,10 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) /* Wait for existing dio to complete */ inode_dio_wait(inode); + ret = file_modified(file); + if (ret) + goto out_mutex; + /* * Prevent page faults from reinstantiating pages we have released from * page cache. @@ -5401,8 +5414,9 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) * by len bytes. * Returns 0 on success, error otherwise. */ -static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) +static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) { + struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; handle_t *handle; struct ext4_ext_path *path; @@ -5458,6 +5472,10 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) /* Wait for existing dio to complete */ inode_dio_wait(inode); + ret = file_modified(file); + if (ret) + goto out_mutex; + /* * Prevent page faults from reinstantiating pages we have released from * page cache. diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e85c238edd85..69fcf82f2c81 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3972,8 +3972,9 @@ int ext4_break_layouts(struct inode *inode) * Returns: 0 on success or negative on failure */ -int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) +int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) { + struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; ext4_lblk_t first_block, stop_block; struct address_space *mapping = inode->i_mapping; @@ -4026,6 +4027,10 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) /* Wait all existing dio workers, newcomers will block on i_mutex */ inode_dio_wait(inode); + ret = file_modified(file); + if (ret) + goto out_mutex; + /* * Prevent page faults from reinstantiating pages we have released from * page cache. -- Gitee From aaf9e2faf5bb4ea97264d9f45f6c6e35f73f17e8 Mon Sep 17 00:00:00 2001 From: Tadeusz Struk Date: Tue, 26 Jul 2022 17:39:26 +0800 Subject: [PATCH 634/674] ext4: limit length to bitmap_maxbytes - blocksize in punch_hole stable inclusion from stable-v5.10.113 commit 22c450d39f8922ae26de459cf4f83b2b294f207e category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=22c450d39f8922ae26de459cf4f83b2b294f207e -------------------------------- commit 2da376228a2427501feb9d15815a45dbdbdd753e upstream. Syzbot found an issue [1] in ext4_fallocate(). The C reproducer [2] calls fallocate(), passing size 0xffeffeff000ul, and offset 0x1000000ul, which, when added together exceed the bitmap_maxbytes for the inode. This triggers a BUG in ext4_ind_remove_space(). According to the comments in this function the 'end' parameter needs to be one block after the last block to be removed. In the case when the BUG is triggered it points to the last block. Modify the ext4_punch_hole() function and add constraint that caps the length to satisfy the one before laster block requirement. LINK: [1] https://syzkaller.appspot.com/bug?id=b80bd9cf348aac724a4f4dff251800106d721331 LINK: [2] https://syzkaller.appspot.com/text?tag=ReproC&x=14ba0238700000 Fixes: a4bb6b64e39a ("ext4: enable "punch hole" functionality") Reported-by: syzbot+7a806094edd5d07ba029@syzkaller.appspotmail.com Signed-off-by: Tadeusz Struk Link: https://lore.kernel.org/r/20220331200515.153214-1-tadeusz.struk@linaro.org Signed-off-by: Theodore Ts'o Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- fs/ext4/inode.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 69fcf82f2c81..f4e0c7cc4820 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3978,7 +3978,8 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) struct super_block *sb = inode->i_sb; ext4_lblk_t first_block, stop_block; struct address_space *mapping = inode->i_mapping; - loff_t first_block_offset, last_block_offset; + loff_t first_block_offset, last_block_offset, max_length; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); handle_t *handle; unsigned int credits; int ret = 0, ret2 = 0; @@ -4012,6 +4013,14 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) offset; } + /* + * For punch hole the length + offset needs to be within one block + * before last range. Adjust the length if it goes beyond that limit. + */ + max_length = sbi->s_bitmap_maxbytes - inode->i_sb->s_blocksize; + if (offset + length > max_length) + length = max_length - offset; + if (offset & (sb->s_blocksize - 1) || (offset + length) & (sb->s_blocksize - 1)) { /* -- Gitee From cf4de22f4fd1ee1f6367a12848686f71d1293010 Mon Sep 17 00:00:00 2001 From: "wangjianjian (C)" Date: Tue, 26 Jul 2022 17:39:27 +0800 Subject: [PATCH 635/674] ext4, doc: fix incorrect h_reserved size stable inclusion from stable-v5.10.113 commit 0c54b093766becb9c83317232c93290cf612b8ff category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=0c54b093766becb9c83317232c93290cf612b8ff -------------------------------- commit 7102ffe4c166ca0f5e35137e9f9de83768c2d27d upstream. According to document and code, ext4_xattr_header's size is 32 bytes, so h_reserved size should be 3. Signed-off-by: Wang Jianjian Link: https://lore.kernel.org/r/92fcc3a6-7d77-8c09-4126-377fcb4c46a5@huawei.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- Documentation/filesystems/ext4/attributes.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/filesystems/ext4/attributes.rst b/Documentation/filesystems/ext4/attributes.rst index 54386a010a8d..871d2da7a0a9 100644 --- a/Documentation/filesystems/ext4/attributes.rst +++ b/Documentation/filesystems/ext4/attributes.rst @@ -76,7 +76,7 @@ The beginning of an extended attribute block is in - Checksum of the extended attribute block. * - 0x14 - \_\_u32 - - h\_reserved[2] + - h\_reserved[3] - Zero. The checksum is calculated against the FS UUID, the 64-bit block number -- Gitee From 27d40cc66fb275b8c38be0766f63cd14118d4423 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 26 Jul 2022 17:39:28 +0800 Subject: [PATCH 636/674] ext4: fix overhead calculation to account for the reserved gdt blocks stable inclusion from stable-v5.10.113 commit 4789149b9ea2a1893c62d816742f1a76514fc901 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=4789149b9ea2a1893c62d816742f1a76514fc901 -------------------------------- commit 10b01ee92df52c8d7200afead4d5e5f55a5c58b1 upstream. The kernel calculation was underestimating the overhead by not taking into account the reserved gdt blocks. With this change, the overhead calculated by the kernel matches the overhead calculation in mke2fs. Signed-off-by: Theodore Ts'o Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- fs/ext4/super.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 26c938360895..34e4e00445da 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3912,9 +3912,11 @@ static int count_overhead(struct super_block *sb, ext4_group_t grp, ext4_fsblk_t first_block, last_block, b; ext4_group_t i, ngroups = ext4_get_groups_count(sb); int s, j, count = 0; + int has_super = ext4_bg_has_super(sb, grp); if (!ext4_has_feature_bigalloc(sb)) - return (ext4_bg_has_super(sb, grp) + ext4_bg_num_gdb(sb, grp) + + return (has_super + ext4_bg_num_gdb(sb, grp) + + (has_super ? le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0) + sbi->s_itb_per_group + 2); first_block = le32_to_cpu(sbi->s_es->s_first_data_block) + -- Gitee From fd9c2ff88c62fbb32351bc8165f473a54e91817c Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 26 Jul 2022 17:39:29 +0800 Subject: [PATCH 637/674] ext4: force overhead calculation if the s_overhead_cluster makes no sense stable inclusion from stable-v5.10.113 commit e1e96e37272156d691203a3725b876787f38c8f2 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=e1e96e37272156d691203a3725b876787f38c8f2 -------------------------------- commit 85d825dbf4899a69407338bae462a59aa9a37326 upstream. If the file system does not use bigalloc, calculating the overhead is cheap, so force the recalculation of the overhead so we don't have to trust the precalculated overhead in the superblock. Signed-off-by: Theodore Ts'o Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- fs/ext4/super.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 34e4e00445da..706a159817ee 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4977,9 +4977,18 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) * Get the # of file system overhead blocks from the * superblock if present. */ - if (es->s_overhead_clusters) - sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters); - else { + sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters); + /* ignore the precalculated value if it is ridiculous */ + if (sbi->s_overhead > ext4_blocks_count(es)) + sbi->s_overhead = 0; + /* + * If the bigalloc feature is not enabled recalculating the + * overhead doesn't take long, so we might as well just redo + * it to make sure we are using the correct value. + */ + if (!ext4_has_feature_bigalloc(sb)) + sbi->s_overhead = 0; + if (sbi->s_overhead == 0) { err = ext4_calculate_overhead(sb); if (err) goto failed_mount_wq; -- Gitee From f120da4c1f854aadb526a8fd86e3caf0c9fdbe5e Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Tue, 26 Jul 2022 17:39:30 +0800 Subject: [PATCH 638/674] can: isotp: stop timeout monitoring when no first frame was sent stable inclusion from stable-v5.10.113 commit 50aac44273600cb0ae1efd010bb1de7701444a41 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=50aac44273600cb0ae1efd010bb1de7701444a41 -------------------------------- commit d73497081710c876c3c61444445512989e102152 upstream. The first attempt to fix a the 'impossible' WARN_ON_ONCE(1) in isotp_tx_timer_handler() focussed on the identical CAN IDs created by the syzbot reproducer and lead to upstream fix/commit 3ea566422cbd ("can: isotp: sanitize CAN ID checks in isotp_bind()"). But this did not catch the root cause of the wrong tx.state in the tx_timer handler. In the isotp 'first frame' case a timeout monitoring needs to be started before the 'first frame' is send. But when this sending failed the timeout monitoring for this specific frame has to be disabled too. Otherwise the tx_timer is fired with the 'warn me' tx.state of ISOTP_IDLE. Fixes: e057dd3fc20f ("can: add ISO 15765-2:2016 transport protocol") Link: https://lore.kernel.org/all/20220405175112.2682-1-socketcan@hartkopp.net Reported-by: syzbot+2339c27f5c66c652843e@syzkaller.appspotmail.com Signed-off-by: Oliver Hartkopp Signed-off-by: Marc Kleine-Budde Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- net/can/isotp.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/net/can/isotp.c b/net/can/isotp.c index 9a4a9c5a9f24..c515bbd46c67 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -864,6 +864,7 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) struct canfd_frame *cf; int ae = (so->opt.flags & CAN_ISOTP_EXTEND_ADDR) ? 1 : 0; int wait_tx_done = (so->opt.flags & CAN_ISOTP_WAIT_TX_DONE) ? 1 : 0; + s64 hrtimer_sec = 0; int off; int err; @@ -962,7 +963,9 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) isotp_create_fframe(cf, so, ae); /* start timeout for FC */ - hrtimer_start(&so->txtimer, ktime_set(1, 0), HRTIMER_MODE_REL_SOFT); + hrtimer_sec = 1; + hrtimer_start(&so->txtimer, ktime_set(hrtimer_sec, 0), + HRTIMER_MODE_REL_SOFT); } /* send the first or only CAN frame */ @@ -975,6 +978,11 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) if (err) { pr_notice_once("can-isotp: %s: can_send_ret %d\n", __func__, err); + + /* no transmission -> no timeout monitoring */ + if (hrtimer_sec) + hrtimer_cancel(&so->txtimer); + goto err_out_drop; } -- Gitee From 0c5ab04a2f48e4ea5814c339410bc679062b41ea Mon Sep 17 00:00:00 2001 From: Tudor Ambarus Date: Tue, 26 Jul 2022 17:39:31 +0800 Subject: [PATCH 639/674] spi: atmel-quadspi: Fix the buswidth adjustment between spi-mem and controller stable inclusion from stable-v5.10.113 commit dccee748af17fc087ff5017152e532ef8e18c8c0 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=dccee748af17fc087ff5017152e532ef8e18c8c0 -------------------------------- commit 8c235cc25087495c4288d94f547e9d3061004991 upstream. Use the spi_mem_default_supports_op() core helper in order to take into account the buswidth specified by the user in device tree. Cc: Fixes: 0e6aae08e9ae ("spi: Add QuadSPI driver for Atmel SAMA5D2") Signed-off-by: Tudor Ambarus Link: https://lore.kernel.org/r/20220406133604.455356-1-tudor.ambarus@microchip.com Signed-off-by: Mark Brown Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/spi/atmel-quadspi.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/spi/atmel-quadspi.c b/drivers/spi/atmel-quadspi.c index 1e63fd4821f9..8aa89d93db11 100644 --- a/drivers/spi/atmel-quadspi.c +++ b/drivers/spi/atmel-quadspi.c @@ -277,6 +277,9 @@ static int atmel_qspi_find_mode(const struct spi_mem_op *op) static bool atmel_qspi_supports_op(struct spi_mem *mem, const struct spi_mem_op *op) { + if (!spi_mem_default_supports_op(mem, op)) + return false; + if (atmel_qspi_find_mode(op) < 0) return false; -- Gitee From dd76b5fe66faa3615f8d284d1fd07f1cccabf125 Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Tue, 26 Jul 2022 17:39:32 +0800 Subject: [PATCH 640/674] staging: ion: Prevent incorrect reference counting behavour stable inclusion from stable-v5.10.113 commit fea24b07edfc348c67a019b6e17b39c0698e631f category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=fea24b07edfc348c67a019b6e17b39c0698e631f -------------------------------- Supply additional check in order to prevent unexpected results. Fixes: b892bf75b2034 ("ion: Switch ion to use dma-buf") Suggested-by: Dan Carpenter Signed-off-by: Lee Jones Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/staging/android/ion/ion.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/staging/android/ion/ion.c b/drivers/staging/android/ion/ion.c index e1fe03ceb7f1..e6d4a3ee6cda 100644 --- a/drivers/staging/android/ion/ion.c +++ b/drivers/staging/android/ion/ion.c @@ -114,6 +114,9 @@ static void *ion_buffer_kmap_get(struct ion_buffer *buffer) void *vaddr; if (buffer->kmap_cnt) { + if (buffer->kmap_cnt == INT_MAX) + return ERR_PTR(-EOVERFLOW); + buffer->kmap_cnt++; return buffer->vaddr; } -- Gitee From 9a1fc0d8feaf050654872afaa9e66323f12e4a11 Mon Sep 17 00:00:00 2001 From: Khazhismel Kumykov Date: Tue, 26 Jul 2022 17:39:33 +0800 Subject: [PATCH 641/674] block/compat_ioctl: fix range check in BLKGETSIZE stable inclusion from stable-v5.10.113 commit 8bedbc8f7f35f533000b347644b6bf1f62524676 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=8bedbc8f7f35f533000b347644b6bf1f62524676 -------------------------------- commit ccf16413e520164eb718cf8b22a30438da80ff23 upstream. kernel ulong and compat_ulong_t may not be same width. Use type directly to eliminate mismatches. This would result in truncation rather than EFBIG for 32bit mode for large disks. Reviewed-by: Bart Van Assche Signed-off-by: Khazhismel Kumykov Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20220414224056.2875681-1-khazhy@google.com Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- block/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/ioctl.c b/block/ioctl.c index 8171858dc8a9..7c578f84a4fd 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -683,7 +683,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) (bdev->bd_bdi->ra_pages * PAGE_SIZE) / 512); case BLKGETSIZE: size = i_size_read(bdev->bd_inode); - if ((size >> 9) > ~0UL) + if ((size >> 9) > ~(compat_ulong_t)0) return -EFBIG; return compat_put_ulong(argp, size >> 9); -- Gitee From 156b297acb77278b3f11077a9e8f92e81c87aebe Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Tue, 26 Jul 2022 17:39:34 +0800 Subject: [PATCH 642/674] Revert "net: micrel: fix KS8851_MLL Kconfig" stable inclusion from stable-v5.10.113 commit 7992fdb045fbc7cb0e34eba464b73044585c0638 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=7992fdb045fbc7cb0e34eba464b73044585c0638 -------------------------------- This reverts commit 1ff5359afa5ec0dd09fe76183dc4fa24b50e4125 which is commit c3efcedd272aa6dd5929e20cf902a52ddaa1197a upstream. The upstream commit c3efcedd272a ("net: micrel: fix KS8851_MLL Kconfig") depends on e5f31552674e ("ethernet: fix PTP_1588_CLOCK dependencies") which is not part of Linux 5.10.y . Revert the aforementioned commit to prevent breakage in 5.10.y . Signed-off-by: Marek Vasut Cc: David S. Miller Cc: Jakub Kicinski Cc: Paolo Abeni Cc: Randy Dunlap Cc: Sasha Levin Cc: # 5.10.x Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/net/ethernet/micrel/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/micrel/Kconfig b/drivers/net/ethernet/micrel/Kconfig index 9ceb7e1fb169..42bc014136fe 100644 --- a/drivers/net/ethernet/micrel/Kconfig +++ b/drivers/net/ethernet/micrel/Kconfig @@ -37,7 +37,6 @@ config KS8851 config KS8851_MLL tristate "Micrel KS8851 MLL" depends on HAS_IOMEM - depends on PTP_1588_CLOCK_OPTIONAL select MII select CRC32 select EEPROM_93CX6 -- Gitee From 03a99552eb74a789de7de2a13b0ea9b31dcf2dfa Mon Sep 17 00:00:00 2001 From: Zhengchao Shao Date: Thu, 28 Jul 2022 18:06:31 +0800 Subject: [PATCH 643/674] bpf: Don't redirect packets with invalid pkt_len mainline inclusion from mainline-v5.19-rc6 commit fd1894224407c484f652ad456e1ce423e89bb3eb category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HWKR CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/commit/?id=fd1894224407 -------------------------------- Syzbot found an issue [1]: fq_codel_drop() try to drop a flow whitout any skbs, that is, the flow->head is null. The root cause, as the [2] says, is because that bpf_prog_test_run_skb() run a bpf prog which redirects empty skbs. So we should determine whether the length of the packet modified by bpf prog or others like bpf_prog_test is valid before forwarding it directly. LINK: [1] https://syzkaller.appspot.com/bug?id=0b84da80c2917757915afa89f7738a9d16ec96c5 LINK: [2] https://www.spinics.net/lists/netdev/msg777503.html Reported-by: syzbot+7a12909485b94426aceb@syzkaller.appspotmail.com Signed-off-by: Zhengchao Shao Reviewed-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20220715115559.139691-1-shaozhengchao@huawei.com Signed-off-by: Alexei Starovoitov Reviewed-by: Wei Yongjun Reviewed-by: Yue Haibing Signed-off-by: Zheng Zengkai --- include/linux/skbuff.h | 8 ++++++++ net/bpf/test_run.c | 3 +++ net/core/dev.c | 1 + 3 files changed, 12 insertions(+) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 68efccc15a87..72cfb047fd2c 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2251,6 +2251,14 @@ static inline void skb_set_tail_pointer(struct sk_buff *skb, const int offset) #endif /* NET_SKBUFF_DATA_USES_OFFSET */ +static inline void skb_assert_len(struct sk_buff *skb) +{ +#ifdef CONFIG_DEBUG_NET + if (WARN_ONCE(!skb->len, "%s\n", __func__)) + DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false); +#endif /* CONFIG_DEBUG_NET */ +} + /* * Add data to an sk_buff */ diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 99712d35e535..f266a9453c8e 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -398,6 +398,9 @@ static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb) { struct qdisc_skb_cb *cb = (struct qdisc_skb_cb *)skb->cb; + if (!skb->len) + return -EINVAL; + if (!__skb) return 0; diff --git a/net/core/dev.c b/net/core/dev.c index 12089c484b30..8e4de36eede8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4094,6 +4094,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) bool again = false; skb_reset_mac_header(skb); + skb_assert_len(skb); if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP)) __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED); -- Gitee From 92561effc05f39fbf33d80fd0f9e5978fb7b3a71 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Thu, 28 Jul 2022 18:06:32 +0800 Subject: [PATCH 644/674] ucounts: add missing data type changes mainline inclusion from mainline-v5.14-rc6 commit f153c2246783ba210493054d99c66353f56423c9 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5IDIC CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=f153c2246783ba210493054d99c66353f56423c9 -------------------------------- commit f9c82a4ea89c3 ("Increase size of ucounts to atomic_long_t") changed the data type of ucounts/ucounts_max to long, but missed to adjust a few other places. This is noticeable on big endian platforms from user space because the /proc/sys/user/max_*_names files all contain 0. v4 - Made the min and max constants long so the sysctl values are actually settable on little endian machines. -- EWB Fixes: f9c82a4ea89c ("Increase size of ucounts to atomic_long_t") Signed-off-by: Sven Schnelle Tested-by: Nathan Chancellor Tested-by: Linux Kernel Functional Testing Acked-by: Alexey Gladkov v1: https://lkml.kernel.org/r/20210721115800.910778-1-svens@linux.ibm.com v2: https://lkml.kernel.org/r/20210721125233.1041429-1-svens@linux.ibm.com v3: https://lkml.kernel.org/r/20210730062854.3601635-1-svens@linux.ibm.com Link: https://lkml.kernel.org/r/8735rijqlv.fsf_-_@disp2133 Signed-off-by: Eric W. Biederman Conflict: fs/notify/fanotify/fanotify_user.c Signed-off-by: Li Nan Reviewed-by: Zhang Yi Signed-off-by: Zheng Zengkai --- fs/notify/inotify/inotify_user.c | 17 +++++++++++------ kernel/ucount.c | 19 +++++++++++-------- 2 files changed, 22 insertions(+), 14 deletions(-) diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 5f6c6bf65909..f13b64729d08 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -46,22 +46,27 @@ struct kmem_cache *inotify_inode_mark_cachep __read_mostly; #include +static long it_zero = 0; +static long it_int_max = INT_MAX; + struct ctl_table inotify_table[] = { { .procname = "max_user_instances", .data = &init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES], - .maxlen = sizeof(int), + .maxlen = sizeof(long), .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, + .proc_handler = proc_doulongvec_minmax, + .extra1 = &it_zero, + .extra2 = &it_int_max, }, { .procname = "max_user_watches", .data = &init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES], - .maxlen = sizeof(int), + .maxlen = sizeof(long), .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, + .proc_handler = proc_doulongvec_minmax, + .extra1 = &it_zero, + .extra2 = &it_int_max, }, { .procname = "max_queued_events", diff --git a/kernel/ucount.c b/kernel/ucount.c index dff1d9b739d2..1f5825b674d8 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -52,14 +52,17 @@ static struct ctl_table_root set_root = { .permissions = set_permissions, }; -#define UCOUNT_ENTRY(name) \ - { \ - .procname = name, \ - .maxlen = sizeof(int), \ - .mode = 0644, \ - .proc_handler = proc_dointvec_minmax, \ - .extra1 = SYSCTL_ZERO, \ - .extra2 = SYSCTL_INT_MAX, \ +static long ue_zero = 0; +static long ue_int_max = INT_MAX; + +#define UCOUNT_ENTRY(name) \ + { \ + .procname = name, \ + .maxlen = sizeof(long), \ + .mode = 0644, \ + .proc_handler = proc_doulongvec_minmax, \ + .extra1 = &ue_zero, \ + .extra2 = &ue_int_max, \ } static struct ctl_table user_table[] = { UCOUNT_ENTRY("max_user_namespaces"), -- Gitee From a51eb65ba1c5c5fcf7c026732d5ace1f9cbaf8f9 Mon Sep 17 00:00:00 2001 From: Luo Meng Date: Thu, 28 Jul 2022 18:06:33 +0800 Subject: [PATCH 645/674] block: Fix warning in bd_link_disk_holder() hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ETAB CVE: NA -------------------------------- Warning reports as follows: WARNING: CPU: 3 PID: 674 at fs/block_dev.c:1272 bd_link_disk_holder+0xcd/0x270 Modules linked in: null_blk(+) CPU: 3 PID: 674 Comm: dmsetup Not tainted 5.10.0-16691-gf6076432827d-dirty #158 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190727_073836-4 RIP: 0010:bd_link_disk_holder+0xcd/0x270 Code: 69 73 ee 00 44 89 e8 5b 48 83 05 c5 bf 6d 0c 01 5d 41 5c 41 5d 41 5e 41 8 RSP: 0018:ffffc9000049bbb8 EFLAGS: 00010202 RAX: ffff888104e39038 RBX: ffff888104185000 RCX: 0000000000000000 RDX: 0000000000000001 RSI: ffffffffaa085692 RDI: 0000000000000000 RBP: ffff88810cc2ae00 R08: ffffffffa853659b R09: 0000000000000000 R10: ffffc9000049bbb0 R11: 720030626c6c756e R12: ffff88810e800000 R13: ffff88810e800090 R14: ffff888103570c98 R15: ffff888103570c80 FS: 00007fb49dc13dc0(0000) GS:ffff88813bd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007ff994ebde70 CR3: 000000010d54a000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: dm_get_table_device+0x175/0x300 dm_get_device+0x238/0x360 linear_ctr+0xee/0x170 dm_table_add_target+0x199/0x4b0 table_load+0x18c/0x480 ? table_clear+0x190/0x190 ctl_ioctl+0x21d/0x640 ? check_preemption_disabled+0x140/0x150 dm_ctl_ioctl+0x12/0x20 __se_sys_ioctl+0xb1/0x100 __x64_sys_ioctl+0x1e/0x30 do_syscall_64+0x45/0x70 entry_SYSCALL_64_after_hwframe+0x44/0xa9 This can reproduce by concurrent operations: 1. modprobe null_blk 2. echo -e "0 10000 linear /dev/nullb0 0" > table dmsetup create xxx table t1: create disk a | t2: dm setup | device_add_disk | dev->devt = devt | | dm_get_table_device | open_table_device | blkdev_get_by_dev -> succeed | bd_link_disk_holder | -> holder_dir is still NULL register_disk -> create holder_dir kobject_create_and_add device_add_disk() will set devt before creating holder_dir, which leaves a window that dm_get_table_device() can find the disk by devt while it's holder_dir is NULL. So move GENHD_FL_UP in blk_register_queue() to avoid this warning and fix a NULL-ptr in __blk_mq_sched_bio_merge(). Signed-off-by: Luo Meng Reviewed-by: Jason Yan Signed-off-by: Yu Kuai Reviewed-by: Jason Yan Signed-off-by: Zheng Zengkai --- block/blk-sysfs.c | 6 ++++++ block/genhd.c | 2 -- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 0a4fcbda8ab4..aff53c3ae836 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -910,6 +910,12 @@ int blk_register_queue(struct gendisk *disk) kobject_uevent(&q->elevator->kobj, KOBJ_ADD); mutex_unlock(&q->sysfs_lock); + + /* + * Set the flag at last, so that block devcie can't be opened + * before it's registration is done. + */ + disk->flags |= GENHD_FL_UP; ret = 0; unlock: mutex_unlock(&q->sysfs_dir_lock); diff --git a/block/genhd.c b/block/genhd.c index 8b37fcfa10d1..9d91f880ea95 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -799,8 +799,6 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk, WARN_ON(!disk->minors && !(disk->flags & (GENHD_FL_EXT_DEVT | GENHD_FL_HIDDEN))); - disk->flags |= GENHD_FL_UP; - retval = blk_alloc_devt(&disk->part0, &devt); if (retval) { WARN_ON(1); -- Gitee From d7c2ddc8456d91991dbc87da304742d60d577b41 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 28 Jul 2022 18:06:34 +0800 Subject: [PATCH 646/674] block: fix that part scan is disabled in device_add_disk() hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ETAB CVE: NA -------------------------------- Patch ("block: Fix warning in bd_link_disk_holder()") moves the setting of flag 'GENHD_FL_UP' behind blkdev_get, which will disabled part scan: devcie_add_disk register_disk blkdev_get __blkdev_get bdev_get_gendisk get_gendisk -> failed because 'GENHD_FL_UP' is not set And this will cause tests block/017, block/018 and scsi/004 to fail. Fix the problem by moving part scan as well. Signed-off-by: Yu Kuai Reviewed-by: Jason Yan Signed-off-by: Zheng Zengkai --- block/blk-sysfs.c | 33 +++++++++++++++++++++++++++++++++ block/genhd.c | 27 --------------------------- 2 files changed, 33 insertions(+), 27 deletions(-) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index aff53c3ae836..f7cd16cec0ed 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -821,6 +821,38 @@ struct kobj_type blk_queue_ktype = { .release = blk_release_queue, }; +static void disk_scan_partitions(struct gendisk *disk) +{ + struct block_device *bdev; + + if (!get_capacity(disk) || !disk_part_scan_enabled(disk)) + return; + + set_bit(GD_NEED_PART_SCAN, &disk->state); + bdev = blkdev_get_by_dev(disk_devt(disk), FMODE_READ, NULL); + if (!IS_ERR(bdev)) + blkdev_put(bdev, FMODE_READ); +} + +static void disk_init_partition(struct gendisk *disk) +{ + struct device *ddev = disk_to_dev(disk); + struct disk_part_iter piter; + struct hd_struct *part; + + disk_scan_partitions(disk); + + /* announce disk after possible partitions are created */ + dev_set_uevent_suppress(ddev, 0); + kobject_uevent(&ddev->kobj, KOBJ_ADD); + + /* announce possible partitions */ + disk_part_iter_init(&piter, disk, 0); + while ((part = disk_part_iter_next(&piter))) + kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD); + disk_part_iter_exit(&piter); +} + /** * blk_register_queue - register a block layer queue with sysfs * @disk: Disk of which the request queue should be registered with sysfs. @@ -916,6 +948,7 @@ int blk_register_queue(struct gendisk *disk) * before it's registration is done. */ disk->flags |= GENHD_FL_UP; + disk_init_partition(disk); ret = 0; unlock: mutex_unlock(&q->sysfs_dir_lock); diff --git a/block/genhd.c b/block/genhd.c index 9d91f880ea95..021c9c2d7231 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -687,25 +687,10 @@ static int exact_lock(dev_t devt, void *data) return 0; } -static void disk_scan_partitions(struct gendisk *disk) -{ - struct block_device *bdev; - - if (!get_capacity(disk) || !disk_part_scan_enabled(disk)) - return; - - set_bit(GD_NEED_PART_SCAN, &disk->state); - bdev = blkdev_get_by_dev(disk_devt(disk), FMODE_READ, NULL); - if (!IS_ERR(bdev)) - blkdev_put(bdev, FMODE_READ); -} - static void register_disk(struct device *parent, struct gendisk *disk, const struct attribute_group **groups) { struct device *ddev = disk_to_dev(disk); - struct disk_part_iter piter; - struct hd_struct *part; int err; ddev->parent = parent; @@ -743,18 +728,6 @@ static void register_disk(struct device *parent, struct gendisk *disk, if (disk->flags & GENHD_FL_HIDDEN) return; - disk_scan_partitions(disk); - - /* announce disk after possible partitions are created */ - dev_set_uevent_suppress(ddev, 0); - kobject_uevent(&ddev->kobj, KOBJ_ADD); - - /* announce possible partitions */ - disk_part_iter_init(&piter, disk, 0); - while ((part = disk_part_iter_next(&piter))) - kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD); - disk_part_iter_exit(&piter); - if (disk->queue->backing_dev_info->dev) { err = sysfs_create_link(&ddev->kobj, &disk->queue->backing_dev_info->dev->kobj, -- Gitee From 67b9d2775f9adafd9a4f5d8a7cc05f9cce884f1a Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 28 Jul 2022 18:06:35 +0800 Subject: [PATCH 647/674] block: prevent lockdep false positive warning about 'bd_mutex' hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ETAB CVE: NA -------------------------------- Patch ("block: fix that part scan is disabled in device_add_disk()") confuse lockdep to produce following warning: ===================================================== WARNING: possible circular locking dependency detected 4.18.0+ #2 Tainted: G ---------r- - ------------------------------------------------------ syz-executor.0/4652 is trying to acquire lock: 00000000ad5f5a19 (&mddev->open_mutex){+.+.}, at: md_open+0x13a/0x260 home/install/linux-rh-3-10/drivers/md/md.c:7626 but task is already holding lock: 000000005c3a3fea (&bdev->bd_mutex){+.+.}, at: __blkdev_get+0x156/0x1490 home/install/linux-rh-3-10/fs/block_dev.c:1583 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (&bdev->bd_mutex){+.+.}: __mutex_lock_common home/install/linux-rh-3-10/kernel/locking/mutex.c:925 [inline] __mutex_lock+0x105/0x1270 home/install/linux-rh-3-10/kernel/locking/mutex.c:1072 __blkdev_get+0x156/0x1490 home/install/linux-rh-3-10/fs/block_dev.c:1583 blkdev_get+0x33c/0xac0 home/install/linux-rh-3-10/fs/block_dev.c:1735 disk_init_partition home/install/linux-rh-3-10/block/blk-sysfs.c:972 [inline] blk_register_queue+0x5ed/0x6c0 home/install/linux-rh-3-10/block/blk-sysfs.c:1055 __device_add_disk+0xab5/0xd70 home/install/linux-rh-3-10/block/genhd.c:729 sd_probe_async+0x447/0x852 home/install/linux-rh-3-10/drivers/scsi/sd.c:3249 async_run_entry_fn+0xe1/0x700 home/install/linux-rh-3-10/kernel/async.c:127 process_one_work+0x9cf/0x1940 home/install/linux-rh-3-10/kernel/workqueue.c:2175 worker_thread+0x91/0xc50 home/install/linux-rh-3-10/kernel/workqueue.c:2321 kthread+0x33a/0x400 home/install/linux-rh-3-10/kernel/kthread.c:257 ret_from_fork+0x3a/0x50 home/install/linux-rh-3-10/arch/x86/entry/entry_64.S:355 -> #1 (&q->sysfs_dir_lock){+.+.}: __mutex_lock_common home/install/linux-rh-3-10/kernel/locking/mutex.c:925 [inline] __mutex_lock+0x105/0x1270 home/install/linux-rh-3-10/kernel/locking/mutex.c:1072 blk_register_queue+0x143/0x6c0 home/install/linux-rh-3-10/block/blk-sysfs.c:1010 __device_add_disk+0xab5/0xd70 home/install/linux-rh-3-10/block/genhd.c:729 add_disk home/install/linux-rh-3-10/./include/linux/genhd.h:447 [inline] md_alloc+0xb06/0x10d0 home/install/linux-rh-3-10/drivers/md/md.c:5525 md_probe+0x32/0x60 home/install/linux-rh-3-10/drivers/md/md.c:5554 kobj_lookup+0x2d2/0x450 home/install/linux-rh-3-10/drivers/base/map.c:152 get_gendisk+0x3b/0x360 home/install/linux-rh-3-10/block/genhd.c:860 bdev_get_gendisk home/install/linux-rh-3-10/fs/block_dev.c:1181 [inline] __blkdev_get+0x3b6/0x1490 home/install/linux-rh-3-10/fs/block_dev.c:1578 blkdev_get+0x33c/0xac0 home/install/linux-rh-3-10/fs/block_dev.c:1735 blkdev_open+0x1c2/0x250 home/install/linux-rh-3-10/fs/block_dev.c:1923 do_dentry_open+0x686/0xf50 home/install/linux-rh-3-10/fs/open.c:777 do_last home/install/linux-rh-3-10/fs/namei.c:3449 [inline] path_openat+0x92f/0x28c0 home/install/linux-rh-3-10/fs/namei.c:3578 do_filp_open+0x1aa/0x2b0 home/install/linux-rh-3-10/fs/namei.c:3613 do_sys_open+0x307/0x490 home/install/linux-rh-3-10/fs/open.c:1075 do_syscall_64+0xca/0x5c0 home/install/linux-rh-3-10/arch/x86/entry/common.c:298 entry_SYSCALL_64_after_hwframe+0x6a/0xdf -> #0 (&mddev->open_mutex){+.+.}: lock_acquire+0x10b/0x3a0 home/install/linux-rh-3-10/kernel/locking/lockdep.c:3868 __mutex_lock_common home/install/linux-rh-3-10/kernel/locking/mutex.c:925 [inline] __mutex_lock+0x105/0x1270 home/install/linux-rh-3-10/kernel/locking/mutex.c:1072 md_open+0x13a/0x260 home/install/linux-rh-3-10/drivers/md/md.c:7626 __blkdev_get+0x2dc/0x1490 home/install/linux-rh-3-10/fs/block_dev.c:1599 blkdev_get+0x33c/0xac0 home/install/linux-rh-3-10/fs/block_dev.c:1735 blkdev_open+0x1c2/0x250 home/install/linux-rh-3-10/fs/block_dev.c:1923 do_dentry_open+0x686/0xf50 home/install/linux-rh-3-10/fs/open.c:777 do_last home/install/linux-rh-3-10/fs/namei.c:3449 [inline] path_openat+0x92f/0x28c0 home/install/linux-rh-3-10/fs/namei.c:3578 do_filp_open+0x1aa/0x2b0 home/install/linux-rh-3-10/fs/namei.c:3613 do_sys_open+0x307/0x490 home/install/linux-rh-3-10/fs/open.c:1075 do_syscall_64+0xca/0x5c0 home/install/linux-rh-3-10/arch/x86/entry/common.c:298 entry_SYSCALL_64_after_hwframe+0x6a/0xdf other info that might help us debug this: Chain exists of: &mddev->open_mutex --> &q->sysfs_dir_lock --> &bdev->bd_mutex Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&bdev->bd_mutex); lock(&q->sysfs_dir_lock); lock(&bdev->bd_mutex); lock(&mddev->open_mutex); *** DEADLOCK *** Since 'bd_mutex' and 'sysfs_dir_lock' is different is for each device, deadlock between md_open() and sd_probe_async() is impossible. However, lockdep is treating 'bd_mutex' and 'sysfs_dir_lock' from different devices the same, and patch "block: fix that part scan is disabled in device_add_disk()" is holding 'bd_mutex' inside 'sysfs_dir_lock', which causes the false positive warning. Fix the false positive warning by don't grab 'bd_mutex' inside 'sysfs_dir_lock'. Signed-off-by: Yu Kuai Reviewed-by: Jason Yan Signed-off-by: Zheng Zengkai --- block/blk-sysfs.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index f7cd16cec0ed..548d758365c6 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -948,10 +948,17 @@ int blk_register_queue(struct gendisk *disk) * before it's registration is done. */ disk->flags |= GENHD_FL_UP; - disk_init_partition(disk); ret = 0; unlock: mutex_unlock(&q->sysfs_dir_lock); + /* + * Init partitions after releasing 'sysfs_dir_lock', otherwise lockdep + * will be confused because it will treat 'bd_mutex' from different + * devices as the same lock. + */ + if (!ret) + disk_init_partition(disk); + return ret; } EXPORT_SYMBOL_GPL(blk_register_queue); -- Gitee From 77eabe0b93e9dd68b400d52c9caa9282ca6094df Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 28 Jul 2022 18:06:36 +0800 Subject: [PATCH 648/674] inotify: show inotify mask flags in proc fdinfo mainline inclusion from mainline-v5.19-rc1 commit a32e697cda27679a0327ae2cafdad8c7170f548f category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5IHD1 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=a32e697cda27679a0327ae2cafdad8c7170f548f -------------------------------- The inotify mask flags IN_ONESHOT and IN_EXCL_UNLINK are not "internal to kernel" and should be exposed in procfs fdinfo so CRIU can restore them. Fixes: 6933599697c9 ("inotify: hide internal kernel bits from fdinfo") Link: https://lore.kernel.org/r/20220422120327.3459282-2-amir73il@gmail.com Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Signed-off-by: Li Nan Reviewed-by: Zhang Yi Signed-off-by: Zheng Zengkai --- fs/notify/fdinfo.c | 11 ++--------- fs/notify/inotify/inotify.h | 12 ++++++++++++ fs/notify/inotify/inotify_user.c | 2 +- 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index f0d6b54be412..765b50aeadd2 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c @@ -83,16 +83,9 @@ static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark); inode = igrab(fsnotify_conn_inode(mark->connector)); if (inode) { - /* - * IN_ALL_EVENTS represents all of the mask bits - * that we expose to userspace. There is at - * least one bit (FS_EVENT_ON_CHILD) which is - * used only internally to the kernel. - */ - u32 mask = mark->mask & IN_ALL_EVENTS; - seq_printf(m, "inotify wd:%x ino:%lx sdev:%x mask:%x ignored_mask:%x ", + seq_printf(m, "inotify wd:%x ino:%lx sdev:%x mask:%x ignored_mask:0 ", inode_mark->wd, inode->i_ino, inode->i_sb->s_dev, - mask, mark->ignored_mask); + inotify_mark_user_mask(mark)); show_mark_fhandle(m, inode); seq_putc(m, '\n'); iput(inode); diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h index 2007e3711916..8f00151eb731 100644 --- a/fs/notify/inotify/inotify.h +++ b/fs/notify/inotify/inotify.h @@ -22,6 +22,18 @@ static inline struct inotify_event_info *INOTIFY_E(struct fsnotify_event *fse) return container_of(fse, struct inotify_event_info, fse); } +/* + * INOTIFY_USER_FLAGS represents all of the mask bits that we expose to + * userspace. There is at least one bit (FS_EVENT_ON_CHILD) which is + * used only internally to the kernel. + */ +#define INOTIFY_USER_MASK (IN_ALL_EVENTS | IN_ONESHOT | IN_EXCL_UNLINK) + +static inline __u32 inotify_mark_user_mask(struct fsnotify_mark *fsn_mark) +{ + return fsn_mark->mask & INOTIFY_USER_MASK; +} + extern void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark, struct fsnotify_group *group); extern int inotify_handle_inode_event(struct fsnotify_mark *inode_mark, diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index f13b64729d08..3986f1877457 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -93,7 +93,7 @@ static inline __u32 inotify_arg_to_mask(struct inode *inode, u32 arg) mask |= FS_EVENT_ON_CHILD; /* mask off the flags used to open the fd */ - mask |= (arg & (IN_ALL_EVENTS | IN_ONESHOT | IN_EXCL_UNLINK)); + mask |= (arg & INOTIFY_USER_MASK); return mask; } -- Gitee From c248fb22ed21b7bf4308bfcdf962d11b4f73f62c Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Thu, 28 Jul 2022 18:06:37 +0800 Subject: [PATCH 649/674] fbcon: Disallow setting font bigger than screen size stable inclusion from stable-v5.10.130 commit b727561ddc9360de9631af2d970d8ffed676a750 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5IQ4M CVE: CVE-2021-33655 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=b727561ddc9360de9631af2d970d8ffed676a750 -------------------------------- commit 65a01e601dbba8b7a51a2677811f70f783766682 upstream. Prevent that users set a font size which is bigger than the physical screen. It's unlikely this may happen (because screens are usually much larger than the fonts and each font char is limited to 32x32 pixels), but it may happen on smaller screens/LCD displays. Signed-off-by: Helge Deller Reviewed-by: Daniel Vetter Reviewed-by: Geert Uytterhoeven Cc: stable@vger.kernel.org # v4.14+ Signed-off-by: Greg Kroah-Hartman Signed-off-by: Chen Jun Reviewed-by: Xiu Jianfeng Reviewed-by: Weilong Chen Signed-off-by: Zheng Zengkai --- drivers/video/fbdev/core/fbcon.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c index f102519ccefb..8d81e9321cf7 100644 --- a/drivers/video/fbdev/core/fbcon.c +++ b/drivers/video/fbdev/core/fbcon.c @@ -2510,6 +2510,11 @@ static int fbcon_set_font(struct vc_data *vc, struct console_font *font, if (charcount != 256 && charcount != 512) return -EINVAL; + /* font bigger than screen resolution ? */ + if (w > FBCON_SWAP(info->var.rotate, info->var.xres, info->var.yres) || + h > FBCON_SWAP(info->var.rotate, info->var.yres, info->var.xres)) + return -EINVAL; + /* Make sure drawing engine can handle the font */ if (!(info->pixmap.blit_x & (1 << (font->width - 1))) || !(info->pixmap.blit_y & (1 << (font->height - 1)))) -- Gitee From e664e9804a489f76e72be7601f53654b7d29014f Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Thu, 28 Jul 2022 18:06:38 +0800 Subject: [PATCH 650/674] fbcon: Prevent that screen size is smaller than font size stable inclusion from stable-v5.10.130 commit cecb806c766c78e1be62b6b7b1483ef59bbaeabe category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5IQ4M CVE: CVE-2021-33655 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=cecb806c766c78e1be62b6b7b1483ef59bbaeabe -------------------------------- commit e64242caef18b4a5840b0e7a9bff37abd4f4f933 upstream. We need to prevent that users configure a screen size which is smaller than the currently selected font size. Otherwise rendering chars on the screen will access memory outside the graphics memory region. This patch adds a new function fbcon_modechange_possible() which implements this check and which later may be extended with other checks if necessary. The new function is called from the FBIOPUT_VSCREENINFO ioctl handler in fbmem.c, which will return -EINVAL if userspace asked for a too small screen size. Signed-off-by: Helge Deller Reviewed-by: Geert Uytterhoeven Cc: stable@vger.kernel.org # v5.4+ Signed-off-by: Greg Kroah-Hartman Signed-off-by: Chen Jun Reviewed-by: Xiu Jianfeng Reviewed-by: Weilong Chen Signed-off-by: Zheng Zengkai --- drivers/video/fbdev/core/fbcon.c | 28 ++++++++++++++++++++++++++++ drivers/video/fbdev/core/fbmem.c | 4 +++- include/linux/fbcon.h | 4 ++++ 3 files changed, 35 insertions(+), 1 deletion(-) diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c index 8d81e9321cf7..b4260a830e78 100644 --- a/drivers/video/fbdev/core/fbcon.c +++ b/drivers/video/fbdev/core/fbcon.c @@ -2776,6 +2776,34 @@ void fbcon_update_vcs(struct fb_info *info, bool all) } EXPORT_SYMBOL(fbcon_update_vcs); +/* let fbcon check if it supports a new screen resolution */ +int fbcon_modechange_possible(struct fb_info *info, struct fb_var_screeninfo *var) +{ + struct fbcon_ops *ops = info->fbcon_par; + struct vc_data *vc; + unsigned int i; + + WARN_CONSOLE_UNLOCKED(); + + if (!ops) + return 0; + + /* prevent setting a screen size which is smaller than font size */ + for (i = first_fb_vc; i <= last_fb_vc; i++) { + vc = vc_cons[i].d; + if (!vc || vc->vc_mode != KD_TEXT || + registered_fb[con2fb_map[i]] != info) + continue; + + if (vc->vc_font.width > FBCON_SWAP(var->rotate, var->xres, var->yres) || + vc->vc_font.height > FBCON_SWAP(var->rotate, var->yres, var->xres)) + return -EINVAL; + } + + return 0; +} +EXPORT_SYMBOL_GPL(fbcon_modechange_possible); + int fbcon_mode_deleted(struct fb_info *info, struct fb_videomode *mode) { diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c index 00939ca2065a..d2cffaf3fda8 100644 --- a/drivers/video/fbdev/core/fbmem.c +++ b/drivers/video/fbdev/core/fbmem.c @@ -1109,7 +1109,9 @@ static long do_fb_ioctl(struct fb_info *info, unsigned int cmd, return -EFAULT; console_lock(); lock_fb_info(info); - ret = fb_set_var(info, &var); + ret = fbcon_modechange_possible(info, &var); + if (!ret) + ret = fb_set_var(info, &var); if (!ret) fbcon_update_vcs(info, var.activate & FB_ACTIVATE_ALL); unlock_fb_info(info); diff --git a/include/linux/fbcon.h b/include/linux/fbcon.h index ff5596dd30f8..2382dec6d6ab 100644 --- a/include/linux/fbcon.h +++ b/include/linux/fbcon.h @@ -15,6 +15,8 @@ void fbcon_new_modelist(struct fb_info *info); void fbcon_get_requirement(struct fb_info *info, struct fb_blit_caps *caps); void fbcon_fb_blanked(struct fb_info *info, int blank); +int fbcon_modechange_possible(struct fb_info *info, + struct fb_var_screeninfo *var); void fbcon_update_vcs(struct fb_info *info, bool all); void fbcon_remap_all(struct fb_info *info); int fbcon_set_con2fb_map_ioctl(void __user *argp); @@ -33,6 +35,8 @@ static inline void fbcon_new_modelist(struct fb_info *info) {} static inline void fbcon_get_requirement(struct fb_info *info, struct fb_blit_caps *caps) {} static inline void fbcon_fb_blanked(struct fb_info *info, int blank) {} +static inline int fbcon_modechange_possible(struct fb_info *info, + struct fb_var_screeninfo *var) { return 0; } static inline void fbcon_update_vcs(struct fb_info *info, bool all) {} static inline void fbcon_remap_all(struct fb_info *info) {} static inline int fbcon_set_con2fb_map_ioctl(void __user *argp) { return 0; } -- Gitee From 9ba05bbca9335251bfdaa95a344034c31969c8a0 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Thu, 28 Jul 2022 18:06:39 +0800 Subject: [PATCH 651/674] fbmem: Check virtual screen sizes in fb_set_var() stable inclusion from stable-v5.10.130 commit b81212828ad19ab3eccf00626cd04099215060bf category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5IQ4M CVE: CVE-2021-33655 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=b81212828ad19ab3eccf00626cd04099215060bf -------------------------------- commit 6c11df58fd1ac0aefcb3b227f72769272b939e56 upstream. Verify that the fbdev or drm driver correctly adjusted the virtual screen sizes. On failure report the failing driver and reject the screen size change. Signed-off-by: Helge Deller Reviewed-by: Geert Uytterhoeven Cc: stable@vger.kernel.org # v5.4+ Signed-off-by: Greg Kroah-Hartman Signed-off-by: Chen Jun Reviewed-by: Xiu Jianfeng Reviewed-by: Weilong Chen Signed-off-by: Zheng Zengkai --- drivers/video/fbdev/core/fbmem.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c index d2cffaf3fda8..3b3ccb235522 100644 --- a/drivers/video/fbdev/core/fbmem.c +++ b/drivers/video/fbdev/core/fbmem.c @@ -1019,6 +1019,16 @@ fb_set_var(struct fb_info *info, struct fb_var_screeninfo *var) if (ret) return ret; + /* verify that virtual resolution >= physical resolution */ + if (var->xres_virtual < var->xres || + var->yres_virtual < var->yres) { + pr_warn("WARNING: fbcon: Driver '%s' missed to adjust virtual screen size (%ux%u vs. %ux%u)\n", + info->fix.id, + var->xres_virtual, var->yres_virtual, + var->xres, var->yres); + return -EINVAL; + } + if ((var->activate & FB_ACTIVATE_MASK) != FB_ACTIVATE_NOW) return 0; -- Gitee From 89e9ad6dfaef94bddf0aa0dcd1e2c0bd69dcbb57 Mon Sep 17 00:00:00 2001 From: Eric Snowberg Date: Thu, 28 Jul 2022 18:06:40 +0800 Subject: [PATCH 652/674] lockdown: Fix kexec lockdown bypass with ima policy mainline inclusion from mainline-v5.19-rc8 commit 543ce63b664e2c2f9533d089a4664b559c3e6b5b category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5I0FP CVE: CVE-2022-21505 Reference: https://seclists.org/oss-sec/2022/q3/57 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=543ce63b664e2c2f9533d089a4664b559c3e6b5b -------------------------------- The lockdown LSM is primarily used in conjunction with UEFI Secure Boot. This LSM may also be used on machines without UEFI. It can also be enabled when UEFI Secure Boot is disabled. One of lockdown's features is to prevent kexec from loading untrusted kernels. Lockdown can be enabled through a bootparam or after the kernel has booted through securityfs. If IMA appraisal is used with the "ima_appraise=log" boot param, lockdown can be defeated with kexec on any machine when Secure Boot is disabled or unavailable. IMA prevents setting "ima_appraise=log" from the boot param when Secure Boot is enabled, but this does not cover cases where lockdown is used without Secure Boot. To defeat lockdown, boot without Secure Boot and add ima_appraise=log to the kernel command line; then: $ echo "integrity" > /sys/kernel/security/lockdown $ echo "appraise func=KEXEC_KERNEL_CHECK appraise_type=imasig" > \ /sys/kernel/security/ima/policy $ kexec -ls unsigned-kernel Add a call to verify ima appraisal is set to "enforce" whenever lockdown is enabled. This fixes CVE-2022-21505. Cc: stable@vger.kernel.org Fixes: 29d3c1c8dfe7 ("kexec: Allow kexec_file() with appropriate IMA policy when locked down") Signed-off-by: Eric Snowberg Acked-by: Mimi Zohar Reviewed-by: John Haxby Signed-off-by: Linus Torvalds Signed-off-by: GUO Zihua Reviewed-by: Xiu Jianfeng Reviewed-by: Wang Weiyang Signed-off-by: Zheng Zengkai --- security/integrity/ima/ima_policy.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c index 5a06050729a7..b1ab4b3d99fb 100644 --- a/security/integrity/ima/ima_policy.c +++ b/security/integrity/ima/ima_policy.c @@ -1900,6 +1900,10 @@ bool ima_appraise_signature(enum kernel_read_file_id id) if (id >= READING_MAX_ID) return false; + if (id == READING_KEXEC_IMAGE && !(ima_appraise & IMA_APPRAISE_ENFORCE) + && security_locked_down(LOCKDOWN_KEXEC)) + return false; + func = read_idmap[id] ?: FILE_CHECK; rcu_read_lock(); -- Gitee From 6658b33b26920a82df3203c6ecefd00b5820e921 Mon Sep 17 00:00:00 2001 From: ZhaoLong Wang Date: Thu, 28 Jul 2022 18:06:41 +0800 Subject: [PATCH 653/674] ubifs: Fix the issue that UBIFS be read-only due to truncate in the encrypted directory hulk inclusion category: bugfix bugzilla: 187163, https://gitee.com/openeuler/kernel/issues/I5GBC4 CVE: NA -------------------------------- The ubifs_compress() function does not compress the data When the data length is short than 128 bytes or the compressed data length is not ideal.It cause that the compressed length of the truncated data in the truncate_data_node() function may be greater than the length of the raw data read from the flash. The above two lengths are transferred to the ubifs_encrypt() function as parameters. This may lead to assertion fails and then the file system becomes read-only. This patch use the actual length of the data in the memory as the input parameter for assert comparison, which avoids the problem. Signed-off-by: ZhaoLong Wang Reviewed-by: zhihao Cheng Reviewed-by: Zhang Yi Signed-off-by: Zheng Zengkai --- fs/ubifs/crypto.c | 11 +++++++++++ fs/ubifs/journal.c | 29 +++++++++++++++++------------ 2 files changed, 28 insertions(+), 12 deletions(-) diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c index 22be7aeb96c4..1dc22cf29c65 100644 --- a/fs/ubifs/crypto.c +++ b/fs/ubifs/crypto.c @@ -24,6 +24,17 @@ static bool ubifs_crypt_empty_dir(struct inode *inode) return ubifs_check_dir_empty(inode) == 0; } +/** + * ubifs_encrypt - Encrypt data. + * @inode: inode which refers to the data node + * @dn: data node to encrypt + * @in_len: length of data to be compressed + * @out_len: allocated memory size for the data area of @dn + * @block: logical block number of the block + * + * This function encrypt a possibly-compressed data in the data node. + * The encrypted data length will store in @out_len. + */ int ubifs_encrypt(const struct inode *inode, struct ubifs_data_node *dn, unsigned int in_len, unsigned int *out_len, int block) { diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 72586512e51f..ee9888087983 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -1472,23 +1472,25 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, * @block: data block number * @dn: data node to re-compress * @new_len: new length + * @dn_size: size of the data node @dn in memory * * This function is used when an inode is truncated and the last data node of * the inode has to be re-compressed/encrypted and re-written. */ static int truncate_data_node(const struct ubifs_info *c, const struct inode *inode, unsigned int block, struct ubifs_data_node *dn, - int *new_len) + int *new_len, int dn_size) { void *buf; - int err, dlen, compr_type, out_len, old_dlen; + int err, dlen, compr_type, out_len, data_size; out_len = le32_to_cpu(dn->size); buf = kmalloc_array(out_len, WORST_COMPR_FACTOR, GFP_NOFS); if (!buf) return -ENOMEM; - dlen = old_dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ; + dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ; + data_size = dn_size - UBIFS_DATA_NODE_SZ; compr_type = le16_to_cpu(dn->compr_type); if (IS_ENCRYPTED(inode)) { @@ -1508,11 +1510,11 @@ static int truncate_data_node(const struct ubifs_info *c, const struct inode *in } if (IS_ENCRYPTED(inode)) { - err = ubifs_encrypt(inode, dn, out_len, &old_dlen, block); + err = ubifs_encrypt(inode, dn, out_len, &data_size, block); if (err) goto out; - out_len = old_dlen; + out_len = data_size; } else { dn->compr_size = 0; } @@ -1549,7 +1551,7 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, struct ubifs_ino_node *ino; struct ubifs_trun_node *trun; struct ubifs_data_node *dn; - int err, dlen, len, lnum, offs, bit, sz, sync = IS_SYNC(inode); + int err, dlen, len, lnum, offs, bit, sz, dn_size, sync = IS_SYNC(inode); struct ubifs_inode *ui = ubifs_inode(inode); ino_t inum = inode->i_ino; unsigned int blk; @@ -1562,10 +1564,13 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, ubifs_assert(c, S_ISREG(inode->i_mode)); ubifs_assert(c, mutex_is_locked(&ui->ui_mutex)); - sz = UBIFS_TRUN_NODE_SZ + UBIFS_INO_NODE_SZ + - UBIFS_MAX_DATA_NODE_SZ * WORST_COMPR_FACTOR; + dn_size = COMPRESSED_DATA_NODE_BUF_SZ; + + if (IS_ENCRYPTED(inode)) + dn_size += UBIFS_CIPHER_BLOCK_SIZE; - sz += ubifs_auth_node_sz(c); + sz = UBIFS_TRUN_NODE_SZ + UBIFS_INO_NODE_SZ + + dn_size + ubifs_auth_node_sz(c); ino = kmalloc(sz, GFP_NOFS); if (!ino) @@ -1596,15 +1601,15 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, if (dn_len <= 0 || dn_len > UBIFS_BLOCK_SIZE) { ubifs_err(c, "bad data node (block %u, inode %lu)", blk, inode->i_ino); - ubifs_dump_node(c, dn, sz - UBIFS_INO_NODE_SZ - - UBIFS_TRUN_NODE_SZ); + ubifs_dump_node(c, dn, dn_size); goto out_free; } if (dn_len <= dlen) dlen = 0; /* Nothing to do */ else { - err = truncate_data_node(c, inode, blk, dn, &dlen); + err = truncate_data_node(c, inode, blk, dn, + &dlen, dn_size); if (err) goto out_free; } -- Gitee From baf97eb4ed59819bb8e2d59570057de458a351d1 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Thu, 28 Jul 2022 18:06:42 +0800 Subject: [PATCH 654/674] ubifs: Fix AA deadlock when setting xattr for encrypted file hulk inclusion category: bugfix bugzilla: 187250, https://gitee.com/openeuler/kernel/issues/I5HSMS CVE: NA ------------------------------------------------- Following process: vfs_setxattr(host) ubifs_xattr_set down_write(host_ui->xattr_sem) <- lock first time create_xattr ubifs_new_inode(host) fscrypt_prepare_new_inode(host) fscrypt_policy_to_inherit(host) if (IS_ENCRYPTED(inode)) fscrypt_require_key(host) fscrypt_get_encryption_info(host) ubifs_xattr_get(host) down_read(host_ui->xattr_sem) <- AA deadlock , which may trigger an AA deadlock problem: [ 102.620871] INFO: task setfattr:1599 blocked for more than 10 seconds. [ 102.625298] Not tainted 5.19.0-rc7-00001-gb666b6823ce0-dirty #711 [ 102.628732] task:setfattr state:D stack: 0 pid: 1599 [ 102.628749] Call Trace: [ 102.628753] [ 102.628776] __schedule+0x482/0x1060 [ 102.629964] schedule+0x92/0x1a0 [ 102.629976] rwsem_down_read_slowpath+0x287/0x8c0 [ 102.629996] down_read+0x84/0x170 [ 102.630585] ubifs_xattr_get+0xd1/0x370 [ubifs] [ 102.630730] ubifs_crypt_get_context+0x1f/0x30 [ubifs] [ 102.630791] fscrypt_get_encryption_info+0x7d/0x1c0 [ 102.630810] fscrypt_policy_to_inherit+0x56/0xc0 [ 102.630817] fscrypt_prepare_new_inode+0x35/0x160 [ 102.630830] ubifs_new_inode+0xcc/0x4b0 [ubifs] [ 102.630873] ubifs_xattr_set+0x591/0x9f0 [ubifs] [ 102.630961] xattr_set+0x8c/0x3e0 [ubifs] [ 102.631003] __vfs_setxattr+0x71/0xc0 [ 102.631026] vfs_setxattr+0x105/0x270 [ 102.631034] do_setxattr+0x6d/0x110 [ 102.631041] setxattr+0xa0/0xd0 [ 102.631087] __x64_sys_setxattr+0x2f/0x40 Fetch a reproducer in [Link]. Just like ext4 does, which skips encrypting for inode with EXT4_EA_INODE_FL flag. Stop encypting xattr inode for ubifs. Link: https://bugzilla.kernel.org/show_bug.cgi?id=216260 Fixes: f4e3634a3b64222 ("ubifs: Fix races between xattr_{set|get} ...") Fixes: d475a507457b5ca ("ubifs: Add skeleton for fscrypto") Signed-off-by: Zhihao Cheng Reviewed-by: Zhang Yi Signed-off-by: Zheng Zengkai --- fs/ubifs/dir.c | 25 ++++++++++++++----------- fs/ubifs/ubifs.h | 2 +- fs/ubifs/xattr.c | 2 +- 3 files changed, 16 insertions(+), 13 deletions(-) diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index f5777f59a101..acd7e83a35e4 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -68,13 +68,14 @@ static int inherit_flags(const struct inode *dir, umode_t mode) * @c: UBIFS file-system description object * @dir: parent directory inode * @mode: inode mode flags + * @is_xattr: whether the inode is xattr inode * * This function finds an unused inode number, allocates new inode and * initializes it. Returns new inode in case of success and an error code in * case of failure. */ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, - umode_t mode) + umode_t mode, bool is_xattr) { int err; struct inode *inode; @@ -99,10 +100,12 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, current_time(inode); inode->i_mapping->nrpages = 0; - err = fscrypt_prepare_new_inode(dir, inode, &encrypted); - if (err) { - ubifs_err(c, "fscrypt_prepare_new_inode failed: %i", err); - goto out_iput; + if (!is_xattr) { + err = fscrypt_prepare_new_inode(dir, inode, &encrypted); + if (err) { + ubifs_err(c, "fscrypt_prepare_new_inode failed: %i", err); + goto out_iput; + } } switch (mode & S_IFMT) { @@ -308,7 +311,7 @@ static int ubifs_create(struct inode *dir, struct dentry *dentry, umode_t mode, sz_change = CALC_DENT_SIZE(fname_len(&nm)); - inode = ubifs_new_inode(c, dir, mode); + inode = ubifs_new_inode(c, dir, mode, false); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_fname; @@ -369,7 +372,7 @@ static struct inode *create_whiteout(struct inode *dir, struct dentry *dentry) if (err) return ERR_PTR(err); - inode = ubifs_new_inode(c, dir, mode); + inode = ubifs_new_inode(c, dir, mode, false); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_free; @@ -461,7 +464,7 @@ static int ubifs_tmpfile(struct inode *dir, struct dentry *dentry, return err; } - inode = ubifs_new_inode(c, dir, mode); + inode = ubifs_new_inode(c, dir, mode, false); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_budg; @@ -1002,7 +1005,7 @@ static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) sz_change = CALC_DENT_SIZE(fname_len(&nm)); - inode = ubifs_new_inode(c, dir, S_IFDIR | mode); + inode = ubifs_new_inode(c, dir, S_IFDIR | mode, false); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_fname; @@ -1089,7 +1092,7 @@ static int ubifs_mknod(struct inode *dir, struct dentry *dentry, sz_change = CALC_DENT_SIZE(fname_len(&nm)); - inode = ubifs_new_inode(c, dir, mode); + inode = ubifs_new_inode(c, dir, mode, false); if (IS_ERR(inode)) { kfree(dev); err = PTR_ERR(inode); @@ -1171,7 +1174,7 @@ static int ubifs_symlink(struct inode *dir, struct dentry *dentry, sz_change = CALC_DENT_SIZE(fname_len(&nm)); - inode = ubifs_new_inode(c, dir, S_IFLNK | S_IRWXUGO); + inode = ubifs_new_inode(c, dir, S_IFLNK | S_IRWXUGO, false); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_fname; diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 9a4a3191ed07..15cfee7c1125 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -1996,7 +1996,7 @@ int ubifs_update_time(struct inode *inode, struct timespec64 *time, int flags); /* dir.c */ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, - umode_t mode); + umode_t mode, bool is_xattr); int ubifs_getattr(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags); int ubifs_check_dir_empty(struct inode *dir); diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index 258b97939ba2..81efe638acd5 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -110,7 +110,7 @@ static int create_xattr(struct ubifs_info *c, struct inode *host, if (err) return err; - inode = ubifs_new_inode(c, host, S_IFREG | S_IRWXUGO); + inode = ubifs_new_inode(c, host, S_IFREG | S_IRWXUGO, true); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_budg; -- Gitee From 156e60a42ffffe594f7cf38df51ee07b7f0fc99c Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Thu, 28 Jul 2022 18:06:43 +0800 Subject: [PATCH 655/674] Revert "mm/dynamic_hugetlb: disable dynamic hugetlb if hugetlb_vmemmap is enabled" hulk inclusion category: bugfix bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO CVE: NA -------------------------------- Will disable hugetlb_vmemmap when dynamic hugetlb is enabled in later patch. This reverts commit c7ae7c0dd37aa112f9cb3e23878f4c5fe67f86bc. Signed-off-by: Liu Shixin Reviewed-by: Kefeng Wang Signed-off-by: Zheng Zengkai --- mm/dynamic_hugetlb.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/mm/dynamic_hugetlb.c b/mm/dynamic_hugetlb.c index eb9b528b73de..f8ebc8ab7d60 100644 --- a/mm/dynamic_hugetlb.c +++ b/mm/dynamic_hugetlb.c @@ -1133,17 +1133,6 @@ void __init dynamic_hugetlb_init(void) if (!enable_dhugetlb) return; - /* - * The dynamic_hugetlb feature need to split and merge pages frequently. - * hugetlb_vmemmap will affects the perforemance of page split and merge. - * If want to use dynamic hugetlb, please disable hugetlb_vmemmap. - */ - if (hugetlb_free_vmemmap_enabled) { - enable_dhugetlb = false; - pr_info("Please set hugetlb_free_vmemmap=off if want to enable dynamic hugetlb\n"); - return; - } - count = max(hugepage_index(max_pfn), (unsigned long)DEFAULT_PAGELIST_COUNT); size = sizeof(struct dhugetlb_pagelist) + count * sizeof(struct dhugetlb_pool *); dhugetlb_pagelist_t = kzalloc(size, GFP_KERNEL); -- Gitee From 2706bcc0550ce47604e18a68f6bd8d0f78f0136d Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 28 Jul 2022 18:06:44 +0800 Subject: [PATCH 656/674] jump_label: Provide CONFIG-driven build state defaults mainline inclusion from mainline-v5.13-rc1 commit 0d66ccc1627013c95f1e7ef10b95b8451cd7834e category: feature bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0d66ccc1627013c95f1e7ef10b95b8451cd7834e -------------------------------- As shown in the comment in jump_label.h, choosing the initial state of static branches changes the assembly layout. If the condition is expected to be likely it's inline, and if unlikely it is out of line via a jump. A few places in the kernel use (or could be using) a CONFIG to choose the default state, which would give a small performance benefit to their compile-time declared default. Provide the infrastructure to do this. Signed-off-by: Kees Cook Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210401232347.2791257-2-keescook@chromium.org Signed-off-by: Liu Shixin Reviewed-by: Kefeng Wang Signed-off-by: Zheng Zengkai --- include/linux/jump_label.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index 7e1dce5670fc..d01c23025af0 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -388,6 +388,21 @@ struct static_key_false { [0 ... (count) - 1] = STATIC_KEY_FALSE_INIT, \ } +#define _DEFINE_STATIC_KEY_1(name) DEFINE_STATIC_KEY_TRUE(name) +#define _DEFINE_STATIC_KEY_0(name) DEFINE_STATIC_KEY_FALSE(name) +#define DEFINE_STATIC_KEY_MAYBE(cfg, name) \ + __PASTE(_DEFINE_STATIC_KEY_, IS_ENABLED(cfg))(name) + +#define _DEFINE_STATIC_KEY_RO_1(name) DEFINE_STATIC_KEY_TRUE_RO(name) +#define _DEFINE_STATIC_KEY_RO_0(name) DEFINE_STATIC_KEY_FALSE_RO(name) +#define DEFINE_STATIC_KEY_MAYBE_RO(cfg, name) \ + __PASTE(_DEFINE_STATIC_KEY_RO_, IS_ENABLED(cfg))(name) + +#define _DECLARE_STATIC_KEY_1(name) DECLARE_STATIC_KEY_TRUE(name) +#define _DECLARE_STATIC_KEY_0(name) DECLARE_STATIC_KEY_FALSE(name) +#define DECLARE_STATIC_KEY_MAYBE(cfg, name) \ + __PASTE(_DECLARE_STATIC_KEY_, IS_ENABLED(cfg))(name) + extern bool ____wrong_branch_error(void); #define static_key_enabled(x) \ @@ -488,6 +503,10 @@ extern bool ____wrong_branch_error(void); #endif /* CONFIG_JUMP_LABEL */ +#define static_branch_maybe(config, x) \ + (IS_ENABLED(config) ? static_branch_likely(x) \ + : static_branch_unlikely(x)) + /* * Advanced usage; refcount, branch is enabled when: count != 0 */ -- Gitee From 2722aa58c0fe3bd8c73b80786b1120e317c41365 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 28 Jul 2022 18:06:45 +0800 Subject: [PATCH 657/674] mm: make compound_head const-preserving mainline inclusion from mainline-v5.14-rc1 commit 0f2317e34e2c7b97efd4600122115410795ebeea category: feature bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0f2317e34e2c7b97efd4600122115410795ebeea -------------------------------- If you pass a const pointer to compound_head(), you get a const pointer back; if you pass a mutable pointer, you get a mutable pointer back. Also remove an unnecessary forward definition of struct page; we're about to dereference page->compound_head, so it must already have been defined. Link: https://lkml.kernel.org/r/20210416231531.2521383-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Vlastimil Babka Reviewed-by: Anshuman Khandual Reviewed-by: William Kucharski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Liu Shixin Reviewed-by: Kefeng Wang Signed-off-by: Zheng Zengkai --- include/linux/page-flags.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 65e1cbe1d1ce..b40b4e0ada31 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -194,17 +194,17 @@ enum pageflags { #ifndef __GENERATING_BOUNDS_H -struct page; /* forward declaration */ - -static inline struct page *compound_head(struct page *page) +static inline unsigned long _compound_head(const struct page *page) { unsigned long head = READ_ONCE(page->compound_head); if (unlikely(head & 1)) - return (struct page *) (head - 1); - return page; + return head - 1; + return (unsigned long)page; } +#define compound_head(page) ((typeof(page))_compound_head(page)) + static __always_inline int PageTail(struct page *page) { return READ_ONCE(page->compound_head) & 1; -- Gitee From fe0aed0230fbd8246fb028d126548fa69b4f9bf4 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 28 Jul 2022 18:06:46 +0800 Subject: [PATCH 658/674] mm: hugetlb: free the 2nd vmemmap page associated with each HugeTLB page mainline inclusion from mainline-v5.18-rc1 commit e7d324850bfcb30df563d144c0363cc44595277d category: feature bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e7d324850bfcb30df563d144c0363cc44595277d -------------------------------- Patch series "Free the 2nd vmemmap page associated with each HugeTLB page", v7. This series can minimize the overhead of struct page for 2MB HugeTLB pages significantly. It further reduces the overhead of struct page by 12.5% for a 2MB HugeTLB compared to the previous approach, which means 2GB per 1TB HugeTLB. It is a nice gain. Comments and reviews are welcome. Thanks. The main implementation and details can refer to the commit log of patch 1. In this series, I have changed the following four helpers, the following table shows the impact of the overhead of those helpers. +------------------+-----------------------+ | APIs | head page | tail page | +------------------+-----------+-----------+ | PageHead() | Y | N | +------------------+-----------+-----------+ | PageTail() | Y | N | +------------------+-----------+-----------+ | PageCompound() | N | N | +------------------+-----------+-----------+ | compound_head() | Y | N | +------------------+-----------+-----------+ Y: Overhead is increased. N: Overhead is _NOT_ increased. It shows that the overhead of those helpers on a tail page don't change between "hugetlb_free_vmemmap=on" and "hugetlb_free_vmemmap=off". But the overhead on a head page will be increased when "hugetlb_free_vmemmap=on" (except PageCompound()). So I believe that Matthew Wilcox's folio series will help with this. The users of PageHead() and PageTail() are much less than compound_head() and most users of PageTail() are VM_BUG_ON(), so I have done some tests about the overhead of compound_head() on head pages. I have tested the overhead of calling compound_head() on a head page, which is 2.11ns (Measure the call time of 10 million times compound_head(), and then average). For a head page whose address is not aligned with PAGE_SIZE or a non-compound page, the overhead of compound_head() is 2.54ns which is increased by 20%. For a head page whose address is aligned with PAGE_SIZE, the overhead of compound_head() is 2.97ns which is increased by 40%. Most pages are the former. I do not think the overhead is significant since the overhead of compound_head() itself is low. This patch (of 5): This patch minimizes the overhead of struct page for 2MB HugeTLB pages significantly. It further reduces the overhead of struct page by 12.5% for a 2MB HugeTLB compared to the previous approach, which means 2GB per 1TB HugeTLB (2MB type). After the feature of "Free sonme vmemmap pages of HugeTLB page" is enabled, the mapping of the vmemmap addresses associated with a 2MB HugeTLB page becomes the figure below. HugeTLB struct pages(8 pages) page frame(8 pages) +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+---> PG_head | | | 0 | -------------> | 0 | | | +-----------+ +-----------+ | | | 1 | -------------> | 1 | | | +-----------+ +-----------+ | | | 2 | ----------------^ ^ ^ ^ ^ ^ | | +-----------+ | | | | | | | | 3 | ------------------+ | | | | | | +-----------+ | | | | | | | 4 | --------------------+ | | | | 2MB | +-----------+ | | | | | | 5 | ----------------------+ | | | | +-----------+ | | | | | 6 | ------------------------+ | | | +-----------+ | | | | 7 | --------------------------+ | | +-----------+ | | | | | | +-----------+ As we can see, the 2nd vmemmap page frame (indexed by 1) is reused and remaped. However, the 2nd vmemmap page frame is also can be freed to the buddy allocator, then we can change the mapping from the figure above to the figure below. HugeTLB struct pages(8 pages) page frame(8 pages) +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+---> PG_head | | | 0 | -------------> | 0 | | | +-----------+ +-----------+ | | | 1 | ---------------^ ^ ^ ^ ^ ^ ^ | | +-----------+ | | | | | | | | | 2 | -----------------+ | | | | | | | +-----------+ | | | | | | | | 3 | -------------------+ | | | | | | +-----------+ | | | | | | | 4 | ---------------------+ | | | | 2MB | +-----------+ | | | | | | 5 | -----------------------+ | | | | +-----------+ | | | | | 6 | -------------------------+ | | | +-----------+ | | | | 7 | ---------------------------+ | | +-----------+ | | | | | | +-----------+ After we do this, all tail vmemmap pages (1-7) are mapped to the head vmemmap page frame (0). In other words, there are more than one page struct with PG_head associated with each HugeTLB page. We __know__ that there is only one head page struct, the tail page structs with PG_head are fake head page structs. We need an approach to distinguish between those two different types of page structs so that compound_head(), PageHead() and PageTail() can work properly if the parameter is the tail page struct but with PG_head. The following code snippet describes how to distinguish between real and fake head page struct. if (test_bit(PG_head, &page->flags)) { unsigned long head = READ_ONCE(page[1].compound_head); if (head & 1) { if (head == (unsigned long)page + 1) ==> head page struct else ==> tail page struct } else ==> head page struct } We can safely access the field of the @page[1] with PG_head because the @page is a compound page composed with at least two contiguous pages. [songmuchun@bytedance.com: restore lost comment changes] Link: https://lkml.kernel.org/r/20211101031651.75851-1-songmuchun@bytedance.com Link: https://lkml.kernel.org/r/20211101031651.75851-2-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Barry Song Cc: Mike Kravetz Cc: Oscar Salvador Cc: Michal Hocko Cc: David Hildenbrand Cc: Chen Huang Cc: Bodeddula Balasubramaniam Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Xiongchun Duan Cc: Fam Zheng Cc: Qi Zheng Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Conflicts: include/linux/page-flags.h Signed-off-by: Liu Shixin Reviewed-by: Kefeng Wang Signed-off-by: Zheng Zengkai --- .../admin-guide/kernel-parameters.txt | 2 +- include/linux/page-flags.h | 73 ++++++++++++++++++- mm/hugetlb_vmemmap.c | 62 +++++++++------- mm/sparse-vmemmap.c | 21 ++++++ 4 files changed, 125 insertions(+), 33 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 2b04cf8fbab4..e0957b73f63d 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1596,7 +1596,7 @@ [KNL] Reguires CONFIG_HUGETLB_PAGE_FREE_VMEMMAP enabled. Allows heavy hugetlb users to free up some more - memory (6 * PAGE_SIZE for each 2MB hugetlb page). + memory (7 * PAGE_SIZE for each 2MB hugetlb page). Format: { on | off (default) } on: enable the feature diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index b40b4e0ada31..7b31a81be7e1 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -194,25 +194,82 @@ enum pageflags { #ifndef __GENERATING_BOUNDS_H +#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP +extern bool hugetlb_free_vmemmap_enabled; + +/* + * If the feature of freeing some vmemmap pages associated with each HugeTLB + * page is enabled, the head vmemmap page frame is reused and all of the tail + * vmemmap addresses map to the head vmemmap page frame (furture details can + * refer to the figure at the head of the mm/hugetlb_vmemmap.c). In other + * words, there are more than one page struct with PG_head associated with each + * HugeTLB page. We __know__ that there is only one head page struct, the tail + * page structs with PG_head are fake head page structs. We need an approach + * to distinguish between those two different types of page structs so that + * compound_head() can return the real head page struct when the parameter is + * the tail page struct but with PG_head. + * + * The page_fixed_fake_head() returns the real head page struct if the @page is + * fake page head, otherwise, returns @page which can either be a true page + * head or tail. + */ +static __always_inline const struct page *page_fixed_fake_head(const struct page *page) +{ + if (!hugetlb_free_vmemmap_enabled) + return page; + + /* + * Only addresses aligned with PAGE_SIZE of struct page may be fake head + * struct page. The alignment check aims to avoid access the fields ( + * e.g. compound_head) of the @page[1]. It can avoid touch a (possibly) + * cold cacheline in some cases. + */ + if (IS_ALIGNED((unsigned long)page, PAGE_SIZE) && + test_bit(PG_head, &page->flags)) { + /* + * We can safely access the field of the @page[1] with PG_head + * because the @page is a compound page composed with at least + * two contiguous pages. + */ + unsigned long head = READ_ONCE(page[1].compound_head); + + if (likely(head & 1)) + return (const struct page *)(head - 1); + } + return page; +} +#else +static inline const struct page *page_fixed_fake_head(const struct page *page) +{ + return page; +} +#endif + +static __always_inline int page_is_fake_head(struct page *page) +{ + return page_fixed_fake_head(page) != page; +} + static inline unsigned long _compound_head(const struct page *page) { unsigned long head = READ_ONCE(page->compound_head); if (unlikely(head & 1)) return head - 1; - return (unsigned long)page; + return (unsigned long)page_fixed_fake_head(page); } #define compound_head(page) ((typeof(page))_compound_head(page)) static __always_inline int PageTail(struct page *page) { - return READ_ONCE(page->compound_head) & 1; + return READ_ONCE(page->compound_head) & 1 || page_is_fake_head(page); } static __always_inline int PageCompound(struct page *page) { - return test_bit(PG_head, &page->flags) || PageTail(page); + return test_bit(PG_head, &page->flags) || + READ_ONCE(page->compound_head) & 1; } #define PAGE_POISON_PATTERN -1l @@ -600,7 +657,15 @@ static inline void set_page_writeback_keepwrite(struct page *page) test_set_page_writeback_keepwrite(page); } -__PAGEFLAG(Head, head, PF_ANY) CLEARPAGEFLAG(Head, head, PF_ANY) +static __always_inline int PageHead(struct page *page) +{ + PF_POISONED_CHECK(page); + return test_bit(PG_head, &page->flags) && !page_is_fake_head(page); +} + +__SETPAGEFLAG(Head, head, PF_ANY) +__CLEARPAGEFLAG(Head, head, PF_ANY) +CLEARPAGEFLAG(Head, head, PF_ANY) static __always_inline void set_compound_head(struct page *page, struct page *head) { diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index c540c21e26f5..4977f5a520c2 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -124,9 +124,9 @@ * page of page structs (page 0) associated with the HugeTLB page contains the 4 * page structs necessary to describe the HugeTLB. The only use of the remaining * pages of page structs (page 1 to page 7) is to point to page->compound_head. - * Therefore, we can remap pages 2 to 7 to page 1. Only 2 pages of page structs + * Therefore, we can remap pages 1 to 7 to page 0. Only 1 page of page structs * will be used for each HugeTLB page. This will allow us to free the remaining - * 6 pages to the buddy allocator. + * 7 pages to the buddy allocator. * * Here is how things look after remapping. * @@ -134,30 +134,30 @@ * +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+ * | | | 0 | -------------> | 0 | * | | +-----------+ +-----------+ - * | | | 1 | -------------> | 1 | - * | | +-----------+ +-----------+ - * | | | 2 | ----------------^ ^ ^ ^ ^ ^ - * | | +-----------+ | | | | | - * | | | 3 | ------------------+ | | | | - * | | +-----------+ | | | | - * | | | 4 | --------------------+ | | | - * | PMD | +-----------+ | | | - * | level | | 5 | ----------------------+ | | - * | mapping | +-----------+ | | - * | | | 6 | ------------------------+ | - * | | +-----------+ | - * | | | 7 | --------------------------+ + * | | | 1 | ---------------^ ^ ^ ^ ^ ^ ^ + * | | +-----------+ | | | | | | + * | | | 2 | -----------------+ | | | | | + * | | +-----------+ | | | | | + * | | | 3 | -------------------+ | | | | + * | | +-----------+ | | | | + * | | | 4 | ---------------------+ | | | + * | PMD | +-----------+ | | | + * | level | | 5 | -----------------------+ | | + * | mapping | +-----------+ | | + * | | | 6 | -------------------------+ | + * | | +-----------+ | + * | | | 7 | ---------------------------+ * | | +-----------+ * | | * | | * | | * +-----------+ * - * When a HugeTLB is freed to the buddy system, we should allocate 6 pages for + * When a HugeTLB is freed to the buddy system, we should allocate 7 pages for * vmemmap pages and restore the previous mapping relationship. * * For the HugeTLB page of the pud level mapping. It is similar to the former. - * We also can use this approach to free (PAGE_SIZE - 2) vmemmap pages. + * We also can use this approach to free (PAGE_SIZE - 1) vmemmap pages. * * Apart from the HugeTLB page of the pmd/pud level mapping, some architectures * (e.g. aarch64) provides a contiguous bit in the translation table entries @@ -166,7 +166,13 @@ * * The contiguous bit is used to increase the mapping size at the pmd and pte * (last) level. So this type of HugeTLB page can be optimized only when its - * size of the struct page structs is greater than 2 pages. + * size of the struct page structs is greater than 1 page. + * + * Notice: The head vmemmap page is not freed to the buddy allocator and all + * tail vmemmap pages are mapped to the head vmemmap page frame. So we can see + * more than one struct page struct with PG_head (e.g. 8 per 2 MB HugeTLB page) + * associated with each HugeTLB page. The compound_head() can handle this + * correctly (more details refer to the comment above compound_head()). */ #define pr_fmt(fmt) "HugeTLB: " fmt @@ -175,19 +181,21 @@ /* * There are a lot of struct page structures associated with each HugeTLB page. * For tail pages, the value of compound_head is the same. So we can reuse first - * page of tail page structures. We map the virtual addresses of the remaining - * pages of tail page structures to the first tail page struct, and then free - * these page frames. Therefore, we need to reserve two pages as vmemmap areas. + * page of head page structures. We map the virtual addresses of all the pages + * of tail page structures to the head page struct, and then free these page + * frames. Therefore, we need to reserve one pages as vmemmap areas. */ -#define RESERVE_VMEMMAP_NR 2U +#define RESERVE_VMEMMAP_NR 1U #define RESERVE_VMEMMAP_SIZE (RESERVE_VMEMMAP_NR << PAGE_SHIFT) -bool hugetlb_free_vmemmap_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON); +bool hugetlb_free_vmemmap_enabled __read_mostly = + IS_ENABLED(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON); +EXPORT_SYMBOL(hugetlb_free_vmemmap_enabled); static int __init early_hugetlb_free_vmemmap_param(char *buf) { /* We cannot optimize if a "struct page" crosses page boundaries. */ - if ((!is_power_of_2(sizeof(struct page)))) { + if (!is_power_of_2(sizeof(struct page))) { pr_warn("cannot free vmemmap pages because \"struct page\" crosses page boundaries\n"); return 0; } @@ -236,7 +244,6 @@ int alloc_huge_page_vmemmap(struct hstate *h, struct page *head) */ ret = vmemmap_remap_alloc(vmemmap_addr, vmemmap_end, vmemmap_reuse, GFP_KERNEL | __GFP_NORETRY | __GFP_THISNODE); - if (!ret) ClearHPageVmemmapOptimized(head); @@ -282,9 +289,8 @@ void __init hugetlb_vmemmap_init(struct hstate *h) vmemmap_pages = (nr_pages * sizeof(struct page)) >> PAGE_SHIFT; /* - * The head page and the first tail page are not to be freed to buddy - * allocator, the other pages will map to the first tail page, so they - * can be freed. + * The head page is not to be freed to buddy allocator, the other tail + * pages will map to the head page, so they can be freed. * * Could RESERVE_VMEMMAP_NR be greater than @vmemmap_pages? It is true * on some architectures (e.g. aarch64). See Documentation/arm64/ diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 396a49462894..39bdbf0b28dd 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -245,6 +245,26 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, set_pte_at(&init_mm, addr, pte, entry); } +/* + * How many struct page structs need to be reset. When we reuse the head + * struct page, the special metadata (e.g. page->flags or page->mapping) + * cannot copy to the tail struct page structs. The invalid value will be + * checked in the free_tail_pages_check(). In order to avoid the message + * of "corrupted mapping in tail page". We need to reset at least 3 (one + * head struct page struct and two tail struct page structs) struct page + * structs. + */ +#define NR_RESET_STRUCT_PAGE 3 + +static inline void reset_struct_pages(struct page *start) +{ + int i; + struct page *from = start + NR_RESET_STRUCT_PAGE; + + for (i = 0; i < NR_RESET_STRUCT_PAGE; i++) + memcpy(start + i, from, sizeof(*from)); +} + static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, struct vmemmap_remap_walk *walk) { @@ -258,6 +278,7 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, list_del(&page->lru); to = page_to_virt(page); copy_page(to, (void *)walk->reuse_addr); + reset_struct_pages(to); set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot)); } -- Gitee From 9b4aa808afd36ebad9567413eddf5a0db8b8441f Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 28 Jul 2022 18:06:47 +0800 Subject: [PATCH 659/674] mm: hugetlb: replace hugetlb_free_vmemmap_enabled with a static_key mainline inclusion from mainline-v5.18-rc1 commit a6b40850c442bf996e729e1d441d3dbc37cea171 category: feature bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a6b40850c442bf996e729e1d441d3dbc37cea171 -------------------------------- The page_fixed_fake_head() is used throughout memory management and the conditional check requires checking a global variable, although the overhead of this check may be small, it increases when the memory cache comes under pressure. Also, the global variable will not be modified after system boot, so it is very appropriate to use static key machanism. Link: https://lkml.kernel.org/r/20211101031651.75851-3-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Barry Song Cc: Bodeddula Balasubramaniam Cc: Chen Huang Cc: David Hildenbrand Cc: Fam Zheng Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Qi Zheng Cc: Xiongchun Duan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Conflicts: mm/memory_hotplug.c Signed-off-by: Liu Shixin Reviewed-by: Kefeng Wang Signed-off-by: Zheng Zengkai --- include/linux/hugetlb.h | 6 ------ include/linux/page-flags.h | 16 ++++++++++++++-- mm/hugetlb_vmemmap.c | 12 ++++++------ 3 files changed, 20 insertions(+), 14 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 634630ebc8a7..fd1dc8d29436 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -1103,12 +1103,6 @@ static inline int hugetlb_insert_hugepage_pte_by_pa(struct mm_struct *mm, } #endif /* CONFIG_HUGETLB_PAGE */ -#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP -extern bool hugetlb_free_vmemmap_enabled; -#else -#define hugetlb_free_vmemmap_enabled false -#endif - static inline spinlock_t *huge_pte_lock(struct hstate *h, struct mm_struct *mm, pte_t *pte) { diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 7b31a81be7e1..81061122af68 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -195,7 +195,14 @@ enum pageflags { #ifndef __GENERATING_BOUNDS_H #ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP -extern bool hugetlb_free_vmemmap_enabled; +DECLARE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON, + hugetlb_free_vmemmap_enabled_key); + +static __always_inline bool hugetlb_free_vmemmap_enabled(void) +{ + return static_branch_maybe(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON, + &hugetlb_free_vmemmap_enabled_key); +} /* * If the feature of freeing some vmemmap pages associated with each HugeTLB @@ -215,7 +222,7 @@ extern bool hugetlb_free_vmemmap_enabled; */ static __always_inline const struct page *page_fixed_fake_head(const struct page *page) { - if (!hugetlb_free_vmemmap_enabled) + if (!hugetlb_free_vmemmap_enabled()) return page; /* @@ -243,6 +250,11 @@ static inline const struct page *page_fixed_fake_head(const struct page *page) { return page; } + +static inline bool hugetlb_free_vmemmap_enabled(void) +{ + return false; +} #endif static __always_inline int page_is_fake_head(struct page *page) diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 4977f5a520c2..791626983c2e 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -188,9 +188,9 @@ #define RESERVE_VMEMMAP_NR 1U #define RESERVE_VMEMMAP_SIZE (RESERVE_VMEMMAP_NR << PAGE_SHIFT) -bool hugetlb_free_vmemmap_enabled __read_mostly = - IS_ENABLED(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON); -EXPORT_SYMBOL(hugetlb_free_vmemmap_enabled); +DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON, + hugetlb_free_vmemmap_enabled_key); +EXPORT_SYMBOL(hugetlb_free_vmemmap_enabled_key); static int __init early_hugetlb_free_vmemmap_param(char *buf) { @@ -204,9 +204,9 @@ static int __init early_hugetlb_free_vmemmap_param(char *buf) return -EINVAL; if (!strcmp(buf, "on")) - hugetlb_free_vmemmap_enabled = true; + static_branch_enable(&hugetlb_free_vmemmap_enabled_key); else if (!strcmp(buf, "off")) - hugetlb_free_vmemmap_enabled = false; + static_branch_disable(&hugetlb_free_vmemmap_enabled_key); else return -EINVAL; @@ -284,7 +284,7 @@ void __init hugetlb_vmemmap_init(struct hstate *h) BUILD_BUG_ON(__NR_USED_SUBPAGE >= RESERVE_VMEMMAP_SIZE / sizeof(struct page)); - if (!hugetlb_free_vmemmap_enabled) + if (!hugetlb_free_vmemmap_enabled()) return; vmemmap_pages = (nr_pages * sizeof(struct page)) >> PAGE_SHIFT; -- Gitee From b31b84bf5c6d282b2e3bdcf8dc6df4f395efcb19 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 28 Jul 2022 18:06:48 +0800 Subject: [PATCH 660/674] mm: sparsemem: use page table lock to protect kernel pmd operations mainline inclusion from mainline-v5.18-rc1 commit d8d55f5616cf3b900a23a72dd24e7b07211e7859 category: feature bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=d8d55f5616cf3b900a23a72dd24e7b07211e7859 -------------------------------- The init_mm.page_table_lock is used to protect kernel page tables, we can use it to serialize splitting vmemmap PMD mappings instead of mmap write lock, which can increase the concurrency of vmemmap_remap_free(). Actually, It increase the concurrency between allocations of HugeTLB pages. But it is not the only benefit. There are a lot of users of mmap read lock of init_mm. The mmap write lock is holding through vmemmap_remap_free(), removing mmap write lock usage to make it does not affect other users of mmap read lock. It is not making anything worse and always a win to move. Now the kernel page table walker does not hold the page_table_lock when walking pmd entries. There may be consistency issue of a pmd entry, because pmd entry might change from a huge pmd entry to a PTE page table. There is only one user of kernel page table walker, namely ptdump. The ptdump already considers the consistency, which use a local variable to cache the value of pmd entry. But we also need to update ->action to ACTION_CONTINUE to make sure the walker does not walk every pte entry again when concurrent thread has split the huge pmd. Link: https://lkml.kernel.org/r/20211101031651.75851-4-songmuchun@bytedance.com Signed-off-by: Muchun Song Cc: Barry Song Cc: Bodeddula Balasubramaniam Cc: Chen Huang Cc: David Hildenbrand Cc: Fam Zheng Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Qi Zheng Cc: Xiongchun Duan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Conflicts: mm/sparse-vmemmap.c Signed-off-by: Liu Shixin Reviewed-by: Kefeng Wang Signed-off-by: Zheng Zengkai --- mm/ptdump.c | 16 +++++++++++---- mm/sparse-vmemmap.c | 47 ++++++++++++++++++++++++++++++--------------- 2 files changed, 43 insertions(+), 20 deletions(-) diff --git a/mm/ptdump.c b/mm/ptdump.c index 93f2f63dc52d..43661863096b 100644 --- a/mm/ptdump.c +++ b/mm/ptdump.c @@ -39,8 +39,10 @@ static int ptdump_pgd_entry(pgd_t *pgd, unsigned long addr, if (st->effective_prot) st->effective_prot(st, 0, pgd_val(val)); - if (pgd_leaf(val)) + if (pgd_leaf(val)) { st->note_page(st, addr, 0, pgd_val(val)); + walk->action = ACTION_CONTINUE; + } return 0; } @@ -59,8 +61,10 @@ static int ptdump_p4d_entry(p4d_t *p4d, unsigned long addr, if (st->effective_prot) st->effective_prot(st, 1, p4d_val(val)); - if (p4d_leaf(val)) + if (p4d_leaf(val)) { st->note_page(st, addr, 1, p4d_val(val)); + walk->action = ACTION_CONTINUE; + } return 0; } @@ -79,8 +83,10 @@ static int ptdump_pud_entry(pud_t *pud, unsigned long addr, if (st->effective_prot) st->effective_prot(st, 2, pud_val(val)); - if (pud_leaf(val)) + if (pud_leaf(val)) { st->note_page(st, addr, 2, pud_val(val)); + walk->action = ACTION_CONTINUE; + } return 0; } @@ -98,8 +104,10 @@ static int ptdump_pmd_entry(pmd_t *pmd, unsigned long addr, if (st->effective_prot) st->effective_prot(st, 3, pmd_val(val)); - if (pmd_leaf(val)) + if (pmd_leaf(val)) { st->note_page(st, addr, 3, pmd_val(val)); + walk->action = ACTION_CONTINUE; + } return 0; } diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 39bdbf0b28dd..eef9053f948f 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -53,8 +53,7 @@ struct vmemmap_remap_walk { struct list_head *vmemmap_pages; }; -static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start, - struct vmemmap_remap_walk *walk) +static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) { pmd_t __pmd; int i; @@ -76,15 +75,34 @@ static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start, set_pte_at(&init_mm, addr, pte, entry); } - /* Make pte visible before pmd. See comment in __pte_alloc(). */ - smp_wmb(); - pmd_populate_kernel(&init_mm, pmd, pgtable); - - flush_tlb_kernel_range(start, start + PMD_SIZE); + spin_lock(&init_mm.page_table_lock); + if (likely(pmd_leaf(*pmd))) { + /* Make pte visible before pmd. See comment in __pte_alloc(). */ + smp_wmb(); + pmd_populate_kernel(&init_mm, pmd, pgtable); + flush_tlb_kernel_range(start, start + PMD_SIZE); + } else { + pte_free_kernel(&init_mm, pgtable); + } + spin_unlock(&init_mm.page_table_lock); return 0; } +static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) +{ + int leaf; + + spin_lock(&init_mm.page_table_lock); + leaf = pmd_leaf(*pmd); + spin_unlock(&init_mm.page_table_lock); + + if (!leaf) + return 0; + + return __split_vmemmap_huge_pmd(pmd, start); +} + static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct vmemmap_remap_walk *walk) @@ -121,13 +139,12 @@ static int vmemmap_pmd_range(pud_t *pud, unsigned long addr, pmd = pmd_offset(pud, addr); do { - if (pmd_leaf(*pmd)) { - int ret; + int ret; + + ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK); + if (ret) + return ret; - ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK, walk); - if (ret) - return ret; - } next = pmd_addr_end(addr, end); vmemmap_pte_range(pmd, addr, next, walk); } while (pmd++, addr = next, addr != end); @@ -321,10 +338,8 @@ int vmemmap_remap_free(unsigned long start, unsigned long end, */ BUG_ON(start - reuse != PAGE_SIZE); - mmap_write_lock(&init_mm); + mmap_read_lock(&init_mm); ret = vmemmap_remap_range(reuse, end, &walk); - mmap_write_downgrade(&init_mm); - if (ret && walk.nr_walked) { end = reuse + walk.nr_walked * PAGE_SIZE; /* -- Gitee From 7d9a273f5bc8369c07ac2ba082984cb1d1e9b182 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 28 Jul 2022 18:06:49 +0800 Subject: [PATCH 661/674] selftests: vm: add a hugetlb test case mainline inclusion from mainline-v5.18-rc1 commit b147c89cd429321a59147368378c8aba17c8480f category: feature bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b147c89cd429321a59147368378c8aba17c8480f -------------------------------- Since the head vmemmap page frame associated with each HugeTLB page is reused, we should hide the PG_head flag of tail struct page from the user. Add a tese case to check whether it is work properly. The test steps are as follows. 1) alloc 2MB hugeTLB 2) get each page frame 3) apply those APIs in each page frame 4) Those APIs work completely the same as before. Reading the flags of a page by /proc/kpageflags is done in stable_page_flags(), which has invoked PageHead(), PageTail(), PageCompound() and compound_head(). If those APIs work properly, the head page must have 15 and 17 bits set. And tail pages must have 16 and 17 bits set but 15 bit unset. Those flags are checked in check_page_flags(). Link: https://lkml.kernel.org/r/20211101031651.75851-5-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Barry Song Cc: Bodeddula Balasubramaniam Cc: Chen Huang Cc: David Hildenbrand Cc: Fam Zheng Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Qi Zheng Cc: Xiongchun Duan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Conflicts: tools/testing/selftests/vm/Makefile tools/testing/selftests/vm/run_vmtests.sh Signed-off-by: Liu Shixin Reviewed-by: Kefeng Wang Signed-off-by: Zheng Zengkai --- tools/testing/selftests/vm/.gitignore | 1 + tools/testing/selftests/vm/Makefile | 1 + tools/testing/selftests/vm/hugepage-vmemmap.c | 144 ++++++++++++++++++ tools/testing/selftests/vm/run_vmtests | 11 ++ 4 files changed, 157 insertions(+) create mode 100644 tools/testing/selftests/vm/hugepage-vmemmap.c diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore index b3a183c36cb5..905dc4efa879 100644 --- a/tools/testing/selftests/vm/.gitignore +++ b/tools/testing/selftests/vm/.gitignore @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only hugepage-mmap hugepage-shm +hugepage-vmemmap khugepaged map_hugetlb map_populate diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index b150cc837177..549761bc7193 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -27,6 +27,7 @@ TEST_GEN_FILES += gup_benchmark TEST_GEN_FILES += hmm-tests TEST_GEN_FILES += hugepage-mmap TEST_GEN_FILES += hugepage-shm +TEST_GEN_FILES += hugepage-vmemmap TEST_GEN_FILES += map_hugetlb TEST_GEN_FILES += map_fixed_noreplace TEST_GEN_FILES += map_populate diff --git a/tools/testing/selftests/vm/hugepage-vmemmap.c b/tools/testing/selftests/vm/hugepage-vmemmap.c new file mode 100644 index 000000000000..557bdbd4f87e --- /dev/null +++ b/tools/testing/selftests/vm/hugepage-vmemmap.c @@ -0,0 +1,144 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * A test case of using hugepage memory in a user application using the + * mmap system call with MAP_HUGETLB flag. Before running this program + * make sure the administrator has allocated enough default sized huge + * pages to cover the 2 MB allocation. + */ +#include +#include +#include +#include +#include + +#define MAP_LENGTH (2UL * 1024 * 1024) + +#ifndef MAP_HUGETLB +#define MAP_HUGETLB 0x40000 /* arch specific */ +#endif + +#define PAGE_SIZE 4096 + +#define PAGE_COMPOUND_HEAD (1UL << 15) +#define PAGE_COMPOUND_TAIL (1UL << 16) +#define PAGE_HUGE (1UL << 17) + +#define HEAD_PAGE_FLAGS (PAGE_COMPOUND_HEAD | PAGE_HUGE) +#define TAIL_PAGE_FLAGS (PAGE_COMPOUND_TAIL | PAGE_HUGE) + +#define PM_PFRAME_BITS 55 +#define PM_PFRAME_MASK ~((1UL << PM_PFRAME_BITS) - 1) + +/* + * For ia64 architecture, Linux kernel reserves Region number 4 for hugepages. + * That means the addresses starting with 0x800000... will need to be + * specified. Specifying a fixed address is not required on ppc64, i386 + * or x86_64. + */ +#ifdef __ia64__ +#define MAP_ADDR (void *)(0x8000000000000000UL) +#define MAP_FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_FIXED) +#else +#define MAP_ADDR NULL +#define MAP_FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB) +#endif + +static void write_bytes(char *addr, size_t length) +{ + unsigned long i; + + for (i = 0; i < length; i++) + *(addr + i) = (char)i; +} + +static unsigned long virt_to_pfn(void *addr) +{ + int fd; + unsigned long pagemap; + + fd = open("/proc/self/pagemap", O_RDONLY); + if (fd < 0) + return -1UL; + + lseek(fd, (unsigned long)addr / PAGE_SIZE * sizeof(pagemap), SEEK_SET); + read(fd, &pagemap, sizeof(pagemap)); + close(fd); + + return pagemap & ~PM_PFRAME_MASK; +} + +static int check_page_flags(unsigned long pfn) +{ + int fd, i; + unsigned long pageflags; + + fd = open("/proc/kpageflags", O_RDONLY); + if (fd < 0) + return -1; + + lseek(fd, pfn * sizeof(pageflags), SEEK_SET); + + read(fd, &pageflags, sizeof(pageflags)); + if ((pageflags & HEAD_PAGE_FLAGS) != HEAD_PAGE_FLAGS) { + close(fd); + printf("Head page flags (%lx) is invalid\n", pageflags); + return -1; + } + + /* + * pages other than the first page must be tail and shouldn't be head; + * this also verifies kernel has correctly set the fake page_head to tail + * while hugetlb_free_vmemmap is enabled. + */ + for (i = 1; i < MAP_LENGTH / PAGE_SIZE; i++) { + read(fd, &pageflags, sizeof(pageflags)); + if ((pageflags & TAIL_PAGE_FLAGS) != TAIL_PAGE_FLAGS || + (pageflags & HEAD_PAGE_FLAGS) == HEAD_PAGE_FLAGS) { + close(fd); + printf("Tail page flags (%lx) is invalid\n", pageflags); + return -1; + } + } + + close(fd); + + return 0; +} + +int main(int argc, char **argv) +{ + void *addr; + unsigned long pfn; + + addr = mmap(MAP_ADDR, MAP_LENGTH, PROT_READ | PROT_WRITE, MAP_FLAGS, -1, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + exit(1); + } + + /* Trigger allocation of HugeTLB page. */ + write_bytes(addr, MAP_LENGTH); + + pfn = virt_to_pfn(addr); + if (pfn == -1UL) { + munmap(addr, MAP_LENGTH); + perror("virt_to_pfn"); + exit(1); + } + + printf("Returned address is %p whose pfn is %lx\n", addr, pfn); + + if (check_page_flags(pfn) < 0) { + munmap(addr, MAP_LENGTH); + perror("check_page_flags"); + exit(1); + } + + /* munmap() length of MAP_HUGETLB memory must be hugepage aligned */ + if (munmap(addr, MAP_LENGTH)) { + perror("munmap"); + exit(1); + } + + return 0; +} diff --git a/tools/testing/selftests/vm/run_vmtests b/tools/testing/selftests/vm/run_vmtests index d578ad831813..949c71fe5951 100755 --- a/tools/testing/selftests/vm/run_vmtests +++ b/tools/testing/selftests/vm/run_vmtests @@ -108,6 +108,17 @@ else echo "[PASS]" fi +echo "------------------------" +echo "running hugepage-vmemmap" +echo "------------------------" +./hugepage-vmemmap +if [ $? -ne 0 ]; then + echo "[FAIL]" + exitcode=1 +else + echo "[PASS]" +fi + echo "NOTE: The above hugetlb tests provide minimal coverage. Use" echo " https://github.com/libhugetlbfs/libhugetlbfs.git for" echo " hugetlb regression testing." -- Gitee From 22f969cffb0be543692d6942f41cf3735fb66790 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 28 Jul 2022 18:06:50 +0800 Subject: [PATCH 662/674] mm: sparsemem: move vmemmap related to HugeTLB to CONFIG_HUGETLB_PAGE_FREE_VMEMMAP mainline inclusion from mainline-v5.18-rc1 commit e54084173487804f5e2f23facf107fd9336e637e category: feature bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e54084173487804f5e2f23facf107fd9336e637e -------------------------------- The vmemmap_remap_free/alloc are relevant to HugeTLB, so move those functiongs to the scope of CONFIG_HUGETLB_PAGE_FREE_VMEMMAP. Link: https://lkml.kernel.org/r/20211101031651.75851-6-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Barry Song Cc: Bodeddula Balasubramaniam Cc: Chen Huang Cc: David Hildenbrand Cc: Fam Zheng Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Qi Zheng Cc: Xiongchun Duan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Liu Shixin Reviewed-by: Kefeng Wang Signed-off-by: Zheng Zengkai --- include/linux/mm.h | 2 ++ mm/sparse-vmemmap.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index 25891b581bf4..efa3972a23bf 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3104,10 +3104,12 @@ static inline void print_vma_addr(char *prefix, unsigned long rip) } #endif +#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP int vmemmap_remap_free(unsigned long start, unsigned long end, unsigned long reuse); int vmemmap_remap_alloc(unsigned long start, unsigned long end, unsigned long reuse, gfp_t gfp_mask); +#endif void *sparse_buffer_alloc(unsigned long size); struct page * __populate_section_memmap(unsigned long pfn, diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index eef9053f948f..e5ed2680ec57 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -34,6 +34,7 @@ #include #include +#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP /** * struct vmemmap_remap_walk - walk vmemmap page table * @@ -419,6 +420,7 @@ int vmemmap_remap_alloc(unsigned long start, unsigned long end, return 0; } +#endif /* CONFIG_HUGETLB_PAGE_FREE_VMEMMAP */ /* * Allocate a block of memory to be used to back the virtual memory map -- Gitee From 0a5d68bb279164b7ec023203dc9f0d6a59fc17f1 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Thu, 28 Jul 2022 18:06:51 +0800 Subject: [PATCH 663/674] Revert "arm64: mm: hugetlb: add support for free vmemmap pages of HugeTLB" hulk inclusion category: feature bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO -------------------------------- There is a formal solution to support hugetlb vmemmap feature on arm64. This reverts commit 5838d235a395e221968e1f024bfaf8650278a522. Signed-off-by: Liu Shixin Reviewed-by: Kefeng Wang Signed-off-by: Zheng Zengkai --- fs/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/Kconfig b/fs/Kconfig index 6e723c90a506..37a8895a3254 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -239,7 +239,7 @@ config HUGETLB_PAGE config HUGETLB_PAGE_FREE_VMEMMAP def_bool HUGETLB_PAGE - depends on X86_64 || ARM64 + depends on X86_64 depends on SPARSEMEM_VMEMMAP config HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON -- Gitee From e457e43c459e8ecc1679f5f356f11c84ada79bbf Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 28 Jul 2022 18:06:52 +0800 Subject: [PATCH 664/674] mm: hugetlb_vmemmap: introduce ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP mainline inclusion from mainline-v5.19-rc1 commit 2e4ec02bbcc05b8905d65c763ebde6bc85508e90 category: feature bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=2e4ec02bbcc05b8905d65c763ebde6bc85508e90 -------------------------------- The feature of minimizing overhead of struct page associated with each HugeTLB page is implemented on x86_64, however, the infrastructure of this feature is already there, we could easily enable it for other architectures. Introduce ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP for other architectures to be easily enabled. Just select this config if they want to enable this feature. Link: https://lkml.kernel.org/r/20220331065640.5777-1-songmuchun@bytedance.com Signed-off-by: Muchun Song Suggested-by: Andrew Morton Reviewed-by: Barry Song Tested-by: Barry Song Reviewed-by: Anshuman Khandual Cc: Bodeddula Balasubramaniam Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Rientjes Cc: Fam Zheng Cc: James Morse Cc: Mark Rutland Cc: Mike Kravetz Cc: Oscar Salvador Cc: Will Deacon Cc: Xiongchun Duan Signed-off-by: Andrew Morton Signed-off-by: Liu Shixin Reviewed-by: Kefeng Wang Signed-off-by: Zheng Zengkai --- arch/x86/Kconfig | 1 + fs/Kconfig | 10 +++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d17396ef4323..661e05e1f762 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -103,6 +103,7 @@ config X86 select ARCH_WANT_DEFAULT_BPF_JIT if X86_64 select ARCH_WANTS_DYNAMIC_TASK_STRUCT select ARCH_WANT_HUGE_PMD_SHARE + select ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP if X86_64 select ARCH_WANT_LD_ORPHAN_WARN select ARCH_WANTS_THP_SWAP if X86_64 select BUILDTIME_TABLE_SORT diff --git a/fs/Kconfig b/fs/Kconfig index 37a8895a3254..b60a7614cb16 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -237,9 +237,17 @@ config HUGETLBFS config HUGETLB_PAGE def_bool HUGETLBFS +# +# Select this config option from the architecture Kconfig, if it is preferred +# to enable the feature of minimizing overhead of struct page associated with +# each HugeTLB page. +# +config ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP + bool + config HUGETLB_PAGE_FREE_VMEMMAP def_bool HUGETLB_PAGE - depends on X86_64 + depends on ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP depends on SPARSEMEM_VMEMMAP config HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON -- Gitee From de126e063c1e89c698fe2304d0e590f69157ac66 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 28 Jul 2022 18:06:53 +0800 Subject: [PATCH 665/674] arm64: mm: hugetlb: enable HUGETLB_PAGE_FREE_VMEMMAP for arm64 mainline inclusion from mainline-v5.19-rc1 commit 1e63ac088f20f7a4425c430c31ecd3cf167fb3f2 category: feature bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=1e63ac088f20f7a4425c430c31ecd3cf167fb3f2 -------------------------------- The feature of minimizing overhead of struct page associated with each HugeTLB page aims to free its vmemmap pages (used as struct page) to save memory, where is ~14GB/16GB per 1TB HugeTLB pages (2MB/1GB type). In short, when a HugeTLB page is allocated or freed, the vmemmap array representing the range associated with the page will need to be remapped. When a page is allocated, vmemmap pages are freed after remapping. When a page is freed, previously discarded vmemmap pages must be allocated before remapping. More implementations and details can be found here [1]. The infrastructure of freeing vmemmap pages associated with each HugeTLB page is already there, we can easily enable HUGETLB_PAGE_FREE_VMEMMAP for arm64, the only thing to be fixed is flush_dcache_page() . flush_dcache_page() need to be adapted to operate on the head page's flags since the tail vmemmap pages are mapped with read-only after the feature is enabled (clear operation is not permitted). There was some discussions about this in the thread [2], but there was no conclusion in the end. And I copied the concern proposed by Anshuman to here and explain why those concern is superfluous. It is safe to enable it for x86_64 as well as arm64. 1st concern: ''' But what happens when a hot remove section's vmemmap area (which is being teared down) is nearby another vmemmap area which is either created or being destroyed for HugeTLB alloc/free purpose. As you mentioned HugeTLB pages inside the hot remove section might be safe. But what about other HugeTLB areas whose vmemmap area shares page table entries with vmemmap entries for a section being hot removed ? Massive HugeTLB alloc /use/free test cycle using memory just adjacent to a memory hotplug area, which is always added and removed periodically, should be able to expose this problem. ''' Answer: At the time memory is removed, all HugeTLB pages either have been migrated away or dissolved. So there is no race between memory hot remove and free_huge_page_vmemmap(). Therefore, HugeTLB pages inside the hot remove section is safe. Let's talk your question "what about other HugeTLB areas whose vmemmap area shares page table entries with vmemmap entries for a section being hot removed ?", the question is not established. The minimal granularity size of hotplug memory 128MB (on arm64, 4k base page), any HugeTLB smaller than 128MB is within a section, then, there is no share PTE page tables between HugeTLB in this section and ones in other sections and a HugeTLB page could not cross two sections. In this case, the section cannot be freed. Any HugeTLB bigger than 128MB (section size) whose vmemmap pages is an integer multiple of 2MB (PMD-mapped). As long as: 1) HugeTLBs are naturally aligned, power-of-two sizes 2) The HugeTLB size >= the section size 3) The HugeTLB size >= the vmemmap leaf mapping size Then a HugeTLB will not share any leaf page table entries with *anything else*, but will share intermediate entries. In this case, at the time memory is removed, all HugeTLB pages either have been migrated away or dissolved. So there is also no race between memory hot remove and free_huge_page_vmemmap(). 2nd concern: ''' differently, not sure if ptdump would require any synchronization. Dumping an wrong value is probably okay but crashing because a page table entry is being freed after ptdump acquired the pointer is bad. On arm64, ptdump() is protected against hotremove via [get|put]_online_mems(). ''' Answer: The ptdump should be fine since vmemmap_remap_free() only exchanges PTEs or splits the PMD entry (which means allocating a PTE page table). Both operations do not free any page tables (PTE), so ptdump cannot run into a UAF on any page tables. The worst case is just dumping an wrong value. [1] https://lore.kernel.org/all/20210510030027.56044-1-songmuchun@bytedance.com/ [2] https://lore.kernel.org/all/20210518091826.36937-1-songmuchun@bytedance.com/ [songmuchun@bytedance.com: restructure the code comment inside flush_dcache_page()] Link: https://lkml.kernel.org/r/20220414072646.21910-1-songmuchun@bytedance.com Link: https://lkml.kernel.org/r/20220331065640.5777-2-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Barry Song Tested-by: Barry Song Cc: Will Deacon Cc: David Hildenbrand Cc: Bodeddula Balasubramaniam Cc: Oscar Salvador Cc: Mike Kravetz Cc: David Rientjes Cc: Mark Rutland Cc: Catalin Marinas Cc: James Morse Cc: Xiongchun Duan Cc: Fam Zheng Signed-off-by: Andrew Morton Signed-off-by: Liu Shixin Reviewed-by: Kefeng Wang Signed-off-by: Zheng Zengkai --- arch/arm64/Kconfig | 1 + arch/arm64/mm/flush.c | 14 ++++++++++++++ 2 files changed, 15 insertions(+) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index e253fdba1249..d9da5c4f91e0 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -81,6 +81,7 @@ config ARM64 select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT select ARCH_WANT_FRAME_POINTERS select ARCH_WANT_HUGE_PMD_SHARE if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36) + select ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP select ARCH_WANT_LD_ORPHAN_WARN select ARCH_WANT_RESERVE_CRASH_KERNEL if KEXEC_CORE select ARCH_HAS_UBSAN_SANITIZE_ALL diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c index 145fe60c5e68..2bb6defad92f 100644 --- a/arch/arm64/mm/flush.c +++ b/arch/arm64/mm/flush.c @@ -69,6 +69,20 @@ EXPORT_SYMBOL_GPL(__sync_icache_dcache); */ void flush_dcache_page(struct page *page) { + /* + * Only the head page's flags of HugeTLB can be cleared since the tail + * vmemmap pages associated with each HugeTLB page are mapped with + * read-only when CONFIG_HUGETLB_PAGE_FREE_VMEMMAP is enabled (more + * details can refer to vmemmap_remap_pte()). Although + * __sync_icache_dcache() only set PG_dcache_clean flag on the head + * page struct, there is more than one page struct with PG_dcache_clean + * associated with the HugeTLB page since the head vmemmap page frame + * is reused (more details can refer to the comments above + * page_fixed_fake_head()). + */ + if (hugetlb_free_vmemmap_enabled() && PageHuge(page)) + page = compound_head(page); + if (test_bit(PG_dcache_clean, &page->flags)) clear_bit(PG_dcache_clean, &page->flags); } -- Gitee From b8dfc94bd51821c5467f622ffec51c5bfd37c342 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 28 Jul 2022 18:06:54 +0800 Subject: [PATCH 666/674] mm: hugetlb_vmemmap: cleanup hugetlb_vmemmap related functions mainline inclusion from mainline-v5.19-rc1 commit 5981611d0a006472d367d7a8e6ead8afaecf17c7 category: feature bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=5981611d0a006472d367d7a8e6ead8afaecf17c7 -------------------------------- Patch series "cleanup hugetlb_vmemmap". The word of "free" is not expressive enough to express the feature of optimizing vmemmap pages associated with each HugeTLB, rename this keywork to "optimize" is more clear. In this series, cheanup related codes to make it more clear and expressive. This is suggested by David. This patch (of 3): The word of "free" is not expressive enough to express the feature of optimizing vmemmap pages associated with each HugeTLB, rename this keywork to "optimize". And some function names are prefixed with "huge_page" instead of "hugetlb", it is easily to be confused with THP. In this patch, cheanup related functions to make code more clear and expressive. Link: https://lkml.kernel.org/r/20220404074652.68024-1-songmuchun@bytedance.com Link: https://lkml.kernel.org/r/20220404074652.68024-2-songmuchun@bytedance.com Signed-off-by: Muchun Song Cc: David Hildenbrand Cc: Mike Kravetz Signed-off-by: Andrew Morton Conflicts: mm/hugetlb.c Signed-off-by: Liu Shixin Reviewed-by: Kefeng Wang Signed-off-by: Zheng Zengkai --- include/linux/hugetlb.h | 2 +- mm/hugetlb.c | 8 ++++---- mm/hugetlb_vmemmap.c | 42 ++++++++++++++++++++--------------------- mm/hugetlb_vmemmap.h | 20 ++++++++++---------- 4 files changed, 35 insertions(+), 37 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index fd1dc8d29436..218fc150d7f8 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -593,7 +593,7 @@ struct hstate { unsigned int surplus_huge_pages_node[MAX_NUMNODES]; unsigned int resv_huge_pages_node[MAX_NUMNODES]; #ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP - unsigned int nr_free_vmemmap_pages; + unsigned int optimize_vmemmap_pages; #endif #ifdef CONFIG_CGROUP_HUGETLB /* cgroup control files */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a8c815386ecc..c5168c7f282a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1448,7 +1448,7 @@ static void __update_and_free_page(struct hstate *h, struct page *page) if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) return; - if (alloc_huge_page_vmemmap(h, page)) { + if (hugetlb_vmemmap_alloc(h, page)) { spin_lock_irq(&hugetlb_lock); /* * If we cannot allocate vmemmap pages, just refuse to free the @@ -1519,7 +1519,7 @@ static DECLARE_WORK(free_hpage_work, free_hpage_workfn); static inline void flush_free_hpage_work(struct hstate *h) { - if (free_vmemmap_pages_per_hpage(h)) + if (hugetlb_optimize_vmemmap_pages(h)) flush_work(&free_hpage_work); } @@ -1642,7 +1642,7 @@ void free_huge_page(struct page *page) static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) { - free_huge_page_vmemmap(h, page); + hugetlb_vmemmap_free(h, page); INIT_LIST_HEAD(&page->lru); set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); hugetlb_set_page_subpool(page, NULL); @@ -2066,7 +2066,7 @@ int dissolve_free_huge_page(struct page *page) * Attempt to allocate vmemmmap here so that we can take * appropriate action on failure. */ - rc = alloc_huge_page_vmemmap(h, head); + rc = hugetlb_vmemmap_alloc(h, head); if (!rc) { /* * Move PageHWPoison flag from head page to the raw diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 791626983c2e..91b79b9d9e25 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Free some vmemmap pages of HugeTLB + * Optimize vmemmap pages associated with HugeTLB * * Copyright (c) 2020, Bytedance. All rights reserved. * @@ -192,7 +192,7 @@ DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON, hugetlb_free_vmemmap_enabled_key); EXPORT_SYMBOL(hugetlb_free_vmemmap_enabled_key); -static int __init early_hugetlb_free_vmemmap_param(char *buf) +static int __init hugetlb_vmemmap_early_param(char *buf) { /* We cannot optimize if a "struct page" crosses page boundaries. */ if (!is_power_of_2(sizeof(struct page))) { @@ -212,29 +212,26 @@ static int __init early_hugetlb_free_vmemmap_param(char *buf) return 0; } -early_param("hugetlb_free_vmemmap", early_hugetlb_free_vmemmap_param); - -static inline unsigned long free_vmemmap_pages_size_per_hpage(struct hstate *h) -{ - return (unsigned long)free_vmemmap_pages_per_hpage(h) << PAGE_SHIFT; -} +early_param("hugetlb_free_vmemmap", hugetlb_vmemmap_early_param); /* * Previously discarded vmemmap pages will be allocated and remapping * after this function returns zero. */ -int alloc_huge_page_vmemmap(struct hstate *h, struct page *head) +int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head) { int ret; unsigned long vmemmap_addr = (unsigned long)head; - unsigned long vmemmap_end, vmemmap_reuse; + unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages; if (!HPageVmemmapOptimized(head)) return 0; - vmemmap_addr += RESERVE_VMEMMAP_SIZE; - vmemmap_end = vmemmap_addr + free_vmemmap_pages_size_per_hpage(h); - vmemmap_reuse = vmemmap_addr - PAGE_SIZE; + vmemmap_addr += RESERVE_VMEMMAP_SIZE; + vmemmap_pages = hugetlb_optimize_vmemmap_pages(h); + vmemmap_end = vmemmap_addr + (vmemmap_pages << PAGE_SHIFT); + vmemmap_reuse = vmemmap_addr - PAGE_SIZE; + /* * The pages which the vmemmap virtual address range [@vmemmap_addr, * @vmemmap_end) are mapped to are freed to the buddy allocator, and @@ -250,17 +247,18 @@ int alloc_huge_page_vmemmap(struct hstate *h, struct page *head) return ret; } -void free_huge_page_vmemmap(struct hstate *h, struct page *head) +void hugetlb_vmemmap_free(struct hstate *h, struct page *head) { unsigned long vmemmap_addr = (unsigned long)head; - unsigned long vmemmap_end, vmemmap_reuse; + unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages; - if (!free_vmemmap_pages_per_hpage(h)) + vmemmap_pages = hugetlb_optimize_vmemmap_pages(h); + if (!vmemmap_pages) return; - vmemmap_addr += RESERVE_VMEMMAP_SIZE; - vmemmap_end = vmemmap_addr + free_vmemmap_pages_size_per_hpage(h); - vmemmap_reuse = vmemmap_addr - PAGE_SIZE; + vmemmap_addr += RESERVE_VMEMMAP_SIZE; + vmemmap_end = vmemmap_addr + (vmemmap_pages << PAGE_SHIFT); + vmemmap_reuse = vmemmap_addr - PAGE_SIZE; /* * Remap the vmemmap virtual address range [@vmemmap_addr, @vmemmap_end) @@ -297,8 +295,8 @@ void __init hugetlb_vmemmap_init(struct hstate *h) * hugetlbpage.rst for more details. */ if (likely(vmemmap_pages > RESERVE_VMEMMAP_NR)) - h->nr_free_vmemmap_pages = vmemmap_pages - RESERVE_VMEMMAP_NR; + h->optimize_vmemmap_pages = vmemmap_pages - RESERVE_VMEMMAP_NR; - pr_info("can free %d vmemmap pages for %s\n", h->nr_free_vmemmap_pages, - h->name); + pr_info("can optimize %d vmemmap pages for %s\n", + h->optimize_vmemmap_pages, h->name); } diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h index cb2bef8f9e73..9760537849b5 100644 --- a/mm/hugetlb_vmemmap.h +++ b/mm/hugetlb_vmemmap.h @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Free some vmemmap pages of HugeTLB + * Optimize vmemmap pages associated with HugeTLB * * Copyright (c) 2020, Bytedance. All rights reserved. * @@ -11,25 +11,25 @@ #include #ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP -int alloc_huge_page_vmemmap(struct hstate *h, struct page *head); -void free_huge_page_vmemmap(struct hstate *h, struct page *head); +int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head); +void hugetlb_vmemmap_free(struct hstate *h, struct page *head); void hugetlb_vmemmap_init(struct hstate *h); /* - * How many vmemmap pages associated with a HugeTLB page that can be freed - * to the buddy allocator. + * How many vmemmap pages associated with a HugeTLB page that can be + * optimized and freed to the buddy allocator. */ -static inline unsigned int free_vmemmap_pages_per_hpage(struct hstate *h) +static inline unsigned int hugetlb_optimize_vmemmap_pages(struct hstate *h) { - return h->nr_free_vmemmap_pages; + return h->optimize_vmemmap_pages; } #else -static inline int alloc_huge_page_vmemmap(struct hstate *h, struct page *head) +static inline int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head) { return 0; } -static inline void free_huge_page_vmemmap(struct hstate *h, struct page *head) +static inline void hugetlb_vmemmap_free(struct hstate *h, struct page *head) { } @@ -37,7 +37,7 @@ static inline void hugetlb_vmemmap_init(struct hstate *h) { } -static inline unsigned int free_vmemmap_pages_per_hpage(struct hstate *h) +static inline unsigned int hugetlb_optimize_vmemmap_pages(struct hstate *h) { return 0; } -- Gitee From 142ca734a4a1aab74f8c37818b634f996ef49e08 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 28 Jul 2022 18:06:55 +0800 Subject: [PATCH 667/674] mm: hugetlb_vmemmap: cleanup hugetlb_free_vmemmap_enabled* mainline inclusion from mainline-v5.19-rc1 commit f10f1442c309ccef7a80ba3dc4abde0978e86fb4 category: feature bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=f10f1442c309ccef7a80ba3dc4abde0978e86fb4 -------------------------------- The word of "free" is not expressive enough to express the feature of optimizing vmemmap pages associated with each HugeTLB, rename this keywork to "optimize". In this patch , cheanup the static key and hugetlb_free_vmemmap_enabled() to make code more expressive. Link: https://lkml.kernel.org/r/20220404074652.68024-3-songmuchun@bytedance.com Signed-off-by: Muchun Song Cc: David Hildenbrand Cc: Mike Kravetz Signed-off-by: Andrew Morton Conflicts: mm/memory_hotplug.c Signed-off-by: Liu Shixin Reviewed-by: Kefeng Wang Signed-off-by: Zheng Zengkai --- arch/arm64/mm/flush.c | 2 +- include/linux/page-flags.h | 12 ++++++------ mm/hugetlb_vmemmap.c | 10 +++++----- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c index 2bb6defad92f..892e53e9c788 100644 --- a/arch/arm64/mm/flush.c +++ b/arch/arm64/mm/flush.c @@ -80,7 +80,7 @@ void flush_dcache_page(struct page *page) * is reused (more details can refer to the comments above * page_fixed_fake_head()). */ - if (hugetlb_free_vmemmap_enabled() && PageHuge(page)) + if (hugetlb_optimize_vmemmap_enabled() && PageHuge(page)) page = compound_head(page); if (test_bit(PG_dcache_clean, &page->flags)) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 81061122af68..d1cf17a71a12 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -196,16 +196,16 @@ enum pageflags { #ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP DECLARE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON, - hugetlb_free_vmemmap_enabled_key); + hugetlb_optimize_vmemmap_key); -static __always_inline bool hugetlb_free_vmemmap_enabled(void) +static __always_inline bool hugetlb_optimize_vmemmap_enabled(void) { return static_branch_maybe(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON, - &hugetlb_free_vmemmap_enabled_key); + &hugetlb_optimize_vmemmap_key); } /* - * If the feature of freeing some vmemmap pages associated with each HugeTLB + * If the feature of optimizing vmemmap pages associated with each HugeTLB * page is enabled, the head vmemmap page frame is reused and all of the tail * vmemmap addresses map to the head vmemmap page frame (furture details can * refer to the figure at the head of the mm/hugetlb_vmemmap.c). In other @@ -222,7 +222,7 @@ static __always_inline bool hugetlb_free_vmemmap_enabled(void) */ static __always_inline const struct page *page_fixed_fake_head(const struct page *page) { - if (!hugetlb_free_vmemmap_enabled()) + if (!hugetlb_optimize_vmemmap_enabled()) return page; /* @@ -251,7 +251,7 @@ static inline const struct page *page_fixed_fake_head(const struct page *page) return page; } -static inline bool hugetlb_free_vmemmap_enabled(void) +static inline bool hugetlb_optimize_vmemmap_enabled(void) { return false; } diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 91b79b9d9e25..f25294973398 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -189,8 +189,8 @@ #define RESERVE_VMEMMAP_SIZE (RESERVE_VMEMMAP_NR << PAGE_SHIFT) DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON, - hugetlb_free_vmemmap_enabled_key); -EXPORT_SYMBOL(hugetlb_free_vmemmap_enabled_key); + hugetlb_optimize_vmemmap_key); +EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); static int __init hugetlb_vmemmap_early_param(char *buf) { @@ -204,9 +204,9 @@ static int __init hugetlb_vmemmap_early_param(char *buf) return -EINVAL; if (!strcmp(buf, "on")) - static_branch_enable(&hugetlb_free_vmemmap_enabled_key); + static_branch_enable(&hugetlb_optimize_vmemmap_key); else if (!strcmp(buf, "off")) - static_branch_disable(&hugetlb_free_vmemmap_enabled_key); + static_branch_disable(&hugetlb_optimize_vmemmap_key); else return -EINVAL; @@ -282,7 +282,7 @@ void __init hugetlb_vmemmap_init(struct hstate *h) BUILD_BUG_ON(__NR_USED_SUBPAGE >= RESERVE_VMEMMAP_SIZE / sizeof(struct page)); - if (!hugetlb_free_vmemmap_enabled()) + if (!hugetlb_optimize_vmemmap_enabled()) return; vmemmap_pages = (nr_pages * sizeof(struct page)) >> PAGE_SHIFT; -- Gitee From f1f5face0c06f0526c21122eb601f4e5482d612b Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 28 Jul 2022 18:06:56 +0800 Subject: [PATCH 668/674] mm: hugetlb_vmemmap: cleanup CONFIG_HUGETLB_PAGE_FREE_VMEMMAP* mainline inclusion from mainline-v5.19-rc1 commit 47010c040dec8af6347ec6259104fc13f7e7e30a category: feature bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=47010c040dec8af6347ec6259104fc13f7e7e30a -------------------------------- The word of "free" is not expressive enough to express the feature of optimizing vmemmap pages associated with each HugeTLB, rename this keywork to "optimize". In this patch , cheanup configs to make code more expressive. Link: https://lkml.kernel.org/r/20220404074652.68024-4-songmuchun@bytedance.com Signed-off-by: Muchun Song Cc: Mike Kravetz Cc: David Hildenbrand Signed-off-by: Andrew Morton Conflicts: arch/arm64/configs/openeuler_defconfig arch/x86/configs/openeuler_defconfig Documentation/admin-guide/kernel-parameters.txt include/linux/hugetlb.h mm/Makefile Signed-off-by: Liu Shixin Reviewed-by: Kefeng Wang Signed-off-by: Zheng Zengkai --- Documentation/admin-guide/kernel-parameters.txt | 5 ++++- Documentation/admin-guide/mm/hugetlbpage.rst | 2 +- arch/arm64/Kconfig | 2 +- arch/arm64/configs/openeuler_defconfig | 4 ++-- arch/arm64/mm/flush.c | 2 +- arch/x86/Kconfig | 2 +- arch/x86/configs/openeuler_defconfig | 4 ++-- arch/x86/mm/init_64.c | 2 +- fs/Kconfig | 16 ++++++++-------- include/linux/hugetlb.h | 2 +- include/linux/mm.h | 2 +- include/linux/page-flags.h | 6 +++--- mm/Makefile | 2 +- mm/hugetlb_vmemmap.c | 4 ++-- mm/hugetlb_vmemmap.h | 4 ++-- mm/sparse-vmemmap.c | 4 ++-- 16 files changed, 33 insertions(+), 30 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index e0957b73f63d..86a7ea4a9964 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1593,7 +1593,7 @@ Format: size[KMG] hugetlb_free_vmemmap= - [KNL] Reguires CONFIG_HUGETLB_PAGE_FREE_VMEMMAP + [KNL] Reguires CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP enabled. Allows heavy hugetlb users to free up some more memory (7 * PAGE_SIZE for each 2MB hugetlb page). @@ -1602,6 +1602,9 @@ on: enable the feature off: disable the feature + Built with CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON=y, + the default is on. + hugevmalloc [KNL,PPC,ARM64,X86] Requires CONFIG_HAVE_ARCH_HUGE_VMALLOC Format: { on | off } Default set by CONFIG_HUGE_VMALLOC_DEFAULT_ENABLED. diff --git a/Documentation/admin-guide/mm/hugetlbpage.rst b/Documentation/admin-guide/mm/hugetlbpage.rst index d70828c07658..0f8acc4a6cf0 100644 --- a/Documentation/admin-guide/mm/hugetlbpage.rst +++ b/Documentation/admin-guide/mm/hugetlbpage.rst @@ -164,7 +164,7 @@ default_hugepagesz will all result in 256 2M huge pages being allocated. Valid default huge page size is architecture dependent. hugetlb_free_vmemmap - When CONFIG_HUGETLB_PAGE_FREE_VMEMMAP is set, this enables freeing + When CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP is set, this enables optimizing unused vmemmap pages associated with each HugeTLB page. When multiple huge page sizes are supported, ``/proc/sys/vm/nr_hugepages`` diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index d9da5c4f91e0..c4f6c80ea976 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -81,7 +81,7 @@ config ARM64 select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT select ARCH_WANT_FRAME_POINTERS select ARCH_WANT_HUGE_PMD_SHARE if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36) - select ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP + select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP select ARCH_WANT_LD_ORPHAN_WARN select ARCH_WANT_RESERVE_CRASH_KERNEL if KEXEC_CORE select ARCH_HAS_UBSAN_SANITIZE_ALL diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index d246fd508ef6..593ac67497e3 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -6281,8 +6281,8 @@ CONFIG_TMPFS_XATTR=y # CONFIG_TMPFS_INODE64 is not set CONFIG_HUGETLBFS=y CONFIG_HUGETLB_PAGE=y -CONFIG_HUGETLB_PAGE_FREE_VMEMMAP=y -# CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON is not set +CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP=y +# CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON is not set CONFIG_MEMFD_CREATE=y CONFIG_ARCH_HAS_GIGANTIC_PAGE=y CONFIG_CONFIGFS_FS=y diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c index 892e53e9c788..c7678e7df53a 100644 --- a/arch/arm64/mm/flush.c +++ b/arch/arm64/mm/flush.c @@ -72,7 +72,7 @@ void flush_dcache_page(struct page *page) /* * Only the head page's flags of HugeTLB can be cleared since the tail * vmemmap pages associated with each HugeTLB page are mapped with - * read-only when CONFIG_HUGETLB_PAGE_FREE_VMEMMAP is enabled (more + * read-only when CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP is enabled (more * details can refer to vmemmap_remap_pte()). Although * __sync_icache_dcache() only set PG_dcache_clean flag on the head * page struct, there is more than one page struct with PG_dcache_clean diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 661e05e1f762..f39cfb5a6535 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -103,7 +103,7 @@ config X86 select ARCH_WANT_DEFAULT_BPF_JIT if X86_64 select ARCH_WANTS_DYNAMIC_TASK_STRUCT select ARCH_WANT_HUGE_PMD_SHARE - select ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP if X86_64 + select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP if X86_64 select ARCH_WANT_LD_ORPHAN_WARN select ARCH_WANTS_THP_SWAP if X86_64 select BUILDTIME_TABLE_SORT diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index 3eac70518e6f..f013c7b95881 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -7370,8 +7370,8 @@ CONFIG_TMPFS_XATTR=y # CONFIG_TMPFS_INODE64 is not set CONFIG_HUGETLBFS=y CONFIG_HUGETLB_PAGE=y -CONFIG_HUGETLB_PAGE_FREE_VMEMMAP=y -# CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON is not set +CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP=y +# CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON is not set CONFIG_DYNAMIC_HUGETLB=y CONFIG_MEMFD_CREATE=y CONFIG_ARCH_HAS_GIGANTIC_PAGE=y diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 1eed5eaee41f..b1d5c05aeca8 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1225,7 +1225,7 @@ static struct kcore_list kcore_vsyscall; static void __init register_page_bootmem_info(void) { -#if defined(CONFIG_NUMA) || defined(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP) +#if defined(CONFIG_NUMA) || defined(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP) int i; for_each_online_node(i) diff --git a/fs/Kconfig b/fs/Kconfig index b60a7614cb16..aa097ca64ef6 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -242,22 +242,22 @@ config HUGETLB_PAGE # to enable the feature of minimizing overhead of struct page associated with # each HugeTLB page. # -config ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP +config ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP bool -config HUGETLB_PAGE_FREE_VMEMMAP +config HUGETLB_PAGE_OPTIMIZE_VMEMMAP def_bool HUGETLB_PAGE - depends on ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP + depends on ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP depends on SPARSEMEM_VMEMMAP -config HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON - bool "Default freeing vmemmap pages of HugeTLB to on" +config HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON + bool "Default optimizing vmemmap pages of HugeTLB to on" default n - depends on HUGETLB_PAGE_FREE_VMEMMAP + depends on HUGETLB_PAGE_OPTIMIZE_VMEMMAP help - When using HUGETLB_PAGE_FREE_VMEMMAP, the freeing unused vmemmap + When using HUGETLB_PAGE_OPTIMIZE_VMEMMAP, the optimizing unused vmemmap pages associated with each HugeTLB page is default off. Say Y here - to enable freeing vmemmap pages of HugeTLB by default. It can then + to enable optimizing vmemmap pages of HugeTLB by default. It can then be disabled on the command line via hugetlb_free_vmemmap=off. config DYNAMIC_HUGETLB diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 218fc150d7f8..0dfe08439095 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -592,7 +592,7 @@ struct hstate { unsigned int free_huge_pages_node[MAX_NUMNODES]; unsigned int surplus_huge_pages_node[MAX_NUMNODES]; unsigned int resv_huge_pages_node[MAX_NUMNODES]; -#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP unsigned int optimize_vmemmap_pages; #endif #ifdef CONFIG_CGROUP_HUGETLB diff --git a/include/linux/mm.h b/include/linux/mm.h index efa3972a23bf..1ae73cc4b806 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3104,7 +3104,7 @@ static inline void print_vma_addr(char *prefix, unsigned long rip) } #endif -#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP int vmemmap_remap_free(unsigned long start, unsigned long end, unsigned long reuse); int vmemmap_remap_alloc(unsigned long start, unsigned long end, diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index d1cf17a71a12..26b36cac9307 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -194,13 +194,13 @@ enum pageflags { #ifndef __GENERATING_BOUNDS_H -#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP -DECLARE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON, +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP +DECLARE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON, hugetlb_optimize_vmemmap_key); static __always_inline bool hugetlb_optimize_vmemmap_enabled(void) { - return static_branch_maybe(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON, + return static_branch_maybe(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON, &hugetlb_optimize_vmemmap_key); } diff --git a/mm/Makefile b/mm/Makefile index d2a6a786f915..e83233177c7a 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -71,7 +71,7 @@ obj-$(CONFIG_FRONTSWAP) += frontswap.o obj-$(CONFIG_ZSWAP) += zswap.o obj-$(CONFIG_HAS_DMA) += dmapool.o obj-$(CONFIG_HUGETLBFS) += hugetlb.o -obj-$(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP) += hugetlb_vmemmap.o +obj-$(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP) += hugetlb_vmemmap.o obj-$(CONFIG_DYNAMIC_HUGETLB) += dynamic_hugetlb.o obj-$(CONFIG_NUMA) += mempolicy.o obj-$(CONFIG_SPARSEMEM) += sparse.o diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index f25294973398..2655434a946b 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -188,7 +188,7 @@ #define RESERVE_VMEMMAP_NR 1U #define RESERVE_VMEMMAP_SIZE (RESERVE_VMEMMAP_NR << PAGE_SHIFT) -DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON, +DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON, hugetlb_optimize_vmemmap_key); EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); @@ -276,7 +276,7 @@ void __init hugetlb_vmemmap_init(struct hstate *h) /* * There are only (RESERVE_VMEMMAP_SIZE / sizeof(struct page)) struct - * page structs that can be used when CONFIG_HUGETLB_PAGE_FREE_VMEMMAP, + * page structs that can be used when CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP, * so add a BUILD_BUG_ON to catch invalid usage of the tail struct page. */ BUILD_BUG_ON(__NR_USED_SUBPAGE >= diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h index 9760537849b5..109b0a53b6fe 100644 --- a/mm/hugetlb_vmemmap.h +++ b/mm/hugetlb_vmemmap.h @@ -10,7 +10,7 @@ #define _LINUX_HUGETLB_VMEMMAP_H #include -#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head); void hugetlb_vmemmap_free(struct hstate *h, struct page *head); void hugetlb_vmemmap_init(struct hstate *h); @@ -41,5 +41,5 @@ static inline unsigned int hugetlb_optimize_vmemmap_pages(struct hstate *h) { return 0; } -#endif /* CONFIG_HUGETLB_PAGE_FREE_VMEMMAP */ +#endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */ #endif /* _LINUX_HUGETLB_VMEMMAP_H */ diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index e5ed2680ec57..5b40a7473dc8 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -34,7 +34,7 @@ #include #include -#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP /** * struct vmemmap_remap_walk - walk vmemmap page table * @@ -420,7 +420,7 @@ int vmemmap_remap_alloc(unsigned long start, unsigned long end, return 0; } -#endif /* CONFIG_HUGETLB_PAGE_FREE_VMEMMAP */ +#endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */ /* * Allocate a block of memory to be used to back the virtual memory map -- Gitee From 301b00fff3e70b91a9bc47af2cabce76f48cdb75 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Thu, 28 Jul 2022 18:06:57 +0800 Subject: [PATCH 669/674] sysctl: add a new register_sysctl_init() interface mainline inclusion from mainline-v5.17-rc1 commit 3ddd9a808cee7284931312f2f3e854c9617f44b2 category: feature bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=3ddd9a808cee7284931312f2f3e854c9617f44b2 -------------------------------- Patch series "sysctl: first set of kernel/sysctl cleanups", v2. Finally had time to respin the series of the work we had started last year on cleaning up the kernel/sysct.c kitchen sink. People keeps stuffing their sysctls in that file and this creates a maintenance burden. So this effort is aimed at placing sysctls where they actually belong. I'm going to split patches up into series as there is quite a bit of work. This first set adds register_sysctl_init() for uses of registerting a sysctl on the init path, adds const where missing to a few places, generalizes common values so to be more easy to share, and starts the move of a few kernel/sysctl.c out where they belong. The majority of rework on v2 in this first patch set is 0-day fixes. Eric Biederman's feedback is later addressed in subsequent patch sets. I'll only post the first two patch sets for now. We can address the rest once the first two patch sets get completely reviewed / Acked. This patch (of 9): The kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. Today though folks heavily rely on tables on kernel/sysctl.c so they can easily just extend this table with their needed sysctls. In order to help users move their sysctls out we need to provide a helper which can be used during code initialization. We special-case the initialization use of register_sysctl() since it *is* safe to fail, given all that sysctls do is provide a dynamic interface to query or modify at runtime an existing variable. So the use case of register_sysctl() on init should *not* stop if the sysctls don't end up getting registered. It would be counter productive to stop boot if a simple sysctl registration failed. Provide a helper for init then, and document the recommended init levels to use for callers of this routine. We will later use this in subsequent patches to start slimming down kernel/sysctl.c tables and moving sysctl registration to the code which actually needs these sysctls. [mcgrof@kernel.org: major commit log and documentation rephrasing also moved to fs/proc/proc_sysctl.c ] Link: https://lkml.kernel.org/r/20211123202347.818157-1-mcgrof@kernel.org Link: https://lkml.kernel.org/r/20211123202347.818157-2-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Reviewed-by: Kees Cook Cc: Iurii Zaikin Cc: "Eric W. Biederman" Cc: Peter Zijlstra Cc: Greg Kroah-Hartman Cc: Paul Turner Cc: Andy Shevchenko Cc: Sebastian Reichel Cc: Tetsuo Handa Cc: Petr Mladek Cc: Sergey Senozhatsky Cc: Qing Wang Cc: Benjamin LaHaise Cc: Al Viro Cc: Jan Kara Cc: Amir Goldstein Cc: Stephen Kitt Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Clemens Ladisch Cc: David Airlie Cc: Jani Nikula Cc: Joel Becker Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Phillip Potter Cc: Rodrigo Vivi Cc: Douglas Gilbert Cc: James E.J. Bottomley Cc: Jani Nikula Cc: John Ogness Cc: Martin K. Petersen Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Liu Shixin Reviewed-by: Kefeng Wang Signed-off-by: Zheng Zengkai --- fs/proc/proc_sysctl.c | 33 +++++++++++++++++++++++++++++++++ include/linux/sysctl.h | 3 +++ 2 files changed, 36 insertions(+) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index ffed75f833b7..df435cd91a5b 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "internal.h" static const struct dentry_operations proc_sys_dentry_operations; @@ -1380,6 +1381,38 @@ struct ctl_table_header *register_sysctl(const char *path, struct ctl_table *tab } EXPORT_SYMBOL(register_sysctl); +/** + * __register_sysctl_init() - register sysctl table to path + * @path: path name for sysctl base + * @table: This is the sysctl table that needs to be registered to the path + * @table_name: The name of sysctl table, only used for log printing when + * registration fails + * + * The sysctl interface is used by userspace to query or modify at runtime + * a predefined value set on a variable. These variables however have default + * values pre-set. Code which depends on these variables will always work even + * if register_sysctl() fails. If register_sysctl() fails you'd just loose the + * ability to query or modify the sysctls dynamically at run time. Chances of + * register_sysctl() failing on init are extremely low, and so for both reasons + * this function does not return any error as it is used by initialization code. + * + * Context: Can only be called after your respective sysctl base path has been + * registered. So for instance, most base directories are registered early on + * init before init levels are processed through proc_sys_init() and + * sysctl_init(). + */ +void __init __register_sysctl_init(const char *path, struct ctl_table *table, + const char *table_name) +{ + struct ctl_table_header *hdr = register_sysctl(path, table); + + if (unlikely(!hdr)) { + pr_err("failed when register_sysctl %s to %s\n", table_name, path); + return; + } + kmemleak_not_leak(hdr); +} + static char *append_path(const char *path, char *pos, const char *name) { int namelen; diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 51298a4f4623..161eba9fd912 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -195,6 +195,9 @@ struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, void unregister_sysctl_table(struct ctl_table_header * table); extern int sysctl_init(void); +extern void __register_sysctl_init(const char *path, struct ctl_table *table, + const char *table_name); +#define register_sysctl_init(path, table) __register_sysctl_init(path, table, #table) void do_sysctl_args(void); extern int pwrsw_enabled; -- Gitee From 9e38aa2f71a1de04192e4c55f788f9c36f27d0a0 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 28 Jul 2022 18:06:58 +0800 Subject: [PATCH 670/674] mm: hugetlb_vmemmap: disable hugetlb_optimize_vmemmap when struct page crosses page boundaries mainline inclusion from mainline-v5.19-rc1 commit 0effdf461c5789be02d40c1868c70cc02ea24627 category: feature bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0effdf461c5789be02d40c1868c70cc02ea24627 -------------------------------- Patch series "add hugetlb_optimize_vmemmap sysctl", v11. This series aims to add hugetlb_optimize_vmemmap sysctl to enable or disable the feature of optimizing vmemmap pages associated with HugeTLB pages. This patch (of 4): If the size of "struct page" is not the power of two but with the feature of minimizing overhead of struct page associated with each HugeTLB is enabled, then the vmemmap pages of HugeTLB will be corrupted after remapping (panic is about to happen in theory). But this only exists when !CONFIG_MEMCG && !CONFIG_SLUB on x86_64. However, it is not a conventional configuration nowadays. So it is not a real word issue, just the result of a code review. But we cannot prevent anyone from configuring that combined configure. This hugetlb_optimize_vmemmap should be disable in this case to fix this issue. Link: https://lkml.kernel.org/r/20220512041142.39501-1-songmuchun@bytedance.com Link: https://lkml.kernel.org/r/20220512041142.39501-2-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Mike Kravetz Cc: David Hildenbrand Cc: Iurii Zaikin Cc: Jonathan Corbet Cc: Kees Cook Cc: Luis Chamberlain Cc: Masahiro Yamada Cc: Oscar Salvador Cc: Xiongchun Duan Signed-off-by: Andrew Morton Signed-off-by: Liu Shixin Reviewed-by: Kefeng Wang Signed-off-by: Zheng Zengkai --- mm/hugetlb_vmemmap.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 2655434a946b..4b8e59eeb971 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -194,12 +194,6 @@ EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); static int __init hugetlb_vmemmap_early_param(char *buf) { - /* We cannot optimize if a "struct page" crosses page boundaries. */ - if (!is_power_of_2(sizeof(struct page))) { - pr_warn("cannot free vmemmap pages because \"struct page\" crosses page boundaries\n"); - return 0; - } - if (!buf) return -EINVAL; @@ -285,6 +279,12 @@ void __init hugetlb_vmemmap_init(struct hstate *h) if (!hugetlb_optimize_vmemmap_enabled()) return; + if (!is_power_of_2(sizeof(struct page))) { + pr_warn_once("cannot optimize vmemmap pages because \"struct page\" crosses page boundaries\n"); + static_branch_disable(&hugetlb_optimize_vmemmap_key); + return; + } + vmemmap_pages = (nr_pages * sizeof(struct page)) >> PAGE_SHIFT; /* * The head page is not to be freed to buddy allocator, the other tail -- Gitee From 6233471f1e8239cde7f7e49a4d9545b17cab51ac Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 28 Jul 2022 18:06:59 +0800 Subject: [PATCH 671/674] mm: hugetlb_vmemmap: use kstrtobool for hugetlb_vmemmap param parsing mainline inclusion from mainline-v5.19-rc1 commit 9c54c522bb76cbef480722bd44059e2ba8304bd2 category: feature bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=9c54c522bb76cbef480722bd44059e2ba8304bd2 -------------------------------- Use kstrtobool rather than open coding "on" and "off" parsing in mm/hugetlb_vmemmap.c, which is more powerful to handle all kinds of parameters like 'Yy1Nn0' or [oO][NnFf] for "on" and "off". Link: https://lkml.kernel.org/r/20220512041142.39501-4-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Mike Kravetz Cc: David Hildenbrand Cc: Iurii Zaikin Cc: Jonathan Corbet Cc: Kees Cook Cc: Luis Chamberlain Cc: Masahiro Yamada Cc: Oscar Salvador Cc: Xiongchun Duan Signed-off-by: Andrew Morton Signed-off-by: Liu Shixin Reviewed-by: Kefeng Wang Signed-off-by: Zheng Zengkai --- Documentation/admin-guide/kernel-parameters.txt | 6 +++--- mm/hugetlb_vmemmap.c | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 86a7ea4a9964..247acf7fc837 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1597,10 +1597,10 @@ enabled. Allows heavy hugetlb users to free up some more memory (7 * PAGE_SIZE for each 2MB hugetlb page). - Format: { on | off (default) } + Format: { [oO][Nn]/Y/y/1 | [oO][Ff]/N/n/0 (default) } - on: enable the feature - off: disable the feature + [oO][Nn]/Y/y/1: enable the feature + [oO][Ff]/N/n/0: disable the feature Built with CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON=y, the default is on. diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 4b8e59eeb971..112e74504905 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -194,15 +194,15 @@ EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); static int __init hugetlb_vmemmap_early_param(char *buf) { - if (!buf) + bool enable; + + if (kstrtobool(buf, &enable)) return -EINVAL; - if (!strcmp(buf, "on")) + if (enable) static_branch_enable(&hugetlb_optimize_vmemmap_key); - else if (!strcmp(buf, "off")) - static_branch_disable(&hugetlb_optimize_vmemmap_key); else - return -EINVAL; + static_branch_disable(&hugetlb_optimize_vmemmap_key); return 0; } -- Gitee From 6b2239c22b56bddcdefd09fd58b6672f2b1f1566 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 28 Jul 2022 18:07:00 +0800 Subject: [PATCH 672/674] mm: hugetlb_vmemmap: add hugetlb_optimize_vmemmap sysctl mainline inclusion from mainline-v5.19-rc1 commit 78f39084b41d287aedb2ea55f2c1895cfa11d61a category: feature bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=78f39084b41d287aedb2ea55f2c1895cfa11d61a -------------------------------- We must add hugetlb_free_vmemmap=on (or "off") to the boot cmdline and reboot the server to enable or disable the feature of optimizing vmemmap pages associated with HugeTLB pages. However, rebooting usually takes a long time. So add a sysctl to enable or disable the feature at runtime without rebooting. Why we need this? There are 3 use cases. 1) The feature of minimizing overhead of struct page associated with each HugeTLB is disabled by default without passing "hugetlb_free_vmemmap=on" to the boot cmdline. When we (ByteDance) deliver the servers to the users who want to enable this feature, they have to configure the grub (change boot cmdline) and reboot the servers, whereas rebooting usually takes a long time (we have thousands of servers). It's a very bad experience for the users. So we need a approach to enable this feature after rebooting. This is a use case in our practical environment. 2) Some use cases are that HugeTLB pages are allocated 'on the fly' instead of being pulled from the HugeTLB pool, those workloads would be affected with this feature enabled. Those workloads could be identified by the characteristics of they never explicitly allocating huge pages with 'nr_hugepages' but only set 'nr_overcommit_hugepages' and then let the pages be allocated from the buddy allocator at fault time. We can confirm it is a real use case from the commit 099730d67417. For those workloads, the page fault time could be ~2x slower than before. We suspect those users want to disable this feature if the system has enabled this before and they don't think the memory savings benefit is enough to make up for the performance drop. 3) If the workload which wants vmemmap pages to be optimized and the workload which wants to set 'nr_overcommit_hugepages' and does not want the extera overhead at fault time when the overcommitted pages be allocated from the buddy allocator are deployed in the same server. The user could enable this feature and set 'nr_hugepages' and 'nr_overcommit_hugepages', then disable the feature. In this case, the overcommited HugeTLB pages will not encounter the extra overhead at fault time. Link: https://lkml.kernel.org/r/20220512041142.39501-5-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Mike Kravetz Cc: Jonathan Corbet Cc: Luis Chamberlain Cc: Kees Cook Cc: Iurii Zaikin Cc: Oscar Salvador Cc: David Hildenbrand Cc: Masahiro Yamada Cc: Xiongchun Duan Signed-off-by: Andrew Morton Conflicts: include/linux/memory_hotplug.h mm/hugetlb_vmemmap.c mm/memory_hotplug.c Signed-off-by: Liu Shixin Reviewed-by: Kefeng Wang Signed-off-by: Zheng Zengkai --- Documentation/admin-guide/sysctl/vm.rst | 39 +++++++++++ mm/hugetlb_vmemmap.c | 92 ++++++++++++++++++++++--- 2 files changed, 122 insertions(+), 9 deletions(-) diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index a5fbef4740c2..5de629b932ae 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -560,6 +560,45 @@ Change the minimum size of the hugepage pool. See Documentation/admin-guide/mm/hugetlbpage.rst +hugetlb_optimize_vmemmap +======================== + +This knob is not available when memory_hotplug.memmap_on_memory (kernel parameter) +is configured or the size of 'struct page' (a structure defined in +include/linux/mm_types.h) is not power of two (an unusual system config could +result in this). + +Enable (set to 1) or disable (set to 0) the feature of optimizing vmemmap pages +associated with each HugeTLB page. + +Once enabled, the vmemmap pages of subsequent allocation of HugeTLB pages from +buddy allocator will be optimized (7 pages per 2MB HugeTLB page and 4095 pages +per 1GB HugeTLB page), whereas already allocated HugeTLB pages will not be +optimized. When those optimized HugeTLB pages are freed from the HugeTLB pool +to the buddy allocator, the vmemmap pages representing that range needs to be +remapped again and the vmemmap pages discarded earlier need to be rellocated +again. If your use case is that HugeTLB pages are allocated 'on the fly' (e.g. +never explicitly allocating HugeTLB pages with 'nr_hugepages' but only set +'nr_overcommit_hugepages', those overcommitted HugeTLB pages are allocated 'on +the fly') instead of being pulled from the HugeTLB pool, you should weigh the +benefits of memory savings against the more overhead (~2x slower than before) +of allocation or freeing HugeTLB pages between the HugeTLB pool and the buddy +allocator. Another behavior to note is that if the system is under heavy memory +pressure, it could prevent the user from freeing HugeTLB pages from the HugeTLB +pool to the buddy allocator since the allocation of vmemmap pages could be +failed, you have to retry later if your system encounter this situation. + +Once disabled, the vmemmap pages of subsequent allocation of HugeTLB pages from +buddy allocator will not be optimized meaning the extra overhead at allocation +time from buddy allocator disappears, whereas already optimized HugeTLB pages +will not be affected. If you want to make sure there are no optimized HugeTLB +pages, you can set "nr_hugepages" to 0 first and then disable this. Note that +writing 0 to nr_hugepages will make any "in use" HugeTLB pages become surplus +pages. So, those surplus pages are still optimized until they are no longer +in use. You would need to wait for those surplus pages to be released before +there are no optimized pages in the system. + + nr_hugepages_mempolicy ====================== diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 112e74504905..4340bf6c5551 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -188,21 +188,40 @@ #define RESERVE_VMEMMAP_NR 1U #define RESERVE_VMEMMAP_SIZE (RESERVE_VMEMMAP_NR << PAGE_SHIFT) +enum vmemmap_optimize_mode { + VMEMMAP_OPTIMIZE_OFF, + VMEMMAP_OPTIMIZE_ON, +}; + DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON, hugetlb_optimize_vmemmap_key); EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); +static enum vmemmap_optimize_mode vmemmap_optimize_mode = + IS_ENABLED(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON); + +static void vmemmap_optimize_mode_switch(enum vmemmap_optimize_mode to) +{ + if (vmemmap_optimize_mode == to) + return; + + if (to == VMEMMAP_OPTIMIZE_OFF) + static_branch_dec(&hugetlb_optimize_vmemmap_key); + else + static_branch_inc(&hugetlb_optimize_vmemmap_key); + WRITE_ONCE(vmemmap_optimize_mode, to); +} + static int __init hugetlb_vmemmap_early_param(char *buf) { bool enable; + enum vmemmap_optimize_mode mode; if (kstrtobool(buf, &enable)) return -EINVAL; - if (enable) - static_branch_enable(&hugetlb_optimize_vmemmap_key); - else - static_branch_disable(&hugetlb_optimize_vmemmap_key); + mode = enable ? VMEMMAP_OPTIMIZE_ON : VMEMMAP_OPTIMIZE_OFF; + vmemmap_optimize_mode_switch(mode); return 0; } @@ -235,8 +254,10 @@ int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head) */ ret = vmemmap_remap_alloc(vmemmap_addr, vmemmap_end, vmemmap_reuse, GFP_KERNEL | __GFP_NORETRY | __GFP_THISNODE); - if (!ret) + if (!ret) { ClearHPageVmemmapOptimized(head); + static_branch_dec(&hugetlb_optimize_vmemmap_key); + } return ret; } @@ -250,6 +271,11 @@ void hugetlb_vmemmap_free(struct hstate *h, struct page *head) if (!vmemmap_pages) return; + if (READ_ONCE(vmemmap_optimize_mode) == VMEMMAP_OPTIMIZE_OFF) + return; + + static_branch_inc(&hugetlb_optimize_vmemmap_key); + vmemmap_addr += RESERVE_VMEMMAP_SIZE; vmemmap_end = vmemmap_addr + (vmemmap_pages << PAGE_SHIFT); vmemmap_reuse = vmemmap_addr - PAGE_SIZE; @@ -259,7 +285,9 @@ void hugetlb_vmemmap_free(struct hstate *h, struct page *head) * to the page which @vmemmap_reuse is mapped to, then free the pages * which the range [@vmemmap_addr, @vmemmap_end] is mapped to. */ - if (!vmemmap_remap_free(vmemmap_addr, vmemmap_end, vmemmap_reuse)) + if (vmemmap_remap_free(vmemmap_addr, vmemmap_end, vmemmap_reuse)) + static_branch_dec(&hugetlb_optimize_vmemmap_key); + else SetHPageVmemmapOptimized(head); } @@ -276,9 +304,6 @@ void __init hugetlb_vmemmap_init(struct hstate *h) BUILD_BUG_ON(__NR_USED_SUBPAGE >= RESERVE_VMEMMAP_SIZE / sizeof(struct page)); - if (!hugetlb_optimize_vmemmap_enabled()) - return; - if (!is_power_of_2(sizeof(struct page))) { pr_warn_once("cannot optimize vmemmap pages because \"struct page\" crosses page boundaries\n"); static_branch_disable(&hugetlb_optimize_vmemmap_key); @@ -300,3 +325,52 @@ void __init hugetlb_vmemmap_init(struct hstate *h) pr_info("can optimize %d vmemmap pages for %s\n", h->optimize_vmemmap_pages, h->name); } + +#ifdef CONFIG_PROC_SYSCTL +static int hugetlb_optimize_vmemmap_handler(struct ctl_table *table, int write, + void *buffer, size_t *length, + loff_t *ppos) +{ + int ret; + enum vmemmap_optimize_mode mode; + static DEFINE_MUTEX(sysctl_mutex); + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + mutex_lock(&sysctl_mutex); + mode = vmemmap_optimize_mode; + table->data = &mode; + ret = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (write && !ret) + vmemmap_optimize_mode_switch(mode); + mutex_unlock(&sysctl_mutex); + + return ret; +} + +static struct ctl_table hugetlb_vmemmap_sysctls[] = { + { + .procname = "hugetlb_optimize_vmemmap", + .maxlen = sizeof(enum vmemmap_optimize_mode), + .mode = 0644, + .proc_handler = hugetlb_optimize_vmemmap_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { } +}; + +static __init int hugetlb_vmemmap_sysctls_init(void) +{ + /* + * If "memory_hotplug.memmap_on_memory" is enabled or "struct page" + * crosses page boundaries, the vmemmap pages cannot be optimized. + */ + if (is_power_of_2(sizeof(struct page))) + register_sysctl_init("vm", hugetlb_vmemmap_sysctls); + + return 0; +} +late_initcall(hugetlb_vmemmap_sysctls_init); +#endif /* CONFIG_PROC_SYSCTL */ -- Gitee From 1d2bbc77596574b078cd2b8dec8725d8c38617ea Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 28 Jul 2022 18:07:01 +0800 Subject: [PATCH 673/674] mm: hugetlb_vmemmap: fix CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON mainline inclusion from mainline-v5.19-rc1 commit 0111def915b280c64c05f73f01b59ca404255aa3 category: feature bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0111def915b280c64c05f73f01b59ca404255aa3 -------------------------------- The following: commit 47010c040dec ("mm: hugetlb_vmemmap: cleanup CONFIG_HUGETLB_PAGE_FREE_VMEMMAP*") forgot to update CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON used in vmemmap_optimize_mode to CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON. The result is we cannot enable hugetlb_optimize_vmemmap at boot time when we configure CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON. Fix it. Link: https://lkml.kernel.org/r/20220527081948.68832-1-songmuchun@bytedance.com Fixes: 47010c040dec ("mm: hugetlb_vmemmap: cleanup CONFIG_HUGETLB_PAGE_FREE_VMEMMAP*") Signed-off-by: Muchun Song Reported-by: Vlastimil Babka Acked-by: Vlastimil Babka Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Liu Shixin Reviewed-by: Kefeng Wang Signed-off-by: Zheng Zengkai --- mm/hugetlb_vmemmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 4340bf6c5551..e9f63cb9e3d4 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -198,7 +198,7 @@ DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON, EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); static enum vmemmap_optimize_mode vmemmap_optimize_mode = - IS_ENABLED(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON); + IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON); static void vmemmap_optimize_mode_switch(enum vmemmap_optimize_mode to) { -- Gitee From f1cfb6175ca81adf5ad5448ea4ad8cd205acf96b Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Thu, 28 Jul 2022 18:07:02 +0800 Subject: [PATCH 674/674] mm: hugetlb_vmemmap: disable hugetlb_vmemmap when dynamic hugetlb is enabled hulk inclusion category: feature bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO CVE: NA -------------------------------- Disable hugetlb_vmemmap when dynamic hugetlb is enabled. By the way, fix a similar spelling error. Signed-off-by: Liu Shixin Reviewed-by: Kefeng Wang Signed-off-by: Zheng Zengkai --- mm/dynamic_hugetlb.c | 4 ++-- mm/huge_memory.c | 2 +- mm/hugetlb_vmemmap.c | 11 ++++++++++- 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/mm/dynamic_hugetlb.c b/mm/dynamic_hugetlb.c index f8ebc8ab7d60..c1b968a7e668 100644 --- a/mm/dynamic_hugetlb.c +++ b/mm/dynamic_hugetlb.c @@ -1150,6 +1150,6 @@ static int __init dynamic_hugetlb_setup(char *s) { if (!strcmp(s, "on")) enable_dhugetlb = true; - return 1; + return 0; } -__setup("dynamic_hugetlb=", dynamic_hugetlb_setup); +early_param("dynamic_hugetlb", dynamic_hugetlb_setup); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index bfe079e294cb..79c855b5adad 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -401,7 +401,7 @@ static int __init hugepage_init(void) */ if (enable_dhugetlb) { transparent_hugepage_flags = 0; - pr_info("transparent hugepage is disabled due to confilct with dynamic hugetlb\n"); + pr_info("transparent hugepage is disabled due to conflict with dynamic hugetlb\n"); return -EINVAL; } diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index e9f63cb9e3d4..7ec8560d267d 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -176,6 +176,7 @@ */ #define pr_fmt(fmt) "HugeTLB: " fmt +#include #include "hugetlb_vmemmap.h" /* @@ -304,6 +305,12 @@ void __init hugetlb_vmemmap_init(struct hstate *h) BUILD_BUG_ON(__NR_USED_SUBPAGE >= RESERVE_VMEMMAP_SIZE / sizeof(struct page)); + if (enable_dhugetlb) { + pr_warn_once("cannot optimize vmemmap pages due to conflict with dynamic hugetlb\n"); + static_branch_disable(&hugetlb_optimize_vmemmap_key); + return; + } + if (!is_power_of_2(sizeof(struct page))) { pr_warn_once("cannot optimize vmemmap pages because \"struct page\" crosses page boundaries\n"); static_branch_disable(&hugetlb_optimize_vmemmap_key); @@ -366,8 +373,10 @@ static __init int hugetlb_vmemmap_sysctls_init(void) /* * If "memory_hotplug.memmap_on_memory" is enabled or "struct page" * crosses page boundaries, the vmemmap pages cannot be optimized. + * If "dynamic hugetlb" is enabled, the vmemmap pages cannot be + * optimized. */ - if (is_power_of_2(sizeof(struct page))) + if (is_power_of_2(sizeof(struct page)) && !enable_dhugetlb) register_sysctl_init("vm", hugetlb_vmemmap_sysctls); return 0; -- Gitee