From 84ddf6b0e04d8c0ec40735094a58e35ef3623fe5 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 13 Nov 2020 00:01:17 +0200
Subject: [PATCH 001/674] x86/mm: Signal SIGSEGV with PF_SGX

mainline inclusion
from mainline-5.11
commit 74faeee06db81a06add0def6a394210c8fef0ab7
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit 74faeee06db8 x86/mm: Signal SIGSEGV with PF_SGX.
Backport for SGX Foundations support

--------------------------------

The x86 architecture has a set of page fault error codes.  These indicate
things like whether the fault occurred from a write, or whether it
originated in userspace.

The SGX hardware architecture has its own per-page memory management
metadata (EPCM) [*] and hardware which is separate from the normal x86 MMU.
The architecture has a new page fault error code: PF_SGX.  This new error
code bit is set whenever a page fault occurs as the result of the SGX MMU.

These faults occur for a variety of reasons.  For instance, an access
attempt to enclave memory from outside the enclave causes a PF_SGX fault.
PF_SGX would also be set for permission conflicts, such as if a write to an
enclave page occurs and the page is marked read-write in the x86 page
tables but is read-only in the EPCM.

These faults do not always indicate errors, though.  SGX pages are
encrypted with a key that is destroyed at hardware reset, including
suspend. Throwing a SIGSEGV allows user space software to react and recover
when these events occur.

Include PF_SGX in the PF error codes list and throw SIGSEGV when it is
encountered.

[*] Intel SDM: 36.5.1 Enclave Page Cache Map (EPCM)

 [ bp: Add bit 15 to the comment above enum x86_pf_error_code too. ]

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Jethro Beekman <jethro@fortanix.com>
Link: https://lkml.kernel.org/r/20201112220135.165028-7-jarkko@kernel.org
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/include/asm/trap_pf.h |  2 ++
 arch/x86/mm/fault.c            | 12 ++++++++++++
 2 files changed, 14 insertions(+)

diff --git a/arch/x86/include/asm/trap_pf.h b/arch/x86/include/asm/trap_pf.h
index 305bc1214aef..10b1de500ab1 100644
--- a/arch/x86/include/asm/trap_pf.h
+++ b/arch/x86/include/asm/trap_pf.h
@@ -11,6 +11,7 @@
  *   bit 3 ==				1: use of reserved bit detected
  *   bit 4 ==				1: fault was an instruction fetch
  *   bit 5 ==				1: protection keys block access
+ *   bit 15 ==				1: SGX MMU page-fault
  */
 enum x86_pf_error_code {
 	X86_PF_PROT	=		1 << 0,
@@ -19,6 +20,7 @@ enum x86_pf_error_code {
 	X86_PF_RSVD	=		1 << 3,
 	X86_PF_INSTR	=		1 << 4,
 	X86_PF_PK	=		1 << 5,
+	X86_PF_SGX	=		1 << 15,
 };
 
 #endif /* _ASM_X86_TRAP_PF_H */
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 34112c63b347..058ff3f6944c 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1123,6 +1123,18 @@ access_error(unsigned long error_code, struct vm_area_struct *vma)
 	if (error_code & X86_PF_PK)
 		return 1;
 
+	/*
+	 * SGX hardware blocked the access.  This usually happens
+	 * when the enclave memory contents have been destroyed, like
+	 * after a suspend/resume cycle. In any case, the kernel can't
+	 * fix the cause of the fault.  Handle the fault as an access
+	 * error even in cases where no actual access violation
+	 * occurred.  This allows userspace to rebuild the enclave in
+	 * response to the signal.
+	 */
+	if (unlikely(error_code & X86_PF_SGX))
+		return 1;
+
 	/*
 	 * Make sure to check the VMA so that we do not perform
 	 * faults just to hit a X86_PF_PK as soon as we fill in a
-- 
Gitee


From 413a8d39e19850afc962b34f23deb0728719fc6e Mon Sep 17 00:00:00 2001
From: Kai Huang <kai.huang@intel.com>
Date: Fri, 19 Mar 2021 20:22:17 +1300
Subject: [PATCH 002/674] x86/cpufeatures: Make SGX_LC feature bit depend on
 SGX bit

mainline inclusion
from mainline-5.13
commit e9a15a40e857fc6ccfbb05fec7b184e9003057df
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit e9a15a40e857 x86/cpufeatures: Make SGX_LC feature bit
depend on SGX bit.
Backport for SGX virtualization support

--------------------------------

Move SGX_LC feature bit to CPUID dependency table to make clearing all
SGX feature bits easier. Also remove clear_sgx_caps() since it is just
a wrapper of setup_clear_cpu_cap(X86_FEATURE_SGX) now.

Suggested-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Kai Huang <kai.huang@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Sean Christopherson <seanjc@google.com>
Acked-by: Dave Hansen <dave.hansen@intel.com>
Acked-by: Jarkko Sakkinen <jarkko@kernel.org>
Link: https://lkml.kernel.org/r/5d4220fd0a39f52af024d3fa166231c1d498dd10.1616136308.git.kai.huang@intel.com
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/kernel/cpu/cpuid-deps.c |  1 +
 arch/x86/kernel/cpu/feat_ctl.c   | 12 +++---------
 2 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
index d502241995a3..b34632ea6d70 100644
--- a/arch/x86/kernel/cpu/cpuid-deps.c
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -71,6 +71,7 @@ static const struct cpuid_dep cpuid_deps[] = {
 	{ X86_FEATURE_AVX512_BF16,		X86_FEATURE_AVX512VL  },
 	{ X86_FEATURE_ENQCMD,			X86_FEATURE_XSAVES    },
 	{ X86_FEATURE_PER_THREAD_MBA,		X86_FEATURE_MBA       },
+	{ X86_FEATURE_SGX_LC,			X86_FEATURE_SGX	      },
 	{}
 };
 
diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c
index 3b1b01f2b248..27533a6e04fa 100644
--- a/arch/x86/kernel/cpu/feat_ctl.c
+++ b/arch/x86/kernel/cpu/feat_ctl.c
@@ -93,15 +93,9 @@ static void init_vmx_capabilities(struct cpuinfo_x86 *c)
 }
 #endif /* CONFIG_X86_VMX_FEATURE_NAMES */
 
-static void clear_sgx_caps(void)
-{
-	setup_clear_cpu_cap(X86_FEATURE_SGX);
-	setup_clear_cpu_cap(X86_FEATURE_SGX_LC);
-}
-
 static int __init nosgx(char *str)
 {
-	clear_sgx_caps();
+	setup_clear_cpu_cap(X86_FEATURE_SGX);
 
 	return 0;
 }
@@ -116,7 +110,7 @@ void init_ia32_feat_ctl(struct cpuinfo_x86 *c)
 
 	if (rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr)) {
 		clear_cpu_cap(c, X86_FEATURE_VMX);
-		clear_sgx_caps();
+		clear_cpu_cap(c, X86_FEATURE_SGX);
 		return;
 	}
 
@@ -177,6 +171,6 @@ void init_ia32_feat_ctl(struct cpuinfo_x86 *c)
 	    !(msr & FEAT_CTL_SGX_LC_ENABLED) || !enable_sgx) {
 		if (enable_sgx)
 			pr_err_once("SGX disabled by BIOS\n");
-		clear_sgx_caps();
+		clear_cpu_cap(c, X86_FEATURE_SGX);
 	}
 }
-- 
Gitee


From 4a5aef2a96bdfe125d9f893aec4d226f573deb5c Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 19 Mar 2021 20:22:18 +1300
Subject: [PATCH 003/674] x86/cpufeatures: Add SGX1 and SGX2 sub-features

mainline inclusion
from mainline-5.13
commit b8921dccf3b25798409d35155b5d127085de72c2
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit b8921dccf3b2 x86/cpufeatures: Add SGX1 and SGX2 sub-features.
Backport for SGX virtualization support

--------------------------------

Add SGX1 and SGX2 feature flags, via CPUID.0x12.0x0.EAX, as scattered
features, since adding a new leaf for only two bits would be wasteful.
As part of virtualizing SGX, KVM will expose the SGX CPUID leafs to its
guest, and to do so correctly needs to query hardware and kernel support
for SGX1 and SGX2.

Suppress both SGX1 and SGX2 from /proc/cpuinfo. SGX1 basically means
SGX, and for SGX2 there is no concrete use case of using it in
/proc/cpuinfo.

Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Kai Huang <kai.huang@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Dave Hansen <dave.hansen@intel.com>
Acked-by: Jarkko Sakkinen <jarkko@kernel.org>
Link: https://lkml.kernel.org/r/d787827dbfca6b3210ac3e432e3ac1202727e786.1616136308.git.kai.huang@intel.com
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/include/asm/cpufeatures.h | 2 ++
 arch/x86/kernel/cpu/cpuid-deps.c   | 2 ++
 arch/x86/kernel/cpu/scattered.c    | 2 ++
 3 files changed, 6 insertions(+)

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 4da419226377..54966a376b99 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -291,6 +291,8 @@
 #define X86_FEATURE_FENCE_SWAPGS_KERNEL	(11*32+ 5) /* "" LFENCE in kernel entry SWAPGS path */
 #define X86_FEATURE_SPLIT_LOCK_DETECT	(11*32+ 6) /* #AC for split lock */
 #define X86_FEATURE_PER_THREAD_MBA	(11*32+ 7) /* "" Per-thread Memory Bandwidth Allocation */
+#define X86_FEATURE_SGX1		(11*32+ 8) /* "" Basic SGX */
+#define X86_FEATURE_SGX2		(11*32+ 9) /* "" SGX Enclave Dynamic Memory Management (EDMM) */
 
 /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */
 #define X86_FEATURE_AVX512_BF16		(12*32+ 5) /* AVX512 BFLOAT16 instructions */
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
index b34632ea6d70..28c75066815c 100644
--- a/arch/x86/kernel/cpu/cpuid-deps.c
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -72,6 +72,8 @@ static const struct cpuid_dep cpuid_deps[] = {
 	{ X86_FEATURE_ENQCMD,			X86_FEATURE_XSAVES    },
 	{ X86_FEATURE_PER_THREAD_MBA,		X86_FEATURE_MBA       },
 	{ X86_FEATURE_SGX_LC,			X86_FEATURE_SGX	      },
+	{ X86_FEATURE_SGX1,			X86_FEATURE_SGX       },
+	{ X86_FEATURE_SGX2,			X86_FEATURE_SGX1      },
 	{}
 };
 
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 866c9a9bcdee..839b54a08e09 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -36,6 +36,8 @@ static const struct cpuid_bit cpuid_bits[] = {
 	{ X86_FEATURE_CDP_L2,		CPUID_ECX,  2, 0x00000010, 2 },
 	{ X86_FEATURE_MBA,		CPUID_EBX,  3, 0x00000010, 0 },
 	{ X86_FEATURE_PER_THREAD_MBA,	CPUID_ECX,  0, 0x00000010, 3 },
+	{ X86_FEATURE_SGX1,		CPUID_EAX,  0, 0x00000012, 0 },
+	{ X86_FEATURE_SGX2,		CPUID_EAX,  1, 0x00000012, 0 },
 	{ X86_FEATURE_HW_PSTATE,	CPUID_EDX,  7, 0x80000007, 0 },
 	{ X86_FEATURE_CPB,		CPUID_EDX,  9, 0x80000007, 0 },
 	{ X86_FEATURE_PROC_FEEDBACK,    CPUID_EDX, 11, 0x80000007, 0 },
-- 
Gitee


From e230e731c6d6ec793fb02b448508920d28ef2bca Mon Sep 17 00:00:00 2001
From: Kai Huang <kai.huang@intel.com>
Date: Thu, 25 Mar 2021 22:30:57 +1300
Subject: [PATCH 004/674] x86/sgx: Wipe out EREMOVE from sgx_free_epc_page()

mainline inclusion
from mainline-5.13
commit b0c7459be0670fabe080e30906ba9fe62df5e02c
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit b0c7459be067 x86/sgx: Wipe out EREMOVE from
sgx_free_epc_page().
Backport for SGX virtualization support

--------------------------------

EREMOVE takes a page and removes any association between that page and
an enclave. It must be run on a page before it can be added into another
enclave. Currently, EREMOVE is run as part of pages being freed into the
SGX page allocator. It is not expected to fail, as it would indicate a
use-after-free of EPC pages. Rather than add the page back to the pool
of available EPC pages, the kernel intentionally leaks the page to avoid
additional errors in the future.

However, KVM does not track how guest pages are used, which means that
SGX virtualization use of EREMOVE might fail. Specifically, it is
legitimate that EREMOVE returns SGX_CHILD_PRESENT for EPC assigned to
KVM guest, because KVM/kernel doesn't track SECS pages.

To allow SGX/KVM to introduce a more permissive EREMOVE helper and
to let the SGX virtualization code use the allocator directly, break
out the EREMOVE call from the SGX page allocator. Rename the original
sgx_free_epc_page() to sgx_encl_free_epc_page(), indicating that
it is used to free an EPC page assigned to a host enclave. Replace
sgx_free_epc_page() with sgx_encl_free_epc_page() in all call sites so
there's no functional change.

At the same time, improve the error message when EREMOVE fails, and
add documentation to explain to the user what that failure means and
to suggest to the user what to do when this bug happens in the case it
happens.

 [ bp: Massage commit message, fix typos and sanitize text, simplify. ]

Signed-off-by: Kai Huang <kai.huang@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Link: https://lkml.kernel.org/r/20210325093057.122834-1-kai.huang@intel.com
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 Documentation/x86/sgx.rst       | 25 +++++++++++++++++++++++++
 arch/x86/kernel/cpu/sgx/encl.c  | 31 ++++++++++++++++++++++++++-----
 arch/x86/kernel/cpu/sgx/encl.h  |  1 +
 arch/x86/kernel/cpu/sgx/ioctl.c |  6 +++---
 arch/x86/kernel/cpu/sgx/main.c  | 14 +++++---------
 arch/x86/kernel/cpu/sgx/sgx.h   |  4 ++++
 6 files changed, 64 insertions(+), 17 deletions(-)

diff --git a/Documentation/x86/sgx.rst b/Documentation/x86/sgx.rst
index eaee1368b4fd..f90076e67cde 100644
--- a/Documentation/x86/sgx.rst
+++ b/Documentation/x86/sgx.rst
@@ -209,3 +209,28 @@ An application may be loaded into a container enclave which is specially
 configured with a library OS and run-time which permits the application to run.
 The enclave run-time and library OS work together to execute the application
 when a thread enters the enclave.
+
+Impact of Potential Kernel SGX Bugs
+===================================
+
+EPC leaks
+---------
+
+When EPC page leaks happen, a WARNING like this is shown in dmesg:
+
+"EREMOVE returned ... and an EPC page was leaked.  SGX may become unusable..."
+
+This is effectively a kernel use-after-free of an EPC page, and due
+to the way SGX works, the bug is detected at freeing. Rather than
+adding the page back to the pool of available EPC pages, the kernel
+intentionally leaks the page to avoid additional errors in the future.
+
+When this happens, the kernel will likely soon leak more EPC pages, and
+SGX will likely become unusable because the memory available to SGX is
+limited. However, while this may be fatal to SGX, the rest of the kernel
+is unlikely to be impacted and should continue to work.
+
+As a result, when this happpens, user should stop running any new
+SGX workloads, (or just any new workloads), and migrate all valuable
+workloads. Although a machine reboot can recover all EPC memory, the bug
+should be reported to Linux developers.
diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
index 3aaa709b5054..0c9f6cdd34a4 100644
--- a/arch/x86/kernel/cpu/sgx/encl.c
+++ b/arch/x86/kernel/cpu/sgx/encl.c
@@ -212,7 +212,7 @@ static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page,
 
 	ret = __sgx_encl_eldu(encl_page, epc_page, secs_page);
 	if (ret) {
-		sgx_free_epc_page(epc_page);
+		sgx_encl_free_epc_page(epc_page);
 		return ERR_PTR(ret);
 	}
 
@@ -538,7 +538,7 @@ void sgx_encl_release(struct kref *ref)
 			if (sgx_unmark_page_reclaimable(entry->epc_page))
 				continue;
 
-			sgx_free_epc_page(entry->epc_page);
+			sgx_encl_free_epc_page(entry->epc_page);
 			encl->secs_child_cnt--;
 			entry->epc_page = NULL;
 		}
@@ -549,7 +549,7 @@ void sgx_encl_release(struct kref *ref)
 	xa_destroy(&encl->page_array);
 
 	if (!encl->secs_child_cnt && encl->secs.epc_page) {
-		sgx_free_epc_page(encl->secs.epc_page);
+		sgx_encl_free_epc_page(encl->secs.epc_page);
 		encl->secs.epc_page = NULL;
 	}
 
@@ -557,7 +557,7 @@ void sgx_encl_release(struct kref *ref)
 		va_page = list_first_entry(&encl->va_pages, struct sgx_va_page,
 					   list);
 		list_del(&va_page->list);
-		sgx_free_epc_page(va_page->epc_page);
+		sgx_encl_free_epc_page(va_page->epc_page);
 		kfree(va_page);
 	}
 
@@ -818,7 +818,7 @@ struct sgx_epc_page *sgx_alloc_va_page(void)
 	ret = __epa(sgx_get_epc_virt_addr(epc_page));
 	if (ret) {
 		WARN_ONCE(1, "EPA returned %d (0x%x)", ret, ret);
-		sgx_free_epc_page(epc_page);
+		sgx_encl_free_epc_page(epc_page);
 		return ERR_PTR(-EFAULT);
 	}
 
@@ -867,3 +867,24 @@ bool sgx_va_page_full(struct sgx_va_page *va_page)
 
 	return slot == SGX_VA_SLOT_COUNT;
 }
+
+/**
+ * sgx_encl_free_epc_page - free an EPC page assigned to an enclave
+ * @page:	EPC page to be freed
+ *
+ * Free an EPC page assigned to an enclave. It does EREMOVE for the page, and
+ * only upon success, it puts the page back to free page list.  Otherwise, it
+ * gives a WARNING to indicate page is leaked.
+ */
+void sgx_encl_free_epc_page(struct sgx_epc_page *page)
+{
+	int ret;
+
+	WARN_ON_ONCE(page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED);
+
+	ret = __eremove(sgx_get_epc_virt_addr(page));
+	if (WARN_ONCE(ret, EREMOVE_ERROR_MESSAGE, ret, ret))
+		return;
+
+	sgx_free_epc_page(page);
+}
diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h
index d8d30ccbef4c..6e74f85b6264 100644
--- a/arch/x86/kernel/cpu/sgx/encl.h
+++ b/arch/x86/kernel/cpu/sgx/encl.h
@@ -115,5 +115,6 @@ struct sgx_epc_page *sgx_alloc_va_page(void);
 unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page);
 void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset);
 bool sgx_va_page_full(struct sgx_va_page *va_page);
+void sgx_encl_free_epc_page(struct sgx_epc_page *page);
 
 #endif /* _X86_ENCL_H */
diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c
index 2e10367ea66c..354e309fcdb7 100644
--- a/arch/x86/kernel/cpu/sgx/ioctl.c
+++ b/arch/x86/kernel/cpu/sgx/ioctl.c
@@ -47,7 +47,7 @@ static void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page)
 	encl->page_cnt--;
 
 	if (va_page) {
-		sgx_free_epc_page(va_page->epc_page);
+		sgx_encl_free_epc_page(va_page->epc_page);
 		list_del(&va_page->list);
 		kfree(va_page);
 	}
@@ -117,7 +117,7 @@ static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs)
 	return 0;
 
 err_out:
-	sgx_free_epc_page(encl->secs.epc_page);
+	sgx_encl_free_epc_page(encl->secs.epc_page);
 	encl->secs.epc_page = NULL;
 
 err_out_backing:
@@ -365,7 +365,7 @@ static int sgx_encl_add_page(struct sgx_encl *encl, unsigned long src,
 	mmap_read_unlock(current->mm);
 
 err_out_free:
-	sgx_free_epc_page(epc_page);
+	sgx_encl_free_epc_page(epc_page);
 	kfree(encl_page);
 
 	return ret;
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index 13a7599ce7d4..b227629b1e9c 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -294,7 +294,7 @@ static void sgx_reclaimer_write(struct sgx_epc_page *epc_page,
 
 		sgx_encl_ewb(encl->secs.epc_page, &secs_backing);
 
-		sgx_free_epc_page(encl->secs.epc_page);
+		sgx_encl_free_epc_page(encl->secs.epc_page);
 		encl->secs.epc_page = NULL;
 
 		sgx_encl_put_backing(&secs_backing, true);
@@ -609,19 +609,15 @@ struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim)
  * sgx_free_epc_page() - Free an EPC page
  * @page:	an EPC page
  *
- * Call EREMOVE for an EPC page and insert it back to the list of free pages.
+ * Put the EPC page back to the list of free pages. It's the caller's
+ * responsibility to make sure that the page is in uninitialized state. In other
+ * words, do EREMOVE, EWB or whatever operation is necessary before calling
+ * this function.
  */
 void sgx_free_epc_page(struct sgx_epc_page *page)
 {
 	struct sgx_epc_section *section = &sgx_epc_sections[page->section];
 	struct sgx_numa_node *node = section->node;
-	int ret;
-
-	WARN_ON_ONCE(page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED);
-
-	ret = __eremove(sgx_get_epc_virt_addr(page));
-	if (WARN_ONCE(ret, "EREMOVE returned %d (0x%x)", ret, ret))
-		return;
 
 	spin_lock(&node->lock);
 
diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h
index 2a2b5c857451..65b2e88dbedd 100644
--- a/arch/x86/kernel/cpu/sgx/sgx.h
+++ b/arch/x86/kernel/cpu/sgx/sgx.h
@@ -13,6 +13,10 @@
 #undef pr_fmt
 #define pr_fmt(fmt) "sgx: " fmt
 
+#define EREMOVE_ERROR_MESSAGE \
+	"EREMOVE returned %d (0x%x) and an EPC page was leaked. SGX may become unusable. " \
+	"Refer to Documentation/x86/sgx.rst for more information."
+
 #define SGX_MAX_EPC_SECTIONS		8
 #define SGX_EEXTEND_BLOCK_SIZE		256
 #define SGX_NR_TO_SCAN			16
-- 
Gitee


From afb84915d55a274c8723c9d0ece90c63f28621e6 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 19 Mar 2021 20:22:20 +1300
Subject: [PATCH 005/674] x86/sgx: Add SGX_CHILD_PRESENT hardware error code

mainline inclusion
from mainline-5.13
commit 231d3dbdda192e3b3c7b79f4c3b0616f6c7f31b7
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit 231d3dbdda19 x86/sgx: Add SGX_CHILD_PRESENT hardware
error code.
Backport for SGX virtualization support

--------------------------------

SGX driver can accurately track how enclave pages are used.  This
enables SECS to be specifically targeted and EREMOVE'd only after all
child pages have been EREMOVE'd.  This ensures that SGX driver will
never encounter SGX_CHILD_PRESENT in normal operation.

Virtual EPC is different.  The host does not track how EPC pages are
used by the guest, so it cannot guarantee EREMOVE success.  It might,
for instance, encounter a SECS with a non-zero child count.

Add a definition of SGX_CHILD_PRESENT.  It will be used exclusively by
the SGX virtualization driver to handle recoverable EREMOVE errors when
saniziting EPC pages after they are freed.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Kai Huang <kai.huang@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Dave Hansen <dave.hansen@intel.com>
Acked-by: Jarkko Sakkinen <jarkko@kernel.org>
Link: https://lkml.kernel.org/r/050b198e882afde7e6eba8e6a0d4da39161dbb5a.1616136308.git.kai.huang@intel.com
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/include/asm/sgx.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/include/asm/sgx.h b/arch/x86/include/asm/sgx.h
index d96e5451d28a..14bb5f7e221c 100644
--- a/arch/x86/include/asm/sgx.h
+++ b/arch/x86/include/asm/sgx.h
@@ -31,12 +31,14 @@
  * enum sgx_return_code - The return code type for ENCLS, ENCLU and ENCLV
  * %SGX_NOT_TRACKED:		Previous ETRACK's shootdown sequence has not
  *				been completed yet.
+ * %SGX_CHILD_PRESENT		SECS has child pages present in the EPC.
  * %SGX_INVALID_EINITTOKEN:	EINITTOKEN is invalid and enclave signer's
  *				public key does not match IA32_SGXLEPUBKEYHASH.
  * %SGX_UNMASKED_EVENT:		An unmasked event, e.g. INTR, was received
  */
 enum sgx_return_code {
 	SGX_NOT_TRACKED			= 11,
+	SGX_CHILD_PRESENT		= 13,
 	SGX_INVALID_EINITTOKEN		= 16,
 	SGX_UNMASKED_EVENT		= 128,
 };
-- 
Gitee


From 4fca036fe49fc7b29c8c2fb4e67a5dc64041507c Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 19 Mar 2021 20:22:21 +1300
Subject: [PATCH 006/674] x86/sgx: Introduce virtual EPC for use by KVM guests

mainline inclusion
from mainline-5.13
commit 540745ddbc70eabdc7dbd3fcc00fe4fb17cd59ba
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit 540745ddbc70 x86/sgx: Introduce virtual EPC for use by
KVM guests.
Backport for SGX virtualization support

--------------------------------

Add a misc device /dev/sgx_vepc to allow userspace to allocate "raw"
Enclave Page Cache (EPC) without an associated enclave. The intended
and only known use case for raw EPC allocation is to expose EPC to a
KVM guest, hence the 'vepc' moniker, virt.{c,h} files and X86_SGX_KVM
Kconfig.

The SGX driver uses the misc device /dev/sgx_enclave to support
userspace in creating an enclave. Each file descriptor returned from
opening /dev/sgx_enclave represents an enclave. Unlike the SGX driver,
KVM doesn't control how the guest uses the EPC, therefore EPC allocated
to a KVM guest is not associated with an enclave, and /dev/sgx_enclave
is not suitable for allocating EPC for a KVM guest.

Having separate device nodes for the SGX driver and KVM virtual EPC also
allows separate permission control for running host SGX enclaves and KVM
SGX guests.

To use /dev/sgx_vepc to allocate a virtual EPC instance with particular
size, the hypervisor opens /dev/sgx_vepc, and uses mmap() with the
intended size to get an address range of virtual EPC. Then it may use
the address range to create one KVM memory slot as virtual EPC for
a guest.

Implement the "raw" EPC allocation in the x86 core-SGX subsystem via
/dev/sgx_vepc rather than in KVM. Doing so has two major advantages:

  - Does not require changes to KVM's uAPI, e.g. EPC gets handled as
    just another memory backend for guests.

  - EPC management is wholly contained in the SGX subsystem, e.g. SGX
    does not have to export any symbols, changes to reclaim flows don't
    need to be routed through KVM, SGX's dirty laundry doesn't have to
    get aired out for the world to see, and so on and so forth.

The virtual EPC pages allocated to guests are currently not reclaimable.
Reclaiming an EPC page used by enclave requires a special reclaim
mechanism separate from normal page reclaim, and that mechanism is not
supported for virutal EPC pages. Due to the complications of handling
reclaim conflicts between guest and host, reclaiming virtual EPC pages
is significantly more complex than basic support for SGX virtualization.

 [ bp:
   - Massage commit message and comments
   - use cpu_feature_enabled()
   - vertically align struct members init
   - massage Virtual EPC clarification text
   - move Kconfig prompt to Virtualization ]

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Co-developed-by: Kai Huang <kai.huang@intel.com>
Signed-off-by: Kai Huang <kai.huang@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Dave Hansen <dave.hansen@intel.com>
Acked-by: Jarkko Sakkinen <jarkko@kernel.org>
Link: https://lkml.kernel.org/r/0c38ced8c8e5a69872db4d6a1c0dabd01e07cad7.1616136308.git.kai.huang@intel.com
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 Documentation/x86/sgx.rst        |  16 ++
 arch/x86/kernel/cpu/sgx/Makefile |   1 +
 arch/x86/kernel/cpu/sgx/sgx.h    |   9 ++
 arch/x86/kernel/cpu/sgx/virt.c   | 259 +++++++++++++++++++++++++++++++
 arch/x86/kvm/Kconfig             |  12 ++
 5 files changed, 297 insertions(+)
 create mode 100644 arch/x86/kernel/cpu/sgx/virt.c

diff --git a/Documentation/x86/sgx.rst b/Documentation/x86/sgx.rst
index f90076e67cde..dd0ac96ff9ef 100644
--- a/Documentation/x86/sgx.rst
+++ b/Documentation/x86/sgx.rst
@@ -234,3 +234,19 @@ As a result, when this happpens, user should stop running any new
 SGX workloads, (or just any new workloads), and migrate all valuable
 workloads. Although a machine reboot can recover all EPC memory, the bug
 should be reported to Linux developers.
+
+
+Virtual EPC
+===========
+
+The implementation has also a virtual EPC driver to support SGX enclaves
+in guests. Unlike the SGX driver, an EPC page allocated by the virtual
+EPC driver doesn't have a specific enclave associated with it. This is
+because KVM doesn't track how a guest uses EPC pages.
+
+As a result, the SGX core page reclaimer doesn't support reclaiming EPC
+pages allocated to KVM guests through the virtual EPC driver. If the
+user wants to deploy SGX applications both on the host and in guests
+on the same machine, the user should reserve enough EPC (by taking out
+total virtual EPC size of all SGX VMs from the physical EPC size) for
+host SGX applications so they can run with acceptable performance.
diff --git a/arch/x86/kernel/cpu/sgx/Makefile b/arch/x86/kernel/cpu/sgx/Makefile
index 91d3dc784a29..9c1656779b2a 100644
--- a/arch/x86/kernel/cpu/sgx/Makefile
+++ b/arch/x86/kernel/cpu/sgx/Makefile
@@ -3,3 +3,4 @@ obj-y += \
 	encl.o \
 	ioctl.o \
 	main.o
+obj-$(CONFIG_X86_SGX_KVM)	+= virt.o
diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h
index 65b2e88dbedd..e4cbc71bf136 100644
--- a/arch/x86/kernel/cpu/sgx/sgx.h
+++ b/arch/x86/kernel/cpu/sgx/sgx.h
@@ -84,4 +84,13 @@ void sgx_mark_page_reclaimable(struct sgx_epc_page *page);
 int sgx_unmark_page_reclaimable(struct sgx_epc_page *page);
 struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim);
 
+#ifdef CONFIG_X86_SGX_KVM
+int __init sgx_vepc_init(void);
+#else
+static inline int __init sgx_vepc_init(void)
+{
+	return -ENODEV;
+}
+#endif
+
 #endif /* _X86_SGX_H */
diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c
new file mode 100644
index 000000000000..259cc46ad78c
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/virt.c
@@ -0,0 +1,259 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Device driver to expose SGX enclave memory to KVM guests.
+ *
+ * Copyright(c) 2021 Intel Corporation.
+ */
+
+#include <linux/miscdevice.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/signal.h>
+#include <linux/slab.h>
+#include <linux/xarray.h>
+#include <asm/sgx.h>
+#include <uapi/asm/sgx.h>
+
+#include "encls.h"
+#include "sgx.h"
+
+struct sgx_vepc {
+	struct xarray page_array;
+	struct mutex lock;
+};
+
+/*
+ * Temporary SECS pages that cannot be EREMOVE'd due to having child in other
+ * virtual EPC instances, and the lock to protect it.
+ */
+static struct mutex zombie_secs_pages_lock;
+static struct list_head zombie_secs_pages;
+
+static int __sgx_vepc_fault(struct sgx_vepc *vepc,
+			    struct vm_area_struct *vma, unsigned long addr)
+{
+	struct sgx_epc_page *epc_page;
+	unsigned long index, pfn;
+	int ret;
+
+	WARN_ON(!mutex_is_locked(&vepc->lock));
+
+	/* Calculate index of EPC page in virtual EPC's page_array */
+	index = vma->vm_pgoff + PFN_DOWN(addr - vma->vm_start);
+
+	epc_page = xa_load(&vepc->page_array, index);
+	if (epc_page)
+		return 0;
+
+	epc_page = sgx_alloc_epc_page(vepc, false);
+	if (IS_ERR(epc_page))
+		return PTR_ERR(epc_page);
+
+	ret = xa_err(xa_store(&vepc->page_array, index, epc_page, GFP_KERNEL));
+	if (ret)
+		goto err_free;
+
+	pfn = PFN_DOWN(sgx_get_epc_phys_addr(epc_page));
+
+	ret = vmf_insert_pfn(vma, addr, pfn);
+	if (ret != VM_FAULT_NOPAGE) {
+		ret = -EFAULT;
+		goto err_delete;
+	}
+
+	return 0;
+
+err_delete:
+	xa_erase(&vepc->page_array, index);
+err_free:
+	sgx_free_epc_page(epc_page);
+	return ret;
+}
+
+static vm_fault_t sgx_vepc_fault(struct vm_fault *vmf)
+{
+	struct vm_area_struct *vma = vmf->vma;
+	struct sgx_vepc *vepc = vma->vm_private_data;
+	int ret;
+
+	mutex_lock(&vepc->lock);
+	ret = __sgx_vepc_fault(vepc, vma, vmf->address);
+	mutex_unlock(&vepc->lock);
+
+	if (!ret)
+		return VM_FAULT_NOPAGE;
+
+	if (ret == -EBUSY && (vmf->flags & FAULT_FLAG_ALLOW_RETRY)) {
+		mmap_read_unlock(vma->vm_mm);
+		return VM_FAULT_RETRY;
+	}
+
+	return VM_FAULT_SIGBUS;
+}
+
+const struct vm_operations_struct sgx_vepc_vm_ops = {
+	.fault = sgx_vepc_fault,
+};
+
+static int sgx_vepc_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct sgx_vepc *vepc = file->private_data;
+
+	if (!(vma->vm_flags & VM_SHARED))
+		return -EINVAL;
+
+	vma->vm_ops = &sgx_vepc_vm_ops;
+	/* Don't copy VMA in fork() */
+	vma->vm_flags |= VM_PFNMAP | VM_IO | VM_DONTDUMP | VM_DONTCOPY;
+	vma->vm_private_data = vepc;
+
+	return 0;
+}
+
+static int sgx_vepc_free_page(struct sgx_epc_page *epc_page)
+{
+	int ret;
+
+	/*
+	 * Take a previously guest-owned EPC page and return it to the
+	 * general EPC page pool.
+	 *
+	 * Guests can not be trusted to have left this page in a good
+	 * state, so run EREMOVE on the page unconditionally.  In the
+	 * case that a guest properly EREMOVE'd this page, a superfluous
+	 * EREMOVE is harmless.
+	 */
+	ret = __eremove(sgx_get_epc_virt_addr(epc_page));
+	if (ret) {
+		/*
+		 * Only SGX_CHILD_PRESENT is expected, which is because of
+		 * EREMOVE'ing an SECS still with child, in which case it can
+		 * be handled by EREMOVE'ing the SECS again after all pages in
+		 * virtual EPC have been EREMOVE'd. See comments in below in
+		 * sgx_vepc_release().
+		 *
+		 * The user of virtual EPC (KVM) needs to guarantee there's no
+		 * logical processor is still running in the enclave in guest,
+		 * otherwise EREMOVE will get SGX_ENCLAVE_ACT which cannot be
+		 * handled here.
+		 */
+		WARN_ONCE(ret != SGX_CHILD_PRESENT, EREMOVE_ERROR_MESSAGE,
+			  ret, ret);
+		return ret;
+	}
+
+	sgx_free_epc_page(epc_page);
+
+	return 0;
+}
+
+static int sgx_vepc_release(struct inode *inode, struct file *file)
+{
+	struct sgx_vepc *vepc = file->private_data;
+	struct sgx_epc_page *epc_page, *tmp, *entry;
+	unsigned long index;
+
+	LIST_HEAD(secs_pages);
+
+	xa_for_each(&vepc->page_array, index, entry) {
+		/*
+		 * Remove all normal, child pages.  sgx_vepc_free_page()
+		 * will fail if EREMOVE fails, but this is OK and expected on
+		 * SECS pages.  Those can only be EREMOVE'd *after* all their
+		 * child pages. Retries below will clean them up.
+		 */
+		if (sgx_vepc_free_page(entry))
+			continue;
+
+		xa_erase(&vepc->page_array, index);
+	}
+
+	/*
+	 * Retry EREMOVE'ing pages.  This will clean up any SECS pages that
+	 * only had children in this 'epc' area.
+	 */
+	xa_for_each(&vepc->page_array, index, entry) {
+		epc_page = entry;
+		/*
+		 * An EREMOVE failure here means that the SECS page still
+		 * has children.  But, since all children in this 'sgx_vepc'
+		 * have been removed, the SECS page must have a child on
+		 * another instance.
+		 */
+		if (sgx_vepc_free_page(epc_page))
+			list_add_tail(&epc_page->list, &secs_pages);
+
+		xa_erase(&vepc->page_array, index);
+	}
+
+	/*
+	 * SECS pages are "pinned" by child pages, and "unpinned" once all
+	 * children have been EREMOVE'd.  A child page in this instance
+	 * may have pinned an SECS page encountered in an earlier release(),
+	 * creating a zombie.  Since some children were EREMOVE'd above,
+	 * try to EREMOVE all zombies in the hopes that one was unpinned.
+	 */
+	mutex_lock(&zombie_secs_pages_lock);
+	list_for_each_entry_safe(epc_page, tmp, &zombie_secs_pages, list) {
+		/*
+		 * Speculatively remove the page from the list of zombies,
+		 * if the page is successfully EREMOVE'd it will be added to
+		 * the list of free pages.  If EREMOVE fails, throw the page
+		 * on the local list, which will be spliced on at the end.
+		 */
+		list_del(&epc_page->list);
+
+		if (sgx_vepc_free_page(epc_page))
+			list_add_tail(&epc_page->list, &secs_pages);
+	}
+
+	if (!list_empty(&secs_pages))
+		list_splice_tail(&secs_pages, &zombie_secs_pages);
+	mutex_unlock(&zombie_secs_pages_lock);
+
+	kfree(vepc);
+
+	return 0;
+}
+
+static int sgx_vepc_open(struct inode *inode, struct file *file)
+{
+	struct sgx_vepc *vepc;
+
+	vepc = kzalloc(sizeof(struct sgx_vepc), GFP_KERNEL);
+	if (!vepc)
+		return -ENOMEM;
+	mutex_init(&vepc->lock);
+	xa_init(&vepc->page_array);
+
+	file->private_data = vepc;
+
+	return 0;
+}
+
+static const struct file_operations sgx_vepc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= sgx_vepc_open,
+	.release	= sgx_vepc_release,
+	.mmap		= sgx_vepc_mmap,
+};
+
+static struct miscdevice sgx_vepc_dev = {
+	.minor		= MISC_DYNAMIC_MINOR,
+	.name		= "sgx_vepc",
+	.nodename	= "sgx_vepc",
+	.fops		= &sgx_vepc_fops,
+};
+
+int __init sgx_vepc_init(void)
+{
+	/* SGX virtualization requires KVM to work */
+	if (!cpu_feature_enabled(X86_FEATURE_VMX))
+		return -ENODEV;
+
+	INIT_LIST_HEAD(&zombie_secs_pages);
+	mutex_init(&zombie_secs_pages_lock);
+
+	return misc_register(&sgx_vepc_dev);
+}
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index f92dfd8ef10d..71fb38d83e4a 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -84,6 +84,18 @@ config KVM_INTEL
 	  To compile this as a module, choose M here: the module
 	  will be called kvm-intel.
 
+config X86_SGX_KVM
+	bool "Software Guard eXtensions (SGX) Virtualization"
+	depends on X86_SGX && KVM_INTEL
+	help
+
+	  Enables KVM guests to create SGX enclaves.
+
+	  This includes support to expose "raw" unreclaimable enclave memory to
+	  guests via a device node, e.g. /dev/sgx_vepc.
+
+	  If unsure, say N.
+
 config KVM_AMD
 	tristate "KVM for AMD processors support"
 	depends on KVM
-- 
Gitee


From ecdadaa2be1be5ff54534b2df23a83185f4c4c99 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 19 Mar 2021 20:22:58 +1300
Subject: [PATCH 007/674] x86/cpu/intel: Allow SGX virtualization without
 Launch Control support

mainline inclusion
from mainline-5.13
commit 332bfc7becf479de8a55864cc5ed0024baea28aa
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit 332bfc7becf4 x86/cpu/intel: Allow SGX virtualization
without Launch Control support.
Backport for SGX virtualization support

--------------------------------

The kernel will currently disable all SGX support if the hardware does
not support launch control.  Make it more permissive to allow SGX
virtualization on systems without Launch Control support.  This will
allow KVM to expose SGX to guests that have less-strict requirements on
the availability of flexible launch control.

Improve error message to distinguish between three cases.  There are two
cases where SGX support is completely disabled:
1) SGX has been disabled completely by the BIOS
2) SGX LC is locked by the BIOS.  Bare-metal support is disabled because
   of LC unavailability.  SGX virtualization is unavailable (because of
   Kconfig).
One where it is partially available:
3) SGX LC is locked by the BIOS.  Bare-metal support is disabled because
   of LC unavailability.  SGX virtualization is supported.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Co-developed-by: Kai Huang <kai.huang@intel.com>
Signed-off-by: Kai Huang <kai.huang@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Jarkko Sakkinen <jarkko@kernel.org>
Acked-by: Dave Hansen <dave.hansen@intel.com>
Link: https://lkml.kernel.org/r/b3329777076509b3b601550da288c8f3c406a865.1616136308.git.kai.huang@intel.com
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/kernel/cpu/feat_ctl.c | 59 +++++++++++++++++++++++++---------
 1 file changed, 44 insertions(+), 15 deletions(-)

diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c
index 27533a6e04fa..da696eb4821a 100644
--- a/arch/x86/kernel/cpu/feat_ctl.c
+++ b/arch/x86/kernel/cpu/feat_ctl.c
@@ -104,8 +104,9 @@ early_param("nosgx", nosgx);
 
 void init_ia32_feat_ctl(struct cpuinfo_x86 *c)
 {
+	bool enable_sgx_kvm = false, enable_sgx_driver = false;
 	bool tboot = tboot_enabled();
-	bool enable_sgx;
+	bool enable_vmx;
 	u64 msr;
 
 	if (rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr)) {
@@ -114,13 +115,19 @@ void init_ia32_feat_ctl(struct cpuinfo_x86 *c)
 		return;
 	}
 
-	/*
-	 * Enable SGX if and only if the kernel supports SGX and Launch Control
-	 * is supported, i.e. disable SGX if the LE hash MSRs can't be written.
-	 */
-	enable_sgx = cpu_has(c, X86_FEATURE_SGX) &&
-		     cpu_has(c, X86_FEATURE_SGX_LC) &&
-		     IS_ENABLED(CONFIG_X86_SGX);
+	enable_vmx = cpu_has(c, X86_FEATURE_VMX) &&
+		     IS_ENABLED(CONFIG_KVM_INTEL);
+
+	if (cpu_has(c, X86_FEATURE_SGX) && IS_ENABLED(CONFIG_X86_SGX)) {
+		/*
+		 * Separate out SGX driver enabling from KVM.  This allows KVM
+		 * guests to use SGX even if the kernel SGX driver refuses to
+		 * use it.  This happens if flexible Launch Control is not
+		 * available.
+		 */
+		enable_sgx_driver = cpu_has(c, X86_FEATURE_SGX_LC);
+		enable_sgx_kvm = enable_vmx && IS_ENABLED(CONFIG_X86_SGX_KVM);
+	}
 
 	if (msr & FEAT_CTL_LOCKED)
 		goto update_caps;
@@ -136,15 +143,18 @@ void init_ia32_feat_ctl(struct cpuinfo_x86 *c)
 	 * i.e. KVM is enabled, to avoid unnecessarily adding an attack vector
 	 * for the kernel, e.g. using VMX to hide malicious code.
 	 */
-	if (cpu_has(c, X86_FEATURE_VMX) && IS_ENABLED(CONFIG_KVM_INTEL)) {
+	if (enable_vmx) {
 		msr |= FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
 
 		if (tboot)
 			msr |= FEAT_CTL_VMX_ENABLED_INSIDE_SMX;
 	}
 
-	if (enable_sgx)
-		msr |= FEAT_CTL_SGX_ENABLED | FEAT_CTL_SGX_LC_ENABLED;
+	if (enable_sgx_kvm || enable_sgx_driver) {
+		msr |= FEAT_CTL_SGX_ENABLED;
+		if (enable_sgx_driver)
+			msr |= FEAT_CTL_SGX_LC_ENABLED;
+	}
 
 	wrmsrl(MSR_IA32_FEAT_CTL, msr);
 
@@ -167,10 +177,29 @@ void init_ia32_feat_ctl(struct cpuinfo_x86 *c)
 	}
 
 update_sgx:
-	if (!(msr & FEAT_CTL_SGX_ENABLED) ||
-	    !(msr & FEAT_CTL_SGX_LC_ENABLED) || !enable_sgx) {
-		if (enable_sgx)
-			pr_err_once("SGX disabled by BIOS\n");
+	if (!(msr & FEAT_CTL_SGX_ENABLED)) {
+		if (enable_sgx_kvm || enable_sgx_driver)
+			pr_err_once("SGX disabled by BIOS.\n");
 		clear_cpu_cap(c, X86_FEATURE_SGX);
+		return;
+	}
+
+	/*
+	 * VMX feature bit may be cleared due to being disabled in BIOS,
+	 * in which case SGX virtualization cannot be supported either.
+	 */
+	if (!cpu_has(c, X86_FEATURE_VMX) && enable_sgx_kvm) {
+		pr_err_once("SGX virtualization disabled due to lack of VMX.\n");
+		enable_sgx_kvm = 0;
+	}
+
+	if (!(msr & FEAT_CTL_SGX_LC_ENABLED) && enable_sgx_driver) {
+		if (!enable_sgx_kvm) {
+			pr_err_once("SGX Launch Control is locked. Disable SGX.\n");
+			clear_cpu_cap(c, X86_FEATURE_SGX);
+		} else {
+			pr_err_once("SGX Launch Control is locked. Support SGX virtualization only.\n");
+			clear_cpu_cap(c, X86_FEATURE_SGX_LC);
+		}
 	}
 }
-- 
Gitee


From 90ec4287f6bfbc994b552351709f2e547b745453 Mon Sep 17 00:00:00 2001
From: Kai Huang <kai.huang@intel.com>
Date: Fri, 19 Mar 2021 20:23:02 +1300
Subject: [PATCH 008/674] x86/sgx: Initialize virtual EPC driver even when SGX
 driver is disabled

mainline inclusion
from mainline-5.13
commit faa7d3e6f3b983a28bf0f88f82dcb1c162e61105
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit faa7d3e6f3b9 x86/sgx: Initialize virtual EPC driver
even when SGX driver is disabled.
Backport for SGX virtualization support

--------------------------------

Modify sgx_init() to always try to initialize the virtual EPC driver,
even if the SGX driver is disabled.  The SGX driver might be disabled
if SGX Launch Control is in locked mode, or not supported in the
hardware at all.  This allows (non-Linux) guests that support non-LC
configurations to use SGX.

 [ bp: De-silli-fy the test. ]

Signed-off-by: Kai Huang <kai.huang@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Sean Christopherson <seanjc@google.com>
Acked-by: Jarkko Sakkinen <jarkko@kernel.org>
Acked-by: Dave Hansen <dave.hansen@intel.com>
Link: https://lkml.kernel.org/r/d35d17a02bbf8feef83a536cec8b43746d4ea557.1616136308.git.kai.huang@intel.com
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/kernel/cpu/sgx/main.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index b227629b1e9c..1c8a228b0104 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -743,8 +743,17 @@ static int __init sgx_init(void)
 		goto err_page_cache;
 	}
 
+	/*
+	 * Always try to initialize the native *and* KVM drivers.
+	 * The KVM driver is less picky than the native one and
+	 * can function if the native one is not supported on the
+	 * current system or fails to initialize.
+	 *
+	 * Error out only if both fail to initialize.
+	 */
 	ret = sgx_drv_init();
-	if (ret)
+
+	if (sgx_vepc_init() && ret)
 		goto err_kthread;
 
 	return 0;
-- 
Gitee


From 42e4ea5524ab1d223cba74c8c8cde8367728b108 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 19 Mar 2021 20:23:04 +1300
Subject: [PATCH 009/674] x86/sgx: Move ENCLS leaf definitions to sgx.h

mainline inclusion
from mainline-5.13
commit 9c55c78a73ce6e62a1d46ba6e4f242c23c29b812
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit 9c55c78a73ce x86/sgx: Move ENCLS leaf definitions to sgx.h.
Backport for SGX virtualization support

--------------------------------

Move the ENCLS leaf definitions to sgx.h so that they can be used by
KVM.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Kai Huang <kai.huang@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Jarkko Sakkinen <jarkko@kernel.org>
Acked-by: Dave Hansen <dave.hansen@intel.com>
Link: https://lkml.kernel.org/r/2e6cd7c5c1ced620cfcd292c3c6c382827fde6b2.1616136308.git.kai.huang@intel.com
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/include/asm/sgx.h      | 15 +++++++++++++++
 arch/x86/kernel/cpu/sgx/encls.h | 15 ---------------
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/arch/x86/include/asm/sgx.h b/arch/x86/include/asm/sgx.h
index 14bb5f7e221c..34f44238d1d1 100644
--- a/arch/x86/include/asm/sgx.h
+++ b/arch/x86/include/asm/sgx.h
@@ -27,6 +27,21 @@
 /* The bitmask for the EPC section type. */
 #define SGX_CPUID_EPC_MASK	GENMASK(3, 0)
 
+enum sgx_encls_function {
+	ECREATE	= 0x00,
+	EADD	= 0x01,
+	EINIT	= 0x02,
+	EREMOVE	= 0x03,
+	EDGBRD	= 0x04,
+	EDGBWR	= 0x05,
+	EEXTEND	= 0x06,
+	ELDU	= 0x08,
+	EBLOCK	= 0x09,
+	EPA	= 0x0A,
+	EWB	= 0x0B,
+	ETRACK	= 0x0C,
+};
+
 /**
  * enum sgx_return_code - The return code type for ENCLS, ENCLU and ENCLV
  * %SGX_NOT_TRACKED:		Previous ETRACK's shootdown sequence has not
diff --git a/arch/x86/kernel/cpu/sgx/encls.h b/arch/x86/kernel/cpu/sgx/encls.h
index 443188fe7e70..be5c49689980 100644
--- a/arch/x86/kernel/cpu/sgx/encls.h
+++ b/arch/x86/kernel/cpu/sgx/encls.h
@@ -11,21 +11,6 @@
 #include <asm/traps.h>
 #include "sgx.h"
 
-enum sgx_encls_function {
-	ECREATE	= 0x00,
-	EADD	= 0x01,
-	EINIT	= 0x02,
-	EREMOVE	= 0x03,
-	EDGBRD	= 0x04,
-	EDGBWR	= 0x05,
-	EEXTEND	= 0x06,
-	ELDU	= 0x08,
-	EBLOCK	= 0x09,
-	EPA	= 0x0A,
-	EWB	= 0x0B,
-	ETRACK	= 0x0C,
-};
-
 /**
  * ENCLS_FAULT_FLAG - flag signifying an ENCLS return code is a trapnr
  *
-- 
Gitee


From f3a767a7cb95fd64fe8c5727ffce40d38c43bd93 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 19 Mar 2021 20:23:05 +1300
Subject: [PATCH 010/674] x86/sgx: Add SGX2 ENCLS leaf definitions (EAUG,
 EMODPR and EMODT)

mainline inclusion
from mainline-5.13
commit 32ddda8e445df3de477db14d386fb3518042224a
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit 32ddda8e445d x86/sgx: Add SGX2 ENCLS leaf definitions
(EAUG, EMODPR and EMODT).
Backport for SGX virtualization support

--------------------------------

Define the ENCLS leafs that are available with SGX2, also referred to as
Enclave Dynamic Memory Management (EDMM).  The leafs will be used by KVM
to conditionally expose SGX2 capabilities to guests.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Kai Huang <kai.huang@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Jarkko Sakkinen <jarkko@kernel.org>
Acked-by: Dave Hansen <dave.hansen@intel.com>
Link: https://lkml.kernel.org/r/5f0970c251ebcc6d5add132f0d750cc753b7060f.1616136308.git.kai.huang@intel.com
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/include/asm/sgx.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/x86/include/asm/sgx.h b/arch/x86/include/asm/sgx.h
index 34f44238d1d1..3b025afec0a7 100644
--- a/arch/x86/include/asm/sgx.h
+++ b/arch/x86/include/asm/sgx.h
@@ -40,6 +40,9 @@ enum sgx_encls_function {
 	EPA	= 0x0A,
 	EWB	= 0x0B,
 	ETRACK	= 0x0C,
+	EAUG	= 0x0D,
+	EMODPR	= 0x0E,
+	EMODT	= 0x0F,
 };
 
 /**
-- 
Gitee


From 42413c8aded5b5c845dc7a975e6dcf26437f03d9 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 19 Mar 2021 20:23:06 +1300
Subject: [PATCH 011/674] x86/sgx: Add encls_faulted() helper

mainline inclusion
from mainline-5.13
commit a67136b458e5e63822b19c35794451122fe2bf3e
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit a67136b458e5 x86/sgx: Add encls_faulted() helper.
Backport for SGX virtualization support

--------------------------------

Add a helper to extract the fault indicator from an encoded ENCLS return
value.  SGX virtualization will also need to detect ENCLS faults.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Kai Huang <kai.huang@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Jarkko Sakkinen <jarkko@kernel.org>
Acked-by: Dave Hansen <dave.hansen@intel.com>
Link: https://lkml.kernel.org/r/c1f955898110de2f669da536fc6cf62e003dff88.1616136308.git.kai.huang@intel.com
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/kernel/cpu/sgx/encls.h | 15 ++++++++++++++-
 arch/x86/kernel/cpu/sgx/ioctl.c |  2 +-
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/sgx/encls.h b/arch/x86/kernel/cpu/sgx/encls.h
index be5c49689980..9b204843b78d 100644
--- a/arch/x86/kernel/cpu/sgx/encls.h
+++ b/arch/x86/kernel/cpu/sgx/encls.h
@@ -40,6 +40,19 @@
 	} while (0);							  \
 }
 
+/*
+ * encls_faulted() - Check if an ENCLS leaf faulted given an error code
+ * @ret:	the return value of an ENCLS leaf function call
+ *
+ * Return:
+ * - true:	ENCLS leaf faulted.
+ * - false:	Otherwise.
+ */
+static inline bool encls_faulted(int ret)
+{
+	return ret & ENCLS_FAULT_FLAG;
+}
+
 /**
  * encls_failed() - Check if an ENCLS function failed
  * @ret:	the return value of an ENCLS function call
@@ -50,7 +63,7 @@
  */
 static inline bool encls_failed(int ret)
 {
-	if (ret & ENCLS_FAULT_FLAG)
+	if (encls_faulted(ret))
 		return ENCLS_TRAPNR(ret) != X86_TRAP_PF;
 
 	return !!ret;
diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c
index 354e309fcdb7..11e3f9635c24 100644
--- a/arch/x86/kernel/cpu/sgx/ioctl.c
+++ b/arch/x86/kernel/cpu/sgx/ioctl.c
@@ -568,7 +568,7 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct,
 		}
 	}
 
-	if (ret & ENCLS_FAULT_FLAG) {
+	if (encls_faulted(ret)) {
 		if (encls_failed(ret))
 			ENCLS_WARN(ret, "EINIT");
 
-- 
Gitee


From 74ecbf21604bc194537a72ddc330c8cc2ec8614d Mon Sep 17 00:00:00 2001
From: Kai Huang <kai.huang@intel.com>
Date: Fri, 19 Mar 2021 20:23:07 +1300
Subject: [PATCH 012/674] x86/sgx: Add helper to update SGX_LEPUBKEYHASHn MSRs

mainline inclusion
from mainline-5.13
commit 73916b6a0c714258f9c2619408a66c6696a761a7
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit 73916b6a0c71 x86/sgx: Add helper to update
SGX_LEPUBKEYHASHn MSRs.
Backport for SGX virtualization support

--------------------------------

Add a helper to update SGX_LEPUBKEYHASHn MSRs.  SGX virtualization also
needs to update those MSRs based on guest's "virtual" SGX_LEPUBKEYHASHn
before EINIT from guest.

Signed-off-by: Kai Huang <kai.huang@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Dave Hansen <dave.hansen@intel.com>
Acked-by: Jarkko Sakkinen <jarkko@kernel.org>
Link: https://lkml.kernel.org/r/dfb7cd39d4dd62ea27703b64afdd8bccb579f623.1616136308.git.kai.huang@intel.com
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/kernel/cpu/sgx/ioctl.c |  5 ++---
 arch/x86/kernel/cpu/sgx/main.c  | 16 ++++++++++++++++
 arch/x86/kernel/cpu/sgx/sgx.h   |  2 ++
 3 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c
index 11e3f9635c24..7be9c064a640 100644
--- a/arch/x86/kernel/cpu/sgx/ioctl.c
+++ b/arch/x86/kernel/cpu/sgx/ioctl.c
@@ -495,7 +495,7 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct,
 			 void *token)
 {
 	u64 mrsigner[4];
-	int i, j, k;
+	int i, j;
 	void *addr;
 	int ret;
 
@@ -544,8 +544,7 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct,
 
 			preempt_disable();
 
-			for (k = 0; k < 4; k++)
-				wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + k, mrsigner[k]);
+			sgx_update_lepubkeyhash(mrsigner);
 
 			ret = __einit(sigstruct, token, addr);
 
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index 1c8a228b0104..227f1e2ad9cf 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -727,6 +727,22 @@ static bool __init sgx_page_cache_init(void)
 	return true;
 }
 
+/*
+ * Update the SGX_LEPUBKEYHASH MSRs to the values specified by caller.
+ * Bare-metal driver requires to update them to hash of enclave's signer
+ * before EINIT. KVM needs to update them to guest's virtual MSR values
+ * before doing EINIT from guest.
+ */
+void sgx_update_lepubkeyhash(u64 *lepubkeyhash)
+{
+	int i;
+
+	WARN_ON_ONCE(preemptible());
+
+	for (i = 0; i < 4; i++)
+		wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + i, lepubkeyhash[i]);
+}
+
 static int __init sgx_init(void)
 {
 	int ret;
diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h
index e4cbc71bf136..4628acec0009 100644
--- a/arch/x86/kernel/cpu/sgx/sgx.h
+++ b/arch/x86/kernel/cpu/sgx/sgx.h
@@ -93,4 +93,6 @@ static inline int __init sgx_vepc_init(void)
 }
 #endif
 
+void sgx_update_lepubkeyhash(u64 *lepubkeyhash);
+
 #endif /* _X86_SGX_H */
-- 
Gitee


From 0d53491c285d1a148bb030a61592383e7bc8cb92 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 19 Mar 2021 20:23:08 +1300
Subject: [PATCH 013/674] x86/sgx: Add helpers to expose ECREATE and EINIT to
 KVM

mainline inclusion
from mainline-5.13
commit d155030b1e7c0e448aab22a803f7a71ea2e117d7
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit d155030b1e7c x86/sgx: Add helpers to expose ECREATE and
EINIT to KVM.
Backport for SGX virtualization support

--------------------------------

The host kernel must intercept ECREATE to impose policies on guests, and
intercept EINIT to be able to write guest's virtual SGX_LEPUBKEYHASH MSR
values to hardware before running guest's EINIT so it can run correctly
according to hardware behavior.

Provide wrappers around __ecreate() and __einit() to hide the ugliness
of overloading the ENCLS return value to encode multiple error formats
in a single int.  KVM will trap-and-execute ECREATE and EINIT as part
of SGX virtualization, and reflect ENCLS execution result to guest by
setting up guest's GPRs, or on an exception, injecting the correct fault
based on return value of __ecreate() and __einit().

Use host userspace addresses (provided by KVM based on guest physical
address of ENCLS parameters) to execute ENCLS/EINIT when possible.
Accesses to both EPC and memory originating from ENCLS are subject to
segmentation and paging mechanisms.  It's also possible to generate
kernel mappings for ENCLS parameters by resolving PFN but using
__uaccess_xx() is simpler.

 [ bp: Return early if the __user memory accesses fail, use
   cpu_feature_enabled(). ]

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Kai Huang <kai.huang@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Jarkko Sakkinen <jarkko@kernel.org>
Link: https://lkml.kernel.org/r/20e09daf559aa5e9e680a0b4b5fba940f1bad86e.1616136308.git.kai.huang@intel.com
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/include/asm/sgx.h     |   7 ++
 arch/x86/kernel/cpu/sgx/virt.c | 117 +++++++++++++++++++++++++++++++++
 2 files changed, 124 insertions(+)

diff --git a/arch/x86/include/asm/sgx.h b/arch/x86/include/asm/sgx.h
index 3b025afec0a7..954042e04102 100644
--- a/arch/x86/include/asm/sgx.h
+++ b/arch/x86/include/asm/sgx.h
@@ -365,4 +365,11 @@ struct sgx_sigstruct {
  * comment!
  */
 
+#ifdef CONFIG_X86_SGX_KVM
+int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs,
+		     int *trapnr);
+int sgx_virt_einit(void __user *sigstruct, void __user *token,
+		   void __user *secs, u64 *lepubkeyhash, int *trapnr);
+#endif
+
 #endif /* _ASM_X86_SGX_H */
diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c
index 259cc46ad78c..7d221eac716a 100644
--- a/arch/x86/kernel/cpu/sgx/virt.c
+++ b/arch/x86/kernel/cpu/sgx/virt.c
@@ -257,3 +257,120 @@ int __init sgx_vepc_init(void)
 
 	return misc_register(&sgx_vepc_dev);
 }
+
+/**
+ * sgx_virt_ecreate() - Run ECREATE on behalf of guest
+ * @pageinfo:	Pointer to PAGEINFO structure
+ * @secs:	Userspace pointer to SECS page
+ * @trapnr:	trap number injected to guest in case of ECREATE error
+ *
+ * Run ECREATE on behalf of guest after KVM traps ECREATE for the purpose
+ * of enforcing policies of guest's enclaves, and return the trap number
+ * which should be injected to guest in case of any ECREATE error.
+ *
+ * Return:
+ * -  0:	ECREATE was successful.
+ * - <0:	on error.
+ */
+int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs,
+		     int *trapnr)
+{
+	int ret;
+
+	/*
+	 * @secs is an untrusted, userspace-provided address.  It comes from
+	 * KVM and is assumed to be a valid pointer which points somewhere in
+	 * userspace.  This can fault and call SGX or other fault handlers when
+	 * userspace mapping @secs doesn't exist.
+	 *
+	 * Add a WARN() to make sure @secs is already valid userspace pointer
+	 * from caller (KVM), who should already have handled invalid pointer
+	 * case (for instance, made by malicious guest).  All other checks,
+	 * such as alignment of @secs, are deferred to ENCLS itself.
+	 */
+	if (WARN_ON_ONCE(!access_ok(secs, PAGE_SIZE)))
+		return -EINVAL;
+
+	__uaccess_begin();
+	ret = __ecreate(pageinfo, (void *)secs);
+	__uaccess_end();
+
+	if (encls_faulted(ret)) {
+		*trapnr = ENCLS_TRAPNR(ret);
+		return -EFAULT;
+	}
+
+	/* ECREATE doesn't return an error code, it faults or succeeds. */
+	WARN_ON_ONCE(ret);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(sgx_virt_ecreate);
+
+static int __sgx_virt_einit(void __user *sigstruct, void __user *token,
+			    void __user *secs)
+{
+	int ret;
+
+	/*
+	 * Make sure all userspace pointers from caller (KVM) are valid.
+	 * All other checks deferred to ENCLS itself.  Also see comment
+	 * for @secs in sgx_virt_ecreate().
+	 */
+#define SGX_EINITTOKEN_SIZE	304
+	if (WARN_ON_ONCE(!access_ok(sigstruct, sizeof(struct sgx_sigstruct)) ||
+			 !access_ok(token, SGX_EINITTOKEN_SIZE) ||
+			 !access_ok(secs, PAGE_SIZE)))
+		return -EINVAL;
+
+	__uaccess_begin();
+	ret = __einit((void *)sigstruct, (void *)token, (void *)secs);
+	__uaccess_end();
+
+	return ret;
+}
+
+/**
+ * sgx_virt_einit() - Run EINIT on behalf of guest
+ * @sigstruct:		Userspace pointer to SIGSTRUCT structure
+ * @token:		Userspace pointer to EINITTOKEN structure
+ * @secs:		Userspace pointer to SECS page
+ * @lepubkeyhash:	Pointer to guest's *virtual* SGX_LEPUBKEYHASH MSR values
+ * @trapnr:		trap number injected to guest in case of EINIT error
+ *
+ * Run EINIT on behalf of guest after KVM traps EINIT. If SGX_LC is available
+ * in host, SGX driver may rewrite the hardware values at wish, therefore KVM
+ * needs to update hardware values to guest's virtual MSR values in order to
+ * ensure EINIT is executed with expected hardware values.
+ *
+ * Return:
+ * -  0:	EINIT was successful.
+ * - <0:	on error.
+ */
+int sgx_virt_einit(void __user *sigstruct, void __user *token,
+		   void __user *secs, u64 *lepubkeyhash, int *trapnr)
+{
+	int ret;
+
+	if (!cpu_feature_enabled(X86_FEATURE_SGX_LC)) {
+		ret = __sgx_virt_einit(sigstruct, token, secs);
+	} else {
+		preempt_disable();
+
+		sgx_update_lepubkeyhash(lepubkeyhash);
+
+		ret = __sgx_virt_einit(sigstruct, token, secs);
+		preempt_enable();
+	}
+
+	/* Propagate up the error from the WARN_ON_ONCE in __sgx_virt_einit() */
+	if (ret == -EINVAL)
+		return ret;
+
+	if (encls_faulted(ret)) {
+		*trapnr = ENCLS_TRAPNR(ret);
+		return -EFAULT;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(sgx_virt_einit);
-- 
Gitee


From 6825a358d3e30c17aa88ed64ad06811a60aa9e3d Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 19 Mar 2021 20:23:09 +1300
Subject: [PATCH 014/674] x86/sgx: Move provisioning device creation out of SGX
 driver

mainline inclusion
from mainline-5.13
commit b3754e5d3da320af2bebb7a690002685c7f5c15c
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit b3754e5d3da3 x86/sgx: Move provisioning device creation
out of SGX driver.
Backport for SGX virtualization support

--------------------------------

And extract sgx_set_attribute() out of sgx_ioc_enclave_provision() and
export it as symbol for KVM to use.

The provisioning key is sensitive. The SGX driver only allows to create
an enclave which can access the provisioning key when the enclave
creator has permission to open /dev/sgx_provision. It should apply to
a VM as well, as the provisioning key is platform-specific, thus an
unrestricted VM can also potentially compromise the provisioning key.

Move the provisioning device creation out of sgx_drv_init() to
sgx_init() as a preparation for adding SGX virtualization support,
so that even if the SGX driver is not enabled due to flexible launch
control not being available, SGX virtualization can still be enabled,
and use it to restrict a VM's capability of being able to access the
provisioning key.

 [ bp: Massage commit message. ]

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Kai Huang <kai.huang@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Acked-by: Dave Hansen <dave.hansen@intel.com>
Link: https://lkml.kernel.org/r/0f4d044d621561f26d5f4ef73e8dc6cd18cc7e79.1616136308.git.kai.huang@intel.com
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/include/asm/sgx.h       |  3 ++
 arch/x86/kernel/cpu/sgx/driver.c | 17 ----------
 arch/x86/kernel/cpu/sgx/ioctl.c  | 16 ++-------
 arch/x86/kernel/cpu/sgx/main.c   | 57 +++++++++++++++++++++++++++++++-
 4 files changed, 61 insertions(+), 32 deletions(-)

diff --git a/arch/x86/include/asm/sgx.h b/arch/x86/include/asm/sgx.h
index 954042e04102..a16e2c9154a3 100644
--- a/arch/x86/include/asm/sgx.h
+++ b/arch/x86/include/asm/sgx.h
@@ -372,4 +372,7 @@ int sgx_virt_einit(void __user *sigstruct, void __user *token,
 		   void __user *secs, u64 *lepubkeyhash, int *trapnr);
 #endif
 
+int sgx_set_attribute(unsigned long *allowed_attributes,
+		      unsigned int attribute_fd);
+
 #endif /* _ASM_X86_SGX_H */
diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c
index 8ce6d8371cfb..aa9b8b868867 100644
--- a/arch/x86/kernel/cpu/sgx/driver.c
+++ b/arch/x86/kernel/cpu/sgx/driver.c
@@ -136,10 +136,6 @@ static const struct file_operations sgx_encl_fops = {
 	.get_unmapped_area	= sgx_get_unmapped_area,
 };
 
-const struct file_operations sgx_provision_fops = {
-	.owner			= THIS_MODULE,
-};
-
 static struct miscdevice sgx_dev_enclave = {
 	.minor = MISC_DYNAMIC_MINOR,
 	.name = "sgx_enclave",
@@ -147,13 +143,6 @@ static struct miscdevice sgx_dev_enclave = {
 	.fops = &sgx_encl_fops,
 };
 
-static struct miscdevice sgx_dev_provision = {
-	.minor = MISC_DYNAMIC_MINOR,
-	.name = "sgx_provision",
-	.nodename = "sgx_provision",
-	.fops = &sgx_provision_fops,
-};
-
 int __init sgx_drv_init(void)
 {
 	unsigned int eax, ebx, ecx, edx;
@@ -187,11 +176,5 @@ int __init sgx_drv_init(void)
 	if (ret)
 		return ret;
 
-	ret = misc_register(&sgx_dev_provision);
-	if (ret) {
-		misc_deregister(&sgx_dev_enclave);
-		return ret;
-	}
-
 	return 0;
 }
diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c
index 7be9c064a640..83df20e3e633 100644
--- a/arch/x86/kernel/cpu/sgx/ioctl.c
+++ b/arch/x86/kernel/cpu/sgx/ioctl.c
@@ -2,6 +2,7 @@
 /*  Copyright(c) 2016-20 Intel Corporation. */
 
 #include <asm/mman.h>
+#include <asm/sgx.h>
 #include <linux/mman.h>
 #include <linux/delay.h>
 #include <linux/file.h>
@@ -666,24 +667,11 @@ static long sgx_ioc_enclave_init(struct sgx_encl *encl, void __user *arg)
 static long sgx_ioc_enclave_provision(struct sgx_encl *encl, void __user *arg)
 {
 	struct sgx_enclave_provision params;
-	struct file *file;
 
 	if (copy_from_user(&params, arg, sizeof(params)))
 		return -EFAULT;
 
-	file = fget(params.fd);
-	if (!file)
-		return -EINVAL;
-
-	if (file->f_op != &sgx_provision_fops) {
-		fput(file);
-		return -EINVAL;
-	}
-
-	encl->attributes_mask |= SGX_ATTR_PROVISIONKEY;
-
-	fput(file);
-	return 0;
+	return sgx_set_attribute(&encl->attributes_mask, params.fd);
 }
 
 long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index 227f1e2ad9cf..92cb11dffd4c 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -1,14 +1,17 @@
 // SPDX-License-Identifier: GPL-2.0
 /*  Copyright(c) 2016-20 Intel Corporation. */
 
+#include <linux/file.h>
 #include <linux/freezer.h>
 #include <linux/highmem.h>
 #include <linux/kthread.h>
+#include <linux/miscdevice.h>
 #include <linux/pagemap.h>
 #include <linux/ratelimit.h>
 #include <linux/sched/mm.h>
 #include <linux/sched/signal.h>
 #include <linux/slab.h>
+#include <asm/sgx.h>
 #include "driver.h"
 #include "encl.h"
 #include "encls.h"
@@ -743,6 +746,51 @@ void sgx_update_lepubkeyhash(u64 *lepubkeyhash)
 		wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + i, lepubkeyhash[i]);
 }
 
+const struct file_operations sgx_provision_fops = {
+	.owner			= THIS_MODULE,
+};
+
+static struct miscdevice sgx_dev_provision = {
+	.minor = MISC_DYNAMIC_MINOR,
+	.name = "sgx_provision",
+	.nodename = "sgx_provision",
+	.fops = &sgx_provision_fops,
+};
+
+/**
+ * sgx_set_attribute() - Update allowed attributes given file descriptor
+ * @allowed_attributes:		Pointer to allowed enclave attributes
+ * @attribute_fd:		File descriptor for specific attribute
+ *
+ * Append enclave attribute indicated by file descriptor to allowed
+ * attributes. Currently only SGX_ATTR_PROVISIONKEY indicated by
+ * /dev/sgx_provision is supported.
+ *
+ * Return:
+ * -0:		SGX_ATTR_PROVISIONKEY is appended to allowed_attributes
+ * -EINVAL:	Invalid, or not supported file descriptor
+ */
+int sgx_set_attribute(unsigned long *allowed_attributes,
+		      unsigned int attribute_fd)
+{
+	struct file *file;
+
+	file = fget(attribute_fd);
+	if (!file)
+		return -EINVAL;
+
+	if (file->f_op != &sgx_provision_fops) {
+		fput(file);
+		return -EINVAL;
+	}
+
+	*allowed_attributes |= SGX_ATTR_PROVISIONKEY;
+
+	fput(file);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(sgx_set_attribute);
+
 static int __init sgx_init(void)
 {
 	int ret;
@@ -759,6 +807,10 @@ static int __init sgx_init(void)
 		goto err_page_cache;
 	}
 
+	ret = misc_register(&sgx_dev_provision);
+	if (ret)
+		goto err_kthread;
+
 	/*
 	 * Always try to initialize the native *and* KVM drivers.
 	 * The KVM driver is less picky than the native one and
@@ -770,10 +822,13 @@ static int __init sgx_init(void)
 	ret = sgx_drv_init();
 
 	if (sgx_vepc_init() && ret)
-		goto err_kthread;
+		goto err_provision;
 
 	return 0;
 
+err_provision:
+	misc_deregister(&sgx_dev_provision);
+
 err_kthread:
 	kthread_stop(ksgxd_tsk);
 
-- 
Gitee


From d0cb049e9ddf3baa8e86ddde2321c1a489965b19 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 12 Apr 2021 16:21:33 +1200
Subject: [PATCH 015/674] KVM: x86: Export kvm_mmu_gva_to_gpa_{read,write}()
 for SGX (VMX)

mainline inclusion
from mainline-5.13
commit 54f958cdaa8c43c0e9b9ef850ae613a6e1bda44e
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit 54f958cdaa8c KVM: x86: Export
kvm_mmu_gva_to_gpa_{read,write}() for SGX (VMX).
Backport for SGX virtualization support

--------------------------------

Export the gva_to_gpa() helpers for use by SGX virtualization when
executing ENCLS[ECREATE] and ENCLS[EINIT] on behalf of the guest.
To execute ECREATE and EINIT, KVM must obtain the GPA of the target
Secure Enclave Control Structure (SECS) in order to get its
corresponding HVA.

Because the SECS must reside in the Enclave Page Cache (EPC), copying
the SECS's data to a host-controlled buffer via existing exported
helpers is not a viable option as the EPC is not readable or writable
by the kernel.

SGX virtualization will also use gva_to_gpa() to obtain HVAs for
non-EPC pages in order to pass user pointers directly to ECREATE and
EINIT, which avoids having to copy pages worth of data into the kernel.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Acked-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Kai Huang <kai.huang@intel.com>
Message-Id: <02f37708321bcdfaa2f9d41c8478affa6e84b04d.1618196135.git.kai.huang@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/kvm/x86.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1f857bc5ac6e..4595c7dbfa93 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6005,6 +6005,7 @@ gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
 	u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
 	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
 }
+EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_read);
 
  gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
 				struct x86_exception *exception)
@@ -6021,6 +6022,7 @@ gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
 	access |= PFERR_WRITE_MASK;
 	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
 }
+EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_write);
 
 /* uses this to access any guest's mapped memory without checking CPL */
 gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
-- 
Gitee


From a10fe21a2e72d80e26fb880169c49fea5316a9c9 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 12 Apr 2021 16:21:34 +1200
Subject: [PATCH 016/674] KVM: x86: Define new #PF SGX error code bit

mainline inclusion
from mainline-5.13
commit 00e7646c3563d2f1a46a8fa1824c32373d77a8be
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit 00e7646c3563 KVM: x86: Define new #PF SGX error code bit.
Backport for SGX virtualization support

--------------------------------

Page faults that are signaled by the SGX Enclave Page Cache Map (EPCM),
as opposed to the traditional IA32/EPT page tables, set an SGX bit in
the error code to indicate that the #PF was induced by SGX.  KVM will
need to emulate this behavior as part of its trap-and-execute scheme for
virtualizing SGX Launch Control, e.g. to inject SGX-induced #PFs if
EINIT faults in the host, and to support live migration.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Kai Huang <kai.huang@intel.com>
Message-Id: <e170c5175cb9f35f53218a7512c9e3db972b97a2.1618196135.git.kai.huang@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/include/asm/kvm_host.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a3bf158f5b12..80556cef8c48 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -216,6 +216,7 @@ enum x86_intercept_stage;
 #define PFERR_RSVD_BIT 3
 #define PFERR_FETCH_BIT 4
 #define PFERR_PK_BIT 5
+#define PFERR_SGX_BIT 15
 #define PFERR_GUEST_FINAL_BIT 32
 #define PFERR_GUEST_PAGE_BIT 33
 
@@ -225,6 +226,7 @@ enum x86_intercept_stage;
 #define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT)
 #define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT)
 #define PFERR_PK_MASK (1U << PFERR_PK_BIT)
+#define PFERR_SGX_MASK (1U << PFERR_SGX_BIT)
 #define PFERR_GUEST_FINAL_MASK (1ULL << PFERR_GUEST_FINAL_BIT)
 #define PFERR_GUEST_PAGE_MASK (1ULL << PFERR_GUEST_PAGE_BIT)
 
-- 
Gitee


From 09afd8d6d01f3a8f5f9e84180796ee07ba93f5c2 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Mon, 12 Apr 2021 16:21:35 +1200
Subject: [PATCH 017/674] KVM: x86: Add support for reverse CPUID lookup of
 scattered features

mainline inclusion
from mainline-5.13
commit 4e66c0cb79b732b01b82e094b21b8e22a20dff83
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit 4e66c0cb79b7 KVM: x86: Add support for reverse CPUID
lookup of scattered features.
Backport for SGX virtualization support

--------------------------------

Introduce a scheme that allows KVM's CPUID magic to support features
that are scattered in the kernel's feature words.  To advertise and/or
query guest support for CPUID-based features, KVM requires the bit
number of an X86_FEATURE_* to match the bit number in its associated
CPUID entry.  For scattered features, this does not hold true.

Add a framework to allow defining KVM-only words, stored in
kvm_cpu_caps after the shared kernel caps, that can be used to gather
the scattered feature bits by translating X86_FEATURE_* flags into their
KVM-defined feature.

Note, because reverse_cpuid_check() effectively forces kvm_cpu_caps
lookups to be resolved at compile time, there is no runtime cost for
translating from kernel-defined to kvm-defined features.

More details here:  https://lkml.kernel.org/r/X/jxCOLG+HUO4QlZ@google.com

Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Kai Huang <kai.huang@intel.com>
Message-Id: <16cad8d00475f67867fb36701fc7fb7c1ec86ce1.1618196135.git.kai.huang@intel.com>
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/kvm/cpuid.c | 32 +++++++++++++++++++++++++++-----
 arch/x86/kvm/cpuid.h | 39 ++++++++++++++++++++++++++++++++++-----
 2 files changed, 61 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 41b0dc37720e..c5fa5c62e391 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -28,7 +28,7 @@
  * Unlike "struct cpuinfo_x86.x86_capability", kvm_cpu_caps doesn't need to be
  * aligned to sizeof(unsigned long) because it's not accessed via bitops.
  */
-u32 kvm_cpu_caps[NCAPINTS] __read_mostly;
+u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly;
 EXPORT_SYMBOL_GPL(kvm_cpu_caps);
 
 static u32 xstate_required_size(u64 xstate_bv, bool compacted)
@@ -53,6 +53,7 @@ static u32 xstate_required_size(u64 xstate_bv, bool compacted)
 }
 
 #define F feature_bit
+#define SF(name) (boot_cpu_has(X86_FEATURE_##name) ? F(name) : 0)
 
 static inline struct kvm_cpuid_entry2 *cpuid_entry2_find(
 	struct kvm_cpuid_entry2 *entries, int nent, u32 function, u32 index)
@@ -330,13 +331,13 @@ int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
 	return r;
 }
 
-static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask)
+/* Mask kvm_cpu_caps for @leaf with the raw CPUID capabilities of this CPU. */
+static __always_inline void __kvm_cpu_cap_mask(enum cpuid_leafs leaf)
 {
 	const struct cpuid_reg cpuid = x86_feature_cpuid(leaf * 32);
 	struct kvm_cpuid_entry2 entry;
 
 	reverse_cpuid_check(leaf);
-	kvm_cpu_caps[leaf] &= mask;
 
 	cpuid_count(cpuid.function, cpuid.index,
 		    &entry.eax, &entry.ebx, &entry.ecx, &entry.edx);
@@ -344,6 +345,26 @@ static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask)
 	kvm_cpu_caps[leaf] &= *__cpuid_entry_get_reg(&entry, cpuid.reg);
 }
 
+static __always_inline void kvm_cpu_cap_init_scattered(enum cpuid_leafs leaf, u32 mask)
+{
+	/* Use kvm_cpu_cap_mask for non-scattered leafs. */
+	BUILD_BUG_ON(leaf < NCAPINTS);
+
+	kvm_cpu_caps[leaf] = mask;
+
+	__kvm_cpu_cap_mask(leaf);
+}
+
+static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask)
+{
+	/* Use kvm_cpu_cap_init_scattered for scattered leafs. */
+	BUILD_BUG_ON(leaf >= NCAPINTS);
+
+	kvm_cpu_caps[leaf] &= mask;
+
+	__kvm_cpu_cap_mask(leaf);
+}
+
 void kvm_set_cpu_caps(void)
 {
 	unsigned int f_nx = is_efer_nx() ? F(NX) : 0;
@@ -354,12 +375,13 @@ void kvm_set_cpu_caps(void)
 	unsigned int f_gbpages = 0;
 	unsigned int f_lm = 0;
 #endif
+	memset(kvm_cpu_caps, 0, sizeof(kvm_cpu_caps));
 
-	BUILD_BUG_ON(sizeof(kvm_cpu_caps) >
+	BUILD_BUG_ON(sizeof(kvm_cpu_caps) - (NKVMCAPINTS * sizeof(*kvm_cpu_caps)) >
 		     sizeof(boot_cpu_data.x86_capability));
 
 	memcpy(&kvm_cpu_caps, &boot_cpu_data.x86_capability,
-	       sizeof(kvm_cpu_caps));
+	       sizeof(kvm_cpu_caps) - (NKVMCAPINTS * sizeof(*kvm_cpu_caps)));
 
 	kvm_cpu_cap_mask(CPUID_1_ECX,
 		/*
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index dc921d76e42e..2041e2f07347 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -7,7 +7,20 @@
 #include <asm/processor.h>
 #include <uapi/asm/kvm_para.h>
 
-extern u32 kvm_cpu_caps[NCAPINTS] __read_mostly;
+/*
+ * Hardware-defined CPUID leafs that are scattered in the kernel, but need to
+ * be directly used by KVM.  Note, these word values conflict with the kernel's
+ * "bug" caps, but KVM doesn't use those.
+ */
+enum kvm_only_cpuid_leafs {
+	NR_KVM_CPU_CAPS = NCAPINTS,
+
+	NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS,
+};
+
+#define X86_KVM_FEATURE(w, f)		((w)*32 + (f))
+
+extern u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly;
 void kvm_set_cpu_caps(void);
 
 void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu);
@@ -83,6 +96,20 @@ static __always_inline void reverse_cpuid_check(unsigned int x86_leaf)
 	BUILD_BUG_ON(reverse_cpuid[x86_leaf].function == 0);
 }
 
+/*
+ * Translate feature bits that are scattered in the kernel's cpufeatures word
+ * into KVM feature words that align with hardware's definitions.
+ */
+static __always_inline u32 __feature_translate(int x86_feature)
+{
+	return x86_feature;
+}
+
+static __always_inline u32 __feature_leaf(int x86_feature)
+{
+	return __feature_translate(x86_feature) / 32;
+}
+
 /*
  * Retrieve the bit mask from an X86_FEATURE_* definition.  Features contain
  * the hardware defined bit number (stored in bits 4:0) and a software defined
@@ -91,6 +118,8 @@ static __always_inline void reverse_cpuid_check(unsigned int x86_leaf)
  */
 static __always_inline u32 __feature_bit(int x86_feature)
 {
+	x86_feature = __feature_translate(x86_feature);
+
 	reverse_cpuid_check(x86_feature / 32);
 	return 1 << (x86_feature & 31);
 }
@@ -99,7 +128,7 @@ static __always_inline u32 __feature_bit(int x86_feature)
 
 static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned int x86_feature)
 {
-	unsigned int x86_leaf = x86_feature / 32;
+	unsigned int x86_leaf = __feature_leaf(x86_feature);
 
 	reverse_cpuid_check(x86_leaf);
 	return reverse_cpuid[x86_leaf];
@@ -291,7 +320,7 @@ static inline bool cpuid_fault_enabled(struct kvm_vcpu *vcpu)
 
 static __always_inline void kvm_cpu_cap_clear(unsigned int x86_feature)
 {
-	unsigned int x86_leaf = x86_feature / 32;
+	unsigned int x86_leaf = __feature_leaf(x86_feature);
 
 	reverse_cpuid_check(x86_leaf);
 	kvm_cpu_caps[x86_leaf] &= ~__feature_bit(x86_feature);
@@ -299,7 +328,7 @@ static __always_inline void kvm_cpu_cap_clear(unsigned int x86_feature)
 
 static __always_inline void kvm_cpu_cap_set(unsigned int x86_feature)
 {
-	unsigned int x86_leaf = x86_feature / 32;
+	unsigned int x86_leaf = __feature_leaf(x86_feature);
 
 	reverse_cpuid_check(x86_leaf);
 	kvm_cpu_caps[x86_leaf] |= __feature_bit(x86_feature);
@@ -307,7 +336,7 @@ static __always_inline void kvm_cpu_cap_set(unsigned int x86_feature)
 
 static __always_inline u32 kvm_cpu_cap_get(unsigned int x86_feature)
 {
-	unsigned int x86_leaf = x86_feature / 32;
+	unsigned int x86_leaf = __feature_leaf(x86_feature);
 
 	reverse_cpuid_check(x86_leaf);
 	return kvm_cpu_caps[x86_leaf] & __feature_bit(x86_feature);
-- 
Gitee


From c66f4510fb74765e574864d5144f149a4b059d02 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Mon, 12 Apr 2021 16:21:36 +1200
Subject: [PATCH 018/674] KVM: x86: Add reverse-CPUID lookup support for
 scattered SGX features

mainline inclusion
from mainline-5.13
commit 01de8682b32d3ed4f0136f7379e1e3ae2e563308
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit 01de8682b32d KVM: x86: Add reverse-CPUID lookup support
for scattered SGX features.
Backport for SGX virtualization support

--------------------------------

Define a new KVM-only feature word for advertising and querying SGX
sub-features in CPUID.0x12.0x0.EAX.  Because SGX1 and SGX2 are scattered
in the kernel's feature word, they need to be translated so that the
bit numbers match those of hardware.

Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Kai Huang <kai.huang@intel.com>
Message-Id: <e797c533f4c71ae89265bbb15a02aef86b67cbec.1618196135.git.kai.huang@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/kvm/cpuid.h | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 2041e2f07347..fbef5b730805 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -13,12 +13,17 @@
  * "bug" caps, but KVM doesn't use those.
  */
 enum kvm_only_cpuid_leafs {
-	NR_KVM_CPU_CAPS = NCAPINTS,
+	CPUID_12_EAX	 = NCAPINTS,
+	NR_KVM_CPU_CAPS,
 
 	NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS,
 };
 
-#define X86_KVM_FEATURE(w, f)		((w)*32 + (f))
+#define KVM_X86_FEATURE(w, f)		((w)*32 + (f))
+
+/* Intel-defined SGX sub-features, CPUID level 0x12 (EAX). */
+#define KVM_X86_FEATURE_SGX1		KVM_X86_FEATURE(CPUID_12_EAX, 0)
+#define KVM_X86_FEATURE_SGX2		KVM_X86_FEATURE(CPUID_12_EAX, 1)
 
 extern u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly;
 void kvm_set_cpu_caps(void);
@@ -76,6 +81,7 @@ static const struct cpuid_reg reverse_cpuid[] = {
 	[CPUID_8000_0007_EBX] = {0x80000007, 0, CPUID_EBX},
 	[CPUID_7_EDX]         = {         7, 0, CPUID_EDX},
 	[CPUID_7_1_EAX]       = {         7, 1, CPUID_EAX},
+	[CPUID_12_EAX]        = {0x00000012, 0, CPUID_EAX},
 };
 
 /*
@@ -102,6 +108,11 @@ static __always_inline void reverse_cpuid_check(unsigned int x86_leaf)
  */
 static __always_inline u32 __feature_translate(int x86_feature)
 {
+	if (x86_feature == X86_FEATURE_SGX1)
+		return KVM_X86_FEATURE_SGX1;
+	else if (x86_feature == X86_FEATURE_SGX2)
+		return KVM_X86_FEATURE_SGX2;
+
 	return x86_feature;
 }
 
-- 
Gitee


From 08f41fc48faf19e8926e2d8fed0ae31967c63ed0 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 12 Apr 2021 16:21:37 +1200
Subject: [PATCH 019/674] KVM: VMX: Add basic handling of VM-Exit from SGX
 enclave

mainline inclusion
from mainline-5.13
commit 3c0c2ad1ae75963c05bf89ec91918c6a53a72696
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit 3c0c2ad1ae75 KVM: VMX: Add basic handling of VM-Exit
from SGX enclave
Backport for SGX virtualization support

--------------------------------

Add support for handling VM-Exits that originate from a guest SGX
enclave.  In SGX, an "enclave" is a new CPL3-only execution environment,
wherein the CPU and memory state is protected by hardware to make the
state inaccesible to code running outside of the enclave.  When exiting
an enclave due to an asynchronous event (from the perspective of the
enclave), e.g. exceptions, interrupts, and VM-Exits, the enclave's state
is automatically saved and scrubbed (the CPU loads synthetic state), and
then reloaded when re-entering the enclave.  E.g. after an instruction
based VM-Exit from an enclave, vmcs.GUEST_RIP will not contain the RIP
of the enclave instruction that trigered VM-Exit, but will instead point
to a RIP in the enclave's untrusted runtime (the guest userspace code
that coordinates entry/exit to/from the enclave).

To help a VMM recognize and handle exits from enclaves, SGX adds bits to
existing VMCS fields, VM_EXIT_REASON.VMX_EXIT_REASON_FROM_ENCLAVE and
GUEST_INTERRUPTIBILITY_INFO.GUEST_INTR_STATE_ENCLAVE_INTR.  Define the
new architectural bits, and add a boolean to struct vcpu_vmx to cache
VMX_EXIT_REASON_FROM_ENCLAVE.  Clear the bit in exit_reason so that
checks against exit_reason do not need to account for SGX, e.g.
"if (exit_reason == EXIT_REASON_EXCEPTION_NMI)" continues to work.

KVM is a largely a passive observer of the new bits, e.g. KVM needs to
account for the bits when propagating information to a nested VMM, but
otherwise doesn't need to act differently for the majority of VM-Exits
from enclaves.

The one scenario that is directly impacted is emulation, which is for
all intents and purposes impossible[1] since KVM does not have access to
the RIP or instruction stream that triggered the VM-Exit.  The inability
to emulate is a non-issue for KVM, as most instructions that might
trigger VM-Exit unconditionally #UD in an enclave (before the VM-Exit
check.  For the few instruction that conditionally #UD, KVM either never
sets the exiting control, e.g. PAUSE_EXITING[2], or sets it if and only
if the feature is not exposed to the guest in order to inject a #UD,
e.g. RDRAND_EXITING.

But, because it is still possible for a guest to trigger emulation,
e.g. MMIO, inject a #UD if KVM ever attempts emulation after a VM-Exit
from an enclave.  This is architecturally accurate for instruction
VM-Exits, and for MMIO it's the least bad choice, e.g. it's preferable
to killing the VM.  In practice, only broken or particularly stupid
guests should ever encounter this behavior.

Add a WARN in skip_emulated_instruction to detect any attempt to
modify the guest's RIP during an SGX enclave VM-Exit as all such flows
should either be unreachable or must handle exits from enclaves before
getting to skip_emulated_instruction.

[1] Impossible for all practical purposes.  Not truly impossible
    since KVM could implement some form of para-virtualization scheme.

[2] PAUSE_LOOP_EXITING only affects CPL0 and enclaves exist only at
    CPL3, so we also don't need to worry about that interaction.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Kai Huang <kai.huang@intel.com>
Message-Id: <315f54a8507d09c292463ef29104e1d4c62e9090.1618196135.git.kai.huang@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/include/asm/vmx.h      |  1 +
 arch/x86/include/uapi/asm/vmx.h |  1 +
 arch/x86/kvm/vmx/nested.c       |  2 ++
 arch/x86/kvm/vmx/vmx.c          | 45 +++++++++++++++++++++++++++++++--
 4 files changed, 47 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index f8ba5289ecb0..c6f028bac3ff 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -371,6 +371,7 @@ enum vmcs_field {
 #define GUEST_INTR_STATE_MOV_SS		0x00000002
 #define GUEST_INTR_STATE_SMI		0x00000004
 #define GUEST_INTR_STATE_NMI		0x00000008
+#define GUEST_INTR_STATE_ENCLAVE_INTR	0x00000010
 
 /* GUEST_ACTIVITY_STATE flags */
 #define GUEST_ACTIVITY_ACTIVE		0
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index b8ff9e8ac0d5..df6707a76a3d 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -27,6 +27,7 @@
 
 
 #define VMX_EXIT_REASONS_FAILED_VMENTRY         0x80000000
+#define VMX_EXIT_REASONS_SGX_ENCLAVE_MODE	0x08000000
 
 #define EXIT_REASON_EXCEPTION_NMI       0
 #define EXIT_REASON_EXTERNAL_INTERRUPT  1
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 0c2389d0fdaf..0f871faeff1a 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -4142,6 +4142,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 {
 	/* update exit information fields: */
 	vmcs12->vm_exit_reason = vm_exit_reason;
+	if (to_vmx(vcpu)->exit_reason.enclave_mode)
+		vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE;
 	vmcs12->exit_qualification = exit_qualification;
 	vmcs12->vm_exit_intr_info = exit_intr_info;
 
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 79889d27aa5b..e6f8c3f52ffa 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1630,12 +1630,25 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
 
 static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len)
 {
+	/*
+	 * Emulation of instructions in SGX enclaves is impossible as RIP does
+	 * not point  tthe failing instruction, and even if it did, the code
+	 * stream is inaccessible.  Inject #UD instead of exiting to userspace
+	 * so that guest userspace can't DoS the guest simply by triggering
+	 * emulation (enclaves are CPL3 only).
+	 */
+	if (to_vmx(vcpu)->exit_reason.enclave_mode) {
+		kvm_queue_exception(vcpu, UD_VECTOR);
+		return false;
+	}
 	return true;
 }
 
 static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
 {
+	union vmx_exit_reason exit_reason = to_vmx(vcpu)->exit_reason;
 	unsigned long rip, orig_rip;
+	u32 instr_len;
 
 	/*
 	 * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on
@@ -1646,9 +1659,33 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
 	 * i.e. we end up advancing IP with some random value.
 	 */
 	if (!static_cpu_has(X86_FEATURE_HYPERVISOR) ||
-	    to_vmx(vcpu)->exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) {
+	    exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) {
+		instr_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+
+		/*
+		 * Emulating an enclave's instructions isn't supported as KVM
+		 * cannot access the enclave's memory or its true RIP, e.g. the
+		 * vmcs.GUEST_RIP points at the exit point of the enclave, not
+		 * the RIP that actually triggered the VM-Exit.  But, because
+		 * most instructions that cause VM-Exit will #UD in an enclave,
+		 * most instruction-based VM-Exits simply do not occur.
+		 *
+		 * There are a few exceptions, notably the debug instructions
+		 * INT1ICEBRK and INT3, as they are allowed in debug enclaves
+		 * and generate #DB/#BP as expected, which KVM might intercept.
+		 * But again, the CPU does the dirty work and saves an instr
+		 * length of zero so VMMs don't shoot themselves in the foot.
+		 * WARN if KVM tries to skip a non-zero length instruction on
+		 * a VM-Exit from an enclave.
+		 */
+		if (!instr_len)
+			goto rip_updated;
+
+		WARN(exit_reason.enclave_mode,
+		     "KVM: skipping instruction after SGX enclave VM-Exit");
+
 		orig_rip = kvm_rip_read(vcpu);
-		rip = orig_rip + vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+		rip = orig_rip + instr_len;
 #ifdef CONFIG_X86_64
 		/*
 		 * We need to mask out the high 32 bits of RIP if not in 64-bit
@@ -1664,6 +1701,7 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
 			return 0;
 	}
 
+rip_updated:
 	/* skipping an emulated instruction also counts */
 	vmx_set_interrupt_shadow(vcpu, 0);
 
@@ -5508,6 +5546,9 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
 {
 	gpa_t gpa;
 
+	if (!vmx_can_emulate_instruction(vcpu, NULL, 0))
+		return 1;
+
 	/*
 	 * A nested guest cannot optimize MMIO vmexits, because we have an
 	 * nGPA here instead of the required GPA.
-- 
Gitee


From e4e22234d060efeaae35860cf411bdf68ba6bfe7 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 12 Apr 2021 16:21:38 +1200
Subject: [PATCH 020/674] KVM: VMX: Frame in ENCLS handler for SGX
 virtualization

mainline inclusion
from mainline-5.13
commit 9798adbc04cf1b14325dc7e2c882639693516a69
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit 9798adbc04cf KVM: VMX: Frame in ENCLS handler for
SGX virtualization.
Backport for SGX virtualization support

--------------------------------

Introduce sgx.c and sgx.h, along with the framework for handling ENCLS
VM-Exits.  Add a bool, enable_sgx, that will eventually be wired up to a
module param to control whether or not SGX virtualization is enabled at
runtime.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Kai Huang <kai.huang@intel.com>
Message-Id: <1c782269608b2f5e1034be450f375a8432fb705d.1618196135.git.kai.huang@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/kvm/Makefile  |  2 ++
 arch/x86/kvm/vmx/sgx.c | 50 ++++++++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/vmx/sgx.h | 15 +++++++++++++
 arch/x86/kvm/vmx/vmx.c |  9 +++++---
 4 files changed, 73 insertions(+), 3 deletions(-)
 create mode 100644 arch/x86/kvm/vmx/sgx.c
 create mode 100644 arch/x86/kvm/vmx/sgx.h

diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index b804444e16d4..1c6c9adcb730 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -20,6 +20,8 @@ kvm-y			+= x86.o emulate.o i8259.o irq.o lapic.o \
 
 kvm-intel-y		+= vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \
 			   vmx/evmcs.o vmx/nested.o vmx/posted_intr.o
+kvm-intel-$(CONFIG_X86_SGX_KVM)	+= vmx/sgx.o
+
 kvm-amd-y		+= svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o svm/sev.o
 
 obj-$(CONFIG_KVM)	+= kvm.o
diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c
new file mode 100644
index 000000000000..d874eb180b7d
--- /dev/null
+++ b/arch/x86/kvm/vmx/sgx.c
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: GPL-2.0
+/*  Copyright(c) 2021 Intel Corporation. */
+
+#include <asm/sgx.h>
+
+#include "cpuid.h"
+#include "kvm_cache_regs.h"
+#include "sgx.h"
+#include "vmx.h"
+#include "x86.h"
+
+bool __read_mostly enable_sgx;
+
+static inline bool encls_leaf_enabled_in_guest(struct kvm_vcpu *vcpu, u32 leaf)
+{
+	if (!enable_sgx || !guest_cpuid_has(vcpu, X86_FEATURE_SGX))
+		return false;
+
+	if (leaf >= ECREATE && leaf <= ETRACK)
+		return guest_cpuid_has(vcpu, X86_FEATURE_SGX1);
+
+	if (leaf >= EAUG && leaf <= EMODT)
+		return guest_cpuid_has(vcpu, X86_FEATURE_SGX2);
+
+	return false;
+}
+
+static inline bool sgx_enabled_in_guest_bios(struct kvm_vcpu *vcpu)
+{
+	const u64 bits = FEAT_CTL_SGX_ENABLED | FEAT_CTL_LOCKED;
+
+	return (to_vmx(vcpu)->msr_ia32_feature_control & bits) == bits;
+}
+
+int handle_encls(struct kvm_vcpu *vcpu)
+{
+	u32 leaf = (u32)kvm_rax_read(vcpu);
+
+	if (!encls_leaf_enabled_in_guest(vcpu, leaf)) {
+		kvm_queue_exception(vcpu, UD_VECTOR);
+	} else if (!sgx_enabled_in_guest_bios(vcpu)) {
+		kvm_inject_gp(vcpu, 0);
+	} else {
+		WARN(1, "KVM: unexpected exit on ENCLS[%u]", leaf);
+		vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
+		vcpu->run->hw.hardware_exit_reason = EXIT_REASON_ENCLS;
+		return 0;
+	}
+	return 1;
+}
diff --git a/arch/x86/kvm/vmx/sgx.h b/arch/x86/kvm/vmx/sgx.h
new file mode 100644
index 000000000000..6e17ecd4aca3
--- /dev/null
+++ b/arch/x86/kvm/vmx/sgx.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_SGX_H
+#define __KVM_X86_SGX_H
+
+#include <linux/kvm_host.h>
+
+#ifdef CONFIG_X86_SGX_KVM
+extern bool __read_mostly enable_sgx;
+
+int handle_encls(struct kvm_vcpu *vcpu);
+#else
+#define enable_sgx 0
+#endif
+
+#endif /* __KVM_X86_SGX_H */
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index e6f8c3f52ffa..578dc24b6195 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -58,6 +58,7 @@
 #include "mmu.h"
 #include "nested.h"
 #include "pmu.h"
+#include "sgx.h"
 #include "trace.h"
 #include "vmcs.h"
 #include "vmcs12.h"
@@ -5800,16 +5801,18 @@ static int handle_vmx_instruction(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+#ifndef CONFIG_X86_SGX_KVM
 static int handle_encls(struct kvm_vcpu *vcpu)
 {
 	/*
-	 * SGX virtualization is not yet supported.  There is no software
-	 * enable bit for SGX, so we have to trap ENCLS and inject a #UD
-	 * to prevent the guest from executing ENCLS.
+	 * SGX virtualization is disabled.  There is no software enable bit for
+	 * SGX, so KVM intercepts all ENCLS leafs and injects a #UD to prevent
+	 * the guest from executing ENCLS (when SGX is supported by hardware).
 	 */
 	kvm_queue_exception(vcpu, UD_VECTOR);
 	return 1;
 }
+#endif /* CONFIG_X86_SGX_KVM */
 
 /*
  * The exit handlers return 1 if the exit was handled fully and guest execution
-- 
Gitee


From 6ad6cc5bd16c19b3d0680fb0a95a8e2282dfd825 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 12 Apr 2021 16:21:39 +1200
Subject: [PATCH 021/674] KVM: VMX: Add SGX ENCLS[ECREATE] handler to enforce
 CPUID restrictions

mainline inclusion
from mainline-5.13
commit 70210c044b4ea8f05e93ec62abc30cab4233ec88
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit 70210c044b4e KVM: VMX: Add SGX ENCLS[ECREATE] handler
to enforce CPUID restrictions.
Backport for SGX virtualization support

--------------------------------

Add an ECREATE handler that will be used to intercept ECREATE for the
purpose of enforcing and enclave's MISCSELECT, ATTRIBUTES and XFRM, i.e.
to allow userspace to restrict SGX features via CPUID.  ECREATE will be
intercepted when any of the aforementioned masks diverges from hardware
in order to enforce the desired CPUID model, i.e. inject #GP if the
guest attempts to set a bit that hasn't been enumerated as allowed-1 in
CPUID.

Note, access to the PROVISIONKEY is not yet supported.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Co-developed-by: Kai Huang <kai.huang@intel.com>
Signed-off-by: Kai Huang <kai.huang@intel.com>
Message-Id: <c3a97684f1b71b4f4626a1fc3879472a95651725.1618196135.git.kai.huang@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/include/asm/kvm_host.h |   3 +
 arch/x86/kvm/vmx/sgx.c          | 275 ++++++++++++++++++++++++++++++++
 2 files changed, 278 insertions(+)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 80556cef8c48..559063fd8907 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -994,6 +994,9 @@ struct kvm_arch {
 
 	bool bus_lock_detection_enabled;
 
+	/* Guest can access the SGX PROVISIONKEY. */
+	bool sgx_provisioning_allowed;
+
 	/* Deflect RDMSR and WRMSR to user space when they trigger a #GP */
 	u32 user_space_msr_mask;
 
diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c
index d874eb180b7d..2cf9d7cf818c 100644
--- a/arch/x86/kvm/vmx/sgx.c
+++ b/arch/x86/kvm/vmx/sgx.c
@@ -11,6 +11,279 @@
 
 bool __read_mostly enable_sgx;
 
+/*
+ * ENCLS's memory operands use a fixed segment (DS) and a fixed
+ * address size based on the mode.  Related prefixes are ignored.
+ */
+static int sgx_get_encls_gva(struct kvm_vcpu *vcpu, unsigned long offset,
+			     int size, int alignment, gva_t *gva)
+{
+	struct kvm_segment s;
+	bool fault;
+
+	/* Skip vmcs.GUEST_DS retrieval for 64-bit mode to avoid VMREADs. */
+	*gva = offset;
+	if (!is_long_mode(vcpu)) {
+		vmx_get_segment(vcpu, &s, VCPU_SREG_DS);
+		*gva += s.base;
+	}
+
+	if (!IS_ALIGNED(*gva, alignment)) {
+		fault = true;
+	} else if (likely(is_long_mode(vcpu))) {
+		fault = is_noncanonical_address(*gva, vcpu);
+	} else {
+		*gva &= 0xffffffff;
+		fault = (s.unusable) ||
+			(s.type != 2 && s.type != 3) ||
+			(*gva > s.limit) ||
+			((s.base != 0 || s.limit != 0xffffffff) &&
+			(((u64)*gva + size - 1) > s.limit + 1));
+	}
+	if (fault)
+		kvm_inject_gp(vcpu, 0);
+	return fault ? -EINVAL : 0;
+}
+
+static void sgx_handle_emulation_failure(struct kvm_vcpu *vcpu, u64 addr,
+					 unsigned int size)
+{
+	vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+	vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+	vcpu->run->internal.ndata = 2;
+	vcpu->run->internal.data[0] = addr;
+	vcpu->run->internal.data[1] = size;
+}
+
+static int sgx_read_hva(struct kvm_vcpu *vcpu, unsigned long hva, void *data,
+			unsigned int size)
+{
+	if (__copy_from_user(data, (void __user *)hva, size)) {
+		sgx_handle_emulation_failure(vcpu, hva, size);
+		return -EFAULT;
+	}
+
+	return 0;
+}
+
+static int sgx_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t gva, bool write,
+			  gpa_t *gpa)
+{
+	struct x86_exception ex;
+
+	if (write)
+		*gpa = kvm_mmu_gva_to_gpa_write(vcpu, gva, &ex);
+	else
+		*gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, &ex);
+
+	if (*gpa == UNMAPPED_GVA) {
+		kvm_inject_emulated_page_fault(vcpu, &ex);
+		return -EFAULT;
+	}
+
+	return 0;
+}
+
+static int sgx_gpa_to_hva(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned long *hva)
+{
+	*hva = kvm_vcpu_gfn_to_hva(vcpu, PFN_DOWN(gpa));
+	if (kvm_is_error_hva(*hva)) {
+		sgx_handle_emulation_failure(vcpu, gpa, 1);
+		return -EFAULT;
+	}
+
+	*hva |= gpa & ~PAGE_MASK;
+
+	return 0;
+}
+
+static int sgx_inject_fault(struct kvm_vcpu *vcpu, gva_t gva, int trapnr)
+{
+	struct x86_exception ex;
+
+	/*
+	 * A non-EPCM #PF indicates a bad userspace HVA.  This *should* check
+	 * for PFEC.SGX and not assume any #PF on SGX2 originated in the EPC,
+	 * but the error code isn't (yet) plumbed through the ENCLS helpers.
+	 */
+	if (trapnr == PF_VECTOR && !boot_cpu_has(X86_FEATURE_SGX2)) {
+		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+		vcpu->run->internal.ndata = 0;
+		return 0;
+	}
+
+	/*
+	 * If the guest thinks it's running on SGX2 hardware, inject an SGX
+	 * #PF if the fault matches an EPCM fault signature (#GP on SGX1,
+	 * #PF on SGX2).  The assumption is that EPCM faults are much more
+	 * likely than a bad userspace address.
+	 */
+	if ((trapnr == PF_VECTOR || !boot_cpu_has(X86_FEATURE_SGX2)) &&
+	    guest_cpuid_has(vcpu, X86_FEATURE_SGX2)) {
+		memset(&ex, 0, sizeof(ex));
+		ex.vector = PF_VECTOR;
+		ex.error_code = PFERR_PRESENT_MASK | PFERR_WRITE_MASK |
+				PFERR_SGX_MASK;
+		ex.address = gva;
+		ex.error_code_valid = true;
+		ex.nested_page_fault = false;
+		kvm_inject_page_fault(vcpu, &ex);
+	} else {
+		kvm_inject_gp(vcpu, 0);
+	}
+	return 1;
+}
+
+static int __handle_encls_ecreate(struct kvm_vcpu *vcpu,
+				  struct sgx_pageinfo *pageinfo,
+				  unsigned long secs_hva,
+				  gva_t secs_gva)
+{
+	struct sgx_secs *contents = (struct sgx_secs *)pageinfo->contents;
+	struct kvm_cpuid_entry2 *sgx_12_0, *sgx_12_1;
+	u64 attributes, xfrm, size;
+	u32 miscselect;
+	u8 max_size_log2;
+	int trapnr, ret;
+
+	sgx_12_0 = kvm_find_cpuid_entry(vcpu, 0x12, 0);
+	sgx_12_1 = kvm_find_cpuid_entry(vcpu, 0x12, 1);
+	if (!sgx_12_0 || !sgx_12_1) {
+		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+		vcpu->run->internal.ndata = 0;
+		return 0;
+	}
+
+	miscselect = contents->miscselect;
+	attributes = contents->attributes;
+	xfrm = contents->xfrm;
+	size = contents->size;
+
+	/* Enforce restriction of access to the PROVISIONKEY. */
+	if (!vcpu->kvm->arch.sgx_provisioning_allowed &&
+	    (attributes & SGX_ATTR_PROVISIONKEY)) {
+		if (sgx_12_1->eax & SGX_ATTR_PROVISIONKEY)
+			pr_warn_once("KVM: SGX PROVISIONKEY advertised but not allowed\n");
+		kvm_inject_gp(vcpu, 0);
+		return 1;
+	}
+
+	/* Enforce CPUID restrictions on MISCSELECT, ATTRIBUTES and XFRM. */
+	if ((u32)miscselect & ~sgx_12_0->ebx ||
+	    (u32)attributes & ~sgx_12_1->eax ||
+	    (u32)(attributes >> 32) & ~sgx_12_1->ebx ||
+	    (u32)xfrm & ~sgx_12_1->ecx ||
+	    (u32)(xfrm >> 32) & ~sgx_12_1->edx) {
+		kvm_inject_gp(vcpu, 0);
+		return 1;
+	}
+
+	/* Enforce CPUID restriction on max enclave size. */
+	max_size_log2 = (attributes & SGX_ATTR_MODE64BIT) ? sgx_12_0->edx >> 8 :
+							    sgx_12_0->edx;
+	if (size >= BIT_ULL(max_size_log2))
+		kvm_inject_gp(vcpu, 0);
+
+	/*
+	 * sgx_virt_ecreate() returns:
+	 *  1) 0:	ECREATE was successful
+	 *  2) -EFAULT:	ECREATE was run but faulted, and trapnr was set to the
+	 *		exception number.
+	 *  3) -EINVAL:	access_ok() on @secs_hva failed. This should never
+	 *		happen as KVM checks host addresses at memslot creation.
+	 *		sgx_virt_ecreate() has already warned in this case.
+	 */
+	ret = sgx_virt_ecreate(pageinfo, (void __user *)secs_hva, &trapnr);
+	if (!ret)
+		return kvm_skip_emulated_instruction(vcpu);
+	if (ret == -EFAULT)
+		return sgx_inject_fault(vcpu, secs_gva, trapnr);
+
+	return ret;
+}
+
+static int handle_encls_ecreate(struct kvm_vcpu *vcpu)
+{
+	gva_t pageinfo_gva, secs_gva;
+	gva_t metadata_gva, contents_gva;
+	gpa_t metadata_gpa, contents_gpa, secs_gpa;
+	unsigned long metadata_hva, contents_hva, secs_hva;
+	struct sgx_pageinfo pageinfo;
+	struct sgx_secs *contents;
+	struct x86_exception ex;
+	int r;
+
+	if (sgx_get_encls_gva(vcpu, kvm_rbx_read(vcpu), 32, 32, &pageinfo_gva) ||
+	    sgx_get_encls_gva(vcpu, kvm_rcx_read(vcpu), 4096, 4096, &secs_gva))
+		return 1;
+
+	/*
+	 * Copy the PAGEINFO to local memory, its pointers need to be
+	 * translated, i.e. we need to do a deep copy/translate.
+	 */
+	r = kvm_read_guest_virt(vcpu, pageinfo_gva, &pageinfo,
+				sizeof(pageinfo), &ex);
+	if (r == X86EMUL_PROPAGATE_FAULT) {
+		kvm_inject_emulated_page_fault(vcpu, &ex);
+		return 1;
+	} else if (r != X86EMUL_CONTINUE) {
+		sgx_handle_emulation_failure(vcpu, pageinfo_gva,
+					     sizeof(pageinfo));
+		return 0;
+	}
+
+	if (sgx_get_encls_gva(vcpu, pageinfo.metadata, 64, 64, &metadata_gva) ||
+	    sgx_get_encls_gva(vcpu, pageinfo.contents, 4096, 4096,
+			      &contents_gva))
+		return 1;
+
+	/*
+	 * Translate the SECINFO, SOURCE and SECS pointers from GVA to GPA.
+	 * Resume the guest on failure to inject a #PF.
+	 */
+	if (sgx_gva_to_gpa(vcpu, metadata_gva, false, &metadata_gpa) ||
+	    sgx_gva_to_gpa(vcpu, contents_gva, false, &contents_gpa) ||
+	    sgx_gva_to_gpa(vcpu, secs_gva, true, &secs_gpa))
+		return 1;
+
+	/*
+	 * ...and then to HVA.  The order of accesses isn't architectural, i.e.
+	 * KVM doesn't have to fully process one address at a time.  Exit to
+	 * userspace if a GPA is invalid.
+	 */
+	if (sgx_gpa_to_hva(vcpu, metadata_gpa, &metadata_hva) ||
+	    sgx_gpa_to_hva(vcpu, contents_gpa, &contents_hva) ||
+	    sgx_gpa_to_hva(vcpu, secs_gpa, &secs_hva))
+		return 0;
+
+	/*
+	 * Copy contents into kernel memory to prevent TOCTOU attack. E.g. the
+	 * guest could do ECREATE w/ SECS.SGX_ATTR_PROVISIONKEY=0, and
+	 * simultaneously set SGX_ATTR_PROVISIONKEY to bypass the check to
+	 * enforce restriction of access to the PROVISIONKEY.
+	 */
+	contents = (struct sgx_secs *)__get_free_page(GFP_KERNEL_ACCOUNT);
+	if (!contents)
+		return -ENOMEM;
+
+	/* Exit to userspace if copying from a host userspace address fails. */
+	if (sgx_read_hva(vcpu, contents_hva, (void *)contents, PAGE_SIZE)) {
+		free_page((unsigned long)contents);
+		return 0;
+	}
+
+	pageinfo.metadata = metadata_hva;
+	pageinfo.contents = (u64)contents;
+
+	r = __handle_encls_ecreate(vcpu, &pageinfo, secs_hva, secs_gva);
+
+	free_page((unsigned long)contents);
+
+	return r;
+}
+
 static inline bool encls_leaf_enabled_in_guest(struct kvm_vcpu *vcpu, u32 leaf)
 {
 	if (!enable_sgx || !guest_cpuid_has(vcpu, X86_FEATURE_SGX))
@@ -41,6 +314,8 @@ int handle_encls(struct kvm_vcpu *vcpu)
 	} else if (!sgx_enabled_in_guest_bios(vcpu)) {
 		kvm_inject_gp(vcpu, 0);
 	} else {
+		if (leaf == ECREATE)
+			return handle_encls_ecreate(vcpu);
 		WARN(1, "KVM: unexpected exit on ENCLS[%u]", leaf);
 		vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
 		vcpu->run->hw.hardware_exit_reason = EXIT_REASON_ENCLS;
-- 
Gitee


From 1766b14ea2c20f12432e24e2cf3fc6f403d9445c Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 12 Apr 2021 16:21:40 +1200
Subject: [PATCH 022/674] KVM: VMX: Add emulation of SGX Launch Control LE hash
 MSRs

mainline inclusion
from mainline-5.13
commit 8f102445d4045384799627c53d82c45ca2cad3a5
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit 8f102445d404 KVM: VMX: Add emulation of SGX Launch
Control LE hash MSRs.
Backport for SGX virtualization support

--------------------------------

Emulate the four Launch Enclave public key hash MSRs (LE hash MSRs) that
exist on CPUs that support SGX Launch Control (LC).  SGX LC modifies the
behavior of ENCLS[EINIT] to use the LE hash MSRs when verifying the key
used to sign an enclave.  On CPUs without LC support, the LE hash is
hardwired into the CPU to an Intel controlled key (the Intel key is also
the reset value of the LE hash MSRs). Track the guest's desired hash so
that a future patch can stuff the hash into the hardware MSRs when
executing EINIT on behalf of the guest, when those MSRs are writable in
host.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Co-developed-by: Kai Huang <kai.huang@intel.com>
Signed-off-by: Kai Huang <kai.huang@intel.com>
Message-Id: <c58ef601ddf88f3a113add837969533099b1364a.1618196135.git.kai.huang@intel.com>
[Add a comment regarding the MSRs being available until SGX is locked.
 - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/kvm/vmx/sgx.c | 35 +++++++++++++++++++++++++++++++++++
 arch/x86/kvm/vmx/sgx.h |  6 ++++++
 arch/x86/kvm/vmx/vmx.c | 31 +++++++++++++++++++++++++++++++
 arch/x86/kvm/vmx/vmx.h |  3 +++
 4 files changed, 75 insertions(+)

diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c
index 2cf9d7cf818c..d53e6b17b320 100644
--- a/arch/x86/kvm/vmx/sgx.c
+++ b/arch/x86/kvm/vmx/sgx.c
@@ -11,6 +11,9 @@
 
 bool __read_mostly enable_sgx;
 
+/* Initial value of guest's virtual SGX_LEPUBKEYHASHn MSRs */
+static u64 sgx_pubkey_hash[4] __ro_after_init;
+
 /*
  * ENCLS's memory operands use a fixed segment (DS) and a fixed
  * address size based on the mode.  Related prefixes are ignored.
@@ -323,3 +326,35 @@ int handle_encls(struct kvm_vcpu *vcpu)
 	}
 	return 1;
 }
+
+void setup_default_sgx_lepubkeyhash(void)
+{
+	/*
+	 * Use Intel's default value for Skylake hardware if Launch Control is
+	 * not supported, i.e. Intel's hash is hardcoded into silicon, or if
+	 * Launch Control is supported and enabled, i.e. mimic the reset value
+	 * and let the guest write the MSRs at will.  If Launch Control is
+	 * supported but disabled, then use the current MSR values as the hash
+	 * MSRs exist but are read-only (locked and not writable).
+	 */
+	if (!enable_sgx || boot_cpu_has(X86_FEATURE_SGX_LC) ||
+	    rdmsrl_safe(MSR_IA32_SGXLEPUBKEYHASH0, &sgx_pubkey_hash[0])) {
+		sgx_pubkey_hash[0] = 0xa6053e051270b7acULL;
+		sgx_pubkey_hash[1] = 0x6cfbe8ba8b3b413dULL;
+		sgx_pubkey_hash[2] = 0xc4916d99f2b3735dULL;
+		sgx_pubkey_hash[3] = 0xd4f8c05909f9bb3bULL;
+	} else {
+		/* MSR_IA32_SGXLEPUBKEYHASH0 is read above */
+		rdmsrl(MSR_IA32_SGXLEPUBKEYHASH1, sgx_pubkey_hash[1]);
+		rdmsrl(MSR_IA32_SGXLEPUBKEYHASH2, sgx_pubkey_hash[2]);
+		rdmsrl(MSR_IA32_SGXLEPUBKEYHASH3, sgx_pubkey_hash[3]);
+	}
+}
+
+void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	memcpy(vmx->msr_ia32_sgxlepubkeyhash, sgx_pubkey_hash,
+	       sizeof(sgx_pubkey_hash));
+}
diff --git a/arch/x86/kvm/vmx/sgx.h b/arch/x86/kvm/vmx/sgx.h
index 6e17ecd4aca3..6502fa52c7e9 100644
--- a/arch/x86/kvm/vmx/sgx.h
+++ b/arch/x86/kvm/vmx/sgx.h
@@ -8,8 +8,14 @@
 extern bool __read_mostly enable_sgx;
 
 int handle_encls(struct kvm_vcpu *vcpu);
+
+void setup_default_sgx_lepubkeyhash(void);
+void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu);
 #else
 #define enable_sgx 0
+
+static inline void setup_default_sgx_lepubkeyhash(void) { }
+static inline void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu) { }
 #endif
 
 #endif /* __KVM_X86_SGX_H */
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 578dc24b6195..df1cdca49fd2 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1964,6 +1964,13 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_IA32_FEAT_CTL:
 		msr_info->data = vmx->msr_ia32_feature_control;
 		break;
+	case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
+		if (!msr_info->host_initiated &&
+		    !guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC))
+			return 1;
+		msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash
+			[msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0];
+		break;
 	case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
 		if (!nested_vmx_allowed(vcpu))
 			return 1;
@@ -2259,6 +2266,26 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		if (msr_info->host_initiated && data == 0)
 			vmx_leave_nested(vcpu);
 		break;
+	case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
+		/*
+		 * On real hardware, the LE hash MSRs are writable before
+		 * the firmware sets bit 0 in MSR 0x7a ("activating" SGX),
+		 * at which point SGX related bits in IA32_FEATURE_CONTROL
+		 * become writable.
+		 *
+		 * KVM does not emulate SGX activation for simplicity, so
+		 * allow writes to the LE hash MSRs if IA32_FEATURE_CONTROL
+		 * is unlocked.  This is technically not architectural
+		 * behavior, but it's close enough.
+		 */
+		if (!msr_info->host_initiated &&
+		    (!guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC) ||
+		    ((vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED) &&
+		    !(vmx->msr_ia32_feature_control & FEAT_CTL_SGX_LC_ENABLED))))
+			return 1;
+		vmx->msr_ia32_sgxlepubkeyhash
+			[msr_index - MSR_IA32_SGXLEPUBKEYHASH0] = data;
+		break;
 	case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
 		if (!msr_info->host_initiated)
 			return 1; /* they are read-only */
@@ -7145,6 +7172,8 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
 	else
 		memset(&vmx->nested.msrs, 0, sizeof(vmx->nested.msrs));
 
+	vcpu_setup_sgx_lepubkeyhash(vcpu);
+
 	vmx->nested.posted_intr_nv = -1;
 	vmx->nested.current_vmptr = -1ull;
 
@@ -8087,6 +8116,8 @@ static __init int hardware_setup(void)
 	if (!enable_ept || !cpu_has_vmx_intel_pt())
 		pt_mode = PT_MODE_SYSTEM;
 
+	setup_default_sgx_lepubkeyhash();
+
 	if (nested) {
 		nested_vmx_setup_ctls_msrs(&vmcs_config.nested,
 					   vmx_capability.ept);
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 05eca210a5ff..1848fef1f96e 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -324,6 +324,9 @@ struct vcpu_vmx {
 	 */
 	u64 msr_ia32_feature_control;
 	u64 msr_ia32_feature_control_valid_bits;
+	/* SGX Launch Control public key hash */
+	u64 msr_ia32_sgxlepubkeyhash[4];
+
 	u64 ept_pointer;
 	u64 msr_ia32_mcu_opt_ctrl;
 	bool disable_fb_clear;
-- 
Gitee


From 97c47cac100800f3bcb258ad401e290dcd20a056 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 12 Apr 2021 16:21:41 +1200
Subject: [PATCH 023/674] KVM: VMX: Add ENCLS[EINIT] handler to support SGX
 Launch Control (LC)

mainline inclusion
from mainline-5.13
commit b6f084ca553845135ccade79ce6548035e52884a
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit b6f084ca5538 KVM: VMX: Add ENCLS[EINIT] handler to support
SGX Launch Control (LC).
Backport for SGX virtualization support

--------------------------------

Add a VM-Exit handler to trap-and-execute EINIT when SGX LC is enabled
in the host.  When SGX LC is enabled, the host kernel may rewrite the
hardware values at will, e.g. to launch enclaves with different signers,
thus KVM needs to intercept EINIT to ensure it is executed with the
correct LE hash (even if the guest sees a hardwired hash).

Switching the LE hash MSRs on VM-Enter/VM-Exit is not a viable option as
writing the MSRs is prohibitively expensive, e.g. on SKL hardware each
WRMSR is ~400 cycles.  And because EINIT takes tens of thousands of
cycles to execute, the ~1500 cycle overhead to trap-and-execute EINIT is
unlikely to be noticed by the guest, let alone impact its overall SGX
performance.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Kai Huang <kai.huang@intel.com>
Message-Id: <57c92fa4d2083eb3be9e6355e3882fc90cffea87.1618196135.git.kai.huang@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/kvm/vmx/sgx.c | 64 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 64 insertions(+)

diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c
index d53e6b17b320..2eed5da91698 100644
--- a/arch/x86/kvm/vmx/sgx.c
+++ b/arch/x86/kvm/vmx/sgx.c
@@ -287,6 +287,68 @@ static int handle_encls_ecreate(struct kvm_vcpu *vcpu)
 	return r;
 }
 
+static int handle_encls_einit(struct kvm_vcpu *vcpu)
+{
+	unsigned long sig_hva, secs_hva, token_hva, rflags;
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	gva_t sig_gva, secs_gva, token_gva;
+	gpa_t sig_gpa, secs_gpa, token_gpa;
+	int ret, trapnr;
+
+	if (sgx_get_encls_gva(vcpu, kvm_rbx_read(vcpu), 1808, 4096, &sig_gva) ||
+	    sgx_get_encls_gva(vcpu, kvm_rcx_read(vcpu), 4096, 4096, &secs_gva) ||
+	    sgx_get_encls_gva(vcpu, kvm_rdx_read(vcpu), 304, 512, &token_gva))
+		return 1;
+
+	/*
+	 * Translate the SIGSTRUCT, SECS and TOKEN pointers from GVA to GPA.
+	 * Resume the guest on failure to inject a #PF.
+	 */
+	if (sgx_gva_to_gpa(vcpu, sig_gva, false, &sig_gpa) ||
+	    sgx_gva_to_gpa(vcpu, secs_gva, true, &secs_gpa) ||
+	    sgx_gva_to_gpa(vcpu, token_gva, false, &token_gpa))
+		return 1;
+
+	/*
+	 * ...and then to HVA.  The order of accesses isn't architectural, i.e.
+	 * KVM doesn't have to fully process one address at a time.  Exit to
+	 * userspace if a GPA is invalid.  Note, all structures are aligned and
+	 * cannot split pages.
+	 */
+	if (sgx_gpa_to_hva(vcpu, sig_gpa, &sig_hva) ||
+	    sgx_gpa_to_hva(vcpu, secs_gpa, &secs_hva) ||
+	    sgx_gpa_to_hva(vcpu, token_gpa, &token_hva))
+		return 0;
+
+	ret = sgx_virt_einit((void __user *)sig_hva, (void __user *)token_hva,
+			     (void __user *)secs_hva,
+			     vmx->msr_ia32_sgxlepubkeyhash, &trapnr);
+
+	if (ret == -EFAULT)
+		return sgx_inject_fault(vcpu, secs_gva, trapnr);
+
+	/*
+	 * sgx_virt_einit() returns -EINVAL when access_ok() fails on @sig_hva,
+	 * @token_hva or @secs_hva. This should never happen as KVM checks host
+	 * addresses at memslot creation. sgx_virt_einit() has already warned
+	 * in this case, so just return.
+	 */
+	if (ret < 0)
+		return ret;
+
+	rflags = vmx_get_rflags(vcpu) & ~(X86_EFLAGS_CF | X86_EFLAGS_PF |
+					  X86_EFLAGS_AF | X86_EFLAGS_SF |
+					  X86_EFLAGS_OF);
+	if (ret)
+		rflags |= X86_EFLAGS_ZF;
+	else
+		rflags &= ~X86_EFLAGS_ZF;
+	vmx_set_rflags(vcpu, rflags);
+
+	kvm_rax_write(vcpu, ret);
+	return kvm_skip_emulated_instruction(vcpu);
+}
+
 static inline bool encls_leaf_enabled_in_guest(struct kvm_vcpu *vcpu, u32 leaf)
 {
 	if (!enable_sgx || !guest_cpuid_has(vcpu, X86_FEATURE_SGX))
@@ -319,6 +381,8 @@ int handle_encls(struct kvm_vcpu *vcpu)
 	} else {
 		if (leaf == ECREATE)
 			return handle_encls_ecreate(vcpu);
+		if (leaf == EINIT)
+			return handle_encls_einit(vcpu);
 		WARN(1, "KVM: unexpected exit on ENCLS[%u]", leaf);
 		vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
 		vcpu->run->hw.hardware_exit_reason = EXIT_REASON_ENCLS;
-- 
Gitee


From 5698b7e86eb371e324b632402364f89956e59e38 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 12 Apr 2021 16:21:42 +1200
Subject: [PATCH 024/674] KVM: VMX: Enable SGX virtualization for SGX1, SGX2
 and LC

mainline inclusion
from mainline-5.13
commit 72add915fbd5bf5c57deee3da5b2605e966ac199
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit 72add915fbd5 KVM: VMX: Enable SGX virtualization for
SGX1, SGX2 and LC.
Backport for SGX virtualization support

--------------------------------

Enable SGX virtualization now that KVM has the VM-Exit handlers needed
to trap-and-execute ENCLS to ensure correctness and/or enforce the CPU
model exposed to the guest.  Add a KVM module param, "sgx", to allow an
admin to disable SGX virtualization independent of the kernel.

When supported in hardware and the kernel, advertise SGX1, SGX2 and SGX
LC to userspace via CPUID and wire up the ENCLS_EXITING bitmap based on
the guest's SGX capabilities, i.e. to allow ENCLS to be executed in an
SGX-enabled guest.  With the exception of the provision key, all SGX
attribute bits may be exposed to the guest.  Guest access to the
provision key, which is controlled via securityfs, will be added in a
future patch.

Note, KVM does not yet support exposing ENCLS_C leafs or ENCLV leafs.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Kai Huang <kai.huang@intel.com>
Message-Id: <a99e9c23310c79f2f4175c1af4c4cbcef913c3e5.1618196135.git.kai.huang@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/kvm/cpuid.c      | 57 +++++++++++++++++++++++++++-
 arch/x86/kvm/vmx/nested.c | 27 +++++++++++--
 arch/x86/kvm/vmx/nested.h |  5 +++
 arch/x86/kvm/vmx/sgx.c    | 80 ++++++++++++++++++++++++++++++++++++++-
 arch/x86/kvm/vmx/sgx.h    | 13 +++++++
 arch/x86/kvm/vmx/vmcs12.c |  1 +
 arch/x86/kvm/vmx/vmcs12.h |  4 +-
 arch/x86/kvm/vmx/vmx.c    | 26 ++++++++++++-
 8 files changed, 204 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index c5fa5c62e391..6d6130ecf250 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -18,6 +18,7 @@
 #include <asm/processor.h>
 #include <asm/user.h>
 #include <asm/fpu/xstate.h>
+#include <asm/sgx.h>
 #include "cpuid.h"
 #include "lapic.h"
 #include "mmu.h"
@@ -170,6 +171,21 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
 		vcpu->arch.guest_supported_xcr0 =
 			(best->eax | ((u64)best->edx << 32)) & supported_xcr0;
 
+	/*
+	 * Bits 127:0 of the allowed SECS.ATTRIBUTES (CPUID.0x12.0x1) enumerate
+	 * the supported XSAVE Feature Request Mask (XFRM), i.e. the enclave's
+	 * requested XCR0 value.  The enclave's XFRM must be a subset of XCRO
+	 * at the time of EENTER, thus adjust the allowed XFRM by the guest's
+	 * supported XCR0.  Similar to XCR0 handling, FP and SSE are forced to
+	 * '1' even on CPUs that don't support XSAVE.
+	 */
+	best = kvm_find_cpuid_entry(vcpu, 0x12, 0x1);
+	if (best) {
+		best->ecx &= vcpu->arch.guest_supported_xcr0 & 0xffffffff;
+		best->edx &= vcpu->arch.guest_supported_xcr0 >> 32;
+		best->ecx |= XFEATURE_MASK_FPSSE;
+	}
+
 	kvm_update_pv_runtime(vcpu);
 
 	vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
@@ -412,7 +428,7 @@ void kvm_set_cpu_caps(void)
 	);
 
 	kvm_cpu_cap_mask(CPUID_7_0_EBX,
-		F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
+		F(FSGSBASE) | F(SGX) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
 		F(BMI2) | F(ERMS) | 0 /*INVPCID*/ | F(RTM) | 0 /*MPX*/ | F(RDSEED) |
 		F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) |
 		F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) |
@@ -423,7 +439,8 @@ void kvm_set_cpu_caps(void)
 		F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | F(RDPID) |
 		F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
 		F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
-		F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/
+		F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/ |
+		F(SGX_LC)
 	);
 	/* Set LA57 based on hardware capability. */
 	if (cpuid_ecx(7) & F(LA57))
@@ -462,6 +479,10 @@ void kvm_set_cpu_caps(void)
 		F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | F(XSAVES)
 	);
 
+	kvm_cpu_cap_init_scattered(CPUID_12_EAX,
+		SF(SGX1) | SF(SGX2)
+	);
+
 	kvm_cpu_cap_mask(CPUID_8000_0001_ECX,
 		F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
 		F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
@@ -785,6 +806,38 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
 			entry->edx = 0;
 		}
 		break;
+	case 0x12:
+		/* Intel SGX */
+		if (!kvm_cpu_cap_has(X86_FEATURE_SGX)) {
+			entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+			break;
+		}
+
+		/*
+		 * Index 0: Sub-features, MISCSELECT (a.k.a extended features)
+		 * and max enclave sizes.   The SGX sub-features and MISCSELECT
+		 * are restricted by kernel and KVM capabilities (like most
+		 * feature flags), while enclave size is unrestricted.
+		 */
+		cpuid_entry_override(entry, CPUID_12_EAX);
+		entry->ebx &= SGX_MISC_EXINFO;
+
+		entry = do_host_cpuid(array, function, 1);
+		if (!entry)
+			goto out;
+
+		/*
+		 * Index 1: SECS.ATTRIBUTES.  ATTRIBUTES are restricted a la
+		 * feature flags.  Advertise all supported flags, including
+		 * privileged attributes that require explicit opt-in from
+		 * userspace.  ATTRIBUTES.XFRM is not adjusted as userspace is
+		 * expected to derive it from supported XCR0.
+		 */
+		entry->eax &= SGX_ATTR_DEBUG | SGX_ATTR_MODE64BIT |
+			      /* PROVISIONKEY | */ SGX_ATTR_EINITTOKENKEY |
+			      SGX_ATTR_KSS;
+		entry->ebx &= 0;
+		break;
 	/* Intel PT */
 	case 0x14:
 		if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT)) {
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 0f871faeff1a..d2a0464e95c7 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -11,6 +11,7 @@
 #include "mmu.h"
 #include "nested.h"
 #include "pmu.h"
+#include "sgx.h"
 #include "trace.h"
 #include "x86.h"
 
@@ -2326,6 +2327,9 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
 		if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
 		    exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
 
+		if (exec_control & SECONDARY_EXEC_ENCLS_EXITING)
+			vmx_write_encls_bitmap(&vmx->vcpu, vmcs12);
+
 		secondary_exec_controls_set(vmx, exec_control);
 	}
 
@@ -5738,6 +5742,21 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
 	return false;
 }
 
+static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu,
+					  struct vmcs12 *vmcs12)
+{
+	u32 encls_leaf;
+
+	if (!guest_cpuid_has(vcpu, X86_FEATURE_SGX) ||
+	    !nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING))
+		return false;
+
+	encls_leaf = kvm_rax_read(vcpu);
+	if (encls_leaf > 62)
+		encls_leaf = 63;
+	return vmcs12->encls_exiting_bitmap & BIT_ULL(encls_leaf);
+}
+
 static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu,
 	struct vmcs12 *vmcs12, gpa_t bitmap)
 {
@@ -5835,9 +5854,6 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu,
 	case EXIT_REASON_VMFUNC:
 		/* VM functions are emulated through L2->L0 vmexits. */
 		return true;
-	case EXIT_REASON_ENCLS:
-		/* SGX is never exposed to L1 */
-		return true;
 	default:
 		break;
 	}
@@ -5961,6 +5977,8 @@ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu,
 	case EXIT_REASON_TPAUSE:
 		return nested_cpu_has2(vmcs12,
 			SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE);
+	case EXIT_REASON_ENCLS:
+		return nested_vmx_exit_handled_encls(vcpu, vmcs12);
 	default:
 		return true;
 	}
@@ -6536,6 +6554,9 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps)
 		msrs->secondary_ctls_high |=
 			SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
 
+	if (enable_sgx)
+		msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING;
+
 	/* miscellaneous data */
 	rdmsr(MSR_IA32_VMX_MISC,
 		msrs->misc_low,
diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h
index 197148d76b8f..184418baeb3c 100644
--- a/arch/x86/kvm/vmx/nested.h
+++ b/arch/x86/kvm/vmx/nested.h
@@ -244,6 +244,11 @@ static inline bool nested_exit_on_intr(struct kvm_vcpu *vcpu)
 		PIN_BASED_EXT_INTR_MASK;
 }
 
+static inline bool nested_cpu_has_encls_exit(struct vmcs12 *vmcs12)
+{
+	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING);
+}
+
 /*
  * if fixed0[i] == 1: val[i] must be 1
  * if fixed1[i] == 0: val[i] must be 0
diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c
index 2eed5da91698..6693ebdc0770 100644
--- a/arch/x86/kvm/vmx/sgx.c
+++ b/arch/x86/kvm/vmx/sgx.c
@@ -5,11 +5,13 @@
 
 #include "cpuid.h"
 #include "kvm_cache_regs.h"
+#include "nested.h"
 #include "sgx.h"
 #include "vmx.h"
 #include "x86.h"
 
-bool __read_mostly enable_sgx;
+bool __read_mostly enable_sgx = 1;
+module_param_named(sgx, enable_sgx, bool, 0444);
 
 /* Initial value of guest's virtual SGX_LEPUBKEYHASHn MSRs */
 static u64 sgx_pubkey_hash[4] __ro_after_init;
@@ -422,3 +424,79 @@ void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu)
 	memcpy(vmx->msr_ia32_sgxlepubkeyhash, sgx_pubkey_hash,
 	       sizeof(sgx_pubkey_hash));
 }
+
+/*
+ * ECREATE must be intercepted to enforce MISCSELECT, ATTRIBUTES and XFRM
+ * restrictions if the guest's allowed-1 settings diverge from hardware.
+ */
+static bool sgx_intercept_encls_ecreate(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *guest_cpuid;
+	u32 eax, ebx, ecx, edx;
+
+	if (!vcpu->kvm->arch.sgx_provisioning_allowed)
+		return true;
+
+	guest_cpuid = kvm_find_cpuid_entry(vcpu, 0x12, 0);
+	if (!guest_cpuid)
+		return true;
+
+	cpuid_count(0x12, 0, &eax, &ebx, &ecx, &edx);
+	if (guest_cpuid->ebx != ebx || guest_cpuid->edx != edx)
+		return true;
+
+	guest_cpuid = kvm_find_cpuid_entry(vcpu, 0x12, 1);
+	if (!guest_cpuid)
+		return true;
+
+	cpuid_count(0x12, 1, &eax, &ebx, &ecx, &edx);
+	if (guest_cpuid->eax != eax || guest_cpuid->ebx != ebx ||
+	    guest_cpuid->ecx != ecx || guest_cpuid->edx != edx)
+		return true;
+
+	return false;
+}
+
+void vmx_write_encls_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+{
+	/*
+	 * There is no software enable bit for SGX that is virtualized by
+	 * hardware, e.g. there's no CR4.SGXE, so when SGX is disabled in the
+	 * guest (either by the host or by the guest's BIOS) but enabled in the
+	 * host, trap all ENCLS leafs and inject #UD/#GP as needed to emulate
+	 * the expected system behavior for ENCLS.
+	 */
+	u64 bitmap = -1ull;
+
+	/* Nothing to do if hardware doesn't support SGX */
+	if (!cpu_has_vmx_encls_vmexit())
+		return;
+
+	if (guest_cpuid_has(vcpu, X86_FEATURE_SGX) &&
+	    sgx_enabled_in_guest_bios(vcpu)) {
+		if (guest_cpuid_has(vcpu, X86_FEATURE_SGX1)) {
+			bitmap &= ~GENMASK_ULL(ETRACK, ECREATE);
+			if (sgx_intercept_encls_ecreate(vcpu))
+				bitmap |= (1 << ECREATE);
+		}
+
+		if (guest_cpuid_has(vcpu, X86_FEATURE_SGX2))
+			bitmap &= ~GENMASK_ULL(EMODT, EAUG);
+
+		/*
+		 * Trap and execute EINIT if launch control is enabled in the
+		 * host using the guest's values for launch control MSRs, even
+		 * if the guest's values are fixed to hardware default values.
+		 * The MSRs are not loaded/saved on VM-Enter/VM-Exit as writing
+		 * the MSRs is extraordinarily expensive.
+		 */
+		if (boot_cpu_has(X86_FEATURE_SGX_LC))
+			bitmap |= (1 << EINIT);
+
+		if (!vmcs12 && is_guest_mode(vcpu))
+			vmcs12 = get_vmcs12(vcpu);
+		if (vmcs12 && nested_cpu_has_encls_exit(vmcs12))
+			bitmap |= vmcs12->encls_exiting_bitmap;
+	}
+	vmcs_write64(ENCLS_EXITING_BITMAP, bitmap);
+}
diff --git a/arch/x86/kvm/vmx/sgx.h b/arch/x86/kvm/vmx/sgx.h
index 6502fa52c7e9..a400888b376d 100644
--- a/arch/x86/kvm/vmx/sgx.h
+++ b/arch/x86/kvm/vmx/sgx.h
@@ -4,6 +4,9 @@
 
 #include <linux/kvm_host.h>
 
+#include "capabilities.h"
+#include "vmx_ops.h"
+
 #ifdef CONFIG_X86_SGX_KVM
 extern bool __read_mostly enable_sgx;
 
@@ -11,11 +14,21 @@ int handle_encls(struct kvm_vcpu *vcpu);
 
 void setup_default_sgx_lepubkeyhash(void);
 void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu);
+
+void vmx_write_encls_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12);
 #else
 #define enable_sgx 0
 
 static inline void setup_default_sgx_lepubkeyhash(void) { }
 static inline void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu) { }
+
+static inline void vmx_write_encls_bitmap(struct kvm_vcpu *vcpu,
+					  struct vmcs12 *vmcs12)
+{
+	/* Nothing to do if hardware doesn't support SGX */
+	if (cpu_has_vmx_encls_vmexit())
+		vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
+}
 #endif
 
 #endif /* __KVM_X86_SGX_H */
diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c
index c8e51c004f78..034adb6404dc 100644
--- a/arch/x86/kvm/vmx/vmcs12.c
+++ b/arch/x86/kvm/vmx/vmcs12.c
@@ -50,6 +50,7 @@ const unsigned short vmcs_field_to_offset_table[] = {
 	FIELD64(VMREAD_BITMAP, vmread_bitmap),
 	FIELD64(VMWRITE_BITMAP, vmwrite_bitmap),
 	FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
+	FIELD64(ENCLS_EXITING_BITMAP, encls_exiting_bitmap),
 	FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
 	FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
 	FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl),
diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h
index 80232daf00ff..13494956d0e9 100644
--- a/arch/x86/kvm/vmx/vmcs12.h
+++ b/arch/x86/kvm/vmx/vmcs12.h
@@ -69,7 +69,8 @@ struct __packed vmcs12 {
 	u64 vm_function_control;
 	u64 eptp_list_address;
 	u64 pml_address;
-	u64 padding64[3]; /* room for future expansion */
+	u64 encls_exiting_bitmap;
+	u64 padding64[2]; /* room for future expansion */
 	/*
 	 * To allow migration of L1 (complete with its L2 guests) between
 	 * machines of different natural widths (32 or 64 bit), we cannot have
@@ -256,6 +257,7 @@ static inline void vmx_check_vmcs12_offsets(void)
 	CHECK_OFFSET(vm_function_control, 296);
 	CHECK_OFFSET(eptp_list_address, 304);
 	CHECK_OFFSET(pml_address, 312);
+	CHECK_OFFSET(encls_exiting_bitmap, 320);
 	CHECK_OFFSET(cr0_guest_host_mask, 344);
 	CHECK_OFFSET(cr4_guest_host_mask, 352);
 	CHECK_OFFSET(cr0_read_shadow, 360);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index df1cdca49fd2..6bb07e495eca 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -2265,6 +2265,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		vmx->msr_ia32_feature_control = data;
 		if (msr_info->host_initiated && data == 0)
 			vmx_leave_nested(vcpu);
+
+		/* SGX may be enabled/disabled by guest's firmware */
+		vmx_write_encls_bitmap(vcpu, NULL);
 		break;
 	case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
 		/*
@@ -4554,8 +4557,7 @@ static void init_vmcs(struct vcpu_vmx *vmx)
 		vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
 	}
 
-	if (cpu_has_vmx_encls_vmexit())
-		vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
+	vmx_write_encls_bitmap(&vmx->vcpu, NULL);
 
 	if (vmx_pt_mode_is_host_guest()) {
 		memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc));
@@ -7500,6 +7502,19 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
 
 	set_cr4_guest_host_mask(vmx);
 
+	vmx_write_encls_bitmap(vcpu, NULL);
+	if (guest_cpuid_has(vcpu, X86_FEATURE_SGX))
+		vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_SGX_ENABLED;
+	else
+		vmx->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_SGX_ENABLED;
+
+	if (guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC))
+		vmx->msr_ia32_feature_control_valid_bits |=
+			FEAT_CTL_SGX_LC_ENABLED;
+	else
+		vmx->msr_ia32_feature_control_valid_bits &=
+			~FEAT_CTL_SGX_LC_ENABLED;
+
 	/* Refresh #PF interception to account for MAXPHYADDR changes. */
 	update_exception_bitmap(vcpu);
 }
@@ -7520,6 +7535,13 @@ static __init void vmx_set_cpu_caps(void)
 	if (vmx_pt_mode_is_host_guest())
 		kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT);
 
+	if (!enable_sgx) {
+		kvm_cpu_cap_clear(X86_FEATURE_SGX);
+		kvm_cpu_cap_clear(X86_FEATURE_SGX_LC);
+		kvm_cpu_cap_clear(X86_FEATURE_SGX1);
+		kvm_cpu_cap_clear(X86_FEATURE_SGX2);
+	}
+
 	if (vmx_umip_emulated())
 		kvm_cpu_cap_set(X86_FEATURE_UMIP);
 
-- 
Gitee


From 2c1e5e2fae715defb064ad661e8f8812ded8f3c3 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 12 Apr 2021 16:21:43 +1200
Subject: [PATCH 025/674] KVM: x86: Add capability to grant VM access to
 privileged SGX attribute

mainline inclusion
from mainline-5.13
commit fe7e948837f312d87853b3fce743795d1ae3715a
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit fe7e948837f3 KVM: x86: Add capability to grant VM
access to privileged SGX attribute.
Backport for SGX virtualization support

--------------------------------

Add a capability, KVM_CAP_SGX_ATTRIBUTE, that can be used by userspace
to grant a VM access to a priveleged attribute, with args[0] holding a
file handle to a valid SGX attribute file.

The SGX subsystem restricts access to a subset of enclave attributes to
provide additional security for an uncompromised kernel, e.g. to prevent
malware from using the PROVISIONKEY to ensure its nodes are running
inside a geniune SGX enclave and/or to obtain a stable fingerprint.

To prevent userspace from circumventing such restrictions by running an
enclave in a VM, KVM restricts guest access to privileged attributes by
default.

Cc: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Kai Huang <kai.huang@intel.com>
Message-Id: <0b099d65e933e068e3ea934b0523bab070cb8cea.1618196135.git.kai.huang@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 Documentation/virt/kvm/api.rst | 23 +++++++++++++++++++++++
 arch/x86/kvm/cpuid.c           |  2 +-
 arch/x86/kvm/x86.c             | 21 +++++++++++++++++++++
 include/uapi/linux/kvm.h       |  1 +
 4 files changed, 46 insertions(+), 1 deletion(-)

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 56deaceab625..671bbd90f15b 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -6197,6 +6197,29 @@ KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR exit notifications which user space
 can then handle to implement model specific MSR handling and/or user notifications
 to inform a user that an MSR was not handled.
 
+7.25 KVM_CAP_SGX_ATTRIBUTE
+----------------------
+
+:Architectures: x86
+:Target: VM
+:Parameters: args[0] is a file handle of a SGX attribute file in securityfs
+:Returns: 0 on success, -EINVAL if the file handle is invalid or if a requested
+          attribute is not supported by KVM.
+
+KVM_CAP_SGX_ATTRIBUTE enables a userspace VMM to grant a VM access to one or
+more priveleged enclave attributes.  args[0] must hold a file handle to a valid
+SGX attribute file corresponding to an attribute that is supported/restricted
+by KVM (currently only PROVISIONKEY).
+
+The SGX subsystem restricts access to a subset of enclave attributes to provide
+additional security for an uncompromised kernel, e.g. use of the PROVISIONKEY
+is restricted to deter malware from using the PROVISIONKEY to obtain a stable
+system fingerprint.  To prevent userspace from circumventing such restrictions
+by running an enclave in a VM, KVM prevents access to privileged attributes by
+default.
+
+See Documentation/x86/sgx/2.Kernel-internals.rst for more details.
+
 8. Other capabilities.
 ======================
 
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 6d6130ecf250..72e7db3a0ad1 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -834,7 +834,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
 		 * expected to derive it from supported XCR0.
 		 */
 		entry->eax &= SGX_ATTR_DEBUG | SGX_ATTR_MODE64BIT |
-			      /* PROVISIONKEY | */ SGX_ATTR_EINITTOKENKEY |
+			      SGX_ATTR_PROVISIONKEY | SGX_ATTR_EINITTOKENKEY |
 			      SGX_ATTR_KSS;
 		entry->ebx &= 0;
 		break;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4595c7dbfa93..4e28a6d6dd1a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -74,6 +74,7 @@
 #include <asm/tlbflush.h>
 #include <asm/intel_pt.h>
 #include <asm/emulate_prefix.h>
+#include <asm/sgx.h>
 #include <clocksource/hyperv_timer.h>
 
 #define CREATE_TRACE_POINTS
@@ -3842,6 +3843,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_X86_USER_SPACE_MSR:
 	case KVM_CAP_X86_MSR_FILTER:
 	case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
+#ifdef CONFIG_X86_SGX_KVM
+	case KVM_CAP_SGX_ATTRIBUTE:
+#endif
 		r = 1;
 		break;
 	case KVM_CAP_SYNC_REGS:
@@ -5394,6 +5398,23 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
 		kvm->arch.user_space_msr_mask = cap->args[0];
 		r = 0;
 		break;
+#ifdef CONFIG_X86_SGX_KVM
+	case KVM_CAP_SGX_ATTRIBUTE: {
+		unsigned long allowed_attributes = 0;
+
+		r = sgx_set_attribute(&allowed_attributes, cap->args[0]);
+		if (r)
+			break;
+
+		/* KVM only supports the PROVISIONKEY privileged attribute. */
+		if ((allowed_attributes & SGX_ATTR_PROVISIONKEY) &&
+		    !(allowed_attributes & ~SGX_ATTR_PROVISIONKEY))
+			kvm->arch.sgx_provisioning_allowed = true;
+		else
+			r = -EINVAL;
+		break;
+	}
+#endif
 	default:
 		r = -EINVAL;
 		break;
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 3af8b0164f1e..4e0ebd66368f 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1061,6 +1061,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_X86_USER_SPACE_MSR 188
 #define KVM_CAP_X86_MSR_FILTER 189
 #define KVM_CAP_ENFORCE_PV_FEATURE_CPUID 190
+#define KVM_CAP_SGX_ATTRIBUTE 196
 
 #define KVM_CAP_ARM_CPU_FEATURE 555
 
-- 
Gitee


From 90772c1249e2e5fa19753dc3025fb88ccb150d5e Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Tue, 20 Apr 2021 18:08:50 -0700
Subject: [PATCH 026/674] KVM: x86: Fix implicit enum conversion goof in
 scattered reverse CPUID code

mainline inclusion
from mainline-5.13
commit 462f8ddebccbb8a364b154008212052d515ac6b1
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit 462f8ddebccb KVM: x86: Fix implicit enum conversion
goof in scattered reverse CPUID code.
Backport for SGX virtualization support

--------------------------------

Take "enum kvm_only_cpuid_leafs" in scattered specific CPUID helpers
(which is obvious in hindsight), and use "unsigned int" for leafs that
can be the kernel's standard "enum cpuid_leaf" or the aforementioned
KVM-only variant.  Loss of the enum params is a bit disapponting, but
gcc obviously isn't providing any extra sanity checks, and the various
BUILD_BUG_ON() assertions ensure the input is in range.

This fixes implicit enum conversions that are detected by clang-11:

arch/x86/kvm/cpuid.c:499:29: warning: implicit conversion from enumeration type 'enum kvm_only_cpuid_leafs' to different enumeration type 'enum cpuid_leafs' [-Wenum-conversion]
        kvm_cpu_cap_init_scattered(CPUID_12_EAX,
        ~~~~~~~~~~~~~~~~~~~~~~~~~~ ^~~~~~~~~~~~
arch/x86/kvm/cpuid.c:837:31: warning: implicit conversion from enumeration type 'enum kvm_only_cpuid_leafs' to different enumeration type 'enum cpuid_leafs' [-Wenum-conversion]
                cpuid_entry_override(entry, CPUID_12_EAX);
                ~~~~~~~~~~~~~~~~~~~~        ^~~~~~~~~~~~
2 warnings generated.

Fixes: 4e66c0cb79b7 ("KVM: x86: Add support for reverse CPUID lookup of scattered features")
Cc: Kai Huang <kai.huang@intel.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20210421010850.3009718-1-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/kvm/cpuid.c | 5 +++--
 arch/x86/kvm/cpuid.h | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 72e7db3a0ad1..911c6efe60aa 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -348,7 +348,7 @@ int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
 }
 
 /* Mask kvm_cpu_caps for @leaf with the raw CPUID capabilities of this CPU. */
-static __always_inline void __kvm_cpu_cap_mask(enum cpuid_leafs leaf)
+static __always_inline void __kvm_cpu_cap_mask(unsigned int leaf)
 {
 	const struct cpuid_reg cpuid = x86_feature_cpuid(leaf * 32);
 	struct kvm_cpuid_entry2 entry;
@@ -361,7 +361,8 @@ static __always_inline void __kvm_cpu_cap_mask(enum cpuid_leafs leaf)
 	kvm_cpu_caps[leaf] &= *__cpuid_entry_get_reg(&entry, cpuid.reg);
 }
 
-static __always_inline void kvm_cpu_cap_init_scattered(enum cpuid_leafs leaf, u32 mask)
+static __always_inline
+void kvm_cpu_cap_init_scattered(enum kvm_only_cpuid_leafs leaf, u32 mask)
 {
 	/* Use kvm_cpu_cap_mask for non-scattered leafs. */
 	BUILD_BUG_ON(leaf < NCAPINTS);
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index fbef5b730805..9fbc0de7c337 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -218,7 +218,7 @@ static __always_inline void cpuid_entry_change(struct kvm_cpuid_entry2 *entry,
 }
 
 static __always_inline void cpuid_entry_override(struct kvm_cpuid_entry2 *entry,
-						 enum cpuid_leafs leaf)
+						 unsigned int leaf)
 {
 	u32 *reg = cpuid_entry_get_reg(entry, leaf * 32);
 
-- 
Gitee


From ef2f0cf07504e7ae045d5117f20271fc12181baf Mon Sep 17 00:00:00 2001
From: Jarkko Sakkinen <jarkko@kernel.org>
Date: Thu, 8 Apr 2021 12:29:24 +0300
Subject: [PATCH 027/674] x86/sgx: Do not update sgx_nr_free_pages in
 sgx_setup_epc_section()

mainline inclusion
from mainline-5.13
commit ae40aaf6bdbf0354a75b8284a0de453fcf5f4d32
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit ae40aaf6bdbf x86/sgx: Do not update sgx_nr_free_pages in
sgx_setup_epc_section().
Backport for SGX virtualization support

--------------------------------

The commit in Fixes: changed the SGX EPC page sanitization to end up in
sgx_free_epc_page() which puts clean and sanitized pages on the free
list.

This was done for the reason that it is best to keep the logic to assign
available-for-use EPC pages to the correct NUMA lists in a single
location.

sgx_nr_free_pages is also incremented by sgx_free_epc_pages() but those
pages which are being added there per EPC section do not belong to the
free list yet because they haven't been sanitized yet - they land on the
dirty list first and the sanitization happens later when ksgxd starts
massaging them.

So remove that addition there and have sgx_free_epc_page() do that
solely.

 [ bp: Sanitize commit message too. ]

Fixes: 51ab30eb2ad4 ("x86/sgx: Replace section->init_laundry_list with sgx_dirty_page_list")
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210408092924.7032-1-jarkko@kernel.org
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/kernel/cpu/sgx/main.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index 92cb11dffd4c..ad904747419e 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -656,7 +656,6 @@ static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size,
 		list_add_tail(&section->pages[i].list, &sgx_dirty_page_list);
 	}
 
-	sgx_nr_free_pages += nr_pages;
 	return true;
 }
 
-- 
Gitee


From b2386478f855d09c726379aecc85a95bcbd461af Mon Sep 17 00:00:00 2001
From: Kai Huang <kai.huang@intel.com>
Date: Tue, 15 Jun 2021 22:16:39 +1200
Subject: [PATCH 028/674] x86/sgx: Add missing xa_destroy() when virtual EPC is
 destroyed

mainline inclusion
from mainline-5.13
commit 4692bc775d2180a937335ccba0edce557103d44a
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit 4692bc775d21 x86/sgx: Add missing xa_destroy() when
virtual EPC is destroyed.
Backport for SGX virtualization support

--------------------------------

xa_destroy() needs to be called to destroy a virtual EPC's page array
before calling kfree() to free the virtual EPC. Currently it is not
called so add the missing xa_destroy().

Fixes: 540745ddbc70 ("x86/sgx: Introduce virtual EPC for use by KVM guests")
Signed-off-by: Kai Huang <kai.huang@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Dave Hansen <dave.hansen@intel.com>
Tested-by: Yang Zhong <yang.zhong@intel.com>
Link: https://lkml.kernel.org/r/20210615101639.291929-1-kai.huang@intel.com
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/kernel/cpu/sgx/virt.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c
index 7d221eac716a..8f7dd1dc4189 100644
--- a/arch/x86/kernel/cpu/sgx/virt.c
+++ b/arch/x86/kernel/cpu/sgx/virt.c
@@ -212,6 +212,7 @@ static int sgx_vepc_release(struct inode *inode, struct file *file)
 		list_splice_tail(&secs_pages, &zombie_secs_pages);
 	mutex_unlock(&zombie_secs_pages_lock);
 
+	xa_destroy(&vepc->page_array);
 	kfree(vepc);
 
 	return 0;
-- 
Gitee


From 254084d399e4a2d6a1b1fb082568c77d2b8c7c42 Mon Sep 17 00:00:00 2001
From: Liam Howlett <liam.howlett@oracle.com>
Date: Tue, 23 Mar 2021 13:42:21 +0000
Subject: [PATCH 029/674] i915_vma: Rename vma_lookup to i915_vma_lookup

mainline inclusion
from mainline-5.13
commit 547be6a479fd19fe1071d2cf76ed574fbff13481
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit 547be6a479fd i915_vma: Rename vma_lookup to
i915_vma_lookup.
Backport for SGX virtualization support

--------------------------------

Use i915 prefix to avoid name collision with future vma_lookup() in mm.

Signed-off-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/20210323134208.3077275-1-Liam.Howlett@Oracle.com
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 drivers/gpu/drm/i915/i915_vma.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
index 50a86fd89d00..9aa4a6ce9fbf 100644
--- a/drivers/gpu/drm/i915/i915_vma.c
+++ b/drivers/gpu/drm/i915/i915_vma.c
@@ -230,7 +230,7 @@ vma_create(struct drm_i915_gem_object *obj,
 }
 
 static struct i915_vma *
-vma_lookup(struct drm_i915_gem_object *obj,
+i915_vma_lookup(struct drm_i915_gem_object *obj,
 	   struct i915_address_space *vm,
 	   const struct i915_ggtt_view *view)
 {
@@ -278,7 +278,7 @@ i915_vma_instance(struct drm_i915_gem_object *obj,
 	GEM_BUG_ON(!atomic_read(&vm->open));
 
 	spin_lock(&obj->vma.lock);
-	vma = vma_lookup(obj, vm, view);
+	vma = i915_vma_lookup(obj, vm, view);
 	spin_unlock(&obj->vma.lock);
 
 	/* vma_create() will resolve the race if another creates the vma */
-- 
Gitee


From 24b6b038317e9d646e4d8ff17915a7454ec287a3 Mon Sep 17 00:00:00 2001
From: Liam Howlett <liam.howlett@oracle.com>
Date: Mon, 28 Jun 2021 19:38:50 -0700
Subject: [PATCH 030/674] mm: add vma_lookup(), update find_vma_intersection()
 comments

mainline inclusion
from mainline-5.14
commit ce6d42f2e4a2d98898419743b037a95661e3ac9d
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit ce6d42f2e4a2  mm: add vma_lookup(), update
find_vma_intersection() comments.
Backport for SGX virtualization support

--------------------------------

Patch series "mm: Add vma_lookup()", v2.

Many places in the kernel use find_vma() to get a vma and then check the
start address of the vma to ensure the next vma was not returned.

Other places use the find_vma_intersection() call with add, addr + 1 as
the range; looking for just the vma at a specific address.

The third use of find_vma() is by developers who do not know that the
function starts searching at the provided address upwards for the next
vma.  This results in a bug that is often overlooked for a long time.

Adding the new vma_lookup() function will allow for cleaner code by
removing the find_vma() calls which check limits, making
find_vma_intersection() calls of a single address to be shorter, and
potentially reduce the incorrect uses of find_vma().

This patch (of 22):

Many places in the kernel use find_vma() to get a vma and then check the
start address of the vma to ensure the next vma was not returned.

Other places use the find_vma_intersection() call with add, addr + 1 as
the range; looking for just the vma at a specific address.

The third use of find_vma() is by developers who do not know that the
function starts searching at the provided address upwards for the next
vma.  This results in a bug that is often overlooked for a long time.

Adding the new vma_lookup() function will allow for cleaner code by
removing the find_vma() calls which check limits, making
find_vma_intersection() calls of a single address to be shorter, and
potentially reduce the incorrect uses of find_vma().

Also change find_vma_intersection() comments and declaration to be of the
correct length and add kernel documentation style comment.

Link: https://lkml.kernel.org/r/20210521174745.2219620-1-Liam.Howlett@Oracle.com
Link: https://lkml.kernel.org/r/20210521174745.2219620-2-Liam.Howlett@Oracle.com
Signed-off-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
Reviewed-by: Laurent Dufour <ldufour@linux.ibm.com>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Davidlohr Bueso <dbueso@suse.de>
Cc: David Miller <davem@davemloft.net>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 include/linux/mm.h | 36 ++++++++++++++++++++++++++++++++----
 1 file changed, 32 insertions(+), 4 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index a886f48b6a0e..b0ddfac0486b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2712,17 +2712,45 @@ extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long add
 extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr,
 					     struct vm_area_struct **pprev);
 
-/* Look up the first VMA which intersects the interval start_addr..end_addr-1,
-   NULL if none.  Assume start_addr < end_addr. */
-static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr)
+/**
+ * find_vma_intersection() - Look up the first VMA which intersects the interval
+ * @mm: The process address space.
+ * @start_addr: The inclusive start user address.
+ * @end_addr: The exclusive end user address.
+ *
+ * Returns: The first VMA within the provided range, %NULL otherwise.  Assumes
+ * start_addr < end_addr.
+ */
+static inline
+struct vm_area_struct *find_vma_intersection(struct mm_struct *mm,
+					     unsigned long start_addr,
+					     unsigned long end_addr)
 {
-	struct vm_area_struct * vma = find_vma(mm,start_addr);
+	struct vm_area_struct *vma = find_vma(mm, start_addr);
 
 	if (vma && end_addr <= vma->vm_start)
 		vma = NULL;
 	return vma;
 }
 
+/**
+ * vma_lookup() - Find a VMA at a specific address
+ * @mm: The process address space.
+ * @addr: The user address.
+ *
+ * Return: The vm_area_struct at the given address, %NULL otherwise.
+ */
+static inline
+struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr)
+{
+	struct vm_area_struct *vma = find_vma(mm, addr);
+
+	if (vma && addr < vma->vm_start)
+		vma = NULL;
+
+	return vma;
+}
+
 static inline unsigned long vm_start_gap(struct vm_area_struct *vma)
 {
 	unsigned long vm_start = vma->vm_start;
-- 
Gitee


From daeb457c98273f7cb1a1eb1dd9a2eb2473b817c2 Mon Sep 17 00:00:00 2001
From: Liam Howlett <liam.howlett@oracle.com>
Date: Mon, 28 Jun 2021 19:39:14 -0700
Subject: [PATCH 031/674] x86/sgx: use vma_lookup() in sgx_encl_find()

mainline inclusion
from mainline-5.14
commit 9ce2c3fc0be6e7d0bb2236a33bbb7a0f1943bd81
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit 9ce2c3fc0be6 x86/sgx: use vma_lookup() in sgx_encl_find().
Backport for SGX virtualization support

--------------------------------

Use vma_lookup() to find the VMA at a specific address.  As vma_lookup()
will return NULL if the address is not within any VMA, the start address
no longer needs to be validated.

Link: https://lkml.kernel.org/r/20210521174745.2219620-10-Liam.Howlett@Oracle.com
Signed-off-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
Reviewed-by: Laurent Dufour <ldufour@linux.ibm.com>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Davidlohr Bueso <dbueso@suse.de>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/kernel/cpu/sgx/encl.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h
index 6e74f85b6264..fec43ca65065 100644
--- a/arch/x86/kernel/cpu/sgx/encl.h
+++ b/arch/x86/kernel/cpu/sgx/encl.h
@@ -91,8 +91,8 @@ static inline int sgx_encl_find(struct mm_struct *mm, unsigned long addr,
 {
 	struct vm_area_struct *result;
 
-	result = find_vma(mm, addr);
-	if (!result || result->vm_ops != &sgx_vm_ops || addr < result->vm_start)
+	result = vma_lookup(mm, addr);
+	if (!result || result->vm_ops != &sgx_vm_ops)
 		return -EINVAL;
 
 	*vma = result;
-- 
Gitee


From 75c51ab6a7038468a77ba92da865408e4b562391 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 21 Oct 2021 16:11:54 -0400
Subject: [PATCH 032/674] x86/sgx/virt: extract sgx_vepc_remove_page

mainline inclusion
from mainline-5.14
commit fd5128e622d7834bb3f7ee23c2bbea8db63cebaf
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit fd5128e622d7 x86/sgx/virt: extract sgx_vepc_remove_page.
Backport for SGX virtualization support

--------------------------------

For bare-metal SGX on real hardware, the hardware provides guarantees
SGX state at reboot.  For instance, all pages start out uninitialized.
The vepc driver provides a similar guarantee today for freshly-opened
vepc instances, but guests such as Windows expect all pages to be in
uninitialized state on startup, including after every guest reboot.

One way to do this is to simply close and reopen the /dev/sgx_vepc file
descriptor and re-mmap the virtual EPC.  However, this is problematic
because it prevents sandboxing the userspace (for example forbidding
open() after the guest starts; this is doable with heavy use of SCM_RIGHTS
file descriptor passing).

In order to implement this, we will need a ioctl that performs
EREMOVE on all pages mapped by a /dev/sgx_vepc file descriptor:
other possibilities, such as closing and reopening the device,
are racy.

Start the implementation by creating a separate function with just
the __eremove wrapper.

Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Reviewed-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Link: https://lkml.kernel.org/r/20211021201155.1523989-2-pbonzini@redhat.com
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/kernel/cpu/sgx/virt.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c
index 8f7dd1dc4189..09674c824ae9 100644
--- a/arch/x86/kernel/cpu/sgx/virt.c
+++ b/arch/x86/kernel/cpu/sgx/virt.c
@@ -111,10 +111,8 @@ static int sgx_vepc_mmap(struct file *file, struct vm_area_struct *vma)
 	return 0;
 }
 
-static int sgx_vepc_free_page(struct sgx_epc_page *epc_page)
+static int sgx_vepc_remove_page(struct sgx_epc_page *epc_page)
 {
-	int ret;
-
 	/*
 	 * Take a previously guest-owned EPC page and return it to the
 	 * general EPC page pool.
@@ -124,7 +122,12 @@ static int sgx_vepc_free_page(struct sgx_epc_page *epc_page)
 	 * case that a guest properly EREMOVE'd this page, a superfluous
 	 * EREMOVE is harmless.
 	 */
-	ret = __eremove(sgx_get_epc_virt_addr(epc_page));
+	return __eremove(sgx_get_epc_virt_addr(epc_page));
+}
+
+static int sgx_vepc_free_page(struct sgx_epc_page *epc_page)
+{
+	int ret = sgx_vepc_remove_page(epc_page);
 	if (ret) {
 		/*
 		 * Only SGX_CHILD_PRESENT is expected, which is because of
@@ -144,7 +147,6 @@ static int sgx_vepc_free_page(struct sgx_epc_page *epc_page)
 	}
 
 	sgx_free_epc_page(epc_page);
-
 	return 0;
 }
 
-- 
Gitee


From 1c4d239038fb183fe2cba97798a0fed4d4601519 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 21 Oct 2021 16:11:55 -0400
Subject: [PATCH 033/674] x86/sgx/virt: implement SGX_IOC_VEPC_REMOVE ioctl

mainline inclusion
from mainline-5.16
commit ae095b16fc652f459e6c16a256834985c85ecc4d
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit ae095b16fc65 x86/sgx/virt: implement SGX_IOC_VEPC_REMOVE
ioctl.
Backport for SGX virtualization support

--------------------------------

For bare-metal SGX on real hardware, the hardware provides guarantees
SGX state at reboot.  For instance, all pages start out uninitialized.
The vepc driver provides a similar guarantee today for freshly-opened
vepc instances, but guests such as Windows expect all pages to be in
uninitialized state on startup, including after every guest reboot.

Some userspace implementations of virtual SGX would rather avoid having
to close and reopen the /dev/sgx_vepc file descriptor and re-mmap the
virtual EPC.  For example, they could sandbox themselves after the guest
starts and forbid further calls to open(), in order to mitigate exploits
from untrusted guests.

Therefore, add a ioctl that does this with EREMOVE.  Userspace can
invoke the ioctl to bring its vEPC pages back to uninitialized state.
There is a possibility that some pages fail to be removed if they are
SECS pages, and the child and SECS pages could be in separate vEPC
regions.  Therefore, the ioctl returns the number of EREMOVE failures,
telling userspace to try the ioctl again after it's done with all
vEPC regions.  A more verbose description of the correct usage and
the possible error conditions is documented in sgx.rst.

Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Reviewed-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Link: https://lkml.kernel.org/r/20211021201155.1523989-3-pbonzini@redhat.com
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 Documentation/x86/sgx.rst       | 35 ++++++++++++++++++++++
 arch/x86/include/uapi/asm/sgx.h |  2 ++
 arch/x86/kernel/cpu/sgx/virt.c  | 53 +++++++++++++++++++++++++++++++++
 3 files changed, 90 insertions(+)

diff --git a/Documentation/x86/sgx.rst b/Documentation/x86/sgx.rst
index dd0ac96ff9ef..a608f667fb95 100644
--- a/Documentation/x86/sgx.rst
+++ b/Documentation/x86/sgx.rst
@@ -250,3 +250,38 @@ user wants to deploy SGX applications both on the host and in guests
 on the same machine, the user should reserve enough EPC (by taking out
 total virtual EPC size of all SGX VMs from the physical EPC size) for
 host SGX applications so they can run with acceptable performance.
+
+Architectural behavior is to restore all EPC pages to an uninitialized
+state also after a guest reboot.  Because this state can be reached only
+through the privileged ``ENCLS[EREMOVE]`` instruction, ``/dev/sgx_vepc``
+provides the ``SGX_IOC_VEPC_REMOVE_ALL`` ioctl to execute the instruction
+on all pages in the virtual EPC.
+
+``EREMOVE`` can fail for three reasons.  Userspace must pay attention
+to expected failures and handle them as follows:
+
+1. Page removal will always fail when any thread is running in the
+   enclave to which the page belongs.  In this case the ioctl will
+   return ``EBUSY`` independent of whether it has successfully removed
+   some pages; userspace can avoid these failures by preventing execution
+   of any vcpu which maps the virtual EPC.
+
+2. Page removal will cause a general protection fault if two calls to
+   ``EREMOVE`` happen concurrently for pages that refer to the same
+   "SECS" metadata pages.  This can happen if there are concurrent
+   invocations to ``SGX_IOC_VEPC_REMOVE_ALL``, or if a ``/dev/sgx_vepc``
+   file descriptor in the guest is closed at the same time as
+   ``SGX_IOC_VEPC_REMOVE_ALL``; it will also be reported as ``EBUSY``.
+   This can be avoided in userspace by serializing calls to the ioctl()
+   and to close(), but in general it should not be a problem.
+
+3. Finally, page removal will fail for SECS metadata pages which still
+   have child pages.  Child pages can be removed by executing
+   ``SGX_IOC_VEPC_REMOVE_ALL`` on all ``/dev/sgx_vepc`` file descriptors
+   mapped into the guest.  This means that the ioctl() must be called
+   twice: an initial set of calls to remove child pages and a subsequent
+   set of calls to remove SECS pages.  The second set of calls is only
+   required for those mappings that returned a nonzero value from the
+   first call.  It indicates a bug in the kernel or the userspace client
+   if any of the second round of ``SGX_IOC_VEPC_REMOVE_ALL`` calls has
+   a return code other than 0.
diff --git a/arch/x86/include/uapi/asm/sgx.h b/arch/x86/include/uapi/asm/sgx.h
index 791e45334a4a..c815a6fec9aa 100644
--- a/arch/x86/include/uapi/asm/sgx.h
+++ b/arch/x86/include/uapi/asm/sgx.h
@@ -27,6 +27,8 @@ enum sgx_page_flags {
 	_IOW(SGX_MAGIC, 0x02, struct sgx_enclave_init)
 #define SGX_IOC_ENCLAVE_PROVISION \
 	_IOW(SGX_MAGIC, 0x03, struct sgx_enclave_provision)
+#define SGX_IOC_VEPC_REMOVE_ALL \
+	_IO(SGX_MAGIC, 0x04)
 
 /**
  * struct sgx_enclave_create - parameter structure for the
diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c
index 09674c824ae9..b4f9a50de776 100644
--- a/arch/x86/kernel/cpu/sgx/virt.c
+++ b/arch/x86/kernel/cpu/sgx/virt.c
@@ -150,6 +150,41 @@ static int sgx_vepc_free_page(struct sgx_epc_page *epc_page)
 	return 0;
 }
 
+static long sgx_vepc_remove_all(struct sgx_vepc *vepc)
+{
+	struct sgx_epc_page *entry;
+	unsigned long index;
+	long failures = 0;
+
+	xa_for_each(&vepc->page_array, index, entry) {
+		int ret = sgx_vepc_remove_page(entry);
+		if (ret) {
+			if (ret == SGX_CHILD_PRESENT) {
+				/* The page is a SECS, userspace will retry.  */
+				failures++;
+			} else {
+				/*
+				 * Report errors due to #GP or SGX_ENCLAVE_ACT; do not
+				 * WARN, as userspace can induce said failures by
+				 * calling the ioctl concurrently on multiple vEPCs or
+				 * while one or more CPUs is running the enclave.  Only
+				 * a #PF on EREMOVE indicates a kernel/hardware issue.
+				 */
+				WARN_ON_ONCE(encls_faulted(ret) &&
+					     ENCLS_TRAPNR(ret) != X86_TRAP_GP);
+				return -EBUSY;
+			}
+		}
+		cond_resched();
+	}
+
+	/*
+	 * Return the number of SECS pages that failed to be removed, so
+	 * userspace knows that it has to retry.
+	 */
+	return failures;
+}
+
 static int sgx_vepc_release(struct inode *inode, struct file *file)
 {
 	struct sgx_vepc *vepc = file->private_data;
@@ -235,9 +270,27 @@ static int sgx_vepc_open(struct inode *inode, struct file *file)
 	return 0;
 }
 
+static long sgx_vepc_ioctl(struct file *file,
+			   unsigned int cmd, unsigned long arg)
+{
+	struct sgx_vepc *vepc = file->private_data;
+
+	switch (cmd) {
+	case SGX_IOC_VEPC_REMOVE_ALL:
+		if (arg)
+			return -EINVAL;
+		return sgx_vepc_remove_all(vepc);
+
+	default:
+		return -ENOTTY;
+	}
+}
+
 static const struct file_operations sgx_vepc_fops = {
 	.owner		= THIS_MODULE,
 	.open		= sgx_vepc_open,
+	.unlocked_ioctl	= sgx_vepc_ioctl,
+	.compat_ioctl	= sgx_vepc_ioctl,
 	.release	= sgx_vepc_release,
 	.mmap		= sgx_vepc_mmap,
 };
-- 
Gitee


From e08294e44e67f41007227e510df79a2a7967b3b8 Mon Sep 17 00:00:00 2001
From: Reinette Chatre <reinette.chatre@intel.com>
Date: Mon, 15 Nov 2021 11:29:04 -0800
Subject: [PATCH 034/674] x86/sgx: Fix free page accounting

mainline inclusion
from mainline-5.16
commit ac5d272a0ad0419f52e08c91953356e32b075af7
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit ac5d272a0ad0 x86/sgx: Fix free page accounting
Backport for SGX virtualization support

--------------------------------

The SGX driver maintains a single global free page counter,
sgx_nr_free_pages, that reflects the number of free pages available
across all NUMA nodes. Correspondingly, a list of free pages is
associated with each NUMA node and sgx_nr_free_pages is updated
every time a page is added or removed from any of the free page
lists. The main usage of sgx_nr_free_pages is by the reclaimer
that runs when it (sgx_nr_free_pages) goes below a watermark
to ensure that there are always some free pages available to, for
example, support efficient page faults.

With sgx_nr_free_pages accessed and modified from a few places
it is essential to ensure that these accesses are done safely but
this is not the case. sgx_nr_free_pages is read without any
protection and updated with inconsistent protection by any one
of the spin locks associated with the individual NUMA nodes.
For example:

      CPU_A                                 CPU_B
      -----                                 -----
 spin_lock(&nodeA->lock);              spin_lock(&nodeB->lock);
 ...                                   ...
 sgx_nr_free_pages--;  /* NOT SAFE */  sgx_nr_free_pages--;

 spin_unlock(&nodeA->lock);            spin_unlock(&nodeB->lock);

Since sgx_nr_free_pages may be protected by different spin locks
while being modified from different CPUs, the following scenario
is possible:

      CPU_A                                CPU_B
      -----                                -----
{sgx_nr_free_pages = 100}
 spin_lock(&nodeA->lock);              spin_lock(&nodeB->lock);
 sgx_nr_free_pages--;                  sgx_nr_free_pages--;
 /* LOAD sgx_nr_free_pages = 100 */    /* LOAD sgx_nr_free_pages = 100 */
 /* sgx_nr_free_pages--          */    /* sgx_nr_free_pages--          */
 /* STORE sgx_nr_free_pages = 99 */    /* STORE sgx_nr_free_pages = 99 */
 spin_unlock(&nodeA->lock);            spin_unlock(&nodeB->lock);

In the above scenario, sgx_nr_free_pages is decremented from two CPUs
but instead of sgx_nr_free_pages ending with a value that is two less
than it started with, it was only decremented by one while the number
of free pages were actually reduced by two. The consequence of
sgx_nr_free_pages not being protected is that its value may not
accurately reflect the actual number of free pages on the system,
impacting the availability of free pages in support of many flows.

The problematic scenario is when the reclaimer does not run because it
believes there to be sufficient free pages while any attempt to allocate
a page fails because there are no free pages available. In the SGX driver
the reclaimer's watermark is only 32 pages so after encountering the
above example scenario 32 times a user space hang is possible when there
are no more free pages because of repeated page faults caused by no
free pages made available.

The following flow was encountered:
asm_exc_page_fault
 ...
   sgx_vma_fault()
     sgx_encl_load_page()
       sgx_encl_eldu() // Encrypted page needs to be loaded from backing
                       // storage into newly allocated SGX memory page
         sgx_alloc_epc_page() // Allocate a page of SGX memory
           __sgx_alloc_epc_page() // Fails, no free SGX memory
           ...
           if (sgx_should_reclaim(SGX_NR_LOW_PAGES)) // Wake reclaimer
             wake_up(&ksgxd_waitq);
           return -EBUSY; // Return -EBUSY giving reclaimer time to run
       return -EBUSY;
     return -EBUSY;
   return VM_FAULT_NOPAGE;

The reclaimer is triggered in above flow with the following code:

static bool sgx_should_reclaim(unsigned long watermark)
{
        return sgx_nr_free_pages < watermark &&
               !list_empty(&sgx_active_page_list);
}

In the problematic scenario there were no free pages available yet the
value of sgx_nr_free_pages was above the watermark. The allocation of
SGX memory thus always failed because of a lack of free pages while no
free pages were made available because the reclaimer is never started
because of sgx_nr_free_pages' incorrect value. The consequence was that
user space kept encountering VM_FAULT_NOPAGE that caused the same
address to be accessed repeatedly with the same result.

Change the global free page counter to an atomic type that
ensures simultaneous updates are done safely. While doing so, move
the updating of the variable outside of the spin lock critical
section to which it does not belong.

Cc: stable@vger.kernel.org
Fixes: 901ddbb9ecf5 ("x86/sgx: Add a basic NUMA allocation scheme to sgx_alloc_epc_page()")
Suggested-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Acked-by: Jarkko Sakkinen <jarkko@kernel.org>
Link: https://lkml.kernel.org/r/a95a40743bbd3f795b465f30922dde7f1ea9e0eb.1637004094.git.reinette.chatre@intel.com
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/kernel/cpu/sgx/main.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index ad904747419e..379eeefa744a 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -28,8 +28,7 @@ static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq);
 static LIST_HEAD(sgx_active_page_list);
 static DEFINE_SPINLOCK(sgx_reclaimer_lock);
 
-/* The free page list lock protected variables prepend the lock. */
-static unsigned long sgx_nr_free_pages;
+static atomic_long_t sgx_nr_free_pages = ATOMIC_LONG_INIT(0);
 
 /* Nodes with one or more EPC sections. */
 static nodemask_t sgx_numa_mask;
@@ -403,14 +402,15 @@ static void sgx_reclaim_pages(void)
 
 		spin_lock(&node->lock);
 		list_add_tail(&epc_page->list, &node->free_page_list);
-		sgx_nr_free_pages++;
 		spin_unlock(&node->lock);
+		atomic_long_inc(&sgx_nr_free_pages);
 	}
 }
 
 static bool sgx_should_reclaim(unsigned long watermark)
 {
-	return sgx_nr_free_pages < watermark && !list_empty(&sgx_active_page_list);
+	return atomic_long_read(&sgx_nr_free_pages) < watermark &&
+	       !list_empty(&sgx_active_page_list);
 }
 
 static int ksgxd(void *p)
@@ -471,9 +471,9 @@ static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid)
 
 	page = list_first_entry(&node->free_page_list, struct sgx_epc_page, list);
 	list_del_init(&page->list);
-	sgx_nr_free_pages--;
 
 	spin_unlock(&node->lock);
+	atomic_long_dec(&sgx_nr_free_pages);
 
 	return page;
 }
@@ -625,9 +625,9 @@ void sgx_free_epc_page(struct sgx_epc_page *page)
 	spin_lock(&node->lock);
 
 	list_add_tail(&page->list, &node->free_page_list);
-	sgx_nr_free_pages++;
 
 	spin_unlock(&node->lock);
+	atomic_long_inc(&sgx_nr_free_pages);
 }
 
 static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size,
-- 
Gitee


From 7e7b20e483355faeda1cef2a88e8809d8657f891 Mon Sep 17 00:00:00 2001
From: Reinette Chatre <reinette.chatre@intel.com>
Date: Tue, 8 Feb 2022 10:48:07 -0800
Subject: [PATCH 035/674] x86/sgx: Silence softlockup detection when releasing
 large enclaves

mainline inclusion
from mainline-5.17
commit 8795359e35bc33bf86b6d0765aa7f37431db3b9c
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit 8795359e35bc x86/sgx: Silence softlockup detection
when releasing large enclaves.
Backport for SGX virtualization support

--------------------------------

Vijay reported that the "unclobbered_vdso_oversubscribed" selftest
triggers the softlockup detector.

Actual SGX systems have 128GB of enclave memory or more.  The
"unclobbered_vdso_oversubscribed" selftest creates one enclave which
consumes all of the enclave memory on the system. Tearing down such a
large enclave takes around a minute, most of it in the loop where
the EREMOVE instruction is applied to each individual 4k enclave page.

Spending one minute in a loop triggers the softlockup detector.

Add a cond_resched() to give other tasks a chance to run and placate
the softlockup detector.

Cc: stable@vger.kernel.org
Fixes: 1728ab54b4be ("x86/sgx: Add a page reclaimer")
Reported-by: Vijay Dhanraj <vijay.dhanraj@intel.com>
Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Tested-by: Jarkko Sakkinen <jarkko@kernel.org>  (kselftest as sanity check)
Link: https://lkml.kernel.org/r/ced01cac1e75f900251b0a4ae1150aa8ebd295ec.1644345232.git.reinette.chatre@intel.com
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/kernel/cpu/sgx/encl.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
index 0c9f6cdd34a4..9a1a93ed2562 100644
--- a/arch/x86/kernel/cpu/sgx/encl.c
+++ b/arch/x86/kernel/cpu/sgx/encl.c
@@ -544,6 +544,8 @@ void sgx_encl_release(struct kref *ref)
 		}
 
 		kfree(entry);
+		/* Invoke scheduler to prevent soft lockups. */
+		cond_resched();
 	}
 
 	xa_destroy(&encl->page_array);
-- 
Gitee


From b2d2fe399208c58fcf31fd46a59cd53e9cd975ce Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Sun, 27 Dec 2020 12:11:16 +0200
Subject: [PATCH 036/674] intel_idle: add SnowRidge C-state table

mainline inclusion
from mainline-v5.11-rc2
commit 9cf93f056f783f986c19f40d5304d1bcffa0fc0d
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5BECY
CVE: NA

Intel-SIG: commit 9cf93f056f78 intel_idle: add SnowRidge C-state table.
Backport intel_idle driver

--------------------------------

Add C-state table for the SnowRidge SoC which is found on Intel Jacobsville
platforms.

The following has been changed.

 1. C1E latency changed from 10us to 15us. It was measured using the
    open source "wult" tool (the "nic" method, 15us is the 99.99th
    percentile).

 2. C1E power break even changed from 20us to 25us, which may result
    in less C1E residency in some workloads.

 3. C6 latency changed from 50us to 130us. Measured the same way as C1E.

The C6 C-state is supported only by some SnowRidge revisions, so add a C-state
table commentary about this.

On SnowRidge, C6 support is enumerated via the usual mechanism: "mwait" leaf of
the "cpuid" instruction. The 'intel_idle' driver does check this leaf, so even
though C6 is present in the table, the driver will only use it if the CPU does
support it.

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 drivers/idle/intel_idle.c | 41 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 40 insertions(+), 1 deletion(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index d79335506ecd..28f93b9aa51b 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -963,6 +963,39 @@ static struct cpuidle_state dnv_cstates[] __initdata = {
 		.enter = NULL }
 };
 
+/*
+ * Note, depending on HW and FW revision, SnowRidge SoC may or may not support
+ * C6, and this is indicated in the CPUID mwait leaf.
+ */
+static struct cpuidle_state snr_cstates[] __initdata = {
+	{
+		.name = "C1",
+		.desc = "MWAIT 0x00",
+		.flags = MWAIT2flg(0x00),
+		.exit_latency = 2,
+		.target_residency = 2,
+		.enter = &intel_idle,
+		.enter_s2idle = intel_idle_s2idle, },
+	{
+		.name = "C1E",
+		.desc = "MWAIT 0x01",
+		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
+		.exit_latency = 15,
+		.target_residency = 25,
+		.enter = &intel_idle,
+		.enter_s2idle = intel_idle_s2idle, },
+	{
+		.name = "C6",
+		.desc = "MWAIT 0x20",
+		.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
+		.exit_latency = 130,
+		.target_residency = 500,
+		.enter = &intel_idle,
+		.enter_s2idle = intel_idle_s2idle, },
+	{
+		.enter = NULL }
+};
+
 static const struct idle_cpu idle_cpu_nehalem __initconst = {
 	.state_table = nehalem_cstates,
 	.auto_demotion_disable_flags = NHM_C1_AUTO_DEMOTE | NHM_C3_AUTO_DEMOTE,
@@ -1084,6 +1117,12 @@ static const struct idle_cpu idle_cpu_dnv __initconst = {
 	.use_acpi = true,
 };
 
+static const struct idle_cpu idle_cpu_snr __initconst = {
+	.state_table = snr_cstates,
+	.disable_promotion_to_c1e = true,
+	.use_acpi = true,
+};
+
 static const struct x86_cpu_id intel_idle_ids[] __initconst = {
 	X86_MATCH_INTEL_FAM6_MODEL(NEHALEM_EP,		&idle_cpu_nhx),
 	X86_MATCH_INTEL_FAM6_MODEL(NEHALEM,		&idle_cpu_nehalem),
@@ -1122,7 +1161,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
 	X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT,	&idle_cpu_bxt),
 	X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_PLUS,	&idle_cpu_bxt),
 	X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_D,	&idle_cpu_dnv),
-	X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D,	&idle_cpu_dnv),
+	X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D,	&idle_cpu_snr),
 	{}
 };
 
-- 
Gitee


From 5f8214310181dabd9120b6a1c57d40bab63c8d38 Mon Sep 17 00:00:00 2001
From: Tom Rix <trix@redhat.com>
Date: Fri, 15 Jan 2021 15:56:46 -0800
Subject: [PATCH 037/674] intel_idle: remove definition of DEBUG

mainline inclusion
from mainline-v5.12-rc2
commit 651bc5816c39e57833fea4478c8ecfb72ad47e44
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5BECY
CVE: NA

Intel-SIG: commit 651bc5816c39 intel_idle: remove definition
of DEBUG.
Backport intel_idle driver

--------------------------------

Defining DEBUG should only be done in development.
So remove DEBUG.

Signed-off-by: Tom Rix <trix@redhat.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 drivers/idle/intel_idle.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 28f93b9aa51b..3273360f30f7 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -37,7 +37,7 @@
  */
 
 /* un-comment DEBUG to enable pr_debug() statements */
-#define DEBUG
+/* #define DEBUG */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
-- 
Gitee


From 0fdfc2a57e517ae371d8520f31549e41aa7d9e96 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Mon, 8 Mar 2021 16:31:46 +0200
Subject: [PATCH 038/674] intel_idle: update ICX C6 data

mainline inclusion
from mainline-v5.13-rc1
commit d484b8bfc6fa71a088e4ac85d9ce11aa0385867e
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5BECY
CVE: NA

Intel-SIG: commit d484b8bfc6fa intel_idle: update ICX C6 data.
Backport intel_idle driver

--------------------------------

Change IceLake Xeon C6 latency from 128 us to 170 us. The latency
was measured with the "wult" tool and corresponds to the 99.99th
percentile when measuring with the "nic" method. Note, the 128 us
figure correspond to the median latency, but in intel_idle we use
the "worst case" latency figure instead.

C6 target residency was increased from 384 us to 600 us, which may
result in less C6 residency in some workloads. This value was tested
and compared to values 384, and 1000. Value 600 is a reasonable
tradeoff between power and performance.

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Acked-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 drivers/idle/intel_idle.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 3273360f30f7..6cac0b748efa 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -744,8 +744,8 @@ static struct cpuidle_state icx_cstates[] __initdata = {
 		.name = "C6",
 		.desc = "MWAIT 0x20",
 		.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
-		.exit_latency = 128,
-		.target_residency = 384,
+		.exit_latency = 170,
+		.target_residency = 600,
 		.enter = &intel_idle,
 		.enter_s2idle = intel_idle_s2idle, },
 	{
-- 
Gitee


From 442af217b8e8c37793a7a7928bf88c4f93fa1aed Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Wed, 7 Apr 2021 09:10:28 +0300
Subject: [PATCH 039/674] intel_idle: add Iclelake-D support

mainline inclusion
from mainline-v5.13-rc1
commit 22141d5f411895bb1b0df2a6b05f702e11e63918
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5BECY
CVE: NA

Intel-SIG: commit 22141d5f4118 intel_idle: add Iclelake-D support.
Backport intel_idle driver

--------------------------------

This patch adds Icelake Xeon D support to the intel_idle driver.

Since Icelake D and Icelake SP C-state characteristics the same,
we use Icelake SP C-states table for Icelake D as well.

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Acked-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 drivers/idle/intel_idle.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 6cac0b748efa..ec1b9d306ba6 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -1156,6 +1156,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
 	X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE,		&idle_cpu_skl),
 	X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X,		&idle_cpu_skx),
 	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X,		&idle_cpu_icx),
+	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D,		&idle_cpu_icx),
 	X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL,	&idle_cpu_knl),
 	X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM,	&idle_cpu_knl),
 	X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT,	&idle_cpu_bxt),
-- 
Gitee


From d9030dcd45e39d16269fc298871802e0bd3222b5 Mon Sep 17 00:00:00 2001
From: Chen Yu <yu.c.chen@intel.com>
Date: Fri, 28 May 2021 11:20:54 +0800
Subject: [PATCH 040/674] intel_idle: Adjust the SKX C6 parameters if PC6 is
 disabled

mainline inclusion
from mainline-v5.14-rc1
commit 64233338499126c5c31e07165735ab5441c7e45a
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5BECY
CVE: NA

Intel-SIG: commit 642333384991 intel_idle: Adjust the SKX C6 parameters
if PC6 is disabled.
Backport intel_idle driver

--------------------------------

Because cpuidle assumes worst-case C-state parameters, PC6 parameters
are used for describing C6, which is worst-case for requesting CC6.
When PC6 is enabled, this is appropriate. But if PC6 is disabled
in the BIOS, the exit latency and target residency should be adjusted
accordingly.

Exit latency:
Previously the C6 exit latency was measured as the PC6 exit latency.
With PC6 disabled, the C6 exit latency should be the one of CC6.

Target residency:
With PC6 disabled, the idle duration within [CC6, PC6) would make the
idle governor choose C1E over C6. This would cause low energy-efficiency.
We should lower the bar to request C6 when PC6 is disabled.

To fill this gap, check if PC6 is disabled in the BIOS in the
MSR_PKG_CST_CONFIG_CONTROL(0xe2) register. If so, use the CC6 exit latency
for C6 and set target_residency to 3 times of the new exit latency. [This
is consistent with how intel_idle driver uses _CST to calculate the
target_residency.] As a result, the OS would be more likely to choose C6
over C1E when PC6 is disabled, which is reasonable, because if C6 is
enabled, it implies that the user cares about energy, so choosing C6 more
frequently makes sense.

The new CC6 exit latency of 92us was measured with wult[1] on SKX via NIC
wakeup as the 99.99th percentile. Also CLX and CPX both have the same CPU
model number as SkX, but their CC6 exit latencies are similar to the SKX
one, 96us and 89us respectively, so reuse the SKX value for them.

There is a concern that it might be better to use a more generic approach
instead of optimizing every platform. However, if the required code
complexity and different PC6 bit interpretation on different platforms
are taken into account, tuning the code per platform seems to be an
acceptable tradeoff.

Link: https://intel.github.io/wult/ # [1]
Suggested-by: Len Brown <len.brown@intel.com>
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Reviewed-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
[ rjw: Subject and changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 drivers/idle/intel_idle.c | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index ec1b9d306ba6..e6c543b5ee1d 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -1484,6 +1484,36 @@ static void __init sklh_idle_state_table_update(void)
 	skl_cstates[6].flags |= CPUIDLE_FLAG_UNUSABLE;	/* C9-SKL */
 }
 
+/**
+ * skx_idle_state_table_update - Adjust the Sky Lake/Cascade Lake
+ * idle states table.
+ */
+static void __init skx_idle_state_table_update(void)
+{
+	unsigned long long msr;
+
+	rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr);
+
+	/*
+	 * 000b: C0/C1 (no package C-state support)
+	 * 001b: C2
+	 * 010b: C6 (non-retention)
+	 * 011b: C6 (retention)
+	 * 111b: No Package C state limits.
+	 */
+	if ((msr & 0x7) < 2) {
+		/*
+		 * Uses the CC6 + PC0 latency and 3 times of
+		 * latency for target_residency if the PC6
+		 * is disabled in BIOS. This is consistent
+		 * with how intel_idle driver uses _CST
+		 * to set the target_residency.
+		 */
+		skx_cstates[2].exit_latency = 92;
+		skx_cstates[2].target_residency = 276;
+	}
+}
+
 static bool __init intel_idle_verify_cstate(unsigned int mwait_hint)
 {
 	unsigned int mwait_cstate = MWAIT_HINT2CSTATE(mwait_hint) + 1;
@@ -1515,6 +1545,9 @@ static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv)
 	case INTEL_FAM6_SKYLAKE:
 		sklh_idle_state_table_update();
 		break;
+	case INTEL_FAM6_SKYLAKE_X:
+		skx_idle_state_table_update();
+		break;
 	}
 
 	for (cstate = 0; cstate < CPUIDLE_STATE_MAX; ++cstate) {
-- 
Gitee


From 9b77001c0ae72da913c2c624575d73dcdd692bdb Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Fri, 17 Sep 2021 10:20:22 +0300
Subject: [PATCH 041/674] intel_idle: enable interrupts before C1 on Xeons

mainline inclusion
from mainline-v5.16-rc1
commit c227233ad64c77e57db738ab0e46439db71822a3
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5BECY
CVE: NA

Intel-SIG: commit c227233ad64c intel_idle: enable interrupts before C1 on Xeons.
Backport intel_idle driver

--------------------------------

Enable local interrupts before requesting C1 on the last two generations
of Intel Xeon platforms: Sky Lake, Cascade Lake, Cooper Lake, Ice Lake.
This decreases average C1 interrupt latency by about 5-10%, as measured
with the 'wult' tool.

The '->enter()' function of the driver enters C-states with local
interrupts disabled by executing the 'monitor' and 'mwait' pair of
instructions. If an interrupt happens, the CPU exits the C-state and
continues executing instructions after 'mwait'. It does not jump to
the interrupt handler, because local interrupts are disabled. The
cpuidle subsystem enables interrupts a bit later, after doing some
housekeeping.

With this patch, we enable local interrupts before requesting C1. In
this case, if the CPU wakes up because of an interrupt, it will jump
to the interrupt handler right away. The cpuidle housekeeping will be
done after the pending interrupt(s) are handled.

Enabling interrupts before entering a C-state has measurable impact
for faster C-states, like C1. Deeper, but slower C-states like C6 do
not really benefit from this sort of change, because their latency is
a lot higher comparing to the delay added by cpuidle housekeeping.

This change was also tested with cyclictest and dbench. In case of Ice
Lake, the average cyclictest latency decreased by 5.1%, and the average
'dbench' throughput increased by about 0.8%. Both tests were run for 4
hours with only C1 enabled (all other idle states, including 'POLL',
were disabled). CPU frequency was pinned to HFM, and uncore frequency
was pinned to the maximum value. The other platforms had similar
single-digit percentage improvements.

It is worth noting that this patch affects 'cpuidle' statistics a tiny
bit.  Before this patch, C1 residency did not include the interrupt
handling time, but with this patch, it will include it. This is similar
to what happens in case of the 'POLL' state, which also runs with
interrupts enabled.

Suggested-by: Len Brown <len.brown@intel.com>
Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 drivers/idle/intel_idle.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index e6c543b5ee1d..0b66e25c0e2d 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -88,6 +88,12 @@ static struct cpuidle_state *cpuidle_state_table __initdata;
 
 static unsigned int mwait_substates __initdata;
 
+/*
+ * Enable interrupts before entering the C-state. On some platforms and for
+ * some C-states, this may measurably decrease interrupt latency.
+ */
+#define CPUIDLE_FLAG_IRQ_ENABLE		BIT(14)
+
 /*
  * Enable this state by default even if the ACPI _CST does not list it.
  */
@@ -127,6 +133,9 @@ static __cpuidle int intel_idle(struct cpuidle_device *dev,
 	unsigned long eax = flg2MWAIT(state->flags);
 	unsigned long ecx = 1; /* break on interrupt flag */
 
+	if (state->flags & CPUIDLE_FLAG_IRQ_ENABLE)
+		local_irq_enable();
+
 	mwait_idle_with_hints(eax, ecx);
 
 	return index;
@@ -698,7 +707,7 @@ static struct cpuidle_state skx_cstates[] __initdata = {
 	{
 		.name = "C1",
 		.desc = "MWAIT 0x00",
-		.flags = MWAIT2flg(0x00),
+		.flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_IRQ_ENABLE,
 		.exit_latency = 2,
 		.target_residency = 2,
 		.enter = &intel_idle,
@@ -727,7 +736,7 @@ static struct cpuidle_state icx_cstates[] __initdata = {
 	{
 		.name = "C1",
 		.desc = "MWAIT 0x00",
-		.flags = MWAIT2flg(0x00),
+		.flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_IRQ_ENABLE,
 		.exit_latency = 1,
 		.target_residency = 1,
 		.enter = &intel_idle,
-- 
Gitee


From 2a0225ab4315ef488b15607607a55cbf308a1e0a Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Wed, 2 Mar 2022 10:15:58 +0200
Subject: [PATCH 042/674] intel_idle: add SPR support

mainline inclusion
from mainline-v5.18-rc1
commit 9edf3c0ffef0ec1bed8300315852b5c6a0997130
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5BECY
CVE: NA

Intel-SIG: commit 9edf3c0ffef0 intel_idle: add SPR support.
Backport intel_idle driver

--------------------------------

Add Sapphire Rapids Xeon support.

Up until very recently, the C1 and C1E C-states were independent, but this
has changed in some new chips, including Sapphire Rapids Xeon (SPR). In these
chips the C1 and C1E states cannot be enabled at the same time. The "C1E
promotion" bit in 'MSR_IA32_POWER_CTL' also has its semantics changed a bit.

Here are the C1, C1E, and "C1E promotion" bit rules on Xeons before SPR.

1. If C1E promotion bit is disabled.
   a. C1  requests end up with C1  C-state.
   b. C1E requests end up with C1E C-state.
2. If C1E promotion bit is enabled.
   a. C1  requests end up with C1E C-state.
   b. C1E requests end up with C1E C-state.

Here are the C1, C1E, and "C1E promotion" bit rules on Sapphire Rapids Xeon.
1. If C1E promotion bit is disabled.
   a. C1  requests end up with C1 C-state.
   b. C1E requests end up with C1 C-state.
2. If C1E promotion bit is enabled.
   a. C1  requests end up with C1E C-state.
   b. C1E requests end up with C1E C-state.

Before SPR Xeon, the 'intel_idle' driver was disabling C1E promotion and was
exposing C1 and C1E as independent C-states. But on SPR, C1 and C1E cannot be
enabled at the same time.

This patch adds both C1 and C1E states. However, C1E is marked as with the
"CPUIDLE_FLAG_UNUSABLE" flag, which means that in won't be registered by
default. The C1E promotion bit will be cleared, which means that by default
only C1 and C6 will be registered on SPR.

The next patch will add an option for enabling C1E and disabling C1 on SPR.

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 drivers/idle/intel_idle.c | 47 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 0b66e25c0e2d..1c7c25909e54 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -761,6 +761,46 @@ static struct cpuidle_state icx_cstates[] __initdata = {
 		.enter = NULL }
 };
 
+/*
+ * On Sapphire Rapids Xeon C1 has to be disabled if C1E is enabled, and vice
+ * versa. On SPR C1E is enabled only if "C1E promotion" bit is set in
+ * MSR_IA32_POWER_CTL. But in this case there effectively no C1, because C1
+ * requests are promoted to C1E. If the "C1E promotion" bit is cleared, then
+ * both C1 and C1E requests end up with C1, so there is effectively no C1E.
+ *
+ * By default we enable C1 and disable C1E by marking it with
+ * 'CPUIDLE_FLAG_UNUSABLE'.
+ */
+static struct cpuidle_state spr_cstates[] __initdata = {
+	{
+		.name = "C1",
+		.desc = "MWAIT 0x00",
+		.flags = MWAIT2flg(0x00),
+		.exit_latency = 1,
+		.target_residency = 1,
+		.enter = &intel_idle,
+		.enter_s2idle = intel_idle_s2idle, },
+	{
+		.name = "C1E",
+		.desc = "MWAIT 0x01",
+		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE | \
+					   CPUIDLE_FLAG_UNUSABLE,
+		.exit_latency = 2,
+		.target_residency = 4,
+		.enter = &intel_idle,
+		.enter_s2idle = intel_idle_s2idle, },
+	{
+		.name = "C6",
+		.desc = "MWAIT 0x20",
+		.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
+		.exit_latency = 290,
+		.target_residency = 800,
+		.enter = &intel_idle,
+		.enter_s2idle = intel_idle_s2idle, },
+	{
+		.enter = NULL }
+};
+
 static struct cpuidle_state atom_cstates[] __initdata = {
 	{
 		.name = "C1E",
@@ -1104,6 +1144,12 @@ static const struct idle_cpu idle_cpu_icx __initconst = {
 	.use_acpi = true,
 };
 
+static const struct idle_cpu idle_cpu_spr __initconst = {
+	.state_table = spr_cstates,
+	.disable_promotion_to_c1e = true,
+	.use_acpi = true,
+};
+
 static const struct idle_cpu idle_cpu_avn __initconst = {
 	.state_table = avn_cstates,
 	.disable_promotion_to_c1e = true,
@@ -1166,6 +1212,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
 	X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X,		&idle_cpu_skx),
 	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X,		&idle_cpu_icx),
 	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D,		&idle_cpu_icx),
+	X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X,	&idle_cpu_spr),
 	X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL,	&idle_cpu_knl),
 	X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM,	&idle_cpu_knl),
 	X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT,	&idle_cpu_bxt),
-- 
Gitee


From fecfa8c2b8f7e0fafb27357c19741d6f1024d0f3 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Wed, 2 Mar 2022 10:15:59 +0200
Subject: [PATCH 043/674] intel_idle: add 'preferred_cstates' module argument

mainline inclusion
from mainline-v5.18-rc1
commit da0e58c038e60e7e65d30813ebdfe91687aa8a24
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5BECY
CVE: NA

Intel-SIG: commit da0e58c038e6 intel_idle: add 'preferred_cstates' module argument.
Backport intel_idle driver

--------------------------------

On Sapphire Rapids Xeon (SPR) the C1 and C1E states are basically mutually
exclusive - only one of them can be enabled. By default, 'intel_idle' driver
enables C1 and disables C1E. However, some users prefer to use C1E instead of
C1, because it saves more energy.

This patch adds a new module parameter ('preferred_cstates') for enabling C1E
and disabling C1. Here is the idea behind it.

1. This option has effect only for "mutually exclusive" C-states like C1 and
   C1E on SPR.
2. It does not have any effect on independent C-states, which do not require
   other C-states to be disabled (most states on most platforms as of today).
3. For mutually exclusive C-states, the 'intel_idle' driver always has a
   reasonable default, such as enabling C1 on SPR by default. On other
   platforms, the default may be different.
4. Users can override the default using the 'preferred_cstates' parameter.
5. The parameter accepts the preferred C-states bit-mask, similarly to the
   existing 'states_off' parameter.
6. This parameter is not limited to C1/C1E, and leaves room for supporting
   other mutually exclusive C-states, if they come in the future.

Today 'intel_idle' can only be compiled-in, which means that on SPR, in order
to disable C1 and enable C1E, users should boot with the following kernel
argument: intel_idle.preferred_cstates=4

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 drivers/idle/intel_idle.c | 46 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 1c7c25909e54..b2688c326522 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -64,6 +64,7 @@ static struct cpuidle_driver intel_idle_driver = {
 /* intel_idle.max_cstate=0 disables driver */
 static int max_cstate = CPUIDLE_STATE_MAX - 1;
 static unsigned int disabled_states_mask;
+static unsigned int preferred_states_mask;
 
 static struct cpuidle_device __percpu *intel_idle_cpuidle_devices;
 
@@ -1400,6 +1401,8 @@ static inline void intel_idle_init_cstates_acpi(struct cpuidle_driver *drv) { }
 static inline bool intel_idle_off_by_default(u32 mwait_hint) { return false; }
 #endif /* !CONFIG_ACPI_PROCESSOR_CSTATE */
 
+static void c1e_promotion_enable(void);
+
 /**
  * ivt_idle_state_table_update - Tune the idle states table for Ivy Town.
  *
@@ -1570,6 +1573,26 @@ static void __init skx_idle_state_table_update(void)
 	}
 }
 
+/**
+ * spr_idle_state_table_update - Adjust Sapphire Rapids idle states table.
+ */
+static void __init spr_idle_state_table_update(void)
+{
+	/* Check if user prefers C1E over C1. */
+	if (preferred_states_mask & BIT(2)) {
+		if (preferred_states_mask & BIT(1))
+			/* Both can't be enabled, stick to the defaults. */
+			return;
+
+		spr_cstates[0].flags |= CPUIDLE_FLAG_UNUSABLE;
+		spr_cstates[1].flags &= ~CPUIDLE_FLAG_UNUSABLE;
+
+		/* Enable C1E using the "C1E promotion" bit. */
+		c1e_promotion_enable();
+		disable_promotion_to_c1e = false;
+	}
+}
+
 static bool __init intel_idle_verify_cstate(unsigned int mwait_hint)
 {
 	unsigned int mwait_cstate = MWAIT_HINT2CSTATE(mwait_hint) + 1;
@@ -1604,6 +1627,9 @@ static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv)
 	case INTEL_FAM6_SKYLAKE_X:
 		skx_idle_state_table_update();
 		break;
+	case INTEL_FAM6_SAPPHIRERAPIDS_X:
+		spr_idle_state_table_update();
+		break;
 	}
 
 	for (cstate = 0; cstate < CPUIDLE_STATE_MAX; ++cstate) {
@@ -1676,6 +1702,15 @@ static void auto_demotion_disable(void)
 	wrmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr_bits);
 }
 
+static void c1e_promotion_enable(void)
+{
+	unsigned long long msr_bits;
+
+	rdmsrl(MSR_IA32_POWER_CTL, msr_bits);
+	msr_bits |= 0x2;
+	wrmsrl(MSR_IA32_POWER_CTL, msr_bits);
+}
+
 static void c1e_promotion_disable(void)
 {
 	unsigned long long msr_bits;
@@ -1845,3 +1880,14 @@ module_param(max_cstate, int, 0444);
  */
 module_param_named(states_off, disabled_states_mask, uint, 0444);
 MODULE_PARM_DESC(states_off, "Mask of disabled idle states");
+/*
+ * Some platforms come with mutually exclusive C-states, so that if one is
+ * enabled, the other C-states must not be used. Example: C1 and C1E on
+ * Sapphire Rapids platform. This parameter allows for selecting the
+ * preferred C-states among the groups of mutually exclusive C-states - the
+ * selected C-states will be registered, the other C-states from the mutually
+ * exclusive group won't be registered. If the platform has no mutually
+ * exclusive C-states, this parameter has no effect.
+ */
+module_param_named(preferred_cstates, preferred_states_mask, uint, 0444);
+MODULE_PARM_DESC(preferred_cstates, "Mask of preferred idle states");
-- 
Gitee


From 5e98d44d9f6b0f2b43886a483fbce9ba3cc0a402 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Wed, 2 Mar 2022 10:16:00 +0200
Subject: [PATCH 044/674] intel_idle: add core C6 optimization for SPR

mainline inclusion
from mainline-v5.18-rc1
commit 3a9cf77b60dc9839b6674943bb7c9dcd524b6294
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5BECY
CVE: NA

Intel-SIG: commit 3a9cf77b60dc intel_idle: add core C6 optimization for SPR.
Backport intel_idle driver

--------------------------------

Add a Sapphire Rapids Xeon C6 optimization, similar to what we have for Sky Lake
Xeon: if package C6 is disabled, adjust C6 exit latency and target residency to
match core C6 values, instead of using the default package C6 values.

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 drivers/idle/intel_idle.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index b2688c326522..e385ddf15b32 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -1578,6 +1578,8 @@ static void __init skx_idle_state_table_update(void)
  */
 static void __init spr_idle_state_table_update(void)
 {
+	unsigned long long msr;
+
 	/* Check if user prefers C1E over C1. */
 	if (preferred_states_mask & BIT(2)) {
 		if (preferred_states_mask & BIT(1))
@@ -1591,6 +1593,19 @@ static void __init spr_idle_state_table_update(void)
 		c1e_promotion_enable();
 		disable_promotion_to_c1e = false;
 	}
+
+	/*
+	 * By default, the C6 state assumes the worst-case scenario of package
+	 * C6. However, if PC6 is disabled, we update the numbers to match
+	 * core C6.
+	 */
+	rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr);
+
+	/* Limit value 2 and above allow for PC6. */
+	if ((msr & 0x7) < 2) {
+		spr_cstates[2].exit_latency = 190;
+		spr_cstates[2].target_residency = 600;
+	}
 }
 
 static bool __init intel_idle_verify_cstate(unsigned int mwait_hint)
-- 
Gitee


From 2b213f7240392736d1280bff47b7f8a7dde70f39 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 15 Mar 2022 20:35:47 +0100
Subject: [PATCH 045/674] cpuidle: intel_idle: Update intel_idle() kerneldoc
 comment

mainline inclusion
from mainline-v5.18-rc1
commit a335b1e6bb29300d3bc6749763a4298627e594ba
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5BECY
CVE: NA

Intel-SIG: commit a335b1e6bb29 cpuidle: intel_idle: Update intel_idle()
kerneldoc comment.
Backport intel_idle driver

--------------------------------

Commit bf9282dc26e7 ("cpuidle: Make CPUIDLE_FLAG_TLB_FLUSHED generic")
moved the leave_mm() call away from intel_idle(), but it didn't update
its kerneldoc comment accordingly, so do that now.

Fixes: bf9282dc26e7 ("cpuidle: Make CPUIDLE_FLAG_TLB_FLUSHED generic")

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 drivers/idle/intel_idle.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index e385ddf15b32..4ba4ab974dbe 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -122,9 +122,6 @@ static unsigned int mwait_substates __initdata;
  * If the local APIC timer is not known to be reliable in the target idle state,
  * enable one-shot tick broadcasting for the target CPU before executing MWAIT.
  *
- * Optionally call leave_mm() for the target CPU upfront to avoid wakeups due to
- * flushing user TLBs.
- *
  * Must be called under local_irq_disable().
  */
 static __cpuidle int intel_idle(struct cpuidle_device *dev,
-- 
Gitee


From e724de09b2346397db17e008b27352ca9d794c09 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 15 Mar 2022 20:36:42 +0100
Subject: [PATCH 046/674] cpuidle: intel_idle: Drop redundant backslash at line
 end

mainline inclusion
from mainline-v5.18-rc1
commit 03eb65224e5711e7a2f34b500d44866b322a249a
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5BECY
CVE: NA

Intel-SIG: commit 03eb65224e57 cpuidle: intel_idle: Drop redundant
backslash at line end.
Backport intel_idle driver

--------------------------------

Drop a redundant backslash character at the end of a line in the
spr_cstates[] definition.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 drivers/idle/intel_idle.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 4ba4ab974dbe..b7640cfe0020 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -781,7 +781,7 @@ static struct cpuidle_state spr_cstates[] __initdata = {
 	{
 		.name = "C1E",
 		.desc = "MWAIT 0x01",
-		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE | \
+		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE |
 					   CPUIDLE_FLAG_UNUSABLE,
 		.exit_latency = 2,
 		.target_residency = 4,
-- 
Gitee


From 950d18c9e2a1398787f9327a29ed0eb9fb7db3a8 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Wed, 27 Apr 2022 09:08:52 +0300
Subject: [PATCH 047/674] intel_idle: Fix the 'preferred_cstates' module
 parameter

mainline inclusion
from mainline-v5.18-rc5
commit 39c184a6a9a7a99950b321d55fe713175cf1d404
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5BECY
CVE: NA

Intel-SIG: commit 39c184a6a9a7 intel_idle: Fix the 'preferred_cstates' module
parameter.
Backport for intel_idle driver

--------------------------------

Problem description.

When user boots kernel up with the 'intel_idle.preferred_cstates=4' option,
we enable C1E and disable C1 states on Sapphire Rapids Xeon (SPR). In order
for C1E to work on SPR, we have to enable the C1E promotion bit on all
CPUs.  However, we enable it only on one CPU.

Fix description.

The 'intel_idle' driver already has the infrastructure for disabling C1E
promotion on every CPU. This patch uses the same infrastructure for
enabling C1E promotion on every CPU. It changes the boolean
'disable_promotion_to_c1e' variable to a tri-state 'c1e_promotion'
variable.

Tested on a 2-socket SPR system. I verified the following combinations:

 * C1E promotion enabled and disabled in BIOS.
 * Booted with and without the 'intel_idle.preferred_cstates=4' kernel
   argument.

In all 4 cases C1E promotion was correctly set on all CPUs.

Also tested on an old Broadwell system, just to make sure it does not cause
a regression. C1E promotion was correctly disabled on that system, both C1
and C1E were exposed (as expected).

Fixes: da0e58c038e6 ("intel_idle: add 'preferred_cstates' module argument")
Reported-by: Jan Beulich <jbeulich@suse.com>
Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
[ rjw: Minor changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 drivers/idle/intel_idle.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index b7640cfe0020..cf5ed4c1d02c 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -69,7 +69,12 @@ static unsigned int preferred_states_mask;
 static struct cpuidle_device __percpu *intel_idle_cpuidle_devices;
 
 static unsigned long auto_demotion_disable_flags;
-static bool disable_promotion_to_c1e;
+
+static enum {
+	C1E_PROMOTION_PRESERVE,
+	C1E_PROMOTION_ENABLE,
+	C1E_PROMOTION_DISABLE
+} c1e_promotion = C1E_PROMOTION_PRESERVE;
 
 struct idle_cpu {
 	struct cpuidle_state *state_table;
@@ -1398,8 +1403,6 @@ static inline void intel_idle_init_cstates_acpi(struct cpuidle_driver *drv) { }
 static inline bool intel_idle_off_by_default(u32 mwait_hint) { return false; }
 #endif /* !CONFIG_ACPI_PROCESSOR_CSTATE */
 
-static void c1e_promotion_enable(void);
-
 /**
  * ivt_idle_state_table_update - Tune the idle states table for Ivy Town.
  *
@@ -1587,8 +1590,7 @@ static void __init spr_idle_state_table_update(void)
 		spr_cstates[1].flags &= ~CPUIDLE_FLAG_UNUSABLE;
 
 		/* Enable C1E using the "C1E promotion" bit. */
-		c1e_promotion_enable();
-		disable_promotion_to_c1e = false;
+		c1e_promotion = C1E_PROMOTION_ENABLE;
 	}
 
 	/*
@@ -1754,7 +1756,9 @@ static int intel_idle_cpu_init(unsigned int cpu)
 	if (auto_demotion_disable_flags)
 		auto_demotion_disable();
 
-	if (disable_promotion_to_c1e)
+	if (c1e_promotion == C1E_PROMOTION_ENABLE)
+		c1e_promotion_enable();
+	else if (c1e_promotion == C1E_PROMOTION_DISABLE)
 		c1e_promotion_disable();
 
 	return 0;
@@ -1833,7 +1837,8 @@ static int __init intel_idle_init(void)
 	if (icpu) {
 		cpuidle_state_table = icpu->state_table;
 		auto_demotion_disable_flags = icpu->auto_demotion_disable_flags;
-		disable_promotion_to_c1e = icpu->disable_promotion_to_c1e;
+		if (icpu->disable_promotion_to_c1e)
+			c1e_promotion = C1E_PROMOTION_DISABLE;
 		if (icpu->use_acpi || force_use_acpi)
 			intel_idle_acpi_cst_extract();
 	} else if (!intel_idle_acpi_cst_extract()) {
-- 
Gitee


From ca838b2fe275e9acf0aaf8d31c1ab6bd96609c86 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Wed, 27 Apr 2022 09:08:53 +0300
Subject: [PATCH 048/674] intel_idle: Fix SPR C6 optimization

mainline inclusion
from mainline-v5.18-rc5
commit 7eac3bd38d18cd3317756649921b8264ddfee692
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5BECY
CVE: NA

Intel-SIG: commit 7eac3bd38d18 intel_idle: Fix SPR C6 optimization.
Backport for intel_idle driver

--------------------------------

The Sapphire Rapids (SPR) C6 optimization was added to the end of the
'spr_idle_state_table_update()' function. However, the function has a
'return' which may happen before the optimization has a chance to run.
And this may prevent the optimization from happening.

This is an unlikely scenario, but possible if user boots with, say,
the 'intel_idle.preferred_cstates=6' kernel boot option.

This patch fixes the issue by eliminating the problematic 'return'
statement.

Fixes: 3a9cf77b60dc ("intel_idle: add core C6 optimization for SPR")
Suggested-by: Jan Beulich <jbeulich@suse.com>
Reported-by: Jan Beulich <jbeulich@suse.com>
Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
[ rjw: Minor changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 drivers/idle/intel_idle.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index cf5ed4c1d02c..47551ab73ca8 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -1581,11 +1581,9 @@ static void __init spr_idle_state_table_update(void)
 	unsigned long long msr;
 
 	/* Check if user prefers C1E over C1. */
-	if (preferred_states_mask & BIT(2)) {
-		if (preferred_states_mask & BIT(1))
-			/* Both can't be enabled, stick to the defaults. */
-			return;
-
+	if ((preferred_states_mask & BIT(2)) &&
+	    !(preferred_states_mask & BIT(1))) {
+		/* Disable C1 and enable C1E. */
 		spr_cstates[0].flags |= CPUIDLE_FLAG_UNUSABLE;
 		spr_cstates[1].flags &= ~CPUIDLE_FLAG_UNUSABLE;
 
-- 
Gitee


From 0ce855fdbd1fbc84b9902e78eb9468834723b77c Mon Sep 17 00:00:00 2001
From: "David E. Box" <david.e.box@linux.intel.com>
Date: Wed, 28 Oct 2020 18:44:45 -0700
Subject: [PATCH 049/674] PCI: Add defines for Designated Vendor-Specific
 Extended Capability

mainline inclusion
from mainline-v5.11-rc1
commit 1dc2da5cd51f648de6d1df87e2bc6ea13f72f19c
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I596K9
CVE: NA

Intel-SIG: commit 1dc2da5cd51f PCI: Add defines for Designated Vendor-Specific
Extended Capability.
Backport for intel PMT (Platform Monitoring Technology) support
--------------------------------

Add PCIe Designated Vendor-Specific Extended Capability (DVSEC) and defines
for the header offsets. Defined in PCIe r5.0, sec 7.9.6.

Signed-off-by: David E. Box <david.e.box@linux.intel.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 include/uapi/linux/pci_regs.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 7e0d526dd96f..9e257fe9efa5 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -729,6 +729,7 @@
 #define PCI_EXT_CAP_ID_DPC	0x1D	/* Downstream Port Containment */
 #define PCI_EXT_CAP_ID_L1SS	0x1E	/* L1 PM Substates */
 #define PCI_EXT_CAP_ID_PTM	0x1F	/* Precision Time Measurement */
+#define PCI_EXT_CAP_ID_DVSEC	0x23	/* Designated Vendor-Specific */
 #define PCI_EXT_CAP_ID_DLF	0x25	/* Data Link Feature */
 #define PCI_EXT_CAP_ID_PL_16GT	0x26	/* Physical Layer 16.0 GT/s */
 #define PCI_EXT_CAP_ID_MAX	PCI_EXT_CAP_ID_PL_16GT
@@ -1079,6 +1080,10 @@
 #define  PCI_L1SS_CTL1_LTR_L12_TH_SCALE	0xe0000000  /* LTR_L1.2_THRESHOLD_Scale */
 #define PCI_L1SS_CTL2		0x0c	/* Control 2 Register */
 
+/* Designated Vendor-Specific (DVSEC, PCI_EXT_CAP_ID_DVSEC) */
+#define PCI_DVSEC_HEADER1		0x4 /* Designated Vendor-Specific Header1 */
+#define PCI_DVSEC_HEADER2		0x8 /* Designated Vendor-Specific Header2 */
+
 /* Data Link Feature */
 #define PCI_DLF_CAP		0x04	/* Capabilities Register */
 #define  PCI_DLF_EXCHANGE_ENABLE	0x80000000  /* Data Link Feature Exchange Enable */
-- 
Gitee


From e615c7102016697dfa1ee9c6940d09ae75c8a11e Mon Sep 17 00:00:00 2001
From: "David E. Box" <david.e.box@linux.intel.com>
Date: Wed, 28 Oct 2020 18:55:33 -0700
Subject: [PATCH 050/674] mfd: Intel Platform Monitoring Technology support

mainline inclusion
from mainline-v5.11-rc1
commit 4f8217d5b0ca8ace78a27dc371b87697eedc421d
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I596K9
CVE: NA

Intel-SIG: commit 4f8217d5b0ca mfd: Intel Platform Monitoring Technology
support.
Backport for intel PMT (Platform Monitoring Technology) support

--------------------------------

Intel Platform Monitoring Technology (PMT) is an architecture for
enumerating and accessing hardware monitoring facilities. PMT supports
multiple types of monitoring capabilities. This driver creates platform
devices for each type so that they may be managed by capability specific
drivers (to be introduced). Capabilities are discovered using PCIe DVSEC
ids. Support is included for the 3 current capability types, Telemetry,
Watcher, and Crashlog. The features are available on new Intel platforms
starting from Tiger Lake for which support is added. This patch adds
support for Tiger Lake (TGL), Alder Lake (ADL), and Out-of-Band Management
Services Module (OOBMSM).

Also add a quirk mechanism for several early hardware differences and bugs.
For Tiger Lake and Alder Lake, do not support Watcher and Crashlog
capabilities since they will not be compatible with future product. Also,
fix use a quirk to fix the discovery table offset.

Co-developed-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
Signed-off-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
Signed-off-by: David E. Box <david.e.box@linux.intel.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 MAINTAINERS             |   5 +
 drivers/mfd/Kconfig     |  10 ++
 drivers/mfd/Makefile    |   1 +
 drivers/mfd/intel_pmt.c | 223 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 239 insertions(+)
 create mode 100644 drivers/mfd/intel_pmt.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 23a23bd94c00..cdffd5192e09 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9038,6 +9038,11 @@ F:	drivers/mfd/intel_soc_pmic*
 F:	include/linux/mfd/intel_msic.h
 F:	include/linux/mfd/intel_soc_pmic*
 
+INTEL PMT DRIVER
+M:	"David E. Box" <david.e.box@linux.intel.com>
+S:	Maintained
+F:	drivers/mfd/intel_pmt.c
+
 INTEL PRO/WIRELESS 2100, 2200BG, 2915ABG NETWORK CONNECTION SUPPORT
 M:	Stanislav Yakovlev <stas.yakovlev@gmail.com>
 L:	linux-wireless@vger.kernel.org
diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 15680c3c9279..3f9f84f9f288 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -699,6 +699,16 @@ config MFD_INTEL_PMC_BXT
 	  Register and P-unit access. In addition this creates devices
 	  for iTCO watchdog and telemetry that are part of the PMC.
 
+config MFD_INTEL_PMT
+	tristate "Intel Platform Monitoring Technology (PMT) support"
+	depends on PCI
+	select MFD_CORE
+	help
+	  The Intel Platform Monitoring Technology (PMT) is an interface that
+	  provides access to hardware monitor registers. This driver supports
+	  Telemetry, Watcher, and Crashlog PMT capabilities/devices for
+	  platforms starting from Tiger Lake.
+
 config MFD_IPAQ_MICRO
 	bool "Atmel Micro ASIC (iPAQ h3100/h3600/h3700) Support"
 	depends on SA1100_H3100 || SA1100_H3600
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index fb1df45a301e..ce8f1c0583d5 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -216,6 +216,7 @@ obj-$(CONFIG_MFD_INTEL_LPSS_PCI)	+= intel-lpss-pci.o
 obj-$(CONFIG_MFD_INTEL_LPSS_ACPI)	+= intel-lpss-acpi.o
 obj-$(CONFIG_MFD_INTEL_MSIC)	+= intel_msic.o
 obj-$(CONFIG_MFD_INTEL_PMC_BXT)	+= intel_pmc_bxt.o
+obj-$(CONFIG_MFD_INTEL_PMT)	+= intel_pmt.o
 obj-$(CONFIG_MFD_PALMAS)	+= palmas.o
 obj-$(CONFIG_MFD_VIPERBOARD)    += viperboard.o
 obj-$(CONFIG_MFD_RC5T583)	+= rc5t583.o rc5t583-irq.o
diff --git a/drivers/mfd/intel_pmt.c b/drivers/mfd/intel_pmt.c
new file mode 100644
index 000000000000..744b230cdcca
--- /dev/null
+++ b/drivers/mfd/intel_pmt.c
@@ -0,0 +1,223 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel Platform Monitoring Technology PMT driver
+ *
+ * Copyright (c) 2020, Intel Corporation.
+ * All Rights Reserved.
+ *
+ * Author: David E. Box <david.e.box@linux.intel.com>
+ */
+
+#include <linux/bits.h>
+#include <linux/kernel.h>
+#include <linux/mfd/core.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/platform_device.h>
+#include <linux/pm.h>
+#include <linux/pm_runtime.h>
+#include <linux/types.h>
+
+/* Intel DVSEC capability vendor space offsets */
+#define INTEL_DVSEC_ENTRIES		0xA
+#define INTEL_DVSEC_SIZE		0xB
+#define INTEL_DVSEC_TABLE		0xC
+#define INTEL_DVSEC_TABLE_BAR(x)	((x) & GENMASK(2, 0))
+#define INTEL_DVSEC_TABLE_OFFSET(x)	((x) & GENMASK(31, 3))
+#define INTEL_DVSEC_ENTRY_SIZE		4
+
+/* PMT capabilities */
+#define DVSEC_INTEL_ID_TELEMETRY	2
+#define DVSEC_INTEL_ID_WATCHER		3
+#define DVSEC_INTEL_ID_CRASHLOG		4
+
+struct intel_dvsec_header {
+	u16	length;
+	u16	id;
+	u8	num_entries;
+	u8	entry_size;
+	u8	tbir;
+	u32	offset;
+};
+
+enum pmt_quirks {
+	/* Watcher capability not supported */
+	PMT_QUIRK_NO_WATCHER	= BIT(0),
+
+	/* Crashlog capability not supported */
+	PMT_QUIRK_NO_CRASHLOG	= BIT(1),
+
+	/* Use shift instead of mask to read discovery table offset */
+	PMT_QUIRK_TABLE_SHIFT	= BIT(2),
+};
+
+struct pmt_platform_info {
+	unsigned long quirks;
+};
+
+static const struct pmt_platform_info tgl_info = {
+	.quirks = PMT_QUIRK_NO_WATCHER | PMT_QUIRK_NO_CRASHLOG |
+		  PMT_QUIRK_TABLE_SHIFT,
+};
+
+static int pmt_add_dev(struct pci_dev *pdev, struct intel_dvsec_header *header,
+		       unsigned long quirks)
+{
+	struct device *dev = &pdev->dev;
+	struct resource *res, *tmp;
+	struct mfd_cell *cell;
+	const char *name;
+	int count = header->num_entries;
+	int size = header->entry_size;
+	int id = header->id;
+	int i;
+
+	switch (id) {
+	case DVSEC_INTEL_ID_TELEMETRY:
+		name = "pmt_telemetry";
+		break;
+	case DVSEC_INTEL_ID_WATCHER:
+		if (quirks & PMT_QUIRK_NO_WATCHER) {
+			dev_info(dev, "Watcher not supported\n");
+			return 0;
+		}
+		name = "pmt_watcher";
+		break;
+	case DVSEC_INTEL_ID_CRASHLOG:
+		if (quirks & PMT_QUIRK_NO_CRASHLOG) {
+			dev_info(dev, "Crashlog not supported\n");
+			return 0;
+		}
+		name = "pmt_crashlog";
+		break;
+	default:
+		dev_err(dev, "Unrecognized PMT capability: %d\n", id);
+		return -EINVAL;
+	}
+
+	if (!header->num_entries || !header->entry_size) {
+		dev_err(dev, "Invalid count or size for %s header\n", name);
+		return -EINVAL;
+	}
+
+	cell = devm_kzalloc(dev, sizeof(*cell), GFP_KERNEL);
+	if (!cell)
+		return -ENOMEM;
+
+	res = devm_kcalloc(dev, count, sizeof(*res), GFP_KERNEL);
+	if (!res)
+		return -ENOMEM;
+
+	if (quirks & PMT_QUIRK_TABLE_SHIFT)
+		header->offset >>= 3;
+
+	/*
+	 * The PMT DVSEC contains the starting offset and count for a block of
+	 * discovery tables, each providing access to monitoring facilities for
+	 * a section of the device. Create a resource list of these tables to
+	 * provide to the driver.
+	 */
+	for (i = 0, tmp = res; i < count; i++, tmp++) {
+		tmp->start = pdev->resource[header->tbir].start +
+			     header->offset + i * (size << 2);
+		tmp->end = tmp->start + (size << 2) - 1;
+		tmp->flags = IORESOURCE_MEM;
+	}
+
+	cell->resources = res;
+	cell->num_resources = count;
+	cell->name = name;
+
+	return devm_mfd_add_devices(dev, PLATFORM_DEVID_AUTO, cell, 1, NULL, 0,
+				    NULL);
+}
+
+static int pmt_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+	struct pmt_platform_info *info;
+	unsigned long quirks = 0;
+	bool found_devices = false;
+	int ret, pos = 0;
+
+	ret = pcim_enable_device(pdev);
+	if (ret)
+		return ret;
+
+	info = (struct pmt_platform_info *)id->driver_data;
+
+	if (info)
+		quirks = info->quirks;
+
+	do {
+		struct intel_dvsec_header header;
+		u32 table;
+		u16 vid;
+
+		pos = pci_find_next_ext_capability(pdev, pos, PCI_EXT_CAP_ID_DVSEC);
+		if (!pos)
+			break;
+
+		pci_read_config_word(pdev, pos + PCI_DVSEC_HEADER1, &vid);
+		if (vid != PCI_VENDOR_ID_INTEL)
+			continue;
+
+		pci_read_config_word(pdev, pos + PCI_DVSEC_HEADER2,
+				     &header.id);
+		pci_read_config_byte(pdev, pos + INTEL_DVSEC_ENTRIES,
+				     &header.num_entries);
+		pci_read_config_byte(pdev, pos + INTEL_DVSEC_SIZE,
+				     &header.entry_size);
+		pci_read_config_dword(pdev, pos + INTEL_DVSEC_TABLE,
+				      &table);
+
+		header.tbir = INTEL_DVSEC_TABLE_BAR(table);
+		header.offset = INTEL_DVSEC_TABLE_OFFSET(table);
+
+		ret = pmt_add_dev(pdev, &header, quirks);
+		if (ret) {
+			dev_warn(&pdev->dev,
+				 "Failed to add device for DVSEC id %d\n",
+				 header.id);
+			continue;
+		}
+
+		found_devices = true;
+	} while (true);
+
+	if (!found_devices)
+		return -ENODEV;
+
+	pm_runtime_put(&pdev->dev);
+	pm_runtime_allow(&pdev->dev);
+
+	return 0;
+}
+
+static void pmt_pci_remove(struct pci_dev *pdev)
+{
+	pm_runtime_forbid(&pdev->dev);
+	pm_runtime_get_sync(&pdev->dev);
+}
+
+#define PCI_DEVICE_ID_INTEL_PMT_ADL	0x467d
+#define PCI_DEVICE_ID_INTEL_PMT_OOBMSM	0x09a7
+#define PCI_DEVICE_ID_INTEL_PMT_TGL	0x9a0d
+static const struct pci_device_id pmt_pci_ids[] = {
+	{ PCI_DEVICE_DATA(INTEL, PMT_ADL, &tgl_info) },
+	{ PCI_DEVICE_DATA(INTEL, PMT_OOBMSM, NULL) },
+	{ PCI_DEVICE_DATA(INTEL, PMT_TGL, &tgl_info) },
+	{ }
+};
+MODULE_DEVICE_TABLE(pci, pmt_pci_ids);
+
+static struct pci_driver pmt_pci_driver = {
+	.name = "intel-pmt",
+	.id_table = pmt_pci_ids,
+	.probe = pmt_pci_probe,
+	.remove = pmt_pci_remove,
+};
+module_pci_driver(pmt_pci_driver);
+
+MODULE_AUTHOR("David E. Box <david.e.box@linux.intel.com>");
+MODULE_DESCRIPTION("Intel Platform Monitoring Technology PMT driver");
+MODULE_LICENSE("GPL v2");
-- 
Gitee


From 71bb316092365fe81085377dce3e97d2a16669d5 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@linux.intel.com>
Date: Wed, 28 Oct 2020 18:55:34 -0700
Subject: [PATCH 051/674] platform/x86: Intel PMT class driver

mainline inclusion
from mainline-v5.11-rc1
commit e2729113ce66d8d21f729b41bc3ed3feaf1acf69
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I596K9
CVE: NA

Intel-SIG: commit e2729113ce66 platform/x86: Intel PMT class driver.
Backport for intel PMT (Platform Monitoring Technology) support

--------------------------------

Intel Platform Monitoring Technology is meant to provide a common way to
access telemetry and system metrics.

Register mappings are not provided by the driver. Instead, a GUID is read
from a header for each endpoint. The GUID identifies the device and is to
be used with an XML, provided by the vendor, to discover the available set
of metrics and their register mapping.  This allows firmware updates to
modify the register space without needing to update the driver every time
with new mappings. Firmware writes a new GUID in this case to specify the
new mapping.  Software tools with access to the associated XML file can
then interpret the changes.

The module manages access to all Intel PMT endpoints on a system,
independent of the device exporting them. It creates an intel_pmt class
to manage the devices. For each telemetry endpoint, sysfs files provide
GUID and size information as well as a pointer to the parent device the
telemetry came from. Software may discover the association between
endpoints and devices by iterating through the list in sysfs, or by looking
for the existence of the class folder under the device of interest.  A
binary sysfs attribute of the same name allows software to then read or map
the telemetry space for direct access.

Signed-off-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
Signed-off-by: David E. Box <david.e.box@linux.intel.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 .../ABI/testing/sysfs-class-intel_pmt         |  54 ++++
 MAINTAINERS                                   |   1 +
 drivers/platform/x86/Kconfig                  |  12 +
 drivers/platform/x86/Makefile                 |   1 +
 drivers/platform/x86/intel_pmt_class.c        | 297 ++++++++++++++++++
 drivers/platform/x86/intel_pmt_class.h        |  52 +++
 6 files changed, 417 insertions(+)
 create mode 100644 Documentation/ABI/testing/sysfs-class-intel_pmt
 create mode 100644 drivers/platform/x86/intel_pmt_class.c
 create mode 100644 drivers/platform/x86/intel_pmt_class.h

diff --git a/Documentation/ABI/testing/sysfs-class-intel_pmt b/Documentation/ABI/testing/sysfs-class-intel_pmt
new file mode 100644
index 000000000000..926b5cf95fd1
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-class-intel_pmt
@@ -0,0 +1,54 @@
+What:		/sys/class/intel_pmt/
+Date:		October 2020
+KernelVersion:	5.10
+Contact:	David Box <david.e.box@linux.intel.com>
+Description:
+		The intel_pmt/ class directory contains information for
+		devices that expose hardware telemetry using Intel Platform
+		Monitoring Technology (PMT)
+
+What:		/sys/class/intel_pmt/telem<x>
+Date:		October 2020
+KernelVersion:	5.10
+Contact:	David Box <david.e.box@linux.intel.com>
+Description:
+		The telem<x> directory contains files describing an instance of
+		a PMT telemetry device that exposes hardware telemetry. Each
+		telem<x> directory has an associated telem file. This file
+		may be opened and mapped or read to access the telemetry space
+		of the device. The register layout of the telemetry space is
+		determined from an XML file that matches the PCI device id and
+		GUID for the device.
+
+What:		/sys/class/intel_pmt/telem<x>/telem
+Date:		October 2020
+KernelVersion:	5.10
+Contact:	David Box <david.e.box@linux.intel.com>
+Description:
+		(RO) The telemetry data for this telemetry device. This file
+		may be mapped or read to obtain the data.
+
+What:		/sys/class/intel_pmt/telem<x>/guid
+Date:		October 2020
+KernelVersion:	5.10
+Contact:	David Box <david.e.box@linux.intel.com>
+Description:
+		(RO) The GUID for this telemetry device. The GUID identifies
+		the version of the XML file for the parent device that is to
+		be used to get the register layout.
+
+What:		/sys/class/intel_pmt/telem<x>/size
+Date:		October 2020
+KernelVersion:	5.10
+Contact:	David Box <david.e.box@linux.intel.com>
+Description:
+		(RO) The size of telemetry region in bytes that corresponds to
+		the mapping size for the telem file.
+
+What:		/sys/class/intel_pmt/telem<x>/offset
+Date:		October 2020
+KernelVersion:	5.10
+Contact:	David Box <david.e.box@linux.intel.com>
+Description:
+		(RO) The offset of telemetry region in bytes that corresponds to
+		the mapping for the telem file.
diff --git a/MAINTAINERS b/MAINTAINERS
index cdffd5192e09..466b1c599848 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9042,6 +9042,7 @@ INTEL PMT DRIVER
 M:	"David E. Box" <david.e.box@linux.intel.com>
 S:	Maintained
 F:	drivers/mfd/intel_pmt.c
+F:	drivers/platform/x86/intel_pmt_*
 
 INTEL PRO/WIRELESS 2100, 2200BG, 2915ABG NETWORK CONNECTION SUPPORT
 M:	Stanislav Yakovlev <stas.yakovlev@gmail.com>
diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
index a1858689d6e1..9118f76ed3e9 100644
--- a/drivers/platform/x86/Kconfig
+++ b/drivers/platform/x86/Kconfig
@@ -1362,6 +1362,18 @@ config INTEL_PMC_CORE
 		- LTR Ignore
 		- MPHY/PLL gating status (Sunrisepoint PCH only)
 
+config INTEL_PMT_CLASS
+	tristate "Intel Platform Monitoring Technology (PMT) Class driver"
+	help
+	  The Intel Platform Monitoring Technology (PMT) class driver provides
+	  the basic sysfs interface and file hierarchy uses by PMT devices.
+
+	  For more information, see:
+	  <file:Documentation/ABI/testing/sysfs-class-intel_pmt>
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called intel_pmt_class.
+
 config INTEL_PUNIT_IPC
 	tristate "Intel P-Unit IPC Driver"
 	help
diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile
index 5f823f7eff45..f4b1f87f2401 100644
--- a/drivers/platform/x86/Makefile
+++ b/drivers/platform/x86/Makefile
@@ -140,6 +140,7 @@ obj-$(CONFIG_INTEL_MFLD_THERMAL)	+= intel_mid_thermal.o
 obj-$(CONFIG_INTEL_MID_POWER_BUTTON)	+= intel_mid_powerbtn.o
 obj-$(CONFIG_INTEL_MRFLD_PWRBTN)	+= intel_mrfld_pwrbtn.o
 obj-$(CONFIG_INTEL_PMC_CORE)		+= intel_pmc_core.o intel_pmc_core_pltdrv.o
+obj-$(CONFIG_INTEL_PMT_CLASS)		+= intel_pmt_class.o
 obj-$(CONFIG_INTEL_PUNIT_IPC)		+= intel_punit_ipc.o
 obj-$(CONFIG_INTEL_SCU_IPC)		+= intel_scu_ipc.o
 obj-$(CONFIG_INTEL_SCU_PCI)		+= intel_scu_pcidrv.o
diff --git a/drivers/platform/x86/intel_pmt_class.c b/drivers/platform/x86/intel_pmt_class.c
new file mode 100644
index 000000000000..aa88dc23bbde
--- /dev/null
+++ b/drivers/platform/x86/intel_pmt_class.c
@@ -0,0 +1,297 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel Platform Monitory Technology Telemetry driver
+ *
+ * Copyright (c) 2020, Intel Corporation.
+ * All Rights Reserved.
+ *
+ * Author: "Alexander Duyck" <alexander.h.duyck@linux.intel.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/pci.h>
+
+#include "intel_pmt_class.h"
+
+#define PMT_XA_START		0
+#define PMT_XA_MAX		INT_MAX
+#define PMT_XA_LIMIT		XA_LIMIT(PMT_XA_START, PMT_XA_MAX)
+
+/*
+ * sysfs
+ */
+static ssize_t
+intel_pmt_read(struct file *filp, struct kobject *kobj,
+	       struct bin_attribute *attr, char *buf, loff_t off,
+	       size_t count)
+{
+	struct intel_pmt_entry *entry = container_of(attr,
+						     struct intel_pmt_entry,
+						     pmt_bin_attr);
+
+	if (off < 0)
+		return -EINVAL;
+
+	if (off >= entry->size)
+		return 0;
+
+	if (count > entry->size - off)
+		count = entry->size - off;
+
+	memcpy_fromio(buf, entry->base + off, count);
+
+	return count;
+}
+
+static int
+intel_pmt_mmap(struct file *filp, struct kobject *kobj,
+		struct bin_attribute *attr, struct vm_area_struct *vma)
+{
+	struct intel_pmt_entry *entry = container_of(attr,
+						     struct intel_pmt_entry,
+						     pmt_bin_attr);
+	unsigned long vsize = vma->vm_end - vma->vm_start;
+	struct device *dev = kobj_to_dev(kobj);
+	unsigned long phys = entry->base_addr;
+	unsigned long pfn = PFN_DOWN(phys);
+	unsigned long psize;
+
+	if (vma->vm_flags & (VM_WRITE | VM_MAYWRITE))
+		return -EROFS;
+
+	psize = (PFN_UP(entry->base_addr + entry->size) - pfn) * PAGE_SIZE;
+	if (vsize > psize) {
+		dev_err(dev, "Requested mmap size is too large\n");
+		return -EINVAL;
+	}
+
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+	if (io_remap_pfn_range(vma, vma->vm_start, pfn,
+		vsize, vma->vm_page_prot))
+		return -EAGAIN;
+
+	return 0;
+}
+
+static ssize_t
+guid_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct intel_pmt_entry *entry = dev_get_drvdata(dev);
+
+	return sprintf(buf, "0x%x\n", entry->guid);
+}
+static DEVICE_ATTR_RO(guid);
+
+static ssize_t size_show(struct device *dev, struct device_attribute *attr,
+			 char *buf)
+{
+	struct intel_pmt_entry *entry = dev_get_drvdata(dev);
+
+	return sprintf(buf, "%zu\n", entry->size);
+}
+static DEVICE_ATTR_RO(size);
+
+static ssize_t
+offset_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct intel_pmt_entry *entry = dev_get_drvdata(dev);
+
+	return sprintf(buf, "%lu\n", offset_in_page(entry->base_addr));
+}
+static DEVICE_ATTR_RO(offset);
+
+static struct attribute *intel_pmt_attrs[] = {
+	&dev_attr_guid.attr,
+	&dev_attr_size.attr,
+	&dev_attr_offset.attr,
+	NULL
+};
+ATTRIBUTE_GROUPS(intel_pmt);
+
+static struct class intel_pmt_class = {
+	.name = "intel_pmt",
+	.owner = THIS_MODULE,
+	.dev_groups = intel_pmt_groups,
+};
+
+static int intel_pmt_populate_entry(struct intel_pmt_entry *entry,
+				    struct intel_pmt_header *header,
+				    struct device *dev,
+				    struct resource *disc_res)
+{
+	struct pci_dev *pci_dev = to_pci_dev(dev->parent);
+	u8 bir;
+
+	/*
+	 * The base offset should always be 8 byte aligned.
+	 *
+	 * For non-local access types the lower 3 bits of base offset
+	 * contains the index of the base address register where the
+	 * telemetry can be found.
+	 */
+	bir = GET_BIR(header->base_offset);
+
+	/* Local access and BARID only for now */
+	switch (header->access_type) {
+	case ACCESS_LOCAL:
+		if (bir) {
+			dev_err(dev,
+				"Unsupported BAR index %d for access type %d\n",
+				bir, header->access_type);
+			return -EINVAL;
+		}
+		/*
+		 * For access_type LOCAL, the base address is as follows:
+		 * base address = end of discovery region + base offset
+		 */
+		entry->base_addr = disc_res->end + 1 + header->base_offset;
+		break;
+	case ACCESS_BARID:
+		/*
+		 * If another BAR was specified then the base offset
+		 * represents the offset within that BAR. SO retrieve the
+		 * address from the parent PCI device and add offset.
+		 */
+		entry->base_addr = pci_resource_start(pci_dev, bir) +
+				   GET_ADDRESS(header->base_offset);
+		break;
+	default:
+		dev_err(dev, "Unsupported access type %d\n",
+			header->access_type);
+		return -EINVAL;
+	}
+
+	entry->guid = header->guid;
+	entry->size = header->size;
+
+	return 0;
+}
+
+static int intel_pmt_dev_register(struct intel_pmt_entry *entry,
+				  struct intel_pmt_namespace *ns,
+				  struct device *parent)
+{
+	struct resource res;
+	struct device *dev;
+	int ret;
+
+	ret = xa_alloc(ns->xa, &entry->devid, entry, PMT_XA_LIMIT, GFP_KERNEL);
+	if (ret)
+		return ret;
+
+	dev = device_create(&intel_pmt_class, parent, MKDEV(0, 0), entry,
+			    "%s%d", ns->name, entry->devid);
+
+	if (IS_ERR(dev)) {
+		dev_err(parent, "Could not create %s%d device node\n",
+			ns->name, entry->devid);
+		ret = PTR_ERR(dev);
+		goto fail_dev_create;
+	}
+
+	entry->kobj = &dev->kobj;
+
+	if (ns->attr_grp) {
+		ret = sysfs_create_group(entry->kobj, ns->attr_grp);
+		if (ret)
+			goto fail_sysfs;
+	}
+
+	/* if size is 0 assume no data buffer, so no file needed */
+	if (!entry->size)
+		return 0;
+
+	res.start = entry->base_addr;
+	res.end = res.start + entry->size - 1;
+	res.flags = IORESOURCE_MEM;
+
+	entry->base = devm_ioremap_resource(dev, &res);
+	if (IS_ERR(entry->base)) {
+		ret = PTR_ERR(entry->base);
+		goto fail_ioremap;
+	}
+
+	sysfs_bin_attr_init(&entry->pmt_bin_attr);
+	entry->pmt_bin_attr.attr.name = ns->name;
+	entry->pmt_bin_attr.attr.mode = 0440;
+	entry->pmt_bin_attr.mmap = intel_pmt_mmap;
+	entry->pmt_bin_attr.read = intel_pmt_read;
+	entry->pmt_bin_attr.size = entry->size;
+
+	ret = sysfs_create_bin_file(&dev->kobj, &entry->pmt_bin_attr);
+	if (!ret)
+		return 0;
+
+fail_ioremap:
+	sysfs_remove_group(entry->kobj, ns->attr_grp);
+fail_sysfs:
+	device_unregister(dev);
+fail_dev_create:
+	xa_erase(ns->xa, entry->devid);
+
+	return ret;
+}
+
+int intel_pmt_dev_create(struct intel_pmt_entry *entry,
+			 struct intel_pmt_namespace *ns,
+			 struct platform_device *pdev, int idx)
+{
+	struct intel_pmt_header header;
+	struct resource	*disc_res;
+	int ret = -ENODEV;
+
+	disc_res = platform_get_resource(pdev, IORESOURCE_MEM, idx);
+	if (!disc_res)
+		return ret;
+
+	entry->disc_table = devm_platform_ioremap_resource(pdev, idx);
+	if (IS_ERR(entry->disc_table))
+		return PTR_ERR(entry->disc_table);
+
+	ret = ns->pmt_header_decode(entry, &header, &pdev->dev);
+	if (ret)
+		return ret;
+
+	ret = intel_pmt_populate_entry(entry, &header, &pdev->dev, disc_res);
+	if (ret)
+		return ret;
+
+	return intel_pmt_dev_register(entry, ns, &pdev->dev);
+
+}
+EXPORT_SYMBOL_GPL(intel_pmt_dev_create);
+
+void intel_pmt_dev_destroy(struct intel_pmt_entry *entry,
+			   struct intel_pmt_namespace *ns)
+{
+	struct device *dev = kobj_to_dev(entry->kobj);
+
+	if (entry->size)
+		sysfs_remove_bin_file(entry->kobj, &entry->pmt_bin_attr);
+
+	if (ns->attr_grp)
+		sysfs_remove_group(entry->kobj, ns->attr_grp);
+
+	device_unregister(dev);
+	xa_erase(ns->xa, entry->devid);
+}
+EXPORT_SYMBOL_GPL(intel_pmt_dev_destroy);
+
+static int __init pmt_class_init(void)
+{
+	return class_register(&intel_pmt_class);
+}
+
+static void __exit pmt_class_exit(void)
+{
+	class_unregister(&intel_pmt_class);
+}
+
+module_init(pmt_class_init);
+module_exit(pmt_class_exit);
+
+MODULE_AUTHOR("Alexander Duyck <alexander.h.duyck@linux.intel.com>");
+MODULE_DESCRIPTION("Intel PMT Class driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/platform/x86/intel_pmt_class.h b/drivers/platform/x86/intel_pmt_class.h
new file mode 100644
index 000000000000..de8f8139ba31
--- /dev/null
+++ b/drivers/platform/x86/intel_pmt_class.h
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _INTEL_PMT_CLASS_H
+#define _INTEL_PMT_CLASS_H
+
+#include <linux/platform_device.h>
+#include <linux/xarray.h>
+#include <linux/types.h>
+#include <linux/bits.h>
+#include <linux/err.h>
+#include <linux/io.h>
+
+/* PMT access types */
+#define ACCESS_BARID		2
+#define ACCESS_LOCAL		3
+
+/* PMT discovery base address/offset register layout */
+#define GET_BIR(v)		((v) & GENMASK(2, 0))
+#define GET_ADDRESS(v)		((v) & GENMASK(31, 3))
+
+struct intel_pmt_entry {
+	struct bin_attribute	pmt_bin_attr;
+	struct kobject		*kobj;
+	void __iomem		*disc_table;
+	void __iomem		*base;
+	unsigned long		base_addr;
+	size_t			size;
+	u32			guid;
+	int			devid;
+};
+
+struct intel_pmt_header {
+	u32	base_offset;
+	u32	size;
+	u32	guid;
+	u8	access_type;
+};
+
+struct intel_pmt_namespace {
+	const char *name;
+	struct xarray *xa;
+	const struct attribute_group *attr_grp;
+	int (*pmt_header_decode)(struct intel_pmt_entry *entry,
+				 struct intel_pmt_header *header,
+				 struct device *dev);
+};
+
+int intel_pmt_dev_create(struct intel_pmt_entry *entry,
+			 struct intel_pmt_namespace *ns,
+			 struct platform_device *pdev, int idx);
+void intel_pmt_dev_destroy(struct intel_pmt_entry *entry,
+			   struct intel_pmt_namespace *ns);
+#endif
-- 
Gitee


From 0d50cb6937086b9a8aa6e24a6ff5ff4a981b23c5 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@linux.intel.com>
Date: Wed, 28 Oct 2020 18:55:35 -0700
Subject: [PATCH 052/674] platform/x86: Intel PMT Telemetry capability driver

mainline inclusion
from mainline-v5.11-rc1
commit 68fe8e6e2c4b04e2733d77834f55a4a0e172b770
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I596K9
CVE: NA

Intel-SIG: commit 68fe8e6e2c4b platform/x86: Intel PMT Telemetry capability
driver.
Backport for intel PMT (Platform Monitoring Technology) support

--------------------------------

PMT Telemetry is a capability of the Intel Platform Monitoring Technology.
The Telemetry capability provides access to device telemetry metrics that
provide hardware performance data to users from read-only register spaces.

With this driver present the intel_pmt directory can be populated with
telem<x> devices. These devices will contain the standard intel_pmt sysfs
data and a "telem" binary sysfs attribute which can be used to access the
telemetry data.

Also create a PCI device id list for early telemetry hardware that require
workarounds for known issues.

Signed-off-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
Co-developed-by: David E. Box <david.e.box@linux.intel.com>
Signed-off-by: David E. Box <david.e.box@linux.intel.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 drivers/platform/x86/Kconfig               |  11 ++
 drivers/platform/x86/Makefile              |   1 +
 drivers/platform/x86/intel_pmt_telemetry.c | 160 +++++++++++++++++++++
 3 files changed, 172 insertions(+)
 create mode 100644 drivers/platform/x86/intel_pmt_telemetry.c

diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
index 9118f76ed3e9..ac93c807eb9a 100644
--- a/drivers/platform/x86/Kconfig
+++ b/drivers/platform/x86/Kconfig
@@ -1374,6 +1374,17 @@ config INTEL_PMT_CLASS
 	  To compile this driver as a module, choose M here: the module
 	  will be called intel_pmt_class.
 
+config INTEL_PMT_TELEMETRY
+	tristate "Intel Platform Monitoring Technology (PMT) Telemetry driver"
+	select INTEL_PMT_CLASS
+	help
+	  The Intel Platform Monitory Technology (PMT) Telemetry driver provides
+	  access to hardware telemetry metrics on devices that support the
+	  feature.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called intel_pmt_telemetry.
+
 config INTEL_PUNIT_IPC
 	tristate "Intel P-Unit IPC Driver"
 	help
diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile
index f4b1f87f2401..6a7b61f59ea8 100644
--- a/drivers/platform/x86/Makefile
+++ b/drivers/platform/x86/Makefile
@@ -141,6 +141,7 @@ obj-$(CONFIG_INTEL_MID_POWER_BUTTON)	+= intel_mid_powerbtn.o
 obj-$(CONFIG_INTEL_MRFLD_PWRBTN)	+= intel_mrfld_pwrbtn.o
 obj-$(CONFIG_INTEL_PMC_CORE)		+= intel_pmc_core.o intel_pmc_core_pltdrv.o
 obj-$(CONFIG_INTEL_PMT_CLASS)		+= intel_pmt_class.o
+obj-$(CONFIG_INTEL_PMT_TELEMETRY)	+= intel_pmt_telemetry.o
 obj-$(CONFIG_INTEL_PUNIT_IPC)		+= intel_punit_ipc.o
 obj-$(CONFIG_INTEL_SCU_IPC)		+= intel_scu_ipc.o
 obj-$(CONFIG_INTEL_SCU_PCI)		+= intel_scu_pcidrv.o
diff --git a/drivers/platform/x86/intel_pmt_telemetry.c b/drivers/platform/x86/intel_pmt_telemetry.c
new file mode 100644
index 000000000000..f8a87614efa4
--- /dev/null
+++ b/drivers/platform/x86/intel_pmt_telemetry.c
@@ -0,0 +1,160 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel Platform Monitory Technology Telemetry driver
+ *
+ * Copyright (c) 2020, Intel Corporation.
+ * All Rights Reserved.
+ *
+ * Author: "David E. Box" <david.e.box@linux.intel.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/overflow.h>
+
+#include "intel_pmt_class.h"
+
+#define TELEM_DEV_NAME		"pmt_telemetry"
+
+#define TELEM_SIZE_OFFSET	0x0
+#define TELEM_GUID_OFFSET	0x4
+#define TELEM_BASE_OFFSET	0x8
+#define TELEM_ACCESS(v)		((v) & GENMASK(3, 0))
+/* size is in bytes */
+#define TELEM_SIZE(v)		(((v) & GENMASK(27, 12)) >> 10)
+
+/* Used by client hardware to identify a fixed telemetry entry*/
+#define TELEM_CLIENT_FIXED_BLOCK_GUID	0x10000000
+
+struct pmt_telem_priv {
+	int				num_entries;
+	struct intel_pmt_entry		entry[];
+};
+
+/*
+ * Early implementations of PMT on client platforms have some
+ * differences from the server platforms (which use the Out Of Band
+ * Management Services Module OOBMSM). This list tracks those
+ * platforms as needed to handle those differences. Newer client
+ * platforms are expected to be fully compatible with server.
+ */
+static const struct pci_device_id pmt_telem_early_client_pci_ids[] = {
+	{ PCI_VDEVICE(INTEL, 0x9a0d) }, /* TGL */
+	{ PCI_VDEVICE(INTEL, 0x467d) }, /* ADL */
+	{ }
+};
+
+static bool intel_pmt_is_early_client_hw(struct device *dev)
+{
+	struct pci_dev *parent = to_pci_dev(dev->parent);
+
+	return !!pci_match_id(pmt_telem_early_client_pci_ids, parent);
+}
+
+static bool pmt_telem_region_overlaps(struct intel_pmt_entry *entry,
+				      struct device *dev)
+{
+	u32 guid = readl(entry->disc_table + TELEM_GUID_OFFSET);
+
+	if (guid != TELEM_CLIENT_FIXED_BLOCK_GUID)
+		return false;
+
+	return intel_pmt_is_early_client_hw(dev);
+}
+
+static int pmt_telem_header_decode(struct intel_pmt_entry *entry,
+				   struct intel_pmt_header *header,
+				   struct device *dev)
+{
+	void __iomem *disc_table = entry->disc_table;
+
+	if (pmt_telem_region_overlaps(entry, dev))
+		return 1;
+
+	header->access_type = TELEM_ACCESS(readl(disc_table));
+	header->guid = readl(disc_table + TELEM_GUID_OFFSET);
+	header->base_offset = readl(disc_table + TELEM_BASE_OFFSET);
+
+	/* Size is measured in DWORDS, but accessor returns bytes */
+	header->size = TELEM_SIZE(readl(disc_table));
+
+	return 0;
+}
+
+static DEFINE_XARRAY_ALLOC(telem_array);
+static struct intel_pmt_namespace pmt_telem_ns = {
+	.name = "telem",
+	.xa = &telem_array,
+	.pmt_header_decode = pmt_telem_header_decode,
+};
+
+static int pmt_telem_remove(struct platform_device *pdev)
+{
+	struct pmt_telem_priv *priv = platform_get_drvdata(pdev);
+	int i;
+
+	for (i = 0; i < priv->num_entries; i++)
+		intel_pmt_dev_destroy(&priv->entry[i], &pmt_telem_ns);
+
+	return 0;
+}
+
+static int pmt_telem_probe(struct platform_device *pdev)
+{
+	struct pmt_telem_priv *priv;
+	size_t size;
+	int i, ret;
+
+	size = struct_size(priv, entry, pdev->num_resources);
+	priv = devm_kzalloc(&pdev->dev, size, GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	platform_set_drvdata(pdev, priv);
+
+	for (i = 0; i < pdev->num_resources; i++) {
+		struct intel_pmt_entry *entry = &priv->entry[i];
+
+		ret = intel_pmt_dev_create(entry, &pmt_telem_ns, pdev, i);
+		if (ret < 0)
+			goto abort_probe;
+		if (ret)
+			continue;
+
+		priv->num_entries++;
+	}
+
+	return 0;
+abort_probe:
+	pmt_telem_remove(pdev);
+	return ret;
+}
+
+static struct platform_driver pmt_telem_driver = {
+	.driver = {
+		.name   = TELEM_DEV_NAME,
+	},
+	.remove = pmt_telem_remove,
+	.probe  = pmt_telem_probe,
+};
+
+static int __init pmt_telem_init(void)
+{
+	return platform_driver_register(&pmt_telem_driver);
+}
+module_init(pmt_telem_init);
+
+static void __exit pmt_telem_exit(void)
+{
+	platform_driver_unregister(&pmt_telem_driver);
+	xa_destroy(&telem_array);
+}
+module_exit(pmt_telem_exit);
+
+MODULE_AUTHOR("David E. Box <david.e.box@linux.intel.com>");
+MODULE_DESCRIPTION("Intel PMT Telemetry driver");
+MODULE_ALIAS("platform:" TELEM_DEV_NAME);
+MODULE_LICENSE("GPL v2");
-- 
Gitee


From 140652ff28bd44545c96b7a23da8f9fd101f8b33 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@linux.intel.com>
Date: Wed, 28 Oct 2020 18:55:36 -0700
Subject: [PATCH 053/674] platform/x86: Intel PMT Crashlog capability driver

mainline inclusion
from mainline-v5.11-rc1
commit 5ef9998c96b0c99c49c202054586967e609286d2
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I596K9
CVE: NA

Intel-SIG: commit 5ef9998c96b0 platform/x86: Intel PMT Crashlog
capability driver.
Backport for intel PMT (Platform Monitoring Technology) support

--------------------------------

Add support for the Intel Platform Monitoring Technology crashlog
interface. This interface provides a few sysfs values to allow for
controlling the crashlog telemetry interface as well as a character
driver to allow for mapping the crashlog memory region so that it can be
accessed after a crashlog has been recorded.

This driver is meant to only support the server version of the crashlog
which is identified as crash_type 1 with a version of zero. Currently no
other types are supported.

Signed-off-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
Signed-off-by: David E. Box <david.e.box@linux.intel.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 .../ABI/testing/sysfs-class-intel_pmt         |  65 ++++
 drivers/platform/x86/Kconfig                  |  11 +
 drivers/platform/x86/Makefile                 |   1 +
 drivers/platform/x86/intel_pmt_crashlog.c     | 328 ++++++++++++++++++
 4 files changed, 405 insertions(+)
 create mode 100644 drivers/platform/x86/intel_pmt_crashlog.c

diff --git a/Documentation/ABI/testing/sysfs-class-intel_pmt b/Documentation/ABI/testing/sysfs-class-intel_pmt
index 926b5cf95fd1..ed4c886a21b1 100644
--- a/Documentation/ABI/testing/sysfs-class-intel_pmt
+++ b/Documentation/ABI/testing/sysfs-class-intel_pmt
@@ -52,3 +52,68 @@ Contact:	David Box <david.e.box@linux.intel.com>
 Description:
 		(RO) The offset of telemetry region in bytes that corresponds to
 		the mapping for the telem file.
+
+What:		/sys/class/intel_pmt/crashlog<x>
+Date:		October 2020
+KernelVersion:	5.10
+Contact:	Alexander Duyck <alexander.h.duyck@linux.intel.com>
+Description:
+		The crashlog<x> directory contains files for configuring an
+		instance of a PMT crashlog device that can perform crash data
+		recording. Each crashlog<x> device has an associated crashlog
+		file. This file can be opened and mapped or read to access the
+		resulting crashlog buffer. The register layout for the buffer
+		can be determined from an XML file of specified GUID for the
+		parent device.
+
+What:		/sys/class/intel_pmt/crashlog<x>/crashlog
+Date:		October 2020
+KernelVersion:	5.10
+Contact:	David Box <david.e.box@linux.intel.com>
+Description:
+		(RO) The crashlog buffer for this crashlog device. This file
+		may be mapped or read to obtain the data.
+
+What:		/sys/class/intel_pmt/crashlog<x>/guid
+Date:		October 2020
+KernelVersion:	5.10
+Contact:	Alexander Duyck <alexander.h.duyck@linux.intel.com>
+Description:
+		(RO) The GUID for this crashlog device. The GUID identifies the
+		version of the XML file for the parent device that should be
+		used to determine the register layout.
+
+What:		/sys/class/intel_pmt/crashlog<x>/size
+Date:		October 2020
+KernelVersion:	5.10
+Contact:	Alexander Duyck <alexander.h.duyck@linux.intel.com>
+Description:
+		(RO) The length of the result buffer in bytes that corresponds
+		to the size for the crashlog buffer.
+
+What:		/sys/class/intel_pmt/crashlog<x>/offset
+Date:		October 2020
+KernelVersion:	5.10
+Contact:	Alexander Duyck <alexander.h.duyck@linux.intel.com>
+Description:
+		(RO) The offset of the buffer in bytes that corresponds
+		to the mapping for the crashlog device.
+
+What:		/sys/class/intel_pmt/crashlog<x>/enable
+Date:		October 2020
+KernelVersion:	5.10
+Contact:	Alexander Duyck <alexander.h.duyck@linux.intel.com>
+Description:
+		(RW) Boolean value controlling if the crashlog functionality
+		is enabled for the crashlog device.
+
+What:		/sys/class/intel_pmt/crashlog<x>/trigger
+Date:		October 2020
+KernelVersion:	5.10
+Contact:	Alexander Duyck <alexander.h.duyck@linux.intel.com>
+Description:
+		(RW) Boolean value controlling the triggering of the crashlog
+		device node. When read it provides data on if the crashlog has
+		been triggered. When written to it can be used to either clear
+		the current trigger by writing false, or to trigger a new
+		event if the trigger is not currently set.
diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
index ac93c807eb9a..da36a682d0cf 100644
--- a/drivers/platform/x86/Kconfig
+++ b/drivers/platform/x86/Kconfig
@@ -1385,6 +1385,17 @@ config INTEL_PMT_TELEMETRY
 	  To compile this driver as a module, choose M here: the module
 	  will be called intel_pmt_telemetry.
 
+config INTEL_PMT_CRASHLOG
+	tristate "Intel Platform Monitoring Technology (PMT) Crashlog driver"
+	select INTEL_PMT_CLASS
+	help
+	  The Intel Platform Monitoring Technology (PMT) crashlog driver provides
+	  access to hardware crashlog capabilities on devices that support the
+	  feature.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called intel_pmt_crashlog.
+
 config INTEL_PUNIT_IPC
 	tristate "Intel P-Unit IPC Driver"
 	help
diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile
index 6a7b61f59ea8..ca82c1344977 100644
--- a/drivers/platform/x86/Makefile
+++ b/drivers/platform/x86/Makefile
@@ -142,6 +142,7 @@ obj-$(CONFIG_INTEL_MRFLD_PWRBTN)	+= intel_mrfld_pwrbtn.o
 obj-$(CONFIG_INTEL_PMC_CORE)		+= intel_pmc_core.o intel_pmc_core_pltdrv.o
 obj-$(CONFIG_INTEL_PMT_CLASS)		+= intel_pmt_class.o
 obj-$(CONFIG_INTEL_PMT_TELEMETRY)	+= intel_pmt_telemetry.o
+obj-$(CONFIG_INTEL_PMT_CRASHLOG)	+= intel_pmt_crashlog.o
 obj-$(CONFIG_INTEL_PUNIT_IPC)		+= intel_punit_ipc.o
 obj-$(CONFIG_INTEL_SCU_IPC)		+= intel_scu_ipc.o
 obj-$(CONFIG_INTEL_SCU_PCI)		+= intel_scu_pcidrv.o
diff --git a/drivers/platform/x86/intel_pmt_crashlog.c b/drivers/platform/x86/intel_pmt_crashlog.c
new file mode 100644
index 000000000000..97dd749c8290
--- /dev/null
+++ b/drivers/platform/x86/intel_pmt_crashlog.c
@@ -0,0 +1,328 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel Platform Monitoring Technology Crashlog driver
+ *
+ * Copyright (c) 2020, Intel Corporation.
+ * All Rights Reserved.
+ *
+ * Author: "Alexander Duyck" <alexander.h.duyck@linux.intel.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/overflow.h>
+
+#include "intel_pmt_class.h"
+
+#define DRV_NAME		"pmt_crashlog"
+
+/* Crashlog discovery header types */
+#define CRASH_TYPE_OOBMSM	1
+
+/* Control Flags */
+#define CRASHLOG_FLAG_DISABLE		BIT(27)
+
+/*
+ * Bits 28 and 29 control the state of bit 31.
+ *
+ * Bit 28 will clear bit 31, if set, allowing a new crashlog to be captured.
+ * Bit 29 will immediately trigger a crashlog to be generated, setting bit 31.
+ * Bit 30 is read-only and reserved as 0.
+ * Bit 31 is the read-only status with a 1 indicating log is complete.
+ */
+#define CRASHLOG_FLAG_TRIGGER_CLEAR	BIT(28)
+#define CRASHLOG_FLAG_TRIGGER_EXECUTE	BIT(29)
+#define CRASHLOG_FLAG_TRIGGER_COMPLETE	BIT(31)
+#define CRASHLOG_FLAG_TRIGGER_MASK	GENMASK(31, 28)
+
+/* Crashlog Discovery Header */
+#define CONTROL_OFFSET		0x0
+#define GUID_OFFSET		0x4
+#define BASE_OFFSET		0x8
+#define SIZE_OFFSET		0xC
+#define GET_ACCESS(v)		((v) & GENMASK(3, 0))
+#define GET_TYPE(v)		(((v) & GENMASK(7, 4)) >> 4)
+#define GET_VERSION(v)		(((v) & GENMASK(19, 16)) >> 16)
+/* size is in bytes */
+#define GET_SIZE(v)		((v) * sizeof(u32))
+
+struct crashlog_entry {
+	/* entry must be first member of struct */
+	struct intel_pmt_entry		entry;
+	struct mutex			control_mutex;
+};
+
+struct pmt_crashlog_priv {
+	int			num_entries;
+	struct crashlog_entry	entry[];
+};
+
+/*
+ * I/O
+ */
+static bool pmt_crashlog_complete(struct intel_pmt_entry *entry)
+{
+	u32 control = readl(entry->disc_table + CONTROL_OFFSET);
+
+	/* return current value of the crashlog complete flag */
+	return !!(control & CRASHLOG_FLAG_TRIGGER_COMPLETE);
+}
+
+static bool pmt_crashlog_disabled(struct intel_pmt_entry *entry)
+{
+	u32 control = readl(entry->disc_table + CONTROL_OFFSET);
+
+	/* return current value of the crashlog disabled flag */
+	return !!(control & CRASHLOG_FLAG_DISABLE);
+}
+
+static bool pmt_crashlog_supported(struct intel_pmt_entry *entry)
+{
+	u32 discovery_header = readl(entry->disc_table + CONTROL_OFFSET);
+	u32 crash_type, version;
+
+	crash_type = GET_TYPE(discovery_header);
+	version = GET_VERSION(discovery_header);
+
+	/*
+	 * Currently we only recognize OOBMSM version 0 devices.
+	 * We can ignore all other crashlog devices in the system.
+	 */
+	return crash_type == CRASH_TYPE_OOBMSM && version == 0;
+}
+
+static void pmt_crashlog_set_disable(struct intel_pmt_entry *entry,
+				     bool disable)
+{
+	u32 control = readl(entry->disc_table + CONTROL_OFFSET);
+
+	/* clear trigger bits so we are only modifying disable flag */
+	control &= ~CRASHLOG_FLAG_TRIGGER_MASK;
+
+	if (disable)
+		control |= CRASHLOG_FLAG_DISABLE;
+	else
+		control &= ~CRASHLOG_FLAG_DISABLE;
+
+	writel(control, entry->disc_table + CONTROL_OFFSET);
+}
+
+static void pmt_crashlog_set_clear(struct intel_pmt_entry *entry)
+{
+	u32 control = readl(entry->disc_table + CONTROL_OFFSET);
+
+	control &= ~CRASHLOG_FLAG_TRIGGER_MASK;
+	control |= CRASHLOG_FLAG_TRIGGER_CLEAR;
+
+	writel(control, entry->disc_table + CONTROL_OFFSET);
+}
+
+static void pmt_crashlog_set_execute(struct intel_pmt_entry *entry)
+{
+	u32 control = readl(entry->disc_table + CONTROL_OFFSET);
+
+	control &= ~CRASHLOG_FLAG_TRIGGER_MASK;
+	control |= CRASHLOG_FLAG_TRIGGER_EXECUTE;
+
+	writel(control, entry->disc_table + CONTROL_OFFSET);
+}
+
+/*
+ * sysfs
+ */
+static ssize_t
+enable_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct intel_pmt_entry *entry = dev_get_drvdata(dev);
+	int enabled = !pmt_crashlog_disabled(entry);
+
+	return sprintf(buf, "%d\n", enabled);
+}
+
+static ssize_t
+enable_store(struct device *dev, struct device_attribute *attr,
+	    const char *buf, size_t count)
+{
+	struct crashlog_entry *entry;
+	bool enabled;
+	int result;
+
+	entry = dev_get_drvdata(dev);
+
+	result = kstrtobool(buf, &enabled);
+	if (result)
+		return result;
+
+	mutex_lock(&entry->control_mutex);
+	pmt_crashlog_set_disable(&entry->entry, !enabled);
+	mutex_unlock(&entry->control_mutex);
+
+	return count;
+}
+static DEVICE_ATTR_RW(enable);
+
+static ssize_t
+trigger_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct intel_pmt_entry *entry;
+	int trigger;
+
+	entry = dev_get_drvdata(dev);
+	trigger = pmt_crashlog_complete(entry);
+
+	return sprintf(buf, "%d\n", trigger);
+}
+
+static ssize_t
+trigger_store(struct device *dev, struct device_attribute *attr,
+	    const char *buf, size_t count)
+{
+	struct crashlog_entry *entry;
+	bool trigger;
+	int result;
+
+	entry = dev_get_drvdata(dev);
+
+	result = kstrtobool(buf, &trigger);
+	if (result)
+		return result;
+
+	mutex_lock(&entry->control_mutex);
+
+	if (!trigger) {
+		pmt_crashlog_set_clear(&entry->entry);
+	} else if (pmt_crashlog_complete(&entry->entry)) {
+		/* we cannot trigger a new crash if one is still pending */
+		result = -EEXIST;
+		goto err;
+	} else if (pmt_crashlog_disabled(&entry->entry)) {
+		/* if device is currently disabled, return busy */
+		result = -EBUSY;
+		goto err;
+	} else {
+		pmt_crashlog_set_execute(&entry->entry);
+	}
+
+	result = count;
+err:
+	mutex_unlock(&entry->control_mutex);
+	return result;
+}
+static DEVICE_ATTR_RW(trigger);
+
+static struct attribute *pmt_crashlog_attrs[] = {
+	&dev_attr_enable.attr,
+	&dev_attr_trigger.attr,
+	NULL
+};
+
+static struct attribute_group pmt_crashlog_group = {
+	.attrs	= pmt_crashlog_attrs,
+};
+
+static int pmt_crashlog_header_decode(struct intel_pmt_entry *entry,
+				      struct intel_pmt_header *header,
+				      struct device *dev)
+{
+	void __iomem *disc_table = entry->disc_table;
+	struct crashlog_entry *crashlog;
+
+	if (!pmt_crashlog_supported(entry))
+		return 1;
+
+	/* initialize control mutex */
+	crashlog = container_of(entry, struct crashlog_entry, entry);
+	mutex_init(&crashlog->control_mutex);
+
+	header->access_type = GET_ACCESS(readl(disc_table));
+	header->guid = readl(disc_table + GUID_OFFSET);
+	header->base_offset = readl(disc_table + BASE_OFFSET);
+
+	/* Size is measured in DWORDS, but accessor returns bytes */
+	header->size = GET_SIZE(readl(disc_table + SIZE_OFFSET));
+
+	return 0;
+}
+
+static DEFINE_XARRAY_ALLOC(crashlog_array);
+static struct intel_pmt_namespace pmt_crashlog_ns = {
+	.name = "crashlog",
+	.xa = &crashlog_array,
+	.attr_grp = &pmt_crashlog_group,
+	.pmt_header_decode = pmt_crashlog_header_decode,
+};
+
+/*
+ * initialization
+ */
+static int pmt_crashlog_remove(struct platform_device *pdev)
+{
+	struct pmt_crashlog_priv *priv = platform_get_drvdata(pdev);
+	int i;
+
+	for (i = 0; i < priv->num_entries; i++)
+		intel_pmt_dev_destroy(&priv->entry[i].entry, &pmt_crashlog_ns);
+
+	return 0;
+}
+
+static int pmt_crashlog_probe(struct platform_device *pdev)
+{
+	struct pmt_crashlog_priv *priv;
+	size_t size;
+	int i, ret;
+
+	size = struct_size(priv, entry, pdev->num_resources);
+	priv = devm_kzalloc(&pdev->dev, size, GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	platform_set_drvdata(pdev, priv);
+
+	for (i = 0; i < pdev->num_resources; i++) {
+		struct intel_pmt_entry *entry = &priv->entry[i].entry;
+
+		ret = intel_pmt_dev_create(entry, &pmt_crashlog_ns, pdev, i);
+		if (ret < 0)
+			goto abort_probe;
+		if (ret)
+			continue;
+
+		priv->num_entries++;
+	}
+
+	return 0;
+abort_probe:
+	pmt_crashlog_remove(pdev);
+	return ret;
+}
+
+static struct platform_driver pmt_crashlog_driver = {
+	.driver = {
+		.name   = DRV_NAME,
+	},
+	.remove = pmt_crashlog_remove,
+	.probe  = pmt_crashlog_probe,
+};
+
+static int __init pmt_crashlog_init(void)
+{
+	return platform_driver_register(&pmt_crashlog_driver);
+}
+
+static void __exit pmt_crashlog_exit(void)
+{
+	platform_driver_unregister(&pmt_crashlog_driver);
+	xa_destroy(&crashlog_array);
+}
+
+module_init(pmt_crashlog_init);
+module_exit(pmt_crashlog_exit);
+
+MODULE_AUTHOR("Alexander Duyck <alexander.h.duyck@linux.intel.com>");
+MODULE_DESCRIPTION("Intel PMT Crashlog driver");
+MODULE_ALIAS("platform:" DRV_NAME);
+MODULE_LICENSE("GPL v2");
-- 
Gitee


From c0a64500da64cf47e8d6847eff107059b39d702a Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Tue, 17 Nov 2020 10:22:51 +0300
Subject: [PATCH 054/674] platform/x86: pmt: Fix a potential Oops on error in
 probe

mainline inclusion
from mainline-v5.11-rc1
commit d3d73d25e0d9bc43fd2a6f4b4e58ff182e55b217
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I596K9
CVE: NA

Intel-SIG: commit d3d73d25e0d9 platform/x86: pmt: Fix a potential Oops
on error in probe.
Backport for intel PMT (Platform Monitoring Technology) support

--------------------------------

The "ns->attr_grp" pointer can be NULL so this error handling code needs
to check for that to avoid an Oops.

Fixes: e2729113ce66 ("platform/x86: Intel PMT class driver")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Reviewed-by: David E. Box <david.e.box@linux.intel.com>
Link: https://lore.kernel.org/r/20201117072251.GC1111239@mwanda
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 drivers/platform/x86/intel_pmt_class.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/platform/x86/intel_pmt_class.c b/drivers/platform/x86/intel_pmt_class.c
index aa88dc23bbde..c8939fba4509 100644
--- a/drivers/platform/x86/intel_pmt_class.c
+++ b/drivers/platform/x86/intel_pmt_class.c
@@ -225,7 +225,8 @@ static int intel_pmt_dev_register(struct intel_pmt_entry *entry,
 		return 0;
 
 fail_ioremap:
-	sysfs_remove_group(entry->kobj, ns->attr_grp);
+	if (ns->attr_grp)
+		sysfs_remove_group(entry->kobj, ns->attr_grp);
 fail_sysfs:
 	device_unregister(dev);
 fail_dev_create:
-- 
Gitee


From ebd22f42906106451221f9ec6f628fe191cf9826 Mon Sep 17 00:00:00 2001
From: "David E. Box" <david.e.box@linux.intel.com>
Date: Tue, 26 Jan 2021 12:55:06 -0800
Subject: [PATCH 055/674] platform/x86: intel_pmt: Make INTEL_PMT_CLASS
 non-user-selectable

mainline inclusion
from mainline-v5.12-rc2
commit 35d8a973fe4d38afee944db636c3d2b1df3741a7
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I596K9
CVE: NA

Intel-SIG: commit 35d8a973fe4d platform/x86: intel_pmt: Make
INTEL_PMT_CLASS non-user-selectable.
Backport for intel PMT (Platform Monitoring Technology) support

--------------------------------

Fix error in Kconfig that exposed INTEL_PMT_CLASS as a user selectable
option. It is already selected by INTEL_PMT_TELEMETRY and
INTEL_PMT_CRASHLOG which are user selectable.

Fixes: e2729113ce66 ("platform/x86: Intel PMT class driver")
Signed-off-by: David E. Box <david.e.box@linux.intel.com>
Link: https://lore.kernel.org/r/20210126205508.30907-1-david.e.box@linux.intel.com
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 drivers/platform/x86/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
index da36a682d0cf..15c8dd312af5 100644
--- a/drivers/platform/x86/Kconfig
+++ b/drivers/platform/x86/Kconfig
@@ -1363,7 +1363,7 @@ config INTEL_PMC_CORE
 		- MPHY/PLL gating status (Sunrisepoint PCH only)
 
 config INTEL_PMT_CLASS
-	tristate "Intel Platform Monitoring Technology (PMT) Class driver"
+	tristate
 	help
 	  The Intel Platform Monitoring Technology (PMT) class driver provides
 	  the basic sysfs interface and file hierarchy uses by PMT devices.
-- 
Gitee


From 40bf0eee7dfb6c7eb5adbb2d6a1494ba28c6fdca Mon Sep 17 00:00:00 2001
From: "David E. Box" <david.e.box@linux.intel.com>
Date: Tue, 26 Jan 2021 12:55:07 -0800
Subject: [PATCH 056/674] platform/x86: intel_pmt_telemetry: Add dependency on
 MFD_INTEL_PMT

mainline inclusion
from mainline-v5.12-rc2
commit f3f6da5014dea3cc005b36948abe3664b5d1f7d3
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I596K9
CVE: NA

Intel-SIG: commit f3f6da5014de platform/x86: intel_pmt_telemetry: Add
dependency on MFD_INTEL_PMT.
Backport for intel PMT (Platform Monitoring Technology) support

--------------------------------

All devices that expose Intel Platform Monitoring Technology (PMT)
telemetry are currently owned by the intel_pmt MFD driver. Therefore make
the telemetry driver depend on the MFD driver for build.

Fixes: 68fe8e6e2c4b ("platform/x86: Intel PMT Telemetry capability driver")
Signed-off-by: David E. Box <david.e.box@linux.intel.com>
Link: https://lore.kernel.org/r/20210126205508.30907-2-david.e.box@linux.intel.com
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 drivers/platform/x86/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
index 15c8dd312af5..06484c5c5b8c 100644
--- a/drivers/platform/x86/Kconfig
+++ b/drivers/platform/x86/Kconfig
@@ -1376,6 +1376,7 @@ config INTEL_PMT_CLASS
 
 config INTEL_PMT_TELEMETRY
 	tristate "Intel Platform Monitoring Technology (PMT) Telemetry driver"
+	depends on MFD_INTEL_PMT
 	select INTEL_PMT_CLASS
 	help
 	  The Intel Platform Monitory Technology (PMT) Telemetry driver provides
-- 
Gitee


From e44c89b41641116e22bbb153e86b3cece9b0bfdf Mon Sep 17 00:00:00 2001
From: "David E. Box" <david.e.box@linux.intel.com>
Date: Tue, 26 Jan 2021 12:55:08 -0800
Subject: [PATCH 057/674] platform/x86: intel_pmt_crashlog: Add dependency on
 MFD_INTEL_PMT

mainline inclusion
from mainline-v5.12-rc2
commit fdd3feb37e36bec2ad75d76f8ac4d0273c5c0a91
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I596K9
CVE: NA

Intel-SIG: commit fdd3feb37e36 platform/x86: intel_pmt_crashlog: Add
dependency on MFD_INTEL_PMT.
Backport for intel PMT (Platform Monitoring Technology) support

--------------------------------

All devices that expose Intel Platform Monitoring Technology (PMT)
crashlog are currently owned by the intel_pmt MFD driver. Therefore make
the crashlog driver depend on the MFD driver for build.

Fixes: 5ef9998c96b0 ("platform/x86: Intel PMT Crashlog capability driver")
Signed-off-by: David E. Box <david.e.box@linux.intel.com>
Link: https://lore.kernel.org/r/20210126205508.30907-3-david.e.box@linux.intel.com
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 drivers/platform/x86/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
index 06484c5c5b8c..a24783aa52ea 100644
--- a/drivers/platform/x86/Kconfig
+++ b/drivers/platform/x86/Kconfig
@@ -1388,6 +1388,7 @@ config INTEL_PMT_TELEMETRY
 
 config INTEL_PMT_CRASHLOG
 	tristate "Intel Platform Monitoring Technology (PMT) Crashlog driver"
+	depends on MFD_INTEL_PMT
 	select INTEL_PMT_CLASS
 	help
 	  The Intel Platform Monitoring Technology (PMT) crashlog driver provides
-- 
Gitee


From fac7d1185d749969a83168368b5a0867b2d0f81c Mon Sep 17 00:00:00 2001
From: "David E. Box" <david.e.box@linux.intel.com>
Date: Wed, 24 Feb 2021 12:10:04 -0800
Subject: [PATCH 058/674] mfd: intel_pmt: Fix nuisance messages and handling of
 disabled capabilities

mainline inclusion
from mainline-v5.13-rc1
commit a1a5c1c3df282dc122508a17500317266ef19e46
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I596K9
CVE: NA

Intel-SIG: commit a1a5c1c3df28 mfd: intel_pmt: Fix nuisance messages and
handling of disabled capabilities.
Backport for intel PMT (Platform Monitoring Technology) support

--------------------------------

Some products will be available that have PMT capabilities that are not
supported. Remove the warnings in this instance to avoid nuisance messages
and confusion.

Also return an error code for capabilities that are disabled by quirk to
prevent them from keeping the driver loaded if only disabled capabilities
are found.

Fixes: 4f8217d5b0ca ("mfd: Intel Platform Monitoring Technology support")
Signed-off-by: David E. Box <david.e.box@linux.intel.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 drivers/mfd/intel_pmt.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/drivers/mfd/intel_pmt.c b/drivers/mfd/intel_pmt.c
index 744b230cdcca..65da2b17a204 100644
--- a/drivers/mfd/intel_pmt.c
+++ b/drivers/mfd/intel_pmt.c
@@ -79,19 +79,18 @@ static int pmt_add_dev(struct pci_dev *pdev, struct intel_dvsec_header *header,
 	case DVSEC_INTEL_ID_WATCHER:
 		if (quirks & PMT_QUIRK_NO_WATCHER) {
 			dev_info(dev, "Watcher not supported\n");
-			return 0;
+			return -EINVAL;
 		}
 		name = "pmt_watcher";
 		break;
 	case DVSEC_INTEL_ID_CRASHLOG:
 		if (quirks & PMT_QUIRK_NO_CRASHLOG) {
 			dev_info(dev, "Crashlog not supported\n");
-			return 0;
+			return -EINVAL;
 		}
 		name = "pmt_crashlog";
 		break;
 	default:
-		dev_err(dev, "Unrecognized PMT capability: %d\n", id);
 		return -EINVAL;
 	}
 
@@ -174,12 +173,8 @@ static int pmt_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 		header.offset = INTEL_DVSEC_TABLE_OFFSET(table);
 
 		ret = pmt_add_dev(pdev, &header, quirks);
-		if (ret) {
-			dev_warn(&pdev->dev,
-				 "Failed to add device for DVSEC id %d\n",
-				 header.id);
+		if (ret)
 			continue;
-		}
 
 		found_devices = true;
 	} while (true);
-- 
Gitee


From ad0c48149868e76d0c3c4cc023ce3ed2efda63d1 Mon Sep 17 00:00:00 2001
From: "David E. Box" <david.e.box@linux.intel.com>
Date: Wed, 24 Feb 2021 12:10:05 -0800
Subject: [PATCH 059/674] mfd: intel_pmt: Add support for DG1

mainline inclusion
from mainline-v5.13-rc1
commit aa47ad3f853ae72c32b7e46dfc8bc2c8dc2dbad7
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I596K9
CVE: NA

Intel-SIG: commit aa47ad3f853a mfd: intel_pmt: Add support for DG1.
Backport for intel PMT (Platform Monitoring Technology) support

--------------------------------

Adds PMT Telemetry aggregator support for the DG1 graphics PCIe card. The
device does not have the DVSEC region in its PCI config space so hard
code the discovery table data in the driver. Also requires a fix for DG1
in the Telemetry driver for how the ACCESS_TYPE field is used.

Signed-off-by: David E. Box <david.e.box@linux.intel.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 drivers/mfd/intel_pmt.c                    | 101 +++++++++++++++------
 drivers/platform/x86/intel_pmt_class.c     |  46 ++++++++++
 drivers/platform/x86/intel_pmt_class.h     |   1 +
 drivers/platform/x86/intel_pmt_telemetry.c |  20 ----
 4 files changed, 119 insertions(+), 49 deletions(-)

diff --git a/drivers/mfd/intel_pmt.c b/drivers/mfd/intel_pmt.c
index 65da2b17a204..dd7eb614c28e 100644
--- a/drivers/mfd/intel_pmt.c
+++ b/drivers/mfd/intel_pmt.c
@@ -49,10 +49,14 @@ enum pmt_quirks {
 
 	/* Use shift instead of mask to read discovery table offset */
 	PMT_QUIRK_TABLE_SHIFT	= BIT(2),
+
+	/* DVSEC not present (provided in driver data) */
+	PMT_QUIRK_NO_DVSEC	= BIT(3),
 };
 
 struct pmt_platform_info {
 	unsigned long quirks;
+	struct intel_dvsec_header **capabilities;
 };
 
 static const struct pmt_platform_info tgl_info = {
@@ -60,6 +64,26 @@ static const struct pmt_platform_info tgl_info = {
 		  PMT_QUIRK_TABLE_SHIFT,
 };
 
+/* DG1 Platform with DVSEC quirk*/
+static struct intel_dvsec_header dg1_telemetry = {
+	.length = 0x10,
+	.id = 2,
+	.num_entries = 1,
+	.entry_size = 3,
+	.tbir = 0,
+	.offset = 0x466000,
+};
+
+static struct intel_dvsec_header *dg1_capabilities[] = {
+	&dg1_telemetry,
+	NULL
+};
+
+static const struct pmt_platform_info dg1_info = {
+	.quirks = PMT_QUIRK_NO_DVSEC,
+	.capabilities = dg1_capabilities,
+};
+
 static int pmt_add_dev(struct pci_dev *pdev, struct intel_dvsec_header *header,
 		       unsigned long quirks)
 {
@@ -147,37 +171,54 @@ static int pmt_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (info)
 		quirks = info->quirks;
 
-	do {
-		struct intel_dvsec_header header;
-		u32 table;
-		u16 vid;
+	if (info && (info->quirks & PMT_QUIRK_NO_DVSEC)) {
+		struct intel_dvsec_header **header;
 
-		pos = pci_find_next_ext_capability(pdev, pos, PCI_EXT_CAP_ID_DVSEC);
-		if (!pos)
-			break;
+		header = info->capabilities;
+		while (*header) {
+			ret = pmt_add_dev(pdev, *header, quirks);
+			if (ret)
+				dev_warn(&pdev->dev,
+					 "Failed to add device for DVSEC id %d\n",
+					 (*header)->id);
+			else
+				found_devices = true;
 
-		pci_read_config_word(pdev, pos + PCI_DVSEC_HEADER1, &vid);
-		if (vid != PCI_VENDOR_ID_INTEL)
-			continue;
-
-		pci_read_config_word(pdev, pos + PCI_DVSEC_HEADER2,
-				     &header.id);
-		pci_read_config_byte(pdev, pos + INTEL_DVSEC_ENTRIES,
-				     &header.num_entries);
-		pci_read_config_byte(pdev, pos + INTEL_DVSEC_SIZE,
-				     &header.entry_size);
-		pci_read_config_dword(pdev, pos + INTEL_DVSEC_TABLE,
-				      &table);
-
-		header.tbir = INTEL_DVSEC_TABLE_BAR(table);
-		header.offset = INTEL_DVSEC_TABLE_OFFSET(table);
-
-		ret = pmt_add_dev(pdev, &header, quirks);
-		if (ret)
-			continue;
-
-		found_devices = true;
-	} while (true);
+			++header;
+		}
+	} else {
+		do {
+			struct intel_dvsec_header header;
+			u32 table;
+			u16 vid;
+
+			pos = pci_find_next_ext_capability(pdev, pos, PCI_EXT_CAP_ID_DVSEC);
+			if (!pos)
+				break;
+
+			pci_read_config_word(pdev, pos + PCI_DVSEC_HEADER1, &vid);
+			if (vid != PCI_VENDOR_ID_INTEL)
+				continue;
+
+			pci_read_config_word(pdev, pos + PCI_DVSEC_HEADER2,
+					     &header.id);
+			pci_read_config_byte(pdev, pos + INTEL_DVSEC_ENTRIES,
+					     &header.num_entries);
+			pci_read_config_byte(pdev, pos + INTEL_DVSEC_SIZE,
+					     &header.entry_size);
+			pci_read_config_dword(pdev, pos + INTEL_DVSEC_TABLE,
+					      &table);
+
+			header.tbir = INTEL_DVSEC_TABLE_BAR(table);
+			header.offset = INTEL_DVSEC_TABLE_OFFSET(table);
+
+			ret = pmt_add_dev(pdev, &header, quirks);
+			if (ret)
+				continue;
+
+			found_devices = true;
+		} while (true);
+	}
 
 	if (!found_devices)
 		return -ENODEV;
@@ -195,10 +236,12 @@ static void pmt_pci_remove(struct pci_dev *pdev)
 }
 
 #define PCI_DEVICE_ID_INTEL_PMT_ADL	0x467d
+#define PCI_DEVICE_ID_INTEL_PMT_DG1	0x490e
 #define PCI_DEVICE_ID_INTEL_PMT_OOBMSM	0x09a7
 #define PCI_DEVICE_ID_INTEL_PMT_TGL	0x9a0d
 static const struct pci_device_id pmt_pci_ids[] = {
 	{ PCI_DEVICE_DATA(INTEL, PMT_ADL, &tgl_info) },
+	{ PCI_DEVICE_DATA(INTEL, PMT_DG1, &dg1_info) },
 	{ PCI_DEVICE_DATA(INTEL, PMT_OOBMSM, NULL) },
 	{ PCI_DEVICE_DATA(INTEL, PMT_TGL, &tgl_info) },
 	{ }
diff --git a/drivers/platform/x86/intel_pmt_class.c b/drivers/platform/x86/intel_pmt_class.c
index c8939fba4509..228e21f1ce5c 100644
--- a/drivers/platform/x86/intel_pmt_class.c
+++ b/drivers/platform/x86/intel_pmt_class.c
@@ -19,6 +19,28 @@
 #define PMT_XA_MAX		INT_MAX
 #define PMT_XA_LIMIT		XA_LIMIT(PMT_XA_START, PMT_XA_MAX)
 
+/*
+ * Early implementations of PMT on client platforms have some
+ * differences from the server platforms (which use the Out Of Band
+ * Management Services Module OOBMSM). This list tracks those
+ * platforms as needed to handle those differences. Newer client
+ * platforms are expected to be fully compatible with server.
+ */
+static const struct pci_device_id pmt_telem_early_client_pci_ids[] = {
+	{ PCI_VDEVICE(INTEL, 0x467d) }, /* ADL */
+	{ PCI_VDEVICE(INTEL, 0x490e) }, /* DG1 */
+	{ PCI_VDEVICE(INTEL, 0x9a0d) }, /* TGL */
+	{ }
+};
+
+bool intel_pmt_is_early_client_hw(struct device *dev)
+{
+	struct pci_dev *parent = to_pci_dev(dev->parent);
+
+	return !!pci_match_id(pmt_telem_early_client_pci_ids, parent);
+}
+EXPORT_SYMBOL_GPL(intel_pmt_is_early_client_hw);
+
 /*
  * sysfs
  */
@@ -147,6 +169,30 @@ static int intel_pmt_populate_entry(struct intel_pmt_entry *entry,
 		 * base address = end of discovery region + base offset
 		 */
 		entry->base_addr = disc_res->end + 1 + header->base_offset;
+
+		/*
+		 * Some hardware use a different calculation for the base address
+		 * when access_type == ACCESS_LOCAL. On the these systems
+		 * ACCCESS_LOCAL refers to an address in the same BAR as the
+		 * header but at a fixed offset. But as the header address was
+		 * supplied to the driver, we don't know which BAR it was in.
+		 * So search for the bar whose range includes the header address.
+		 */
+		if (intel_pmt_is_early_client_hw(dev)) {
+			int i;
+
+			entry->base_addr = 0;
+			for (i = 0; i < 6; i++)
+				if (disc_res->start >= pci_resource_start(pci_dev, i) &&
+				   (disc_res->start <= pci_resource_end(pci_dev, i))) {
+					entry->base_addr = pci_resource_start(pci_dev, i) +
+							   header->base_offset;
+					break;
+				}
+			if (!entry->base_addr)
+				return -EINVAL;
+		}
+
 		break;
 	case ACCESS_BARID:
 		/*
diff --git a/drivers/platform/x86/intel_pmt_class.h b/drivers/platform/x86/intel_pmt_class.h
index de8f8139ba31..1337019c2873 100644
--- a/drivers/platform/x86/intel_pmt_class.h
+++ b/drivers/platform/x86/intel_pmt_class.h
@@ -44,6 +44,7 @@ struct intel_pmt_namespace {
 				 struct device *dev);
 };
 
+bool intel_pmt_is_early_client_hw(struct device *dev);
 int intel_pmt_dev_create(struct intel_pmt_entry *entry,
 			 struct intel_pmt_namespace *ns,
 			 struct platform_device *pdev, int idx);
diff --git a/drivers/platform/x86/intel_pmt_telemetry.c b/drivers/platform/x86/intel_pmt_telemetry.c
index f8a87614efa4..9b95ef050457 100644
--- a/drivers/platform/x86/intel_pmt_telemetry.c
+++ b/drivers/platform/x86/intel_pmt_telemetry.c
@@ -34,26 +34,6 @@ struct pmt_telem_priv {
 	struct intel_pmt_entry		entry[];
 };
 
-/*
- * Early implementations of PMT on client platforms have some
- * differences from the server platforms (which use the Out Of Band
- * Management Services Module OOBMSM). This list tracks those
- * platforms as needed to handle those differences. Newer client
- * platforms are expected to be fully compatible with server.
- */
-static const struct pci_device_id pmt_telem_early_client_pci_ids[] = {
-	{ PCI_VDEVICE(INTEL, 0x9a0d) }, /* TGL */
-	{ PCI_VDEVICE(INTEL, 0x467d) }, /* ADL */
-	{ }
-};
-
-static bool intel_pmt_is_early_client_hw(struct device *dev)
-{
-	struct pci_dev *parent = to_pci_dev(dev->parent);
-
-	return !!pci_match_id(pmt_telem_early_client_pci_ids, parent);
-}
-
 static bool pmt_telem_region_overlaps(struct intel_pmt_entry *entry,
 				      struct device *dev)
 {
-- 
Gitee


From 9a2f0041975b5654471cd8f58c5446b67d14e099 Mon Sep 17 00:00:00 2001
From: "David E. Box" <david.e.box@linux.intel.com>
Date: Tue, 16 Mar 2021 19:44:54 -0700
Subject: [PATCH 060/674] platform/x86: intel_pmt_class: Initial resource to 0

mainline inclusion
from mainline-v5.13-rc1
commit 501bb68a66cfc0bc2a2458483400cb49daca974f
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I596K9
CVE: NA

Intel-SIG: commit 501bb68a66cf platform/x86: intel_pmt_class: Initial resource
to 0.
Backport for intel PMT (Platform Monitoring Technology) support

--------------------------------

Initialize the struct resource in intel_pmt_dev_register to zero to avoid a
fault should the char *name field be non-zero.

Signed-off-by: David E. Box <david.e.box@linux.intel.com>
Link: https://lore.kernel.org/r/20210317024455.3071477-1-david.e.box@linux.intel.com
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 drivers/platform/x86/intel_pmt_class.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/platform/x86/intel_pmt_class.c b/drivers/platform/x86/intel_pmt_class.c
index 228e21f1ce5c..c86ff15b1ed5 100644
--- a/drivers/platform/x86/intel_pmt_class.c
+++ b/drivers/platform/x86/intel_pmt_class.c
@@ -219,7 +219,7 @@ static int intel_pmt_dev_register(struct intel_pmt_entry *entry,
 				  struct intel_pmt_namespace *ns,
 				  struct device *parent)
 {
-	struct resource res;
+	struct resource res = {0};
 	struct device *dev;
 	int ret;
 
-- 
Gitee


From 38603f41c16a1086ed099a84fa05815a9e872a0f Mon Sep 17 00:00:00 2001
From: "David E. Box" <david.e.box@linux.intel.com>
Date: Tue, 16 Mar 2021 19:44:55 -0700
Subject: [PATCH 061/674] platform/x86: intel_pmt_crashlog: Fix incorrect
 macros

mainline inclusion
from mainline-v5.12-rc5
commit 10c931cdfe64ebc38a15a485dd794915044f2111
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I596K9
CVE: NA

Intel-SIG: commit 10c931cdfe64 platform/x86: intel_pmt_crashlog: Fix
incorrect macros.
Backport for intel PMT (Platform Monitoring Technology) support

--------------------------------

Fixes off-by-one bugs in the macro assignments for the crashlog control
bits. Was initially tested on emulation but bug revealed after testing on
silicon.

Fixes: 5ef9998c96b0 ("platform/x86: Intel PMT Crashlog capability driver")
Signed-off-by: David E. Box <david.e.box@linux.intel.com>
Link: https://lore.kernel.org/r/20210317024455.3071477-2-david.e.box@linux.intel.com
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 drivers/platform/x86/intel_pmt_crashlog.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/drivers/platform/x86/intel_pmt_crashlog.c b/drivers/platform/x86/intel_pmt_crashlog.c
index 97dd749c8290..92d315a16cfd 100644
--- a/drivers/platform/x86/intel_pmt_crashlog.c
+++ b/drivers/platform/x86/intel_pmt_crashlog.c
@@ -23,18 +23,17 @@
 #define CRASH_TYPE_OOBMSM	1
 
 /* Control Flags */
-#define CRASHLOG_FLAG_DISABLE		BIT(27)
+#define CRASHLOG_FLAG_DISABLE		BIT(28)
 
 /*
- * Bits 28 and 29 control the state of bit 31.
+ * Bits 29 and 30 control the state of bit 31.
  *
- * Bit 28 will clear bit 31, if set, allowing a new crashlog to be captured.
- * Bit 29 will immediately trigger a crashlog to be generated, setting bit 31.
- * Bit 30 is read-only and reserved as 0.
+ * Bit 29 will clear bit 31, if set, allowing a new crashlog to be captured.
+ * Bit 30 will immediately trigger a crashlog to be generated, setting bit 31.
  * Bit 31 is the read-only status with a 1 indicating log is complete.
  */
-#define CRASHLOG_FLAG_TRIGGER_CLEAR	BIT(28)
-#define CRASHLOG_FLAG_TRIGGER_EXECUTE	BIT(29)
+#define CRASHLOG_FLAG_TRIGGER_CLEAR	BIT(29)
+#define CRASHLOG_FLAG_TRIGGER_EXECUTE	BIT(30)
 #define CRASHLOG_FLAG_TRIGGER_COMPLETE	BIT(31)
 #define CRASHLOG_FLAG_TRIGGER_MASK	GENMASK(31, 28)
 
-- 
Gitee


From e69b71e05fbbf874203483132d39e7c72dfba6e5 Mon Sep 17 00:00:00 2001
From: Rikard Falkeborn <rikard.falkeborn@gmail.com>
Date: Sat, 5 Jun 2021 22:38:05 +0200
Subject: [PATCH 062/674] platform/x86: intel_pmt_crashlog: Constify static
 attribute_group struct

mainline inclusion
from mainline-v5.14-rc1
commit d24023e375704860c6c8b91c3af3034669aa1bc5
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I596K9
CVE: NA

Intel-SIG: commit d24023e37570 platform/x86: intel_pmt_crashlog: Constify
static attribute_group struct.
Backport for intel PMT (Platform Monitoring Technology) support

--------------------------------

The only use of pmt_crashlog_group is to assign its address to the
attr_grp field in the intel_pmt_namespace struct, which is a pointer to
const attribute_group. Make it const to allow the compiler to put it in
read-only memory.

Signed-off-by: Rikard Falkeborn <rikard.falkeborn@gmail.com>
Link: https://lore.kernel.org/r/20210605203807.60547-3-rikard.falkeborn@gmail.com
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 drivers/platform/x86/intel_pmt_crashlog.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/platform/x86/intel_pmt_crashlog.c b/drivers/platform/x86/intel_pmt_crashlog.c
index 92d315a16cfd..56963ceb6345 100644
--- a/drivers/platform/x86/intel_pmt_crashlog.c
+++ b/drivers/platform/x86/intel_pmt_crashlog.c
@@ -218,7 +218,7 @@ static struct attribute *pmt_crashlog_attrs[] = {
 	NULL
 };
 
-static struct attribute_group pmt_crashlog_group = {
+static const struct attribute_group pmt_crashlog_group = {
 	.attrs	= pmt_crashlog_attrs,
 };
 
-- 
Gitee


From 21db19cf4be15474deadabe6160688a0371e9f7b Mon Sep 17 00:00:00 2001
From: "David E. Box" <david.e.box@linux.intel.com>
Date: Tue, 17 Aug 2021 15:40:17 -0700
Subject: [PATCH 063/674] platform/x86: intel_pmt_telemetry: Ignore zero sized
 entries

mainline inclusion
from mainline-v5.15-rc1
commit ef195e8a7f43924b9979b2cd81ac7fa54f24bb3c
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I596K9
CVE: NA

Intel-SIG: commit ef195e8a7f43 platform/x86: intel_pmt_telemetry: Ignore zero
sized entries.
Backport for intel PMT (Platform Monitoring Technology) support

--------------------------------

Some devices may expose non-functioning entries that are reserved for
future use. These entries have zero size. Ignore them during probe.

Signed-off-by: David E. Box <david.e.box@linux.intel.com>
Link: https://lore.kernel.org/r/20210817224018.1013192-5-david.e.box@linux.intel.com
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 drivers/platform/x86/intel_pmt_telemetry.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/platform/x86/intel_pmt_telemetry.c b/drivers/platform/x86/intel_pmt_telemetry.c
index 9b95ef050457..9f845e70a1f8 100644
--- a/drivers/platform/x86/intel_pmt_telemetry.c
+++ b/drivers/platform/x86/intel_pmt_telemetry.c
@@ -61,6 +61,14 @@ static int pmt_telem_header_decode(struct intel_pmt_entry *entry,
 	/* Size is measured in DWORDS, but accessor returns bytes */
 	header->size = TELEM_SIZE(readl(disc_table));
 
+	/*
+	 * Some devices may expose non-functioning entries that are
+	 * reserved for future use. They have zero size. Do not fail
+	 * probe for these. Just ignore them.
+	 */
+	if (header->size == 0)
+		return 1;
+
 	return 0;
 }
 
-- 
Gitee


From b778ba5c868584a960628901482978560de43fd4 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Thu, 7 Jan 2021 13:23:34 +0100
Subject: [PATCH 064/674] x86/mce: Get rid of mcheck_intel_therm_init()

mainline inclusion
from mainline-5.12
commit 4f432e8bb15b352da72525144da025a46695968f
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5DSOL

Intel_SIG: commit 4f432e8bb15b x86/mce: Get rid of
mcheck_intel_therm_init().
Backport for Intel HFI (Hardware Feedback Interface) support

-------------------------------------

Move the APIC_LVTTHMR read which needs to happen on the BSP, to
intel_init_thermal(). One less boot dependency.

No functional changes.

Signed-off-by: Borislav Petkov <bp@suse.de>
Tested-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Link: https://lkml.kernel.org/r/20210201142704.12495-2-bp@alien8.de
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 arch/x86/include/asm/mce.h            |  6 ------
 arch/x86/kernel/cpu/mce/core.c        |  1 -
 arch/x86/kernel/cpu/mce/therm_throt.c | 15 ++++-----------
 3 files changed, 4 insertions(+), 18 deletions(-)

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 9b5ff423e939..4feed45f0438 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -299,12 +299,6 @@ extern int (*platform_thermal_package_notify)(__u64 msr_val);
  * callback has rate control */
 extern bool (*platform_thermal_package_rate_control)(void);
 
-#ifdef CONFIG_X86_THERMAL_VECTOR
-extern void mcheck_intel_therm_init(void);
-#else
-static inline void mcheck_intel_therm_init(void) { }
-#endif
-
 /*
  * Used by APEI to report memory error via /dev/mcelog
  */
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 0ced8b250090..ed49a4abd20e 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -2231,7 +2231,6 @@ __setup("mce", mcheck_enable);
 
 int __init mcheck_init(void)
 {
-	mcheck_intel_therm_init();
 	mce_register_decode_chain(&early_nb);
 	mce_register_decode_chain(&mce_uc_nb);
 	mce_register_decode_chain(&mce_default_nb);
diff --git a/arch/x86/kernel/cpu/mce/therm_throt.c b/arch/x86/kernel/cpu/mce/therm_throt.c
index a7cd2d203ced..5b15d7cef1d1 100644
--- a/arch/x86/kernel/cpu/mce/therm_throt.c
+++ b/arch/x86/kernel/cpu/mce/therm_throt.c
@@ -633,17 +633,6 @@ static int intel_thermal_supported(struct cpuinfo_x86 *c)
 	return 1;
 }
 
-void __init mcheck_intel_therm_init(void)
-{
-	/*
-	 * This function is only called on boot CPU. Save the init thermal
-	 * LVT value on BSP and use that value to restore APs' thermal LVT
-	 * entry BIOS programmed later
-	 */
-	if (intel_thermal_supported(&boot_cpu_data))
-		lvtthmr_init = apic_read(APIC_LVTTHMR);
-}
-
 void intel_init_thermal(struct cpuinfo_x86 *c)
 {
 	unsigned int cpu = smp_processor_id();
@@ -653,6 +642,10 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
 	if (!intel_thermal_supported(c))
 		return;
 
+	/* On the BSP? */
+	if (c == &boot_cpu_data)
+		lvtthmr_init = apic_read(APIC_LVTTHMR);
+
 	/*
 	 * First check if its enabled already, in which case there might
 	 * be some SMM goo which handles it, so we can't even put a handler
-- 
Gitee


From 3042f430fab61ead14942ce4a39be12834f08a78 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Thu, 7 Jan 2021 13:29:05 +0100
Subject: [PATCH 065/674] thermal: Move therm_throt there from x86/mce

mainline inclusion
from mainline-5.12
commit 9223d0dccb8f8523754122f68316dd1a4f39f7f8
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5DSOL

Intel_SIG: commit 9223d0dccb8f thermal: Move therm_throt there from
x86/mce.
Backport for Intel HFI (Hardware Feedback Interface) support

-------------------------------------

This functionality has nothing to do with MCE, move it to the thermal
framework and untangle it from MCE.

Requested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Tested-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Link: https://lkml.kernel.org/r/20210202121003.GD18075@zn.tnic
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 arch/x86/Kconfig                              |  4 ---
 arch/x86/include/asm/mce.h                    | 16 ----------
 arch/x86/include/asm/thermal.h                | 13 ++++++++
 arch/x86/kernel/cpu/intel.c                   |  3 ++
 arch/x86/kernel/cpu/mce/Makefile              |  2 --
 arch/x86/kernel/cpu/mce/intel.c               |  1 -
 arch/x86/kernel/irq.c                         | 21 ++++++++++++
 drivers/thermal/intel/Kconfig                 |  4 +++
 drivers/thermal/intel/Makefile                |  1 +
 .../thermal/intel}/therm_throt.c              | 32 ++++++-------------
 drivers/thermal/intel/thermal_interrupt.h     | 15 +++++++++
 drivers/thermal/intel/x86_pkg_temp_thermal.c  |  4 +--
 12 files changed, 68 insertions(+), 48 deletions(-)
 create mode 100644 arch/x86/include/asm/thermal.h
 rename {arch/x86/kernel/cpu/mce => drivers/thermal/intel}/therm_throt.c (97%)
 create mode 100644 drivers/thermal/intel/thermal_interrupt.h

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 040fb77366dd..16683d531a00 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1164,10 +1164,6 @@ config X86_MCE_INJECT
 	  If you don't know what a machine check is and you don't do kernel
 	  QA it is safe to say n.
 
-config X86_THERMAL_VECTOR
-	def_bool y
-	depends on X86_MCE_INTEL
-
 source "arch/x86/events/Kconfig"
 
 config X86_LEGACY_VM86
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 4feed45f0438..28c112695e92 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -283,22 +283,6 @@ extern void (*mce_threshold_vector)(void);
 /* Deferred error interrupt handler */
 extern void (*deferred_error_int_vector)(void);
 
-/*
- * Thermal handler
- */
-
-void intel_init_thermal(struct cpuinfo_x86 *c);
-
-/* Interrupt Handler for core thermal thresholds */
-extern int (*platform_thermal_notify)(__u64 msr_val);
-
-/* Interrupt Handler for package thermal thresholds */
-extern int (*platform_thermal_package_notify)(__u64 msr_val);
-
-/* Callback support of rate control, return true, if
- * callback has rate control */
-extern bool (*platform_thermal_package_rate_control)(void);
-
 /*
  * Used by APEI to report memory error via /dev/mcelog
  */
diff --git a/arch/x86/include/asm/thermal.h b/arch/x86/include/asm/thermal.h
new file mode 100644
index 000000000000..ddbdefd5b94f
--- /dev/null
+++ b/arch/x86/include/asm/thermal.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_THERMAL_H
+#define _ASM_X86_THERMAL_H
+
+#ifdef CONFIG_X86_THERMAL_VECTOR
+void intel_init_thermal(struct cpuinfo_x86 *c);
+bool x86_thermal_enabled(void);
+void intel_thermal_interrupt(void);
+#else
+static inline void intel_init_thermal(struct cpuinfo_x86 *c) { }
+#endif
+
+#endif /* _ASM_X86_THERMAL_H */
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 816fdbec795a..0e422a544835 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -24,6 +24,7 @@
 #include <asm/traps.h>
 #include <asm/resctrl.h>
 #include <asm/numa.h>
+#include <asm/thermal.h>
 
 #ifdef CONFIG_X86_64
 #include <linux/topology.h>
@@ -719,6 +720,8 @@ static void init_intel(struct cpuinfo_x86 *c)
 		tsx_disable();
 
 	split_lock_init();
+
+	intel_init_thermal(c);
 }
 
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/mce/Makefile b/arch/x86/kernel/cpu/mce/Makefile
index 9f020c994154..015856abdbb1 100644
--- a/arch/x86/kernel/cpu/mce/Makefile
+++ b/arch/x86/kernel/cpu/mce/Makefile
@@ -9,8 +9,6 @@ obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
 mce-inject-y			:= inject.o
 obj-$(CONFIG_X86_MCE_INJECT)	+= mce-inject.o
 
-obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o
-
 obj-$(CONFIG_ACPI_APEI)		+= apei.o
 
 obj-$(CONFIG_X86_MCELOG_LEGACY)	+= dev-mcelog.o
diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c
index 24d45a1e214c..1adf67e0fcba 100644
--- a/arch/x86/kernel/cpu/mce/intel.c
+++ b/arch/x86/kernel/cpu/mce/intel.c
@@ -514,7 +514,6 @@ static void intel_ppin_init(struct cpuinfo_x86 *c)
 
 void mce_intel_feature_init(struct cpuinfo_x86 *c)
 {
-	intel_init_thermal(c);
 	intel_init_cmci();
 	intel_init_lmce();
 	intel_ppin_init(c);
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index ce904c89c6c7..cb23373bffa8 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -21,6 +21,7 @@
 #include <asm/hw_irq.h>
 #include <asm/desc.h>
 #include <asm/traps.h>
+#include <asm/thermal.h>
 
 #define CREATE_TRACE_POINTS
 #include <asm/trace/irq_vectors.h>
@@ -376,3 +377,23 @@ void fixup_irqs(void)
 	}
 }
 #endif
+
+#ifdef CONFIG_X86_THERMAL_VECTOR
+static void smp_thermal_vector(void)
+{
+	if (x86_thermal_enabled())
+		intel_thermal_interrupt();
+	else
+		pr_err("CPU%d: Unexpected LVT thermal interrupt!\n",
+		       smp_processor_id());
+}
+
+DEFINE_IDTENTRY_SYSVEC(sysvec_thermal)
+{
+	trace_thermal_apic_entry(THERMAL_APIC_VECTOR);
+	inc_irq_stat(irq_thermal_count);
+	smp_thermal_vector();
+	trace_thermal_apic_exit(THERMAL_APIC_VECTOR);
+	ack_APIC_irq();
+}
+#endif
diff --git a/drivers/thermal/intel/Kconfig b/drivers/thermal/intel/Kconfig
index 8025b21f43fa..ce4f59213c7a 100644
--- a/drivers/thermal/intel/Kconfig
+++ b/drivers/thermal/intel/Kconfig
@@ -8,6 +8,10 @@ config INTEL_POWERCLAMP
 	  enforce idle time which results in more package C-state residency. The
 	  user interface is exposed via generic thermal framework.
 
+config X86_THERMAL_VECTOR
+	def_bool y
+	depends on X86 && CPU_SUP_INTEL && X86_LOCAL_APIC
+
 config X86_PKG_TEMP_THERMAL
 	tristate "X86 package temperature thermal driver"
 	depends on X86_THERMAL_VECTOR
diff --git a/drivers/thermal/intel/Makefile b/drivers/thermal/intel/Makefile
index 0d9736ced5d4..ff2ad30ef397 100644
--- a/drivers/thermal/intel/Makefile
+++ b/drivers/thermal/intel/Makefile
@@ -10,3 +10,4 @@ obj-$(CONFIG_INTEL_QUARK_DTS_THERMAL)	+= intel_quark_dts_thermal.o
 obj-$(CONFIG_INT340X_THERMAL)  += int340x_thermal/
 obj-$(CONFIG_INTEL_BXT_PMIC_THERMAL) += intel_bxt_pmic_thermal.o
 obj-$(CONFIG_INTEL_PCH_THERMAL)	+= intel_pch_thermal.o
+obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o
diff --git a/arch/x86/kernel/cpu/mce/therm_throt.c b/drivers/thermal/intel/therm_throt.c
similarity index 97%
rename from arch/x86/kernel/cpu/mce/therm_throt.c
rename to drivers/thermal/intel/therm_throt.c
index 5b15d7cef1d1..f8e882592ba5 100644
--- a/arch/x86/kernel/cpu/mce/therm_throt.c
+++ b/drivers/thermal/intel/therm_throt.c
@@ -26,13 +26,13 @@
 #include <linux/cpu.h>
 
 #include <asm/processor.h>
+#include <asm/thermal.h>
 #include <asm/traps.h>
 #include <asm/apic.h>
-#include <asm/mce.h>
+#include <asm/irq.h>
 #include <asm/msr.h>
-#include <asm/trace/irq_vectors.h>
 
-#include "internal.h"
+#include "thermal_interrupt.h"
 
 /* How long to wait between reporting thermal events */
 #define CHECK_INTERVAL		(300 * HZ)
@@ -570,7 +570,7 @@ static void notify_thresholds(__u64 msr_val)
 }
 
 /* Thermal transition interrupt handler */
-static void intel_thermal_interrupt(void)
+void intel_thermal_interrupt(void)
 {
 	__u64 msr_val;
 
@@ -606,23 +606,6 @@ static void intel_thermal_interrupt(void)
 	}
 }
 
-static void unexpected_thermal_interrupt(void)
-{
-	pr_err("CPU%d: Unexpected LVT thermal interrupt!\n",
-		smp_processor_id());
-}
-
-static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt;
-
-DEFINE_IDTENTRY_SYSVEC(sysvec_thermal)
-{
-	trace_thermal_apic_entry(THERMAL_APIC_VECTOR);
-	inc_irq_stat(irq_thermal_count);
-	smp_thermal_vector();
-	trace_thermal_apic_exit(THERMAL_APIC_VECTOR);
-	ack_APIC_irq();
-}
-
 /* Thermal monitoring depends on APIC, ACPI and clock modulation */
 static int intel_thermal_supported(struct cpuinfo_x86 *c)
 {
@@ -633,6 +616,11 @@ static int intel_thermal_supported(struct cpuinfo_x86 *c)
 	return 1;
 }
 
+bool x86_thermal_enabled(void)
+{
+	return atomic_read(&therm_throt_en);
+}
+
 void intel_init_thermal(struct cpuinfo_x86 *c)
 {
 	unsigned int cpu = smp_processor_id();
@@ -719,8 +707,6 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
 				| PACKAGE_THERM_INT_HIGH_ENABLE), h);
 	}
 
-	smp_thermal_vector = intel_thermal_interrupt;
-
 	rdmsr(MSR_IA32_MISC_ENABLE, l, h);
 	wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);
 
diff --git a/drivers/thermal/intel/thermal_interrupt.h b/drivers/thermal/intel/thermal_interrupt.h
new file mode 100644
index 000000000000..53f427bb58dc
--- /dev/null
+++ b/drivers/thermal/intel/thermal_interrupt.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _INTEL_THERMAL_INTERRUPT_H
+#define _INTEL_THERMAL_INTERRUPT_H
+
+/* Interrupt Handler for package thermal thresholds */
+extern int (*platform_thermal_package_notify)(__u64 msr_val);
+
+/* Interrupt Handler for core thermal thresholds */
+extern int (*platform_thermal_notify)(__u64 msr_val);
+
+/* Callback support of rate control, return true, if
+ * callback has rate control */
+extern bool (*platform_thermal_package_rate_control)(void);
+
+#endif /* _INTEL_THERMAL_INTERRUPT_H */
diff --git a/drivers/thermal/intel/x86_pkg_temp_thermal.c b/drivers/thermal/intel/x86_pkg_temp_thermal.c
index 4f5d97329ee3..e734929b2cd0 100644
--- a/drivers/thermal/intel/x86_pkg_temp_thermal.c
+++ b/drivers/thermal/intel/x86_pkg_temp_thermal.c
@@ -17,9 +17,9 @@
 #include <linux/pm.h>
 #include <linux/thermal.h>
 #include <linux/debugfs.h>
-#include <asm/cpu_device_id.h>
-#include <asm/mce.h>
 
+#include <asm/cpu_device_id.h>
+#include "thermal_interrupt.h"
 /*
 * Rate control delay: Idea is to introduce denounce effect
 * This should be long enough to avoid reduce events, when
-- 
Gitee


From 967320d3fae4cafac795c7b457eff40953eb6d94 Mon Sep 17 00:00:00 2001
From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Date: Thu, 27 Jan 2022 11:34:49 -0800
Subject: [PATCH 066/674] x86/cpu: Add definitions for the Intel Hardware
 Feedback Interface

mainline inclusion
from mainline-5.18
commit 7b8f40b3de75c971a4e5f9308b06deb59118dbac
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5DSOL

Intel_SIG: commit 7b8f40b3de75 x86/cpu: Add definitions for the Intel
Hardware Feedback Interface.
Backport for Intel HFI (Hardware Feedback Interface) support

-------------------------------------

Add the CPUID feature bit and the model-specific registers needed to
identify and configure the Intel Hardware Feedback Interface.

Acked-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 arch/x86/include/asm/cpufeatures.h | 1 +
 arch/x86/include/asm/msr-index.h   | 6 ++++++
 2 files changed, 7 insertions(+)

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 4da419226377..9e61ccbac3c0 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -321,6 +321,7 @@
 #define X86_FEATURE_HWP_ACT_WINDOW	(14*32+ 9) /* HWP Activity Window */
 #define X86_FEATURE_HWP_EPP		(14*32+10) /* HWP Energy Perf. Preference */
 #define X86_FEATURE_HWP_PKG_REQ		(14*32+11) /* HWP Package Level Request */
+#define X86_FEATURE_HFI			(14*32+19) /* Hardware Feedback Interface */
 
 /* AMD SVM Feature Identification, CPUID level 0x8000000a (EDX), word 15 */
 #define X86_FEATURE_NPT			(15*32+ 0) /* Nested Page Table support */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 2b0af5eb5131..2f0ca77d24bc 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -706,12 +706,14 @@
 
 #define PACKAGE_THERM_STATUS_PROCHOT		(1 << 0)
 #define PACKAGE_THERM_STATUS_POWER_LIMIT	(1 << 10)
+#define PACKAGE_THERM_STATUS_HFI_UPDATED	(1 << 26)
 
 #define MSR_IA32_PACKAGE_THERM_INTERRUPT	0x000001b2
 
 #define PACKAGE_THERM_INT_HIGH_ENABLE		(1 << 0)
 #define PACKAGE_THERM_INT_LOW_ENABLE		(1 << 1)
 #define PACKAGE_THERM_INT_PLN_ENABLE		(1 << 24)
+#define PACKAGE_THERM_INT_HFI_ENABLE		(1 << 25)
 
 /* Thermal Thresholds Support */
 #define THERM_INT_THRESHOLD0_ENABLE    (1 << 15)
@@ -956,4 +958,8 @@
 #define MSR_VM_IGNNE                    0xc0010115
 #define MSR_VM_HSAVE_PA                 0xc0010117
 
+/* Hardware Feedback Interface */
+#define MSR_IA32_HW_FEEDBACK_PTR        0x17d0
+#define MSR_IA32_HW_FEEDBACK_CONFIG     0x17d1
+
 #endif /* _ASM_X86_MSR_INDEX_H */
-- 
Gitee


From cd073e4711bf1e7a723c5d951c65395b44bde852 Mon Sep 17 00:00:00 2001
From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Date: Thu, 27 Jan 2022 11:34:50 -0800
Subject: [PATCH 067/674] thermal: intel: hfi: Minimally initialize the
 Hardware Feedback Interface

mainline inclusion
from mainline-5.18
commit 1cb19cabeb0e187b6c244d0da73d27f7432c40dc
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5DSOL

Intel_SIG: commit 1cb19cabeb0e thermal: intel: hfi: Minimally initialize
the Hardware Feedback Interface.
Backport for Intel HFI (Hardware Feedback Interface) support

-------------------------------------

The Intel Hardware Feedback Interface provides guidance to the operating
system about the performance and energy efficiency capabilities of each
CPU in the system. Capabilities are numbers between 0 and 255 where a
higher number represents a higher capability. For each CPU, energy
efficiency and performance are reported as separate capabilities.

Hardware computes these capabilities based on the operating conditions of
the system such as power and thermal limits. These capabilities are shared
with the operating system in a table resident in memory. Each package in
the system has its own HFI instance. Every logical CPU in the package is
represented in the table. More than one logical CPUs may be represented in
a single table entry. When the hardware updates the table, it generates a
package-level thermal interrupt.

The size and format of the HFI table depend on the supported features and
can only be determined at runtime. To minimally initialize the HFI, parse
its features and allocate one instance per package of a data structure with
the necessary parameters to read and navigate a local copy (i.e., owned by
the driver) of individual HFI tables.

A subsequent changeset will provide per-CPU initialization and interrupt
handling.

Reviewed-by: Len Brown <len.brown@intel.com>
Co-developed by: Aubrey Li <aubrey.li@linux.intel.com>
Signed-off-by: Aubrey Li <aubrey.li@linux.intel.com>
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 drivers/thermal/intel/Kconfig       |  12 ++
 drivers/thermal/intel/Makefile      |   1 +
 drivers/thermal/intel/intel_hfi.c   | 180 ++++++++++++++++++++++++++++
 drivers/thermal/intel/intel_hfi.h   |  11 ++
 drivers/thermal/intel/therm_throt.c |   3 +
 5 files changed, 207 insertions(+)
 create mode 100644 drivers/thermal/intel/intel_hfi.c
 create mode 100644 drivers/thermal/intel/intel_hfi.h

diff --git a/drivers/thermal/intel/Kconfig b/drivers/thermal/intel/Kconfig
index ce4f59213c7a..26aea864aaca 100644
--- a/drivers/thermal/intel/Kconfig
+++ b/drivers/thermal/intel/Kconfig
@@ -79,3 +79,15 @@ config INTEL_PCH_THERMAL
 	  Enable this to support thermal reporting on certain intel PCHs.
 	  Thermal reporting device will provide temperature reading,
 	  programmable trip points and other information.
+
+config INTEL_HFI_THERMAL
+	bool "Intel Hardware Feedback Interface"
+	depends on CPU_SUP_INTEL
+	depends on X86_THERMAL_VECTOR
+	help
+	  Select this option to enable the Hardware Feedback Interface. If
+	  selected, hardware provides guidance to the operating system on
+	  the performance and energy efficiency capabilities of each CPU.
+	  These capabilities may change as a result of changes in the operating
+	  conditions of the system such power and thermal limits. If selected,
+	  the kernel relays updates in CPUs' capabilities to userspace.
diff --git a/drivers/thermal/intel/Makefile b/drivers/thermal/intel/Makefile
index ff2ad30ef397..ece3888f30af 100644
--- a/drivers/thermal/intel/Makefile
+++ b/drivers/thermal/intel/Makefile
@@ -11,3 +11,4 @@ obj-$(CONFIG_INT340X_THERMAL)  += int340x_thermal/
 obj-$(CONFIG_INTEL_BXT_PMIC_THERMAL) += intel_bxt_pmic_thermal.o
 obj-$(CONFIG_INTEL_PCH_THERMAL)	+= intel_pch_thermal.o
 obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o
+obj-$(CONFIG_INTEL_HFI_THERMAL) += intel_hfi.o
diff --git a/drivers/thermal/intel/intel_hfi.c b/drivers/thermal/intel/intel_hfi.c
new file mode 100644
index 000000000000..9ea8256cf9ae
--- /dev/null
+++ b/drivers/thermal/intel/intel_hfi.c
@@ -0,0 +1,180 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Hardware Feedback Interface Driver
+ *
+ * Copyright (c) 2021, Intel Corporation.
+ *
+ * Authors: Aubrey Li <aubrey.li@linux.intel.com>
+ *          Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+ *
+ *
+ * The Hardware Feedback Interface provides a performance and energy efficiency
+ * capability information for each CPU in the system. Depending on the processor
+ * model, hardware may periodically update these capabilities as a result of
+ * changes in the operating conditions (e.g., power limits or thermal
+ * constraints). On other processor models, there is a single HFI update
+ * at boot.
+ *
+ * This file provides functionality to process HFI updates and relay these
+ * updates to userspace.
+ */
+
+#define pr_fmt(fmt)  "intel-hfi: " fmt
+
+#include <linux/bitops.h>
+#include <linux/cpufeature.h>
+#include <linux/printk.h>
+#include <linux/processor.h>
+#include <linux/slab.h>
+#include <linux/topology.h>
+
+#include "intel_hfi.h"
+
+/* CPUID detection and enumeration definitions for HFI */
+
+#define CPUID_HFI_LEAF 6
+
+union hfi_capabilities {
+	struct {
+		u8	performance:1;
+		u8	energy_efficiency:1;
+		u8	__reserved:6;
+	} split;
+	u8 bits;
+};
+
+union cpuid6_edx {
+	struct {
+		union hfi_capabilities	capabilities;
+		u32			table_pages:4;
+		u32			__reserved:4;
+		s32			index:16;
+	} split;
+	u32 full;
+};
+
+/**
+ * struct hfi_cpu_data - HFI capabilities per CPU
+ * @perf_cap:		Performance capability
+ * @ee_cap:		Energy efficiency capability
+ *
+ * Capabilities of a logical processor in the HFI table. These capabilities are
+ * unitless.
+ */
+struct hfi_cpu_data {
+	u8	perf_cap;
+	u8	ee_cap;
+} __packed;
+
+/**
+ * struct hfi_hdr - Header of the HFI table
+ * @perf_updated:	Hardware updated performance capabilities
+ * @ee_updated:		Hardware updated energy efficiency capabilities
+ *
+ * Properties of the data in an HFI table.
+ */
+struct hfi_hdr {
+	u8	perf_updated;
+	u8	ee_updated;
+} __packed;
+
+/**
+ * struct hfi_instance - Representation of an HFI instance (i.e., a table)
+ * @local_table:	Base of the local copy of the HFI table
+ * @timestamp:		Timestamp of the last update of the local table.
+ *			Located at the base of the local table.
+ * @hdr:		Base address of the header of the local table
+ * @data:		Base address of the data of the local table
+ *
+ * A set of parameters to parse and navigate a specific HFI table.
+ */
+struct hfi_instance {
+	union {
+		void			*local_table;
+		u64			*timestamp;
+	};
+	void			*hdr;
+	void			*data;
+};
+
+/**
+ * struct hfi_features - Supported HFI features
+ * @nr_table_pages:	Size of the HFI table in 4KB pages
+ * @cpu_stride:		Stride size to locate the capability data of a logical
+ *			processor within the table (i.e., row stride)
+ * @hdr_size:		Size of the table header
+ *
+ * Parameters and supported features that are common to all HFI instances
+ */
+struct hfi_features {
+	unsigned int	nr_table_pages;
+	unsigned int	cpu_stride;
+	unsigned int	hdr_size;
+};
+
+static int max_hfi_instances;
+static struct hfi_instance *hfi_instances;
+
+static struct hfi_features hfi_features;
+
+static __init int hfi_parse_features(void)
+{
+	unsigned int nr_capabilities;
+	union cpuid6_edx edx;
+
+	if (!boot_cpu_has(X86_FEATURE_HFI))
+		return -ENODEV;
+
+	/*
+	 * If we are here we know that CPUID_HFI_LEAF exists. Parse the
+	 * supported capabilities and the size of the HFI table.
+	 */
+	edx.full = cpuid_edx(CPUID_HFI_LEAF);
+
+	if (!edx.split.capabilities.split.performance) {
+		pr_debug("Performance reporting not supported! Not using HFI\n");
+		return -ENODEV;
+	}
+
+	/*
+	 * The number of supported capabilities determines the number of
+	 * columns in the HFI table. Exclude the reserved bits.
+	 */
+	edx.split.capabilities.split.__reserved = 0;
+	nr_capabilities = hweight8(edx.split.capabilities.bits);
+
+	/* The number of 4KB pages required by the table */
+	hfi_features.nr_table_pages = edx.split.table_pages + 1;
+
+	/*
+	 * The header contains change indications for each supported feature.
+	 * The size of the table header is rounded up to be a multiple of 8
+	 * bytes.
+	 */
+	hfi_features.hdr_size = DIV_ROUND_UP(nr_capabilities, 8) * 8;
+
+	/*
+	 * Data of each logical processor is also rounded up to be a multiple
+	 * of 8 bytes.
+	 */
+	hfi_features.cpu_stride = DIV_ROUND_UP(nr_capabilities, 8) * 8;
+
+	return 0;
+}
+
+void __init intel_hfi_init(void)
+{
+	if (hfi_parse_features())
+		return;
+
+	/* There is one HFI instance per die/package. */
+	max_hfi_instances = topology_max_packages() *
+			    topology_max_die_per_package();
+
+	/*
+	 * This allocation may fail. CPU hotplug callbacks must check
+	 * for a null pointer.
+	 */
+	hfi_instances = kcalloc(max_hfi_instances, sizeof(*hfi_instances),
+				GFP_KERNEL);
+}
diff --git a/drivers/thermal/intel/intel_hfi.h b/drivers/thermal/intel/intel_hfi.h
new file mode 100644
index 000000000000..05f748b48a4e
--- /dev/null
+++ b/drivers/thermal/intel/intel_hfi.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _INTEL_HFI_H
+#define _INTEL_HFI_H
+
+#if defined(CONFIG_INTEL_HFI_THERMAL)
+void __init intel_hfi_init(void);
+#else
+static inline void intel_hfi_init(void) { }
+#endif /* CONFIG_INTEL_HFI_THERMAL */
+
+#endif /* _INTEL_HFI_H */
diff --git a/drivers/thermal/intel/therm_throt.c b/drivers/thermal/intel/therm_throt.c
index f8e882592ba5..9d26e35106a7 100644
--- a/drivers/thermal/intel/therm_throt.c
+++ b/drivers/thermal/intel/therm_throt.c
@@ -32,6 +32,7 @@
 #include <asm/irq.h>
 #include <asm/msr.h>
 
+#include "intel_hfi.h"
 #include "thermal_interrupt.h"
 
 /* How long to wait between reporting thermal events */
@@ -509,6 +510,8 @@ static __init int thermal_throttle_init_device(void)
 	if (!atomic_read(&therm_throt_en))
 		return 0;
 
+	intel_hfi_init();
+
 	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/therm:online",
 				thermal_throttle_online,
 				thermal_throttle_offline);
-- 
Gitee


From 55affa7a8345f4b8d74407b00c0223bf7183f62a Mon Sep 17 00:00:00 2001
From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Date: Thu, 27 Jan 2022 11:34:51 -0800
Subject: [PATCH 068/674] thermal: intel: hfi: Handle CPU hotplug events

mainline inclusion
from mainline-5.18
commit 2d74e6319abe278981e79166b6c2d0c3ed39b1ae
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5DSOL

Intel_SIG: commit 2d74e6319abe thermal: intel: hfi: Handle CPU hotplug
events.
Backport for Intel HFI (Hardware Feedback Interface) support

-------------------------------------

All CPUs in a package are represented in an HFI table. There exists an
HFI table per package. Thus, CPUs in a package need to coordinate to
initialize and access the table. Do such coordination during CPU hotplug.
Use the first CPU to come online in a package to initialize the HFI
instance and the data structure representing it. Other CPUs in the same
package need only to register or unregister themselves in that data
structure.

The HFI depends on both the package-level thermal management and the local
APIC thermal local vector. Thus, to ensure that a CPU coming online has an
associated HFI instance when the hardware issues an HFI event, enable the
HFI only after having enabled the local APIC thermal vector. The thermal
throttle driver takes care of the needed package-level initialization.

Reviewed-by: Len Brown <len.brown@intel.com>
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 drivers/thermal/intel/intel_hfi.c   | 205 ++++++++++++++++++++++++++++
 drivers/thermal/intel/intel_hfi.h   |   4 +
 drivers/thermal/intel/therm_throt.c |   9 ++
 3 files changed, 218 insertions(+)

diff --git a/drivers/thermal/intel/intel_hfi.c b/drivers/thermal/intel/intel_hfi.c
index 9ea8256cf9ae..8cac7ee6a20f 100644
--- a/drivers/thermal/intel/intel_hfi.c
+++ b/drivers/thermal/intel/intel_hfi.c
@@ -23,13 +23,23 @@
 
 #include <linux/bitops.h>
 #include <linux/cpufeature.h>
+#include <linux/cpumask.h>
+#include <linux/gfp.h>
+#include <linux/io.h>
+#include <linux/mutex.h>
+#include <linux/percpu-defs.h>
 #include <linux/printk.h>
 #include <linux/processor.h>
 #include <linux/slab.h>
 #include <linux/topology.h>
 
+#include <asm/msr.h>
+
 #include "intel_hfi.h"
 
+/* Hardware Feedback Interface MSR configuration bits */
+#define HW_FEEDBACK_PTR_VALID_BIT		BIT(0)
+
 /* CPUID detection and enumeration definitions for HFI */
 
 #define CPUID_HFI_LEAF 6
@@ -85,6 +95,8 @@ struct hfi_hdr {
  *			Located at the base of the local table.
  * @hdr:		Base address of the header of the local table
  * @data:		Base address of the data of the local table
+ * @cpus:		CPUs represented in this HFI table instance
+ * @hw_table:		Pointer to the HFI table of this instance
  *
  * A set of parameters to parse and navigate a specific HFI table.
  */
@@ -95,6 +107,8 @@ struct hfi_instance {
 	};
 	void			*hdr;
 	void			*data;
+	cpumask_var_t		cpus;
+	void			*hw_table;
 };
 
 /**
@@ -112,10 +126,179 @@ struct hfi_features {
 	unsigned int	hdr_size;
 };
 
+/**
+ * struct hfi_cpu_info - Per-CPU attributes to consume HFI data
+ * @index:		Row of this CPU in its HFI table
+ * @hfi_instance:	Attributes of the HFI table to which this CPU belongs
+ *
+ * Parameters to link a logical processor to an HFI table and a row within it.
+ */
+struct hfi_cpu_info {
+	s16			index;
+	struct hfi_instance	*hfi_instance;
+};
+
+static DEFINE_PER_CPU(struct hfi_cpu_info, hfi_cpu_info) = { .index = -1 };
+
 static int max_hfi_instances;
 static struct hfi_instance *hfi_instances;
 
 static struct hfi_features hfi_features;
+static DEFINE_MUTEX(hfi_instance_lock);
+
+static void init_hfi_cpu_index(struct hfi_cpu_info *info)
+{
+	union cpuid6_edx edx;
+
+	/* Do not re-read @cpu's index if it has already been initialized. */
+	if (info->index > -1)
+		return;
+
+	edx.full = cpuid_edx(CPUID_HFI_LEAF);
+	info->index = edx.split.index;
+}
+
+/*
+ * The format of the HFI table depends on the number of capabilities that the
+ * hardware supports. Keep a data structure to navigate the table.
+ */
+static void init_hfi_instance(struct hfi_instance *hfi_instance)
+{
+	/* The HFI header is below the time-stamp. */
+	hfi_instance->hdr = hfi_instance->local_table +
+			    sizeof(*hfi_instance->timestamp);
+
+	/* The HFI data starts below the header. */
+	hfi_instance->data = hfi_instance->hdr + hfi_features.hdr_size;
+}
+
+/**
+ * intel_hfi_online() - Enable HFI on @cpu
+ * @cpu:	CPU in which the HFI will be enabled
+ *
+ * Enable the HFI to be used in @cpu. The HFI is enabled at the die/package
+ * level. The first CPU in the die/package to come online does the full HFI
+ * initialization. Subsequent CPUs will just link themselves to the HFI
+ * instance of their die/package.
+ *
+ * This function is called before enabling the thermal vector in the local APIC
+ * in order to ensure that @cpu has an associated HFI instance when it receives
+ * an HFI event.
+ */
+void intel_hfi_online(unsigned int cpu)
+{
+	struct hfi_instance *hfi_instance;
+	struct hfi_cpu_info *info;
+	phys_addr_t hw_table_pa;
+	u64 msr_val;
+	u16 die_id;
+
+	/* Nothing to do if hfi_instances are missing. */
+	if (!hfi_instances)
+		return;
+
+	/*
+	 * Link @cpu to the HFI instance of its package/die. It does not
+	 * matter whether the instance has been initialized.
+	 */
+	info = &per_cpu(hfi_cpu_info, cpu);
+	die_id = topology_logical_die_id(cpu);
+	hfi_instance = info->hfi_instance;
+	if (!hfi_instance) {
+		if (die_id < 0 || die_id >= max_hfi_instances)
+			return;
+
+		hfi_instance = &hfi_instances[die_id];
+		info->hfi_instance = hfi_instance;
+	}
+
+	init_hfi_cpu_index(info);
+
+	/*
+	 * Now check if the HFI instance of the package/die of @cpu has been
+	 * initialized (by checking its header). In such case, all we have to
+	 * do is to add @cpu to this instance's cpumask.
+	 */
+	mutex_lock(&hfi_instance_lock);
+	if (hfi_instance->hdr) {
+		cpumask_set_cpu(cpu, hfi_instance->cpus);
+		goto unlock;
+	}
+
+	/*
+	 * Hardware is programmed with the physical address of the first page
+	 * frame of the table. Hence, the allocated memory must be page-aligned.
+	 */
+	hfi_instance->hw_table = alloc_pages_exact(hfi_features.nr_table_pages,
+						   GFP_KERNEL | __GFP_ZERO);
+	if (!hfi_instance->hw_table)
+		goto unlock;
+
+	hw_table_pa = virt_to_phys(hfi_instance->hw_table);
+
+	/*
+	 * Allocate memory to keep a local copy of the table that
+	 * hardware generates.
+	 */
+	hfi_instance->local_table = kzalloc(hfi_features.nr_table_pages << PAGE_SHIFT,
+					    GFP_KERNEL);
+	if (!hfi_instance->local_table)
+		goto free_hw_table;
+
+	/*
+	 * Program the address of the feedback table of this die/package. On
+	 * some processors, hardware remembers the old address of the HFI table
+	 * even after having been reprogrammed and re-enabled. Thus, do not free
+	 * the pages allocated for the table or reprogram the hardware with a
+	 * new base address. Namely, program the hardware only once.
+	 */
+	msr_val = hw_table_pa | HW_FEEDBACK_PTR_VALID_BIT;
+	wrmsrl(MSR_IA32_HW_FEEDBACK_PTR, msr_val);
+
+	init_hfi_instance(hfi_instance);
+
+	cpumask_set_cpu(cpu, hfi_instance->cpus);
+
+unlock:
+	mutex_unlock(&hfi_instance_lock);
+	return;
+
+free_hw_table:
+	free_pages_exact(hfi_instance->hw_table, hfi_features.nr_table_pages);
+	goto unlock;
+}
+
+/**
+ * intel_hfi_offline() - Disable HFI on @cpu
+ * @cpu:	CPU in which the HFI will be disabled
+ *
+ * Remove @cpu from those covered by its HFI instance.
+ *
+ * On some processors, hardware remembers previous programming settings even
+ * after being reprogrammed. Thus, keep HFI enabled even if all CPUs in the
+ * die/package of @cpu are offline. See note in intel_hfi_online().
+ */
+void intel_hfi_offline(unsigned int cpu)
+{
+	struct hfi_cpu_info *info = &per_cpu(hfi_cpu_info, cpu);
+	struct hfi_instance *hfi_instance;
+
+	/*
+	 * Check if @cpu as an associated, initialized (i.e., with a non-NULL
+	 * header). Also, HFI instances are only initialized if X86_FEATURE_HFI
+	 * is present.
+	 */
+	hfi_instance = info->hfi_instance;
+	if (!hfi_instance)
+		return;
+
+	if (!hfi_instance->hdr)
+		return;
+
+	mutex_lock(&hfi_instance_lock);
+	cpumask_clear_cpu(cpu, hfi_instance->cpus);
+	mutex_unlock(&hfi_instance_lock);
+}
 
 static __init int hfi_parse_features(void)
 {
@@ -164,6 +347,9 @@ static __init int hfi_parse_features(void)
 
 void __init intel_hfi_init(void)
 {
+	struct hfi_instance *hfi_instance;
+	int i, j;
+
 	if (hfi_parse_features())
 		return;
 
@@ -177,4 +363,23 @@ void __init intel_hfi_init(void)
 	 */
 	hfi_instances = kcalloc(max_hfi_instances, sizeof(*hfi_instances),
 				GFP_KERNEL);
+	if (!hfi_instances)
+		return;
+
+	for (i = 0; i < max_hfi_instances; i++) {
+		hfi_instance = &hfi_instances[i];
+		if (!zalloc_cpumask_var(&hfi_instance->cpus, GFP_KERNEL))
+			goto err_nomem;
+	}
+
+	return;
+
+err_nomem:
+	for (j = 0; j < i; ++j) {
+		hfi_instance = &hfi_instances[j];
+		free_cpumask_var(hfi_instance->cpus);
+	}
+
+	kfree(hfi_instances);
+	hfi_instances = NULL;
 }
diff --git a/drivers/thermal/intel/intel_hfi.h b/drivers/thermal/intel/intel_hfi.h
index 05f748b48a4e..56c6b2d75202 100644
--- a/drivers/thermal/intel/intel_hfi.h
+++ b/drivers/thermal/intel/intel_hfi.h
@@ -4,8 +4,12 @@
 
 #if defined(CONFIG_INTEL_HFI_THERMAL)
 void __init intel_hfi_init(void);
+void intel_hfi_online(unsigned int cpu);
+void intel_hfi_offline(unsigned int cpu);
 #else
 static inline void intel_hfi_init(void) { }
+static inline void intel_hfi_online(unsigned int cpu) { }
+static inline void intel_hfi_offline(unsigned int cpu) { }
 #endif /* CONFIG_INTEL_HFI_THERMAL */
 
 #endif /* _INTEL_HFI_H */
diff --git a/drivers/thermal/intel/therm_throt.c b/drivers/thermal/intel/therm_throt.c
index 9d26e35106a7..e4f95cf06b82 100644
--- a/drivers/thermal/intel/therm_throt.c
+++ b/drivers/thermal/intel/therm_throt.c
@@ -476,6 +476,13 @@ static int thermal_throttle_online(unsigned int cpu)
 	INIT_DELAYED_WORK(&state->package_throttle.therm_work, throttle_active_work);
 	INIT_DELAYED_WORK(&state->core_throttle.therm_work, throttle_active_work);
 
+	/*
+	 * The first CPU coming online will enable the HFI. Usually this causes
+	 * hardware to issue an HFI thermal interrupt. Such interrupt will reach
+	 * the CPU once we enable the thermal vector in the local APIC.
+	 */
+	intel_hfi_online(cpu);
+
 	/* Unmask the thermal vector after the above workqueues are initialized. */
 	l = apic_read(APIC_LVTTHMR);
 	apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
@@ -493,6 +500,8 @@ static int thermal_throttle_offline(unsigned int cpu)
 	l = apic_read(APIC_LVTTHMR);
 	apic_write(APIC_LVTTHMR, l | APIC_LVT_MASKED);
 
+	intel_hfi_offline(cpu);
+
 	cancel_delayed_work_sync(&state->package_throttle.therm_work);
 	cancel_delayed_work_sync(&state->core_throttle.therm_work);
 
-- 
Gitee


From 02091d14394d14819544c301e8ec0c6103efad89 Mon Sep 17 00:00:00 2001
From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Date: Thu, 27 Jan 2022 11:34:48 -0800
Subject: [PATCH 069/674] x86/Documentation: Describe the Intel Hardware
 Feedback Interface

mainline inclusion
from mainline-5.18
commit 4a960e8941bd59fe20f8f774de371f40f222a0c7
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5DSOL

Intel_SIG: commit 4a960e8941bd x86/Documentation: Describe the Intel Hardware
Feedback Interface.
Backport for Intel HFI (Hardware Feedback Interface) support

-------------------------------------

Start a documentation file to describe the purpose and operation of Intel's
Hardware Feedback Interface. Describe how this interface is used in Linux
to relay performance and energy efficiency updates to userspace.

Reviewed-by: Len Brown <len.brown@intel.com>
Suggested-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 Documentation/x86/index.rst     |  1 +
 Documentation/x86/intel-hfi.rst | 72 +++++++++++++++++++++++++++++++++
 2 files changed, 73 insertions(+)
 create mode 100644 Documentation/x86/intel-hfi.rst

diff --git a/Documentation/x86/index.rst b/Documentation/x86/index.rst
index 647a570d4931..6832df92f084 100644
--- a/Documentation/x86/index.rst
+++ b/Documentation/x86/index.rst
@@ -21,6 +21,7 @@ x86-specific Documentation
    tlb
    mtrr
    pat
+   intel-hfi
    intel-iommu
    intel_txt
    amd-memory-encryption
diff --git a/Documentation/x86/intel-hfi.rst b/Documentation/x86/intel-hfi.rst
new file mode 100644
index 000000000000..49dea58ea4fb
--- /dev/null
+++ b/Documentation/x86/intel-hfi.rst
@@ -0,0 +1,72 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+============================================================
+Hardware-Feedback Interface for scheduling on Intel Hardware
+============================================================
+
+Overview
+--------
+
+Intel has described the Hardware Feedback Interface (HFI) in the Intel 64 and
+IA-32 Architectures Software Developer's Manual (Intel SDM) Volume 3 Section
+14.6 [1]_.
+
+The HFI gives the operating system a performance and energy efficiency
+capability data for each CPU in the system. Linux can use the information from
+the HFI to influence task placement decisions.
+
+The Hardware Feedback Interface
+-------------------------------
+
+The Hardware Feedback Interface provides to the operating system information
+about the performance and energy efficiency of each CPU in the system. Each
+capability is given as a unit-less quantity in the range [0-255]. Higher values
+indicate higher capability. Energy efficiency and performance are reported in
+separate capabilities. Even though on some systems these two metrics may be
+related, they are specified as independent capabilities in the Intel SDM.
+
+These capabilities may change at runtime as a result of changes in the
+operating conditions of the system or the action of external factors. The rate
+at which these capabilities are updated is specific to each processor model. On
+some models, capabilities are set at boot time and never change. On others,
+capabilities may change every tens of milliseconds. For instance, a remote
+mechanism may be used to lower Thermal Design Power. Such change can be
+reflected in the HFI. Likewise, if the system needs to be throttled due to
+excessive heat, the HFI may reflect reduced performance on specific CPUs.
+
+The kernel or a userspace policy daemon can use these capabilities to modify
+task placement decisions. For instance, if either the performance or energy
+capabilities of a given logical processor becomes zero, it is an indication that
+the hardware recommends to the operating system to not schedule any tasks on
+that processor for performance or energy efficiency reasons, respectively.
+
+Implementation details for Linux
+--------------------------------
+
+The infrastructure to handle thermal event interrupts has two parts. In the
+Local Vector Table of a CPU's local APIC, there exists a register for the
+Thermal Monitor Register. This register controls how interrupts are delivered
+to a CPU when the thermal monitor generates and interrupt. Further details
+can be found in the Intel SDM Vol. 3 Section 10.5 [1]_.
+
+The thermal monitor may generate interrupts per CPU or per package. The HFI
+generates package-level interrupts. This monitor is configured and initialized
+via a set of machine-specific registers. Specifically, the HFI interrupt and
+status are controlled via designated bits in the IA32_PACKAGE_THERM_INTERRUPT
+and IA32_PACKAGE_THERM_STATUS registers, respectively. There exists one HFI
+table per package. Further details can be found in the Intel SDM Vol. 3
+Section 14.9 [1]_.
+
+The hardware issues an HFI interrupt after updating the HFI table and is ready
+for the operating system to consume it. CPUs receive such interrupt via the
+thermal entry in the Local APIC's Local Vector Table.
+
+When servicing such interrupt, the HFI driver parses the updated table and
+relays the update to userspace using the thermal notification framework. Given
+that there may be many HFI updates every second, the updates relayed to
+userspace are throttled at a rate of CONFIG_HZ jiffies.
+
+References
+----------
+
+.. [1] https://www.intel.com/sdm
-- 
Gitee


From 07dc6e0b04d8eca2963e138452d5b24af8f82b28 Mon Sep 17 00:00:00 2001
From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Date: Thu, 27 Jan 2022 11:34:52 -0800
Subject: [PATCH 070/674] thermal: intel: hfi: Enable notification interrupt

mainline inclusion
from mainline-5.18
commit ab09b0744a9944cbdc0ac9a5cb00bef72adf79d5
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5DSOL

Intel_SIG: commit ab09b0744a99 thermal: intel: hfi: Enable notification
interrupt.
Backport for Intel HFI (Hardware Feedback Interface) support

-------------------------------------

When hardware wants to inform the operating system about updates in the HFI
table, it issues a package-level thermal event interrupt. For this,
hardware has new interrupt and status bits in the IA32_PACKAGE_THERM_
INTERRUPT and IA32_PACKAGE_THERM_STATUS registers. The existing thermal
throttle driver already handles thermal event interrupts: it initializes
the thermal vector of the local APIC as well as per-CPU and package-level
interrupt reporting. It also provides routines to service such interrupts.
Extend its functionality to also handle HFI interrupts.

The frequency of the thermal HFI interrupt is specific to each processor
model. On some processors, a single interrupt happens as soon as the HFI is
enabled and hardware will never update HFI capabilities afterwards. On
other processors, thermal and power constraints may cause thermal HFI
interrupts every tens of milliseconds.

To not overwhelm consumers of the HFI data, use delayed work to throttle
the rate at which HFI updates are processed. Use a dedicated workqueue to
not overload system_wq if hardware issues many HFI updates.

Reviewed-by: Len Brown <len.brown@intel.com>
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 drivers/thermal/intel/intel_hfi.c   | 110 ++++++++++++++++++++++++++++
 drivers/thermal/intel/intel_hfi.h   |   2 +
 drivers/thermal/intel/therm_throt.c |  10 +++
 3 files changed, 122 insertions(+)

diff --git a/drivers/thermal/intel/intel_hfi.c b/drivers/thermal/intel/intel_hfi.c
index 8cac7ee6a20f..ebaf767bc83d 100644
--- a/drivers/thermal/intel/intel_hfi.c
+++ b/drivers/thermal/intel/intel_hfi.c
@@ -26,19 +26,27 @@
 #include <linux/cpumask.h>
 #include <linux/gfp.h>
 #include <linux/io.h>
+#include <linux/kernel.h>
 #include <linux/mutex.h>
 #include <linux/percpu-defs.h>
 #include <linux/printk.h>
 #include <linux/processor.h>
 #include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
 #include <linux/topology.h>
+#include <linux/workqueue.h>
 
 #include <asm/msr.h>
 
 #include "intel_hfi.h"
 
+#define THERM_STATUS_CLEAR_PKG_MASK (BIT(1) | BIT(3) | BIT(5) | BIT(7) | \
+				    BIT(9) | BIT(11) | BIT(26))
+
 /* Hardware Feedback Interface MSR configuration bits */
 #define HW_FEEDBACK_PTR_VALID_BIT		BIT(0)
+#define HW_FEEDBACK_CONFIG_HFI_ENABLE_BIT	BIT(0)
 
 /* CPUID detection and enumeration definitions for HFI */
 
@@ -97,6 +105,9 @@ struct hfi_hdr {
  * @data:		Base address of the data of the local table
  * @cpus:		CPUs represented in this HFI table instance
  * @hw_table:		Pointer to the HFI table of this instance
+ * @update_work:	Delayed work to process HFI updates
+ * @table_lock:		Lock to protect acceses to the table of this instance
+ * @event_lock:		Lock to process HFI interrupts
  *
  * A set of parameters to parse and navigate a specific HFI table.
  */
@@ -109,6 +120,9 @@ struct hfi_instance {
 	void			*data;
 	cpumask_var_t		cpus;
 	void			*hw_table;
+	struct delayed_work	update_work;
+	raw_spinlock_t		table_lock;
+	raw_spinlock_t		event_lock;
 };
 
 /**
@@ -146,6 +160,86 @@ static struct hfi_instance *hfi_instances;
 static struct hfi_features hfi_features;
 static DEFINE_MUTEX(hfi_instance_lock);
 
+static struct workqueue_struct *hfi_updates_wq;
+#define HFI_UPDATE_INTERVAL		HZ
+
+static void hfi_update_work_fn(struct work_struct *work)
+{
+	struct hfi_instance *hfi_instance;
+
+	hfi_instance = container_of(to_delayed_work(work), struct hfi_instance,
+				    update_work);
+	if (!hfi_instance)
+		return;
+
+	/* TODO: Consume update here. */
+}
+
+void intel_hfi_process_event(__u64 pkg_therm_status_msr_val)
+{
+	struct hfi_instance *hfi_instance;
+	int cpu = smp_processor_id();
+	struct hfi_cpu_info *info;
+	u64 new_timestamp;
+
+	if (!pkg_therm_status_msr_val)
+		return;
+
+	info = &per_cpu(hfi_cpu_info, cpu);
+	if (!info)
+		return;
+
+	/*
+	 * A CPU is linked to its HFI instance before the thermal vector in the
+	 * local APIC is unmasked. Hence, info->hfi_instance cannot be NULL
+	 * when receiving an HFI event.
+	 */
+	hfi_instance = info->hfi_instance;
+	if (unlikely(!hfi_instance)) {
+		pr_debug("Received event on CPU %d but instance was null", cpu);
+		return;
+	}
+
+	/*
+	 * On most systems, all CPUs in the package receive a package-level
+	 * thermal interrupt when there is an HFI update. It is sufficient to
+	 * let a single CPU to acknowledge the update and queue work to
+	 * process it. The remaining CPUs can resume their work.
+	 */
+	if (!raw_spin_trylock(&hfi_instance->event_lock))
+		return;
+
+	/* Skip duplicated updates. */
+	new_timestamp = *(u64 *)hfi_instance->hw_table;
+	if (*hfi_instance->timestamp == new_timestamp) {
+		raw_spin_unlock(&hfi_instance->event_lock);
+		return;
+	}
+
+	raw_spin_lock(&hfi_instance->table_lock);
+
+	/*
+	 * Copy the updated table into our local copy. This includes the new
+	 * timestamp.
+	 */
+	memcpy(hfi_instance->local_table, hfi_instance->hw_table,
+	       hfi_features.nr_table_pages << PAGE_SHIFT);
+
+	raw_spin_unlock(&hfi_instance->table_lock);
+	raw_spin_unlock(&hfi_instance->event_lock);
+
+	/*
+	 * Let hardware know that we are done reading the HFI table and it is
+	 * free to update it again.
+	 */
+	pkg_therm_status_msr_val &= THERM_STATUS_CLEAR_PKG_MASK &
+				    ~PACKAGE_THERM_STATUS_HFI_UPDATED;
+	wrmsrl(MSR_IA32_PACKAGE_THERM_STATUS, pkg_therm_status_msr_val);
+
+	queue_delayed_work(hfi_updates_wq, &hfi_instance->update_work,
+			   HFI_UPDATE_INTERVAL);
+}
+
 static void init_hfi_cpu_index(struct hfi_cpu_info *info)
 {
 	union cpuid6_edx edx;
@@ -257,8 +351,20 @@ void intel_hfi_online(unsigned int cpu)
 
 	init_hfi_instance(hfi_instance);
 
+	INIT_DELAYED_WORK(&hfi_instance->update_work, hfi_update_work_fn);
+	raw_spin_lock_init(&hfi_instance->table_lock);
+	raw_spin_lock_init(&hfi_instance->event_lock);
+
 	cpumask_set_cpu(cpu, hfi_instance->cpus);
 
+	/*
+	 * Enable the hardware feedback interface and never disable it. See
+	 * comment on programming the address of the table.
+	 */
+	rdmsrl(MSR_IA32_HW_FEEDBACK_CONFIG, msr_val);
+	msr_val |= HW_FEEDBACK_CONFIG_HFI_ENABLE_BIT;
+	wrmsrl(MSR_IA32_HW_FEEDBACK_CONFIG, msr_val);
+
 unlock:
 	mutex_unlock(&hfi_instance_lock);
 	return;
@@ -372,6 +478,10 @@ void __init intel_hfi_init(void)
 			goto err_nomem;
 	}
 
+	hfi_updates_wq = create_singlethread_workqueue("hfi-updates");
+	if (!hfi_updates_wq)
+		goto err_nomem;
+
 	return;
 
 err_nomem:
diff --git a/drivers/thermal/intel/intel_hfi.h b/drivers/thermal/intel/intel_hfi.h
index 56c6b2d75202..325aa78b745c 100644
--- a/drivers/thermal/intel/intel_hfi.h
+++ b/drivers/thermal/intel/intel_hfi.h
@@ -6,10 +6,12 @@
 void __init intel_hfi_init(void);
 void intel_hfi_online(unsigned int cpu);
 void intel_hfi_offline(unsigned int cpu);
+void intel_hfi_process_event(__u64 pkg_therm_status_msr_val);
 #else
 static inline void intel_hfi_init(void) { }
 static inline void intel_hfi_online(unsigned int cpu) { }
 static inline void intel_hfi_offline(unsigned int cpu) { }
+static inline void intel_hfi_process_event(__u64 pkg_therm_status_msr_val) { }
 #endif /* CONFIG_INTEL_HFI_THERMAL */
 
 #endif /* _INTEL_HFI_H */
diff --git a/drivers/thermal/intel/therm_throt.c b/drivers/thermal/intel/therm_throt.c
index e4f95cf06b82..8cc35a9f3862 100644
--- a/drivers/thermal/intel/therm_throt.c
+++ b/drivers/thermal/intel/therm_throt.c
@@ -615,6 +615,10 @@ void intel_thermal_interrupt(void)
 					PACKAGE_THERM_STATUS_POWER_LIMIT,
 					POWER_LIMIT_EVENT,
 					PACKAGE_LEVEL);
+
+		if (this_cpu_has(X86_FEATURE_HFI))
+			intel_hfi_process_event(msr_val &
+						PACKAGE_THERM_STATUS_HFI_UPDATED);
 	}
 }
 
@@ -717,6 +721,12 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
 			wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
 			      l | (PACKAGE_THERM_INT_LOW_ENABLE
 				| PACKAGE_THERM_INT_HIGH_ENABLE), h);
+
+		if (cpu_has(c, X86_FEATURE_HFI)) {
+			rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
+			wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
+			      l | PACKAGE_THERM_INT_HFI_ENABLE, h);
+		}
 	}
 
 	rdmsr(MSR_IA32_MISC_ENABLE, l, h);
-- 
Gitee


From b0b8a4822c7764d153f1fc03ba3b7d37c0cba077 Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Thu, 27 Jan 2022 11:34:54 -0800
Subject: [PATCH 071/674] thermal: intel: hfi: Notify user space for HFI events

mainline inclusion
from mainline-5.18
commit bd30cdfd9bd73b68e4977ce7c5540aa7b14c25cd
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5DSOL

Intel_SIG: commit bd30cdfd9bd7 thermal: intel: hfi: Notify user space for
HFI events.
Backport for Intel HFI (Hardware Feedback Interface) support

-------------------------------------

When the hardware issues an HFI event, relay a notification to user space.
This allows user space to respond by reading performance and efficiency of
each CPU and take appropriate action.

For example, when the performance and efficiency of a CPU is 0, user space
can either offline the CPU or inject idle. Also, if user space notices a
downward trend in performance, it may proactively adjust power limits to
avoid future situations in which performance drops to 0.

To avoid excessive notifications, the rate is limited by one HZ per event.
To limit the netlink message size, send parameters for up to 16 CPUs in a
single message. If there are more than 16 CPUs, issue as many messages as
needed to notify the status of all CPUs.

In the HFI specification, both performance and efficiency capabilities are
defined in the [0, 255] range. The existing implementations of HFI hardware
do not scale the maximum values to 255. Since userspace cares about
capability values that are either 0 or show a downward/upward trend, this
fact does not matter much. Relative changes in capabilities are enough. To
comply with the thermal netlink ABI, scale both performance and efficiency
capabilities to the [0, 1023] interval.

Reviewed-by: Len Brown <len.brown@intel.com>
Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 drivers/thermal/intel/Kconfig     |  1 +
 drivers/thermal/intel/intel_hfi.c | 75 ++++++++++++++++++++++++++++++-
 2 files changed, 75 insertions(+), 1 deletion(-)

diff --git a/drivers/thermal/intel/Kconfig b/drivers/thermal/intel/Kconfig
index 26aea864aaca..510d36326dfd 100644
--- a/drivers/thermal/intel/Kconfig
+++ b/drivers/thermal/intel/Kconfig
@@ -84,6 +84,7 @@ config INTEL_HFI_THERMAL
 	bool "Intel Hardware Feedback Interface"
 	depends on CPU_SUP_INTEL
 	depends on X86_THERMAL_VECTOR
+	select THERMAL_NETLINK
 	help
 	  Select this option to enable the Hardware Feedback Interface. If
 	  selected, hardware provides guidance to the operating system on
diff --git a/drivers/thermal/intel/intel_hfi.c b/drivers/thermal/intel/intel_hfi.c
index ebaf767bc83d..158a2b7c6b6f 100644
--- a/drivers/thermal/intel/intel_hfi.c
+++ b/drivers/thermal/intel/intel_hfi.c
@@ -39,6 +39,7 @@
 
 #include <asm/msr.h>
 
+#include "../thermal_core.h"
 #include "intel_hfi.h"
 
 #define THERM_STATUS_CLEAR_PKG_MASK (BIT(1) | BIT(3) | BIT(5) | BIT(7) | \
@@ -162,6 +163,78 @@ static DEFINE_MUTEX(hfi_instance_lock);
 
 static struct workqueue_struct *hfi_updates_wq;
 #define HFI_UPDATE_INTERVAL		HZ
+#define HFI_MAX_THERM_NOTIFY_COUNT	16
+
+static void get_hfi_caps(struct hfi_instance *hfi_instance,
+			 struct thermal_genl_cpu_caps *cpu_caps)
+{
+	int cpu, i = 0;
+
+	raw_spin_lock_irq(&hfi_instance->table_lock);
+	for_each_cpu(cpu, hfi_instance->cpus) {
+		struct hfi_cpu_data *caps;
+		s16 index;
+
+		index = per_cpu(hfi_cpu_info, cpu).index;
+		caps = hfi_instance->data + index * hfi_features.cpu_stride;
+		cpu_caps[i].cpu = cpu;
+
+		/*
+		 * Scale performance and energy efficiency to
+		 * the [0, 1023] interval that thermal netlink uses.
+		 */
+		cpu_caps[i].performance = caps->perf_cap << 2;
+		cpu_caps[i].efficiency = caps->ee_cap << 2;
+
+		++i;
+	}
+	raw_spin_unlock_irq(&hfi_instance->table_lock);
+}
+
+/*
+ * Call update_capabilities() when there are changes in the HFI table.
+ */
+static void update_capabilities(struct hfi_instance *hfi_instance)
+{
+	struct thermal_genl_cpu_caps *cpu_caps;
+	int i = 0, cpu_count;
+
+	/* CPUs may come online/offline while processing an HFI update. */
+	mutex_lock(&hfi_instance_lock);
+
+	cpu_count = cpumask_weight(hfi_instance->cpus);
+
+	/* No CPUs to report in this hfi_instance. */
+	if (!cpu_count)
+		goto out;
+
+	cpu_caps = kcalloc(cpu_count, sizeof(*cpu_caps), GFP_KERNEL);
+	if (!cpu_caps)
+		goto out;
+
+	get_hfi_caps(hfi_instance, cpu_caps);
+
+	if (cpu_count < HFI_MAX_THERM_NOTIFY_COUNT)
+		goto last_cmd;
+
+	/* Process complete chunks of HFI_MAX_THERM_NOTIFY_COUNT capabilities. */
+	for (i = 0;
+	     (i + HFI_MAX_THERM_NOTIFY_COUNT) <= cpu_count;
+	     i += HFI_MAX_THERM_NOTIFY_COUNT)
+		thermal_genl_cpu_capability_event(HFI_MAX_THERM_NOTIFY_COUNT,
+						  &cpu_caps[i]);
+
+	cpu_count = cpu_count - i;
+
+last_cmd:
+	/* Process the remaining capabilities if any. */
+	if (cpu_count)
+		thermal_genl_cpu_capability_event(cpu_count, &cpu_caps[i]);
+
+	kfree(cpu_caps);
+out:
+	mutex_unlock(&hfi_instance_lock);
+}
 
 static void hfi_update_work_fn(struct work_struct *work)
 {
@@ -172,7 +245,7 @@ static void hfi_update_work_fn(struct work_struct *work)
 	if (!hfi_instance)
 		return;
 
-	/* TODO: Consume update here. */
+	update_capabilities(hfi_instance);
 }
 
 void intel_hfi_process_event(__u64 pkg_therm_status_msr_val)
-- 
Gitee


From 9c1fe5896c82aa7193bdfe10b45845bcb11210bb Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Tue, 8 Feb 2022 16:15:46 -0800
Subject: [PATCH 072/674] thermal: intel: hfi: INTEL_HFI_THERMAL depends on NET

mainline inclusion
from mainline-5.18
commit c95aa2bab974394809edea28690f6504a15791b6
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5DSOL

Intel_SIG: commit c95aa2bab974 thermal: intel: hfi: INTEL_HFI_THERMAL
depends on NET.
Backport for Intel HFI (Hardware Feedback Interface) support

-------------------------------------

THERMAL_NETLINK depends on NET and since 'select' does not follow
any dependency chain, INTEL_HFI_THERMAL also should depend on NET.

Fix one Kconfig warning and 48 subsequent build errors:

WARNING: unmet direct dependencies detected for THERMAL_NETLINK
  Depends on [n]: THERMAL [=y] && NET [=n]
  Selected by [y]:
  - INTEL_HFI_THERMAL [=y] && THERMAL [=y] && (X86 [=y] || X86_INTEL_QUARK [=n] || COMPILE_TEST [=y]) && CPU_SUP_INTEL [=y] && X86_THERMAL_VECTOR [=y]

Fixes: bd30cdfd9bd7 ("thermal: intel: hfi: Notify user space for HFI events")
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Reviewed-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 drivers/thermal/intel/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/thermal/intel/Kconfig b/drivers/thermal/intel/Kconfig
index 510d36326dfd..876869d57313 100644
--- a/drivers/thermal/intel/Kconfig
+++ b/drivers/thermal/intel/Kconfig
@@ -82,6 +82,7 @@ config INTEL_PCH_THERMAL
 
 config INTEL_HFI_THERMAL
 	bool "Intel Hardware Feedback Interface"
+	depends on NET
 	depends on CPU_SUP_INTEL
 	depends on X86_THERMAL_VECTOR
 	select THERMAL_NETLINK
-- 
Gitee


From d1b287f6c142193c6dc5a5abfe2fe59e2f03cee6 Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Thu, 27 Jan 2022 11:34:53 -0800
Subject: [PATCH 073/674] thermal: netlink: Add a new event to notify CPU
 capabilities change

mainline inclusion
from mainline-5.18
commit e4b1eb24ce5a696ef7229f9926ff34d7502f0582
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5DSOL

Intel_SIG: commit e4b1eb24ce5a thermal: netlink: Add a new event to notify
CPU capabilities change.
Backport for Intel HFI (Hardware Feedback Interface) support

-------------------------------------

Add a new netlink event to notify change in CPU capabilities in terms of
performance and efficiency.

Firmware may change CPU capabilities as a result of thermal events in the
system or to account for changes in the TDP (thermal design power) level.

This notification type will allow user space to avoid running workloads
on certain CPUs or proactively adjust power limits to avoid future events.

The netlink message consists of a nested attribute
(THERMAL_GENL_ATTR_CPU_CAPABILITY) with three attributes:

 * THERMAL_GENL_ATTR_CPU_CAPABILITY_ID (type u32):
   -- logical CPU number
 * THERMAL_GENL_ATTR_CPU_CAPABILITY_PERFORMANCE (type u32):
   -- Scaled performance from 0-1023
 * THERMAL_GENL_ATTR_CPU_CAPABILITY_EFFICIENCY (type u32):
   -- Scaled efficiency from 0-1023

Reviewed-by: Len Brown <len.brown@intel.com>
Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 drivers/thermal/thermal_netlink.c | 53 +++++++++++++++++++++++++++++++
 drivers/thermal/thermal_netlink.h | 14 ++++++++
 include/uapi/linux/thermal.h      |  6 +++-
 3 files changed, 72 insertions(+), 1 deletion(-)

diff --git a/drivers/thermal/thermal_netlink.c b/drivers/thermal/thermal_netlink.c
index 41c8d47805c4..dc535831b660 100644
--- a/drivers/thermal/thermal_netlink.c
+++ b/drivers/thermal/thermal_netlink.c
@@ -43,6 +43,11 @@ static const struct nla_policy thermal_genl_policy[THERMAL_GENL_ATTR_MAX + 1] =
 	[THERMAL_GENL_ATTR_CDEV_MAX_STATE]	= { .type = NLA_U32 },
 	[THERMAL_GENL_ATTR_CDEV_NAME]		= { .type = NLA_STRING,
 						    .len = THERMAL_NAME_LENGTH },
+	/* CPU capabilities */
+	[THERMAL_GENL_ATTR_CPU_CAPABILITY]		= { .type = NLA_NESTED },
+	[THERMAL_GENL_ATTR_CPU_CAPABILITY_ID]		= { .type = NLA_U32 },
+	[THERMAL_GENL_ATTR_CPU_CAPABILITY_PERFORMANCE]	= { .type = NLA_U32 },
+	[THERMAL_GENL_ATTR_CPU_CAPABILITY_EFFICIENCY]	= { .type = NLA_U32 },
 };
 
 struct param {
@@ -58,6 +63,8 @@ struct param {
 	int temp;
 	int cdev_state;
 	int cdev_max_state;
+	struct thermal_genl_cpu_caps *cpu_capabilities;
+	int cpu_capabilities_count;
 };
 
 typedef int (*cb_t)(struct param *);
@@ -189,6 +196,42 @@ static int thermal_genl_event_gov_change(struct param *p)
 	return 0;
 }
 
+static int thermal_genl_event_cpu_capability_change(struct param *p)
+{
+	struct thermal_genl_cpu_caps *cpu_cap = p->cpu_capabilities;
+	struct sk_buff *msg = p->msg;
+	struct nlattr *start_cap;
+	int i;
+
+	start_cap = nla_nest_start(msg, THERMAL_GENL_ATTR_CPU_CAPABILITY);
+	if (!start_cap)
+		return -EMSGSIZE;
+
+	for (i = 0; i < p->cpu_capabilities_count; ++i) {
+		if (nla_put_u32(msg, THERMAL_GENL_ATTR_CPU_CAPABILITY_ID,
+				cpu_cap->cpu))
+			goto out_cancel_nest;
+
+		if (nla_put_u32(msg, THERMAL_GENL_ATTR_CPU_CAPABILITY_PERFORMANCE,
+				cpu_cap->performance))
+			goto out_cancel_nest;
+
+		if (nla_put_u32(msg, THERMAL_GENL_ATTR_CPU_CAPABILITY_EFFICIENCY,
+				cpu_cap->efficiency))
+			goto out_cancel_nest;
+
+		++cpu_cap;
+	}
+
+	nla_nest_end(msg, start_cap);
+
+	return 0;
+out_cancel_nest:
+	nla_nest_cancel(msg, start_cap);
+
+	return -EMSGSIZE;
+}
+
 int thermal_genl_event_tz_delete(struct param *p)
 	__attribute__((alias("thermal_genl_event_tz")));
 
@@ -218,6 +261,7 @@ static cb_t event_cb[] = {
 	[THERMAL_GENL_EVENT_CDEV_DELETE]	= thermal_genl_event_cdev_delete,
 	[THERMAL_GENL_EVENT_CDEV_STATE_UPDATE]	= thermal_genl_event_cdev_state_update,
 	[THERMAL_GENL_EVENT_TZ_GOV_CHANGE]	= thermal_genl_event_gov_change,
+	[THERMAL_GENL_EVENT_CPU_CAPABILITY_CHANGE] = thermal_genl_event_cpu_capability_change,
 };
 
 /*
@@ -355,6 +399,15 @@ int thermal_notify_tz_gov_change(int tz_id, const char *name)
 	return thermal_genl_send_event(THERMAL_GENL_EVENT_TZ_GOV_CHANGE, &p);
 }
 
+int thermal_genl_cpu_capability_event(int count,
+				      struct thermal_genl_cpu_caps *caps)
+{
+	struct param p = { .cpu_capabilities_count = count, .cpu_capabilities = caps };
+
+	return thermal_genl_send_event(THERMAL_GENL_EVENT_CPU_CAPABILITY_CHANGE, &p);
+}
+EXPORT_SYMBOL_GPL(thermal_genl_cpu_capability_event);
+
 /*************************** Command encoding ********************************/
 
 static int __thermal_genl_cmd_tz_get_id(struct thermal_zone_device *tz,
diff --git a/drivers/thermal/thermal_netlink.h b/drivers/thermal/thermal_netlink.h
index 828d1dddfa98..7439b245e41e 100644
--- a/drivers/thermal/thermal_netlink.h
+++ b/drivers/thermal/thermal_netlink.h
@@ -4,6 +4,12 @@
  *  Author: Daniel Lezcano <daniel.lezcano@linaro.org>
  */
 
+struct thermal_genl_cpu_caps {
+	int cpu;
+	int performance;
+	int efficiency;
+};
+
 /* Netlink notification function */
 #ifdef CONFIG_THERMAL_NETLINK
 int __init thermal_netlink_init(void);
@@ -23,6 +29,8 @@ int thermal_notify_cdev_add(int cdev_id, const char *name, int max_state);
 int thermal_notify_cdev_delete(int cdev_id);
 int thermal_notify_tz_gov_change(int tz_id, const char *name);
 int thermal_genl_sampling_temp(int id, int temp);
+int thermal_genl_cpu_capability_event(int count,
+				      struct thermal_genl_cpu_caps *caps);
 #else
 static inline int thermal_netlink_init(void)
 {
@@ -101,4 +109,10 @@ static inline int thermal_genl_sampling_temp(int id, int temp)
 {
 	return 0;
 }
+
+static inline int thermal_genl_cpu_capability_event(int count, struct cpu_capability *caps)
+{
+	return 0;
+}
+
 #endif /* CONFIG_THERMAL_NETLINK */
diff --git a/include/uapi/linux/thermal.h b/include/uapi/linux/thermal.h
index c105054cbb57..06a8cea36fe3 100644
--- a/include/uapi/linux/thermal.h
+++ b/include/uapi/linux/thermal.h
@@ -44,7 +44,10 @@ enum thermal_genl_attr {
 	THERMAL_GENL_ATTR_CDEV_MAX_STATE,
 	THERMAL_GENL_ATTR_CDEV_NAME,
 	THERMAL_GENL_ATTR_GOV_NAME,
-
+	THERMAL_GENL_ATTR_CPU_CAPABILITY,
+	THERMAL_GENL_ATTR_CPU_CAPABILITY_ID,
+	THERMAL_GENL_ATTR_CPU_CAPABILITY_PERFORMANCE,
+	THERMAL_GENL_ATTR_CPU_CAPABILITY_EFFICIENCY,
 	__THERMAL_GENL_ATTR_MAX,
 };
 #define THERMAL_GENL_ATTR_MAX (__THERMAL_GENL_ATTR_MAX - 1)
@@ -71,6 +74,7 @@ enum thermal_genl_event {
 	THERMAL_GENL_EVENT_CDEV_DELETE,		/* Cdev unbound */
 	THERMAL_GENL_EVENT_CDEV_STATE_UPDATE,	/* Cdev state updated */
 	THERMAL_GENL_EVENT_TZ_GOV_CHANGE,	/* Governor policy changed  */
+	THERMAL_GENL_EVENT_CPU_CAPABILITY_CHANGE,	/* CPU capability changed */
 	__THERMAL_GENL_EVENT_MAX,
 };
 #define THERMAL_GENL_EVENT_MAX (__THERMAL_GENL_EVENT_MAX - 1)
-- 
Gitee


From ddbef6c0c3a2bd531aa7bfe545d6eb0ad6b23f76 Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan@kernel.org>
Date: Mon, 7 Feb 2022 09:38:29 -0700
Subject: [PATCH 074/674] thermal: netlink: Fix parameter type of
 thermal_genl_cpu_capability_event() stub

mainline inclusion
mainline-5.18
commit 345be4275cad454ae7e25884369a9c6c25e56279
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5DSOL

Intel_SIG: commit 345be4275cad thermal: netlink: Fix parameter type of
thermal_genl_cpu_capability_event() stub.
Backport for Intel HFI (Hardware Feedback Interface) support

-------------------------------------

When building with CONFIG_THERMAL_NETLINK=n, there is a spew of warnings
along the lines of:

  In file included from drivers/thermal/thermal_core.c:27:
  In file included from drivers/thermal/thermal_core.h:15:
  drivers/thermal/thermal_netlink.h:113:71: warning: declaration of 'struct cpu_capability' will not be visible outside of this function [-Wvisibility]
  static inline int thermal_genl_cpu_capability_event(int count, struct cpu_capability *caps)
                                                                        ^
  1 warning generated.

'struct cpu_capability' is not forward declared anywhere in the header.
As it turns out, this should really be 'struct thermal_genl_cpu_caps',
which silences the warning and makes the parameter types of the stub
match the full function.

Fixes: e4b1eb24ce5a ("thermal: netlink: Add a new event to notify CPU capabilities change")
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Reviewed-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 drivers/thermal/thermal_netlink.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/thermal/thermal_netlink.h b/drivers/thermal/thermal_netlink.h
index 7439b245e41e..fd4188470d3f 100644
--- a/drivers/thermal/thermal_netlink.h
+++ b/drivers/thermal/thermal_netlink.h
@@ -110,7 +110,7 @@ static inline int thermal_genl_sampling_temp(int id, int temp)
 	return 0;
 }
 
-static inline int thermal_genl_cpu_capability_event(int count, struct cpu_capability *caps)
+static inline int thermal_genl_cpu_capability_event(int count, struct thermal_genl_cpu_caps *caps)
 {
 	return 0;
 }
-- 
Gitee


From c200f744e7896c73d9a8417c3bcf29d32eba90f4 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Tue, 7 Dec 2021 21:17:34 +0800
Subject: [PATCH 075/674] powercap: intel_rapl: support new layout of Psys
 PowerLimit Register on SPR

mainline inclusion
mainline-5.17
commit 931da6a0de5d620425af4425344259e6ff46b654
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5BED0

Intel-SIG: commit 931da6a0de5d powercap: intel_rapl: support new layout
of Psys PowerLimit Register on SPR.
Backport for SPR RAPL Psys support.

-------------------------------------

On Sapphire Rapids, the layout of the Psys domain Power Limit Register
is different from from what it was before.

Enhance the code to support the new Psys PL register layout.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reported-and-tested-by: Alkattan Dana <dana.alkattan@intel.com>
[ rjw: Subject and changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 drivers/powercap/intel_rapl_common.c | 61 +++++++++++++++++++++++++++-
 include/linux/intel_rapl.h           |  6 +++
 2 files changed, 65 insertions(+), 2 deletions(-)

diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c
index c9e57237d778..a994c45ca42e 100644
--- a/drivers/powercap/intel_rapl_common.c
+++ b/drivers/powercap/intel_rapl_common.c
@@ -61,6 +61,20 @@
 #define PERF_STATUS_THROTTLE_TIME_MASK 0xffffffff
 #define PP_POLICY_MASK         0x1F
 
+/*
+ * SPR has different layout for Psys Domain PowerLimit registers.
+ * There are 17 bits of PL1 and PL2 instead of 15 bits.
+ * The Enable bits and TimeWindow bits are also shifted as a result.
+ */
+#define PSYS_POWER_LIMIT1_MASK       0x1FFFF
+#define PSYS_POWER_LIMIT1_ENABLE     BIT(17)
+
+#define PSYS_POWER_LIMIT2_MASK       (0x1FFFFULL<<32)
+#define PSYS_POWER_LIMIT2_ENABLE     BIT_ULL(49)
+
+#define PSYS_TIME_WINDOW1_MASK       (0x7FULL<<19)
+#define PSYS_TIME_WINDOW2_MASK       (0x7FULL<<51)
+
 /* Non HW constants */
 #define RAPL_PRIMITIVE_DERIVED       BIT(1)	/* not from raw data */
 #define RAPL_PRIMITIVE_DUMMY         BIT(2)
@@ -97,6 +111,7 @@ struct rapl_defaults {
 				    bool to_raw);
 	unsigned int dram_domain_energy_unit;
 	unsigned int psys_domain_energy_unit;
+	bool spr_psys_bits;
 };
 static struct rapl_defaults *rapl_defaults;
 
@@ -669,12 +684,51 @@ static struct rapl_primitive_info rpi[] = {
 			    RAPL_DOMAIN_REG_PERF, TIME_UNIT, 0),
 	PRIMITIVE_INFO_INIT(PRIORITY_LEVEL, PP_POLICY_MASK, 0,
 			    RAPL_DOMAIN_REG_POLICY, ARBITRARY_UNIT, 0),
+	PRIMITIVE_INFO_INIT(PSYS_POWER_LIMIT1, PSYS_POWER_LIMIT1_MASK, 0,
+			    RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0),
+	PRIMITIVE_INFO_INIT(PSYS_POWER_LIMIT2, PSYS_POWER_LIMIT2_MASK, 32,
+			    RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0),
+	PRIMITIVE_INFO_INIT(PSYS_PL1_ENABLE, PSYS_POWER_LIMIT1_ENABLE, 17,
+			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
+	PRIMITIVE_INFO_INIT(PSYS_PL2_ENABLE, PSYS_POWER_LIMIT2_ENABLE, 49,
+			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
+	PRIMITIVE_INFO_INIT(PSYS_TIME_WINDOW1, PSYS_TIME_WINDOW1_MASK, 19,
+			    RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0),
+	PRIMITIVE_INFO_INIT(PSYS_TIME_WINDOW2, PSYS_TIME_WINDOW2_MASK, 51,
+			    RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0),
 	/* non-hardware */
 	PRIMITIVE_INFO_INIT(AVERAGE_POWER, 0, 0, 0, POWER_UNIT,
 			    RAPL_PRIMITIVE_DERIVED),
 	{NULL, 0, 0, 0},
 };
 
+static enum rapl_primitives
+prim_fixups(struct rapl_domain *rd, enum rapl_primitives prim)
+{
+	if (!rapl_defaults->spr_psys_bits)
+		return prim;
+
+	if (rd->id != RAPL_DOMAIN_PLATFORM)
+		return prim;
+
+	switch (prim) {
+	case POWER_LIMIT1:
+		return PSYS_POWER_LIMIT1;
+	case POWER_LIMIT2:
+		return PSYS_POWER_LIMIT2;
+	case PL1_ENABLE:
+		return PSYS_PL1_ENABLE;
+	case PL2_ENABLE:
+		return PSYS_PL2_ENABLE;
+	case TIME_WINDOW1:
+		return PSYS_TIME_WINDOW1;
+	case TIME_WINDOW2:
+		return PSYS_TIME_WINDOW2;
+	default:
+		return prim;
+	}
+}
+
 /* Read primitive data based on its related struct rapl_primitive_info.
  * if xlate flag is set, return translated data based on data units, i.e.
  * time, energy, and power.
@@ -692,7 +746,8 @@ static int rapl_read_data_raw(struct rapl_domain *rd,
 			      enum rapl_primitives prim, bool xlate, u64 *data)
 {
 	u64 value;
-	struct rapl_primitive_info *rp = &rpi[prim];
+	enum rapl_primitives prim_fixed = prim_fixups(rd, prim);
+	struct rapl_primitive_info *rp = &rpi[prim_fixed];
 	struct reg_action ra;
 	int cpu;
 
@@ -738,7 +793,8 @@ static int rapl_write_data_raw(struct rapl_domain *rd,
 			       enum rapl_primitives prim,
 			       unsigned long long value)
 {
-	struct rapl_primitive_info *rp = &rpi[prim];
+	enum rapl_primitives prim_fixed = prim_fixups(rd, prim);
+	struct rapl_primitive_info *rp = &rpi[prim_fixed];
 	int cpu;
 	u64 bits;
 	struct reg_action ra;
@@ -981,6 +1037,7 @@ static const struct rapl_defaults rapl_defaults_spr_server = {
 	.compute_time_window = rapl_compute_time_window_core,
 	.dram_domain_energy_unit = 15300,
 	.psys_domain_energy_unit = 1000000000,
+	.spr_psys_bits = true,
 };
 
 static const struct rapl_defaults rapl_defaults_byt = {
diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h
index 50b8398ffd21..acf72c018142 100644
--- a/include/linux/intel_rapl.h
+++ b/include/linux/intel_rapl.h
@@ -58,6 +58,12 @@ enum rapl_primitives {
 	THROTTLED_TIME,
 	PRIORITY_LEVEL,
 
+	PSYS_POWER_LIMIT1,
+	PSYS_POWER_LIMIT2,
+	PSYS_PL1_ENABLE,
+	PSYS_PL2_ENABLE,
+	PSYS_TIME_WINDOW1,
+	PSYS_TIME_WINDOW2,
 	/* below are not raw primitive data */
 	AVERAGE_POWER,
 	NR_RAPL_PRIMITIVES,
-- 
Gitee


From 198e436bbfda02c58308cdacc94f66dae432ad9f Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Mon, 11 Jul 2022 22:07:02 +0800
Subject: [PATCH 076/674] vt: drop old FONT ioctls

stable inclusion
from stable-v5.10.127
commit 3acb7dc242ca25eb258493b513ef2f4b0f2a9ad1
category: bugfix
bugzilla: 187166 https://gitee.com/src-openeuler/kernel/issues/I5GKVX
CVE: CVE-2021-33656

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?h=linux-5.10.y&id=3acb7dc242ca25eb258493b513ef2f4b0f2a9ad1

--------------------------------

commit ff2047fb755d4415ec3c70ac799889371151796d upstream.

Drop support for these ioctls:
* PIO_FONT, PIO_FONTX
* GIO_FONT, GIO_FONTX
* PIO_FONTRESET

As was demonstrated by commit 90bfdeef83f1 (tty: make FONTX ioctl use
the tty pointer they were actually passed), these ioctls are not used
from userspace, as:
1) they used to be broken (set up font on current console, not the open
   one) and racy (before the commit above)
2) KDFONTOP ioctl is used for years instead

Note that PIO_FONTRESET is defunct on most systems as VGA_CONSOLE is set
on them for ages. That turns on BROKEN_GRAPHICS_PROGRAMS which makes
PIO_FONTRESET just return an error.

We are removing KD_FONT_FLAG_OLD here as it was used only by these
removed ioctls. kd.h header exists both in kernel and uapi headers, so
we can remove the kernel one completely. Everyone includeing kd.h will
now automatically get the uapi one.

There are now unused definitions of the ioctl numbers and "struct
consolefontdesc" in kd.h, but as it is a uapi header, I am not touching
these.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Link: https://lore.kernel.org/r/20210105120239.28031-8-jslaby@suse.cz
Cc: guodaxing <guodaxing@huawei.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Xie XiuQi <xiexiuqi@huawei.com>
Reviewed-by: Xiu Jianfeng <xiujianfeng@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 drivers/tty/vt/vt.c       |  39 +---------
 drivers/tty/vt/vt_ioctl.c | 151 --------------------------------------
 include/linux/kd.h        |   8 --
 3 files changed, 3 insertions(+), 195 deletions(-)
 delete mode 100644 include/linux/kd.h

diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c
index a7ee1171eeb3..0a6336d54a65 100644
--- a/drivers/tty/vt/vt.c
+++ b/drivers/tty/vt/vt.c
@@ -4625,16 +4625,8 @@ static int con_font_get(struct vc_data *vc, struct console_font_op *op)
 
 	if (op->data && font.charcount > op->charcount)
 		rc = -ENOSPC;
-	if (!(op->flags & KD_FONT_FLAG_OLD)) {
-		if (font.width > op->width || font.height > op->height) 
-			rc = -ENOSPC;
-	} else {
-		if (font.width != 8)
-			rc = -EIO;
-		else if ((op->height && font.height > op->height) ||
-			 font.height > 32)
-			rc = -ENOSPC;
-	}
+	if (font.width > op->width || font.height > op->height)
+		rc = -ENOSPC;
 	if (rc)
 		goto out;
 
@@ -4662,7 +4654,7 @@ static int con_font_set(struct vc_data *vc, struct console_font_op *op)
 		return -EINVAL;
 	if (op->charcount > 512)
 		return -EINVAL;
-	if (op->width <= 0 || op->width > 32 || op->height > 32)
+	if (op->width <= 0 || op->width > 32 || !op->height || op->height > 32)
 		return -EINVAL;
 	size = (op->width+7)/8 * 32 * op->charcount;
 	if (size > max_font_size)
@@ -4672,31 +4664,6 @@ static int con_font_set(struct vc_data *vc, struct console_font_op *op)
 	if (IS_ERR(font.data))
 		return PTR_ERR(font.data);
 
-	if (!op->height) {		/* Need to guess font height [compat] */
-		int h, i;
-		u8 *charmap = font.data;
-
-		/*
-		 * If from KDFONTOP ioctl, don't allow things which can be done
-		 * in userland,so that we can get rid of this soon
-		 */
-		if (!(op->flags & KD_FONT_FLAG_OLD)) {
-			kfree(font.data);
-			return -EINVAL;
-		}
-
-		for (h = 32; h > 0; h--)
-			for (i = 0; i < op->charcount; i++)
-				if (charmap[32*i+h-1])
-					goto nonzero;
-
-		kfree(font.data);
-		return -EINVAL;
-
-	nonzero:
-		op->height = h;
-	}
-
 	font.charcount = op->charcount;
 	font.width = op->width;
 	font.height = op->height;
diff --git a/drivers/tty/vt/vt_ioctl.c b/drivers/tty/vt/vt_ioctl.c
index a9c6ea8986af..b10b86e2c17e 100644
--- a/drivers/tty/vt/vt_ioctl.c
+++ b/drivers/tty/vt/vt_ioctl.c
@@ -486,70 +486,6 @@ static int vt_k_ioctl(struct tty_struct *tty, unsigned int cmd,
 	return 0;
 }
 
-static inline int do_fontx_ioctl(struct vc_data *vc, int cmd,
-		struct consolefontdesc __user *user_cfd,
-		struct console_font_op *op)
-{
-	struct consolefontdesc cfdarg;
-	int i;
-
-	if (copy_from_user(&cfdarg, user_cfd, sizeof(struct consolefontdesc)))
-		return -EFAULT;
-
-	switch (cmd) {
-	case PIO_FONTX:
-		op->op = KD_FONT_OP_SET;
-		op->flags = KD_FONT_FLAG_OLD;
-		op->width = 8;
-		op->height = cfdarg.charheight;
-		op->charcount = cfdarg.charcount;
-		op->data = cfdarg.chardata;
-		return con_font_op(vc, op);
-
-	case GIO_FONTX:
-		op->op = KD_FONT_OP_GET;
-		op->flags = KD_FONT_FLAG_OLD;
-		op->width = 8;
-		op->height = cfdarg.charheight;
-		op->charcount = cfdarg.charcount;
-		op->data = cfdarg.chardata;
-		i = con_font_op(vc, op);
-		if (i)
-			return i;
-		cfdarg.charheight = op->height;
-		cfdarg.charcount = op->charcount;
-		if (copy_to_user(user_cfd, &cfdarg, sizeof(struct consolefontdesc)))
-			return -EFAULT;
-		return 0;
-	}
-	return -EINVAL;
-}
-
-static int vt_io_fontreset(struct vc_data *vc, struct console_font_op *op)
-{
-	int ret;
-
-	if (__is_defined(BROKEN_GRAPHICS_PROGRAMS)) {
-		/*
-		 * With BROKEN_GRAPHICS_PROGRAMS defined, the default font is
-		 * not saved.
-		 */
-		return -ENOSYS;
-	}
-
-	op->op = KD_FONT_OP_SET_DEFAULT;
-	op->data = NULL;
-	ret = con_font_op(vc, op);
-	if (ret)
-		return ret;
-
-	console_lock();
-	con_set_default_unimap(vc);
-	console_unlock();
-
-	return 0;
-}
-
 static inline int do_unimap_ioctl(int cmd, struct unimapdesc __user *user_ud,
 		bool perm, struct vc_data *vc)
 {
@@ -574,29 +510,7 @@ static inline int do_unimap_ioctl(int cmd, struct unimapdesc __user *user_ud,
 static int vt_io_ioctl(struct vc_data *vc, unsigned int cmd, void __user *up,
 		bool perm)
 {
-	struct console_font_op op;	/* used in multiple places here */
-
 	switch (cmd) {
-	case PIO_FONT:
-		if (!perm)
-			return -EPERM;
-		op.op = KD_FONT_OP_SET;
-		op.flags = KD_FONT_FLAG_OLD | KD_FONT_FLAG_DONT_RECALC;	/* Compatibility */
-		op.width = 8;
-		op.height = 0;
-		op.charcount = 256;
-		op.data = up;
-		return con_font_op(vc, &op);
-
-	case GIO_FONT:
-		op.op = KD_FONT_OP_GET;
-		op.flags = KD_FONT_FLAG_OLD;
-		op.width = 8;
-		op.height = 32;
-		op.charcount = 256;
-		op.data = up;
-		return con_font_op(vc, &op);
-
 	case PIO_CMAP:
                 if (!perm)
 			return -EPERM;
@@ -605,20 +519,6 @@ static int vt_io_ioctl(struct vc_data *vc, unsigned int cmd, void __user *up,
 	case GIO_CMAP:
                 return con_get_cmap(up);
 
-	case PIO_FONTX:
-		if (!perm)
-			return -EPERM;
-
-		fallthrough;
-	case GIO_FONTX:
-		return do_fontx_ioctl(vc, cmd, up, &op);
-
-	case PIO_FONTRESET:
-		if (!perm)
-			return -EPERM;
-
-		return vt_io_fontreset(vc, &op);
-
 	case PIO_SCRNMAP:
 		if (!perm)
 			return -EPERM;
@@ -1099,54 +999,6 @@ void vc_SAK(struct work_struct *work)
 
 #ifdef CONFIG_COMPAT
 
-struct compat_consolefontdesc {
-	unsigned short charcount;       /* characters in font (256 or 512) */
-	unsigned short charheight;      /* scan lines per character (1-32) */
-	compat_caddr_t chardata;	/* font data in expanded form */
-};
-
-static inline int
-compat_fontx_ioctl(struct vc_data *vc, int cmd,
-		   struct compat_consolefontdesc __user *user_cfd,
-		   int perm, struct console_font_op *op)
-{
-	struct compat_consolefontdesc cfdarg;
-	int i;
-
-	if (copy_from_user(&cfdarg, user_cfd, sizeof(struct compat_consolefontdesc)))
-		return -EFAULT;
-
-	switch (cmd) {
-	case PIO_FONTX:
-		if (!perm)
-			return -EPERM;
-		op->op = KD_FONT_OP_SET;
-		op->flags = KD_FONT_FLAG_OLD;
-		op->width = 8;
-		op->height = cfdarg.charheight;
-		op->charcount = cfdarg.charcount;
-		op->data = compat_ptr(cfdarg.chardata);
-		return con_font_op(vc, op);
-
-	case GIO_FONTX:
-		op->op = KD_FONT_OP_GET;
-		op->flags = KD_FONT_FLAG_OLD;
-		op->width = 8;
-		op->height = cfdarg.charheight;
-		op->charcount = cfdarg.charcount;
-		op->data = compat_ptr(cfdarg.chardata);
-		i = con_font_op(vc, op);
-		if (i)
-			return i;
-		cfdarg.charheight = op->height;
-		cfdarg.charcount = op->charcount;
-		if (copy_to_user(user_cfd, &cfdarg, sizeof(struct compat_consolefontdesc)))
-			return -EFAULT;
-		return 0;
-	}
-	return -EINVAL;
-}
-
 struct compat_console_font_op {
 	compat_uint_t op;        /* operation code KD_FONT_OP_* */
 	compat_uint_t flags;     /* KD_FONT_FLAG_* */
@@ -1223,9 +1075,6 @@ long vt_compat_ioctl(struct tty_struct *tty,
 	/*
 	 * these need special handlers for incompatible data structures
 	 */
-	case PIO_FONTX:
-	case GIO_FONTX:
-		return compat_fontx_ioctl(vc, cmd, up, perm, &op);
 
 	case KDFONTOP:
 		return compat_kdfontop_ioctl(up, perm, &op, vc);
diff --git a/include/linux/kd.h b/include/linux/kd.h
deleted file mode 100644
index b130a18f860f..000000000000
--- a/include/linux/kd.h
+++ /dev/null
@@ -1,8 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_KD_H
-#define _LINUX_KD_H
-
-#include <uapi/linux/kd.h>
-
-#define KD_FONT_FLAG_OLD		0x80000000	/* Invoked via old interface [compat] */
-#endif /* _LINUX_KD_H */
-- 
Gitee


From 039bd396a88b93edd0008cab8f0b2e597ec072cc Mon Sep 17 00:00:00 2001
From: He Sheng <hesheng@wxiat.com>
Date: Fri, 6 May 2022 09:18:15 +0800
Subject: [PATCH 077/674] sw64: remove unused header files

Sunway inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I56OLG

--------------------------------

These header files have became unused for a long time.

Signed-off-by: He Sheng <hesheng@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/include/asm/agp.h            |  19 ----
 arch/sw_64/include/asm/asm-prototypes.h |   1 -
 arch/sw_64/include/asm/compiler.h       |   7 --
 arch/sw_64/include/asm/console.h        |  11 ---
 arch/sw_64/include/asm/elf.h            |   6 +-
 arch/sw_64/include/asm/floppy.h         | 116 ------------------------
 arch/sw_64/include/asm/special_insns.h  |  20 ----
 arch/sw_64/include/uapi/asm/console.h   |  51 -----------
 8 files changed, 2 insertions(+), 229 deletions(-)
 delete mode 100644 arch/sw_64/include/asm/agp.h
 delete mode 100644 arch/sw_64/include/asm/compiler.h
 delete mode 100644 arch/sw_64/include/asm/console.h
 delete mode 100644 arch/sw_64/include/asm/floppy.h
 delete mode 100644 arch/sw_64/include/asm/special_insns.h
 delete mode 100644 arch/sw_64/include/uapi/asm/console.h

diff --git a/arch/sw_64/include/asm/agp.h b/arch/sw_64/include/asm/agp.h
deleted file mode 100644
index e9d16888910e..000000000000
--- a/arch/sw_64/include/asm/agp.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_SW64_AGP_H
-#define _ASM_SW64_AGP_H 1
-
-#include <asm/io.h>
-
-/* dummy for now */
-
-#define map_page_into_agp(page)
-#define unmap_page_from_agp(page)
-#define flush_agp_cache() mb()
-
-/* GATT allocation. Returns/accepts GATT kernel virtual address. */
-#define alloc_gatt_pages(order)		\
-	((char *)__get_free_pages(GFP_KERNEL, (order)))
-#define free_gatt_pages(table, order)	\
-	free_pages((unsigned long)(table), (order))
-
-#endif
diff --git a/arch/sw_64/include/asm/asm-prototypes.h b/arch/sw_64/include/asm/asm-prototypes.h
index 21f4f494d74d..15bad8ef6883 100644
--- a/arch/sw_64/include/asm/asm-prototypes.h
+++ b/arch/sw_64/include/asm/asm-prototypes.h
@@ -4,7 +4,6 @@
 
 #include <linux/spinlock.h>
 #include <asm/checksum.h>
-#include <asm/console.h>
 #include <asm/page.h>
 #include <asm/string.h>
 #include <linux/uaccess.h>
diff --git a/arch/sw_64/include/asm/compiler.h b/arch/sw_64/include/asm/compiler.h
deleted file mode 100644
index 9a80aa6a0ba8..000000000000
--- a/arch/sw_64/include/asm/compiler.h
+++ /dev/null
@@ -1,7 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_SW64_COMPILER_H
-#define _ASM_SW64_COMPILER_H
-
-#include <uapi/asm/compiler.h>
-
-#endif /* _ASM_SW64_COMPILER_H */
diff --git a/arch/sw_64/include/asm/console.h b/arch/sw_64/include/asm/console.h
deleted file mode 100644
index 0c01cb740bce..000000000000
--- a/arch/sw_64/include/asm/console.h
+++ /dev/null
@@ -1,11 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_SW64_CONSOLE_H
-#define _ASM_SW64_CONSOLE_H
-
-#include <uapi/asm/console.h>
-#ifndef __ASSEMBLY__
-struct crb_struct;
-extern int callback_init_done;
-extern void callback_init(void);
-#endif /* __ASSEMBLY__ */
-#endif /* _ASM_SW64_CONSOLE_H */
diff --git a/arch/sw_64/include/asm/elf.h b/arch/sw_64/include/asm/elf.h
index 150629b0b615..abecc5653eba 100644
--- a/arch/sw_64/include/asm/elf.h
+++ b/arch/sw_64/include/asm/elf.h
@@ -3,7 +3,6 @@
 #define _ASM_SW64_ELF_H
 #ifdef __KERNEL__
 #include <asm/auxvec.h>
-#include <asm/special_insns.h>
 #endif
 /* Special values for the st_other field in the symbol table.  */
 
@@ -141,11 +140,10 @@ extern int dump_elf_task_fp(elf_fpreg_t *dest, struct task_struct *task);
 
 /*
  * This yields a mask that user programs can use to figure out what
- * instruction set this CPU supports.  This is trivial on SW-64,
- * but not so on other machines.
+ * instruction set this CPU supports.
  */
 
-#define ELF_HWCAP  (~amask(-1))
+#define ELF_HWCAP	0
 
 /*
  * This yields a string that ld.so will use to load implementation
diff --git a/arch/sw_64/include/asm/floppy.h b/arch/sw_64/include/asm/floppy.h
deleted file mode 100644
index f4646d99d80c..000000000000
--- a/arch/sw_64/include/asm/floppy.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Architecture specific parts of the Floppy driver
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * Copyright (C) 1995
- */
-#ifndef _ASM_SW64_FLOPPY_H
-#define _ASM_SW64_FLOPPY_H
-
-#define fd_inb(port)		inb_p(port)
-#define fd_outb(value, port)	outb_p(value, port)
-
-#define fd_enable_dma()		enable_dma(FLOPPY_DMA)
-#define fd_disable_dma()	disable_dma(FLOPPY_DMA)
-#define fd_request_dma()	request_dma(FLOPPY_DMA, "floppy")
-#define fd_free_dma()		free_dma(FLOPPY_DMA)
-#define fd_clear_dma_ff()	clear_dma_ff(FLOPPY_DMA)
-#define fd_set_dma_mode(mode)	set_dma_mode(FLOPPY_DMA, mode)
-#define fd_set_dma_addr(addr)	set_dma_addr(FLOPPY_DMA, virt_to_bus(addr))
-#define fd_set_dma_count(count)	set_dma_count(FLOPPY_DMA, count)
-#define fd_enable_irq()		enable_irq(FLOPPY_IRQ)
-#define fd_disable_irq()	disable_irq(FLOPPY_IRQ)
-#define fd_cacheflush(addr, size) /* nothing */
-#define fd_request_irq() \
-	request_irq(FLOPPY_IRQ, floppy_interrupt, 0, "floppy", NULL)
-#define fd_free_irq()		free_irq(FLOPPY_IRQ, NULL)
-
-#ifdef CONFIG_PCI
-
-#include <linux/pci.h>
-
-#define fd_dma_setup(addr, size, mode, io) \
-	sw64_fd_dma_setup(addr, size, mode, io)
-
-static inline int
-sw64_fd_dma_setup(char *addr, unsigned long size, int mode, int io)
-{
-	static unsigned long prev_size;
-	static dma_addr_t bus_addr;
-	static char *prev_addr;
-	static int prev_dir;
-	int dir;
-
-	dir = (mode != DMA_MODE_READ) ? PCI_DMA_FROMDEVICE : PCI_DMA_TODEVICE;
-
-	if (bus_addr
-		&& (addr != prev_addr || size != prev_size || dir != prev_dir)) {
-		/* different from last time -- unmap prev */
-		bus_addr = 0;
-	}
-
-	if (!bus_addr)	/* need to map it */
-		bus_addr = virt_to_bus(addr);
-
-	/* remember this one as prev */
-	prev_addr = addr;
-	prev_size = size;
-	prev_dir = dir;
-
-	fd_clear_dma_ff();
-	fd_cacheflush(addr, size);
-	fd_set_dma_mode(mode);
-	set_dma_addr(FLOPPY_DMA, bus_addr);
-	fd_set_dma_count(size);
-	virtual_dma_port = io;
-	fd_enable_dma();
-
-	return 0;
-}
-
-#endif /* CONFIG_PCI */
-
-inline void virtual_dma_init(void)
-{
-	/* Nothing to do on an sw64 */
-}
-
-static int FDC1 = 0x3f0;
-static int FDC2 = -1;
-
-/*
- * Again, the CMOS information doesn't work on the sw64..
- */
-#define FLOPPY0_TYPE 6
-#define FLOPPY1_TYPE 0
-
-#define N_FDC 2
-#define N_DRIVE 8
-
-/*
- * Most sw64s have no problems with floppy DMA crossing 64k borders,
- * except for certain ones, like XL and RUFFIAN.
- *
- * However, the test is simple and fast, and this *is* floppy, after all,
- * so we do it for all platforms, just to make sure.
- *
- * This is advantageous in other circumstances as well, as in moving
- * about the PCI DMA windows and forcing the floppy to start doing
- * scatter-gather when it never had before, and there *is* a problem
- * on that platform... ;-}
- */
-
-static inline unsigned long CROSS_64KB(void *a, unsigned long s)
-{
-	unsigned long p = (unsigned long)a;
-
-	return ((p + s - 1) ^ p) & ~0xffffUL;
-}
-
-#define EXTRA_FLOPPY_PARAMS
-
-#endif /* __ASM_SW64_FLOPPY_H */
diff --git a/arch/sw_64/include/asm/special_insns.h b/arch/sw_64/include/asm/special_insns.h
deleted file mode 100644
index 7f5a52b20444..000000000000
--- a/arch/sw_64/include/asm/special_insns.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_SW64_SPECIAL_INSNS_H
-#define _ASM_SW64_SPECIAL_INSNS_H
-
-enum amask_enum {
-	AMASK_BWX = (1UL << 0),
-	AMASK_FIX = (1UL << 1),
-	AMASK_CIX = (1UL << 2),
-	AMASK_MAX = (1UL << 8),
-	AMASK_PRECISE_TRAP = (1UL << 9),
-};
-
-#define amask(mask)						\
-({								\
-	unsigned long __amask, __input = (mask);		\
-	__asm__ ("mov %1, %0" : "=r"(__amask) : "rI"(__input));	\
-	__amask;						\
-})
-
-#endif /* _ASM_SW64_SPECIAL_INSNS_H */
diff --git a/arch/sw_64/include/uapi/asm/console.h b/arch/sw_64/include/uapi/asm/console.h
deleted file mode 100644
index a40cd7aeb31f..000000000000
--- a/arch/sw_64/include/uapi/asm/console.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _UAPI_ASM_SW64_CONSOLE_H
-#define _UAPI_ASM_SW64_CONSOLE_H
-
-/*
- * Console callback routine numbers
- */
-#define CCB_GETC		0x01
-#define CCB_PUTS		0x02
-#define CCB_RESET_TERM		0x03
-#define CCB_SET_TERM_INT	0x04
-#define CCB_SET_TERM_CTL	0x05
-#define CCB_PROCESS_KEYCODE	0x06
-#define CCB_OPEN_CONSOLE	0x07
-#define CCB_CLOSE_CONSOLE	0x08
-
-#define CCB_OPEN		0x10
-#define CCB_CLOSE		0x11
-#define CCB_IOCTL		0x12
-#define CCB_READ		0x13
-#define CCB_WRITE		0x14
-
-#define CCB_SET_ENV		0x20
-#define CCB_RESET_ENV		0x21
-#define CCB_GET_ENV		0x22
-#define CCB_SAVE_ENV		0x23
-
-#define CCB_PSWITCH		0x30
-#define CCB_BIOS_EMUL		0x32
-
-/*
- * Environment variable numbers
- */
-#define ENV_AUTO_ACTION		0x01
-#define ENV_BOOT_DEV		0x02
-#define ENV_BOOTDEF_DEV		0x03
-#define ENV_BOOTED_DEV		0x04
-#define ENV_BOOT_FILE		0x05
-#define ENV_BOOTED_FILE		0x06
-#define ENV_BOOT_OSFLAGS	0x07
-#define ENV_BOOTED_OSFLAGS	0x08
-#define ENV_BOOT_RESET		0x09
-#define ENV_DUMP_DEV		0x0A
-#define ENV_ENABLE_AUDIT	0x0B
-#define ENV_LICENSE		0x0C
-#define ENV_CHAR_SET		0x0D
-#define ENV_LANGUAGE		0x0E
-#define ENV_TTY_DEV		0x0F
-
-
-#endif /* _UAPI_ASM_SW64_CONSOLE_H */
-- 
Gitee


From 65d722507cd65bbf42f9c00d92c7016bc92968bf Mon Sep 17 00:00:00 2001
From: He Sheng <hesheng@wxiat.com>
Date: Fri, 6 May 2022 09:28:36 +0800
Subject: [PATCH 078/674] sw64: uapi: generate some uapi headers from generic
 ones

Sunway inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I56OLG

--------------------------------

These uapi headers are proved to be the same with generic version.

Herein we remove the arch-specific statfs.h and types.h to end
support for old system.

Signed-off-by: He Sheng <hesheng@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/include/uapi/asm/Kbuild        |  1 +
 arch/sw_64/include/uapi/asm/ipcbuf.h      |  7 ----
 arch/sw_64/include/uapi/asm/kvm_para.h    |  7 ----
 arch/sw_64/include/uapi/asm/msgbuf.h      | 28 ----------------
 arch/sw_64/include/uapi/asm/poll.h        |  7 ----
 arch/sw_64/include/uapi/asm/posix_types.h | 18 -----------
 arch/sw_64/include/uapi/asm/sembuf.h      | 23 -------------
 arch/sw_64/include/uapi/asm/shmbuf.h      | 39 -----------------------
 arch/sw_64/include/uapi/asm/statfs.h      |  9 ------
 arch/sw_64/include/uapi/asm/types.h       | 28 ----------------
 10 files changed, 1 insertion(+), 166 deletions(-)
 delete mode 100644 arch/sw_64/include/uapi/asm/ipcbuf.h
 delete mode 100644 arch/sw_64/include/uapi/asm/kvm_para.h
 delete mode 100644 arch/sw_64/include/uapi/asm/msgbuf.h
 delete mode 100644 arch/sw_64/include/uapi/asm/poll.h
 delete mode 100644 arch/sw_64/include/uapi/asm/posix_types.h
 delete mode 100644 arch/sw_64/include/uapi/asm/sembuf.h
 delete mode 100644 arch/sw_64/include/uapi/asm/shmbuf.h
 delete mode 100644 arch/sw_64/include/uapi/asm/statfs.h
 delete mode 100644 arch/sw_64/include/uapi/asm/types.h

diff --git a/arch/sw_64/include/uapi/asm/Kbuild b/arch/sw_64/include/uapi/asm/Kbuild
index a01bfb9600ec..15700040f138 100644
--- a/arch/sw_64/include/uapi/asm/Kbuild
+++ b/arch/sw_64/include/uapi/asm/Kbuild
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
 # UAPI Header export list
 
+generic-y += kvm_para.h
 generated-y += unistd_64.h
diff --git a/arch/sw_64/include/uapi/asm/ipcbuf.h b/arch/sw_64/include/uapi/asm/ipcbuf.h
deleted file mode 100644
index 553cdb37052d..000000000000
--- a/arch/sw_64/include/uapi/asm/ipcbuf.h
+++ /dev/null
@@ -1,7 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _UAPI_ASM_SW64_IPCBUF_H
-#define _UAPI_ASM_SW64_IPCBUF_H
-
-#include <asm-generic/ipcbuf.h>
-
-#endif
diff --git a/arch/sw_64/include/uapi/asm/kvm_para.h b/arch/sw_64/include/uapi/asm/kvm_para.h
deleted file mode 100644
index 3c0f9fa712ab..000000000000
--- a/arch/sw_64/include/uapi/asm/kvm_para.h
+++ /dev/null
@@ -1,7 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _UAPI_ASM_SW64_KVM_PARA_H
-#define _UAPI_ASM_SW64_KVM_PARA_H
-
-#include <asm-generic/kvm_para.h>
-
-#endif
diff --git a/arch/sw_64/include/uapi/asm/msgbuf.h b/arch/sw_64/include/uapi/asm/msgbuf.h
deleted file mode 100644
index b938df3664a0..000000000000
--- a/arch/sw_64/include/uapi/asm/msgbuf.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _UAPI_ASM_SW64_MSGBUF_H
-#define _UAPI_ASM_SW64_MSGBUF_H
-
-/*
- * The msqid64_ds structure for sw64 architecture.
- * Note extra padding because this structure is passed back and forth
- * between kernel and user space.
- *
- * Pad space is left for:
- * - 2 miscellaneous 64-bit values
- */
-
-struct msqid64_ds {
-	struct ipc64_perm msg_perm;
-	long		msg_stime;	/* last msgsnd time */
-	long		msg_rtime;	/* last msgrcv time */
-	long		msg_ctime;	/* last change time */
-	unsigned long	msg_cbytes;	/* current number of bytes on queue */
-	unsigned long	msg_qnum;	/* number of messages in queue */
-	unsigned long	msg_qbytes;	/* max number of bytes on queue */
-	__kernel_pid_t	msg_lspid;	/* pid of last msgsnd */
-	__kernel_pid_t	msg_lrpid;	/* last receive pid */
-	unsigned long	__unused1;
-	unsigned long	__unused2;
-};
-
-#endif /* _UAPI_ASM_SW64_MSGBUF_H */
diff --git a/arch/sw_64/include/uapi/asm/poll.h b/arch/sw_64/include/uapi/asm/poll.h
deleted file mode 100644
index 114d0344e377..000000000000
--- a/arch/sw_64/include/uapi/asm/poll.h
+++ /dev/null
@@ -1,7 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _UAPI_ASM_SW64_POLL_H
-#define _UAPI_ASM_SW64_POLL_H
-
-#include <asm-generic/poll.h>
-
-#endif
diff --git a/arch/sw_64/include/uapi/asm/posix_types.h b/arch/sw_64/include/uapi/asm/posix_types.h
deleted file mode 100644
index 182741aaa06e..000000000000
--- a/arch/sw_64/include/uapi/asm/posix_types.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _UAPI_ASM_SW64_POSIX_TYPES_H
-#define _UAPI_ASM_SW64_POSIX_TYPES_H
-
-/*
- * This file is generally used by user-level software, so you need to
- * be a little careful about namespace pollution etc.  Also, we cannot
- * assume GCC is being used.
- */
-
-typedef unsigned long	__kernel_ino_t;
-#define __kernel_ino_t	__kernel_ino_t
-
-typedef unsigned long	__kernel_sigset_t;	/* at least 32 bits */
-
-#include <asm-generic/posix_types.h>
-
-#endif /* _UAPI_ASM_SW64_POSIX_TYPES_H */
diff --git a/arch/sw_64/include/uapi/asm/sembuf.h b/arch/sw_64/include/uapi/asm/sembuf.h
deleted file mode 100644
index 08b0876e739c..000000000000
--- a/arch/sw_64/include/uapi/asm/sembuf.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _UAPI_ASM_SW64_SEMBUF_H
-#define _UAPI_ASM_SW64_SEMBUF_H
-
-/*
- * The semid64_ds structure for sw64 architecture.
- * Note extra padding because this structure is passed back and forth
- * between kernel and user space.
- *
- * Pad space is left for:
- * - 2 miscellaneous 64-bit values
- */
-
-struct semid64_ds {
-	struct ipc64_perm sem_perm;		/* permissions .. see ipc.h */
-	long		sem_otime;		/* last semop time */
-	long		sem_ctime;		/* last change time */
-	unsigned long	sem_nsems;		/* no. of semaphores in array */
-	unsigned long	__unused1;
-	unsigned long	__unused2;
-};
-
-#endif /* _UAPI_ASM_SW64_SEMBUF_H */
diff --git a/arch/sw_64/include/uapi/asm/shmbuf.h b/arch/sw_64/include/uapi/asm/shmbuf.h
deleted file mode 100644
index 4572337bee02..000000000000
--- a/arch/sw_64/include/uapi/asm/shmbuf.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _UAPI_ASM_SW64_SHMBUF_H
-#define _UAPI_ASM_SW64_SHMBUF_H
-
-/*
- * The shmid64_ds structure for sw64 architecture.
- * Note extra padding because this structure is passed back and forth
- * between kernel and user space.
- *
- * Pad space is left for:
- * - 2 miscellaneous 64-bit values
- */
-
-struct shmid64_ds {
-	struct ipc64_perm	shm_perm;	/* operation perms */
-	size_t			shm_segsz;	/* size of segment (bytes) */
-	long			shm_atime;	/* last attach time */
-	long			shm_dtime;	/* last detach time */
-	long			shm_ctime;	/* last change time */
-	__kernel_pid_t		shm_cpid;	/* pid of creator */
-	__kernel_pid_t		shm_lpid;	/* pid of last operator */
-	unsigned long		shm_nattch;	/* no. of current attaches */
-	unsigned long		__unused1;
-	unsigned long		__unused2;
-};
-
-struct shminfo64 {
-	unsigned long	shmmax;
-	unsigned long	shmmin;
-	unsigned long	shmmni;
-	unsigned long	shmseg;
-	unsigned long	shmall;
-	unsigned long	__unused1;
-	unsigned long	__unused2;
-	unsigned long	__unused3;
-	unsigned long	__unused4;
-};
-
-#endif /* _UAPI_ASM_SW64_SHMBUF_H */
diff --git a/arch/sw_64/include/uapi/asm/statfs.h b/arch/sw_64/include/uapi/asm/statfs.h
deleted file mode 100644
index b92d719238d1..000000000000
--- a/arch/sw_64/include/uapi/asm/statfs.h
+++ /dev/null
@@ -1,9 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _UAPI_ASM_SW64_STATFS_H
-#define _UAPI_ASM_SW64_STATFS_H
-
-#include <linux/types.h>
-
-#include <asm-generic/statfs.h>
-
-#endif
diff --git a/arch/sw_64/include/uapi/asm/types.h b/arch/sw_64/include/uapi/asm/types.h
deleted file mode 100644
index 750b5181c3de..000000000000
--- a/arch/sw_64/include/uapi/asm/types.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _UAPI_ASM_SW64_TYPES_H
-#define _UAPI_ASM_SW64_TYPES_H
-
-/*
- * This file is never included by application software unless
- * explicitly requested (e.g., via linux/types.h) in which case the
- * application is Linux specific so (user-) name space pollution is
- * not a major issue.  However, for interoperability, libraries still
- * need to be careful to avoid a name clashes.
- */
-
-/*
- * This is here because we used to use l64 for sw64 and we don't want
- * to impact user mode with our change to ll64 in the kernel.
- *
- * However, some user programs are fine with this.  They can
- * flag __SANE_USERSPACE_TYPES__ to get int-ll64.h here.
- */
-#ifndef __KERNEL__
-#ifndef __SANE_USERSPACE_TYPES__
-#include <asm-generic/int-l64.h>
-#else
-#include <asm-generic/int-ll64.h>
-#endif /* __SANE_USERSPACE_TYPES__ */
-#endif /* __KERNEL__ */
-
-#endif /* _UAPI_ASM_SW64_TYPES_H */
-- 
Gitee


From e2c89aaa6b99586c72d55024e4c42c0a331864f6 Mon Sep 17 00:00:00 2001
From: He Sheng <hesheng@wxiat.com>
Date: Fri, 6 May 2022 09:30:28 +0800
Subject: [PATCH 079/674] sw64: uapi: include generic param.h

Sunway inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I56OLG

--------------------------------

Most macro definitions are the same with generic version.

Signed-off-by: He Sheng <hesheng@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/include/uapi/asm/param.h | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/arch/sw_64/include/uapi/asm/param.h b/arch/sw_64/include/uapi/asm/param.h
index 16c4934c937e..d38e8202dd97 100644
--- a/arch/sw_64/include/uapi/asm/param.h
+++ b/arch/sw_64/include/uapi/asm/param.h
@@ -2,15 +2,8 @@
 #ifndef _UAPI_ASM_SW64_PARAM_H
 #define _UAPI_ASM_SW64_PARAM_H
 
-#define HZ		100
-
 #define EXEC_PAGESIZE	8192
 
-#ifndef NOGROUP
-#define NOGROUP		(-1)
-#endif
-
-#define MAXHOSTNAMELEN	64	/* max length of hostname */
-
+#include <asm-generic/param.h>
 
 #endif /* _UAPI_ASM_SW64_PARAM_H */
-- 
Gitee


From 92f19473b28196492cbfd8153f8d48f2297a562a Mon Sep 17 00:00:00 2001
From: He Sheng <hesheng@wxiat.com>
Date: Fri, 6 May 2022 09:57:27 +0800
Subject: [PATCH 080/674] sw64: kapi: remove unused header-y from Kbuild

Sunway inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I56OLG

--------------------------------

Signed-off-by: He Sheng <hesheng@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/include/asm/Kbuild | 19 ++++---------------
 1 file changed, 4 insertions(+), 15 deletions(-)

diff --git a/arch/sw_64/include/asm/Kbuild b/arch/sw_64/include/asm/Kbuild
index e276ba366e68..a94a978edf3e 100644
--- a/arch/sw_64/include/asm/Kbuild
+++ b/arch/sw_64/include/asm/Kbuild
@@ -1,23 +1,12 @@
 # SPDX-License-Identifier: GPL-2.0
-header-y += compiler.h
-header-y += console.h
-header-y += fpu.h
-header-y += gentrap.h
-header-y += hmcall.h
-header-y += reg.h
-header-y += regdef.h
-header-y += sysinfo.h
-header-y += page.h
-header-y += elf.h
 
-generated-y += syscall_table.h
+generic-y += clkdev.h
 generic-y += export.h
 generic-y += kvm_types.h
-generic-y += rwsem.h
-
+generic-y += mcs_spinlock.h
 generic-y += qrwlock.h
 generic-y += qspinlock.h
-generic-y += mcs_spinlock.h
-generic-y += clkdev.h
+generic-y += rwsem.h
 generic-y += scatterlist.h
 generic-y += user.h
+generated-y += syscall_table.h
-- 
Gitee


From 927e8e16d604582a095d7c0ae9714f8050f0205d Mon Sep 17 00:00:00 2001
From: He Sheng <hesheng@wxiat.com>
Date: Fri, 6 May 2022 10:00:14 +0800
Subject: [PATCH 081/674] sw64: kapi: generate some kapi headers from generic
 ones

Sunway inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I56OLG

--------------------------------

These kapi headers are proved to be the same with generic version.

Signed-off-by: He Sheng <hesheng@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/include/asm/Kbuild              |   7 +-
 arch/sw_64/include/asm/div64.h             |   7 --
 arch/sw_64/include/asm/emergency-restart.h |   7 --
 arch/sw_64/include/asm/exec.h              |   7 --
 arch/sw_64/include/asm/irq_regs.h          |   7 --
 arch/sw_64/include/asm/kmap_types.h        |  15 ---
 arch/sw_64/include/asm/local.h             | 125 ---------------------
 arch/sw_64/include/asm/local64.h           |   7 --
 arch/sw_64/include/asm/param.h             |  11 --
 arch/sw_64/include/asm/parport.h           |  19 ----
 arch/sw_64/include/asm/preempt.h           |   7 --
 arch/sw_64/include/asm/seccomp.h           |  15 ---
 arch/sw_64/include/asm/sections.h          |   8 --
 arch/sw_64/include/asm/segment.h           |   7 --
 arch/sw_64/include/asm/serial.h            |  16 ---
 arch/sw_64/include/asm/shmparam.h          |   7 --
 arch/sw_64/include/asm/trace_clock.h       |  10 --
 arch/sw_64/include/asm/types.h             |   7 --
 arch/sw_64/include/asm/unaligned.h         |  12 --
 19 files changed, 6 insertions(+), 295 deletions(-)
 delete mode 100644 arch/sw_64/include/asm/div64.h
 delete mode 100644 arch/sw_64/include/asm/emergency-restart.h
 delete mode 100644 arch/sw_64/include/asm/exec.h
 delete mode 100644 arch/sw_64/include/asm/irq_regs.h
 delete mode 100644 arch/sw_64/include/asm/kmap_types.h
 delete mode 100644 arch/sw_64/include/asm/local.h
 delete mode 100644 arch/sw_64/include/asm/local64.h
 delete mode 100644 arch/sw_64/include/asm/param.h
 delete mode 100644 arch/sw_64/include/asm/parport.h
 delete mode 100644 arch/sw_64/include/asm/preempt.h
 delete mode 100644 arch/sw_64/include/asm/seccomp.h
 delete mode 100644 arch/sw_64/include/asm/sections.h
 delete mode 100644 arch/sw_64/include/asm/segment.h
 delete mode 100644 arch/sw_64/include/asm/serial.h
 delete mode 100644 arch/sw_64/include/asm/shmparam.h
 delete mode 100644 arch/sw_64/include/asm/trace_clock.h
 delete mode 100644 arch/sw_64/include/asm/types.h
 delete mode 100644 arch/sw_64/include/asm/unaligned.h

diff --git a/arch/sw_64/include/asm/Kbuild b/arch/sw_64/include/asm/Kbuild
index a94a978edf3e..d08f0b08918e 100644
--- a/arch/sw_64/include/asm/Kbuild
+++ b/arch/sw_64/include/asm/Kbuild
@@ -3,10 +3,15 @@
 generic-y += clkdev.h
 generic-y += export.h
 generic-y += kvm_types.h
+generic-y += local64.h
 generic-y += mcs_spinlock.h
+generic-y += param.h
 generic-y += qrwlock.h
 generic-y += qspinlock.h
 generic-y += rwsem.h
-generic-y += scatterlist.h
+generic-y += seccomp.h
+generic-y += segment.h
+generic-y += types.h
 generic-y += user.h
+
 generated-y += syscall_table.h
diff --git a/arch/sw_64/include/asm/div64.h b/arch/sw_64/include/asm/div64.h
deleted file mode 100644
index 306581407ba5..000000000000
--- a/arch/sw_64/include/asm/div64.h
+++ /dev/null
@@ -1,7 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_SW64_DIV64_H
-#define _ASM_SW64_DIV64_H
-
-#include <asm-generic/div64.h>
-
-#endif
diff --git a/arch/sw_64/include/asm/emergency-restart.h b/arch/sw_64/include/asm/emergency-restart.h
deleted file mode 100644
index fabb33ebf0eb..000000000000
--- a/arch/sw_64/include/asm/emergency-restart.h
+++ /dev/null
@@ -1,7 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_SW64_EMERGENCY_RESTART_H
-#define _ASM_SW64_EMERGENCY_RESTART_H
-
-#include <asm-generic/emergency-restart.h>
-
-#endif /* _ASM_SW64_EMERGENCY_RESTART_H */
diff --git a/arch/sw_64/include/asm/exec.h b/arch/sw_64/include/asm/exec.h
deleted file mode 100644
index 4a9cb71c5c47..000000000000
--- a/arch/sw_64/include/asm/exec.h
+++ /dev/null
@@ -1,7 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_SW64_EXEC_H
-#define _ASM_SW64_EXEC_H
-
-#define arch_align_stack(x) (x)
-
-#endif /* _ASM_SW64_EXEC_H */
diff --git a/arch/sw_64/include/asm/irq_regs.h b/arch/sw_64/include/asm/irq_regs.h
deleted file mode 100644
index bba48f36a40f..000000000000
--- a/arch/sw_64/include/asm/irq_regs.h
+++ /dev/null
@@ -1,7 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_SW64_IRQ_REGS_H
-#define _ASM_SW64_IRQ_REGS_H
-
-#include <asm-generic/irq_regs.h>
-
-#endif
diff --git a/arch/sw_64/include/asm/kmap_types.h b/arch/sw_64/include/asm/kmap_types.h
deleted file mode 100644
index 8e86b08dee94..000000000000
--- a/arch/sw_64/include/asm/kmap_types.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_SW64_KMAP_TYPES_H
-#define _ASM_SW64_KMAP_TYPES_H
-
-/* Dummy header just to define km_type. */
-
-#ifdef CONFIG_DEBUG_HIGHMEM
-#define  __WITH_KM_FENCE
-#endif
-
-#include <asm-generic/kmap_types.h>
-
-#undef __WITH_KM_FENCE
-
-#endif
diff --git a/arch/sw_64/include/asm/local.h b/arch/sw_64/include/asm/local.h
deleted file mode 100644
index 9144600f641d..000000000000
--- a/arch/sw_64/include/asm/local.h
+++ /dev/null
@@ -1,125 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_SW64_LOCAL_H
-#define _ASM_SW64_LOCAL_H
-
-#include <linux/percpu.h>
-#include <linux/atomic.h>
-
-typedef struct {
-	atomic_long_t a;
-} local_t;
-
-#define LOCAL_INIT(i)	{ ATOMIC_LONG_INIT(i) }
-#define local_read(l)	atomic_long_read(&(l)->a)
-#define local_set(l, i)	atomic_long_set(&(l)->a, (i))
-#define local_inc(l)	atomic_long_inc(&(l)->a)
-#define local_dec(l)	atomic_long_dec(&(l)->a)
-#define local_add(i, l)	atomic_long_add((i), (&(l)->a))
-#define local_sub(i, l)	atomic_long_sub((i), (&(l)->a))
-
-static inline long local_add_return(long i, local_t *l)
-{
-	long temp1, temp2, result, addr;
-
-	__asm__ __volatile__(
-#ifdef CONFIG_LOCK_MEMB
-	"	memb\n"
-#endif
-	"	ldi  %4, %2\n"
-	"1:	lldl %0, 0(%4)\n"
-	"	ldi  %1, 1\n"
-	"	wr_f %1\n"
-	"	addl %0, %5, %3\n"
-	"	addl %0, %5, %0\n"
-#ifdef CONFIG_LOCK_FIXUP
-	"	memb\n"
-#endif
-	"	lstl %0, 0(%4)\n"
-	"	rd_f %0\n"
-	"	beq %0, 2f\n"
-	".subsection 2\n"
-	"2:	br 1b\n"
-	".previous"
-	: "=&r" (temp1), "=&r" (temp2), "=m" (l->a.counter),
-	  "=&r" (result), "=&r" (addr)
-	: "Ir" (i), "m" (l->a.counter) : "memory");
-	return result;
-}
-
-static inline long local_sub_return(long i, local_t *l)
-{
-	long temp1, temp2, result, addr;
-
-	__asm__ __volatile__(
-#ifdef CONFIG_LOCK_MEMB
-	"	memb\n"
-#endif
-	"	ldi  %4, %2\n"
-	"1:	lldl %0, 0(%4)\n"
-	"	ldi  %1, 1\n"
-	"	wr_f %1\n"
-	"	subl %0, %5, %3\n"
-	"	subl %0, %5, %0\n"
-#ifdef CONFIG_LOCK_FIXUP
-	"       memb\n"
-#endif
-	"	lstl %0, 0(%4)\n"
-	"	rd_f %0\n"
-	"	beq %0, 2f\n"
-	".subsection 2\n"
-	"2:	br 1b\n"
-	".previous"
-	: "=&r" (temp1), "=&r" (temp2), "=m" (l->a.counter),
-	  "=&r" (result), "=&r" (addr)
-	: "Ir" (i), "m" (l->a.counter) : "memory");
-	return result;
-}
-
-#define local_cmpxchg(l, o, n) \
-	(cmpxchg_local(&((l)->a.counter), (o), (n)))
-#define local_xchg(l, n) (xchg_local(&((l)->a.counter), (n)))
-
-/**
- * local_add_unless - add unless the number is a given value
- * @l: pointer of type local_t
- * @a: the amount to add to l...
- * @u: ...unless l is equal to u.
- *
- * Atomically adds @a to @l, so long as it was not @u.
- * Returns non-zero if @l was not @u, and zero otherwise.
- */
-#define local_add_unless(l, a, u)				\
-({								\
-	long c, old;						\
-	c = local_read(l);					\
-	for (;;) {						\
-		if (unlikely(c == (u)))				\
-			break;					\
-		old = local_cmpxchg((l), c, c + (a));	\
-		if (likely(old == c))				\
-			break;					\
-		c = old;					\
-	}							\
-	c != (u);						\
-})
-#define local_inc_not_zero(l) local_add_unless((l), 1, 0)
-
-#define local_add_negative(a, l) (local_add_return((a), (l)) < 0)
-
-#define local_dec_return(l) local_sub_return(1, (l))
-
-#define local_inc_return(l) local_add_return(1, (l))
-
-#define local_sub_and_test(i, l) (local_sub_return((i), (l)) == 0)
-
-#define local_inc_and_test(l) (local_add_return(1, (l)) == 0)
-
-#define local_dec_and_test(l) (local_sub_return(1, (l)) == 0)
-
-/* Verify if faster than atomic ops */
-#define __local_inc(l)		((l)->a.counter++)
-#define __local_dec(l)		((l)->a.counter++)
-#define __local_add(i, l)	((l)->a.counter += (i))
-#define __local_sub(i, l)	((l)->a.counter -= (i))
-
-#endif /* _ASM_SW64_LOCAL_H */
diff --git a/arch/sw_64/include/asm/local64.h b/arch/sw_64/include/asm/local64.h
deleted file mode 100644
index 4278133cd8fa..000000000000
--- a/arch/sw_64/include/asm/local64.h
+++ /dev/null
@@ -1,7 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_SW64_LOCAL64_H
-#define _ASM_SW64_LOCAL64_H
-
-#include <asm-generic/local64.h>
-
-#endif
diff --git a/arch/sw_64/include/asm/param.h b/arch/sw_64/include/asm/param.h
deleted file mode 100644
index 49c5d03a3370..000000000000
--- a/arch/sw_64/include/asm/param.h
+++ /dev/null
@@ -1,11 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_SW64_PARAM_H
-#define _ASM_SW64_PARAM_H
-
-#include <uapi/asm/param.h>
-
-#undef HZ
-#define HZ		CONFIG_HZ
-#define USER_HZ		100
-#define CLOCKS_PER_SEC	USER_HZ	/* frequency at which times() counts */
-#endif /* _ASM_SW64_PARAM_H */
diff --git a/arch/sw_64/include/asm/parport.h b/arch/sw_64/include/asm/parport.h
deleted file mode 100644
index 82b9a219b797..000000000000
--- a/arch/sw_64/include/asm/parport.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * parport.h: platform-specific PC-style parport initialisation
- *
- * Copyright (C) 1999, 2000  Tim Waugh <tim@cyberelk.demon.co.uk>
- *
- * This file should only be included by drivers/parport/parport_pc.c.
- */
-
-#ifndef _ASM_SW64_PARPORT_H
-#define _ASM_SW64_PARPORT_H
-
-static int parport_pc_find_isa_ports(int autoirq, int autodma);
-static int parport_pc_find_nonpci_ports(int autoirq, int autodma)
-{
-	return parport_pc_find_isa_ports(autoirq, autodma);
-}
-
-#endif /* !(_ASM_SW64_PARPORT_H) */
diff --git a/arch/sw_64/include/asm/preempt.h b/arch/sw_64/include/asm/preempt.h
deleted file mode 100644
index dc6643a43766..000000000000
--- a/arch/sw_64/include/asm/preempt.h
+++ /dev/null
@@ -1,7 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_SW64_PREEMPT_H
-#define _ASM_SW64_PREEMPT_H
-
-#include <asm-generic/preempt.h>
-
-#endif /* _ASM_SW64_PREEMPT_H */
diff --git a/arch/sw_64/include/asm/seccomp.h b/arch/sw_64/include/asm/seccomp.h
deleted file mode 100644
index db2f298862c3..000000000000
--- a/arch/sw_64/include/asm/seccomp.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * arch/sw_64/include/asm/seccomp.h
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef _ASM_SW64_SECCOMP_H
-#define _ASM_SW64_SECCOMP_H
-
-#include <asm/unistd.h>
-#include <asm-generic/seccomp.h>
-
-#endif /* _ASM_SW64_SECCOMP_H */
diff --git a/arch/sw_64/include/asm/sections.h b/arch/sw_64/include/asm/sections.h
deleted file mode 100644
index 37dab4fde720..000000000000
--- a/arch/sw_64/include/asm/sections.h
+++ /dev/null
@@ -1,8 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_SW64_SECTIONS_H
-#define _ASM_SW64_SECTIONS_H
-
-/* nothing to see, move along */
-#include <asm-generic/sections.h>
-
-#endif
diff --git a/arch/sw_64/include/asm/segment.h b/arch/sw_64/include/asm/segment.h
deleted file mode 100644
index dc90357765e5..000000000000
--- a/arch/sw_64/include/asm/segment.h
+++ /dev/null
@@ -1,7 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_SW64_SEGMENT_H
-#define _ASM_SW64_SEGMENT_H
-
-/* Only here because we have some old header files that expect it.. */
-
-#endif
diff --git a/arch/sw_64/include/asm/serial.h b/arch/sw_64/include/asm/serial.h
deleted file mode 100644
index 059e603642b9..000000000000
--- a/arch/sw_64/include/asm/serial.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_SW64_SERIAL_H
-#define _ASM_SW64_SERIAL_H
-
-#define BASE_BAUD (1843200 / 16)
-
-/* Standard COM flags (except for COM4, because of the 8514 problem) */
-#ifdef CONFIG_SERIAL_8250_DETECT_IRQ
-#define STD_COM_FLAGS (UPF_BOOT_AUTOCONF | UPF_SKIP_TEST | UPF_AUTO_IRQ)
-#define STD_COM4_FLAGS (UPF_BOOT_AUTOCONF | UPF_AUTO_IRQ)
-#else
-#define STD_COM_FLAGS (UPF_BOOT_AUTOCONF | UPF_SKIP_TEST)
-#define STD_COM4_FLAGS UPF_BOOT_AUTOCONF
-#endif
-
-#endif /* _ASM_SW64_SERIAL_H */
diff --git a/arch/sw_64/include/asm/shmparam.h b/arch/sw_64/include/asm/shmparam.h
deleted file mode 100644
index 15f71533b1ed..000000000000
--- a/arch/sw_64/include/asm/shmparam.h
+++ /dev/null
@@ -1,7 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_SW64_SHMPARAM_H
-#define _ASM_SW64_SHMPARAM_H
-
-#define	SHMLBA PAGE_SIZE		 /* attach addr a multiple of this */
-
-#endif /* _ASM_SW64_SHMPARAM_H */
diff --git a/arch/sw_64/include/asm/trace_clock.h b/arch/sw_64/include/asm/trace_clock.h
deleted file mode 100644
index 57324215a837..000000000000
--- a/arch/sw_64/include/asm/trace_clock.h
+++ /dev/null
@@ -1,10 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_SW64_TRACE_CLOCK_H
-#define _ASM_SW64_TRACE_CLOCK_H
-
-#include <linux/compiler.h>
-#include <linux/types.h>
-
-#define ARCH_TRACE_CLOCKS
-
-#endif  /* _ASM_SW64_TRACE_CLOCK_H */
diff --git a/arch/sw_64/include/asm/types.h b/arch/sw_64/include/asm/types.h
deleted file mode 100644
index 37d626269a02..000000000000
--- a/arch/sw_64/include/asm/types.h
+++ /dev/null
@@ -1,7 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_SW64_TYPES_H
-#define _ASM_SW64_TYPES_H
-
-#include <asm-generic/int-ll64.h>
-
-#endif /* _ASM_SW64_TYPES_H */
diff --git a/arch/sw_64/include/asm/unaligned.h b/arch/sw_64/include/asm/unaligned.h
deleted file mode 100644
index 91fdff923ce5..000000000000
--- a/arch/sw_64/include/asm/unaligned.h
+++ /dev/null
@@ -1,12 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_SW64_UNALIGNED_H
-#define _ASM_SW64_UNALIGNED_H
-
-#include <linux/unaligned/le_struct.h>
-#include <linux/unaligned/be_byteshift.h>
-#include <linux/unaligned/generic.h>
-
-#define get_unaligned __get_unaligned_le
-#define put_unaligned __put_unaligned_le
-
-#endif /* _ASM_SW64_UNALIGNED_H */
-- 
Gitee


From 6519c5195084b7326aa8a26584bb2e2b4d82f19c Mon Sep 17 00:00:00 2001
From: He Sheng <hesheng@wxiat.com>
Date: Fri, 6 May 2022 10:05:54 +0800
Subject: [PATCH 082/674] sw64: move ucontext.h to uapi

Sunway inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I56OLG

--------------------------------

It's an important user interface which should be put in uapi.

Signed-off-by: He Sheng <hesheng@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/include/{ => uapi}/asm/ucontext.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)
 rename arch/sw_64/include/{ => uapi}/asm/ucontext.h (56%)

diff --git a/arch/sw_64/include/asm/ucontext.h b/arch/sw_64/include/uapi/asm/ucontext.h
similarity index 56%
rename from arch/sw_64/include/asm/ucontext.h
rename to arch/sw_64/include/uapi/asm/ucontext.h
index d40eebe988ef..c5d6e24e3e5f 100644
--- a/arch/sw_64/include/asm/ucontext.h
+++ b/arch/sw_64/include/uapi/asm/ucontext.h
@@ -1,6 +1,6 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_SW64_UCONTEXT_H
-#define _ASM_SW64_UCONTEXT_H
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_ASM_SW64_UCONTEXT_H
+#define _UAPI_ASM_SW64_UCONTEXT_H
 
 struct ucontext {
 	unsigned long		uc_flags;
@@ -11,4 +11,4 @@ struct ucontext {
 	sigset_t		uc_sigmask;	/* mask last for extensibility */
 };
 
-#endif /* _ASM_SW64_UCONTEXT_H */
+#endif /* _UAPI_ASM_SW64_UCONTEXT_H */
-- 
Gitee


From dc6f31f83df3a0ff2a9da93ffb61aee3e58ae852 Mon Sep 17 00:00:00 2001
From: He Sheng <hesheng@wxiat.com>
Date: Fri, 6 May 2022 10:30:09 +0800
Subject: [PATCH 083/674] sw64: kapi: remove redudant SMP_CACHE_BYTES

Sunway inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I56OLG

--------------------------------

SMP_CACHE_BYTES is also defined in linux/cache.h, so remove the
replicated one.

Signed-off-by: He Sheng <hesheng@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/include/asm/cache.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/arch/sw_64/include/asm/cache.h b/arch/sw_64/include/asm/cache.h
index a59a74110884..1dca2e2e04a4 100644
--- a/arch/sw_64/include/asm/cache.h
+++ b/arch/sw_64/include/asm/cache.h
@@ -5,9 +5,7 @@
 #ifndef _ASM_SW64_CACHE_H
 #define _ASM_SW64_CACHE_H
 
-#define L1_CACHE_BYTES		128
 #define L1_CACHE_SHIFT		7
-
-#define SMP_CACHE_BYTES		L1_CACHE_BYTES
+#define L1_CACHE_BYTES		(1 << L1_CACHE_SHIFT)
 
 #endif
-- 
Gitee


From cf28d11eff5d9edbb5330eed2e2bb3e75e4bc066 Mon Sep 17 00:00:00 2001
From: He Sheng <hesheng@wxiat.com>
Date: Fri, 6 May 2022 10:45:56 +0800
Subject: [PATCH 084/674] sw64: kapi: remove unimplemented IPLs

Sunway inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I56OLG

--------------------------------

There are only two IPLs so far on sw64.

Signed-off-by: He Sheng <hesheng@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/include/asm/irqflags.h | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/arch/sw_64/include/asm/irqflags.h b/arch/sw_64/include/asm/irqflags.h
index 6101b6ad2e99..b4440f25a51d 100644
--- a/arch/sw_64/include/asm/irqflags.h
+++ b/arch/sw_64/include/asm/irqflags.h
@@ -5,14 +5,6 @@
 #include <asm/hmcall.h>
 
 #define IPL_MIN		0
-#define IPL_SW0		1
-#define IPL_SW1		2
-#define IPL_DEV0	3
-#define IPL_DEV1	4
-#define IPL_TIMER	5
-#define IPL_PERF	6
-#define IPL_POWERFAIL	6
-#define IPL_MCHECK	7
 #define IPL_MAX		7
 
 #define getipl()		(rdps() & 7)
-- 
Gitee


From 211af39aab2e322ad14fd7727977946b29fff3ba Mon Sep 17 00:00:00 2001
From: He Sheng <hesheng@wxiat.com>
Date: Fri, 6 May 2022 11:09:36 +0800
Subject: [PATCH 085/674] sw64: kapi: include generic modules.h

Sunway inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I56OLG

--------------------------------

This patch removes replicated macros which have been defined in
generic version.

Signed-off-by: He Sheng <hesheng@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/include/asm/module.h | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/arch/sw_64/include/asm/module.h b/arch/sw_64/include/asm/module.h
index 55e6e333585f..d1663aab4097 100644
--- a/arch/sw_64/include/asm/module.h
+++ b/arch/sw_64/include/asm/module.h
@@ -2,18 +2,12 @@
 #ifndef _ASM_SW64_MODULE_H
 #define _ASM_SW64_MODULE_H
 
+#include <asm-generic/module.h>
+
 struct mod_arch_specific {
 	unsigned int gotsecindex;
 };
 
-#define Elf_Sym		Elf64_Sym
-#define Elf_Shdr	Elf64_Shdr
-#define Elf_Ehdr	Elf64_Ehdr
-#define Elf_Phdr	Elf64_Phdr
-#define Elf_Dyn		Elf64_Dyn
-#define Elf_Rel		Elf64_Rel
-#define Elf_Rela	Elf64_Rela
-
 #define ARCH_SHF_SMALL	SHF_SW64_GPREL
 
 #ifdef MODULE
-- 
Gitee


From fda97bdcd80c5dbced89f64ad340c18987fa39fb Mon Sep 17 00:00:00 2001
From: He Sheng <hesheng@wxiat.com>
Date: Fri, 6 May 2022 14:15:52 +0800
Subject: [PATCH 086/674] sw64: remove VGA_HOSE things

Sunway inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I56OLG

--------------------------------

SW64 has nothing to do with CONFIG_VGA_HOSE.

Signed-off-by: He Sheng <hesheng@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/Kconfig           |  8 --------
 arch/sw_64/include/asm/vga.h | 31 -------------------------------
 2 files changed, 39 deletions(-)

diff --git a/arch/sw_64/Kconfig b/arch/sw_64/Kconfig
index 472a916fd93e..8c5e87cb9b84 100644
--- a/arch/sw_64/Kconfig
+++ b/arch/sw_64/Kconfig
@@ -896,12 +896,4 @@ source "drivers/idle/Kconfig"
 
 endmenu
 
-# DUMMY_CONSOLE may be defined in drivers/video/console/Kconfig
-# but we also need it if VGA_HOSE is set
-config DUMMY_CONSOLE
-	bool
-	depends on VGA_HOSE
-	default y
-
-
 source "arch/sw_64/kvm/Kconfig"
diff --git a/arch/sw_64/include/asm/vga.h b/arch/sw_64/include/asm/vga.h
index 28adb8b8b7f1..7268b5cd4bd8 100644
--- a/arch/sw_64/include/asm/vga.h
+++ b/arch/sw_64/include/asm/vga.h
@@ -49,37 +49,6 @@ extern void scr_memcpyw(u16 *d, const u16 *s, unsigned int count);
 #define vga_readb(a)		readb((u8 __iomem *)(a))
 #define vga_writeb(v, a)	writeb(v, (u8 __iomem *)(a))
 
-#ifdef CONFIG_VGA_HOSE
-#include <linux/ioport.h>
-#include <linux/pci.h>
-
-extern struct pci_controller *pci_vga_hose;
-
-#define __is_port_vga(a) \
-	(((a) >= 0x3b0) && ((a) < 0x3e0) && \
-	((a) != 0x3b3) && ((a) != 0x3d3))
-
-#define __is_mem_vga(a) \
-	(((a) >= 0xa0000) && ((a) <= 0xc0000))
-
-#define FIXUP_IOADDR_VGA(a) do { \
-	if (pci_vga_hose && __is_port_vga(a)) \
-		(a) += pci_vga_hose->io_space->start; \
-} while (0)
-
-#define FIXUP_MEMADDR_VGA(a) do { \
-	if (pci_vga_hose && __is_mem_vga(a)) \
-		(a) += pci_vga_hose->mem_space->start; \
-} while (0)
-
-#else /* CONFIG_VGA_HOSE */
-#define pci_vga_hose 0
-#define __is_port_vga(a) 0
-#define __is_mem_vga(a) 0
-#define FIXUP_IOADDR_VGA(a)
-#define FIXUP_MEMADDR_VGA(a)
-#endif /* CONFIG_VGA_HOSE */
-
 #define VGA_MAP_MEM(x, s)	((unsigned long)ioremap(x, s))
 
 #endif
-- 
Gitee


From cfc38dc4ccccb5fd218eeb560bde21a42f381b91 Mon Sep 17 00:00:00 2001
From: He Sheng <hesheng@wxiat.com>
Date: Sat, 7 May 2022 15:00:06 +0800
Subject: [PATCH 087/674] sw64: clean up unused pci iounmap operation

Sunway inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I56OLG

--------------------------------

For sw64, IO address is mapped by direct mapping without pages.
It doesn't have to do anything to unmap PCI MMIO. The result is
that __is_mmio helper can be removed.

Signed-off-by: He Sheng <hesheng@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/include/asm/io.h | 8 --------
 arch/sw_64/kernel/pci.c     | 4 ----
 2 files changed, 12 deletions(-)

diff --git a/arch/sw_64/include/asm/io.h b/arch/sw_64/include/asm/io.h
index 6796c64f94ae..cb6c7c1f4a6b 100644
--- a/arch/sw_64/include/asm/io.h
+++ b/arch/sw_64/include/asm/io.h
@@ -218,14 +218,6 @@ static inline int __is_ioaddr(unsigned long addr)
 
 #define __is_ioaddr(a)  __is_ioaddr((unsigned long)(a))
 
-static inline int __is_mmio(const volatile void __iomem *xaddr)
-{
-	unsigned long addr = (unsigned long)xaddr;
-
-	return (addr & 0x100000000UL) == 0;
-}
-
-
 
 #define ioread16be(p) be16_to_cpu(ioread16(p))
 #define ioread32be(p) be32_to_cpu(ioread32(p))
diff --git a/arch/sw_64/kernel/pci.c b/arch/sw_64/kernel/pci.c
index 44264e3da18f..77bbd1b68176 100644
--- a/arch/sw_64/kernel/pci.c
+++ b/arch/sw_64/kernel/pci.c
@@ -358,12 +358,8 @@ asmlinkage long sys_pciconfig_iobase(long which, unsigned long bus, unsigned lon
 	return -EOPNOTSUPP;
 }
 
-/* Destroy an __iomem token.  Not copied from lib/iomap.c.  */
-
 void pci_iounmap(struct pci_dev *dev, void __iomem *addr)
 {
-	if (__is_mmio(addr))
-		iounmap(addr);
 }
 EXPORT_SYMBOL(pci_iounmap);
 
-- 
Gitee


From 751e450b08b35f491121fa7f977f1234d1366a12 Mon Sep 17 00:00:00 2001
From: He Sheng <hesheng@wxiat.com>
Date: Tue, 10 May 2022 13:50:33 +0800
Subject: [PATCH 088/674] sw64: kapi: use generic vga.h

Sunway inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I56OLG

--------------------------------

The old arch-specific vga.h takes no effect now, so use the
generic version. As a result, the arch-specific scr_memcpyw
interface and __is_ioaddr helper can be removed too.

Signed-off-by: He Sheng <hesheng@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/include/asm/io.h  |  8 ------
 arch/sw_64/include/asm/vga.h | 54 ------------------------------------
 arch/sw_64/lib/iomap.c       | 37 ------------------------
 3 files changed, 99 deletions(-)
 delete mode 100644 arch/sw_64/include/asm/vga.h

diff --git a/arch/sw_64/include/asm/io.h b/arch/sw_64/include/asm/io.h
index cb6c7c1f4a6b..e8e80c761467 100644
--- a/arch/sw_64/include/asm/io.h
+++ b/arch/sw_64/include/asm/io.h
@@ -211,14 +211,6 @@ static inline void __iounmap(volatile void __iomem *addr)
 
 #define iounmap				__iounmap
 
-static inline int __is_ioaddr(unsigned long addr)
-{
-	return addr >= (PAGE_OFFSET | IO_BASE);
-}
-
-#define __is_ioaddr(a)  __is_ioaddr((unsigned long)(a))
-
-
 #define ioread16be(p) be16_to_cpu(ioread16(p))
 #define ioread32be(p) be32_to_cpu(ioread32(p))
 #define iowrite16be(v, p) iowrite16(cpu_to_be16(v), (p))
diff --git a/arch/sw_64/include/asm/vga.h b/arch/sw_64/include/asm/vga.h
deleted file mode 100644
index 7268b5cd4bd8..000000000000
--- a/arch/sw_64/include/asm/vga.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- *	Access to VGA videoram
- *
- *	(c) 1998 Martin Mares <mj@ucw.cz>
- */
-
-#ifndef _ASM_SW64_VGA_H
-#define _ASM_SW64_VGA_H
-
-#include <asm/io.h>
-
-#define VT_BUF_HAVE_RW
-#define VT_BUF_HAVE_MEMSETW
-#define VT_BUF_HAVE_MEMCPYW
-
-static inline void scr_writew(u16 val, volatile u16 *addr)
-{
-	if (__is_ioaddr(addr))
-		__raw_writew(val, (volatile u16 __iomem *) addr);
-	else
-		*addr = val;
-}
-
-static inline u16 scr_readw(volatile const u16 *addr)
-{
-	if (__is_ioaddr(addr))
-		return __raw_readw((volatile const u16 __iomem *) addr);
-	else
-		return *addr;
-}
-
-static inline void scr_memsetw(u16 *s, u16 c, unsigned int count)
-{
-	if (__is_ioaddr(s))
-		memsetw_io((u16 __iomem *) s, c, count);
-	else
-		memsetw(s, c, count);
-}
-
-/* Do not trust that the usage will be correct; analyze the arguments.  */
-extern void scr_memcpyw(u16 *d, const u16 *s, unsigned int count);
-
-/*
- * ??? These are currently only used for downloading character sets.  As
- * such, they don't need memory barriers.  Is this all they are intended
- * to be used for?
- */
-#define vga_readb(a)		readb((u8 __iomem *)(a))
-#define vga_writeb(v, a)	writeb(v, (u8 __iomem *)(a))
-
-#define VGA_MAP_MEM(x, s)	((unsigned long)ioremap(x, s))
-
-#endif
diff --git a/arch/sw_64/lib/iomap.c b/arch/sw_64/lib/iomap.c
index 39e3d5498ae6..6fb96a7d7119 100644
--- a/arch/sw_64/lib/iomap.c
+++ b/arch/sw_64/lib/iomap.c
@@ -457,43 +457,6 @@ void _memset_c_io(volatile void __iomem *to, unsigned long c, long count)
 }
 EXPORT_SYMBOL(_memset_c_io);
 
-/*
- * A version of memcpy used by the vga console routines to move data around
- * arbitrarily between screen and main memory.
- */
-
-void
-scr_memcpyw(u16 *d, const u16 *s, unsigned int count)
-{
-	const u16 __iomem *ios = (const u16 __iomem *) s;
-	u16 __iomem *iod = (u16 __iomem *) d;
-	int s_isio = __is_ioaddr(s);
-	int d_isio = __is_ioaddr(d);
-	u16 tmp;
-
-	if (s_isio) {
-		if (d_isio) {
-			/*
-			 * FIXME: Should handle unaligned ops and
-			 * operation widening.
-			 */
-
-			count /= 2;
-			while (count--) {
-				tmp = __raw_readw(ios++);
-				__raw_writew(tmp, iod++);
-			}
-		} else
-			memcpy_fromio(d, ios, count);
-	} else {
-		if (d_isio)
-			memcpy_toio(iod, s, count);
-		else
-			memcpy(d, s, count);
-	}
-}
-EXPORT_SYMBOL(scr_memcpyw);
-
 void __iomem *ioport_map(unsigned long port, unsigned int size)
 {
 	return ioportmap(port);
-- 
Gitee


From b5601cff08accb45befdd523b35337c96550c547 Mon Sep 17 00:00:00 2001
From: He Sheng <hesheng@wxiat.com>
Date: Tue, 10 May 2022 14:21:21 +0800
Subject: [PATCH 089/674] sw64: remove unused IO_CONCAT

Sunway inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I56OLG

--------------------------------

IO interface of chipset is wrapped as a hook in sw64_platform_ops.

Signed-off-by: He Sheng <hesheng@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/include/asm/io.h | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/arch/sw_64/include/asm/io.h b/arch/sw_64/include/asm/io.h
index e8e80c761467..3138665133e1 100644
--- a/arch/sw_64/include/asm/io.h
+++ b/arch/sw_64/include/asm/io.h
@@ -64,13 +64,6 @@ static inline void * __deprecated bus_to_virt(unsigned long address)
 }
 #define isa_bus_to_virt bus_to_virt
 
-/*
- * There are different chipsets to interface the sw64 CPUs to the world.
- */
-
-#define IO_CONCAT(a, b)		_IO_CONCAT(a, b)
-#define _IO_CONCAT(a, b)	a ## _ ## b
-
 #include <asm/sw64io.h>
 
 /*
-- 
Gitee


From a75d3cc4a8a08692bdfd9037c0ea97bbd8cdcd9c Mon Sep 17 00:00:00 2001
From: He Sheng <hesheng@wxiat.com>
Date: Tue, 10 May 2022 15:34:03 +0800
Subject: [PATCH 090/674] sw64: do not include sw64io.h in io.h

Sunway inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I56OLG

--------------------------------

We consider io.h as a basic header which should be decoupled with
platforms and chipsets, so it's better to remove sw64io.h from
io.h, and do something necessary to make it build ok.

Signed-off-by: He Sheng <hesheng@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/chip/chip3/i2c-lib.c    |  2 ++
 arch/sw_64/include/asm/io.h        | 10 ----------
 arch/sw_64/include/asm/irq_impl.h  |  2 ++
 arch/sw_64/include/asm/sw64_init.h |  1 +
 arch/sw_64/kernel/pci_impl.h       |  2 ++
 arch/sw_64/kernel/proto.h          |  1 +
 arch/sw_64/lib/iomap.c             |  3 ++-
 7 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/arch/sw_64/chip/chip3/i2c-lib.c b/arch/sw_64/chip/chip3/i2c-lib.c
index ddf0a187ab5a..39b815022e69 100644
--- a/arch/sw_64/chip/chip3/i2c-lib.c
+++ b/arch/sw_64/chip/chip3/i2c-lib.c
@@ -19,6 +19,8 @@
 #include <linux/errno.h>
 #include <linux/fb.h>
 
+#include <asm/sw64io.h>
+
 #define CPLD_BUSNR	 2
 
 #ifndef _I2C_DEBUG_FLAG_
diff --git a/arch/sw_64/include/asm/io.h b/arch/sw_64/include/asm/io.h
index 3138665133e1..fc11cb82f5b5 100644
--- a/arch/sw_64/include/asm/io.h
+++ b/arch/sw_64/include/asm/io.h
@@ -64,8 +64,6 @@ static inline void * __deprecated bus_to_virt(unsigned long address)
 }
 #define isa_bus_to_virt bus_to_virt
 
-#include <asm/sw64io.h>
-
 /*
  * Generic IO read/write.  These perform native-endian accesses.
  */
@@ -177,14 +175,6 @@ extern void		outb(u8 b, unsigned long port);
 extern void		outw(u16 b, unsigned long port);
 extern void		outl(u32 b, unsigned long port);
 
-/*
- * Mapping from port numbers to __iomem space is pretty easy.
- */
-static inline void __iomem *ioportmap(unsigned long addr)
-{
-	return sw64_platform->ioportmap(addr);
-}
-
 static inline void __iomem *__ioremap(phys_addr_t addr, size_t size,
 				      pgprot_t prot)
 {
diff --git a/arch/sw_64/include/asm/irq_impl.h b/arch/sw_64/include/asm/irq_impl.h
index 713267077142..3679793d8b65 100644
--- a/arch/sw_64/include/asm/irq_impl.h
+++ b/arch/sw_64/include/asm/irq_impl.h
@@ -11,6 +11,8 @@
 #include <linux/irq.h>
 #include <linux/profile.h>
 
+#include <asm/sw64io.h>
+
 #define SW64_PCIE0_INT_BASE 17
 #define SW64_PCIE0_MSI_BASE 21
 
diff --git a/arch/sw_64/include/asm/sw64_init.h b/arch/sw_64/include/asm/sw64_init.h
index 15842d22e5ba..2d9140605d0b 100644
--- a/arch/sw_64/include/asm/sw64_init.h
+++ b/arch/sw_64/include/asm/sw64_init.h
@@ -5,6 +5,7 @@
 #include <linux/cpu.h>
 #include <linux/pci.h>
 
+#include <asm/sw64io.h>
 
 struct sw64_early_init_ops {
 	void (*setup_core_start)(struct cpumask *cpumask);
diff --git a/arch/sw_64/kernel/pci_impl.h b/arch/sw_64/kernel/pci_impl.h
index 8e541f28f4ce..6025145cb1c5 100644
--- a/arch/sw_64/kernel/pci_impl.h
+++ b/arch/sw_64/kernel/pci_impl.h
@@ -6,6 +6,8 @@
 #ifndef _SW64_KERNEL_PCI_IMPL_H
 #define	_SW64_KERNEL_PCI_IMPL_H
 
+#include <asm/sw64io.h>
+
 struct pci_dev;
 struct pci_controller;
 
diff --git a/arch/sw_64/kernel/proto.h b/arch/sw_64/kernel/proto.h
index 1a729a8f21c3..9c99baaaf1d0 100644
--- a/arch/sw_64/kernel/proto.h
+++ b/arch/sw_64/kernel/proto.h
@@ -5,6 +5,7 @@
 #include <linux/interrupt.h>
 #include <linux/io.h>
 #include <asm/pgtable.h>
+#include <asm/sw64io.h>
 
 /* ptrace.c */
 extern int ptrace_set_bpt(struct task_struct *child);
diff --git a/arch/sw_64/lib/iomap.c b/arch/sw_64/lib/iomap.c
index 6fb96a7d7119..3a8d879ef070 100644
--- a/arch/sw_64/lib/iomap.c
+++ b/arch/sw_64/lib/iomap.c
@@ -6,6 +6,7 @@
 #include <linux/module.h>
 
 #include <asm/io.h>
+#include <asm/platform.h>
 
 /*
  * Here comes the sw64 implementation of the IOMAP interfaces.
@@ -459,7 +460,7 @@ EXPORT_SYMBOL(_memset_c_io);
 
 void __iomem *ioport_map(unsigned long port, unsigned int size)
 {
-	return ioportmap(port);
+	return sw64_platform->ioportmap(port);
 }
 EXPORT_SYMBOL(ioport_map);
 
-- 
Gitee


From 5e50a69788ed5bb9a512b098a3e1c7620c705e47 Mon Sep 17 00:00:00 2001
From: Zhou Qihang <zhouqihang@wxiat.com>
Date: Wed, 11 May 2022 10:53:06 +0800
Subject: [PATCH 091/674] sw64: iommu: work around iova mapping on pci bars

Sunway inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5G5T3

--------------------------------

Since the RCs of SW64 chipset do not support Peer-to-Peer DMA, we used
to return -EINVAL error on iova mampping on PCI BARs. However, it will
cause driver loading errors when QEMU calls vfio module to passthrough
PCI devices with this kind of BARs, shown as follows:
> qemu-system-sw64: VFIO_MAP_DMA: -22
> qemu-system-sw64: vfio_dma_map(0x121c28020, 0x8800c0000000, 0x40000,
  0x4165f21b2000) = -22 (Invalid argument)
> qemu: hardware error: vfio: DMA mapping failed, unable to continue

To fix this problem, this patch will return success on the iova mapping
of PCI BARs that pretends to support this feature.

Signed-off-by: Zhou Qihang <zhouqihang@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 drivers/iommu/sw64/sunway_iommu.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/sw64/sunway_iommu.c b/drivers/iommu/sw64/sunway_iommu.c
index 8b851e0a0c20..5797adf8fbc5 100644
--- a/drivers/iommu/sw64/sunway_iommu.c
+++ b/drivers/iommu/sw64/sunway_iommu.c
@@ -1509,6 +1509,9 @@ sunway_iommu_iova_to_phys(struct iommu_domain *dom, dma_addr_t iova)
 	struct sunway_iommu_domain *sdomain = to_sunway_domain(dom);
 	unsigned long paddr, grn;
 
+	if (iova > SW64_BAR_ADDRESS)
+		return iova;
+
 	paddr = fetch_pte(sdomain, iova, PTE_LEVEL2_VAL);
 
 	if ((paddr & SW64_IOMMU_ENTRY_VALID) == 0)
@@ -1544,7 +1547,7 @@ sunway_iommu_map(struct iommu_domain *dom, unsigned long iova,
 	 * to avoid VFIO trying to map pci config space.
 	 */
 	if (iova > SW64_BAR_ADDRESS)
-		return -EINVAL;
+		return 0;
 
 	mutex_lock(&sdomain->api_lock);
 	ret = sunway_iommu_map_page(sdomain, iova, paddr, page_size);
-- 
Gitee


From fb2723595a7bee3a69596b60ce8038d5cee85384 Mon Sep 17 00:00:00 2001
From: Zhao Yihan <zhaoyihan@wxiat.com>
Date: Wed, 11 May 2022 15:13:59 +0800
Subject: [PATCH 092/674] sw64: map address by OR operation in __va()

Sunway inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I56OLG

--------------------------------

Signed-off-by: Zhao Yihan <zhaoyihan@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/include/asm/page.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/sw_64/include/asm/page.h b/arch/sw_64/include/asm/page.h
index 6e17d5e437c5..363785c9f082 100644
--- a/arch/sw_64/include/asm/page.h
+++ b/arch/sw_64/include/asm/page.h
@@ -46,7 +46,7 @@ extern unsigned long __phys_addr(unsigned long);
 #endif
 
 #define __pa(x)			__phys_addr((unsigned long)(x))
-#define __va(x)			((void *)((unsigned long) (x) + PAGE_OFFSET))
+#define __va(x)			((void *)((unsigned long) (x) | PAGE_OFFSET))
 #define virt_to_page(kaddr)	pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
 #define virt_addr_valid(kaddr)	pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
 
-- 
Gitee


From 447e5cc179c5a24a1e99eb103737523f8b549137 Mon Sep 17 00:00:00 2001
From: He Sheng <hesheng@wxiat.com>
Date: Thu, 12 May 2022 11:10:01 +0800
Subject: [PATCH 093/674] sw64: read host IO registers with rdio64 hmcall

Sunway inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I56OLG

--------------------------------

The hmcall rdio64 can access IOR directly with its PA.

Signed-off-by: He Sheng <hesheng@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/chip/chip3/chip.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/sw_64/chip/chip3/chip.c b/arch/sw_64/chip/chip3/chip.c
index 4d2f99cc6402..704588d7c3e1 100644
--- a/arch/sw_64/chip/chip3/chip.c
+++ b/arch/sw_64/chip/chip3/chip.c
@@ -54,7 +54,7 @@ static struct clocksource clocksource_longtime = {
 static u64 read_vtime(struct clocksource *cs)
 {
 	u64 result;
-	unsigned long vtime_addr = PAGE_OFFSET | IO_BASE | LONG_TIME;
+	unsigned long vtime_addr = IO_BASE | LONG_TIME;
 
 	result = rdio64(vtime_addr);
 	return result;
-- 
Gitee


From 96259d7012ceb2b857d2f53c9921498365bd212e Mon Sep 17 00:00:00 2001
From: He Sheng <hesheng@wxiat.com>
Date: Thu, 12 May 2022 11:22:07 +0800
Subject: [PATCH 094/674] sw64: map logical address with __va()

Sunway inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5G5YB

--------------------------------

Although kmem and IO space are accessed with direct mapping on sw64,
it's not a standard method to or PAGE_OFFSET and access it directly
everywhere. This patch maps PA to VA with __va(). Besides, we change
type of some variables to pointer to make the code clearer.

Signed-off-by: He Sheng <hesheng@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/chip/chip3/chip.c           |  4 +-
 arch/sw_64/chip/chip3/i2c-lib.c        | 24 +++++------
 arch/sw_64/include/asm/pci.h           |  4 +-
 arch/sw_64/include/asm/sw64io.h        | 56 +++++++++++++-------------
 arch/sw_64/kernel/dup_print.c          | 10 +++--
 arch/sw_64/kernel/pci.c                |  5 +--
 arch/sw_64/kvm/kvm-sw64.c              | 11 +++--
 arch/sw_64/platform/platform_xuelang.c |  2 +-
 8 files changed, 59 insertions(+), 57 deletions(-)

diff --git a/arch/sw_64/chip/chip3/chip.c b/arch/sw_64/chip/chip3/chip.c
index 704588d7c3e1..acd25fe99669 100644
--- a/arch/sw_64/chip/chip3/chip.c
+++ b/arch/sw_64/chip/chip3/chip.c
@@ -480,8 +480,8 @@ static void chip3_hose_init(struct pci_controller *hose)
 
 	hose->dense_mem_base = pci_io_base;
 	hose->dense_io_base = pci_io_base | PCI_LEGACY_IO;
-	hose->ep_config_space_base = PAGE_OFFSET | pci_io_base | PCI_EP_CFG;
-	hose->rc_config_space_base = PAGE_OFFSET | pci_io_base | PCI_RC_CFG;
+	hose->ep_config_space_base = __va(pci_io_base | PCI_EP_CFG);
+	hose->rc_config_space_base = __va(pci_io_base | PCI_RC_CFG);
 
 	hose->mem_space->start = pci_io_base + PCI_32BIT_MEMIO;
 	hose->mem_space->end = hose->mem_space->start + PCI_32BIT_MEMIO_SIZE - 1;
diff --git a/arch/sw_64/chip/chip3/i2c-lib.c b/arch/sw_64/chip/chip3/i2c-lib.c
index 39b815022e69..b3dcdcd32735 100644
--- a/arch/sw_64/chip/chip3/i2c-lib.c
+++ b/arch/sw_64/chip/chip3/i2c-lib.c
@@ -96,7 +96,7 @@ enum i2c_bus_operation {
 	I2C_BUS_WRITE,
 };
 
-static uint64_t m_i2c_base_address;
+static void __iomem *m_i2c_base_address;
 
 /*
  * This function get I2Cx controller base address
@@ -104,18 +104,18 @@ static uint64_t m_i2c_base_address;
  * @param i2c_controller_index  Bus Number of I2C controller.
  * @return I2C BAR.
  */
-uint64_t get_i2c_bar_addr(uint8_t i2c_controller_index)
+void __iomem *get_i2c_bar_addr(uint8_t i2c_controller_index)
 {
-	uint64_t base_addr = 0;
-
-	if (i2c_controller_index == 0)
-		base_addr = PAGE_OFFSET | IO_BASE | IIC0_BASE;
-	else if (i2c_controller_index == 1)
-		base_addr = PAGE_OFFSET | IO_BASE | IIC1_BASE;
-	else if (i2c_controller_index == 2)
-		base_addr = PAGE_OFFSET | IO_BASE | IIC2_BASE;
-
-	return base_addr;
+	switch (i2c_controller_index) {
+	case 0:
+		return __va(IO_BASE | IIC0_BASE);
+	case 1:
+		return __va(IO_BASE | IIC1_BASE);
+	case 2:
+		return __va(IO_BASE | IIC2_BASE);
+	default:
+		return NULL;
+	}
 }
 
 void write_cpu_i2c_controller(uint64_t offset, uint32_t data)
diff --git a/arch/sw_64/include/asm/pci.h b/arch/sw_64/include/asm/pci.h
index ed875e0c3162..a90f80152470 100644
--- a/arch/sw_64/include/asm/pci.h
+++ b/arch/sw_64/include/asm/pci.h
@@ -34,8 +34,8 @@ struct pci_controller {
 	unsigned long dense_io_base;
 
 	/* This one's for the kernel only.  It's in KSEG somewhere.  */
-	unsigned long ep_config_space_base;
-	unsigned long rc_config_space_base;
+	void __iomem *ep_config_space_base;
+	void __iomem *rc_config_space_base;
 
 	unsigned long index;
 	unsigned long node;
diff --git a/arch/sw_64/include/asm/sw64io.h b/arch/sw_64/include/asm/sw64io.h
index 7c032070acf0..e66aba66932b 100644
--- a/arch/sw_64/include/asm/sw64io.h
+++ b/arch/sw_64/include/asm/sw64io.h
@@ -11,20 +11,20 @@ extern void setup_chip_clocksource(void);
 #endif
 
 #define MK_RC_CFG(nid, idx) \
-	(PAGE_OFFSET | SW64_PCI_IO_BASE((nid), (idx)) | PCI_RC_CFG)
+	(SW64_PCI_IO_BASE((nid), (idx)) | PCI_RC_CFG)
 #define MK_PIU_IOR0(nid, idx) \
-	(PAGE_OFFSET | SW64_PCI_IO_BASE((nid), (idx)) | PCI_IOR0_BASE)
+	(SW64_PCI_IO_BASE((nid), (idx)) | PCI_IOR0_BASE)
 #define MK_PIU_IOR1(nid, idx) \
-	(PAGE_OFFSET | SW64_PCI_IO_BASE((nid), (idx)) | PCI_IOR1_BASE)
+	(SW64_PCI_IO_BASE((nid), (idx)) | PCI_IOR1_BASE)
 
 static inline  unsigned int
-read_rc_conf(unsigned long node, unsigned long rc_index,
-		unsigned int conf_offset)
+read_rc_conf(unsigned long node, unsigned long rc,
+		unsigned int offset)
 {
-	unsigned long addr;
+	void __iomem *addr;
 	unsigned int value;
 
-	addr = MK_RC_CFG(node, rc_index) | conf_offset;
+	addr = __va(MK_RC_CFG(node, rc) | offset);
 	value = *(volatile unsigned int *)addr;
 	mb();
 
@@ -32,24 +32,24 @@ read_rc_conf(unsigned long node, unsigned long rc_index,
 }
 
 static inline void
-write_rc_conf(unsigned long node, unsigned long rc_index,
-		unsigned int conf_offset, unsigned int data)
+write_rc_conf(unsigned long node, unsigned long rc,
+		unsigned int offset, unsigned int data)
 {
-	unsigned long addr;
+	void __iomem *addr;
 
-	addr = MK_RC_CFG(node, rc_index) | conf_offset;
+	addr = __va(MK_RC_CFG(node, rc) | offset);
 	*(unsigned int *)addr = data;
 	mb();
 }
 
 static inline  unsigned long
-read_piu_ior0(unsigned long node, unsigned long rc_index,
+read_piu_ior0(unsigned long node, unsigned long rc,
 		unsigned int reg)
 {
-	unsigned long addr;
+	void __iomem *addr;
 	unsigned long value;
 
-	addr = MK_PIU_IOR0(node, rc_index) + reg;
+	addr = __va(MK_PIU_IOR0(node, rc) + reg);
 	value = *(volatile unsigned long __iomem *)addr;
 	mb();
 
@@ -57,23 +57,24 @@ read_piu_ior0(unsigned long node, unsigned long rc_index,
 }
 
 static inline void
-write_piu_ior0(unsigned long node, unsigned long rc_index,
+write_piu_ior0(unsigned long node, unsigned long rc,
 		unsigned int reg, unsigned long data)
 {
-	unsigned long addr;
+	void __iomem *addr;
 
-	addr = MK_PIU_IOR0(node, rc_index) + reg;
+	addr = __va(MK_PIU_IOR0(node, rc) + reg);
 	*(unsigned long __iomem *)addr = data;
 	mb();
 }
 
 static inline  unsigned long
-read_piu_ior1(unsigned long node, unsigned long rc_index,
+read_piu_ior1(unsigned long node, unsigned long rc,
 		unsigned int reg)
 {
-	unsigned long addr, value;
+	void __iomem *addr;
+	unsigned long value;
 
-	addr = MK_PIU_IOR1(node, rc_index) + reg;
+	addr = __va(MK_PIU_IOR1(node, rc) + reg);
 	value = *(volatile unsigned long __iomem *)addr;
 	mb();
 
@@ -81,12 +82,12 @@ read_piu_ior1(unsigned long node, unsigned long rc_index,
 }
 
 static inline void
-write_piu_ior1(unsigned long node, unsigned long rc_index,
+write_piu_ior1(unsigned long node, unsigned long rc,
 		unsigned int reg, unsigned long data)
 {
-	unsigned long addr;
+	void __iomem *addr;
 
-	addr = MK_PIU_IOR1(node, rc_index) + reg;
+	addr = __va(MK_PIU_IOR1(node, rc) + reg);
 	*(volatile unsigned long __iomem *)addr = data;
 	mb();
 }
@@ -94,9 +95,10 @@ write_piu_ior1(unsigned long node, unsigned long rc_index,
 static inline unsigned long
 sw64_io_read(unsigned long node, unsigned long reg)
 {
-	unsigned long addr, value;
+	void __iomem *addr;
+	unsigned long value;
 
-	addr = PAGE_OFFSET | SW64_IO_BASE(node) | reg;
+	addr = __va(SW64_IO_BASE(node) | reg);
 	value = *(volatile unsigned long __iomem *)addr;
 	mb();
 
@@ -106,9 +108,9 @@ sw64_io_read(unsigned long node, unsigned long reg)
 static inline void
 sw64_io_write(unsigned long node, unsigned long reg, unsigned long data)
 {
-	unsigned long addr;
+	void __iomem *addr;
 
-	addr = PAGE_OFFSET | SW64_IO_BASE(node) | reg;
+	addr = __va(SW64_IO_BASE(node) | reg);
 	*(volatile unsigned long __iomem *)addr = data;
 	mb();
 }
diff --git a/arch/sw_64/kernel/dup_print.c b/arch/sw_64/kernel/dup_print.c
index 1aa7710b5092..02639f40a4bc 100644
--- a/arch/sw_64/kernel/dup_print.c
+++ b/arch/sw_64/kernel/dup_print.c
@@ -4,6 +4,7 @@
 #include <linux/spinlock.h>
 
 #include <asm/chip3_io.h>
+#include <asm/io.h>
 
 #ifdef CONFIG_SW64_RRK
 
@@ -18,7 +19,7 @@ unsigned long sw64_printk_offset;
  * For output the kernel message on the console
  * with full-system emulator.
  */
-#define QEMU_PRINTF_BUFF_BASE	(IO_BASE | MCU_BASE | 0x40000UL | PAGE_OFFSET)
+#define QEMU_PRINTF_BUFF_BASE	(IO_BASE | MCU_BASE | 0x40000UL)
 
 int sw64_printk(const char *fmt, va_list args)
 {
@@ -38,9 +39,10 @@ int sw64_printk(const char *fmt, va_list args)
 	} else {
 		printed_len += vscnprintf(sw64_printk_buf, 1024, fmt, args);
 		if (is_in_emul()) {
-			unsigned long write_addr = QEMU_PRINTF_BUFF_BASE;
-			*(unsigned long *)write_addr = (unsigned long)((((unsigned long)sw64_printk_buf) & 0xffffffffUL)
-					| ((unsigned long)printed_len << 32));
+			void __iomem *addr = __va(QEMU_PRINTF_BUFF_BASE);
+			u64 data = ((u64)sw64_printk_buf & 0xffffffffUL)
+					| ((u64)printed_len << 32);
+			*(u64 *)addr = data;
 		}
 	}
 	sw64_printk_offset += printed_len;
diff --git a/arch/sw_64/kernel/pci.c b/arch/sw_64/kernel/pci.c
index 77bbd1b68176..401ee0b781be 100644
--- a/arch/sw_64/kernel/pci.c
+++ b/arch/sw_64/kernel/pci.c
@@ -398,7 +398,7 @@ int sw6_pcie_read_rc_cfg(struct pci_bus *bus, unsigned int devfn,
 {
 	u32 data;
 	struct pci_controller *hose = bus->sysdata;
-	void __iomem *cfg_iobase = (void *)hose->rc_config_space_base;
+	void __iomem *cfg_iobase = hose->rc_config_space_base;
 
 	if (IS_ENABLED(CONFIG_PCI_DEBUG))
 		pr_debug("rc read addr:%px bus %d, devfn %#x, where %#x size=%d\t",
@@ -549,9 +549,8 @@ static void __iomem *sw6_pcie_map_bus(struct pci_bus *bus,
 		return NULL;
 
 	relbus = (bus->number << 24) | (devfn << 16) | where;
-	relbus |= PCI_EP_CFG;
 
-	cfg_iobase = (void *)(hose->ep_config_space_base | relbus);
+	cfg_iobase = hose->ep_config_space_base + relbus;
 
 	if (IS_ENABLED(CONFIG_PCI_DEBUG))
 		pr_debug("addr:%px bus %d, devfn %d, where %d\n",
diff --git a/arch/sw_64/kvm/kvm-sw64.c b/arch/sw_64/kvm/kvm-sw64.c
index d651d26a957a..839ee83d57d5 100644
--- a/arch/sw_64/kvm/kvm-sw64.c
+++ b/arch/sw_64/kvm/kvm-sw64.c
@@ -311,7 +311,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 	kvm->arch.host_phys_addr = (u64)addr;
 	kvm->arch.size = round_up(mem->memory_size, 8<<20);
 
-	memset((void *)(PAGE_OFFSET + addr), 0, 0x2000000);
+	memset(__va(addr), 0, 0x2000000);
 
 	return 0;
 }
@@ -344,7 +344,7 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
 	memset(&vcpu->arch.irqs_pending, 0, sizeof(vcpu->arch.irqs_pending));
 
 	if (vcpu->vcpu_id == 0)
-		memset((void *)(PAGE_OFFSET + addr), 0, 0x2000000);
+		memset(__va(addr), 0, 0x2000000);
 
 	return 0;
 }
@@ -432,18 +432,17 @@ void _debug_printk_vcpu(struct kvm_vcpu *vcpu)
 {
 	unsigned long pc = vcpu->arch.regs.pc;
 	unsigned long offset = vcpu->kvm->arch.host_phys_addr;
-	unsigned long pc_phys = PAGE_OFFSET | ((pc & 0x7fffffffUL) + offset);
+	unsigned int *pc_phys = __va((pc & 0x7fffffffUL) + offset);
 	unsigned int insn;
 	int opc, ra, disp16;
 
-	insn = *(unsigned int *)pc_phys;
-
+	insn = *pc_phys;
 	opc = (insn >> 26) & 0x3f;
 	ra = (insn >> 21) & 0x1f;
 	disp16 = insn & 0xffff;
 
 	if (opc == 0x06 && disp16 == 0x1000) /* RD_F */
-		pr_info("vcpu exit: pc = %#lx (%#lx), insn[%x] : rd_f r%d [%#lx]\n",
+		pr_info("vcpu exit: pc = %#lx (%px), insn[%x] : rd_f r%d [%#lx]\n",
 				pc, pc_phys, insn, ra, vcpu_get_reg(vcpu, ra));
 }
 
diff --git a/arch/sw_64/platform/platform_xuelang.c b/arch/sw_64/platform/platform_xuelang.c
index f0e33c664b0e..63a4b163e43e 100644
--- a/arch/sw_64/platform/platform_xuelang.c
+++ b/arch/sw_64/platform/platform_xuelang.c
@@ -54,7 +54,7 @@ static inline void __iomem *xuelang_ioportmap(unsigned long addr)
 		addr = addr | io_offset;
 	}
 
-	return (void __iomem *)(addr | PAGE_OFFSET);
+	return __va(addr);
 }
 
 struct sw64_platform_ops xuelang_ops = {
-- 
Gitee


From 53003aee7c6a44a565254b702fe73a0c44fa244b Mon Sep 17 00:00:00 2001
From: He Sheng <hesheng@wxiat.com>
Date: Thu, 12 May 2022 14:07:00 +0800
Subject: [PATCH 095/674] sw64: access IO space with readX/writeX

Sunway inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5G5YB

--------------------------------

It's more standard to access IO registers by calling readX/writeX
interfaces. In addition, the code is easier to maintain.

Signed-off-by: He Sheng <hesheng@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/chip/chip3/i2c-lib.c | 15 +++++--------
 arch/sw_64/include/asm/sw64io.h | 37 ++++++++-------------------------
 2 files changed, 14 insertions(+), 38 deletions(-)

diff --git a/arch/sw_64/chip/chip3/i2c-lib.c b/arch/sw_64/chip/chip3/i2c-lib.c
index b3dcdcd32735..e70f0f0c9a56 100644
--- a/arch/sw_64/chip/chip3/i2c-lib.c
+++ b/arch/sw_64/chip/chip3/i2c-lib.c
@@ -118,19 +118,14 @@ void __iomem *get_i2c_bar_addr(uint8_t i2c_controller_index)
 	}
 }
 
-void write_cpu_i2c_controller(uint64_t offset, uint32_t data)
+static inline void write_cpu_i2c_controller(uint64_t offset, uint32_t data)
 {
-	mb();
-	*(volatile uint32_t *)(m_i2c_base_address + offset) = data;
+	writel(data, m_i2c_base_address + offset);
 }
 
-uint32_t read_cpu_i2c_controller(uint64_t offset)
+static inline uint32_t read_cpu_i2c_controller(uint64_t offset)
 {
-	uint32_t data;
-
-	data = *(volatile uint32_t *)(m_i2c_base_address + offset);
-	mb();
-	return data;
+	return readl(m_i2c_base_address + offset);
 }
 
 static int poll_for_status_set0(uint16_t status_bit)
@@ -241,7 +236,7 @@ static int i2c_read(uint8_t reg_offset, uint8_t *buffer, uint32_t length)
 			write_cpu_i2c_controller(DW_IC_DATA_CMD, DW_IC_CMD);
 
 		if (poll_for_status_set0(DW_IC_STATUS_RFNE) == 0)
-			buffer[i] = *(uint8_t *) (m_i2c_base_address + DW_IC_DATA_CMD);
+			buffer[i] = readb(m_i2c_base_address + DW_IC_DATA_CMD);
 		else
 			pr_err("Read timeout line %d.\n", __LINE__);
 	}
diff --git a/arch/sw_64/include/asm/sw64io.h b/arch/sw_64/include/asm/sw64io.h
index e66aba66932b..7d79a5b75090 100644
--- a/arch/sw_64/include/asm/sw64io.h
+++ b/arch/sw_64/include/asm/sw64io.h
@@ -2,6 +2,7 @@
 #ifndef _ASM_SW64_SW64IO_H
 #define _ASM_SW64_SW64IO_H
 
+#include <asm/io.h>
 #include <asm/page.h>
 
 extern void setup_chip_clocksource(void);
@@ -22,13 +23,9 @@ read_rc_conf(unsigned long node, unsigned long rc,
 		unsigned int offset)
 {
 	void __iomem *addr;
-	unsigned int value;
 
 	addr = __va(MK_RC_CFG(node, rc) | offset);
-	value = *(volatile unsigned int *)addr;
-	mb();
-
-	return value;
+	return readl(addr);
 }
 
 static inline void
@@ -38,8 +35,7 @@ write_rc_conf(unsigned long node, unsigned long rc,
 	void __iomem *addr;
 
 	addr = __va(MK_RC_CFG(node, rc) | offset);
-	*(unsigned int *)addr = data;
-	mb();
+	writel(data, addr);
 }
 
 static inline  unsigned long
@@ -47,13 +43,9 @@ read_piu_ior0(unsigned long node, unsigned long rc,
 		unsigned int reg)
 {
 	void __iomem *addr;
-	unsigned long value;
 
 	addr = __va(MK_PIU_IOR0(node, rc) + reg);
-	value = *(volatile unsigned long __iomem *)addr;
-	mb();
-
-	return value;
+	return readq(addr);
 }
 
 static inline void
@@ -63,8 +55,7 @@ write_piu_ior0(unsigned long node, unsigned long rc,
 	void __iomem *addr;
 
 	addr = __va(MK_PIU_IOR0(node, rc) + reg);
-	*(unsigned long __iomem *)addr = data;
-	mb();
+	writeq(data, addr);
 }
 
 static inline  unsigned long
@@ -72,13 +63,9 @@ read_piu_ior1(unsigned long node, unsigned long rc,
 		unsigned int reg)
 {
 	void __iomem *addr;
-	unsigned long value;
 
 	addr = __va(MK_PIU_IOR1(node, rc) + reg);
-	value = *(volatile unsigned long __iomem *)addr;
-	mb();
-
-	return value;
+	return readq(addr);
 }
 
 static inline void
@@ -88,21 +75,16 @@ write_piu_ior1(unsigned long node, unsigned long rc,
 	void __iomem *addr;
 
 	addr = __va(MK_PIU_IOR1(node, rc) + reg);
-	*(volatile unsigned long __iomem *)addr = data;
-	mb();
+	writeq(data, addr);
 }
 
 static inline unsigned long
 sw64_io_read(unsigned long node, unsigned long reg)
 {
 	void __iomem *addr;
-	unsigned long value;
 
 	addr = __va(SW64_IO_BASE(node) | reg);
-	value = *(volatile unsigned long __iomem *)addr;
-	mb();
-
-	return value;
+	return readq(addr);
 }
 
 static inline void
@@ -111,7 +93,6 @@ sw64_io_write(unsigned long node, unsigned long reg, unsigned long data)
 	void __iomem *addr;
 
 	addr = __va(SW64_IO_BASE(node) | reg);
-	*(volatile unsigned long __iomem *)addr = data;
-	mb();
+	writeq(data, addr);
 }
 #endif
-- 
Gitee


From 687ba666ce3a7f7c55eaa7b325d0ad928559798c Mon Sep 17 00:00:00 2001
From: He Sheng <hesheng@wxiat.com>
Date: Thu, 21 Apr 2022 17:13:53 +0800
Subject: [PATCH 096/674] sw64: add fpu state save/restore interfaces

Sunway inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I56OLG

--------------------------------

Signed-off-by: He Sheng <hesheng@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/kernel/Makefile |   2 +-
 arch/sw_64/kernel/fpu.S    | 106 +++++++++++++++++++++++++++++++++++++
 2 files changed, 107 insertions(+), 1 deletion(-)
 create mode 100644 arch/sw_64/kernel/fpu.S

diff --git a/arch/sw_64/kernel/Makefile b/arch/sw_64/kernel/Makefile
index f6a2813b0466..293b360626f7 100644
--- a/arch/sw_64/kernel/Makefile
+++ b/arch/sw_64/kernel/Makefile
@@ -13,7 +13,7 @@ CFLAGS_REMOVE_insn.o = -pg
 CFLAGS_REMOVE_printk.o = -pg
 endif
 
-obj-y    := entry.o traps.o process.o sys_sw64.o irq.o \
+obj-y    := entry.o fpu.o traps.o process.o sys_sw64.o irq.o \
 	    irq_sw64.o signal.o setup.o ptrace.o time.o \
 	    systbls.o dup_print.o tc.o \
 	    insn.o early_init.o topology.o cacheinfo.o \
diff --git a/arch/sw_64/kernel/fpu.S b/arch/sw_64/kernel/fpu.S
new file mode 100644
index 000000000000..25dcf4740525
--- /dev/null
+++ b/arch/sw_64/kernel/fpu.S
@@ -0,0 +1,106 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/linkage.h>
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
+#include <asm/regdef.h>
+
+	.text
+	.set noat
+ENTRY(__fpstate_save)
+	/* a0: prev task */
+	ldi	a0, TASK_THREAD(a0)
+	ldi	t0, THREAD_CTX_FP(a0)
+	vstd	$f0, CTX_FP_F0(t0)
+	vstd	$f1, CTX_FP_F1(t0)
+	vstd	$f2, CTX_FP_F2(t0)
+	vstd	$f3, CTX_FP_F3(t0)
+	vstd	$f4, CTX_FP_F4(t0)
+	vstd	$f5, CTX_FP_F5(t0)
+	vstd	$f6, CTX_FP_F6(t0)
+	vstd	$f7, CTX_FP_F7(t0)
+	vstd	$f8, CTX_FP_F8(t0)
+	vstd	$f9, CTX_FP_F9(t0)
+	vstd	$f10, CTX_FP_F10(t0)
+	vstd	$f11, CTX_FP_F11(t0)
+	vstd	$f12, CTX_FP_F12(t0)
+	vstd	$f13, CTX_FP_F13(t0)
+	vstd	$f14, CTX_FP_F14(t0)
+	vstd	$f15, CTX_FP_F15(t0)
+	vstd	$f16, CTX_FP_F16(t0)
+	vstd	$f17, CTX_FP_F17(t0)
+	vstd	$f18, CTX_FP_F18(t0)
+	vstd	$f19, CTX_FP_F19(t0)
+	vstd	$f20, CTX_FP_F20(t0)
+	vstd	$f21, CTX_FP_F21(t0)
+	vstd	$f22, CTX_FP_F22(t0)
+	vstd	$f23, CTX_FP_F23(t0)
+	vstd	$f24, CTX_FP_F24(t0)
+	vstd	$f25, CTX_FP_F25(t0)
+	vstd	$f26, CTX_FP_F26(t0)
+	vstd	$f27, CTX_FP_F27(t0)
+	rfpcr	$f0
+	vstd	$f28, CTX_FP_F28(t0)
+	vstd	$f29, CTX_FP_F29(t0)
+	vstd	$f30, CTX_FP_F30(t0)
+	fstd	$f0, THREAD_FPCR(a0)
+	vldd	$f0, CTX_FP_F0(t0)
+	ret
+END(__fpstate_save)
+
+ENTRY(__fpstate_restore)
+	/* a0: next task */
+	ldi	a0, TASK_THREAD(a0)
+	fldd	$f0, THREAD_FPCR(a0)
+	wfpcr	$f0
+	fimovd	$f0, t1
+	and	t1, 0x3, t1
+	beq	t1, $setfpec_0
+	subl	t1, 0x1, t1
+	beq	t1, $setfpec_1
+	subl	t1, 0x1, t1
+	beq	t1, $setfpec_2
+	setfpec3
+	br	$setfpec_over
+$setfpec_0:
+	setfpec0
+	br	$setfpec_over
+$setfpec_1:
+	setfpec1
+	br	$setfpec_over
+$setfpec_2:
+	setfpec2
+$setfpec_over:
+	ldi	t0, THREAD_CTX_FP(a0)
+	vldd	$f0, CTX_FP_F0(t0)
+	vldd	$f1, CTX_FP_F1(t0)
+	vldd	$f2, CTX_FP_F2(t0)
+	vldd	$f3, CTX_FP_F3(t0)
+	vldd	$f4, CTX_FP_F4(t0)
+	vldd	$f5, CTX_FP_F5(t0)
+	vldd	$f6, CTX_FP_F6(t0)
+	vldd	$f7, CTX_FP_F7(t0)
+	vldd	$f8, CTX_FP_F8(t0)
+	vldd	$f9, CTX_FP_F9(t0)
+	vldd	$f10, CTX_FP_F10(t0)
+	vldd	$f11, CTX_FP_F11(t0)
+	vldd	$f12, CTX_FP_F12(t0)
+	vldd	$f13, CTX_FP_F13(t0)
+	vldd	$f14, CTX_FP_F14(t0)
+	vldd	$f15, CTX_FP_F15(t0)
+	vldd	$f16, CTX_FP_F16(t0)
+	vldd	$f17, CTX_FP_F17(t0)
+	vldd	$f18, CTX_FP_F18(t0)
+	vldd	$f19, CTX_FP_F19(t0)
+	vldd	$f20, CTX_FP_F20(t0)
+	vldd	$f21, CTX_FP_F21(t0)
+	vldd	$f22, CTX_FP_F22(t0)
+	vldd	$f23, CTX_FP_F23(t0)
+	vldd	$f24, CTX_FP_F24(t0)
+	vldd	$f25, CTX_FP_F25(t0)
+	vldd	$f26, CTX_FP_F26(t0)
+	vldd	$f27, CTX_FP_F27(t0)
+	vldd	$f28, CTX_FP_F28(t0)
+	vldd	$f29, CTX_FP_F29(t0)
+	vldd	$f30, CTX_FP_F30(t0)
+	ret
+END(__fpstate_restore)
-- 
Gitee


From fa1cadd2b145b88c6ffb9fd8cd540967b5f7c067 Mon Sep 17 00:00:00 2001
From: He Sheng <hesheng@wxiat.com>
Date: Fri, 22 Apr 2022 09:29:43 +0800
Subject: [PATCH 097/674] sw64: switch to generic fork like system calls

Sunway inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GF7A

--------------------------------

We put thread's callee-saved registers in thread_struct, and put
exception callee-saved registers in pt_regs. For context switch,
it switches fpu state first, and then switches integer registers
in __switch_to.

Because callee-saved registers are always saved and restored, it
will cause a little overhead.

As a result, the special sw64 fork like system calls are removed.

Signed-off-by: He Sheng <hesheng@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/include/asm/processor.h     |   3 +
 arch/sw_64/include/asm/switch_to.h     |  37 +++++-
 arch/sw_64/include/uapi/asm/ptrace.h   |  15 ++-
 arch/sw_64/kernel/asm-offsets.c        |  16 +++
 arch/sw_64/kernel/entry.S              | 173 +++++++++++++++----------
 arch/sw_64/kernel/process.c            |  21 ++-
 arch/sw_64/kernel/syscalls/syscall.tbl |   6 +-
 7 files changed, 175 insertions(+), 96 deletions(-)

diff --git a/arch/sw_64/include/asm/processor.h b/arch/sw_64/include/asm/processor.h
index 645c33a596ff..67d3b4368987 100644
--- a/arch/sw_64/include/asm/processor.h
+++ b/arch/sw_64/include/asm/processor.h
@@ -78,6 +78,9 @@ struct context_fpregs {
 struct thread_struct {
 	struct context_fpregs ctx_fp;
 	unsigned long fpcr;
+	/* Callee-saved registers */
+	unsigned long ra;
+	unsigned long s[7];	/* s0 ~ s6 */
 };
 #define INIT_THREAD  { }
 
diff --git a/arch/sw_64/include/asm/switch_to.h b/arch/sw_64/include/asm/switch_to.h
index 22045b247557..d503fc59390f 100644
--- a/arch/sw_64/include/asm/switch_to.h
+++ b/arch/sw_64/include/asm/switch_to.h
@@ -2,12 +2,41 @@
 #ifndef _ASM_SW64_SWITCH_TO_H
 #define _ASM_SW64_SWITCH_TO_H
 
-struct task_struct;
-extern struct task_struct *__switch_to(unsigned long, struct task_struct *);
+#include<linux/sched.h>
+
+extern void __fpstate_save(struct task_struct *save_to);
+extern void __fpstate_restore(struct task_struct *restore_from);
+extern struct task_struct *__switch_to(unsigned long pcb,
+		struct task_struct *prev, struct task_struct *next);
 extern void restore_da_match_after_sched(void);
-#define switch_to(P, N, L)						\
+
+static inline void fpstate_save(struct task_struct *task)
+{
+	if (likely(!(task->flags & PF_KTHREAD)))
+		__fpstate_save(task);
+}
+
+static inline void fpstate_restore(struct task_struct *task)
+{
+	if (likely(!(task->flags & PF_KTHREAD)))
+		__fpstate_restore(task);
+}
+
+static inline void __switch_to_aux(struct task_struct *prev,
+				   struct task_struct *next)
+{
+	fpstate_save(prev);
+	fpstate_restore(next);
+}
+
+
+#define switch_to(prev, next, last)					\
 do {									\
-	(L) = __switch_to(virt_to_phys(&task_thread_info(N)->pcb), (P));\
+	struct task_struct *__prev = (prev);				\
+	struct task_struct *__next = (next);				\
+	__u64 __nextpcb = virt_to_phys(&task_thread_info(__next)->pcb);	\
+	__switch_to_aux(__prev, __next);				\
+	(last) = __switch_to(__nextpcb, __prev, __next);		\
 	check_mmu_context();						\
 } while (0)
 
diff --git a/arch/sw_64/include/uapi/asm/ptrace.h b/arch/sw_64/include/uapi/asm/ptrace.h
index 96cb10891bea..4549e8d4bf57 100644
--- a/arch/sw_64/include/uapi/asm/ptrace.h
+++ b/arch/sw_64/include/uapi/asm/ptrace.h
@@ -9,12 +9,7 @@
  *
  * NOTE! I want to minimize the overhead of system calls, so this
  * struct has as little information as possible.  I does not have
- *
- *  - floating point regs: the kernel doesn't change those
- *  - r9-15: saved by the C compiler
- *
- * This makes "fork()" and "exec()" a bit more complex, but should
- * give us low system call latency.
+ * floating point regs, because the kernel doesn't change those
  */
 
 struct pt_regs {
@@ -27,6 +22,14 @@ struct pt_regs {
 	unsigned long r6;
 	unsigned long r7;
 	unsigned long r8;
+	unsigned long r9;
+	unsigned long r10;
+	unsigned long r11;
+	unsigned long r12;
+	unsigned long r13;
+	unsigned long r14;
+	unsigned long r15;
+	/* r16 ~ r18 saved by hmcode */
 	unsigned long r19;
 	unsigned long r20;
 	unsigned long r21;
diff --git a/arch/sw_64/kernel/asm-offsets.c b/arch/sw_64/kernel/asm-offsets.c
index bea12d2d96fe..349bc5c4e2f1 100644
--- a/arch/sw_64/kernel/asm-offsets.c
+++ b/arch/sw_64/kernel/asm-offsets.c
@@ -72,6 +72,13 @@ void foo(void)
 	DEFINE(PT_REGS_R6, offsetof(struct pt_regs, r6));
 	DEFINE(PT_REGS_R7, offsetof(struct pt_regs, r7));
 	DEFINE(PT_REGS_R8, offsetof(struct pt_regs, r8));
+	DEFINE(PT_REGS_R9, offsetof(struct pt_regs, r9));
+	DEFINE(PT_REGS_R10, offsetof(struct pt_regs, r10));
+	DEFINE(PT_REGS_R11, offsetof(struct pt_regs, r11));
+	DEFINE(PT_REGS_R12, offsetof(struct pt_regs, r12));
+	DEFINE(PT_REGS_R13, offsetof(struct pt_regs, r13));
+	DEFINE(PT_REGS_R14, offsetof(struct pt_regs, r14));
+	DEFINE(PT_REGS_R15, offsetof(struct pt_regs, r15));
 	DEFINE(PT_REGS_R19, offsetof(struct pt_regs, r19));
 	DEFINE(PT_REGS_R20, offsetof(struct pt_regs, r20));
 	DEFINE(PT_REGS_R21, offsetof(struct pt_regs, r21));
@@ -258,4 +265,13 @@ void foo(void)
 	DEFINE(CTX_FP_F29, offsetof(struct context_fpregs, f29));
 	DEFINE(CTX_FP_F30, offsetof(struct context_fpregs, f30));
 	BLANK();
+	OFFSET(TASK_THREAD_RA, task_struct, thread.ra);
+	OFFSET(TASK_THREAD_S0, task_struct, thread.s[0]);
+	OFFSET(TASK_THREAD_S1, task_struct, thread.s[1]);
+	OFFSET(TASK_THREAD_S2, task_struct, thread.s[2]);
+	OFFSET(TASK_THREAD_S3, task_struct, thread.s[3]);
+	OFFSET(TASK_THREAD_S4, task_struct, thread.s[4]);
+	OFFSET(TASK_THREAD_S5, task_struct, thread.s[5]);
+	OFFSET(TASK_THREAD_S6, task_struct, thread.s[6]);
+	BLANK();
 }
diff --git a/arch/sw_64/kernel/entry.S b/arch/sw_64/kernel/entry.S
index 6c40d2015439..ac62461aa9dc 100644
--- a/arch/sw_64/kernel/entry.S
+++ b/arch/sw_64/kernel/entry.S
@@ -21,52 +21,84 @@
  * the hmcode-provided values are available to the signal handler.
  */
 
-#define SAVE_ALL				\
-	ldi	$sp, -PT_REGS_PS($sp);		\
-	stl	$0, PT_REGS_R0($sp);		\
-	stl	$1, PT_REGS_R1($sp);		\
-	stl	$2, PT_REGS_R2($sp);		\
-	stl	$3, PT_REGS_R3($sp);		\
-	stl	$4, PT_REGS_R4($sp);		\
-	stl	$28, PT_REGS_R28($sp);		\
-	stl	$5, PT_REGS_R5($sp);		\
-	stl	$6, PT_REGS_R6($sp);		\
-	stl	$7, PT_REGS_R7($sp);		\
-	stl	$8, PT_REGS_R8($sp);		\
-	stl	$19, PT_REGS_R19($sp);		\
-	stl	$20, PT_REGS_R20($sp);		\
-	stl	$21, PT_REGS_R21($sp);		\
-	stl	$22, PT_REGS_R22($sp);		\
-	stl	$23, PT_REGS_R23($sp);		\
-	stl	$24, PT_REGS_R24($sp);		\
-	stl	$25, PT_REGS_R25($sp);		\
-	stl	$26, PT_REGS_R26($sp);		\
-	stl	$27, PT_REGS_R27($sp);		\
-	stl	$16, PT_REGS_TRAP_A0($sp);	\
-	stl	$17, PT_REGS_TRAP_A1($sp);	\
+	.macro SAVE_COMMON_REGS
+	ldi	$sp, -PT_REGS_PS($sp)
+	stl	$0, PT_REGS_R0($sp)
+	stl	$1, PT_REGS_R1($sp)
+	stl	$2, PT_REGS_R2($sp)
+	stl	$3, PT_REGS_R3($sp)
+	stl	$4, PT_REGS_R4($sp)
+	stl	$28, PT_REGS_R28($sp)
+	stl	$5, PT_REGS_R5($sp)
+	stl	$6, PT_REGS_R6($sp)
+	stl	$7, PT_REGS_R7($sp)
+	stl	$8, PT_REGS_R8($sp)
+	stl	$19, PT_REGS_R19($sp)
+	stl	$20, PT_REGS_R20($sp)
+	stl	$21, PT_REGS_R21($sp)
+	stl	$22, PT_REGS_R22($sp)
+	stl	$23, PT_REGS_R23($sp)
+	stl	$24, PT_REGS_R24($sp)
+	stl	$25, PT_REGS_R25($sp)
+	stl	$26, PT_REGS_R26($sp)
+	stl	$27, PT_REGS_R27($sp)
+	stl	$16, PT_REGS_TRAP_A0($sp)
+	stl	$17, PT_REGS_TRAP_A1($sp)
 	stl	$18, PT_REGS_TRAP_A2($sp)
+	.endm
 
-#define RESTORE_ALL				\
-	ldl	$0, PT_REGS_R0($sp);		\
-	ldl	$1, PT_REGS_R1($sp);		\
-	ldl	$2, PT_REGS_R2($sp);		\
-	ldl	$3, PT_REGS_R3($sp);		\
-	ldl	$4, PT_REGS_R4($sp);		\
-	ldl	$5, PT_REGS_R5($sp);		\
-	ldl	$6, PT_REGS_R6($sp);		\
-	ldl	$7, PT_REGS_R7($sp);		\
-	ldl	$8, PT_REGS_R8($sp);		\
-	ldl	$19, PT_REGS_R19($sp);		\
-	ldl	$20, PT_REGS_R20($sp);		\
-	ldl	$21, PT_REGS_R21($sp);		\
-	ldl	$22, PT_REGS_R22($sp);		\
-	ldl	$23, PT_REGS_R23($sp);		\
-	ldl	$24, PT_REGS_R24($sp);		\
-	ldl	$25, PT_REGS_R25($sp);		\
-	ldl	$26, PT_REGS_R26($sp);		\
-	ldl	$27, PT_REGS_R27($sp);		\
-	ldl	$28, PT_REGS_R28($sp);		\
+	.macro RESTORE_COMMON_REGS
+	ldl	$0, PT_REGS_R0($sp)
+	ldl	$1, PT_REGS_R1($sp)
+	ldl	$2, PT_REGS_R2($sp)
+	ldl	$3, PT_REGS_R3($sp)
+	ldl	$4, PT_REGS_R4($sp)
+	ldl	$5, PT_REGS_R5($sp)
+	ldl	$6, PT_REGS_R6($sp)
+	ldl	$7, PT_REGS_R7($sp)
+	ldl	$8, PT_REGS_R8($sp)
+	ldl	$19, PT_REGS_R19($sp)
+	ldl	$20, PT_REGS_R20($sp)
+	ldl	$21, PT_REGS_R21($sp)
+	ldl	$22, PT_REGS_R22($sp)
+	ldl	$23, PT_REGS_R23($sp)
+	ldl	$24, PT_REGS_R24($sp)
+	ldl	$25, PT_REGS_R25($sp)
+	ldl	$26, PT_REGS_R26($sp)
+	ldl	$27, PT_REGS_R27($sp)
+	ldl	$28, PT_REGS_R28($sp)
 	ldi	$sp, PT_REGS_PS($sp)
+	.endm
+
+	.macro SAVE_CALLEE_REGS
+	stl	$9, PT_REGS_R9($sp)
+	stl	$10, PT_REGS_R10($sp)
+	stl	$11, PT_REGS_R11($sp)
+	stl	$12, PT_REGS_R12($sp)
+	stl	$13, PT_REGS_R13($sp)
+	stl	$14, PT_REGS_R14($sp)
+	stl	$15, PT_REGS_R15($sp)
+	.endm
+
+	.macro RESTORE_CALLEE_REGS
+	ldl	$9, PT_REGS_R9($sp)
+	ldl	$10, PT_REGS_R10($sp)
+	ldl	$11, PT_REGS_R11($sp)
+	ldl	$12, PT_REGS_R12($sp)
+	ldl	$13, PT_REGS_R13($sp)
+	ldl	$14, PT_REGS_R14($sp)
+	ldl	$15, PT_REGS_R15($sp)
+	.endm
+
+	.macro SAVE_ALL
+	SAVE_COMMON_REGS
+	SAVE_CALLEE_REGS
+	.endm
+
+	.macro RESTORE_ALL
+	RESTORE_CALLEE_REGS
+	RESTORE_COMMON_REGS
+	.endm
 
 /*
  * Non-syscall kernel entry points.
@@ -591,19 +623,42 @@ $setfpec_over:
 	.end undo_switch_stack
 
 /*
- * The meat of the context switch code.
+ * Integer register context switch
+ * The callee-saved registers must be saved and restored.
+ *
+ *   a0: physical address of next task's pcb, used by hmcode
+ *   a1: previous task_struct (must be preserved across the switch)
+ *   a2: next task_struct
+ *
+ * The value of a1  must be preserved by this function, as that's how
+ * arguments are passed to schedule_tail.
  */
-
 	.align 4
 	.globl __switch_to
 	.ent __switch_to
 __switch_to:
 	.prologue 0
-	bsr	$1, do_switch_stack
+	/* Save context into prev->thread */
+	stl	$26, TASK_THREAD_RA($17)
+	stl	$9, TASK_THREAD_S0($17)
+	stl	$10, TASK_THREAD_S1($17)
+	stl	$11, TASK_THREAD_S2($17)
+	stl	$12, TASK_THREAD_S3($17)
+	stl	$13, TASK_THREAD_S4($17)
+	stl	$14, TASK_THREAD_S5($17)
+	stl	$15, TASK_THREAD_S6($17)
+	/* Restore context from next->thread */
+	ldl	$26, TASK_THREAD_RA($18)
+	ldl	$9, TASK_THREAD_S0($18)
+	ldl	$10, TASK_THREAD_S1($18)
+	ldl	$11, TASK_THREAD_S2($18)
+	ldl	$12, TASK_THREAD_S3($18)
+	ldl	$13, TASK_THREAD_S4($18)
+	ldl	$14, TASK_THREAD_S5($18)
+	ldl	$15, TASK_THREAD_S6($18)
 	sys_call HMC_swpctx
 	ldi	$8, 0x3fff
 	bic	$sp, $8, $8
-	bsr	$1, undo_switch_stack
 	mov	$17, $0
 	ret
 	.end __switch_to
@@ -637,30 +692,6 @@ ret_from_kernel_thread:
 	br	$31, ret_to_user
 	.end ret_from_kernel_thread
 
-/*
- * Special system calls.  Most of these are special in that they either
- * have to play switch_stack games or in some way use the pt_regs struct.
- */
-
-.macro	fork_like name
-	.align 4
-	.globl sw64_\name
-	.ent sw64_\name
-sw64_\name:
-	.prologue 0
-	bsr	$1, do_switch_stack
-	call	$26, sys_\name
-	ldl	$26, SWITCH_STACK_RA($sp)
-	ldi	$sp, SWITCH_STACK_SIZE($sp)
-	ret
-	.end	sw64_\name
-	.endm
-
-fork_like fork
-fork_like vfork
-fork_like clone
-fork_like clone3
-
 	.align 4
 	.globl sys_sigreturn
 	.ent sys_sigreturn
diff --git a/arch/sw_64/kernel/process.c b/arch/sw_64/kernel/process.c
index 4192d50f5b0e..c6e87874af19 100644
--- a/arch/sw_64/kernel/process.c
+++ b/arch/sw_64/kernel/process.c
@@ -11,6 +11,7 @@
 #include <linux/random.h>
 
 #include <asm/fpu.h>
+#include <asm/switch_to.h>
 
 #include "proto.h"
 
@@ -158,19 +159,18 @@ copy_thread(unsigned long clone_flags, unsigned long usp,
 	struct thread_info *childti = task_thread_info(p);
 	struct pt_regs *childregs = task_pt_regs(p);
 	struct pt_regs *regs = current_pt_regs();
-	struct switch_stack *childstack, *stack;
 
-	childstack = ((struct switch_stack *) childregs) - 1;
-	childti->pcb.ksp = (unsigned long) childstack;
+	childti->pcb.ksp = (unsigned long) childregs;
 	childti->pcb.flags = 7;	/* set FEN, clear everything else */
+	__fpstate_save(current);
+	p->thread = current->thread;
 
 	if (unlikely(p->flags & PF_KTHREAD)) {
 		/* kernel thread */
-		memset(childstack, 0,
-			sizeof(struct switch_stack) + sizeof(struct pt_regs));
-		childstack->r26 = (unsigned long) ret_from_kernel_thread;
-		childstack->r9 = usp;	/* function */
-		childstack->r10 = kthread_arg;
+		memset(childregs, 0, sizeof(struct pt_regs));
+		p->thread.ra = (unsigned long) ret_from_kernel_thread;
+		p->thread.s[0] = usp;	/* function */
+		p->thread.s[1] = kthread_arg;
 		childti->pcb.usp = 0;
 		return 0;
 	}
@@ -189,10 +189,7 @@ copy_thread(unsigned long clone_flags, unsigned long usp,
 	*childregs = *regs;
 	childregs->r0 = 0;
 	childregs->r19 = 0;
-	stack = ((struct switch_stack *) regs) - 1;
-	*childstack = *stack;
-	p->thread = current->thread;
-	childstack->r26 = (unsigned long) ret_from_fork;
+	p->thread.ra = (unsigned long) ret_from_fork;
 	return 0;
 }
 
diff --git a/arch/sw_64/kernel/syscalls/syscall.tbl b/arch/sw_64/kernel/syscalls/syscall.tbl
index b9b93d70124d..42a179422b6b 100644
--- a/arch/sw_64/kernel/syscalls/syscall.tbl
+++ b/arch/sw_64/kernel/syscalls/syscall.tbl
@@ -73,7 +73,7 @@
 63	common	getpgrp				sys_getpgrp
 #64 is unused
 #65 is unused
-66	common	vfork				sw64_vfork
+66	common	vfork				sys_vfork
 67	common	stat				sys_newstat
 68	common	lstat				sys_newlstat
 #69 is unused
@@ -289,7 +289,7 @@
 279	common	fsmount				sys_fsmount
 280	common	fspick				sys_fspick
 281	common	pidfd_open			sys_pidfd_open
-282	common	clone3				sw64_clone3
+282	common	clone3				sys_clone3
 283	common	close_range			sys_close_range
 284	common	openat2				sys_openat2
 285	common	pidfd_getfd			sys_pidfd_getfd
@@ -319,7 +319,7 @@
 309	common	get_kernel_syms			sys_ni_syscall
 310	common	syslog				sys_syslog
 311	common	reboot				sys_reboot
-312	common	clone				sw64_clone
+312	common	clone				sys_clone
 313	common	uselib				sys_uselib
 314	common	mlock				sys_mlock
 315	common	munlock				sys_munlock
-- 
Gitee


From cf8d42cc0e6dc5fe55895a233aa95418a157145c Mon Sep 17 00:00:00 2001
From: He Sheng <hesheng@wxiat.com>
Date: Sun, 24 Apr 2022 09:11:52 +0800
Subject: [PATCH 098/674] sw64: remove r9_r15 argument of dik_show_regs and
 die_if_kernel

Sunway inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GF7A

--------------------------------

Signed-off-by: He Sheng <hesheng@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/kernel/process.c |  2 +-
 arch/sw_64/kernel/proto.h   |  4 ++--
 arch/sw_64/kernel/ptrace.c  |  2 +-
 arch/sw_64/kernel/traps.c   | 27 ++++++++++++---------------
 arch/sw_64/mm/fault.c       |  8 ++++----
 arch/sw_64/mm/init.c        |  2 --
 6 files changed, 20 insertions(+), 25 deletions(-)

diff --git a/arch/sw_64/kernel/process.c b/arch/sw_64/kernel/process.c
index c6e87874af19..9a6812851b15 100644
--- a/arch/sw_64/kernel/process.c
+++ b/arch/sw_64/kernel/process.c
@@ -110,7 +110,7 @@ void
 show_regs(struct pt_regs *regs)
 {
 	show_regs_print_info(KERN_DEFAULT);
-	dik_show_regs(regs, NULL);
+	dik_show_regs(regs);
 }
 
 /*
diff --git a/arch/sw_64/kernel/proto.h b/arch/sw_64/kernel/proto.h
index 9c99baaaf1d0..189074f8bd5c 100644
--- a/arch/sw_64/kernel/proto.h
+++ b/arch/sw_64/kernel/proto.h
@@ -12,8 +12,8 @@ extern int ptrace_set_bpt(struct task_struct *child);
 extern int ptrace_cancel_bpt(struct task_struct *child);
 
 /* traps.c */
-extern void dik_show_regs(struct pt_regs *regs, unsigned long *r9_15);
-extern void die_if_kernel(char *str, struct pt_regs *regs, long err, unsigned long *r9_15);
+extern void dik_show_regs(struct pt_regs *regs);
+extern void die_if_kernel(char *str, struct pt_regs *regs, long err);
 
 /* timer.c */
 extern void setup_timer(void);
diff --git a/arch/sw_64/kernel/ptrace.c b/arch/sw_64/kernel/ptrace.c
index b06c98e9944b..3b29ff7114f7 100644
--- a/arch/sw_64/kernel/ptrace.c
+++ b/arch/sw_64/kernel/ptrace.c
@@ -521,7 +521,7 @@ int do_match(unsigned long address, unsigned long mmcsr, long cause, struct pt_r
 	case MMCSR__DA_MATCH:
 	case MMCSR__DV_MATCH:
 	case MMCSR__DAV_MATCH:
-		dik_show_regs(regs, (unsigned long *)regs-15);
+		dik_show_regs(regs);
 
 		if (!(current->ptrace & PT_PTRACED)) {
 			printk(" pid %d %s not be ptraced, return\n", current->pid, current->comm);
diff --git a/arch/sw_64/kernel/traps.c b/arch/sw_64/kernel/traps.c
index 99cee58e886d..b7c4e45df87b 100644
--- a/arch/sw_64/kernel/traps.c
+++ b/arch/sw_64/kernel/traps.c
@@ -22,8 +22,7 @@
 
 #include "proto.h"
 
-void
-dik_show_regs(struct pt_regs *regs, unsigned long *r9_15)
+void dik_show_regs(struct pt_regs *regs)
 {
 	printk("pc = [<%016lx>]  ra = [<%016lx>]  ps = %04lx    %s\n",
 	       regs->pc, regs->r26, regs->ps, print_tainted());
@@ -36,13 +35,12 @@ dik_show_regs(struct pt_regs *regs, unsigned long *r9_15)
 	printk("t5 = %016lx  t6 = %016lx  t7 = %016lx\n",
 	       regs->r6, regs->r7, regs->r8);
 
-	if (r9_15) {
-		printk("s0 = %016lx  s1 = %016lx  s2 = %016lx\n",
-		       r9_15[9], r9_15[10], r9_15[11]);
-		printk("s3 = %016lx  s4 = %016lx  s5 = %016lx\n",
-		       r9_15[12], r9_15[13], r9_15[14]);
-		printk("s6 = %016lx\n", r9_15[15]);
-	}
+	printk("s0 = %016lx  s1 = %016lx  s2 = %016lx\n",
+	       regs->r9, regs->r10, regs->r11);
+	printk("s3 = %016lx  s4 = %016lx  s5 = %016lx\n",
+	       regs->r12, regs->r13, regs->r14);
+	printk("s6 = %016lx\n",
+	       regs->r15);
 
 	printk("a0 = %016lx  a1 = %016lx  a2 = %016lx\n",
 	       regs->r16, regs->r17, regs->r18);
@@ -117,8 +115,7 @@ void show_stack(struct task_struct *task, unsigned long *sp, const char *loglvl)
 	dik_show_trace(sp, loglvl);
 }
 
-void
-die_if_kernel(char *str, struct pt_regs *regs, long err, unsigned long *r9_15)
+void die_if_kernel(char *str, struct pt_regs *regs, long err)
 {
 	if (regs->ps & 8)
 		return;
@@ -126,7 +123,7 @@ die_if_kernel(char *str, struct pt_regs *regs, long err, unsigned long *r9_15)
 	printk("CPU %d ", hard_smp_processor_id());
 #endif
 	printk("%s(%d): %s %ld\n", current->comm, task_pid_nr(current), str, err);
-	dik_show_regs(regs, r9_15);
+	dik_show_regs(regs);
 	add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE);
 	dik_show_trace((unsigned long *)(regs+1), KERN_DEFAULT);
 	dik_show_code((unsigned int *)regs->pc);
@@ -178,7 +175,7 @@ do_entArith(unsigned long summary, unsigned long write_mask,
 		if (si_code == 0)
 			return;
 	}
-	die_if_kernel("Arithmetic fault", regs, 0, NULL);
+	die_if_kernel("Arithmetic fault", regs, 0);
 
 	force_sig_fault(SIGFPE, si_code, (void __user *)regs->pc, 0);
 }
@@ -205,7 +202,7 @@ do_entIF(unsigned long inst_type, struct pt_regs *regs)
 			return;
 		}
 		die_if_kernel((type == 1 ? "Kernel Bug" : "Instruction fault"),
-				regs, type, NULL);
+				regs, type);
 	}
 
 	switch (type) {
@@ -297,7 +294,7 @@ do_entIF(unsigned long inst_type, struct pt_regs *regs)
 				return;
 		}
 		if ((regs->ps & ~IPL_MAX) == 0)
-			die_if_kernel("Instruction fault", regs, type, NULL);
+			die_if_kernel("Instruction fault", regs, type);
 		break;
 
 	case 3: /* FEN fault */
diff --git a/arch/sw_64/mm/fault.c b/arch/sw_64/mm/fault.c
index b580450893ba..9562bc3ba37f 100644
--- a/arch/sw_64/mm/fault.c
+++ b/arch/sw_64/mm/fault.c
@@ -31,8 +31,8 @@ static inline int notify_page_fault(struct pt_regs *regs, unsigned long mmcsr)
 }
 #endif
 
-extern void die_if_kernel(char *, struct pt_regs *, long, unsigned long *);
-extern void dik_show_regs(struct pt_regs *regs, unsigned long *r9_15);
+extern void die_if_kernel(char *, struct pt_regs *, long);
+extern void dik_show_regs(struct pt_regs *regs);
 
 void show_all_vma(void)
 {
@@ -305,7 +305,7 @@ do_page_fault(unsigned long address, unsigned long mmcsr,
 	 */
 	pr_alert("Unable to handle kernel paging request at virtual address %016lx\n",
 	       address);
-	die_if_kernel("Oops", regs, cause, (unsigned long *)regs - 16);
+	die_if_kernel("Oops", regs, cause);
 	do_exit(SIGKILL);
 
 	/*
@@ -336,7 +336,7 @@ do_page_fault(unsigned long address, unsigned long mmcsr,
 	if (unlikely(segv_debug_enabled)) {
 		pr_info("fault: want to send_segv: pid %d, cause = %#lx, mmcsr = %#lx, address = %#lx, pc %#lx\n",
 				current->pid, cause, mmcsr, address, regs->pc);
-		dik_show_regs(regs, (unsigned long *)regs-16);
+		dik_show_regs(regs);
 		show_all_vma();
 	}
 
diff --git a/arch/sw_64/mm/init.c b/arch/sw_64/mm/init.c
index 7fcd3d834ba5..3593e4b13319 100644
--- a/arch/sw_64/mm/init.c
+++ b/arch/sw_64/mm/init.c
@@ -13,8 +13,6 @@
 
 #include <asm/mmu_context.h>
 
-extern void die_if_kernel(char *, struct pt_regs *, long);
-
 struct mem_desc_t mem_desc;
 #ifndef CONFIG_NUMA
 struct numa_node_desc_t numa_nodes_desc[1];
-- 
Gitee


From 9edfcf89f2df44eda3a312e0c958914f391a14cf Mon Sep 17 00:00:00 2001
From: He Sheng <hesheng@wxiat.com>
Date: Sun, 24 Apr 2022 09:52:46 +0800
Subject: [PATCH 099/674] sw64: remove switch_stack from entMM and entSys

Sunway inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GF7A

--------------------------------

This patch removes switch_stack and its usages in mm fault entry
and system call entry. Note that ptrace accesses registers with
struct pt_regs only since strace gave up a switch_stack trick.

Signed-off-by: He Sheng <hesheng@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/kernel/entry.S  | 42 +++-----------------------------------
 arch/sw_64/kernel/ptrace.c | 12 ++---------
 2 files changed, 5 insertions(+), 49 deletions(-)

diff --git a/arch/sw_64/kernel/entry.S b/arch/sw_64/kernel/entry.S
index ac62461aa9dc..f843acd88249 100644
--- a/arch/sw_64/kernel/entry.S
+++ b/arch/sw_64/kernel/entry.S
@@ -133,31 +133,11 @@ entArith:
 	.ent entMM
 entMM:
 	SAVE_ALL
-/* save $9 - $15 so the inline exception code can manipulate them.  */
-	subl	$sp, SWITCH_STACK_RA, $sp
-	stl	$9, SWITCH_STACK_R9($sp)
-	stl	$10, SWITCH_STACK_R10($sp)
-	stl	$11, SWITCH_STACK_R11($sp)
-	stl	$12, SWITCH_STACK_R12($sp)
-	stl	$13, SWITCH_STACK_R13($sp)
-	stl	$14, SWITCH_STACK_R14($sp)
-	stl	$15, SWITCH_STACK_R15($sp)
-	addl	$sp, SWITCH_STACK_RA, $19
-/* handle the fault */
 	ldi	$8, 0x3fff
+	ldi	$26, ret_from_sys_call
 	bic	$sp, $8, $8
-	call	$26, do_page_fault
-/* reload the registers after the exception code played.  */
-	ldl	$9, SWITCH_STACK_R9($sp)
-	ldl	$10, SWITCH_STACK_R10($sp)
-	ldl	$11, SWITCH_STACK_R11($sp)
-	ldl	$12, SWITCH_STACK_R12($sp)
-	ldl	$13, SWITCH_STACK_R13($sp)
-	ldl	$14, SWITCH_STACK_R14($sp)
-	ldl	$15, SWITCH_STACK_R15($sp)
-	addl	$sp, SWITCH_STACK_RA, $sp
-/* finish up the syscall as normal.  */
-	br	ret_from_sys_call
+	mov	$sp, $19
+	call	$31, do_page_fault
 	.end entMM
 
 	.align 4
@@ -400,9 +380,7 @@ $work_resched:
 
 $work_notifysig:
 	mov	$sp, $16
-	bsr	$1, do_switch_stack
 	call	$26, do_work_pending
-	bsr	$1, undo_switch_stack
 	br	restore_all
 	.end work_pending
 
@@ -416,14 +394,9 @@ $work_notifysig:
 	.ent strace
 strace:
 	/* set up signal stack, call syscall_trace */
-	bsr	$1, do_switch_stack
 	mov	$0, $9
 	mov	$19, $10
 	call	$26, syscall_trace_enter
-	mov	$9, $18
-	mov	$10, $19
-	bsr	$1, undo_switch_stack
-
 	blt	$0, $syscall_trace_failed
 
 	/* get the system call number and the arguments back.. */
@@ -452,10 +425,7 @@ ret_from_straced:
 	stl	$31, PT_REGS_R19($sp)	/* a3=0 => no error */
 $strace_success:
 	stl	$0, PT_REGS_R0($sp)	/* save return value */
-
-	bsr	$1, do_switch_stack
 	call	$26, syscall_trace_leave
-	bsr	$1, undo_switch_stack
 	br	$31, ret_from_sys_call
 
 	.align 3
@@ -470,25 +440,19 @@ $strace_error:
 	stl	$0, PT_REGS_R0($sp)
 	stl	$1, PT_REGS_R19($sp)	/* a3 for return */
 
-	bsr	$1, do_switch_stack
 	mov	$18, $9		/* save old syscall number */
 	mov	$19, $10	/* save old a3 */
 	call	$26, syscall_trace_leave
 	mov	$9, $18
 	mov	$10, $19
-	bsr	$1, undo_switch_stack
 
 	mov	$31, $26	/* tell "ret_from_sys_call" we can restart */
 	br	ret_from_sys_call
 
 $syscall_trace_failed:
-	bsr	$1, do_switch_stack
-	mov	$18, $9
-	mov	$19, $10
 	call	$26, syscall_trace_leave
 	mov	$9, $18
 	mov	$10, $19
-	bsr	$1, undo_switch_stack
 	mov	$31, $26	/* tell "ret_from_sys_call" we can restart */
 	br	ret_from_sys_call
 	.end strace
diff --git a/arch/sw_64/kernel/ptrace.c b/arch/sw_64/kernel/ptrace.c
index 3b29ff7114f7..097590b22a5a 100644
--- a/arch/sw_64/kernel/ptrace.c
+++ b/arch/sw_64/kernel/ptrace.c
@@ -33,10 +33,6 @@
  *  | frame generated by SAVE_ALL    | |
  *  |				     | v
  *  +================================+
- *  |				     | ^
- *  | frame saved by do_switch_stack | | struct switch_stack
- *  |				     | v
- *  +================================+
  */
 
 /*
@@ -59,18 +55,14 @@ enum {
 #define PT_REG(reg) \
 	(PAGE_SIZE * 2 - sizeof(struct pt_regs) + offsetof(struct pt_regs, reg))
 
-#define SW_REG(reg) \
-	(PAGE_SIZE * 2 - sizeof(struct pt_regs) - sizeof(struct switch_stack) \
-	+ offsetof(struct switch_stack, reg))
-
 #define FP_REG(fp_regno, vector_regno) \
 	(fp_regno * 32 + vector_regno * 8)
 
 static int regoff[] = {
 	PT_REG(r0), PT_REG(r1), PT_REG(r2), PT_REG(r3),
 	PT_REG(r4), PT_REG(r5), PT_REG(r6), PT_REG(r7),
-	PT_REG(r8), SW_REG(r9), SW_REG(r10), SW_REG(r11),
-	SW_REG(r12), SW_REG(r13), SW_REG(r14), SW_REG(r15),
+	PT_REG(r8), PT_REG(r9), PT_REG(r10), PT_REG(r11),
+	PT_REG(r12), PT_REG(r13), PT_REG(r14), PT_REG(r15),
 	PT_REG(r16), PT_REG(r17), PT_REG(r18), PT_REG(r19),
 	PT_REG(r20), PT_REG(r21), PT_REG(r22), PT_REG(r23),
 	PT_REG(r24), PT_REG(r25), PT_REG(r26), PT_REG(r27),
-- 
Gitee


From 40f13ed00ba08ca82c764d04f78fdf08d7583263 Mon Sep 17 00:00:00 2001
From: He Sheng <hesheng@wxiat.com>
Date: Sun, 24 Apr 2022 12:01:24 +0800
Subject: [PATCH 100/674] sw64: remove switch_stack from signal handling

Sunway inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GF7A

--------------------------------

The new pt_regs struct holds r9 ~ r15, and it's sufficient to setup
and restore sigcontext. Take care that fpregs state of current task
should be saved and restored explicitly after switch_stack removed.

Signed-off-by: He Sheng <hesheng@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/kernel/entry.S  |  8 ++------
 arch/sw_64/kernel/signal.c | 38 ++++++++++++++++++--------------------
 2 files changed, 20 insertions(+), 26 deletions(-)

diff --git a/arch/sw_64/kernel/entry.S b/arch/sw_64/kernel/entry.S
index f843acd88249..730346754ecc 100644
--- a/arch/sw_64/kernel/entry.S
+++ b/arch/sw_64/kernel/entry.S
@@ -663,12 +663,10 @@ sys_sigreturn:
 	.prologue 0
 	ldi	$9, ret_from_straced
 	cmpult	$26, $9, $9
-	ldi	$sp, -SWITCH_STACK_SIZE($sp)
 	call	$26, do_sigreturn
 	bne	$9, 1f
 	call	$26, syscall_trace_leave
-1:	br	$1, undo_switch_stack
-	br	ret_from_sys_call
+1:	br	ret_from_sys_call
 	.end sys_sigreturn
 
 	.align 4
@@ -678,12 +676,10 @@ sys_rt_sigreturn:
 	.prologue 0
 	ldi	$9, ret_from_straced
 	cmpult	$26, $9, $9
-	ldi	$sp, -SWITCH_STACK_SIZE($sp)
 	call	$26, do_rt_sigreturn
 	bne	$9, 1f
 	call	$26, syscall_trace_leave
-1:	br	$1, undo_switch_stack
-	br	ret_from_sys_call
+1:	br	ret_from_sys_call
 	.end sys_rt_sigreturn
 
 	.align 4
diff --git a/arch/sw_64/kernel/signal.c b/arch/sw_64/kernel/signal.c
index dd0d8ff42420..887326812624 100644
--- a/arch/sw_64/kernel/signal.c
+++ b/arch/sw_64/kernel/signal.c
@@ -14,6 +14,7 @@
 
 #include <asm/ucontext.h>
 #include <asm/vdso.h>
+#include <asm/switch_to.h>
 
 #include "proto.h"
 
@@ -22,8 +23,6 @@
 
 #define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
 
-asmlinkage void ret_from_sys_call(void);
-
 SYSCALL_DEFINE2(odd_sigprocmask, int, how, unsigned long, newmask)
 {
 	sigset_t oldmask;
@@ -64,13 +63,10 @@ static long
 restore_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs)
 {
 	unsigned long usp;
-	struct switch_stack *sw = (struct switch_stack *)regs - 1;
 	long err = __get_user(regs->pc, &sc->sc_pc);
 
 	current->restart_block.fn = do_no_restart_syscall;
 
-	sw->r26 = (unsigned long) ret_from_sys_call;
-
 	err |= __get_user(regs->r0, sc->sc_regs+0);
 	err |= __get_user(regs->r1, sc->sc_regs+1);
 	err |= __get_user(regs->r2, sc->sc_regs+2);
@@ -80,13 +76,13 @@ restore_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs)
 	err |= __get_user(regs->r6, sc->sc_regs+6);
 	err |= __get_user(regs->r7, sc->sc_regs+7);
 	err |= __get_user(regs->r8, sc->sc_regs+8);
-	err |= __get_user(sw->r9, sc->sc_regs+9);
-	err |= __get_user(sw->r10, sc->sc_regs+10);
-	err |= __get_user(sw->r11, sc->sc_regs+11);
-	err |= __get_user(sw->r12, sc->sc_regs+12);
-	err |= __get_user(sw->r13, sc->sc_regs+13);
-	err |= __get_user(sw->r14, sc->sc_regs+14);
-	err |= __get_user(sw->r15, sc->sc_regs+15);
+	err |= __get_user(regs->r9, sc->sc_regs+9);
+	err |= __get_user(regs->r10, sc->sc_regs+10);
+	err |= __get_user(regs->r11, sc->sc_regs+11);
+	err |= __get_user(regs->r12, sc->sc_regs+12);
+	err |= __get_user(regs->r13, sc->sc_regs+13);
+	err |= __get_user(regs->r14, sc->sc_regs+14);
+	err |= __get_user(regs->r15, sc->sc_regs+15);
 	err |= __get_user(regs->r16, sc->sc_regs+16);
 	err |= __get_user(regs->r17, sc->sc_regs+17);
 	err |= __get_user(regs->r18, sc->sc_regs+18);
@@ -107,6 +103,8 @@ restore_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs)
 	err |= __copy_from_user(&current->thread.ctx_fp,
 			&sc->sc_fpregs, sizeof(struct context_fpregs));
 	err |= __get_user(current->thread.fpcr, &sc->sc_fpcr);
+	if (likely(!err))
+		__fpstate_restore(current);
 
 	return err;
 }
@@ -191,7 +189,6 @@ static long
 setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs,
 		 unsigned long mask, unsigned long sp)
 {
-	struct switch_stack *sw = (struct switch_stack *)regs - 1;
 	long err = 0;
 
 	err |= __put_user(on_sig_stack((unsigned long)sc), &sc->sc_onstack);
@@ -208,13 +205,13 @@ setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs,
 	err |= __put_user(regs->r6, sc->sc_regs+6);
 	err |= __put_user(regs->r7, sc->sc_regs+7);
 	err |= __put_user(regs->r8, sc->sc_regs+8);
-	err |= __put_user(sw->r9, sc->sc_regs+9);
-	err |= __put_user(sw->r10, sc->sc_regs+10);
-	err |= __put_user(sw->r11, sc->sc_regs+11);
-	err |= __put_user(sw->r12, sc->sc_regs+12);
-	err |= __put_user(sw->r13, sc->sc_regs+13);
-	err |= __put_user(sw->r14, sc->sc_regs+14);
-	err |= __put_user(sw->r15, sc->sc_regs+15);
+	err |= __put_user(regs->r9, sc->sc_regs+9);
+	err |= __put_user(regs->r10, sc->sc_regs+10);
+	err |= __put_user(regs->r11, sc->sc_regs+11);
+	err |= __put_user(regs->r12, sc->sc_regs+12);
+	err |= __put_user(regs->r13, sc->sc_regs+13);
+	err |= __put_user(regs->r14, sc->sc_regs+14);
+	err |= __put_user(regs->r15, sc->sc_regs+15);
 	err |= __put_user(regs->r16, sc->sc_regs+16);
 	err |= __put_user(regs->r17, sc->sc_regs+17);
 	err |= __put_user(regs->r18, sc->sc_regs+18);
@@ -232,6 +229,7 @@ setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs,
 	err |= __put_user(sp, sc->sc_regs+30);
 	err |= __put_user(0, sc->sc_regs+31);
 	/* simd-fp */
+	__fpstate_save(current);
 	err |= __copy_to_user(&sc->sc_fpregs,
 			&current->thread.ctx_fp, sizeof(struct context_fpregs));
 	err |= __put_user(current->thread.fpcr, &sc->sc_fpcr);
-- 
Gitee


From ba8c73ce31bd4d5375b0d005b1f89d46c73477dc Mon Sep 17 00:00:00 2001
From: He Sheng <hesheng@wxiat.com>
Date: Sun, 24 Apr 2022 16:51:34 +0800
Subject: [PATCH 101/674] sw64: dump callee-saved registers from pt_regs

Sunway inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GF7A

--------------------------------

Signed-off-by: He Sheng <hesheng@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/kernel/process.c | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/arch/sw_64/kernel/process.c b/arch/sw_64/kernel/process.c
index 9a6812851b15..4e7a1418af30 100644
--- a/arch/sw_64/kernel/process.c
+++ b/arch/sw_64/kernel/process.c
@@ -199,9 +199,6 @@ copy_thread(unsigned long clone_flags, unsigned long usp,
 void
 dump_elf_thread(elf_greg_t *dest, struct pt_regs *pt, struct thread_info *ti)
 {
-	/* switch stack follows right below pt_regs: */
-	struct switch_stack *sw = ((struct switch_stack *) pt) - 1;
-
 	dest[0] = pt->r0;
 	dest[1] = pt->r1;
 	dest[2] = pt->r2;
@@ -211,13 +208,13 @@ dump_elf_thread(elf_greg_t *dest, struct pt_regs *pt, struct thread_info *ti)
 	dest[6] = pt->r6;
 	dest[7] = pt->r7;
 	dest[8] = pt->r8;
-	dest[9] = sw->r9;
-	dest[10] = sw->r10;
-	dest[11] = sw->r11;
-	dest[12] = sw->r12;
-	dest[13] = sw->r13;
-	dest[14] = sw->r14;
-	dest[15] = sw->r15;
+	dest[9] = pt->r9;
+	dest[10] = pt->r10;
+	dest[11] = pt->r11;
+	dest[12] = pt->r12;
+	dest[13] = pt->r13;
+	dest[14] = pt->r14;
+	dest[15] = pt->r15;
 	dest[16] = pt->r16;
 	dest[17] = pt->r17;
 	dest[18] = pt->r18;
-- 
Gitee


From c4c31c88e37cf8ad3a6797dba141c0ed162b5e93 Mon Sep 17 00:00:00 2001
From: He Sheng <hesheng@wxiat.com>
Date: Sun, 24 Apr 2022 16:51:45 +0800
Subject: [PATCH 102/674] sw64: get blocked thread's frame pointer from
 thread_struct

Sunway inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GF7A

--------------------------------

The thread_struct holds all provided the thread blocked through
a call to schedule(). $15 is the frame pointer in schedule() and
it is saved at thread_struct.s[6]. With frame pointer, it can
return saved PC of a blocked thread which assumes that the saved
return address is the first long in the frame.

Signed-off-by: He Sheng <hesheng@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/kernel/process.c | 29 +++++++++++------------------
 1 file changed, 11 insertions(+), 18 deletions(-)

diff --git a/arch/sw_64/kernel/process.c b/arch/sw_64/kernel/process.c
index 4e7a1418af30..1f093c5a5235 100644
--- a/arch/sw_64/kernel/process.c
+++ b/arch/sw_64/kernel/process.c
@@ -257,13 +257,6 @@ dump_elf_task_fp(elf_fpreg_t *dest, struct task_struct *task)
 EXPORT_SYMBOL(dump_elf_task_fp);
 
 /*
- * Return saved PC of a blocked thread.  This assumes the frame
- * pointer is the 6th saved long on the kernel stack and that the
- * saved return address is the first long in the frame.  This all
- * holds provided the thread blocked through a call to schedule() ($15
- * is the frame pointer in schedule() and $15 is saved at offset 48 by
- * entry.S:do_switch_stack).
- *
  * Under heavy swap load I've seen this lose in an ugly way.  So do
  * some extra sanity checking on the ranges we expect these pointers
  * to be in so that we can fail gracefully.  This is just for ps after
@@ -273,14 +266,14 @@ EXPORT_SYMBOL(dump_elf_task_fp);
 unsigned long
 thread_saved_pc(struct task_struct *t)
 {
-	unsigned long base = (unsigned long)task_stack_page(t);
-	unsigned long fp, sp = task_thread_info(t)->pcb.ksp;
+	unsigned long top, fp, sp;
 
-	if (sp > base && sp+6*8 < base + 16*1024) {
-		fp = ((unsigned long *)sp)[6];
-		if (fp > sp && fp < base + 16*1024)
-			return *(unsigned long *)fp;
-	}
+	top = (unsigned long)task_stack_page(t) + 2 * PAGE_SIZE;
+	sp = task_thread_info(t)->pcb.ksp;
+	fp = t->thread.s[6];
+
+	if (fp > sp && fp < top)
+		return *(unsigned long *)fp;
 
 	return 0;
 }
@@ -289,7 +282,7 @@ unsigned long
 get_wchan(struct task_struct *p)
 {
 	unsigned long schedule_frame;
-	unsigned long pc, base, sp;
+	unsigned long pc, top, sp;
 
 	if (!p || p == current || p->state == TASK_RUNNING)
 		return 0;
@@ -305,10 +298,10 @@ get_wchan(struct task_struct *p)
 
 	pc = thread_saved_pc(p);
 	if (in_sched_functions(pc)) {
-		base = (unsigned long)task_stack_page(p);
+		top = (unsigned long)task_stack_page(p) + 2 * PAGE_SIZE;
 		sp = task_thread_info(p)->pcb.ksp;
-		schedule_frame = ((unsigned long *)sp)[6];
-		if (schedule_frame > sp && schedule_frame < base + 16*1024)
+		schedule_frame = p->thread.s[6];
+		if (schedule_frame > sp && schedule_frame < top)
 			return ((unsigned long *)schedule_frame)[12];
 	}
 	return pc;
-- 
Gitee


From 72ed1e6e2775b868bb84d55f3c13ff8cd204d1d8 Mon Sep 17 00:00:00 2001
From: He Sheng <hesheng@wxiat.com>
Date: Mon, 25 Apr 2022 12:57:28 +0800
Subject: [PATCH 103/674] sw64: remove switch_stack and allregs from entUna

Sunway inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GF7A

--------------------------------

It's simple to maintain unified register layout for user-mode and
kernel-mode unaligned accesses. With the new struct pt_regs, we
are able to make the exception handler cleaner.

After that, the unused struct allregs can be cleaned up.

Signed-off-by: He Sheng <hesheng@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/kernel/asm-offsets.c |  41 ------------
 arch/sw_64/kernel/entry.S       | 109 +++++---------------------------
 arch/sw_64/kernel/traps.c       |  76 +++++-----------------
 3 files changed, 33 insertions(+), 193 deletions(-)

diff --git a/arch/sw_64/kernel/asm-offsets.c b/arch/sw_64/kernel/asm-offsets.c
index 349bc5c4e2f1..0e8a814f47ce 100644
--- a/arch/sw_64/kernel/asm-offsets.c
+++ b/arch/sw_64/kernel/asm-offsets.c
@@ -111,47 +111,6 @@ void foo(void)
 	DEFINE(SWITCH_STACK_RA, offsetof(struct switch_stack, r26));
 	BLANK();
 
-	DEFINE(ALLREGS_SIZE, sizeof(struct allregs));
-	DEFINE(ALLREGS_R0, offsetof(struct allregs, regs[0]));
-	DEFINE(ALLREGS_R1, offsetof(struct allregs, regs[1]));
-	DEFINE(ALLREGS_R2, offsetof(struct allregs, regs[2]));
-	DEFINE(ALLREGS_R3, offsetof(struct allregs, regs[3]));
-	DEFINE(ALLREGS_R4, offsetof(struct allregs, regs[4]));
-	DEFINE(ALLREGS_R5, offsetof(struct allregs, regs[5]));
-	DEFINE(ALLREGS_R6, offsetof(struct allregs, regs[6]));
-	DEFINE(ALLREGS_R7, offsetof(struct allregs, regs[7]));
-	DEFINE(ALLREGS_R8, offsetof(struct allregs, regs[8]));
-	DEFINE(ALLREGS_R9, offsetof(struct allregs, regs[9]));
-	DEFINE(ALLREGS_R10, offsetof(struct allregs, regs[10]));
-	DEFINE(ALLREGS_R11, offsetof(struct allregs, regs[11]));
-	DEFINE(ALLREGS_R12, offsetof(struct allregs, regs[12]));
-	DEFINE(ALLREGS_R13, offsetof(struct allregs, regs[13]));
-	DEFINE(ALLREGS_R14, offsetof(struct allregs, regs[14]));
-	DEFINE(ALLREGS_R15, offsetof(struct allregs, regs[15]));
-	DEFINE(ALLREGS_R16, offsetof(struct allregs, regs[16]));
-	DEFINE(ALLREGS_R17, offsetof(struct allregs, regs[17]));
-	DEFINE(ALLREGS_R18, offsetof(struct allregs, regs[18]));
-	DEFINE(ALLREGS_R19, offsetof(struct allregs, regs[19]));
-	DEFINE(ALLREGS_R20, offsetof(struct allregs, regs[20]));
-	DEFINE(ALLREGS_R21, offsetof(struct allregs, regs[21]));
-	DEFINE(ALLREGS_R22, offsetof(struct allregs, regs[22]));
-	DEFINE(ALLREGS_R23, offsetof(struct allregs, regs[23]));
-	DEFINE(ALLREGS_R24, offsetof(struct allregs, regs[24]));
-	DEFINE(ALLREGS_R25, offsetof(struct allregs, regs[25]));
-	DEFINE(ALLREGS_R26, offsetof(struct allregs, regs[26]));
-	DEFINE(ALLREGS_R27, offsetof(struct allregs, regs[27]));
-	DEFINE(ALLREGS_R28, offsetof(struct allregs, regs[28]));
-	DEFINE(ALLREGS_R29, offsetof(struct allregs, regs[29]));
-	DEFINE(ALLREGS_R30, offsetof(struct allregs, regs[30]));
-	DEFINE(ALLREGS_R31, offsetof(struct allregs, regs[31]));
-	DEFINE(ALLREGS_PS, offsetof(struct allregs, ps));
-	DEFINE(ALLREGS_PC, offsetof(struct allregs, pc));
-	DEFINE(ALLREGS_GP, offsetof(struct allregs, gp));
-	DEFINE(ALLREGS_A0, offsetof(struct allregs, a0));
-	DEFINE(ALLREGS_A1, offsetof(struct allregs, a1));
-	DEFINE(ALLREGS_A2, offsetof(struct allregs, a2));
-	BLANK();
-
 	DEFINE(KVM_REGS_SIZE, sizeof(struct kvm_regs));
 	DEFINE(KVM_REGS_R0, offsetof(struct kvm_regs, r0));
 	DEFINE(KVM_REGS_R1, offsetof(struct kvm_regs, r1));
diff --git a/arch/sw_64/kernel/entry.S b/arch/sw_64/kernel/entry.S
index 730346754ecc..f54b02c1b3a9 100644
--- a/arch/sw_64/kernel/entry.S
+++ b/arch/sw_64/kernel/entry.S
@@ -152,109 +152,32 @@ entIF:
 	call	$31, do_entIF
 	.end entIF
 
+/*
+ * Handle unalignment exception.
+ * We don't handle the "gp" register correctly, but if we fault on a
+ * gp-register unaligned load/store, something is _very_ wrong in the
+ * kernel anyway.
+ */
 	.align 4
 	.globl entUna
 	.ent entUna
 entUna:
-	ldi	$sp, -ALLREGS_PS($sp)
-	stl	$0, ALLREGS_R0($sp)
-	ldl	$0, ALLREGS_PS($sp)	/* get PS */
-	stl	$1, ALLREGS_R1($sp)
-	stl	$2, ALLREGS_R2($sp)
-	stl	$3, ALLREGS_R3($sp)
-	and	$0, 8, $0	/* user mode? */
-	stl	$4, ALLREGS_R4($sp)
-	bne	$0, entUnaUser	/* yup -> do user-level unaligned fault */
-	stl	$5, ALLREGS_R5($sp)
-	stl	$6, ALLREGS_R6($sp)
-	stl	$7, ALLREGS_R7($sp)
-	stl	$8, ALLREGS_R8($sp)
-	stl	$9, ALLREGS_R9($sp)
-	stl	$10, ALLREGS_R10($sp)
-	stl	$11, ALLREGS_R11($sp)
-	stl	$12, ALLREGS_R12($sp)
-	stl	$13, ALLREGS_R13($sp)
-	stl	$14, ALLREGS_R14($sp)
-	stl	$15, ALLREGS_R15($sp)
-	/* 16-18 HMCODE-saved */
-	stl	$19, ALLREGS_R19($sp)
-	stl	$20, ALLREGS_R20($sp)
-	stl	$21, ALLREGS_R21($sp)
-	stl	$22, ALLREGS_R22($sp)
-	stl	$23, ALLREGS_R23($sp)
-	stl	$24, ALLREGS_R24($sp)
-	stl	$25, ALLREGS_R25($sp)
-	stl	$26, ALLREGS_R26($sp)
-	stl	$27, ALLREGS_R27($sp)
-	stl	$28, ALLREGS_R28($sp)
-	mov	$sp, $19
-	stl	$gp, ALLREGS_R29($sp)
+	SAVE_ALL
 	ldi	$8, 0x3fff
-	stl	$31, ALLREGS_R31($sp)
 	bic	$sp, $8, $8
+	mov	$sp, $19
+	ldl	$0, PT_REGS_PS($sp)
+	and	$0, 8, $0		/* user mode ? */
+	beq	$0, 1f
+	ldi	$26, ret_from_sys_call
+	call	$31, do_entUnaUser	/* return to ret_from_syscall */
+1:	ldl	$9, PT_REGS_GP($sp)
 	call	$26, do_entUna
-	ldl	$0, ALLREGS_R0($sp)
-	ldl	$1, ALLREGS_R1($sp)
-	ldl	$2, ALLREGS_R2($sp)
-	ldl	$3, ALLREGS_R3($sp)
-	ldl	$4, ALLREGS_R4($sp)
-	ldl	$5, ALLREGS_R5($sp)
-	ldl	$6, ALLREGS_R6($sp)
-	ldl	$7, ALLREGS_R7($sp)
-	ldl	$8, ALLREGS_R8($sp)
-	ldl	$9, ALLREGS_R9($sp)
-	ldl	$10, ALLREGS_R10($sp)
-	ldl	$11, ALLREGS_R11($sp)
-	ldl	$12, ALLREGS_R12($sp)
-	ldl	$13, ALLREGS_R13($sp)
-	ldl	$14, ALLREGS_R14($sp)
-	ldl	$15, ALLREGS_R15($sp)
-	/* 16-18 HMCODE-saved */
-	ldl	$19, ALLREGS_R19($sp)
-	ldl	$20, ALLREGS_R20($sp)
-	ldl	$21, ALLREGS_R21($sp)
-	ldl	$22, ALLREGS_R22($sp)
-	ldl	$23, ALLREGS_R23($sp)
-	ldl	$24, ALLREGS_R24($sp)
-	ldl	$25, ALLREGS_R25($sp)
-	ldl	$26, ALLREGS_R26($sp)
-	ldl	$27, ALLREGS_R27($sp)
-	ldl	$28, ALLREGS_R28($sp)
-	ldl	$gp, ALLREGS_R29($sp)
-	ldi	$sp, ALLREGS_PS($sp)
+	stl	$9, PT_REGS_GP($sp)
+	RESTORE_ALL
 	sys_call HMC_rti
 	.end entUna
 
-	.align 4
-	.ent entUnaUser
-entUnaUser:
-	ldl	$0, ALLREGS_R0($sp)	/* restore original $0 */
-	ldi	$sp, ALLREGS_PS($sp)	/* pop entUna's stack frame */
-	SAVE_ALL		/* setup normal kernel stack */
-	ldi	$sp, -SWITCH_STACK_RA($sp)
-	stl	$9, SWITCH_STACK_R9($sp)
-	stl	$10, SWITCH_STACK_R10($sp)
-	stl	$11, SWITCH_STACK_R11($sp)
-	stl	$12, SWITCH_STACK_R12($sp)
-	stl	$13, SWITCH_STACK_R13($sp)
-	stl	$14, SWITCH_STACK_R14($sp)
-	stl	$15, SWITCH_STACK_R15($sp)
-	ldi	$8, 0x3fff
-	addl	$sp, SWITCH_STACK_RA, $19
-	bic	$sp, $8, $8
-	call	$26, do_entUnaUser
-	ldl	$9, SWITCH_STACK_R9($sp)
-	ldl	$10, SWITCH_STACK_R10($sp)
-	ldl	$11, SWITCH_STACK_R11($sp)
-	ldl	$12, SWITCH_STACK_R12($sp)
-	ldl	$13, SWITCH_STACK_R13($sp)
-	ldl	$14, SWITCH_STACK_R14($sp)
-	ldl	$15, SWITCH_STACK_R15($sp)
-	ldi	$sp, SWITCH_STACK_RA($sp)
-	br	ret_from_sys_call
-	.end entUnaUser
-
-
 /*
  * The system call entry point is special.  Most importantly, it looks
  * like a function call to userspace as far as clobbered registers.  We
diff --git a/arch/sw_64/kernel/traps.c b/arch/sw_64/kernel/traps.c
index b7c4e45df87b..6cd3ade905d1 100644
--- a/arch/sw_64/kernel/traps.c
+++ b/arch/sw_64/kernel/traps.c
@@ -321,41 +321,35 @@ do_entIF(unsigned long inst_type, struct pt_regs *regs)
 	force_sig_fault(SIGILL, ILL_ILLOPC, (void __user *)regs->pc, 0);
 }
 
-/*
- * entUna has a different register layout to be reasonably simple. It
- * needs access to all the integer registers (the kernel doesn't use
- * fp-regs), and it needs to have them in order for simpler access.
- *
- * Due to the non-standard register layout (and because we don't want
- * to handle floating-point regs), user-mode unaligned accesses are
- * handled separately by do_entUnaUser below.
- *
- * Oh, btw, we don't handle the "gp" register correctly, but if we fault
- * on a gp-register unaligned load/store, something is _very_ wrong
- * in the kernel anyway..
- */
-struct allregs {
-	unsigned long regs[32];
-	unsigned long ps, pc, gp, a0, a1, a2;
-};
-
 struct unaligned_stat {
 	unsigned long count, va, pc;
 } unaligned[2];
 
 
 /* Macro for exception fixup code to access integer registers. */
-#define una_reg(r) (_regs[(r) >= 16 && (r) <= 18 ? (r) + 19 : (r)])
+#define R(x)	((size_t) &((struct pt_regs *)0)->x)
+
+static int regoffsets[32] = {
+	R(r0), R(r1), R(r2), R(r3), R(r4), R(r5), R(r6), R(r7), R(r8),
+	R(r9), R(r10), R(r11), R(r12), R(r13), R(r14), R(r15),
+	R(r16), R(r17), R(r18),
+	R(r19), R(r20), R(r21), R(r22), R(r23), R(r24), R(r25), R(r26),
+	R(r27), R(r28), R(gp),
+	0, 0
+};
+
+#undef R
+
+#define una_reg(r) (*(unsigned long *)((char *)regs + regoffsets[r]))
 
 
 asmlinkage void
 do_entUna(void *va, unsigned long opcode, unsigned long reg,
-	  struct allregs *regs)
+	  struct pt_regs *regs)
 {
 	long error;
 	unsigned long tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
 	unsigned long pc = regs->pc - 4;
-	unsigned long *_regs = regs->regs;
 	const struct exception_table_entry *fixup;
 
 	unaligned[0].count++;
@@ -566,29 +560,7 @@ do_entUna(void *va, unsigned long opcode, unsigned long reg,
 	printk("%s(%d): unhandled unaligned exception\n",
 	       current->comm, task_pid_nr(current));
 
-	printk("pc = [<%016lx>]  ra = [<%016lx>]  ps = %04lx\n",
-	       pc, una_reg(26), regs->ps);
-	printk("r0 = %016lx  r1 = %016lx  r2 = %016lx\n",
-	       una_reg(0), una_reg(1), una_reg(2));
-	printk("r3 = %016lx  r4 = %016lx  r5 = %016lx\n",
-	       una_reg(3), una_reg(4), una_reg(5));
-	printk("r6 = %016lx  r7 = %016lx  r8 = %016lx\n",
-	       una_reg(6), una_reg(7), una_reg(8));
-	printk("r9 = %016lx  r10= %016lx  r11= %016lx\n",
-	       una_reg(9), una_reg(10), una_reg(11));
-	printk("r12= %016lx  r13= %016lx  r14= %016lx\n",
-	       una_reg(12), una_reg(13), una_reg(14));
-	printk("r15= %016lx\n", una_reg(15));
-	printk("r16= %016lx  r17= %016lx  r18= %016lx\n",
-	       una_reg(16), una_reg(17), una_reg(18));
-	printk("r19= %016lx  r20= %016lx  r21= %016lx\n",
-	       una_reg(19), una_reg(20), una_reg(21));
-	printk("r22= %016lx  r23= %016lx  r24= %016lx\n",
-	       una_reg(22), una_reg(23), una_reg(24));
-	printk("r25= %016lx  r27= %016lx  r28= %016lx\n",
-	       una_reg(25), una_reg(27), una_reg(28));
-	printk("gp = %016lx  sp = %p\n", regs->gp, regs+1);
-
+	dik_show_regs(regs);
 	dik_show_code((unsigned int *)pc);
 	dik_show_trace((unsigned long *)(regs+1), KERN_DEFAULT);
 
@@ -668,20 +640,6 @@ s_reg_to_mem(unsigned long s_reg)
 			 1L << 0x2c | 1L << 0x2d | /* stw stl */	\
 			 1L << 0x0d | 1L << 0x0e)  /* sth stb */
 
-#define R(x)	((size_t) &((struct pt_regs *)0)->x)
-
-static int unauser_reg_offsets[32] = {
-	R(r0), R(r1), R(r2), R(r3), R(r4), R(r5), R(r6), R(r7), R(r8),
-	/* r9 ... r15 are stored in front of regs. */
-	-56, -48, -40, -32, -24, -16, -8,
-	R(r16), R(r17), R(r18),
-	R(r19), R(r20), R(r21), R(r22), R(r23), R(r24), R(r25), R(r26),
-	R(r27), R(r28), R(gp),
-	0, 0
-};
-
-#undef R
-
 asmlinkage void
 do_entUnaUser(void __user *va, unsigned long opcode,
 	      unsigned long reg, struct pt_regs *regs)
@@ -734,7 +692,7 @@ do_entUnaUser(void __user *va, unsigned long opcode,
 		/* it's an integer load/store */
 		if (reg < 30) {
 			reg_addr = (unsigned long *)
-				((char *)regs + unauser_reg_offsets[reg]);
+				((char *)regs + regoffsets[reg]);
 		} else if (reg == 30) {
 			/* usp in HMCODE regs */
 			fake_reg = rdusp();
-- 
Gitee


From b936d36d7ef16056e8b64dda3a0110a39a543bab Mon Sep 17 00:00:00 2001
From: He Sheng <hesheng@wxiat.com>
Date: Mon, 25 Apr 2022 15:09:11 +0800
Subject: [PATCH 104/674] sw64: remove switch_stack from __sw64_vcpu_run

Sunway inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GF7A

--------------------------------

Since r9~r15 have been added to struct pt_regs, it's okay to save
and restore host callee-saved integer registers with pt_regs.

Signed-off-by: He Sheng <hesheng@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/kvm/entry.S | 57 +++++++++++++++---------------------------
 1 file changed, 20 insertions(+), 37 deletions(-)

diff --git a/arch/sw_64/kvm/entry.S b/arch/sw_64/kvm/entry.S
index 76ebdda920cb..1e089da5378e 100644
--- a/arch/sw_64/kvm/entry.S
+++ b/arch/sw_64/kvm/entry.S
@@ -34,20 +34,15 @@ ENTRY(__sw64_vcpu_run)
 	/* r18 = hcall args */
 	/* save host  pt_regs to current kernel  stack */
 	ldi	sp, -PT_REGS_SIZE(sp)
-
-	stl     $8, PT_REGS_R8(sp)
+	stl	$9, PT_REGS_R9(sp)
+	stl	$10, PT_REGS_R10(sp)
+	stl	$11, PT_REGS_R11(sp)
+	stl	$12, PT_REGS_R12(sp)
+	stl	$13, PT_REGS_R13(sp)
+	stl	$14, PT_REGS_R14(sp)
+	stl	$15, PT_REGS_R15(sp)
 	stl	$26, PT_REGS_R26(sp)
 
-	/* save host switch stack to current kernel stack */
-	ldi	sp, -SWITCH_STACK_SIZE(sp)
-	stl	$9, SWITCH_STACK_R9(sp)
-	stl	$10, SWITCH_STACK_R10(sp)
-	stl	$11, SWITCH_STACK_R11(sp)
-	stl	$12, SWITCH_STACK_R12(sp)
-	stl	$13, SWITCH_STACK_R13(sp)
-	stl	$14, SWITCH_STACK_R14(sp)
-	stl	$15, SWITCH_STACK_R15(sp)
-
 	/* restore guest switch stack from guest kvm_regs struct */
 	ldl	$0, KVM_REGS_R0($17)
 	ldl	$1, KVM_REGS_R1($17)
@@ -203,23 +198,19 @@ $g_setfpec_over:
 	stl	$27, KVM_REGS_R27($17)
 	stl	$28, KVM_REGS_R28($17)
 
-	/* restore host switch stack from host sp */
-	ldl	$9, SWITCH_STACK_R9(sp)
-	ldl	$10, SWITCH_STACK_R10(sp)
-	ldl	$11, SWITCH_STACK_R11(sp)
-	ldl	$12, SWITCH_STACK_R12(sp)
-	ldl	$13, SWITCH_STACK_R13(sp)
-	ldl	$14, SWITCH_STACK_R14(sp)
-	ldl	$15, SWITCH_STACK_R15(sp)
-
-	ldi	sp, SWITCH_STACK_SIZE(sp)
-
 	/* restore host regs from host sp */
-	ldl     $8, PT_REGS_R8(sp)
+	ldl	$9, PT_REGS_R9(sp)
+	ldl	$10, PT_REGS_R10(sp)
+	ldl	$11, PT_REGS_R11(sp)
+	ldl	$12, PT_REGS_R12(sp)
+	ldl	$13, PT_REGS_R13(sp)
+	ldl	$14, PT_REGS_R14(sp)
+	ldl	$15, PT_REGS_R15(sp)
 	ldl	$26, PT_REGS_R26(sp)
-
 	ldi	sp, PT_REGS_SIZE(sp)
 
+	ldi	$8, 0x3fff
+	bic	sp, $8, $8
 	/* restore host fpregs */
 	ldl	$1, TI_TASK($8)
 	ldi	$1, TASK_THREAD($1)
@@ -261,25 +252,17 @@ $setfpec_over:
 
 	/* Hmcode will setup in  */
 	/* restore $16 $17 $18, do interrupt trick */
-	ldi	sp, -(HOST_INT_SIZE + PT_REGS_SIZE + SWITCH_STACK_SIZE)(sp)
+	ldi	sp, -(HOST_INT_SIZE + PT_REGS_SIZE)(sp)
 	ldl	$16, HOST_INT_R16(sp)
 	ldl	$17, HOST_INT_R17(sp)
 	ldl	$18, HOST_INT_R18(sp)
-	ldi	sp, (HOST_INT_SIZE + PT_REGS_SIZE + SWITCH_STACK_SIZE)(sp)
+	ldi	sp, (HOST_INT_SIZE + PT_REGS_SIZE)(sp)
 
-	ldi	$8, 0x3fff
-	bic	sp, $8, $8
 	ldi	$19, -PT_REGS_SIZE(sp)
-
-	ldi	$26, ret_from_do_entInt_noregs
-	call	$31, do_entInt
-
-	/* ret($0) indicate hcall number */
-ret_from_do_entInt_noregs:
+	call	$26, do_entInt
 	ldl	$26, VCPU_RET_RA(sp)
 	ldl	$0, VCPU_RET_R0(sp)
-
-	/* restore r16 - r19 */
 $ret_to:
+	/* ret($0) indicate hcall number */
 	ldi	sp, VCPU_RET_SIZE(sp)	/* pop stack */
 	ret
-- 
Gitee


From b9b8f159d7d859a2a6d1fcfcfa49987af59f0c87 Mon Sep 17 00:00:00 2001
From: He Sheng <hesheng@wxiat.com>
Date: Mon, 25 Apr 2022 13:39:10 +0800
Subject: [PATCH 105/674] sw64: remove other struct switch_stack things

Sunway inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GF7A

--------------------------------

Based on the previous series of patches, the do_switch_stack,
undo_switch_stack and struct switch_stack become unused. Herein
they are cleaned up.

Signed-off-by: He Sheng <hesheng@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/include/uapi/asm/ptrace.h |  15 ----
 arch/sw_64/kernel/asm-offsets.c      |  11 ---
 arch/sw_64/kernel/entry.S            | 129 ---------------------------
 arch/sw_64/kernel/traps.c            |   5 +-
 4 files changed, 2 insertions(+), 158 deletions(-)

diff --git a/arch/sw_64/include/uapi/asm/ptrace.h b/arch/sw_64/include/uapi/asm/ptrace.h
index 4549e8d4bf57..deb7e4096c93 100644
--- a/arch/sw_64/include/uapi/asm/ptrace.h
+++ b/arch/sw_64/include/uapi/asm/ptrace.h
@@ -54,21 +54,6 @@ struct pt_regs {
 	unsigned long r18;
 };
 
-/*
- * This is the extended stack used by signal handlers and the context
- * switcher: it's pushed after the normal "struct pt_regs".
- */
-struct switch_stack {
-	unsigned long r9;
-	unsigned long r10;
-	unsigned long r11;
-	unsigned long r12;
-	unsigned long r13;
-	unsigned long r14;
-	unsigned long r15;
-	unsigned long r26;
-};
-
 #define PTRACE_GETREGS		12	/* get general purpose registers */
 #define PTRACE_SETREGS		13	/* set general purpose registers */
 #define PTRACE_GETFPREGS	14	/* get floating-point registers */
diff --git a/arch/sw_64/kernel/asm-offsets.c b/arch/sw_64/kernel/asm-offsets.c
index 0e8a814f47ce..5dc59346996f 100644
--- a/arch/sw_64/kernel/asm-offsets.c
+++ b/arch/sw_64/kernel/asm-offsets.c
@@ -100,17 +100,6 @@ void foo(void)
 	DEFINE(PT_REGS_R18, offsetof(struct pt_regs, r18));
 	BLANK();
 
-	DEFINE(SWITCH_STACK_SIZE, sizeof(struct switch_stack));
-	DEFINE(SWITCH_STACK_R9, offsetof(struct switch_stack, r9));
-	DEFINE(SWITCH_STACK_R10, offsetof(struct switch_stack, r10));
-	DEFINE(SWITCH_STACK_R11, offsetof(struct switch_stack, r11));
-	DEFINE(SWITCH_STACK_R12, offsetof(struct switch_stack, r12));
-	DEFINE(SWITCH_STACK_R13, offsetof(struct switch_stack, r13));
-	DEFINE(SWITCH_STACK_R14, offsetof(struct switch_stack, r14));
-	DEFINE(SWITCH_STACK_R15, offsetof(struct switch_stack, r15));
-	DEFINE(SWITCH_STACK_RA, offsetof(struct switch_stack, r26));
-	BLANK();
-
 	DEFINE(KVM_REGS_SIZE, sizeof(struct kvm_regs));
 	DEFINE(KVM_REGS_R0, offsetof(struct kvm_regs, r0));
 	DEFINE(KVM_REGS_R1, offsetof(struct kvm_regs, r1));
diff --git a/arch/sw_64/kernel/entry.S b/arch/sw_64/kernel/entry.S
index f54b02c1b3a9..977c774ad799 100644
--- a/arch/sw_64/kernel/entry.S
+++ b/arch/sw_64/kernel/entry.S
@@ -380,135 +380,6 @@ $syscall_trace_failed:
 	br	ret_from_sys_call
 	.end strace
 
-	.align 4
-	.ent do_switch_stack
-do_switch_stack:
-	ldi	$sp, -SWITCH_STACK_SIZE($sp)
-	flds	$f31, 0($sp) /* fillde hint */
-	stl	$9, SWITCH_STACK_R9($sp)
-	stl	$10, SWITCH_STACK_R10($sp)
-	stl	$11, SWITCH_STACK_R11($sp)
-	stl	$12, SWITCH_STACK_R12($sp)
-	stl	$13, SWITCH_STACK_R13($sp)
-	stl	$14, SWITCH_STACK_R14($sp)
-	stl	$15, SWITCH_STACK_R15($sp)
-	stl	$26, SWITCH_STACK_RA($sp)
-	// SIMD-FP
-	ldl	$9, TI_TASK($8)
-	ldi	$9, TASK_THREAD($9)
-	ldi	$10, THREAD_CTX_FP($9)
-	vstd	$f0, CTX_FP_F0($10)
-	vstd	$f1, CTX_FP_F1($10)
-	vstd	$f2, CTX_FP_F2($10)
-	vstd	$f3, CTX_FP_F3($10)
-	vstd	$f4, CTX_FP_F4($10)
-	vstd	$f5, CTX_FP_F5($10)
-	vstd	$f6, CTX_FP_F6($10)
-	vstd	$f7, CTX_FP_F7($10)
-	vstd	$f8, CTX_FP_F8($10)
-	vstd	$f9, CTX_FP_F9($10)
-	vstd	$f10, CTX_FP_F10($10)
-	vstd	$f11, CTX_FP_F11($10)
-	vstd	$f12, CTX_FP_F12($10)
-	vstd	$f13, CTX_FP_F13($10)
-	vstd	$f14, CTX_FP_F14($10)
-	vstd	$f15, CTX_FP_F15($10)
-	vstd	$f16, CTX_FP_F16($10)
-	vstd	$f17, CTX_FP_F17($10)
-	vstd	$f18, CTX_FP_F18($10)
-	vstd	$f19, CTX_FP_F19($10)
-	vstd	$f20, CTX_FP_F20($10)
-	vstd	$f21, CTX_FP_F21($10)
-	vstd	$f22, CTX_FP_F22($10)
-	vstd	$f23, CTX_FP_F23($10)
-	vstd	$f24, CTX_FP_F24($10)
-	vstd	$f25, CTX_FP_F25($10)
-	vstd	$f26, CTX_FP_F26($10)
-	vstd	$f27, CTX_FP_F27($10)
-	rfpcr	$f0
-	vstd	$f28, CTX_FP_F28($10)
-	vstd	$f29, CTX_FP_F29($10)
-	vstd	$f30, CTX_FP_F30($10)
-	fstd	$f0, THREAD_FPCR($9)
-	vldd	$f0, CTX_FP_F0($10)
-	ldl	$9, SWITCH_STACK_R9($sp)
-	ldl	$10, SWITCH_STACK_R10($sp)
-	ret	$31, ($1), 1
-	.end do_switch_stack
-
-	.align 4
-	.ent undo_switch_stack
-undo_switch_stack:
-#ifdef CONFIG_SUBARCH_C3B
-	fillcs	0($sp)		/* prefetch */
-#endif
-	ldl	$11, SWITCH_STACK_R11($sp)
-	ldl	$12, SWITCH_STACK_R12($sp)
-	ldl	$13, SWITCH_STACK_R13($sp)
-	ldl	$14, SWITCH_STACK_R14($sp)
-	ldl	$15, SWITCH_STACK_R15($sp)
-	ldl	$26, SWITCH_STACK_RA($sp)
-	// SIMD-FP
-	ldl	$9, TI_TASK($8)
-	ldi	$9, TASK_THREAD($9)
-	fldd	$f0, THREAD_FPCR($9)
-	wfpcr	$f0
-	fimovd	$f0, $10
-	and	$10, 0x3, $10
-	beq	$10, $setfpec_0
-	subl	$10, 0x1, $10
-	beq	$10, $setfpec_1
-	subl	$10, 0x1, $10
-	beq	$10, $setfpec_2
-	setfpec3
-	br	$setfpec_over
-$setfpec_0:
-	setfpec0
-	br	$setfpec_over
-$setfpec_1:
-	setfpec1
-	br	$setfpec_over
-$setfpec_2:
-	setfpec2
-$setfpec_over:
-	ldi	$10, THREAD_CTX_FP($9)
-	vldd	$f0, CTX_FP_F0($10)
-	vldd	$f1, CTX_FP_F1($10)
-	vldd	$f2, CTX_FP_F2($10)
-	vldd	$f3, CTX_FP_F3($10)
-	vldd	$f4, CTX_FP_F4($10)
-	vldd	$f5, CTX_FP_F5($10)
-	vldd	$f6, CTX_FP_F6($10)
-	vldd	$f7, CTX_FP_F7($10)
-	vldd	$f8, CTX_FP_F8($10)
-	vldd	$f9, CTX_FP_F9($10)
-	vldd	$f10, CTX_FP_F10($10)
-	vldd	$f11, CTX_FP_F11($10)
-	vldd	$f12, CTX_FP_F12($10)
-	vldd	$f13, CTX_FP_F13($10)
-	vldd	$f14, CTX_FP_F14($10)
-	vldd	$f15, CTX_FP_F15($10)
-	vldd	$f16, CTX_FP_F16($10)
-	vldd	$f17, CTX_FP_F17($10)
-	vldd	$f18, CTX_FP_F18($10)
-	vldd	$f19, CTX_FP_F19($10)
-	vldd	$f20, CTX_FP_F20($10)
-	vldd	$f21, CTX_FP_F21($10)
-	vldd	$f22, CTX_FP_F22($10)
-	vldd	$f23, CTX_FP_F23($10)
-	vldd	$f24, CTX_FP_F24($10)
-	vldd	$f25, CTX_FP_F25($10)
-	vldd	$f26, CTX_FP_F26($10)
-	vldd	$f27, CTX_FP_F27($10)
-	vldd	$f28, CTX_FP_F28($10)
-	vldd	$f29, CTX_FP_F29($10)
-	vldd	$f30, CTX_FP_F30($10)
-	ldl	$9, SWITCH_STACK_R9($sp)
-	ldl	$10, SWITCH_STACK_R10($sp)
-	ldi	$sp, SWITCH_STACK_SIZE($sp)
-	ret	$31, ($1), 1
-	.end undo_switch_stack
-
 /*
  * Integer register context switch
  * The callee-saved registers must be saved and restored.
diff --git a/arch/sw_64/kernel/traps.c b/arch/sw_64/kernel/traps.c
index 6cd3ade905d1..19fbf50b6ebc 100644
--- a/arch/sw_64/kernel/traps.c
+++ b/arch/sw_64/kernel/traps.c
@@ -300,9 +300,8 @@ do_entIF(unsigned long inst_type, struct pt_regs *regs)
 	case 3: /* FEN fault */
 		/*
 		 * Irritating users can call HMC_clrfen to disable the
-		 * FPU for the process. The kernel will then trap in
-		 * do_switch_stack and undo_switch_stack when we try
-		 * to save and restore the FP registers.
+		 * FPU for the process. The kernel will then trap to
+		 * save and restore the FP registers.
 
 		 * Given that GCC by default generates code that uses the
 		 * FP registers, HMC_clrfen is not useful except for DoS
-- 
Gitee


From ef588f6bce82bdeface18c414af7ffe0a97a1773 Mon Sep 17 00:00:00 2001
From: He Sheng <hesheng@wxiat.com>
Date: Thu, 28 Apr 2022 09:32:24 +0800
Subject: [PATCH 106/674] sw64: access pt_regs with regoffsets where
 appropriate

Sunway inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GF7A

--------------------------------

The struct pt_regs of sw64 is a bit weird because r16~r18, r29 and
r30 are saved at odd location by hmcode, which makes it complicated
to reference them with regno. To improve maintainability, we map
these regno(s) in an unified way.

Signed-off-by: He Sheng <hesheng@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/include/asm/extable.h |  4 ++++
 arch/sw_64/include/asm/ptrace.h  |  3 +++
 arch/sw_64/kernel/ptrace.c       | 27 +++++++++++++--------------
 arch/sw_64/kernel/traps.c        | 31 +++++++------------------------
 arch/sw_64/mm/fault.c            |  6 +-----
 5 files changed, 28 insertions(+), 43 deletions(-)

diff --git a/arch/sw_64/include/asm/extable.h b/arch/sw_64/include/asm/extable.h
index 12b50b68a0d2..ae753772a45a 100644
--- a/arch/sw_64/include/asm/extable.h
+++ b/arch/sw_64/include/asm/extable.h
@@ -52,4 +52,8 @@ struct exception_table_entry {
 		(b)->fixup.unit = (tmp).fixup.unit;		\
 	} while (0)
 
+/* Macro for exception fixup code to access integer registers. */
+extern short regoffsets[];
+#define map_regs(r) (*(unsigned long *)((char *)regs + regoffsets[r]))
+
 #endif
diff --git a/arch/sw_64/include/asm/ptrace.h b/arch/sw_64/include/asm/ptrace.h
index 74349a05b9e4..0f0de9a3f8ce 100644
--- a/arch/sw_64/include/asm/ptrace.h
+++ b/arch/sw_64/include/asm/ptrace.h
@@ -28,6 +28,9 @@
 #define force_successful_syscall_return() (current_pt_regs()->r0 = 0)
 
 #define MAX_REG_OFFSET (offsetof(struct pt_regs, r18))
+
+extern short regoffsets[];
+
 /**
  * regs_get_register() - get register value from its offset
  * @regs:       pt_regs from which register value is gotten
diff --git a/arch/sw_64/kernel/ptrace.c b/arch/sw_64/kernel/ptrace.c
index 097590b22a5a..ce41d89a54cb 100644
--- a/arch/sw_64/kernel/ptrace.c
+++ b/arch/sw_64/kernel/ptrace.c
@@ -9,6 +9,7 @@
 #include <linux/audit.h>
 
 #include <asm/reg.h>
+#include <asm/asm-offsets.h>
 
 #include "proto.h"
 
@@ -52,23 +53,21 @@ enum {
 	REG_GP = 29
 };
 
-#define PT_REG(reg) \
-	(PAGE_SIZE * 2 - sizeof(struct pt_regs) + offsetof(struct pt_regs, reg))
-
 #define FP_REG(fp_regno, vector_regno) \
 	(fp_regno * 32 + vector_regno * 8)
 
-static int regoff[] = {
-	PT_REG(r0), PT_REG(r1), PT_REG(r2), PT_REG(r3),
-	PT_REG(r4), PT_REG(r5), PT_REG(r6), PT_REG(r7),
-	PT_REG(r8), PT_REG(r9), PT_REG(r10), PT_REG(r11),
-	PT_REG(r12), PT_REG(r13), PT_REG(r14), PT_REG(r15),
-	PT_REG(r16), PT_REG(r17), PT_REG(r18), PT_REG(r19),
-	PT_REG(r20), PT_REG(r21), PT_REG(r22), PT_REG(r23),
-	PT_REG(r24), PT_REG(r25), PT_REG(r26), PT_REG(r27),
-	PT_REG(r28), PT_REG(gp), -1, -1
+#define R(x)	((size_t) &((struct pt_regs *)0)->x)
+
+short regoffsets[32] = {
+	R(r0), R(r1), R(r2), R(r3), R(r4), R(r5), R(r6), R(r7), R(r8),
+	R(r9), R(r10), R(r11), R(r12), R(r13), R(r14), R(r15),
+	R(r16), R(r17), R(r18),
+	R(r19), R(r20), R(r21), R(r22), R(r23), R(r24), R(r25), R(r26),
+	R(r27), R(r28), R(gp), 0, 0
 };
 
+#undef R
+
 #define PCB_OFF(var)	offsetof(struct pcb_struct, var)
 
 static int pcboff[] = {
@@ -104,7 +103,7 @@ get_reg_addr(struct task_struct *task, unsigned long regno)
 		addr = (void *)task_thread_info(task) + pcboff[regno];
 		break;
 	case REG_BASE ... REG_END:
-		addr = (void *)task_thread_info(task) + regoff[regno];
+		addr = (void *)task_pt_regs(task) + regoffsets[regno];
 		break;
 	case FPREG_BASE ... FPREG_END:
 		fp_regno = regno - FPREG_BASE;
@@ -128,7 +127,7 @@ get_reg_addr(struct task_struct *task, unsigned long regno)
 		addr = (void *)&task->thread.fpcr;
 		break;
 	case PC:
-		addr = (void *)task_thread_info(task) + PT_REG(pc);
+		addr = (void *)task_pt_regs(task) + PT_REGS_PC;
 		break;
 	default:
 		addr = &zero;
diff --git a/arch/sw_64/kernel/traps.c b/arch/sw_64/kernel/traps.c
index 19fbf50b6ebc..a61c851967a9 100644
--- a/arch/sw_64/kernel/traps.c
+++ b/arch/sw_64/kernel/traps.c
@@ -325,23 +325,6 @@ struct unaligned_stat {
 } unaligned[2];
 
 
-/* Macro for exception fixup code to access integer registers. */
-#define R(x)	((size_t) &((struct pt_regs *)0)->x)
-
-static int regoffsets[32] = {
-	R(r0), R(r1), R(r2), R(r3), R(r4), R(r5), R(r6), R(r7), R(r8),
-	R(r9), R(r10), R(r11), R(r12), R(r13), R(r14), R(r15),
-	R(r16), R(r17), R(r18),
-	R(r19), R(r20), R(r21), R(r22), R(r23), R(r24), R(r25), R(r26),
-	R(r27), R(r28), R(gp),
-	0, 0
-};
-
-#undef R
-
-#define una_reg(r) (*(unsigned long *)((char *)regs + regoffsets[r]))
-
-
 asmlinkage void
 do_entUna(void *va, unsigned long opcode, unsigned long reg,
 	  struct pt_regs *regs)
@@ -380,7 +363,7 @@ do_entUna(void *va, unsigned long opcode, unsigned long reg,
 
 		if (error)
 			goto got_exception;
-		una_reg(reg) = tmp1 | tmp2;
+		map_regs(reg) = tmp1 | tmp2;
 		return;
 
 	case 0x22:
@@ -401,7 +384,7 @@ do_entUna(void *va, unsigned long opcode, unsigned long reg,
 
 		if (error)
 			goto got_exception;
-		una_reg(reg) = (int)(tmp1 | tmp2);
+		map_regs(reg) = (int)(tmp1 | tmp2);
 		return;
 
 	case 0x23: /* ldl */
@@ -422,7 +405,7 @@ do_entUna(void *va, unsigned long opcode, unsigned long reg,
 
 		if (error)
 			goto got_exception;
-		una_reg(reg) = tmp1 | tmp2;
+		map_regs(reg) = tmp1 | tmp2;
 		return;
 
 	case 0x29: /* sth */
@@ -440,7 +423,7 @@ do_entUna(void *va, unsigned long opcode, unsigned long reg,
 		".previous"
 		: "=r"(error), "=&r"(tmp1), "=&r"(tmp2),
 		"=&r"(tmp3), "=&r"(tmp4)
-		: "r"(va), "r"(una_reg(reg)), "0"(0));
+		: "r"(va), "r"(map_regs(reg)), "0"(0));
 
 		if (error)
 			goto got_exception;
@@ -472,7 +455,7 @@ do_entUna(void *va, unsigned long opcode, unsigned long reg,
 		".previous"
 		: "=r"(error), "=&r"(tmp1), "=&r"(tmp2),
 		  "=&r"(tmp3), "=&r"(tmp4)
-		: "r"(va), "r"(una_reg(reg)), "0"(0));
+		: "r"(va), "r"(map_regs(reg)), "0"(0));
 
 		if (error)
 			goto got_exception;
@@ -524,7 +507,7 @@ do_entUna(void *va, unsigned long opcode, unsigned long reg,
 		".previous"
 		: "=r"(error), "=&r"(tmp1), "=&r"(tmp2), "=&r"(tmp3),
 		"=&r"(tmp4), "=&r"(tmp5), "=&r"(tmp6), "=&r"(tmp7), "=&r"(tmp8)
-		: "r"(va), "r"(una_reg(reg)), "0"(0));
+		: "r"(va), "r"(map_regs(reg)), "0"(0));
 
 		if (error)
 			goto got_exception;
@@ -543,7 +526,7 @@ do_entUna(void *va, unsigned long opcode, unsigned long reg,
 	if (fixup != 0) {
 		unsigned long newpc;
 
-		newpc = fixup_exception(una_reg, fixup, pc);
+		newpc = fixup_exception(map_regs, fixup, pc);
 		printk("Forwarding unaligned exception at %lx (%lx)\n",
 		       pc, newpc);
 
diff --git a/arch/sw_64/mm/fault.c b/arch/sw_64/mm/fault.c
index 9562bc3ba37f..9b0ad9eb5a3a 100644
--- a/arch/sw_64/mm/fault.c
+++ b/arch/sw_64/mm/fault.c
@@ -107,10 +107,6 @@ __load_new_mm_context(struct mm_struct *next_mm)
  * modify them.
  */
 
-/* Macro for exception fixup code to access integer registers.  */
-#define dpf_reg(r)							\
-	(((unsigned long *)regs)[(r) <= 8 ? (r) : (r) <= 15 ? (r)-16 :	\
-				 (r) <= 18 ? (r)+10 : (r)-10])
 unsigned long show_va_to_pa(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t *pgd = NULL;
@@ -294,7 +290,7 @@ do_page_fault(unsigned long address, unsigned long mmcsr,
 	if (fixup != 0) {
 		unsigned long newpc;
 
-		newpc = fixup_exception(dpf_reg, fixup, regs->pc);
+		newpc = fixup_exception(map_regs, fixup, regs->pc);
 		regs->pc = newpc;
 		return;
 	}
-- 
Gitee


From ca223ff216cdab7bb985b24221f92e1495ce4bc7 Mon Sep 17 00:00:00 2001
From: He Sheng <hesheng@wxiat.com>
Date: Sat, 14 May 2022 11:07:13 +0800
Subject: [PATCH 107/674] sw64: move struct pt_regs to kapi ptrace.h

Sunway inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GF7A

--------------------------------

This struct should be invisible to user space, and we will define
new regs struct for user in uapi later.

Signed-off-by: He Sheng <hesheng@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/include/asm/ptrace.h      | 47 +++++++++++++++++++++++++
 arch/sw_64/include/uapi/asm/ptrace.h | 52 ----------------------------
 2 files changed, 47 insertions(+), 52 deletions(-)

diff --git a/arch/sw_64/include/asm/ptrace.h b/arch/sw_64/include/asm/ptrace.h
index 0f0de9a3f8ce..85f7a47fdee3 100644
--- a/arch/sw_64/include/asm/ptrace.h
+++ b/arch/sw_64/include/asm/ptrace.h
@@ -9,6 +9,53 @@
 #include <asm/processor.h>
 #include <asm/page.h>
 
+/*
+ * This struct defines the way the registers are stored on the
+ * kernel stack during a system call or other kernel entry
+ */
+
+struct pt_regs {
+	unsigned long r0;
+	unsigned long r1;
+	unsigned long r2;
+	unsigned long r3;
+	unsigned long r4;
+	unsigned long r5;
+	unsigned long r6;
+	unsigned long r7;
+	unsigned long r8;
+	unsigned long r9;
+	unsigned long r10;
+	unsigned long r11;
+	unsigned long r12;
+	unsigned long r13;
+	unsigned long r14;
+	unsigned long r15;
+	/* r16 ~ r18 saved by hmcode */
+	unsigned long r19;
+	unsigned long r20;
+	unsigned long r21;
+	unsigned long r22;
+	unsigned long r23;
+	unsigned long r24;
+	unsigned long r25;
+	unsigned long r26;
+	unsigned long r27;
+	unsigned long r28;
+	unsigned long hae;
+/* JRP - These are the values provided to a0-a2 by HMcode */
+	unsigned long trap_a0;
+	unsigned long trap_a1;
+	unsigned long trap_a2;
+/* These are saved by HMcode: */
+	unsigned long ps;
+	unsigned long pc;
+	unsigned long gp;
+	unsigned long r16;
+	unsigned long r17;
+	unsigned long r18;
+};
+
 #define arch_has_single_step()		(1)
 #define user_mode(regs) (((regs)->ps & 8) != 0)
 #define instruction_pointer(regs) ((regs)->pc)
diff --git a/arch/sw_64/include/uapi/asm/ptrace.h b/arch/sw_64/include/uapi/asm/ptrace.h
index deb7e4096c93..ac7ce6b6510b 100644
--- a/arch/sw_64/include/uapi/asm/ptrace.h
+++ b/arch/sw_64/include/uapi/asm/ptrace.h
@@ -2,58 +2,6 @@
 #ifndef _UAPI_ASM_SW64_PTRACE_H
 #define _UAPI_ASM_SW64_PTRACE_H
 
-
-/*
- * This struct defines the way the registers are stored on the
- * kernel stack during a system call or other kernel entry
- *
- * NOTE! I want to minimize the overhead of system calls, so this
- * struct has as little information as possible.  I does not have
- * floating point regs, because the kernel doesn't change those
- */
-
-struct pt_regs {
-	unsigned long r0;
-	unsigned long r1;
-	unsigned long r2;
-	unsigned long r3;
-	unsigned long r4;
-	unsigned long r5;
-	unsigned long r6;
-	unsigned long r7;
-	unsigned long r8;
-	unsigned long r9;
-	unsigned long r10;
-	unsigned long r11;
-	unsigned long r12;
-	unsigned long r13;
-	unsigned long r14;
-	unsigned long r15;
-	/* r16 ~ r18 saved by hmcode */
-	unsigned long r19;
-	unsigned long r20;
-	unsigned long r21;
-	unsigned long r22;
-	unsigned long r23;
-	unsigned long r24;
-	unsigned long r25;
-	unsigned long r26;
-	unsigned long r27;
-	unsigned long r28;
-	unsigned long hae;
-/* JRP - These are the values provided to a0-a2 by HMcode */
-	unsigned long trap_a0;
-	unsigned long trap_a1;
-	unsigned long trap_a2;
-/* These are saved by HMcode: */
-	unsigned long ps;
-	unsigned long pc;
-	unsigned long gp;
-	unsigned long r16;
-	unsigned long r17;
-	unsigned long r18;
-};
-
 #define PTRACE_GETREGS		12	/* get general purpose registers */
 #define PTRACE_SETREGS		13	/* set general purpose registers */
 #define PTRACE_GETFPREGS	14	/* get floating-point registers */
-- 
Gitee


From 2ac1d37a4635c4fd4fb9a98889bbeb7bfe9aef87 Mon Sep 17 00:00:00 2001
From: He Sheng <hesheng@wxiat.com>
Date: Mon, 16 May 2022 10:31:57 +0800
Subject: [PATCH 108/674] sw64: avoid copying thread_struct twice

Sunway inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GF7A

--------------------------------

The thread_struct has been copied in dup_task_struct() by generic
arch_dup_task_struct(), and it's unnecessary to be copied again
in copy_thread(). This patch adds arch-specific implementation of
arch_dup_task_struct() to reduce data copying.

It does not have to save fpu state for a kernel thread, so call
fpstate_save() instead of __fpstate_save() here to save fpu state
on demand.

Signed-off-by: He Sheng <hesheng@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/kernel/process.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/arch/sw_64/kernel/process.c b/arch/sw_64/kernel/process.c
index 1f093c5a5235..6b5f0f3a8345 100644
--- a/arch/sw_64/kernel/process.c
+++ b/arch/sw_64/kernel/process.c
@@ -144,6 +144,13 @@ release_thread(struct task_struct *dead_task)
 {
 }
 
+int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
+{
+	fpstate_save(src);
+	*dst = *src;
+	return 0;
+}
+
 /*
  * Copy architecture-specific thread state
  */
@@ -162,8 +169,6 @@ copy_thread(unsigned long clone_flags, unsigned long usp,
 
 	childti->pcb.ksp = (unsigned long) childregs;
 	childti->pcb.flags = 7;	/* set FEN, clear everything else */
-	__fpstate_save(current);
-	p->thread = current->thread;
 
 	if (unlikely(p->flags & PF_KTHREAD)) {
 		/* kernel thread */
-- 
Gitee


From cb3b05c7af85291bc8fa870f542a6176ee6b126a Mon Sep 17 00:00:00 2001
From: He Chuyue <hechuyue@wxiat.com>
Date: Mon, 16 May 2022 16:34:52 +0800
Subject: [PATCH 109/674] sw64: simplify __phys_addr and __virt_addr_valid

Sunway inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I56OLG

--------------------------------

In these methods, the local variable `y` is redundant and can be
removed.

Signed-off-by: He Chuyue <hechuyue@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/mm/physaddr.c | 30 +++++++++++++-----------------
 1 file changed, 13 insertions(+), 17 deletions(-)

diff --git a/arch/sw_64/mm/physaddr.c b/arch/sw_64/mm/physaddr.c
index fbb489ae4db5..26769f0bf7bf 100644
--- a/arch/sw_64/mm/physaddr.c
+++ b/arch/sw_64/mm/physaddr.c
@@ -5,34 +5,30 @@
 
 unsigned long __phys_addr(unsigned long x)
 {
-	unsigned long y = x;
-
-	if (y >= __START_KERNEL_map) {
-		y -= __START_KERNEL_map;
-		VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE);
+	if (x >= __START_KERNEL_map) {
+		x -= __START_KERNEL_map;
+		VIRTUAL_BUG_ON(x >= KERNEL_IMAGE_SIZE);
 	} else {
-		VIRTUAL_BUG_ON(y < PAGE_OFFSET);
-		y -= PAGE_OFFSET;
-		VIRTUAL_BUG_ON(!phys_addr_valid(y));
+		VIRTUAL_BUG_ON(x < PAGE_OFFSET);
+		x -= PAGE_OFFSET;
+		VIRTUAL_BUG_ON(!phys_addr_valid(x));
 	}
-	return y;
+	return x;
 }
 EXPORT_SYMBOL(__phys_addr);
 
 bool __virt_addr_valid(unsigned long x)
 {
-	unsigned long y = x;
-
-	if (y >= __START_KERNEL_map) {
-		y -= __START_KERNEL_map;
-		if (y >= KERNEL_IMAGE_SIZE)
+	if (x >= __START_KERNEL_map) {
+		x -= __START_KERNEL_map;
+		if (x >= KERNEL_IMAGE_SIZE)
 			return false;
 	} else {
-		if (y < PAGE_OFFSET)
+		if (x < PAGE_OFFSET)
 			return false;
-		y -= PAGE_OFFSET;
+		x -= PAGE_OFFSET;
 	}
 
-	return pfn_valid(y >> PAGE_SHIFT);
+	return pfn_valid(x >> PAGE_SHIFT);
 }
 EXPORT_SYMBOL(__virt_addr_valid);
-- 
Gitee


From 5c841330a078634752deca6aaf3fd749f092a0eb Mon Sep 17 00:00:00 2001
From: He Sheng <hesheng@wxiat.com>
Date: Tue, 17 May 2022 15:05:15 +0800
Subject: [PATCH 110/674] sw64: simplify pgtable helpers

Sunway inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I56OLG

--------------------------------

This patch:
  - Remove unused macros.
  - Rename some variables and macros to make them shorter.
  - Add pfn_to_virt and virt_to_pfn to make code cleaner.
  - Map vaddr by __va().
  - Redefine mk_pte with pfn_to_pte.
  - Fix some wrong usages, like pgd_offset

Signed-off-by: He Sheng <hesheng@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/include/asm/mmu_context.h |  8 +--
 arch/sw_64/include/asm/mmzone.h      | 23 -------
 arch/sw_64/include/asm/page.h        |  3 +
 arch/sw_64/include/asm/pgalloc.h     |  2 +-
 arch/sw_64/include/asm/pgtable.h     | 91 +++++++++-------------------
 arch/sw_64/mm/fault.c                |  6 +-
 arch/sw_64/mm/init.c                 |  2 +-
 7 files changed, 40 insertions(+), 95 deletions(-)

diff --git a/arch/sw_64/include/asm/mmu_context.h b/arch/sw_64/include/asm/mmu_context.h
index e3d7ae7c873e..d6cd01d55712 100644
--- a/arch/sw_64/include/asm/mmu_context.h
+++ b/arch/sw_64/include/asm/mmu_context.h
@@ -131,7 +131,7 @@ switch_mm(struct mm_struct *prev_mm, struct mm_struct *next_mm,
 	 * Always update the PCB PTBR. If next is kernel thread, it must
 	 * update PTBR. If next is user process, it's ok to update PTBR.
 	 */
-	task_thread_info(next)->pcb.ptbr = (__pa(next_mm->pgd)) >> PAGE_SHIFT;
+	task_thread_info(next)->pcb.ptbr = virt_to_pfn(next_mm->pgd);
 	load_asn_ptbr(task_thread_info(next)->pcb.asn, task_thread_info(next)->pcb.ptbr);
 }
 
@@ -170,8 +170,7 @@ static inline int init_new_context(struct task_struct *tsk,
 	for_each_possible_cpu(i)
 		mm->context.asid[i] = 0;
 	if (tsk != current)
-		task_thread_info(tsk)->pcb.ptbr
-			= (__pa(mm->pgd)) >> PAGE_SHIFT;
+		task_thread_info(tsk)->pcb.ptbr = virt_to_pfn(mm->pgd);
 	return 0;
 }
 
@@ -183,8 +182,7 @@ static inline void destroy_context(struct mm_struct *mm)
 static inline void enter_lazy_tlb(struct mm_struct *mm,
 				  struct task_struct *tsk)
 {
-	task_thread_info(tsk)->pcb.ptbr
-		= (__pa(mm->pgd)) >> PAGE_SHIFT;
+	task_thread_info(tsk)->pcb.ptbr = virt_to_pfn(mm->pgd);
 }
 
 static inline int arch_dup_mmap(struct mm_struct *oldmm,
diff --git a/arch/sw_64/include/asm/mmzone.h b/arch/sw_64/include/asm/mmzone.h
index 924e33f6d326..4219015c5cfc 100644
--- a/arch/sw_64/include/asm/mmzone.h
+++ b/arch/sw_64/include/asm/mmzone.h
@@ -18,29 +18,6 @@ extern pg_data_t *node_data[];
 extern int pa_to_nid(unsigned long pa);
 extern int pfn_valid(unsigned long pfn);
 
-#define mk_pte(page, pgprot)							\
-({										\
-	pte_t pte;								\
-	unsigned long pfn;							\
-										\
-	pfn = page_to_pfn(page) << _PTE_FLAGS_BITS;				\
-	pte_val(pte) = pfn | pgprot_val(pgprot);				\
-										\
-	pte;									\
-})
-
-#define pte_page(x)								\
-({										\
-	unsigned long kvirt;							\
-	struct page *__xx;							\
-										\
-	kvirt = (unsigned long)__va(pte_val(x) >> (_PTE_FLAGS_BITS-PAGE_SHIFT));\
-	__xx = virt_to_page(kvirt);						\
-										\
-	__xx;									\
-})
-
-#define page_to_pa(page)	(page_to_pfn(page) << PAGE_SHIFT)
 #define pfn_to_nid(pfn)		pa_to_nid(((u64)(pfn) << PAGE_SHIFT))
 #endif /* CONFIG_DISCONTIGMEM */
 
diff --git a/arch/sw_64/include/asm/page.h b/arch/sw_64/include/asm/page.h
index 363785c9f082..dc6a89c37231 100644
--- a/arch/sw_64/include/asm/page.h
+++ b/arch/sw_64/include/asm/page.h
@@ -50,6 +50,9 @@ extern unsigned long __phys_addr(unsigned long);
 #define virt_to_page(kaddr)	pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
 #define virt_addr_valid(kaddr)	pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
 
+#define virt_to_pfn(vaddr)	(PHYS_PFN(__pa(vaddr)))
+#define pfn_to_virt(pfn)	(__va(PFN_PHYS(pfn)))
+
 #ifdef CONFIG_FLATMEM
 #define pfn_valid(pfn)		((pfn) < max_mapnr)
 #endif /* CONFIG_FLATMEM */
diff --git a/arch/sw_64/include/asm/pgalloc.h b/arch/sw_64/include/asm/pgalloc.h
index 3cfdcbef7ef8..9572b4709ff4 100644
--- a/arch/sw_64/include/asm/pgalloc.h
+++ b/arch/sw_64/include/asm/pgalloc.h
@@ -15,7 +15,7 @@
 static inline void
 pmd_populate(struct mm_struct *mm, pmd_t *pmd, pgtable_t pte)
 {
-	pmd_set(pmd, (pte_t *)(page_to_pa(pte) + PAGE_OFFSET));
+	pmd_set(pmd, (pte_t *)__va(page_to_pa(pte)));
 }
 #define pmd_pgtable(pmd) pmd_page(pmd)
 
diff --git a/arch/sw_64/include/asm/pgtable.h b/arch/sw_64/include/asm/pgtable.h
index 590f15508e28..67637b4ddf1d 100644
--- a/arch/sw_64/include/asm/pgtable.h
+++ b/arch/sw_64/include/asm/pgtable.h
@@ -119,9 +119,8 @@ static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
 #define __ACCESS_BITS	(_PAGE_ACCESSED | _PAGE_KRE | _PAGE_URE)
 
 
-#define _PFN_MASK	0xFFFFFFFFF0000000UL
-#define _PFN_BITS	36
-#define _PTE_FLAGS_BITS	(64 - _PFN_BITS)
+#define _PFN_SHIFT	28
+#define _PFN_MASK	((-1UL) << _PFN_SHIFT)
 
 #define _PAGE_TABLE	(_PAGE_VALID | __DIRTY_BITS | __ACCESS_BITS)
 #define _PAGE_CHG_MASK	(_PFN_MASK | __DIRTY_BITS | __ACCESS_BITS | _PAGE_SPECIAL)
@@ -181,53 +180,19 @@ static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
 extern struct page *empty_zero_page;
 #define ZERO_PAGE(vaddr)		(empty_zero_page)
 
-/* number of bits that fit into a memory pointer */
-#define BITS_PER_PTR			(8 * sizeof(unsigned long))
-
-/* to align the pointer to a pointer address */
-#define PTR_MASK			(~(sizeof(void *) - 1))
-
-/* sizeof(void*)==1<<SIZEOF_PTR_LOG2 */
-#define SIZEOF_PTR_LOG2			3
-
-/* to find an entry in a page-table */
-#define PAGE_PTR(address) \
-	((unsigned long)(address) >> (PAGE_SHIFT - SIZEOF_PTR_LOG2) & PTR_MASK & ~PAGE_MASK)
-
-#define PHYS_TWIDDLE(pfn)		(pfn)
-
-/*
- * Conversion functions:  convert a page and protection to a page entry,
- * and a page entry and page directory to the page they refer to.
- */
-#define page_to_pa(page)		(page_to_pfn(page) << PAGE_SHIFT)
-
-#define pmd_pfn(pmd)			(pmd_val(pmd) >> _PTE_FLAGS_BITS)
-#define pte_pfn(pte)			(pte_val(pte) >> _PTE_FLAGS_BITS)
-#ifndef CONFIG_DISCONTIGMEM
-#define pte_page(pte)			pfn_to_page(pte_pfn(pte))
-#define mk_pte(page, pgprot)						\
-({									\
-	pte_t pte;							\
-									\
-	pte_val(pte) = (page_to_pfn(page) << _PTE_FLAGS_BITS) | pgprot_val(pgprot);	\
-	pte;								\
-})
-#endif
-
-static inline pte_t pfn_pte(unsigned long physpfn, pgprot_t pgprot)
+static inline pte_t pfn_pte(unsigned long pfn, pgprot_t prot)
 {
 	pte_t pte;
 
-	pte_val(pte) = (PHYS_TWIDDLE(physpfn) << _PTE_FLAGS_BITS) | pgprot_val(pgprot);
+	pte_val(pte) = (pfn << _PFN_SHIFT) | pgprot_val(prot);
 	return pte;
 }
 
-static inline pmd_t pfn_pmd(unsigned long physpfn, pgprot_t pgprot)
+static inline pmd_t pfn_pmd(unsigned long pfn, pgprot_t prot)
 {
 	pmd_t pmd;
 
-	pmd_val(pmd) = (PHYS_TWIDDLE(physpfn) << _PTE_FLAGS_BITS) | pgprot_val(pgprot);
+	pmd_val(pmd) = (pfn << _PFN_SHIFT) | pgprot_val(prot);
 	return pmd;
 }
 
@@ -245,37 +210,48 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
 
 static inline void pmd_set(pmd_t *pmdp, pte_t *ptep)
 {
-	pmd_val(*pmdp) = _PAGE_TABLE | (__pa(ptep) << (_PTE_FLAGS_BITS - PAGE_SHIFT));
+	pmd_val(*pmdp) = _PAGE_TABLE | (virt_to_pfn(ptep) << _PFN_SHIFT);
 }
 
 static inline void pud_set(pud_t *pudp, pmd_t *pmdp)
 {
-	pud_val(*pudp) = _PAGE_TABLE | (__pa(pmdp) << (_PTE_FLAGS_BITS - PAGE_SHIFT));
+	pud_val(*pudp) = _PAGE_TABLE | (virt_to_pfn(pmdp) << _PFN_SHIFT);
 }
 
 static inline void p4d_set(p4d_t *p4dp, pud_t *pudp)
 {
-	p4d_val(*p4dp) = _PAGE_TABLE | (__pa(pudp) << (_PTE_FLAGS_BITS - PAGE_SHIFT));
+	p4d_val(*p4dp) = _PAGE_TABLE | (virt_to_pfn(pudp) << _PFN_SHIFT);
 }
 
-static inline unsigned long
-pmd_page_vaddr(pmd_t pmd)
+static inline unsigned long pmd_page_vaddr(pmd_t pmd)
 {
-	return ((pmd_val(pmd) & _PFN_MASK) >> (_PTE_FLAGS_BITS-PAGE_SHIFT)) + PAGE_OFFSET;
+	return (unsigned long)pfn_to_virt(pmd_val(pmd) >> _PFN_SHIFT);
 }
 
-#define pmd_page(pmd)		(pfn_to_page(pmd_val(pmd) >> _PTE_FLAGS_BITS))
-#define pud_page(pud)		(pfn_to_page(pud_val(pud) >> _PTE_FLAGS_BITS))
-#define p4d_page(p4d)		(pfn_to_page(p4d_val(p4d) >> _PTE_FLAGS_BITS))
+/*
+ * Conversion functions:  convert a page and protection to a page entry,
+ * and a page entry and page directory to the page they refer to.
+ */
+#define page_to_pa(page)	(page_to_pfn(page) << PAGE_SHIFT)
+
+#define pmd_pfn(pmd)		(pmd_val(pmd) >> _PFN_SHIFT)
+#define pte_pfn(pte)		(pte_val(pte) >> _PFN_SHIFT)
+
+#define pte_page(pte)		pfn_to_page(pte_pfn(pte))
+#define mk_pte(page, prot)	pfn_pte(page_to_pfn(page), prot)
+
+#define pmd_page(pmd)		(pfn_to_page(pmd_val(pmd) >> _PFN_SHIFT))
+#define pud_page(pud)		(pfn_to_page(pud_val(pud) >> _PFN_SHIFT))
+#define p4d_page(p4d)		(pfn_to_page(p4d_val(p4d) >> _PFN_SHIFT))
 
 static inline pud_t *p4d_pgtable(p4d_t p4d)
 {
-	return (pud_t *)(PAGE_OFFSET + ((p4d_val(p4d) & _PFN_MASK) >> (_PTE_FLAGS_BITS-PAGE_SHIFT)));
+	return (pud_t *)pfn_to_virt(p4d_val(p4d) >> _PFN_SHIFT);
 }
 
 static inline pmd_t *pud_pgtable(pud_t pud)
 {
-	return (pmd_t *)(PAGE_OFFSET + ((pud_val(pud) & _PFN_MASK) >> (_PTE_FLAGS_BITS-PAGE_SHIFT)));
+	return (pmd_t *)pfn_to_virt(pud_val(pud) >> _PFN_SHIFT);
 }
 
 static inline int pte_none(pte_t pte)
@@ -566,7 +542,7 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
 	set_bit(_PAGE_BIT_FOW, (unsigned long *)pmdp);
 }
 
-#define mk_pmd(page, pgprot)   pfn_pmd(page_to_pfn(page), (pgprot))
+#define mk_pmd(page, prot)	pfn_pmd(page_to_pfn(page), (prot))
 
 #define  __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
 extern int pmdp_set_access_flags(struct vm_area_struct *vma,
@@ -586,15 +562,6 @@ extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
 extern void pmdp_splitting_flush(struct vm_area_struct *vma,
 				 unsigned long addr, pmd_t *pmdp);
 
-#define PAGE_DIR_OFFSET(tsk, address) pgd_offset((tsk), (address))
-
-/* to find an entry in a kernel page-table-directory */
-#define pgd_offset_k(address) pgd_offset(&init_mm, (address))
-
-/* to find an entry in a page-table-directory. */
-#define pgd_index(address)	(((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
-#define pgd_offset(mm, address)	((mm)->pgd+pgd_index(address))
-
 extern pgd_t swapper_pg_dir[1024];
 
 /*
diff --git a/arch/sw_64/mm/fault.c b/arch/sw_64/mm/fault.c
index 9b0ad9eb5a3a..d596fc50772d 100644
--- a/arch/sw_64/mm/fault.c
+++ b/arch/sw_64/mm/fault.c
@@ -80,7 +80,7 @@ __load_new_mm_context(struct mm_struct *next_mm)
 
 	pcb = &current_thread_info()->pcb;
 	pcb->asn = mmc & HARDWARE_ASN_MASK;
-	pcb->ptbr = ((unsigned long) next_mm->pgd - PAGE_OFFSET) >> PAGE_SHIFT;
+	pcb->ptbr = virt_to_pfn(next_mm->pgd);
 
 	__reload_thread(pcb);
 }
@@ -122,7 +122,7 @@ unsigned long show_va_to_pa(struct mm_struct *mm, unsigned long addr)
 		pr_debug("addr = %#lx, pgd = %#lx\n", addr, pgd_val(*pgd));
 		goto out;
 	}
-	p4d = pgd_offset(pgd, addr);
+	p4d = p4d_offset(pgd, addr);
 	if (p4d_none(*p4d)) {
 		ret = 0;
 		pr_debug("addr = %#lx, pgd = %#lx, p4d = %#lx\n",
@@ -146,7 +146,7 @@ unsigned long show_va_to_pa(struct mm_struct *mm, unsigned long addr)
 	}
 	pte = pte_offset_map(pmd, addr);
 	if (pte_present(*pte)) {
-		ret = ((unsigned long)__va(((pte_val(*pte) >> 32)) << PAGE_SHIFT));
+		ret = (unsigned long)pfn_to_virt(pte_val(*pte) >> _PFN_SHIFT);
 		pr_debug("addr = %#lx, pgd = %#lx, pud = %#lx, pmd = %#lx, pte = %#lx, ret = %#lx\n",
 				addr, *(unsigned long *)pgd, *(unsigned long *)pud,
 				*(unsigned long *)pmd, *(unsigned long *)pte, ret);
diff --git a/arch/sw_64/mm/init.c b/arch/sw_64/mm/init.c
index 3593e4b13319..cd3c0551b8f8 100644
--- a/arch/sw_64/mm/init.c
+++ b/arch/sw_64/mm/init.c
@@ -87,7 +87,7 @@ switch_to_system_map(void)
 	 * the last slot of the L1 page table.
 	 */
 	memset(swapper_pg_dir, 0, PAGE_SIZE);
-	newptbr = __pa(swapper_pg_dir) >> PAGE_SHIFT;
+	newptbr = virt_to_pfn(swapper_pg_dir);
 
 	/* Also set up the real kernel PCB while we're at it.  */
 	init_thread_info.pcb.ptbr = newptbr;
-- 
Gitee


From 9e08dc63c00d48adde62976841bcacf25409b88b Mon Sep 17 00:00:00 2001
From: He Chuyue <hechuyue@wxiat.com>
Date: Tue, 17 May 2022 09:43:30 +0800
Subject: [PATCH 111/674] sw64: check integrity for dtb passed by BIOS

Sunway inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I56OLG

--------------------------------

If kernel image is so large that it corrupts dtb, system will break
down early to give a message.

Signed-off-by: He Chuyue <hechuyue@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/kernel/setup.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/sw_64/kernel/setup.c b/arch/sw_64/kernel/setup.c
index ca19445ac883..faae7cb3c5a1 100644
--- a/arch/sw_64/kernel/setup.c
+++ b/arch/sw_64/kernel/setup.c
@@ -565,8 +565,14 @@ static void __init setup_machine_fdt(void)
 	/* Give a chance to select kernel builtin DTB firstly */
 	if (IS_ENABLED(CONFIG_SW64_BUILTIN_DTB))
 		dt_virt = (void *)__dtb_start;
-	else
+	else {
 		dt_virt = (void *)sunway_boot_params->dtb_start;
+		if (dt_virt < (void *)__bss_stop) {
+			pr_emerg("BUG: DTB has been corrupted by kernel image!\n");
+			while (true)
+				cpu_relax();
+		}
+	}
 
 	phys_addr = __phys_addr((unsigned long)dt_virt);
 	if (!phys_addr_valid(phys_addr) ||
-- 
Gitee


From 8d384996f080b31d8e7c0f3337952bf32921aa3a Mon Sep 17 00:00:00 2001
From: He Sheng <hesheng@wxiat.com>
Date: Tue, 17 May 2022 17:32:12 +0800
Subject: [PATCH 112/674] sw64: remove discontiguous memory support

Sunway inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I56OLG

--------------------------------

Since sparse memory support works fine so far, discontiguous
memory support can be removed to avoid confusion.

Signed-off-by: He Sheng <hesheng@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/Kconfig               |  9 --------
 arch/sw_64/include/asm/mmzone.h  |  7 -------
 arch/sw_64/include/asm/pgtable.h |  7 -------
 arch/sw_64/kernel/Makefile       |  3 +--
 arch/sw_64/kernel/core.c         | 35 --------------------------------
 arch/sw_64/mm/init.c             | 12 -----------
 6 files changed, 1 insertion(+), 72 deletions(-)
 delete mode 100644 arch/sw_64/kernel/core.c

diff --git a/arch/sw_64/Kconfig b/arch/sw_64/Kconfig
index 8c5e87cb9b84..2d7721acb529 100644
--- a/arch/sw_64/Kconfig
+++ b/arch/sw_64/Kconfig
@@ -649,15 +649,6 @@ config ARCH_SPARSEMEM_ENABLE
 	depends on SMP
 	select SPARSEMEM_VMEMMAP_ENABLE
 
-config ARCH_DISCONTIGMEM_ENABLE
-	bool "Discontiguous Memory Support"
-	depends on SMP
-	help
-	  Say Y to support efficient handling of discontiguous physical memory,
-	  for architectures which are either NUMA (Non-Uniform Memory Access)
-	  or have huge holes in the physical address space for other reasons.
-	  See <file:Documentation/vm/numa> for more.
-
 config NUMA
 	bool "NUMA Support"
 	depends on SMP && !FLATMEM
diff --git a/arch/sw_64/include/asm/mmzone.h b/arch/sw_64/include/asm/mmzone.h
index 4219015c5cfc..3849d26e3890 100644
--- a/arch/sw_64/include/asm/mmzone.h
+++ b/arch/sw_64/include/asm/mmzone.h
@@ -14,11 +14,4 @@ extern pg_data_t *node_data[];
 #define NODE_DATA(nid)		(node_data[(nid)])
 #endif
 
-#ifdef CONFIG_DISCONTIGMEM
-extern int pa_to_nid(unsigned long pa);
-extern int pfn_valid(unsigned long pfn);
-
-#define pfn_to_nid(pfn)		pa_to_nid(((u64)(pfn) << PAGE_SHIFT))
-#endif /* CONFIG_DISCONTIGMEM */
-
 #endif /* _ASM_SW64_MMZONE_H */
diff --git a/arch/sw_64/include/asm/pgtable.h b/arch/sw_64/include/asm/pgtable.h
index 67637b4ddf1d..76c782baf242 100644
--- a/arch/sw_64/include/asm/pgtable.h
+++ b/arch/sw_64/include/asm/pgtable.h
@@ -596,14 +596,7 @@ extern pgd_t swapper_pg_dir[1024];
 #define __pte_to_swp_entry(pte)	((swp_entry_t) { pte_val(pte) })
 #define __swp_entry_to_pte(x)	((pte_t) { (x).val })
 
-#if defined(CONFIG_FLATMEM)
 #define kern_addr_valid(addr)	(1)
-#elif defined(CONFIG_DISCONTIGMEM)
-/* XXX: FIXME -- wli */
-#define kern_addr_valid(kaddr)	(0)
-#elif defined(CONFIG_SPARSEMEM)
-#define kern_addr_valid(addr)	(1)
-#endif
 
 #define pte_ERROR(e) \
 	pr_err("%s: %d: bad pte %016lx.\n", __FILE__, __LINE__, pte_val(e))
diff --git a/arch/sw_64/kernel/Makefile b/arch/sw_64/kernel/Makefile
index 293b360626f7..c1e461e0ac56 100644
--- a/arch/sw_64/kernel/Makefile
+++ b/arch/sw_64/kernel/Makefile
@@ -15,7 +15,7 @@ endif
 
 obj-y    := entry.o fpu.o traps.o process.o sys_sw64.o irq.o \
 	    irq_sw64.o signal.o setup.o ptrace.o time.o \
-	    systbls.o dup_print.o tc.o \
+	    systbls.o dup_print.o tc.o timer.o \
 	    insn.o early_init.o topology.o cacheinfo.o \
 	    vdso.o vdso/
 
@@ -43,7 +43,6 @@ obj-y += kvm_cma.o
 endif
 
 # Core logic support
-obj-$(CONFIG_SW64)	+= core.o timer.o
 obj-$(CONFIG_SW64_CPUFREQ) += platform.o clock.o
 obj-$(CONFIG_SW64_CPUAUTOPLUG) += cpuautoplug.o
 
diff --git a/arch/sw_64/kernel/core.c b/arch/sw_64/kernel/core.c
deleted file mode 100644
index e26b3a5faab2..000000000000
--- a/arch/sw_64/kernel/core.c
+++ /dev/null
@@ -1,35 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/types.h>
-#include <linux/memblock.h>
-
-#ifdef CONFIG_DISCONTIGMEM
-#ifdef CONFIG_NUMA
-int pa_to_nid(unsigned long pa)
-{
-	int i = 0;
-	phys_addr_t pfn_base, pfn_size, pfn;
-
-	pfn = pa >> PAGE_SHIFT;
-	for (i = 0; i < MAX_NUMNODES; i++) {
-		if (!NODE_DATA(i))
-			continue;
-
-		pfn_base = NODE_DATA(i)->node_start_pfn;
-		pfn_size = NODE_DATA(i)->node_spanned_pages;
-
-		if (pfn >= pfn_base && pfn < pfn_base + pfn_size)
-			return i;
-	}
-
-	pr_err("%s: pa %#lx does not belong to any node, return node 0\n", __func__, pa);
-	return 0;
-}
-EXPORT_SYMBOL(pa_to_nid);
-#else /* !CONFIG_NUMA */
-int pa_to_nid(unsigned long pa)
-{
-	return 0;
-}
-EXPORT_SYMBOL(pa_to_nid);
-#endif /* CONFIG_NUMA */
-#endif /* CONFIG_DISCONTIGMEM */
diff --git a/arch/sw_64/mm/init.c b/arch/sw_64/mm/init.c
index cd3c0551b8f8..16d3da7beebe 100644
--- a/arch/sw_64/mm/init.c
+++ b/arch/sw_64/mm/init.c
@@ -244,18 +244,6 @@ void vmemmap_free(unsigned long start, unsigned long end,
 }
 #endif
 
-#ifdef CONFIG_DISCONTIGMEM
-int pfn_valid(unsigned long pfn)
-{
-	phys_addr_t addr = pfn << PAGE_SHIFT;
-
-	if ((addr >> PAGE_SHIFT) != pfn)
-		return 0;
-	return memblock_is_map_memory(addr);
-}
-EXPORT_SYMBOL(pfn_valid);
-#endif
-
 #ifdef CONFIG_HAVE_MEMBLOCK
 #ifndef MIN_MEMBLOCK_ADDR
 #define MIN_MEMBLOCK_ADDR       __pa(PAGE_OFFSET)
-- 
Gitee


From d351c1bdf7563b31109bf0a01feb015233a01f2d Mon Sep 17 00:00:00 2001
From: Zhu Donghong <zhudonghong@wxiat.com>
Date: Wed, 18 May 2022 15:23:55 +0800
Subject: [PATCH 113/674] irqchip: add sw64 chip3 builtin LPC interrupt
 controller driver

Sunway inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5G7AK

--------------------------------

This adds an irqchip driver for the secondary interrupt controllers
based on sw64 chip3 cpu builtin LPC.

Signed-off-by: Zhu Donghong <zhudonghong@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 drivers/irqchip/Kconfig        |   7 ++
 drivers/irqchip/Makefile       |   1 +
 drivers/irqchip/irq-lpc.c      | 137 +++++++++++++++++++++++++++++++++
 drivers/mfd/lpc_sunway_chip3.c | 130 -------------------------------
 4 files changed, 145 insertions(+), 130 deletions(-)
 create mode 100644 drivers/irqchip/irq-lpc.c

diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig
index 375ba3b30b72..5bf6cf60999b 100644
--- a/drivers/irqchip/Kconfig
+++ b/drivers/irqchip/Kconfig
@@ -9,6 +9,13 @@ config SW64_INTC
            The INTC controls devices interrupts and connects them to each
            core's local interrupt controller.
 
+config SW64_CHIP3_LPC
+	bool "SW64 Chip3 Buildin LPC Interrupt Controller"
+	depends on SW64_CHIP3
+	help
+           This enables support for the LPC interrupt controller bultin in
+	   on chip3 series.
+
 config IRQCHIP
 	def_bool y
 	depends on OF_IRQ
diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile
index 4c78b0f64e6c..78eb12ab4d4c 100644
--- a/drivers/irqchip/Makefile
+++ b/drivers/irqchip/Makefile
@@ -28,6 +28,7 @@ obj-$(CONFIG_ARCH_SUNXI)		+= irq-sunxi-nmi.o
 obj-$(CONFIG_ARCH_SPEAR3XX)		+= spear-shirq.o
 obj-$(CONFIG_ARM_GIC)			+= irq-gic.o irq-gic-common.o
 obj-$(CONFIG_SW64_INTC)			+= irq-intc-v1.o
+obj-$(CONFIG_SW64_CHIP3_LPC)		+= irq-lpc.o
 obj-$(CONFIG_ARM_GIC_PM)		+= irq-gic-pm.o
 obj-$(CONFIG_ARCH_REALVIEW)		+= irq-gic-realview.o
 obj-$(CONFIG_ARM_GIC_V2M)		+= irq-gic-v2m.o
diff --git a/drivers/irqchip/irq-lpc.c b/drivers/irqchip/irq-lpc.c
new file mode 100644
index 000000000000..1cbf87478242
--- /dev/null
+++ b/drivers/irqchip/irq-lpc.c
@@ -0,0 +1,137 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bitops.h>
+#include <linux/irq.h>
+#include <linux/of_address.h>
+#include <linux/of_irq.h>
+#include <linux/irqchip.h>
+#include <linux/irqchip/chained_irq.h>
+#include <linux/interrupt.h>
+
+#define LPC_NR_IRQS 16
+#define	LPC_IRQ  0x4
+#define	LPC_IRQ_MASK  0x8
+
+struct lpc_intc_data {
+	struct irq_domain *domain;
+	struct irq_chip_generic *gc;
+};
+
+static void lpc_irq_mask_ack(struct irq_data *data)
+{
+	struct irq_chip_generic *gc = irq_data_get_irq_chip_data(data);
+	struct irq_chip_type *ct = irq_data_get_chip_type(data);
+	unsigned int mask = data->mask;
+
+	irq_gc_lock(gc);
+	*ct->mask_cache |= mask;
+	irq_reg_writel(gc, *ct->mask_cache, ct->regs.mask);
+	irq_reg_writel(gc, mask, ct->regs.ack);
+	irq_gc_unlock(gc);
+}
+
+static void lpc_irq_handler(struct irq_desc *desc)
+{
+	struct lpc_intc_data *b = irq_desc_get_handler_data(desc);
+	struct irq_chip *chip = irq_desc_get_chip(desc);
+	unsigned int irq;
+	u32 status;
+
+	chained_irq_enter(chip, desc);
+
+	status = irq_reg_readl(b->gc, LPC_IRQ);
+
+	if (status == 0) {
+		raw_spin_lock(&desc->lock);
+		handle_bad_irq(desc);
+		raw_spin_unlock(&desc->lock);
+		goto out;
+	}
+
+	while (status) {
+		irq = __ffs(status);
+		status &= ~BIT(irq);
+		generic_handle_irq(irq_find_mapping(b->domain, irq));
+	}
+
+out:
+	chained_irq_exit(chip, desc);
+}
+
+static int __init lpc_intc_of_init(struct device_node *np,
+				  struct device_node *parent)
+{
+	unsigned int set = IRQ_NOPROBE | IRQ_LEVEL;
+	struct lpc_intc_data *data;
+	struct irq_chip_type *ct;
+	int parent_irq, ret;
+	void __iomem *base;
+	int hwirq = 0;
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	base = of_iomap(np, 0);
+	if (!base) {
+		pr_err("failed to remap lpc intc registers\n");
+		ret = -ENOMEM;
+		goto out_free;
+	}
+
+	parent_irq = irq_of_parse_and_map(np, 0);
+	if (!parent_irq) {
+		pr_err("failed to find parent interrupt\n");
+		ret = -EINVAL;
+		goto out_unmap;
+	}
+
+	data->domain = irq_domain_add_linear(np, LPC_NR_IRQS,
+				&irq_generic_chip_ops, NULL);
+	if (!data->domain) {
+		ret = -ENOMEM;
+		goto out_unmap;
+	}
+
+	/* Allocate a single Generic IRQ chip for this node */
+	ret = irq_alloc_domain_generic_chips(data->domain, 16, 1, np->name,
+					     handle_level_irq, 0, set,
+					     IRQ_GC_INIT_MASK_CACHE);
+	if (ret) {
+		pr_err("failed to allocate generic irq chip\n");
+		goto out_free_domain;
+	}
+
+	/* Set the IRQ chaining logic */
+	irq_set_chained_handler_and_data(parent_irq,
+					 lpc_irq_handler, data);
+
+	data->gc = irq_get_domain_generic_chip(data->domain, 0);
+	data->gc->reg_base = base;
+	data->gc->private = data;
+
+	ct = data->gc->chip_types;
+
+	ct->regs.ack = LPC_IRQ;
+	ct->regs.mask = LPC_IRQ_MASK;
+	ct->chip.irq_mask = irq_gc_mask_set_bit;
+	ct->chip.irq_unmask = irq_gc_mask_clr_bit;
+	ct->chip.irq_ack = irq_gc_ack_set_bit;
+	ct->chip.irq_mask_ack = lpc_irq_mask_ack;
+
+	for (hwirq = 0 ; hwirq < 16 ; hwirq++)
+		irq_create_mapping(data->domain, hwirq);
+
+	/* Enable LPC interrupts */
+	writel(0xffffebdd, base + LPC_IRQ_MASK);
+
+	return 0;
+
+out_free_domain:
+	irq_domain_remove(data->domain);
+out_unmap:
+	iounmap(base);
+out_free:
+	kfree(data);
+	return ret;
+}
+IRQCHIP_DECLARE(sw_lpc_intc, "sw64,lpc_intc", lpc_intc_of_init);
diff --git a/drivers/mfd/lpc_sunway_chip3.c b/drivers/mfd/lpc_sunway_chip3.c
index 793b557195c2..878aff87c992 100644
--- a/drivers/mfd/lpc_sunway_chip3.c
+++ b/drivers/mfd/lpc_sunway_chip3.c
@@ -75,53 +75,16 @@ enum {
 	LPC_DMA_SWRST = 0x70,
 };
 
-enum {
-	LPC_IRQ0 = 0,		/* 8254 Timer */
-	LPC_IRQ1,		/* Keyboard */
-	LPC_IRQ2,		/* Reserved */
-	LPC_IRQ3,		/* UART */
-	LPC_IRQ4,		/* UART */
-	LPC_IRQ5,		/* LPC Parallel Port2 */
-	LPC_IRQ6,		/* FDC-Floppy Disk Controller */
-	LPC_IRQ7,		/* LPT-Parallel Port1 */
-	LPC_NR_IRQS,
-	LPC_IRQ8,		/* RTC */
-	LPC_IRQ9,		/* Undefined */
-	LPC_IRQ10,		/* Undefined */
-	LPC_IRQ11,		/* Undefined */
-	LPC_IRQ12,		/* Mouse */
-	LPC_IRQ13,		/* Undefined */
-	LPC_IRQ14,		/* Undefined */
-	LPC_IRQ15,		/* Undefined */
-};
-
 struct lpc_chip3_adapter {
 	void __iomem *hst_regs;
 	struct device *dev;
 	int irq;
-	struct irq_chip_generic *gc;
 	unsigned int features;
 };
 
 static struct resource superio_chip3_resources[] = {
 	{
 		.flags = IORESOURCE_IO,
-	}, {
-		.start = LPC_IRQ1,
-		.flags = IORESOURCE_IRQ,
-		.name = "i8042_kbd_irq",
-	}, {
-		.start = LPC_IRQ12,
-		.flags = IORESOURCE_IRQ,
-		.name = "i8042_aux_irq",
-	}, {
-		.start = LPC_IRQ5,
-		.flags = IORESOURCE_IRQ,
-		.name = "uart0_irq",
-	}, {
-		.start = LPC_IRQ4,
-		.flags = IORESOURCE_IRQ,
-		.name = "uart1_irq",
 	}
 };
 
@@ -218,75 +181,9 @@ static void lpc_fw_flash_init(struct platform_device *pdev,
 
 }
 
-static u32 lpc_do_irq(struct lpc_chip3_adapter *lpc_adapter)
-{
-	u32 irq_status = readl_relaxed(lpc_adapter->hst_regs + LPC_IRQ);
-	u32 ret = irq_status;
-
-	DBG_LPC("%s irq_status=%#x\n", __func__, irq_status);
-	while (irq_status) {
-		int hwirq = fls(irq_status) - 1;
-
-		generic_handle_irq(hwirq);
-		irq_status &= ~BIT(hwirq);
-	}
-
-	lpc_writel(lpc_adapter->hst_regs, LPC_IRQ, ret);
-	return 1;
-}
-
-static void lpc_irq_handler_mfd(struct irq_desc *desc)
-{
-	unsigned int irq = irq_desc_get_irq(desc);
-	struct lpc_chip3_adapter *lpc_adapter = irq_get_handler_data(irq);
-	u32 worked = 0;
-
-	DBG_LPC("enter %s line:%d\n", __func__, __LINE__);
-
-	worked = lpc_do_irq(lpc_adapter);
-	if (worked == IRQ_HANDLED)
-		dev_dbg(lpc_adapter->dev, "LPC irq handled.\n");
-
-	DBG_LPC("leave %s line:%d\n", __func__, __LINE__);
-}
-
-static void lpc_unmask_interrupt_all(struct lpc_chip3_adapter *lpc_adapter)
-{
-	lpc_writel(lpc_adapter->hst_regs, LPC_IRQ_MASK, 0);
-}
-
-static void lpc_irq_mask_ack(struct irq_data *d)
-{
-	struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
-	struct irq_chip_type *ct = irq_data_get_chip_type(d);
-	u32 mask = d->mask;
-
-	irq_gc_lock(gc);
-	*ct->mask_cache |= mask;
-	irq_reg_writel(gc, *ct->mask_cache, ct->regs.mask);
-	irq_reg_writel(gc, mask, ct->regs.ack);
-	irq_gc_unlock(gc);
-}
-
-static void lpc_enable_irqs(struct lpc_chip3_adapter *lpc_adapter)
-{
-	int interrupt = 0;
-
-	lpc_unmask_interrupt_all(lpc_adapter);
-
-	interrupt = lpc_readl(lpc_adapter->hst_regs, LPC_IRQ);
-
-	lpc_writel(lpc_adapter->hst_regs, LPC_CTL, 0x1600);
-	interrupt = lpc_readl(lpc_adapter->hst_regs, LPC_IRQ);
-}
-
 static int lpc_chip3_probe(struct platform_device *pdev)
 {
 	int ret;
-	int num_ct = 1;
-	int irq_base;
-	struct irq_chip_generic *gc;
-	struct irq_chip_type *ct;
 	struct lpc_chip3_adapter *lpc_adapter;
 	struct resource *mem;
 
@@ -312,32 +209,6 @@ static int lpc_chip3_probe(struct platform_device *pdev)
 	lpc_adapter->dev = &pdev->dev;
 	lpc_adapter->features = 0;
 
-	lpc_adapter->irq = platform_get_irq(pdev, 0);
-	if (lpc_adapter->irq < 0) {
-		dev_err(&pdev->dev, "no irq resource?\n");
-		return lpc_adapter->irq;	/* -ENXIO */
-	}
-
-	irq_base = LPC_IRQ0;
-	gc = irq_alloc_generic_chip("LPC_CHIP3", num_ct, irq_base,
-				    lpc_adapter->hst_regs, handle_level_irq);
-
-	ct = gc->chip_types;
-	ct->regs.mask = LPC_IRQ_MASK;
-	ct->regs.ack = LPC_IRQ;
-	ct->chip.irq_mask = irq_gc_mask_set_bit;
-	ct->chip.irq_unmask = irq_gc_mask_clr_bit;
-	ct->chip.irq_ack = irq_gc_ack_set_bit;
-	ct->chip.irq_mask_ack = lpc_irq_mask_ack;
-	irq_setup_generic_chip(gc, IRQ_MSK(LPC_NR_IRQS), 0, 0,
-			       IRQ_NOPROBE | IRQ_LEVEL);
-
-	lpc_adapter->gc = gc;
-
-	irq_set_handler_data(lpc_adapter->irq, lpc_adapter);
-	irq_set_chained_handler(lpc_adapter->irq,
-				(irq_flow_handler_t) lpc_irq_handler_mfd);
-
 	lpc_enable(lpc_adapter);
 
 	lpc_mem_flash_init(pdev, lpc_adapter);
@@ -350,7 +221,6 @@ static int lpc_chip3_probe(struct platform_device *pdev)
 		goto out_dev;
 
 	dev_info(lpc_adapter->dev, "probe succeed !\n");
-	lpc_enable_irqs(lpc_adapter);
 
 	return ret;
 
-- 
Gitee


From f920164998e475ee8dc6a3dc2be9d406c380acf3 Mon Sep 17 00:00:00 2001
From: Zhu Donghong <zhudonghong@wxiat.com>
Date: Wed, 18 May 2022 15:24:50 +0800
Subject: [PATCH 114/674] sw64: add builtin LPC interrupt controller to
 chip3.dts

Sunway inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5G7AK

--------------------------------

This adds corresponding devicetree binding for LPC interrupt
controllers on chip3 series cpu.

Signed-off-by: Zhu Donghong <zhudonghong@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/boot/dts/chip3.dts | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/arch/sw_64/boot/dts/chip3.dts b/arch/sw_64/boot/dts/chip3.dts
index be2e91ee3279..082506393ac9 100644
--- a/arch/sw_64/boot/dts/chip3.dts
+++ b/arch/sw_64/boot/dts/chip3.dts
@@ -32,12 +32,20 @@ spiclk: spiclk {
 
 		};
 
-		intc: interrupt-controller{
+		intc: interrupt-controller {
 			compatible = "sw64,sw6_irq_controller";
 			interrupt-controller;
 			#interrupt-cells = <1>;
 		};
 
+		lpc_intc: interrupt-controller@0x8037 {
+			compatible = "sw64,lpc_intc";
+			reg = <0x8037 0x40000000 0x0 0x8000>;
+			interrupt-controller;
+			#interrupt-cells = <1>;
+			interrupt-parent = <&intc>;
+			interrupts = <2>;
+		};
 
 		uart: serial0@8033 {
 			#address-cells = <2>;
@@ -176,10 +184,8 @@ partition@0 {
 		lpc: lpc@0x8037 {
 			#address-cells = <2>;
 			#size-cells = <2>;
-			compatible = "sw,sw6b_lpc";
+			compatible = "sunway,chip3_lpc";
 			reg = <0x8037 0x40000000 0x0 0x8000>;
-			interrupt-parent=<&intc>;
-			interrupts = <2>;
 			status = "okay";
 
 		};
@@ -202,6 +208,8 @@ ipmi-bt@0x8037 {
 			device_type = "ipmi";
 			compatible = "ipmi-bt";
 			reg = <0x8037 0x100000e4 0x0 0x10>;
+			interrupt-parent=<&lpc_intc>;
+			interrupts = <10>;
 			reg-size = <1>;
 			reg-spacing = <1>;
 			reg-shift = <0>;
-- 
Gitee


From 6422b5c0dbd7b5602f449ccec702266cdde2b34e Mon Sep 17 00:00:00 2001
From: He Chuyue <hechuyue@wxiat.com>
Date: Thu, 19 May 2022 09:06:17 +0800
Subject: [PATCH 115/674] sw64: fix some structs related to pt_regs

Sunway inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GF7A

--------------------------------

Because of previous modifications to struct pt_regs, this patch
fixes struct pt_regs_offset, dbg_reg_def and perf_event_sw64_regs
to keep them consistent.

Signed-off-by: He Chuyue <hechuyue@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/include/uapi/asm/perf_regs.h |  7 +++++++
 arch/sw_64/kernel/kgdb.c                | 14 +++++++-------
 arch/sw_64/kernel/ptrace.c              |  7 +++++++
 3 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/arch/sw_64/include/uapi/asm/perf_regs.h b/arch/sw_64/include/uapi/asm/perf_regs.h
index 426ae642fcc8..1378a7397951 100644
--- a/arch/sw_64/include/uapi/asm/perf_regs.h
+++ b/arch/sw_64/include/uapi/asm/perf_regs.h
@@ -13,6 +13,13 @@ enum perf_event_sw64_regs {
 	PERF_REG_SW64_R6,
 	PERF_REG_SW64_R7,
 	PERF_REG_SW64_R8,
+	PERF_REG_SW64_R9,
+	PERF_REG_SW64_R10,
+	PERF_REG_SW64_R11,
+	PERF_REG_SW64_R12,
+	PERF_REG_SW64_R13,
+	PERF_REG_SW64_R14,
+	PERF_REG_SW64_R15,
 	PERF_REG_SW64_R19,
 	PERF_REG_SW64_R20,
 	PERF_REG_SW64_R21,
diff --git a/arch/sw_64/kernel/kgdb.c b/arch/sw_64/kernel/kgdb.c
index 491f287eede9..ac2f397f1609 100644
--- a/arch/sw_64/kernel/kgdb.c
+++ b/arch/sw_64/kernel/kgdb.c
@@ -34,13 +34,13 @@ struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = {
 	{ "r7", 8, offsetof(struct pt_regs, r7)},
 	{ "r8", 8, offsetof(struct pt_regs, r8)},
 
-	{ "r9",  8, -1 },
-	{ "r10", 8, -1 },
-	{ "r11", 8, -1 },
-	{ "r12", 8, -1 },
-	{ "r13", 8, -1 },
-	{ "r14", 8, -1 },
-	{ "r15", 8, -1 },
+	{ "r9",  8, offsetof(struct pt_regs, r9)},
+	{ "r10", 8, offsetof(struct pt_regs, r10)},
+	{ "r11", 8, offsetof(struct pt_regs, r11)},
+	{ "r12", 8, offsetof(struct pt_regs, r12)},
+	{ "r13", 8, offsetof(struct pt_regs, r13)},
+	{ "r14", 8, offsetof(struct pt_regs, r14)},
+	{ "r15", 8, offsetof(struct pt_regs, r15)},
 
 	{ "r16", 8, offsetof(struct pt_regs, r16)},
 	{ "r17", 8, offsetof(struct pt_regs, r17)},
diff --git a/arch/sw_64/kernel/ptrace.c b/arch/sw_64/kernel/ptrace.c
index ce41d89a54cb..94809804e23e 100644
--- a/arch/sw_64/kernel/ptrace.c
+++ b/arch/sw_64/kernel/ptrace.c
@@ -619,6 +619,13 @@ static const struct pt_regs_offset regoffset_table[] = {
 	REG_OFFSET_NAME(r6),
 	REG_OFFSET_NAME(r7),
 	REG_OFFSET_NAME(r8),
+	REG_OFFSET_NAME(r9),
+	REG_OFFSET_NAME(r10),
+	REG_OFFSET_NAME(r11),
+	REG_OFFSET_NAME(r12),
+	REG_OFFSET_NAME(r13),
+	REG_OFFSET_NAME(r14),
+	REG_OFFSET_NAME(r15),
 	REG_OFFSET_NAME(r19),
 	REG_OFFSET_NAME(r20),
 	REG_OFFSET_NAME(r21),
-- 
Gitee


From 9fc89ea369a1953ee9ed8839a43df772eead479d Mon Sep 17 00:00:00 2001
From: He Chuyue <hechuyue@wxiat.com>
Date: Tue, 24 May 2022 16:03:40 +0800
Subject: [PATCH 116/674] sw64: perf: add exclude_user and exclude_kernel
 support

Sunway inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I56X48

--------------------------------

Although sw64 have no per-counter usr/os/guest/host bits, we can
distinguish user and kernel by sample mode.

Signed-off-by: He Chuyue <hechuyue@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/include/asm/wrperfmon.h |  4 ++++
 arch/sw_64/kernel/perf_event.c     | 20 +++++++++++++++-----
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/arch/sw_64/include/asm/wrperfmon.h b/arch/sw_64/include/asm/wrperfmon.h
index eaa6735b5a25..098702573bfc 100644
--- a/arch/sw_64/include/asm/wrperfmon.h
+++ b/arch/sw_64/include/asm/wrperfmon.h
@@ -38,6 +38,10 @@
 #define PC1_MIN				0x0
 #define PC1_MAX				0x37
 
+#define SW64_PERFCTRL_KM		2
+#define SW64_PERFCTRL_UM		3
+#define SW64_PERFCTRL_AM		4
+
 /* pc0 events */
 #define PC0_INSTRUCTIONS		0x0
 #define PC0_BRANCH_INSTRUCTIONS		0x3
diff --git a/arch/sw_64/kernel/perf_event.c b/arch/sw_64/kernel/perf_event.c
index d2975e17f666..0d49224ba058 100644
--- a/arch/sw_64/kernel/perf_event.c
+++ b/arch/sw_64/kernel/perf_event.c
@@ -457,8 +457,8 @@ static void sw64_pmu_start(struct perf_event *event, int flags)
 
 	hwc->state = 0;
 
-	/* counting in all modes, for both counters */
-	wrperfmon(PERFMON_CMD_PM, 4);
+	/* counting in selected modes, for both counters */
+	wrperfmon(PERFMON_CMD_PM, hwc->config_base);
 	if (hwc->idx == PERFMON_PC0) {
 		wrperfmon(PERFMON_CMD_EVENT_PC0, hwc->event_base);
 		wrperfmon(PERFMON_CMD_ENABLE, PERFMON_ENABLE_ARGS_PC0);
@@ -519,9 +519,12 @@ static int __hw_perf_event_init(struct perf_event *event)
 	const struct sw64_perf_event *event_type;
 
 
-	/* SW64 do not have per-counter usr/os/guest/host bits */
-	if (event->attr.exclude_user || event->attr.exclude_kernel ||
-			event->attr.exclude_hv || event->attr.exclude_idle ||
+	/*
+	 * SW64 does not have per-counter usr/os/guest/host bits,
+	 * we can distinguish exclude_user and exclude_kernel by
+	 * sample mode.
+	 */
+	if (event->attr.exclude_hv || event->attr.exclude_idle ||
 			event->attr.exclude_host || event->attr.exclude_guest)
 		return -EINVAL;
 
@@ -553,6 +556,13 @@ static int __hw_perf_event_init(struct perf_event *event)
 		hwc->event_base = attr->config & 0xff;	/* event selector */
 	}
 
+	hwc->config_base = SW64_PERFCTRL_AM;
+
+	if (attr->exclude_user)
+		hwc->config_base = SW64_PERFCTRL_KM;
+	if (attr->exclude_kernel)
+		hwc->config_base = SW64_PERFCTRL_UM;
+
 	hwc->config = attr->config;
 
 	if (!is_sampling_event(event))
-- 
Gitee


From b9e3841445914e43ecf42975f2d8a8f3c8239e25 Mon Sep 17 00:00:00 2001
From: Hang Xiaoqian <hangxiaoqian@wxiat.com>
Date: Wed, 25 May 2022 10:54:25 +0800
Subject: [PATCH 117/674] sw64: change the value of physical_id in
 /proc/cpuinfo

Sunway inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I56OLG

--------------------------------

The value of physical_id in /proc/cpuinfo should be
the socket_id of the processor.

Signed-off-by: Hang Xiaoqian <hangxiaoqian@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/kernel/setup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/sw_64/kernel/setup.c b/arch/sw_64/kernel/setup.c
index faae7cb3c5a1..4d39db5fb1ef 100644
--- a/arch/sw_64/kernel/setup.c
+++ b/arch/sw_64/kernel/setup.c
@@ -907,7 +907,7 @@ show_cpuinfo(struct seq_file *f, void *slot)
 				"physical id\t: %d\n"
 				"bogomips\t: %lu.%02lu\n",
 				cpu_freq, cpu_data[i].tcache.size >> 10,
-				cpu_to_rcid(i),
+				cpu_topology[i].package_id,
 				loops_per_jiffy / (500000/HZ),
 				(loops_per_jiffy / (5000/HZ)) % 100);
 
-- 
Gitee


From 06ed664fca7dcb8963d9868472a4cbbb149edb27 Mon Sep 17 00:00:00 2001
From: Gu Zitao <guzitao@wxiat.com>
Date: Tue, 24 May 2022 15:28:27 +0800
Subject: [PATCH 118/674] sw64: add ARCH_TRACEHOOK and regset support

Sunway inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5G7NL

--------------------------------

By adding TRACEHOOK support, we can access register sets via
PTRACE_GETREGSET and PTRACE_SETREGSET request.

The user-visible regset struct user_pt_regs and user_fpsimd_state
are added in this patch, and they can be accessed with NT_PRSTATUS
and NT_PRFPREG respectively.

Besides, PTRACE_{GET,SET}REGS and PTRACE_{GET,SET}FPREGS requests
which are implemented in error have never been used on sw64, so we
remove them completely.

Signed-off-by: Gu Zitao <guzitao@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/Kconfig                   |   1 +
 arch/sw_64/include/uapi/asm/ptrace.h |  18 ++-
 arch/sw_64/kernel/ptrace.c           | 194 +++++++++++++--------------
 3 files changed, 111 insertions(+), 102 deletions(-)

diff --git a/arch/sw_64/Kconfig b/arch/sw_64/Kconfig
index 2d7721acb529..0e32fc7e1f9a 100644
--- a/arch/sw_64/Kconfig
+++ b/arch/sw_64/Kconfig
@@ -68,6 +68,7 @@ config SW64
 	select ARCH_HAS_SG_CHAIN
 	select IRQ_FORCED_THREADING
 	select GENERIC_IRQ_MIGRATION if SMP
+	select HAVE_ARCH_TRACEHOOK
 	select HAVE_FUNCTION_TRACER
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_FTRACE_MCOUNT_RECORD
diff --git a/arch/sw_64/include/uapi/asm/ptrace.h b/arch/sw_64/include/uapi/asm/ptrace.h
index ac7ce6b6510b..1169d87c4b9c 100644
--- a/arch/sw_64/include/uapi/asm/ptrace.h
+++ b/arch/sw_64/include/uapi/asm/ptrace.h
@@ -2,10 +2,20 @@
 #ifndef _UAPI_ASM_SW64_PTRACE_H
 #define _UAPI_ASM_SW64_PTRACE_H
 
-#define PTRACE_GETREGS		12	/* get general purpose registers */
-#define PTRACE_SETREGS		13	/* set general purpose registers */
-#define PTRACE_GETFPREGS	14	/* get floating-point registers */
-#define PTRACE_SETFPREGS	15	/* set floating-point registers */
+/*
+ * User structures for general purpose, floating point and debug registers.
+ */
+struct user_pt_regs {
+	__u64 regs[31];
+	__u64 pc;
+	__u64 pstate;
+};
+
+struct user_fpsimd_state {
+	__u64 vregs[124];
+	__u64 fpcr;
+};
+
 /* PTRACE_ATTACH is 16 */
 /* PTRACE_DETACH is 17 */
 
diff --git a/arch/sw_64/kernel/ptrace.c b/arch/sw_64/kernel/ptrace.c
index 94809804e23e..94fba3569781 100644
--- a/arch/sw_64/kernel/ptrace.c
+++ b/arch/sw_64/kernel/ptrace.c
@@ -7,6 +7,8 @@
 
 #include <linux/tracehook.h>
 #include <linux/audit.h>
+#include <linux/regset.h>
+#include <linux/elf.h>
 
 #include <asm/reg.h>
 #include <asm/asm-offsets.h>
@@ -270,123 +272,131 @@ void ptrace_disable(struct task_struct *child)
 	user_disable_single_step(child);
 }
 
-int ptrace_getregs(struct task_struct *child, __s64 __user *data)
+static int gpr_get(struct task_struct *target,
+			const struct user_regset *regset,
+			struct membuf to)
 {
-	int ret, retval = 0;
-	int i;
-	unsigned long regval;
+	struct pt_regs *regs;
+	struct user_pt_regs uregs;
+	int i, ret;
 
-	if (!access_ok(data, sizeof(long) * 33))
-		return -EIO;
+	regs = task_pt_regs(target);
+	for (i = 0; i < 30; i++)
+		uregs.regs[i] = *(__u64 *)((void *)regs + regoffsets[i]);
+
+	uregs.regs[30] = task_thread_info(target)->pcb.usp;
+	uregs.pc = regs->pc;
+	uregs.pstate = regs->ps;
+
+	ret = membuf_write(&to, &uregs, sizeof(uregs));
 
-	/* r0-r15 */
-	for (i = 0; i < 16; i++) {
-		regval = get_reg(child, i);
-		retval |= __put_user((long)regval, data + i);
-	}
-	/* r19-r28 */
-	for (i = 19; i < 29; i++) {
-		regval = get_reg(child, i);
-		retval |= __put_user((long)regval, data + i - 3);
-	}
-	/*SP, PS ,PC,GP*/
-	retval |= __put_user((long)(get_reg(child, REG_SP)), data + EF_SP);
-	retval |= __put_user((long)(get_reg(child, REG_PS)), data + EF_PS);
-	retval |= __put_user((long)(get_reg(child, REG_PC)), data + EF_PC);
-	retval |= __put_user((long)(get_reg(child, REG_GP)), data + EF_GP);
-	/* r16-r18 */
-	retval |= __put_user((long)(get_reg(child, 16)), data + EF_A0);
-	retval |= __put_user((long)(get_reg(child, 17)), data + EF_A1);
-	retval |= __put_user((long)(get_reg(child, 18)), data + EF_A2);
-
-	ret = retval ? -EIO : 0;
 	return ret;
 }
 
-int ptrace_setregs(struct task_struct *child, __s64 __user *data)
+static int gpr_set(struct task_struct *target,
+			const struct user_regset *regset,
+			unsigned int pos, unsigned int count,
+			const void *kbuf, const void __user *ubuf)
 {
-	int ret, retval = 0;
-	int i;
-	unsigned long regval;
+	struct pt_regs *regs;
+	struct user_pt_regs uregs;
+	int i, ret;
 
-	if (!access_ok(data, sizeof(long) * 33))
-		return -EIO;
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				&uregs, 0, sizeof(uregs));
+	if (ret)
+		return ret;
+
+	regs = task_pt_regs(target);
+	for (i = 0; i < 30; i++)
+		*(__u64 *)((void *)regs + regoffsets[i]) = uregs.regs[i];
+
+	task_thread_info(target)->pcb.usp = uregs.regs[30];
+	regs->pc = uregs.pc;
+	regs->ps = uregs.pstate;
 
-	/* r0-r15 */
-	for (i = 0; i < 16; i++) {
-		retval |= __get_user(regval, data + i);
-		ret = put_reg(child, i, regval);
-	}
-	/* r19-r28 */
-	for (i = 19; i < 29; i++) {
-		retval |= __get_user(regval, data + i - 3);
-		ret = put_reg(child, i, regval);
-	}
-	/*SP, PS ,PC,GP*/
-	retval |= __get_user(regval, data + EF_SP);
-	ret = put_reg(child, REG_SP, regval);
-	retval |= __get_user(regval, data + EF_PS);
-	ret = put_reg(child, REG_PS, regval);
-	retval |= __get_user(regval, data + EF_PC);
-	ret = put_reg(child, REG_PC, regval);
-	retval |= __get_user(regval, data + EF_GP);
-	ret = put_reg(child, REG_GP, regval);
-	/* r16-r18 */
-	retval |= __get_user(regval, data + EF_A0);
-	ret = put_reg(child, 16, regval);
-	retval |= __get_user(regval, data + EF_A1);
-	ret = put_reg(child, 17, regval);
-	retval |= __get_user(regval, data + EF_A2);
-	ret = put_reg(child, 18, regval);
-
-	ret = retval ? -EIO : 0;
 	return 0;
 }
 
-int ptrace_getfpregs(struct task_struct *child, __s64 __user *data)
+static int fpr_get(struct task_struct *target,
+			const struct user_regset *regset,
+			struct membuf to)
 {
-	int ret, retval = 0;
-	int i;
-	unsigned long regval;
+	int ret;
+	struct user_fpsimd_state uregs;
 
-	if (!access_ok(data, sizeof(long) * 32))
-		return -EIO;
+	memcpy(uregs.vregs, &target->thread.ctx_fp,
+			sizeof(struct context_fpregs));
 
-	/* fp0-fp31 */
-	for (i = 0; i < 32; i++) {
-		regval = get_reg(child, REG_F0 + i);
-		retval |= __put_user((long)regval, data + i);
-	}
+	uregs.fpcr = target->thread.fpcr;
 
-	ret = retval ? -EIO : 0;
-	return 0;
+	ret = membuf_write(&to, &uregs, sizeof(uregs));
+
+	return ret;
 }
 
-int ptrace_setfpregs(struct task_struct *child, __s64 __user *data)
+static int fpr_set(struct task_struct *target,
+			const struct user_regset *regset,
+			unsigned int pos, unsigned int count,
+			const void *kbuf, const void __user *ubuf)
 {
-	int ret, retval = 0;
-	int i;
-	unsigned long regval;
+	int ret;
+	struct user_fpsimd_state uregs;
 
-	if (!access_ok(data, sizeof(long) * 32))
-		return -EIO;
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				&uregs, 0, sizeof(uregs));
 
-	/* fp0-fp31 */
-	for (i = 0; i < 32; i++) {
-		retval |= __get_user(regval, data + i);
-		ret = put_reg(child, REG_F0 + i, regval);
-	}
+	if (ret)
+		return ret;
+
+	memcpy(&target->thread.ctx_fp, uregs.vregs,
+			sizeof(struct context_fpregs));
+
+	target->thread.fpcr = uregs.fpcr;
 
 	return ret;
 }
 
+enum sw64_regset {
+	REGSET_GPR,
+	REGSET_FPR,
+};
+
+static const struct user_regset sw64_regsets[] = {
+	[REGSET_GPR] = {
+		.core_note_type = NT_PRSTATUS,
+		.n = ELF_NGREG,
+		.size = sizeof(elf_greg_t),
+		.align = sizeof(elf_greg_t),
+		.regset_get = gpr_get,
+		.set = gpr_set
+	},
+	[REGSET_FPR] = {
+		.core_note_type = NT_PRFPREG,
+		.n = sizeof(struct user_fpsimd_state) / sizeof(u64),
+		.size = sizeof(u64),
+		.align = sizeof(u64),
+		.regset_get = fpr_get,
+		.set = fpr_set
+	},
+};
+
+static const struct user_regset_view user_sw64_view = {
+	.name = "sw64", .e_machine = EM_SW64,
+	.regsets = sw64_regsets, .n = ARRAY_SIZE(sw64_regsets)
+};
+
+const struct user_regset_view *task_user_regset_view(struct task_struct *task)
+{
+	return &user_sw64_view;
+}
+
 long arch_ptrace(struct task_struct *child, long request,
 		unsigned long addr, unsigned long data)
 {
 	unsigned long tmp;
 	size_t copied;
 	long ret;
-	void __user *datavp = (void __user *) data;
 
 	switch (request) {
 	/* When I and D space are separate, these will need to be fixed.  */
@@ -416,18 +426,6 @@ long arch_ptrace(struct task_struct *child, long request,
 	case PTRACE_POKEUSR: /* write the specified register */
 		ret = put_reg(child, addr, data);
 		break;
-	case PTRACE_GETREGS:
-		ret = ptrace_getregs(child, datavp);
-		break;
-	case PTRACE_SETREGS:
-		ret = ptrace_setregs(child, datavp);
-		break;
-	case PTRACE_GETFPREGS:
-		ret = ptrace_getfpregs(child, datavp);
-		break;
-	case PTRACE_SETFPREGS:
-		ret = ptrace_setfpregs(child, datavp);
-		break;
 	default:
 		ret = ptrace_request(child, request, addr, data);
 		break;
-- 
Gitee


From a9839e292f57d2e4f65e82e73d278fff9adff87e Mon Sep 17 00:00:00 2001
From: He Sheng <hesheng@wxiat.com>
Date: Thu, 26 May 2022 11:00:48 +0800
Subject: [PATCH 119/674] sw64: merge user_fpsimd_state into thread_struct

Sunway inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5G7NL

--------------------------------

The user_fpsimd_state struct has been defined to hold fpu state
which is also held in thread_struct. To reuse this struct, this
patch makes a little change to user_fpsimd_state, then replace
struct context_fpregs and fpcr of thread_struct.

This patch also moves definition of task_pt_regs and mm_segment_t
to appropriate headers to avoid compile errors.

Signed-off-by: He Sheng <hesheng@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/include/asm/processor.h   |  45 +--------
 arch/sw_64/include/asm/ptrace.h      |   4 -
 arch/sw_64/include/asm/thread_info.h |   5 +
 arch/sw_64/include/uapi/asm/ptrace.h |   8 +-
 arch/sw_64/kernel/asm-offsets.c      |  67 +++++++-------
 arch/sw_64/kernel/fpu.S              | 134 +++++++++++++--------------
 arch/sw_64/kernel/process.c          |   2 +-
 arch/sw_64/kernel/ptrace.c           |  51 +++-------
 arch/sw_64/kernel/signal.c           |  13 +--
 arch/sw_64/kvm/entry.S               |  40 ++++----
 10 files changed, 155 insertions(+), 214 deletions(-)

diff --git a/arch/sw_64/include/asm/processor.h b/arch/sw_64/include/asm/processor.h
index 67d3b4368987..08e8cc8e5428 100644
--- a/arch/sw_64/include/asm/processor.h
+++ b/arch/sw_64/include/asm/processor.h
@@ -9,6 +9,10 @@
 #define _ASM_SW64_PROCESSOR_H
 
 #include <linux/personality.h>	/* for ADDR_LIMIT_32BIT */
+#include <asm/ptrace.h>
+
+#define task_pt_regs(task) \
+	((struct pt_regs *) (task_stack_page(task) + 2 * PAGE_SIZE) - 1)
 
 /*
  * Returns current instruction pointer ("program counter").
@@ -37,47 +41,8 @@
 #define TASK_UNMAPPED_BASE \
 	((current->personality & ADDR_LIMIT_32BIT) ? 0x40000000 : UNMAPPED_BASE)
 
-typedef struct {
-	unsigned long seg;
-} mm_segment_t;
-
-struct context_fpregs {
-	unsigned long f0[4];
-	unsigned long f1[4];
-	unsigned long f2[4];
-	unsigned long f3[4];
-	unsigned long f4[4];
-	unsigned long f5[4];
-	unsigned long f6[4];
-	unsigned long f7[4];
-	unsigned long f8[4];
-	unsigned long f9[4];
-	unsigned long f10[4];
-	unsigned long f11[4];
-	unsigned long f12[4];
-	unsigned long f13[4];
-	unsigned long f14[4];
-	unsigned long f15[4];
-	unsigned long f16[4];
-	unsigned long f17[4];
-	unsigned long f18[4];
-	unsigned long f19[4];
-	unsigned long f20[4];
-	unsigned long f21[4];
-	unsigned long f22[4];
-	unsigned long f23[4];
-	unsigned long f24[4];
-	unsigned long f25[4];
-	unsigned long f26[4];
-	unsigned long f27[4];
-	unsigned long f28[4];
-	unsigned long f29[4];
-	unsigned long f30[4];
-} __aligned(32);	/* 256 bits aligned for simd */
-
 struct thread_struct {
-	struct context_fpregs ctx_fp;
-	unsigned long fpcr;
+	struct user_fpsimd_state fpstate;
 	/* Callee-saved registers */
 	unsigned long ra;
 	unsigned long s[7];	/* s0 ~ s6 */
diff --git a/arch/sw_64/include/asm/ptrace.h b/arch/sw_64/include/asm/ptrace.h
index 85f7a47fdee3..ac9943015663 100644
--- a/arch/sw_64/include/asm/ptrace.h
+++ b/arch/sw_64/include/asm/ptrace.h
@@ -3,10 +3,8 @@
 #define _ASM_SW64_PTRACE_H
 
 #include <uapi/asm/ptrace.h>
-#include <linux/sched/task_stack.h>
 #include <asm/hmcall.h>
 #include <asm/thread_info.h>
-#include <asm/processor.h>
 #include <asm/page.h>
 
 /*
@@ -65,8 +63,6 @@ struct pt_regs {
 #define kernel_stack_pointer(regs) (((regs->ps) >> 4) & (TASK_SIZE - 1))
 #define instruction_pointer_set(regs, val) ((regs)->pc = val)
 
-#define task_pt_regs(task) \
-	((struct pt_regs *) (task_stack_page(task) + 2 * PAGE_SIZE) - 1)
 
 #define current_pt_regs() \
 	((struct pt_regs *) ((char *)current_thread_info() + 2 * PAGE_SIZE) - 1)
diff --git a/arch/sw_64/include/asm/thread_info.h b/arch/sw_64/include/asm/thread_info.h
index cffb09fc6262..33b95f815448 100644
--- a/arch/sw_64/include/asm/thread_info.h
+++ b/arch/sw_64/include/asm/thread_info.h
@@ -9,6 +9,11 @@
 #include <asm/types.h>
 #include <asm/sysinfo.h>
 
+typedef struct {
+	unsigned long seg;
+} mm_segment_t;
+
+
 struct pcb_struct {
 	unsigned long ksp;
 	unsigned long usp;
diff --git a/arch/sw_64/include/uapi/asm/ptrace.h b/arch/sw_64/include/uapi/asm/ptrace.h
index 1169d87c4b9c..a45b10b8914a 100644
--- a/arch/sw_64/include/uapi/asm/ptrace.h
+++ b/arch/sw_64/include/uapi/asm/ptrace.h
@@ -11,9 +11,15 @@ struct user_pt_regs {
 	__u64 pstate;
 };
 
+/* 256 bits aligned for simd */
+struct fpreg {
+	__u64 v[4] __attribute__((aligned(32)));
+};
+
 struct user_fpsimd_state {
-	__u64 vregs[124];
+	struct fpreg fp[31];
 	__u64 fpcr;
+	__u64 __reserved[3];
 };
 
 /* PTRACE_ATTACH is 16 */
diff --git a/arch/sw_64/kernel/asm-offsets.c b/arch/sw_64/kernel/asm-offsets.c
index 5dc59346996f..0c7e1e26eb05 100644
--- a/arch/sw_64/kernel/asm-offsets.c
+++ b/arch/sw_64/kernel/asm-offsets.c
@@ -178,40 +178,39 @@ void foo(void)
 	DEFINE(HOST_INT_R16, offsetof(struct host_int_args, r16));
 	BLANK();
 
-	DEFINE(TASK_THREAD, offsetof(struct task_struct, thread));
-	DEFINE(THREAD_CTX_FP, offsetof(struct thread_struct, ctx_fp));
-	DEFINE(THREAD_FPCR, offsetof(struct thread_struct, fpcr));
-	DEFINE(CTX_FP_F0, offsetof(struct context_fpregs, f0));
-	DEFINE(CTX_FP_F1, offsetof(struct context_fpregs, f1));
-	DEFINE(CTX_FP_F2, offsetof(struct context_fpregs, f2));
-	DEFINE(CTX_FP_F3, offsetof(struct context_fpregs, f3));
-	DEFINE(CTX_FP_F4, offsetof(struct context_fpregs, f4));
-	DEFINE(CTX_FP_F5, offsetof(struct context_fpregs, f5));
-	DEFINE(CTX_FP_F6, offsetof(struct context_fpregs, f6));
-	DEFINE(CTX_FP_F7, offsetof(struct context_fpregs, f7));
-	DEFINE(CTX_FP_F8, offsetof(struct context_fpregs, f8));
-	DEFINE(CTX_FP_F9, offsetof(struct context_fpregs, f9));
-	DEFINE(CTX_FP_F10, offsetof(struct context_fpregs, f10));
-	DEFINE(CTX_FP_F11, offsetof(struct context_fpregs, f11));
-	DEFINE(CTX_FP_F12, offsetof(struct context_fpregs, f12));
-	DEFINE(CTX_FP_F13, offsetof(struct context_fpregs, f13));
-	DEFINE(CTX_FP_F14, offsetof(struct context_fpregs, f14));
-	DEFINE(CTX_FP_F15, offsetof(struct context_fpregs, f15));
-	DEFINE(CTX_FP_F16, offsetof(struct context_fpregs, f16));
-	DEFINE(CTX_FP_F17, offsetof(struct context_fpregs, f17));
-	DEFINE(CTX_FP_F18, offsetof(struct context_fpregs, f18));
-	DEFINE(CTX_FP_F19, offsetof(struct context_fpregs, f19));
-	DEFINE(CTX_FP_F20, offsetof(struct context_fpregs, f20));
-	DEFINE(CTX_FP_F21, offsetof(struct context_fpregs, f21));
-	DEFINE(CTX_FP_F22, offsetof(struct context_fpregs, f22));
-	DEFINE(CTX_FP_F23, offsetof(struct context_fpregs, f23));
-	DEFINE(CTX_FP_F24, offsetof(struct context_fpregs, f24));
-	DEFINE(CTX_FP_F25, offsetof(struct context_fpregs, f25));
-	DEFINE(CTX_FP_F26, offsetof(struct context_fpregs, f26));
-	DEFINE(CTX_FP_F27, offsetof(struct context_fpregs, f27));
-	DEFINE(CTX_FP_F28, offsetof(struct context_fpregs, f28));
-	DEFINE(CTX_FP_F29, offsetof(struct context_fpregs, f29));
-	DEFINE(CTX_FP_F30, offsetof(struct context_fpregs, f30));
+	OFFSET(TASK_THREAD, task_struct, thread);
+	OFFSET(TASK_THREAD_F0, task_struct, thread.fpstate.fp[0]);
+	OFFSET(TASK_THREAD_F1, task_struct, thread.fpstate.fp[1]);
+	OFFSET(TASK_THREAD_F2, task_struct, thread.fpstate.fp[2]);
+	OFFSET(TASK_THREAD_F3, task_struct, thread.fpstate.fp[3]);
+	OFFSET(TASK_THREAD_F4, task_struct, thread.fpstate.fp[4]);
+	OFFSET(TASK_THREAD_F5, task_struct, thread.fpstate.fp[5]);
+	OFFSET(TASK_THREAD_F6, task_struct, thread.fpstate.fp[6]);
+	OFFSET(TASK_THREAD_F7, task_struct, thread.fpstate.fp[7]);
+	OFFSET(TASK_THREAD_F8, task_struct, thread.fpstate.fp[8]);
+	OFFSET(TASK_THREAD_F9, task_struct, thread.fpstate.fp[9]);
+	OFFSET(TASK_THREAD_F10, task_struct, thread.fpstate.fp[10]);
+	OFFSET(TASK_THREAD_F11, task_struct, thread.fpstate.fp[11]);
+	OFFSET(TASK_THREAD_F12, task_struct, thread.fpstate.fp[12]);
+	OFFSET(TASK_THREAD_F13, task_struct, thread.fpstate.fp[13]);
+	OFFSET(TASK_THREAD_F14, task_struct, thread.fpstate.fp[14]);
+	OFFSET(TASK_THREAD_F15, task_struct, thread.fpstate.fp[15]);
+	OFFSET(TASK_THREAD_F16, task_struct, thread.fpstate.fp[16]);
+	OFFSET(TASK_THREAD_F17, task_struct, thread.fpstate.fp[17]);
+	OFFSET(TASK_THREAD_F18, task_struct, thread.fpstate.fp[18]);
+	OFFSET(TASK_THREAD_F19, task_struct, thread.fpstate.fp[19]);
+	OFFSET(TASK_THREAD_F20, task_struct, thread.fpstate.fp[20]);
+	OFFSET(TASK_THREAD_F21, task_struct, thread.fpstate.fp[21]);
+	OFFSET(TASK_THREAD_F22, task_struct, thread.fpstate.fp[22]);
+	OFFSET(TASK_THREAD_F23, task_struct, thread.fpstate.fp[23]);
+	OFFSET(TASK_THREAD_F24, task_struct, thread.fpstate.fp[24]);
+	OFFSET(TASK_THREAD_F25, task_struct, thread.fpstate.fp[25]);
+	OFFSET(TASK_THREAD_F26, task_struct, thread.fpstate.fp[26]);
+	OFFSET(TASK_THREAD_F27, task_struct, thread.fpstate.fp[27]);
+	OFFSET(TASK_THREAD_F28, task_struct, thread.fpstate.fp[28]);
+	OFFSET(TASK_THREAD_F29, task_struct, thread.fpstate.fp[29]);
+	OFFSET(TASK_THREAD_F30, task_struct, thread.fpstate.fp[30]);
+	OFFSET(TASK_THREAD_FPCR, task_struct, thread.fpstate.fpcr);
 	BLANK();
 	OFFSET(TASK_THREAD_RA, task_struct, thread.ra);
 	OFFSET(TASK_THREAD_S0, task_struct, thread.s[0]);
diff --git a/arch/sw_64/kernel/fpu.S b/arch/sw_64/kernel/fpu.S
index 25dcf4740525..3cb3bfab08e8 100644
--- a/arch/sw_64/kernel/fpu.S
+++ b/arch/sw_64/kernel/fpu.S
@@ -8,49 +8,46 @@
 	.set noat
 ENTRY(__fpstate_save)
 	/* a0: prev task */
-	ldi	a0, TASK_THREAD(a0)
-	ldi	t0, THREAD_CTX_FP(a0)
-	vstd	$f0, CTX_FP_F0(t0)
-	vstd	$f1, CTX_FP_F1(t0)
-	vstd	$f2, CTX_FP_F2(t0)
-	vstd	$f3, CTX_FP_F3(t0)
-	vstd	$f4, CTX_FP_F4(t0)
-	vstd	$f5, CTX_FP_F5(t0)
-	vstd	$f6, CTX_FP_F6(t0)
-	vstd	$f7, CTX_FP_F7(t0)
-	vstd	$f8, CTX_FP_F8(t0)
-	vstd	$f9, CTX_FP_F9(t0)
-	vstd	$f10, CTX_FP_F10(t0)
-	vstd	$f11, CTX_FP_F11(t0)
-	vstd	$f12, CTX_FP_F12(t0)
-	vstd	$f13, CTX_FP_F13(t0)
-	vstd	$f14, CTX_FP_F14(t0)
-	vstd	$f15, CTX_FP_F15(t0)
-	vstd	$f16, CTX_FP_F16(t0)
-	vstd	$f17, CTX_FP_F17(t0)
-	vstd	$f18, CTX_FP_F18(t0)
-	vstd	$f19, CTX_FP_F19(t0)
-	vstd	$f20, CTX_FP_F20(t0)
-	vstd	$f21, CTX_FP_F21(t0)
-	vstd	$f22, CTX_FP_F22(t0)
-	vstd	$f23, CTX_FP_F23(t0)
-	vstd	$f24, CTX_FP_F24(t0)
-	vstd	$f25, CTX_FP_F25(t0)
-	vstd	$f26, CTX_FP_F26(t0)
-	vstd	$f27, CTX_FP_F27(t0)
+	vstd	$f0, TASK_THREAD_F0(a0)
+	vstd	$f1, TASK_THREAD_F1(a0)
+	vstd	$f2, TASK_THREAD_F2(a0)
+	vstd	$f3, TASK_THREAD_F3(a0)
+	vstd	$f4, TASK_THREAD_F4(a0)
+	vstd	$f5, TASK_THREAD_F5(a0)
+	vstd	$f6, TASK_THREAD_F6(a0)
+	vstd	$f7, TASK_THREAD_F7(a0)
+	vstd	$f8, TASK_THREAD_F8(a0)
+	vstd	$f9, TASK_THREAD_F9(a0)
+	vstd	$f10, TASK_THREAD_F10(a0)
+	vstd	$f11, TASK_THREAD_F11(a0)
+	vstd	$f12, TASK_THREAD_F12(a0)
+	vstd	$f13, TASK_THREAD_F13(a0)
+	vstd	$f14, TASK_THREAD_F14(a0)
+	vstd	$f15, TASK_THREAD_F15(a0)
+	vstd	$f16, TASK_THREAD_F16(a0)
+	vstd	$f17, TASK_THREAD_F17(a0)
+	vstd	$f18, TASK_THREAD_F18(a0)
+	vstd	$f19, TASK_THREAD_F19(a0)
+	vstd	$f20, TASK_THREAD_F20(a0)
+	vstd	$f21, TASK_THREAD_F21(a0)
+	vstd	$f22, TASK_THREAD_F22(a0)
+	vstd	$f23, TASK_THREAD_F23(a0)
+	vstd	$f24, TASK_THREAD_F24(a0)
+	vstd	$f25, TASK_THREAD_F25(a0)
+	vstd	$f26, TASK_THREAD_F26(a0)
+	vstd	$f27, TASK_THREAD_F27(a0)
 	rfpcr	$f0
-	vstd	$f28, CTX_FP_F28(t0)
-	vstd	$f29, CTX_FP_F29(t0)
-	vstd	$f30, CTX_FP_F30(t0)
-	fstd	$f0, THREAD_FPCR(a0)
-	vldd	$f0, CTX_FP_F0(t0)
+	vstd	$f28, TASK_THREAD_F28(a0)
+	vstd	$f29, TASK_THREAD_F29(a0)
+	vstd	$f30, TASK_THREAD_F30(a0)
+	fstd	$f0, TASK_THREAD_FPCR(a0)
+	vldd	$f0, TASK_THREAD_F0(a0)
 	ret
 END(__fpstate_save)
 
 ENTRY(__fpstate_restore)
 	/* a0: next task */
-	ldi	a0, TASK_THREAD(a0)
-	fldd	$f0, THREAD_FPCR(a0)
+	fldd	$f0, TASK_THREAD_FPCR(a0)
 	wfpcr	$f0
 	fimovd	$f0, t1
 	and	t1, 0x3, t1
@@ -70,37 +67,36 @@ $setfpec_1:
 $setfpec_2:
 	setfpec2
 $setfpec_over:
-	ldi	t0, THREAD_CTX_FP(a0)
-	vldd	$f0, CTX_FP_F0(t0)
-	vldd	$f1, CTX_FP_F1(t0)
-	vldd	$f2, CTX_FP_F2(t0)
-	vldd	$f3, CTX_FP_F3(t0)
-	vldd	$f4, CTX_FP_F4(t0)
-	vldd	$f5, CTX_FP_F5(t0)
-	vldd	$f6, CTX_FP_F6(t0)
-	vldd	$f7, CTX_FP_F7(t0)
-	vldd	$f8, CTX_FP_F8(t0)
-	vldd	$f9, CTX_FP_F9(t0)
-	vldd	$f10, CTX_FP_F10(t0)
-	vldd	$f11, CTX_FP_F11(t0)
-	vldd	$f12, CTX_FP_F12(t0)
-	vldd	$f13, CTX_FP_F13(t0)
-	vldd	$f14, CTX_FP_F14(t0)
-	vldd	$f15, CTX_FP_F15(t0)
-	vldd	$f16, CTX_FP_F16(t0)
-	vldd	$f17, CTX_FP_F17(t0)
-	vldd	$f18, CTX_FP_F18(t0)
-	vldd	$f19, CTX_FP_F19(t0)
-	vldd	$f20, CTX_FP_F20(t0)
-	vldd	$f21, CTX_FP_F21(t0)
-	vldd	$f22, CTX_FP_F22(t0)
-	vldd	$f23, CTX_FP_F23(t0)
-	vldd	$f24, CTX_FP_F24(t0)
-	vldd	$f25, CTX_FP_F25(t0)
-	vldd	$f26, CTX_FP_F26(t0)
-	vldd	$f27, CTX_FP_F27(t0)
-	vldd	$f28, CTX_FP_F28(t0)
-	vldd	$f29, CTX_FP_F29(t0)
-	vldd	$f30, CTX_FP_F30(t0)
+	vldd	$f0, TASK_THREAD_F0(a0)
+	vldd	$f1, TASK_THREAD_F1(a0)
+	vldd	$f2, TASK_THREAD_F2(a0)
+	vldd	$f3, TASK_THREAD_F3(a0)
+	vldd	$f4, TASK_THREAD_F4(a0)
+	vldd	$f5, TASK_THREAD_F5(a0)
+	vldd	$f6, TASK_THREAD_F6(a0)
+	vldd	$f7, TASK_THREAD_F7(a0)
+	vldd	$f8, TASK_THREAD_F8(a0)
+	vldd	$f9, TASK_THREAD_F9(a0)
+	vldd	$f10, TASK_THREAD_F10(a0)
+	vldd	$f11, TASK_THREAD_F11(a0)
+	vldd	$f12, TASK_THREAD_F12(a0)
+	vldd	$f13, TASK_THREAD_F13(a0)
+	vldd	$f14, TASK_THREAD_F14(a0)
+	vldd	$f15, TASK_THREAD_F15(a0)
+	vldd	$f16, TASK_THREAD_F16(a0)
+	vldd	$f17, TASK_THREAD_F17(a0)
+	vldd	$f18, TASK_THREAD_F18(a0)
+	vldd	$f19, TASK_THREAD_F19(a0)
+	vldd	$f20, TASK_THREAD_F20(a0)
+	vldd	$f21, TASK_THREAD_F21(a0)
+	vldd	$f22, TASK_THREAD_F22(a0)
+	vldd	$f23, TASK_THREAD_F23(a0)
+	vldd	$f24, TASK_THREAD_F24(a0)
+	vldd	$f25, TASK_THREAD_F25(a0)
+	vldd	$f26, TASK_THREAD_F26(a0)
+	vldd	$f27, TASK_THREAD_F27(a0)
+	vldd	$f28, TASK_THREAD_F28(a0)
+	vldd	$f29, TASK_THREAD_F29(a0)
+	vldd	$f30, TASK_THREAD_F30(a0)
 	ret
 END(__fpstate_restore)
diff --git a/arch/sw_64/kernel/process.c b/arch/sw_64/kernel/process.c
index 6b5f0f3a8345..308675d29c04 100644
--- a/arch/sw_64/kernel/process.c
+++ b/arch/sw_64/kernel/process.c
@@ -256,7 +256,7 @@ EXPORT_SYMBOL(dump_elf_task);
 int
 dump_elf_task_fp(elf_fpreg_t *dest, struct task_struct *task)
 {
-	memcpy(dest, &task->thread.ctx_fp, 32 * 8);
+	memcpy(dest, &task->thread.fpstate, 32 * 8);
 	return 1;
 }
 EXPORT_SYMBOL(dump_elf_task_fp);
diff --git a/arch/sw_64/kernel/ptrace.c b/arch/sw_64/kernel/ptrace.c
index 94fba3569781..bdbd0d97a130 100644
--- a/arch/sw_64/kernel/ptrace.c
+++ b/arch/sw_64/kernel/ptrace.c
@@ -9,6 +9,7 @@
 #include <linux/audit.h>
 #include <linux/regset.h>
 #include <linux/elf.h>
+#include <linux/sched/task_stack.h>
 
 #include <asm/reg.h>
 #include <asm/asm-offsets.h>
@@ -55,9 +56,6 @@ enum {
 	REG_GP = 29
 };
 
-#define FP_REG(fp_regno, vector_regno) \
-	(fp_regno * 32 + vector_regno * 8)
-
 #define R(x)	((size_t) &((struct pt_regs *)0)->x)
 
 short regoffsets[32] = {
@@ -91,8 +89,8 @@ static unsigned long zero;
 static unsigned long *
 get_reg_addr(struct task_struct *task, unsigned long regno)
 {
-	unsigned long *addr;
-	int fp_regno, vector_regno;
+	void *addr;
+	int fno, vno;
 
 	switch (regno) {
 	case USP:
@@ -108,9 +106,8 @@ get_reg_addr(struct task_struct *task, unsigned long regno)
 		addr = (void *)task_pt_regs(task) + regoffsets[regno];
 		break;
 	case FPREG_BASE ... FPREG_END:
-		fp_regno = regno - FPREG_BASE;
-		vector_regno = 0;
-		addr = (void *)((unsigned long)&task->thread.ctx_fp + FP_REG(fp_regno, vector_regno));
+		fno = regno - FPREG_BASE;
+		addr = &task->thread.fpstate.fp[fno].v[0];
 		break;
 	case VECREG_BASE ... VECREG_END:
 		/*
@@ -121,12 +118,12 @@ get_reg_addr(struct task_struct *task, unsigned long regno)
 			addr = &zero;
 			break;
 		}
-		fp_regno = (regno - VECREG_BASE) & 0x1f;
-		vector_regno = 1 + ((regno - VECREG_BASE) >> 5);
-		addr = (void *)((unsigned long)&task->thread.ctx_fp + FP_REG(fp_regno, vector_regno));
+		fno = (regno - VECREG_BASE) & 0x1f;
+		vno = 1 + ((regno - VECREG_BASE) >> 5);
+		addr = &task->thread.fpstate.fp[fno].v[vno];
 		break;
 	case FPCR:
-		addr = (void *)&task->thread.fpcr;
+		addr = &task->thread.fpstate.fpcr;
 		break;
 	case PC:
 		addr = (void *)task_pt_regs(task) + PT_REGS_PC;
@@ -322,17 +319,9 @@ static int fpr_get(struct task_struct *target,
 			const struct user_regset *regset,
 			struct membuf to)
 {
-	int ret;
-	struct user_fpsimd_state uregs;
-
-	memcpy(uregs.vregs, &target->thread.ctx_fp,
-			sizeof(struct context_fpregs));
-
-	uregs.fpcr = target->thread.fpcr;
-
-	ret = membuf_write(&to, &uregs, sizeof(uregs));
 
-	return ret;
+	return membuf_write(&to, &target->thread.fpstate,
+			sizeof(struct user_fpsimd_state));
 }
 
 static int fpr_set(struct task_struct *target,
@@ -340,21 +329,9 @@ static int fpr_set(struct task_struct *target,
 			unsigned int pos, unsigned int count,
 			const void *kbuf, const void __user *ubuf)
 {
-	int ret;
-	struct user_fpsimd_state uregs;
-
-	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
-				&uregs, 0, sizeof(uregs));
-
-	if (ret)
-		return ret;
-
-	memcpy(&target->thread.ctx_fp, uregs.vregs,
-			sizeof(struct context_fpregs));
-
-	target->thread.fpcr = uregs.fpcr;
-
-	return ret;
+	return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				&target->thread.fpstate, 0,
+				sizeof(struct user_fpsimd_state));
 }
 
 enum sw64_regset {
diff --git a/arch/sw_64/kernel/signal.c b/arch/sw_64/kernel/signal.c
index 887326812624..6a6203ccb04f 100644
--- a/arch/sw_64/kernel/signal.c
+++ b/arch/sw_64/kernel/signal.c
@@ -100,9 +100,10 @@ restore_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs)
 	err |= __get_user(usp, sc->sc_regs+30);
 	wrusp(usp);
 	/* simd-fp */
-	err |= __copy_from_user(&current->thread.ctx_fp,
-			&sc->sc_fpregs, sizeof(struct context_fpregs));
-	err |= __get_user(current->thread.fpcr, &sc->sc_fpcr);
+	err |= __copy_from_user(&current->thread.fpstate, &sc->sc_fpregs,
+				offsetof(struct user_fpsimd_state, fpcr));
+	err |= __get_user(current->thread.fpstate.fpcr, &sc->sc_fpcr);
+
 	if (likely(!err))
 		__fpstate_restore(current);
 
@@ -230,9 +231,9 @@ setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs,
 	err |= __put_user(0, sc->sc_regs+31);
 	/* simd-fp */
 	__fpstate_save(current);
-	err |= __copy_to_user(&sc->sc_fpregs,
-			&current->thread.ctx_fp, sizeof(struct context_fpregs));
-	err |= __put_user(current->thread.fpcr, &sc->sc_fpcr);
+	err |= __copy_to_user(&sc->sc_fpregs, &current->thread.fpstate,
+				offsetof(struct user_fpsimd_state, fpcr));
+	err |= __put_user(current->thread.fpstate.fpcr, &sc->sc_fpcr);
 
 	err |= __put_user(regs->trap_a0, &sc->sc_traparg_a0);
 	err |= __put_user(regs->trap_a1, &sc->sc_traparg_a1);
diff --git a/arch/sw_64/kvm/entry.S b/arch/sw_64/kvm/entry.S
index 1e089da5378e..0c02b68ee7d0 100644
--- a/arch/sw_64/kvm/entry.S
+++ b/arch/sw_64/kvm/entry.S
@@ -15,18 +15,16 @@ ENTRY(__sw64_vcpu_run)
 
 	/* save host fpregs */
 	ldl	$1, TI_TASK($8)
-	ldi	$1, TASK_THREAD($1)
 	rfpcr	$f0
-	fstd	$f0, THREAD_FPCR($1)
-	ldi	$1, THREAD_CTX_FP($1)
-	vstd	$f2, CTX_FP_F2($1)
-	vstd	$f3, CTX_FP_F3($1)
-	vstd	$f4, CTX_FP_F4($1)
-	vstd	$f5, CTX_FP_F5($1)
-	vstd	$f6, CTX_FP_F6($1)
-	vstd	$f7, CTX_FP_F7($1)
-	vstd	$f8, CTX_FP_F8($1)
-	vstd	$f9, CTX_FP_F9($1)
+	fstd	$f0, TASK_THREAD_FPCR($1)
+	vstd	$f2, TASK_THREAD_F2($1)
+	vstd	$f3, TASK_THREAD_F3($1)
+	vstd	$f4, TASK_THREAD_F4($1)
+	vstd	$f5, TASK_THREAD_F5($1)
+	vstd	$f6, TASK_THREAD_F6($1)
+	vstd	$f7, TASK_THREAD_F7($1)
+	vstd	$f8, TASK_THREAD_F8($1)
+	vstd	$f9, TASK_THREAD_F9($1)
 
 	ldi	sp, -VCPU_RET_SIZE(sp)
 	/* r16 = guest kvm_vcpu_arch.vcb struct pointer */
@@ -213,8 +211,7 @@ $g_setfpec_over:
 	bic	sp, $8, $8
 	/* restore host fpregs */
 	ldl	$1, TI_TASK($8)
-	ldi	$1, TASK_THREAD($1)
-	fldd	$f0, THREAD_FPCR($1)
+	fldd	$f0, TASK_THREAD_FPCR($1)
 	wfpcr	$f0
 	fimovd	$f0, $2
 	and	$2, 0x3, $2
@@ -234,15 +231,14 @@ $setfpec_1:
 $setfpec_2:
 	setfpec2
 $setfpec_over:
-	ldi	$1, THREAD_CTX_FP($1)
-	vldd	$f2, CTX_FP_F2($1)
-	vldd	$f3, CTX_FP_F3($1)
-	vldd	$f4, CTX_FP_F4($1)
-	vldd	$f5, CTX_FP_F5($1)
-	vldd	$f6, CTX_FP_F6($1)
-	vldd	$f7, CTX_FP_F7($1)
-	vldd	$f8, CTX_FP_F8($1)
-	vldd	$f9, CTX_FP_F9($1)
+	vldd	$f2, TASK_THREAD_F2($1)
+	vldd	$f3, TASK_THREAD_F3($1)
+	vldd	$f4, TASK_THREAD_F4($1)
+	vldd	$f5, TASK_THREAD_F5($1)
+	vldd	$f6, TASK_THREAD_F6($1)
+	vldd	$f7, TASK_THREAD_F7($1)
+	vldd	$f8, TASK_THREAD_F8($1)
+	vldd	$f9, TASK_THREAD_F9($1)
 
 	/* if $0 > 0, handle hcall */
 	bgt	$0, $ret_to
-- 
Gitee


From 6a2490239e1f86c8009ab192e2361aa4f911da2e Mon Sep 17 00:00:00 2001
From: He Sheng <hesheng@wxiat.com>
Date: Fri, 27 May 2022 10:01:54 +0800
Subject: [PATCH 120/674] sw64: fix ptrace.h with types.h and NOT __ASSEMBLY__

Sunway inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I56OLG

--------------------------------

It may report compile errors on some user apps if linux/types.h is
nowhere to be included. And a __ASSEMBLY__ guard makes it work if
included by assembly codes.

Signed-off-by: He Sheng <hesheng@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/include/uapi/asm/ptrace.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/sw_64/include/uapi/asm/ptrace.h b/arch/sw_64/include/uapi/asm/ptrace.h
index a45b10b8914a..80bad067fc15 100644
--- a/arch/sw_64/include/uapi/asm/ptrace.h
+++ b/arch/sw_64/include/uapi/asm/ptrace.h
@@ -2,6 +2,9 @@
 #ifndef _UAPI_ASM_SW64_PTRACE_H
 #define _UAPI_ASM_SW64_PTRACE_H
 
+#include <linux/types.h>
+
+#ifndef __ASSEMBLY__
 /*
  * User structures for general purpose, floating point and debug registers.
  */
@@ -21,6 +24,7 @@ struct user_fpsimd_state {
 	__u64 fpcr;
 	__u64 __reserved[3];
 };
+#endif
 
 /* PTRACE_ATTACH is 16 */
 /* PTRACE_DETACH is 17 */
-- 
Gitee


From 9941c328d6527b408479f70e782a40c706c277d1 Mon Sep 17 00:00:00 2001
From: He Sheng <hesheng@wxiat.com>
Date: Fri, 27 May 2022 10:09:44 +0800
Subject: [PATCH 121/674] sw64: rewrite elf core copy interfaces

Sunway inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I56OLG

--------------------------------

For gpreg, simplify dump method to make it easier to maintain.

For fpreg, redefine elf_fpregset_t with struct user_fpsimd_state and
add a arch-specific dump_fpu() method.

Signed-off-by: He Sheng <hesheng@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/include/asm/elf.h | 34 +++++--------------
 arch/sw_64/kernel/process.c  | 66 ++++++++----------------------------
 2 files changed, 23 insertions(+), 77 deletions(-)

diff --git a/arch/sw_64/include/asm/elf.h b/arch/sw_64/include/asm/elf.h
index abecc5653eba..8c858cff5573 100644
--- a/arch/sw_64/include/asm/elf.h
+++ b/arch/sw_64/include/asm/elf.h
@@ -55,23 +55,18 @@
 #define EF_SW64_32BIT		1	/* All addresses are below 2GB */
 
 /*
- * ELF register definitions..
- */
-
-/*
- * The legacy version of <sys/procfs.h> makes gregset_t 46 entries long.
- * I have no idea why that is so.  For now, we just leave it at 33
- * (32 general regs + processor status word).
+ * ELF register definitions.
+ *
+ * For now, we just leave it at 33 (32 general regs + processor status word).
  */
 #define ELF_NGREG	33
-#define ELF_NFPREG	32
-
 
 typedef unsigned long elf_greg_t;
 typedef elf_greg_t elf_gregset_t[ELF_NGREG];
 
-typedef double elf_fpreg_t;
-typedef elf_fpreg_t elf_fpregset_t[ELF_NFPREG];
+/* Same with user_fpsimd_state */
+#include <uapi/asm/ptrace.h>
+typedef struct user_fpsimd_state elf_fpregset_t;
 
 /*
  * This is used to ensure we don't load something for the wrong architecture.
@@ -121,22 +116,9 @@ extern int arch_setup_additional_pages(struct linux_binprm *bprm,
 
 #ifdef __KERNEL__
 struct pt_regs;
-struct thread_info;
 struct task_struct;
-extern void dump_elf_thread(elf_greg_t *dest, struct pt_regs *pt,
-			    struct thread_info *ti);
-#define ELF_CORE_COPY_REGS(DEST, REGS) \
-	dump_elf_thread(DEST, REGS, current_thread_info());
-
-/* Similar, but for a thread other than current.  */
-
-extern int dump_elf_task(elf_greg_t *dest, struct task_struct *task);
-#define ELF_CORE_COPY_TASK_REGS(TASK, DEST) dump_elf_task(*(DEST), TASK)
-
-/* Similar, but for the FP registers.  */
-
-extern int dump_elf_task_fp(elf_fpreg_t *dest, struct task_struct *task);
-#define ELF_CORE_COPY_FPREGS(TASK, DEST) dump_elf_task_fp(*(DEST), TASK)
+extern void sw64_elf_core_copy_regs(elf_greg_t *dest, struct pt_regs *pt);
+#define ELF_CORE_COPY_REGS(DEST, REGS) sw64_elf_core_copy_regs(DEST, REGS);
 
 /*
  * This yields a mask that user programs can use to figure out what
diff --git a/arch/sw_64/kernel/process.c b/arch/sw_64/kernel/process.c
index 308675d29c04..7a7578d530c6 100644
--- a/arch/sw_64/kernel/process.c
+++ b/arch/sw_64/kernel/process.c
@@ -200,66 +200,30 @@ copy_thread(unsigned long clone_flags, unsigned long usp,
 
 /*
  * Fill in the user structure for a ELF core dump.
+ * @regs: should be signal_pt_regs() or task_pt_reg(task)
  */
-void
-dump_elf_thread(elf_greg_t *dest, struct pt_regs *pt, struct thread_info *ti)
+void sw64_elf_core_copy_regs(elf_greg_t *dest, struct pt_regs *regs)
 {
-	dest[0] = pt->r0;
-	dest[1] = pt->r1;
-	dest[2] = pt->r2;
-	dest[3] = pt->r3;
-	dest[4] = pt->r4;
-	dest[5] = pt->r5;
-	dest[6] = pt->r6;
-	dest[7] = pt->r7;
-	dest[8] = pt->r8;
-	dest[9] = pt->r9;
-	dest[10] = pt->r10;
-	dest[11] = pt->r11;
-	dest[12] = pt->r12;
-	dest[13] = pt->r13;
-	dest[14] = pt->r14;
-	dest[15] = pt->r15;
-	dest[16] = pt->r16;
-	dest[17] = pt->r17;
-	dest[18] = pt->r18;
-	dest[19] = pt->r19;
-	dest[20] = pt->r20;
-	dest[21] = pt->r21;
-	dest[22] = pt->r22;
-	dest[23] = pt->r23;
-	dest[24] = pt->r24;
-	dest[25] = pt->r25;
-	dest[26] = pt->r26;
-	dest[27] = pt->r27;
-	dest[28] = pt->r28;
-	dest[29] = pt->gp;
-	dest[30] = ti == current_thread_info() ? rdusp() : ti->pcb.usp;
-	dest[31] = pt->pc;
+	int i;
+	struct thread_info *ti;
 
-	/* Once upon a time this was the PS value.  Which is stupid
-	 * since that is always 8 for usermode.  Usurped for the more
-	 * useful value of the thread's UNIQUE field.
-	 */
-	dest[32] = ti->pcb.unique;
-}
-EXPORT_SYMBOL(dump_elf_thread);
+	ti = (void *)((__u64)regs & ~(THREAD_SIZE - 1));
 
-int
-dump_elf_task(elf_greg_t *dest, struct task_struct *task)
-{
-	dump_elf_thread(dest, task_pt_regs(task), task_thread_info(task));
-	return 1;
+	for (i = 0; i < 30; i++)
+		dest[i] = *(__u64 *)((void *)regs + regoffsets[i]);
+	dest[30] = ti == current_thread_info() ? rdusp() : ti->pcb.usp;
+	dest[31] = regs->pc;
+	dest[32] = ti->pcb.unique;
 }
-EXPORT_SYMBOL(dump_elf_task);
+EXPORT_SYMBOL(sw64_elf_core_copy_regs);
 
-int
-dump_elf_task_fp(elf_fpreg_t *dest, struct task_struct *task)
+/* Fill in the fpu structure for a core dump.  */
+int dump_fpu(struct pt_regs *regs, elf_fpregset_t *fpu)
 {
-	memcpy(dest, &task->thread.fpstate, 32 * 8);
+	memcpy(fpu, &current->thread.fpstate, sizeof(*fpu));
 	return 1;
 }
-EXPORT_SYMBOL(dump_elf_task_fp);
+EXPORT_SYMBOL(dump_fpu);
 
 /*
  * Under heavy swap load I've seen this lose in an ugly way.  So do
-- 
Gitee


From 4c651be01d70a2c0e3c6ac2c2821eac775f85aaa Mon Sep 17 00:00:00 2001
From: Wu Liliu <wuliliu@wxiat.com>
Date: Tue, 31 May 2022 09:48:40 +0800
Subject: [PATCH 122/674] sw64: perf: add fp based stack trace support

Sunway inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I56X48

--------------------------------

It used to obtain return address of function by scanning entire
stack which lead to inaccurate result. This patch adds fp based
stack trace support to improve efficiency and accuracy.

Because toolchain is not ready yet, we take CONFIG_FRAME_POINTER
to activate this support, and it should be disabled by default.

Signed-off-by: Wu Liliu <wuliliu@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/include/asm/stacktrace.h | 72 +++++++++++++++++++++++++
 arch/sw_64/kernel/perf_event.c      | 66 +++++++++++++++++++----
 arch/sw_64/kernel/stacktrace.c      | 83 +++++++++++++++++++++++++++++
 3 files changed, 210 insertions(+), 11 deletions(-)
 create mode 100644 arch/sw_64/include/asm/stacktrace.h

diff --git a/arch/sw_64/include/asm/stacktrace.h b/arch/sw_64/include/asm/stacktrace.h
new file mode 100644
index 000000000000..813aa5e7a91d
--- /dev/null
+++ b/arch/sw_64/include/asm/stacktrace.h
@@ -0,0 +1,72 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _ASM_SW64_STACKTRACE_H
+#define _ASM_SW64_STACKTRACE_H
+
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
+#include <asm/memory.h>
+#include <asm/ptrace.h>
+
+struct stackframe {
+	unsigned long pc;
+	unsigned long fp;
+};
+
+enum stack_type {
+	STACK_TYPE_UNKNOWN,
+	STACK_TYPE_TASK,
+};
+
+struct stack_info {
+	unsigned long low;
+	unsigned long high;
+	enum stack_type type;
+};
+
+/* The form of the top of the frame on the stack */
+struct stack_frame {
+	unsigned long return_address;
+	struct stack_frame *next_frame;
+};
+
+extern int unwind_frame(struct task_struct *tsk, struct stackframe *frame);
+extern void walk_stackframe(struct task_struct *tsk, struct stackframe *frame,
+			    int (*fn)(struct stackframe *, void *), void *data);
+
+static inline bool on_task_stack(struct task_struct *tsk, unsigned long sp,
+				struct stack_info *info)
+{
+	unsigned long low = (unsigned long)task_stack_page(tsk);
+	unsigned long high = low + THREAD_SIZE;
+
+	if (sp < low || sp >= high)
+		return false;
+
+	if (info) {
+		info->low = low;
+		info->high = high;
+		info->type = STACK_TYPE_TASK;
+	}
+
+	return true;
+}
+
+/*
+ * We can only safely access per-cpu stacks from current in a non-preemptible
+ * context.
+ */
+static inline bool on_accessible_stack(struct task_struct *tsk,
+					unsigned long sp,
+					struct stack_info *info)
+{
+	if (on_task_stack(tsk, sp, info))
+		return true;
+	if (tsk != current || preemptible())
+		return false;
+
+	return false;
+}
+
+#endif	/* _ASM_SW64_STACKTRACE_H */
diff --git a/arch/sw_64/kernel/perf_event.c b/arch/sw_64/kernel/perf_event.c
index 0d49224ba058..70f1f2781016 100644
--- a/arch/sw_64/kernel/perf_event.c
+++ b/arch/sw_64/kernel/perf_event.c
@@ -6,6 +6,7 @@
  */
 
 #include <linux/perf_event.h>
+#include <asm/stacktrace.h>
 
 /* For tracking PMCs and the hw events they monitor on each CPU. */
 struct cpu_hw_events {
@@ -247,10 +248,9 @@ static const struct sw64_perf_event *core3_map_cache_event(u64 config)
  */
 static bool core3_raw_event_valid(u64 config)
 {
-	if ((config >= (PC0_RAW_BASE + PC0_MIN) && config <= (PC0_RAW_BASE + PC0_MAX)) ||
-		(config >= (PC1_RAW_BASE + PC1_MIN) && config <= (PC1_RAW_BASE + PC1_MAX))) {
+	if ((config >= PC0_RAW_BASE && config <= (PC0_RAW_BASE + PC0_MAX)) ||
+		(config >= PC1_RAW_BASE && config <= (PC1_RAW_BASE + PC1_MAX)))
 		return true;
-	}
 
 	pr_info("sw64 pmu: invalid raw event config %#llx\n", config);
 	return false;
@@ -697,6 +697,36 @@ bool valid_dy_addr(unsigned long addr)
 	return ret;
 }
 
+#ifdef CONFIG_FRAME_POINTER
+void perf_callchain_user(struct perf_callchain_entry_ctx *entry,
+		struct pt_regs *regs)
+{
+
+	struct stack_frame frame;
+	unsigned long __user *fp;
+	int err;
+
+	perf_callchain_store(entry, regs->pc);
+
+	fp = (unsigned long __user *)regs->r15;
+
+	while (entry->nr < entry->max_stack && (unsigned long)fp < current->mm->start_stack) {
+		if (!access_ok(fp, sizeof(frame)))
+			break;
+
+		pagefault_disable();
+		err =  __copy_from_user_inatomic(&frame, fp, sizeof(frame));
+		pagefault_enable();
+
+		if (err)
+			break;
+
+		if (valid_utext_addr(frame.return_address) || valid_dy_addr(frame.return_address))
+			perf_callchain_store(entry, frame.return_address);
+		fp = (void __user *)frame.next_frame;
+	}
+}
+#else /* !CONFIG_FRAME_POINTER */
 void perf_callchain_user(struct perf_callchain_entry_ctx *entry,
 		struct pt_regs *regs)
 {
@@ -709,30 +739,44 @@ void perf_callchain_user(struct perf_callchain_entry_ctx *entry,
 	while (entry->nr < entry->max_stack && usp < current->mm->start_stack) {
 		if (!access_ok(usp, 8))
 			break;
+
 		pagefault_disable();
 		err = __get_user(user_addr, (unsigned long *)usp);
 		pagefault_enable();
+
 		if (err)
 			break;
+
 		if (valid_utext_addr(user_addr) || valid_dy_addr(user_addr))
 			perf_callchain_store(entry, user_addr);
 		usp = usp + 8;
 	}
 }
+#endif/* CONFIG_FRAME_POINTER */
+
+/*
+ * Gets called by walk_stackframe() for every stackframe. This will be called
+ * whist unwinding the stackframe and is like a subroutine return so we use
+ * the PC.
+ */
+static int callchain_trace(struct stackframe *frame, void *data)
+{
+	struct perf_callchain_entry_ctx *entry = data;
+
+	perf_callchain_store(entry, frame->pc);
+
+	return 0;
+}
 
 void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
 			   struct pt_regs *regs)
 {
-	unsigned long *sp = (unsigned long *)current_thread_info()->pcb.ksp;
-	unsigned long addr;
+	struct stackframe frame;
 
-	perf_callchain_store(entry, regs->pc);
+	frame.fp = regs->r15;
+	frame.pc = regs->pc;
 
-	while (!kstack_end(sp) && entry->nr < entry->max_stack) {
-		addr = *sp++;
-		if (__kernel_text_address(addr))
-			perf_callchain_store(entry, addr);
-	}
+	walk_stackframe(current, &frame, callchain_trace, entry);
 }
 
 /*
diff --git a/arch/sw_64/kernel/stacktrace.c b/arch/sw_64/kernel/stacktrace.c
index 41cdff5b4941..2671331717ba 100644
--- a/arch/sw_64/kernel/stacktrace.c
+++ b/arch/sw_64/kernel/stacktrace.c
@@ -8,7 +8,90 @@
 #include <linux/stacktrace.h>
 #include <linux/sched/task_stack.h>
 #include <linux/sched/debug.h>
+#include <linux/ftrace.h>
+#include <linux/perf_event.h>
+#include <asm/stacktrace.h>
 
+/*
+ * sw_64 PCS assigns the frame pointer to r15.
+ *
+ * A simple function prologue looks like this:
+ *	ldi     sp,-xx(sp)
+ *	stl     ra,0(sp)
+ *	stl     fp,8(sp)
+ *	mov     sp,fp
+ *
+ * A simple function epilogue looks like this:
+ *	mov     fp,sp
+ *	ldl     ra,0(sp)
+ *	ldl     fp,8(sp)
+ *	ldi     sp,+xx(sp)
+ */
+
+#ifdef CONFIG_FRAME_POINTER
+
+int unwind_frame(struct task_struct *tsk, struct stackframe *frame)
+{
+	unsigned long fp = frame->fp;
+
+	if (fp & 0x7)
+		return -EINVAL;
+
+	if (!tsk)
+		tsk = current;
+
+	if (!on_accessible_stack(tsk, fp, NULL))
+		return -EINVAL;
+
+	frame->pc = READ_ONCE_NOCHECK(*(unsigned long *)(fp));
+	frame->fp = READ_ONCE_NOCHECK(*(unsigned long *)(fp + 8));
+
+	/*
+	 * Frames created upon entry from user have NULL FP and PC values, so
+	 * don't bother reporting these. Frames created by __noreturn functions
+	 * might have a valid FP even if PC is bogus, so only terminate where
+	 * both are NULL.
+	 */
+	if (!frame->fp && !frame->pc)
+		return -EINVAL;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(unwind_frame);
+
+void walk_stackframe(struct task_struct *tsk, struct stackframe *frame,
+		     int (*fn)(struct stackframe *, void *), void *data)
+{
+	while (1) {
+		int ret;
+
+		if (fn(frame, data))
+			break;
+		ret = unwind_frame(tsk, frame);
+		if (ret < 0)
+			break;
+	}
+}
+EXPORT_SYMBOL_GPL(walk_stackframe);
+
+#else /* !CONFIG_FRAME_POINTER */
+void walk_stackframe(struct task_struct *tsk, struct stackframe *frame,
+		     int (*fn)(struct stackframe *, void *), void *data)
+{
+	unsigned long *sp = (unsigned long *)current_thread_info()->pcb.ksp;
+	unsigned long addr;
+	struct perf_callchain_entry_ctx *entry = data;
+
+	perf_callchain_store(entry, frame->pc);
+	while (!kstack_end(sp) && entry->nr < entry->max_stack) {
+		addr = *sp++;
+		if (__kernel_text_address(addr))
+			perf_callchain_store(entry, addr);
+	}
+}
+EXPORT_SYMBOL_GPL(walk_stackframe);
+
+#endif/* CONFIG_FRAME_POINTER */
 
 /*
  * Save stack-backtrace addresses into a stack_trace buffer.
-- 
Gitee


From 42518209c039862f32068aeee497621aaac776ed Mon Sep 17 00:00:00 2001
From: Xu Chenjiao <xuchenjiao@wxiat.com>
Date: Tue, 31 May 2022 14:11:10 +0800
Subject: [PATCH 123/674] sw64: fix compile errors for NOT chip3

Sunway inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I56QAM

--------------------------------

IO register DEVINT_WKEN, DEVINTWK_INTEN, PME_ENABLE_INTD_CORE0 and
AER_ENABLE_INTD_CORE0 are only defined in chip3. This will lead to
compile errors if other chip is selected. To avoid these errors, we
define default set_devint_wken() and set_pcieport_service_irq() as
weak symbols.

Signed-off-by: Xu Chenjiao <xuchenjiao@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/chip/chip3/chip.c | 16 ++++++++++++++++
 arch/sw_64/kernel/pci.c      | 15 ++++-----------
 2 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/arch/sw_64/chip/chip3/chip.c b/arch/sw_64/chip/chip3/chip.c
index acd25fe99669..b73afc9c62ac 100644
--- a/arch/sw_64/chip/chip3/chip.c
+++ b/arch/sw_64/chip/chip3/chip.c
@@ -90,6 +90,22 @@ void setup_chip_clocksource(void)
 #endif
 }
 
+void set_devint_wken(int node)
+{
+	unsigned long val;
+
+	/* enable INTD wakeup */
+	val = 0x80;
+	sw64_io_write(node, DEVINT_WKEN, val);
+	sw64_io_write(node, DEVINTWK_INTEN, val);
+}
+
+void set_pcieport_service_irq(int node, int index)
+{
+	write_piu_ior0(node, index, PMEINTCONFIG, PME_ENABLE_INTD_CORE0);
+	write_piu_ior0(node, index, AERERRINTCONFIG, AER_ENABLE_INTD_CORE0);
+}
+
 static int chip3_get_cpu_nums(void)
 {
 	unsigned long trkmode;
diff --git a/arch/sw_64/kernel/pci.c b/arch/sw_64/kernel/pci.c
index 401ee0b781be..a004ea9fde7f 100644
--- a/arch/sw_64/kernel/pci.c
+++ b/arch/sw_64/kernel/pci.c
@@ -600,15 +600,7 @@ sw64_init_host(unsigned long node, unsigned long index)
 	}
 }
 
-static void set_devint_wken(int node)
-{
-	unsigned long val;
-
-	/* enable INTD wakeup */
-	val = 0x80;
-	sw64_io_write(node, DEVINT_WKEN, val);
-	sw64_io_write(node, DEVINTWK_INTEN, val);
-}
+void __weak set_devint_wken(int node) {}
 
 void __init sw64_init_arch(void)
 {
@@ -651,6 +643,8 @@ void __init sw64_init_arch(void)
 	}
 }
 
+void __weak set_pcieport_service_irq(int node, int index) {}
+
 static void __init sw64_init_intx(struct pci_controller *hose)
 {
 	unsigned long int_conf, node, val_node;
@@ -679,8 +673,7 @@ static void __init sw64_init_intx(struct pci_controller *hose)
 	if (sw64_chip_init->pci_init.set_intx)
 		sw64_chip_init->pci_init.set_intx(node, index, int_conf);
 
-	write_piu_ior0(node, index, PMEINTCONFIG, PME_ENABLE_INTD_CORE0);
-	write_piu_ior0(node, index, AERERRINTCONFIG, AER_ENABLE_INTD_CORE0);
+	set_pcieport_service_irq(node, index);
 }
 
 void __init sw64_init_irq(void)
-- 
Gitee


From afe00ca0c338c3a0828cc835f249e4b11906b632 Mon Sep 17 00:00:00 2001
From: Xiong Aifei <xiongaifei@wxiat.com>
Date: Mon, 6 Jun 2022 15:24:44 +0800
Subject: [PATCH 124/674] sw64: gpu: correct low-level mmio memset/memcpy
 direct calls

Sunway inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GDKC

--------------------------------

Driver codes of the direct calls, via the SIMD-optimized memset
and memcpy functions, may raise DFAULT when using RX580 or R7
Series graphics card on sw64, so work around 'memset' references
to '_memset_c_io' and 'memcpy' to 'memcpy_fromio'.

Signed-off-by: Xiong Aifei <xiongaifei@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 12 ++++++++++++
 drivers/gpu/drm/radeon/vce_v1_0.c     |  5 +++++
 2 files changed, 17 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index c36258d56b44..851d64e83166 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -1354,7 +1354,11 @@ static int gfx_v8_0_mec_init(struct amdgpu_device *adev)
 			return r;
 		}
 
+#if IS_ENABLED(CONFIG_SW64)
+		_memset_c_io(hpd, 0, mec_hpd_size);
+#else
 		memset(hpd, 0, mec_hpd_size);
+#endif
 
 		amdgpu_bo_kunmap(adev->gfx.mec.hpd_eop_obj);
 		amdgpu_bo_unreserve(adev->gfx.mec.hpd_eop_obj);
@@ -4649,7 +4653,11 @@ static int gfx_v8_0_kiq_init_queue(struct amdgpu_ring *ring)
 		vi_srbm_select(adev, 0, 0, 0, 0);
 		mutex_unlock(&adev->srbm_mutex);
 	} else {
+#if IS_ENABLED(CONFIG_SW64)
+		_memset_c_io((void *)mqd, 0, sizeof(struct vi_mqd_allocation));
+#else
 		memset((void *)mqd, 0, sizeof(struct vi_mqd_allocation));
+#endif
 		((struct vi_mqd_allocation *)mqd)->dynamic_cu_mask = 0xFFFFFFFF;
 		((struct vi_mqd_allocation *)mqd)->dynamic_rb_mask = 0xFFFFFFFF;
 		mutex_lock(&adev->srbm_mutex);
@@ -4660,7 +4668,11 @@ static int gfx_v8_0_kiq_init_queue(struct amdgpu_ring *ring)
 		mutex_unlock(&adev->srbm_mutex);
 
 		if (adev->gfx.mec.mqd_backup[mqd_idx])
+#if IS_ENABLED(CONFIG_SW64)
+			memcpy_fromio(adev->gfx.mec.mqd_backup[mqd_idx], mqd, sizeof(struct vi_mqd_allocation));
+#else
 			memcpy(adev->gfx.mec.mqd_backup[mqd_idx], mqd, sizeof(struct vi_mqd_allocation));
+#endif
 	}
 
 	return 0;
diff --git a/drivers/gpu/drm/radeon/vce_v1_0.c b/drivers/gpu/drm/radeon/vce_v1_0.c
index bd75bbcf5bf6..fbd8a5d9a691 100644
--- a/drivers/gpu/drm/radeon/vce_v1_0.c
+++ b/drivers/gpu/drm/radeon/vce_v1_0.c
@@ -193,8 +193,13 @@ int vce_v1_0_load_fw(struct radeon_device *rdev, uint32_t *data)
 	data[3] = sign->val[i].nonce[3];
 	data[4] = cpu_to_le32(le32_to_cpu(sign->len) + 64);
 
+#if IS_ENABLED(CONFIG_SW64)
+	memset_io(&data[5], 0, 44);
+	memcpy_toio(&data[16], &sign[1], rdev->vce_fw->size - sizeof(*sign));
+#else
 	memset(&data[5], 0, 44);
 	memcpy(&data[16], &sign[1], rdev->vce_fw->size - sizeof(*sign));
+#endif
 
 	data += (le32_to_cpu(sign->len) + 64) / 4;
 	data[0] = sign->val[i].sigval[0];
-- 
Gitee


From 852e6489f5d399a75ad6e755e60e9312a2e4300e Mon Sep 17 00:00:00 2001
From: Xu Chenjiao <xuchenjiao@wxiat.com>
Date: Mon, 6 Jun 2022 16:50:04 +0800
Subject: [PATCH 125/674] sw64: pcie: fix lack of PME and AER interrupt service
 routines

Sunway inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I56XUF

--------------------------------

On SW64 platform, we used to enable PME and AER interrupt by default,
however, there is a risk that no body cares the interrupt routine when
CONFIG_PCIE_PME=n or CONFIG_PCIEAER=n. So let's fix this problem.

Signed-off-by: Xu Chenjiao <xuchenjiao@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/chip/chip3/chip.c | 30 +++++++++++++++++++-----------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/arch/sw_64/chip/chip3/chip.c b/arch/sw_64/chip/chip3/chip.c
index b73afc9c62ac..2103d93a53a2 100644
--- a/arch/sw_64/chip/chip3/chip.c
+++ b/arch/sw_64/chip/chip3/chip.c
@@ -102,8 +102,11 @@ void set_devint_wken(int node)
 
 void set_pcieport_service_irq(int node, int index)
 {
-	write_piu_ior0(node, index, PMEINTCONFIG, PME_ENABLE_INTD_CORE0);
-	write_piu_ior0(node, index, AERERRINTCONFIG, AER_ENABLE_INTD_CORE0);
+	if (IS_ENABLED(CONFIG_PCIE_PME))
+		write_piu_ior0(node, index, PMEINTCONFIG, PME_ENABLE_INTD_CORE0);
+
+	if (IS_ENABLED(CONFIG_PCIEAER))
+		write_piu_ior0(node, index, AERERRINTCONFIG, AER_ENABLE_INTD_CORE0);
 }
 
 static int chip3_get_cpu_nums(void)
@@ -438,7 +441,7 @@ extern struct pci_controller *hose_head, **hose_tail;
 static void sw6_handle_intx(unsigned int offset)
 {
 	struct pci_controller *hose;
-	unsigned long value, pme_value, aer_value;
+	unsigned long value;
 
 	hose = hose_head;
 	for (hose = hose_head; hose; hose = hose->next) {
@@ -451,15 +454,20 @@ static void sw6_handle_intx(unsigned int offset)
 			write_piu_ior0(hose->node, hose->index, INTACONFIG + (offset << 7), value);
 		}
 
-		pme_value = read_piu_ior0(hose->node, hose->index, PMEINTCONFIG);
-		aer_value = read_piu_ior0(hose->node, hose->index, AERERRINTCONFIG);
-		if ((pme_value >> 63) || (aer_value >> 63)) {
-			handle_irq(hose->service_irq);
+		if (IS_ENABLED(CONFIG_PCIE_PME)) {
+			value = read_piu_ior0(hose->node, hose->index, PMEINTCONFIG);
+			if (value >> 63) {
+				handle_irq(hose->service_irq);
+				write_piu_ior0(hose->node, hose->index, PMEINTCONFIG, value);
+			}
+		}
 
-			if (pme_value >> 63)
-				write_piu_ior0(hose->node, hose->index, PMEINTCONFIG, pme_value);
-			if (aer_value >> 63)
-				write_piu_ior0(hose->node, hose->index, AERERRINTCONFIG, aer_value);
+		if (IS_ENABLED(CONFIG_PCIEAER)) {
+			value = read_piu_ior0(hose->node, hose->index, AERERRINTCONFIG);
+			if (value >> 63) {
+				handle_irq(hose->service_irq);
+				write_piu_ior0(hose->node, hose->index, AERERRINTCONFIG, value);
+			}
 		}
 
 		if (hose->iommu_enable) {
-- 
Gitee


From e41f34b561b8485031080a07c2e19f22dceb06da Mon Sep 17 00:00:00 2001
From: Zheng Chongzhen <zhengchongzhen@wxiat.com>
Date: Wed, 15 Jun 2022 16:54:25 +0800
Subject: [PATCH 126/674] sw64: fix dma features for zx200

Sunway inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I56OSP

--------------------------------

The device controllers of ZX200 chipset are all the 32-bit DMA capable
device, so we add pci quirk function hook to set dev.bus_dma_limit
to DMA_BIT_MASK(32) for all devices on the chipset, besides when
CONFIG_SUNWAY_IOMMU=n, all these devices will use swiotlb dma_ops.

Signed-off-by: Zheng Chongzhen <zhengchongzhen@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/kernel/pci.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/arch/sw_64/kernel/pci.c b/arch/sw_64/kernel/pci.c
index a004ea9fde7f..7cc8b7d2d43b 100644
--- a/arch/sw_64/kernel/pci.c
+++ b/arch/sw_64/kernel/pci.c
@@ -691,3 +691,16 @@ sw64_init_pci(void)
 {
 	common_init_pci();
 }
+
+static int setup_bus_dma_cb(struct pci_dev *pdev, void *data)
+{
+	pdev->dev.bus_dma_limit = DMA_BIT_MASK(32);
+	return 0;
+}
+
+static void fix_bus_dma_limit(struct pci_dev *dev)
+{
+	pci_walk_bus(dev->subordinate, setup_bus_dma_cb, NULL);
+	pr_info("Set zx200 bus_dma_limit to 32-bit\n");
+}
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ZHAOXIN, 0x071f, fix_bus_dma_limit);
-- 
Gitee


From aadce49276fe3cb5d65f9bd932522f5ac069564b Mon Sep 17 00:00:00 2001
From: Zheng Chongzhen <zhengchongzhen@wxiat.com>
Date: Wed, 15 Jun 2022 17:40:59 +0800
Subject: [PATCH 127/674] sw64: iommu: fix iommu interrupt handler

Sunway inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I56OSP

--------------------------------

When porting ZX200 chipset driver on SW64 platform, we meet an IOMMU
exception show as follows:

iommu_interrupt, iommu_status = 0xc8014000dffe0000, devid 0xa00, dva 0xdffe0000,
1Unable to handle kernel paging request at virtual address 0000000000000040
CPU 0 swapper/0(0): Oops 0
pc = [<ffffffff81424b80>]  ra = [<ffffffff81424b24>]  ps = 0001    Not tainted
pc is at iommu_interrupt+0x140/0x3e0
ra is at iommu_interrupt+0xe4/0x3e0
v0 = 0000000000000051  t0 = c8014000dffe0000  t1 = 0000000000000000
t2 = 0000000000000000  t3 = 0000000000000000  t4 = 0000000000000001
t5 = 0000000000000001  t6 = 0000000000000000  t7 = ffffffff82948000
s0 = fff00003ffff0400  s1 = 0000000000000001  s2 = 0000000000000a00
s3 = 0000000000000a00  s4 = 00000000dffe0000  s5 = fff0000100680e80
s6 = ffffffff8294ba70
a0 = 0000000000000001  a1 = 0000000000000001  a2 = ffffffff8294b790
a3 = ffffffff8294b7a8  a4 = 0000000000000000  a5 = ffffffff82c5fb7a
t8 = 0000000000000001  t9 = fffffffffffcac48  t10 = 0000000000000000
t11= 0000000000000000  pv = ffffffff809f4f10  at = ffffffff82bff6c0
gp = ffffffff82c1f510  sp = (____ptrval____)

The root cause is that the device which raises iommu exception is not in the
device list, then reference a null sdev will cause a page fualt. To work
around this problem, we apply this patch by just clearing IOMMUEXCPT_STATUS
and then go on.

BTW, why the device raise IOMMU exception is not a valid device ID, it's a
puzzling problem.

Signed-off-by: Zheng Chongzhen <zhengchongzhen@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 drivers/iommu/sw64/sunway_iommu.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/sw64/sunway_iommu.c b/drivers/iommu/sw64/sunway_iommu.c
index 5797adf8fbc5..b6c8f1272d28 100644
--- a/drivers/iommu/sw64/sunway_iommu.c
+++ b/drivers/iommu/sw64/sunway_iommu.c
@@ -648,10 +648,21 @@ irqreturn_t iommu_interrupt(int irq, void *dev)
 	type = (iommu_status >> 59) & 0x7;
 	devid = (iommu_status >> 37) & 0xffff;
 	dva = iommu_status & 0xffffffff;
-	sdev = search_dev_data(devid);
-	sdomain = sdev->domain;
 	pr_info("%s, iommu_status = %#lx, devid %#lx, dva %#lx, ",
 			__func__, iommu_status, devid, dva);
+
+	sdev = search_dev_data(devid);
+	if (sdev == NULL) {
+		pr_info("no such dev!!!\n");
+
+		iommu_status &= ~(1UL << 62);
+		write_piu_ior0(hose->node, hose->index,
+				IOMMUEXCPT_STATUS, iommu_status);
+
+		return IRQ_HANDLED;
+	}
+
+	sdomain = sdev->domain;
 	switch (type) {
 	case DTE_LEVEL1:
 		pr_info("invalid level1 dte, addr:%#lx, val:%#lx\n",
@@ -674,7 +685,6 @@ irqreturn_t iommu_interrupt(int irq, void *dev)
 			fetch_pte(sdomain, dva, PTE_LEVEL2_VAL));
 
 		iommu_status &= ~(1UL << 62);
-		iommu_status = iommu_status | (1UL << 63);
 		write_piu_ior0(hose->node, hose->index,
 				IOMMUEXCPT_STATUS, iommu_status);
 		break;
-- 
Gitee


From c1bf5440faf937aa402e5c08090537948dd99d60 Mon Sep 17 00:00:00 2001
From: Min Fanlei <minfanlei@wxiat.com>
Date: Tue, 7 Jun 2022 15:12:25 +0800
Subject: [PATCH 128/674] sw64: kvm: fix incorrect page_ref_count() call

Sunway inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I56WV8

--------------------------------

The last page_ref_count() call will cause kernel panic if
kvm memory pool is at the end of DRAM. This patch reorders
the checks to avoid illegal atomic_read operation.

Signed-off-by: Min Fanlei <minfanlei@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/kernel/setup.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/sw_64/kernel/setup.c b/arch/sw_64/kernel/setup.c
index 4d39db5fb1ef..e20e215dd08a 100644
--- a/arch/sw_64/kernel/setup.c
+++ b/arch/sw_64/kernel/setup.c
@@ -1026,8 +1026,7 @@ static int __init sw64_kvm_pool_init(void)
 	end_page  = pfn_to_page((kvm_mem_base + kvm_mem_size - 1) >> PAGE_SHIFT);
 
 	p = base_page;
-	while (page_ref_count(p) == 0 &&
-			(unsigned long)p <= (unsigned long)end_page) {
+	while (p <= end_page && page_ref_count(p) == 0) {
 		set_page_count(p, 1);
 		page_mapcount_reset(p);
 		SetPageReserved(p);
-- 
Gitee


From 3f5352084a8cc53cc8d555dcb20357138a37e276 Mon Sep 17 00:00:00 2001
From: Yang Qiang <yangqiang@wxiat.com>
Date: Wed, 8 Jun 2022 09:35:37 +0800
Subject: [PATCH 129/674] sw64: dtb: check address validity with physical
 address

Sunway inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GE5X

--------------------------------

Due to sw64 ABI sepecification, a large kernel image may override
the data structures of UEFI BIOS.

To solve this problem, we have expanded UEFI BIOS to 1GB to make
sure that runtime service code and data structures reside in the
high address below 1GB which may be beyond ktext map. So fix it.

Signed-off-by: Yang Qiang <yangqiang@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/kernel/setup.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/arch/sw_64/kernel/setup.c b/arch/sw_64/kernel/setup.c
index e20e215dd08a..85d172c32239 100644
--- a/arch/sw_64/kernel/setup.c
+++ b/arch/sw_64/kernel/setup.c
@@ -560,22 +560,20 @@ static void __init setup_machine_fdt(void)
 #ifdef CONFIG_USE_OF
 	void *dt_virt;
 	const char *name;
-	unsigned long phys_addr;
 
 	/* Give a chance to select kernel builtin DTB firstly */
 	if (IS_ENABLED(CONFIG_SW64_BUILTIN_DTB))
 		dt_virt = (void *)__dtb_start;
 	else {
 		dt_virt = (void *)sunway_boot_params->dtb_start;
-		if (dt_virt < (void *)__bss_stop) {
+		if (virt_to_phys(dt_virt) < virt_to_phys(__bss_stop)) {
 			pr_emerg("BUG: DTB has been corrupted by kernel image!\n");
 			while (true)
 				cpu_relax();
 		}
 	}
 
-	phys_addr = __phys_addr((unsigned long)dt_virt);
-	if (!phys_addr_valid(phys_addr) ||
+	if (!phys_addr_valid(virt_to_phys(dt_virt)) ||
 			!early_init_dt_scan(dt_virt)) {
 		pr_crit("\n"
 			"Error: invalid device tree blob at virtual address %px\n"
-- 
Gitee


From b9c01ee05c5fb17024d11bc058214f336718af3b Mon Sep 17 00:00:00 2001
From: He Chuyue <hechuyue@wxiat.com>
Date: Wed, 8 Jun 2022 09:38:24 +0800
Subject: [PATCH 130/674] sw64: perf: fix the number of supported raw events

Sunway inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I56X48

--------------------------------

According to CORE3B core software interface manual, the performance
counter PC_1 on sw64 supports event index up to 0x3d. Now fix it,
and remove some unused macros from wrperfmon.h.

Signed-off-by: He Chuyue <hechuyue@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/include/asm/wrperfmon.h | 4 +---
 arch/sw_64/kernel/perf_event.c     | 2 +-
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/arch/sw_64/include/asm/wrperfmon.h b/arch/sw_64/include/asm/wrperfmon.h
index 098702573bfc..15f7f6beb07c 100644
--- a/arch/sw_64/include/asm/wrperfmon.h
+++ b/arch/sw_64/include/asm/wrperfmon.h
@@ -33,10 +33,8 @@
 
 #define PC0_RAW_BASE			0x0
 #define PC1_RAW_BASE			0x100
-#define PC0_MIN				0x0
 #define PC0_MAX				0xF
-#define PC1_MIN				0x0
-#define PC1_MAX				0x37
+#define PC1_MAX				0x3D
 
 #define SW64_PERFCTRL_KM		2
 #define SW64_PERFCTRL_UM		3
diff --git a/arch/sw_64/kernel/perf_event.c b/arch/sw_64/kernel/perf_event.c
index 70f1f2781016..e9aae53a56f6 100644
--- a/arch/sw_64/kernel/perf_event.c
+++ b/arch/sw_64/kernel/perf_event.c
@@ -244,7 +244,7 @@ static const struct sw64_perf_event *core3_map_cache_event(u64 config)
 
 /*
  * r0xx for counter0, r1yy for counter1.
- * According to the datasheet, 00 <= xx <= 0F, 00 <= yy <= 37
+ * According to the datasheet, 00 <= xx <= 0F, 00 <= yy <= 3D
  */
 static bool core3_raw_event_valid(u64 config)
 {
-- 
Gitee


From de13548b9a24e73d988b43fbad57f32cfb4adfd1 Mon Sep 17 00:00:00 2001
From: Zhu Donghong <zhudonghong@wxiat.com>
Date: Wed, 8 Jun 2022 09:49:30 +0800
Subject: [PATCH 131/674] drivers/irqchip: add sw64 interrupt controller
 support

Sunway inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GDHC

--------------------------------

Add a driver for the SW64 implementation of the CPU interrupt
controller(INTC). In fact, the INTC is just a software simulated
module to connect global interrupt sources to the local interrupt
controller on each core.

In addition, we rename irq-lpc.c and SW64_CHIP3_LPC config.

Signed-off-by: Zhu Donghong <zhudonghong@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/Kconfig                            |   8 +-
 arch/sw_64/chip/chip3/Makefile                |   1 -
 drivers/irqchip/Kconfig                       |  33 +++---
 drivers/irqchip/Makefile                      |   4 +-
 drivers/irqchip/irq-intc-v1.c                 | 104 ------------------
 .../irqchip/irq-sw64-intc-v2.c                |  64 ++++++++++-
 .../{irq-lpc.c => irq-sw64-lpc-intc.c}        |   0
 7 files changed, 84 insertions(+), 130 deletions(-)
 delete mode 100644 drivers/irqchip/irq-intc-v1.c
 rename arch/sw_64/chip/chip3/irq_chip.c => drivers/irqchip/irq-sw64-intc-v2.c (59%)
 rename drivers/irqchip/{irq-lpc.c => irq-sw64-lpc-intc.c} (100%)

diff --git a/arch/sw_64/Kconfig b/arch/sw_64/Kconfig
index 0e32fc7e1f9a..ec6e583a5d9a 100644
--- a/arch/sw_64/Kconfig
+++ b/arch/sw_64/Kconfig
@@ -234,6 +234,7 @@ config PLATFORM_XUELANG
 	depends on SW64_CHIP3
 	select SPARSE_IRQ
 	select SYS_HAS_EARLY_PRINTK
+	select SW64_INTC_V2
 	help
 	  Sunway chip3 board chipset
 
@@ -736,15 +737,10 @@ endmenu
 
 menu "Boot options"
 
-config SW64_IRQ_CHIP
-	bool
-
 config USE_OF
 	bool "Flattened Device Tree support"
-	select GENERIC_IRQ_CHIP
-	select IRQ_DOMAIN
-	select SW64_IRQ_CHIP
 	select OF
+	select IRQ_DOMAIN
 	help
 	  Include support for flattened device tree machine descriptions.
 
diff --git a/arch/sw_64/chip/chip3/Makefile b/arch/sw_64/chip/chip3/Makefile
index 2b7b5790003f..ba0ab3f67f98 100644
--- a/arch/sw_64/chip/chip3/Makefile
+++ b/arch/sw_64/chip/chip3/Makefile
@@ -4,5 +4,4 @@ obj-y	:= chip.o i2c-lib.o
 
 obj-$(CONFIG_PCI)	+= pci-quirks.o
 obj-$(CONFIG_PCI_MSI)	+= msi.o vt_msi.o
-obj-$(CONFIG_SW64_IRQ_CHIP)	+= irq_chip.o
 obj-$(CONFIG_CPUFREQ_DEBUGFS)   += cpufreq_debugfs.o
diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig
index 5bf6cf60999b..dd35895c92f3 100644
--- a/drivers/irqchip/Kconfig
+++ b/drivers/irqchip/Kconfig
@@ -1,21 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
 menu "IRQ chip support"
 
-config SW64_INTC
-	bool "SW64 Platform-Level Interrupt Controller"
-	depends on ACPI && SW64
-	help
-           This enables support for the INTC chip found in SW systems.
-           The INTC controls devices interrupts and connects them to each
-           core's local interrupt controller.
-
-config SW64_CHIP3_LPC
-	bool "SW64 Chip3 Buildin LPC Interrupt Controller"
-	depends on SW64_CHIP3
-	help
-           This enables support for the LPC interrupt controller bultin in
-	   on chip3 series.
-
 config IRQCHIP
 	def_bool y
 	depends on OF_IRQ
@@ -26,6 +11,24 @@ config ARM_GIC
 	select GENERIC_IRQ_MULTI_HANDLER
 	select GENERIC_IRQ_EFFECTIVE_AFF_MASK
 
+config SW64_INTC_V2
+	bool "SW64 Interrupt Controller V2"
+	depends on SW64_CHIP3
+	default y
+	select GENERIC_IRQ_CHIP
+	select IRQ_DOMAIN
+	help
+           This enables support for the INTC chip found in SW CHIP3 systems.
+           The INTC controls devices interrupts and connects them to each
+           core's local interrupt controller.
+
+config SW64_LPC_INTC
+	bool "SW64 cpu builtin LPC Interrupt Controller"
+	depends on SW64_INTC_V2
+	help
+	   Say yes here to add support for the SW64 cpu builtin LPC
+	   IRQ controller.
+
 config ARM_GIC_PM
 	bool
 	depends on PM
diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile
index 78eb12ab4d4c..14a022c074ce 100644
--- a/drivers/irqchip/Makefile
+++ b/drivers/irqchip/Makefile
@@ -27,8 +27,8 @@ obj-$(CONFIG_ARCH_SUNXI)		+= irq-sun4i.o
 obj-$(CONFIG_ARCH_SUNXI)		+= irq-sunxi-nmi.o
 obj-$(CONFIG_ARCH_SPEAR3XX)		+= spear-shirq.o
 obj-$(CONFIG_ARM_GIC)			+= irq-gic.o irq-gic-common.o
-obj-$(CONFIG_SW64_INTC)			+= irq-intc-v1.o
-obj-$(CONFIG_SW64_CHIP3_LPC)		+= irq-lpc.o
+obj-$(CONFIG_SW64_INTC_V2)     		+= irq-sw64-intc-v2.o
+obj-$(CONFIG_SW64_LPC_INTC)		+= irq-sw64-lpc-intc.o
 obj-$(CONFIG_ARM_GIC_PM)		+= irq-gic-pm.o
 obj-$(CONFIG_ARCH_REALVIEW)		+= irq-gic-realview.o
 obj-$(CONFIG_ARM_GIC_V2M)		+= irq-gic-v2m.o
diff --git a/drivers/irqchip/irq-intc-v1.c b/drivers/irqchip/irq-intc-v1.c
deleted file mode 100644
index 4519e96526fb..000000000000
--- a/drivers/irqchip/irq-intc-v1.c
+++ /dev/null
@@ -1,104 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include <linux/acpi_iort.h>
-#include <linux/msi.h>
-#include <linux/acpi.h>
-#include <linux/irqdomain.h>
-#include <linux/interrupt.h>
-#include <linux/cpumask.h>
-#include <linux/io.h>
-#include <linux/percpu.h>
-#include <linux/slab.h>
-#include <linux/irqchip.h>
-#include <asm/sw64io.h>
-static void fake_irq_mask(struct irq_data *data)
-{
-}
-
-static void fake_irq_unmask(struct irq_data *data)
-{
-}
-
-static struct irq_chip onchip_intc = {
-	.name           = "SW fake Intc",
-	.irq_mask       = fake_irq_mask,
-	.irq_unmask     = fake_irq_unmask,
-};
-
-static int sw_intc_domain_map(struct irq_domain *d, unsigned int irq,
-				irq_hw_number_t hw)
-{
-	irq_set_chip_and_handler(irq, &onchip_intc, handle_level_irq);
-	irq_set_status_flags(irq, IRQ_LEVEL);
-	return 0;
-}
-
-static const struct irq_domain_ops intc_irq_domain_ops = {
-	.xlate = irq_domain_xlate_onecell,
-	.map = sw_intc_domain_map,
-};
-
-#ifdef CONFIG_ACPI
-
-static int __init
-intc_parse_madt(union acpi_subtable_headers *header,
-		       const unsigned long end)
-{
-	struct acpi_madt_io_sapic *its_entry;
-	static struct irq_domain *root_domain;
-	int intc_irqs = 8, irq_base = NR_IRQS_LEGACY;
-	irq_hw_number_t hwirq_base = 0;
-	int irq_start = -1;
-
-	its_entry = (struct acpi_madt_io_sapic *)header;
-
-	intc_irqs -= hwirq_base; /* calculate # of irqs to allocate */
-
-	irq_base = irq_alloc_descs(irq_start, 16, intc_irqs,
-			numa_node_id());
-	if (irq_base < 0) {
-		WARN(1, "Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n",
-				irq_start);
-		irq_base = irq_start;
-	}
-
-	root_domain = irq_domain_add_legacy(NULL, intc_irqs, irq_base,
-			hwirq_base, &intc_irq_domain_ops, NULL);
-
-	if (!root_domain)
-		pr_err("Failed to create irqdomain");
-
-	irq_set_default_host(root_domain);
-
-	sw64_io_write(0, MCU_DVC_INT_EN, 0xff);
-
-	return 0;
-}
-
-static int __init acpi_intc_init(void)
-{
-	int count = 0;
-
-	count = acpi_table_parse_madt(ACPI_MADT_TYPE_IO_SAPIC,
-			      intc_parse_madt, 0);
-
-	if (count <= 0) {
-		pr_err("No valid intc entries exist\n");
-		return -EINVAL;
-	}
-	return 0;
-}
-#else
-static int __init acpi_intc_init(void)
-{
-	return 0;
-}
-#endif
-
-static int __init intc_init(void)
-{
-	acpi_intc_init();
-
-	return 0;
-}
-subsys_initcall(intc_init);
diff --git a/arch/sw_64/chip/chip3/irq_chip.c b/drivers/irqchip/irq-sw64-intc-v2.c
similarity index 59%
rename from arch/sw_64/chip/chip3/irq_chip.c
rename to drivers/irqchip/irq-sw64-intc-v2.c
index 24dfa1e1a898..8640c4aa9506 100644
--- a/arch/sw_64/chip/chip3/irq_chip.c
+++ b/drivers/irqchip/irq-sw64-intc-v2.c
@@ -1,8 +1,10 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <linux/irqdomain.h>
 #include <linux/irqchip.h>
-
-#include <asm/irq_impl.h>
+#include <linux/acpi.h>
+#include <linux/acpi_iort.h>
+#include <linux/of_irq.h>
+#include <asm/sw64io.h>
 
 static void fake_irq_mask(struct irq_data *data)
 {
@@ -32,6 +34,64 @@ static const struct irq_domain_ops sw64_intc_domain_ops = {
 	.map = sw64_intc_domain_map,
 };
 
+static int __init
+intc_parse_madt(union acpi_subtable_headers *header,
+		       const unsigned long end)
+{
+	struct acpi_madt_io_sapic *its_entry;
+	static struct irq_domain *root_domain;
+	int intc_irqs = 8, irq_base = NR_IRQS_LEGACY;
+	irq_hw_number_t hwirq_base = 0;
+	int irq_start = -1;
+
+	its_entry = (struct acpi_madt_io_sapic *)header;
+
+	intc_irqs -= hwirq_base; /* calculate # of irqs to allocate */
+
+	irq_base = irq_alloc_descs(irq_start, 16, intc_irqs,
+			numa_node_id());
+	if (irq_base < 0) {
+		WARN(1, "Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n",
+				irq_start);
+		irq_base = irq_start;
+	}
+
+	root_domain = irq_domain_add_legacy(NULL, intc_irqs, irq_base,
+			hwirq_base, &sw64_intc_domain_ops, NULL);
+
+	if (!root_domain)
+		pr_err("Failed to create irqdomain");
+
+	irq_set_default_host(root_domain);
+
+	sw64_io_write(0, MCU_DVC_INT_EN, 0xff);
+
+	return 0;
+}
+
+static int __init acpi_intc_init(void)
+{
+	int count = 0;
+
+	count = acpi_table_parse_madt(ACPI_MADT_TYPE_IO_SAPIC,
+			      intc_parse_madt, 0);
+
+	if (count <= 0) {
+		pr_err("No valid intc entries exist\n");
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int __init intc_init(void)
+{
+	acpi_intc_init();
+
+	return 0;
+}
+
+subsys_initcall(intc_init);
+
 static struct irq_domain *root_domain;
 
 static int __init
diff --git a/drivers/irqchip/irq-lpc.c b/drivers/irqchip/irq-sw64-lpc-intc.c
similarity index 100%
rename from drivers/irqchip/irq-lpc.c
rename to drivers/irqchip/irq-sw64-lpc-intc.c
-- 
Gitee


From 5384a3a3e7337b4953636b75c681ef0b0083ac0c Mon Sep 17 00:00:00 2001
From: Mao Minkai <maominkai@wxiat.com>
Date: Thu, 23 Jun 2022 14:11:05 +0800
Subject: [PATCH 132/674] sw64: fix __csum_and_copy when dest is not 8-byte
 aligned

Sunway inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I56W9F

--------------------------------

When destination is not 8-byte aligned,
csum_partial_cfu_dest_unaligned() should be used to avoid kernel
unaligned exception.

Fixes: 2fcadd2861b4 ("sw64: optimize ip checksum calculation")
Signed-off-by: Mao Minkai <maominkai@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/lib/csum_partial_copy.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/sw_64/lib/csum_partial_copy.c b/arch/sw_64/lib/csum_partial_copy.c
index 441ae5575de5..5e5274e82b2b 100644
--- a/arch/sw_64/lib/csum_partial_copy.c
+++ b/arch/sw_64/lib/csum_partial_copy.c
@@ -128,9 +128,9 @@ static __wsum __csum_and_copy(const void __user *src, void *dst, int len)
 			(const unsigned long __user *) src,
 			(unsigned long *) dst, len-8);
 	} else {
-		checksum = csum_partial_cfu_dest_aligned(
+		checksum = csum_partial_cfu_dest_unaligned(
 			(const unsigned long __user *) src,
-			(unsigned long *) dst, len-8);
+			(unsigned long *) dst, doff, len-8);
 	}
 	return (__force __wsum)from64to16(checksum);
 }
-- 
Gitee


From 12edb9430a3d902a0e8edb8c3d1489efda8205ad Mon Sep 17 00:00:00 2001
From: Xiong Aifei <xiongaifei@wxiat.com>
Date: Wed, 15 Jun 2022 16:36:00 +0800
Subject: [PATCH 133/674] sw64: gpu: replace '_memset_c_io' by 'memset_io'

Sunway inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GDKC

--------------------------------

Signed-off-by: Xiong Aifei <xiongaifei@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 4 ++--
 drivers/gpu/drm/radeon/radeon_vce.c   | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index 851d64e83166..28c4e1fe5cd4 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -1355,7 +1355,7 @@ static int gfx_v8_0_mec_init(struct amdgpu_device *adev)
 		}
 
 #if IS_ENABLED(CONFIG_SW64)
-		_memset_c_io(hpd, 0, mec_hpd_size);
+		memset_io(hpd, 0, mec_hpd_size);
 #else
 		memset(hpd, 0, mec_hpd_size);
 #endif
@@ -4654,7 +4654,7 @@ static int gfx_v8_0_kiq_init_queue(struct amdgpu_ring *ring)
 		mutex_unlock(&adev->srbm_mutex);
 	} else {
 #if IS_ENABLED(CONFIG_SW64)
-		_memset_c_io((void *)mqd, 0, sizeof(struct vi_mqd_allocation));
+		memset_io((void *)mqd, 0, sizeof(struct vi_mqd_allocation));
 #else
 		memset((void *)mqd, 0, sizeof(struct vi_mqd_allocation));
 #endif
diff --git a/drivers/gpu/drm/radeon/radeon_vce.c b/drivers/gpu/drm/radeon/radeon_vce.c
index 68cc5a347d3b..b9680d38d924 100644
--- a/drivers/gpu/drm/radeon/radeon_vce.c
+++ b/drivers/gpu/drm/radeon/radeon_vce.c
@@ -240,7 +240,7 @@ int radeon_vce_resume(struct radeon_device *rdev)
 	}
 
 #ifdef __sw_64__
-	_memset_c_io(cpu_addr, 0, radeon_bo_size(rdev->vce.vcpu_bo));
+	memset_io(cpu_addr, 0, radeon_bo_size(rdev->vce.vcpu_bo));
 #else
 	memset(cpu_addr, 0, radeon_bo_size(rdev->vce.vcpu_bo));
 #endif
-- 
Gitee


From fd62817c1d7a175e76ea65dbaeaa8bea72bfd012 Mon Sep 17 00:00:00 2001
From: He Chuyue <hechuyue@wxiat.com>
Date: Tue, 14 Jun 2022 16:40:12 +0800
Subject: [PATCH 134/674] sw64: perf: fix raw event count

Sunway inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I56X48

--------------------------------

To read raw event count, perf stat calls sw64_pmu_start() to start
the event and then calls sw64_perf_event_set_period() to set a new
period to sample over.

It used to initialize hwc.prev_count and PMC with different values
and this will result in error sample values. To fix this problem,
initialize them to 0 consistently.

Signed-off-by: He Chuyue <hechuyue@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/kernel/perf_event.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/arch/sw_64/kernel/perf_event.c b/arch/sw_64/kernel/perf_event.c
index e9aae53a56f6..52ec34e33269 100644
--- a/arch/sw_64/kernel/perf_event.c
+++ b/arch/sw_64/kernel/perf_event.c
@@ -297,31 +297,33 @@ static int sw64_perf_event_set_period(struct perf_event *event,
 {
 	long left = local64_read(&hwc->period_left);
 	long period = hwc->sample_period;
-	int ret = 0;
+	int overflow = 0;
+	unsigned long value;
 
 	if (unlikely(left <= -period)) {
 		left = period;
 		local64_set(&hwc->period_left, left);
 		hwc->last_period = period;
-		ret = 1;
+		overflow = 1;
 	}
 
 	if (unlikely(left <= 0)) {
 		left += period;
 		local64_set(&hwc->period_left, left);
 		hwc->last_period = period;
-		ret = 1;
+		overflow = 1;
 	}
 
 	if (left > (long)sw64_pmu->pmc_max_period)
 		left = sw64_pmu->pmc_max_period;
 
-	local64_set(&hwc->prev_count, (unsigned long)(-left));
-	sw64_write_pmc(idx,  (unsigned long)(sw64_pmu->pmc_max_period - left));
+	value = sw64_pmu->pmc_max_period - left;
+	local64_set(&hwc->prev_count, value);
+	sw64_write_pmc(idx, value);
 
 	perf_event_update_userpage(event);
 
-	return ret;
+	return overflow;
 }
 
 /*
-- 
Gitee


From 248fac94e07c792a921d7f76c47f87c22b414ff6 Mon Sep 17 00:00:00 2001
From: Lu Feifei <lufeifei@wxiat.com>
Date: Wed, 15 Jun 2022 14:59:23 +0800
Subject: [PATCH 135/674] sw64: add memhotplug support for guest os

Sunway inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I56WV8

--------------------------------

This patch introduces memory-hotplug support for guest os, and it
should set CONFIG_KVM_MEMHOTLPUG=y on host to enable this feature.

Currently, only 1GB memory-hotplug granularity is supported, and
multiple granularity support will be implemented in the future.

Signed-off-by: Lu Feifei <lufeifei@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/chip/chip3/chip.c      |   5 +
 arch/sw_64/include/asm/hcall.h    |   1 +
 arch/sw_64/include/asm/irq_impl.h |   1 +
 arch/sw_64/include/asm/kvm_asm.h  |   3 +
 arch/sw_64/include/asm/kvm_host.h |  11 +-
 arch/sw_64/include/asm/memory.h   |   1 +
 arch/sw_64/kvm/Kconfig            |   7 +
 arch/sw_64/kvm/handle_exit.c      |   5 +
 arch/sw_64/kvm/kvm-sw64.c         | 113 ++++++++++++-
 arch/sw_64/mm/init.c              |   9 ++
 drivers/misc/Kconfig              |   8 +
 drivers/misc/Makefile             |   1 +
 drivers/misc/sunway-ged.c         | 253 ++++++++++++++++++++++++++++++
 13 files changed, 410 insertions(+), 8 deletions(-)
 create mode 100644 drivers/misc/sunway-ged.c

diff --git a/arch/sw_64/chip/chip3/chip.c b/arch/sw_64/chip/chip3/chip.c
index 2103d93a53a2..02b369b2b37b 100644
--- a/arch/sw_64/chip/chip3/chip.c
+++ b/arch/sw_64/chip/chip3/chip.c
@@ -701,6 +701,11 @@ void handle_chip_irq(unsigned long type, unsigned long vector,
 		handle_irq(type);
 		set_irq_regs(old_regs);
 		return;
+	case INT_VT_HOTPLUG:
+		old_regs = set_irq_regs(regs);
+		handle_irq(type);
+		set_irq_regs(old_regs);
+		return;
 	case INT_PC0:
 		perf_irq(PERFMON_PC0, regs);
 		return;
diff --git a/arch/sw_64/include/asm/hcall.h b/arch/sw_64/include/asm/hcall.h
index 8117752b657e..b5438b477c87 100644
--- a/arch/sw_64/include/asm/hcall.h
+++ b/arch/sw_64/include/asm/hcall.h
@@ -18,6 +18,7 @@ enum HCALL_TYPE {
 	HCALL_SWNET		= 20,   /* guest request swnet service */
 	HCALL_SWNET_IRQ		= 21,   /* guest request swnet intr */
 	HCALL_FATAL_ERROR	= 22,   /* guest fatal error, issued by hmcode */
+	HCALL_MEMHOTPLUG	= 23,   /* guest memory hotplug event */
 	NR_HCALL
 };
 
diff --git a/arch/sw_64/include/asm/irq_impl.h b/arch/sw_64/include/asm/irq_impl.h
index 3679793d8b65..b568efef6994 100644
--- a/arch/sw_64/include/asm/irq_impl.h
+++ b/arch/sw_64/include/asm/irq_impl.h
@@ -32,6 +32,7 @@ enum sw64_irq_type {
 	INT_RTC		= 9,
 	INT_FAULT	= 10,
 	INT_VT_SERIAL	= 12,
+	INT_VT_HOTPLUG	= 13,
 	INT_DEV		= 17,
 	INT_NMI		= 18,
 	INT_LEGACY	= 31,
diff --git a/arch/sw_64/include/asm/kvm_asm.h b/arch/sw_64/include/asm/kvm_asm.h
index 4b851682188c..7e2c92ed4574 100644
--- a/arch/sw_64/include/asm/kvm_asm.h
+++ b/arch/sw_64/include/asm/kvm_asm.h
@@ -11,4 +11,7 @@
 #define SW64_KVM_EXIT_RESTART		17
 #define SW64_KVM_EXIT_FATAL_ERROR	22
 
+#ifdef CONFIG_KVM_MEMHOTPLUG
+#define SW64_KVM_EXIT_MEMHOTPLUG	23
+#endif
 #endif /* _ASM_SW64_KVM_ASM_H */
diff --git a/arch/sw_64/include/asm/kvm_host.h b/arch/sw_64/include/asm/kvm_host.h
index e4ebb993153c..6d292c086347 100644
--- a/arch/sw_64/include/asm/kvm_host.h
+++ b/arch/sw_64/include/asm/kvm_host.h
@@ -29,7 +29,7 @@
 #include <asm/kvm_mmio.h>
 
 #define KVM_MAX_VCPUS 64
-#define KVM_USER_MEM_SLOTS 512
+#define KVM_USER_MEM_SLOTS 64
 
 #define KVM_HALT_POLL_NS_DEFAULT 0
 #define KVM_IRQCHIP_NUM_PINS     256
@@ -42,12 +42,16 @@
 #define KVM_PAGES_PER_HPAGE(x)  (KVM_HPAGE_SIZE(x) / PAGE_SIZE)
 
 struct kvm_arch_memory_slot {
-
+	unsigned long host_phys_addr;
+	bool valid;
 };
 
 struct kvm_arch {
 	unsigned long host_phys_addr;
 	unsigned long size;
+
+	/* segment table */
+	unsigned long *seg_pgd;
 };
 
 
@@ -100,6 +104,9 @@ struct kvm_vcpu_stat {
 	u64 halt_poll_invalid;
 };
 
+#ifdef CONFIG_KVM_MEMHOTPLUG
+void vcpu_mem_hotplug(struct kvm_vcpu *vcpu, unsigned long start_addr);
+#endif
 int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
 		int exception_index, struct hcall_args *hargs);
 void vcpu_send_ipi(struct kvm_vcpu *vcpu, int target_vcpuid);
diff --git a/arch/sw_64/include/asm/memory.h b/arch/sw_64/include/asm/memory.h
index d3191165c7b5..b2b7492ae477 100644
--- a/arch/sw_64/include/asm/memory.h
+++ b/arch/sw_64/include/asm/memory.h
@@ -6,6 +6,7 @@
 #include <linux/numa.h>
 #endif
 
+#define MIN_MEMORY_BLOCK_SIZE_VM_MEMHP    (1UL << 30)
 #define NODE0_START	(_TEXT_START - __START_KERNEL_map)
 
 #define MAX_PHYSMEM_BITS	48
diff --git a/arch/sw_64/kvm/Kconfig b/arch/sw_64/kvm/Kconfig
index 230ac526911c..85323b48f564 100644
--- a/arch/sw_64/kvm/Kconfig
+++ b/arch/sw_64/kvm/Kconfig
@@ -42,6 +42,13 @@ config KVM_SW64_HOST
 	  Provides host support for SW64 processors.
 	  To compile this as a module, choose M here.
 
+config KVM_MEMHOTPLUG
+	bool "Memory hotplug support for guest"
+	depends on KVM
+	help
+	  Provides memory hotplug support for SW64 guest.
+
+
 source "drivers/vhost/Kconfig"
 
 endif # VIRTUALIZATION
diff --git a/arch/sw_64/kvm/handle_exit.c b/arch/sw_64/kvm/handle_exit.c
index 0d6806051fc7..5016bc0eddc2 100644
--- a/arch/sw_64/kvm/handle_exit.c
+++ b/arch/sw_64/kvm/handle_exit.c
@@ -34,6 +34,11 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
 	case SW64_KVM_EXIT_IPI:
 		vcpu_send_ipi(vcpu, hargs->arg0);
 		return 1;
+#ifdef CONFIG_KVM_MEMHOTPLUG
+	case SW64_KVM_EXIT_MEMHOTPLUG:
+		vcpu_mem_hotplug(vcpu, hargs->arg0);
+		return 1;
+#endif
 	case SW64_KVM_EXIT_FATAL_ERROR:
 		printk("Guest fatal error: Reason=[%lx], EXC_PC=[%lx], DVA=[%lx]", hargs->arg0, hargs->arg1, hargs->arg2);
 		vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
diff --git a/arch/sw_64/kvm/kvm-sw64.c b/arch/sw_64/kvm/kvm-sw64.c
index 839ee83d57d5..af29d0ca8e7f 100644
--- a/arch/sw_64/kvm/kvm-sw64.c
+++ b/arch/sw_64/kvm/kvm-sw64.c
@@ -56,10 +56,18 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int irq
 
 extern int __sw64_vcpu_run(struct vcpucb *vcb, struct kvm_regs *regs, struct hcall_args *args);
 
-static unsigned long get_vpcr(unsigned long machine_mem_offset, unsigned long memory_size, unsigned long vpn)
+#ifdef CONFIG_KVM_MEMHOTPLUG
+static u64 get_vpcr_memhp(u64 seg_base, u64 vpn)
 {
-	return (machine_mem_offset >> 23) | ((memory_size >> 23) << 16) | ((vpn & HARDWARE_VPN_MASK) << 44);
+	return seg_base | ((vpn & HARDWARE_VPN_MASK) << 44);
 }
+#else
+static u64 get_vpcr(u64 hpa_base, u64 mem_size, u64 vpn)
+{
+	return (hpa_base >> 23) | ((mem_size >> 23) << 16)
+				| ((vpn & HARDWARE_VPN_MASK) << 44);
+}
+#endif
 
 static unsigned long __get_new_vpn_context(struct kvm_vcpu *vcpu, long cpu)
 {
@@ -212,12 +220,38 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 
 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 {
+#ifdef CONFIG_KVM_MEMHOTPLUG
+	unsigned long *seg_pgd;
+
+	if (kvm->arch.seg_pgd != NULL) {
+		kvm_err("kvm_arch already initialized?\n");
+		return -EINVAL;
+	}
+
+	seg_pgd = alloc_pages_exact(PAGE_SIZE, GFP_KERNEL | __GFP_ZERO);
+	if (!seg_pgd)
+		return -ENOMEM;
+
+	kvm->arch.seg_pgd = seg_pgd;
+#endif
+
 	return 0;
 }
 
 void kvm_arch_destroy_vm(struct kvm *kvm)
 {
 	int i;
+#ifdef CONFIG_KVM_MEMHOTPLUG
+	void *seg_pgd = NULL;
+
+	if (kvm->arch.seg_pgd) {
+		seg_pgd = READ_ONCE(kvm->arch.seg_pgd);
+		kvm->arch.seg_pgd = NULL;
+	}
+
+	if (seg_pgd)
+		free_pages_exact(seg_pgd, PAGE_SIZE);
+#endif
 
 	for (i = 0; i < KVM_MAX_VCPUS; ++i) {
 		if (kvm->vcpus[i]) {
@@ -227,7 +261,6 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
 	}
 
 	atomic_set(&kvm->online_vcpus, 0);
-
 }
 
 long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
@@ -241,6 +274,22 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
 	return 0;
 }
 
+#ifdef CONFIG_KVM_MEMHOTPLUG
+static void setup_segment_table(struct kvm *kvm,
+	struct kvm_memory_slot *memslot, unsigned long addr, size_t size)
+{
+	unsigned long *seg_pgd = kvm->arch.seg_pgd;
+	unsigned int num_of_entry = size >> 30;
+	unsigned long base_hpa = addr >> 30;
+	int i;
+
+	for (i = 0; i < num_of_entry; i++) {
+		*seg_pgd = base_hpa + i;
+		seg_pgd++;
+	}
+}
+#endif
+
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
 		struct kvm_memory_slot *memslot,
 		const struct kvm_userspace_memory_region *mem,
@@ -253,8 +302,15 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 	unsigned long ret;
 	size_t size;
 
-	if (change == KVM_MR_FLAGS_ONLY)
+	if (change == KVM_MR_FLAGS_ONLY || change == KVM_MR_DELETE)
+		return 0;
+
+#ifndef CONFIG_KVM_MEMHOTPLUG
+	if (mem->guest_phys_addr) {
+		pr_info("%s, No KVM MEMHOTPLUG support!\n", __func__);
 		return 0;
+	}
+#endif
 
 	if (test_bit(IO_MARK_BIT, &(mem->guest_phys_addr)))
 		return 0;
@@ -276,7 +332,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 	if (!vm_file) {
 		info = kzalloc(sizeof(struct vmem_info), GFP_KERNEL);
 
-		size = round_up(mem->memory_size, 8<<20);
+		size = round_up(mem->memory_size, 8 << 20);
 		addr = gen_pool_alloc(sw64_kvm_pool, size);
 		if (!addr)
 			return -ENOMEM;
@@ -291,6 +347,18 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 		if (!vma)
 			return -ENOMEM;
 
+#ifdef CONFIG_KVM_MEMHOTPLUG
+		if (memslot->base_gfn == 0x0UL) {
+			setup_segment_table(kvm, memslot, addr, size);
+			kvm->arch.host_phys_addr = (u64)addr;
+			memslot->arch.host_phys_addr = addr;
+		} else {
+			/* used for memory hotplug */
+			memslot->arch.host_phys_addr = addr;
+			memslot->arch.valid = false;
+		}
+#endif
+
 		info->start = addr;
 		info->size = size;
 		vma->vm_private_data = (void *) info;
@@ -308,8 +376,11 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 
 	pr_info("guest phys addr = %#lx, size = %#lx\n",
 			addr, vma->vm_end - vma->vm_start);
+
+#ifndef CONFIG_KVM_MEMHOTPLUG
 	kvm->arch.host_phys_addr = (u64)addr;
-	kvm->arch.size = round_up(mem->memory_size, 8<<20);
+	kvm->arch.size = round_up(mem->memory_size, 8 << 20);
+#endif
 
 	memset(__va(addr), 0, 0x2000000);
 
@@ -463,8 +534,14 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
 	/* Set guest vcb */
 	/* vpn will update later when vcpu is running */
 	if (vcpu->arch.vcb.vpcr == 0) {
+#ifndef CONFIG_KVM_MEMHOTPLUG
 		vcpu->arch.vcb.vpcr
 			= get_vpcr(vcpu->kvm->arch.host_phys_addr, vcpu->kvm->arch.size, 0);
+#else
+		unsigned long seg_base = virt_to_phys(vcpu->kvm->arch.seg_pgd);
+
+		vcpu->arch.vcb.vpcr = get_vpcr_memhp(seg_base, 0);
+#endif
 		vcpu->arch.vcb.upcr = 0x7;
 	}
 
@@ -640,6 +717,30 @@ int kvm_dev_ioctl_check_extension(long ext)
 	return r;
 }
 
+#ifdef CONFIG_KVM_MEMHOTPLUG
+void vcpu_mem_hotplug(struct kvm_vcpu *vcpu, unsigned long start_addr)
+{
+	struct kvm *kvm = vcpu->kvm;
+	struct kvm_memory_slot *slot;
+	unsigned long start_pfn = start_addr >> PAGE_SHIFT;
+
+	kvm_for_each_memslot(slot, kvm_memslots(kvm)) {
+		if (start_pfn == slot->base_gfn) {
+			unsigned long *seg_pgd;
+			unsigned long num_of_entry = slot->npages >> 17;
+			unsigned long base_hpa = slot->arch.host_phys_addr;
+			int i;
+
+			seg_pgd = kvm->arch.seg_pgd + (start_pfn >> 17);
+			for (i = 0; i < num_of_entry; i++) {
+				*seg_pgd = (base_hpa >> 30) + i;
+				seg_pgd++;
+			}
+		}
+	}
+}
+#endif
+
 void vcpu_send_ipi(struct kvm_vcpu *vcpu, int target_vcpuid)
 {
 	struct kvm_vcpu *target_vcpu = kvm_get_vcpu(vcpu->kvm, target_vcpuid);
diff --git a/arch/sw_64/mm/init.c b/arch/sw_64/mm/init.c
index 16d3da7beebe..82f2414ef7f7 100644
--- a/arch/sw_64/mm/init.c
+++ b/arch/sw_64/mm/init.c
@@ -10,6 +10,7 @@
 #include <linux/memblock.h>
 #include <linux/swiotlb.h>
 #include <linux/acpi.h>
+#include <linux/memory.h>
 
 #include <asm/mmu_context.h>
 
@@ -33,6 +34,14 @@ static pud_t vmalloc_pud[1024]	__attribute__((__aligned__(PAGE_SIZE)));
 static phys_addr_t mem_start;
 static phys_addr_t mem_size_limit;
 
+unsigned long memory_block_size_bytes(void)
+{
+	if (is_in_guest())
+		return MIN_MEMORY_BLOCK_SIZE_VM_MEMHP;
+	else
+		return MIN_MEMORY_BLOCK_SIZE;
+}
+
 static int __init setup_mem_size(char *p)
 {
 	char *oldp;
diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index fafa8b0d8099..140716083ab8 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -351,6 +351,14 @@ config HMC6352
 	  This driver provides support for the Honeywell HMC6352 compass,
 	  providing configuration and heading data via sysfs.
 
+config SUNWAY_GED
+	tristate "sunway generic device driver for memhotplug"
+	depends on SW64
+	depends on MEMORY_HOTPLUG
+	help
+	  This driver provides support for sunway generic device driver for
+	  memhotplug, providing configuration and heading data via sysfs.
+
 config DS1682
 	tristate "Dallas DS1682 Total Elapsed Time Recorder with Alarm"
 	depends on I2C
diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
index d23231e73330..3615763234a6 100644
--- a/drivers/misc/Makefile
+++ b/drivers/misc/Makefile
@@ -34,6 +34,7 @@ obj-$(CONFIG_SENSORS_TSL2550)	+= tsl2550.o
 obj-$(CONFIG_DS1682)		+= ds1682.o
 obj-$(CONFIG_C2PORT)		+= c2port/
 obj-$(CONFIG_HMC6352)		+= hmc6352.o
+obj-$(CONFIG_SUNWAY_GED)	+= sunway-ged.o
 obj-y				+= eeprom/
 obj-y				+= cb710/
 obj-$(CONFIG_VMWARE_BALLOON)	+= vmw_balloon.o
diff --git a/drivers/misc/sunway-ged.c b/drivers/misc/sunway-ged.c
new file mode 100644
index 000000000000..b4e4ca315852
--- /dev/null
+++ b/drivers/misc/sunway-ged.c
@@ -0,0 +1,253 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/* Generic Event Device for ACPI. */
+
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/list.h>
+#include <linux/platform_device.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_platform.h>
+
+#define OFFSET_START_ADDR	0
+#define OFFSET_LENGTH		8
+#define OFFSET_STATUS		16
+#define OFFSET_SLOT		24
+
+/* Memory hotplug event */
+#define SUNWAY_MEMHOTPLUG_ADD		0x1
+#define SUNWAY_MEMHOTPLUG_REMOVE	0x2
+
+struct sunway_memory_device {
+	struct sunway_ged_device *device;
+	unsigned int state;     /* State of the memory device */
+	struct list_head list;
+
+	u64 start_addr;         /* Memory Range start physical addr */
+	u64 length;             /* Memory Range length */
+	u64 slot;		/* Memory Range slot */
+	unsigned int enabled:1;
+};
+
+struct sunway_ged_device {
+	struct device *dev;
+	void __iomem *membase;
+	void *driver_data;
+	spinlock_t lock;
+	struct list_head dev_list;
+};
+
+static int sunway_memory_enable_device(struct sunway_memory_device *mem_device)
+{
+	int num_enabled = 0;
+	int result = 0;
+
+	if (mem_device->enabled) {	/* just sanity check...*/
+		num_enabled++;
+		goto out;
+	}
+
+	/*
+	 * If the memory block size is zero, please ignore it.
+	 * Don't try to do the following memory hotplug flowchart.
+	 */
+	if (!mem_device->length)
+		goto out;
+
+	lock_device_hotplug();
+	/* suppose node = 0, fix me! */
+	result = __add_memory(0, mem_device->start_addr, mem_device->length);
+	unlock_device_hotplug();
+	/*
+	 * If the memory block has been used by the kernel, add_memory()
+	 * returns -EEXIST. If add_memory() returns the other error, it
+	 * means that this memory block is not used by the kernel.
+	 */
+	if (result && result != -EEXIST)
+		goto out;
+
+	mem_device->enabled = 1;
+
+	/*
+	 * Add num_enable even if add_memory() returns -EEXIST, so the
+	 * device is bound to this driver.
+	 */
+	num_enabled++;
+out:
+	if (!num_enabled) {
+		dev_err(mem_device->device->dev, "add_memory failed\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int sunway_memory_get_meminfo(struct sunway_memory_device *mem_device)
+{
+	struct sunway_ged_device *geddev;
+
+	if (!mem_device)
+		return -EINVAL;
+
+	if (mem_device->enabled)
+		return 0;
+
+	geddev = mem_device->device;
+
+	mem_device->start_addr = readq(geddev->membase + OFFSET_START_ADDR);
+	mem_device->length = readq(geddev->membase + OFFSET_LENGTH);
+
+	return 0;
+}
+
+static void sunway_memory_device_remove(struct sunway_ged_device *device)
+{
+	struct sunway_memory_device *mem_dev, *n;
+	unsigned long start_addr, length, slot;
+
+	if (!device)
+		return;
+
+	start_addr = readq(device->membase + OFFSET_START_ADDR);
+	length = readq(device->membase + OFFSET_LENGTH);
+	slot = readq(device->membase + OFFSET_SLOT);
+
+	list_for_each_entry_safe(mem_dev, n, &device->dev_list, list) {
+		if (!mem_dev->enabled)
+			continue;
+
+		if ((start_addr == mem_dev->start_addr) &&
+				(length == mem_dev->length)) {
+			/* suppose node = 0, fix me! */
+			remove_memory(0, start_addr, length);
+			list_del(&mem_dev->list);
+			kfree(mem_dev);
+		}
+	}
+
+	writeq(slot, device->membase + OFFSET_SLOT);
+}
+
+static int sunway_memory_device_add(struct sunway_ged_device *device)
+{
+	struct sunway_memory_device *mem_device;
+	int result;
+
+	if (!device)
+		return -EINVAL;
+
+	mem_device = kzalloc(sizeof(struct sunway_memory_device), GFP_KERNEL);
+	if (!mem_device)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&mem_device->list);
+	mem_device->device = device;
+
+	/* Get the range from the IO */
+	mem_device->start_addr = readq(device->membase + OFFSET_START_ADDR);
+	mem_device->length = readq(device->membase + OFFSET_LENGTH);
+	mem_device->slot = readq(device->membase + OFFSET_SLOT);
+
+	result = sunway_memory_enable_device(mem_device);
+	if (result) {
+		dev_err(device->dev, "sunway_memory_enable_device() error\n");
+		sunway_memory_device_remove(device);
+
+		return result;
+	}
+
+	list_add_tail(&mem_device->list, &device->dev_list);
+	dev_dbg(device->dev, "Memory device configured\n");
+
+	hcall(HCALL_MEMHOTPLUG, mem_device->start_addr, 0, 0);
+
+	return 1;
+}
+
+static irqreturn_t sunwayged_ist(int irq, void *data)
+{
+	struct sunway_ged_device *sunwayged_dev = data;
+	unsigned int status;
+
+	status = readl(sunwayged_dev->membase + OFFSET_STATUS);
+
+	/* through IO status to add or remove memory device  */
+	if (status & SUNWAY_MEMHOTPLUG_ADD)
+		sunway_memory_device_add(sunwayged_dev);
+
+	if (status & SUNWAY_MEMHOTPLUG_REMOVE)
+		sunway_memory_device_remove(sunwayged_dev);
+
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t sunwayged_irq_handler(int irq, void *data)
+{
+	return IRQ_WAKE_THREAD;
+}
+
+static int sunwayged_probe(struct platform_device *pdev)
+{
+	struct resource *regs = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	int irq = platform_get_irq(pdev, 0);
+	struct sunway_ged_device *geddev;
+	struct device *dev;
+	int irqflags;
+
+	if (!regs) {
+		dev_err(dev, "no registers defined\n");
+		return -EINVAL;
+	}
+
+	geddev = devm_kzalloc(&pdev->dev, sizeof(*geddev), GFP_KERNEL);
+	if (!geddev)
+		return -ENOMEM;
+
+	spin_lock_init(&geddev->lock);
+	geddev->membase = devm_ioremap(&pdev->dev,
+					regs->start, resource_size(regs));
+	if (!geddev->membase)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&geddev->dev_list);
+	geddev->dev = &pdev->dev;
+	irqflags = IRQF_SHARED;
+
+	if (request_threaded_irq(irq, sunwayged_irq_handler, sunwayged_ist,
+				irqflags, "SUNWAY:Ged", geddev)) {
+		dev_err(dev, "failed to setup event handler for irq %u\n", irq);
+
+		return -EINVAL;
+	}
+
+	platform_set_drvdata(pdev, geddev);
+
+	return 0;
+}
+
+static int sunwayged_remove(struct platform_device *pdev)
+{
+	return 0;
+}
+
+static const struct of_device_id sunwayged_of_match[] = {
+	{.compatible = "sw6,sunway-ged", },
+	{ }
+};
+MODULE_DEVICE_TABLE(of, sunwayged_of_match);
+
+static struct platform_driver sunwayged_platform_driver = {
+	.driver = {
+		.name		= "sunway-ged",
+		.of_match_table	= sunwayged_of_match,
+	},
+	.probe			= sunwayged_probe,
+	.remove			= sunwayged_remove,
+};
+module_platform_driver(sunwayged_platform_driver);
+
+MODULE_AUTHOR("Lu Feifei");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Sunway ged driver");
-- 
Gitee


From 2d3b928eac96fa601080afba9282cac711edbcef Mon Sep 17 00:00:00 2001
From: Lu Feifei <lufeifei@wxiat.com>
Date: Wed, 15 Jun 2022 14:59:51 +0800
Subject: [PATCH 136/674] sw64: add a misc device to chip_vt.dts for
 memory-hotplug

Sunway inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I56WV8

--------------------------------

This is an emulated device used to communicate with the memory-hotplug
device on the qemu side, so the device is added to device tree for
guest os.

Signed-off-by: Lu Feifei <lufeifei@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/boot/dts/chip_vt.dts | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/arch/sw_64/boot/dts/chip_vt.dts b/arch/sw_64/boot/dts/chip_vt.dts
index f0bcf1db1d08..abad29dee97e 100644
--- a/arch/sw_64/boot/dts/chip_vt.dts
+++ b/arch/sw_64/boot/dts/chip_vt.dts
@@ -34,5 +34,17 @@ uart: serial0@8801 {
 			clock-frequency = <24000000>;
 			status = "okay";
 		};
+		misc: misc0@8036 {
+			#address-cells = <2>;
+			#size-cells = <2>;
+			compatible = "sw6,sunway-ged";
+			reg = <0x8036 0x0 0x0 0x20>;
+			interrupt-parent=<&intc>;
+			interrupts = <13>;
+			reg-shift = <0>;
+			reg-io-width = <8>;
+			clock-frequency = <24000000>;
+			status = "okay";
+		};
 	};
 };
-- 
Gitee


From edb81706ec0dc4bc13c81b7ce08febe09bf2a471 Mon Sep 17 00:00:00 2001
From: Yang Qiang <yangqiang@wxiat.com>
Date: Mon, 20 Jun 2022 14:23:56 +0800
Subject: [PATCH 137/674] sw64: pci: fix maximum bus number for pci scan

Sunway inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GDR1

--------------------------------

When PCI devices enable SRIOV function, the previous policy that uses
pci_find_bus to locate PCI bus from a given domain and bus number may
return false PCI bus number, so this patch fixes this patch.

Signed-off-by: Yang Qiang <yangqiang@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/chip/chip3/chip.c | 13 +++++++------
 arch/sw_64/kernel/pci.c      | 18 +++++++++---------
 2 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/arch/sw_64/chip/chip3/chip.c b/arch/sw_64/chip/chip3/chip.c
index 02b369b2b37b..84ca7ffcb2ef 100644
--- a/arch/sw_64/chip/chip3/chip.c
+++ b/arch/sw_64/chip/chip3/chip.c
@@ -178,18 +178,20 @@ int chip_pcie_configure(struct pci_controller *hose)
 	struct pci_bus *bus, *top;
 	struct list_head *next;
 	unsigned int max_read_size, smallest_max_payload;
-	int max_payloadsize, iov_bus = 0;
+	int max_payloadsize;
 	unsigned long rc_index, node;
 	unsigned long piuconfig0, value;
 	unsigned int pcie_caps_offset;
 	unsigned int rc_conf_value;
 	u16 devctl, new_values;
 	bool rc_ari_disabled = false, found = false;
+	unsigned char bus_max_num;
 
 	node = hose->node;
 	rc_index = hose->index;
 	smallest_max_payload = read_rc_conf(node, rc_index, RC_EXP_DEVCAP);
 	smallest_max_payload &= PCI_EXP_DEVCAP_PAYLOAD;
+	bus_max_num = hose->busn_space->start;
 
 	top = hose->bus;
 	bus = top;
@@ -200,6 +202,7 @@ int chip_pcie_configure(struct pci_controller *hose)
 			/* end of this bus, go up or finish */
 			if (bus == top)
 				break;
+
 			next = bus->self->bus_list.next;
 			bus = bus->self->bus;
 			continue;
@@ -224,10 +227,8 @@ int chip_pcie_configure(struct pci_controller *hose)
 			}
 		}
 
-#ifdef CONFIG_PCI_IOV
-		if (dev->is_physfn)
-			iov_bus += dev->sriov->max_VF_buses - dev->bus->number;
-#endif
+		if (bus->busn_res.end > bus_max_num)
+			bus_max_num = bus->busn_res.end;
 
 		/* Query device PCIe capability register  */
 		pcie_caps_offset = dev->pcie_cap;
@@ -306,7 +307,7 @@ int chip_pcie_configure(struct pci_controller *hose)
 		pci_write_config_word(dev, pcie_caps_offset + PCI_EXP_DEVCTL, devctl);
 	}
 
-	return iov_bus;
+	return bus_max_num;
 }
 
 static int chip3_check_pci_vt_linkup(unsigned long node, unsigned long index)
diff --git a/arch/sw_64/kernel/pci.c b/arch/sw_64/kernel/pci.c
index 7cc8b7d2d43b..fcc6e0f02a93 100644
--- a/arch/sw_64/kernel/pci.c
+++ b/arch/sw_64/kernel/pci.c
@@ -221,7 +221,7 @@ void __init common_init_pci(void)
 	struct pci_bus *bus;
 	unsigned int init_busnr;
 	int need_domain_info = 0;
-	int ret, iov_bus;
+	int ret;
 	unsigned long offset;
 
 	/* Scan all of the recorded PCI controllers. */
@@ -257,20 +257,20 @@ void __init common_init_pci(void)
 
 		bus = hose->bus = bridge->bus;
 		hose->need_domain_info = need_domain_info;
-		while (pci_find_bus(pci_domain_nr(bus), last_bus))
-			last_bus++;
 
 		if (is_in_host())
-			iov_bus = chip_pcie_configure(hose);
-		last_bus += iov_bus;
+			last_bus = chip_pcie_configure(hose);
+		else
+			while (pci_find_bus(pci_domain_nr(bus), last_bus))
+				last_bus++;
 
-		hose->last_busno = hose->busn_space->end = last_bus - 1;
+		hose->last_busno = hose->busn_space->end = last_bus;
 		init_busnr = read_rc_conf(hose->node, hose->index, RC_PRIMARY_BUS);
 		init_busnr &= ~(0xff << 16);
-		init_busnr |= (last_bus - 1) << 16;
+		init_busnr |= last_bus << 16;
 		write_rc_conf(hose->node, hose->index, RC_PRIMARY_BUS, init_busnr);
-		pci_bus_update_busn_res_end(bus, last_bus - 1);
-
+		pci_bus_update_busn_res_end(bus, last_bus);
+		last_bus++;
 	}
 
 	pcibios_claim_console_setup();
-- 
Gitee


From b6bc5e6d316eb6e70a88ca730d2e099a8bad1bf7 Mon Sep 17 00:00:00 2001
From: Hang Xiaoqian <hangxiaoqian@wxiat.com>
Date: Mon, 20 Jun 2022 16:54:38 +0800
Subject: [PATCH 138/674] sw64: remove unaligned count

Sunway inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GFNY

--------------------------------

In the case of highly concurrent unaligned processing, unaligned
count may cause severe cache thrashing. However, unaligned count
is not required, so remove it to reduce cache thrashing.

Signed-off-by: Hang Xiaoqian <hangxiaoqian@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/kernel/Makefile    |  2 +-
 arch/sw_64/kernel/traps.c     | 13 --------
 arch/sw_64/kernel/unaligned.c | 56 -----------------------------------
 3 files changed, 1 insertion(+), 70 deletions(-)
 delete mode 100644 arch/sw_64/kernel/unaligned.c

diff --git a/arch/sw_64/kernel/Makefile b/arch/sw_64/kernel/Makefile
index c1e461e0ac56..94b63d6a286b 100644
--- a/arch/sw_64/kernel/Makefile
+++ b/arch/sw_64/kernel/Makefile
@@ -31,7 +31,7 @@ obj-$(CONFIG_HIBERNATION) += hibernate_asm.o hibernate.o
 obj-$(CONFIG_AUDIT)     += audit.o
 obj-$(CONFIG_PCI) += pci_common.o
 obj-$(CONFIG_RELOCATABLE)   += relocate.o
-obj-$(CONFIG_DEBUG_FS)	+= unaligned.o segvdbg.o
+obj-$(CONFIG_DEBUG_FS)	+= segvdbg.o
 obj-$(CONFIG_JUMP_LABEL) += jump_label.o
 
 ifndef CONFIG_PCI
diff --git a/arch/sw_64/kernel/traps.c b/arch/sw_64/kernel/traps.c
index a61c851967a9..d656eca5f961 100644
--- a/arch/sw_64/kernel/traps.c
+++ b/arch/sw_64/kernel/traps.c
@@ -320,11 +320,6 @@ do_entIF(unsigned long inst_type, struct pt_regs *regs)
 	force_sig_fault(SIGILL, ILL_ILLOPC, (void __user *)regs->pc, 0);
 }
 
-struct unaligned_stat {
-	unsigned long count, va, pc;
-} unaligned[2];
-
-
 asmlinkage void
 do_entUna(void *va, unsigned long opcode, unsigned long reg,
 	  struct pt_regs *regs)
@@ -334,10 +329,6 @@ do_entUna(void *va, unsigned long opcode, unsigned long reg,
 	unsigned long pc = regs->pc - 4;
 	const struct exception_table_entry *fixup;
 
-	unaligned[0].count++;
-	unaligned[0].va = (unsigned long) va;
-	unaligned[0].pc = pc;
-
 	/*
 	 * We don't want to use the generic get/put unaligned macros as
 	 * we want to trap exceptions. Only if we actually get an
@@ -666,10 +657,6 @@ do_entUnaUser(void __user *va, unsigned long opcode,
 	if ((unsigned long)va >= TASK_SIZE)
 		goto give_sigsegv;
 
-	++unaligned[1].count;
-	unaligned[1].va = (unsigned long)va;
-	unaligned[1].pc = regs->pc - 4;
-
 	if ((1L << opcode) & OP_INT_MASK) {
 		/* it's an integer load/store */
 		if (reg < 30) {
diff --git a/arch/sw_64/kernel/unaligned.c b/arch/sw_64/kernel/unaligned.c
deleted file mode 100644
index a1bbdab4a266..000000000000
--- a/arch/sw_64/kernel/unaligned.c
+++ /dev/null
@@ -1,56 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-/*
- * Copyright (C) 2020 Mao Minkai
- * Author: Mao Minkai
- *
- * This code is taken from arch/mips/kernel/segment.c
- *	Copyright (C) 2013 Imagination Technologies Ltd.
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- */
-
-#include <asm/unaligned.h>
-#include <asm/debug.h>
-
-static int show_unaligned(struct seq_file *sf, void *v)
-{
-	extern struct unaligned_stat {
-		unsigned long count, va, pc;
-	} unaligned[2];
-
-	seq_printf(sf, "kernel unaligned acc\t: %ld (pc=%lx, va=%lx)\n", unaligned[0].count, unaligned[0].pc, unaligned[0].va);
-	seq_printf(sf, "user unaligned acc\t: %ld (pc=%lx, va=%lx)\n", unaligned[1].count, unaligned[1].pc, unaligned[1].va);
-
-	return 0;
-}
-
-static int unaligned_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, show_unaligned, NULL);
-}
-
-static const struct file_operations unaligned_fops = {
-	.open		= unaligned_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
-static int __init unaligned_info(void)
-{
-	struct dentry *unaligned;
-
-	if (!sw64_debugfs_dir)
-		return -ENODEV;
-
-	unaligned = debugfs_create_file("unaligned", S_IRUGO,
-				       sw64_debugfs_dir, NULL,
-				       &unaligned_fops);
-	if (!unaligned)
-		return -ENOMEM;
-	return 0;
-}
-device_initcall(unaligned_info);
-- 
Gitee


From 993a93c8c4a7aa088c0157bddb10fda388fcdccb Mon Sep 17 00:00:00 2001
From: Mao Minkai <maominkai@wxiat.com>
Date: Mon, 20 Jun 2022 16:58:09 +0800
Subject: [PATCH 139/674] sw64: fix simd version of memset

Sunway inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GFOO

--------------------------------

In commit 20b900c4b7fb ("sw64: optimize simd version of memcpy and
memset"), _nc instructions are used to improve performance, but the
position of memb instruction in memset is wrong. Fix it.

Fixes: 20b900c4b7fb ("sw64: optimize simd version of memcpy and memset")
Signed-off-by: Mao Minkai <maominkai@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/lib/deep-memset.S | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/sw_64/lib/deep-memset.S b/arch/sw_64/lib/deep-memset.S
index ed2171c56d4d..7fbd529c72a8 100644
--- a/arch/sw_64/lib/deep-memset.S
+++ b/arch/sw_64/lib/deep-memset.S
@@ -99,12 +99,11 @@ $mod32_aligned:
 	.align 5
 $mod32_loop_nc:
 	subl	$18, 64, $18
-	blt	$18, $mod32_tail
+	blt	$18, $mod32_tail_memb
 	vstd_nc	$f10, 0($16)
 	vstd_nc	$f10, 32($16)
 	addl	$16, 64, $16
 	br	$31, $mod32_loop_nc
-	memb			# required for _nc store instructions
 
 	.align 5
 $mod32_loop:
@@ -115,6 +114,8 @@ $mod32_loop:
 	addl	$16, 64, $16
 	br	$31, $mod32_loop
 
+$mod32_tail_memb:
+	memb			# required for _nc store instructions
 $mod32_tail:
 	vldd	$f10, 0($4)
 	addl	$sp, 64, $sp
-- 
Gitee


From e73f0bffb26c59b5b3ff024b4194a3c6fe34a8dd Mon Sep 17 00:00:00 2001
From: Zhu Donghong <zhudonghong@wxiat.com>
Date: Tue, 21 Jun 2022 14:47:33 +0800
Subject: [PATCH 140/674] sw64: deliver a hot reset to Root Complex with plugin
 JMicron 585 card

Sunway inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GFPA

--------------------------------

On SW64 platform, JMicron 585 SATA card directly connected to Root
Complex may raise DMA failure when reboot, so we force a hot reset
to Root Complex to fix can not access JMicron 585 SATA card.

Signed-off-by: Zhu Donghong <zhudonghong@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/platform/platform_xuelang.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/arch/sw_64/platform/platform_xuelang.c b/arch/sw_64/platform/platform_xuelang.c
index 63a4b163e43e..ae8179b53b4c 100644
--- a/arch/sw_64/platform/platform_xuelang.c
+++ b/arch/sw_64/platform/platform_xuelang.c
@@ -26,9 +26,25 @@ extern void cpld_write(uint8_t slave_addr, uint8_t reg, uint8_t data);
 
 static void xuelang_kill_arch(int mode)
 {
+	struct pci_dev *pdev;
+	struct pci_controller *hose;
+	int val;
+
 	if (is_in_host()) {
 		switch (mode) {
 		case LINUX_REBOOT_CMD_RESTART:
+			pdev = pci_get_device(PCI_VENDOR_ID_JMICRON,
+					      0x0585, NULL);
+			if (pdev) {
+				hose = (struct pci_controller *)pdev->sysdata;
+				val = read_rc_conf(hose->node, hose->index,
+						   RC_PORT_LINK_CTL);
+				write_rc_conf(hose->node, hose->index,
+					      RC_PORT_LINK_CTL, val | 0x8);
+				write_rc_conf(hose->node, hose->index,
+					      RC_PORT_LINK_CTL, val);
+			}
+
 			cpld_write(0x64, 0x00, 0xc3);
 			mb();
 			break;
-- 
Gitee


From 9e0b719ccc16ce561eb8247e87f8a15f00ffaccf Mon Sep 17 00:00:00 2001
From: He Sheng <hesheng@wxiat.com>
Date: Wed, 21 Jul 2021 13:36:27 +0800
Subject: [PATCH 141/674] sw64: rename debugfs dir sw_64 to sw64

Sunway inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I56OLG

--------------------------------

Signed-off-by: He Sheng <hesheng@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/kernel/setup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/sw_64/kernel/setup.c b/arch/sw_64/kernel/setup.c
index 85d172c32239..0e93643539d3 100644
--- a/arch/sw_64/kernel/setup.c
+++ b/arch/sw_64/kernel/setup.c
@@ -976,7 +976,7 @@ static int __init debugfs_sw64(void)
 {
 	struct dentry *d;
 
-	d = debugfs_create_dir("sw_64", NULL);
+	d = debugfs_create_dir("sw64", NULL);
 	if (!d)
 		return -ENOMEM;
 	sw64_debugfs_dir = d;
-- 
Gitee


From 0115f679cddf65c65822b61208be17a141a72cfc Mon Sep 17 00:00:00 2001
From: Wu Liliu <wuliliu@wxiat.com>
Date: Wed, 15 Jun 2022 11:00:09 +0800
Subject: [PATCH 142/674] sw64: reimplement show_stack() method

Sunway inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GFQ0

--------------------------------

It used to show stack information by SP, that means it has nothing to
do with task, which is unexpected. In order to show stack of task, we
improve the implementation of show_stack().

However, the original task struct does not have sp, so we add it into
thread_struct and do something necessary in __switch_to.

Meanwhile, walk_stackframe() can be reused, so refactor related codes
for better support.

Signed-off-by: Wu Liliu <wuliliu@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/include/asm/processor.h  |  1 +
 arch/sw_64/include/asm/stacktrace.h |  4 +-
 arch/sw_64/kernel/asm-offsets.c     |  1 +
 arch/sw_64/kernel/entry.S           |  6 +++
 arch/sw_64/kernel/perf_event.c      | 12 ++---
 arch/sw_64/kernel/process.c         |  1 +
 arch/sw_64/kernel/stacktrace.c      | 78 +++++++++++++++++++++++------
 arch/sw_64/kernel/traps.c           | 58 ++++-----------------
 8 files changed, 85 insertions(+), 76 deletions(-)

diff --git a/arch/sw_64/include/asm/processor.h b/arch/sw_64/include/asm/processor.h
index 08e8cc8e5428..886f28635dd4 100644
--- a/arch/sw_64/include/asm/processor.h
+++ b/arch/sw_64/include/asm/processor.h
@@ -45,6 +45,7 @@ struct thread_struct {
 	struct user_fpsimd_state fpstate;
 	/* Callee-saved registers */
 	unsigned long ra;
+	unsigned long sp;
 	unsigned long s[7];	/* s0 ~ s6 */
 };
 #define INIT_THREAD  { }
diff --git a/arch/sw_64/include/asm/stacktrace.h b/arch/sw_64/include/asm/stacktrace.h
index 813aa5e7a91d..ed691a72573b 100644
--- a/arch/sw_64/include/asm/stacktrace.h
+++ b/arch/sw_64/include/asm/stacktrace.h
@@ -32,8 +32,8 @@ struct stack_frame {
 };
 
 extern int unwind_frame(struct task_struct *tsk, struct stackframe *frame);
-extern void walk_stackframe(struct task_struct *tsk, struct stackframe *frame,
-			    int (*fn)(struct stackframe *, void *), void *data);
+extern void walk_stackframe(struct task_struct *tsk, struct pt_regs *regs,
+			    int (*fn)(unsigned long, void *), void *data);
 
 static inline bool on_task_stack(struct task_struct *tsk, unsigned long sp,
 				struct stack_info *info)
diff --git a/arch/sw_64/kernel/asm-offsets.c b/arch/sw_64/kernel/asm-offsets.c
index 0c7e1e26eb05..9e6c338a5edd 100644
--- a/arch/sw_64/kernel/asm-offsets.c
+++ b/arch/sw_64/kernel/asm-offsets.c
@@ -213,6 +213,7 @@ void foo(void)
 	OFFSET(TASK_THREAD_FPCR, task_struct, thread.fpstate.fpcr);
 	BLANK();
 	OFFSET(TASK_THREAD_RA, task_struct, thread.ra);
+	OFFSET(TASK_THREAD_SP, task_struct, thread.sp);
 	OFFSET(TASK_THREAD_S0, task_struct, thread.s[0]);
 	OFFSET(TASK_THREAD_S1, task_struct, thread.s[1]);
 	OFFSET(TASK_THREAD_S2, task_struct, thread.s[2]);
diff --git a/arch/sw_64/kernel/entry.S b/arch/sw_64/kernel/entry.S
index 977c774ad799..f79c9a6ddf36 100644
--- a/arch/sw_64/kernel/entry.S
+++ b/arch/sw_64/kernel/entry.S
@@ -398,6 +398,7 @@ __switch_to:
 	.prologue 0
 	/* Save context into prev->thread */
 	stl	$26, TASK_THREAD_RA($17)
+	stl	$30, TASK_THREAD_SP($17)
 	stl	$9, TASK_THREAD_S0($17)
 	stl	$10, TASK_THREAD_S1($17)
 	stl	$11, TASK_THREAD_S2($17)
@@ -415,6 +416,11 @@ __switch_to:
 	ldl	$14, TASK_THREAD_S5($18)
 	ldl	$15, TASK_THREAD_S6($18)
 	sys_call HMC_swpctx
+	/*
+	 * SP has been saved and restored by HMC_swpctx,
+	 * and restore it again here for future expansion.
+	 */
+	ldl     $30, TASK_THREAD_SP($18)
 	ldi	$8, 0x3fff
 	bic	$sp, $8, $8
 	mov	$17, $0
diff --git a/arch/sw_64/kernel/perf_event.c b/arch/sw_64/kernel/perf_event.c
index 52ec34e33269..6e344239917b 100644
--- a/arch/sw_64/kernel/perf_event.c
+++ b/arch/sw_64/kernel/perf_event.c
@@ -761,24 +761,18 @@ void perf_callchain_user(struct perf_callchain_entry_ctx *entry,
  * whist unwinding the stackframe and is like a subroutine return so we use
  * the PC.
  */
-static int callchain_trace(struct stackframe *frame, void *data)
+static int callchain_trace(unsigned long pc, void *data)
 {
 	struct perf_callchain_entry_ctx *entry = data;
 
-	perf_callchain_store(entry, frame->pc);
-
+	perf_callchain_store(entry, pc);
 	return 0;
 }
 
 void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
 			   struct pt_regs *regs)
 {
-	struct stackframe frame;
-
-	frame.fp = regs->r15;
-	frame.pc = regs->pc;
-
-	walk_stackframe(current, &frame, callchain_trace, entry);
+	walk_stackframe(NULL, regs, callchain_trace, entry);
 }
 
 /*
diff --git a/arch/sw_64/kernel/process.c b/arch/sw_64/kernel/process.c
index 7a7578d530c6..da721d4266ee 100644
--- a/arch/sw_64/kernel/process.c
+++ b/arch/sw_64/kernel/process.c
@@ -169,6 +169,7 @@ copy_thread(unsigned long clone_flags, unsigned long usp,
 
 	childti->pcb.ksp = (unsigned long) childregs;
 	childti->pcb.flags = 7;	/* set FEN, clear everything else */
+	p->thread.sp = (unsigned long) childregs;
 
 	if (unlikely(p->flags & PF_KTHREAD)) {
 		/* kernel thread */
diff --git a/arch/sw_64/kernel/stacktrace.c b/arch/sw_64/kernel/stacktrace.c
index 2671331717ba..4e9acf99aaab 100644
--- a/arch/sw_64/kernel/stacktrace.c
+++ b/arch/sw_64/kernel/stacktrace.c
@@ -10,6 +10,8 @@
 #include <linux/sched/debug.h>
 #include <linux/ftrace.h>
 #include <linux/perf_event.h>
+#include <linux/kallsyms.h>
+
 #include <asm/stacktrace.h>
 
 /*
@@ -59,40 +61,84 @@ int unwind_frame(struct task_struct *tsk, struct stackframe *frame)
 }
 EXPORT_SYMBOL_GPL(unwind_frame);
 
-void walk_stackframe(struct task_struct *tsk, struct stackframe *frame,
-		     int (*fn)(struct stackframe *, void *), void *data)
+void walk_stackframe(struct task_struct *tsk, struct pt_regs *regs,
+		     int (*fn)(unsigned long, void *), void *data)
 {
+	unsigned long pc, fp;
+
+	struct stackframe frame;
+
+	if (regs) {
+		pc = regs->pc;
+		fp = regs->r15;
+	} else if (tsk == current || tsk == NULL) {
+		fp = (unsigned long)__builtin_frame_address(0);
+		pc = (unsigned long)walk_stackframe;
+	} else {
+		fp = tsk->thread.s[6];
+		pc = tsk->thread.ra;
+	}
+
+	if (!__kernel_text_address(pc) || fn(pc, data))
+		return;
+
+	frame.pc = pc;
+	frame.fp = fp;
 	while (1) {
 		int ret;
-
-		if (fn(frame, data))
-			break;
-		ret = unwind_frame(tsk, frame);
+		ret = unwind_frame(tsk, &frame);
 		if (ret < 0)
 			break;
+
+		if (fn(frame.pc, data))
+			break;
 	}
 }
 EXPORT_SYMBOL_GPL(walk_stackframe);
 
 #else /* !CONFIG_FRAME_POINTER */
-void walk_stackframe(struct task_struct *tsk, struct stackframe *frame,
-		     int (*fn)(struct stackframe *, void *), void *data)
+void walk_stackframe(struct task_struct *tsk, struct pt_regs *regs,
+		     int (*fn)(unsigned long, void *), void *data)
 {
-	unsigned long *sp = (unsigned long *)current_thread_info()->pcb.ksp;
-	unsigned long addr;
-	struct perf_callchain_entry_ctx *entry = data;
+	unsigned long *ksp;
+	unsigned long sp, pc;
+
+	if (regs) {
+		sp = (unsigned long)(regs+1);
+		pc = regs->pc;
+	} else if (tsk == current || tsk == NULL) {
+		register unsigned long current_sp __asm__ ("$30");
+		sp = current_sp;
+		pc = (unsigned long)walk_stackframe;
+	} else {
+		sp = tsk->thread.sp;
+		pc = tsk->thread.ra;
+	}
 
-	perf_callchain_store(entry, frame->pc);
-	while (!kstack_end(sp) && entry->nr < entry->max_stack) {
-		addr = *sp++;
-		if (__kernel_text_address(addr))
-			perf_callchain_store(entry, addr);
+	ksp = (unsigned long *)sp;
+
+	while (!kstack_end(ksp)) {
+		if (__kernel_text_address(pc) && fn(pc, data))
+			break;
+		pc = (*ksp++) - 0x4;
 	}
 }
 EXPORT_SYMBOL_GPL(walk_stackframe);
 
 #endif/* CONFIG_FRAME_POINTER */
 
+static int print_address_trace(unsigned long pc, void *data)
+{
+	print_ip_sym((const char *)data, pc);
+	return 0;
+}
+
+void show_stack(struct task_struct *task, unsigned long *sp, const char *loglvl)
+{
+	pr_info("Trace:\n");
+	walk_stackframe(task, NULL, print_address_trace, (void *)loglvl);
+}
+
 /*
  * Save stack-backtrace addresses into a stack_trace buffer.
  */
diff --git a/arch/sw_64/kernel/traps.c b/arch/sw_64/kernel/traps.c
index d656eca5f961..4e95cab13daa 100644
--- a/arch/sw_64/kernel/traps.c
+++ b/arch/sw_64/kernel/traps.c
@@ -12,13 +12,20 @@
 #include <linux/extable.h>
 #include <linux/perf_event.h>
 #include <linux/kdebug.h>
+#include <linux/sched.h>
 #include <linux/kexec.h>
+#include <linux/kallsyms.h>
+#include <linux/sched/task_stack.h>
+#include <linux/sched/debug.h>
 
 #include <asm/gentrap.h>
 #include <asm/mmu_context.h>
 #include <asm/fpu.h>
 #include <asm/kprobes.h>
 #include <asm/uprobes.h>
+#include <asm/stacktrace.h>
+#include <asm/processor.h>
+#include <asm/ptrace.h>
 
 #include "proto.h"
 
@@ -68,53 +75,6 @@ dik_show_code(unsigned int *pc)
 	printk("\n");
 }
 
-static void
-dik_show_trace(unsigned long *sp, const char *loglvl)
-{
-	long i = 0;
-	unsigned long tmp;
-
-	printk("%sTrace:\n", loglvl);
-	while (0x1ff8 & (unsigned long)sp) {
-		tmp = *sp;
-		sp++;
-		if (!__kernel_text_address(tmp))
-			continue;
-		printk("%s[<%lx>] %pSR\n", loglvl, tmp, (void *)tmp);
-		if (i > 40) {
-			printk("%s ...", loglvl);
-			break;
-		}
-	}
-	printk("\n");
-}
-
-static int kstack_depth_to_print = 24;
-
-void show_stack(struct task_struct *task, unsigned long *sp, const char *loglvl)
-{
-	unsigned long *stack;
-	int i;
-
-	/*
-	 * debugging aid: "show_stack(NULL, NULL, KERN_EMERG);" prints the
-	 * back trace for this cpu.
-	 */
-	if (sp == NULL)
-		sp = (unsigned long *)&sp;
-
-	stack = sp;
-	for (i = 0; i < kstack_depth_to_print; i++) {
-		if (((long) stack & (THREAD_SIZE-1)) == 0)
-			break;
-		if (i && ((i % 4) == 0))
-			printk("%s       ", loglvl);
-		printk("%016lx ", *stack++);
-	}
-	printk("\n");
-	dik_show_trace(sp, loglvl);
-}
-
 void die_if_kernel(char *str, struct pt_regs *regs, long err)
 {
 	if (regs->ps & 8)
@@ -125,7 +85,7 @@ void die_if_kernel(char *str, struct pt_regs *regs, long err)
 	printk("%s(%d): %s %ld\n", current->comm, task_pid_nr(current), str, err);
 	dik_show_regs(regs);
 	add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE);
-	dik_show_trace((unsigned long *)(regs+1), KERN_DEFAULT);
+	show_stack(current, NULL, KERN_EMERG);
 	dik_show_code((unsigned int *)regs->pc);
 
 	if (test_and_set_thread_flag(TIF_DIE_IF_KERNEL)) {
@@ -535,7 +495,7 @@ do_entUna(void *va, unsigned long opcode, unsigned long reg,
 
 	dik_show_regs(regs);
 	dik_show_code((unsigned int *)pc);
-	dik_show_trace((unsigned long *)(regs+1), KERN_DEFAULT);
+	show_stack(current, NULL, KERN_EMERG);
 
 	if (test_and_set_thread_flag(TIF_DIE_IF_KERNEL)) {
 		printk("die_if_kernel recursion detected.\n");
-- 
Gitee


From 8cc109c4a51ea3c343c15b47eb62d99d0adae211 Mon Sep 17 00:00:00 2001
From: Wu Liliu <wuliliu@wxiat.com>
Date: Tue, 21 Jun 2022 14:19:58 +0800
Subject: [PATCH 143/674] sw64: reimplement get_wchan()

Sunway inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GFUD

--------------------------------

The get_wchan() always return 0 because `fp > sp` is false. This
patch reimplements it entirely and fixes this error.

Signed-off-by: Wu Liliu <wuliliu@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/kernel/process.c    | 51 ----------------------------------
 arch/sw_64/kernel/stacktrace.c | 22 +++++++++++++++
 2 files changed, 22 insertions(+), 51 deletions(-)

diff --git a/arch/sw_64/kernel/process.c b/arch/sw_64/kernel/process.c
index da721d4266ee..a75ae20205f3 100644
--- a/arch/sw_64/kernel/process.c
+++ b/arch/sw_64/kernel/process.c
@@ -226,57 +226,6 @@ int dump_fpu(struct pt_regs *regs, elf_fpregset_t *fpu)
 }
 EXPORT_SYMBOL(dump_fpu);
 
-/*
- * Under heavy swap load I've seen this lose in an ugly way.  So do
- * some extra sanity checking on the ranges we expect these pointers
- * to be in so that we can fail gracefully.  This is just for ps after
- * all.  -- r~
- */
-
-unsigned long
-thread_saved_pc(struct task_struct *t)
-{
-	unsigned long top, fp, sp;
-
-	top = (unsigned long)task_stack_page(t) + 2 * PAGE_SIZE;
-	sp = task_thread_info(t)->pcb.ksp;
-	fp = t->thread.s[6];
-
-	if (fp > sp && fp < top)
-		return *(unsigned long *)fp;
-
-	return 0;
-}
-
-unsigned long
-get_wchan(struct task_struct *p)
-{
-	unsigned long schedule_frame;
-	unsigned long pc, top, sp;
-
-	if (!p || p == current || p->state == TASK_RUNNING)
-		return 0;
-	/*
-	 * This one depends on the frame size of schedule().  Do a
-	 * "disass schedule" in gdb to find the frame size.  Also, the
-	 * code assumes that sleep_on() follows immediately after
-	 * interruptible_sleep_on() and that add_timer() follows
-	 * immediately after interruptible_sleep().  Ugly, isn't it?
-	 * Maybe adding a wchan field to task_struct would be better,
-	 * after all...
-	 */
-
-	pc = thread_saved_pc(p);
-	if (in_sched_functions(pc)) {
-		top = (unsigned long)task_stack_page(p) + 2 * PAGE_SIZE;
-		sp = task_thread_info(p)->pcb.ksp;
-		schedule_frame = p->thread.s[6];
-		if (schedule_frame > sp && schedule_frame < top)
-			return ((unsigned long *)schedule_frame)[12];
-	}
-	return pc;
-}
-
 unsigned long arch_randomize_brk(struct mm_struct *mm)
 {
 	return randomize_page(mm->brk, 0x02000000);
diff --git a/arch/sw_64/kernel/stacktrace.c b/arch/sw_64/kernel/stacktrace.c
index 4e9acf99aaab..b51b1500c736 100644
--- a/arch/sw_64/kernel/stacktrace.c
+++ b/arch/sw_64/kernel/stacktrace.c
@@ -172,3 +172,25 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
 		trace->entries[trace->nr_entries++] = ULONG_MAX;
 }
 EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
+
+static int save_pc(unsigned long pc, void *data)
+{
+	unsigned long *p = data;
+	*p = 0;
+
+	if (!in_sched_functions(pc))
+		*p = pc;
+
+	return *p;
+}
+
+unsigned long get_wchan(struct task_struct *tsk)
+{
+	unsigned long pc;
+
+	if (!tsk || tsk == current || tsk->state == TASK_RUNNING)
+		return 0;
+	walk_stackframe(tsk, NULL, save_pc, &pc);
+
+	return pc;
+}
-- 
Gitee


From a3266d177538c7304143e30f9c1c9de6cda945d4 Mon Sep 17 00:00:00 2001
From: Wu Liliu <wuliliu@wxiat.com>
Date: Fri, 24 Jun 2022 14:12:33 +0800
Subject: [PATCH 144/674] sw64: reimplement save_stack_trace()

Sunway inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GFQQ

--------------------------------

It used to save stack-backtrace addresses by SP, which is inaccurate.
In order to implement this function accurately, we provide two ways
to support it.

Signed-off-by: Wu Liliu <wuliliu@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/kernel/stacktrace.c | 62 ++++++++++++++++++++++------------
 1 file changed, 40 insertions(+), 22 deletions(-)

diff --git a/arch/sw_64/kernel/stacktrace.c b/arch/sw_64/kernel/stacktrace.c
index b51b1500c736..7b5ddc78bd6d 100644
--- a/arch/sw_64/kernel/stacktrace.c
+++ b/arch/sw_64/kernel/stacktrace.c
@@ -139,40 +139,58 @@ void show_stack(struct task_struct *task, unsigned long *sp, const char *loglvl)
 	walk_stackframe(task, NULL, print_address_trace, (void *)loglvl);
 }
 
+#ifdef CONFIG_STACKTRACE
 /*
  * Save stack-backtrace addresses into a stack_trace buffer.
  */
-void save_stack_trace(struct stack_trace *trace)
+struct stack_trace_data {
+	struct stack_trace *trace;
+	unsigned int nosched;
+};
+
+int save_trace(unsigned long pc, void *d)
 {
-	save_stack_trace_tsk(current, trace);
-}
-EXPORT_SYMBOL_GPL(save_stack_trace);
+	struct stack_trace_data *data = d;
+	struct stack_trace *trace = data->trace;
+
+	if (data->nosched && in_sched_functions(pc))
+		return 0;
+	if (trace->skip > 0) {
+		trace->skip--;
+		return 0;
+	}
 
+	trace->entries[trace->nr_entries++] = pc;
+	return (trace->nr_entries >= trace->max_entries);
+}
 
-void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
+static void __save_stack_trace(struct task_struct *tsk,
+		struct stack_trace *trace, unsigned int nosched)
 {
-	unsigned long *sp = (unsigned long *)task_thread_info(tsk)->pcb.ksp;
-	unsigned long addr;
-
-	WARN_ON(trace->nr_entries || !trace->max_entries);
-
-	while (!kstack_end(sp)) {
-		addr = *sp++;
-		if (__kernel_text_address(addr) &&
-				!in_sched_functions(addr)) {
-			if (trace->skip > 0)
-				trace->skip--;
-			else
-				trace->entries[trace->nr_entries++] = addr;
-			if (trace->nr_entries >= trace->max_entries)
-				break;
-		}
-	}
+	struct stack_trace_data data;
+
+	data.trace = trace;
+	data.nosched = nosched;
+
+	walk_stackframe(tsk, NULL, save_trace, &data);
+
 	if (trace->nr_entries < trace->max_entries)
 		trace->entries[trace->nr_entries++] = ULONG_MAX;
 }
+
+void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
+{
+	__save_stack_trace(tsk, trace, 1);
+}
 EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
 
+void save_stack_trace(struct stack_trace *trace)
+{
+	__save_stack_trace(current, trace, 0);
+}
+EXPORT_SYMBOL_GPL(save_stack_trace);
+#endif
+
 static int save_pc(unsigned long pc, void *data)
 {
 	unsigned long *p = data;
-- 
Gitee


From d14ec0e7e888b5ab16142e0a71da4d99183587b6 Mon Sep 17 00:00:00 2001
From: Wang Yuanheng <wangyuanheng@wxiat.com>
Date: Mon, 27 Jun 2022 09:02:01 +0800
Subject: [PATCH 145/674] sw64: kvm: enable binding_vcpu debug dynamically

Sunway inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GFR5

--------------------------------

Add a bool debugfs file /sys/kernel/debug/sw_64/bind_vcpu, you can
echo 1/Y to enable bind vcpu, or echo 0/N to disable it. Determin which
node to bind the core according to the physical address assigned to the
guest.

Signed-off-by: Wang Yuanheng <wangyuanheng@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/kernel/Makefile   |  2 +-
 arch/sw_64/kernel/bindvcpu.c | 29 +++++++++++++++++++++++++++++
 arch/sw_64/kvm/kvm-sw64.c    | 13 ++++++++++++-
 3 files changed, 42 insertions(+), 2 deletions(-)
 create mode 100644 arch/sw_64/kernel/bindvcpu.c

diff --git a/arch/sw_64/kernel/Makefile b/arch/sw_64/kernel/Makefile
index 94b63d6a286b..d4dc9e175d67 100644
--- a/arch/sw_64/kernel/Makefile
+++ b/arch/sw_64/kernel/Makefile
@@ -31,7 +31,7 @@ obj-$(CONFIG_HIBERNATION) += hibernate_asm.o hibernate.o
 obj-$(CONFIG_AUDIT)     += audit.o
 obj-$(CONFIG_PCI) += pci_common.o
 obj-$(CONFIG_RELOCATABLE)   += relocate.o
-obj-$(CONFIG_DEBUG_FS)	+= segvdbg.o
+obj-$(CONFIG_DEBUG_FS)	+= segvdbg.o bindvcpu.o
 obj-$(CONFIG_JUMP_LABEL) += jump_label.o
 
 ifndef CONFIG_PCI
diff --git a/arch/sw_64/kernel/bindvcpu.c b/arch/sw_64/kernel/bindvcpu.c
new file mode 100644
index 000000000000..611c395c144b
--- /dev/null
+++ b/arch/sw_64/kernel/bindvcpu.c
@@ -0,0 +1,29 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2022 Wang Yuanheng
+ * Author: Wang Yuanheng
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/uaccess.h>
+#include <asm/debug.h>
+
+extern bool bind_vcpu_enabled;
+
+static int __init bind_vcpu_init(void)
+{
+	struct dentry *bindvcpu;
+
+	if (!sw64_debugfs_dir)
+		return -ENODEV;
+
+	bindvcpu = debugfs_create_bool("bind_vcpu", 0644,
+			sw64_debugfs_dir, &bind_vcpu_enabled);
+	if (!bindvcpu)
+		return -ENOMEM;
+	return 0;
+}
+late_initcall(bind_vcpu_init);
diff --git a/arch/sw_64/kvm/kvm-sw64.c b/arch/sw_64/kvm/kvm-sw64.c
index af29d0ca8e7f..de81f7efe01a 100644
--- a/arch/sw_64/kvm/kvm-sw64.c
+++ b/arch/sw_64/kvm/kvm-sw64.c
@@ -12,7 +12,7 @@
 #include <linux/sched/signal.h>
 #include <linux/kvm.h>
 #include <linux/uaccess.h>
-
+#include <linux/sched.h>
 #include <asm/kvm_timer.h>
 #include <asm/kvm_emulate.h>
 
@@ -21,6 +21,7 @@
 
 bool set_msi_flag;
 unsigned long sw64_kvm_last_vpn[NR_CPUS];
+__read_mostly bool bind_vcpu_enabled;
 #define cpu_last_vpn(cpuid) sw64_kvm_last_vpn[cpuid]
 
 #ifdef CONFIG_SUBARCH_C3B
@@ -537,6 +538,16 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
 #ifndef CONFIG_KVM_MEMHOTPLUG
 		vcpu->arch.vcb.vpcr
 			= get_vpcr(vcpu->kvm->arch.host_phys_addr, vcpu->kvm->arch.size, 0);
+
+		if (unlikely(bind_vcpu_enabled)) {
+			int nid;
+			unsigned long end;
+
+			end = vcpu->kvm->arch.host_phys_addr + vcpu->kvm->arch.size;
+			nid = pfn_to_nid(PHYS_PFN(vcpu->kvm->arch.host_phys_addr));
+			if (pfn_to_nid(PHYS_PFN(end)) == nid)
+				set_cpus_allowed_ptr(vcpu->arch.tsk, node_to_cpumask_map[nid]);
+		}
 #else
 		unsigned long seg_base = virt_to_phys(vcpu->kvm->arch.seg_pgd);
 
-- 
Gitee


From 0d22be5a4f5fb31a45fb4c55dde5037bfba3c572 Mon Sep 17 00:00:00 2001
From: Gu Zitao <guzitao@wxiat.com>
Date: Tue, 28 Jun 2022 08:51:26 +0800
Subject: [PATCH 146/674] sw64: rename CONFIG_HAVE_GENERIC_GUP to
 CONFIG_HAVE_FAST_GUP

Sunway inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GFRP

--------------------------------

This patch is a backport of commit 67a929e097b7 ("mm: rename
CONFIG_HAVE_GENERIC_GUP to CONFIG_HAVE_FAST_GUP"). In order to
use the generic GUP, let's do the same.

Signed-off-by: Gu Zitao <guzitao@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/sw_64/Kconfig b/arch/sw_64/Kconfig
index ec6e583a5d9a..cf2f6f00708c 100644
--- a/arch/sw_64/Kconfig
+++ b/arch/sw_64/Kconfig
@@ -7,7 +7,7 @@ config SW64
 	select HAVE_OPROFILE
 	select HAVE_PCSPKR_PLATFORM
 	select HAVE_PERF_EVENTS
-	select HAVE_GENERIC_GUP
+	select HAVE_FAST_GUP
 	select GENERIC_CLOCKEVENTS
 	select GENERIC_IRQ_PROBE
 	select GENERIC_IRQ_LEGACY
-- 
Gitee


From ce6455155eeacae6557fe0142e69a6c288ac970e Mon Sep 17 00:00:00 2001
From: Zhou Xuemei <zhouxuemei@wxiat.com>
Date: Thu, 7 Jul 2022 17:19:10 +0800
Subject: [PATCH 147/674] sw64: fix floating point register corruption

Sunway inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GFS3

--------------------------------

When csum_partial_copy_from_user is called in an interrupt,
__copy_from_user will modify floating point register f10-f15
without restore register value. This will cause the value of
the userspace register to be corrupted. Use memcpy() instead
when called from kernel space.

Signed-off-by: Zhou Xuemei <zhouxuemei@wxiat.com>

Signed-off-by: Gu Zitao <guzitao@wxiat.com>
---
 arch/sw_64/lib/csum_partial_copy.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/arch/sw_64/lib/csum_partial_copy.c b/arch/sw_64/lib/csum_partial_copy.c
index 5e5274e82b2b..742dd63cdb70 100644
--- a/arch/sw_64/lib/csum_partial_copy.c
+++ b/arch/sw_64/lib/csum_partial_copy.c
@@ -61,7 +61,11 @@ csum_partial_cfu_dest_aligned(const unsigned long __user *src,
 	unsigned long checksum = ~0U;
 	int err = 0;
 
-	err = __copy_from_user(dst, src, len+8);
+	if (likely(!uaccess_kernel()))
+		err = __copy_from_user(dst, src, len + 8);
+	else
+		memcpy(dst, src, len + 8);
+
 	while (len > 0) {
 		word = *dst;
 		checksum += word;
@@ -89,7 +93,10 @@ csum_partial_cfu_dest_unaligned(const unsigned long __user *src,
 	unsigned long checksum = ~0U;
 	int err = 0;
 
-	err = __copy_from_user(dst, src, len+8);
+	if (likely(!uaccess_kernel()))
+		err = __copy_from_user(dst, src, len + 8);
+	else
+		memcpy(dst, src, len + 8);
 
 	dst = (unsigned long *)((unsigned long)dst & (~7UL));
 	word = *dst;
-- 
Gitee


From 510d96b0f5a1edbd9ed1e9efbfde87930a3588b3 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Tue, 12 Jul 2022 22:51:43 +0800
Subject: [PATCH 148/674] blk-throttle: fix io hung due to configuration
 updates

hulk inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I58VE5
CVE: NA

--------------------------------

If new configuration is submitted while a bio is throttled, then new
waiting time is recaculated regardless that the bio might aready wait
for some time:

tg_conf_updated
 throtl_start_new_slice
  tg_update_disptime
  throtl_schedule_next_dispatch

Then io hung can be triggered by always submmiting new configuration
before the throttled bio is dispatched.

Fix the problem by respecting the time that throttled bio aready waited.
In order to do that, instead of start new slice in tg_conf_updated(),
just update 'bytes_disp' and 'io_disp' based on the new configuration.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 block/blk-throttle.c | 79 ++++++++++++++++++++++++++++++++++++--------
 1 file changed, 66 insertions(+), 13 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 95a13da1f343..0427c9c63e81 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1421,7 +1421,57 @@ static int tg_print_conf_uint(struct seq_file *sf, void *v)
 	return 0;
 }
 
-static void tg_conf_updated(struct throtl_grp *tg, bool global)
+static u64 throtl_update_bytes_disp(u64 dispatched, u64 new_limit,
+				    u64 old_limit)
+{
+	if (new_limit == old_limit)
+		return dispatched;
+
+	if (!dispatched)
+		return 0;
+
+	/*
+	 * In the case that multiply will overflow, just return 0. It will only
+	 * let bios to be dispatched earlier.
+	 */
+	if (div64_u64(U64_MAX, dispatched) < new_limit)
+		return 0;
+
+	dispatched *= new_limit;
+	return div64_u64(dispatched, old_limit);
+}
+
+static u32 throtl_update_io_disp(u32 dispatched, u32 new_limit, u32 old_limit)
+{
+	if (new_limit == old_limit)
+		return dispatched;
+
+	if (!dispatched)
+		return 0;
+	/*
+	 * In the case that multiply will overflow, just return 0. It will only
+	 * let bios to be dispatched earlier.
+	 */
+	if (UINT_MAX / dispatched < new_limit)
+		return 0;
+
+	dispatched *= new_limit;
+	return dispatched / old_limit;
+}
+
+static void throtl_update_slice(struct throtl_grp *tg, u64 *old_limits)
+{
+	tg->bytes_disp[READ] = throtl_update_bytes_disp(tg->bytes_disp[READ],
+			tg_bps_limit(tg, READ), old_limits[0]);
+	tg->bytes_disp[WRITE] = throtl_update_bytes_disp(tg->bytes_disp[WRITE],
+			tg_bps_limit(tg, WRITE), old_limits[1]);
+	tg->io_disp[READ] = throtl_update_io_disp(tg->io_disp[READ],
+			tg_iops_limit(tg, READ), (u32)old_limits[2]);
+	tg->io_disp[WRITE] = throtl_update_io_disp(tg->io_disp[WRITE],
+			tg_iops_limit(tg, WRITE), (u32)old_limits[3]);
+}
+
+static void tg_conf_updated(struct throtl_grp *tg, u64 *old_limits, bool global)
 {
 	struct throtl_service_queue *sq = &tg->service_queue;
 	struct cgroup_subsys_state *pos_css;
@@ -1460,16 +1510,7 @@ static void tg_conf_updated(struct throtl_grp *tg, bool global)
 				parent_tg->latency_target);
 	}
 
-	/*
-	 * We're already holding queue_lock and know @tg is valid.  Let's
-	 * apply the new config directly.
-	 *
-	 * Restart the slices for both READ and WRITES. It might happen
-	 * that a group's limit are dropped suddenly and we don't want to
-	 * account recently dispatched IO with new low rate.
-	 */
-	throtl_start_new_slice(tg, READ);
-	throtl_start_new_slice(tg, WRITE);
+	throtl_update_slice(tg, old_limits);
 
 	if (tg->flags & THROTL_TG_PENDING) {
 		tg_update_disptime(tg);
@@ -1502,6 +1543,14 @@ static inline int throtl_restart_syscall_when_busy(int errno)
 	return ret;
 }
 
+static void tg_get_limits(struct throtl_grp *tg, u64 *limits)
+{
+	limits[0] = tg_bps_limit(tg, READ);
+	limits[1] = tg_bps_limit(tg, WRITE);
+	limits[2] = tg_iops_limit(tg, READ);
+	limits[3] = tg_iops_limit(tg, WRITE);
+}
+
 static ssize_t tg_set_conf(struct kernfs_open_file *of,
 			   char *buf, size_t nbytes, loff_t off, bool is_u64)
 {
@@ -1510,6 +1559,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
 	struct throtl_grp *tg;
 	int ret;
 	u64 v;
+	u64 old_limits[4];
 
 	ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
 	if (ret)
@@ -1526,13 +1576,14 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
 		v = U64_MAX;
 
 	tg = blkg_to_tg(ctx.blkg);
+	tg_get_limits(tg, old_limits);
 
 	if (is_u64)
 		*(u64 *)((void *)tg + of_cft(of)->private) = v;
 	else
 		*(unsigned int *)((void *)tg + of_cft(of)->private) = v;
 
-	tg_conf_updated(tg, false);
+	tg_conf_updated(tg, old_limits, false);
 	ret = 0;
 out_finish:
 	blkg_conf_finish(&ctx);
@@ -1703,6 +1754,7 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
 	struct blkg_conf_ctx ctx;
 	struct throtl_grp *tg;
 	u64 v[4];
+	u64 old_limits[4];
 	unsigned long idle_time;
 	unsigned long latency_time;
 	int ret;
@@ -1721,6 +1773,7 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
 	v[1] = tg->bps_conf[WRITE][index];
 	v[2] = tg->iops_conf[READ][index];
 	v[3] = tg->iops_conf[WRITE][index];
+	tg_get_limits(tg, old_limits);
 
 	idle_time = tg->idletime_threshold_conf;
 	latency_time = tg->latency_target_conf;
@@ -1807,7 +1860,7 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
 			tg->td->limit_index = LIMIT_LOW;
 	} else
 		tg->td->limit_index = LIMIT_MAX;
-	tg_conf_updated(tg, index == LIMIT_LOW &&
+	tg_conf_updated(tg, old_limits, index == LIMIT_LOW &&
 		tg->td->limit_valid[LIMIT_LOW]);
 	ret = 0;
 out_finish:
-- 
Gitee


From badcb143e9e6bf8f66c06e08be3fb92a3abd1f11 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Tue, 12 Jul 2022 22:51:44 +0800
Subject: [PATCH 149/674] block: update nsecs[] in part_stat_show() and
 diskstats_show()

hulk inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I57S8D
CVE: NA

--------------------------------

commit 7ec2ec682568 ("block: update io_ticks when io hang") fixed that
%util will be zero for iostat when io is hanged, however, avgqu-sz is
still zero while it represents the number of io that are hunged. On the
other hand, for some slow device, if an io is started before and done
after diskstats is read, the avgqu-sz will be miscalculated.

To fix the problem, update 'nsecs[]' when part_stat_show() or
diskstats_show() is called. In order to do that, add 'stat_time' in
struct hd_struct and 'rq_stat_time' in struct request to record the
time. And during iteration, update 'nsecs[]' for each inflight request.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 block/blk-core.c        | 15 ++++++++--
 block/blk-mq.c          | 45 ++++++++++++++++++++++++++++++
 block/blk-mq.h          |  2 ++
 block/genhd.c           | 61 ++++++++++++++++++++++++++---------------
 block/partitions/core.c |  2 ++
 include/linux/blkdev.h  |  3 +-
 include/linux/genhd.h   |  2 ++
 7 files changed, 105 insertions(+), 25 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index bbd3d4560458..f5f419dd8ab6 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1292,13 +1292,24 @@ void blk_account_io_done(struct request *req, u64 now)
 	    !(req->rq_flags & RQF_FLUSH_SEQ)) {
 		const int sgrp = op_stat_group(req_op(req));
 		struct hd_struct *part;
+		u64 stat_time;
 
 		part_stat_lock();
 		part = req->part;
-
 		update_io_ticks(part, jiffies, true);
 		part_stat_inc(part, ios[sgrp]);
-		part_stat_add(part, nsecs[sgrp], now - req->start_time_ns);
+		stat_time = READ_ONCE(req->stat_time_ns);
+		/*
+		 * This might fail if 'req->stat_time_ns' is updated
+		 * in blk_mq_check_inflight_with_stat().
+		 */
+		if (likely(cmpxchg64(&req->stat_time_ns, stat_time, now)
+			   == stat_time)) {
+			u64 duation = stat_time ? now - stat_time :
+				now - req->start_time_ns;
+
+			part_stat_add(req->part, nsecs[sgrp], duation);
+		}
 		part_stat_unlock();
 
 		hd_struct_put(part);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 83193e44aada..2cf387697a41 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -102,6 +102,50 @@ struct mq_inflight {
 	unsigned int inflight[2];
 };
 
+static bool blk_mq_check_inflight_with_stat(struct blk_mq_hw_ctx *hctx,
+					    struct request *rq, void *priv,
+					    bool reserved)
+{
+	struct mq_inflight *mi = priv;
+
+	if ((!mi->part->partno || rq->part == mi->part) &&
+	    blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT) {
+		u64 stat_time;
+
+		mi->inflight[rq_data_dir(rq)]++;
+		if (!rq->part)
+			return true;
+
+		stat_time = READ_ONCE(rq->stat_time_ns);
+		/*
+		 * This might fail if 'req->stat_time_ns' is updated in
+		 * blk_account_io_done().
+		 */
+		if (likely(cmpxchg64(&rq->stat_time_ns, stat_time,
+				   rq->part->stat_time) == stat_time)) {
+			int sgrp = op_stat_group(req_op(rq));
+			u64 duation = stat_time ?
+				rq->part->stat_time - stat_time :
+				rq->part->stat_time - rq->start_time_ns;
+
+			part_stat_add(rq->part, nsecs[sgrp], duation);
+		}
+	}
+
+	return true;
+}
+
+unsigned int blk_mq_in_flight_with_stat(struct request_queue *q,
+					struct hd_struct *part)
+{
+	struct mq_inflight mi = { .part = part };
+
+	blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight_with_stat, &mi);
+
+	return mi.inflight[0] + mi.inflight[1];
+}
+
+
 static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
 				  struct request *rq, void *priv,
 				  bool reserved)
@@ -328,6 +372,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 		rq->start_time_ns = ktime_get_ns();
 	else
 		rq->start_time_ns = 0;
+	rq->stat_time_ns = 0;
 	rq->io_start_time_ns = 0;
 	rq->stats_sectors = 0;
 	rq->nr_phys_segments = 0;
diff --git a/block/blk-mq.h b/block/blk-mq.h
index bb58c68b8274..6897b4c09b7c 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -187,6 +187,8 @@ static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx)
 unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part);
 void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part,
 			 unsigned int inflight[2]);
+unsigned int blk_mq_in_flight_with_stat(struct request_queue *q,
+					struct hd_struct *part);
 
 static inline void blk_mq_put_dispatch_budget(struct request_queue *q)
 {
diff --git a/block/genhd.c b/block/genhd.c
index f94152e99876..16ad881172d0 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1293,25 +1293,52 @@ ssize_t part_size_show(struct device *dev,
 		(unsigned long long)part_nr_sects_read(p));
 }
 
+static void part_set_stat_time(struct hd_struct *hd)
+{
+	u64 now = ktime_get_ns();
+
+again:
+	hd->stat_time = now;
+	if (hd->partno) {
+		hd = &part_to_disk(hd)->part0;
+		goto again;
+	}
+}
+
+static void part_get_stat_info(struct hd_struct *hd, struct disk_stats *stat,
+			       unsigned int *inflight)
+{
+	struct request_queue *q = part_to_disk(hd)->queue;
+
+	if (queue_is_mq(q)) {
+		part_stat_lock();
+		spin_lock(&hd->bd_stat_lock);
+		part_set_stat_time(hd);
+		*inflight = blk_mq_in_flight_with_stat(q, hd);
+		spin_unlock(&hd->bd_stat_lock);
+		part_stat_unlock();
+	} else {
+		*inflight = part_in_flight(hd);
+	}
+
+	if (*inflight) {
+		part_stat_lock();
+		update_io_ticks(hd, jiffies, true);
+		part_stat_unlock();
+	}
+
+	part_stat_read_all(hd, stat);
+}
+
 ssize_t part_stat_show(struct device *dev,
 		       struct device_attribute *attr, char *buf)
 {
 	struct hd_struct *p = dev_to_part(dev);
-	struct request_queue *q = part_to_disk(p)->queue;
 	struct disk_stats stat;
 	unsigned int inflight;
 
-	if (queue_is_mq(q))
-		inflight = blk_mq_in_flight(q, p);
-	else
-		inflight = part_in_flight(p);
+	part_get_stat_info(p, &stat, &inflight);
 
-	if (inflight) {
-		part_stat_lock();
-		update_io_ticks(p, jiffies, true);
-		part_stat_unlock();
-	}
-	part_stat_read_all(p, &stat);
 	return sprintf(buf,
 		"%8lu %8lu %8llu %8u "
 		"%8lu %8lu %8llu %8u "
@@ -1628,17 +1655,7 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 
 	disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0);
 	while ((hd = disk_part_iter_next(&piter))) {
-		if (queue_is_mq(gp->queue))
-			inflight = blk_mq_in_flight(gp->queue, hd);
-		else
-			inflight = part_in_flight(hd);
-
-		if (inflight) {
-			part_stat_lock();
-			update_io_ticks(hd, jiffies, true);
-			part_stat_unlock();
-		}
-		part_stat_read_all(hd, &stat);
+		part_get_stat_info(hd, &stat, &inflight);
 		seq_printf(seqf, "%4d %7d %s "
 			   "%lu %lu %lu %u "
 			   "%lu %lu %lu %u "
diff --git a/block/partitions/core.c b/block/partitions/core.c
index 569b0ca9f6e1..92c723c19bb0 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -415,6 +415,8 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno,
 	p->nr_sects = len;
 	p->partno = partno;
 	p->read_only = get_disk_ro(disk) | test_bit(partno, disk->user_ro_bitmap);
+	p->stat_time = 0;
+	spin_lock_init(&p->bd_stat_lock);
 
 	if (info) {
 		struct partition_meta_info *pinfo;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 6a4b2a01a462..46feee6bd45c 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -207,7 +207,8 @@ struct request {
 	u64 start_time_ns;
 	/* Time that I/O was submitted to the device. */
 	u64 io_start_time_ns;
-
+	/* Time that I/O was counted in part_get_stat_info(). */
+	u64 stat_time_ns;
 #ifdef CONFIG_BLK_WBT
 	unsigned short wbt_flags;
 #endif
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 09da27361620..07122c79210c 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -63,6 +63,8 @@ struct hd_struct {
 	seqcount_t nr_sects_seq;
 #endif
 	unsigned long stamp;
+	spinlock_t bd_stat_lock;
+	u64 stat_time;
 	struct disk_stats __percpu *dkstats;
 	struct percpu_ref ref;
 
-- 
Gitee


From 5c250d556e1021b554c712056bc00ca9871d06f9 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Tue, 12 Jul 2022 22:51:45 +0800
Subject: [PATCH 150/674] blk-mq: fix kabi broken in struct request

hulk inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I57S8D
CVE: NA

--------------------------------

Since there are no reserved fields, declare a wrapper to fix kabi
broken.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 block/blk-core.c       |  7 ++++---
 block/blk-mq.c         | 12 +++++++-----
 include/linux/blk-mq.h | 13 +++++++++++--
 include/linux/blkdev.h |  2 --
 4 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index f5f419dd8ab6..8b93366c76e4 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1293,17 +1293,18 @@ void blk_account_io_done(struct request *req, u64 now)
 		const int sgrp = op_stat_group(req_op(req));
 		struct hd_struct *part;
 		u64 stat_time;
+		struct request_wrapper *rq_wrapper = request_to_wrapper(req);
 
 		part_stat_lock();
 		part = req->part;
 		update_io_ticks(part, jiffies, true);
 		part_stat_inc(part, ios[sgrp]);
-		stat_time = READ_ONCE(req->stat_time_ns);
+		stat_time = READ_ONCE(rq_wrapper->stat_time_ns);
 		/*
-		 * This might fail if 'req->stat_time_ns' is updated
+		 * This might fail if 'stat_time_ns' is updated
 		 * in blk_mq_check_inflight_with_stat().
 		 */
-		if (likely(cmpxchg64(&req->stat_time_ns, stat_time, now)
+		if (likely(cmpxchg64(&rq_wrapper->stat_time_ns, stat_time, now)
 			   == stat_time)) {
 			u64 duation = stat_time ? now - stat_time :
 				now - req->start_time_ns;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 2cf387697a41..76ff229e7fb2 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -111,17 +111,19 @@ static bool blk_mq_check_inflight_with_stat(struct blk_mq_hw_ctx *hctx,
 	if ((!mi->part->partno || rq->part == mi->part) &&
 	    blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT) {
 		u64 stat_time;
+		struct request_wrapper *rq_wrapper;
 
 		mi->inflight[rq_data_dir(rq)]++;
 		if (!rq->part)
 			return true;
 
-		stat_time = READ_ONCE(rq->stat_time_ns);
+		rq_wrapper = request_to_wrapper(rq);
+		stat_time = READ_ONCE(rq_wrapper->stat_time_ns);
 		/*
-		 * This might fail if 'req->stat_time_ns' is updated in
+		 * This might fail if 'stat_time_ns' is updated in
 		 * blk_account_io_done().
 		 */
-		if (likely(cmpxchg64(&rq->stat_time_ns, stat_time,
+		if (likely(cmpxchg64(&rq_wrapper->stat_time_ns, stat_time,
 				   rq->part->stat_time) == stat_time)) {
 			int sgrp = op_stat_group(req_op(rq));
 			u64 duation = stat_time ?
@@ -368,11 +370,11 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 #ifdef CONFIG_BLK_RQ_ALLOC_TIME
 	rq->alloc_time_ns = alloc_time_ns;
 #endif
+	request_to_wrapper(rq)->stat_time_ns = 0;
 	if (blk_mq_need_time_stamp(rq))
 		rq->start_time_ns = ktime_get_ns();
 	else
 		rq->start_time_ns = 0;
-	rq->stat_time_ns = 0;
 	rq->io_start_time_ns = 0;
 	rq->stats_sectors = 0;
 	rq->nr_phys_segments = 0;
@@ -2555,7 +2557,7 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
 	 * rq_size is the size of the request plus driver payload, rounded
 	 * to the cacheline size
 	 */
-	rq_size = round_up(sizeof(struct request) + set->cmd_size,
+	rq_size = round_up(sizeof(struct request_wrapper) + set->cmd_size,
 				cache_line_size());
 	left = rq_size * depth;
 
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index c9210fb70e4d..ac83257972a0 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -304,6 +304,15 @@ struct blk_mq_queue_data {
 	KABI_RESERVE(1)
 };
 
+struct request_wrapper {
+	struct request rq;
+
+	/* Time that I/O was counted in part_get_stat_info(). */
+	u64 stat_time_ns;
+};
+
+#define request_to_wrapper(_rq) container_of(_rq, struct request_wrapper, rq)
+
 typedef bool (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *,
 		bool);
 typedef bool (busy_tag_iter_fn)(struct request *, void *, bool);
@@ -595,7 +604,7 @@ static inline bool blk_should_fake_timeout(struct request_queue *q)
  */
 static inline struct request *blk_mq_rq_from_pdu(void *pdu)
 {
-	return pdu - sizeof(struct request);
+	return pdu - sizeof(struct request_wrapper);
 }
 
 /**
@@ -609,7 +618,7 @@ static inline struct request *blk_mq_rq_from_pdu(void *pdu)
  */
 static inline void *blk_mq_rq_to_pdu(struct request *rq)
 {
-	return rq + 1;
+	return request_to_wrapper(rq) + 1;
 }
 
 static inline struct blk_mq_hw_ctx *queue_hctx(struct request_queue *q, int id)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 46feee6bd45c..49540ce9e325 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -207,8 +207,6 @@ struct request {
 	u64 start_time_ns;
 	/* Time that I/O was submitted to the device. */
 	u64 io_start_time_ns;
-	/* Time that I/O was counted in part_get_stat_info(). */
-	u64 stat_time_ns;
 #ifdef CONFIG_BLK_WBT
 	unsigned short wbt_flags;
 #endif
-- 
Gitee


From 47fea27be7e37652e5fb324c26e1f976aef7f83c Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Tue, 12 Jul 2022 22:51:46 +0800
Subject: [PATCH 151/674] block: fix kabi broken in struct hd_struct

hulk inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I57S8D
CVE: NA

--------------------------------

Use reserved fields to fix kabi broken for field 'stat_time'. However,
for the field 'bd_stat_lock', spinlock_t can be up to 64 bytes, thus
reserved fields is not enough. And struct 'hd_struct' is internal of
other sutrct, thus declare a wrapper is infeasible. In order to fix
kabi broken for 'bd_stat_lock', use 'dev->mutex' instead.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 block/genhd.c           | 4 ++--
 block/partitions/core.c | 1 -
 include/linux/genhd.h   | 4 +---
 3 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/block/genhd.c b/block/genhd.c
index 16ad881172d0..cc114dd0265b 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1312,10 +1312,10 @@ static void part_get_stat_info(struct hd_struct *hd, struct disk_stats *stat,
 
 	if (queue_is_mq(q)) {
 		part_stat_lock();
-		spin_lock(&hd->bd_stat_lock);
+		mutex_lock(&part_to_dev(hd)->mutex);
 		part_set_stat_time(hd);
 		*inflight = blk_mq_in_flight_with_stat(q, hd);
-		spin_unlock(&hd->bd_stat_lock);
+		mutex_unlock(&part_to_dev(hd)->mutex);
 		part_stat_unlock();
 	} else {
 		*inflight = part_in_flight(hd);
diff --git a/block/partitions/core.c b/block/partitions/core.c
index 92c723c19bb0..8f32f3cd0ede 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -416,7 +416,6 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno,
 	p->partno = partno;
 	p->read_only = get_disk_ro(disk) | test_bit(partno, disk->user_ro_bitmap);
 	p->stat_time = 0;
-	spin_lock_init(&p->bd_stat_lock);
 
 	if (info) {
 		struct partition_meta_info *pinfo;
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 07122c79210c..05927a1c6b5b 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -63,8 +63,6 @@ struct hd_struct {
 	seqcount_t nr_sects_seq;
 #endif
 	unsigned long stamp;
-	spinlock_t bd_stat_lock;
-	u64 stat_time;
 	struct disk_stats __percpu *dkstats;
 	struct percpu_ref ref;
 
@@ -78,7 +76,7 @@ struct hd_struct {
 #endif
 	struct rcu_work rcu_work;
 
-	KABI_RESERVE(1)
+	KABI_USE(1, u64 stat_time)
 	KABI_RESERVE(2)
 	KABI_RESERVE(3)
 	KABI_RESERVE(4)
-- 
Gitee


From 8d1b294dcbd95a1a301245f11ee6fbdd9b23f019 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Tue, 12 Jul 2022 22:51:47 +0800
Subject: [PATCH 152/674] block: fix sleeping function called from invalid
 context in part_get_stat_info()

hulk inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I57S8D
CVE: NA

--------------------------------

part_get_stat_info() call mutex_lock() inside part_stat_lock(), which
is wrong because part_stat_lock() disables preempt.

Fix the problem by hold mutex first.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 block/genhd.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/block/genhd.c b/block/genhd.c
index cc114dd0265b..fcd6210417bc 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1311,12 +1311,12 @@ static void part_get_stat_info(struct hd_struct *hd, struct disk_stats *stat,
 	struct request_queue *q = part_to_disk(hd)->queue;
 
 	if (queue_is_mq(q)) {
-		part_stat_lock();
 		mutex_lock(&part_to_dev(hd)->mutex);
+		part_stat_lock();
 		part_set_stat_time(hd);
 		*inflight = blk_mq_in_flight_with_stat(q, hd);
-		mutex_unlock(&part_to_dev(hd)->mutex);
 		part_stat_unlock();
+		mutex_unlock(&part_to_dev(hd)->mutex);
 	} else {
 		*inflight = part_in_flight(hd);
 	}
-- 
Gitee


From ae7eb31ec422f1a1c644c7c3a73feb11d8657b99 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Tue, 12 Jul 2022 22:51:48 +0800
Subject: [PATCH 153/674] block: fix that iostat can show huge wait time

hulk inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I57S8D
CVE: NA

--------------------------------

There might be a problem that iostat can read less 'nsecs' than the
last time.

1) io is started after 'hd->stat_time' is set.

2) following concurrent scenario:

t1                              t2
                                blk_mq_end_request
				 time1 -> before hd->stat_time

				 blk_account_io_done
part_get_stat_info
 part_set_stat_time
  hd->stat_time = time2
  -> time1 < time2
 blk_mq_in_flight_with_stat
  blk_mq_check_inflight_with_stat
   cmpxchg64()
   -> set stat_time_ns to time2
				  cmpxchg64()
				  -> set stat_time to time1
				  duation = time1 - time2;
				  -> time1 < time2
				  part_stat_add(xx, nsecs, duation)
				  -> problematic

3) Similar concurrent scenario the other way around.

Fix the problem by don't add 'duation' if the calculation might underflow.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 block/blk-core.c |  3 ++-
 block/blk-mq.c   | 12 ++++++++++--
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 8b93366c76e4..715b61c239ea 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1304,7 +1304,8 @@ void blk_account_io_done(struct request *req, u64 now)
 		 * This might fail if 'stat_time_ns' is updated
 		 * in blk_mq_check_inflight_with_stat().
 		 */
-		if (likely(cmpxchg64(&rq_wrapper->stat_time_ns, stat_time, now)
+		if (likely(now > stat_time &&
+			   cmpxchg64(&rq_wrapper->stat_time_ns, stat_time, now)
 			   == stat_time)) {
 			u64 duation = stat_time ? now - stat_time :
 				now - req->start_time_ns;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 76ff229e7fb2..5031c4fc4412 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -117,14 +117,22 @@ static bool blk_mq_check_inflight_with_stat(struct blk_mq_hw_ctx *hctx,
 		if (!rq->part)
 			return true;
 
+		/*
+		 * If the request is started after 'part->stat_time' is set,
+		 * don't update 'nsces' here.
+		 */
+		if (rq->part->stat_time <= rq->start_time_ns)
+			return true;
+
 		rq_wrapper = request_to_wrapper(rq);
 		stat_time = READ_ONCE(rq_wrapper->stat_time_ns);
 		/*
 		 * This might fail if 'stat_time_ns' is updated in
 		 * blk_account_io_done().
 		 */
-		if (likely(cmpxchg64(&rq_wrapper->stat_time_ns, stat_time,
-				   rq->part->stat_time) == stat_time)) {
+		if (likely(rq->part->stat_time > stat_time &&
+			   cmpxchg64(&rq_wrapper->stat_time_ns, stat_time,
+				     rq->part->stat_time) == stat_time)) {
 			int sgrp = op_stat_group(req_op(rq));
 			u64 duation = stat_time ?
 				rq->part->stat_time - stat_time :
-- 
Gitee


From 2ed9d17cc07fcbbdfe37df9169164ebd8c547b00 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Tue, 12 Jul 2022 22:51:49 +0800
Subject: [PATCH 154/674] block: don't use cmpxchg64() on 32-bit platform

hulk inclusion
category: bugfix
bugzilla: 186921
CVE: NA

--------------------------------

Some 32-bit platform doesn't support cmpxchg64(), using it in generic
code will cause compile error.

Fixes: 4c8f034bf1e6 ("[Huawei] block: update nsecs[] in part_stat_show() and diskstats_show()")
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 block/blk-core.c | 6 ++++++
 block/blk-mq.c   | 3 ++-
 block/blk-mq.h   | 2 ++
 block/genhd.c    | 8 ++++++--
 4 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 715b61c239ea..109fb2750453 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1292,13 +1292,16 @@ void blk_account_io_done(struct request *req, u64 now)
 	    !(req->rq_flags & RQF_FLUSH_SEQ)) {
 		const int sgrp = op_stat_group(req_op(req));
 		struct hd_struct *part;
+#ifdef CONFIG_64BIT
 		u64 stat_time;
 		struct request_wrapper *rq_wrapper = request_to_wrapper(req);
+#endif
 
 		part_stat_lock();
 		part = req->part;
 		update_io_ticks(part, jiffies, true);
 		part_stat_inc(part, ios[sgrp]);
+#ifdef CONFIG_64BIT
 		stat_time = READ_ONCE(rq_wrapper->stat_time_ns);
 		/*
 		 * This might fail if 'stat_time_ns' is updated
@@ -1312,6 +1315,9 @@ void blk_account_io_done(struct request *req, u64 now)
 
 			part_stat_add(req->part, nsecs[sgrp], duation);
 		}
+#else
+		part_stat_add(part, nsecs[sgrp], now - req->start_time_ns);
+#endif
 		part_stat_unlock();
 
 		hd_struct_put(part);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 5031c4fc4412..1941ffc4db85 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -102,6 +102,7 @@ struct mq_inflight {
 	unsigned int inflight[2];
 };
 
+#ifdef CONFIG_64BIT
 static bool blk_mq_check_inflight_with_stat(struct blk_mq_hw_ctx *hctx,
 					    struct request *rq, void *priv,
 					    bool reserved)
@@ -154,7 +155,7 @@ unsigned int blk_mq_in_flight_with_stat(struct request_queue *q,
 
 	return mi.inflight[0] + mi.inflight[1];
 }
-
+#endif
 
 static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
 				  struct request *rq, void *priv,
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 6897b4c09b7c..ad2d74f887f2 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -187,8 +187,10 @@ static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx)
 unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part);
 void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part,
 			 unsigned int inflight[2]);
+#ifdef CONFIG_64BIT
 unsigned int blk_mq_in_flight_with_stat(struct request_queue *q,
 					struct hd_struct *part);
+#endif
 
 static inline void blk_mq_put_dispatch_budget(struct request_queue *q)
 {
diff --git a/block/genhd.c b/block/genhd.c
index fcd6210417bc..8b37fcfa10d1 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1293,6 +1293,7 @@ ssize_t part_size_show(struct device *dev,
 		(unsigned long long)part_nr_sects_read(p));
 }
 
+#ifdef CONFIG_64BIT
 static void part_set_stat_time(struct hd_struct *hd)
 {
 	u64 now = ktime_get_ns();
@@ -1304,12 +1305,13 @@ static void part_set_stat_time(struct hd_struct *hd)
 		goto again;
 	}
 }
+#endif
 
 static void part_get_stat_info(struct hd_struct *hd, struct disk_stats *stat,
 			       unsigned int *inflight)
 {
+#ifdef CONFIG_64BIT
 	struct request_queue *q = part_to_disk(hd)->queue;
-
 	if (queue_is_mq(q)) {
 		mutex_lock(&part_to_dev(hd)->mutex);
 		part_stat_lock();
@@ -1320,7 +1322,9 @@ static void part_get_stat_info(struct hd_struct *hd, struct disk_stats *stat,
 	} else {
 		*inflight = part_in_flight(hd);
 	}
-
+#else
+	*inflight = part_in_flight(hd);
+#endif
 	if (*inflight) {
 		part_stat_lock();
 		update_io_ticks(hd, jiffies, true);
-- 
Gitee


From 499ecade21fc377c04ec66ed7f0505f1ca74d755 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Tue, 12 Jul 2022 22:51:50 +0800
Subject: [PATCH 155/674] mm/filemap: fix that first page is not mark accessed
 in filemap_read()

hulk inclusion
category: bugfix
bugzilla: 186896, https://gitee.com/src-openeuler/kernel/issues/I5FRAP
CVE: NA

--------------------------------

In filemap_read(), first page will not mark accessed if previous page
is equal to the current page:

if (iocb->ki_pos >> PAGE_SHIFT != ra->prev_pos >> PAGE_SHIFT))
	folio_mark_accessed(fbatch.folios[0]);

However, 'prev_pos' is set to 'ki_pos + copied' during last read, which
means 'prev_pos' can be equal to 'ki_pos' in this read, thus previous
page can be miscaculated.

Fix the problem by setting 'prev_pos' to the start offset of last read,
so that 'prev_pos >> PAGE_SHIFT' will be previous page as expected.

Fixes: 06c0444290ce ("mm/filemap.c: generic_file_buffered_read() now uses find_get_pages_contig")
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 mm/filemap.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index 3958fc3280d8..9653c1e0faef 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2549,10 +2549,11 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
 				flush_dcache_page(pages[i]);
 
 			copied = copy_page_to_iter(pages[i], offset, bytes, iter);
-
-			written += copied;
-			iocb->ki_pos += copied;
-			ra->prev_pos = iocb->ki_pos;
+			if (copied) {
+				ra->prev_pos = iocb->ki_pos;
+				written += copied;
+				iocb->ki_pos += copied;
+			}
 
 			if (copied < bytes) {
 				error = -EFAULT;
-- 
Gitee


From ac90783f66b8154e56a3779206a18a1c5ee6d020 Mon Sep 17 00:00:00 2001
From: Li Huafei <lihuafei1@huawei.com>
Date: Tue, 12 Jul 2022 22:51:51 +0800
Subject: [PATCH 156/674] livepatch/ppc64: Fix several compilation errors in
 unwind_frame()

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5CJ7X

--------------------------------

Fix the spelling error of 'CONFIG_ FUNCTION_GRAPH_TRACE' and two
compilation errors:
- 'ftrace_idx' is not defined
- 'struct stackframe' does not have a member named 'ip'

Fixes: ac81d625f3da ("livepatch/ppc64: Implement livepatch without ftrace for ppc64be")
Signed-off-by: Li Huafei <lihuafei1@huawei.com>
Reviewed-by: Xu Kuohai <xukuohai@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 arch/powerpc/kernel/livepatch_64.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/livepatch_64.c b/arch/powerpc/kernel/livepatch_64.c
index f008b3beb001..2487fab569a7 100644
--- a/arch/powerpc/kernel/livepatch_64.c
+++ b/arch/powerpc/kernel/livepatch_64.c
@@ -250,6 +250,9 @@ static int unwind_frame(struct task_struct *tsk, struct stackframe *frame)
 {
 
 	unsigned long *stack;
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	int ftrace_idx = 0;
+#endif
 
 	if (!validate_sp(frame->sp, tsk, STACK_FRAME_OVERHEAD))
 		return -1;
@@ -279,12 +282,12 @@ static int unwind_frame(struct task_struct *tsk, struct stackframe *frame)
 
 	frame->sp = stack[0];
 	frame->pc = stack[STACK_FRAME_LR_SAVE];
-#ifdef CONFIG_FUNCTION_GRAPH_TRACE
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
 	/*
 	 * IMHO these tests do not belong in
 	 * arch-dependent code, they are generic.
 	 */
-	frame->pc = ftrace_graph_ret_addr(tsk, &ftrace_idx, frame->ip, stack);
+	frame->pc = ftrace_graph_ret_addr(tsk, &ftrace_idx, frame->pc, stack);
 #endif
 
 	return 0;
-- 
Gitee


From 27d852d29cd2f760d33cc4aa2244de3f6ddf5ebd Mon Sep 17 00:00:00 2001
From: Li Huafei <lihuafei1@huawei.com>
Date: Tue, 12 Jul 2022 22:51:52 +0800
Subject: [PATCH 157/674] livepatch/ppc64: Fix the stack check for exception
 frames

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5CJ7X

--------------------------------

For exception frames, we need to check NIP in addition to PC. When a
task is interrupted, NIP is the interrupt return address. The function
where the NIP is located is also on the stack.

Fixes: ec2244b5adcf ("livepatch/ppc64: only check stack top")
Fixes: 2a7c3db6e6a8 ("livepatch/powerpc64: Add arch_klp_module_check_calltrace")
Signed-off-by: Li Huafei <lihuafei1@huawei.com>
Reviewed-by: Xu Kuohai <xukuohai@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 arch/powerpc/kernel/livepatch_64.c | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/kernel/livepatch_64.c b/arch/powerpc/kernel/livepatch_64.c
index 2487fab569a7..acd3658d37a3 100644
--- a/arch/powerpc/kernel/livepatch_64.c
+++ b/arch/powerpc/kernel/livepatch_64.c
@@ -326,9 +326,14 @@ static int klp_check_jump_func(struct stackframe *frame, void *data)
 	struct walk_stackframe_args *args = data;
 	struct klp_func_list *check_funcs = args->check_funcs;
 
-	if (!check_func_list(check_funcs, &args->ret, frame->pc)) {
+	/* check the PC first */
+	if (!check_func_list(check_funcs, &args->ret, frame->pc))
 		return args->ret;
-	}
+
+	/* check NIP when the exception stack switching */
+	if (frame->nip && !check_func_list(check_funcs, &args->ret, frame->nip))
+		return args->ret;
+
 	return 0;
 }
 
@@ -430,11 +435,19 @@ static int check_module_calltrace(struct stackframe *frame, void *data)
 {
 	struct walk_stackframe_args *args = data;
 
-	if (within_module_core(frame->pc, args->mod)) {
-		pr_err("module %s is in use!\n", args->mod->name);
-		return (args->ret = -EBUSY);
-	}
+	/* check the PC first */
+	if (within_module_core(frame->pc, args->mod))
+		goto err_out;
+
+	/* check NIP when the exception stack switching */
+	if (frame->nip && within_module_core(frame->nip, args->mod))
+		goto err_out;
+
 	return 0;
+
+err_out:
+	pr_err("module %s is in use!\n", args->mod->name);
+	return (args->ret = -EBUSY);
 }
 
 int arch_klp_module_check_calltrace(void *data)
-- 
Gitee


From 7a64eb335c3b048001238d9b36bf2bedef1d5b62 Mon Sep 17 00:00:00 2001
From: Li Huafei <lihuafei1@huawei.com>
Date: Tue, 12 Jul 2022 22:51:53 +0800
Subject: [PATCH 158/674] livepatch/ppc32: Fix the stack check for exception
 frames

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5CJ7X

--------------------------------

Currently, the stack check of ppc32 does not consider exception frames,
and the interrupted functions are omitted. Similar to ppc64, special
processing of exception frames should be added.

Because the stack frame structure of ppc32 and ppc64 is the same, we can
reuse the unwind_frame() code of ppc64. Then during the stack check, the
exception frames need to check the NIP in addition to the PC, which is
the function that is interrupted.

Fixes: e22fb775aaf6 ("livepatch/ppc32: Support livepatch without ftrace")
Fixes: 1daef7b0cae5 ("livepatch/powerpc32: Add arch_klp_module_check_calltrace")
Signed-off-by: Li Huafei <lihuafei1@huawei.com>
Reviewed-by: Xu Kuohai <xukuohai@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 arch/powerpc/include/asm/livepatch.h |  7 ++++
 arch/powerpc/kernel/livepatch.c      | 47 ++++++++++++++++++++++++
 arch/powerpc/kernel/livepatch_32.c   | 47 +++++++++++-------------
 arch/powerpc/kernel/livepatch_64.c   | 55 +---------------------------
 4 files changed, 76 insertions(+), 80 deletions(-)

diff --git a/arch/powerpc/include/asm/livepatch.h b/arch/powerpc/include/asm/livepatch.h
index bafbfaba190f..39dcfc3c28ce 100644
--- a/arch/powerpc/include/asm/livepatch.h
+++ b/arch/powerpc/include/asm/livepatch.h
@@ -118,6 +118,12 @@ struct arch_klp_data {
 
 #endif	/* CONFIG_PPC64 */
 
+struct stackframe {
+	unsigned long sp;
+	unsigned long pc;
+	unsigned long nip;
+};
+
 #ifdef PPC64_ELF_ABI_v1
 struct klp_func_node;
 void arch_klp_set_brk_func(struct klp_func_node *func_node, void *new_func);
@@ -127,6 +133,7 @@ int arch_klp_add_breakpoint(struct arch_klp_data *arch_data, void *old_func);
 void arch_klp_remove_breakpoint(struct arch_klp_data *arch_data, void *old_func);
 long arch_klp_save_old_code(struct arch_klp_data *arch_data, void *old_func);
 int arch_klp_module_check_calltrace(void *data);
+int klp_unwind_frame(struct task_struct *tsk, struct stackframe *frame);
 
 #endif /* CONFIG_LIVEPATCH_FTRACE */
 
diff --git a/arch/powerpc/kernel/livepatch.c b/arch/powerpc/kernel/livepatch.c
index b8afcc7b9939..d568e8c8b16b 100644
--- a/arch/powerpc/kernel/livepatch.c
+++ b/arch/powerpc/kernel/livepatch.c
@@ -21,6 +21,7 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <linux/module.h>
+#include <linux/ftrace.h>
 #include <linux/livepatch.h>
 #include <asm/probes.h>
 #include <asm/livepatch.h>
@@ -65,3 +66,49 @@ int klp_brk_handler(struct pt_regs *regs)
 
 	return 1;
 }
+
+int klp_unwind_frame(struct task_struct *tsk, struct stackframe *frame)
+{
+	unsigned long *stack;
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	int ftrace_idx = 0;
+#endif
+
+	if (!validate_sp(frame->sp, tsk, STACK_FRAME_OVERHEAD))
+		return -1;
+
+	if (frame->nip != 0)
+		frame->nip = 0;
+
+	stack = (unsigned long *)frame->sp;
+
+	/*
+	 * When switching to the exception stack,
+	 * we save the NIP in pt_regs
+	 *
+	 * See if this is an exception frame.
+	 * We look for the "regshere" marker in the current frame.
+	 */
+	if (validate_sp(frame->sp, tsk, STACK_INT_FRAME_SIZE)
+	    && stack[STACK_FRAME_MARKER] == STACK_FRAME_REGS_MARKER) {
+		struct pt_regs *regs = (struct pt_regs *)
+			(frame->sp + STACK_FRAME_OVERHEAD);
+		frame->nip = regs->nip;
+		pr_debug("--- interrupt: task = %d/%s, trap %lx at NIP=x%lx/%pS, LR=0x%lx/%pS\n",
+			tsk->pid, tsk->comm, regs->trap,
+			regs->nip, (void *)regs->nip,
+			regs->link, (void *)regs->link);
+	}
+
+	frame->sp = stack[0];
+	frame->pc = stack[STACK_FRAME_LR_SAVE];
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	/*
+	 * IMHO these tests do not belong in
+	 * arch-dependent code, they are generic.
+	 */
+	frame->pc = ftrace_graph_ret_addr(tsk, &ftrace_idx, frame->pc, stack);
+#endif
+
+	return 0;
+}
diff --git a/arch/powerpc/kernel/livepatch_32.c b/arch/powerpc/kernel/livepatch_32.c
index 603f1d61cc23..8f53386e7cf8 100644
--- a/arch/powerpc/kernel/livepatch_32.c
+++ b/arch/powerpc/kernel/livepatch_32.c
@@ -62,11 +62,6 @@ struct klp_func_list {
 	int force;
 };
 
-struct stackframe {
-	unsigned long sp;
-	unsigned long pc;
-};
-
 struct walk_stackframe_args {
 	int enable;
 	struct klp_func_list *check_funcs;
@@ -226,20 +221,6 @@ static int klp_check_activeness_func(struct klp_patch *patch, int enable,
 	return 0;
 }
 
-static int unwind_frame(struct task_struct *tsk, struct stackframe *frame)
-{
-
-	unsigned long *stack;
-
-	if (!validate_sp(frame->sp, tsk, STACK_FRAME_OVERHEAD))
-		return -1;
-
-	stack = (unsigned long *)frame->sp;
-	frame->sp = stack[0];
-	frame->pc = stack[STACK_FRAME_LR_SAVE];
-	return 0;
-}
-
 void notrace klp_walk_stackframe(struct stackframe *frame,
 		int (*fn)(struct stackframe *, void *),
 		struct task_struct *tsk, void *data)
@@ -249,7 +230,7 @@ void notrace klp_walk_stackframe(struct stackframe *frame,
 
 		if (fn(frame, data))
 			break;
-		ret = unwind_frame(tsk, frame);
+		ret = klp_unwind_frame(tsk, frame);
 		if (ret < 0)
 			break;
 	}
@@ -273,9 +254,14 @@ static int klp_check_jump_func(struct stackframe *frame, void *data)
 	struct walk_stackframe_args *args = data;
 	struct klp_func_list *check_funcs = args->check_funcs;
 
-	if (!check_func_list(check_funcs, &args->ret, frame->pc)) {
+	/* check the PC first */
+	if (!check_func_list(check_funcs, &args->ret, frame->pc))
 		return args->ret;
-	}
+
+	/* check NIP when the exception stack switching */
+	if (frame->nip && !check_func_list(check_funcs, &args->ret, frame->nip))
+		return args->ret;
+
 	return 0;
 }
 
@@ -334,6 +320,7 @@ static int do_check_calltrace(struct walk_stackframe_args *args,
 
 		frame.sp = (unsigned long)stack;
 		frame.pc = stack[STACK_FRAME_LR_SAVE];
+		frame.nip = 0;
 		klp_walk_stackframe(&frame, fn, t, args);
 		if (args->ret) {
 			pr_info("PID: %d Comm: %.20s\n", t->pid, t->comm);
@@ -372,11 +359,19 @@ static int check_module_calltrace(struct stackframe *frame, void *data)
 {
 	struct walk_stackframe_args *args = data;
 
-	if (within_module_core(frame->pc, args->mod)) {
-		pr_err("module %s is in use!\n", args->mod->name);
-		return (args->ret = -EBUSY);
-	}
+	/* check the PC first */
+	if (within_module_core(frame->pc, args->mod))
+		goto err_out;
+
+	/* check NIP when the exception stack switching */
+	if (frame->nip && within_module_core(frame->nip, args->mod))
+		goto err_out;
+
 	return 0;
+
+err_out:
+	pr_err("module %s is in use!\n", args->mod->name);
+	return (args->ret = -EBUSY);
 }
 
 int arch_klp_module_check_calltrace(void *data)
diff --git a/arch/powerpc/kernel/livepatch_64.c b/arch/powerpc/kernel/livepatch_64.c
index acd3658d37a3..cbb5e02cccff 100644
--- a/arch/powerpc/kernel/livepatch_64.c
+++ b/arch/powerpc/kernel/livepatch_64.c
@@ -67,12 +67,6 @@ struct klp_func_list {
 	int force;
 };
 
-struct stackframe {
-	unsigned long sp;
-	unsigned long pc;
-	unsigned long nip;
-};
-
 struct walk_stackframe_args {
 	int enable;
 	struct klp_func_list *check_funcs;
@@ -246,53 +240,6 @@ static int klp_check_activeness_func(struct klp_patch *patch, int enable,
 	return 0;
 }
 
-static int unwind_frame(struct task_struct *tsk, struct stackframe *frame)
-{
-
-	unsigned long *stack;
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-	int ftrace_idx = 0;
-#endif
-
-	if (!validate_sp(frame->sp, tsk, STACK_FRAME_OVERHEAD))
-		return -1;
-
-	if (frame->nip != 0)
-		frame->nip = 0;
-
-	stack = (unsigned long *)frame->sp;
-
-	/*
-	 * When switching to the exception stack,
-	 * we save the NIP in pt_regs
-	 *
-	 * See if this is an exception frame.
-	 * We look for the "regshere" marker in the current frame.
-	 */
-	if (validate_sp(frame->sp, tsk, STACK_INT_FRAME_SIZE)
-	    && stack[STACK_FRAME_MARKER] == STACK_FRAME_REGS_MARKER) {
-		struct pt_regs *regs = (struct pt_regs *)
-			(frame->sp + STACK_FRAME_OVERHEAD);
-		frame->nip = regs->nip;
-		pr_debug("--- interrupt: task = %d/%s, trap %lx at NIP=x%lx/%pS, LR=0x%lx/%pS\n",
-			tsk->pid, tsk->comm, regs->trap,
-			regs->nip, (void *)regs->nip,
-			regs->link, (void *)regs->link);
-	}
-
-	frame->sp = stack[0];
-	frame->pc = stack[STACK_FRAME_LR_SAVE];
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-	/*
-	 * IMHO these tests do not belong in
-	 * arch-dependent code, they are generic.
-	 */
-	frame->pc = ftrace_graph_ret_addr(tsk, &ftrace_idx, frame->pc, stack);
-#endif
-
-	return 0;
-}
-
 static void notrace klp_walk_stackframe(struct stackframe *frame,
 		int (*fn)(struct stackframe *, void *),
 		struct task_struct *tsk, void *data)
@@ -302,7 +249,7 @@ static void notrace klp_walk_stackframe(struct stackframe *frame,
 
 		if (fn(frame, data))
 			break;
-		ret = unwind_frame(tsk, frame);
+		ret = klp_unwind_frame(tsk, frame);
 		if (ret < 0)
 			break;
 	}
-- 
Gitee


From bf06a577f02ef3a1a74e138211be5639e0227178 Mon Sep 17 00:00:00 2001
From: Guo Mengqi <guomengqi3@huawei.com>
Date: Tue, 12 Jul 2022 22:51:54 +0800
Subject: [PATCH 159/674] Revert "iommu: handle page response timeout"

hulk inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5EOOG
CVE: NA

--------------------------------

This reverts commit da76349ca8776aa7f8b186010005fb563fb163bb.
However, the iommu_fault_param and iommu_fault_event changes
are reserved to avoid KABI change.

Signed-off-by: Guo Mengqi <guomengqi3@huawei.com>
Reviewed-by: Weilong Chen <chenweilong@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 drivers/iommu/iommu.c | 55 -------------------------------------------
 include/linux/iommu.h |  4 ++--
 2 files changed, 2 insertions(+), 57 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 9116c93945d0..97953fa27630 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1084,39 +1084,6 @@ int iommu_group_unregister_notifier(struct iommu_group *group,
 }
 EXPORT_SYMBOL_GPL(iommu_group_unregister_notifier);
 
-static void iommu_dev_fault_timer_fn(struct timer_list *t)
-{
-	struct iommu_fault_param *fparam = from_timer(fparam, t, timer);
-	struct iommu_fault_event *evt;
-	struct iommu_fault_page_request *prm;
-
-	u64 now;
-
-	now = get_jiffies_64();
-
-	/* The goal is to ensure driver or guest page fault handler(via vfio)
-	 * send page response on time. Otherwise, limited queue resources
-	 * may be occupied by some irresponsive guests or drivers.
-	 * When per device pending fault list is not empty, we periodically checks
-	 * if any anticipated page response time has expired.
-	 *
-	 * TODO:
-	 * We could do the following if response time expires:
-	 * 1. send page response code FAILURE to all pending PRQ
-	 * 2. inform device driver or vfio
-	 * 3. drain in-flight page requests and responses for this device
-	 * 4. clear pending fault list such that driver can unregister fault
-	 *    handler(otherwise blocked when pending faults are present).
-	 */
-	list_for_each_entry(evt, &fparam->faults, list) {
-		prm = &evt->fault.prm;
-		if (time_after64(now, evt->expire))
-			pr_err("Page response time expired!, pasid %d gid %d exp %llu now %llu\n",
-				prm->pasid, prm->grpid, evt->expire, now);
-	}
-	mod_timer(t, now + prq_timeout);
-}
-
 /**
  * iommu_register_device_fault_handler() - Register a device fault handler
  * @dev: the device
@@ -1164,9 +1131,6 @@ int iommu_register_device_fault_handler(struct device *dev,
 	mutex_init(&param->fault_param->lock);
 	INIT_LIST_HEAD(&param->fault_param->faults);
 
-	if (prq_timeout)
-		timer_setup(&param->fault_param->timer, iommu_dev_fault_timer_fn,
-			TIMER_DEFERRABLE);
 done_unlock:
 	mutex_unlock(&param->lock);
 
@@ -1306,9 +1270,7 @@ int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt)
 	struct dev_iommu *param = dev->iommu;
 	struct iommu_fault_event *evt_pending = NULL;
 	struct iommu_fault_param *fparam;
-	struct timer_list *tmr;
 	int ret = 0;
-	u64 exp;
 
 	if (!param || !evt || WARN_ON_ONCE(!iommu_fault_valid(&evt->fault)))
 		return -EINVAL;
@@ -1329,17 +1291,7 @@ int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt)
 			ret = -ENOMEM;
 			goto done_unlock;
 		}
-		/* Keep track of response expiration time */
-		exp = get_jiffies_64() + prq_timeout;
-		evt_pending->expire = exp;
 		mutex_lock(&fparam->lock);
-		if (list_empty(&fparam->faults)) {
-			/* First pending event, start timer */
-			tmr = &fparam->timer;
-			WARN_ON(timer_pending(tmr));
-			mod_timer(tmr, exp);
-		}
-
 		list_add_tail(&evt_pending->list, &fparam->faults);
 		mutex_unlock(&fparam->lock);
 	}
@@ -1417,13 +1369,6 @@ int iommu_page_response(struct device *dev,
 		break;
 	}
 
-	/* stop response timer if no more pending request */
-	if (list_empty(&param->fault_param->faults) &&
-		timer_pending(&param->fault_param->timer)) {
-		pr_debug("no pending PRQ, stop timer\n");
-		del_timer(&param->fault_param->timer);
-	}
-
 done_unlock:
 	mutex_unlock(&param->fault_param->lock);
 	return ret;
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 8baf5ed66a84..092384b71ab2 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -393,7 +393,7 @@ struct iommu_device {
 struct iommu_fault_event {
 	struct iommu_fault fault;
 	struct list_head list;
-	u64 expire;
+	_KABI_DEPRECATE(u64, expire);
 };
 
 /**
@@ -408,7 +408,7 @@ struct iommu_fault_param {
 	iommu_dev_fault_handler_t handler;
 	void *data;
 	struct list_head faults;
-	struct timer_list timer;
+	_KABI_DEPRECATE(struct timer_list, timer);
 	struct mutex lock;
 };
 
-- 
Gitee


From 3e8545091ac1afe772c6a79e8b015bad7284129f Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Tue, 12 Jul 2022 22:51:55 +0800
Subject: [PATCH 160/674] fs: introduce a wrapper uuid_to_fsid()

mainline inclusion
from mainline-5.13-rc1
commit 9591c3a34f7722bd77f42c98d76fd5a5bad465f0
category: feature
bugzilla: 187162, https://gitee.com/openeuler/kernel/issues/I5FYKV
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=9591c3a34f7722bd77f42c98d76fd5a5bad465f0

-------------------------------------------------

Some filesystem's use a digest of their uuid for f_fsid.
Create a simple wrapper for this open coded folding.

Filesystems that have a non null uuid but use the block device
number for f_fsid may also consider using this helper.

[JK: Added missing asm/byteorder.h include]
Link: https://lore.kernel.org/r/20210322173944.449469-2-amir73il@gmail.com
Acked-by: Damien Le Moal <damien.lemoal@wdc.com>
Reviewed-by: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Baokun Li <libaokun1@huawei.com>
Reviewed-by: chengzhihao <chengzhihao1@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 fs/ext2/super.c        | 5 +----
 fs/ext4/super.c        | 5 +----
 fs/zonefs/super.c      | 5 +----
 include/linux/statfs.h | 8 ++++++++
 4 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index b6314d3c6a87..74e110289413 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1403,7 +1403,6 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf)
 	struct super_block *sb = dentry->d_sb;
 	struct ext2_sb_info *sbi = EXT2_SB(sb);
 	struct ext2_super_block *es = sbi->s_es;
-	u64 fsid;
 
 	spin_lock(&sbi->s_lock);
 
@@ -1457,9 +1456,7 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf)
 	buf->f_ffree = ext2_count_free_inodes(sb);
 	es->s_free_inodes_count = cpu_to_le32(buf->f_ffree);
 	buf->f_namelen = EXT2_NAME_LEN;
-	fsid = le64_to_cpup((void *)es->s_uuid) ^
-	       le64_to_cpup((void *)es->s_uuid + sizeof(u64));
-	buf->f_fsid = u64_to_fsid(fsid);
+	buf->f_fsid = uuid_to_fsid(es->s_uuid);
 	spin_unlock(&sbi->s_lock);
 	return 0;
 }
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 5e992775fc30..26c938360895 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -6246,7 +6246,6 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_super_block *es = sbi->s_es;
 	ext4_fsblk_t overhead = 0, resv_blocks;
-	u64 fsid;
 	s64 bfree;
 	resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters));
 
@@ -6267,9 +6266,7 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_files = le32_to_cpu(es->s_inodes_count);
 	buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
 	buf->f_namelen = EXT4_NAME_LEN;
-	fsid = le64_to_cpup((void *)es->s_uuid) ^
-	       le64_to_cpup((void *)es->s_uuid + sizeof(u64));
-	buf->f_fsid = u64_to_fsid(fsid);
+	buf->f_fsid = uuid_to_fsid(es->s_uuid);
 
 #ifdef CONFIG_QUOTA
 	if (ext4_test_inode_flag(dentry->d_inode, EXT4_INODE_PROJINHERIT) &&
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index e60759d8bb5f..2d1f5d4d12e0 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -1169,7 +1169,6 @@ static int zonefs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	struct super_block *sb = dentry->d_sb;
 	struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
 	enum zonefs_ztype t;
-	u64 fsid;
 
 	buf->f_type = ZONEFS_MAGIC;
 	buf->f_bsize = sb->s_blocksize;
@@ -1192,9 +1191,7 @@ static int zonefs_statfs(struct dentry *dentry, struct kstatfs *buf)
 
 	spin_unlock(&sbi->s_lock);
 
-	fsid = le64_to_cpup((void *)sbi->s_uuid.b) ^
-		le64_to_cpup((void *)sbi->s_uuid.b + sizeof(u64));
-	buf->f_fsid = u64_to_fsid(fsid);
+	buf->f_fsid = uuid_to_fsid(sbi->s_uuid.b);
 
 	return 0;
 }
diff --git a/include/linux/statfs.h b/include/linux/statfs.h
index 20f695b90aab..02c862686ea3 100644
--- a/include/linux/statfs.h
+++ b/include/linux/statfs.h
@@ -4,6 +4,7 @@
 
 #include <linux/types.h>
 #include <asm/statfs.h>
+#include <asm/byteorder.h>
 
 struct kstatfs {
 	long f_type;
@@ -50,4 +51,11 @@ static inline __kernel_fsid_t u64_to_fsid(u64 v)
 	return (__kernel_fsid_t){.val = {(u32)v, (u32)(v>>32)}};
 }
 
+/* Fold 16 bytes uuid to 64 bit fsid */
+static inline __kernel_fsid_t uuid_to_fsid(__u8 *uuid)
+{
+	return u64_to_fsid(le64_to_cpup((void *)uuid) ^
+		le64_to_cpup((void *)(uuid + sizeof(u64))));
+}
+
 #endif
-- 
Gitee


From a3fbc9d494df45ee9316918c4286bbe594faf6e4 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Tue, 12 Jul 2022 22:51:56 +0800
Subject: [PATCH 161/674] shmem: allow reporting fanotify events with file
 handles on tmpfs

mainline inclusion
from mainline-5.13-rc1
commit 59cda49ecf6c9a32fae4942420701b6e087204f6
category: feature
bugzilla: 187162, https://gitee.com/openeuler/kernel/issues/I5FYKV
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=59cda49ecf6c9a32fae4942420701b6e087204f6

-------------------------------------------------

Since kernel v5.1, fanotify_init(2) supports the flag FAN_REPORT_FID
for identifying objects using file handle and fsid in events.

fanotify_mark(2) fails with -ENODEV when trying to set a mark on
filesystems that report null f_fsid in stasfs(2).

Use the digest of uuid as f_fsid for tmpfs to uniquely identify tmpfs
objects as best as possible and allow setting an fanotify mark that
reports events with file handles on tmpfs.

Link: https://lore.kernel.org/r/20210322173944.449469-3-amir73il@gmail.com
Acked-by: Hugh Dickins <hughd@google.com>
Reviewed-by: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Baokun Li <libaokun1@huawei.com>
Reviewed-by: chengzhihao <chengzhihao1@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 mm/shmem.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/mm/shmem.c b/mm/shmem.c
index 9df016296347..201086030e72 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2869,6 +2869,9 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
 		buf->f_ffree = sbinfo->free_inodes;
 	}
 	/* else leave those fields 0 like simple_statfs */
+
+	buf->f_fsid = uuid_to_fsid(dentry->d_sb->s_uuid.b);
+
 	return 0;
 }
 
-- 
Gitee


From 756a3804723a3dac3018968ad9e0cec674418709 Mon Sep 17 00:00:00 2001
From: Liu Shixin <liushixin2@huawei.com>
Date: Tue, 12 Jul 2022 22:51:57 +0800
Subject: [PATCH 162/674] mm/filemap: fix UAF in find_lock_entries

maillist inclusion
category: bugfix
bugzilla: 186821, https://gitee.com/openeuler/kernel/issues/I5G69G

Reference: https://lore.kernel.org/all/20220707020938.2122198-1-liushixin2@huawei.com/

--------------------------------

Release refcount after xas_set to fix UAF which may cause panic like this:

 page:ffffea000491fa40 refcount:1 mapcount:0 mapping:0000000000000000 index:0x1 pfn:0x1247e9
 head:ffffea000491fa00 order:3 compound_mapcount:0 compound_pincount:0
 memcg:ffff888104f91091
 flags: 0x2fffff80010200(slab|head|node=0|zone=2|lastcpupid=0x1fffff)
...
page dumped because: VM_BUG_ON_PAGE(PageTail(page))
 ------------[ cut here ]------------
 kernel BUG at include/linux/page-flags.h:632!
 invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC KASAN
 CPU: 1 PID: 7642 Comm: sh Not tainted 5.15.51-dirty #26
...
 Call Trace:
  <TASK>
  __invalidate_mapping_pages+0xe7/0x540
  drop_pagecache_sb+0x159/0x320
  iterate_supers+0x120/0x240
  drop_caches_sysctl_handler+0xaa/0xe0
  proc_sys_call_handler+0x2b4/0x480
  new_sync_write+0x3d6/0x5c0
  vfs_write+0x446/0x7a0
  ksys_write+0x105/0x210
  do_syscall_64+0x35/0x80
  entry_SYSCALL_64_after_hwframe+0x44/0xae
 RIP: 0033:0x7f52b5733130
...

This problem has been fixed on mainline by patch 6b24ca4a1a8d ("mm: Use
multi-index entries in the page cache") since it deletes the related code.

Fixes: 5c211ba29deb ("mm: add and use find_lock_entries")
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
Acked-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Conflicts:
	mm/filemap.c
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 mm/filemap.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index 9653c1e0faef..ebae261f9df9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1951,7 +1951,11 @@ unsigned find_lock_entries(struct address_space *mapping, pgoff_t start,
 
 	rcu_read_lock();
 	while ((page = find_get_entry(&xas, end, XA_PRESENT))) {
+		unsigned long next_idx = xas.xa_index + 1;
+
 		if (!xa_is_value(page)) {
+			if (PageTransHuge(page))
+				next_idx = page->index + thp_nr_pages(page);
 			if (page->index < start)
 				goto put;
 			VM_BUG_ON_PAGE(page->index != xas.xa_index, page);
@@ -1973,13 +1977,11 @@ unsigned find_lock_entries(struct address_space *mapping, pgoff_t start,
 put:
 		put_page(page);
 next:
-		if (!xa_is_value(page) && PageTransHuge(page)) {
-			unsigned int nr_pages = thp_nr_pages(page);
-
+		if (next_idx != xas.xa_index + 1) {
 			/* Final THP may cross MAX_LFS_FILESIZE on 32-bit */
-			xas_set(&xas, page->index + nr_pages);
-			if (xas.xa_index < nr_pages)
+			if (next_idx < xas.xa_index)
 				break;
+			xas_set(&xas, next_idx);
 		}
 	}
 	rcu_read_unlock();
-- 
Gitee


From 2e8eb7a44f09ec9a3da21113ad2e13efcd07e7ba Mon Sep 17 00:00:00 2001
From: Luo Meng <luomeng12@huawei.com>
Date: Tue, 12 Jul 2022 22:51:58 +0800
Subject: [PATCH 163/674] tmpfs: fix undefined-behaviour in shmem_reconfigure()

mainline inclusion
from mainline-v5.19-rc3
commit d14f5efadd846dbde561bd734318de6a9e6b26e6
category: bugfix
bugzilla: 186471 https://gitee.com/openeuler/kernel/issues/I5DRKR
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=d14f5efadd846dbde561bd734318de6a9e6b26e6

-------------------------------------------------------------------------------

When shmem_reconfigure() calls __percpu_counter_compare(), the second
parameter is unsigned long long.  But in the definition of
__percpu_counter_compare(), the second parameter is s64.  So when
__percpu_counter_compare() executes abs(count - rhs), UBSAN shows the
following warning:

================================================================================
UBSAN: Undefined behaviour in lib/percpu_counter.c:209:6
signed integer overflow:
0 - -9223372036854775808 cannot be represented in type 'long long int'
CPU: 1 PID: 9636 Comm: syz-executor.2 Tainted: G                 ---------r-  - 4.18.0 #2
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
Call Trace:
 __dump_stack home/install/linux-rh-3-10/lib/dump_stack.c:77 [inline]
 dump_stack+0x125/0x1ae home/install/linux-rh-3-10/lib/dump_stack.c:117
 ubsan_epilogue+0xe/0x81 home/install/linux-rh-3-10/lib/ubsan.c:159
 handle_overflow+0x19d/0x1ec home/install/linux-rh-3-10/lib/ubsan.c:190
 __percpu_counter_compare+0x124/0x140 home/install/linux-rh-3-10/lib/percpu_counter.c:209
 percpu_counter_compare home/install/linux-rh-3-10/./include/linux/percpu_counter.h:50 [inline]
 shmem_remount_fs+0x1ce/0x6b0 home/install/linux-rh-3-10/mm/shmem.c:3530
 do_remount_sb+0x11b/0x530 home/install/linux-rh-3-10/fs/super.c:888
 do_remount home/install/linux-rh-3-10/fs/namespace.c:2344 [inline]
 do_mount+0xf8d/0x26b0 home/install/linux-rh-3-10/fs/namespace.c:2844
 ksys_mount+0xad/0x120 home/install/linux-rh-3-10/fs/namespace.c:3075
 __do_sys_mount home/install/linux-rh-3-10/fs/namespace.c:3089 [inline]
 __se_sys_mount home/install/linux-rh-3-10/fs/namespace.c:3086 [inline]
 __x64_sys_mount+0xbf/0x160 home/install/linux-rh-3-10/fs/namespace.c:3086
 do_syscall_64+0xca/0x5c0 home/install/linux-rh-3-10/arch/x86/entry/common.c:298
 entry_SYSCALL_64_after_hwframe+0x6a/0xdf
RIP: 0033:0x46b5e9
Code: 5d db fa ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 2b db fa ff c3 66 2e 0f 1f 84 00 00 00 00
RSP: 002b:00007f54d5f22c68 EFLAGS: 00000246 ORIG_RAX: 00000000000000a5
RAX: ffffffffffffffda RBX: 000000000077bf60 RCX: 000000000046b5e9
RDX: 0000000000000000 RSI: 0000000020000000 RDI: 0000000000000000
RBP: 000000000077bf60 R08: 0000000020000140 R09: 0000000000000000
R10: 00000000026740a4 R11: 0000000000000246 R12: 0000000000000000
R13: 00007ffd1fb1592f R14: 00007f54d5f239c0 R15: 000000000077bf6c
================================================================================

[akpm@linux-foundation.org: tweak error message text]
Link: https://lkml.kernel.org/r/20220513025225.2678727-1-luomeng12@huawei.com
Signed-off-by: Luo Meng <luomeng12@huawei.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Yu Kuai <yukuai3@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

Conflicts:
        mm/shmem.c

Signed-off-by: ZhaoLong Wang <wangzhaolong1@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 mm/shmem.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/mm/shmem.c b/mm/shmem.c
index 201086030e72..f043682ba567 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -3553,6 +3553,10 @@ static int shmem_reconfigure(struct fs_context *fc)
 
 	spin_lock(&sbinfo->stat_lock);
 	inodes = sbinfo->max_inodes - sbinfo->free_inodes;
+	if (ctx->blocks > S64_MAX) {
+		err = "Number of blocks too large";
+		goto out;
+	}
 	if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) {
 		if (!sbinfo->max_blocks) {
 			err = "Cannot retroactively limit size";
-- 
Gitee


From 03c87a1c896c4b7c8797a8af2bbc020998e31c6d Mon Sep 17 00:00:00 2001
From: ZhaoLong Wang <wangzhaolong1@huawei.com>
Date: Tue, 12 Jul 2022 22:51:59 +0800
Subject: [PATCH 164/674] tmpfs: fix the issue that the mount and remount
 results are inconsistent.

hulk inclusion
category: bugfix
bugzilla: 187184, https://gitee.com/openeuler/kernel/issues/I5G9EK
CVE: NA

--------------------------------

An undefined-behavior issue has not been completely fixed since commit
a70846a97e7b("tmpfs: fix undefined-behaviour in shmem_reconfigure()").
In the commit, check in the shmem_reconfigure() is added in remount
process to avoid the Ubsan problem.  However, the check is not added to
the mount process.  It causes inconsistent results between mount and
remount.  The operations to reproduce the problem in user mode as follows:

If nr_blocks is set to 0x8000000000000000, the mounting is successful.

  # mount tmpfs /dev/shm/ -t tmpfs -o nr_blocks=0x8000000000000000

However, when -o remount is used, the mount fails because of the
check in the shmem_reconfigure()

  # mount tmpfs /dev/shm/ -t tmpfs -o remount,nr_blocks=0x8000000000000000
  mount: /dev/shm: mount point not mounted or bad option.

Therefore, add checks in the shmem_parse_one() function and remove the
check in shmem_reconfigure() to avoid this problem.

Signed-off-by: ZhaoLong Wang <wangzhaolong1@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 mm/shmem.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/mm/shmem.c b/mm/shmem.c
index f043682ba567..ad2d68150ed2 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -3432,7 +3432,7 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
 		break;
 	case Opt_nr_blocks:
 		ctx->blocks = memparse(param->string, &rest);
-		if (*rest)
+		if (*rest || ctx->blocks > S64_MAX)
 			goto bad_value;
 		ctx->seen |= SHMEM_SEEN_BLOCKS;
 		break;
@@ -3553,10 +3553,7 @@ static int shmem_reconfigure(struct fs_context *fc)
 
 	spin_lock(&sbinfo->stat_lock);
 	inodes = sbinfo->max_inodes - sbinfo->free_inodes;
-	if (ctx->blocks > S64_MAX) {
-		err = "Number of blocks too large";
-		goto out;
-	}
+
 	if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) {
 		if (!sbinfo->max_blocks) {
 			err = "Cannot retroactively limit size";
-- 
Gitee


From a5b6f7c60d6566217f8090e3b9c2da6af5c00105 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@suse.com>
Date: Tue, 12 Jul 2022 22:52:00 +0800
Subject: [PATCH 165/674] xen-netfront: restore __skb_queue_tail() positioning
 in xennet_get_responses()

stable inclusion
from stable-v5.10.129
commit 547b7c640df545a344358ede93e491a89194cdfa
category: bugfix
bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5GBF2?from=project-issue
CVE: CVE-2022-33743

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?h=linux-5.10.y&id=547b7c640df545a344358ede93e491a89194cdfa

--------------------------------

commit f63c2c2032c2e3caad9add3b82cc6e91c376fd26 upstream.

The commit referenced below moved the invocation past the "next" label,
without any explanation. In fact this allows misbehaving backends undue
control over the domain the frontend runs in, as earlier detected errors
require the skb to not be freed (it may be retained for later processing
via xennet_move_rx_slot(), or it may simply be unsafe to have it freed).

This is CVE-2022-33743 / XSA-405.

Fixes: 6c5aa6fc4def ("xen networking: add basic XDP support for xen-netfront")
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Ziyang Xuan <william.xuanziyang@huawei.com>
Reviewed-by: Yue Haibing <yuehaibing@huawei.com>
Reviewed-by: Xiu Jianfeng <xiujianfeng@huawei.com>
Reviewed-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 drivers/net/xen-netfront.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
index 1a69b5246133..a7ebf17f75a3 100644
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -1057,8 +1057,10 @@ static int xennet_get_responses(struct netfront_queue *queue,
 			}
 		}
 		rcu_read_unlock();
-next:
+
 		__skb_queue_tail(list, skb);
+
+next:
 		if (!(rx->flags & XEN_NETRXF_more_data))
 			break;
 
-- 
Gitee


From 8da2810de868dd9d42b61a16cbff500f405479cf Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 12 Jul 2022 22:52:01 +0800
Subject: [PATCH 166/674] netfilter: nf_tables: stricter validation of element
 data

mainline inclusion
from mainline-v5.19-rc6
commit 7e6bc1f6cabcd30aba0b11219d8e01b952eacbb6
category: bugfix
bugzilla: 187147, https://gitee.com/src-openeuler/kernel/issues/I5GCQH
CVE: CVE-2022-34918

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=7e6bc1f6cabcd30aba0b11219d8e01b952eacbb6

--------------------------------

Make sure element data type and length do not mismatch the one specified
by the set declaration.

Fixes: 7d7402642eaf ("netfilter: nf_tables: variable sized set element keys / data")
Reported-by: Hugues ANGUELKOV <hanguelkov@randorisec.fr>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Lu Wei <luwei32@huawei.com>
Reviewed-by: Yue Haibing <yuehaibing@huawei.com>
Reviewed-by: Xiu Jianfeng <xiujianfeng@huawei.com>
Reviewed-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 net/netfilter/nf_tables_api.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index ea162e36e0e4..560a93aad5b6 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -4880,13 +4880,20 @@ static int nft_setelem_parse_data(struct nft_ctx *ctx, struct nft_set *set,
 				  struct nft_data *data,
 				  struct nlattr *attr)
 {
+	u32 dtype;
 	int err;
 
 	err = nft_data_init(ctx, data, NFT_DATA_VALUE_MAXLEN, desc, attr);
 	if (err < 0)
 		return err;
 
-	if (desc->type != NFT_DATA_VERDICT && desc->len != set->dlen) {
+	if (set->dtype == NFT_DATA_VERDICT)
+		dtype = NFT_DATA_VERDICT;
+	else
+		dtype = NFT_DATA_VALUE;
+
+	if (dtype != desc->type ||
+	    set->dlen != desc->len) {
 		nft_data_release(data, desc->type);
 		return -EINVAL;
 	}
-- 
Gitee


From 1938d44f19da38bd4d28bb89aaee5a75763d9ffd Mon Sep 17 00:00:00 2001
From: Roger Pau Monne <roger.pau@citrix.com>
Date: Tue, 12 Jul 2022 22:52:02 +0800
Subject: [PATCH 167/674] xen/blkfront: fix leaking data in shared pages
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

stable inclusion
from stable-v5.10.129
commit cfea428030be836d79a7690968232bb7fa4410f1
category: bugfix
bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5GLXT
CVE: CVE-2022-26365

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?h=linux-5.10.y&id=cfea428030be836d79a7690968232bb7fa4410f1

--------------------------------

commit 2f446ffe9d737e9a844b97887919c4fda18246e7 upstream.

When allocating pages to be used for shared communication with the
backend always zero them, this avoids leaking unintended data present
on the pages.

This is CVE-2022-26365, part of XSA-403.

Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: ChenXiaoSong <chenxiaosong2@huawei.com>
Reviewed-by: Xiu Jianfeng <xiujianfeng@huawei.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 drivers/block/xen-blkfront.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 47d4bb23d6f3..fffb7c3118b1 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -311,7 +311,7 @@ static int fill_grant_buffer(struct blkfront_ring_info *rinfo, int num)
 			goto out_of_memory;
 
 		if (info->feature_persistent) {
-			granted_page = alloc_page(GFP_NOIO);
+			granted_page = alloc_page(GFP_NOIO | __GFP_ZERO);
 			if (!granted_page) {
 				kfree(gnt_list_entry);
 				goto out_of_memory;
@@ -1753,7 +1753,7 @@ static int setup_blkring(struct xenbus_device *dev,
 	for (i = 0; i < info->nr_ring_pages; i++)
 		rinfo->ring_ref[i] = GRANT_INVALID_REF;
 
-	sring = alloc_pages_exact(ring_size, GFP_NOIO);
+	sring = alloc_pages_exact(ring_size, GFP_NOIO | __GFP_ZERO);
 	if (!sring) {
 		xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring");
 		return -ENOMEM;
@@ -2293,7 +2293,8 @@ static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo)
 
 		BUG_ON(!list_empty(&rinfo->indirect_pages));
 		for (i = 0; i < num; i++) {
-			struct page *indirect_page = alloc_page(GFP_KERNEL);
+			struct page *indirect_page = alloc_page(GFP_KERNEL |
+			                                        __GFP_ZERO);
 			if (!indirect_page)
 				goto out_of_memory;
 			list_add(&indirect_page->lru, &rinfo->indirect_pages);
-- 
Gitee


From a6baeccd14246eed67ffaafa52d3902d1c9fa9bf Mon Sep 17 00:00:00 2001
From: Roger Pau Monne <roger.pau@citrix.com>
Date: Tue, 12 Jul 2022 22:52:03 +0800
Subject: [PATCH 168/674] xen/netfront: fix leaking data in shared pages
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

stable inclusion
from stable-v5.10.129
commit 728d68bfe68d92eae1407b8a9edc7817d6227404
category: bugfix
bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5GLYP
CVE: CVE-2022-33740

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?h=linux-5.10.y&id=728d68bfe68d92eae1407b8a9edc7817d6227404

--------------------------------

commit 307c8de2b02344805ebead3440d8feed28f2f010 upstream.

When allocating pages to be used for shared communication with the
backend always zero them, this avoids leaking unintended data present
on the pages.

This is CVE-2022-33740, part of XSA-403.

Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: ChenXiaoSong <chenxiaosong2@huawei.com>
Reviewed-by: Xiu Jianfeng <xiujianfeng@huawei.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 drivers/net/xen-netfront.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
index a7ebf17f75a3..881bc68e5402 100644
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -273,7 +273,8 @@ static struct sk_buff *xennet_alloc_one_rx_buffer(struct netfront_queue *queue)
 	if (unlikely(!skb))
 		return NULL;
 
-	page = page_pool_dev_alloc_pages(queue->page_pool);
+	page = page_pool_alloc_pages(queue->page_pool,
+				     GFP_ATOMIC | __GFP_NOWARN | __GFP_ZERO);
 	if (unlikely(!page)) {
 		kfree_skb(skb);
 		return NULL;
-- 
Gitee


From 5d6fe8fb20024bec55c269088b085a3706a0be16 Mon Sep 17 00:00:00 2001
From: Roger Pau Monne <roger.pau@citrix.com>
Date: Tue, 12 Jul 2022 22:52:04 +0800
Subject: [PATCH 169/674] xen/netfront: force data bouncing when backend is
 untrusted
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

stable inclusion
from stable-v5.10.129
commit 4923217af5742a796821272ee03f8d6de15c0cca
category: bugfix
bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5GLZZ
CVE: CVE-2022-33741

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?h=linux-5.10.y&id=4923217af5742a796821272ee03f8d6de15c0cca

--------------------------------

commit 4491001c2e0fa69efbb748c96ec96b100a5cdb7e upstream.

Bounce all data on the skbs to be transmitted into zeroed pages if the
backend is untrusted. This avoids leaking data present in the pages
shared with the backend but not part of the skb fragments.  This
requires introducing a new helper in order to allocate skbs with a
size multiple of XEN_PAGE_SIZE so we don't leak contiguous data on the
granted pages.

Reporting whether the backend is to be trusted can be done using a
module parameter, or from the xenstore frontend path as set by the
toolstack when adding the device.

This is CVE-2022-33741, part of XSA-403.

Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
Reviewed-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: ChenXiaoSong <chenxiaosong2@huawei.com>
Reviewed-by: Xiu Jianfeng <xiujianfeng@huawei.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 drivers/net/xen-netfront.c | 49 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 47 insertions(+), 2 deletions(-)

diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
index 881bc68e5402..569f3c8e7b75 100644
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -66,6 +66,10 @@ module_param_named(max_queues, xennet_max_queues, uint, 0644);
 MODULE_PARM_DESC(max_queues,
 		 "Maximum number of queues per virtual interface");
 
+static bool __read_mostly xennet_trusted = true;
+module_param_named(trusted, xennet_trusted, bool, 0644);
+MODULE_PARM_DESC(trusted, "Is the backend trusted");
+
 #define XENNET_TIMEOUT  (5 * HZ)
 
 static const struct ethtool_ops xennet_ethtool_ops;
@@ -175,6 +179,9 @@ struct netfront_info {
 	/* Is device behaving sane? */
 	bool broken;
 
+	/* Should skbs be bounced into a zeroed buffer? */
+	bool bounce;
+
 	atomic_t rx_gso_checksum_fixup;
 };
 
@@ -670,6 +677,33 @@ static int xennet_xdp_xmit(struct net_device *dev, int n,
 	return n - drops;
 }
 
+struct sk_buff *bounce_skb(const struct sk_buff *skb)
+{
+	unsigned int headerlen = skb_headroom(skb);
+	/* Align size to allocate full pages and avoid contiguous data leaks */
+	unsigned int size = ALIGN(skb_end_offset(skb) + skb->data_len,
+				  XEN_PAGE_SIZE);
+	struct sk_buff *n = alloc_skb(size, GFP_ATOMIC | __GFP_ZERO);
+
+	if (!n)
+		return NULL;
+
+	if (!IS_ALIGNED((uintptr_t)n->head, XEN_PAGE_SIZE)) {
+		WARN_ONCE(1, "misaligned skb allocated\n");
+		kfree_skb(n);
+		return NULL;
+	}
+
+	/* Set the data pointer */
+	skb_reserve(n, headerlen);
+	/* Set the tail pointer and length */
+	skb_put(n, skb->len);
+
+	BUG_ON(skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len));
+
+	skb_copy_header(n, skb);
+	return n;
+}
 
 #define MAX_XEN_SKB_FRAGS (65536 / XEN_PAGE_SIZE + 1)
 
@@ -723,9 +757,13 @@ static netdev_tx_t xennet_start_xmit(struct sk_buff *skb, struct net_device *dev
 
 	/* The first req should be at least ETH_HLEN size or the packet will be
 	 * dropped by netback.
+	 *
+	 * If the backend is not trusted bounce all data to zeroed pages to
+	 * avoid exposing contiguous data on the granted page not belonging to
+	 * the skb.
 	 */
-	if (unlikely(PAGE_SIZE - offset < ETH_HLEN)) {
-		nskb = skb_copy(skb, GFP_ATOMIC);
+	if (np->bounce || unlikely(PAGE_SIZE - offset < ETH_HLEN)) {
+		nskb = bounce_skb(skb);
 		if (!nskb)
 			goto drop;
 		dev_consume_skb_any(skb);
@@ -2251,6 +2289,10 @@ static int talk_to_netback(struct xenbus_device *dev,
 
 	info->netdev->irq = 0;
 
+	/* Check if backend is trusted. */
+	info->bounce = !xennet_trusted ||
+		       !xenbus_read_unsigned(dev->nodename, "trusted", 1);
+
 	/* Check if backend supports multiple queues */
 	max_queues = xenbus_read_unsigned(info->xbdev->otherend,
 					  "multi-queue-max-queues", 1);
@@ -2417,6 +2459,9 @@ static int xennet_connect(struct net_device *dev)
 		return err;
 	if (np->netback_has_xdp_headroom)
 		pr_info("backend supports XDP headroom\n");
+	if (np->bounce)
+		dev_info(&np->xbdev->dev,
+			 "bouncing transmitted data to zeroed pages\n");
 
 	/* talk_to_netback() sets the correct number of queues */
 	num_queues = dev->real_num_tx_queues;
-- 
Gitee


From 52c7b5227826f6cbb5b2c3eb0ea19484dfa8d53d Mon Sep 17 00:00:00 2001
From: Roger Pau Monne <roger.pau@citrix.com>
Date: Tue, 12 Jul 2022 22:52:05 +0800
Subject: [PATCH 170/674] xen/blkfront: force data bouncing when backend is
 untrusted
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

stable inclusion
from stable-v5.10.129
commit cbbd2d2531539212ff090aecbea9877c996e6ce6
category: bugfix
bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5GM0S
CVE: CVE-2022-33742

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?h=linux-5.10.y&id=cbbd2d2531539212ff090aecbea9877c996e6ce6

--------------------------------

commit 2400617da7eebf9167d71a46122828bc479d64c9 upstream.

Split the current bounce buffering logic used with persistent grants
into it's own option, and allow enabling it independently of
persistent grants.  This allows to reuse the same code paths to
perform the bounce buffering required to avoid leaking contiguous data
in shared pages not part of the request fragments.

Reporting whether the backend is to be trusted can be done using a
module parameter, or from the xenstore frontend path as set by the
toolstack when adding the device.

This is CVE-2022-33742, part of XSA-403.

Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
Reviewed-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: ChenXiaoSong <chenxiaosong2@huawei.com>
Reviewed-by: Xiu Jianfeng <xiujianfeng@huawei.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 drivers/block/xen-blkfront.c | 49 +++++++++++++++++++++++++-----------
 1 file changed, 34 insertions(+), 15 deletions(-)

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index fffb7c3118b1..abbb68b6d9bd 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -151,6 +151,10 @@ static unsigned int xen_blkif_max_ring_order;
 module_param_named(max_ring_page_order, xen_blkif_max_ring_order, int, 0444);
 MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the shared ring");
 
+static bool __read_mostly xen_blkif_trusted = true;
+module_param_named(trusted, xen_blkif_trusted, bool, 0644);
+MODULE_PARM_DESC(trusted, "Is the backend trusted");
+
 #define BLK_RING_SIZE(info)	\
 	__CONST_RING_SIZE(blkif, XEN_PAGE_SIZE * (info)->nr_ring_pages)
 
@@ -208,6 +212,7 @@ struct blkfront_info
 	unsigned int feature_discard:1;
 	unsigned int feature_secdiscard:1;
 	unsigned int feature_persistent:1;
+	unsigned int bounce:1;
 	unsigned int discard_granularity;
 	unsigned int discard_alignment;
 	/* Number of 4KB segments handled */
@@ -310,7 +315,7 @@ static int fill_grant_buffer(struct blkfront_ring_info *rinfo, int num)
 		if (!gnt_list_entry)
 			goto out_of_memory;
 
-		if (info->feature_persistent) {
+		if (info->bounce) {
 			granted_page = alloc_page(GFP_NOIO | __GFP_ZERO);
 			if (!granted_page) {
 				kfree(gnt_list_entry);
@@ -330,7 +335,7 @@ static int fill_grant_buffer(struct blkfront_ring_info *rinfo, int num)
 	list_for_each_entry_safe(gnt_list_entry, n,
 	                         &rinfo->grants, node) {
 		list_del(&gnt_list_entry->node);
-		if (info->feature_persistent)
+		if (info->bounce)
 			__free_page(gnt_list_entry->page);
 		kfree(gnt_list_entry);
 		i--;
@@ -376,7 +381,7 @@ static struct grant *get_grant(grant_ref_t *gref_head,
 	/* Assign a gref to this page */
 	gnt_list_entry->gref = gnttab_claim_grant_reference(gref_head);
 	BUG_ON(gnt_list_entry->gref == -ENOSPC);
-	if (info->feature_persistent)
+	if (info->bounce)
 		grant_foreign_access(gnt_list_entry, info);
 	else {
 		/* Grant access to the GFN passed by the caller */
@@ -400,7 +405,7 @@ static struct grant *get_indirect_grant(grant_ref_t *gref_head,
 	/* Assign a gref to this page */
 	gnt_list_entry->gref = gnttab_claim_grant_reference(gref_head);
 	BUG_ON(gnt_list_entry->gref == -ENOSPC);
-	if (!info->feature_persistent) {
+	if (!info->bounce) {
 		struct page *indirect_page;
 
 		/* Fetch a pre-allocated page to use for indirect grefs */
@@ -715,7 +720,7 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri
 		.grant_idx = 0,
 		.segments = NULL,
 		.rinfo = rinfo,
-		.need_copy = rq_data_dir(req) && info->feature_persistent,
+		.need_copy = rq_data_dir(req) && info->bounce,
 	};
 
 	/*
@@ -1035,11 +1040,12 @@ static void xlvbd_flush(struct blkfront_info *info)
 {
 	blk_queue_write_cache(info->rq, info->feature_flush ? true : false,
 			      info->feature_fua ? true : false);
-	pr_info("blkfront: %s: %s %s %s %s %s\n",
+	pr_info("blkfront: %s: %s %s %s %s %s %s %s\n",
 		info->gd->disk_name, flush_info(info),
 		"persistent grants:", info->feature_persistent ?
 		"enabled;" : "disabled;", "indirect descriptors:",
-		info->max_indirect_segments ? "enabled;" : "disabled;");
+		info->max_indirect_segments ? "enabled;" : "disabled;",
+		"bounce buffer:", info->bounce ? "enabled" : "disabled;");
 }
 
 static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset)
@@ -1273,7 +1279,7 @@ static void blkif_free_ring(struct blkfront_ring_info *rinfo)
 	if (!list_empty(&rinfo->indirect_pages)) {
 		struct page *indirect_page, *n;
 
-		BUG_ON(info->feature_persistent);
+		BUG_ON(info->bounce);
 		list_for_each_entry_safe(indirect_page, n, &rinfo->indirect_pages, lru) {
 			list_del(&indirect_page->lru);
 			__free_page(indirect_page);
@@ -1290,7 +1296,7 @@ static void blkif_free_ring(struct blkfront_ring_info *rinfo)
 							  0, 0UL);
 				rinfo->persistent_gnts_c--;
 			}
-			if (info->feature_persistent)
+			if (info->bounce)
 				__free_page(persistent_gnt->page);
 			kfree(persistent_gnt);
 		}
@@ -1311,7 +1317,7 @@ static void blkif_free_ring(struct blkfront_ring_info *rinfo)
 		for (j = 0; j < segs; j++) {
 			persistent_gnt = rinfo->shadow[i].grants_used[j];
 			gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL);
-			if (info->feature_persistent)
+			if (info->bounce)
 				__free_page(persistent_gnt->page);
 			kfree(persistent_gnt);
 		}
@@ -1501,7 +1507,7 @@ static int blkif_completion(unsigned long *id,
 	data.s = s;
 	num_sg = s->num_sg;
 
-	if (bret->operation == BLKIF_OP_READ && info->feature_persistent) {
+	if (bret->operation == BLKIF_OP_READ && info->bounce) {
 		for_each_sg(s->sg, sg, num_sg, i) {
 			BUG_ON(sg->offset + sg->length > PAGE_SIZE);
 
@@ -1560,7 +1566,7 @@ static int blkif_completion(unsigned long *id,
 				 * Add the used indirect page back to the list of
 				 * available pages for indirect grefs.
 				 */
-				if (!info->feature_persistent) {
+				if (!info->bounce) {
 					indirect_page = s->indirect_grants[i]->page;
 					list_add(&indirect_page->lru, &rinfo->indirect_pages);
 				}
@@ -1857,6 +1863,10 @@ static int talk_to_blkback(struct xenbus_device *dev,
 	if (!info)
 		return -ENODEV;
 
+	/* Check if backend is trusted. */
+	info->bounce = !xen_blkif_trusted ||
+		       !xenbus_read_unsigned(dev->nodename, "trusted", 1);
+
 	max_page_order = xenbus_read_unsigned(info->xbdev->otherend,
 					      "max-ring-page-order", 0);
 	ring_page_order = min(xen_blkif_max_ring_order, max_page_order);
@@ -2283,10 +2293,10 @@ static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo)
 	if (err)
 		goto out_of_memory;
 
-	if (!info->feature_persistent && info->max_indirect_segments) {
+	if (!info->bounce && info->max_indirect_segments) {
 		/*
-		 * We are using indirect descriptors but not persistent
-		 * grants, we need to allocate a set of pages that can be
+		 * We are using indirect descriptors but don't have a bounce
+		 * buffer, we need to allocate a set of pages that can be
 		 * used for mapping indirect grefs
 		 */
 		int num = INDIRECT_GREFS(grants) * BLK_RING_SIZE(info);
@@ -2387,6 +2397,8 @@ static void blkfront_gather_backend_features(struct blkfront_info *info)
 		info->feature_persistent =
 			!!xenbus_read_unsigned(info->xbdev->otherend,
 					       "feature-persistent", 0);
+	if (info->feature_persistent)
+		info->bounce = true;
 
 	indirect_segments = xenbus_read_unsigned(info->xbdev->otherend,
 					"feature-max-indirect-segments", 0);
@@ -2760,6 +2772,13 @@ static void blkfront_delay_work(struct work_struct *work)
 	struct blkfront_info *info;
 	bool need_schedule_work = false;
 
+	/*
+	 * Note that when using bounce buffers but not persistent grants
+	 * there's no need to run blkfront_delay_work because grants are
+	 * revoked in blkif_completion or else an error is reported and the
+	 * connection is closed.
+	 */
+
 	mutex_lock(&blkfront_mutex);
 
 	list_for_each_entry(info, &info_list, info_list) {
-- 
Gitee


From d78aa4d9b77b77832dbe9fb1f03ac0054d5eaca8 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Mon, 22 Mar 2021 13:53:23 +0000
Subject: [PATCH 171/674] x86/cpufeatures: Enumerate #DB for bus lock detection

mainline inclusion
from mainline-v5.12-rc4
commit f21d4d3b97a8603567e5d4250bd75e8ebbd520af
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5G10C
CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/
commit/?id=f21d4d3b97a8603567e5d4250bd75e8ebbd520af

Intel-SIG: commit f21d4d3 x86/cpufeatures: Enumerate #DB for bus lock detection

--------------------------------

A bus lock is acquired through either a split locked access to writeback
(WB) memory or any locked access to non-WB memory. This is typically >1000
cycles slower than an atomic operation within a cache line. It also
disrupts performance on other cores.

Some CPUs have the ability to notify the kernel by a #DB trap after a user
instruction acquires a bus lock and is executed. This allows the kernel to
enforce user application throttling or mitigation. Both breakpoint and bus
lock can trigger the #DB trap in the same instruction and the ordering of
handling them is the kernel #DB handler's choice.

The CPU feature flag to be shown in /proc/cpuinfo will be "bus_lock_detect".

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/r/20210322135325.682257-2-fenghua.yu@intel.com

(cherry picked from commit f21d4d3b97a8603567e5d4250bd75e8ebbd520af)
Signed-off-by: Ethan Zhao <haifeng.zhao@linux.intel.com>
---
 arch/x86/include/asm/cpufeatures.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 4da419226377..a0de1f6b8e62 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -353,6 +353,7 @@
 #define X86_FEATURE_AVX512_VPOPCNTDQ	(16*32+14) /* POPCNT for vectors of DW/QW */
 #define X86_FEATURE_LA57		(16*32+16) /* 5-level page tables */
 #define X86_FEATURE_RDPID		(16*32+22) /* RDPID instruction */
+#define X86_FEATURE_BUS_LOCK_DETECT	(16*32+24) /* Bus Lock detect */
 #define X86_FEATURE_CLDEMOTE		(16*32+25) /* CLDEMOTE instruction */
 #define X86_FEATURE_MOVDIRI		(16*32+27) /* MOVDIRI instruction */
 #define X86_FEATURE_MOVDIR64B		(16*32+28) /* MOVDIR64B instruction */
-- 
Gitee


From 3f9272839c21f5b6ff3e912c45fc1b0d605c7cd6 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Mon, 22 Mar 2021 13:53:24 +0000
Subject: [PATCH 172/674] x86/traps: Handle #DB for bus lock

mainline inclusion
from mainline-v5.12-rc4
commit ebb1064e7c2e90b56e4d40ab154ef9796060a1c3
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5G10C
CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/
commit/?id=ebb1064e7c2e90b56e4d40ab154ef9796060a1c3

Intel-SIG: commit ebb1064 x86/traps: Handle #DB for bus lock

--------------------------------

Bus locks degrade performance for the whole system, not just for the CPU
that requested the bus lock. Two CPU features "#AC for split lock" and
"#DB for bus lock" provide hooks so that the operating system may choose
one of several mitigation strategies.

bus lock feature to cover additional situations with new options to
mitigate.

split_lock_detect=
		#AC for split lock		#DB for bus lock

off		Do nothing			Do nothing

warn		Kernel OOPs			Warn once per task and
		Warn once per task and		and continues to run.
		disable future checking
	 	When both features are
		supported, warn in #AC

fatal		Kernel OOPs			Send SIGBUS to user.
		Send SIGBUS to user
		When both features are
		supported, fatal in #AC

ratelimit:N	Do nothing			Limit bus lock rate to
						N per second in the
						current non-root user.

Default option is "warn".

Hardware only generates #DB for bus lock detect when CPL>0 to avoid
nested #DB from multiple bus locks while the first #DB is being handled.
So no need to handle #DB for bus lock detected in the kernel.

while #AC for split lock is enabled by split lock detection bit 29 in
TEST_CTRL MSR.

Both breakpoint and bus lock in the same instruction can trigger one #DB.
The bus lock is handled before the breakpoint in the #DB handler.

Delivery of #DB for bus lock in userspace clears DR6[11], which is set by
the #DB handler right after reading DR6.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/r/20210322135325.682257-3-fenghua.yu@intel.com

(cherry picked from commit ebb1064e7c2e90b56e4d40ab154ef9796060a1c3)
Signed-off-by: Ethan Zhao <haifeng.zhao@linux.intel.com>
---
 arch/x86/include/asm/cpu.h           |   7 +-
 arch/x86/include/asm/msr-index.h     |   1 +
 arch/x86/include/uapi/asm/debugreg.h |   1 +
 arch/x86/kernel/cpu/common.c         |   2 +-
 arch/x86/kernel/cpu/intel.c          | 113 ++++++++++++++++++++++-----
 arch/x86/kernel/traps.c              |   4 +
 6 files changed, 106 insertions(+), 22 deletions(-)

diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
index da78ccbd493b..0d7fc0e2bfc9 100644
--- a/arch/x86/include/asm/cpu.h
+++ b/arch/x86/include/asm/cpu.h
@@ -41,12 +41,13 @@ unsigned int x86_family(unsigned int sig);
 unsigned int x86_model(unsigned int sig);
 unsigned int x86_stepping(unsigned int sig);
 #ifdef CONFIG_CPU_SUP_INTEL
-extern void __init cpu_set_core_cap_bits(struct cpuinfo_x86 *c);
+extern void __init sld_setup(struct cpuinfo_x86 *c);
 extern void switch_to_sld(unsigned long tifn);
 extern bool handle_user_split_lock(struct pt_regs *regs, long error_code);
 extern bool handle_guest_split_lock(unsigned long ip);
+extern void handle_bus_lock(struct pt_regs *regs);
 #else
-static inline void __init cpu_set_core_cap_bits(struct cpuinfo_x86 *c) {}
+static inline void __init sld_setup(struct cpuinfo_x86 *c) {}
 static inline void switch_to_sld(unsigned long tifn) {}
 static inline bool handle_user_split_lock(struct pt_regs *regs, long error_code)
 {
@@ -57,6 +58,8 @@ static inline bool handle_guest_split_lock(unsigned long ip)
 {
 	return false;
 }
+
+static inline void handle_bus_lock(struct pt_regs *regs) {}
 #endif
 #ifdef CONFIG_IA32_FEAT_CTL
 void init_ia32_feat_ctl(struct cpuinfo_x86 *c);
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 2b0af5eb5131..e2103a35b45a 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -289,6 +289,7 @@
 #define DEBUGCTLMSR_LBR			(1UL <<  0) /* last branch recording */
 #define DEBUGCTLMSR_BTF_SHIFT		1
 #define DEBUGCTLMSR_BTF			(1UL <<  1) /* single-step on branches */
+#define DEBUGCTLMSR_BUS_LOCK_DETECT	(1UL <<  2)
 #define DEBUGCTLMSR_TR			(1UL <<  6)
 #define DEBUGCTLMSR_BTS			(1UL <<  7)
 #define DEBUGCTLMSR_BTINT		(1UL <<  8)
diff --git a/arch/x86/include/uapi/asm/debugreg.h b/arch/x86/include/uapi/asm/debugreg.h
index d95d080b30e3..0007ba077c0c 100644
--- a/arch/x86/include/uapi/asm/debugreg.h
+++ b/arch/x86/include/uapi/asm/debugreg.h
@@ -24,6 +24,7 @@
 #define DR_TRAP3	(0x8)		/* db3 */
 #define DR_TRAP_BITS	(DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)
 
+#define DR_BUS_LOCK	(0x800)		/* bus_lock */
 #define DR_STEP		(0x4000)	/* single-step */
 #define DR_SWITCH	(0x8000)	/* task switch */
 
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 4917c2698ac1..b06254f8643c 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1374,7 +1374,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
 
 	cpu_set_bug_bits(c);
 
-	cpu_set_core_cap_bits(c);
+	sld_setup(c);
 
 	fpu__init_system(c);
 
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 816fdbec795a..dfda4f42ec9b 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -43,9 +43,9 @@ enum split_lock_detect_state {
 };
 
 /*
- * Default to sld_off because most systems do not support split lock detection
- * split_lock_setup() will switch this to sld_warn on systems that support
- * split lock detect, unless there is a command line override.
+ * Default to sld_off because most systems do not support split lock detection.
+ * sld_state_setup() will switch this to sld_warn on systems that support
+ * split lock/bus lock detect, unless there is a command line override.
  */
 static enum split_lock_detect_state sld_state __ro_after_init = sld_off;
 static u64 msr_test_ctrl_cache __ro_after_init;
@@ -602,6 +602,7 @@ static void init_intel_misc_features(struct cpuinfo_x86 *c)
 }
 
 static void split_lock_init(void);
+static void bus_lock_init(void);
 
 static void init_intel(struct cpuinfo_x86 *c)
 {
@@ -719,6 +720,9 @@ static void init_intel(struct cpuinfo_x86 *c)
 		tsx_disable();
 
 	split_lock_init();
+	bus_lock_init();
+
+	intel_init_thermal(c);
 }
 
 #ifdef CONFIG_X86_32
@@ -1017,16 +1021,15 @@ static bool split_lock_verify_msr(bool on)
 	return ctrl == tmp;
 }
 
-static void __init split_lock_setup(void)
+static void __init sld_state_setup(void)
 {
 	enum split_lock_detect_state state = sld_warn;
 	char arg[20];
 	int i, ret;
 
-	if (!split_lock_verify_msr(false)) {
-		pr_info("MSR access failed: Disabled\n");
+	if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) &&
+	    !boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT))
 		return;
-	}
 
 	ret = cmdline_find_option(boot_command_line, "split_lock_detect",
 				  arg, sizeof(arg));
@@ -1038,17 +1041,14 @@ static void __init split_lock_setup(void)
 			}
 		}
 	}
+	sld_state = state;
+}
 
-	switch (state) {
-	case sld_off:
-		pr_info("disabled\n");
+static void __init __split_lock_setup(void)
+{
+	if (!split_lock_verify_msr(false)) {
+		pr_info("MSR access failed: Disabled\n");
 		return;
-	case sld_warn:
-		pr_info("warning about user-space split_locks\n");
-		break;
-	case sld_fatal:
-		pr_info("sending SIGBUS on user-space split_locks\n");
-		break;
 	}
 
 	rdmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache);
@@ -1058,7 +1058,9 @@ static void __init split_lock_setup(void)
 		return;
 	}
 
-	sld_state = state;
+	/* Restore the MSR to its cached value. */
+	wrmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache);
+
 	setup_force_cpu_cap(X86_FEATURE_SPLIT_LOCK_DETECT);
 }
 
@@ -1115,6 +1117,29 @@ bool handle_guest_split_lock(unsigned long ip)
 }
 EXPORT_SYMBOL_GPL(handle_guest_split_lock);
 
+static void bus_lock_init(void)
+{
+	u64 val;
+
+	/*
+	 * Warn and fatal are handled by #AC for split lock if #AC for
+	 * split lock is supported.
+	 */
+	if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) ||
+	    (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) &&
+	    (sld_state == sld_warn || sld_state == sld_fatal)) ||
+	    sld_state == sld_off)
+		return;
+
+	/*
+	 * Enable #DB for bus lock. All bus locks are handled in #DB except
+	 * split locks are handled in #AC in the fatal case.
+	 */
+	rdmsrl(MSR_IA32_DEBUGCTLMSR, val);
+	val |= DEBUGCTLMSR_BUS_LOCK_DETECT;
+	wrmsrl(MSR_IA32_DEBUGCTLMSR, val);
+}
+
 bool handle_user_split_lock(struct pt_regs *regs, long error_code)
 {
 	if ((regs->flags & X86_EFLAGS_AC) || sld_state == sld_fatal)
@@ -1123,6 +1148,21 @@ bool handle_user_split_lock(struct pt_regs *regs, long error_code)
 	return true;
 }
 
+void handle_bus_lock(struct pt_regs *regs)
+{
+	switch (sld_state) {
+	case sld_off:
+		break;
+	case sld_warn:
+		pr_warn_ratelimited("#DB: %s/%d took a bus_lock trap at address: 0x%lx\n",
+				    current->comm, current->pid, regs->ip);
+		break;
+	case sld_fatal:
+		force_sig_fault(SIGBUS, BUS_ADRALN, NULL);
+		break;
+	}
+}
+
 /*
  * This function is called only when switching between tasks with
  * different split-lock detection modes. It sets the MSR for the
@@ -1163,7 +1203,7 @@ static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = {
 	{}
 };
 
-void __init cpu_set_core_cap_bits(struct cpuinfo_x86 *c)
+static void __init split_lock_setup(struct cpuinfo_x86 *c)
 {
 	const struct x86_cpu_id *m;
 	u64 ia32_core_caps;
@@ -1190,5 +1230,40 @@ void __init cpu_set_core_cap_bits(struct cpuinfo_x86 *c)
 	}
 
 	cpu_model_supports_sld = true;
-	split_lock_setup();
+	__split_lock_setup();
+}
+
+static void sld_state_show(void)
+{
+	if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) &&
+	    !boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
+		return;
+
+	switch (sld_state) {
+	case sld_off:
+		pr_info("disabled\n");
+		break;
+	case sld_warn:
+		if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
+			pr_info("#AC: crashing the kernel on kernel split_locks and warning on user-space split_locks\n");
+		else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT))
+			pr_info("#DB: warning on user-space bus_locks\n");
+		break;
+	case sld_fatal:
+		if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) {
+			pr_info("#AC: crashing the kernel on kernel split_locks and sending SIGBUS on user-space split_locks\n");
+		} else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) {
+			pr_info("#DB: sending SIGBUS on user-space bus_locks%s\n",
+				boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) ?
+				" from non-WB" : "");
+		}
+		break;
+	}
+}
+
+void __init sld_setup(struct cpuinfo_x86 *c)
+{
+	split_lock_setup(c);
+	sld_state_setup();
+	sld_state_show();
 }
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 696ec85164e6..68fa31d334e3 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -1023,6 +1023,10 @@ static __always_inline void exc_debug_user(struct pt_regs *regs,
 		goto out_irq;
 	}
 
+	/* #DB for bus lock can only be triggered from userspace. */
+	if (dr6 & DR_BUS_LOCK)
+		handle_bus_lock(regs);
+
 	/* Add the virtual_dr6 bits for signals. */
 	dr6 |= current->thread.virtual_dr6;
 	if (dr6 & (DR_STEP | DR_TRAP_BITS) || icebp)
-- 
Gitee


From 82b4f86723f1bef98f128d4bd8b2c75d57260557 Mon Sep 17 00:00:00 2001
From: Kyung Min Park <kyung.min.park@intel.com>
Date: Thu, 16 Jun 2022 09:21:39 +0300
Subject: [PATCH 173/674] Intel: 5G ISA: x86: Enumerate AVX512 FP16 CPUID
 feature flag

mainline inclusion
from mainline-5.11
commit e1b35da5e624f8b09d2e98845c2e4c84b179d9a4
category: feature
feature: SPR New instructions
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I596EH
CVE: N/A

Intel-SIG: commit e1b35da5e6 x86: Enumerate AVX512 FP16 CPUID feature flag
Backport for SPR core 5G ISA support.

-------------------------------------

Enumerate AVX512 Half-precision floating point (FP16) CPUID feature
flag. Compared with using FP32, using FP16 cut the number of bits
required for storage in half, reducing the exponent from 8 bits to 5,
and the mantissa from 23 bits to 10. Using FP16 also enables developers
to train and run inference on deep learning models fast when all
precision or magnitude (FP32) is not needed.

A processor supports AVX512 FP16 if CPUID.(EAX=7,ECX=0):EDX[bit 23]
is present. The AVX512 FP16 requires AVX512BW feature be implemented
since the instructions for manipulating 32bit masks are associated with
AVX512BW.

The only in-kernel usage of this is kvm passthrough. The CPU feature
flag is shown as "avx512_fp16" in /proc/cpuinfo.

Signed-off-by: Kyung Min Park kyung.min.park@intel.com
Acked-by: Dave Hansen dave.hansen@intel.com
Reviewed-by: Tony Luck tony.luck@intel.com
Message-Id: 20201208033441.28207-2-kyung.min.park@intel.com
Acked-by: Borislav Petkov bp@suse.de
Signed-off-by: Paolo Bonzini pbonzini@redhat.com
Signed-off-by: Luming Yu luming.yu@intel.com
---
 arch/x86/include/asm/cpufeatures.h | 1 +
 arch/x86/kernel/cpu/cpuid-deps.c   | 1 +
 2 files changed, 2 insertions(+)

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 4da419226377..81f0c811604e 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -376,6 +376,7 @@
 #define X86_FEATURE_TSXLDTRK		(18*32+16) /* TSX Suspend Load Address Tracking */
 #define X86_FEATURE_PCONFIG		(18*32+18) /* Intel PCONFIG */
 #define X86_FEATURE_ARCH_LBR		(18*32+19) /* Intel ARCH LBR */
+#define X86_FEATURE_AVX512_FP16		(18*32+23) /* AVX512 FP16 */
 #define X86_FEATURE_SPEC_CTRL		(18*32+26) /* "" Speculation Control (IBRS + IBPB) */
 #define X86_FEATURE_INTEL_STIBP		(18*32+27) /* "" Single Thread Indirect Branch Predictors */
 #define X86_FEATURE_FLUSH_L1D		(18*32+28) /* Flush L1D cache */
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
index d502241995a3..42af31b64c2c 100644
--- a/arch/x86/kernel/cpu/cpuid-deps.c
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -69,6 +69,7 @@ static const struct cpuid_dep cpuid_deps[] = {
 	{ X86_FEATURE_CQM_MBM_TOTAL,		X86_FEATURE_CQM_LLC   },
 	{ X86_FEATURE_CQM_MBM_LOCAL,		X86_FEATURE_CQM_LLC   },
 	{ X86_FEATURE_AVX512_BF16,		X86_FEATURE_AVX512VL  },
+	{ X86_FEATURE_AVX512_FP16,		X86_FEATURE_AVX512BW  },
 	{ X86_FEATURE_ENQCMD,			X86_FEATURE_XSAVES    },
 	{ X86_FEATURE_PER_THREAD_MBA,		X86_FEATURE_MBA       },
 	{}
-- 
Gitee


From de9dc9cae0c6df31ec992234773ce452c977834e Mon Sep 17 00:00:00 2001
From: Kyung Min Park <kyung.min.park@intel.com>
Date: Thu, 16 Jun 2022 14:56:12 +0300
Subject: [PATCH 174/674] Intel: AVX VNNI: x86: Enumerate AVX Vector Neural
 Network instructions

mainline inclusion
from mainline-5.11
commit b85a0425d8056f3bd8d0a94ecdddf2a39d32a801
category: feature
feature: SPR New instructions
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I596EH
CVE: N/A

Intel-SIG: commit b85a0425d80 x86: Enumerate AVX Vector Neural Network instructions
Backport for SPR core AVX VNNI support.

----------------------------

Add AVX version of the Vector Neural Network (VNNI) Instructions.
A processor supports AVX VNNI instructions if CPUID.0x07.0x1:EAX[4] is
present. The following instructions are available when this feature is
present.
1. VPDPBUS: Multiply and Add Unsigned and Signed Bytes
2. VPDPBUSDS: Multiply and Add Unsigned and Signed Bytes with Saturation
3. VPDPWSSD: Multiply and Add Signed Word Integers
4. VPDPWSSDS: Multiply and Add Signed Integers with Saturation

The only in-kernel usage of this is kvm passthrough. The CPU feature
flag is shown as "avx_vnni" in /proc/cpuinfo.

This instruction is currently documented in the latest "extensions"
manual (ISE). It will appear in the "main" manual (SDM) in the future.

Signed-off-by: Kyung Min Park <kyung.min.park@intel.com>
Signed-off-by: Yang Zhong <yang.zhong@intel.com>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Message-Id: <20210105004909.42000-2-yang.zhong@intel.com>
Acked-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Luming Yu <luming.yu@intel.com>
---
 arch/x86/include/asm/cpufeatures.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 81f0c811604e..51e57b620461 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -293,6 +293,7 @@
 #define X86_FEATURE_PER_THREAD_MBA	(11*32+ 7) /* "" Per-thread Memory Bandwidth Allocation */
 
 /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */
+#define X86_FEATURE_AVX_VNNI		(12*32+ 4) /* AVX VNNI instructions */
 #define X86_FEATURE_AVX512_BF16		(12*32+ 5) /* AVX512 BFLOAT16 instructions */
 
 /* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */
-- 
Gitee


From 58276802a77ed31f164b4a97d2204a7d6ee2c4fe Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Mon, 22 Mar 2021 13:53:25 +0000
Subject: [PATCH 175/674] Documentation/admin-guide: Change doc for
 split_lock_detect parameter

mainline inclusion
from mainline-v5.12-rc4
commit ebca17707e38f2050b188d837bd4646b29a1b0c2
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5G10C
CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/
commit/?id=ebca17707e38f2050b188d837bd4646b29a1b0c2

Intel-SIG: commit ebca177 Documentation/admin-guide: Change doc for split_lock_
detect parameter

--------------------------------

Since #DB for bus lock detect changes the split_lock_detect parameter,
update the documentation for the changes.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Link: https://lore.kernel.org/r/20210322135325.682257-4-fenghua.yu@intel.com

(cherry picked from commit ebca17707e38f2050b188d837bd4646b29a1b0c2)
Signed-off-by: Ethan Zhao <haifeng.zhao@linux.intel.com>
---
 .../admin-guide/kernel-parameters.txt         | 22 ++++++++++++++-----
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 2b04cf8fbab4..fbbe09ac2f86 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -5275,27 +5275,37 @@
 	spia_peddr=
 
 	split_lock_detect=
-			[X86] Enable split lock detection
+			[X86] Enable split lock detection or bus lock detection
 
 			When enabled (and if hardware support is present), atomic
 			instructions that access data across cache line
-			boundaries will result in an alignment check exception.
+			boundaries will result in an alignment check exception
+			for split lock detection or a debug exception for
+			bus lock detection.
 
 			off	- not enabled
 
-			warn	- the kernel will emit rate limited warnings
+			warn	- the kernel will emit rate-limited warnings
 				  about applications triggering the #AC
-				  exception. This mode is the default on CPUs
-				  that supports split lock detection.
+				  exception or the #DB exception. This mode is
+				  the default on CPUs that support split lock
+				  detection or bus lock detection. Default
+				  behavior is by #AC if both features are
+				  enabled in hardware.
 
 			fatal	- the kernel will send SIGBUS to applications
-				  that trigger the #AC exception.
+				  that trigger the #AC exception or the #DB
+				  exception. Default behavior is by #AC if
+				  both features are enabled in hardware.
 
 			If an #AC exception is hit in the kernel or in
 			firmware (i.e. not while executing in user mode)
 			the kernel will oops in either "warn" or "fatal"
 			mode.
 
+			#DB exception for bus lock is triggered only when
+			CPL > 0.
+
 	srbds=		[X86,INTEL]
 			Control the Special Register Buffer Data Sampling
 			(SRBDS) mitigation.
-- 
Gitee


From 9fbda1057252b584a3b24a507740aac38fe41c7d Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Mon, 19 Apr 2021 21:49:55 +0000
Subject: [PATCH 176/674] Documentation/x86: Add buslock.rst

mainline inclusion
from mainline-v5.12-rc4
commit 1897907cca5aa22cdfcdb7fb8f0644a6add0877d
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5G10C
CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/
commit/?id=1897907cca5aa22cdfcdb7fb8f0644a6add0877d

Intel-SIG: commit 1897907 Documentation/x86: Add buslock.rst

--------------------------------

Add buslock.rst to explain bus lock problem and how to detect and
handle it.

[ tglx: Included it into index.rst and added the missing include ... ]

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/r/20210419214958.4035512-2-fenghua.yu@intel.com

(cherry picked from commit 1897907cca5aa22cdfcdb7fb8f0644a6add0877d)
Signed-off-by: Ethan Zhao <haifeng.zhao@linux.intel.com>
---
 Documentation/x86/buslock.rst | 104 ++++++++++++++++++++++++++++++++++
 Documentation/x86/index.rst   |   1 +
 2 files changed, 105 insertions(+)
 create mode 100644 Documentation/x86/buslock.rst

diff --git a/Documentation/x86/buslock.rst b/Documentation/x86/buslock.rst
new file mode 100644
index 000000000000..159ff6ba830e
--- /dev/null
+++ b/Documentation/x86/buslock.rst
@@ -0,0 +1,104 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+.. include:: <isonum.txt>
+
+===============================
+Bus lock detection and handling
+===============================
+
+:Copyright: |copy| 2021 Intel Corporation
+:Authors: - Fenghua Yu <fenghua.yu@intel.com>
+          - Tony Luck <tony.luck@intel.com>
+
+Problem
+=======
+
+A split lock is any atomic operation whose operand crosses two cache lines.
+Since the operand spans two cache lines and the operation must be atomic,
+the system locks the bus while the CPU accesses the two cache lines.
+
+A bus lock is acquired through either split locked access to writeback (WB)
+memory or any locked access to non-WB memory. This is typically thousands of
+cycles slower than an atomic operation within a cache line. It also disrupts
+performance on other cores and brings the whole system to its knees.
+
+Detection
+=========
+
+Intel processors may support either or both of the following hardware
+mechanisms to detect split locks and bus locks.
+
+#AC exception for split lock detection
+--------------------------------------
+
+Beginning with the Tremont Atom CPU split lock operations may raise an
+Alignment Check (#AC) exception when a split lock operation is attemped.
+
+#DB exception for bus lock detection
+------------------------------------
+
+Some CPUs have the ability to notify the kernel by an #DB trap after a user
+instruction acquires a bus lock and is executed. This allows the kernel to
+terminate the application or to enforce throttling.
+
+Software handling
+=================
+
+The kernel #AC and #DB handlers handle bus lock based on the kernel
+parameter "split_lock_detect". Here is a summary of different options:
+
++------------------+----------------------------+-----------------------+
+|split_lock_detect=|#AC for split lock		|#DB for bus lock	|
++------------------+----------------------------+-----------------------+
+|off	  	   |Do nothing			|Do nothing		|
++------------------+----------------------------+-----------------------+
+|warn		   |Kernel OOPs			|Warn once per task and |
+|(default)	   |Warn once per task and	|and continues to run.  |
+|		   |disable future checking	|			|
+|		   |When both features are	|			|
+|		   |supported, warn in #AC	|			|
++------------------+----------------------------+-----------------------+
+|fatal		   |Kernel OOPs			|Send SIGBUS to user.	|
+|		   |Send SIGBUS to user		|			|
+|		   |When both features are	|			|
+|		   |supported, fatal in #AC	|			|
++------------------+----------------------------+-----------------------+
+
+Usages
+======
+
+Detecting and handling bus lock may find usages in various areas:
+
+It is critical for real time system designers who build consolidated real
+time systems. These systems run hard real time code on some cores and run
+"untrusted" user processes on other cores. The hard real time cannot afford
+to have any bus lock from the untrusted processes to hurt real time
+performance. To date the designers have been unable to deploy these
+solutions as they have no way to prevent the "untrusted" user code from
+generating split lock and bus lock to block the hard real time code to
+access memory during bus locking.
+
+It's also useful for general computing to prevent guests or user
+applications from slowing down the overall system by executing instructions
+with bus lock.
+
+
+Guidance
+========
+off
+---
+
+Disable checking for split lock and bus lock. This option can be useful if
+there are legacy applications that trigger these events at a low rate so
+that mitigation is not needed.
+
+warn
+----
+
+A warning is emitted when a bus lock is detected which allows to identify
+the offending application. This is the default behavior.
+
+fatal
+-----
+
+In this case, the bus lock is not tolerated and the process is killed.
diff --git a/Documentation/x86/index.rst b/Documentation/x86/index.rst
index 647a570d4931..295a59cbaa69 100644
--- a/Documentation/x86/index.rst
+++ b/Documentation/x86/index.rst
@@ -29,6 +29,7 @@ x86-specific Documentation
    microcode
    resctrl_ui
    tsx_async_abort
+   buslock
    usb-legacy-support
    i386/index
    x86_64/index
-- 
Gitee


From 25f4c3c5f204a006fbcc003054c3cc08932b6e0c Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Mon, 19 Apr 2021 21:49:57 +0000
Subject: [PATCH 177/674] Documentation/admin-guide: Add bus lock ratelimit

mainline inclusion
from mainline-v5.12-rc4
commit 9d839c280b64817345c2fa462c0027a9bd742361
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5G10C
CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/
commit/?id=9d839c280b64817345c2fa462c0027a9bd742361

Intel-SIG: commit 9d839c Documentation/admin-guide: Add bus lock ratelimit

--------------------------------

Since bus lock rate limit changes the split_lock_detect parameter,
update the documentation for the change.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/r/20210419214958.4035512-4-fenghua.yu@intel.com

(cherry picked from commit 9d839c280b64817345c2fa462c0027a9bd742361)
Signed-off-by: Ethan Zhao <haifeng.zhao@linux.intel.com>
---
 Documentation/admin-guide/kernel-parameters.txt | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index fbbe09ac2f86..4241caac8b83 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -5298,6 +5298,14 @@
 				  exception. Default behavior is by #AC if
 				  both features are enabled in hardware.
 
+			ratelimit:N -
+				  Set system wide rate limit to N bus locks
+				  per second for bus lock detection.
+				  0 < N <= 1000.
+
+				  N/A for split lock detection.
+
+
 			If an #AC exception is hit in the kernel or in
 			firmware (i.e. not while executing in user mode)
 			the kernel will oops in either "warn" or "fatal"
-- 
Gitee


From afc7742cc862c618ea92366a49b9437e51ad387a Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Mon, 19 Apr 2021 21:49:56 +0000
Subject: [PATCH 178/674] x86/bus_lock: Set rate limit for bus lock

mainline inclusion
from mainline-v5.12-rc4
commit ef4ae6e4413159d2329a172c12e9274e2cb0a3a8
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5G10C
CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/
commit/?id=ef4ae6e4413159d2329a172c12e9274e2cb0a3a8

Intel-SIG: commit ef4ae6e x86/bus_lock: Set rate limit for bus lock_

--------------------------------

A bus lock can be thousands of cycles slower than atomic operation within
one cache line. It also disrupts performance on other cores. Malicious
users can generate multiple bus locks to degrade the whole system
performance.

The current mitigation is to kill the offending process, but for certain
scenarios it's desired to identify and throttle the offending application.

Add a system wide rate limit for bus locks. When the system detects bus
locks at a rate higher than N/sec (where N can be set by the kernel boot
argument in the range [1..1000]) any task triggering a bus lock will be
forced to sleep for at least 20ms until the overall system rate of bus
locks drops below the threshold.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/r/20210419214958.4035512-3-fenghua.yu@intel.com

(cherry picked from commit ef4ae6e4413159d2329a172c12e9274e2cb0a3a8)
Signed-off-by: Ethan Zhao <haifeng.zhao@linux.intel.com>
---
 arch/x86/kernel/cpu/intel.c | 42 +++++++++++++++++++++++++++++++++++--
 1 file changed, 40 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index dfda4f42ec9b..1780db8192a0 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -10,6 +10,7 @@
 #include <linux/thread_info.h>
 #include <linux/init.h>
 #include <linux/uaccess.h>
+#include <linux/delay.h>
 
 #include <asm/cpufeature.h>
 #include <asm/msr.h>
@@ -40,6 +41,7 @@ enum split_lock_detect_state {
 	sld_off = 0,
 	sld_warn,
 	sld_fatal,
+	sld_ratelimit,
 };
 
 /*
@@ -996,13 +998,30 @@ static const struct {
 	{ "off",	sld_off   },
 	{ "warn",	sld_warn  },
 	{ "fatal",	sld_fatal },
+	{ "ratelimit:", sld_ratelimit },
 };
 
+static struct ratelimit_state bld_ratelimit;
+
 static inline bool match_option(const char *arg, int arglen, const char *opt)
 {
-	int len = strlen(opt);
+	int len = strlen(opt), ratelimit;
+
+	if (strncmp(arg, opt, len))
+		return false;
+
+	/*
+	 * Min ratelimit is 1 bus lock/sec.
+	 * Max ratelimit is 1000 bus locks/sec.
+	 */
+	if (sscanf(arg, "ratelimit:%d", &ratelimit) == 1 &&
+	    ratelimit > 0 && ratelimit <= 1000) {
+		ratelimit_state_init(&bld_ratelimit, HZ, ratelimit);
+		ratelimit_set_flags(&bld_ratelimit, RATELIMIT_MSG_ON_RELEASE);
+		return true;
+	}
 
-	return len == arglen && !strncmp(arg, opt, len);
+	return len == arglen;
 }
 
 static bool split_lock_verify_msr(bool on)
@@ -1081,6 +1100,15 @@ static void sld_update_msr(bool on)
 
 static void split_lock_init(void)
 {
+	/*
+	 * #DB for bus lock handles ratelimit and #AC for split lock is
+	 * disabled.
+	 */
+	if (sld_state == sld_ratelimit) {
+		split_lock_verify_msr(false);
+		return;
+	}
+
 	if (cpu_model_supports_sld)
 		split_lock_verify_msr(sld_state != sld_off);
 }
@@ -1153,6 +1181,12 @@ void handle_bus_lock(struct pt_regs *regs)
 	switch (sld_state) {
 	case sld_off:
 		break;
+	case sld_ratelimit:
+		/* Enforce no more than bld_ratelimit bus locks/sec. */
+		while (!__ratelimit(&bld_ratelimit))
+			msleep(20);
+		/* Warn on the bus lock. */
+		fallthrough;
 	case sld_warn:
 		pr_warn_ratelimited("#DB: %s/%d took a bus_lock trap at address: 0x%lx\n",
 				    current->comm, current->pid, regs->ip);
@@ -1258,6 +1292,10 @@ static void sld_state_show(void)
 				" from non-WB" : "");
 		}
 		break;
+	case sld_ratelimit:
+		if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT))
+			pr_info("#DB: setting system wide bus lock rate limit to %u/sec\n", bld_ratelimit.burst);
+		break;
 	}
 }
 
-- 
Gitee


From a7a9a47d38a5e6d0e32e9167e51986b645502532 Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Thu, 10 Mar 2022 12:48:53 -0800
Subject: [PATCH 179/674] x86/split_lock: Make life miserable for split lockers

mainline inclusion
from mainline-v5.18-rc4
commit 	b041b525dab95352fbd666b14dc73ab898df465f
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5G10C
CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/
commit/?id=b041b525dab95352fbd666b14dc73ab898df465f

Intel-SIG: commit b041b52 x86/split_lock: Make life miserable for split lockers

--------------------------------

In https://lore.kernel.org/all/87y22uujkm.ffs@tglx/ Thomas
said:

  Its's simply wishful thinking that stuff gets fixed because of a
  WARN_ONCE(). This has never worked. The only thing which works is to
  make stuff fail hard or slow it down in a way which makes it annoying
  enough to users to complain.

He was talking about WBINVD. But it made me think about how we use the
split lock detection feature in Linux.

Existing code has three options for applications:

 1) Don't enable split lock detection (allow arbitrary split locks)
 2) Warn once when a process uses split lock, but let the process
    keep running with split lock detection disabled
 3) Kill process that use split locks

Option 2 falls into the "wishful thinking" territory that Thomas warns does
nothing. But option 3 might not be viable in a situation with legacy
applications that need to run.

Hence make option 2 much stricter to "slow it down in a way which makes
it annoying".

Primary reason for this change is to provide better quality of service to
the rest of the applications running on the system. Internal testing shows
that even with many processes splitting locks, performance for the rest of
the system is much more responsive.

The new "warn" mode operates like this.  When an application tries to
execute a bus lock the #AC handler.

 1) Delays (interruptibly) 10 ms before moving to next step.

 2) Blocks (interruptibly) until it can get the semaphore
	If interrupted, just return. Assume the signal will either
	kill the task, or direct execution away from the instruction
	that is trying to get the bus lock.
 3) Disables split lock detection for the current core
 4) Schedules a work queue to re-enable split lock detect in 2 jiffies
 5) Returns

The work queue that re-enables split lock detection also releases the
semaphore.

There is a corner case where a CPU may be taken offline while split lock
detection is disabled. A CPU hotplug handler handles this case.

Old behaviour was to only print the split lock warning on the first
occurrence of a split lock from a task. Preserve that by adding a flag to
the task structure that suppresses subsequent split lock messages from that
task.

Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20220310204854.31752-2-tony.luck@intel.com

(cherry picked from commit b041b525dab95352fbd666b14dc73ab898df465f)
Signed-off-by: Ethan Zhao <haifeng.zhao@linux.intel.com>
---
 arch/x86/kernel/cpu/intel.c | 63 +++++++++++++++++++++++++++++++------
 include/linux/sched.h       |  3 ++
 kernel/fork.c               |  5 +++
 3 files changed, 61 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 1780db8192a0..87f07093df53 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -7,10 +7,13 @@
 #include <linux/smp.h>
 #include <linux/sched.h>
 #include <linux/sched/clock.h>
+#include <linux/semaphore.h>
 #include <linux/thread_info.h>
 #include <linux/init.h>
 #include <linux/uaccess.h>
+#include <linux/workqueue.h>
 #include <linux/delay.h>
+#include <linux/cpuhotplug.h>
 
 #include <asm/cpufeature.h>
 #include <asm/msr.h>
@@ -1003,6 +1006,8 @@ static const struct {
 
 static struct ratelimit_state bld_ratelimit;
 
+static DEFINE_SEMAPHORE(buslock_sem);
+
 static inline bool match_option(const char *arg, int arglen, const char *opt)
 {
 	int len = strlen(opt), ratelimit;
@@ -1113,18 +1118,52 @@ static void split_lock_init(void)
 		split_lock_verify_msr(sld_state != sld_off);
 }
 
+static void __split_lock_reenable(struct work_struct *work)
+{
+	sld_update_msr(true);
+	up(&buslock_sem);
+}
+
+/*
+ * If a CPU goes offline with pending delayed work to re-enable split lock
+ * detection then the delayed work will be executed on some other CPU. That
+ * handles releasing the buslock_sem, but because it executes on a
+ * different CPU probably won't re-enable split lock detection. This is a
+ * problem on HT systems since the sibling CPU on the same core may then be
+ * left running with split lock detection disabled.
+ *
+ * Unconditionally re-enable detection here.
+ */
+static int splitlock_cpu_offline(unsigned int cpu)
+{
+	sld_update_msr(true);
+
+	return 0;
+}
+
+static DECLARE_DELAYED_WORK(split_lock_reenable, __split_lock_reenable);
+
 static void split_lock_warn(unsigned long ip)
 {
-	pr_warn_ratelimited("#AC: %s/%d took a split_lock trap at address: 0x%lx\n",
-			    current->comm, current->pid, ip);
+	int cpu;
 
-	/*
-	 * Disable the split lock detection for this task so it can make
-	 * progress and set TIF_SLD so the detection is re-enabled via
-	 * switch_to_sld() when the task is scheduled out.
-	 */
+	if (!current->reported_split_lock)
+		pr_warn_ratelimited("#AC: %s/%d took a split_lock trap at address: 0x%lx\n",
+				    current->comm, current->pid, ip);
+	current->reported_split_lock = 1;
+
+	/* misery factor #1, sleep 10ms before trying to execute split lock */
+	if (msleep_interruptible(10) > 0)
+		return;
+	/* Misery factor #2, only allow one buslocked disabled core at a time */
+	if (down_interruptible(&buslock_sem) == -EINTR)
+		return;
+	cpu = get_cpu();
+	schedule_delayed_work_on(cpu, &split_lock_reenable, 2);
+
+	/* Disable split lock detection on this CPU to make progress */
 	sld_update_msr(false);
-	set_tsk_thread_flag(current, TIF_SLD);
+	put_cpu();
 }
 
 bool handle_guest_split_lock(unsigned long ip)
@@ -1278,10 +1317,14 @@ static void sld_state_show(void)
 		pr_info("disabled\n");
 		break;
 	case sld_warn:
-		if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
+		if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) {
 			pr_info("#AC: crashing the kernel on kernel split_locks and warning on user-space split_locks\n");
-		else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT))
+			if (cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
+					      "x86/splitlock", NULL, splitlock_cpu_offline) < 0)
+				pr_warn("No splitlock CPU offline handler\n");
+		} else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) {
 			pr_info("#DB: warning on user-space bus_locks\n");
+		}
 		break;
 	case sld_fatal:
 		if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) {
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 47f462040f4d..764b2a927ef9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -865,6 +865,9 @@ struct task_struct {
 	/* Stalled due to lack of memory */
 	unsigned			in_memstall:1;
 #endif
+#ifdef	CONFIG_CPU_SUP_INTEL
+	unsigned			reported_split_lock:1;
+#endif
 
 	unsigned long			atomic_flags; /* Flags requiring atomic access. */
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 0fb86b65ae60..7a2395ade1d4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -946,6 +946,11 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 #ifdef CONFIG_MEMCG
 	tsk->active_memcg = NULL;
 #endif
+
+#ifdef CONFIG_CPU_SUP_INTEL
+	tsk->reported_split_lock = 0;
+#endif
+
 	return tsk;
 
 free_stack:
-- 
Gitee


From ba77b6ebe736ec84ef21efb653f06297a6e0d989 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Wed, 19 May 2021 10:51:43 +0200
Subject: [PATCH 180/674] docs: virt: api.rst: fix a pointer to SGX
 documentation

mainline inclusion
from mainline-5.14
commit 0a5fab9f085880dbd7f9b0055c74936ca8b64fc1
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit 0a5fab9f0858 docs: virt: api.rst: fix a pointer to SGX
documentation.
Backport for SGX virtualization support

--------------------------------

The document which describes the SGX kernel architecture was added at
commit 3fa97bf00126 ("Documentation/x86: Document SGX kernel architecture")

but the reference at virt/kvm/api.rst is pointing to some
non-existing document.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Link: https://lore.kernel.org/r/138c24633c6e4edf862a2b4d77033c603fc10406.1621413933.git.mchehab+huawei@kernel.org
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 Documentation/virt/kvm/api.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 671bbd90f15b..d542bd99a9a4 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -6218,7 +6218,7 @@ system fingerprint.  To prevent userspace from circumventing such restrictions
 by running an enclave in a VM, KVM prevents access to privileged attributes by
 default.
 
-See Documentation/x86/sgx/2.Kernel-internals.rst for more details.
+See Documentation/x86/sgx.rst for more details.
 
 8. Other capabilities.
 ======================
-- 
Gitee


From b7d30662370210e8b0a42ba93f9dc42df80b8b6d Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Tue, 26 Oct 2021 15:00:44 -0700
Subject: [PATCH 181/674] x86/sgx: Add new sgx_epc_page flag bit to mark free
 pages

mainline inclusion
from mainline-5.17
commit d6d261bded8a57aed4faa12d08a5b193418d3aa4
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZFM
CVE: NA

Intel-SIG: commit d6d261bded8a x86/sgx: Add new sgx_epc_page flag
bit to mark free pages.
Backport for SGX MCA recovery co-existence support

--------------------------------

SGX EPC pages go through the following life cycle:

        DIRTY ---> FREE ---> IN-USE --\
                    ^                 |
                    \-----------------/

Recovery action for poison for a DIRTY or FREE page is simple. Just
make sure never to allocate the page. IN-USE pages need some extra
handling.

Add a new flag bit SGX_EPC_PAGE_IS_FREE that is set when a page
is added to a free list and cleared when the page is allocated.

Notes:

1) These transitions are made while holding the node->lock so that
   future code that checks the flags while holding the node->lock
   can be sure that if the SGX_EPC_PAGE_IS_FREE bit is set, then the
   page is on the free list.

2) Initially while the pages are on the dirty list the
   SGX_EPC_PAGE_IS_FREE bit is cleared.

Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Tested-by: Reinette Chatre <reinette.chatre@intel.com>
Link: https://lkml.kernel.org/r/20211026220050.697075-2-tony.luck@intel.com
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/kernel/cpu/sgx/main.c | 2 ++
 arch/x86/kernel/cpu/sgx/sgx.h  | 3 +++
 2 files changed, 5 insertions(+)

diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index 379eeefa744a..e62cc1d04cd7 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -471,6 +471,7 @@ static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid)
 
 	page = list_first_entry(&node->free_page_list, struct sgx_epc_page, list);
 	list_del_init(&page->list);
+	page->flags = 0;
 
 	spin_unlock(&node->lock);
 	atomic_long_dec(&sgx_nr_free_pages);
@@ -625,6 +626,7 @@ void sgx_free_epc_page(struct sgx_epc_page *page)
 	spin_lock(&node->lock);
 
 	list_add_tail(&page->list, &node->free_page_list);
+	page->flags = SGX_EPC_PAGE_IS_FREE;
 
 	spin_unlock(&node->lock);
 	atomic_long_inc(&sgx_nr_free_pages);
diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h
index 4628acec0009..5906471156c5 100644
--- a/arch/x86/kernel/cpu/sgx/sgx.h
+++ b/arch/x86/kernel/cpu/sgx/sgx.h
@@ -26,6 +26,9 @@
 /* Pages, which are being tracked by the page reclaimer. */
 #define SGX_EPC_PAGE_RECLAIMER_TRACKED	BIT(0)
 
+/* Pages on free list */
+#define SGX_EPC_PAGE_IS_FREE		BIT(1)
+
 struct sgx_epc_page {
 	unsigned int section;
 	unsigned int flags;
-- 
Gitee


From e87c3256335b30b3fa238c359d8481ebcb03d1f4 Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Tue, 26 Oct 2021 15:00:45 -0700
Subject: [PATCH 182/674] x86/sgx: Add infrastructure to identify SGX EPC pages

mainline inclusion
from mainline-5.17
commit 40e0e7843e23d164625b9031514f5672f8758bf4
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZFM
CVE: NA

Intel-SIG: commit 40e0e7843e23 x86/sgx: Add infrastructure to identify
SGX EPC pages.
Backport for SGX MCA recovery co-existence support

--------------------------------

X86 machine check architecture reports a physical address when there
is a memory error. Handling that error requires a method to determine
whether the physical address reported is in any of the areas reserved
for EPC pages by BIOS.

SGX EPC pages do not have Linux "struct page" associated with them.

Keep track of the mapping from ranges of EPC pages to the sections
that contain them using an xarray. N.B. adds CONFIG_XARRAY_MULTI to
the SGX dependecies. So "select" that in arch/x86/Kconfig for X86/SGX.

Create a function arch_is_platform_page() that simply reports whether an
address is an EPC page for use elsewhere in the kernel. The ACPI error
injection code needs this function and is typically built as a module,
so export it.

Note that arch_is_platform_page() will be slower than other similar
"what type is this page" functions that can simply check bits in the
"struct page".  If there is some future performance critical user of
this function it may need to be implemented in a more efficient way.

Note also that the current implementation of xarray allocates a few
hundred kilobytes for this usage on a system with 4GB of SGX EPC memory
configured. This isn't ideal, but worth it for the code simplicity.

Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Tested-by: Reinette Chatre <reinette.chatre@intel.com>
Link: https://lkml.kernel.org/r/20211026220050.697075-3-tony.luck@intel.com
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/Kconfig               | 1 +
 arch/x86/kernel/cpu/sgx/main.c | 9 +++++++++
 2 files changed, 10 insertions(+)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 040fb77366dd..aa217be83f58 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1967,6 +1967,7 @@ config X86_SGX
 	select SRCU
 	select MMU_NOTIFIER
 	select NUMA_KEEP_MEMINFO if NUMA
+	select XARRAY_MULTI
 	help
 	  Intel(R) Software Guard eXtensions (SGX) is a set of CPU instructions
 	  that can be used by applications to set aside private regions of code
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index e62cc1d04cd7..9e51d3510732 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -20,6 +20,7 @@ struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS];
 static int sgx_nr_epc_sections;
 static struct task_struct *ksgxd_tsk;
 static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq);
+static DEFINE_XARRAY(sgx_epc_address_space);
 
 /*
  * These variables are part of the state of the reclaimer, and must be accessed
@@ -650,6 +651,8 @@ static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size,
 	}
 
 	section->phys_addr = phys_addr;
+	xa_store_range(&sgx_epc_address_space, section->phys_addr,
+		       phys_addr + size - 1, section, GFP_KERNEL);
 
 	for (i = 0; i < nr_pages; i++) {
 		section->pages[i].section = index;
@@ -661,6 +664,12 @@ static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size,
 	return true;
 }
 
+bool arch_is_platform_page(u64 paddr)
+{
+	return !!xa_load(&sgx_epc_address_space, paddr);
+}
+EXPORT_SYMBOL_GPL(arch_is_platform_page);
+
 /**
  * A section metric is concatenated in a way that @low bits 12-31 define the
  * bits 12-31 of the metric and @high bits 0-19 define the bits 32-51 of the
-- 
Gitee


From dc1b46e0f80bc54c0bd12eed9824214cb718f70c Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Tue, 26 Oct 2021 15:00:46 -0700
Subject: [PATCH 183/674] x86/sgx: Initial poison handling for dirty and free
 pages

mainline inclusion
from mainline-5.17
commit 992801ae92431761b3d8ec88abd5793d154d34ac
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZFM
CVE: NA

Intel-SIG: commit 992801ae9243 x86/sgx: Initial poison handling for
dirty and free pages.
Backport for SGX MCA recovery co-existence support

--------------------------------

A memory controller patrol scrubber can report poison in a page
that isn't currently being used.

Add "poison" field in the sgx_epc_page that can be set for an
sgx_epc_page. Check for it:
1) When sanitizing dirty pages
2) When freeing epc pages

Poison is a new field separated from flags to avoid having to make all
updates to flags atomic, or integrate poison state changes into some
other locking scheme to protect flags (Currently just sgx_reclaimer_lock
which protects the SGX_EPC_PAGE_RECLAIMER_TRACKED bit in page->flags).

In both cases place the poisoned page on a per-node list of poisoned
epc pages to make sure it will not be reallocated.

Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Tested-by: Reinette Chatre <reinette.chatre@intel.com>
Link: https://lkml.kernel.org/r/20211026220050.697075-4-tony.luck@intel.com
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/kernel/cpu/sgx/main.c | 26 +++++++++++++++++++++++++-
 arch/x86/kernel/cpu/sgx/sgx.h  |  4 +++-
 2 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index 9e51d3510732..2e9ac0454608 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -61,6 +61,24 @@ static void __sgx_sanitize_pages(struct list_head *dirty_page_list)
 
 		page = list_first_entry(dirty_page_list, struct sgx_epc_page, list);
 
+		/*
+		 * Checking page->poison without holding the node->lock
+		 * is racy, but losing the race (i.e. poison is set just
+		 * after the check) just means __eremove() will be uselessly
+		 * called for a page that sgx_free_epc_page() will put onto
+		 * the node->sgx_poison_page_list later.
+		 */
+		if (page->poison) {
+			struct sgx_epc_section *section = &sgx_epc_sections[page->section];
+			struct sgx_numa_node *node = section->node;
+
+			spin_lock(&node->lock);
+			list_move(&page->list, &node->sgx_poison_page_list);
+			spin_unlock(&node->lock);
+
+			continue;
+		}
+
 		ret = __eremove(sgx_get_epc_virt_addr(page));
 		if (!ret) {
 			/*
@@ -626,7 +644,11 @@ void sgx_free_epc_page(struct sgx_epc_page *page)
 
 	spin_lock(&node->lock);
 
-	list_add_tail(&page->list, &node->free_page_list);
+	page->owner = NULL;
+	if (page->poison)
+		list_add(&page->list, &node->sgx_poison_page_list);
+	else
+		list_add_tail(&page->list, &node->free_page_list);
 	page->flags = SGX_EPC_PAGE_IS_FREE;
 
 	spin_unlock(&node->lock);
@@ -658,6 +680,7 @@ static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size,
 		section->pages[i].section = index;
 		section->pages[i].flags = 0;
 		section->pages[i].owner = NULL;
+		section->pages[i].poison = 0;
 		list_add_tail(&section->pages[i].list, &sgx_dirty_page_list);
 	}
 
@@ -724,6 +747,7 @@ static bool __init sgx_page_cache_init(void)
 		if (!node_isset(nid, sgx_numa_mask)) {
 			spin_lock_init(&sgx_numa_nodes[nid].lock);
 			INIT_LIST_HEAD(&sgx_numa_nodes[nid].free_page_list);
+			INIT_LIST_HEAD(&sgx_numa_nodes[nid].sgx_poison_page_list);
 			node_set(nid, sgx_numa_mask);
 		}
 
diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h
index 5906471156c5..9ec3136c7800 100644
--- a/arch/x86/kernel/cpu/sgx/sgx.h
+++ b/arch/x86/kernel/cpu/sgx/sgx.h
@@ -31,7 +31,8 @@
 
 struct sgx_epc_page {
 	unsigned int section;
-	unsigned int flags;
+	u16 flags;
+	u16 poison;
 	struct sgx_encl_page *owner;
 	struct list_head list;
 };
@@ -42,6 +43,7 @@ struct sgx_epc_page {
  */
 struct sgx_numa_node {
 	struct list_head free_page_list;
+	struct list_head sgx_poison_page_list;
 	spinlock_t lock;
 };
 
-- 
Gitee


From 4fccae8c26f2571bfdde980c8b58af647e7ce008 Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Tue, 26 Oct 2021 15:00:47 -0700
Subject: [PATCH 184/674] x86/sgx: Add SGX infrastructure to recover from
 poison

mainline inclusion
from mainline-5.17
commit a495cbdffa30558b34f3c95555cecc4fd9688039
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZFM
CVE: NA

Intel-SIG: commit a495cbdffa30 x86/sgx: Add SGX infrastructure to
recover from poison.
Backport for SGX MCA recovery co-existence support

--------------------------------

Provide a recovery function sgx_memory_failure(). If the poison was
consumed synchronously then send a SIGBUS. Note that the virtual
address of the access is not included with the SIGBUS as is the case
for poison outside of SGX enclaves. This doesn't matter as addresses
of code/data inside an enclave is of little to no use to code executing
outside the (now dead) enclave.

Poison found in a free page results in the page being moved from the
free list to the per-node poison page list.

Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Tested-by: Reinette Chatre <reinette.chatre@intel.com>
Link: https://lkml.kernel.org/r/20211026220050.697075-5-tony.luck@intel.com
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/kernel/cpu/sgx/main.c | 76 ++++++++++++++++++++++++++++++++++
 1 file changed, 76 insertions(+)

diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index 2e9ac0454608..4036f50fc42c 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -693,6 +693,82 @@ bool arch_is_platform_page(u64 paddr)
 }
 EXPORT_SYMBOL_GPL(arch_is_platform_page);
 
+static struct sgx_epc_page *sgx_paddr_to_page(u64 paddr)
+{
+	struct sgx_epc_section *section;
+
+	section = xa_load(&sgx_epc_address_space, paddr);
+	if (!section)
+		return NULL;
+
+	return &section->pages[PFN_DOWN(paddr - section->phys_addr)];
+}
+
+/*
+ * Called in process context to handle a hardware reported
+ * error in an SGX EPC page.
+ * If the MF_ACTION_REQUIRED bit is set in flags, then the
+ * context is the task that consumed the poison data. Otherwise
+ * this is called from a kernel thread unrelated to the page.
+ */
+int arch_memory_failure(unsigned long pfn, int flags)
+{
+	struct sgx_epc_page *page = sgx_paddr_to_page(pfn << PAGE_SHIFT);
+	struct sgx_epc_section *section;
+	struct sgx_numa_node *node;
+
+	/*
+	 * mm/memory-failure.c calls this routine for all errors
+	 * where there isn't a "struct page" for the address. But that
+	 * includes other address ranges besides SGX.
+	 */
+	if (!page)
+		return -ENXIO;
+
+	/*
+	 * If poison was consumed synchronously. Send a SIGBUS to
+	 * the task. Hardware has already exited the SGX enclave and
+	 * will not allow re-entry to an enclave that has a memory
+	 * error. The signal may help the task understand why the
+	 * enclave is broken.
+	 */
+	if (flags & MF_ACTION_REQUIRED)
+		force_sig(SIGBUS);
+
+	section = &sgx_epc_sections[page->section];
+	node = section->node;
+
+	spin_lock(&node->lock);
+
+	/* Already poisoned? Nothing more to do */
+	if (page->poison)
+		goto out;
+
+	page->poison = 1;
+
+	/*
+	 * If the page is on a free list, move it to the per-node
+	 * poison page list.
+	 */
+	if (page->flags & SGX_EPC_PAGE_IS_FREE) {
+		list_move(&page->list, &node->sgx_poison_page_list);
+		goto out;
+	}
+
+	/*
+	 * TBD: Add additional plumbing to enable pre-emptive
+	 * action for asynchronous poison notification. Until
+	 * then just hope that the poison:
+	 * a) is not accessed - sgx_free_epc_page() will deal with it
+	 *    when the user gives it back
+	 * b) results in a recoverable machine check rather than
+	 *    a fatal one
+	 */
+out:
+	spin_unlock(&node->lock);
+	return 0;
+}
+
 /**
  * A section metric is concatenated in a way that @low bits 12-31 define the
  * bits 12-31 of the metric and @high bits 0-19 define the bits 32-51 of the
-- 
Gitee


From 5bb9e11f0474d1573ac7c25b0a2139e7ddebf2ab Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Tue, 26 Oct 2021 15:00:48 -0700
Subject: [PATCH 185/674] x86/sgx: Hook arch_memory_failure() into mainline
 code

mainline inclusion
from mainline-5.17
commit 03b122da74b22fbe7cd98184fa5657a9ce13970c
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZFM
CVE: NA

Intel-SIG: commit 03b122da74b2 x86/sgx: Hook arch_memory_failure()
into mainline code.
Backport for SGX MCA recovery co-existence support

--------------------------------

Add a call inside memory_failure() to call the arch specific code
to check if the address is an SGX EPC page and handle it.

Note the SGX EPC pages do not have a "struct page" entry, so the hook
goes in at the same point as the device mapping hook.

Pull the call to acquire the mutex earlier so the SGX errors are also
protected.

Make set_mce_nospec() skip SGX pages when trying to adjust
the 1:1 map.

Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Reviewed-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
Tested-by: Reinette Chatre <reinette.chatre@intel.com>
Link: https://lkml.kernel.org/r/20211026220050.697075-6-tony.luck@intel.com
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/include/asm/processor.h  |  8 ++++++++
 arch/x86/include/asm/set_memory.h |  4 ++++
 include/linux/mm.h                | 13 +++++++++++++
 mm/memory-failure.c               | 19 +++++++++++++------
 4 files changed, 38 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index d428d611a43a..a3d0152a4361 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -858,4 +858,12 @@ enum mds_mitigations {
 	MDS_MITIGATION_VMWERV,
 };
 
+#ifdef CONFIG_X86_SGX
+int arch_memory_failure(unsigned long pfn, int flags);
+#define arch_memory_failure arch_memory_failure
+
+bool arch_is_platform_page(u64 paddr);
+#define arch_is_platform_page arch_is_platform_page
+#endif
+
 #endif /* _ASM_X86_PROCESSOR_H */
diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h
index 5948218f35c5..ef47246d22f5 100644
--- a/arch/x86/include/asm/set_memory.h
+++ b/arch/x86/include/asm/set_memory.h
@@ -2,6 +2,7 @@
 #ifndef _ASM_X86_SET_MEMORY_H
 #define _ASM_X86_SET_MEMORY_H
 
+#include <linux/mm.h>
 #include <asm/page.h>
 #include <asm-generic/set_memory.h>
 
@@ -97,6 +98,9 @@ static inline int set_mce_nospec(unsigned long pfn, bool unmap)
 	unsigned long decoy_addr;
 	int rc;
 
+	/* SGX pages are not in the 1:1 map */
+	if (arch_is_platform_page(pfn << PAGE_SHIFT))
+		return 0;
 	/*
 	 * We would like to just call:
 	 *      set_memory_XX((unsigned long)pfn_to_kaddr(pfn), 1);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index b0ddfac0486b..25891b581bf4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -425,6 +425,19 @@ extern unsigned int kobjsize(const void *objp);
 /* VMA basic access permission flags */
 #define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC)
 
+#ifndef arch_memory_failure
+static inline int arch_memory_failure(unsigned long pfn, int flags)
+{
+	return -ENXIO;
+}
+#endif
+
+#ifndef arch_is_platform_page
+static inline bool arch_is_platform_page(u64 paddr)
+{
+	return false;
+}
+#endif
 
 /*
  * Special vmas that are non-mergable, non-mlock()able.
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 0519f20d2b57..bfa6d1478a75 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1370,21 +1370,28 @@ int memory_failure(unsigned long pfn, int flags)
 	if (!sysctl_memory_failure_recovery)
 		panic("Memory failure on page %lx", pfn);
 
+	mutex_lock(&mf_mutex);
+
 	p = pfn_to_online_page(pfn);
 	if (!p) {
+		res = arch_memory_failure(pfn, flags);
+		if (res == 0)
+			goto unlock_mutex;
+
 		if (pfn_valid(pfn)) {
 			pgmap = get_dev_pagemap(pfn, NULL);
-			if (pgmap)
-				return memory_failure_dev_pagemap(pfn, flags,
-								  pgmap);
+			if (pgmap) {
+				res = memory_failure_dev_pagemap(pfn, flags,
+								 pgmap);
+				goto unlock_mutex;
+			}
 		}
 		pr_err("Memory failure: %#lx: memory outside kernel control\n",
 			pfn);
-		return -ENXIO;
+		res = -ENXIO;
+		goto unlock_mutex;
 	}
 
-	mutex_lock(&mf_mutex);
-
 try_again:
 	if (PageHuge(p)) {
 		res = memory_failure_hugetlb(pfn, flags);
-- 
Gitee


From 3f72e50d38c1c8ea9f6021f539348f2ba1d3a905 Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Tue, 26 Oct 2021 15:00:49 -0700
Subject: [PATCH 186/674] x86/sgx: Add hook to error injection address
 validation

mainline inclusion
from mainline-5.17
commit c6acb1e7bf4656b9434335c72b8245cc84575fde
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZFM
CVE: NA

Intel-SIG: commit c6acb1e7bf46 x86/sgx: Add hook to error injection
address validation.
Backport for SGX MCA recovery co-existence support

--------------------------------

SGX reserved memory does not appear in the standard address maps.

Add hook to call into the SGX code to check if an address is located
in SGX memory.

There are other challenges in injecting errors into SGX. Update the
documentation with a sequence of operations to inject.

Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Tested-by: Reinette Chatre <reinette.chatre@intel.com>
Link: https://lkml.kernel.org/r/20211026220050.697075-7-tony.luck@intel.com
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 .../firmware-guide/acpi/apei/einj.rst         | 19 +++++++++++++++++++
 drivers/acpi/apei/einj.c                      |  3 ++-
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/Documentation/firmware-guide/acpi/apei/einj.rst b/Documentation/firmware-guide/acpi/apei/einj.rst
index e588bccf5158..2fa989c0a12d 100644
--- a/Documentation/firmware-guide/acpi/apei/einj.rst
+++ b/Documentation/firmware-guide/acpi/apei/einj.rst
@@ -181,5 +181,24 @@ You should see something like this in dmesg::
   [22715.834759] EDAC sbridge MC3: PROCESSOR 0:306e7 TIME 1422553404 SOCKET 0 APIC 0
   [22716.616173] EDAC MC3: 1 CE memory read error on CPU_SrcID#0_Channel#0_DIMM#0 (channel:0 slot:0 page:0x12345 offset:0x0 grain:32 syndrome:0x0 -  area:DRAM err_code:0001:0090 socket:0 channel_mask:1 rank:0)
 
+Special notes for injection into SGX enclaves:
+
+There may be a separate BIOS setup option to enable SGX injection.
+
+The injection process consists of setting some special memory controller
+trigger that will inject the error on the next write to the target
+address. But the h/w prevents any software outside of an SGX enclave
+from accessing enclave pages (even BIOS SMM mode).
+
+The following sequence can be used:
+  1) Determine physical address of enclave page
+  2) Use "notrigger=1" mode to inject (this will setup
+     the injection address, but will not actually inject)
+  3) Enter the enclave
+  4) Store data to the virtual address matching physical address from step 1
+  5) Execute CLFLUSH for that virtual address
+  6) Spin delay for 250ms
+  7) Read from the virtual address. This will trigger the error
+
 For more information about EINJ, please refer to ACPI specification
 version 4.0, section 17.5 and ACPI 5.0, section 18.6.
diff --git a/drivers/acpi/apei/einj.c b/drivers/acpi/apei/einj.c
index 133156759551..4384269ad159 100644
--- a/drivers/acpi/apei/einj.c
+++ b/drivers/acpi/apei/einj.c
@@ -544,7 +544,8 @@ static int einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2,
 	    ((region_intersects(base_addr, size, IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE)
 				!= REGION_INTERSECTS) &&
 	     (region_intersects(base_addr, size, IORESOURCE_MEM, IORES_DESC_PERSISTENT_MEMORY)
-				!= REGION_INTERSECTS)))
+				!= REGION_INTERSECTS) &&
+	     !arch_is_platform_page(base_addr)))
 		return -EINVAL;
 
 inject:
-- 
Gitee


From b8151303ac4d44e8eb3862aed5557837837be9af Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Tue, 26 Oct 2021 15:00:50 -0700
Subject: [PATCH 187/674] x86/sgx: Add check for SGX pages to
 ghes_do_memory_failure()

mainline inclusion
from mainline-5.17
commit 3ad6fd77a2d62e8f4465b429b65805eaf88e1b9e
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZFM
CVE: NA

Intel-SIG: commit 3ad6fd77a2d6 x86/sgx: Add check for SGX pages to
ghes_do_memory_failure().
Backport for SGX MCA recovery co-existence support

--------------------------------

SGX EPC pages do not have a "struct page" associated with them so the
pfn_valid() sanity check fails and results in a warning message to
the console.

Add an additional check to skip the warning if the address of the error
is in an SGX EPC page.

Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Tested-by: Reinette Chatre <reinette.chatre@intel.com>
Link: https://lkml.kernel.org/r/20211026220050.697075-8-tony.luck@intel.com
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 drivers/acpi/apei/ghes.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 9ac1e45dbba0..9c38c2cdd2fd 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -452,7 +452,7 @@ static bool ghes_do_memory_failure(u64 physical_addr, int flags)
 		return false;
 
 	pfn = PHYS_PFN(physical_addr);
-	if (!pfn_valid(pfn)) {
+	if (!pfn_valid(pfn) && !arch_is_platform_page(physical_addr)) {
 		pr_warn_ratelimited(FW_WARN GHES_PFX
 		"Invalid address in generic error data: %#llx\n",
 		physical_addr);
-- 
Gitee


From 2f7813cc2de07c4516a117c3f80ccd928520aef4 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Thu, 14 Jul 2022 21:56:58 +0800
Subject: [PATCH 188/674] gfs2: Check for active reservation in gfs2_release

stable inclusion
from stable-v5.10.111
commit 3f53715fd55c4616fc25ac0e45ca7331d8fdf7f1
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=3f53715fd55c4616fc25ac0e45ca7331d8fdf7f1

--------------------------------

[ Upstream commit 0ec9b9ea4f83303bfd8f052a3d8b2bd179b002e1 ]

In gfs2_release, check if the inode has an active reservation to avoid
unnecessary lock taking.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 fs/gfs2/file.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index cfd9d03f604f..59318b1eaa60 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -716,10 +716,10 @@ static int gfs2_release(struct inode *inode, struct file *file)
 	kfree(file->private_data);
 	file->private_data = NULL;
 
-	if (file->f_mode & FMODE_WRITE) {
+	if (gfs2_rs_active(&ip->i_res))
 		gfs2_rs_delete(ip, &inode->i_writecount);
+	if (file->f_mode & FMODE_WRITE)
 		gfs2_qa_put(ip);
-	}
 	return 0;
 }
 
-- 
Gitee


From 9d7d0fdf041465c6390a608c28d07ff15ee3096a Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Thu, 14 Jul 2022 21:56:59 +0800
Subject: [PATCH 189/674] gfs2: Fix gfs2_release for non-writers regression

stable inclusion
from stable-v5.10.111
commit f349d7f9ee6d6c9cf9edadb0e1c5e9de6eda7466
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=f349d7f9ee6d6c9cf9edadb0e1c5e9de6eda7466

--------------------------------

[ Upstream commit d3add1a9519dcacd6e644ecac741c56cf18b67f5 ]

When a file is opened for writing, the vfs code (do_dentry_open)
calls get_write_access for the inode, thus incrementing the inode's write
count. That writer normally then creates a multi-block reservation for
the inode (i_res) that can be re-used by other writers, which speeds up
writes for applications that stupidly loop on open/write/close.
When the writes are all done, the multi-block reservation should be
deleted when the file is closed by the last "writer."

Commit 0ec9b9ea4f83 broke that concept when it moved the call to
gfs2_rs_delete before the check for FMODE_WRITE.  Non-writers have no
business removing the multi-block reservations of writers. In fact, if
someone opens and closes the file for RO while a writer has a
multi-block reservation, the RO closer will delete the reservation
midway through the write, and this results in:

kernel BUG at fs/gfs2/rgrp.c:677! (or thereabouts) which is:
BUG_ON(rs->rs_requested); from function gfs2_rs_deltree.

This patch moves the check back inside the check for FMODE_WRITE.

Fixes: 0ec9b9ea4f83 ("gfs2: Check for active reservation in gfs2_release")
Cc: stable@vger.kernel.org # v5.12+
Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 fs/gfs2/file.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 59318b1eaa60..7bd7581aa682 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -716,10 +716,11 @@ static int gfs2_release(struct inode *inode, struct file *file)
 	kfree(file->private_data);
 	file->private_data = NULL;
 
-	if (gfs2_rs_active(&ip->i_res))
-		gfs2_rs_delete(ip, &inode->i_writecount);
-	if (file->f_mode & FMODE_WRITE)
+	if (file->f_mode & FMODE_WRITE) {
+		if (gfs2_rs_active(&ip->i_res))
+			gfs2_rs_delete(ip, &inode->i_writecount);
 		gfs2_qa_put(ip);
+	}
 	return 0;
 }
 
-- 
Gitee


From 63fc4fbc7733aae50b5ba196913d9960aa253594 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Thu, 14 Jul 2022 21:57:00 +0800
Subject: [PATCH 190/674] gfs2: gfs2_setattr_size error path fix

stable inclusion
from stable-v5.10.111
commit 0777fe98a44c3434a817817d3fb1f17ae855f3f2
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=0777fe98a44c3434a817817d3fb1f17ae855f3f2

--------------------------------

[ Upstream commit 7336905a89f19173bf9301cd50a24421162f417c ]

When gfs2_setattr_size() fails, it calls gfs2_rs_delete(ip, NULL) to get
rid of any reservations the inode may have.  Instead, it should pass in
the inode's write count as the second parameter to allow
gfs2_rs_delete() to figure out if the inode has any writers left.

In a next step, there are two instances of gfs2_rs_delete(ip, NULL) left
where we know that there can be no other users of the inode.  Replace
those with gfs2_rs_deltree(&ip->i_res) to avoid the unnecessary write
count check.

With that, gfs2_rs_delete() is only called with the inode's actual write
count, so get rid of the second parameter.

Fixes: a097dc7e24cb ("GFS2: Make rgrp reservations part of the gfs2_inode structure")
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 fs/gfs2/bmap.c  | 2 +-
 fs/gfs2/file.c  | 2 +-
 fs/gfs2/inode.c | 2 +-
 fs/gfs2/rgrp.c  | 7 ++++---
 fs/gfs2/rgrp.h  | 2 +-
 fs/gfs2/super.c | 2 +-
 6 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index b34c02985d9d..6c047570d6a9 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -2200,7 +2200,7 @@ int gfs2_setattr_size(struct inode *inode, u64 newsize)
 
 	ret = do_shrink(inode, newsize);
 out:
-	gfs2_rs_delete(ip, NULL);
+	gfs2_rs_delete(ip);
 	gfs2_qa_put(ip);
 	return ret;
 }
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 7bd7581aa682..2e6f622ed428 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -718,7 +718,7 @@ static int gfs2_release(struct inode *inode, struct file *file)
 
 	if (file->f_mode & FMODE_WRITE) {
 		if (gfs2_rs_active(&ip->i_res))
-			gfs2_rs_delete(ip, &inode->i_writecount);
+			gfs2_rs_delete(ip);
 		gfs2_qa_put(ip);
 	}
 	return 0;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 65ae4fc28ede..74a6b0800e05 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -811,7 +811,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
 		if (free_vfs_inode) /* else evict will do the put for us */
 			gfs2_glock_put(ip->i_gl);
 	}
-	gfs2_rs_delete(ip, NULL);
+	gfs2_rs_deltree(&ip->i_res);
 	gfs2_qa_put(ip);
 fail_free_acls:
 	posix_acl_release(default_acl);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index eb775e93de97..dc55b029afaa 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -664,13 +664,14 @@ void gfs2_rs_deltree(struct gfs2_blkreserv *rs)
 /**
  * gfs2_rs_delete - delete a multi-block reservation
  * @ip: The inode for this reservation
- * @wcount: The inode's write count, or NULL
  *
  */
-void gfs2_rs_delete(struct gfs2_inode *ip, atomic_t *wcount)
+void gfs2_rs_delete(struct gfs2_inode *ip)
 {
+	struct inode *inode = &ip->i_inode;
+
 	down_write(&ip->i_rw_mutex);
-	if ((wcount == NULL) || (atomic_read(wcount) <= 1))
+	if (atomic_read(&inode->i_writecount) <= 1)
 		gfs2_rs_deltree(&ip->i_res);
 	up_write(&ip->i_rw_mutex);
 }
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 9a587ada51ed..2d3c150c55bd 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -45,7 +45,7 @@ extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n,
 			     bool dinode, u64 *generation);
 
 extern void gfs2_rs_deltree(struct gfs2_blkreserv *rs);
-extern void gfs2_rs_delete(struct gfs2_inode *ip, atomic_t *wcount);
+extern void gfs2_rs_delete(struct gfs2_inode *ip);
 extern void __gfs2_free_blocks(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
 			       u64 bstart, u32 blen, int meta);
 extern void gfs2_free_meta(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index d2b7ecbd1b15..d14b98aa1c3e 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1434,7 +1434,7 @@ static void gfs2_evict_inode(struct inode *inode)
 	truncate_inode_pages_final(&inode->i_data);
 	if (ip->i_qadata)
 		gfs2_assert_warn(sdp, ip->i_qadata->qa_ref == 0);
-	gfs2_rs_delete(ip, NULL);
+	gfs2_rs_deltree(&ip->i_res);
 	gfs2_ordered_del_inode(ip);
 	clear_inode(inode);
 	gfs2_dir_hash_inval(ip);
-- 
Gitee


From f182174d1f67f5cc7227d28aa1f52a8b1c96e1b7 Mon Sep 17 00:00:00 2001
From: Jiasheng Jiang <jiasheng@iscas.ac.cn>
Date: Thu, 14 Jul 2022 21:57:01 +0800
Subject: [PATCH 191/674] rtc: wm8350: Handle error for wm8350_register_irq

stable inclusion
from stable-v5.10.111
commit 2e52a294700bec7867450e912e5bf91e3503288e
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=2e52a294700bec7867450e912e5bf91e3503288e

--------------------------------

[ Upstream commit 43f0269b6b89c1eec4ef83c48035608f4dcdd886 ]

As the potential failure of the wm8350_register_irq(),
it should be better to check it and return error if fails.
Also, it need not free 'wm_rtc->rtc' since it will be freed
automatically.

Fixes: 077eaf5b40ec ("rtc: rtc-wm8350: add support for WM8350 RTC")
Signed-off-by: Jiasheng Jiang <jiasheng@iscas.ac.cn>
Acked-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Link: https://lore.kernel.org/r/20220303085030.291793-1-jiasheng@iscas.ac.cn
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/rtc/rtc-wm8350.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/drivers/rtc/rtc-wm8350.c b/drivers/rtc/rtc-wm8350.c
index 2018614f258f..6eaa9321c074 100644
--- a/drivers/rtc/rtc-wm8350.c
+++ b/drivers/rtc/rtc-wm8350.c
@@ -432,14 +432,21 @@ static int wm8350_rtc_probe(struct platform_device *pdev)
 		return ret;
 	}
 
-	wm8350_register_irq(wm8350, WM8350_IRQ_RTC_SEC,
+	ret = wm8350_register_irq(wm8350, WM8350_IRQ_RTC_SEC,
 			    wm8350_rtc_update_handler, 0,
 			    "RTC Seconds", wm8350);
+	if (ret)
+		return ret;
+
 	wm8350_mask_irq(wm8350, WM8350_IRQ_RTC_SEC);
 
-	wm8350_register_irq(wm8350, WM8350_IRQ_RTC_ALM,
+	ret = wm8350_register_irq(wm8350, WM8350_IRQ_RTC_ALM,
 			    wm8350_rtc_alarm_handler, 0,
 			    "RTC Alarm", wm8350);
+	if (ret) {
+		wm8350_free_irq(wm8350, WM8350_IRQ_RTC_SEC, wm8350);
+		return ret;
+	}
 
 	return 0;
 }
-- 
Gitee


From 162443c3431f1eb02174f822c00f5548b86628ab Mon Sep 17 00:00:00 2001
From: Jim Mattson <jmattson@google.com>
Date: Thu, 14 Jul 2022 21:57:02 +0800
Subject: [PATCH 192/674] KVM: x86/svm: Clear reserved bits written to
 PerfEvtSeln MSRs

stable inclusion
from stable-v5.10.111
commit 66b0fa6b22187caf4d789bdac8dcca49940628b0
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=66b0fa6b22187caf4d789bdac8dcca49940628b0

--------------------------------

[ Upstream commit 9b026073db2f1ad0e4d8b61c83316c8497981037 ]

AMD EPYC CPUs never raise a #GP for a WRMSR to a PerfEvtSeln MSR. Some
reserved bits are cleared, and some are not. Specifically, on
Zen3/Milan, bits 19 and 42 are not cleared.

When emulating such a WRMSR, KVM should not synthesize a #GP,
regardless of which bits are set. However, undocumented bits should
not be passed through to the hardware MSR. So, rather than checking
for reserved bits and synthesizing a #GP, just clear the reserved
bits.

This may seem pedantic, but since KVM currently does not support the
"Host/Guest Only" bits (41:40), it is necessary to clear these bits
rather than synthesizing #GP, because some popular guests (e.g Linux)
will set the "Host Only" bit even on CPUs that don't support
EFER.SVME, and they don't expect a #GP.

For example,

root@Ubuntu1804:~# perf stat -e r26 -a sleep 1

 Performance counter stats for 'system wide':

                 0      r26

       1.001070977 seconds time elapsed

Feb 23 03:59:58 Ubuntu1804 kernel: [  405.379957] unchecked MSR access error: WRMSR to 0xc0010200 (tried to write 0x0000020000130026) at rIP: 0xffffffff9b276a28 (native_write_msr+0x8/0x30)
Feb 23 03:59:58 Ubuntu1804 kernel: [  405.379958] Call Trace:
Feb 23 03:59:58 Ubuntu1804 kernel: [  405.379963]  amd_pmu_disable_event+0x27/0x90

Fixes: ca724305a2b0 ("KVM: x86/vPMU: Implement AMD vPMU code for KVM")
Reported-by: Lotus Fenn <lotusf@google.com>
Signed-off-by: Jim Mattson <jmattson@google.com>
Reviewed-by: Like Xu <likexu@tencent.com>
Reviewed-by: David Dunn <daviddunn@google.com>
Message-Id: <20220226234131.2167175-1-jmattson@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 arch/x86/kvm/svm/pmu.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c
index 4e7093bcb64b..0e9c2322d398 100644
--- a/arch/x86/kvm/svm/pmu.c
+++ b/arch/x86/kvm/svm/pmu.c
@@ -253,12 +253,10 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	/* MSR_EVNTSELn */
 	pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL);
 	if (pmc) {
-		if (data == pmc->eventsel)
-			return 0;
-		if (!(data & pmu->reserved_bits)) {
+		data &= ~pmu->reserved_bits;
+		if (data != pmc->eventsel)
 			reprogram_gp_counter(pmc, data);
-			return 0;
-		}
+		return 0;
 	}
 
 	return 1;
-- 
Gitee


From ab5144b67e9eecff089754b281fd6faa168cb4d4 Mon Sep 17 00:00:00 2001
From: Hou Wenlong <houwenlong.hwl@antgroup.com>
Date: Thu, 14 Jul 2022 21:57:03 +0800
Subject: [PATCH 193/674] KVM: x86/emulator: Emulate RDPID only if it is
 enabled in guest

stable inclusion
from stable-v5.10.111
commit a24479c5e9f44c2a799ca2178b1ba9fe1290706e
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=a24479c5e9f44c2a799ca2178b1ba9fe1290706e

--------------------------------

[ Upstream commit a836839cbfe60dc434c5476a7429cf2bae36415d ]

When RDTSCP is supported but RDPID is not supported in host,
RDPID emulation is available. However, __kvm_get_msr() would
only fail when RDTSCP/RDPID both are disabled in guest, so
the emulator wouldn't inject a #UD when RDPID is disabled but
RDTSCP is enabled in guest.

Fixes: fb6d4d340e05 ("KVM: x86: emulate RDPID")
Signed-off-by: Hou Wenlong <houwenlong.hwl@antgroup.com>
Message-Id: <1dfd46ae5b76d3ed87bde3154d51c64ea64c99c1.1646226788.git.houwenlong.hwl@antgroup.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 arch/x86/kvm/emulate.c     | 4 +++-
 arch/x86/kvm/kvm_emulate.h | 1 +
 arch/x86/kvm/x86.c         | 6 ++++++
 3 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index a63df19ef4da..71e1a2d39f21 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3611,8 +3611,10 @@ static int em_rdpid(struct x86_emulate_ctxt *ctxt)
 {
 	u64 tsc_aux = 0;
 
-	if (ctxt->ops->get_msr(ctxt, MSR_TSC_AUX, &tsc_aux))
+	if (!ctxt->ops->guest_has_rdpid(ctxt))
 		return emulate_ud(ctxt);
+
+	ctxt->ops->get_msr(ctxt, MSR_TSC_AUX, &tsc_aux);
 	ctxt->dst.val = tsc_aux;
 	return X86EMUL_CONTINUE;
 }
diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h
index 7d5be04dc661..aeed6da60e0c 100644
--- a/arch/x86/kvm/kvm_emulate.h
+++ b/arch/x86/kvm/kvm_emulate.h
@@ -225,6 +225,7 @@ struct x86_emulate_ops {
 	bool (*guest_has_long_mode)(struct x86_emulate_ctxt *ctxt);
 	bool (*guest_has_movbe)(struct x86_emulate_ctxt *ctxt);
 	bool (*guest_has_fxsr)(struct x86_emulate_ctxt *ctxt);
+	bool (*guest_has_rdpid)(struct x86_emulate_ctxt *ctxt);
 
 	void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked);
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4e28a6d6dd1a..1c45d9bafe0a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6961,6 +6961,11 @@ static bool emulator_guest_has_fxsr(struct x86_emulate_ctxt *ctxt)
 	return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_FXSR);
 }
 
+static bool emulator_guest_has_rdpid(struct x86_emulate_ctxt *ctxt)
+{
+	return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_RDPID);
+}
+
 static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg)
 {
 	return kvm_register_read(emul_to_vcpu(ctxt), reg);
@@ -7044,6 +7049,7 @@ static const struct x86_emulate_ops emulate_ops = {
 	.guest_has_long_mode = emulator_guest_has_long_mode,
 	.guest_has_movbe     = emulator_guest_has_movbe,
 	.guest_has_fxsr      = emulator_guest_has_fxsr,
+	.guest_has_rdpid     = emulator_guest_has_rdpid,
 	.set_nmi_mask        = emulator_set_nmi_mask,
 	.get_hflags          = emulator_get_hflags,
 	.set_hflags          = emulator_set_hflags,
-- 
Gitee


From c96ad29ecadf11244fb87bf9da9df16da9234034 Mon Sep 17 00:00:00 2001
From: Anisse Astier <anisse@astier.eu>
Date: Thu, 14 Jul 2022 21:57:04 +0800
Subject: [PATCH 194/674] drm: Add orientation quirk for GPD Win Max

stable inclusion
from stable-v5.10.111
commit 850c4351e89555bb89ddb6c7fee5bed7cfdcfcb2
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=850c4351e89555bb89ddb6c7fee5bed7cfdcfcb2

--------------------------------

[ Upstream commit 0b464ca3e0dd3cec65f28bc6d396d82f19080f69 ]

Panel is 800x1280, but mounted on a laptop form factor, sideways.

Signed-off-by: Anisse Astier <anisse@astier.eu>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20211229222200.53128-3-anisse@astier.eu
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/gpu/drm/drm_panel_orientation_quirks.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/drm_panel_orientation_quirks.c b/drivers/gpu/drm/drm_panel_orientation_quirks.c
index 448c2f2d803a..f5ab891731d0 100644
--- a/drivers/gpu/drm/drm_panel_orientation_quirks.c
+++ b/drivers/gpu/drm/drm_panel_orientation_quirks.c
@@ -166,6 +166,12 @@ static const struct dmi_system_id orientation_data[] = {
 		  DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "MicroPC"),
 		},
 		.driver_data = (void *)&lcd720x1280_rightside_up,
+	}, {	/* GPD Win Max */
+		.matches = {
+		  DMI_EXACT_MATCH(DMI_SYS_VENDOR, "GPD"),
+		  DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "G1619-01"),
+		},
+		.driver_data = (void *)&lcd800x1280_rightside_up,
 	}, {	/*
 		 * GPD Pocket, note that the the DMI data is less generic then
 		 * it seems, devices with a board-vendor of "AMI Corporation"
-- 
Gitee


From 6f858fc398ef1d2ac68aa3a39b8f3941e2200486 Mon Sep 17 00:00:00 2001
From: Zekun Shen <bruceshenzk@gmail.com>
Date: Thu, 14 Jul 2022 21:57:05 +0800
Subject: [PATCH 195/674] ath5k: fix OOB in ath5k_eeprom_read_pcal_info_5111

stable inclusion
from stable-v5.10.111
commit 9d7d83d0399e23d66fd431b553842a84ac10398f
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=9d7d83d0399e23d66fd431b553842a84ac10398f

--------------------------------

[ Upstream commit 564d4eceb97eaf381dd6ef6470b06377bb50c95a ]

The bug was found during fuzzing. Stacktrace locates it in
ath5k_eeprom_convert_pcal_info_5111.
When none of the curve is selected in the loop, idx can go
up to AR5K_EEPROM_N_PD_CURVES. The line makes pd out of bound.
pd = &chinfo[pier].pd_curves[idx];

There are many OOB writes using pd later in the code. So I
added a sanity check for idx. Checks for other loops involving
AR5K_EEPROM_N_PD_CURVES are not needed as the loop index is not
used outside the loops.

The patch is NOT tested with real device.

The following is the fuzzing report

BUG: KASAN: slab-out-of-bounds in ath5k_eeprom_read_pcal_info_5111+0x126a/0x1390 [ath5k]
Write of size 1 at addr ffff8880174a4d60 by task modprobe/214

CPU: 0 PID: 214 Comm: modprobe Not tainted 5.6.0 #1
Call Trace:
 dump_stack+0x76/0xa0
 print_address_description.constprop.0+0x16/0x200
 ? ath5k_eeprom_read_pcal_info_5111+0x126a/0x1390 [ath5k]
 ? ath5k_eeprom_read_pcal_info_5111+0x126a/0x1390 [ath5k]
 __kasan_report.cold+0x37/0x7c
 ? ath5k_eeprom_read_pcal_info_5111+0x126a/0x1390 [ath5k]
 kasan_report+0xe/0x20
 ath5k_eeprom_read_pcal_info_5111+0x126a/0x1390 [ath5k]
 ? apic_timer_interrupt+0xa/0x20
 ? ath5k_eeprom_init_11a_pcal_freq+0xbc0/0xbc0 [ath5k]
 ? ath5k_pci_eeprom_read+0x228/0x3c0 [ath5k]
 ath5k_eeprom_init+0x2513/0x6290 [ath5k]
 ? ath5k_eeprom_init_11a_pcal_freq+0xbc0/0xbc0 [ath5k]
 ? usleep_range+0xb8/0x100
 ? apic_timer_interrupt+0xa/0x20
 ? ath5k_eeprom_read_pcal_info_2413+0x2f20/0x2f20 [ath5k]
 ath5k_hw_init+0xb60/0x1970 [ath5k]
 ath5k_init_ah+0x6fe/0x2530 [ath5k]
 ? kasprintf+0xa6/0xe0
 ? ath5k_stop+0x140/0x140 [ath5k]
 ? _dev_notice+0xf6/0xf6
 ? apic_timer_interrupt+0xa/0x20
 ath5k_pci_probe.cold+0x29a/0x3d6 [ath5k]
 ? ath5k_pci_eeprom_read+0x3c0/0x3c0 [ath5k]
 ? mutex_lock+0x89/0xd0
 ? ath5k_pci_eeprom_read+0x3c0/0x3c0 [ath5k]
 local_pci_probe+0xd3/0x160
 pci_device_probe+0x23f/0x3e0
 ? pci_device_remove+0x280/0x280
 ? pci_device_remove+0x280/0x280
 really_probe+0x209/0x5d0

Reported-by: Brendan Dolan-Gavitt <brendandg@nyu.edu>
Signed-off-by: Zekun Shen <bruceshenzk@gmail.com>
Signed-off-by: Kalle Valo <quic_kvalo@quicinc.com>
Link: https://lore.kernel.org/r/YckvDdj3mtCkDRIt@a-10-27-26-18.dynapool.vpn.nyu.edu
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/net/wireless/ath/ath5k/eeprom.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/wireless/ath/ath5k/eeprom.c b/drivers/net/wireless/ath/ath5k/eeprom.c
index 1fbc2c19848f..d444b3d70ba2 100644
--- a/drivers/net/wireless/ath/ath5k/eeprom.c
+++ b/drivers/net/wireless/ath/ath5k/eeprom.c
@@ -746,6 +746,9 @@ ath5k_eeprom_convert_pcal_info_5111(struct ath5k_hw *ah, int mode,
 			}
 		}
 
+		if (idx == AR5K_EEPROM_N_PD_CURVES)
+			goto err_out;
+
 		ee->ee_pd_gains[mode] = 1;
 
 		pd = &chinfo[pier].pd_curves[idx];
-- 
Gitee


From f82957609b5e7de8fa5f4090a5b147efc72f98e0 Mon Sep 17 00:00:00 2001
From: Dale Zhao <dale.zhao@amd.com>
Date: Thu, 14 Jul 2022 21:57:06 +0800
Subject: [PATCH 196/674] drm/amd/display: Add signal type check when verify
 stream backends same

stable inclusion
from stable-v5.10.111
commit 85313d9bc7bd1f20d4ff92023fe731dadf0c778f
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=85313d9bc7bd1f20d4ff92023fe731dadf0c778f

--------------------------------

[ Upstream commit 047db281c026de5971cedb5bb486aa29bd16a39d ]

[Why]
For allow eDP hot-plug feature, the stream signal may change to VIRTUAL
when plug-out and back to eDP when plug-in. OS will still setPathMode
with same timing for each plugging, but eDP gets no stream update as we
don't check signal type changing back as keeping it VIRTUAL. It's also
unsafe for future cases that stream signal is switched with same timing.

[How]
Check stream signal type change include previous HDMI signal case.

Reviewed-by: Aric Cyr <Aric.Cyr@amd.com>
Acked-by: Wayne Lin <wayne.lin@amd.com>
Signed-off-by: Dale Zhao <dale.zhao@amd.com>
Tested-by: Daniel Wheeler <daniel.wheeler@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/gpu/drm/amd/display/dc/core/dc_resource.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_resource.c b/drivers/gpu/drm/amd/display/dc/core/dc_resource.c
index 5f4cdb05c4db..5c5ccbad9658 100644
--- a/drivers/gpu/drm/amd/display/dc/core/dc_resource.c
+++ b/drivers/gpu/drm/amd/display/dc/core/dc_resource.c
@@ -1674,6 +1674,9 @@ static bool are_stream_backends_same(
 	if (is_timing_changed(stream_a, stream_b))
 		return false;
 
+	if (stream_a->signal != stream_b->signal)
+		return false;
+
 	if (stream_a->dpms_off != stream_b->dpms_off)
 		return false;
 
-- 
Gitee


From 1e958ba009c6fe88dff92244c3a73c7243948b01 Mon Sep 17 00:00:00 2001
From: Xin Xiong <xiongx18@fudan.edu.cn>
Date: Thu, 14 Jul 2022 21:57:07 +0800
Subject: [PATCH 197/674] drm/amd/amdgpu/amdgpu_cs: fix refcount leak of a
 dma_fence obj
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

stable inclusion
from stable-v5.10.111
commit 927beb05aaa429c883cc0ec6adc48964b187e291
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=927beb05aaa429c883cc0ec6adc48964b187e291

--------------------------------

[ Upstream commit dfced44f122c500004a48ecc8db516bb6a295a1b ]

This issue takes place in an error path in
amdgpu_cs_fence_to_handle_ioctl(). When `info->in.what` falls into
default case, the function simply returns -EINVAL, forgetting to
decrement the reference count of a dma_fence obj, which is bumped
earlier by amdgpu_cs_get_fence(). This may result in reference count
leaks.

Fix it by decreasing the refcount of specific object before returning
the error code.

Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Xin Xiong <xiongx18@fudan.edu.cn>
Signed-off-by: Xin Tan <tanxin.ctf@gmail.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 12598a4b5c78..867fcee6b0d3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -1484,6 +1484,7 @@ int amdgpu_cs_fence_to_handle_ioctl(struct drm_device *dev, void *data,
 		return 0;
 
 	default:
+		dma_fence_put(fence);
 		return -EINVAL;
 	}
 }
-- 
Gitee


From 137ee1fdc2747db7d385bc7ae93fd1572ca3d852 Mon Sep 17 00:00:00 2001
From: Wayne Chang <waynec@nvidia.com>
Date: Thu, 14 Jul 2022 21:57:08 +0800
Subject: [PATCH 198/674] usb: gadget: tegra-xudc: Do not program SPARAM

stable inclusion
from stable-v5.10.111
commit 07971b818e18b8b61245be270acc861386f7479d
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=07971b818e18b8b61245be270acc861386f7479d

--------------------------------

[ Upstream commit 62fb61580eb48fc890b7bc9fb5fd263367baeca8 ]

According to the Tegra Technical Reference Manual, SPARAM
is a read-only register and should not be programmed in
the driver.

The change removes the wrong SPARAM usage.

Signed-off-by: Wayne Chang <waynec@nvidia.com>
Link: https://lore.kernel.org/r/20220107090443.149021-1-waynec@nvidia.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/usb/gadget/udc/tegra-xudc.c | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/drivers/usb/gadget/udc/tegra-xudc.c b/drivers/usb/gadget/udc/tegra-xudc.c
index 57ee72fead45..1dd0d50c6b56 100644
--- a/drivers/usb/gadget/udc/tegra-xudc.c
+++ b/drivers/usb/gadget/udc/tegra-xudc.c
@@ -32,9 +32,6 @@
 #include <linux/workqueue.h>
 
 /* XUSB_DEV registers */
-#define SPARAM 0x000
-#define  SPARAM_ERSTMAX_MASK GENMASK(20, 16)
-#define  SPARAM_ERSTMAX(x) (((x) << 16) & SPARAM_ERSTMAX_MASK)
 #define DB 0x004
 #define  DB_TARGET_MASK GENMASK(15, 8)
 #define  DB_TARGET(x) (((x) << 8) & DB_TARGET_MASK)
@@ -3295,11 +3292,6 @@ static void tegra_xudc_init_event_ring(struct tegra_xudc *xudc)
 	unsigned int i;
 	u32 val;
 
-	val = xudc_readl(xudc, SPARAM);
-	val &= ~(SPARAM_ERSTMAX_MASK);
-	val |= SPARAM_ERSTMAX(XUDC_NR_EVENT_RINGS);
-	xudc_writel(xudc, val, SPARAM);
-
 	for (i = 0; i < ARRAY_SIZE(xudc->event_ring); i++) {
 		memset(xudc->event_ring[i], 0, XUDC_EVENT_RING_SIZE *
 		       sizeof(*xudc->event_ring[i]));
-- 
Gitee


From 7b7537b8342690fc80888b2028bc7fa5c0f7587f Mon Sep 17 00:00:00 2001
From: Wayne Chang <waynec@nvidia.com>
Date: Thu, 14 Jul 2022 21:57:09 +0800
Subject: [PATCH 199/674] usb: gadget: tegra-xudc: Fix control endpoint's
 definitions

stable inclusion
from stable-v5.10.111
commit 9ea17b9f1dd0dd9f458df9460b36d5c7160eb3f1
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=9ea17b9f1dd0dd9f458df9460b36d5c7160eb3f1

--------------------------------

[ Upstream commit 7bd42fb95eb4f98495ccadf467ad15124208ec49 ]

According to the Tegra Technical Reference Manual, the seq_num
field of control endpoint is not [31:24] but [31:27]. Bit 24
is reserved and bit 26 is splitxstate.

The change fixes the wrong control endpoint's definitions.

Signed-off-by: Wayne Chang <waynec@nvidia.com>
Link: https://lore.kernel.org/r/20220107091349.149798-1-waynec@nvidia.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/usb/gadget/udc/tegra-xudc.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/usb/gadget/udc/tegra-xudc.c b/drivers/usb/gadget/udc/tegra-xudc.c
index 1dd0d50c6b56..de178bf264c2 100644
--- a/drivers/usb/gadget/udc/tegra-xudc.c
+++ b/drivers/usb/gadget/udc/tegra-xudc.c
@@ -272,8 +272,10 @@ BUILD_EP_CONTEXT_RW(deq_hi, deq_hi, 0, 0xffffffff)
 BUILD_EP_CONTEXT_RW(avg_trb_len, tx_info, 0, 0xffff)
 BUILD_EP_CONTEXT_RW(max_esit_payload, tx_info, 16, 0xffff)
 BUILD_EP_CONTEXT_RW(edtla, rsvd[0], 0, 0xffffff)
-BUILD_EP_CONTEXT_RW(seq_num, rsvd[0], 24, 0xff)
+BUILD_EP_CONTEXT_RW(rsvd, rsvd[0], 24, 0x1)
 BUILD_EP_CONTEXT_RW(partial_td, rsvd[0], 25, 0x1)
+BUILD_EP_CONTEXT_RW(splitxstate, rsvd[0], 26, 0x1)
+BUILD_EP_CONTEXT_RW(seq_num, rsvd[0], 27, 0x1f)
 BUILD_EP_CONTEXT_RW(cerrcnt, rsvd[1], 18, 0x3)
 BUILD_EP_CONTEXT_RW(data_offset, rsvd[2], 0, 0x1ffff)
 BUILD_EP_CONTEXT_RW(numtrbs, rsvd[2], 22, 0x1f)
@@ -1554,6 +1556,9 @@ static int __tegra_xudc_ep_set_halt(struct tegra_xudc_ep *ep, bool halt)
 		ep_reload(xudc, ep->index);
 
 		ep_ctx_write_state(ep->context, EP_STATE_RUNNING);
+		ep_ctx_write_rsvd(ep->context, 0);
+		ep_ctx_write_partial_td(ep->context, 0);
+		ep_ctx_write_splitxstate(ep->context, 0);
 		ep_ctx_write_seq_num(ep->context, 0);
 
 		ep_reload(xudc, ep->index);
@@ -2809,7 +2814,10 @@ static void tegra_xudc_reset(struct tegra_xudc *xudc)
 	xudc->setup_seq_num = 0;
 	xudc->queued_setup_packet = false;
 
-	ep_ctx_write_seq_num(ep0->context, xudc->setup_seq_num);
+	ep_ctx_write_rsvd(ep0->context, 0);
+	ep_ctx_write_partial_td(ep0->context, 0);
+	ep_ctx_write_splitxstate(ep0->context, 0);
+	ep_ctx_write_seq_num(ep0->context, 0);
 
 	deq_ptr = trb_virt_to_phys(ep0, &ep0->transfer_ring[ep0->deq_ptr]);
 
-- 
Gitee


From 7a81c512ceac75cc4f8e13effe8fcf1f5489843b Mon Sep 17 00:00:00 2001
From: Yang Guang <yang.guang5@zte.com.cn>
Date: Thu, 14 Jul 2022 21:57:10 +0800
Subject: [PATCH 200/674] ptp: replace snprintf with sysfs_emit

stable inclusion
from stable-v5.10.111
commit 02e2ee8619849f7ac1555d250698fd900ba95d2d
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=02e2ee8619849f7ac1555d250698fd900ba95d2d

--------------------------------

[ Upstream commit e2cf07654efb0fd7bbcb475c6f74be7b5755a8fd ]

coccinelle report:
./drivers/ptp/ptp_sysfs.c:17:8-16:
WARNING: use scnprintf or sprintf
./drivers/ptp/ptp_sysfs.c:390:8-16:
WARNING: use scnprintf or sprintf

Use sysfs_emit instead of scnprintf or sprintf makes more sense.

Reported-by: Zeal Robot <zealci@zte.com.cn>
Signed-off-by: Yang Guang <yang.guang5@zte.com.cn>
Signed-off-by: David Yang <davidcomponentone@gmail.com>
Acked-by: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/ptp/ptp_sysfs.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/ptp/ptp_sysfs.c b/drivers/ptp/ptp_sysfs.c
index be076a91e20e..8cd59e848163 100644
--- a/drivers/ptp/ptp_sysfs.c
+++ b/drivers/ptp/ptp_sysfs.c
@@ -13,7 +13,7 @@ static ssize_t clock_name_show(struct device *dev,
 			       struct device_attribute *attr, char *page)
 {
 	struct ptp_clock *ptp = dev_get_drvdata(dev);
-	return snprintf(page, PAGE_SIZE-1, "%s\n", ptp->info->name);
+	return sysfs_emit(page, "%s\n", ptp->info->name);
 }
 static DEVICE_ATTR_RO(clock_name);
 
@@ -227,7 +227,7 @@ static ssize_t ptp_pin_show(struct device *dev, struct device_attribute *attr,
 
 	mutex_unlock(&ptp->pincfg_mux);
 
-	return snprintf(page, PAGE_SIZE, "%u %u\n", func, chan);
+	return sysfs_emit(page, "%u %u\n", func, chan);
 }
 
 static ssize_t ptp_pin_store(struct device *dev, struct device_attribute *attr,
-- 
Gitee


From 52033721eb81fbd50f491b514b080ce6ee5826c2 Mon Sep 17 00:00:00 2001
From: Maxim Kiselev <bigunclemax@gmail.com>
Date: Thu, 14 Jul 2022 21:57:11 +0800
Subject: [PATCH 201/674] powerpc: dts: t104xrdb: fix phy type for FMAN 4/5

stable inclusion
from stable-v5.10.111
commit e4d2d72013564231842f87fe8fea5d3e4edecedc
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=e4d2d72013564231842f87fe8fea5d3e4edecedc

--------------------------------

[ Upstream commit 17846485dff91acce1ad47b508b633dffc32e838 ]

T1040RDB has two RTL8211E-VB phys which requires setting
of internal delays for correct work.

Changing the phy-connection-type property to `rgmii-id`
will fix this issue.

Signed-off-by: Maxim Kiselev <bigunclemax@gmail.com>
Reviewed-by: Maxim Kochetkov <fido_max@inbox.ru>
Reviewed-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20211230151123.1258321-1-bigunclemax@gmail.com
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 arch/powerpc/boot/dts/fsl/t104xrdb.dtsi | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/boot/dts/fsl/t104xrdb.dtsi b/arch/powerpc/boot/dts/fsl/t104xrdb.dtsi
index 099a598c74c0..bfe1ed5be337 100644
--- a/arch/powerpc/boot/dts/fsl/t104xrdb.dtsi
+++ b/arch/powerpc/boot/dts/fsl/t104xrdb.dtsi
@@ -139,12 +139,12 @@ pca9546@77 {
 		fman@400000 {
 			ethernet@e6000 {
 				phy-handle = <&phy_rgmii_0>;
-				phy-connection-type = "rgmii";
+				phy-connection-type = "rgmii-id";
 			};
 
 			ethernet@e8000 {
 				phy-handle = <&phy_rgmii_1>;
-				phy-connection-type = "rgmii";
+				phy-connection-type = "rgmii-id";
 			};
 
 			mdio0: mdio@fc000 {
-- 
Gitee


From bd1eb5982d2ce144afeaac39413bddcee4a37753 Mon Sep 17 00:00:00 2001
From: Venkateswara Naralasetty <quic_vnaralas@quicinc.com>
Date: Thu, 14 Jul 2022 21:57:12 +0800
Subject: [PATCH 202/674] ath11k: fix kernel panic during unload/load ath11k
 modules

stable inclusion
from stable-v5.10.111
commit c6a815f5abdf324108799829dd19ea62fef4bf95
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=c6a815f5abdf324108799829dd19ea62fef4bf95

--------------------------------

[ Upstream commit 22b59cb965f79ee1accf83172441c9ca0ecb632a ]

Call netif_napi_del() from ath11k_ahb_free_ext_irq() to fix
the following kernel panic when unload/load ath11k modules
for few iterations.

[  971.201365] Unable to handle kernel paging request at virtual address 6d97a208
[  971.204227] pgd = 594c2919
[  971.211478] [6d97a208] *pgd=00000000
[  971.214120] Internal error: Oops: 5 [#1] PREEMPT SMP ARM
[  971.412024] CPU: 2 PID: 4435 Comm: insmod Not tainted 5.4.89 #0
[  971.434256] Hardware name: Generic DT based system
[  971.440165] PC is at napi_by_id+0x10/0x40
[  971.445019] LR is at netif_napi_add+0x160/0x1dc

[  971.743127] (napi_by_id) from [<807d89a0>] (netif_napi_add+0x160/0x1dc)
[  971.751295] (netif_napi_add) from [<7f1209ac>] (ath11k_ahb_config_irq+0xf8/0x414 [ath11k_ahb])
[  971.759164] (ath11k_ahb_config_irq [ath11k_ahb]) from [<7f12135c>] (ath11k_ahb_probe+0x40c/0x51c [ath11k_ahb])
[  971.768567] (ath11k_ahb_probe [ath11k_ahb]) from [<80666864>] (platform_drv_probe+0x48/0x94)
[  971.779670] (platform_drv_probe) from [<80664718>] (really_probe+0x1c8/0x450)
[  971.789389] (really_probe) from [<80664cc4>] (driver_probe_device+0x15c/0x1b8)
[  971.797547] (driver_probe_device) from [<80664f60>] (device_driver_attach+0x44/0x60)
[  971.805795] (device_driver_attach) from [<806650a0>] (__driver_attach+0x124/0x140)
[  971.814822] (__driver_attach) from [<80662adc>] (bus_for_each_dev+0x58/0xa4)
[  971.823328] (bus_for_each_dev) from [<80663a2c>] (bus_add_driver+0xf0/0x1e8)
[  971.831662] (bus_add_driver) from [<806658a4>] (driver_register+0xa8/0xf0)
[  971.839822] (driver_register) from [<8030269c>] (do_one_initcall+0x78/0x1ac)
[  971.847638] (do_one_initcall) from [<80392524>] (do_init_module+0x54/0x200)
[  971.855968] (do_init_module) from [<803945b0>] (load_module+0x1e30/0x1ffc)
[  971.864126] (load_module) from [<803948b0>] (sys_init_module+0x134/0x17c)
[  971.871852] (sys_init_module) from [<80301000>] (ret_fast_syscall+0x0/0x50)

Tested-on: IPQ8074 hw2.0 AHB WLAN.HK.2.6.0.1-00760-QCAHKSWPL_SILICONZ-1

Signed-off-by: Venkateswara Naralasetty <quic_vnaralas@quicinc.com>
Signed-off-by: Kalle Valo <quic_kvalo@quicinc.com>
Link: https://lore.kernel.org/r/1642583973-21599-1-git-send-email-quic_vnaralas@quicinc.com
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/net/wireless/ath/ath11k/ahb.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/wireless/ath/ath11k/ahb.c b/drivers/net/wireless/ath/ath11k/ahb.c
index 9ff6e6853314..190bc5712e96 100644
--- a/drivers/net/wireless/ath/ath11k/ahb.c
+++ b/drivers/net/wireless/ath/ath11k/ahb.c
@@ -366,6 +366,8 @@ static void ath11k_ahb_free_ext_irq(struct ath11k_base *ab)
 
 		for (j = 0; j < irq_grp->num_irq; j++)
 			free_irq(ab->irq_num[irq_grp->irqs[j]], irq_grp);
+
+		netif_napi_del(&irq_grp->napi);
 	}
 }
 
-- 
Gitee


From 7658d7c5aaa3e536aa38714731f06d780041a006 Mon Sep 17 00:00:00 2001
From: Kalle Valo <quic_kvalo@quicinc.com>
Date: Thu, 14 Jul 2022 21:57:13 +0800
Subject: [PATCH 203/674] ath11k: mhi: use mhi_sync_power_up()

stable inclusion
from stable-v5.10.111
commit 339bd0b55ecdd0f7f341e9357c4cfde799de9418
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=339bd0b55ecdd0f7f341e9357c4cfde799de9418

--------------------------------

[ Upstream commit 3df6d74aedfdca919cca475d15dfdbc8b05c9e5d ]

If amss.bin was missing ath11k would crash during 'rmmod ath11k_pci'. The
reason for that was that we were using mhi_async_power_up() which does not
check any errors. But mhi_sync_power_up() on the other hand does check for
errors so let's use that to fix the crash.

I was not able to find a reason why an async version was used.
ath11k_mhi_start() (which enables state ATH11K_MHI_POWER_ON) is called from
ath11k_hif_power_up(), which can sleep. So sync version should be safe to use
here.

[  145.569731] general protection fault, probably for non-canonical address 0xdffffc0000000000: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC KASAN PTI
[  145.569789] KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007]
[  145.569843] CPU: 2 PID: 1628 Comm: rmmod Kdump: loaded Tainted: G        W         5.16.0-wt-ath+ #567
[  145.569898] Hardware name: Intel(R) Client Systems NUC8i7HVK/NUC8i7HVB, BIOS HNKBLi70.86A.0067.2021.0528.1339 05/28/2021
[  145.569956] RIP: 0010:ath11k_hal_srng_access_begin+0xb5/0x2b0 [ath11k]
[  145.570028] Code: df 48 89 fa 48 c1 ea 03 80 3c 02 00 0f 85 ec 01 00 00 48 8b ab a8 00 00 00 48 b8 00 00 00 00 00 fc ff df 48 89 ea 48 c1 ea 03 <0f> b6 14 02 48 89 e8 83 e0 07 83 c0 03 45 85 ed 75 48 38 d0 7c 08
[  145.570089] RSP: 0018:ffffc900025d7ac0 EFLAGS: 00010246
[  145.570144] RAX: dffffc0000000000 RBX: ffff88814fca2dd8 RCX: 1ffffffff50cb455
[  145.570196] RDX: 0000000000000000 RSI: ffff88814fca2dd8 RDI: ffff88814fca2e80
[  145.570252] RBP: 0000000000000000 R08: 0000000000000000 R09: ffffffffa8659497
[  145.570329] R10: fffffbfff50cb292 R11: 0000000000000001 R12: ffff88814fca0000
[  145.570410] R13: 0000000000000000 R14: ffff88814fca2798 R15: ffff88814fca2dd8
[  145.570465] FS:  00007fa399988540(0000) GS:ffff888233e00000(0000) knlGS:0000000000000000
[  145.570519] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  145.570571] CR2: 00007fa399b51421 CR3: 0000000137898002 CR4: 00000000003706e0
[  145.570623] Call Trace:
[  145.570675]  <TASK>
[  145.570727]  ? ath11k_ce_tx_process_cb+0x34b/0x860 [ath11k]
[  145.570797]  ath11k_ce_tx_process_cb+0x356/0x860 [ath11k]
[  145.570864]  ? tasklet_init+0x150/0x150
[  145.570919]  ? ath11k_ce_alloc_pipes+0x280/0x280 [ath11k]
[  145.570986]  ? tasklet_clear_sched+0x42/0xe0
[  145.571042]  ? tasklet_kill+0xe9/0x1b0
[  145.571095]  ? tasklet_clear_sched+0xe0/0xe0
[  145.571148]  ? irq_has_action+0x120/0x120
[  145.571202]  ath11k_ce_cleanup_pipes+0x45a/0x580 [ath11k]
[  145.571270]  ? ath11k_pci_stop+0x10e/0x170 [ath11k_pci]
[  145.571345]  ath11k_core_stop+0x8a/0xc0 [ath11k]
[  145.571434]  ath11k_core_deinit+0x9e/0x150 [ath11k]
[  145.571499]  ath11k_pci_remove+0xd2/0x260 [ath11k_pci]
[  145.571553]  pci_device_remove+0x9a/0x1c0
[  145.571605]  __device_release_driver+0x332/0x660
[  145.571659]  driver_detach+0x1e7/0x2c0
[  145.571712]  bus_remove_driver+0xe2/0x2d0
[  145.571772]  pci_unregister_driver+0x21/0x250
[  145.571826]  __do_sys_delete_module+0x30a/0x4b0
[  145.571879]  ? free_module+0xac0/0xac0
[  145.571933]  ? lockdep_hardirqs_on_prepare.part.0+0x18c/0x370
[  145.571986]  ? syscall_enter_from_user_mode+0x1d/0x50
[  145.572039]  ? lockdep_hardirqs_on+0x79/0x100
[  145.572097]  do_syscall_64+0x3b/0x90
[  145.572153]  entry_SYSCALL_64_after_hwframe+0x44/0xae

Tested-on: WCN6855 hw2.0 PCI WLAN.HSP.1.1-03003-QCAHSPSWPL_V1_V2_SILICONZ_LITE-2

Signed-off-by: Kalle Valo <quic_kvalo@quicinc.com>
Link: https://lore.kernel.org/r/20220127090117.2024-2-kvalo@kernel.org
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/net/wireless/ath/ath11k/mhi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/ath/ath11k/mhi.c b/drivers/net/wireless/ath/ath11k/mhi.c
index aded9a719d51..84db9e55c3e7 100644
--- a/drivers/net/wireless/ath/ath11k/mhi.c
+++ b/drivers/net/wireless/ath/ath11k/mhi.c
@@ -402,7 +402,7 @@ static int ath11k_mhi_set_state(struct ath11k_pci *ab_pci,
 		ret = 0;
 		break;
 	case ATH11K_MHI_POWER_ON:
-		ret = mhi_async_power_up(ab_pci->mhi_ctrl);
+		ret = mhi_sync_power_up(ab_pci->mhi_ctrl);
 		break;
 	case ATH11K_MHI_POWER_OFF:
 		mhi_power_down(ab_pci->mhi_ctrl, true);
-- 
Gitee


From 4d9dfcd10e53a8fc7e5e6be9a66291b2421f2a3a Mon Sep 17 00:00:00 2001
From: Jakub Sitnicki <jakub@cloudflare.com>
Date: Thu, 14 Jul 2022 21:57:14 +0800
Subject: [PATCH 204/674] bpf: Make dst_port field in struct bpf_sock 16-bit
 wide

stable inclusion
from stable-v5.10.111
commit 995f517888687c0730bc3d2dbca424c27350eaa7
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=995f517888687c0730bc3d2dbca424c27350eaa7

--------------------------------

[ Upstream commit 4421a582718ab81608d8486734c18083b822390d ]

Menglong Dong reports that the documentation for the dst_port field in
struct bpf_sock is inaccurate and confusing. From the BPF program PoV, the
field is a zero-padded 16-bit integer in network byte order. The value
appears to the BPF user as if laid out in memory as so:

  offsetof(struct bpf_sock, dst_port) + 0  <port MSB>
                                      + 8  <port LSB>
                                      +16  0x00
                                      +24  0x00

32-, 16-, and 8-bit wide loads from the field are all allowed, but only if
the offset into the field is 0.

32-bit wide loads from dst_port are especially confusing. The loaded value,
after converting to host byte order with bpf_ntohl(dst_port), contains the
port number in the upper 16-bits.

Remove the confusion by splitting the field into two 16-bit fields. For
backward compatibility, allow 32-bit wide loads from offsetof(struct
bpf_sock, dst_port).

While at it, allow loads 8-bit loads at offset [0] and [1] from dst_port.

Reported-by: Menglong Dong <imagedong@tencent.com>
Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Link: https://lore.kernel.org/r/20220130115518.213259-2-jakub@cloudflare.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 include/uapi/linux/bpf.h |  3 ++-
 net/core/filter.c        | 10 +++++++++-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 6bf4d010222e..8fae845d80e2 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -4195,7 +4195,8 @@ struct bpf_sock {
 	__u32 src_ip4;
 	__u32 src_ip6[4];
 	__u32 src_port;		/* host byte order */
-	__u32 dst_port;		/* network byte order */
+	__be16 dst_port;	/* network byte order */
+	__u16 :16;		/* zero padding */
 	__u32 dst_ip4;
 	__u32 dst_ip6[4];
 	__u32 state;
diff --git a/net/core/filter.c b/net/core/filter.c
index 4e06f7331914..6b619ef0dee3 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -7789,6 +7789,7 @@ bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type,
 			      struct bpf_insn_access_aux *info)
 {
 	const int size_default = sizeof(__u32);
+	int field_size;
 
 	if (off < 0 || off >= sizeof(struct bpf_sock))
 		return false;
@@ -7800,7 +7801,6 @@ bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type,
 	case offsetof(struct bpf_sock, family):
 	case offsetof(struct bpf_sock, type):
 	case offsetof(struct bpf_sock, protocol):
-	case offsetof(struct bpf_sock, dst_port):
 	case offsetof(struct bpf_sock, src_port):
 	case offsetof(struct bpf_sock, rx_queue_mapping):
 	case bpf_ctx_range(struct bpf_sock, src_ip4):
@@ -7809,6 +7809,14 @@ bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type,
 	case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]):
 		bpf_ctx_record_field_size(info, size_default);
 		return bpf_ctx_narrow_access_ok(off, size, size_default);
+	case bpf_ctx_range(struct bpf_sock, dst_port):
+		field_size = size == size_default ?
+			size_default : sizeof_field(struct bpf_sock, dst_port);
+		bpf_ctx_record_field_size(info, field_size);
+		return bpf_ctx_narrow_access_ok(off, size, field_size);
+	case offsetofend(struct bpf_sock, dst_port) ...
+	     offsetof(struct bpf_sock, dst_ip4) - 1:
+		return false;
 	}
 
 	return size == size_default;
-- 
Gitee


From ea8d0294b99d34ab6037a80c39e78eaef333e7c6 Mon Sep 17 00:00:00 2001
From: Yang Guang <yang.guang5@zte.com.cn>
Date: Thu, 14 Jul 2022 21:57:15 +0800
Subject: [PATCH 205/674] scsi: mvsas: Replace snprintf() with sysfs_emit()

stable inclusion
from stable-v5.10.111
commit ed7db959203e3ef06f7b59dfb504d392842312f3
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=ed7db959203e3ef06f7b59dfb504d392842312f3

--------------------------------

[ Upstream commit 0ad3867b0f13e45cfee5a1298bfd40eef096116c ]

coccinelle report:
./drivers/scsi/mvsas/mv_init.c:699:8-16:
WARNING: use scnprintf or sprintf
./drivers/scsi/mvsas/mv_init.c:747:8-16:
WARNING: use scnprintf or sprintf

Use sysfs_emit() instead of scnprintf() or sprintf().

Link: https://lore.kernel.org/r/c1711f7cf251730a8ceb5bdfc313bf85662b3395.1643182948.git.yang.guang5@zte.com.cn
Reported-by: Zeal Robot <zealci@zte.com.cn>
Signed-off-by: Yang Guang <yang.guang5@zte.com.cn>
Signed-off-by: David Yang <davidcomponentone@gmail.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/scsi/mvsas/mv_init.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/scsi/mvsas/mv_init.c b/drivers/scsi/mvsas/mv_init.c
index b03c0f35d7b0..0cfea7b2ab13 100644
--- a/drivers/scsi/mvsas/mv_init.c
+++ b/drivers/scsi/mvsas/mv_init.c
@@ -697,7 +697,7 @@ static ssize_t
 mvs_show_driver_version(struct device *cdev,
 		struct device_attribute *attr,  char *buffer)
 {
-	return snprintf(buffer, PAGE_SIZE, "%s\n", DRV_VERSION);
+	return sysfs_emit(buffer, "%s\n", DRV_VERSION);
 }
 
 static DEVICE_ATTR(driver_version,
@@ -749,7 +749,7 @@ mvs_store_interrupt_coalescing(struct device *cdev,
 static ssize_t mvs_show_interrupt_coalescing(struct device *cdev,
 			struct device_attribute *attr, char *buffer)
 {
-	return snprintf(buffer, PAGE_SIZE, "%d\n", interrupt_coalescing);
+	return sysfs_emit(buffer, "%d\n", interrupt_coalescing);
 }
 
 static DEVICE_ATTR(interrupt_coalescing,
-- 
Gitee


From 469563ccf55711bafcb482164c800826a801ab1f Mon Sep 17 00:00:00 2001
From: Yang Guang <yang.guang5@zte.com.cn>
Date: Thu, 14 Jul 2022 21:57:16 +0800
Subject: [PATCH 206/674] scsi: bfa: Replace snprintf() with sysfs_emit()

stable inclusion
from stable-v5.10.111
commit de9505936c47df0c1cbee112a48c883322135b54
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=de9505936c47df0c1cbee112a48c883322135b54

--------------------------------

[ Upstream commit 2245ea91fd3a04cafbe2f54911432a8657528c3b ]

coccinelle report:
./drivers/scsi/bfa/bfad_attr.c:908:8-16:
WARNING: use scnprintf or sprintf
./drivers/scsi/bfa/bfad_attr.c:860:8-16:
WARNING: use scnprintf or sprintf
./drivers/scsi/bfa/bfad_attr.c:888:8-16:
WARNING: use scnprintf or sprintf
./drivers/scsi/bfa/bfad_attr.c:853:8-16:
WARNING: use scnprintf or sprintf
./drivers/scsi/bfa/bfad_attr.c:808:8-16:
WARNING: use scnprintf or sprintf
./drivers/scsi/bfa/bfad_attr.c:728:8-16:
WARNING: use scnprintf or sprintf
./drivers/scsi/bfa/bfad_attr.c:822:8-16:
WARNING: use scnprintf or sprintf
./drivers/scsi/bfa/bfad_attr.c:927:9-17:
WARNING: use scnprintf or sprintf
./drivers/scsi/bfa/bfad_attr.c:900:8-16:
WARNING: use scnprintf or sprintf
./drivers/scsi/bfa/bfad_attr.c:874:8-16:
WARNING: use scnprintf or sprintf
./drivers/scsi/bfa/bfad_attr.c:714:8-16:
WARNING: use scnprintf or sprintf
./drivers/scsi/bfa/bfad_attr.c:839:8-16:
WARNING: use scnprintf or sprintf

Use sysfs_emit() instead of scnprintf() or sprintf().

Link: https://lore.kernel.org/r/def83ff75faec64ba592b867a8499b1367bae303.1643181468.git.yang.guang5@zte.com.cn
Reported-by: Zeal Robot <zealci@zte.com.cn>
Signed-off-by: Yang Guang <yang.guang5@zte.com.cn>
Signed-off-by: David Yang <davidcomponentone@gmail.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/scsi/bfa/bfad_attr.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/drivers/scsi/bfa/bfad_attr.c b/drivers/scsi/bfa/bfad_attr.c
index 5ae1e3f78910..e049cdb3c286 100644
--- a/drivers/scsi/bfa/bfad_attr.c
+++ b/drivers/scsi/bfa/bfad_attr.c
@@ -711,7 +711,7 @@ bfad_im_serial_num_show(struct device *dev, struct device_attribute *attr,
 	char serial_num[BFA_ADAPTER_SERIAL_NUM_LEN];
 
 	bfa_get_adapter_serial_num(&bfad->bfa, serial_num);
-	return snprintf(buf, PAGE_SIZE, "%s\n", serial_num);
+	return sysfs_emit(buf, "%s\n", serial_num);
 }
 
 static ssize_t
@@ -725,7 +725,7 @@ bfad_im_model_show(struct device *dev, struct device_attribute *attr,
 	char model[BFA_ADAPTER_MODEL_NAME_LEN];
 
 	bfa_get_adapter_model(&bfad->bfa, model);
-	return snprintf(buf, PAGE_SIZE, "%s\n", model);
+	return sysfs_emit(buf, "%s\n", model);
 }
 
 static ssize_t
@@ -805,7 +805,7 @@ bfad_im_model_desc_show(struct device *dev, struct device_attribute *attr,
 		snprintf(model_descr, BFA_ADAPTER_MODEL_DESCR_LEN,
 			"Invalid Model");
 
-	return snprintf(buf, PAGE_SIZE, "%s\n", model_descr);
+	return sysfs_emit(buf, "%s\n", model_descr);
 }
 
 static ssize_t
@@ -819,7 +819,7 @@ bfad_im_node_name_show(struct device *dev, struct device_attribute *attr,
 	u64        nwwn;
 
 	nwwn = bfa_fcs_lport_get_nwwn(port->fcs_port);
-	return snprintf(buf, PAGE_SIZE, "0x%llx\n", cpu_to_be64(nwwn));
+	return sysfs_emit(buf, "0x%llx\n", cpu_to_be64(nwwn));
 }
 
 static ssize_t
@@ -836,7 +836,7 @@ bfad_im_symbolic_name_show(struct device *dev, struct device_attribute *attr,
 	bfa_fcs_lport_get_attr(&bfad->bfa_fcs.fabric.bport, &port_attr);
 	strlcpy(symname, port_attr.port_cfg.sym_name.symname,
 			BFA_SYMNAME_MAXLEN);
-	return snprintf(buf, PAGE_SIZE, "%s\n", symname);
+	return sysfs_emit(buf, "%s\n", symname);
 }
 
 static ssize_t
@@ -850,14 +850,14 @@ bfad_im_hw_version_show(struct device *dev, struct device_attribute *attr,
 	char hw_ver[BFA_VERSION_LEN];
 
 	bfa_get_pci_chip_rev(&bfad->bfa, hw_ver);
-	return snprintf(buf, PAGE_SIZE, "%s\n", hw_ver);
+	return sysfs_emit(buf, "%s\n", hw_ver);
 }
 
 static ssize_t
 bfad_im_drv_version_show(struct device *dev, struct device_attribute *attr,
 				char *buf)
 {
-	return snprintf(buf, PAGE_SIZE, "%s\n", BFAD_DRIVER_VERSION);
+	return sysfs_emit(buf, "%s\n", BFAD_DRIVER_VERSION);
 }
 
 static ssize_t
@@ -871,7 +871,7 @@ bfad_im_optionrom_version_show(struct device *dev,
 	char optrom_ver[BFA_VERSION_LEN];
 
 	bfa_get_adapter_optrom_ver(&bfad->bfa, optrom_ver);
-	return snprintf(buf, PAGE_SIZE, "%s\n", optrom_ver);
+	return sysfs_emit(buf, "%s\n", optrom_ver);
 }
 
 static ssize_t
@@ -885,7 +885,7 @@ bfad_im_fw_version_show(struct device *dev, struct device_attribute *attr,
 	char fw_ver[BFA_VERSION_LEN];
 
 	bfa_get_adapter_fw_ver(&bfad->bfa, fw_ver);
-	return snprintf(buf, PAGE_SIZE, "%s\n", fw_ver);
+	return sysfs_emit(buf, "%s\n", fw_ver);
 }
 
 static ssize_t
@@ -897,7 +897,7 @@ bfad_im_num_of_ports_show(struct device *dev, struct device_attribute *attr,
 			(struct bfad_im_port_s *) shost->hostdata[0];
 	struct bfad_s *bfad = im_port->bfad;
 
-	return snprintf(buf, PAGE_SIZE, "%d\n",
+	return sysfs_emit(buf, "%d\n",
 			bfa_get_nports(&bfad->bfa));
 }
 
@@ -905,7 +905,7 @@ static ssize_t
 bfad_im_drv_name_show(struct device *dev, struct device_attribute *attr,
 				char *buf)
 {
-	return snprintf(buf, PAGE_SIZE, "%s\n", BFAD_DRIVER_NAME);
+	return sysfs_emit(buf, "%s\n", BFAD_DRIVER_NAME);
 }
 
 static ssize_t
@@ -924,14 +924,14 @@ bfad_im_num_of_discovered_ports_show(struct device *dev,
 	rports = kcalloc(nrports, sizeof(struct bfa_rport_qualifier_s),
 			 GFP_ATOMIC);
 	if (rports == NULL)
-		return snprintf(buf, PAGE_SIZE, "Failed\n");
+		return sysfs_emit(buf, "Failed\n");
 
 	spin_lock_irqsave(&bfad->bfad_lock, flags);
 	bfa_fcs_lport_get_rport_quals(port->fcs_port, rports, &nrports);
 	spin_unlock_irqrestore(&bfad->bfad_lock, flags);
 	kfree(rports);
 
-	return snprintf(buf, PAGE_SIZE, "%d\n", nrports);
+	return sysfs_emit(buf, "%d\n", nrports);
 }
 
 static          DEVICE_ATTR(serial_number, S_IRUGO,
-- 
Gitee


From f6cd84e06dbb2c9600dc4813556e3c7713fe786a Mon Sep 17 00:00:00 2001
From: Evgeny Boger <boger@wirenboard.com>
Date: Thu, 14 Jul 2022 21:57:17 +0800
Subject: [PATCH 207/674] power: supply: axp20x_battery: properly report
 current when discharging

stable inclusion
from stable-v5.10.111
commit b42b6d0ec358432ef13d1e3ca4d33cd837af6f8a
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=b42b6d0ec358432ef13d1e3ca4d33cd837af6f8a

--------------------------------

[ Upstream commit d4f408cdcd26921c1268cb8dcbe8ffb6faf837f3 ]

As stated in [1], negative current values are used for discharging
batteries.

AXP PMICs internally have two different ADC channels for shunt current
measurement: one used during charging and one during discharging.
The values reported by these ADCs are unsigned.
While the driver properly selects ADC channel to get the data from,
it doesn't apply negative sign when reporting discharging current.

[1] Documentation/ABI/testing/sysfs-class-power

Signed-off-by: Evgeny Boger <boger@wirenboard.com>
Acked-by: Chen-Yu Tsai <wens@csie.org>
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/power/supply/axp20x_battery.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/drivers/power/supply/axp20x_battery.c b/drivers/power/supply/axp20x_battery.c
index e84b6e4da14a..9fda98b950ba 100644
--- a/drivers/power/supply/axp20x_battery.c
+++ b/drivers/power/supply/axp20x_battery.c
@@ -185,7 +185,6 @@ static int axp20x_battery_get_prop(struct power_supply *psy,
 				   union power_supply_propval *val)
 {
 	struct axp20x_batt_ps *axp20x_batt = power_supply_get_drvdata(psy);
-	struct iio_channel *chan;
 	int ret = 0, reg, val1;
 
 	switch (psp) {
@@ -265,12 +264,12 @@ static int axp20x_battery_get_prop(struct power_supply *psy,
 		if (ret)
 			return ret;
 
-		if (reg & AXP20X_PWR_STATUS_BAT_CHARGING)
-			chan = axp20x_batt->batt_chrg_i;
-		else
-			chan = axp20x_batt->batt_dischrg_i;
-
-		ret = iio_read_channel_processed(chan, &val->intval);
+		if (reg & AXP20X_PWR_STATUS_BAT_CHARGING) {
+			ret = iio_read_channel_processed(axp20x_batt->batt_chrg_i, &val->intval);
+		} else {
+			ret = iio_read_channel_processed(axp20x_batt->batt_dischrg_i, &val1);
+			val->intval = -val1;
+		}
 		if (ret)
 			return ret;
 
-- 
Gitee


From 585eb591b678419b3df7177cb4ea29c2152ed982 Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Thu, 14 Jul 2022 21:57:18 +0800
Subject: [PATCH 208/674] mt76: dma: initialize skip_unmap in mt76_dma_rx_fill

stable inclusion
from stable-v5.10.111
commit 9a56e2b271bc65116d1e44454b244b18d289a26b
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=9a56e2b271bc65116d1e44454b244b18d289a26b

--------------------------------

[ Upstream commit 577298ec55dfc8b9aece54520f0258c3f93a6573 ]

Even if it is only a false-positive since skip_buf0/skip_buf1 are only
used in mt76_dma_tx_cleanup_idx routine, initialize skip_unmap in
mt76_dma_rx_fill in order to fix the following UBSAN report:

[   13.924906] UBSAN: invalid-load in linux-5.15.0/drivers/net/wireless/mediatek/mt76/dma.c:162:13
[   13.924909] load of value 225 is not a valid value for type '_Bool'
[   13.924912] CPU: 9 PID: 672 Comm: systemd-udevd Not tainted 5.15.0-18-generic #18-Ubuntu
[   13.924914] Hardware name: LENOVO 21A0000CMX/21A0000CMX, BIOS R1MET43W (1.13 ) 11/05/2021
[   13.924915] Call Trace:
[   13.924917]  <TASK>
[   13.924920]  show_stack+0x52/0x58
[   13.924925]  dump_stack_lvl+0x4a/0x5f
[   13.924931]  dump_stack+0x10/0x12
[   13.924932]  ubsan_epilogue+0x9/0x45
[   13.924934]  __ubsan_handle_load_invalid_value.cold+0x44/0x49
[   13.924935]  ? __iommu_dma_map+0x84/0xf0
[   13.924939]  mt76_dma_add_buf.constprop.0.cold+0x23/0x85 [mt76]
[   13.924949]  mt76_dma_rx_fill.isra.0+0x102/0x1f0 [mt76]
[   13.924954]  mt76_dma_init+0xc9/0x150 [mt76]
[   13.924959]  ? mt7921_dma_enable+0x110/0x110 [mt7921e]
[   13.924966]  mt7921_dma_init+0x1e3/0x260 [mt7921e]
[   13.924970]  mt7921_register_device+0x29d/0x510 [mt7921e]
[   13.924975]  mt7921_pci_probe.part.0+0x17f/0x1b0 [mt7921e]
[   13.924980]  mt7921_pci_probe+0x43/0x60 [mt7921e]
[   13.924984]  local_pci_probe+0x4b/0x90
[   13.924987]  pci_device_probe+0x115/0x1f0
[   13.924989]  really_probe+0x21e/0x420
[   13.924992]  __driver_probe_device+0x115/0x190
[   13.924994]  driver_probe_device+0x23/0xc0
[   13.924996]  __driver_attach+0xbd/0x1d0
[   13.924998]  ? __device_attach_driver+0x110/0x110
[   13.924999]  bus_for_each_dev+0x7e/0xc0
[   13.925001]  driver_attach+0x1e/0x20
[   13.925003]  bus_add_driver+0x135/0x200
[   13.925005]  driver_register+0x95/0xf0
[   13.925008]  ? 0xffffffffc0766000
[   13.925010]  __pci_register_driver+0x68/0x70
[   13.925011]  mt7921_pci_driver_init+0x23/0x1000 [mt7921e]
[   13.925015]  do_one_initcall+0x48/0x1d0
[   13.925019]  ? kmem_cache_alloc_trace+0x19e/0x2e0
[   13.925022]  do_init_module+0x62/0x280
[   13.925025]  load_module+0xac9/0xbb0
[   13.925027]  __do_sys_finit_module+0xbf/0x120
[   13.925029]  __x64_sys_finit_module+0x18/0x20
[   13.925030]  do_syscall_64+0x5c/0xc0
[   13.925033]  ? do_syscall_64+0x69/0xc0
[   13.925034]  ? sysvec_reschedule_ipi+0x78/0xe0
[   13.925036]  ? asm_sysvec_reschedule_ipi+0xa/0x20
[   13.925039]  entry_SYSCALL_64_after_hwframe+0x44/0xae
[   13.925040] RIP: 0033:0x7fbf2b90f94d
[   13.925045] RSP: 002b:00007ffe2ec7e5d8 EFLAGS: 00000246 ORIG_RAX: 0000000000000139
[   13.925047] RAX: ffffffffffffffda RBX: 000056106b0634e0 RCX: 00007fbf2b90f94d
[   13.925048] RDX: 0000000000000000 RSI: 00007fbf2baa3441 RDI: 0000000000000013
[   13.925049] RBP: 0000000000020000 R08: 0000000000000000 R09: 0000000000000002
[   13.925050] R10: 0000000000000013 R11: 0000000000000246 R12: 00007fbf2baa3441
[   13.925051] R13: 000056106b062620 R14: 000056106b0610c0 R15: 000056106b0640d0
[   13.925053]  </TASK>

Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/net/wireless/mediatek/mt76/dma.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/wireless/mediatek/mt76/dma.c b/drivers/net/wireless/mediatek/mt76/dma.c
index 0fdfead45c77..f01b455783b2 100644
--- a/drivers/net/wireless/mediatek/mt76/dma.c
+++ b/drivers/net/wireless/mediatek/mt76/dma.c
@@ -455,6 +455,7 @@ mt76_dma_rx_fill(struct mt76_dev *dev, struct mt76_queue *q)
 
 		qbuf.addr = addr + offset;
 		qbuf.len = len - offset;
+		qbuf.skip_unmap = false;
 		mt76_dma_add_buf(dev, q, &qbuf, 1, 0, buf, NULL);
 		frames++;
 	}
-- 
Gitee


From 80972fc714726cd3abe83b6d88099ef435f5332a Mon Sep 17 00:00:00 2001
From: Avraham Stern <avraham.stern@intel.com>
Date: Thu, 14 Jul 2022 21:57:19 +0800
Subject: [PATCH 209/674] cfg80211: don't add non transmitted BSS to 6GHz
 scanned channels

stable inclusion
from stable-v5.10.111
commit 26a1e4739e442da11607f0a757cb22a653e595ec
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=26a1e4739e442da11607f0a757cb22a653e595ec

--------------------------------

[ Upstream commit 5666ee154f4696c011dfa8544aaf5591b6b87515 ]

When adding 6GHz channels to scan request based on reported
co-located APs, don't add channels that have only APs with
"non-transmitted" BSSes if they only match the wildcard SSID since
they will be found by probing the "transmitted" BSS.

Signed-off-by: Avraham Stern <avraham.stern@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Link: https://lore.kernel.org/r/iwlwifi.20220202104617.f6ddf099f934.I231e55885d3644f292d00dfe0f42653269f2559e@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 net/wireless/scan.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index fd614a5a00b4..c1b2655682a8 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -702,8 +702,12 @@ static bool cfg80211_find_ssid_match(struct cfg80211_colocated_ap *ap,
 
 	for (i = 0; i < request->n_ssids; i++) {
 		/* wildcard ssid in the scan request */
-		if (!request->ssids[i].ssid_len)
+		if (!request->ssids[i].ssid_len) {
+			if (ap->multi_bss && !ap->transmitted_bssid)
+				continue;
+
 			return true;
+		}
 
 		if (ap->ssid_len &&
 		    ap->ssid_len == request->ssids[i].ssid_len) {
@@ -830,6 +834,9 @@ static int cfg80211_scan_6ghz(struct cfg80211_registered_device *rdev)
 		    !cfg80211_find_ssid_match(ap, request))
 			continue;
 
+		if (!request->n_ssids && ap->multi_bss && !ap->transmitted_bssid)
+			continue;
+
 		cfg80211_scan_req_add_chan(request, chan, true);
 		memcpy(scan_6ghz_params->bssid, ap->bssid, ETH_ALEN);
 		scan_6ghz_params->short_ssid = ap->short_ssid;
-- 
Gitee


From e2e71220c8cafedebdbd20c69f91c7f71e56332c Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 14 Jul 2022 21:57:20 +0800
Subject: [PATCH 210/674] libbpf: Fix build issue with llvm-readelf

stable inclusion
from stable-v5.10.111
commit 5baf92a2c46c543ddcc2e3b1a96cd67a10f7e7fd
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=5baf92a2c46c543ddcc2e3b1a96cd67a10f7e7fd

--------------------------------

[ Upstream commit 0908a66ad1124c1634c33847ac662106f7f2c198 ]

There are cases where clang compiler is packaged in a way
readelf is a symbolic link to llvm-readelf. In such cases,
llvm-readelf will be used instead of default binutils readelf,
and the following error will appear during libbpf build:

#  Warning: Num of global symbols in
#   /home/yhs/work/bpf-next/tools/testing/selftests/bpf/tools/build/libbpf/sharedobjs/libbpf-in.o (367)
#   does NOT match with num of versioned symbols in
#   /home/yhs/work/bpf-next/tools/testing/selftests/bpf/tools/build/libbpf/libbpf.so libbpf.map (383).
#   Please make sure all LIBBPF_API symbols are versioned in libbpf.map.
#  --- /home/yhs/work/bpf-next/tools/testing/selftests/bpf/tools/build/libbpf/libbpf_global_syms.tmp ...
#  +++ /home/yhs/work/bpf-next/tools/testing/selftests/bpf/tools/build/libbpf/libbpf_versioned_syms.tmp ...
#  @@ -324,6 +324,22 @@
#   btf__str_by_offset
#   btf__type_by_id
#   btf__type_cnt
#  +LIBBPF_0.0.1
#  +LIBBPF_0.0.2
#  +LIBBPF_0.0.3
#  +LIBBPF_0.0.4
#  +LIBBPF_0.0.5
#  +LIBBPF_0.0.6
#  +LIBBPF_0.0.7
#  +LIBBPF_0.0.8
#  +LIBBPF_0.0.9
#  +LIBBPF_0.1.0
#  +LIBBPF_0.2.0
#  +LIBBPF_0.3.0
#  +LIBBPF_0.4.0
#  +LIBBPF_0.5.0
#  +LIBBPF_0.6.0
#  +LIBBPF_0.7.0
#   libbpf_attach_type_by_name
#   libbpf_find_kernel_btf
#   libbpf_find_vmlinux_btf_id
#  make[2]: *** [Makefile:184: check_abi] Error 1
#  make[1]: *** [Makefile:140: all] Error 2

The above failure is due to different printouts for some ABS
versioned symbols. For example, with the same libbpf.so,
  $ /bin/readelf --dyn-syms --wide tools/lib/bpf/libbpf.so | grep "LIBBPF" | grep ABS
     134: 0000000000000000     0 OBJECT  GLOBAL DEFAULT  ABS LIBBPF_0.5.0
     202: 0000000000000000     0 OBJECT  GLOBAL DEFAULT  ABS LIBBPF_0.6.0
     ...
  $ /opt/llvm/bin/readelf --dyn-syms --wide tools/lib/bpf/libbpf.so | grep "LIBBPF" | grep ABS
     134: 0000000000000000     0 OBJECT  GLOBAL DEFAULT   ABS LIBBPF_0.5.0@@LIBBPF_0.5.0
     202: 0000000000000000     0 OBJECT  GLOBAL DEFAULT   ABS LIBBPF_0.6.0@@LIBBPF_0.6.0
     ...
The binutils readelf doesn't print out the symbol LIBBPF_* version and llvm-readelf does.
Such a difference caused libbpf build failure with llvm-readelf.

The proposed fix filters out all ABS symbols as they are not part of the comparison.
This works for both binutils readelf and llvm-readelf.

Reported-by: Delyan Kratunov <delyank@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20220204214355.502108-1-yhs@fb.com
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 tools/lib/bpf/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile
index 154b75fc1373..f2a353bba25f 100644
--- a/tools/lib/bpf/Makefile
+++ b/tools/lib/bpf/Makefile
@@ -147,7 +147,7 @@ GLOBAL_SYM_COUNT = $(shell readelf -s --wide $(BPF_IN_SHARED) | \
 			   sort -u | wc -l)
 VERSIONED_SYM_COUNT = $(shell readelf --dyn-syms --wide $(OUTPUT)libbpf.so | \
 			      sed 's/\[.*\]//' | \
-			      awk '/GLOBAL/ && /DEFAULT/ && !/UND/ {print $$NF}' | \
+			      awk '/GLOBAL/ && /DEFAULT/ && !/UND|ABS/ {print $$NF}' | \
 			      grep -Eo '[^ ]+@LIBBPF_' | cut -d@ -f1 | sort -u | wc -l)
 
 CMD_TARGETS = $(LIB_TARGET) $(PC_FILE)
@@ -216,7 +216,7 @@ check_abi: $(OUTPUT)libbpf.so $(VERSION_SCRIPT)
 		    sort -u > $(OUTPUT)libbpf_global_syms.tmp;		 \
 		readelf --dyn-syms --wide $(OUTPUT)libbpf.so |		 \
 		    sed 's/\[.*\]//' |					 \
-		    awk '/GLOBAL/ && /DEFAULT/ && !/UND/ {print $$NF}'|  \
+		    awk '/GLOBAL/ && /DEFAULT/ && !/UND|ABS/ {print $$NF}'|  \
 		    grep -Eo '[^ ]+@LIBBPF_' | cut -d@ -f1 |		 \
 		    sort -u > $(OUTPUT)libbpf_versioned_syms.tmp; 	 \
 		diff -u $(OUTPUT)libbpf_global_syms.tmp			 \
-- 
Gitee


From c04876da0c5e08d43a22fd7a60669284ac253472 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 14 Jul 2022 21:57:21 +0800
Subject: [PATCH 211/674] ipv6: make mc_forwarding atomic

stable inclusion
from stable-v5.10.111
commit fb5ac62fbe16dcc273e90bec99ce637ff2bd60c3
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=fb5ac62fbe16dcc273e90bec99ce637ff2bd60c3

--------------------------------

[ Upstream commit 145c7a793838add5e004e7d49a67654dc7eba147 ]

This fixes minor data-races in ip6_mc_input() and
batadv_mcast_mla_rtr_flags_softif_get_ipv6()

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 include/linux/ipv6.h       | 2 +-
 net/batman-adv/multicast.c | 2 +-
 net/ipv6/addrconf.c        | 4 ++--
 net/ipv6/ip6_input.c       | 2 +-
 net/ipv6/ip6mr.c           | 8 ++++----
 5 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 0e6f5d2785e9..f4e1790a5eed 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -51,7 +51,7 @@ struct ipv6_devconf {
 	__s32		use_optimistic;
 #endif
 #ifdef CONFIG_IPV6_MROUTE
-	__s32		mc_forwarding;
+	atomic_t	mc_forwarding;
 #endif
 	__s32		disable_ipv6;
 	__s32		drop_unicast_in_l2_multicast;
diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c
index 139894ca788b..c8a341cd652c 100644
--- a/net/batman-adv/multicast.c
+++ b/net/batman-adv/multicast.c
@@ -136,7 +136,7 @@ static u8 batadv_mcast_mla_rtr_flags_softif_get_ipv6(struct net_device *dev)
 {
 	struct inet6_dev *in6_dev = __in6_dev_get(dev);
 
-	if (in6_dev && in6_dev->cnf.mc_forwarding)
+	if (in6_dev && atomic_read(&in6_dev->cnf.mc_forwarding))
 		return BATADV_NO_FLAGS;
 	else
 		return BATADV_MCAST_WANT_NO_RTR6;
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 1064edea8841..4e24f3f7595c 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -546,7 +546,7 @@ static int inet6_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
 #ifdef CONFIG_IPV6_MROUTE
 	if ((all || type == NETCONFA_MC_FORWARDING) &&
 	    nla_put_s32(skb, NETCONFA_MC_FORWARDING,
-			devconf->mc_forwarding) < 0)
+			atomic_read(&devconf->mc_forwarding)) < 0)
 		goto nla_put_failure;
 #endif
 	if ((all || type == NETCONFA_PROXY_NEIGH) &&
@@ -5519,7 +5519,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
 	array[DEVCONF_USE_OPTIMISTIC] = cnf->use_optimistic;
 #endif
 #ifdef CONFIG_IPV6_MROUTE
-	array[DEVCONF_MC_FORWARDING] = cnf->mc_forwarding;
+	array[DEVCONF_MC_FORWARDING] = atomic_read(&cnf->mc_forwarding);
 #endif
 	array[DEVCONF_DISABLE_IPV6] = cnf->disable_ipv6;
 	array[DEVCONF_ACCEPT_DAD] = cnf->accept_dad;
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 06d60662717d..15ea3d082534 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -509,7 +509,7 @@ int ip6_mc_input(struct sk_buff *skb)
 	/*
 	 *      IPv6 multicast router mode is now supported ;)
 	 */
-	if (dev_net(skb->dev)->ipv6.devconf_all->mc_forwarding &&
+	if (atomic_read(&dev_net(skb->dev)->ipv6.devconf_all->mc_forwarding) &&
 	    !(ipv6_addr_type(&hdr->daddr) &
 	      (IPV6_ADDR_LOOPBACK|IPV6_ADDR_LINKLOCAL)) &&
 	    likely(!(IP6CB(skb)->flags & IP6SKB_FORWARDED))) {
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 41cb348a7c3c..5f0ac47acc74 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -740,7 +740,7 @@ static int mif6_delete(struct mr_table *mrt, int vifi, int notify,
 
 	in6_dev = __in6_dev_get(dev);
 	if (in6_dev) {
-		in6_dev->cnf.mc_forwarding--;
+		atomic_dec(&in6_dev->cnf.mc_forwarding);
 		inet6_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF,
 					     NETCONFA_MC_FORWARDING,
 					     dev->ifindex, &in6_dev->cnf);
@@ -908,7 +908,7 @@ static int mif6_add(struct net *net, struct mr_table *mrt,
 
 	in6_dev = __in6_dev_get(dev);
 	if (in6_dev) {
-		in6_dev->cnf.mc_forwarding++;
+		atomic_inc(&in6_dev->cnf.mc_forwarding);
 		inet6_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF,
 					     NETCONFA_MC_FORWARDING,
 					     dev->ifindex, &in6_dev->cnf);
@@ -1558,7 +1558,7 @@ static int ip6mr_sk_init(struct mr_table *mrt, struct sock *sk)
 	} else {
 		rcu_assign_pointer(mrt->mroute_sk, sk);
 		sock_set_flag(sk, SOCK_RCU_FREE);
-		net->ipv6.devconf_all->mc_forwarding++;
+		atomic_inc(&net->ipv6.devconf_all->mc_forwarding);
 	}
 	write_unlock_bh(&mrt_lock);
 
@@ -1591,7 +1591,7 @@ int ip6mr_sk_done(struct sock *sk)
 			 * so the RCU grace period before sk freeing
 			 * is guaranteed by sk_destruct()
 			 */
-			net->ipv6.devconf_all->mc_forwarding--;
+			atomic_dec(&net->ipv6.devconf_all->mc_forwarding);
 			write_unlock_bh(&mrt_lock);
 			inet6_netconf_notify_devconf(net, RTM_NEWNETCONF,
 						     NETCONFA_MC_FORWARDING,
-- 
Gitee


From 1191f146f6e5da5a9762a94facb310329c31205f Mon Sep 17 00:00:00 2001
From: Zheng Zengkai <zhengzengkai@huawei.com>
Date: Thu, 14 Jul 2022 21:57:22 +0800
Subject: [PATCH 212/674] ipv6: fix kabi for mc_forwarding in struct
 ipv6_devconf

hulk inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z
CVE: NA

---------------------------------------

Making mc_forwarding atomic breaks the KABI of struct ipv6_devconf.
This patch uses KABI_REPLACE to fix it.

Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 include/linux/ipv6.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index f4e1790a5eed..210402b26a20 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -51,7 +51,7 @@ struct ipv6_devconf {
 	__s32		use_optimistic;
 #endif
 #ifdef CONFIG_IPV6_MROUTE
-	atomic_t	mc_forwarding;
+	KABI_REPLACE(__s32 mc_forwarding, atomic_t mc_forwarding)
 #endif
 	__s32		disable_ipv6;
 	__s32		drop_unicast_in_l2_multicast;
-- 
Gitee


From 4c98b9576c915d4d0f420f2db10f34c13a7c3d8b Mon Sep 17 00:00:00 2001
From: Sourabh Jain <sourabhjain@linux.ibm.com>
Date: Thu, 14 Jul 2022 21:57:23 +0800
Subject: [PATCH 213/674] powerpc: Set crashkernel offset to mid of RMA region

stable inclusion
from stable-v5.10.111
commit ea21eaea7f5f368d710acb3e4dd51d14410d5b49
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=ea21eaea7f5f368d710acb3e4dd51d14410d5b49

--------------------------------

[ Upstream commit 7c5ed82b800d8615cdda00729e7b62e5899f0b13 ]

On large config LPARs (having 192 and more cores), Linux fails to boot
due to insufficient memory in the first memblock. It is due to the
memory reservation for the crash kernel which starts at 128MB offset of
the first memblock. This memory reservation for the crash kernel doesn't
leave enough space in the first memblock to accommodate other essential
system resources.

The crash kernel start address was set to 128MB offset by default to
ensure that the crash kernel get some memory below the RMA region which
is used to be of size 256MB. But given that the RMA region size can be
512MB or more, setting the crash kernel offset to mid of RMA size will
leave enough space for the kernel to allocate memory for other system
resources.

Since the above crash kernel offset change is only applicable to the LPAR
platform, the LPAR feature detection is pushed before the crash kernel
reservation. The rest of LPAR specific initialization will still
be done during pseries_probe_fw_features as usual.

This patch is dependent on changes to paca allocation for boot CPU. It
expect boot CPU to discover 1T segment support which is introduced by
the patch posted here:
https://lists.ozlabs.org/pipermail/linuxppc-dev/2022-January/239175.html

Reported-by: Abdul haleem <abdhalee@linux.vnet.ibm.com>
Signed-off-by: Sourabh Jain <sourabhjain@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20220204085601.107257-1-sourabhjain@linux.ibm.com
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 arch/powerpc/kernel/rtas.c |  6 ++++++
 arch/powerpc/kexec/core.c  | 15 +++++++++++----
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index cccb32cf0e08..cf421eb7f90d 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -1296,6 +1296,12 @@ int __init early_init_dt_scan_rtas(unsigned long node,
 	entryp = of_get_flat_dt_prop(node, "linux,rtas-entry", NULL);
 	sizep  = of_get_flat_dt_prop(node, "rtas-size", NULL);
 
+#ifdef CONFIG_PPC64
+	/* need this feature to decide the crashkernel offset */
+	if (of_get_flat_dt_prop(node, "ibm,hypertas-functions", NULL))
+		powerpc_firmware_features |= FW_FEATURE_LPAR;
+#endif
+
 	if (basep && entryp && sizep) {
 		rtas.base = *basep;
 		rtas.entry = *entryp;
diff --git a/arch/powerpc/kexec/core.c b/arch/powerpc/kexec/core.c
index 56da5eb2b923..80c79cb5010c 100644
--- a/arch/powerpc/kexec/core.c
+++ b/arch/powerpc/kexec/core.c
@@ -147,11 +147,18 @@ void __init reserve_crashkernel(void)
 	if (!crashk_res.start) {
 #ifdef CONFIG_PPC64
 		/*
-		 * On 64bit we split the RMO in half but cap it at half of
-		 * a small SLB (128MB) since the crash kernel needs to place
-		 * itself and some stacks to be in the first segment.
+		 * On the LPAR platform place the crash kernel to mid of
+		 * RMA size (512MB or more) to ensure the crash kernel
+		 * gets enough space to place itself and some stack to be
+		 * in the first segment. At the same time normal kernel
+		 * also get enough space to allocate memory for essential
+		 * system resource in the first segment. Keep the crash
+		 * kernel starts at 128MB offset on other platforms.
 		 */
-		crashk_res.start = min(0x8000000ULL, (ppc64_rma_size / 2));
+		if (firmware_has_feature(FW_FEATURE_LPAR))
+			crashk_res.start = ppc64_rma_size / 2;
+		else
+			crashk_res.start = min(0x8000000ULL, (ppc64_rma_size / 2));
 #else
 		crashk_res.start = KDUMP_KERNELBASE;
 #endif
-- 
Gitee


From 613db224006696c7a210a0f850f37d8b4ad88fad Mon Sep 17 00:00:00 2001
From: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
Date: Thu, 14 Jul 2022 21:57:24 +0800
Subject: [PATCH 214/674] drm/amdgpu: Fix recursive locking warning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

stable inclusion
from stable-v5.10.111
commit 6694b8643bde4b940f6b410507960793b922a77d
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=6694b8643bde4b940f6b410507960793b922a77d

--------------------------------

[ Upstream commit 447c7997b62a5115ba4da846dcdee4fc12298a6a ]

Noticed the below warning while running a pytorch workload on vega10
GPUs. Change to trylock to avoid conflicts with already held reservation
locks.

[  +0.000003] WARNING: possible recursive locking detected
[  +0.000003] 5.13.0-kfd-rajneesh #1030 Not tainted
[  +0.000004] --------------------------------------------
[  +0.000002] python/4822 is trying to acquire lock:
[  +0.000004] ffff932cd9a259f8 (reservation_ww_class_mutex){+.+.}-{3:3},
at: amdgpu_bo_release_notify+0xc4/0x160 [amdgpu]
[  +0.000203]
              but task is already holding lock:
[  +0.000003] ffff932cbb7181f8 (reservation_ww_class_mutex){+.+.}-{3:3},
at: ttm_eu_reserve_buffers+0x270/0x470 [ttm]
[  +0.000017]
              other info that might help us debug this:
[  +0.000002]  Possible unsafe locking scenario:

[  +0.000003]        CPU0
[  +0.000002]        ----
[  +0.000002]   lock(reservation_ww_class_mutex);
[  +0.000004]   lock(reservation_ww_class_mutex);
[  +0.000003]
               *** DEADLOCK ***

[  +0.000002]  May be due to missing lock nesting notation

[  +0.000003] 7 locks held by python/4822:
[  +0.000003]  #0: ffff932c4ac028d0 (&process->mutex){+.+.}-{3:3}, at:
kfd_ioctl_map_memory_to_gpu+0x10b/0x320 [amdgpu]
[  +0.000232]  #1: ffff932c55e830a8 (&info->lock#2){+.+.}-{3:3}, at:
amdgpu_amdkfd_gpuvm_map_memory_to_gpu+0x64/0xf60 [amdgpu]
[  +0.000241]  #2: ffff932cc45b5e68 (&(*mem)->lock){+.+.}-{3:3}, at:
amdgpu_amdkfd_gpuvm_map_memory_to_gpu+0xdf/0xf60 [amdgpu]
[  +0.000236]  #3: ffffb2b35606fd28
(reservation_ww_class_acquire){+.+.}-{0:0}, at:
amdgpu_amdkfd_gpuvm_map_memory_to_gpu+0x232/0xf60 [amdgpu]
[  +0.000235]  #4: ffff932cbb7181f8
(reservation_ww_class_mutex){+.+.}-{3:3}, at:
ttm_eu_reserve_buffers+0x270/0x470 [ttm]
[  +0.000015]  #5: ffffffffc045f700 (*(sspp++)){....}-{0:0}, at:
drm_dev_enter+0x5/0xa0 [drm]
[  +0.000038]  #6: ffff932c52da7078 (&vm->eviction_lock){+.+.}-{3:3},
at: amdgpu_vm_bo_update_mapping+0xd5/0x4f0 [amdgpu]
[  +0.000195]
              stack backtrace:
[  +0.000003] CPU: 11 PID: 4822 Comm: python Not tainted
5.13.0-kfd-rajneesh #1030
[  +0.000005] Hardware name: GIGABYTE MZ01-CE0-00/MZ01-CE0-00, BIOS F02
08/29/2018
[  +0.000003] Call Trace:
[  +0.000003]  dump_stack+0x6d/0x89
[  +0.000010]  __lock_acquire+0xb93/0x1a90
[  +0.000009]  lock_acquire+0x25d/0x2d0
[  +0.000005]  ? amdgpu_bo_release_notify+0xc4/0x160 [amdgpu]
[  +0.000184]  ? lock_is_held_type+0xa2/0x110
[  +0.000006]  ? amdgpu_bo_release_notify+0xc4/0x160 [amdgpu]
[  +0.000184]  __ww_mutex_lock.constprop.17+0xca/0x1060
[  +0.000007]  ? amdgpu_bo_release_notify+0xc4/0x160 [amdgpu]
[  +0.000183]  ? lock_release+0x13f/0x270
[  +0.000005]  ? lock_is_held_type+0xa2/0x110
[  +0.000006]  ? amdgpu_bo_release_notify+0xc4/0x160 [amdgpu]
[  +0.000183]  amdgpu_bo_release_notify+0xc4/0x160 [amdgpu]
[  +0.000185]  ttm_bo_release+0x4c6/0x580 [ttm]
[  +0.000010]  amdgpu_bo_unref+0x1a/0x30 [amdgpu]
[  +0.000183]  amdgpu_vm_free_table+0x76/0xa0 [amdgpu]
[  +0.000189]  amdgpu_vm_free_pts+0xb8/0xf0 [amdgpu]
[  +0.000189]  amdgpu_vm_update_ptes+0x411/0x770 [amdgpu]
[  +0.000191]  amdgpu_vm_bo_update_mapping+0x324/0x4f0 [amdgpu]
[  +0.000191]  amdgpu_vm_bo_update+0x251/0x610 [amdgpu]
[  +0.000191]  update_gpuvm_pte+0xcc/0x290 [amdgpu]
[  +0.000229]  ? amdgpu_vm_bo_map+0xd7/0x130 [amdgpu]
[  +0.000190]  amdgpu_amdkfd_gpuvm_map_memory_to_gpu+0x912/0xf60
[amdgpu]
[  +0.000234]  kfd_ioctl_map_memory_to_gpu+0x182/0x320 [amdgpu]
[  +0.000218]  kfd_ioctl+0x2b9/0x600 [amdgpu]
[  +0.000216]  ? kfd_ioctl_unmap_memory_from_gpu+0x270/0x270 [amdgpu]
[  +0.000216]  ? lock_release+0x13f/0x270
[  +0.000006]  ? __fget_files+0x107/0x1e0
[  +0.000007]  __x64_sys_ioctl+0x8b/0xd0
[  +0.000007]  do_syscall_64+0x36/0x70
[  +0.000004]  entry_SYSCALL_64_after_hwframe+0x44/0xae
[  +0.000007] RIP: 0033:0x7fbff90a7317
[  +0.000004] Code: b3 66 90 48 8b 05 71 4b 2d 00 64 c7 00 26 00 00 00
48 c7 c0 ff ff ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 b8 10 00 00 00 0f
05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 41 4b 2d 00 f7 d8 64 89 01 48
[  +0.000005] RSP: 002b:00007fbe301fe648 EFLAGS: 00000246 ORIG_RAX:
0000000000000010
[  +0.000006] RAX: ffffffffffffffda RBX: 00007fbcc402d820 RCX:
00007fbff90a7317
[  +0.000003] RDX: 00007fbe301fe690 RSI: 00000000c0184b18 RDI:
0000000000000004
[  +0.000003] RBP: 00007fbe301fe690 R08: 0000000000000000 R09:
00007fbcc402d880
[  +0.000003] R10: 0000000002001000 R11: 0000000000000246 R12:
00000000c0184b18
[  +0.000003] R13: 0000000000000004 R14: 00007fbf689593a0 R15:
00007fbcc402d820

Cc: Christian König <christian.koenig@amd.com>
Cc: Felix Kuehling <Felix.Kuehling@amd.com>
Cc: Alex Deucher <Alexander.Deucher@amd.com>

Reviewed-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index ad9863b84f1f..f615ecc06a22 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -1338,7 +1338,8 @@ void amdgpu_bo_release_notify(struct ttm_buffer_object *bo)
 	    !(abo->flags & AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE))
 		return;
 
-	dma_resv_lock(bo->base.resv, NULL);
+	if (WARN_ON_ONCE(!dma_resv_trylock(bo->base.resv)))
+		return;
 
 	r = amdgpu_fill_buffer(abo, AMDGPU_POISON, bo->base.resv, &fence);
 	if (!WARN_ON(r)) {
-- 
Gitee


From b23bcbc093a2826e1b387a3d9f9971581225a4ea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pali=20Roh=C3=A1r?= <pali@kernel.org>
Date: Thu, 14 Jul 2022 21:57:25 +0800
Subject: [PATCH 215/674] PCI: aardvark: Fix support for MSI interrupts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

stable inclusion
from stable-v5.10.111
commit e07e420a00564863b45f88e3bd26860e803d87fe
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=e07e420a00564863b45f88e3bd26860e803d87fe

--------------------------------

[ Upstream commit b0b0b8b897f8e12b2368e868bd7cdc5742d5c5a9 ]

Aardvark hardware supports Multi-MSI and MSI_FLAG_MULTI_PCI_MSI is already
set for the MSI chip. But when allocating MSI interrupt numbers for
Multi-MSI, the numbers need to be properly aligned, otherwise endpoint
devices send MSI interrupt with incorrect numbers.

Fix this issue by using function bitmap_find_free_region() instead of
bitmap_find_next_zero_area().

To ensure that aligned MSI interrupt numbers are used by endpoint devices,
we cannot use Linux virtual irq numbers (as they are random and not
properly aligned). Instead we need to use the aligned hwirq numbers.

This change fixes receiving MSI interrupts on Armada 3720 boards and
allows using NVMe disks which use Multi-MSI feature with 3 interrupts.

Without this NVMe disks freeze booting as linux nvme-core.c is waiting
60s for an interrupt.

Link: https://lore.kernel.org/r/20220110015018.26359-4-kabel@kernel.org
Signed-off-by: Pali Rohár <pali@kernel.org>
Signed-off-by: Marek Behún <kabel@kernel.org>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/pci/controller/pci-aardvark.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c
index 49ff8bf10c74..af051fb88699 100644
--- a/drivers/pci/controller/pci-aardvark.c
+++ b/drivers/pci/controller/pci-aardvark.c
@@ -1186,7 +1186,7 @@ static void advk_msi_irq_compose_msi_msg(struct irq_data *data,
 
 	msg->address_lo = lower_32_bits(msi_msg);
 	msg->address_hi = upper_32_bits(msi_msg);
-	msg->data = data->irq;
+	msg->data = data->hwirq;
 }
 
 static int advk_msi_set_affinity(struct irq_data *irq_data,
@@ -1203,15 +1203,11 @@ static int advk_msi_irq_domain_alloc(struct irq_domain *domain,
 	int hwirq, i;
 
 	mutex_lock(&pcie->msi_used_lock);
-	hwirq = bitmap_find_next_zero_area(pcie->msi_used, MSI_IRQ_NUM,
-					   0, nr_irqs, 0);
-	if (hwirq >= MSI_IRQ_NUM) {
-		mutex_unlock(&pcie->msi_used_lock);
-		return -ENOSPC;
-	}
-
-	bitmap_set(pcie->msi_used, hwirq, nr_irqs);
+	hwirq = bitmap_find_free_region(pcie->msi_used, MSI_IRQ_NUM,
+					order_base_2(nr_irqs));
 	mutex_unlock(&pcie->msi_used_lock);
+	if (hwirq < 0)
+		return -ENOSPC;
 
 	for (i = 0; i < nr_irqs; i++)
 		irq_domain_set_info(domain, virq + i, hwirq + i,
@@ -1229,7 +1225,7 @@ static void advk_msi_irq_domain_free(struct irq_domain *domain,
 	struct advk_pcie *pcie = domain->host_data;
 
 	mutex_lock(&pcie->msi_used_lock);
-	bitmap_clear(pcie->msi_used, d->hwirq, nr_irqs);
+	bitmap_release_region(pcie->msi_used, d->hwirq, order_base_2(nr_irqs));
 	mutex_unlock(&pcie->msi_used_lock);
 }
 
-- 
Gitee


From 3d4a08e07a387572ebfb4d0c4093b1825e2185d0 Mon Sep 17 00:00:00 2001
From: Zhou Guanghui <zhouguanghui1@huawei.com>
Date: Thu, 14 Jul 2022 21:57:26 +0800
Subject: [PATCH 216/674] iommu/arm-smmu-v3: fix event handling soft lockup

stable inclusion
from stable-v5.10.111
commit 0b9cf0b599258dfbc8f3b4f21a0ef1faec8c455e
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=0b9cf0b599258dfbc8f3b4f21a0ef1faec8c455e

--------------------------------

[ Upstream commit 30de2b541af98179780054836b48825fcfba4408 ]

During event processing, events are read from the event queue one
by one until the queue is empty.If the master device continuously
requests address access at the same time and the SMMU generates
events, the cyclic processing of the event takes a long time and
softlockup warnings may be reported.

arm-smmu-v3 arm-smmu-v3.34.auto: event 0x0a received:
arm-smmu-v3 arm-smmu-v3.34.auto: 	0x00007f220000280a
arm-smmu-v3 arm-smmu-v3.34.auto: 	0x000010000000007e
arm-smmu-v3 arm-smmu-v3.34.auto: 	0x00000000034e8670
watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [irq/268-arm-smm:247]
Call trace:
 _dev_info+0x7c/0xa0
 arm_smmu_evtq_thread+0x1c0/0x230
 irq_thread_fn+0x30/0x80
 irq_thread+0x128/0x210
 kthread+0x134/0x138
 ret_from_fork+0x10/0x1c
Kernel panic - not syncing: softlockup: hung tasks

Fix this by calling cond_resched() after the event information is
printed.

Signed-off-by: Zhou Guanghui <zhouguanghui1@huawei.com>
Link: https://lore.kernel.org/r/20220119070754.26528-1-zhouguanghui1@huawei.com
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index e463fd31d268..b22d0187ea8a 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -1852,6 +1852,7 @@ static irqreturn_t arm_smmu_evtq_thread(int irq, void *dev)
 				dev_info(smmu->dev, "\t0x%016llx\n",
 					 (unsigned long long)evt[i]);
 
+			cond_resched();
 		}
 
 		/*
-- 
Gitee


From 3968a04694e7cb84c5c8183ef9ee6a11c1fff9b4 Mon Sep 17 00:00:00 2001
From: Neal Liu <neal_liu@aspeedtech.com>
Date: Thu, 14 Jul 2022 21:57:27 +0800
Subject: [PATCH 217/674] usb: ehci: add pci device support for Aspeed
 platforms

stable inclusion
from stable-v5.10.111
commit 4820847e8bc2a7c964854a72f5d5d48c800d39c9
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=4820847e8bc2a7c964854a72f5d5d48c800d39c9

--------------------------------

[ Upstream commit c3c9cee592828528fd228b01d312c7526c584a42 ]

Enable Aspeed quirks in commit 7f2d73788d90 ("usb: ehci:
handshake CMD_RUN instead of STS_HALT") to support Aspeed
ehci-pci device.

Acked-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Neal Liu <neal_liu@aspeedtech.com>
Link: https://lore.kernel.org/r/20220208101657.76459-1-neal_liu@aspeedtech.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>

 Conflicts:
	drivers/usb/host/ehci-pci.c

Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/usb/host/ehci-pci.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/drivers/usb/host/ehci-pci.c b/drivers/usb/host/ehci-pci.c
index a5e27deda83a..d0d07d30e912 100644
--- a/drivers/usb/host/ehci-pci.c
+++ b/drivers/usb/host/ehci-pci.c
@@ -21,6 +21,9 @@ static const char hcd_name[] = "ehci-pci";
 /* defined here to avoid adding to pci_ids.h for single instance use */
 #define PCI_DEVICE_ID_INTEL_CE4100_USB	0x2e70
 
+#define PCI_VENDOR_ID_ASPEED		0x1a03
+#define PCI_DEVICE_ID_ASPEED_EHCI	0x2603
+
 /*-------------------------------------------------------------------------*/
 #define PCI_DEVICE_ID_INTEL_QUARK_X1000_SOC		0x0939
 static inline bool is_intel_quark_x1000(struct pci_dev *pdev)
@@ -226,6 +229,12 @@ static int ehci_pci_setup(struct usb_hcd *hcd)
 		if (pdev->device == 0x3104 && (pdev->revision & 0xf0) == 0x90)
 			ehci->zx_wakeup_clear = 1;
 		break;
+	case PCI_VENDOR_ID_ASPEED:
+		if (pdev->device == PCI_DEVICE_ID_ASPEED_EHCI) {
+			ehci_info(ehci, "applying Aspeed HC workaround\n");
+			ehci->is_aspeed = 1;
+		}
+		break;
 	}
 
 	/* optional debug port, normally in the first BAR */
-- 
Gitee


From 914f2581374890a670257478d533594d449d0b73 Mon Sep 17 00:00:00 2001
From: Hou Zhiqiang <Zhiqiang.Hou@nxp.com>
Date: Thu, 14 Jul 2022 21:57:28 +0800
Subject: [PATCH 218/674] PCI: endpoint: Fix alignment fault error in copy
 tests

stable inclusion
from stable-v5.10.111
commit b02a1a65023f46b970486be38f7d209761084a7c
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=b02a1a65023f46b970486be38f7d209761084a7c

--------------------------------

[ Upstream commit 829cc0e2ea2d61fb6c54bc3f8a17f86c56e11864 ]

The copy test uses the memcpy() to copy data between IO memory spaces.
This can trigger an alignment fault error (pasted the error logs below)
because memcpy() may use unaligned accesses on a mapped memory that is
just IO, which does not support unaligned memory accesses.

Fix it by using the correct memcpy API to copy from/to IO memory.

Alignment fault error logs:
   Unable to handle kernel paging request at virtual address ffff8000101cd3c1
   Mem abort info:
     ESR = 0x96000021
     EC = 0x25: DABT (current EL), IL = 32 bits
     SET = 0, FnV = 0
     EA = 0, S1PTW = 0
     FSC = 0x21: alignment fault
   Data abort info:
     ISV = 0, ISS = 0x00000021
     CM = 0, WnR = 0
   swapper pgtable: 4k pages, 48-bit VAs, pgdp=0000000081773000
   [ffff8000101cd3c1] pgd=1000000082410003, p4d=1000000082410003, pud=1000000082411003, pmd=1000000082412003, pte=0068004000001f13
   Internal error: Oops: 96000021 [#1] PREEMPT SMP
   Modules linked in:
   CPU: 0 PID: 6 Comm: kworker/0:0H Not tainted 5.15.0-rc1-next-20210914-dirty #2
   Hardware name: LS1012A RDB Board (DT)
   Workqueue: kpcitest pci_epf_test_cmd_handler
   pstate: 80000005 (Nzcv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--)
   pc : __memcpy+0x168/0x230
   lr : pci_epf_test_cmd_handler+0x6f0/0xa68
   sp : ffff80001003bce0
   x29: ffff80001003bce0 x28: ffff800010135000 x27: ffff8000101e5000
   x26: ffff8000101cd000 x25: ffff6cda941cf6c8 x24: 0000000000000000
   x23: ffff6cda863f2000 x22: ffff6cda9096c800 x21: ffff800010135000
   x20: ffff6cda941cf680 x19: ffffaf39fd999000 x18: 0000000000000000
   x17: 0000000000000000 x16: 0000000000000000 x15: ffffaf39fd2b6000
   x14: 0000000000000000 x13: 15f5c8fa2f984d57 x12: 604d132b60275454
   x11: 065cee5e5fb428b6 x10: aae662eb17d0cf3e x9 : 1d97c9a1b4ddef37
   x8 : 7541b65edebf928c x7 : e71937c4fc595de0 x6 : b8a0e09562430d1c
   x5 : ffff8000101e5401 x4 : ffff8000101cd401 x3 : ffff8000101e5380
   x2 : fffffffffffffff1 x1 : ffff8000101cd3c0 x0 : ffff8000101e5000
   Call trace:
    __memcpy+0x168/0x230
    process_one_work+0x1ec/0x370
    worker_thread+0x44/0x478
    kthread+0x154/0x160
    ret_from_fork+0x10/0x20
   Code: a984346c a9c4342c f1010042 54fffee8 (a97c3c8e)
   ---[ end trace 568c28c7b6336335 ]---

Link: https://lore.kernel.org/r/20211217094708.28678-1-Zhiqiang.Hou@nxp.com
Signed-off-by: Hou Zhiqiang <Zhiqiang.Hou@nxp.com>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Reviewed-by: Kishon Vijay Abraham I <kishon@ti.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/pci/endpoint/functions/pci-epf-test.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/endpoint/functions/pci-epf-test.c b/drivers/pci/endpoint/functions/pci-epf-test.c
index d41570715dc7..b861840e867c 100644
--- a/drivers/pci/endpoint/functions/pci-epf-test.c
+++ b/drivers/pci/endpoint/functions/pci-epf-test.c
@@ -285,7 +285,17 @@ static int pci_epf_test_copy(struct pci_epf_test *epf_test)
 		if (ret)
 			dev_err(dev, "Data transfer failed\n");
 	} else {
-		memcpy(dst_addr, src_addr, reg->size);
+		void *buf;
+
+		buf = kzalloc(reg->size, GFP_KERNEL);
+		if (!buf) {
+			ret = -ENOMEM;
+			goto err_map_addr;
+		}
+
+		memcpy_fromio(buf, src_addr, reg->size);
+		memcpy_toio(dst_addr, buf, reg->size);
+		kfree(buf);
 	}
 	ktime_get_ts64(&end);
 	pci_epf_test_print_rate("COPY", reg->size, &start, &end, use_dma);
-- 
Gitee


From b65364a8d1b914b201a6f278545459603a53086e Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 14 Jul 2022 21:57:29 +0800
Subject: [PATCH 219/674] tcp: Don't acquire inet_listen_hashbucket::lock with
 disabled BH.

stable inclusion
from stable-v5.10.111
commit b1b27b0e8d489b40607127e26af70e5571bf9201
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=b1b27b0e8d489b40607127e26af70e5571bf9201

--------------------------------

[ Upstream commit 4f9bf2a2f5aacf988e6d5e56b961ba45c5a25248 ]

Commit
   9652dc2eb9e40 ("tcp: relax listening_hash operations")

removed the need to disable bottom half while acquiring
listening_hash.lock. There are still two callers left which disable
bottom half before the lock is acquired.

On PREEMPT_RT the softirqs are preemptible and local_bh_disable() acts
as a lock to ensure that resources, that are protected by disabling
bottom halves, remain protected.
This leads to a circular locking dependency if the lock acquired with
disabled bottom halves is also acquired with enabled bottom halves
followed by disabling bottom halves. This is the reverse locking order.
It has been observed with inet_listen_hashbucket::lock:

local_bh_disable() + spin_lock(&ilb->lock):
  inet_listen()
    inet_csk_listen_start()
      sk->sk_prot->hash() := inet_hash()
	local_bh_disable()
	__inet_hash()
	  spin_lock(&ilb->lock);
	    acquire(&ilb->lock);

Reverse order: spin_lock(&ilb2->lock) + local_bh_disable():
  tcp_seq_next()
    listening_get_next()
      spin_lock(&ilb2->lock);
	acquire(&ilb2->lock);

  tcp4_seq_show()
    get_tcp4_sock()
      sock_i_ino()
	read_lock_bh(&sk->sk_callback_lock);
	  acquire(softirq_ctrl)	// <---- whoops
	  acquire(&sk->sk_callback_lock)

Drop local_bh_disable() around __inet_hash() which acquires
listening_hash->lock. Split inet_unhash() and acquire the
listen_hashbucket lock without disabling bottom halves; the inet_ehash
lock with disabled bottom halves.

Reported-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Link: https://lkml.kernel.org/r/12d6f9879a97cd56c09fb53dee343cbb14f7f1f7.camel@gmx.de
Link: https://lkml.kernel.org/r/X9CheYjuXWc75Spa@hirez.programming.kicks-ass.net
Link: https://lore.kernel.org/r/YgQOebeZ10eNx1W6@linutronix.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 net/ipv4/inet_hashtables.c  | 53 ++++++++++++++++++++++---------------
 net/ipv6/inet6_hashtables.c |  5 +---
 2 files changed, 33 insertions(+), 25 deletions(-)

diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index cf178d8eea81..2bb9ded807ee 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -637,7 +637,9 @@ int __inet_hash(struct sock *sk, struct sock *osk)
 	int err = 0;
 
 	if (sk->sk_state != TCP_LISTEN) {
+		local_bh_disable();
 		inet_ehash_nolisten(sk, osk, NULL);
+		local_bh_enable();
 		return 0;
 	}
 	WARN_ON(!sk_unhashed(sk));
@@ -669,45 +671,54 @@ int inet_hash(struct sock *sk)
 {
 	int err = 0;
 
-	if (sk->sk_state != TCP_CLOSE) {
-		local_bh_disable();
+	if (sk->sk_state != TCP_CLOSE)
 		err = __inet_hash(sk, NULL);
-		local_bh_enable();
-	}
 
 	return err;
 }
 EXPORT_SYMBOL_GPL(inet_hash);
 
-void inet_unhash(struct sock *sk)
+static void __inet_unhash(struct sock *sk, struct inet_listen_hashbucket *ilb)
 {
-	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
-	struct inet_listen_hashbucket *ilb = NULL;
-	spinlock_t *lock;
-
 	if (sk_unhashed(sk))
 		return;
 
-	if (sk->sk_state == TCP_LISTEN) {
-		ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
-		lock = &ilb->lock;
-	} else {
-		lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
-	}
-	spin_lock_bh(lock);
-	if (sk_unhashed(sk))
-		goto unlock;
-
 	if (rcu_access_pointer(sk->sk_reuseport_cb))
 		reuseport_detach_sock(sk);
 	if (ilb) {
+		struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
+
 		inet_unhash2(hashinfo, sk);
 		ilb->count--;
 	}
 	__sk_nulls_del_node_init_rcu(sk);
 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
-unlock:
-	spin_unlock_bh(lock);
+}
+
+void inet_unhash(struct sock *sk)
+{
+	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
+
+	if (sk_unhashed(sk))
+		return;
+
+	if (sk->sk_state == TCP_LISTEN) {
+		struct inet_listen_hashbucket *ilb;
+
+		ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
+		/* Don't disable bottom halves while acquiring the lock to
+		 * avoid circular locking dependency on PREEMPT_RT.
+		 */
+		spin_lock(&ilb->lock);
+		__inet_unhash(sk, ilb);
+		spin_unlock(&ilb->lock);
+	} else {
+		spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
+
+		spin_lock_bh(lock);
+		__inet_unhash(sk, NULL);
+		spin_unlock_bh(lock);
+	}
 }
 EXPORT_SYMBOL_GPL(inet_unhash);
 
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index c9e7ecc7afd3..40203255ed88 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -333,11 +333,8 @@ int inet6_hash(struct sock *sk)
 {
 	int err = 0;
 
-	if (sk->sk_state != TCP_CLOSE) {
-		local_bh_disable();
+	if (sk->sk_state != TCP_CLOSE)
 		err = __inet_hash(sk, NULL);
-		local_bh_enable();
-	}
 
 	return err;
 }
-- 
Gitee


From 833bfdd23b887a59b51b080b94fcd82becf90520 Mon Sep 17 00:00:00 2001
From: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Date: Thu, 14 Jul 2022 21:57:30 +0800
Subject: [PATCH 220/674] PCI: pciehp: Add Qualcomm quirk for Command Completed
 erratum

stable inclusion
from stable-v5.10.111
commit c66cc0404367fe0bd3e1e458ff9a238078fd3a39
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=c66cc0404367fe0bd3e1e458ff9a238078fd3a39

--------------------------------

[ Upstream commit 9f72d4757cbe4d1ed669192f6d23817c9e437c4b ]

The Qualcomm PCI bridge device (Device ID 0x0110) found in chipsets such as
SM8450 does not set the Command Completed bit unless writes to the Slot
Command register change "Control" bits.

This results in timeouts like below:

  pcieport 0001:00:00.0: pciehp: Timeout on hotplug command 0x03c0 (issued 2020 msec ago)

Add the device to the Command Completed quirk to mark commands "completed"
immediately unless they change the "Control" bits.

Link: https://lore.kernel.org/r/20220210145003.135907-1-manivannan.sadhasivam@linaro.org
Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/pci/hotplug/pciehp_hpc.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c
index c4ce4c4e9f7a..a2639135f5d3 100644
--- a/drivers/pci/hotplug/pciehp_hpc.c
+++ b/drivers/pci/hotplug/pciehp_hpc.c
@@ -1116,6 +1116,8 @@ static void quirk_cmd_compl(struct pci_dev *pdev)
 }
 DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_INTEL, PCI_ANY_ID,
 			      PCI_CLASS_BRIDGE_PCI, 8, quirk_cmd_compl);
+DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_QCOM, 0x0110,
+			      PCI_CLASS_BRIDGE_PCI, 8, quirk_cmd_compl);
 DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_QCOM, 0x0400,
 			      PCI_CLASS_BRIDGE_PCI, 8, quirk_cmd_compl);
 DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_QCOM, 0x0401,
-- 
Gitee


From ec47a8c660e3a3a5c2ad81da50929accae89824b Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Thu, 14 Jul 2022 21:57:31 +0800
Subject: [PATCH 221/674] power: supply: axp288-charger: Set Vhold to 4.4V

stable inclusion
from stable-v5.10.111
commit 9538563d31a2ea2f5621ce46982beb1ac109ef0c
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=9538563d31a2ea2f5621ce46982beb1ac109ef0c

--------------------------------

[ Upstream commit 5ac121b81b4051e7fc83d5b3456a5e499d5bd147 ]

The AXP288's recommended and factory default Vhold value (minimum
input voltage below which the input current draw will be reduced)
is 4.4V. This lines up with other charger IC's such as the TI
bq2419x/bq2429x series which use 4.36V or 4.44V.

For some reason some BIOS-es initialize Vhold to 4.6V or even 4.7V
which combined with the typical voltage drop over typically low
wire gauge micro-USB cables leads to the input-current getting
capped below 1A (with a 2A capable dedicated charger) based on Vhold.

This leads to slow charging, or even to the device slowly discharging
if the device is in heavy use.

As the Linux AXP288 drivers use the builtin BC1.2 charger detection
and send the input-current-limit according to the detected charger
there really is no reason not to use the recommended 4.4V Vhold.

Set Vhold to 4.4V to fix the slow charging issue on various devices.

There is one exception, the special-case of the HP X2 2-in-1s which
combine this BC1.2 capable PMIC with a Type-C port and a 5V/3A factory
provided charger with a Type-C plug which does not do BC1.2. These
have their input-current-limit hardcoded to 3A (like under Windows)
and use a higher Vhold on purpose to limit the current when used
with other chargers. To avoid touching Vhold on these HP X2 laptops
the code setting Vhold is added to an else branch of the if checking
for these models.

Note this also fixes the sofar unused VBUS_ISPOUT_VHOLD_SET_MASK
define, which was wrong.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/power/supply/axp288_charger.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/drivers/power/supply/axp288_charger.c b/drivers/power/supply/axp288_charger.c
index a4df1ea92386..f65bf7b295c5 100644
--- a/drivers/power/supply/axp288_charger.c
+++ b/drivers/power/supply/axp288_charger.c
@@ -41,11 +41,11 @@
 #define VBUS_ISPOUT_CUR_LIM_1500MA	0x1	/* 1500mA */
 #define VBUS_ISPOUT_CUR_LIM_2000MA	0x2	/* 2000mA */
 #define VBUS_ISPOUT_CUR_NO_LIM		0x3	/* 2500mA */
-#define VBUS_ISPOUT_VHOLD_SET_MASK	0x31
+#define VBUS_ISPOUT_VHOLD_SET_MASK	0x38
 #define VBUS_ISPOUT_VHOLD_SET_BIT_POS	0x3
 #define VBUS_ISPOUT_VHOLD_SET_OFFSET	4000	/* 4000mV */
 #define VBUS_ISPOUT_VHOLD_SET_LSB_RES	100	/* 100mV */
-#define VBUS_ISPOUT_VHOLD_SET_4300MV	0x3	/* 4300mV */
+#define VBUS_ISPOUT_VHOLD_SET_4400MV	0x4	/* 4400mV */
 #define VBUS_ISPOUT_VBUS_PATH_DIS	BIT(7)
 
 #define CHRG_CCCV_CC_MASK		0xf		/* 4 bits */
@@ -744,6 +744,16 @@ static int charger_init_hw_regs(struct axp288_chrg_info *info)
 		ret = axp288_charger_vbus_path_select(info, true);
 		if (ret < 0)
 			return ret;
+	} else {
+		/* Set Vhold to the factory default / recommended 4.4V */
+		val = VBUS_ISPOUT_VHOLD_SET_4400MV << VBUS_ISPOUT_VHOLD_SET_BIT_POS;
+		ret = regmap_update_bits(info->regmap, AXP20X_VBUS_IPSOUT_MGMT,
+					 VBUS_ISPOUT_VHOLD_SET_MASK, val);
+		if (ret < 0) {
+			dev_err(&info->pdev->dev, "register(%x) write error(%d)\n",
+				AXP20X_VBUS_IPSOUT_MGMT, ret);
+			return ret;
+		}
 	}
 
 	/* Read current charge voltage and current limit */
-- 
Gitee


From 235f9ee8bf43d7bc04431d50052be91379d25769 Mon Sep 17 00:00:00 2001
From: Ilan Peer <ilan.peer@intel.com>
Date: Thu, 14 Jul 2022 21:57:32 +0800
Subject: [PATCH 222/674] iwlwifi: mvm: Correctly set fragmented EBS

stable inclusion
from stable-v5.10.111
commit bae03957e8ca7bde1b0a7743cde6826317803ce8
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=bae03957e8ca7bde1b0a7743cde6826317803ce8

--------------------------------

[ Upstream commit d8d4dd26b9e0469baf5017f0544d852fd4e3fb6d ]

Currently, fragmented EBS was set for a channel only if the 'hb_type'
was set to fragmented or balanced scan. However, 'hb_type' is set only
in case of CDB, and thus fragmented EBS is never set for a channel for
non-CDB devices. Fix it.

Signed-off-by: Ilan Peer <ilan.peer@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Link: https://lore.kernel.org/r/iwlwifi.20220204122220.a6165ac9b9d5.I654eafa62fd647030ae6d4f07f32c96c3171decb@changeid
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/net/wireless/intel/iwlwifi/mvm/scan.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/scan.c b/drivers/net/wireless/intel/iwlwifi/mvm/scan.c
index 46255d2c555b..17b992526694 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/scan.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/scan.c
@@ -1706,7 +1706,10 @@ static u8 iwl_mvm_scan_umac_chan_flags_v2(struct iwl_mvm *mvm,
 			IWL_SCAN_CHANNEL_FLAG_CACHE_ADD;
 
 	/* set fragmented ebs for fragmented scan on HB channels */
-	if (iwl_mvm_is_scan_fragmented(params->hb_type))
+	if ((!iwl_mvm_is_cdb_supported(mvm) &&
+	     iwl_mvm_is_scan_fragmented(params->type)) ||
+	    (iwl_mvm_is_cdb_supported(mvm) &&
+	     iwl_mvm_is_scan_fragmented(params->hb_type)))
 		flags |= IWL_SCAN_CHANNEL_FLAG_EBS_FRAG;
 
 	return flags;
-- 
Gitee


From e415a37234e790e416dd930c8fba1268914dfdef Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Thu, 14 Jul 2022 21:57:33 +0800
Subject: [PATCH 223/674] ipv4: Invalidate neighbour for broadcast address upon
 address addition

stable inclusion
from stable-v5.10.111
commit f655b724b4407ec4284b52c6e957d3d344eb159a
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=f655b724b4407ec4284b52c6e957d3d344eb159a

--------------------------------

[ Upstream commit 0c51e12e218f20b7d976158fdc18019627326f7a ]

In case user space sends a packet destined to a broadcast address when a
matching broadcast route is not configured, the kernel will create a
unicast neighbour entry that will never be resolved [1].

When the broadcast route is configured, the unicast neighbour entry will
not be invalidated and continue to linger, resulting in packets being
dropped.

Solve this by invalidating unresolved neighbour entries for broadcast
addresses after routes for these addresses are internally configured by
the kernel. This allows the kernel to create a broadcast neighbour entry
following the next route lookup.

Another possible solution that is more generic but also more complex is
to have the ARP code register a listener to the FIB notification chain
and invalidate matching neighbour entries upon the addition of broadcast
routes.

It is also possible to wave off the issue as a user space problem, but
it seems a bit excessive to expect user space to be that intimately
familiar with the inner workings of the FIB/neighbour kernel code.

[1] https://lore.kernel.org/netdev/55a04a8f-56f3-f73c-2aea-2195923f09d1@huawei.com/

Reported-by: Wang Hai <wanghai38@huawei.com>
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Tested-by: Wang Hai <wanghai38@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 include/net/arp.h       | 1 +
 net/ipv4/arp.c          | 9 +++++++--
 net/ipv4/fib_frontend.c | 5 ++++-
 3 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/include/net/arp.h b/include/net/arp.h
index 4950191f6b2b..4a23a97195f3 100644
--- a/include/net/arp.h
+++ b/include/net/arp.h
@@ -71,6 +71,7 @@ void arp_send(int type, int ptype, __be32 dest_ip,
 	      const unsigned char *src_hw, const unsigned char *th);
 int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir);
 void arp_ifdown(struct net_device *dev);
+int arp_invalidate(struct net_device *dev, __be32 ip, bool force);
 
 struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
 			   struct net_device *dev, __be32 src_ip,
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 922dd73e5740..83a47998c4b1 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -1116,13 +1116,18 @@ static int arp_req_get(struct arpreq *r, struct net_device *dev)
 	return err;
 }
 
-static int arp_invalidate(struct net_device *dev, __be32 ip)
+int arp_invalidate(struct net_device *dev, __be32 ip, bool force)
 {
 	struct neighbour *neigh = neigh_lookup(&arp_tbl, &ip, dev);
 	int err = -ENXIO;
 	struct neigh_table *tbl = &arp_tbl;
 
 	if (neigh) {
+		if ((neigh->nud_state & NUD_VALID) && !force) {
+			neigh_release(neigh);
+			return 0;
+		}
+
 		if (neigh->nud_state & ~NUD_NOARP)
 			err = neigh_update(neigh, NULL, NUD_FAILED,
 					   NEIGH_UPDATE_F_OVERRIDE|
@@ -1169,7 +1174,7 @@ static int arp_req_delete(struct net *net, struct arpreq *r,
 		if (!dev)
 			return -EINVAL;
 	}
-	return arp_invalidate(dev, ip);
+	return arp_invalidate(dev, ip, true);
 }
 
 /*
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 917ea953dfad..0df4594b49c7 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -1112,9 +1112,11 @@ void fib_add_ifaddr(struct in_ifaddr *ifa)
 		return;
 
 	/* Add broadcast address, if it is explicitly assigned. */
-	if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF))
+	if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF)) {
 		fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32,
 			  prim, 0);
+		arp_invalidate(dev, ifa->ifa_broadcast, false);
+	}
 
 	if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) &&
 	    (prefix != addr || ifa->ifa_prefixlen < 32)) {
@@ -1130,6 +1132,7 @@ void fib_add_ifaddr(struct in_ifaddr *ifa)
 				  prim, 0);
 			fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask,
 				  32, prim, 0);
+			arp_invalidate(dev, prefix | ~mask, false);
 		}
 	}
 }
-- 
Gitee


From 8ec846488847926241fa34983380d41baece54b6 Mon Sep 17 00:00:00 2001
From: Jordy Zomer <jordy@jordyzomer.github.io>
Date: Thu, 14 Jul 2022 21:57:34 +0800
Subject: [PATCH 224/674] dm ioctl: prevent potential spectre v1 gadget

stable inclusion
from stable-v5.10.111
commit 71c8df33fd777c7628f6fbc09b14e84806c55914
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=71c8df33fd777c7628f6fbc09b14e84806c55914

--------------------------------

[ Upstream commit cd9c88da171a62c4b0f1c70e50c75845969fbc18 ]

It appears like cmd could be a Spectre v1 gadget as it's supplied by a
user and used as an array index. Prevent the contents of kernel memory
from being leaked to userspace via speculative execution by using
array_index_nospec.

Signed-off-by: Jordy Zomer <jordy@pwning.systems>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/md/dm-ioctl.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 1ca65b434f1f..b839705654d4 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -17,6 +17,7 @@
 #include <linux/dm-ioctl.h>
 #include <linux/hdreg.h>
 #include <linux/compat.h>
+#include <linux/nospec.h>
 
 #include <linux/uaccess.h>
 
@@ -1696,6 +1697,7 @@ static ioctl_fn lookup_ioctl(unsigned int cmd, int *ioctl_flags)
 	if (unlikely(cmd >= ARRAY_SIZE(_ioctls)))
 		return NULL;
 
+	cmd = array_index_nospec(cmd, ARRAY_SIZE(_ioctls));
 	*ioctl_flags = _ioctls[cmd].flags;
 	return _ioctls[cmd].fn;
 }
-- 
Gitee


From 9314c8a29370a888602502fb120cdd4f9433da58 Mon Sep 17 00:00:00 2001
From: Alex Deucher <alexander.deucher@amd.com>
Date: Thu, 14 Jul 2022 21:57:35 +0800
Subject: [PATCH 225/674] drm/amdkfd: make CRAT table missing message
 informational only

stable inclusion
from stable-v5.10.111
commit fe4b6d5a0dd7701454981ea8a7e2332a2970ef81
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=fe4b6d5a0dd7701454981ea8a7e2332a2970ef81

--------------------------------

[ Upstream commit 9dff13f9edf755a15f6507874185a3290c1ae8bb ]

The driver has a fallback so make the message informational
rather than a warning. The driver has a fallback if the
Component Resource Association Table (CRAT) is missing, so
make this informational now.

Bug: https://gitlab.freedesktop.org/drm/amd/-/issues/1906
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_crat.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
index 31d793ee0836..86b4dadf772e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
@@ -784,7 +784,7 @@ int kfd_create_crat_image_acpi(void **crat_image, size_t *size)
 	/* Fetch the CRAT table from ACPI */
 	status = acpi_get_table(CRAT_SIGNATURE, 0, &crat_table);
 	if (status == AE_NOT_FOUND) {
-		pr_warn("CRAT table not found\n");
+		pr_info("CRAT table not found\n");
 		return -ENODATA;
 	} else if (ACPI_FAILURE(status)) {
 		const char *err = acpi_format_exception(status);
-- 
Gitee


From 5b544929258d39d92fe674fc239ccdf50ad347a8 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Date: Thu, 14 Jul 2022 21:57:36 +0800
Subject: [PATCH 226/674] scsi: pm8001: Fix pm80xx_pci_mem_copy() interface

stable inclusion
from stable-v5.10.111
commit ef969095c44245b85a940e9a0ceaaf8d2f035d87
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=ef969095c44245b85a940e9a0ceaaf8d2f035d87

--------------------------------

[ Upstream commit 3762d8f6edcdb03994c919f9487fd6d336c06561 ]

The declaration of the local variable destination1 in pm80xx_pci_mem_copy()
as a pointer to a u32 results in the sparse warning:

warning: incorrect type in assignment (different base types)
    expected unsigned int [usertype]
    got restricted __le32 [usertype]

Furthermore, the destination" argument of pm80xx_pci_mem_copy() is wrongly
declared with the const attribute.

Fix both problems by changing the type of the "destination" argument to
"__le32 *" and use this argument directly inside the pm80xx_pci_mem_copy()
function, thus removing the need for the destination1 local variable.

Link: https://lore.kernel.org/r/20220220031810.738362-6-damien.lemoal@opensource.wdc.com
Reviewed-by: Jack Wang <jinpu.wang@ionos.com>
Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/scsi/pm8001/pm80xx_hwi.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/drivers/scsi/pm8001/pm80xx_hwi.c b/drivers/scsi/pm8001/pm80xx_hwi.c
index 2b3ee963e375..677ddc7bfb47 100644
--- a/drivers/scsi/pm8001/pm80xx_hwi.c
+++ b/drivers/scsi/pm8001/pm80xx_hwi.c
@@ -66,18 +66,16 @@ int pm80xx_bar4_shift(struct pm8001_hba_info *pm8001_ha, u32 shift_value)
 }
 
 static void pm80xx_pci_mem_copy(struct pm8001_hba_info  *pm8001_ha, u32 soffset,
-				const void *destination,
+				__le32 *destination,
 				u32 dw_count, u32 bus_base_number)
 {
 	u32 index, value, offset;
-	u32 *destination1;
-	destination1 = (u32 *)destination;
 
-	for (index = 0; index < dw_count; index += 4, destination1++) {
+	for (index = 0; index < dw_count; index += 4, destination++) {
 		offset = (soffset + index);
 		if (offset < (64 * 1024)) {
 			value = pm8001_cr32(pm8001_ha, bus_base_number, offset);
-			*destination1 =  cpu_to_le32(value);
+			*destination = cpu_to_le32(value);
 		}
 	}
 	return;
-- 
Gitee


From 9f47bf4e2f1810ebfabb8d291e439b41a23bad1d Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Date: Thu, 14 Jul 2022 21:57:37 +0800
Subject: [PATCH 227/674] scsi: pm8001: Fix pm8001_mpi_task_abort_resp()

stable inclusion
from stable-v5.10.111
commit 3bd9a28798ca8f19e78fca1890ad08404e53c8fe
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=3bd9a28798ca8f19e78fca1890ad08404e53c8fe

--------------------------------

[ Upstream commit 7e6b7e740addcea450041b5be8e42f0a4ceece0f ]

The call to pm8001_ccb_task_free() at the end of
pm8001_mpi_task_abort_resp() already frees the ccb tag. So when the device
NCQ_ABORT_ALL_FLAG is set, the tag should not be freed again.  Also change
the hardcoded 0xBFFFFFFF value to ~NCQ_ABORT_ALL_FLAG as it ought to be.

Link: https://lore.kernel.org/r/20220220031810.738362-19-damien.lemoal@opensource.wdc.com
Reviewed-by: Jack Wang <jinpu.wang@ionos.com>
Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/scsi/pm8001/pm8001_hwi.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/scsi/pm8001/pm8001_hwi.c b/drivers/scsi/pm8001/pm8001_hwi.c
index a8219a42c786..3a3bd0c13b5a 100644
--- a/drivers/scsi/pm8001/pm8001_hwi.c
+++ b/drivers/scsi/pm8001/pm8001_hwi.c
@@ -3669,12 +3669,11 @@ int pm8001_mpi_task_abort_resp(struct pm8001_hba_info *pm8001_ha, void *piomb)
 	mb();
 
 	if (pm8001_dev->id & NCQ_ABORT_ALL_FLAG) {
-		pm8001_tag_free(pm8001_ha, tag);
 		sas_free_task(t);
-		/* clear the flag */
-		pm8001_dev->id &= 0xBFFFFFFF;
-	} else
+		pm8001_dev->id &= ~NCQ_ABORT_ALL_FLAG;
+	} else {
 		t->task_done(t);
+	}
 
 	return 0;
 }
-- 
Gitee


From dd9e52a11360b63880cb9f3078c7eac85e42fe23 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Date: Thu, 14 Jul 2022 21:57:38 +0800
Subject: [PATCH 228/674] scsi: pm8001: Fix task leak in
 pm8001_send_abort_all()

stable inclusion
from stable-v5.10.111
commit 2051044d7901f1a9d7be95d0d32e53b88e9548f7
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=2051044d7901f1a9d7be95d0d32e53b88e9548f7

--------------------------------

[ Upstream commit f90a74892f3acf0cdec5844e90fc8686ca13e7d7 ]

In pm8001_send_abort_all(), make sure to free the allocated sas task
if pm8001_tag_alloc() or pm8001_mpi_build_cmd() fail.

Link: https://lore.kernel.org/r/20220220031810.738362-21-damien.lemoal@opensource.wdc.com
Reviewed-by: John Garry <john.garry@huawei.com>
Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/scsi/pm8001/pm8001_hwi.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/scsi/pm8001/pm8001_hwi.c b/drivers/scsi/pm8001/pm8001_hwi.c
index 3a3bd0c13b5a..c5edf6c425b5 100644
--- a/drivers/scsi/pm8001/pm8001_hwi.c
+++ b/drivers/scsi/pm8001/pm8001_hwi.c
@@ -1711,7 +1711,6 @@ static void pm8001_send_abort_all(struct pm8001_hba_info *pm8001_ha,
 	}
 
 	task = sas_alloc_slow_task(GFP_ATOMIC);
-
 	if (!task) {
 		pm8001_dbg(pm8001_ha, FAIL, "cannot allocate task\n");
 		return;
@@ -1720,8 +1719,10 @@ static void pm8001_send_abort_all(struct pm8001_hba_info *pm8001_ha,
 	task->task_done = pm8001_task_done;
 
 	res = pm8001_tag_alloc(pm8001_ha, &ccb_tag);
-	if (res)
+	if (res) {
+		sas_free_task(task);
 		return;
+	}
 
 	ccb = &pm8001_ha->ccb_info[ccb_tag];
 	ccb->device = pm8001_ha_dev;
@@ -1738,8 +1739,10 @@ static void pm8001_send_abort_all(struct pm8001_hba_info *pm8001_ha,
 
 	ret = pm8001_mpi_build_cmd(pm8001_ha, circularQ, opc, &task_abort,
 			sizeof(task_abort), 0);
-	if (ret)
+	if (ret) {
+		sas_free_task(task);
 		pm8001_tag_free(pm8001_ha, ccb_tag);
+	}
 
 }
 
-- 
Gitee


From 00bde281afd7caefecb49cd0da8c13f24532ad2e Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Date: Thu, 14 Jul 2022 21:57:39 +0800
Subject: [PATCH 229/674] scsi: pm8001: Fix tag leaks on error

stable inclusion
from stable-v5.10.111
commit a0bb65eadbf942024226241d9d99fed17168940b
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=a0bb65eadbf942024226241d9d99fed17168940b

--------------------------------

[ Upstream commit 4c8f04b1905cd4b776d0b720463c091545478ef7 ]

In pm8001_chip_set_dev_state_req(), pm8001_chip_fw_flash_update_req(),
pm80xx_chip_phy_ctl_req() and pm8001_chip_reg_dev_req() add missing calls
to pm8001_tag_free() to free the allocated tag when pm8001_mpi_build_cmd()
fails.

Similarly, in pm8001_exec_internal_task_abort(), if the chip ->task_abort
method fails, the tag allocated for the abort request task must be
freed. Add the missing call to pm8001_tag_free().

Link: https://lore.kernel.org/r/20220220031810.738362-22-damien.lemoal@opensource.wdc.com
Reviewed-by: John Garry <john.garry@huawei.com>
Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/scsi/pm8001/pm8001_hwi.c | 9 +++++++++
 drivers/scsi/pm8001/pm8001_sas.c | 2 +-
 drivers/scsi/pm8001/pm80xx_hwi.c | 9 +++++++--
 3 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/drivers/scsi/pm8001/pm8001_hwi.c b/drivers/scsi/pm8001/pm8001_hwi.c
index c5edf6c425b5..2182c977498d 100644
--- a/drivers/scsi/pm8001/pm8001_hwi.c
+++ b/drivers/scsi/pm8001/pm8001_hwi.c
@@ -4444,6 +4444,9 @@ static int pm8001_chip_reg_dev_req(struct pm8001_hba_info *pm8001_ha,
 		SAS_ADDR_SIZE);
 	rc = pm8001_mpi_build_cmd(pm8001_ha, circularQ, opc, &payload,
 			sizeof(payload), 0);
+	if (rc)
+		pm8001_tag_free(pm8001_ha, tag);
+
 	return rc;
 }
 
@@ -4856,6 +4859,9 @@ pm8001_chip_fw_flash_update_req(struct pm8001_hba_info *pm8001_ha,
 	ccb->ccb_tag = tag;
 	rc = pm8001_chip_fw_flash_update_build(pm8001_ha, &flash_update_info,
 		tag);
+	if (rc)
+		pm8001_tag_free(pm8001_ha, tag);
+
 	return rc;
 }
 
@@ -4960,6 +4966,9 @@ pm8001_chip_set_dev_state_req(struct pm8001_hba_info *pm8001_ha,
 	payload.nds = cpu_to_le32(state);
 	rc = pm8001_mpi_build_cmd(pm8001_ha, circularQ, opc, &payload,
 			sizeof(payload), 0);
+	if (rc)
+		pm8001_tag_free(pm8001_ha, tag);
+
 	return rc;
 
 }
diff --git a/drivers/scsi/pm8001/pm8001_sas.c b/drivers/scsi/pm8001/pm8001_sas.c
index ffafbc137a18..f77061396871 100644
--- a/drivers/scsi/pm8001/pm8001_sas.c
+++ b/drivers/scsi/pm8001/pm8001_sas.c
@@ -831,10 +831,10 @@ pm8001_exec_internal_task_abort(struct pm8001_hba_info *pm8001_ha,
 
 		res = PM8001_CHIP_DISP->task_abort(pm8001_ha,
 			pm8001_dev, flag, task_tag, ccb_tag);
-
 		if (res) {
 			del_timer(&task->slow_task->timer);
 			pm8001_dbg(pm8001_ha, FAIL, "Executing internal task failed\n");
+			pm8001_tag_free(pm8001_ha, ccb_tag);
 			goto ex_err;
 		}
 		wait_for_completion(&task->slow_task->completion);
diff --git a/drivers/scsi/pm8001/pm80xx_hwi.c b/drivers/scsi/pm8001/pm80xx_hwi.c
index 677ddc7bfb47..a66a06914913 100644
--- a/drivers/scsi/pm8001/pm80xx_hwi.c
+++ b/drivers/scsi/pm8001/pm80xx_hwi.c
@@ -4856,8 +4856,13 @@ static int pm80xx_chip_phy_ctl_req(struct pm8001_hba_info *pm8001_ha,
 	payload.tag = cpu_to_le32(tag);
 	payload.phyop_phyid =
 		cpu_to_le32(((phy_op & 0xFF) << 8) | (phyId & 0xFF));
-	return pm8001_mpi_build_cmd(pm8001_ha, circularQ, opc, &payload,
-			sizeof(payload), 0);
+
+	rc = pm8001_mpi_build_cmd(pm8001_ha, circularQ, opc, &payload,
+				  sizeof(payload), 0);
+	if (rc)
+		pm8001_tag_free(pm8001_ha, tag);
+
+	return rc;
 }
 
 static u32 pm80xx_chip_is_our_interrupt(struct pm8001_hba_info *pm8001_ha)
-- 
Gitee


From 61f8f11694b71e2100fad9a9fb2d9b9e57828788 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Date: Thu, 14 Jul 2022 21:57:40 +0800
Subject: [PATCH 230/674] scsi: pm8001: Fix memory leak in
 pm8001_chip_fw_flash_update_req()

stable inclusion
from stable-v5.10.111
commit d83574666bac4b1462e90df393fbed6c5f57d1a3
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=d83574666bac4b1462e90df393fbed6c5f57d1a3

--------------------------------

[ Upstream commit f792a3629f4c4aa4c3703d66b43ce1edcc3ec09a ]

In pm8001_chip_fw_flash_update_build(), if
pm8001_chip_fw_flash_update_build() fails, the struct fw_control_ex
allocated must be freed.

Link: https://lore.kernel.org/r/20220220031810.738362-23-damien.lemoal@opensource.wdc.com
Reviewed-by: Jack Wang <jinpu.wang@ionos.com>
Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/scsi/pm8001/pm8001_hwi.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/scsi/pm8001/pm8001_hwi.c b/drivers/scsi/pm8001/pm8001_hwi.c
index 2182c977498d..73e6f5d17ca7 100644
--- a/drivers/scsi/pm8001/pm8001_hwi.c
+++ b/drivers/scsi/pm8001/pm8001_hwi.c
@@ -4859,8 +4859,10 @@ pm8001_chip_fw_flash_update_req(struct pm8001_hba_info *pm8001_ha,
 	ccb->ccb_tag = tag;
 	rc = pm8001_chip_fw_flash_update_build(pm8001_ha, &flash_update_info,
 		tag);
-	if (rc)
+	if (rc) {
+		kfree(fw_control_context);
 		pm8001_tag_free(pm8001_ha, tag);
+	}
 
 	return rc;
 }
-- 
Gitee


From 6a5fbd015bac01548cd295ba59f50b88f66a87e6 Mon Sep 17 00:00:00 2001
From: Yang Li <yang.lee@linux.alibaba.com>
Date: Thu, 14 Jul 2022 21:57:41 +0800
Subject: [PATCH 231/674] mt76: mt7615: Fix assigning negative values to
 unsigned variable

stable inclusion
from stable-v5.10.111
commit 91ee8a14efb6ced3db850729cf72730aad2ae596
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=91ee8a14efb6ced3db850729cf72730aad2ae596

--------------------------------

[ Upstream commit 9273ffcc9a11942bd586bb42584337ef3962b692 ]

Smatch reports the following:
drivers/net/wireless/mediatek/mt76/mt7615/mac.c:1865
mt7615_mac_adjust_sensitivity() warn: assigning (-110) to unsigned
variable 'def_th'
drivers/net/wireless/mediatek/mt76/mt7615/mac.c:1865
mt7615_mac_adjust_sensitivity() warn: assigning (-98) to unsigned
variable 'def_th'

Reported-by: Abaci Robot <abaci@linux.alibaba.com>
Signed-off-by: Yang Li <yang.lee@linux.alibaba.com>
Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/net/wireless/mediatek/mt76/mt7615/mac.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/mediatek/mt76/mt7615/mac.c b/drivers/net/wireless/mediatek/mt76/mt7615/mac.c
index 424be103093c..1465a92ea3fc 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7615/mac.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7615/mac.c
@@ -1626,7 +1626,7 @@ mt7615_mac_adjust_sensitivity(struct mt7615_phy *phy,
 	struct mt7615_dev *dev = phy->dev;
 	int false_cca = ofdm ? phy->false_cca_ofdm : phy->false_cca_cck;
 	bool ext_phy = phy != &dev->phy;
-	u16 def_th = ofdm ? -98 : -110;
+	s16 def_th = ofdm ? -98 : -110;
 	bool update = false;
 	s8 *sensitivity;
 	int signal;
-- 
Gitee


From 40f234f823b4369a7e0f0858e774193ed387032f Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Thu, 14 Jul 2022 21:57:42 +0800
Subject: [PATCH 232/674] scsi: aha152x: Fix aha152x_setup() __setup handler
 return value

stable inclusion
from stable-v5.10.111
commit f49ffaa85d2c13134d4e91896198084edace387c
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=f49ffaa85d2c13134d4e91896198084edace387c

--------------------------------

[ Upstream commit cc8294ec4738d25e2bb2d71f7d82a9bf7f4a157b ]

__setup() handlers should return 1 if the command line option is handled
and 0 if not (or maybe never return 0; doing so just pollutes init's
environment with strings that are not init arguments/parameters).

Return 1 from aha152x_setup() to indicate that the boot option has been
handled.

Link: lore.kernel.org/r/64644a2f-4a20-bab3-1e15-3b2cdd0defe3@omprussia.ru
Link: https://lore.kernel.org/r/20220223000623.5920-1-rdunlap@infradead.org
Cc: "Juergen E. Fischer" <fischer@norbit.de>
Cc: "James E.J. Bottomley" <jejb@linux.ibm.com>
Cc: "Martin K. Petersen" <martin.petersen@oracle.com>
Reported-by: Igor Zhbanov <i.zhbanov@omprussia.ru>
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/scsi/aha152x.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/scsi/aha152x.c b/drivers/scsi/aha152x.c
index d8e19afa7a14..c6607c4686bb 100644
--- a/drivers/scsi/aha152x.c
+++ b/drivers/scsi/aha152x.c
@@ -3367,13 +3367,11 @@ static int __init aha152x_setup(char *str)
 	setup[setup_count].synchronous = ints[0] >= 6 ? ints[6] : 1;
 	setup[setup_count].delay       = ints[0] >= 7 ? ints[7] : DELAY_DEFAULT;
 	setup[setup_count].ext_trans   = ints[0] >= 8 ? ints[8] : 0;
-	if (ints[0] > 8) {                                                /*}*/
+	if (ints[0] > 8)
 		printk(KERN_NOTICE "aha152x: usage: aha152x=<IOBASE>[,<IRQ>[,<SCSI ID>"
 		       "[,<RECONNECT>[,<PARITY>[,<SYNCHRONOUS>[,<DELAY>[,<EXT_TRANS>]]]]]]]\n");
-	} else {
+	else
 		setup_count++;
-		return 0;
-	}
 
 	return 1;
 }
-- 
Gitee


From 946188f65fa4a553bbeba5179aa4c01c35e691f9 Mon Sep 17 00:00:00 2001
From: Qi Liu <liuqi115@huawei.com>
Date: Thu, 14 Jul 2022 21:57:43 +0800
Subject: [PATCH 233/674] scsi: hisi_sas: Free irq vectors in order for v3 HW

stable inclusion
from stable-v5.10.111
commit 224903cc60d045576393c3b16907742f23e6c740
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=224903cc60d045576393c3b16907742f23e6c740

--------------------------------

[ Upstream commit 554fb72ee34f4732c7f694f56c3c6e67790352a0 ]

If the driver probe fails to request the channel IRQ or fatal IRQ, the
driver will free the IRQ vectors before freeing the IRQs in free_irq(),
and this will cause a kernel BUG like this:

------------[ cut here ]------------
kernel BUG at drivers/pci/msi.c:369!
Internal error: Oops - BUG: 0 [#1] PREEMPT SMP
Call trace:
   free_msi_irqs+0x118/0x13c
   pci_disable_msi+0xfc/0x120
   pci_free_irq_vectors+0x24/0x3c
   hisi_sas_v3_probe+0x360/0x9d0 [hisi_sas_v3_hw]
   local_pci_probe+0x44/0xb0
   work_for_cpu_fn+0x20/0x34
   process_one_work+0x1d0/0x340
   worker_thread+0x2e0/0x460
   kthread+0x180/0x190
   ret_from_fork+0x10/0x20
---[ end trace b88990335b610c11 ]---

So we use devm_add_action() to control the order in which we free the
vectors.

Link: https://lore.kernel.org/r/1645703489-87194-4-git-send-email-john.garry@huawei.com
Signed-off-by: Qi Liu <liuqi115@huawei.com>
Signed-off-by: John Garry <john.garry@huawei.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>

 Conflicts:
	drivers/scsi/hisi_sas/hisi_sas_v3_hw.c

Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/scsi/hisi_sas/hisi_sas_v3_hw.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c
index 3ecc61eb7214..fd5bdb0afa71 100644
--- a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c
+++ b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c
@@ -2389,17 +2389,25 @@ static irqreturn_t cq_interrupt_v3_hw(int irq_no, void *p)
 	return IRQ_WAKE_THREAD;
 }
 
+static void hisi_sas_v3_free_vectors(void *data)
+{
+	struct pci_dev *pdev = data;
+
+	pci_free_irq_vectors(pdev);
+}
+
 static int interrupt_preinit_v3_hw(struct hisi_hba *hisi_hba)
 {
 	int vectors;
 	int max_msi = HISI_SAS_MSI_COUNT_V3_HW, min_msi;
 	struct Scsi_Host *shost = hisi_hba->shost;
+	struct pci_dev *pdev = hisi_hba->pci_dev;
 	struct irq_affinity desc = {
 		.pre_vectors = BASE_VECTORS_V3_HW,
 	};
 
 	min_msi = MIN_AFFINE_VECTORS_V3_HW;
-	vectors = pci_alloc_irq_vectors_affinity(hisi_hba->pci_dev,
+	vectors = pci_alloc_irq_vectors_affinity(pdev,
 						 min_msi, max_msi,
 						 PCI_IRQ_MSI |
 						 PCI_IRQ_AFFINITY,
@@ -2411,6 +2419,7 @@ static int interrupt_preinit_v3_hw(struct hisi_hba *hisi_hba)
 	hisi_hba->cq_nvecs = vectors - BASE_VECTORS_V3_HW;
 	shost->nr_hw_queues = hisi_hba->cq_nvecs;
 
+	devm_add_action(&pdev->dev, hisi_sas_v3_free_vectors, pdev);
 	return 0;
 }
 
@@ -4808,7 +4817,7 @@ hisi_sas_v3_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	dev_err(dev, "%d hw queues\n", shost->nr_hw_queues);
 	rc = scsi_add_host(shost, dev);
 	if (rc)
-		goto err_out_free_irq_vectors;
+		goto err_out_debugfs;
 
 	rc = sas_register_ha(sha);
 	if (rc)
@@ -4839,8 +4848,6 @@ hisi_sas_v3_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	sas_unregister_ha(sha);
 err_out_register_ha:
 	scsi_remove_host(shost);
-err_out_free_irq_vectors:
-	pci_free_irq_vectors(pdev);
 err_out_debugfs:
 	debugfs_exit_v3_hw(hisi_hba);
 err_out_ha:
@@ -4868,7 +4875,6 @@ hisi_sas_v3_destroy_irqs(struct pci_dev *pdev, struct hisi_hba *hisi_hba)
 
 		devm_free_irq(&pdev->dev, pci_irq_vector(pdev, nr), cq);
 	}
-	pci_free_irq_vectors(pdev);
 }
 
 static void hisi_sas_v3_remove(struct pci_dev *pdev)
-- 
Gitee


From 4942cbd3edb29a2d8b62ae33b52411bfbd40e4ee Mon Sep 17 00:00:00 2001
From: Dust Li <dust.li@linux.alibaba.com>
Date: Thu, 14 Jul 2022 21:57:44 +0800
Subject: [PATCH 234/674] net/smc: correct settings of RMB window update limit

stable inclusion
from stable-v5.10.111
commit f2565cb40e9b0b71337e9bffddd2bf57c117461b
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=f2565cb40e9b0b71337e9bffddd2bf57c117461b

--------------------------------

[ Upstream commit 6bf536eb5c8ca011d1ff57b5c5f7c57ceac06a37 ]

rmbe_update_limit is used to limit announcing receive
window updating too frequently. RFC7609 request a minimal
increase in the window size of 10% of the receive buffer
space. But current implementation used:

  min_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2)

and SOCK_MIN_SNDBUF / 2 == 2304 Bytes, which is almost
always less then 10% of the receive buffer space.

This causes the receiver always sending CDC message to
update its consumer cursor when it consumes more then 2K
of data. And as a result, we may encounter something like
"TCP silly window syndrome" when sending 2.5~8K message.

This patch fixes this using max(rmbe_size / 10, SOCK_MIN_SNDBUF / 2).

With this patch and SMC autocorking enabled, qperf 2K/4K/8K
tcp_bw test shows 45%/75%/40% increase in throughput respectively.

Signed-off-by: Dust Li <dust.li@linux.alibaba.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 net/smc/smc_core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index d69aac6c1fce..ef2fd28999ba 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -1426,7 +1426,7 @@ static struct smc_buf_desc *smc_buf_get_slot(int compressed_bufsize,
  */
 static inline int smc_rmb_wnd_update_limit(int rmbe_size)
 {
-	return min_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2);
+	return max_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2);
 }
 
 /* map an rmb buf to a link */
-- 
Gitee


From 9a8d7db5e318d4c0b54d6ae2f85631d7ef4577c2 Mon Sep 17 00:00:00 2001
From: Hangyu Hua <hbh25y@gmail.com>
Date: Thu, 14 Jul 2022 21:57:45 +0800
Subject: [PATCH 235/674] mips: ralink: fix a refcount leak in
 ill_acc_of_setup()

stable inclusion
from stable-v5.10.111
commit 142ae7d4f21524acfe073e5a3da5667aa85eb970
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=142ae7d4f21524acfe073e5a3da5667aa85eb970

--------------------------------

[ Upstream commit 4a0a1436053b17e50b7c88858fb0824326641793 ]

of_node_put(np) needs to be called when pdev == NULL.

Signed-off-by: Hangyu Hua <hbh25y@gmail.com>
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 arch/mips/ralink/ill_acc.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/mips/ralink/ill_acc.c b/arch/mips/ralink/ill_acc.c
index bdf53807d7c2..bea857c9da8b 100644
--- a/arch/mips/ralink/ill_acc.c
+++ b/arch/mips/ralink/ill_acc.c
@@ -61,6 +61,7 @@ static int __init ill_acc_of_setup(void)
 	pdev = of_find_device_by_node(np);
 	if (!pdev) {
 		pr_err("%pOFn: failed to lookup pdev\n", np);
+		of_node_put(np);
 		return -EINVAL;
 	}
 
-- 
Gitee


From cc2e15dd0e8dc6be0e750dec0551f9fca3405329 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Thu, 14 Jul 2022 21:57:46 +0800
Subject: [PATCH 236/674] macvtap: advertise link netns via netlink
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

stable inclusion
from stable-v5.10.111
commit 797b4ea9515eb01ec9b79f9319abc5ac4fc7087d
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=797b4ea9515eb01ec9b79f9319abc5ac4fc7087d

--------------------------------

[ Upstream commit a02192151b7dbf855084c38dca380d77c7658353 ]

Assign rtnl_link_ops->get_link_net() callback so that IFLA_LINK_NETNSID is
added to rtnetlink messages. This fixes iproute2 which otherwise resolved
the link interface to an interface in the wrong namespace.

Test commands:

  ip netns add nst
  ip link add dummy0 type dummy
  ip link add link macvtap0 link dummy0 type macvtap
  ip link set macvtap0 netns nst
  ip -netns nst link show macvtap0

Before:

  10: macvtap0@gre0: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 500
      link/ether 5e:8f:ae:1d:60:50 brd ff:ff:ff:ff:ff:ff

After:

  10: macvtap0@if2: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 500
      link/ether 5e:8f:ae:1d:60:50 brd ff:ff:ff:ff:ff:ff link-netnsid 0

Reported-by: Leonardo Mörlein <freifunk@irrelefant.net>
Signed-off-by: Sven Eckelmann <sven@narfation.org>
Link: https://lore.kernel.org/r/20220228003240.1337426-1-sven@narfation.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/net/macvtap.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 694e2f5dbbe5..39801c31e507 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -133,11 +133,17 @@ static void macvtap_setup(struct net_device *dev)
 	dev->tx_queue_len = TUN_READQ_SIZE;
 }
 
+static struct net *macvtap_link_net(const struct net_device *dev)
+{
+	return dev_net(macvlan_dev_real_dev(dev));
+}
+
 static struct rtnl_link_ops macvtap_link_ops __read_mostly = {
 	.kind		= "macvtap",
 	.setup		= macvtap_setup,
 	.newlink	= macvtap_newlink,
 	.dellink	= macvtap_dellink,
+	.get_link_net	= macvtap_link_net,
 	.priv_size      = sizeof(struct macvtap_dev),
 };
 
-- 
Gitee


From 3e3f5567fbfb97b05316cbda2254ad8ab174b56e Mon Sep 17 00:00:00 2001
From: Harold Huang <baymaxhuang@gmail.com>
Date: Thu, 14 Jul 2022 21:57:47 +0800
Subject: [PATCH 237/674] tuntap: add sanity checks about msg_controllen in
 sendmsg

stable inclusion
from stable-v5.10.111
commit 647b35aaf454271d68067d4bf05154095fe63f93
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=647b35aaf454271d68067d4bf05154095fe63f93

--------------------------------

[ Upstream commit 74a335a07a17d131b9263bfdbdcb5e40673ca9ca ]

In patch [1], tun_msg_ctl was added to allow pass batched xdp buffers to
tun_sendmsg. Although we donot use msg_controllen in this path, we should
check msg_controllen to make sure the caller pass a valid msg_ctl.

[1]: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=fe8dd45bb7556246c6b76277b1ba4296c91c2505

Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
Suggested-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Harold Huang <baymaxhuang@gmail.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Link: https://lore.kernel.org/r/20220303022441.383865-1-baymaxhuang@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/net/tap.c   | 3 ++-
 drivers/net/tun.c   | 3 ++-
 drivers/vhost/net.c | 1 +
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index f549d3a8e59c..8f7bb15206e9 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -1202,7 +1202,8 @@ static int tap_sendmsg(struct socket *sock, struct msghdr *m,
 	struct xdp_buff *xdp;
 	int i;
 
-	if (ctl && (ctl->type == TUN_MSG_PTR)) {
+	if (m->msg_controllen == sizeof(struct tun_msg_ctl) &&
+	    ctl && ctl->type == TUN_MSG_PTR) {
 		for (i = 0; i < ctl->num; i++) {
 			xdp = &((struct xdp_buff *)ctl->ptr)[i];
 			tap_get_user_xdp(q, xdp);
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index ab64f3d007c1..aef966a9dae2 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -2499,7 +2499,8 @@ static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
 	if (!tun)
 		return -EBADFD;
 
-	if (ctl && (ctl->type == TUN_MSG_PTR)) {
+	if (m->msg_controllen == sizeof(struct tun_msg_ctl) &&
+	    ctl && ctl->type == TUN_MSG_PTR) {
 		struct tun_page tpage;
 		int n = ctl->num;
 		int flush = 0;
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 379bae56040a..eba91da505e0 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -472,6 +472,7 @@ static void vhost_tx_batch(struct vhost_net *net,
 		goto signal_used;
 
 	msghdr->msg_control = &ctl;
+	msghdr->msg_controllen = sizeof(ctl);
 	err = sock->ops->sendmsg(sock, msghdr, 0);
 	if (unlikely(err < 0)) {
 		vq_err(&nvq->vq, "Fail to batch sending packets\n");
-- 
Gitee


From 886811ae87388dcbdd558e33cae1cd3d16632ce2 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Thu, 14 Jul 2022 21:57:48 +0800
Subject: [PATCH 238/674] Bluetooth: Fix not checking for valid hdev on
 bt_dev_{info,warn,err,dbg}

stable inclusion
from stable-v5.10.111
commit f9b183f1332ad0ed23317a980064d53d8028a280
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=f9b183f1332ad0ed23317a980064d53d8028a280

--------------------------------

[ Upstream commit 9b392e0e0b6d026da5a62bb79a08f32e27af858e ]

This fixes attemting to print hdev->name directly which causes them to
print an error:

kernel: read_version:367: (efault): sock 000000006a3008f2

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 include/net/bluetooth/bluetooth.h | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h
index 9125effbf448..3fecc4a411a1 100644
--- a/include/net/bluetooth/bluetooth.h
+++ b/include/net/bluetooth/bluetooth.h
@@ -180,19 +180,21 @@ void bt_err_ratelimited(const char *fmt, ...);
 #define BT_DBG(fmt, ...)	pr_debug(fmt "\n", ##__VA_ARGS__)
 #endif
 
+#define bt_dev_name(hdev) ((hdev) ? (hdev)->name : "null")
+
 #define bt_dev_info(hdev, fmt, ...)				\
-	BT_INFO("%s: " fmt, (hdev)->name, ##__VA_ARGS__)
+	BT_INFO("%s: " fmt, bt_dev_name(hdev), ##__VA_ARGS__)
 #define bt_dev_warn(hdev, fmt, ...)				\
-	BT_WARN("%s: " fmt, (hdev)->name, ##__VA_ARGS__)
+	BT_WARN("%s: " fmt, bt_dev_name(hdev), ##__VA_ARGS__)
 #define bt_dev_err(hdev, fmt, ...)				\
-	BT_ERR("%s: " fmt, (hdev)->name, ##__VA_ARGS__)
+	BT_ERR("%s: " fmt, bt_dev_name(hdev), ##__VA_ARGS__)
 #define bt_dev_dbg(hdev, fmt, ...)				\
-	BT_DBG("%s: " fmt, (hdev)->name, ##__VA_ARGS__)
+	BT_DBG("%s: " fmt, bt_dev_name(hdev), ##__VA_ARGS__)
 
 #define bt_dev_warn_ratelimited(hdev, fmt, ...)			\
-	bt_warn_ratelimited("%s: " fmt, (hdev)->name, ##__VA_ARGS__)
+	bt_warn_ratelimited("%s: " fmt, bt_dev_name(hdev), ##__VA_ARGS__)
 #define bt_dev_err_ratelimited(hdev, fmt, ...)			\
-	bt_err_ratelimited("%s: " fmt, (hdev)->name, ##__VA_ARGS__)
+	bt_err_ratelimited("%s: " fmt, bt_dev_name(hdev), ##__VA_ARGS__)
 
 /* Connection and socket states */
 enum {
-- 
Gitee


From 20542a2d7ee0f75e490fa1df9b101b2e11b4e3d4 Mon Sep 17 00:00:00 2001
From: "Minghao Chi (CGEL ZTE)" <chi.minghao@zte.com.cn>
Date: Thu, 14 Jul 2022 21:57:49 +0800
Subject: [PATCH 239/674] Bluetooth: use memset avoid memory leaks

stable inclusion
from stable-v5.10.111
commit 9567d54e70ff58c2695c2cc2e53c86c67551d3e6
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=9567d54e70ff58c2695c2cc2e53c86c67551d3e6

--------------------------------

[ Upstream commit d3715b2333e9a21692ba16ef8645eda584a9515d ]

Use memset to initialize structs to prevent memory leaks
in l2cap_ecred_connect

Reported-by: Zeal Robot <zealci@zte.com.cn>
Signed-off-by: Minghao Chi (CGEL ZTE) <chi.minghao@zte.com.cn>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 net/bluetooth/l2cap_core.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index 0ddbc415ce15..012c1a0abda8 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -1438,6 +1438,7 @@ static void l2cap_ecred_connect(struct l2cap_chan *chan)
 
 	l2cap_ecred_init(chan, 0);
 
+	memset(&data, 0, sizeof(data));
 	data.pdu.req.psm     = chan->psm;
 	data.pdu.req.mtu     = cpu_to_le16(chan->imtu);
 	data.pdu.req.mps     = cpu_to_le16(chan->mps);
-- 
Gitee


From 5452a9496e8206fe5885bcb1d1dd116f1e9b5293 Mon Sep 17 00:00:00 2001
From: Michael Chan <michael.chan@broadcom.com>
Date: Thu, 14 Jul 2022 21:57:50 +0800
Subject: [PATCH 240/674] bnxt_en: Eliminate unintended link toggle during FW
 reset

stable inclusion
from stable-v5.10.111
commit 79cfc0052f3916238421330c2a70474ee2df3567
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=79cfc0052f3916238421330c2a70474ee2df3567

--------------------------------

[ Upstream commit 7c492a2530c1f05441da541307c2534230dfd59b ]

If the flow control settings have been changed, a subsequent FW reset
may cause the ethernet link to toggle unnecessarily.  This link toggle
will increase the down time by a few seconds.

The problem is caused by bnxt_update_phy_setting() detecting a false
mismatch in the flow control settings between the stored software
settings and the current FW settings after the FW reset.  This mismatch
is caused by the AUTONEG bit added to link_info->req_flow_ctrl in an
inconsistent way in bnxt_set_pauseparam() in autoneg mode.  The AUTONEG
bit should not be added to link_info->req_flow_ctrl.

Reviewed-by: Colin Winegarden <colin.winegarden@broadcom.com>
Reviewed-by: Pavan Chebbi <pavan.chebbi@broadcom.com>
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
index 66f9d9b3de0a..d90b7b85c052 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
@@ -2049,9 +2049,7 @@ static int bnxt_set_pauseparam(struct net_device *dev,
 		}
 
 		link_info->autoneg |= BNXT_AUTONEG_FLOW_CTRL;
-		if (bp->hwrm_spec_code >= 0x10201)
-			link_info->req_flow_ctrl =
-				PORT_PHY_CFG_REQ_AUTO_PAUSE_AUTONEG_PAUSE;
+		link_info->req_flow_ctrl = 0;
 	} else {
 		/* when transition from auto pause to force pause,
 		 * force a link change
-- 
Gitee


From a3f2447127a7dd0c33d4feaa0b17afee2660c78e Mon Sep 17 00:00:00 2001
From: Li Chen <lchen@ambarella.com>
Date: Thu, 14 Jul 2022 21:57:51 +0800
Subject: [PATCH 241/674] PCI: endpoint: Fix misused goto label

stable inclusion
from stable-v5.10.111
commit 7c657c0694ff690e361a13ce41c36b9dfb433ec8
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=7c657c0694ff690e361a13ce41c36b9dfb433ec8

--------------------------------

[ Upstream commit bf8d87c076f55b8b4dfdb6bc6c6b6dc0c2ccb487 ]

Fix a misused goto label jump since that can result in a memory leak.

Link: https://lore.kernel.org/r/17e7b9b9ee6.c6d9c6a02564.4545388417402742326@zohomail.com
Signed-off-by: Li Chen <lchen@ambarella.com>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Acked-by: Kishon Vijay Abraham I <kishon@ti.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/pci/endpoint/functions/pci-epf-test.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/pci/endpoint/functions/pci-epf-test.c b/drivers/pci/endpoint/functions/pci-epf-test.c
index b861840e867c..262b2c4c70c9 100644
--- a/drivers/pci/endpoint/functions/pci-epf-test.c
+++ b/drivers/pci/endpoint/functions/pci-epf-test.c
@@ -451,7 +451,7 @@ static int pci_epf_test_write(struct pci_epf_test *epf_test)
 		if (!epf_test->dma_supported) {
 			dev_err(dev, "Cannot transfer data using DMA\n");
 			ret = -EINVAL;
-			goto err_map_addr;
+			goto err_dma_map;
 		}
 
 		src_phys_addr = dma_map_single(dma_dev, buf, reg->size,
-- 
Gitee


From 671a69e4246fff6155e9b73eb2dc06bef77a840c Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <alobakin@pm.me>
Date: Thu, 14 Jul 2022 21:57:52 +0800
Subject: [PATCH 242/674] MIPS: fix fortify panic when copying asm exception
 handlers

stable inclusion
from stable-v5.10.111
commit fd416c3f5a4c0e85abb443bd31b49e03d52ef07b
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=fd416c3f5a4c0e85abb443bd31b49e03d52ef07b

--------------------------------

[ Upstream commit d17b66417308996e7e64b270a3c7f3c1fbd4cfc8 ]

With KCFLAGS="-O3", I was able to trigger a fortify-source
memcpy() overflow panic on set_vi_srs_handler().
Although O3 level is not supported in the mainline, under some
conditions that may've happened with any optimization settings,
it's just a matter of inlining luck. The panic itself is correct,
more precisely, 50/50 false-positive and not at the same time.
From the one side, no real overflow happens. Exception handler
defined in asm just gets copied to some reserved places in the
memory.
But the reason behind is that C code refers to that exception
handler declares it as `char`, i.e. something of 1 byte length.
It's obvious that the asm function itself is way more than 1 byte,
so fortify logics thought we are going to past the symbol declared.
The standard way to refer to asm symbols from C code which is not
supposed to be called from C is to declare them as
`extern const u8[]`. This is fully correct from any point of view,
as any code itself is just a bunch of bytes (including 0 as it is
for syms like _stext/_etext/etc.), and the exact size is not known
at the moment of compilation.
Adjust the type of the except_vec_vi_*() and related variables.
Make set_handler() take `const` as a second argument to avoid
cast-away warnings and give a little more room for optimization.

Signed-off-by: Alexander Lobakin <alobakin@pm.me>
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 arch/mips/include/asm/setup.h |  2 +-
 arch/mips/kernel/traps.c      | 22 +++++++++++-----------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/arch/mips/include/asm/setup.h b/arch/mips/include/asm/setup.h
index bb36a400203d..8c56b862fd9c 100644
--- a/arch/mips/include/asm/setup.h
+++ b/arch/mips/include/asm/setup.h
@@ -16,7 +16,7 @@ static inline void setup_8250_early_printk_port(unsigned long base,
 	unsigned int reg_shift, unsigned int timeout) {}
 #endif
 
-extern void set_handler(unsigned long offset, void *addr, unsigned long len);
+void set_handler(unsigned long offset, const void *addr, unsigned long len);
 extern void set_uncached_handler(unsigned long offset, void *addr, unsigned long len);
 
 typedef void (*vi_handler_t)(void);
diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c
index e0352958e2f7..b1fe4518bd22 100644
--- a/arch/mips/kernel/traps.c
+++ b/arch/mips/kernel/traps.c
@@ -2097,19 +2097,19 @@ static void *set_vi_srs_handler(int n, vi_handler_t addr, int srs)
 		 * If no shadow set is selected then use the default handler
 		 * that does normal register saving and standard interrupt exit
 		 */
-		extern char except_vec_vi, except_vec_vi_lui;
-		extern char except_vec_vi_ori, except_vec_vi_end;
-		extern char rollback_except_vec_vi;
-		char *vec_start = using_rollback_handler() ?
-			&rollback_except_vec_vi : &except_vec_vi;
+		extern const u8 except_vec_vi[], except_vec_vi_lui[];
+		extern const u8 except_vec_vi_ori[], except_vec_vi_end[];
+		extern const u8 rollback_except_vec_vi[];
+		const u8 *vec_start = using_rollback_handler() ?
+				      rollback_except_vec_vi : except_vec_vi;
 #if defined(CONFIG_CPU_MICROMIPS) || defined(CONFIG_CPU_BIG_ENDIAN)
-		const int lui_offset = &except_vec_vi_lui - vec_start + 2;
-		const int ori_offset = &except_vec_vi_ori - vec_start + 2;
+		const int lui_offset = except_vec_vi_lui - vec_start + 2;
+		const int ori_offset = except_vec_vi_ori - vec_start + 2;
 #else
-		const int lui_offset = &except_vec_vi_lui - vec_start;
-		const int ori_offset = &except_vec_vi_ori - vec_start;
+		const int lui_offset = except_vec_vi_lui - vec_start;
+		const int ori_offset = except_vec_vi_ori - vec_start;
 #endif
-		const int handler_len = &except_vec_vi_end - vec_start;
+		const int handler_len = except_vec_vi_end - vec_start;
 
 		if (handler_len > VECTORSPACING) {
 			/*
@@ -2317,7 +2317,7 @@ void per_cpu_trap_init(bool is_boot_cpu)
 }
 
 /* Install CPU exception handler */
-void set_handler(unsigned long offset, void *addr, unsigned long size)
+void set_handler(unsigned long offset, const void *addr, unsigned long size)
 {
 #ifdef CONFIG_CPU_MICROMIPS
 	memcpy((void *)(ebase + offset), ((unsigned char *)addr - 1), size);
-- 
Gitee


From 5365fa404c658fbd709aaf776f9617fc1170143c Mon Sep 17 00:00:00 2001
From: Hangyu Hua <hbh25y@gmail.com>
Date: Thu, 14 Jul 2022 21:57:53 +0800
Subject: [PATCH 243/674] powerpc/secvar: fix refcount leak in format_show()

stable inclusion
from stable-v5.10.111
commit 02222bf4f0a27f6eba66d1f597cdb5daadd51829
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=02222bf4f0a27f6eba66d1f597cdb5daadd51829

--------------------------------

[ Upstream commit d601fd24e6964967f115f036a840f4f28488f63f ]

Refcount leak will happen when format_show returns failure in multiple
cases. Unified management of of_node_put can fix this problem.

Signed-off-by: Hangyu Hua <hbh25y@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20220302021959.10959-1-hbh25y@gmail.com
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 arch/powerpc/kernel/secvar-sysfs.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/secvar-sysfs.c b/arch/powerpc/kernel/secvar-sysfs.c
index a0a78aba2083..1ee4640a2641 100644
--- a/arch/powerpc/kernel/secvar-sysfs.c
+++ b/arch/powerpc/kernel/secvar-sysfs.c
@@ -26,15 +26,18 @@ static ssize_t format_show(struct kobject *kobj, struct kobj_attribute *attr,
 	const char *format;
 
 	node = of_find_compatible_node(NULL, NULL, "ibm,secvar-backend");
-	if (!of_device_is_available(node))
-		return -ENODEV;
+	if (!of_device_is_available(node)) {
+		rc = -ENODEV;
+		goto out;
+	}
 
 	rc = of_property_read_string(node, "format", &format);
 	if (rc)
-		return rc;
+		goto out;
 
 	rc = sprintf(buf, "%s\n", format);
 
+out:
 	of_node_put(node);
 
 	return rc;
-- 
Gitee


From 17e3fdb4a43980671111eda19b77fe70610bdcfa Mon Sep 17 00:00:00 2001
From: Jianglei Nie <niejianglei2021@163.com>
Date: Thu, 14 Jul 2022 21:57:54 +0800
Subject: [PATCH 244/674] scsi: libfc: Fix use after free in
 fc_exch_abts_resp()

stable inclusion
from stable-v5.10.111
commit 1d7effe5fff9d28e45e18ac3a564067c7ddfe898
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=1d7effe5fff9d28e45e18ac3a564067c7ddfe898

--------------------------------

[ Upstream commit 271add11994ba1a334859069367e04d2be2ebdd4 ]

fc_exch_release(ep) will decrease the ep's reference count. When the
reference count reaches zero, it is freed. But ep is still used in the
following code, which will lead to a use after free.

Return after the fc_exch_release() call to avoid use after free.

Link: https://lore.kernel.org/r/20220303015115.459778-1-niejianglei2021@163.com
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Jianglei Nie <niejianglei2021@163.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/scsi/libfc/fc_exch.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/scsi/libfc/fc_exch.c b/drivers/scsi/libfc/fc_exch.c
index a50f1eef0e0c..4261380af97b 100644
--- a/drivers/scsi/libfc/fc_exch.c
+++ b/drivers/scsi/libfc/fc_exch.c
@@ -1702,6 +1702,7 @@ static void fc_exch_abts_resp(struct fc_exch *ep, struct fc_frame *fp)
 	if (cancel_delayed_work_sync(&ep->timeout_work)) {
 		FC_EXCH_DBG(ep, "Exchange timer canceled due to ABTS response\n");
 		fc_exch_release(ep);	/* release from pending timer hold */
+		return;
 	}
 
 	spin_lock_bh(&ep->ex_lock);
-- 
Gitee


From 6e87a4cc684895159cdcbf5f624d506fe38fccbd Mon Sep 17 00:00:00 2001
From: Oliver Hartkopp <socketcan@hartkopp.net>
Date: Thu, 14 Jul 2022 21:57:55 +0800
Subject: [PATCH 245/674] can: isotp: set default value for N_As to 50 micro
 seconds

stable inclusion
from stable-v5.10.111
commit 74c4d50255519a1fd53dd791239f1442f1ae291b
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=74c4d50255519a1fd53dd791239f1442f1ae291b

--------------------------------

[ Upstream commit 530e0d46c61314c59ecfdb8d3bcb87edbc0f85d3 ]

The N_As value describes the time a CAN frame needs on the wire when
transmitted by the CAN controller. Even very short CAN FD frames need
arround 100 usecs (bitrate 1Mbit/s, data bitrate 8Mbit/s).

Having N_As to be zero (the former default) leads to 'no CAN frame
separation' when STmin is set to zero by the receiving node. This 'burst
mode' should not be enabled by default as it could potentially dump a high
number of CAN frames into the netdev queue from the soft hrtimer context.
This does not affect the system stability but is just not nice and
cooperative.

With this N_As/frame_txtime value the 'burst mode' is disabled by default.

As user space applications usually do not set the frame_txtime element
of struct can_isotp_options the new in-kernel default is very likely
overwritten with zero when the sockopt() CAN_ISOTP_OPTS is invoked.
To make sure that a N_As value of zero is only set intentional the
value '0' is now interpreted as 'do not change the current value'.
When a frame_txtime of zero is required for testing purposes this
CAN_ISOTP_FRAME_TXTIME_ZERO u32 value has to be set in frame_txtime.

Link: https://lore.kernel.org/all/20220309120416.83514-2-socketcan@hartkopp.net
Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 include/uapi/linux/can/isotp.h | 28 ++++++++++++++++++++++------
 net/can/isotp.c                | 12 +++++++++++-
 2 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/include/uapi/linux/can/isotp.h b/include/uapi/linux/can/isotp.h
index c55935b64ccc..590f8aea2b6d 100644
--- a/include/uapi/linux/can/isotp.h
+++ b/include/uapi/linux/can/isotp.h
@@ -137,20 +137,16 @@ struct can_isotp_ll_options {
 #define CAN_ISOTP_WAIT_TX_DONE	0x400	/* wait for tx completion */
 #define CAN_ISOTP_SF_BROADCAST	0x800	/* 1-to-N functional addressing */
 
-/* default values */
+/* protocol machine default values */
 
 #define CAN_ISOTP_DEFAULT_FLAGS		0
 #define CAN_ISOTP_DEFAULT_EXT_ADDRESS	0x00
 #define CAN_ISOTP_DEFAULT_PAD_CONTENT	0xCC /* prevent bit-stuffing */
-#define CAN_ISOTP_DEFAULT_FRAME_TXTIME	0
+#define CAN_ISOTP_DEFAULT_FRAME_TXTIME	50000 /* 50 micro seconds */
 #define CAN_ISOTP_DEFAULT_RECV_BS	0
 #define CAN_ISOTP_DEFAULT_RECV_STMIN	0x00
 #define CAN_ISOTP_DEFAULT_RECV_WFTMAX	0
 
-#define CAN_ISOTP_DEFAULT_LL_MTU	CAN_MTU
-#define CAN_ISOTP_DEFAULT_LL_TX_DL	CAN_MAX_DLEN
-#define CAN_ISOTP_DEFAULT_LL_TX_FLAGS	0
-
 /*
  * Remark on CAN_ISOTP_DEFAULT_RECV_* values:
  *
@@ -162,4 +158,24 @@ struct can_isotp_ll_options {
  * consistency and copied directly into the flow control (FC) frame.
  */
 
+/* link layer default values => make use of Classical CAN frames */
+
+#define CAN_ISOTP_DEFAULT_LL_MTU	CAN_MTU
+#define CAN_ISOTP_DEFAULT_LL_TX_DL	CAN_MAX_DLEN
+#define CAN_ISOTP_DEFAULT_LL_TX_FLAGS	0
+
+/*
+ * The CAN_ISOTP_DEFAULT_FRAME_TXTIME has become a non-zero value as
+ * it only makes sense for isotp implementation tests to run without
+ * a N_As value. As user space applications usually do not set the
+ * frame_txtime element of struct can_isotp_options the new in-kernel
+ * default is very likely overwritten with zero when the sockopt()
+ * CAN_ISOTP_OPTS is invoked.
+ * To make sure that a N_As value of zero is only set intentional the
+ * value '0' is now interpreted as 'do not change the current value'.
+ * When a frame_txtime of zero is required for testing purposes this
+ * CAN_ISOTP_FRAME_TXTIME_ZERO u32 value has to be set in frame_txtime.
+ */
+#define CAN_ISOTP_FRAME_TXTIME_ZERO	0xFFFFFFFF
+
 #endif /* !_UAPI_CAN_ISOTP_H */
diff --git a/net/can/isotp.c b/net/can/isotp.c
index 63e6e8923200..9a4a9c5a9f24 100644
--- a/net/can/isotp.c
+++ b/net/can/isotp.c
@@ -141,6 +141,7 @@ struct isotp_sock {
 	struct can_isotp_options opt;
 	struct can_isotp_fc_options rxfc, txfc;
 	struct can_isotp_ll_options ll;
+	u32 frame_txtime;
 	u32 force_tx_stmin;
 	u32 force_rx_stmin;
 	struct tpcon rx, tx;
@@ -360,7 +361,7 @@ static int isotp_rcv_fc(struct isotp_sock *so, struct canfd_frame *cf, int ae)
 
 		so->tx_gap = ktime_set(0, 0);
 		/* add transmission time for CAN frame N_As */
-		so->tx_gap = ktime_add_ns(so->tx_gap, so->opt.frame_txtime);
+		so->tx_gap = ktime_add_ns(so->tx_gap, so->frame_txtime);
 		/* add waiting time for consecutive frames N_Cs */
 		if (so->opt.flags & CAN_ISOTP_FORCE_TXSTMIN)
 			so->tx_gap = ktime_add_ns(so->tx_gap,
@@ -1245,6 +1246,14 @@ static int isotp_setsockopt_locked(struct socket *sock, int level, int optname,
 		/* no separate rx_ext_address is given => use ext_address */
 		if (!(so->opt.flags & CAN_ISOTP_RX_EXT_ADDR))
 			so->opt.rx_ext_address = so->opt.ext_address;
+
+		/* check for frame_txtime changes (0 => no changes) */
+		if (so->opt.frame_txtime) {
+			if (so->opt.frame_txtime == CAN_ISOTP_FRAME_TXTIME_ZERO)
+				so->frame_txtime = 0;
+			else
+				so->frame_txtime = so->opt.frame_txtime;
+		}
 		break;
 
 	case CAN_ISOTP_RECV_FC:
@@ -1446,6 +1455,7 @@ static int isotp_init(struct sock *sk)
 	so->opt.rxpad_content = CAN_ISOTP_DEFAULT_PAD_CONTENT;
 	so->opt.txpad_content = CAN_ISOTP_DEFAULT_PAD_CONTENT;
 	so->opt.frame_txtime = CAN_ISOTP_DEFAULT_FRAME_TXTIME;
+	so->frame_txtime = CAN_ISOTP_DEFAULT_FRAME_TXTIME;
 	so->rxfc.bs = CAN_ISOTP_DEFAULT_RECV_BS;
 	so->rxfc.stmin = CAN_ISOTP_DEFAULT_RECV_STMIN;
 	so->rxfc.wftmax = CAN_ISOTP_DEFAULT_RECV_WFTMAX;
-- 
Gitee


From c487d7a7fe5d0aeaef5b967b1c7421e5fd90c5be Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 14 Jul 2022 21:57:56 +0800
Subject: [PATCH 246/674] net: account alternate interface name memory

stable inclusion
from stable-v5.10.111
commit 423e7107f61ff6dfd4b380efc68c5acfc2546e4c
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=423e7107f61ff6dfd4b380efc68c5acfc2546e4c

--------------------------------

[ Upstream commit 5d26cff5bdbebdf98ba48217c078ff102536f134 ]

George reports that altnames can eat up kernel memory.
We should charge that memory appropriately.

Reported-by: George Shuklin <george.shuklin@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 net/core/rtnetlink.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 59284e51a2b4..857b1db8189a 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -3621,7 +3621,7 @@ static int rtnl_alt_ifname(int cmd, struct net_device *dev, struct nlattr *attr,
 	if (err)
 		return err;
 
-	alt_ifname = nla_strdup(attr, GFP_KERNEL);
+	alt_ifname = nla_strdup(attr, GFP_KERNEL_ACCOUNT);
 	if (!alt_ifname)
 		return -ENOMEM;
 
-- 
Gitee


From c005766ff8a26fa0ba8d69aad93cbc882c5dd363 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 14 Jul 2022 21:57:57 +0800
Subject: [PATCH 247/674] net: limit altnames to 64k total

stable inclusion
from stable-v5.10.111
commit 278b652f0ad9d34158d4834c6b00b95a808fd230
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=278b652f0ad9d34158d4834c6b00b95a808fd230

--------------------------------

[ Upstream commit 155fb43b70b5fce341347a77d1af2765d1e8fbb8 ]

Property list (altname is a link "property") is wrapped
in a nlattr. nlattrs length is 16bit so practically
speaking the list of properties can't be longer than
that, otherwise user space would have to interpret
broken netlink messages.

Prevent the problem from occurring by checking the length
of the property list before adding new entries.

Reported-by: George Shuklin <george.shuklin@gmail.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 net/core/rtnetlink.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 857b1db8189a..3c9c2d6e3b92 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -3615,12 +3615,23 @@ static int rtnl_alt_ifname(int cmd, struct net_device *dev, struct nlattr *attr,
 			   bool *changed, struct netlink_ext_ack *extack)
 {
 	char *alt_ifname;
+	size_t size;
 	int err;
 
 	err = nla_validate(attr, attr->nla_len, IFLA_MAX, ifla_policy, extack);
 	if (err)
 		return err;
 
+	if (cmd == RTM_NEWLINKPROP) {
+		size = rtnl_prop_list_size(dev);
+		size += nla_total_size(ALTIFNAMSIZ);
+		if (size >= U16_MAX) {
+			NL_SET_ERR_MSG(extack,
+				       "effective property list too long");
+			return -EINVAL;
+		}
+	}
+
 	alt_ifname = nla_strdup(attr, GFP_KERNEL_ACCOUNT);
 	if (!alt_ifname)
 		return -ENOMEM;
-- 
Gitee


From 3a10d8db1b6e0c5fe78342c61f77ee074911942a Mon Sep 17 00:00:00 2001
From: Michael Walle <michael@walle.cc>
Date: Thu, 14 Jul 2022 21:57:58 +0800
Subject: [PATCH 248/674] net: sfp: add 2500base-X quirk for Lantech SFP module

stable inclusion
from stable-v5.10.111
commit a4dd3e9e5ae84c2757d67933828c5d16a1e76769
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=a4dd3e9e5ae84c2757d67933828c5d16a1e76769

--------------------------------

[ Upstream commit 00eec9fe4f3b9588b4bfa8ef9dd0aae96407d5d7 ]

The Lantech 8330-262D-E module is 2500base-X capable, but it reports the
nominal bitrate as 2500MBd instead of 3125MBd. Add a quirk for the
module.

The following in an EEPROM dump of such a SFP with the serial number
redacted:

00: 03 04 07 00 00 00 01 20 40 0c 05 01 19 00 00 00    ???...? @????...
10: 1e 0f 00 00 4c 61 6e 74 65 63 68 20 20 20 20 20    ??..Lantech
20: 20 20 20 20 00 00 00 00 38 33 33 30 2d 32 36 32        ....8330-262
30: 44 2d 45 20 20 20 20 20 56 31 2e 30 03 52 00 cb    D-E     V1.0?R.?
40: 00 1a 00 00 46 43 XX XX XX XX XX XX XX XX XX XX    .?..FCXXXXXXXXXX
50: 20 20 20 20 32 32 30 32 31 34 20 20 68 b0 01 98        220214  h???
60: 45 58 54 52 45 4d 45 4c 59 20 43 4f 4d 50 41 54    EXTREMELY COMPAT
70: 49 42 4c 45 20 20 20 20 20 20 20 20 20 20 20 20    IBLE

Signed-off-by: Michael Walle <michael@walle.cc>
Link: https://lore.kernel.org/r/20220312205014.4154907-1-michael@walle.cc
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/net/phy/sfp-bus.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/net/phy/sfp-bus.c b/drivers/net/phy/sfp-bus.c
index a05d8372669c..850915a37f4c 100644
--- a/drivers/net/phy/sfp-bus.c
+++ b/drivers/net/phy/sfp-bus.c
@@ -74,6 +74,12 @@ static const struct sfp_quirk sfp_quirks[] = {
 		.vendor = "HUAWEI",
 		.part = "MA5671A",
 		.modes = sfp_quirk_2500basex,
+	}, {
+		// Lantech 8330-262D-E can operate at 2500base-X, but
+		// incorrectly report 2500MBd NRZ in their EEPROM
+		.vendor = "Lantech",
+		.part = "8330-262D-E",
+		.modes = sfp_quirk_2500basex,
 	}, {
 		.vendor = "UBNT",
 		.part = "UF-INSTANT",
-- 
Gitee


From acb549eedca10706368d9a9d8cdaeb4f02e3d508 Mon Sep 17 00:00:00 2001
From: "H. Nikolaus Schaller" <hns@goldelico.com>
Date: Thu, 14 Jul 2022 21:57:59 +0800
Subject: [PATCH 249/674] usb: dwc3: omap: fix "unbalanced disables for
 smps10_out1" on omap5evm

stable inclusion
from stable-v5.10.111
commit f6b9550f53674226838076139ebfc1818f5bbada
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=f6b9550f53674226838076139ebfc1818f5bbada

--------------------------------

[ Upstream commit ac01df343e5a6c6bcead2ed421af1fde30f73e7e ]

Usually, the vbus_regulator (smps10 on omap5evm) boots up disabled.

Hence calling regulator_disable() indirectly through dwc3_omap_set_mailbox()
during probe leads to:

[   10.332764] WARNING: CPU: 0 PID: 1628 at drivers/regulator/core.c:2853 _regulator_disable+0x40/0x164
[   10.351919] unbalanced disables for smps10_out1
[   10.361298] Modules linked in: dwc3_omap(+) clk_twl6040 at24 gpio_twl6040 palmas_gpadc palmas_pwrbutton
industrialio snd_soc_omap_mcbsp(+) snd_soc_ti_sdma display_connector ti_tpd12s015 drm leds_gpio
drm_panel_orientation_quirks ip_tables x_tables ipv6 autofs4
[   10.387818] CPU: 0 PID: 1628 Comm: systemd-udevd Not tainted 5.17.0-rc1-letux-lpae+ #8139
[   10.405129] Hardware name: Generic OMAP5 (Flattened Device Tree)
[   10.411455]  unwind_backtrace from show_stack+0x10/0x14
[   10.416970]  show_stack from dump_stack_lvl+0x40/0x4c
[   10.422313]  dump_stack_lvl from __warn+0xb8/0x170
[   10.427377]  __warn from warn_slowpath_fmt+0x70/0x9c
[   10.432595]  warn_slowpath_fmt from _regulator_disable+0x40/0x164
[   10.439037]  _regulator_disable from regulator_disable+0x30/0x64
[   10.445382]  regulator_disable from dwc3_omap_set_mailbox+0x8c/0xf0 [dwc3_omap]
[   10.453116]  dwc3_omap_set_mailbox [dwc3_omap] from dwc3_omap_probe+0x2b8/0x394 [dwc3_omap]
[   10.467021]  dwc3_omap_probe [dwc3_omap] from platform_probe+0x58/0xa8
[   10.481762]  platform_probe from really_probe+0x168/0x2fc
[   10.481782]  really_probe from __driver_probe_device+0xc4/0xd8
[   10.481782]  __driver_probe_device from driver_probe_device+0x24/0xa4
[   10.503762]  driver_probe_device from __driver_attach+0xc4/0xd8
[   10.510018]  __driver_attach from bus_for_each_dev+0x64/0xa0
[   10.516001]  bus_for_each_dev from bus_add_driver+0x148/0x1a4
[   10.524880]  bus_add_driver from driver_register+0xb4/0xf8
[   10.530678]  driver_register from do_one_initcall+0x90/0x1c4
[   10.536661]  do_one_initcall from do_init_module+0x4c/0x200
[   10.536683]  do_init_module from load_module+0x13dc/0x1910
[   10.551159]  load_module from sys_finit_module+0xc8/0xd8
[   10.561319]  sys_finit_module from __sys_trace_return+0x0/0x18
[   10.561336] Exception stack(0xc344bfa8 to 0xc344bff0)
[   10.561341] bfa0:                   b6fb5778 b6fab8d8 00000007 b6ecfbb8 00000000 b6ed0398
[   10.561341] bfc0: b6fb5778 b6fab8d8 855c0500 0000017b 00020000 b6f9a3cc 00000000 b6fb5778
[   10.595500] bfe0: bede18f8 bede18e8 b6ec9aeb b6dda1c2
[   10.601345] ---[ end trace 0000000000000000 ]---

Fix this unnecessary warning by checking if the regulator is enabled.

Signed-off-by: H. Nikolaus Schaller <hns@goldelico.com>
Link: https://lore.kernel.org/r/af3b750dc2265d875deaabcf5f80098c9645da45.1646744616.git.hns@goldelico.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/usb/dwc3/dwc3-omap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/usb/dwc3/dwc3-omap.c b/drivers/usb/dwc3/dwc3-omap.c
index e196673f5c64..efaf0db595f4 100644
--- a/drivers/usb/dwc3/dwc3-omap.c
+++ b/drivers/usb/dwc3/dwc3-omap.c
@@ -242,7 +242,7 @@ static void dwc3_omap_set_mailbox(struct dwc3_omap *omap,
 		break;
 
 	case OMAP_DWC3_ID_FLOAT:
-		if (omap->vbus_reg)
+		if (omap->vbus_reg && regulator_is_enabled(omap->vbus_reg))
 			regulator_disable(omap->vbus_reg);
 		val = dwc3_omap_read_utmi_ctrl(omap);
 		val |= USBOTGSS_UTMI_OTG_CTRL_IDDIG;
-- 
Gitee


From 66ea27791e760aaa9d23ddba0b92fa76b550cf9d Mon Sep 17 00:00:00 2001
From: Max Filippov <jcmvbkbc@gmail.com>
Date: Thu, 14 Jul 2022 21:58:00 +0800
Subject: [PATCH 250/674] xtensa: fix DTC warning unit_address_format

stable inclusion
from stable-v5.10.111
commit 61e25021e67a0ffb34e56147590d5f1193d2eb0e
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=61e25021e67a0ffb34e56147590d5f1193d2eb0e

--------------------------------

[ Upstream commit e85d29ba4b24f68e7a78cb85c55e754362eeb2de ]

DTC issues the following warnings when building xtfpga device trees:

 /soc/flash@00000000/partition@0x0: unit name should not have leading "0x"
 /soc/flash@00000000/partition@0x6000000: unit name should not have leading "0x"
 /soc/flash@00000000/partition@0x6800000: unit name should not have leading "0x"
 /soc/flash@00000000/partition@0x7fe0000: unit name should not have leading "0x"

Drop leading 0x from flash partition unit names.

Signed-off-by: Max Filippov <jcmvbkbc@gmail.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 arch/xtensa/boot/dts/xtfpga-flash-128m.dtsi | 8 ++++----
 arch/xtensa/boot/dts/xtfpga-flash-16m.dtsi  | 8 ++++----
 arch/xtensa/boot/dts/xtfpga-flash-4m.dtsi   | 4 ++--
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/arch/xtensa/boot/dts/xtfpga-flash-128m.dtsi b/arch/xtensa/boot/dts/xtfpga-flash-128m.dtsi
index 9bf8bad1dd18..c33932568aa7 100644
--- a/arch/xtensa/boot/dts/xtfpga-flash-128m.dtsi
+++ b/arch/xtensa/boot/dts/xtfpga-flash-128m.dtsi
@@ -8,19 +8,19 @@ flash: flash@00000000 {
 			reg = <0x00000000 0x08000000>;
 			bank-width = <2>;
 			device-width = <2>;
-			partition@0x0 {
+			partition@0 {
 				label = "data";
 				reg = <0x00000000 0x06000000>;
 			};
-			partition@0x6000000 {
+			partition@6000000 {
 				label = "boot loader area";
 				reg = <0x06000000 0x00800000>;
 			};
-			partition@0x6800000 {
+			partition@6800000 {
 				label = "kernel image";
 				reg = <0x06800000 0x017e0000>;
 			};
-			partition@0x7fe0000 {
+			partition@7fe0000 {
 				label = "boot environment";
 				reg = <0x07fe0000 0x00020000>;
 			};
diff --git a/arch/xtensa/boot/dts/xtfpga-flash-16m.dtsi b/arch/xtensa/boot/dts/xtfpga-flash-16m.dtsi
index 40c2f81f7cb6..7bde2ab2d6fb 100644
--- a/arch/xtensa/boot/dts/xtfpga-flash-16m.dtsi
+++ b/arch/xtensa/boot/dts/xtfpga-flash-16m.dtsi
@@ -8,19 +8,19 @@ flash: flash@08000000 {
 			reg = <0x08000000 0x01000000>;
 			bank-width = <2>;
 			device-width = <2>;
-			partition@0x0 {
+			partition@0 {
 				label = "boot loader area";
 				reg = <0x00000000 0x00400000>;
 			};
-			partition@0x400000 {
+			partition@400000 {
 				label = "kernel image";
 				reg = <0x00400000 0x00600000>;
 			};
-			partition@0xa00000 {
+			partition@a00000 {
 				label = "data";
 				reg = <0x00a00000 0x005e0000>;
 			};
-			partition@0xfe0000 {
+			partition@fe0000 {
 				label = "boot environment";
 				reg = <0x00fe0000 0x00020000>;
 			};
diff --git a/arch/xtensa/boot/dts/xtfpga-flash-4m.dtsi b/arch/xtensa/boot/dts/xtfpga-flash-4m.dtsi
index fb8d3a9f33c2..0655b868749a 100644
--- a/arch/xtensa/boot/dts/xtfpga-flash-4m.dtsi
+++ b/arch/xtensa/boot/dts/xtfpga-flash-4m.dtsi
@@ -8,11 +8,11 @@ flash: flash@08000000 {
 			reg = <0x08000000 0x00400000>;
 			bank-width = <2>;
 			device-width = <2>;
-			partition@0x0 {
+			partition@0 {
 				label = "boot loader area";
 				reg = <0x00000000 0x003f0000>;
 			};
-			partition@0x3f0000 {
+			partition@3f0000 {
 				label = "boot environment";
 				reg = <0x003f0000 0x00010000>;
 			};
-- 
Gitee


From bdb5b2272a8b6e6ee859ee66859761f7aa171ec1 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
Date: Thu, 14 Jul 2022 21:58:01 +0800
Subject: [PATCH 251/674] MIPS: ingenic: correct unit node address

stable inclusion
from stable-v5.10.111
commit 789621df196313e0b67ec1c96dbcb02bb77d35c8
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=789621df196313e0b67ec1c96dbcb02bb77d35c8

--------------------------------

[ Upstream commit 8931ddd8d6a55fcefb20f44a38ba42bb746f0b62 ]

Unit node addresses should not have leading 0x:

  Warning (unit_address_format): /nemc@13410000/efuse@d0/eth-mac-addr@0x22: unit name should not have leading "0x"

Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
Reviewed-by: Paul Cercueil <paul@crapouillou.net>
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 arch/mips/boot/dts/ingenic/jz4780.dtsi | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/mips/boot/dts/ingenic/jz4780.dtsi b/arch/mips/boot/dts/ingenic/jz4780.dtsi
index dfb5a7e1bb21..830e5dd3550e 100644
--- a/arch/mips/boot/dts/ingenic/jz4780.dtsi
+++ b/arch/mips/boot/dts/ingenic/jz4780.dtsi
@@ -429,7 +429,7 @@ efuse: efuse@d0 {
 			#address-cells = <1>;
 			#size-cells = <1>;
 
-			eth0_addr: eth-mac-addr@0x22 {
+			eth0_addr: eth-mac-addr@22 {
 				reg = <0x22 0x6>;
 			};
 		};
-- 
Gitee


From 5a11d67ebcc2f039779d277218b2d1dbb5d03e32 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Thu, 14 Jul 2022 21:58:02 +0800
Subject: [PATCH 252/674] Bluetooth: Fix use after free in hci_send_acl
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

stable inclusion
from stable-v5.10.111
commit 2cc803804ec9a296b3156855d6c8c4ca1c6b84be
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=2cc803804ec9a296b3156855d6c8c4ca1c6b84be

--------------------------------

[ Upstream commit f63d24baff787e13b723d86fe036f84bdbc35045 ]

This fixes the following trace caused by receiving
HCI_EV_DISCONN_PHY_LINK_COMPLETE which does call hci_conn_del without
first checking if conn->type is in fact AMP_LINK and in case it is
do properly cleanup upper layers with hci_disconn_cfm:

 ==================================================================
    BUG: KASAN: use-after-free in hci_send_acl+0xaba/0xc50
    Read of size 8 at addr ffff88800e404818 by task bluetoothd/142

    CPU: 0 PID: 142 Comm: bluetoothd Not tainted
    5.17.0-rc5-00006-gda4022eeac1a #7
    Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
    rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
    Call Trace:
     <TASK>
     dump_stack_lvl+0x45/0x59
     print_address_description.constprop.0+0x1f/0x150
     kasan_report.cold+0x7f/0x11b
     hci_send_acl+0xaba/0xc50
     l2cap_do_send+0x23f/0x3d0
     l2cap_chan_send+0xc06/0x2cc0
     l2cap_sock_sendmsg+0x201/0x2b0
     sock_sendmsg+0xdc/0x110
     sock_write_iter+0x20f/0x370
     do_iter_readv_writev+0x343/0x690
     do_iter_write+0x132/0x640
     vfs_writev+0x198/0x570
     do_writev+0x202/0x280
     do_syscall_64+0x38/0x90
     entry_SYSCALL_64_after_hwframe+0x44/0xae
    RSP: 002b:00007ffce8a099b8 EFLAGS: 00000246 ORIG_RAX: 0000000000000014
    Code: 0f 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b8 0f 1f 00 f3
    0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 14 00 00 00 0f 05
    <48> 3d 00 f0 ff ff 77 51 c3 48 83 ec 28 89 54 24 1c 48 89 74 24 10
    RDX: 0000000000000001 RSI: 00007ffce8a099e0 RDI: 0000000000000015
    RAX: ffffffffffffffda RBX: 00007ffce8a099e0 RCX: 00007f788fc3cf77
    R10: 00007ffce8af7080 R11: 0000000000000246 R12: 000055e4ccf75580
    RBP: 0000000000000015 R08: 0000000000000002 R09: 0000000000000001
    </TASK>
    R13: 000055e4ccf754a0 R14: 000055e4ccf75cd0 R15: 000055e4ccf4a6b0

    Allocated by task 45:
        kasan_save_stack+0x1e/0x40
        __kasan_kmalloc+0x81/0xa0
        hci_chan_create+0x9a/0x2f0
        l2cap_conn_add.part.0+0x1a/0xdc0
        l2cap_connect_cfm+0x236/0x1000
        le_conn_complete_evt+0x15a7/0x1db0
        hci_le_conn_complete_evt+0x226/0x2c0
        hci_le_meta_evt+0x247/0x450
        hci_event_packet+0x61b/0xe90
        hci_rx_work+0x4d5/0xc50
        process_one_work+0x8fb/0x15a0
        worker_thread+0x576/0x1240
        kthread+0x29d/0x340
        ret_from_fork+0x1f/0x30

    Freed by task 45:
        kasan_save_stack+0x1e/0x40
        kasan_set_track+0x21/0x30
        kasan_set_free_info+0x20/0x30
        __kasan_slab_free+0xfb/0x130
        kfree+0xac/0x350
        hci_conn_cleanup+0x101/0x6a0
        hci_conn_del+0x27e/0x6c0
        hci_disconn_phylink_complete_evt+0xe0/0x120
        hci_event_packet+0x812/0xe90
        hci_rx_work+0x4d5/0xc50
        process_one_work+0x8fb/0x15a0
        worker_thread+0x576/0x1240
        kthread+0x29d/0x340
        ret_from_fork+0x1f/0x30

    The buggy address belongs to the object at ffff88800c0f0500
    The buggy address is located 24 bytes inside of
    which belongs to the cache kmalloc-128 of size 128
    The buggy address belongs to the page:
    128-byte region [ffff88800c0f0500, ffff88800c0f0580)
    flags: 0x100000000000200(slab|node=0|zone=1)
    page:00000000fe45cd86 refcount:1 mapcount:0
    mapping:0000000000000000 index:0x0 pfn:0xc0f0
    raw: 0000000000000000 0000000080100010 00000001ffffffff
    0000000000000000
    raw: 0100000000000200 ffffea00003a2c80 dead000000000004
    ffff8880078418c0
    page dumped because: kasan: bad access detected
    ffff88800c0f0400: 00 00 00 00 00 00 00 00 00 00 00 00 00 fc fc fc
    Memory state around the buggy address:
    >ffff88800c0f0500: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
    ffff88800c0f0480: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
    ffff88800c0f0580: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
                                ^
    ==================================================================
    ffff88800c0f0600: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb

Reported-by: Sönke Huster <soenke.huster@eknoes.de>
Tested-by: Sönke Huster <soenke.huster@eknoes.de>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 net/bluetooth/hci_event.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 72b4127360c7..e926e80d9731 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -5061,8 +5061,9 @@ static void hci_disconn_phylink_complete_evt(struct hci_dev *hdev,
 	hci_dev_lock(hdev);
 
 	hcon = hci_conn_hash_lookup_handle(hdev, ev->phy_handle);
-	if (hcon) {
+	if (hcon && hcon->type == AMP_LINK) {
 		hcon->state = BT_CLOSED;
+		hci_disconn_cfm(hcon, ev->reason);
 		hci_conn_del(hcon);
 	}
 
-- 
Gitee


From 066e7ecd0ac543cba8299d889b39801aea45a3f1 Mon Sep 17 00:00:00 2001
From: Wang Yufen <wangyufen@huawei.com>
Date: Thu, 14 Jul 2022 21:58:03 +0800
Subject: [PATCH 253/674] netlabel: fix out-of-bounds memory accesses

stable inclusion
from stable-v5.10.111
commit d745512d54fd79d58e227f6f583b84e6111204a8
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=d745512d54fd79d58e227f6f583b84e6111204a8

--------------------------------

[ Upstream commit f22881de730ebd472e15bcc2c0d1d46e36a87b9c ]

In calipso_map_cat_ntoh(), in the for loop, if the return value of
netlbl_bitmap_walk() is equal to (net_clen_bits - 1), when
netlbl_bitmap_walk() is called next time, out-of-bounds memory accesses
of bitmap[byte_offset] occurs.

The bug was found during fuzzing. The following is the fuzzing report
 BUG: KASAN: slab-out-of-bounds in netlbl_bitmap_walk+0x3c/0xd0
 Read of size 1 at addr ffffff8107bf6f70 by task err_OH/252

 CPU: 7 PID: 252 Comm: err_OH Not tainted 5.17.0-rc7+ #17
 Hardware name: linux,dummy-virt (DT)
 Call trace:
  dump_backtrace+0x21c/0x230
  show_stack+0x1c/0x60
  dump_stack_lvl+0x64/0x7c
  print_address_description.constprop.0+0x70/0x2d0
  __kasan_report+0x158/0x16c
  kasan_report+0x74/0x120
  __asan_load1+0x80/0xa0
  netlbl_bitmap_walk+0x3c/0xd0
  calipso_opt_getattr+0x1a8/0x230
  calipso_sock_getattr+0x218/0x340
  calipso_sock_getattr+0x44/0x60
  netlbl_sock_getattr+0x44/0x80
  selinux_netlbl_socket_setsockopt+0x138/0x170
  selinux_socket_setsockopt+0x4c/0x60
  security_socket_setsockopt+0x4c/0x90
  __sys_setsockopt+0xbc/0x2b0
  __arm64_sys_setsockopt+0x6c/0x84
  invoke_syscall+0x64/0x190
  el0_svc_common.constprop.0+0x88/0x200
  do_el0_svc+0x88/0xa0
  el0_svc+0x128/0x1b0
  el0t_64_sync_handler+0x9c/0x120
  el0t_64_sync+0x16c/0x170

Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: Wang Yufen <wangyufen@huawei.com>
Acked-by: Paul Moore <paul@paul-moore.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 net/netlabel/netlabel_kapi.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c
index 5e1239cef000..91b35b7c80d8 100644
--- a/net/netlabel/netlabel_kapi.c
+++ b/net/netlabel/netlabel_kapi.c
@@ -885,6 +885,8 @@ int netlbl_bitmap_walk(const unsigned char *bitmap, u32 bitmap_len,
 	unsigned char bitmask;
 	unsigned char byte;
 
+	if (offset >= bitmap_len)
+		return -1;
 	byte_offset = offset / 8;
 	byte = bitmap[byte_offset];
 	bit_spot = offset;
-- 
Gitee


From 58766443dd921a435c707a59129fbe75dfeecb20 Mon Sep 17 00:00:00 2001
From: Xiubo Li <xiubli@redhat.com>
Date: Thu, 14 Jul 2022 21:58:04 +0800
Subject: [PATCH 254/674] ceph: fix memory leak in ceph_readdir when
 note_last_dentry returns error

stable inclusion
from stable-v5.10.111
commit f4429786129648a8f4bb1e5faa143c4478cc5c4a
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=f4429786129648a8f4bb1e5faa143c4478cc5c4a

--------------------------------

[ Upstream commit f639d9867eea647005dc824e0e24f39ffc50d4e4 ]

Reset the last_readdir at the same time, and add a comment explaining
why we don't free last_readdir when dir_emit returns false.

Signed-off-by: Xiubo Li <xiubli@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 fs/ceph/dir.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index f63c1a090139..1fddb9cd3e88 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -478,8 +478,11 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
 					2 : (fpos_off(rde->offset) + 1);
 			err = note_last_dentry(dfi, rde->name, rde->name_len,
 					       next_offset);
-			if (err)
+			if (err) {
+				ceph_mdsc_put_request(dfi->last_readdir);
+				dfi->last_readdir = NULL;
 				return err;
+			}
 		} else if (req->r_reply_info.dir_end) {
 			dfi->next_offset = 2;
 			/* keep last name */
@@ -520,6 +523,12 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
 		if (!dir_emit(ctx, rde->name, rde->name_len,
 			      ceph_present_ino(inode->i_sb, le64_to_cpu(rde->inode.in->ino)),
 			      le32_to_cpu(rde->inode.in->mode) >> 12)) {
+			/*
+			 * NOTE: Here no need to put the 'dfi->last_readdir',
+			 * because when dir_emit stops us it's most likely
+			 * doesn't have enough memory, etc. So for next readdir
+			 * it will continue.
+			 */
 			dout("filldir stopping us...\n");
 			return 0;
 		}
-- 
Gitee


From ca9570a8e3d291fabc8687f8ea0e8fe78c920a53 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Thu, 14 Jul 2022 21:58:05 +0800
Subject: [PATCH 255/674] init/main.c: return 1 from handled __setup()
 functions

stable inclusion
from stable-v5.10.111
commit 8d9efd4434e3479ea624b8e65e57e39a3ea91126
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=8d9efd4434e3479ea624b8e65e57e39a3ea91126

--------------------------------

[ Upstream commit f9a40b0890658330c83c95511f9d6b396610defc ]

initcall_blacklist() should return 1 to indicate that it handled its
cmdline arguments.

set_debug_rodata() should return 1 to indicate that it handled its
cmdline arguments.  Print a warning if the option string is invalid.

This prevents these strings from being added to the 'init' program's
environment as they are not init arguments/parameters.

Link: https://lkml.kernel.org/r/20220221050901.23985-1-rdunlap@infradead.org
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Reported-by: Igor Zhbanov <i.zhbanov@omprussia.ru>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 init/main.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/init/main.c b/init/main.c
index 11c96633c3f7..41a9ce782acc 100644
--- a/init/main.c
+++ b/init/main.c
@@ -1110,7 +1110,7 @@ static int __init initcall_blacklist(char *str)
 		}
 	} while (str_entry);
 
-	return 0;
+	return 1;
 }
 
 static bool __init_or_module initcall_blacklisted(initcall_t fn)
@@ -1373,7 +1373,9 @@ static noinline void __init kernel_init_freeable(void);
 bool rodata_enabled __ro_after_init = true;
 static int __init set_debug_rodata(char *str)
 {
-	return strtobool(str, &rodata_enabled);
+	if (strtobool(str, &rodata_enabled))
+		pr_warn("Invalid option string for rodata: '%s'\n", str);
+	return 1;
 }
 __setup("rodata=", set_debug_rodata);
 #endif
-- 
Gitee


From 0c134c80d758766c4aa1de45cd00dfcfa508b0ef Mon Sep 17 00:00:00 2001
From: Qinghua Jin <qhjin.dev@gmail.com>
Date: Thu, 14 Jul 2022 21:58:06 +0800
Subject: [PATCH 256/674] minix: fix bug when opening a file with O_DIRECT

stable inclusion
from stable-v5.10.111
commit c9cf6baabf780b60fcfd0fe8357f053184bb876e
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=c9cf6baabf780b60fcfd0fe8357f053184bb876e

--------------------------------

[ Upstream commit 9ce3c0d26c42d279b6c378a03cd6a61d828f19ca ]

Testcase:
1. create a minix file system and mount it
2. open a file on the file system with O_RDWR|O_CREAT|O_TRUNC|O_DIRECT
3. open fails with -EINVAL but leaves an empty file behind. All other
   open() failures don't leave the failed open files behind.

It is hard to check the direct_IO op before creating the inode.  Just as
ext4 and btrfs do, this patch will resolve the issue by allowing to
create the file with O_DIRECT but returning error when writing the file.

Link: https://lkml.kernel.org/r/20220107133626.413379-1-qhjin.dev@gmail.com
Signed-off-by: Qinghua Jin <qhjin.dev@gmail.com>
Reported-by: Colin Ian King <colin.king@intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Acked-by: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 fs/minix/inode.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 34f546404aa1..e938f5b1e4b9 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -446,7 +446,8 @@ static const struct address_space_operations minix_aops = {
 	.writepage = minix_writepage,
 	.write_begin = minix_write_begin,
 	.write_end = generic_write_end,
-	.bmap = minix_bmap
+	.bmap = minix_bmap,
+	.direct_IO = noop_direct_IO
 };
 
 static const struct inode_operations minix_symlink_inode_operations = {
-- 
Gitee


From e74ffd21806dfdf0a79c961786b7380a0a59673c Mon Sep 17 00:00:00 2001
From: Adam Wujek <dev_public@wujek.eu>
Date: Thu, 14 Jul 2022 21:58:07 +0800
Subject: [PATCH 257/674] clk: si5341: fix reported clk_rate when output
 divider is 2

stable inclusion
from stable-v5.10.111
commit be4ecca95819175c4a5f6f8516edeb7be8711374
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=be4ecca95819175c4a5f6f8516edeb7be8711374

--------------------------------

[ Upstream commit 2a8b539433e111c4de364237627ef219d2f6350a ]

SI5341_OUT_CFG_RDIV_FORCE2 shall be checked first to distinguish whether
a divider for a given output is set to 2 (SI5341_OUT_CFG_RDIV_FORCE2
is set) or the output is disabled (SI5341_OUT_CFG_RDIV_FORCE2 not set,
SI5341_OUT_R_REG is set 0).
Before the change, divider set to 2 (SI5341_OUT_R_REG set to 0) was
interpreted as output is disabled.

Signed-off-by: Adam Wujek <dev_public@wujek.eu>
Link: https://lore.kernel.org/r/20211203141125.2447520-1-dev_public@wujek.eu
Reviewed-by: Robert Hancock <robert.hancock@calian.com>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/clk/clk-si5341.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/drivers/clk/clk-si5341.c b/drivers/clk/clk-si5341.c
index 772b48ad0cd7..382a0619a048 100644
--- a/drivers/clk/clk-si5341.c
+++ b/drivers/clk/clk-si5341.c
@@ -789,6 +789,15 @@ static unsigned long si5341_output_clk_recalc_rate(struct clk_hw *hw,
 	u32 r_divider;
 	u8 r[3];
 
+	err = regmap_read(output->data->regmap,
+			SI5341_OUT_CONFIG(output), &val);
+	if (err < 0)
+		return err;
+
+	/* If SI5341_OUT_CFG_RDIV_FORCE2 is set, r_divider is 2 */
+	if (val & SI5341_OUT_CFG_RDIV_FORCE2)
+		return parent_rate / 2;
+
 	err = regmap_bulk_read(output->data->regmap,
 			SI5341_OUT_R_REG(output), r, 3);
 	if (err < 0)
@@ -805,13 +814,6 @@ static unsigned long si5341_output_clk_recalc_rate(struct clk_hw *hw,
 	r_divider += 1;
 	r_divider <<= 1;
 
-	err = regmap_read(output->data->regmap,
-			SI5341_OUT_CONFIG(output), &val);
-	if (err < 0)
-		return err;
-
-	if (val & SI5341_OUT_CFG_RDIV_FORCE2)
-		r_divider = 2;
 
 	return parent_rate / r_divider;
 }
-- 
Gitee


From 7c02091ee1d74add6622c1b4e323d81fe073d88e Mon Sep 17 00:00:00 2001
From: Stefan Wahren <stefan.wahren@i2se.com>
Date: Thu, 14 Jul 2022 21:58:08 +0800
Subject: [PATCH 258/674] staging: vchiq_core: handle NULL result of
 find_service_by_handle

stable inclusion
from stable-v5.10.111
commit aa0b7296785312a4bfa8fac0ba8ad78698fd9fcf
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=aa0b7296785312a4bfa8fac0ba8ad78698fd9fcf

--------------------------------

[ Upstream commit ca225857faf237234d2fffe5d1919467dfadd822 ]

In case of an invalid handle the function find_servive_by_handle
returns NULL. So take care of this and avoid a NULL pointer dereference.

Reviewed-by: Nicolas Saenz Julienne <nsaenz@kernel.org>
Signed-off-by: Stefan Wahren <stefan.wahren@i2se.com>
Link: https://lore.kernel.org/r/1642968143-19281-18-git-send-email-stefan.wahren@i2se.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 .../staging/vc04_services/interface/vchiq_arm/vchiq_core.c  | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_core.c b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_core.c
index 38b10fd5d992..95b91fe45cb3 100644
--- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_core.c
+++ b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_core.c
@@ -2280,6 +2280,9 @@ void vchiq_msg_queue_push(unsigned int handle, struct vchiq_header *header)
 	struct vchiq_service *service = find_service_by_handle(handle);
 	int pos;
 
+	if (!service)
+		return;
+
 	while (service->msg_queue_write == service->msg_queue_read +
 		VCHIQ_MAX_SLOTS) {
 		if (wait_for_completion_interruptible(&service->msg_queue_pop))
@@ -2299,6 +2302,9 @@ struct vchiq_header *vchiq_msg_hold(unsigned int handle)
 	struct vchiq_header *header;
 	int pos;
 
+	if (!service)
+		return NULL;
+
 	if (service->msg_queue_write == service->msg_queue_read)
 		return NULL;
 
-- 
Gitee


From ff34587608dcb43ed859a3889b9fa57d5602165f Mon Sep 17 00:00:00 2001
From: Amjad Ouled-Ameur <aouledameur@baylibre.com>
Date: Thu, 14 Jul 2022 21:58:09 +0800
Subject: [PATCH 259/674] phy: amlogic: meson8b-usb2: Use dev_err_probe()

stable inclusion
from stable-v5.10.111
commit 8f1d24f85ffd0dd75b4220bc05693b90fc8cb59c
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=8f1d24f85ffd0dd75b4220bc05693b90fc8cb59c

--------------------------------

[ Upstream commit 6466ba1898d415b527e1013bd8551a6fdfece94c ]

Use the existing dev_err_probe() helper instead of open-coding the same
operation.

Signed-off-by: Amjad Ouled-Ameur <aouledameur@baylibre.com>
Reported-by: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Reviewed-by: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Acked-by: Neil Armstrong <narmstrong@baylibre.com>
Link: https://lore.kernel.org/r/20220111095255.176141-3-aouledameur@baylibre.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/phy/amlogic/phy-meson8b-usb2.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/phy/amlogic/phy-meson8b-usb2.c b/drivers/phy/amlogic/phy-meson8b-usb2.c
index 03c061dd5f0d..8f40b9342a97 100644
--- a/drivers/phy/amlogic/phy-meson8b-usb2.c
+++ b/drivers/phy/amlogic/phy-meson8b-usb2.c
@@ -261,8 +261,9 @@ static int phy_meson8b_usb2_probe(struct platform_device *pdev)
 		return PTR_ERR(priv->clk_usb);
 
 	priv->reset = devm_reset_control_get_optional_shared(&pdev->dev, NULL);
-	if (PTR_ERR(priv->reset) == -EPROBE_DEFER)
-		return PTR_ERR(priv->reset);
+	if (IS_ERR(priv->reset))
+		return dev_err_probe(&pdev->dev, PTR_ERR(priv->reset),
+				     "Failed to get the reset line");
 
 	priv->dr_mode = of_usb_get_dr_mode_by_phy(pdev->dev.of_node, -1);
 	if (priv->dr_mode == USB_DR_MODE_UNKNOWN) {
-- 
Gitee


From 3d725385b53a2630886a4ae4476fd82d7ae695a8 Mon Sep 17 00:00:00 2001
From: Xiaoke Wang <xkernel.wang@foxmail.com>
Date: Thu, 14 Jul 2022 21:58:10 +0800
Subject: [PATCH 260/674] staging: wfx: fix an error handling in
 wfx_init_common()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

stable inclusion
from stable-v5.10.111
commit 93498c6e775ae91732a8109dba1bdcd324908f84
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=93498c6e775ae91732a8109dba1bdcd324908f84

--------------------------------

[ Upstream commit 60f1d3c92dc1ef1026e5b917a329a7fa947da036 ]

One error handler of wfx_init_common() return without calling
ieee80211_free_hw(hw), which may result in memory leak. And I add
one err label to unify the error handler, which is useful for the
subsequent changes.

Suggested-by: Jérôme Pouiller <jerome.pouiller@silabs.com>
Reviewed-by: Dan Carpenter <dan.carpenter@oracle.com>
Reviewed-by: Jérôme Pouiller <jerome.pouiller@silabs.com>
Signed-off-by: Xiaoke Wang <xkernel.wang@foxmail.com>
Link: https://lore.kernel.org/r/tencent_24A24A3EFF61206ECCC4B94B1C5C1454E108@qq.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/staging/wfx/main.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/staging/wfx/main.c b/drivers/staging/wfx/main.c
index e7bc1988124a..d5dacd5583c6 100644
--- a/drivers/staging/wfx/main.c
+++ b/drivers/staging/wfx/main.c
@@ -309,7 +309,8 @@ struct wfx_dev *wfx_init_common(struct device *dev,
 	wdev->pdata.gpio_wakeup = devm_gpiod_get_optional(dev, "wakeup",
 							  GPIOD_OUT_LOW);
 	if (IS_ERR(wdev->pdata.gpio_wakeup))
-		return NULL;
+		goto err;
+
 	if (wdev->pdata.gpio_wakeup)
 		gpiod_set_consumer_name(wdev->pdata.gpio_wakeup, "wfx wakeup");
 
@@ -328,6 +329,10 @@ struct wfx_dev *wfx_init_common(struct device *dev,
 		return NULL;
 
 	return wdev;
+
+err:
+	ieee80211_free_hw(hw);
+	return NULL;
 }
 
 int wfx_probe(struct wfx_dev *wdev)
-- 
Gitee


From 317afd94832fb99d1beb89dcb654f1b2154b30c3 Mon Sep 17 00:00:00 2001
From: Lucas Denefle <lucas.denefle@converge.io>
Date: Thu, 14 Jul 2022 21:58:11 +0800
Subject: [PATCH 261/674] w1: w1_therm: fixes w1_seq for ds28ea00 sensors

stable inclusion
from stable-v5.10.111
commit 2e16895d06e6e9a0057ecb93f03531c13cc8d595
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=2e16895d06e6e9a0057ecb93f03531c13cc8d595

--------------------------------

[ Upstream commit 41a92a89eee819298f805c40187ad8b02bb53426 ]

w1_seq was failing due to several devices responding to the
CHAIN_DONE at the same time. Now properly selects the current
device in the chain with MATCH_ROM. Also acknowledgment was
read twice.

Signed-off-by: Lucas Denefle <lucas.denefle@converge.io>
Link: https://lore.kernel.org/r/20220223113558.232750-1-lucas.denefle@converge.io
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/w1/slaves/w1_therm.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/w1/slaves/w1_therm.c b/drivers/w1/slaves/w1_therm.c
index 974d02bb3a45..6546d029c7fd 100644
--- a/drivers/w1/slaves/w1_therm.c
+++ b/drivers/w1/slaves/w1_therm.c
@@ -2092,16 +2092,20 @@ static ssize_t w1_seq_show(struct device *device,
 		if (sl->reg_num.id == reg_num->id)
 			seq = i;
 
+		if (w1_reset_bus(sl->master))
+			goto error;
+
+		/* Put the device into chain DONE state */
+		w1_write_8(sl->master, W1_MATCH_ROM);
+		w1_write_block(sl->master, (u8 *)&rn, 8);
 		w1_write_8(sl->master, W1_42_CHAIN);
 		w1_write_8(sl->master, W1_42_CHAIN_DONE);
 		w1_write_8(sl->master, W1_42_CHAIN_DONE_INV);
-		w1_read_block(sl->master, &ack, sizeof(ack));
 
 		/* check for acknowledgment */
 		ack = w1_read_8(sl->master);
 		if (ack != W1_42_SUCCESS_CONFIRM_BYTE)
 			goto error;
-
 	}
 
 	/* Exit from CHAIN state */
-- 
Gitee


From a6d9fbda9f5f5f54277f445a72a2fe53374f35c0 Mon Sep 17 00:00:00 2001
From: Xin Xiong <xiongx18@fudan.edu.cn>
Date: Thu, 14 Jul 2022 21:58:12 +0800
Subject: [PATCH 262/674] NFSv4.2: fix reference count leaks in
 _nfs42_proc_copy_notify()

stable inclusion
from stable-v5.10.111
commit 9b9feec97c1fc7dd9bb69f62c4905cddf1801599
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=9b9feec97c1fc7dd9bb69f62c4905cddf1801599

--------------------------------

[ Upstream commit b7f114edd54326f730a754547e7cfb197b5bc132 ]

[You don't often get email from xiongx18@fudan.edu.cn. Learn why this is important at http://aka.ms/LearnAboutSenderIdentification.]

The reference counting issue happens in two error paths in the
function _nfs42_proc_copy_notify(). In both error paths, the function
simply returns the error code and forgets to balance the refcount of
object `ctx`, bumped by get_nfs_open_context() earlier, which may
cause refcount leaks.

Fix it by balancing refcount of the `ctx` object before the function
returns in both error paths.

Signed-off-by: Xin Xiong <xiongx18@fudan.edu.cn>
Signed-off-by: Xiyu Yang <xiyuyang19@fudan.edu.cn>
Signed-off-by: Xin Tan <tanxin.ctf@gmail.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 fs/nfs/nfs42proc.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 2587b1b8e2ef..dad32b171e67 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -567,8 +567,10 @@ static int _nfs42_proc_copy_notify(struct file *src, struct file *dst,
 
 	ctx = get_nfs_open_context(nfs_file_open_context(src));
 	l_ctx = nfs_get_lock_context(ctx);
-	if (IS_ERR(l_ctx))
-		return PTR_ERR(l_ctx);
+	if (IS_ERR(l_ctx)) {
+		status = PTR_ERR(l_ctx);
+		goto out;
+	}
 
 	status = nfs4_set_rw_stateid(&args->cna_src_stateid, ctx, l_ctx,
 				     FMODE_READ);
@@ -576,7 +578,7 @@ static int _nfs42_proc_copy_notify(struct file *src, struct file *dst,
 	if (status) {
 		if (status == -EAGAIN)
 			status = -NFS4ERR_BAD_STATEID;
-		return status;
+		goto out;
 	}
 
 	status = nfs4_call_sync(src_server->client, src_server, &msg,
@@ -584,6 +586,7 @@ static int _nfs42_proc_copy_notify(struct file *src, struct file *dst,
 	if (status == -ENOTSUPP)
 		src_server->caps &= ~NFS_CAP_COPY_NOTIFY;
 
+out:
 	put_nfs_open_context(nfs_file_open_context(src));
 	return status;
 }
-- 
Gitee


From f86e249e18079b72aa7414b102c43919f4d44a73 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Thu, 14 Jul 2022 21:58:13 +0800
Subject: [PATCH 263/674] NFSv4: Protect the state recovery thread against
 direct reclaim

stable inclusion
from stable-v5.10.111
commit 4a2544ce244bd27825a940824d1084505188695a
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=4a2544ce244bd27825a940824d1084505188695a

--------------------------------

[ Upstream commit 3e17898aca293a24dae757a440a50aa63ca29671 ]

If memory allocation triggers a direct reclaim from the state recovery
thread, then we can deadlock. Use memalloc_nofs_save/restore to ensure
that doesn't happen.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 fs/nfs/nfs4state.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index cbeec29e9f21..a8fe8f84c5ae 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -49,6 +49,7 @@
 #include <linux/workqueue.h>
 #include <linux/bitops.h>
 #include <linux/jiffies.h>
+#include <linux/sched/mm.h>
 
 #include <linux/sunrpc/clnt.h>
 
@@ -2557,9 +2558,17 @@ static void nfs4_layoutreturn_any_run(struct nfs_client *clp)
 
 static void nfs4_state_manager(struct nfs_client *clp)
 {
+	unsigned int memflags;
 	int status = 0;
 	const char *section = "", *section_sep = "";
 
+	/*
+	 * State recovery can deadlock if the direct reclaim code tries
+	 * start NFS writeback. So ensure memory allocations are all
+	 * GFP_NOFS.
+	 */
+	memflags = memalloc_nofs_save();
+
 	/* Ensure exclusive access to NFSv4 state */
 	do {
 		trace_nfs4_state_mgr(clp);
@@ -2654,6 +2663,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
 			clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state);
 		}
 
+		memalloc_nofs_restore(memflags);
 		nfs4_end_drain_session(clp);
 		nfs4_clear_state_manager_bit(clp);
 
@@ -2671,6 +2681,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
 			return;
 		if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0)
 			return;
+		memflags = memalloc_nofs_save();
 	} while (refcount_read(&clp->cl_count) > 1 && !signalled());
 	goto out_drain;
 
@@ -2683,6 +2694,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
 			clp->cl_hostname, -status);
 	ssleep(1);
 out_drain:
+	memalloc_nofs_restore(memflags);
 	nfs4_end_drain_session(clp);
 	nfs4_clear_state_manager_bit(clp);
 }
-- 
Gitee


From 39f755411208bdd8c0667901ca55b3ba48dff0f5 Mon Sep 17 00:00:00 2001
From: Dongli Zhang <dongli.zhang@oracle.com>
Date: Thu, 14 Jul 2022 21:58:14 +0800
Subject: [PATCH 264/674] xen: delay xen_hvm_init_time_ops() if kdump is boot
 on vcpu>=32

stable inclusion
from stable-v5.10.111
commit a2a0e04f6478e8c1038db64717f3fafd55de1420
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=a2a0e04f6478e8c1038db64717f3fafd55de1420

--------------------------------

[ Upstream commit eed05744322da07dd7e419432dcedf3c2e017179 ]

The sched_clock() can be used very early since commit 857baa87b642
("sched/clock: Enable sched clock early"). In addition, with commit
38669ba205d1 ("x86/xen/time: Output xen sched_clock time from 0"), kdump
kernel in Xen HVM guest may panic at very early stage when accessing
&__this_cpu_read(xen_vcpu)->time as in below:

setup_arch()
 -> init_hypervisor_platform()
     -> x86_init.hyper.init_platform = xen_hvm_guest_init()
         -> xen_hvm_init_time_ops()
             -> xen_clocksource_read()
                 -> src = &__this_cpu_read(xen_vcpu)->time;

This is because Xen HVM supports at most MAX_VIRT_CPUS=32 'vcpu_info'
embedded inside 'shared_info' during early stage until xen_vcpu_setup() is
used to allocate/relocate 'vcpu_info' for boot cpu at arbitrary address.

However, when Xen HVM guest panic on vcpu >= 32, since
xen_vcpu_info_reset(0) would set per_cpu(xen_vcpu, cpu) = NULL when
vcpu >= 32, xen_clocksource_read() on vcpu >= 32 would panic.

This patch calls xen_hvm_init_time_ops() again later in
xen_hvm_smp_prepare_boot_cpu() after the 'vcpu_info' for boot vcpu is
registered when the boot vcpu is >= 32.

This issue can be reproduced on purpose via below command at the guest
side when kdump/kexec is enabled:

"taskset -c 33 echo c > /proc/sysrq-trigger"

The bugfix for PVM is not implemented due to the lack of testing
environment.

[boris: xen_hvm_init_time_ops() returns on errors instead of jumping to end]

Cc: Joe Jin <joe.jin@oracle.com>
Signed-off-by: Dongli Zhang <dongli.zhang@oracle.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Link: https://lore.kernel.org/r/20220302164032.14569-3-dongli.zhang@oracle.com
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 arch/x86/xen/smp_hvm.c |  6 ++++++
 arch/x86/xen/time.c    | 24 +++++++++++++++++++++++-
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/arch/x86/xen/smp_hvm.c b/arch/x86/xen/smp_hvm.c
index 6ff3c887e0b9..b70afdff419c 100644
--- a/arch/x86/xen/smp_hvm.c
+++ b/arch/x86/xen/smp_hvm.c
@@ -19,6 +19,12 @@ static void __init xen_hvm_smp_prepare_boot_cpu(void)
 	 */
 	xen_vcpu_setup(0);
 
+	/*
+	 * Called again in case the kernel boots on vcpu >= MAX_VIRT_CPUS.
+	 * Refer to comments in xen_hvm_init_time_ops().
+	 */
+	xen_hvm_init_time_ops();
+
 	/*
 	 * The alternative logic (which patches the unlock/lock) runs before
 	 * the smp bootup up code is activated. Hence we need to set this up
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 91f5b330dcc6..8183d17e1cf1 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -556,6 +556,11 @@ static void xen_hvm_setup_cpu_clockevents(void)
 
 void __init xen_hvm_init_time_ops(void)
 {
+	static bool hvm_time_initialized;
+
+	if (hvm_time_initialized)
+		return;
+
 	/*
 	 * vector callback is needed otherwise we cannot receive interrupts
 	 * on cpu > 0 and at this point we don't know how many cpus are
@@ -565,7 +570,22 @@ void __init xen_hvm_init_time_ops(void)
 		return;
 
 	if (!xen_feature(XENFEAT_hvm_safe_pvclock)) {
-		pr_info("Xen doesn't support pvclock on HVM, disable pv timer");
+		pr_info_once("Xen doesn't support pvclock on HVM, disable pv timer");
+		return;
+	}
+
+	/*
+	 * Only MAX_VIRT_CPUS 'vcpu_info' are embedded inside 'shared_info'.
+	 * The __this_cpu_read(xen_vcpu) is still NULL when Xen HVM guest
+	 * boots on vcpu >= MAX_VIRT_CPUS (e.g., kexec), To access
+	 * __this_cpu_read(xen_vcpu) via xen_clocksource_read() will panic.
+	 *
+	 * The xen_hvm_init_time_ops() should be called again later after
+	 * __this_cpu_read(xen_vcpu) is available.
+	 */
+	if (!__this_cpu_read(xen_vcpu)) {
+		pr_info("Delay xen_init_time_common() as kernel is running on vcpu=%d\n",
+			xen_vcpu_nr(0));
 		return;
 	}
 
@@ -577,6 +597,8 @@ void __init xen_hvm_init_time_ops(void)
 	x86_platform.calibrate_tsc = xen_tsc_khz;
 	x86_platform.get_wallclock = xen_get_wallclock;
 	x86_platform.set_wallclock = xen_set_wallclock;
+
+	hvm_time_initialized = true;
 }
 #endif
 
-- 
Gitee


From 97bb889524f242fa10776e7d0ce2df8997848cf2 Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Thu, 14 Jul 2022 21:58:15 +0800
Subject: [PATCH 265/674] clk: ti: Preserve node in ti_dt_clocks_register()

stable inclusion
from stable-v5.10.111
commit 1e9b5538cf160788c88fc2e3e74badc65b8a6bac
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=1e9b5538cf160788c88fc2e3e74badc65b8a6bac

--------------------------------

[ Upstream commit 80864594ff2ad002e2755daf97d46ff0c86faf1f ]

In preparation for making use of the clock-output-names, we want to
keep node around in ti_dt_clocks_register().

This change should not needed as a fix currently.

Signed-off-by: Tony Lindgren <tony@atomide.com>
Link: https://lore.kernel.org/r/20220204071449.16762-3-tony@atomide.com
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/clk/ti/clk.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/drivers/clk/ti/clk.c b/drivers/clk/ti/clk.c
index 3da33c786d77..29eafab4353e 100644
--- a/drivers/clk/ti/clk.c
+++ b/drivers/clk/ti/clk.c
@@ -131,7 +131,7 @@ int ti_clk_setup_ll_ops(struct ti_clk_ll_ops *ops)
 void __init ti_dt_clocks_register(struct ti_dt_clk oclks[])
 {
 	struct ti_dt_clk *c;
-	struct device_node *node, *parent;
+	struct device_node *node, *parent, *child;
 	struct clk *clk;
 	struct of_phandle_args clkspec;
 	char buf[64];
@@ -171,10 +171,13 @@ void __init ti_dt_clocks_register(struct ti_dt_clk oclks[])
 		node = of_find_node_by_name(NULL, buf);
 		if (num_args && compat_mode) {
 			parent = node;
-			node = of_get_child_by_name(parent, "clock");
-			if (!node)
-				node = of_get_child_by_name(parent, "clk");
-			of_node_put(parent);
+			child = of_get_child_by_name(parent, "clock");
+			if (!child)
+				child = of_get_child_by_name(parent, "clk");
+			if (child) {
+				of_node_put(parent);
+				node = child;
+			}
 		}
 
 		clkspec.np = node;
-- 
Gitee


From 8c27e3bcd939b56a2b323fbb15e428a0d77a09fc Mon Sep 17 00:00:00 2001
From: Maxime Ripard <maxime@cerno.tech>
Date: Thu, 14 Jul 2022 21:58:16 +0800
Subject: [PATCH 266/674] clk: Enforce that disjoints limits are invalid

stable inclusion
from stable-v5.10.111
commit e2b2542f7452672ce2511b73ccea234d51dd5827
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=e2b2542f7452672ce2511b73ccea234d51dd5827

--------------------------------

[ Upstream commit 10c46f2ea914202482d19cf80dcc9c321c9ff59b ]

If we were to have two users of the same clock, doing something like:

clk_set_rate_range(user1, 1000, 2000);
clk_set_rate_range(user2, 3000, 4000);

The second call would fail with -EINVAL, preventing from getting in a
situation where we end up with impossible limits.

However, this is never explicitly checked against and enforced, and
works by relying on an undocumented behaviour of clk_set_rate().

Indeed, on the first clk_set_rate_range will make sure the current clock
rate is within the new range, so it will be between 1000 and 2000Hz. On
the second clk_set_rate_range(), it will consider (rightfully), that our
current clock is outside of the 3000-4000Hz range, and will call
clk_core_set_rate_nolock() to set it to 3000Hz.

clk_core_set_rate_nolock() will then call clk_calc_new_rates() that will
eventually check that our rate 3000Hz rate is outside the min 3000Hz max
2000Hz range, will bail out, the error will propagate and we'll
eventually return -EINVAL.

This solely relies on the fact that clk_calc_new_rates(), and in
particular clk_core_determine_round_nolock(), won't modify the new rate
allowing the error to be reported. That assumption won't be true for all
drivers, and most importantly we'll break that assumption in a later
patch.

It can also be argued that we shouldn't even reach the point where we're
calling clk_core_set_rate_nolock().

Let's make an explicit check for disjoints range before we're doing
anything.

Signed-off-by: Maxime Ripard <maxime@cerno.tech>
Link: https://lore.kernel.org/r/20220225143534.405820-4-maxime@cerno.tech
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/clk/clk.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c
index 92fc084203b7..2e56cc0a3bce 100644
--- a/drivers/clk/clk.c
+++ b/drivers/clk/clk.c
@@ -631,6 +631,24 @@ static void clk_core_get_boundaries(struct clk_core *core,
 		*max_rate = min(*max_rate, clk_user->max_rate);
 }
 
+static bool clk_core_check_boundaries(struct clk_core *core,
+				      unsigned long min_rate,
+				      unsigned long max_rate)
+{
+	struct clk *user;
+
+	lockdep_assert_held(&prepare_lock);
+
+	if (min_rate > core->max_rate || max_rate < core->min_rate)
+		return false;
+
+	hlist_for_each_entry(user, &core->clks, clks_node)
+		if (min_rate > user->max_rate || max_rate < user->min_rate)
+			return false;
+
+	return true;
+}
+
 void clk_hw_set_rate_range(struct clk_hw *hw, unsigned long min_rate,
 			   unsigned long max_rate)
 {
@@ -2332,6 +2350,11 @@ int clk_set_rate_range(struct clk *clk, unsigned long min, unsigned long max)
 	clk->min_rate = min;
 	clk->max_rate = max;
 
+	if (!clk_core_check_boundaries(clk->core, min, max)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
 	rate = clk_core_get_rate_nolock(clk->core);
 	if (rate < min || rate > max) {
 		/*
@@ -2360,6 +2383,7 @@ int clk_set_rate_range(struct clk *clk, unsigned long min, unsigned long max)
 		}
 	}
 
+out:
 	if (clk->exclusive_count)
 		clk_core_rate_protect(clk->core);
 
-- 
Gitee


From 2668aa0c0e749b8b4f9404e93d6b3b9550a41508 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 14 Jul 2022 21:58:17 +0800
Subject: [PATCH 267/674] SUNRPC/call_alloc: async tasks mustn't block waiting
 for memory

stable inclusion
from stable-v5.10.111
commit f9244d31e05a26fdf458207ec88617bfae04fe8f
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=f9244d31e05a26fdf458207ec88617bfae04fe8f

--------------------------------

[ Upstream commit c487216bec83b0c5a8803e5c61433d33ad7b104d ]

When memory is short, new worker threads cannot be created and we depend
on the minimum one rpciod thread to be able to handle everything.
So it must not block waiting for memory.

mempools are particularly a problem as memory can only be released back
to the mempool by an async rpc task running.  If all available
workqueue threads are waiting on the mempool, no thread is available to
return anything.

rpc_malloc() can block, and this might cause deadlocks.
So check RPC_IS_ASYNC(), rather than RPC_IS_SWAPPER() to determine if
blocking is acceptable.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 net/sunrpc/sched.c              | 4 +++-
 net/sunrpc/xprtrdma/transport.c | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index c045f63d11fa..6e4d476c6324 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -1012,8 +1012,10 @@ int rpc_malloc(struct rpc_task *task)
 	struct rpc_buffer *buf;
 	gfp_t gfp = GFP_NOFS;
 
+	if (RPC_IS_ASYNC(task))
+		gfp = GFP_NOWAIT | __GFP_NOWARN;
 	if (RPC_IS_SWAPPER(task))
-		gfp = __GFP_MEMALLOC | GFP_NOWAIT | __GFP_NOWARN;
+		gfp |= __GFP_MEMALLOC;
 
 	size += sizeof(struct rpc_buffer);
 	if (size <= RPC_BUFFER_MAXSIZE)
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 8e2368a0c2a2..fb7a0ab27899 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -572,8 +572,10 @@ xprt_rdma_allocate(struct rpc_task *task)
 	gfp_t flags;
 
 	flags = RPCRDMA_DEF_GFP;
+	if (RPC_IS_ASYNC(task))
+		flags = GFP_NOWAIT | __GFP_NOWARN;
 	if (RPC_IS_SWAPPER(task))
-		flags = __GFP_MEMALLOC | GFP_NOWAIT | __GFP_NOWARN;
+		flags |= __GFP_MEMALLOC;
 
 	if (!rpcrdma_check_regbuf(r_xprt, req->rl_sendbuf, rqst->rq_callsize,
 				  flags))
-- 
Gitee


From 621a0c7c51368d01902e41ab672a2a1960a23d36 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 14 Jul 2022 21:58:18 +0800
Subject: [PATCH 268/674] SUNRPC/xprt: async tasks mustn't block waiting for
 memory

stable inclusion
from stable-v5.10.111
commit f4fc47e71e32ac64b33e0d36e2aceeda65d46d4c
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=f4fc47e71e32ac64b33e0d36e2aceeda65d46d4c

--------------------------------

[ Upstream commit a721035477fb5fb8abc738fbe410b07c12af3dc5 ]

When memory is short, new worker threads cannot be created and we depend
on the minimum one rpciod thread to be able to handle everything.  So it
must not block waiting for memory.

xprt_dynamic_alloc_slot can block indefinitely.  This can tie up all
workqueue threads and NFS can deadlock.  So when called from a
workqueue, set __GFP_NORETRY.

The rdma alloc_slot already does not block.  However it sets the error
to -EAGAIN suggesting this will trigger a sleep.  It does not.  As we
can see in call_reserveresult(), only -ENOMEM causes a sleep.  -EAGAIN
causes immediate retry.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 net/sunrpc/xprt.c               | 5 ++++-
 net/sunrpc/xprtrdma/transport.c | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 3ea27bb3f512..cb5acd451489 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -1656,12 +1656,15 @@ static bool xprt_throttle_congested(struct rpc_xprt *xprt, struct rpc_task *task
 static struct rpc_rqst *xprt_dynamic_alloc_slot(struct rpc_xprt *xprt)
 {
 	struct rpc_rqst *req = ERR_PTR(-EAGAIN);
+	gfp_t gfp_mask = GFP_KERNEL;
 
 	if (xprt->num_reqs >= xprt->max_reqs)
 		goto out;
 	++xprt->num_reqs;
 	spin_unlock(&xprt->reserve_lock);
-	req = kzalloc(sizeof(struct rpc_rqst), GFP_NOFS);
+	if (current->flags & PF_WQ_WORKER)
+		gfp_mask |= __GFP_NORETRY | __GFP_NOWARN;
+	req = kzalloc(sizeof(*req), gfp_mask);
 	spin_lock(&xprt->reserve_lock);
 	if (req != NULL)
 		goto out;
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index fb7a0ab27899..9cf10cfb85c6 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -519,7 +519,7 @@ xprt_rdma_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task)
 	return;
 
 out_sleep:
-	task->tk_status = -EAGAIN;
+	task->tk_status = -ENOMEM;
 	xprt_add_backlog(xprt, task);
 }
 
-- 
Gitee


From be98f7ada5341938e53fbd08c2d1169fd2c57677 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 14 Jul 2022 21:58:19 +0800
Subject: [PATCH 269/674] SUNRPC: remove scheduling boost for "SWAPPER" tasks.

stable inclusion
from stable-v5.10.111
commit 4b6f122bdfdc048715d0ee630b3b20a1eea8094b
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=4b6f122bdfdc048715d0ee630b3b20a1eea8094b

--------------------------------

[ Upstream commit a80a8461868905823609be97f91776a26befe839 ]

Currently, tasks marked as "swapper" tasks get put to the front of
non-priority rpc_queues, and are sorted earlier than non-swapper tasks on
the transport's ->xmit_queue.

This is pointless as currently *all* tasks for a mount that has swap
enabled on *any* file are marked as "swapper" tasks.  So the net result
is that the non-priority rpc_queues are reverse-ordered (LIFO).

This scheduling boost is not necessary to avoid deadlocks, and hurts
fairness, so remove it.  If there were a need to expedite some requests,
the tk_priority mechanism is a more appropriate tool.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 net/sunrpc/sched.c |  7 -------
 net/sunrpc/xprt.c  | 11 -----------
 2 files changed, 18 deletions(-)

diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 6e4d476c6324..f0f55fbd1375 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -186,11 +186,6 @@ static void __rpc_add_wait_queue_priority(struct rpc_wait_queue *queue,
 
 /*
  * Add new request to wait queue.
- *
- * Swapper tasks always get inserted at the head of the queue.
- * This should avoid many nasty memory deadlocks and hopefully
- * improve overall performance.
- * Everyone else gets appended to the queue to ensure proper FIFO behavior.
  */
 static void __rpc_add_wait_queue(struct rpc_wait_queue *queue,
 		struct rpc_task *task,
@@ -199,8 +194,6 @@ static void __rpc_add_wait_queue(struct rpc_wait_queue *queue,
 	INIT_LIST_HEAD(&task->u.tk_wait.timer_list);
 	if (RPC_IS_PRIORITY(queue))
 		__rpc_add_wait_queue_priority(queue, task, queue_priority);
-	else if (RPC_IS_SWAPPER(task))
-		list_add(&task->u.tk_wait.list, &queue->tasks[0]);
 	else
 		list_add_tail(&task->u.tk_wait.list, &queue->tasks[0]);
 	task->tk_waitqueue = queue;
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index cb5acd451489..55b0c2b74933 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -1327,17 +1327,6 @@ xprt_request_enqueue_transmit(struct rpc_task *task)
 				INIT_LIST_HEAD(&req->rq_xmit2);
 				goto out;
 			}
-		} else if (RPC_IS_SWAPPER(task)) {
-			list_for_each_entry(pos, &xprt->xmit_queue, rq_xmit) {
-				if (pos->rq_cong || pos->rq_bytes_sent)
-					continue;
-				if (RPC_IS_SWAPPER(pos->rq_task))
-					continue;
-				/* Note: req is added _before_ pos */
-				list_add_tail(&req->rq_xmit, &pos->rq_xmit);
-				INIT_LIST_HEAD(&req->rq_xmit2);
-				goto out;
-			}
 		} else if (!req->rq_seqno) {
 			list_for_each_entry(pos, &xprt->xmit_queue, rq_xmit) {
 				if (pos->rq_task->tk_owner != task->tk_owner)
-- 
Gitee


From eb6fd2d61f35cb3d7e4221dc441466c3283d5547 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 14 Jul 2022 21:58:20 +0800
Subject: [PATCH 270/674] NFS: swap IO handling is slightly different for
 O_DIRECT IO

stable inclusion
from stable-v5.10.111
commit d4170a28217a5abd168e18581add641716c14057
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=d4170a28217a5abd168e18581add641716c14057

--------------------------------

[ Upstream commit 64158668ac8b31626a8ce48db4cad08496eb8340 ]

1/ Taking the i_rwsem for swap IO triggers lockdep warnings regarding
   possible deadlocks with "fs_reclaim".  These deadlocks could, I believe,
   eventuate if a buffered read on the swapfile was attempted.

   We don't need coherence with the page cache for a swap file, and
   buffered writes are forbidden anyway.  There is no other need for
   i_rwsem during direct IO.  So never take it for swap_rw()

2/ generic_write_checks() explicitly forbids writes to swap, and
   performs checks that are not needed for swap.  So bypass it
   for swap_rw().

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 fs/nfs/direct.c        | 42 ++++++++++++++++++++++++++++--------------
 fs/nfs/file.c          |  4 ++--
 include/linux/nfs_fs.h |  8 ++++----
 3 files changed, 34 insertions(+), 20 deletions(-)

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 3c0335c15a73..28afc315ec0c 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -172,8 +172,8 @@ ssize_t nfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 	VM_BUG_ON(iov_iter_count(iter) != PAGE_SIZE);
 
 	if (iov_iter_rw(iter) == READ)
-		return nfs_file_direct_read(iocb, iter);
-	return nfs_file_direct_write(iocb, iter);
+		return nfs_file_direct_read(iocb, iter, true);
+	return nfs_file_direct_write(iocb, iter, true);
 }
 
 static void nfs_direct_release_pages(struct page **pages, unsigned int npages)
@@ -424,6 +424,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
  * nfs_file_direct_read - file direct read operation for NFS files
  * @iocb: target I/O control block
  * @iter: vector of user buffers into which to read data
+ * @swap: flag indicating this is swap IO, not O_DIRECT IO
  *
  * We use this function for direct reads instead of calling
  * generic_file_aio_read() in order to avoid gfar's check to see if
@@ -439,7 +440,8 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
  * client must read the updated atime from the server back into its
  * cache.
  */
-ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter)
+ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
+			     bool swap)
 {
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
@@ -481,12 +483,14 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter)
 	if (iter_is_iovec(iter))
 		dreq->flags = NFS_ODIRECT_SHOULD_DIRTY;
 
-	nfs_start_io_direct(inode);
+	if (!swap)
+		nfs_start_io_direct(inode);
 
 	NFS_I(inode)->read_io += count;
 	requested = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos);
 
-	nfs_end_io_direct(inode);
+	if (!swap)
+		nfs_end_io_direct(inode);
 
 	if (requested > 0) {
 		result = nfs_direct_wait(dreq);
@@ -875,6 +879,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
  * nfs_file_direct_write - file direct write operation for NFS files
  * @iocb: target I/O control block
  * @iter: vector of user buffers from which to write data
+ * @swap: flag indicating this is swap IO, not O_DIRECT IO
  *
  * We use this function for direct writes instead of calling
  * generic_file_aio_write() in order to avoid taking the inode
@@ -891,7 +896,8 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
  * Note that O_APPEND is not supported for NFS direct writes, as there
  * is no atomic O_APPEND write facility in the NFS protocol.
  */
-ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
+ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter,
+			      bool swap)
 {
 	ssize_t result, requested;
 	size_t count;
@@ -905,7 +911,11 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
 	dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n",
 		file, iov_iter_count(iter), (long long) iocb->ki_pos);
 
-	result = generic_write_checks(iocb, iter);
+	if (swap)
+		/* bypass generic checks */
+		result =  iov_iter_count(iter);
+	else
+		result = generic_write_checks(iocb, iter);
 	if (result <= 0)
 		return result;
 	count = result;
@@ -936,16 +946,20 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
 		dreq->iocb = iocb;
 	pnfs_init_ds_commit_info_ops(&dreq->ds_cinfo, inode);
 
-	nfs_start_io_direct(inode);
+	if (swap) {
+		requested = nfs_direct_write_schedule_iovec(dreq, iter, pos);
+	} else {
+		nfs_start_io_direct(inode);
 
-	requested = nfs_direct_write_schedule_iovec(dreq, iter, pos);
+		requested = nfs_direct_write_schedule_iovec(dreq, iter, pos);
 
-	if (mapping->nrpages) {
-		invalidate_inode_pages2_range(mapping,
-					      pos >> PAGE_SHIFT, end);
-	}
+		if (mapping->nrpages) {
+			invalidate_inode_pages2_range(mapping,
+						      pos >> PAGE_SHIFT, end);
+		}
 
-	nfs_end_io_direct(inode);
+		nfs_end_io_direct(inode);
+	}
 
 	if (requested > 0) {
 		result = nfs_direct_wait(dreq);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index f96367a2463e..aff1e65af6bd 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -158,7 +158,7 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to)
 	ssize_t result;
 
 	if (iocb->ki_flags & IOCB_DIRECT)
-		return nfs_file_direct_read(iocb, to);
+		return nfs_file_direct_read(iocb, to, false);
 
 	dprintk("NFS: read(%pD2, %zu@%lu)\n",
 		iocb->ki_filp,
@@ -609,7 +609,7 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
 		return result;
 
 	if (iocb->ki_flags & IOCB_DIRECT)
-		return nfs_file_direct_write(iocb, from);
+		return nfs_file_direct_write(iocb, from, false);
 
 	dprintk("NFS: write(%pD2, %zu@%Ld)\n",
 		file, iov_iter_count(from), (long long) iocb->ki_pos);
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 1e0a3497bdb4..2a17e0dfd431 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -478,10 +478,10 @@ static inline const struct cred *nfs_file_cred(struct file *file)
  * linux/fs/nfs/direct.c
  */
 extern ssize_t nfs_direct_IO(struct kiocb *, struct iov_iter *);
-extern ssize_t nfs_file_direct_read(struct kiocb *iocb,
-			struct iov_iter *iter);
-extern ssize_t nfs_file_direct_write(struct kiocb *iocb,
-			struct iov_iter *iter);
+ssize_t nfs_file_direct_read(struct kiocb *iocb,
+			     struct iov_iter *iter, bool swap);
+ssize_t nfs_file_direct_write(struct kiocb *iocb,
+			      struct iov_iter *iter, bool swap);
 
 /*
  * linux/fs/nfs/dir.c
-- 
Gitee


From 9ae11d7acbd8c4b59ebf92f2af95b053fbb55980 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 14 Jul 2022 21:58:21 +0800
Subject: [PATCH 271/674] NFS: swap-out must always use STABLE writes.

stable inclusion
from stable-v5.10.111
commit b3882e78aa0a924fa6c2c09fe74aaaa2299a34ad
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=b3882e78aa0a924fa6c2c09fe74aaaa2299a34ad

--------------------------------

[ Upstream commit c265de257f558a05c1859ee9e3fed04883b9ec0e ]

The commit handling code is not safe against memory-pressure deadlocks
when writing to swap.  In particular, nfs_commitdata_alloc() blocks
indefinitely waiting for memory, and this can consume all available
workqueue threads.

swap-out most likely uses STABLE writes anyway as COND_STABLE indicates
that a stable write should be used if the write fits in a single
request, and it normally does.  However if we ever swap with a small
wsize, or gather unusually large numbers of pages for a single write,
this might change.

For safety, make it explicit in the code that direct writes used for swap
must always use FLUSH_STABLE.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 fs/nfs/direct.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 28afc315ec0c..c220810c61d1 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -793,7 +793,7 @@ static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = {
  */
 static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
 					       struct iov_iter *iter,
-					       loff_t pos)
+					       loff_t pos, int ioflags)
 {
 	struct nfs_pageio_descriptor desc;
 	struct inode *inode = dreq->inode;
@@ -801,7 +801,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
 	size_t requested_bytes = 0;
 	size_t wsize = max_t(size_t, NFS_SERVER(inode)->wsize, PAGE_SIZE);
 
-	nfs_pageio_init_write(&desc, inode, FLUSH_COND_STABLE, false,
+	nfs_pageio_init_write(&desc, inode, ioflags, false,
 			      &nfs_direct_write_completion_ops);
 	desc.pg_dreq = dreq;
 	get_dreq(dreq);
@@ -947,11 +947,13 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter,
 	pnfs_init_ds_commit_info_ops(&dreq->ds_cinfo, inode);
 
 	if (swap) {
-		requested = nfs_direct_write_schedule_iovec(dreq, iter, pos);
+		requested = nfs_direct_write_schedule_iovec(dreq, iter, pos,
+							    FLUSH_STABLE);
 	} else {
 		nfs_start_io_direct(inode);
 
-		requested = nfs_direct_write_schedule_iovec(dreq, iter, pos);
+		requested = nfs_direct_write_schedule_iovec(dreq, iter, pos,
+							    FLUSH_COND_STABLE);
 
 		if (mapping->nrpages) {
 			invalidate_inode_pages2_range(mapping,
-- 
Gitee


From bca150089a21cd0d1b209e248559b06513a3ae43 Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan@kernel.org>
Date: Thu, 14 Jul 2022 21:58:22 +0800
Subject: [PATCH 272/674] x86/Kconfig: Do not allow CONFIG_X86_X32_ABI=y with
 llvm-objcopy

stable inclusion
from stable-v5.10.111
commit 9cb90f9ad5975ddfc90d0906b40e9c71c2eca44e
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=9cb90f9ad5975ddfc90d0906b40e9c71c2eca44e

--------------------------------

[ Upstream commit aaeed6ecc1253ce1463fa1aca0b70a4ccbc9fa75 ]

There are two outstanding issues with CONFIG_X86_X32_ABI and
llvm-objcopy, with similar root causes:

1. llvm-objcopy does not properly convert .note.gnu.property when going
   from x86_64 to x86_x32, resulting in a corrupted section when
   linking:

   https://github.com/ClangBuiltLinux/linux/issues/1141

2. llvm-objcopy produces corrupted compressed debug sections when going
   from x86_64 to x86_x32, also resulting in an error when linking:

   https://github.com/ClangBuiltLinux/linux/issues/514

After commit 41c5ef31ad71 ("x86/ibt: Base IBT bits"), the
.note.gnu.property section is always generated when
CONFIG_X86_KERNEL_IBT is enabled, which causes the first issue to become
visible with an allmodconfig build:

  ld.lld: error: arch/x86/entry/vdso/vclock_gettime-x32.o:(.note.gnu.property+0x1c): program property is too short

To avoid this error, do not allow CONFIG_X86_X32_ABI to be selected when
using llvm-objcopy. If the two issues ever get fixed in llvm-objcopy,
this can be turned into a feature check.

Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20220314194842.3452-3-nathan@kernel.org
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 arch/x86/Kconfig | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 040fb77366dd..ec51ccea60c6 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2908,6 +2908,11 @@ config IA32_AOUT
 config X86_X32
 	bool "x32 ABI for 64-bit mode"
 	depends on X86_64
+	# llvm-objcopy does not convert x86_64 .note.gnu.property or
+	# compressed debug sections to x86_x32 properly:
+	# https://github.com/ClangBuiltLinux/linux/issues/514
+	# https://github.com/ClangBuiltLinux/linux/issues/1141
+	depends on $(success,$(OBJCOPY) --version | head -n1 | grep -qv llvm)
 	help
 	  Include code to run binaries for the x32 native 32-bit ABI
 	  for 64-bit processors.  An x32 process gets access to the
-- 
Gitee


From 27f1c36b60209c871375f5cd17d10473c8091d2c Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Thu, 14 Jul 2022 21:58:23 +0800
Subject: [PATCH 273/674] serial: samsung_tty: do not unlock port->lock for
 uart_write_wakeup()

stable inclusion
from stable-v5.10.111
commit 3309b322171180828564c386443eb5a62eabc862
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=3309b322171180828564c386443eb5a62eabc862

--------------------------------

[ Upstream commit 988c7c00691008ea1daaa1235680a0da49dab4e8 ]

The commit c15c3747ee32 (serial: samsung: fix potential soft lockup
during uart write) added an unlock of port->lock before
uart_write_wakeup() and a lock after it. It was always problematic to
write data from tty_ldisc_ops::write_wakeup and it was even documented
that way. We fixed the line disciplines to conform to this recently.
So if there is still a missed one, we should fix them instead of this
workaround.

On the top of that, s3c24xx_serial_tx_dma_complete() in this driver
still holds the port->lock while calling uart_write_wakeup().

So revert the wrap added by the commit above.

Cc: Thomas Abraham <thomas.abraham@linaro.org>
Cc: Kyungmin Park <kyungmin.park@samsung.com>
Cc: Hyeonkook Kim <hk619.kim@samsung.com>
Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Link: https://lore.kernel.org/r/20220308115153.4225-1-jslaby@suse.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/tty/serial/samsung_tty.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/tty/serial/samsung_tty.c b/drivers/tty/serial/samsung_tty.c
index 8ae3e03fbd8c..81faead3c4f8 100644
--- a/drivers/tty/serial/samsung_tty.c
+++ b/drivers/tty/serial/samsung_tty.c
@@ -883,11 +883,8 @@ static irqreturn_t s3c24xx_serial_tx_chars(int irq, void *id)
 		goto out;
 	}
 
-	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS) {
-		spin_unlock(&port->lock);
+	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
 		uart_write_wakeup(port);
-		spin_lock(&port->lock);
-	}
 
 	if (uart_circ_empty(xmit))
 		s3c24xx_serial_stop_tx(port);
-- 
Gitee


From 62a0c4b1082844d07606cb6ccfeb90f24f2d17b3 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Thu, 14 Jul 2022 21:58:24 +0800
Subject: [PATCH 274/674] virtio_console: eliminate anonymous module_init &
 module_exit

stable inclusion
from stable-v5.10.111
commit c69b442125bf009fce26e15aa5616caf8a3673c3
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=c69b442125bf009fce26e15aa5616caf8a3673c3

--------------------------------

[ Upstream commit fefb8a2a941338d871e2d83fbd65fbfa068857bd ]

Eliminate anonymous module_init() and module_exit(), which can lead to
confusion or ambiguity when reading System.map, crashes/oops/bugs,
or an initcall_debug log.

Give each of these init and exit functions unique driver-specific
names to eliminate the anonymous names.

Example 1: (System.map)
 ffffffff832fc78c t init
 ffffffff832fc79e t init
 ffffffff832fc8f8 t init

Example 2: (initcall_debug log)
 calling  init+0x0/0x12 @ 1
 initcall init+0x0/0x12 returned 0 after 15 usecs
 calling  init+0x0/0x60 @ 1
 initcall init+0x0/0x60 returned 0 after 2 usecs
 calling  init+0x0/0x9a @ 1
 initcall init+0x0/0x9a returned 0 after 74 usecs

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Reviewed-by: Amit Shah <amit@kernel.org>
Cc: virtualization@lists.linux-foundation.org
Cc: Arnd Bergmann <arnd@arndb.de>
Link: https://lore.kernel.org/r/20220316192010.19001-3-rdunlap@infradead.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/char/virtio_console.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
index 3dd4deb60adb..6d361420ffe8 100644
--- a/drivers/char/virtio_console.c
+++ b/drivers/char/virtio_console.c
@@ -2239,7 +2239,7 @@ static struct virtio_driver virtio_rproc_serial = {
 	.remove =	virtcons_remove,
 };
 
-static int __init init(void)
+static int __init virtio_console_init(void)
 {
 	int err;
 
@@ -2276,7 +2276,7 @@ static int __init init(void)
 	return err;
 }
 
-static void __exit fini(void)
+static void __exit virtio_console_fini(void)
 {
 	reclaim_dma_bufs();
 
@@ -2286,8 +2286,8 @@ static void __exit fini(void)
 	class_destroy(pdrvdata.class);
 	debugfs_remove_recursive(pdrvdata.debugfs_dir);
 }
-module_init(init);
-module_exit(fini);
+module_init(virtio_console_init);
+module_exit(virtio_console_fini);
 
 MODULE_DESCRIPTION("Virtio console driver");
 MODULE_LICENSE("GPL");
-- 
Gitee


From a44452c4120ede42648c992e7b649b94c345a1db Mon Sep 17 00:00:00 2001
From: Haimin Zhang <tcs_kernel@tencent.com>
Date: Thu, 14 Jul 2022 21:58:25 +0800
Subject: [PATCH 275/674] jfs: prevent NULL deref in diFree

stable inclusion
from stable-v5.10.111
commit b9c5ac0a15f24d63b20f899072fa6dd8c93af136
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=b9c5ac0a15f24d63b20f899072fa6dd8c93af136

--------------------------------

[ Upstream commit a53046291020ec41e09181396c1e829287b48d47 ]

Add validation check for JFS_IP(ipimap)->i_imap to prevent a NULL deref
in diFree since diFree uses it without do any validations.
When function jfs_mount calls diMount to initialize fileset inode
allocation map, it can fail and JFS_IP(ipimap)->i_imap won't be
initialized. Then it calls diFreeSpecial to close fileset inode allocation
map inode and it will flow into jfs_evict_inode. Function jfs_evict_inode
just validates JFS_SBI(inode->i_sb)->ipimap, then calls diFree. diFree use
JFS_IP(ipimap)->i_imap directly, then it will cause a NULL deref.

Reported-by: TCS Robot <tcs_robot@tencent.com>
Signed-off-by: Haimin Zhang <tcs_kernel@tencent.com>
Signed-off-by: Dave Kleikamp <dave.kleikamp@oracle.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 fs/jfs/inode.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index b0eb9c85eea0..980aa3300f10 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -146,12 +146,13 @@ void jfs_evict_inode(struct inode *inode)
 		dquot_initialize(inode);
 
 		if (JFS_IP(inode)->fileset == FILESYSTEM_I) {
+			struct inode *ipimap = JFS_SBI(inode->i_sb)->ipimap;
 			truncate_inode_pages_final(&inode->i_data);
 
 			if (test_cflag(COMMIT_Freewmap, inode))
 				jfs_free_zero_link(inode);
 
-			if (JFS_SBI(inode->i_sb)->ipimap)
+			if (ipimap && JFS_IP(ipimap)->i_imap)
 				diFree(inode);
 
 			/*
-- 
Gitee


From 625a6a2735e87773bdca55da632e2bcf686169f6 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Thu, 14 Jul 2022 21:58:26 +0800
Subject: [PATCH 276/674] SUNRPC: Fix socket waits for write buffer space

stable inclusion
from stable-v5.10.111
commit 7a506fabcfe1a58bcdf6dd18fa3fca42635235f0
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=7a506fabcfe1a58bcdf6dd18fa3fca42635235f0

--------------------------------

[ Upstream commit 7496b59f588dd52886fdbac7633608097543a0a5 ]

The socket layer requires that we use the socket lock to protect changes
to the sock->sk_write_pending field and others.

Reported-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 net/sunrpc/xprtsock.c | 54 +++++++++++++++++++++++++++++++------------
 1 file changed, 39 insertions(+), 15 deletions(-)

diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index b7c262f6d555..f57ccf5ae0f2 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -754,12 +754,12 @@ xs_stream_start_connect(struct sock_xprt *transport)
 /**
  * xs_nospace - handle transmit was incomplete
  * @req: pointer to RPC request
+ * @transport: pointer to struct sock_xprt
  *
  */
-static int xs_nospace(struct rpc_rqst *req)
+static int xs_nospace(struct rpc_rqst *req, struct sock_xprt *transport)
 {
-	struct rpc_xprt *xprt = req->rq_xprt;
-	struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
+	struct rpc_xprt *xprt = &transport->xprt;
 	struct sock *sk = transport->inet;
 	int ret = -EAGAIN;
 
@@ -770,25 +770,49 @@ static int xs_nospace(struct rpc_rqst *req)
 
 	/* Don't race with disconnect */
 	if (xprt_connected(xprt)) {
+		struct socket_wq *wq;
+
+		rcu_read_lock();
+		wq = rcu_dereference(sk->sk_wq);
+		set_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags);
+		rcu_read_unlock();
+
 		/* wait for more buffer space */
+		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 		sk->sk_write_pending++;
 		xprt_wait_for_buffer_space(xprt);
 	} else
 		ret = -ENOTCONN;
 
 	spin_unlock(&xprt->transport_lock);
+	return ret;
+}
 
-	/* Race breaker in case memory is freed before above code is called */
-	if (ret == -EAGAIN) {
-		struct socket_wq *wq;
+static int xs_sock_nospace(struct rpc_rqst *req)
+{
+	struct sock_xprt *transport =
+		container_of(req->rq_xprt, struct sock_xprt, xprt);
+	struct sock *sk = transport->inet;
+	int ret = -EAGAIN;
 
-		rcu_read_lock();
-		wq = rcu_dereference(sk->sk_wq);
-		set_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags);
-		rcu_read_unlock();
+	lock_sock(sk);
+	if (!sock_writeable(sk))
+		ret = xs_nospace(req, transport);
+	release_sock(sk);
+	return ret;
+}
 
-		sk->sk_write_space(sk);
-	}
+static int xs_stream_nospace(struct rpc_rqst *req)
+{
+	struct sock_xprt *transport =
+		container_of(req->rq_xprt, struct sock_xprt, xprt);
+	struct sock *sk = transport->inet;
+	int ret = -EAGAIN;
+
+	lock_sock(sk);
+	if (!sk_stream_memory_free(sk))
+		ret = xs_nospace(req, transport);
+	release_sock(sk);
 	return ret;
 }
 
@@ -878,7 +902,7 @@ static int xs_local_send_request(struct rpc_rqst *req)
 	case -ENOBUFS:
 		break;
 	case -EAGAIN:
-		status = xs_nospace(req);
+		status = xs_stream_nospace(req);
 		break;
 	default:
 		dprintk("RPC:       sendmsg returned unrecognized error %d\n",
@@ -954,7 +978,7 @@ static int xs_udp_send_request(struct rpc_rqst *req)
 		/* Should we call xs_close() here? */
 		break;
 	case -EAGAIN:
-		status = xs_nospace(req);
+		status = xs_sock_nospace(req);
 		break;
 	case -ENETUNREACH:
 	case -ENOBUFS:
@@ -1069,7 +1093,7 @@ static int xs_tcp_send_request(struct rpc_rqst *req)
 		/* Should we call xs_close() here? */
 		break;
 	case -EAGAIN:
-		status = xs_nospace(req);
+		status = xs_stream_nospace(req);
 		break;
 	case -ECONNRESET:
 	case -ECONNREFUSED:
-- 
Gitee


From 2549ae58e4f66de9341813d6c98bb5c0966f80f8 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Thu, 14 Jul 2022 21:58:27 +0800
Subject: [PATCH 277/674] NFS: nfsiod should not block forever in
 mempool_alloc()

stable inclusion
from stable-v5.10.111
commit 34681aeddcfcc0c47d45154f5d39310def87c55f
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=34681aeddcfcc0c47d45154f5d39310def87c55f

--------------------------------

[ Upstream commit 515dcdcd48736576c6f5c197814da6f81c60a21e ]

The concern is that since nfsiod is sometimes required to kick off a
commit, it can get locked up waiting forever in mempool_alloc() instead
of failing gracefully and leaving the commit until later.

Try to allocate from the slab first, with GFP_KERNEL | __GFP_NORETRY,
then fall back to a non-blocking attempt to allocate from the memory
pool.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 fs/nfs/internal.h      |  7 +++++++
 fs/nfs/pnfs_nfs.c      |  8 ++++++--
 fs/nfs/write.c         | 24 +++++++++---------------
 include/linux/nfs_fs.h |  2 +-
 4 files changed, 23 insertions(+), 18 deletions(-)

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 4c7666cd20e1..7009a8dddd45 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -588,6 +588,13 @@ nfs_write_match_verf(const struct nfs_writeverf *verf,
 		!nfs_write_verifier_cmp(&req->wb_verf, &verf->verifier);
 }
 
+static inline gfp_t nfs_io_gfp_mask(void)
+{
+	if (current->flags & PF_WQ_WORKER)
+		return GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
+	return GFP_KERNEL;
+}
+
 /* unlink.c */
 extern struct rpc_task *
 nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index 7b9d701bef01..a2ad8bb87e2d 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -419,7 +419,7 @@ static struct nfs_commit_data *
 pnfs_bucket_fetch_commitdata(struct pnfs_commit_bucket *bucket,
 			     struct nfs_commit_info *cinfo)
 {
-	struct nfs_commit_data *data = nfs_commitdata_alloc(false);
+	struct nfs_commit_data *data = nfs_commitdata_alloc();
 
 	if (!data)
 		return NULL;
@@ -515,7 +515,11 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
 	unsigned int nreq = 0;
 
 	if (!list_empty(mds_pages)) {
-		data = nfs_commitdata_alloc(true);
+		data = nfs_commitdata_alloc();
+		if (!data) {
+			nfs_retry_commit(mds_pages, NULL, cinfo, -1);
+			return -ENOMEM;
+		}
 		data->ds_commit_index = -1;
 		list_splice_init(mds_pages, &data->pages);
 		list_add_tail(&data->list, &list);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index cc926e69ee9b..a97eaf4e813c 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -70,27 +70,17 @@ static mempool_t *nfs_wdata_mempool;
 static struct kmem_cache *nfs_cdata_cachep;
 static mempool_t *nfs_commit_mempool;
 
-struct nfs_commit_data *nfs_commitdata_alloc(bool never_fail)
+struct nfs_commit_data *nfs_commitdata_alloc(void)
 {
 	struct nfs_commit_data *p;
 
-	if (never_fail)
-		p = mempool_alloc(nfs_commit_mempool, GFP_NOIO);
-	else {
-		/* It is OK to do some reclaim, not no safe to wait
-		 * for anything to be returned to the pool.
-		 * mempool_alloc() cannot handle that particular combination,
-		 * so we need two separate attempts.
-		 */
+	p = kmem_cache_zalloc(nfs_cdata_cachep, nfs_io_gfp_mask());
+	if (!p) {
 		p = mempool_alloc(nfs_commit_mempool, GFP_NOWAIT);
-		if (!p)
-			p = kmem_cache_alloc(nfs_cdata_cachep, GFP_NOIO |
-					     __GFP_NOWARN | __GFP_NORETRY);
 		if (!p)
 			return NULL;
+		memset(p, 0, sizeof(*p));
 	}
-
-	memset(p, 0, sizeof(*p));
 	INIT_LIST_HEAD(&p->pages);
 	return p;
 }
@@ -1800,7 +1790,11 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how,
 	if (list_empty(head))
 		return 0;
 
-	data = nfs_commitdata_alloc(true);
+	data = nfs_commitdata_alloc();
+	if (!data) {
+		nfs_retry_commit(head, NULL, cinfo, -1);
+		return -ENOMEM;
+	}
 
 	/* Set up the argument struct */
 	nfs_init_commit(data, head, NULL, cinfo);
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 2a17e0dfd431..e39342945a80 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -551,7 +551,7 @@ extern int nfs_wb_all(struct inode *inode);
 extern int nfs_wb_page(struct inode *inode, struct page *page);
 extern int nfs_wb_page_cancel(struct inode *inode, struct page* page);
 extern int  nfs_commit_inode(struct inode *, int);
-extern struct nfs_commit_data *nfs_commitdata_alloc(bool never_fail);
+extern struct nfs_commit_data *nfs_commitdata_alloc(void);
 extern void nfs_commit_free(struct nfs_commit_data *data);
 bool nfs_commit_end(struct nfs_mds_commit_info *cinfo);
 
-- 
Gitee


From 0824355fd43f07da43bb4af8ddbfb94e8dd61d88 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Thu, 14 Jul 2022 21:58:28 +0800
Subject: [PATCH 278/674] NFS: Avoid writeback threads getting stuck in
 mempool_alloc()

stable inclusion
from stable-v5.10.111
commit c74e2f6ecc51bd08bb5b0335477dba954a50592e
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=c74e2f6ecc51bd08bb5b0335477dba954a50592e

--------------------------------

[ Upstream commit 0bae835b63c53f86cdc524f5962e39409585b22c ]

In a low memory situation, allow the NFS writeback code to fail without
getting stuck in infinite loops in mempool_alloc().

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 fs/nfs/pagelist.c | 10 +++++-----
 fs/nfs/write.c    | 10 ++++++++--
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 98b9c1ed366e..17fef6eb490c 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -90,10 +90,10 @@ void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos)
 	}
 }
 
-static inline struct nfs_page *
-nfs_page_alloc(void)
+static inline struct nfs_page *nfs_page_alloc(void)
 {
-	struct nfs_page	*p = kmem_cache_zalloc(nfs_page_cachep, GFP_KERNEL);
+	struct nfs_page *p =
+		kmem_cache_zalloc(nfs_page_cachep, nfs_io_gfp_mask());
 	if (p)
 		INIT_LIST_HEAD(&p->wb_list);
 	return p;
@@ -901,7 +901,7 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
 	struct nfs_commit_info cinfo;
 	struct nfs_page_array *pg_array = &hdr->page_array;
 	unsigned int pagecount, pageused;
-	gfp_t gfp_flags = GFP_KERNEL;
+	gfp_t gfp_flags = nfs_io_gfp_mask();
 
 	pagecount = nfs_page_array_len(mirror->pg_base, mirror->pg_count);
 	pg_array->npages = pagecount;
@@ -984,7 +984,7 @@ nfs_pageio_alloc_mirrors(struct nfs_pageio_descriptor *desc,
 	desc->pg_mirrors_dynamic = NULL;
 	if (mirror_count == 1)
 		return desc->pg_mirrors_static;
-	ret = kmalloc_array(mirror_count, sizeof(*ret), GFP_KERNEL);
+	ret = kmalloc_array(mirror_count, sizeof(*ret), nfs_io_gfp_mask());
 	if (ret != NULL) {
 		for (i = 0; i < mirror_count; i++)
 			nfs_pageio_mirror_init(&ret[i], desc->pg_bsize);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index a97eaf4e813c..5d07799513a6 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -94,9 +94,15 @@ EXPORT_SYMBOL_GPL(nfs_commit_free);
 
 static struct nfs_pgio_header *nfs_writehdr_alloc(void)
 {
-	struct nfs_pgio_header *p = mempool_alloc(nfs_wdata_mempool, GFP_KERNEL);
+	struct nfs_pgio_header *p;
 
-	memset(p, 0, sizeof(*p));
+	p = kmem_cache_zalloc(nfs_wdata_cachep, nfs_io_gfp_mask());
+	if (!p) {
+		p = mempool_alloc(nfs_wdata_mempool, GFP_NOWAIT);
+		if (!p)
+			return NULL;
+		memset(p, 0, sizeof(*p));
+	}
 	p->rw_mode = FMODE_WRITE;
 	return p;
 }
-- 
Gitee


From 2cfbd97835d65cfe05bf819b34e9c8c93c953f75 Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Thu, 14 Jul 2022 21:58:29 +0800
Subject: [PATCH 279/674] parisc: Fix CPU affinity for Lasi, WAX and Dino chips

stable inclusion
from stable-v5.10.111
commit f7c35220305f273bddc0bdaf1e453b4ca280f145
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=f7c35220305f273bddc0bdaf1e453b4ca280f145

--------------------------------

[ Upstream commit 939fc856676c266c3bc347c1c1661872a3725c0f ]

Add the missing logic to allow Lasi, WAX and Dino to set the
CPU affinity. This fixes IRQ migration to other CPUs when a
CPU is shutdown which currently holds the IRQs for one of those
chips.

Signed-off-by: Helge Deller <deller@gmx.de>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/parisc/dino.c | 41 +++++++++++++++++++++++++++++++++--------
 drivers/parisc/gsc.c  | 31 +++++++++++++++++++++++++++++++
 drivers/parisc/gsc.h  |  1 +
 drivers/parisc/lasi.c |  7 +++----
 drivers/parisc/wax.c  |  7 +++----
 5 files changed, 71 insertions(+), 16 deletions(-)

diff --git a/drivers/parisc/dino.c b/drivers/parisc/dino.c
index 952a92504df6..e33036281327 100644
--- a/drivers/parisc/dino.c
+++ b/drivers/parisc/dino.c
@@ -142,9 +142,8 @@ struct dino_device
 {
 	struct pci_hba_data	hba;	/* 'C' inheritance - must be first */
 	spinlock_t		dinosaur_pen;
-	unsigned long		txn_addr; /* EIR addr to generate interrupt */ 
-	u32			txn_data; /* EIR data assign to each dino */ 
 	u32 			imr;	  /* IRQ's which are enabled */ 
+	struct gsc_irq		gsc_irq;
 	int			global_irq[DINO_LOCAL_IRQS]; /* map IMR bit to global irq */
 #ifdef DINO_DEBUG
 	unsigned int		dino_irr0; /* save most recent IRQ line stat */
@@ -339,14 +338,43 @@ static void dino_unmask_irq(struct irq_data *d)
 	if (tmp & DINO_MASK_IRQ(local_irq)) {
 		DBG(KERN_WARNING "%s(): IRQ asserted! (ILR 0x%x)\n",
 				__func__, tmp);
-		gsc_writel(dino_dev->txn_data, dino_dev->txn_addr);
+		gsc_writel(dino_dev->gsc_irq.txn_data, dino_dev->gsc_irq.txn_addr);
 	}
 }
 
+#ifdef CONFIG_SMP
+static int dino_set_affinity_irq(struct irq_data *d, const struct cpumask *dest,
+				bool force)
+{
+	struct dino_device *dino_dev = irq_data_get_irq_chip_data(d);
+	struct cpumask tmask;
+	int cpu_irq;
+	u32 eim;
+
+	if (!cpumask_and(&tmask, dest, cpu_online_mask))
+		return -EINVAL;
+
+	cpu_irq = cpu_check_affinity(d, &tmask);
+	if (cpu_irq < 0)
+		return cpu_irq;
+
+	dino_dev->gsc_irq.txn_addr = txn_affinity_addr(d->irq, cpu_irq);
+	eim = ((u32) dino_dev->gsc_irq.txn_addr) | dino_dev->gsc_irq.txn_data;
+	__raw_writel(eim, dino_dev->hba.base_addr+DINO_IAR0);
+
+	irq_data_update_effective_affinity(d, &tmask);
+
+	return IRQ_SET_MASK_OK;
+}
+#endif
+
 static struct irq_chip dino_interrupt_type = {
 	.name		= "GSC-PCI",
 	.irq_unmask	= dino_unmask_irq,
 	.irq_mask	= dino_mask_irq,
+#ifdef CONFIG_SMP
+	.irq_set_affinity = dino_set_affinity_irq,
+#endif
 };
 
 
@@ -806,7 +834,6 @@ static int __init dino_common_init(struct parisc_device *dev,
 {
 	int status;
 	u32 eim;
-	struct gsc_irq gsc_irq;
 	struct resource *res;
 
 	pcibios_register_hba(&dino_dev->hba);
@@ -821,10 +848,8 @@ static int __init dino_common_init(struct parisc_device *dev,
 	**   still only has 11 IRQ input lines - just map some of them
 	**   to a different processor.
 	*/
-	dev->irq = gsc_alloc_irq(&gsc_irq);
-	dino_dev->txn_addr = gsc_irq.txn_addr;
-	dino_dev->txn_data = gsc_irq.txn_data;
-	eim = ((u32) gsc_irq.txn_addr) | gsc_irq.txn_data;
+	dev->irq = gsc_alloc_irq(&dino_dev->gsc_irq);
+	eim = ((u32) dino_dev->gsc_irq.txn_addr) | dino_dev->gsc_irq.txn_data;
 
 	/* 
 	** Dino needs a PA "IRQ" to get a processor's attention.
diff --git a/drivers/parisc/gsc.c b/drivers/parisc/gsc.c
index ed9371acf37e..ec175ae99873 100644
--- a/drivers/parisc/gsc.c
+++ b/drivers/parisc/gsc.c
@@ -135,10 +135,41 @@ static void gsc_asic_unmask_irq(struct irq_data *d)
 	 */
 }
 
+#ifdef CONFIG_SMP
+static int gsc_set_affinity_irq(struct irq_data *d, const struct cpumask *dest,
+				bool force)
+{
+	struct gsc_asic *gsc_dev = irq_data_get_irq_chip_data(d);
+	struct cpumask tmask;
+	int cpu_irq;
+
+	if (!cpumask_and(&tmask, dest, cpu_online_mask))
+		return -EINVAL;
+
+	cpu_irq = cpu_check_affinity(d, &tmask);
+	if (cpu_irq < 0)
+		return cpu_irq;
+
+	gsc_dev->gsc_irq.txn_addr = txn_affinity_addr(d->irq, cpu_irq);
+	gsc_dev->eim = ((u32) gsc_dev->gsc_irq.txn_addr) | gsc_dev->gsc_irq.txn_data;
+
+	/* switch IRQ's for devices below LASI/WAX to other CPU */
+	gsc_writel(gsc_dev->eim, gsc_dev->hpa + OFFSET_IAR);
+
+	irq_data_update_effective_affinity(d, &tmask);
+
+	return IRQ_SET_MASK_OK;
+}
+#endif
+
+
 static struct irq_chip gsc_asic_interrupt_type = {
 	.name		=	"GSC-ASIC",
 	.irq_unmask	=	gsc_asic_unmask_irq,
 	.irq_mask	=	gsc_asic_mask_irq,
+#ifdef CONFIG_SMP
+	.irq_set_affinity =	gsc_set_affinity_irq,
+#endif
 };
 
 int gsc_assign_irq(struct irq_chip *type, void *data)
diff --git a/drivers/parisc/gsc.h b/drivers/parisc/gsc.h
index 86abad3fa215..73cbd0bb1975 100644
--- a/drivers/parisc/gsc.h
+++ b/drivers/parisc/gsc.h
@@ -31,6 +31,7 @@ struct gsc_asic {
 	int version;
 	int type;
 	int eim;
+	struct gsc_irq gsc_irq;
 	int global_irq[32];
 };
 
diff --git a/drivers/parisc/lasi.c b/drivers/parisc/lasi.c
index 4e4fd12c2112..6ef621adb63a 100644
--- a/drivers/parisc/lasi.c
+++ b/drivers/parisc/lasi.c
@@ -163,7 +163,6 @@ static int __init lasi_init_chip(struct parisc_device *dev)
 {
 	extern void (*chassis_power_off)(void);
 	struct gsc_asic *lasi;
-	struct gsc_irq gsc_irq;
 	int ret;
 
 	lasi = kzalloc(sizeof(*lasi), GFP_KERNEL);
@@ -185,7 +184,7 @@ static int __init lasi_init_chip(struct parisc_device *dev)
 	lasi_init_irq(lasi);
 
 	/* the IRQ lasi should use */
-	dev->irq = gsc_alloc_irq(&gsc_irq);
+	dev->irq = gsc_alloc_irq(&lasi->gsc_irq);
 	if (dev->irq < 0) {
 		printk(KERN_ERR "%s(): cannot get GSC irq\n",
 				__func__);
@@ -193,9 +192,9 @@ static int __init lasi_init_chip(struct parisc_device *dev)
 		return -EBUSY;
 	}
 
-	lasi->eim = ((u32) gsc_irq.txn_addr) | gsc_irq.txn_data;
+	lasi->eim = ((u32) lasi->gsc_irq.txn_addr) | lasi->gsc_irq.txn_data;
 
-	ret = request_irq(gsc_irq.irq, gsc_asic_intr, 0, "lasi", lasi);
+	ret = request_irq(lasi->gsc_irq.irq, gsc_asic_intr, 0, "lasi", lasi);
 	if (ret < 0) {
 		kfree(lasi);
 		return ret;
diff --git a/drivers/parisc/wax.c b/drivers/parisc/wax.c
index 5b6df1516235..73a2b01f8d9c 100644
--- a/drivers/parisc/wax.c
+++ b/drivers/parisc/wax.c
@@ -68,7 +68,6 @@ static int __init wax_init_chip(struct parisc_device *dev)
 {
 	struct gsc_asic *wax;
 	struct parisc_device *parent;
-	struct gsc_irq gsc_irq;
 	int ret;
 
 	wax = kzalloc(sizeof(*wax), GFP_KERNEL);
@@ -85,7 +84,7 @@ static int __init wax_init_chip(struct parisc_device *dev)
 	wax_init_irq(wax);
 
 	/* the IRQ wax should use */
-	dev->irq = gsc_claim_irq(&gsc_irq, WAX_GSC_IRQ);
+	dev->irq = gsc_claim_irq(&wax->gsc_irq, WAX_GSC_IRQ);
 	if (dev->irq < 0) {
 		printk(KERN_ERR "%s(): cannot get GSC irq\n",
 				__func__);
@@ -93,9 +92,9 @@ static int __init wax_init_chip(struct parisc_device *dev)
 		return -EBUSY;
 	}
 
-	wax->eim = ((u32) gsc_irq.txn_addr) | gsc_irq.txn_data;
+	wax->eim = ((u32) wax->gsc_irq.txn_addr) | wax->gsc_irq.txn_data;
 
-	ret = request_irq(gsc_irq.irq, gsc_asic_intr, 0, "wax", wax);
+	ret = request_irq(wax->gsc_irq.irq, gsc_asic_intr, 0, "wax", wax);
 	if (ret < 0) {
 		kfree(wax);
 		return ret;
-- 
Gitee


From a6f7b8d42aee21dba2a7ad39ace0ebfaf0d76bc9 Mon Sep 17 00:00:00 2001
From: John David Anglin <dave.anglin@bell.net>
Date: Thu, 14 Jul 2022 21:58:30 +0800
Subject: [PATCH 280/674] parisc: Fix patch code locking and flushing

stable inclusion
from stable-v5.10.111
commit 1753a49e266d8f0433e60d4b59f12c2b2497646a
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=1753a49e266d8f0433e60d4b59f12c2b2497646a

--------------------------------

[ Upstream commit a9fe7fa7d874a536e0540469f314772c054a0323 ]

This change fixes the following:

1) The flags variable is not initialized. Always use raw_spin_lock_irqsave
and raw_spin_unlock_irqrestore to serialize patching.

2) flush_kernel_vmap_range is primarily intended for DMA flushes. Since
__patch_text_multiple is often called with interrupts disabled, it is
better to directly call flush_kernel_dcache_range_asm and
flush_kernel_icache_range_asm. This avoids an extra call.

3) The final call to flush_icache_range is unnecessary.

Signed-off-by: John David Anglin <dave.anglin@bell.net>
Signed-off-by: Helge Deller <deller@gmx.de>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 arch/parisc/kernel/patch.c | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/arch/parisc/kernel/patch.c b/arch/parisc/kernel/patch.c
index 80a0ab372802..e59574f65e64 100644
--- a/arch/parisc/kernel/patch.c
+++ b/arch/parisc/kernel/patch.c
@@ -40,10 +40,7 @@ static void __kprobes *patch_map(void *addr, int fixmap, unsigned long *flags,
 
 	*need_unmap = 1;
 	set_fixmap(fixmap, page_to_phys(page));
-	if (flags)
-		raw_spin_lock_irqsave(&patch_lock, *flags);
-	else
-		__acquire(&patch_lock);
+	raw_spin_lock_irqsave(&patch_lock, *flags);
 
 	return (void *) (__fix_to_virt(fixmap) + (uintaddr & ~PAGE_MASK));
 }
@@ -52,10 +49,7 @@ static void __kprobes patch_unmap(int fixmap, unsigned long *flags)
 {
 	clear_fixmap(fixmap);
 
-	if (flags)
-		raw_spin_unlock_irqrestore(&patch_lock, *flags);
-	else
-		__release(&patch_lock);
+	raw_spin_unlock_irqrestore(&patch_lock, *flags);
 }
 
 void __kprobes __patch_text_multiple(void *addr, u32 *insn, unsigned int len)
@@ -67,8 +61,9 @@ void __kprobes __patch_text_multiple(void *addr, u32 *insn, unsigned int len)
 	int mapped;
 
 	/* Make sure we don't have any aliases in cache */
-	flush_kernel_vmap_range(addr, len);
-	flush_icache_range(start, end);
+	flush_kernel_dcache_range_asm(start, end);
+	flush_kernel_icache_range_asm(start, end);
+	flush_tlb_kernel_range(start, end);
 
 	p = fixmap = patch_map(addr, FIX_TEXT_POKE0, &flags, &mapped);
 
@@ -81,8 +76,10 @@ void __kprobes __patch_text_multiple(void *addr, u32 *insn, unsigned int len)
 			 * We're crossing a page boundary, so
 			 * need to remap
 			 */
-			flush_kernel_vmap_range((void *)fixmap,
-						(p-fixmap) * sizeof(*p));
+			flush_kernel_dcache_range_asm((unsigned long)fixmap,
+						      (unsigned long)p);
+			flush_tlb_kernel_range((unsigned long)fixmap,
+					       (unsigned long)p);
 			if (mapped)
 				patch_unmap(FIX_TEXT_POKE0, &flags);
 			p = fixmap = patch_map(addr, FIX_TEXT_POKE0, &flags,
@@ -90,10 +87,10 @@ void __kprobes __patch_text_multiple(void *addr, u32 *insn, unsigned int len)
 		}
 	}
 
-	flush_kernel_vmap_range((void *)fixmap, (p-fixmap) * sizeof(*p));
+	flush_kernel_dcache_range_asm((unsigned long)fixmap, (unsigned long)p);
+	flush_tlb_kernel_range((unsigned long)fixmap, (unsigned long)p);
 	if (mapped)
 		patch_unmap(FIX_TEXT_POKE0, &flags);
-	flush_icache_range(start, end);
 }
 
 void __kprobes __patch_text(void *addr, u32 insn)
-- 
Gitee


From b873d75277e79605a2350a4460fafbd851e7482e Mon Sep 17 00:00:00 2001
From: Mauricio Faria de Oliveira <mfo@canonical.com>
Date: Thu, 14 Jul 2022 21:58:31 +0800
Subject: [PATCH 281/674] mm: fix race between MADV_FREE reclaim and blkdev
 direct IO read

stable inclusion
from stable-v5.10.111
commit 3c3fbfa6dddb022e91be18902c3785a82aa4867d
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=3c3fbfa6dddb022e91be18902c3785a82aa4867d

--------------------------------

commit 6c8e2a256915a223f6289f651d6b926cd7135c9e upstream.

Problem:
Reviewed-by: Wei Li <liwei391@huawei.com>

=======

Userspace might read the zero-page instead of actual data from a direct IO
read on a block device if the buffers have been called madvise(MADV_FREE)
on earlier (this is discussed below) due to a race between page reclaim on
MADV_FREE and blkdev direct IO read.

- Race condition:
  ==============

During page reclaim, the MADV_FREE page check in try_to_unmap_one() checks
if the page is not dirty, then discards its rmap PTE(s) (vs.  remap back
if the page is dirty).

However, after try_to_unmap_one() returns to shrink_page_list(), it might
keep the page _anyway_ if page_ref_freeze() fails (it expects exactly
_one_ page reference, from the isolation for page reclaim).

Well, blkdev_direct_IO() gets references for all pages, and on READ
operations it only sets them dirty _later_.

So, if MADV_FREE'd pages (i.e., not dirty) are used as buffers for direct
IO read from block devices, and page reclaim happens during
__blkdev_direct_IO[_simple]() exactly AFTER bio_iov_iter_get_pages()
returns, but BEFORE the pages are set dirty, the situation happens.

The direct IO read eventually completes.  Now, when userspace reads the
buffers, the PTE is no longer there and the page fault handler
do_anonymous_page() services that with the zero-page, NOT the data!

A synthetic reproducer is provided.

- Page faults:
  ===========

If page reclaim happens BEFORE bio_iov_iter_get_pages() the issue doesn't
happen, because that faults-in all pages as writeable, so
do_anonymous_page() sets up a new page/rmap/PTE, and that is used by
direct IO.  The userspace reads don't fault as the PTE is there (thus
zero-page is not used/setup).

But if page reclaim happens AFTER it / BEFORE setting pages dirty, the PTE
is no longer there; the subsequent page faults can't help:

The data-read from the block device probably won't generate faults due to
DMA (no MMU) but even in the case it wouldn't use DMA, that happens on
different virtual addresses (not user-mapped addresses) because `struct
bio_vec` stores `struct page` to figure addresses out (which are different
from user-mapped addresses) for the read.

Thus userspace reads (to user-mapped addresses) still fault, then
do_anonymous_page() gets another `struct page` that would address/ map to
other memory than the `struct page` used by `struct bio_vec` for the read.
(The original `struct page` is not available, since it wasn't freed, as
page_ref_freeze() failed due to more page refs.  And even if it were
available, its data cannot be trusted anymore.)

Solution:
========

One solution is to check for the expected page reference count in
try_to_unmap_one().

There should be one reference from the isolation (that is also checked in
shrink_page_list() with page_ref_freeze()) plus one or more references
from page mapping(s) (put in discard: label).  Further references mean
that rmap/PTE cannot be unmapped/nuked.

(Note: there might be more than one reference from mapping due to
fork()/clone() without CLONE_VM, which use the same `struct page` for
references, until the copy-on-write page gets copied.)

So, additional page references (e.g., from direct IO read) now prevent the
rmap/PTE from being unmapped/dropped; similarly to the page is not freed
per shrink_page_list()/page_ref_freeze()).

- Races and Barriers:
  ==================

The new check in try_to_unmap_one() should be safe in races with
bio_iov_iter_get_pages() in get_user_pages() fast and slow paths, as it's
done under the PTE lock.

The fast path doesn't take the lock, but it checks if the PTE has changed
and if so, it drops the reference and leaves the page for the slow path
(which does take that lock).

The fast path requires synchronization w/ full memory barrier: it writes
the page reference count first then it reads the PTE later, while
try_to_unmap() writes PTE first then it reads page refcount.

And a second barrier is needed, as the page dirty flag should not be read
before the page reference count (as in __remove_mapping()).  (This can be
a load memory barrier only; no writes are involved.)

Call stack/comments:

- try_to_unmap_one()
  - page_vma_mapped_walk()
    - map_pte()			# see pte_offset_map_lock():
        pte_offset_map()
        spin_lock()

  - ptep_get_and_clear()	# write PTE
  - smp_mb()			# (new barrier) GUP fast path
  - page_ref_count()		# (new check) read refcount

  - page_vma_mapped_walk_done()	# see pte_unmap_unlock():
      pte_unmap()
      spin_unlock()

- bio_iov_iter_get_pages()
  - __bio_iov_iter_get_pages()
    - iov_iter_get_pages()
      - get_user_pages_fast()
        - internal_get_user_pages_fast()

          # fast path
          - lockless_pages_from_mm()
            - gup_{pgd,p4d,pud,pmd,pte}_range()
                ptep = pte_offset_map()		# not _lock()
                pte = ptep_get_lockless(ptep)

                page = pte_page(pte)
                try_grab_compound_head(page)	# inc refcount
                                            	# (RMW/barrier
                                             	#  on success)

                if (pte_val(pte) != pte_val(*ptep)) # read PTE
                        put_compound_head(page) # dec refcount
                        			# go slow path

          # slow path
          - __gup_longterm_unlocked()
            - get_user_pages_unlocked()
              - __get_user_pages_locked()
                - __get_user_pages()
                  - follow_{page,p4d,pud,pmd}_mask()
                    - follow_page_pte()
                        ptep = pte_offset_map_lock()
                        pte = *ptep
                        page = vm_normal_page(pte)
                        try_grab_page(page)	# inc refcount
                        pte_unmap_unlock()

- Huge Pages:
  ==========

Regarding transparent hugepages, that logic shouldn't change, as MADV_FREE
(aka lazyfree) pages are PageAnon() && !PageSwapBacked()
(madvise_free_pte_range() -> mark_page_lazyfree() -> lru_lazyfree_fn())
thus should reach shrink_page_list() -> split_huge_page_to_list() before
try_to_unmap[_one](), so it deals with normal pages only.

(And in case unlikely/TTU_SPLIT_HUGE_PMD/split_huge_pmd_address() happens,
which should not or be rare, the page refcount should be greater than
mapcount: the head page is referenced by tail pages.  That also prevents
checking the head `page` then incorrectly call page_remove_rmap(subpage)
for a tail page, that isn't even in the shrink_page_list()'s page_list (an
effect of split huge pmd/pmvw), as it might happen today in this unlikely
scenario.)

MADV_FREE'd buffers:
===================

So, back to the "if MADV_FREE pages are used as buffers" note.  The case
is arguable, and subject to multiple interpretations.

The madvise(2) manual page on the MADV_FREE advice value says:

1) 'After a successful MADV_FREE ... data will be lost when
   the kernel frees the pages.'
2) 'the free operation will be canceled if the caller writes
   into the page' / 'subsequent writes ... will succeed and
   then [the] kernel cannot free those dirtied pages'
3) 'If there is no subsequent write, the kernel can free the
   pages at any time.'

Thoughts, questions, considerations... respectively:

1) Since the kernel didn't actually free the page (page_ref_freeze()
   failed), should the data not have been lost? (on userspace read.)
2) Should writes performed by the direct IO read be able to cancel
   the free operation?
   - Should the direct IO read be considered as 'the caller' too,
     as it's been requested by 'the caller'?
   - Should the bio technique to dirty pages on return to userspace
     (bio_check_pages_dirty() is called/used by __blkdev_direct_IO())
     be considered in another/special way here?
3) Should an upcoming write from a previously requested direct IO
   read be considered as a subsequent write, so the kernel should
   not free the pages? (as it's known at the time of page reclaim.)

And lastly:

Technically, the last point would seem a reasonable consideration and
balance, as the madvise(2) manual page apparently (and fairly) seem to
assume that 'writes' are memory access from the userspace process (not
explicitly considering writes from the kernel or its corner cases; again,
fairly)..  plus the kernel fix implementation for the corner case of the
largely 'non-atomic write' encompassed by a direct IO read operation, is
relatively simple; and it helps.

Reproducer:
==========

@ test.c (simplified, but works)

	#define _GNU_SOURCE
	#include <fcntl.h>
	#include <stdio.h>
	#include <unistd.h>
	#include <sys/mman.h>

	int main() {
		int fd, i;
		char *buf;

		fd = open(DEV, O_RDONLY | O_DIRECT);

		buf = mmap(NULL, BUF_SIZE, PROT_READ | PROT_WRITE,
                	   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);

		for (i = 0; i < BUF_SIZE; i += PAGE_SIZE)
			buf[i] = 1; // init to non-zero

		madvise(buf, BUF_SIZE, MADV_FREE);

		read(fd, buf, BUF_SIZE);

		for (i = 0; i < BUF_SIZE; i += PAGE_SIZE)
			printf("%p: 0x%x\n", &buf[i], buf[i]);

		return 0;
	}

@ block/fops.c (formerly fs/block_dev.c)

	+#include <linux/swap.h>
	...
	... __blkdev_direct_IO[_simple](...)
	{
	...
	+	if (!strcmp(current->comm, "good"))
	+		shrink_all_memory(ULONG_MAX);
	+
         	ret = bio_iov_iter_get_pages(...);
	+
	+	if (!strcmp(current->comm, "bad"))
	+		shrink_all_memory(ULONG_MAX);
	...
	}

@ shell

        # NUM_PAGES=4
        # PAGE_SIZE=$(getconf PAGE_SIZE)

        # yes | dd of=test.img bs=${PAGE_SIZE} count=${NUM_PAGES}
        # DEV=$(losetup -f --show test.img)

        # gcc -DDEV=\"$DEV\" \
              -DBUF_SIZE=$((PAGE_SIZE * NUM_PAGES)) \
              -DPAGE_SIZE=${PAGE_SIZE} \
               test.c -o test

        # od -tx1 $DEV
        0000000 79 0a 79 0a 79 0a 79 0a 79 0a 79 0a 79 0a 79 0a
        *
        0040000

        # mv test good
        # ./good
        0x7f7c10418000: 0x79
        0x7f7c10419000: 0x79
        0x7f7c1041a000: 0x79
        0x7f7c1041b000: 0x79

        # mv good bad
        # ./bad
        0x7fa1b8050000: 0x0
        0x7fa1b8051000: 0x0
        0x7fa1b8052000: 0x0
        0x7fa1b8053000: 0x0

Note: the issue is consistent on v5.17-rc3, but it's intermittent with the
support of MADV_FREE on v4.5 (60%-70% error; needs swap).  [wrap
do_direct_IO() in do_blockdev_direct_IO() @ fs/direct-io.c].

- v5.17-rc3:

        # for i in {1..1000}; do ./good; done \
            | cut -d: -f2 | sort | uniq -c
           4000  0x79

        # mv good bad
        # for i in {1..1000}; do ./bad; done \
            | cut -d: -f2 | sort | uniq -c
           4000  0x0

        # free | grep Swap
        Swap:             0           0           0

- v4.5:

        # for i in {1..1000}; do ./good; done \
            | cut -d: -f2 | sort | uniq -c
           4000  0x79

        # mv good bad
        # for i in {1..1000}; do ./bad; done \
            | cut -d: -f2 | sort | uniq -c
           2702  0x0
           1298  0x79

        # swapoff -av
        swapoff /swap

        # for i in {1..1000}; do ./bad; done \
            | cut -d: -f2 | sort | uniq -c
           4000  0x79

Ceph/TCMalloc:
=============

For documentation purposes, the use case driving the analysis/fix is Ceph
on Ubuntu 18.04, as the TCMalloc library there still uses MADV_FREE to
release unused memory to the system from the mmap'ed page heap (might be
committed back/used again; it's not munmap'ed.) - PageHeap::DecommitSpan()
-> TCMalloc_SystemRelease() -> madvise() - PageHeap::CommitSpan() ->
TCMalloc_SystemCommit() -> do nothing.

Note: TCMalloc switched back to MADV_DONTNEED a few commits after the
release in Ubuntu 18.04 (google-perftools/gperftools 2.5), so the issue
just 'disappeared' on Ceph on later Ubuntu releases but is still present
in the kernel, and can be hit by other use cases.

The observed issue seems to be the old Ceph bug #22464 [1], where checksum
mismatches are observed (and instrumentation with buffer dumps shows
zero-pages read from mmap'ed/MADV_FREE'd page ranges).

The issue in Ceph was reasonably deemed a kernel bug (comment #50) and
mostly worked around with a retry mechanism, but other parts of Ceph could
still hit that (rocksdb).  Anyway, it's less likely to be hit again as
TCMalloc switched out of MADV_FREE by default.

(Some kernel versions/reports from the Ceph bug, and relation with
the MADV_FREE introduction/changes; TCMalloc versions not checked.)
- 4.4 good
- 4.5 (madv_free: introduction)
- 4.9 bad
- 4.10 good? maybe a swapless system
- 4.12 (madv_free: no longer free instantly on swapless systems)
- 4.13 bad

[1] https://tracker.ceph.com/issues/22464

Thanks:
======

Several people contributed to analysis/discussions/tests/reproducers in
the first stages when drilling down on ceph/tcmalloc/linux kernel:

- Dan Hill
- Dan Streetman
- Dongdong Tao
- Gavin Guo
- Gerald Yang
- Heitor Alves de Siqueira
- Ioanna Alifieraki
- Jay Vosburgh
- Matthew Ruffell
- Ponnuvel Palaniyappan

Reviews, suggestions, corrections, comments:

- Minchan Kim
- Yu Zhao
- Huang, Ying
- John Hubbard
- Christoph Hellwig

[mfo@canonical.com: v4]
  Link: https://lkml.kernel.org/r/20220209202659.183418-1-mfo@canonical.comLink: https://lkml.kernel.org/r/20220131230255.789059-1-mfo@canonical.com

Fixes: 802a3a92ad7a ("mm: reclaim MADV_FREE pages")
Signed-off-by: Mauricio Faria de Oliveira <mfo@canonical.com>
Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Yu Zhao <yuzhao@google.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Dan Hill <daniel.hill@canonical.com>
Cc: Dan Streetman <dan.streetman@canonical.com>
Cc: Dongdong Tao <dongdong.tao@canonical.com>
Cc: Gavin Guo <gavin.guo@canonical.com>
Cc: Gerald Yang <gerald.yang@canonical.com>
Cc: Heitor Alves de Siqueira <halves@canonical.com>
Cc: Ioanna Alifieraki <ioanna-maria.alifieraki@canonical.com>
Cc: Jay Vosburgh <jay.vosburgh@canonical.com>
Cc: Matthew Ruffell <matthew.ruffell@canonical.com>
Cc: Ponnuvel Palaniyappan <ponnuvel.palaniyappan@canonical.com>
Cc: <stable@vger.kernel.org>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
[mfo: backport: replace folio/test_flag with page/flag equivalents;
 real Fixes: 854e9ed09ded ("mm: support madvise(MADV_FREE)") in v4.]
Signed-off-by: Mauricio Faria de Oliveira <mfo@canonical.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 mm/rmap.c | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/mm/rmap.c b/mm/rmap.c
index a780862cd226..0dc39cf94345 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1657,7 +1657,30 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 
 			/* MADV_FREE page check */
 			if (!PageSwapBacked(page)) {
-				if (!PageDirty(page)) {
+				int ref_count, map_count;
+
+				/*
+				 * Synchronize with gup_pte_range():
+				 * - clear PTE; barrier; read refcount
+				 * - inc refcount; barrier; read PTE
+				 */
+				smp_mb();
+
+				ref_count = page_ref_count(page);
+				map_count = page_mapcount(page);
+
+				/*
+				 * Order reads for page refcount and dirty flag
+				 * (see comments in __remove_mapping()).
+				 */
+				smp_rmb();
+
+				/*
+				 * The only page refs must be one from isolation
+				 * plus the rmap(s) (dropped by discard:).
+				 */
+				if (ref_count == 1 + map_count &&
+				    !PageDirty(page)) {
 					/* Invalidate as we cleared the pte */
 					mmu_notifier_invalidate_range(mm,
 						address, address + PAGE_SIZE);
-- 
Gitee


From 91aae60ff67df941151687801cb887f9d4199f22 Mon Sep 17 00:00:00 2001
From: Sasha Levin <sashal@kernel.org>
Date: Thu, 14 Jul 2022 21:58:32 +0800
Subject: [PATCH 282/674] Revert "hv: utils: add PTP_1588_CLOCK to Kconfig to
 fix build"

stable inclusion
from stable-v5.10.111
commit 84e5dfc05f37bbb7203af4ec6bd18a17d37abacf
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=84e5dfc05f37bbb7203af4ec6bd18a17d37abacf

--------------------------------

This reverts commit c4dc584a2d4c8d74b054f09d67e0a076767bdee5.

On Sat, Apr 09, 2022 at 09:07:51AM -0700, Randy Dunlap wrote:
>According to https://bugzilla.kernel.org/show_bug.cgi?id=215823,
>c4dc584a2d4c8d74b054f09d67e0a076767bdee5 ("hv: utils: add PTP_1588_CLOCK to Kconfig to fix build")
>is a problem for 5.10 since CONFIG_PTP_1588_CLOCK_OPTIONAL does not exist in 5.10.
>This prevents the hyper-V NIC timestamping from working, so please revert that commit.

Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/hv/Kconfig | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig
index 210e532ac277..79e5356a737a 100644
--- a/drivers/hv/Kconfig
+++ b/drivers/hv/Kconfig
@@ -17,7 +17,6 @@ config HYPERV_TIMER
 config HYPERV_UTILS
 	tristate "Microsoft Hyper-V Utilities driver"
 	depends on HYPERV && CONNECTOR && NLS
-	depends on PTP_1588_CLOCK_OPTIONAL
 	help
 	  Select this option to enable the Hyper-V Utilities.
 
-- 
Gitee


From bff89179ea68b4c16d011072859f787411bd7ed5 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Thu, 14 Jul 2022 21:58:33 +0800
Subject: [PATCH 283/674] drm/amdgpu: fix off by one in
 amdgpu_gfx_kiq_acquire()

stable inclusion
from stable-v5.10.111
commit 0c122eb3a109356fb2c39f49c1768321248d0f1d
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=0c122eb3a109356fb2c39f49c1768321248d0f1d

--------------------------------

[ Upstream commit 1647b54ed55d4d48c7199d439f8834626576cbe9 ]

This post-op should be a pre-op so that we do not pass -1 as the bit
number to test_bit().  The current code will loop downwards from 63 to
-1.  After changing to a pre-op, it loops from 63 to 0.

Fixes: 71c37505e7ea ("drm/amdgpu/gfx: move more common KIQ code to amdgpu_gfx.c")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index 9f9f55a2b257..f84582b70d0e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -263,7 +263,7 @@ static int amdgpu_gfx_kiq_acquire(struct amdgpu_device *adev,
 		    * adev->gfx.mec.num_pipe_per_mec
 		    * adev->gfx.mec.num_queue_per_pipe;
 
-	while (queue_bit-- >= 0) {
+	while (--queue_bit >= 0) {
 		if (test_bit(queue_bit, adev->gfx.mec.queue_bitmap))
 			continue;
 
-- 
Gitee


From 909b329794f18d8b24c4945d7d1b35b374698da3 Mon Sep 17 00:00:00 2001
From: "Guilherme G. Piccoli" <gpiccoli@igalia.com>
Date: Thu, 14 Jul 2022 21:58:34 +0800
Subject: [PATCH 284/674] Drivers: hv: vmbus: Fix potential crash on module
 unload

stable inclusion
from stable-v5.10.111
commit cf580d2e3884dbafd6b90269b03a24d661578624
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=cf580d2e3884dbafd6b90269b03a24d661578624

--------------------------------

[ Upstream commit 792f232d57ff28bbd5f9c4abe0466b23d5879dc8 ]

The vmbus driver relies on the panic notifier infrastructure to perform
some operations when a panic event is detected. Since vmbus can be built
as module, it is required that the driver handles both registering and
unregistering such panic notifier callback.

After commit 74347a99e73a ("x86/Hyper-V: Unload vmbus channel in hv panic callback")
though, the panic notifier registration is done unconditionally in the module
initialization routine whereas the unregistering procedure is conditionally
guarded and executes only if HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE capability
is set.

This patch fixes that by unconditionally unregistering the panic notifier
in the module's exit routine as well.

Fixes: 74347a99e73a ("x86/Hyper-V: Unload vmbus channel in hv panic callback")
Signed-off-by: Guilherme G. Piccoli <gpiccoli@igalia.com>
Reviewed-by: Michael Kelley <mikelley@microsoft.com>
Link: https://lore.kernel.org/r/20220315203535.682306-1-gpiccoli@igalia.com
Signed-off-by: Wei Liu <wei.liu@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/hv/vmbus_drv.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index 362da2a83b47..b9ac357e465d 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -2673,10 +2673,15 @@ static void __exit vmbus_exit(void)
 	if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE) {
 		kmsg_dump_unregister(&hv_kmsg_dumper);
 		unregister_die_notifier(&hyperv_die_block);
-		atomic_notifier_chain_unregister(&panic_notifier_list,
-						 &hyperv_panic_block);
 	}
 
+	/*
+	 * The panic notifier is always registered, hence we should
+	 * also unconditionally unregister it here as well.
+	 */
+	atomic_notifier_chain_unregister(&panic_notifier_list,
+					 &hyperv_panic_block);
+
 	free_page((unsigned long)hv_panic_page);
 	unregister_sysctl_table(hv_ctl_table_hdr);
 	hv_ctl_table_hdr = NULL;
-- 
Gitee


From c10e11a7a9f2b24764288978c863bf15fb4869a7 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Thu, 14 Jul 2022 21:58:35 +0800
Subject: [PATCH 285/674] scsi: zorro7xx: Fix a resource leak in
 zorro7xx_remove_one()

stable inclusion
from stable-v5.10.111
commit c5f77b595379b5191316edd365a542f8b1526066
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=c5f77b595379b5191316edd365a542f8b1526066

--------------------------------

[ Upstream commit 16ed828b872d12ccba8f07bcc446ae89ba662f9c ]

The error handling path of the probe releases a resource that is not freed
in the remove function. In some cases, a ioremap() must be undone.

Add the missing iounmap() call in the remove function.

Link: https://lore.kernel.org/r/247066a3104d25f9a05de8b3270fc3c848763bcc.1647673264.git.christophe.jaillet@wanadoo.fr
Fixes: 45804fbb00ee ("[SCSI] 53c700: Amiga Zorro NCR53c710 SCSI")
Reviewed-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/scsi/zorro7xx.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/scsi/zorro7xx.c b/drivers/scsi/zorro7xx.c
index 27b9e2baab1a..7acf9193a9e8 100644
--- a/drivers/scsi/zorro7xx.c
+++ b/drivers/scsi/zorro7xx.c
@@ -159,6 +159,8 @@ static void zorro7xx_remove_one(struct zorro_dev *z)
 	scsi_remove_host(host);
 
 	NCR_700_release(host);
+	if (host->base > 0x01000000)
+		iounmap(hostdata->base);
 	kfree(hostdata);
 	free_irq(host->irq, host);
 	zorro_release_device(z);
-- 
Gitee


From 4052593d0d15a0ec3fe2b3fe381aebad8fd11299 Mon Sep 17 00:00:00 2001
From: Ivan Vecera <ivecera@redhat.com>
Date: Thu, 14 Jul 2022 21:58:37 +0800
Subject: [PATCH 286/674] ice: Clear default forwarding VSI during VSI release

stable inclusion
from stable-v5.10.111
commit 4be6ed03107b6e245f5ee0d9e273c868fda66154
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=4be6ed03107b6e245f5ee0d9e273c868fda66154

--------------------------------

[ Upstream commit bd8c624c0cd59de0032752ba3001c107bba97f7b ]

VSI is set as default forwarding one when promisc mode is set for
PF interface, when PF is switched to switchdev mode or when VF
driver asks to enable allmulticast or promisc mode for the VF
interface (when vf-true-promisc-support priv flag is off).
The third case is buggy because in that case VSI associated with
VF remains as default one after VF removal.

Reproducer:
1. Create VF
   echo 1 > sys/class/net/ens7f0/device/sriov_numvfs
2. Enable allmulticast or promisc mode on VF
   ip link set ens7f0v0 allmulticast on
   ip link set ens7f0v0 promisc on
3. Delete VF
   echo 0 > sys/class/net/ens7f0/device/sriov_numvfs
4. Try to enable promisc mode on PF
   ip link set ens7f0 promisc on

Although it looks that promisc mode on PF is enabled the opposite
is true because ice_vsi_sync_fltr() responsible for IFF_PROMISC
handling first checks if any other VSI is set as default forwarding
one and if so the function does not do anything. At this point
it is not possible to enable promisc mode on PF without re-probe
device.

To resolve the issue this patch clear default forwarding VSI
during ice_vsi_release() when the VSI to be released is the default
one.

Fixes: 01b5e89aab49 ("ice: Add VF promiscuous support")
Signed-off-by: Ivan Vecera <ivecera@redhat.com>
Reviewed-by: Michal Swiatkowski <michal.swiatkowski@linux.intel.com>
Reviewed-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Alice Michael <alice.michael@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/net/ethernet/intel/ice/ice_lib.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c
index 52ac6cc08e83..ec475353b620 100644
--- a/drivers/net/ethernet/intel/ice/ice_lib.c
+++ b/drivers/net/ethernet/intel/ice/ice_lib.c
@@ -2667,6 +2667,8 @@ int ice_vsi_release(struct ice_vsi *vsi)
 		}
 	}
 
+	if (ice_is_vsi_dflt_vsi(pf->first_sw, vsi))
+		ice_clear_dflt_vsi(pf->first_sw);
 	ice_fltr_remove_all(vsi);
 	ice_rm_vsi_lan_cfg(vsi->port_info, vsi->idx);
 	ice_vsi_delete(vsi);
-- 
Gitee


From 7aa1bfa2c655fbc9ecb88937a72def428e2a298a Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <razor@blackwall.org>
Date: Thu, 14 Jul 2022 21:58:38 +0800
Subject: [PATCH 287/674] net: ipv4: fix route with nexthop object delete
 warning

stable inclusion
from stable-v5.10.111
commit 63ea57478aaa3e06a597081a0f537318fc04e49f
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=63ea57478aaa3e06a597081a0f537318fc04e49f

--------------------------------

[ Upstream commit 6bf92d70e690b7ff12b24f4bfff5e5434d019b82 ]

FRR folks have hit a kernel warning[1] while deleting routes[2] which is
caused by trying to delete a route pointing to a nexthop id without
specifying nhid but matching on an interface. That is, a route is found
but we hit a warning while matching it. The warning is from
fib_info_nh() in include/net/nexthop.h because we run it on a fib_info
with nexthop object. The call chain is:
 inet_rtm_delroute -> fib_table_delete -> fib_nh_match (called with a
nexthop fib_info and also with fc_oif set thus calling fib_info_nh on
the fib_info and triggering the warning). The fix is to not do any
matching in that branch if the fi has a nexthop object because those are
managed separately. I.e. we should match when deleting without nh spec and
should fail when deleting a nexthop route with old-style nh spec because
nexthop objects are managed separately, e.g.:
 $ ip r show 1.2.3.4/32
 1.2.3.4 nhid 12 via 192.168.11.2 dev dummy0

 $ ip r del 1.2.3.4/32
 $ ip r del 1.2.3.4/32 nhid 12
 <both should work>

 $ ip r del 1.2.3.4/32 dev dummy0
 <should fail with ESRCH>

[1]
 [  523.462226] ------------[ cut here ]------------
 [  523.462230] WARNING: CPU: 14 PID: 22893 at include/net/nexthop.h:468 fib_nh_match+0x210/0x460
 [  523.462236] Modules linked in: dummy rpcsec_gss_krb5 xt_socket nf_socket_ipv4 nf_socket_ipv6 ip6table_raw iptable_raw bpf_preload xt_statistic ip_set ip_vs_sh ip_vs_wrr ip_vs_rr ip_vs xt_mark nf_tables xt_nat veth nf_conntrack_netlink nfnetlink xt_addrtype br_netfilter overlay dm_crypt nfsv3 nfs fscache netfs vhost_net vhost vhost_iotlb tap tun xt_CHECKSUM xt_MASQUERADE xt_conntrack 8021q garp mrp ipt_REJECT nf_reject_ipv4 ip6table_mangle ip6table_nat iptable_mangle iptable_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 iptable_filter bridge stp llc rfcomm snd_seq_dummy snd_hrtimer rpcrdma rdma_cm iw_cm ib_cm ib_core ip6table_filter xt_comment ip6_tables vboxnetadp(OE) vboxnetflt(OE) vboxdrv(OE) qrtr bnep binfmt_misc xfs vfat fat squashfs loop nvidia_drm(POE) nvidia_modeset(POE) nvidia_uvm(POE) nvidia(POE) intel_rapl_msr intel_rapl_common snd_hda_codec_realtek snd_hda_codec_generic ledtrig_audio snd_hda_codec_hdmi btusb btrtl iwlmvm uvcvideo btbcm snd_hda_intel edac_mce_amd
 [  523.462274]  videobuf2_vmalloc videobuf2_memops btintel snd_intel_dspcfg videobuf2_v4l2 snd_intel_sdw_acpi bluetooth snd_usb_audio snd_hda_codec mac80211 snd_usbmidi_lib joydev snd_hda_core videobuf2_common kvm_amd snd_rawmidi snd_hwdep snd_seq videodev ccp snd_seq_device libarc4 ecdh_generic mc snd_pcm kvm iwlwifi snd_timer drm_kms_helper snd cfg80211 cec soundcore irqbypass rapl wmi_bmof i2c_piix4 rfkill k10temp pcspkr acpi_cpufreq nfsd auth_rpcgss nfs_acl lockd grace sunrpc drm zram ip_tables crct10dif_pclmul crc32_pclmul crc32c_intel ghash_clmulni_intel nvme sp5100_tco r8169 nvme_core wmi ipmi_devintf ipmi_msghandler fuse
 [  523.462300] CPU: 14 PID: 22893 Comm: ip Tainted: P           OE     5.16.18-200.fc35.x86_64 #1
 [  523.462302] Hardware name: Micro-Star International Co., Ltd. MS-7C37/MPG X570 GAMING EDGE WIFI (MS-7C37), BIOS 1.C0 10/29/2020
 [  523.462303] RIP: 0010:fib_nh_match+0x210/0x460
 [  523.462304] Code: 7c 24 20 48 8b b5 90 00 00 00 e8 bb ee f4 ff 48 8b 7c 24 20 41 89 c4 e8 ee eb f4 ff 45 85 e4 0f 85 2e fe ff ff e9 4c ff ff ff <0f> 0b e9 17 ff ff ff 3c 0a 0f 85 61 fe ff ff 48 8b b5 98 00 00 00
 [  523.462306] RSP: 0018:ffffaa53d4d87928 EFLAGS: 00010286
 [  523.462307] RAX: 0000000000000000 RBX: ffffaa53d4d87a90 RCX: ffffaa53d4d87bb0
 [  523.462308] RDX: ffff9e3d2ee6be80 RSI: ffffaa53d4d87a90 RDI: ffffffff920ed380
 [  523.462309] RBP: ffff9e3d2ee6be80 R08: 0000000000000064 R09: 0000000000000000
 [  523.462310] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000031
 [  523.462310] R13: 0000000000000020 R14: 0000000000000000 R15: ffff9e3d331054e0
 [  523.462311] FS:  00007f245517c1c0(0000) GS:ffff9e492ed80000(0000) knlGS:0000000000000000
 [  523.462313] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 [  523.462313] CR2: 000055e5dfdd8268 CR3: 00000003ef488000 CR4: 0000000000350ee0
 [  523.462315] Call Trace:
 [  523.462316]  <TASK>
 [  523.462320]  fib_table_delete+0x1a9/0x310
 [  523.462323]  inet_rtm_delroute+0x93/0x110
 [  523.462325]  rtnetlink_rcv_msg+0x133/0x370
 [  523.462327]  ? _copy_to_iter+0xb5/0x6f0
 [  523.462330]  ? rtnl_calcit.isra.0+0x110/0x110
 [  523.462331]  netlink_rcv_skb+0x50/0xf0
 [  523.462334]  netlink_unicast+0x211/0x330
 [  523.462336]  netlink_sendmsg+0x23f/0x480
 [  523.462338]  sock_sendmsg+0x5e/0x60
 [  523.462340]  ____sys_sendmsg+0x22c/0x270
 [  523.462341]  ? import_iovec+0x17/0x20
 [  523.462343]  ? sendmsg_copy_msghdr+0x59/0x90
 [  523.462344]  ? __mod_lruvec_page_state+0x85/0x110
 [  523.462348]  ___sys_sendmsg+0x81/0xc0
 [  523.462350]  ? netlink_seq_start+0x70/0x70
 [  523.462352]  ? __dentry_kill+0x13a/0x180
 [  523.462354]  ? __fput+0xff/0x250
 [  523.462356]  __sys_sendmsg+0x49/0x80
 [  523.462358]  do_syscall_64+0x3b/0x90
 [  523.462361]  entry_SYSCALL_64_after_hwframe+0x44/0xae
 [  523.462364] RIP: 0033:0x7f24552aa337
 [  523.462365] Code: 0e 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b9 0f 1f 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 2e 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51 c3 48 83 ec 28 89 54 24 1c 48 89 74 24 10
 [  523.462366] RSP: 002b:00007fff7f05a838 EFLAGS: 00000246 ORIG_RAX: 000000000000002e
 [  523.462368] RAX: ffffffffffffffda RBX: 000000006245bf91 RCX: 00007f24552aa337
 [  523.462368] RDX: 0000000000000000 RSI: 00007fff7f05a8a0 RDI: 0000000000000003
 [  523.462369] RBP: 0000000000000000 R08: 0000000000000001 R09: 0000000000000000
 [  523.462370] R10: 0000000000000008 R11: 0000000000000246 R12: 0000000000000001
 [  523.462370] R13: 00007fff7f05ce08 R14: 0000000000000000 R15: 000055e5dfdd1040
 [  523.462373]  </TASK>
 [  523.462374] ---[ end trace ba537bc16f6bf4ed ]---

[2] https://github.com/FRRouting/frr/issues/6412

Fixes: 4c7e8084fd46 ("ipv4: Plumb support for nexthop object in a fib_info")
Signed-off-by: Nikolay Aleksandrov <razor@blackwall.org>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 net/ipv4/fib_semantics.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 838a876c168c..c8c7b76c3b2e 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -888,8 +888,13 @@ int fib_nh_match(struct net *net, struct fib_config *cfg, struct fib_info *fi,
 	}
 
 	if (cfg->fc_oif || cfg->fc_gw_family) {
-		struct fib_nh *nh = fib_info_nh(fi, 0);
+		struct fib_nh *nh;
+
+		/* cannot match on nexthop object attributes */
+		if (fi->nh)
+			return 1;
 
+		nh = fib_info_nh(fi, 0);
 		if (cfg->fc_encap) {
 			if (fib_encap_match(net, cfg->fc_encap_type,
 					    cfg->fc_encap, nh, cfg, extack))
-- 
Gitee


From 1ecc903f99b69361d17f2cc8bcb6c1cd35fcddd7 Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wens@csie.org>
Date: Thu, 14 Jul 2022 21:58:39 +0800
Subject: [PATCH 288/674] net: stmmac: Fix unset max_speed difference between
 DT and non-DT platforms

stable inclusion
from stable-v5.10.111
commit 02ab4abe5bbf60f6d0e020a974e0c40e42984a1a
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=02ab4abe5bbf60f6d0e020a974e0c40e42984a1a

--------------------------------

[ Upstream commit c21cabb0fd0b54b8b54235fc1ecfe1195a23bcb2 ]

In commit 9cbadf094d9d ("net: stmmac: support max-speed device tree
property"), when DT platforms don't set "max-speed", max_speed is set to
-1; for non-DT platforms, it stays the default 0.

Prior to commit eeef2f6b9f6e ("net: stmmac: Start adding phylink support"),
the check for a valid max_speed setting was to check if it was greater
than zero. This commit got it right, but subsequent patches just checked
for non-zero, which is incorrect for DT platforms.

In commit 92c3807b9ac3 ("net: stmmac: convert to phylink_get_linkmodes()")
the conversion switched completely to checking for non-zero value as a
valid value, which caused 1000base-T to stop getting advertised by
default.

Instead of trying to fix all the checks, simply leave max_speed alone if
DT property parsing fails.

Fixes: 9cbadf094d9d ("net: stmmac: support max-speed device tree property")
Fixes: 92c3807b9ac3 ("net: stmmac: convert to phylink_get_linkmodes()")
Signed-off-by: Chen-Yu Tsai <wens@csie.org>
Acked-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Link: https://lore.kernel.org/r/20220331184832.16316-1-wens@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
index 3183d8826981..b40b962055fa 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
@@ -432,8 +432,7 @@ stmmac_probe_config_dt(struct platform_device *pdev, const char **mac)
 	plat->phylink_node = np;
 
 	/* Get max speed of operation from device tree */
-	if (of_property_read_u32(np, "max-speed", &plat->max_speed))
-		plat->max_speed = -1;
+	of_property_read_u32(np, "max-speed", &plat->max_speed);
 
 	plat->bus_id = of_alias_get_id(np, "ethernet");
 	if (plat->bus_id < 0)
-- 
Gitee


From ac294abfb19a7df6a86f4b6dd087e1d3cc9bd3cd Mon Sep 17 00:00:00 2001
From: Jiasheng Jiang <jiasheng@iscas.ac.cn>
Date: Thu, 14 Jul 2022 21:58:40 +0800
Subject: [PATCH 289/674] drm/imx: imx-ldb: Check for null pointer after
 calling kmemdup

stable inclusion
from stable-v5.10.111
commit 25bc9fd4c8d13036c0656a00ebedc2cff4650a91
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=25bc9fd4c8d13036c0656a00ebedc2cff4650a91

--------------------------------

[ Upstream commit 8027a9ad9b3568c5eb49c968ad6c97f279d76730 ]

As the possible failure of the allocation, kmemdup() may return NULL
pointer.
Therefore, it should be better to check the return value of kmemdup()
and return error if fails.

Fixes: dc80d7038883 ("drm/imx-ldb: Add support to drm-bridge")
Signed-off-by: Jiasheng Jiang <jiasheng@iscas.ac.cn>
Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
Link: https://lore.kernel.org/r/20220105074729.2363657-1-jiasheng@iscas.ac.cn
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/gpu/drm/imx/imx-ldb.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/imx/imx-ldb.c b/drivers/gpu/drm/imx/imx-ldb.c
index 75036aaa0c63..efd13e533726 100644
--- a/drivers/gpu/drm/imx/imx-ldb.c
+++ b/drivers/gpu/drm/imx/imx-ldb.c
@@ -553,6 +553,8 @@ static int imx_ldb_panel_ddc(struct device *dev,
 		edidp = of_get_property(child, "edid", &edid_len);
 		if (edidp) {
 			channel->edid = kmemdup(edidp, edid_len, GFP_KERNEL);
+			if (!channel->edid)
+				return -ENOMEM;
 		} else if (!channel->panel) {
 			/* fallback to display-timings node */
 			ret = of_get_drm_display_mode(child,
-- 
Gitee


From 861e0d903aff3a0c3188dc0c96d70c289b00a0b3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jos=C3=A9=20Exp=C3=B3sito?= <jose.exposito89@gmail.com>
Date: Thu, 14 Jul 2022 21:58:41 +0800
Subject: [PATCH 290/674] drm/imx: Fix memory leak in
 imx_pd_connector_get_modes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

stable inclusion
from stable-v5.10.111
commit f8b0ef0a5889890b50482b6593bd8de544736351
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=f8b0ef0a5889890b50482b6593bd8de544736351

--------------------------------

[ Upstream commit bce81feb03a20fca7bbdd1c4af16b4e9d5c0e1d3 ]

Avoid leaking the display mode variable if of_get_drm_display_mode
fails.

Fixes: 76ecd9c9fb24 ("drm/imx: parallel-display: check return code from of_get_drm_display_mode()")
Addresses-Coverity-ID: 1443943 ("Resource leak")
Signed-off-by: José Expósito <jose.exposito89@gmail.com>
Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
Link: https://lore.kernel.org/r/20220108165230.44610-1-jose.exposito89@gmail.com
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/gpu/drm/imx/parallel-display.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/imx/parallel-display.c b/drivers/gpu/drm/imx/parallel-display.c
index 605ac8825a59..b61bfa84b6bb 100644
--- a/drivers/gpu/drm/imx/parallel-display.c
+++ b/drivers/gpu/drm/imx/parallel-display.c
@@ -70,8 +70,10 @@ static int imx_pd_connector_get_modes(struct drm_connector *connector)
 		ret = of_get_drm_display_mode(np, &imxpd->mode,
 					      &imxpd->bus_flags,
 					      OF_USE_NATIVE_MODE);
-		if (ret)
+		if (ret) {
+			drm_mode_destroy(connector->dev, mode);
 			return ret;
+		}
 
 		drm_mode_copy(mode, &imxpd->mode);
 		mode->type |= DRM_MODE_TYPE_DRIVER | DRM_MODE_TYPE_PREFERRED,
-- 
Gitee


From d1a5b626a1bf101918afbaf9f70e9316144c8a4f Mon Sep 17 00:00:00 2001
From: Andy Gospodarek <gospo@broadcom.com>
Date: Thu, 14 Jul 2022 21:58:42 +0800
Subject: [PATCH 291/674] bnxt_en: reserve space inside receive page for
 skb_shared_info

stable inclusion
from stable-v5.10.111
commit 0ac74169ebc34686a6ef2a033473a4913aa3e471
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=0ac74169ebc34686a6ef2a033473a4913aa3e471

--------------------------------

[ Upstream commit facc173cf700e55b2ad249ecbd3a7537f7315691 ]

Insufficient space was being reserved in the page used for packet
reception, so the interface MTU could be set too large to still have
room for the contents of the packet when doing XDP redirect.  This
resulted in the following message when redirecting a packet between
3520 and 3822 bytes with an MTU of 3822:

[311815.561880] XDP_WARN: xdp_update_frame_from_buff(line:200): Driver BUG: missing reserved tailroom

Fixes: f18c2b77b2e4 ("bnxt_en: optimized XDP_REDIRECT support")
Reviewed-by: Somnath Kotur <somnath.kotur@broadcom.com>
Reviewed-by: Pavan Chebbi <pavan.chebbi@broadcom.com>
Signed-off-by: Andy Gospodarek <gospo@broadcom.com>
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 92f9f7f5240b..34affd1de91d 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -569,7 +569,8 @@ struct nqe_cn {
 #define BNXT_MAX_MTU		9500
 #define BNXT_MAX_PAGE_MODE_MTU	\
 	((unsigned int)PAGE_SIZE - VLAN_ETH_HLEN - NET_IP_ALIGN -	\
-	 XDP_PACKET_HEADROOM)
+	 XDP_PACKET_HEADROOM - \
+	 SKB_DATA_ALIGN((unsigned int)sizeof(struct skb_shared_info)))
 
 #define BNXT_MIN_PKT_SIZE	52
 
-- 
Gitee


From 4923c9da128bc0c53564b384ddfb181fbd92d301 Mon Sep 17 00:00:00 2001
From: Martin Habets <habetsm.xilinx@gmail.com>
Date: Thu, 14 Jul 2022 21:58:43 +0800
Subject: [PATCH 292/674] sfc: Do not free an empty page_ring

stable inclusion
from stable-v5.10.111
commit d8992b393f974b47c4a72b9bacdbfc4110e29eea
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=d8992b393f974b47c4a72b9bacdbfc4110e29eea

--------------------------------

[ Upstream commit 458f5d92df4807e2a7c803ed928369129996bf96 ]

When the page_ring is not used page_ptr_mask is 0.
Do not dereference page_ring[0] in this case.

Fixes: 2768935a4660 ("sfc: reuse pages to avoid DMA mapping/unmapping costs")
Reported-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: Martin Habets <habetsm.xilinx@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/net/ethernet/sfc/rx_common.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/sfc/rx_common.c b/drivers/net/ethernet/sfc/rx_common.c
index e423b17e2a14..2c09afac5beb 100644
--- a/drivers/net/ethernet/sfc/rx_common.c
+++ b/drivers/net/ethernet/sfc/rx_common.c
@@ -166,6 +166,9 @@ static void efx_fini_rx_recycle_ring(struct efx_rx_queue *rx_queue)
 	struct efx_nic *efx = rx_queue->efx;
 	int i;
 
+	if (unlikely(!rx_queue->page_ring))
+		return;
+
 	/* Unmap and release the pages in the recycle ring. Remove the ring. */
 	for (i = 0; i <= rx_queue->page_ptr_mask; i++) {
 		struct page *page = rx_queue->page_ring[i];
-- 
Gitee


From 3889021d68a68371300974e5d708fd11640b4723 Mon Sep 17 00:00:00 2001
From: Aharon Landau <aharonl@nvidia.com>
Date: Thu, 14 Jul 2022 21:58:44 +0800
Subject: [PATCH 293/674] RDMA/mlx5: Don't remove cache MRs when a delay is
 needed

stable inclusion
from stable-v5.10.111
commit 3a57babfb6e9871b961e3c36e616af87df4ef1e8
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=3a57babfb6e9871b961e3c36e616af87df4ef1e8

--------------------------------

[ Upstream commit 84c2362fb65d69c721fec0974556378cbb36a62b ]

Don't remove MRs from the cache if need to delay the removal.

Fixes: b9358bdbc713 ("RDMA/mlx5: Fix locking in MR cache work queue")
Link: https://lore.kernel.org/r/c3087a90ff362c8796c7eaa2715128743ce36722.1649062436.git.leonro@nvidia.com
Signed-off-by: Aharon Landau <aharonl@nvidia.com>
Reviewed-by: Shay Drory <shayd@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/infiniband/hw/mlx5/mr.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 6cd0cbd4fc9f..d827a4e44c94 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -531,8 +531,10 @@ static void __cache_work_func(struct mlx5_cache_ent *ent)
 		spin_lock_irq(&ent->lock);
 		if (ent->disabled)
 			goto out;
-		if (need_delay)
+		if (need_delay) {
 			queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ);
+			goto out;
+		}
 		remove_cache_mr_locked(ent);
 		queue_adjust_cache_locked(ent);
 	}
-- 
Gitee


From d087ba14ae40e0ffd311707621765bc33ca78a6b Mon Sep 17 00:00:00 2001
From: Niels Dossche <dossche.niels@gmail.com>
Date: Thu, 14 Jul 2022 21:58:45 +0800
Subject: [PATCH 294/674] IB/rdmavt: add lock to call to rvt_error_qp to
 prevent a race condition

stable inclusion
from stable-v5.10.111
commit 43c2d7890ecabe527448a6c391fb2d9a5e6bbfe0
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=43c2d7890ecabe527448a6c391fb2d9a5e6bbfe0

--------------------------------

[ Upstream commit 4d809f69695d4e7d1378b3a072fa9aef23123018 ]

The documentation of the function rvt_error_qp says both r_lock and s_lock
need to be held when calling that function.  It also asserts using lockdep
that both of those locks are held.  However, the commit I referenced in
Fixes accidentally makes the call to rvt_error_qp in rvt_ruc_loopback no
longer covered by r_lock.  This results in the lockdep assertion failing
and also possibly in a race condition.

Fixes: d757c60eca9b ("IB/rdmavt: Fix concurrency panics in QP post_send and modify to error")
Link: https://lore.kernel.org/r/20220228165330.41546-1-dossche.niels@gmail.com
Signed-off-by: Niels Dossche <dossche.niels@gmail.com>
Acked-by: Dennis Dalessandro <dennis.dalessandro@cornelisnetworks.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/infiniband/sw/rdmavt/qp.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c
index 09f0dbf941c0..d8d52a00a1be 100644
--- a/drivers/infiniband/sw/rdmavt/qp.c
+++ b/drivers/infiniband/sw/rdmavt/qp.c
@@ -3241,7 +3241,11 @@ void rvt_ruc_loopback(struct rvt_qp *sqp)
 	spin_lock_irqsave(&sqp->s_lock, flags);
 	rvt_send_complete(sqp, wqe, send_status);
 	if (sqp->ibqp.qp_type == IB_QPT_RC) {
-		int lastwqe = rvt_error_qp(sqp, IB_WC_WR_FLUSH_ERR);
+		int lastwqe;
+
+		spin_lock(&sqp->r_lock);
+		lastwqe = rvt_error_qp(sqp, IB_WC_WR_FLUSH_ERR);
+		spin_unlock(&sqp->r_lock);
 
 		sqp->s_flags &= ~RVT_S_BUSY;
 		spin_unlock_irqrestore(&sqp->s_lock, flags);
-- 
Gitee


From a18d2070ed2ca064484e6153586083e57b6370d9 Mon Sep 17 00:00:00 2001
From: Miaoqian Lin <linmq006@gmail.com>
Date: Thu, 14 Jul 2022 21:58:46 +0800
Subject: [PATCH 295/674] dpaa2-ptp: Fix refcount leak in dpaa2_ptp_probe

stable inclusion
from stable-v5.10.111
commit ebd1e3458dbf6643aa0272da398cb5b537d492b7
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=ebd1e3458dbf6643aa0272da398cb5b537d492b7

--------------------------------

[ Upstream commit 2b04bd4f03bba021959ca339314f6739710f0954 ]

This node pointer is returned by of_find_compatible_node() with
refcount incremented. Calling of_node_put() to aovid the refcount leak.

Fixes: d346c9e86d86 ("dpaa2-ptp: reuse ptp_qoriq driver")
Signed-off-by: Miaoqian Lin <linmq006@gmail.com>
Link: https://lore.kernel.org/r/20220404125336.13427-1-linmq006@gmail.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/net/ethernet/freescale/dpaa2/dpaa2-ptp.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-ptp.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-ptp.c
index 32b5faa87bb8..208a3459f2e2 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-ptp.c
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-ptp.c
@@ -168,7 +168,7 @@ static int dpaa2_ptp_probe(struct fsl_mc_device *mc_dev)
 	base = of_iomap(node, 0);
 	if (!base) {
 		err = -ENOMEM;
-		goto err_close;
+		goto err_put;
 	}
 
 	err = fsl_mc_allocate_irqs(mc_dev);
@@ -212,6 +212,8 @@ static int dpaa2_ptp_probe(struct fsl_mc_device *mc_dev)
 	fsl_mc_free_irqs(mc_dev);
 err_unmap:
 	iounmap(base);
+err_put:
+	of_node_put(node);
 err_close:
 	dprtc_close(mc_dev->mc_io, 0, mc_dev->mc_handle);
 err_free_mcp:
-- 
Gitee


From c5bafbbd9e3c4d96de225b74ad46a2102633219a Mon Sep 17 00:00:00 2001
From: Anatolii Gerasymenko <anatolii.gerasymenko@intel.com>
Date: Thu, 14 Jul 2022 21:58:47 +0800
Subject: [PATCH 296/674] ice: Set txq_teid to ICE_INVAL_TEID on ring creation

stable inclusion
from stable-v5.10.111
commit b003fc4913ea79c3c1458879acb474c66240489a
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=b003fc4913ea79c3c1458879acb474c66240489a

--------------------------------

[ Upstream commit ccfee1822042b87e5135d33cad8ea353e64612d2 ]

When VF is freshly created, but not brought up, ring->txq_teid
value is by default set to 0.
But 0 is a valid TEID. On some platforms the Root Node of
Tx scheduler has a TEID = 0. This can cause issues as shown below.

The proper way is to set ring->txq_teid to ICE_INVAL_TEID (0xFFFFFFFF).

Testing Hints:
echo 1 > /sys/class/net/ens785f0/device/sriov_numvfs
ip link set dev ens785f0v0 up
ip link set dev ens785f0v0 down

If we have freshly created VF and quickly turn it on and off, so there
would be no time to reach VIRTCHNL_OP_CONFIG_VSI_QUEUES stage, then
VIRTCHNL_OP_DISABLE_QUEUES stage will fail with error:
[  639.531454] disable queue 89 failed 14
[  639.532233] Failed to disable LAN Tx queues, error: ICE_ERR_AQ_ERROR
[  639.533107] ice 0000:02:00.0: Failed to stop Tx ring 0 on VSI 5

The reason for the fail is that we are trying to send AQ command to
delete queue 89, which has never been created and receive an "invalid
argument" error from firmware.

As this queue has never been created, it's teid and ring->txq_teid
have default value 0.
ice_dis_vsi_txq has a check against non-existent queues:

node = ice_sched_find_node_by_teid(pi->root, q_teids[i]);
if (!node)
	continue;

But on some platforms the Root Node of Tx scheduler has a teid = 0.
Hence, ice_sched_find_node_by_teid finds a node with teid = 0 (it is
pi->root), and we go further to submit an erroneous request to firmware.

Fixes: 37bb83901286 ("ice: Move common functions out of ice_main.c part 7/7")
Signed-off-by: Anatolii Gerasymenko <anatolii.gerasymenko@intel.com>
Reviewed-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Tested-by: Konrad Jankowski <konrad0.jankowski@intel.com>
Signed-off-by: Alice Michael <alice.michael@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/net/ethernet/intel/ice/ice_lib.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c
index ec475353b620..ea8d868c8f30 100644
--- a/drivers/net/ethernet/intel/ice/ice_lib.c
+++ b/drivers/net/ethernet/intel/ice/ice_lib.c
@@ -1265,6 +1265,7 @@ static int ice_vsi_alloc_rings(struct ice_vsi *vsi)
 		ring->vsi = vsi;
 		ring->dev = dev;
 		ring->count = vsi->num_tx_desc;
+		ring->txq_teid = ICE_INVAL_TEID;
 		WRITE_ONCE(vsi->tx_rings[i], ring);
 	}
 
-- 
Gitee


From 9708464859b169e7fbc5ad7cb53c49d137ab4902 Mon Sep 17 00:00:00 2001
From: Anatolii Gerasymenko <anatolii.gerasymenko@intel.com>
Date: Thu, 14 Jul 2022 21:58:48 +0800
Subject: [PATCH 297/674] ice: Do not skip not enabled queues in
 ice_vc_dis_qs_msg

stable inclusion
from stable-v5.10.111
commit ffce126c952e0b849cf280e0344bc087adc7256c
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=ffce126c952e0b849cf280e0344bc087adc7256c

--------------------------------

[ Upstream commit 05ef6813b234db3196f083b91db3963f040b65bb ]

Disable check for queue being enabled in ice_vc_dis_qs_msg, because
there could be a case when queues were created, but were not enabled.
We still need to delete those queues.

Normal workflow for VF looks like:
Enable path:
VIRTCHNL_OP_ADD_ETH_ADDR (opcode 10)
VIRTCHNL_OP_CONFIG_VSI_QUEUES (opcode 6)
VIRTCHNL_OP_ENABLE_QUEUES (opcode 8)

Disable path:
VIRTCHNL_OP_DISABLE_QUEUES (opcode 9)
VIRTCHNL_OP_DEL_ETH_ADDR (opcode 11)

The issue appears only in stress conditions when VF is enabled and
disabled very fast.
Eventually there will be a case, when queues are created by
VIRTCHNL_OP_CONFIG_VSI_QUEUES, but are not enabled by
VIRTCHNL_OP_ENABLE_QUEUES.
In turn, these queues are not deleted by VIRTCHNL_OP_DISABLE_QUEUES,
because there is a check whether queues are enabled in
ice_vc_dis_qs_msg.

When we bring up the VF again, we will see the "Failed to set LAN Tx queue
context" error during VIRTCHNL_OP_CONFIG_VSI_QUEUES step. This
happens because old 16 queues were not deleted and VF requests to create
16 more, but ice_sched_get_free_qparent in ice_ena_vsi_txq would fail to
find a parent node for first newly requested queue (because all nodes
are allocated to 16 old queues).

Testing Hints:

Just enable and disable VF fast enough, so it would be disabled before
reaching VIRTCHNL_OP_ENABLE_QUEUES.

while true; do
        ip link set dev ens785f0v0 up
        sleep 0.065 # adjust delay value for you machine
        ip link set dev ens785f0v0 down
done

Fixes: 77ca27c41705 ("ice: add support for virtchnl_queue_select.[tx|rx]_queues bitmap")
Signed-off-by: Anatolii Gerasymenko <anatolii.gerasymenko@intel.com>
Tested-by: Konrad Jankowski <konrad0.jankowski@intel.com>
Signed-off-by: Alice Michael <alice.michael@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c
index 5134342ff70f..a980d337861d 100644
--- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c
@@ -2723,9 +2723,9 @@ static int ice_vc_dis_qs_msg(struct ice_vf *vf, u8 *msg)
 				goto error_param;
 			}
 
-			/* Skip queue if not enabled */
 			if (!test_bit(vf_q_id, vf->txq_ena))
-				continue;
+				dev_dbg(ice_pf_to_dev(vsi->back), "Queue %u on VSI %u is not enabled, but stopping it anyway\n",
+					vf_q_id, vsi->vsi_num);
 
 			ice_fill_txq_meta(vsi, ring, &txq_meta);
 
-- 
Gitee


From a78dd66e9269c807cb1e8f792587597928f4ba26 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@kernel.org>
Date: Thu, 14 Jul 2022 21:58:49 +0800
Subject: [PATCH 298/674] ipv6: Fix stats accounting in ip6_pkt_drop

stable inclusion
from stable-v5.10.111
commit e3dd1202ab2ef81a69dbc20e18945b369dddea81
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=e3dd1202ab2ef81a69dbc20e18945b369dddea81

--------------------------------

[ Upstream commit 1158f79f82d437093aeed87d57df0548bdd68146 ]

VRF devices are the loopbacks for VRFs, and a loopback can not be
assigned to a VRF. Accordingly, the condition in ip6_pkt_drop should
be '||' not '&&'.

Fixes: 1d3fd8a10bed ("vrf: Use orig netdev to count Ip6InNoRoutes and a fresh route lookup when sending dest unreach")
Reported-by: Pudak, Filip <Filip.Pudak@windriver.com>
Reported-by: Xiao, Jiguang <Jiguang.Xiao@windriver.com>
Signed-off-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20220404150908.2937-1-dsahern@kernel.org
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 net/ipv6/route.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index b298b40cf158..a44ad9637e8a 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -4393,7 +4393,7 @@ static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
 	struct inet6_dev *idev;
 	int type;
 
-	if (netif_is_l3_master(skb->dev) &&
+	if (netif_is_l3_master(skb->dev) ||
 	    dst->dev == net->loopback_dev)
 		idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
 	else
-- 
Gitee


From 3fa6cb4bfd2c2a3948be6fc587f7cd3bcdf4eadc Mon Sep 17 00:00:00 2001
From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Date: Thu, 14 Jul 2022 21:58:50 +0800
Subject: [PATCH 299/674] ice: synchronize_rcu() when terminating rings

stable inclusion
from stable-v5.10.111
commit e54ea8fc51cae0232982f78c1b168d3b89a46ad2
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=e54ea8fc51cae0232982f78c1b168d3b89a46ad2

--------------------------------

[ Upstream commit f9124c68f05ffdb87a47e3ea6d5fae9dad7cb6eb ]

Unfortunately, the ice driver doesn't respect the RCU critical section that
XSK wakeup is surrounded with. To fix this, add synchronize_rcu() calls to
paths that destroy resources that might be in use.

This was addressed in other AF_XDP ZC enabled drivers, for reference see
for example commit b3873a5be757 ("net/i40e: Fix concurrency issues
between config flow and XSK")

Fixes: efc2214b6047 ("ice: Add support for XDP")
Fixes: 2d4238f55697 ("ice: Add support for AF_XDP")
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Tested-by: Shwetha Nagaraju <shwetha.nagaraju@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/net/ethernet/intel/ice/ice.h      | 2 +-
 drivers/net/ethernet/intel/ice/ice_main.c | 4 +++-
 drivers/net/ethernet/intel/ice/ice_xsk.c  | 4 +++-
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h
index 6a57b41ddb54..7794703c1359 100644
--- a/drivers/net/ethernet/intel/ice/ice.h
+++ b/drivers/net/ethernet/intel/ice/ice.h
@@ -498,7 +498,7 @@ static inline struct ice_pf *ice_netdev_to_pf(struct net_device *netdev)
 
 static inline bool ice_is_xdp_ena_vsi(struct ice_vsi *vsi)
 {
-	return !!vsi->xdp_prog;
+	return !!READ_ONCE(vsi->xdp_prog);
 }
 
 static inline void ice_set_ring_xdp(struct ice_ring *ring)
diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
index 20c9d55f3adc..eb0625b52e45 100644
--- a/drivers/net/ethernet/intel/ice/ice_main.c
+++ b/drivers/net/ethernet/intel/ice/ice_main.c
@@ -2475,8 +2475,10 @@ int ice_destroy_xdp_rings(struct ice_vsi *vsi)
 
 	for (i = 0; i < vsi->num_xdp_txq; i++)
 		if (vsi->xdp_rings[i]) {
-			if (vsi->xdp_rings[i]->desc)
+			if (vsi->xdp_rings[i]->desc) {
+				synchronize_rcu();
 				ice_free_tx_ring(vsi->xdp_rings[i]);
+			}
 			kfree_rcu(vsi->xdp_rings[i], rcu);
 			vsi->xdp_rings[i] = NULL;
 		}
diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c
index 9f36f8d7a985..5733526fa245 100644
--- a/drivers/net/ethernet/intel/ice/ice_xsk.c
+++ b/drivers/net/ethernet/intel/ice/ice_xsk.c
@@ -36,8 +36,10 @@ static void ice_qp_reset_stats(struct ice_vsi *vsi, u16 q_idx)
 static void ice_qp_clean_rings(struct ice_vsi *vsi, u16 q_idx)
 {
 	ice_clean_tx_ring(vsi->tx_rings[q_idx]);
-	if (ice_is_xdp_ena_vsi(vsi))
+	if (ice_is_xdp_ena_vsi(vsi)) {
+		synchronize_rcu();
 		ice_clean_tx_ring(vsi->xdp_rings[q_idx]);
+	}
 	ice_clean_rx_ring(vsi->rx_rings[q_idx]);
 }
 
-- 
Gitee


From 25b6995796c162fbc8ddf7d510402f3eb9fe4158 Mon Sep 17 00:00:00 2001
From: Ilya Maximets <i.maximets@ovn.org>
Date: Thu, 14 Jul 2022 21:58:51 +0800
Subject: [PATCH 300/674] net: openvswitch: don't send internal clone attribute
 to the userspace.

stable inclusion
from stable-v5.10.111
commit 42ab401d22de3429f600a48acdd4c0ee5662783c
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=42ab401d22de3429f600a48acdd4c0ee5662783c

--------------------------------

[ Upstream commit 3f2a3050b4a3e7f32fc0ea3c9b0183090ae00522 ]

'OVS_CLONE_ATTR_EXEC' is an internal attribute that is used for
performance optimization inside the kernel.  It's added by the kernel
while parsing user-provided actions and should not be sent during the
flow dump as it's not part of the uAPI.

The issue doesn't cause any significant problems to the ovs-vswitchd
process, because reported actions are not really used in the
application lifecycle and only supposed to be shown to a human via
ovs-dpctl flow dump.  However, the action list is still incorrect
and causes the following error if the user wants to look at the
datapath flows:

  # ovs-dpctl add-dp system@ovs-system
  # ovs-dpctl add-flow "<flow match>" "clone(ct(commit),0)"
  # ovs-dpctl dump-flows
  <flow match>, packets:0, bytes:0, used:never,
    actions:clone(bad length 4, expected -1 for: action0(01 00 00 00),
                  ct(commit),0)

With the fix:

  # ovs-dpctl dump-flows
  <flow match>, packets:0, bytes:0, used:never,
    actions:clone(ct(commit),0)

Additionally fixed an incorrect attribute name in the comment.

Fixes: b233504033db ("openvswitch: kernel datapath clone action")
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
Acked-by: Aaron Conole <aconole@redhat.com>
Link: https://lore.kernel.org/r/20220404104150.2865736-1-i.maximets@ovn.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 net/openvswitch/actions.c      | 2 +-
 net/openvswitch/flow_netlink.c | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 525c1540f10e..6d8d70021666 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -1044,7 +1044,7 @@ static int clone(struct datapath *dp, struct sk_buff *skb,
 	int rem = nla_len(attr);
 	bool dont_clone_flow_key;
 
-	/* The first action is always 'OVS_CLONE_ATTR_ARG'. */
+	/* The first action is always 'OVS_CLONE_ATTR_EXEC'. */
 	clone_arg = nla_data(attr);
 	dont_clone_flow_key = nla_get_u32(clone_arg);
 	actions = nla_next(clone_arg, &rem);
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index 8c4bdfa627ca..c41093540b2f 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -3419,7 +3419,9 @@ static int clone_action_to_attr(const struct nlattr *attr,
 	if (!start)
 		return -EMSGSIZE;
 
-	err = ovs_nla_put_actions(nla_data(attr), rem, skb);
+	/* Skipping the OVS_CLONE_ATTR_EXEC that is always the first attribute. */
+	attr = nla_next(nla_data(attr), &rem);
+	err = ovs_nla_put_actions(attr, rem, skb);
 
 	if (err)
 		nla_nest_cancel(skb, start);
-- 
Gitee


From b6cea59998c5c91457499a93d5900efb635f1579 Mon Sep 17 00:00:00 2001
From: Ilya Maximets <i.maximets@ovn.org>
Date: Thu, 14 Jul 2022 21:58:52 +0800
Subject: [PATCH 301/674] net: openvswitch: fix leak of nested actions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

stable inclusion
from stable-v5.10.111
commit 5ae05b5eb58773cfec307ff88aff4cfd843c4cff
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=5ae05b5eb58773cfec307ff88aff4cfd843c4cff

--------------------------------

[ Upstream commit 1f30fb9166d4f15a1aa19449b9da871fe0ed4796 ]

While parsing user-provided actions, openvswitch module may dynamically
allocate memory and store pointers in the internal copy of the actions.
So this memory has to be freed while destroying the actions.

Currently there are only two such actions: ct() and set().  However,
there are many actions that can hold nested lists of actions and
ovs_nla_free_flow_actions() just jumps over them leaking the memory.

For example, removal of the flow with the following actions will lead
to a leak of the memory allocated by nf_ct_tmpl_alloc():

  actions:clone(ct(commit),0)

Non-freed set() action may also leak the 'dst' structure for the
tunnel info including device references.

Under certain conditions with a high rate of flow rotation that may
cause significant memory leak problem (2MB per second in reporter's
case).  The problem is also hard to mitigate, because the user doesn't
have direct control over the datapath flows generated by OVS.

Fix that by iterating over all the nested actions and freeing
everything that needs to be freed recursively.

New build time assertion should protect us from this problem if new
actions will be added in the future.

Unfortunately, openvswitch module doesn't use NLA_F_NESTED, so all
attributes has to be explicitly checked.  sample() and clone() actions
are mixing extra attributes into the user-provided action list.  That
prevents some code generalization too.

Fixes: 34ae932a4036 ("openvswitch: Make tunnel set action attach a metadata dst")
Link: https://mail.openvswitch.org/pipermail/ovs-dev/2022-March/392922.html
Reported-by: Stéphane Graber <stgraber@ubuntu.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
Acked-by: Aaron Conole <aconole@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 net/openvswitch/flow_netlink.c | 95 ++++++++++++++++++++++++++++++++--
 1 file changed, 90 insertions(+), 5 deletions(-)

diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index c41093540b2f..98a7e6f64ab0 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -2288,6 +2288,62 @@ static struct sw_flow_actions *nla_alloc_flow_actions(int size)
 	return sfa;
 }
 
+static void ovs_nla_free_nested_actions(const struct nlattr *actions, int len);
+
+static void ovs_nla_free_check_pkt_len_action(const struct nlattr *action)
+{
+	const struct nlattr *a;
+	int rem;
+
+	nla_for_each_nested(a, action, rem) {
+		switch (nla_type(a)) {
+		case OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL:
+		case OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER:
+			ovs_nla_free_nested_actions(nla_data(a), nla_len(a));
+			break;
+		}
+	}
+}
+
+static void ovs_nla_free_clone_action(const struct nlattr *action)
+{
+	const struct nlattr *a = nla_data(action);
+	int rem = nla_len(action);
+
+	switch (nla_type(a)) {
+	case OVS_CLONE_ATTR_EXEC:
+		/* The real list of actions follows this attribute. */
+		a = nla_next(a, &rem);
+		ovs_nla_free_nested_actions(a, rem);
+		break;
+	}
+}
+
+static void ovs_nla_free_dec_ttl_action(const struct nlattr *action)
+{
+	const struct nlattr *a = nla_data(action);
+
+	switch (nla_type(a)) {
+	case OVS_DEC_TTL_ATTR_ACTION:
+		ovs_nla_free_nested_actions(nla_data(a), nla_len(a));
+		break;
+	}
+}
+
+static void ovs_nla_free_sample_action(const struct nlattr *action)
+{
+	const struct nlattr *a = nla_data(action);
+	int rem = nla_len(action);
+
+	switch (nla_type(a)) {
+	case OVS_SAMPLE_ATTR_ARG:
+		/* The real list of actions follows this attribute. */
+		a = nla_next(a, &rem);
+		ovs_nla_free_nested_actions(a, rem);
+		break;
+	}
+}
+
 static void ovs_nla_free_set_action(const struct nlattr *a)
 {
 	const struct nlattr *ovs_key = nla_data(a);
@@ -2301,25 +2357,54 @@ static void ovs_nla_free_set_action(const struct nlattr *a)
 	}
 }
 
-void ovs_nla_free_flow_actions(struct sw_flow_actions *sf_acts)
+static void ovs_nla_free_nested_actions(const struct nlattr *actions, int len)
 {
 	const struct nlattr *a;
 	int rem;
 
-	if (!sf_acts)
+	/* Whenever new actions are added, the need to update this
+	 * function should be considered.
+	 */
+	BUILD_BUG_ON(OVS_ACTION_ATTR_MAX != 23);
+
+	if (!actions)
 		return;
 
-	nla_for_each_attr(a, sf_acts->actions, sf_acts->actions_len, rem) {
+	nla_for_each_attr(a, actions, len, rem) {
 		switch (nla_type(a)) {
-		case OVS_ACTION_ATTR_SET:
-			ovs_nla_free_set_action(a);
+		case OVS_ACTION_ATTR_CHECK_PKT_LEN:
+			ovs_nla_free_check_pkt_len_action(a);
+			break;
+
+		case OVS_ACTION_ATTR_CLONE:
+			ovs_nla_free_clone_action(a);
 			break;
+
 		case OVS_ACTION_ATTR_CT:
 			ovs_ct_free_action(a);
 			break;
+
+		case OVS_ACTION_ATTR_DEC_TTL:
+			ovs_nla_free_dec_ttl_action(a);
+			break;
+
+		case OVS_ACTION_ATTR_SAMPLE:
+			ovs_nla_free_sample_action(a);
+			break;
+
+		case OVS_ACTION_ATTR_SET:
+			ovs_nla_free_set_action(a);
+			break;
 		}
 	}
+}
+
+void ovs_nla_free_flow_actions(struct sw_flow_actions *sf_acts)
+{
+	if (!sf_acts)
+		return;
 
+	ovs_nla_free_nested_actions(sf_acts->actions, sf_acts->actions_len);
 	kfree(sf_acts);
 }
 
-- 
Gitee


From 16ee25f37dbf211c272cce1eed4ab51ba8356051 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 14 Jul 2022 21:58:53 +0800
Subject: [PATCH 302/674] rxrpc: fix a race in rxrpc_exit_net()

stable inclusion
from stable-v5.10.111
commit 08ff0e74fab517dbc44e11b8bc683dd4ecc65950
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=08ff0e74fab517dbc44e11b8bc683dd4ecc65950

--------------------------------

[ Upstream commit 1946014ca3b19be9e485e780e862c375c6f98bad ]

Current code can lead to the following race:

CPU0                                                 CPU1

rxrpc_exit_net()
                                                     rxrpc_peer_keepalive_worker()
                                                       if (rxnet->live)

  rxnet->live = false;
  del_timer_sync(&rxnet->peer_keepalive_timer);

                                                             timer_reduce(&rxnet->peer_keepalive_timer, jiffies + delay);

  cancel_work_sync(&rxnet->peer_keepalive_work);

rxrpc_exit_net() exits while peer_keepalive_timer is still armed,
leading to use-after-free.

syzbot report was:

ODEBUG: free active (active state 0) object type: timer_list hint: rxrpc_peer_keepalive_timeout+0x0/0xb0
WARNING: CPU: 0 PID: 3660 at lib/debugobjects.c:505 debug_print_object+0x16e/0x250 lib/debugobjects.c:505
Modules linked in:
CPU: 0 PID: 3660 Comm: kworker/u4:6 Not tainted 5.17.0-syzkaller-13993-g88e6c0207623 #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Workqueue: netns cleanup_net
RIP: 0010:debug_print_object+0x16e/0x250 lib/debugobjects.c:505
Code: ff df 48 89 fa 48 c1 ea 03 80 3c 02 00 0f 85 af 00 00 00 48 8b 14 dd 00 1c 26 8a 4c 89 ee 48 c7 c7 00 10 26 8a e8 b1 e7 28 05 <0f> 0b 83 05 15 eb c5 09 01 48 83 c4 18 5b 5d 41 5c 41 5d 41 5e c3
RSP: 0018:ffffc9000353fb00 EFLAGS: 00010082
RAX: 0000000000000000 RBX: 0000000000000003 RCX: 0000000000000000
RDX: ffff888029196140 RSI: ffffffff815efad8 RDI: fffff520006a7f52
RBP: 0000000000000001 R08: 0000000000000000 R09: 0000000000000000
R10: ffffffff815ea4ae R11: 0000000000000000 R12: ffffffff89ce23e0
R13: ffffffff8a2614e0 R14: ffffffff816628c0 R15: dffffc0000000000
FS:  0000000000000000(0000) GS:ffff8880b9c00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007fe1f2908924 CR3: 0000000043720000 CR4: 00000000003506f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
 <TASK>
 __debug_check_no_obj_freed lib/debugobjects.c:992 [inline]
 debug_check_no_obj_freed+0x301/0x420 lib/debugobjects.c:1023
 kfree+0xd6/0x310 mm/slab.c:3809
 ops_free_list.part.0+0x119/0x370 net/core/net_namespace.c:176
 ops_free_list net/core/net_namespace.c:174 [inline]
 cleanup_net+0x591/0xb00 net/core/net_namespace.c:598
 process_one_work+0x996/0x1610 kernel/workqueue.c:2289
 worker_thread+0x665/0x1080 kernel/workqueue.c:2436
 kthread+0x2e9/0x3a0 kernel/kthread.c:376
 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:298
 </TASK>

Fixes: ace45bec6d77 ("rxrpc: Fix firewall route keepalive")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Marc Dionne <marc.dionne@auristor.com>
Cc: linux-afs@lists.infradead.org
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 net/rxrpc/net_ns.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/rxrpc/net_ns.c b/net/rxrpc/net_ns.c
index 25bbc4cc8b13..f15d6942da45 100644
--- a/net/rxrpc/net_ns.c
+++ b/net/rxrpc/net_ns.c
@@ -113,8 +113,8 @@ static __net_exit void rxrpc_exit_net(struct net *net)
 	struct rxrpc_net *rxnet = rxrpc_net(net);
 
 	rxnet->live = false;
-	del_timer_sync(&rxnet->peer_keepalive_timer);
 	cancel_work_sync(&rxnet->peer_keepalive_work);
+	del_timer_sync(&rxnet->peer_keepalive_timer);
 	rxrpc_destroy_all_calls(rxnet);
 	rxrpc_destroy_all_connections(rxnet);
 	rxrpc_destroy_all_peers(rxnet);
-- 
Gitee


From 993871ac488687874fd489ce2e9ed85b4c3ab465 Mon Sep 17 00:00:00 2001
From: Michael Walle <michael@walle.cc>
Date: Thu, 14 Jul 2022 21:58:54 +0800
Subject: [PATCH 303/674] net: phy: mscc-miim: reject clause 45 register
 accesses

stable inclusion
from stable-v5.10.111
commit b7893388bb888cb15c7db0c0f71bd58a4fb5312c
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=b7893388bb888cb15c7db0c0f71bd58a4fb5312c

--------------------------------

[ Upstream commit 8d90991e5bf7fdb9f264f5f579d18969913054b7 ]

The driver doesn't support clause 45 register access yet, but doesn't
check if the access is a c45 one either. This leads to spurious register
reads and writes. Add the check.

Fixes: 542671fe4d86 ("net: phy: mscc-miim: Add MDIO driver")
Signed-off-by: Michael Walle <michael@walle.cc>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/net/mdio/mdio-mscc-miim.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/net/mdio/mdio-mscc-miim.c b/drivers/net/mdio/mdio-mscc-miim.c
index 11f583fd4611..1c9232fca1e2 100644
--- a/drivers/net/mdio/mdio-mscc-miim.c
+++ b/drivers/net/mdio/mdio-mscc-miim.c
@@ -76,6 +76,9 @@ static int mscc_miim_read(struct mii_bus *bus, int mii_id, int regnum)
 	u32 val;
 	int ret;
 
+	if (regnum & MII_ADDR_C45)
+		return -EOPNOTSUPP;
+
 	ret = mscc_miim_wait_pending(bus);
 	if (ret)
 		goto out;
@@ -105,6 +108,9 @@ static int mscc_miim_write(struct mii_bus *bus, int mii_id,
 	struct mscc_miim_dev *miim = bus->priv;
 	int ret;
 
+	if (regnum & MII_ADDR_C45)
+		return -EOPNOTSUPP;
+
 	ret = mscc_miim_wait_pending(bus);
 	if (ret < 0)
 		goto out;
-- 
Gitee


From d44c28285509cde9d8d3ee017e9bab8b7de8bd8f Mon Sep 17 00:00:00 2001
From: Jamie Bainbridge <jamie.bainbridge@gmail.com>
Date: Thu, 14 Jul 2022 21:58:55 +0800
Subject: [PATCH 304/674] qede: confirm skb is allocated before using

stable inclusion
from stable-v5.10.111
commit 8928239e5e2e460d95b8a0b89f61671625e7ece0
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=8928239e5e2e460d95b8a0b89f61671625e7ece0

--------------------------------

[ Upstream commit 4e910dbe36508654a896d5735b318c0b88172570 ]

qede_build_skb() assumes build_skb() always works and goes straight
to skb_reserve(). However, build_skb() can fail under memory pressure.
This results in a kernel panic because the skb to reserve is NULL.

Add a check in case build_skb() failed to allocate and return NULL.

The NULL return is handled correctly in callers to qede_build_skb().

Fixes: 8a8633978b842 ("qede: Add build_skb() support.")
Signed-off-by: Jamie Bainbridge <jamie.bainbridge@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/net/ethernet/qlogic/qede/qede_fp.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/qlogic/qede/qede_fp.c b/drivers/net/ethernet/qlogic/qede/qede_fp.c
index 21c906200e79..d210632676d3 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_fp.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_fp.c
@@ -752,6 +752,9 @@ qede_build_skb(struct qede_rx_queue *rxq,
 	buf = page_address(bd->data) + bd->page_offset;
 	skb = build_skb(buf, rxq->rx_buf_seg_size);
 
+	if (unlikely(!skb))
+		return NULL;
+
 	skb_reserve(skb, pad);
 	skb_put(skb, len);
 
-- 
Gitee


From 8d205a67676cd45e872270911b401c05d4b26981 Mon Sep 17 00:00:00 2001
From: Kamal Dasu <kdasu.kdev@gmail.com>
Date: Thu, 14 Jul 2022 21:58:56 +0800
Subject: [PATCH 305/674] spi: bcm-qspi: fix MSPI only access with
 bcm_qspi_exec_mem_op()

stable inclusion
from stable-v5.10.111
commit 6c17f4ef3c4f662865ac99ad167a8b896ca70d2f
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=6c17f4ef3c4f662865ac99ad167a8b896ca70d2f

--------------------------------

[ Upstream commit 2c7d1b281286c46049cd22b43435cecba560edde ]

This fixes case where MSPI controller is used to access spi-nor
flash and BSPI block is not present.

Fixes: 5f195ee7d830 ("spi: bcm-qspi: Implement the spi_mem interface")
Signed-off-by: Kamal Dasu <kdasu.kdev@gmail.com>
Acked-by: Florian Fainelli <f.fainelli@gmail.com>
Link: https://lore.kernel.org/r/20220328142442.7553-1-kdasu.kdev@gmail.com
Signed-off-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/spi/spi-bcm-qspi.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/spi/spi-bcm-qspi.c b/drivers/spi/spi-bcm-qspi.c
index 4a80f043b7b1..766b00350e39 100644
--- a/drivers/spi/spi-bcm-qspi.c
+++ b/drivers/spi/spi-bcm-qspi.c
@@ -1032,7 +1032,7 @@ static int bcm_qspi_exec_mem_op(struct spi_mem *mem,
 	addr = op->addr.val;
 	len = op->data.nbytes;
 
-	if (bcm_qspi_bspi_ver_three(qspi) == true) {
+	if (has_bspi(qspi) && bcm_qspi_bspi_ver_three(qspi) == true) {
 		/*
 		 * The address coming into this function is a raw flash offset.
 		 * But for BSPI <= V3, we need to convert it to a remapped BSPI
@@ -1051,7 +1051,7 @@ static int bcm_qspi_exec_mem_op(struct spi_mem *mem,
 	    len < 4)
 		mspi_read = true;
 
-	if (mspi_read)
+	if (!has_bspi(qspi) || mspi_read)
 		return bcm_qspi_mspi_exec_mem_op(spi, op);
 
 	ret = bcm_qspi_bspi_set_mode(qspi, op, 0);
-- 
Gitee


From 22c85679a13dc27646fe9c747598020cf8e07817 Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy <maximmi@nvidia.com>
Date: Thu, 14 Jul 2022 21:58:57 +0800
Subject: [PATCH 306/674] bpf: Support dual-stack sockets in
 bpf_tcp_check_syncookie

stable inclusion
from stable-v5.10.111
commit 970a6bb72912f2accae52e811a6618e3d4a5b0ff
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=970a6bb72912f2accae52e811a6618e3d4a5b0ff

--------------------------------

[ Upstream commit 2e8702cc0cfa1080f29fd64003c00a3e24ac38de ]

bpf_tcp_gen_syncookie looks at the IP version in the IP header and
validates the address family of the socket. It supports IPv4 packets in
AF_INET6 dual-stack sockets.

On the other hand, bpf_tcp_check_syncookie looks only at the address
family of the socket, ignoring the real IP version in headers, and
validates only the packet size. This implementation has some drawbacks:

1. Packets are not validated properly, allowing a BPF program to trick
   bpf_tcp_check_syncookie into handling an IPv6 packet on an IPv4
   socket.

2. Dual-stack sockets fail the checks on IPv4 packets. IPv4 clients end
   up receiving a SYNACK with the cookie, but the following ACK gets
   dropped.

This patch fixes these issues by changing the checks in
bpf_tcp_check_syncookie to match the ones in bpf_tcp_gen_syncookie. IP
version from the header is taken into account, and it is validated
properly with address family.

Fixes: 399040847084 ("bpf: add helper to check for a valid SYN cookie")
Signed-off-by: Maxim Mikityanskiy <maximmi@nvidia.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Acked-by: Arthur Fabre <afabre@cloudflare.com>
Link: https://lore.kernel.org/bpf/20220406124113.2795730-1-maximmi@nvidia.com
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 net/core/filter.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index 6b619ef0dee3..933fdf6e6a90 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -6562,24 +6562,33 @@ BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len
 	if (!th->ack || th->rst || th->syn)
 		return -ENOENT;
 
+	if (unlikely(iph_len < sizeof(struct iphdr)))
+		return -EINVAL;
+
 	if (tcp_synq_no_recent_overflow(sk))
 		return -ENOENT;
 
 	cookie = ntohl(th->ack_seq) - 1;
 
-	switch (sk->sk_family) {
-	case AF_INET:
-		if (unlikely(iph_len < sizeof(struct iphdr)))
+	/* Both struct iphdr and struct ipv6hdr have the version field at the
+	 * same offset so we can cast to the shorter header (struct iphdr).
+	 */
+	switch (((struct iphdr *)iph)->version) {
+	case 4:
+		if (sk->sk_family == AF_INET6 && ipv6_only_sock(sk))
 			return -EINVAL;
 
 		ret = __cookie_v4_check((struct iphdr *)iph, th, cookie);
 		break;
 
 #if IS_BUILTIN(CONFIG_IPV6)
-	case AF_INET6:
+	case 6:
 		if (unlikely(iph_len < sizeof(struct ipv6hdr)))
 			return -EINVAL;
 
+		if (sk->sk_family != AF_INET6)
+			return -EINVAL;
+
 		ret = __cookie_v6_check((struct ipv6hdr *)iph, th, cookie);
 		break;
 #endif /* CONFIG_IPV6 */
-- 
Gitee


From e1119e30c3433839ad8484ebc20b35107c110de8 Mon Sep 17 00:00:00 2001
From: Lv Yunlong <lyl2019@mail.ustc.edu.cn>
Date: Thu, 14 Jul 2022 21:58:58 +0800
Subject: [PATCH 307/674] drbd: Fix five use after free bugs in
 get_initial_state
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

stable inclusion
from stable-v5.10.111
commit 594205b4936771a250f9d141e7e0fff21c3dd2d9
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=594205b4936771a250f9d141e7e0fff21c3dd2d9

--------------------------------

[ Upstream commit aadb22ba2f656581b2f733deb3a467c48cc618f6 ]

In get_initial_state, it calls notify_initial_state_done(skb,..) if
cb->args[5]==1. If genlmsg_put() failed in notify_initial_state_done(),
the skb will be freed by nlmsg_free(skb).
Then get_initial_state will goto out and the freed skb will be used by
return value skb->len, which is a uaf bug.

What's worse, the same problem goes even further: skb can also be
freed in the notify_*_state_change -> notify_*_state calls below.
Thus 4 additional uaf bugs happened.

My patch lets the problem callee functions: notify_initial_state_done
and notify_*_state_change return an error code if errors happen.
So that the error codes could be propagated and the uaf bugs can be avoid.

v2 reports a compilation warning. This v3 fixed this warning and built
successfully in my local environment with no additional warnings.
v2: https://lore.kernel.org/patchwork/patch/1435218/

Fixes: a29728463b254 ("drbd: Backport the "events2" command")
Signed-off-by: Lv Yunlong <lyl2019@mail.ustc.edu.cn>
Reviewed-by: Christoph Böhmwalder <christoph.boehmwalder@linbit.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/block/drbd/drbd_int.h          |  8 ++---
 drivers/block/drbd/drbd_nl.c           | 41 ++++++++++++++++----------
 drivers/block/drbd/drbd_state.c        | 18 +++++------
 drivers/block/drbd/drbd_state_change.h |  8 ++---
 4 files changed, 42 insertions(+), 33 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 8f879e5c2f67..60b9ca53c0a3 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1644,22 +1644,22 @@ struct sib_info {
 };
 void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib);
 
-extern void notify_resource_state(struct sk_buff *,
+extern int notify_resource_state(struct sk_buff *,
 				  unsigned int,
 				  struct drbd_resource *,
 				  struct resource_info *,
 				  enum drbd_notification_type);
-extern void notify_device_state(struct sk_buff *,
+extern int notify_device_state(struct sk_buff *,
 				unsigned int,
 				struct drbd_device *,
 				struct device_info *,
 				enum drbd_notification_type);
-extern void notify_connection_state(struct sk_buff *,
+extern int notify_connection_state(struct sk_buff *,
 				    unsigned int,
 				    struct drbd_connection *,
 				    struct connection_info *,
 				    enum drbd_notification_type);
-extern void notify_peer_device_state(struct sk_buff *,
+extern int notify_peer_device_state(struct sk_buff *,
 				     unsigned int,
 				     struct drbd_peer_device *,
 				     struct peer_device_info *,
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index bf7de4c7b96c..f8d0146bf785 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -4614,7 +4614,7 @@ static int nla_put_notification_header(struct sk_buff *msg,
 	return drbd_notification_header_to_skb(msg, &nh, true);
 }
 
-void notify_resource_state(struct sk_buff *skb,
+int notify_resource_state(struct sk_buff *skb,
 			   unsigned int seq,
 			   struct drbd_resource *resource,
 			   struct resource_info *resource_info,
@@ -4656,16 +4656,17 @@ void notify_resource_state(struct sk_buff *skb,
 		if (err && err != -ESRCH)
 			goto failed;
 	}
-	return;
+	return 0;
 
 nla_put_failure:
 	nlmsg_free(skb);
 failed:
 	drbd_err(resource, "Error %d while broadcasting event. Event seq:%u\n",
 			err, seq);
+	return err;
 }
 
-void notify_device_state(struct sk_buff *skb,
+int notify_device_state(struct sk_buff *skb,
 			 unsigned int seq,
 			 struct drbd_device *device,
 			 struct device_info *device_info,
@@ -4705,16 +4706,17 @@ void notify_device_state(struct sk_buff *skb,
 		if (err && err != -ESRCH)
 			goto failed;
 	}
-	return;
+	return 0;
 
 nla_put_failure:
 	nlmsg_free(skb);
 failed:
 	drbd_err(device, "Error %d while broadcasting event. Event seq:%u\n",
 		 err, seq);
+	return err;
 }
 
-void notify_connection_state(struct sk_buff *skb,
+int notify_connection_state(struct sk_buff *skb,
 			     unsigned int seq,
 			     struct drbd_connection *connection,
 			     struct connection_info *connection_info,
@@ -4754,16 +4756,17 @@ void notify_connection_state(struct sk_buff *skb,
 		if (err && err != -ESRCH)
 			goto failed;
 	}
-	return;
+	return 0;
 
 nla_put_failure:
 	nlmsg_free(skb);
 failed:
 	drbd_err(connection, "Error %d while broadcasting event. Event seq:%u\n",
 		 err, seq);
+	return err;
 }
 
-void notify_peer_device_state(struct sk_buff *skb,
+int notify_peer_device_state(struct sk_buff *skb,
 			      unsigned int seq,
 			      struct drbd_peer_device *peer_device,
 			      struct peer_device_info *peer_device_info,
@@ -4804,13 +4807,14 @@ void notify_peer_device_state(struct sk_buff *skb,
 		if (err && err != -ESRCH)
 			goto failed;
 	}
-	return;
+	return 0;
 
 nla_put_failure:
 	nlmsg_free(skb);
 failed:
 	drbd_err(peer_device, "Error %d while broadcasting event. Event seq:%u\n",
 		 err, seq);
+	return err;
 }
 
 void notify_helper(enum drbd_notification_type type,
@@ -4861,7 +4865,7 @@ void notify_helper(enum drbd_notification_type type,
 		 err, seq);
 }
 
-static void notify_initial_state_done(struct sk_buff *skb, unsigned int seq)
+static int notify_initial_state_done(struct sk_buff *skb, unsigned int seq)
 {
 	struct drbd_genlmsghdr *dh;
 	int err;
@@ -4875,11 +4879,12 @@ static void notify_initial_state_done(struct sk_buff *skb, unsigned int seq)
 	if (nla_put_notification_header(skb, NOTIFY_EXISTS))
 		goto nla_put_failure;
 	genlmsg_end(skb, dh);
-	return;
+	return 0;
 
 nla_put_failure:
 	nlmsg_free(skb);
 	pr_err("Error %d sending event. Event seq:%u\n", err, seq);
+	return err;
 }
 
 static void free_state_changes(struct list_head *list)
@@ -4906,6 +4911,7 @@ static int get_initial_state(struct sk_buff *skb, struct netlink_callback *cb)
 	unsigned int seq = cb->args[2];
 	unsigned int n;
 	enum drbd_notification_type flags = 0;
+	int err = 0;
 
 	/* There is no need for taking notification_mutex here: it doesn't
 	   matter if the initial state events mix with later state chage
@@ -4914,32 +4920,32 @@ static int get_initial_state(struct sk_buff *skb, struct netlink_callback *cb)
 
 	cb->args[5]--;
 	if (cb->args[5] == 1) {
-		notify_initial_state_done(skb, seq);
+		err = notify_initial_state_done(skb, seq);
 		goto out;
 	}
 	n = cb->args[4]++;
 	if (cb->args[4] < cb->args[3])
 		flags |= NOTIFY_CONTINUES;
 	if (n < 1) {
-		notify_resource_state_change(skb, seq, state_change->resource,
+		err = notify_resource_state_change(skb, seq, state_change->resource,
 					     NOTIFY_EXISTS | flags);
 		goto next;
 	}
 	n--;
 	if (n < state_change->n_connections) {
-		notify_connection_state_change(skb, seq, &state_change->connections[n],
+		err = notify_connection_state_change(skb, seq, &state_change->connections[n],
 					       NOTIFY_EXISTS | flags);
 		goto next;
 	}
 	n -= state_change->n_connections;
 	if (n < state_change->n_devices) {
-		notify_device_state_change(skb, seq, &state_change->devices[n],
+		err = notify_device_state_change(skb, seq, &state_change->devices[n],
 					   NOTIFY_EXISTS | flags);
 		goto next;
 	}
 	n -= state_change->n_devices;
 	if (n < state_change->n_devices * state_change->n_connections) {
-		notify_peer_device_state_change(skb, seq, &state_change->peer_devices[n],
+		err = notify_peer_device_state_change(skb, seq, &state_change->peer_devices[n],
 						NOTIFY_EXISTS | flags);
 		goto next;
 	}
@@ -4954,7 +4960,10 @@ static int get_initial_state(struct sk_buff *skb, struct netlink_callback *cb)
 		cb->args[4] = 0;
 	}
 out:
-	return skb->len;
+	if (err)
+		return err;
+	else
+		return skb->len;
 }
 
 int drbd_adm_get_initial_state(struct sk_buff *skb, struct netlink_callback *cb)
diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c
index 0067d328f0b5..5fbaea6b77b1 100644
--- a/drivers/block/drbd/drbd_state.c
+++ b/drivers/block/drbd/drbd_state.c
@@ -1537,7 +1537,7 @@ int drbd_bitmap_io_from_worker(struct drbd_device *device,
 	return rv;
 }
 
-void notify_resource_state_change(struct sk_buff *skb,
+int notify_resource_state_change(struct sk_buff *skb,
 				  unsigned int seq,
 				  struct drbd_resource_state_change *resource_state_change,
 				  enum drbd_notification_type type)
@@ -1550,10 +1550,10 @@ void notify_resource_state_change(struct sk_buff *skb,
 		.res_susp_fen = resource_state_change->susp_fen[NEW],
 	};
 
-	notify_resource_state(skb, seq, resource, &resource_info, type);
+	return notify_resource_state(skb, seq, resource, &resource_info, type);
 }
 
-void notify_connection_state_change(struct sk_buff *skb,
+int notify_connection_state_change(struct sk_buff *skb,
 				    unsigned int seq,
 				    struct drbd_connection_state_change *connection_state_change,
 				    enum drbd_notification_type type)
@@ -1564,10 +1564,10 @@ void notify_connection_state_change(struct sk_buff *skb,
 		.conn_role = connection_state_change->peer_role[NEW],
 	};
 
-	notify_connection_state(skb, seq, connection, &connection_info, type);
+	return notify_connection_state(skb, seq, connection, &connection_info, type);
 }
 
-void notify_device_state_change(struct sk_buff *skb,
+int notify_device_state_change(struct sk_buff *skb,
 				unsigned int seq,
 				struct drbd_device_state_change *device_state_change,
 				enum drbd_notification_type type)
@@ -1577,10 +1577,10 @@ void notify_device_state_change(struct sk_buff *skb,
 		.dev_disk_state = device_state_change->disk_state[NEW],
 	};
 
-	notify_device_state(skb, seq, device, &device_info, type);
+	return notify_device_state(skb, seq, device, &device_info, type);
 }
 
-void notify_peer_device_state_change(struct sk_buff *skb,
+int notify_peer_device_state_change(struct sk_buff *skb,
 				     unsigned int seq,
 				     struct drbd_peer_device_state_change *p,
 				     enum drbd_notification_type type)
@@ -1594,7 +1594,7 @@ void notify_peer_device_state_change(struct sk_buff *skb,
 		.peer_resync_susp_dependency = p->resync_susp_dependency[NEW],
 	};
 
-	notify_peer_device_state(skb, seq, peer_device, &peer_device_info, type);
+	return notify_peer_device_state(skb, seq, peer_device, &peer_device_info, type);
 }
 
 static void broadcast_state_change(struct drbd_state_change *state_change)
@@ -1602,7 +1602,7 @@ static void broadcast_state_change(struct drbd_state_change *state_change)
 	struct drbd_resource_state_change *resource_state_change = &state_change->resource[0];
 	bool resource_state_has_changed;
 	unsigned int n_device, n_connection, n_peer_device, n_peer_devices;
-	void (*last_func)(struct sk_buff *, unsigned int, void *,
+	int (*last_func)(struct sk_buff *, unsigned int, void *,
 			  enum drbd_notification_type) = NULL;
 	void *last_arg = NULL;
 
diff --git a/drivers/block/drbd/drbd_state_change.h b/drivers/block/drbd/drbd_state_change.h
index ba80f612d6ab..d5b0479bc9a6 100644
--- a/drivers/block/drbd/drbd_state_change.h
+++ b/drivers/block/drbd/drbd_state_change.h
@@ -44,19 +44,19 @@ extern struct drbd_state_change *remember_old_state(struct drbd_resource *, gfp_
 extern void copy_old_to_new_state_change(struct drbd_state_change *);
 extern void forget_state_change(struct drbd_state_change *);
 
-extern void notify_resource_state_change(struct sk_buff *,
+extern int notify_resource_state_change(struct sk_buff *,
 					 unsigned int,
 					 struct drbd_resource_state_change *,
 					 enum drbd_notification_type type);
-extern void notify_connection_state_change(struct sk_buff *,
+extern int notify_connection_state_change(struct sk_buff *,
 					   unsigned int,
 					   struct drbd_connection_state_change *,
 					   enum drbd_notification_type type);
-extern void notify_device_state_change(struct sk_buff *,
+extern int notify_device_state_change(struct sk_buff *,
 				       unsigned int,
 				       struct drbd_device_state_change *,
 				       enum drbd_notification_type type);
-extern void notify_peer_device_state_change(struct sk_buff *,
+extern int notify_peer_device_state_change(struct sk_buff *,
 					    unsigned int,
 					    struct drbd_peer_device_state_change *,
 					    enum drbd_notification_type type);
-- 
Gitee


From 9cf5e90ac8a67144d7edcee720d7b1419d5d6be1 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 14 Jul 2022 21:58:59 +0800
Subject: [PATCH 308/674] io_uring: don't touch scm_fp_list after queueing skb

stable inclusion
from stable-v5.10.111
commit aed30a2054060e470db02eaaf352c14dc38aa611
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=aed30a2054060e470db02eaaf352c14dc38aa611

--------------------------------

[ Upstream commit a07211e3001435fe8591b992464cd8d5e3c98c5a ]

It's safer to not touch scm_fp_list after we queued an skb to which it
was assigned, there might be races lurking if we screw subtle sync
guarantees on the io_uring side.

Fixes: 6b06314c47e14 ("io_uring: add file set registration")
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 fs/io_uring.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index a1c2c04b8ca0..3da9c6f457ae 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7314,8 +7314,12 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
 		refcount_add(skb->truesize, &sk->sk_wmem_alloc);
 		skb_queue_head(&sk->sk_receive_queue, skb);
 
-		for (i = 0; i < nr_files; i++)
-			fput(fpl->fp[i]);
+		for (i = 0; i < nr; i++) {
+			struct file *file = io_file_from_index(ctx, i + offset);
+
+			if (file)
+				fput(file);
+		}
 	} else {
 		kfree_skb(skb);
 		free_uid(fpl->user);
-- 
Gitee


From 70391d566bb7f9348fd1b4c3941030f041df0ac5 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Thu, 14 Jul 2022 21:59:00 +0800
Subject: [PATCH 309/674] SUNRPC: Handle ENOMEM in call_transmit_status()

stable inclusion
from stable-v5.10.111
commit 132cbe2f182ab535e0a7fa309d2c73ed6257a246
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=132cbe2f182ab535e0a7fa309d2c73ed6257a246

--------------------------------

[ Upstream commit d3c15033b240767d0287f1c4a529cbbe2d5ded8a ]

Both call_transmit() and call_bc_transmit() can now return ENOMEM, so
let's make sure that we handle the errors gracefully.

Fixes: 0472e4766049 ("SUNRPC: Convert socket page send code to use iov_iter()")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 net/sunrpc/clnt.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 84c8a534029c..bae42ada8c10 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -2175,6 +2175,7 @@ call_transmit_status(struct rpc_task *task)
 		 * socket just returned a connection error,
 		 * then hold onto the transport lock.
 		 */
+	case -ENOMEM:
 	case -ENOBUFS:
 		rpc_delay(task, HZ>>2);
 		fallthrough;
@@ -2258,6 +2259,7 @@ call_bc_transmit_status(struct rpc_task *task)
 	case -ENOTCONN:
 	case -EPIPE:
 		break;
+	case -ENOMEM:
 	case -ENOBUFS:
 		rpc_delay(task, HZ>>2);
 		fallthrough;
-- 
Gitee


From fa0241e9ba3fdb89bcd46fafbcf09bfd60c8d668 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Thu, 14 Jul 2022 21:59:01 +0800
Subject: [PATCH 310/674] SUNRPC: Handle low memory situations in call_status()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

stable inclusion
from stable-v5.10.111
commit 9a45e08636bba2949819cbf4d9ee0b974dd5c7a6
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=9a45e08636bba2949819cbf4d9ee0b974dd5c7a6

--------------------------------

[ Upstream commit 9d82819d5b065348ce623f196bf601028e22ed00 ]

We need to handle ENFILE, ENOBUFS, and ENOMEM, because
xprt_wake_pending_tasks() can be called with any one of these due to
socket creation failures.

Fixes: b61d59fffd3e ("SUNRPC: xs_tcp_connect_worker{4,6}: merge common code")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 net/sunrpc/clnt.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index bae42ada8c10..c5af31312e0c 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -2342,6 +2342,11 @@ call_status(struct rpc_task *task)
 	case -EPIPE:
 	case -EAGAIN:
 		break;
+	case -ENFILE:
+	case -ENOBUFS:
+	case -ENOMEM:
+		rpc_delay(task, HZ>>2);
+		break;
 	case -EIO:
 		/* shutdown or soft timeout */
 		goto out_exit;
-- 
Gitee


From 49fd0817134311af332f0602a09650ff61298933 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Thu, 14 Jul 2022 21:59:02 +0800
Subject: [PATCH 311/674] SUNRPC: svc_tcp_sendmsg() should handle errors from
 xdr_alloc_bvec()

stable inclusion
from stable-v5.10.111
commit b3c00be2ff8bc7e211fb04ec42facf0e4c876800
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=b3c00be2ff8bc7e211fb04ec42facf0e4c876800

--------------------------------

[ Upstream commit b056fa070814897be32d83b079dbc311375588e7 ]

The allocation is done with GFP_KERNEL, but it could still fail in a low
memory situation.

Fixes: 4a85a6a3320b ("SUNRPC: Handle TCP socket sends with kernel_sendpage() again")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 net/sunrpc/svcsock.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index eba1714bf09a..6d5bb8bfed38 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1091,7 +1091,9 @@ static int svc_tcp_sendmsg(struct socket *sock, struct msghdr *msg,
 	int flags, ret;
 
 	*sentp = 0;
-	xdr_alloc_bvec(xdr, GFP_KERNEL);
+	ret = xdr_alloc_bvec(xdr, GFP_KERNEL);
+	if (ret < 0)
+		return ret;
 
 	msg->msg_flags = MSG_MORE;
 	ret = kernel_sendmsg(sock, msg, &rm, 1, rm.iov_len);
-- 
Gitee


From cc46da9de5344020d39687c82cfa22561be7a751 Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Thu, 14 Jul 2022 21:59:03 +0800
Subject: [PATCH 312/674] iommu/omap: Fix regression in probe for NULL pointer
 dereference

stable inclusion
from stable-v5.10.111
commit bd905fed87ce01ac010011bb8f44ed0140116ceb
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=bd905fed87ce01ac010011bb8f44ed0140116ceb

--------------------------------

[ Upstream commit 71ff461c3f41f6465434b9e980c01782763e7ad8 ]

Commit 3f6634d997db ("iommu: Use right way to retrieve iommu_ops") started
triggering a NULL pointer dereference for some omap variants:

__iommu_probe_device from probe_iommu_group+0x2c/0x38
probe_iommu_group from bus_for_each_dev+0x74/0xbc
bus_for_each_dev from bus_iommu_probe+0x34/0x2e8
bus_iommu_probe from bus_set_iommu+0x80/0xc8
bus_set_iommu from omap_iommu_init+0x88/0xcc
omap_iommu_init from do_one_initcall+0x44/0x24

This is caused by omap iommu probe returning 0 instead of ERR_PTR(-ENODEV)
as noted by Jason Gunthorpe <jgg@ziepe.ca>.

Looks like the regression already happened with an earlier commit
6785eb9105e3 ("iommu/omap: Convert to probe/release_device() call-backs")
that changed the function return type and missed converting one place.

Cc: Drew Fustini <dfustini@baylibre.com>
Cc: Lu Baolu <baolu.lu@linux.intel.com>
Cc: Suman Anna <s-anna@ti.com>
Suggested-by: Jason Gunthorpe <jgg@ziepe.ca>
Fixes: 6785eb9105e3 ("iommu/omap: Convert to probe/release_device() call-backs")
Fixes: 3f6634d997db ("iommu: Use right way to retrieve iommu_ops")
Signed-off-by: Tony Lindgren <tony@atomide.com>
Tested-by: Drew Fustini <dfustini@baylibre.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20220331062301.24269-1-tony@atomide.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/iommu/omap-iommu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c
index 71f29c0927fc..ff2c692c0db4 100644
--- a/drivers/iommu/omap-iommu.c
+++ b/drivers/iommu/omap-iommu.c
@@ -1665,7 +1665,7 @@ static struct iommu_device *omap_iommu_probe_device(struct device *dev)
 	num_iommus = of_property_count_elems_of_size(dev->of_node, "iommus",
 						     sizeof(phandle));
 	if (num_iommus < 0)
-		return 0;
+		return ERR_PTR(-ENODEV);
 
 	arch_data = kcalloc(num_iommus + 1, sizeof(*arch_data), GFP_KERNEL);
 	if (!arch_data)
-- 
Gitee


From 7495b6a600bcc5dc05096bf25e8608128b9d3330 Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@arm.com>
Date: Thu, 14 Jul 2022 21:59:04 +0800
Subject: [PATCH 313/674] perf: arm-spe: Fix perf report --mem-mode

stable inclusion
from stable-v5.10.111
commit 65210fac639ea23c36187c4e4bb5a48e68dddff1
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=65210fac639ea23c36187c4e4bb5a48e68dddff1

--------------------------------

[ Upstream commit ffab487052054162b3b6c9c6005777ec6cfcea05 ]

Since commit bb30acae4c4dacfa ("perf report: Bail out --mem-mode if mem
info is not available") "perf mem report" and "perf report --mem-mode"
don't allow opening the file unless one of the events has
PERF_SAMPLE_DATA_SRC set.

SPE doesn't have this set even though synthetic memory data is generated
after it is decoded. Fix this issue by setting DATA_SRC on SPE events.
This has no effect on the data collected because the SPE driver doesn't
do anything with that flag and doesn't generate samples.

Fixes: bb30acae4c4dacfa ("perf report: Bail out --mem-mode if mem info is not available")
Signed-off-by: James Clark <james.clark@arm.com>
Tested-by: Leo Yan <leo.yan@linaro.org>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: German Gomez <german.gomez@arm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.garry@huawei.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: linux-arm-kernel@lists.infradead.org
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20220408144056.1955535-1-james.clark@arm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 tools/perf/arch/arm64/util/arm-spe.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tools/perf/arch/arm64/util/arm-spe.c b/tools/perf/arch/arm64/util/arm-spe.c
index f3aa09647606..3c0d919a73f4 100644
--- a/tools/perf/arch/arm64/util/arm-spe.c
+++ b/tools/perf/arch/arm64/util/arm-spe.c
@@ -239,6 +239,12 @@ static int arm_spe_recording_options(struct auxtrace_record *itr,
 		arm_spe_set_timestamp(itr, arm_spe_evsel);
 	}
 
+	/*
+	 * Set this only so that perf report knows that SPE generates memory info. It has no effect
+	 * on the opening of the event or the SPE data produced.
+	 */
+	evsel__set_sample_bit(arm_spe_evsel, DATA_SRC);
+
 	/* Add dummy event to keep tracking */
 	err = parse_events(evlist, "dummy:u", NULL);
 	if (err)
-- 
Gitee


From d8cb6b975a4828ba2388e63ca31f58445e393922 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Thu, 14 Jul 2022 21:59:05 +0800
Subject: [PATCH 314/674] perf tools: Fix perf's libperf_print callback

stable inclusion
from stable-v5.10.111
commit 362ced37690d2069dc8f5b556a19599c9fb0686c
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=362ced37690d2069dc8f5b556a19599c9fb0686c

--------------------------------

[ Upstream commit aeee9dc53ce405d2161f9915f553114e94e5b677 ]

eprintf() does not expect va_list as the type of the 4th parameter.

Use veprintf() because it does.

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Fixes: 428dab813a56ce94 ("libperf: Merge libperf_set_print() into libperf_init()")
Cc: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/r/20220408132625.2451452-1-adrian.hunter@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 tools/perf/perf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/perf.c b/tools/perf/perf.c
index 27f94b0bb874..505e2a2f1872 100644
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c
@@ -433,7 +433,7 @@ void pthread__unblock_sigwinch(void)
 static int libperf_print(enum libperf_print_level level,
 			 const char *fmt, va_list ap)
 {
-	return eprintf(level, verbose, fmt, ap);
+	return veprintf(level, verbose, fmt, ap);
 }
 
 int main(int argc, const char **argv)
-- 
Gitee


From dd63ae24cebfd4583af37b813ca4170a0a5b323b Mon Sep 17 00:00:00 2001
From: Denis Nikitin <denik@chromium.org>
Date: Thu, 14 Jul 2022 21:59:06 +0800
Subject: [PATCH 315/674] perf session: Remap buf if there is no space for
 event

stable inclusion
from stable-v5.10.111
commit 4604b5738d5b0c160724b72886822c98c35ad316
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=4604b5738d5b0c160724b72886822c98c35ad316

--------------------------------

[ Upstream commit bc21e74d4775f883ae1f542c1f1dc7205b15d925 ]

If a perf event doesn't fit into remaining buffer space return NULL to
remap buf and fetch the event again.

Keep the logic to error out on inadequate input from fuzzing.

This fixes perf failing on ChromeOS (with 32b userspace):

  $ perf report -v -i perf.data
  ...
  prefetch_event: head=0x1fffff8 event->header_size=0x30, mmap_size=0x2000000: fuzzed or compressed perf.data?
  Error:
  failed to process sample

Fixes: 57fc032ad643ffd0 ("perf session: Avoid infinite loop when seeing invalid header.size")
Reviewed-by: James Clark <james.clark@arm.com>
Signed-off-by: Denis Nikitin <denik@chromium.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexey Budankov <alexey.budankov@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/r/20220330031130.2152327-1-denik@chromium.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 tools/perf/util/session.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 9dddec19a494..354e1e04a266 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -2056,6 +2056,7 @@ prefetch_event(char *buf, u64 head, size_t mmap_size,
 	       bool needs_swap, union perf_event *error)
 {
 	union perf_event *event;
+	u16 event_size;
 
 	/*
 	 * Ensure we have enough space remaining to read
@@ -2068,15 +2069,23 @@ prefetch_event(char *buf, u64 head, size_t mmap_size,
 	if (needs_swap)
 		perf_event_header__bswap(&event->header);
 
-	if (head + event->header.size <= mmap_size)
+	event_size = event->header.size;
+	if (head + event_size <= mmap_size)
 		return event;
 
 	/* We're not fetching the event so swap back again */
 	if (needs_swap)
 		perf_event_header__bswap(&event->header);
 
-	pr_debug("%s: head=%#" PRIx64 " event->header_size=%#x, mmap_size=%#zx:"
-		 " fuzzed or compressed perf.data?\n",__func__, head, event->header.size, mmap_size);
+	/* Check if the event fits into the next mmapped buf. */
+	if (event_size <= mmap_size - head % page_size) {
+		/* Remap buf and fetch again. */
+		return NULL;
+	}
+
+	/* Invalid input. Event size should never exceed mmap_size. */
+	pr_debug("%s: head=%#" PRIx64 " event->header.size=%#x, mmap_size=%#zx:"
+		 " fuzzed or compressed perf.data?\n", __func__, head, event_size, mmap_size);
 
 	return error;
 }
-- 
Gitee


From a5b625b788104b3bf7c3201c0a75f527be93182c Mon Sep 17 00:00:00 2001
From: Chanho Park <chanho61.park@samsung.com>
Date: Thu, 14 Jul 2022 21:59:07 +0800
Subject: [PATCH 316/674] arm64: Add part number for Arm Cortex-A78AE

stable inclusion
from stable-v5.10.111
commit 9de98470db6ec5223ab47ec50b21c40f4d08655c
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=9de98470db6ec5223ab47ec50b21c40f4d08655c

--------------------------------

commit 83bea32ac7ed37bbda58733de61fc9369513f9f9 upstream.

Add the MIDR part number info for the Arm Cortex-A78AE[1] and add it to
spectre-BHB affected list[2].

[1]: https://developer.arm.com/Processors/Cortex-A78AE
[2]: https://developer.arm.com/Arm%20Security%20Center/Spectre-BHB

Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: James Morse <james.morse@arm.com>
Signed-off-by: Chanho Park <chanho61.park@samsung.com>
Link: https://lore.kernel.org/r/20220407091128.8700-1-chanho61.park@samsung.com
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>

 Conflicts:
	arch/arm64/include/asm/cputype.h

Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 arch/arm64/include/asm/cputype.h | 2 ++
 arch/arm64/kernel/proton-pack.c  | 1 +
 2 files changed, 3 insertions(+)

diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h
index 2e0066f13ad4..662708c56397 100644
--- a/arch/arm64/include/asm/cputype.h
+++ b/arch/arm64/include/asm/cputype.h
@@ -76,6 +76,7 @@
 #define ARM_CPU_PART_CORTEX_A77		0xD0D
 #define ARM_CPU_PART_NEOVERSE_V1	0xD40
 #define ARM_CPU_PART_CORTEX_A78		0xD41
+#define ARM_CPU_PART_CORTEX_A78AE	0xD42
 #define ARM_CPU_PART_CORTEX_X1		0xD44
 #define ARM_CPU_PART_CORTEX_A78C	0xD4B
 #define ARM_CPU_PART_CORTEX_A510	0xD46
@@ -131,6 +132,7 @@
 #define MIDR_CORTEX_A77	MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A77)
 #define MIDR_NEOVERSE_V1 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_V1)
 #define MIDR_CORTEX_A78 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A78)
+#define MIDR_CORTEX_A78AE	MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A78AE)
 #define MIDR_CORTEX_X1 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X1)
 #define MIDR_CORTEX_A78C MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A78C)
 #define MIDR_CORTEX_A510 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A510)
diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c
index a3fb28e170dd..e807f77737e0 100644
--- a/arch/arm64/kernel/proton-pack.c
+++ b/arch/arm64/kernel/proton-pack.c
@@ -861,6 +861,7 @@ u8 spectre_bhb_loop_affected(int scope)
 	if (scope == SCOPE_LOCAL_CPU) {
 		static const struct midr_range spectre_bhb_k32_list[] = {
 			MIDR_ALL_VERSIONS(MIDR_CORTEX_A78),
+			MIDR_ALL_VERSIONS(MIDR_CORTEX_A78AE),
 			MIDR_ALL_VERSIONS(MIDR_CORTEX_A78C),
 			MIDR_ALL_VERSIONS(MIDR_CORTEX_X1),
 			MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V1),
-- 
Gitee


From a5207284dd6072c7e9eaa0d823042528e1eb8d4f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pali=20Roh=C3=A1r?= <pali@kernel.org>
Date: Thu, 14 Jul 2022 21:59:08 +0800
Subject: [PATCH 317/674] Revert "mmc: sdhci-xenon: fix annoying 1.8V regulator
 warning"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

stable inclusion
from stable-v5.10.111
commit 41a519c05beee0ad9a0e53143fe24c6eeb677db6
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=41a519c05beee0ad9a0e53143fe24c6eeb677db6

--------------------------------

commit 7e2646ed47542123168d43916b84b954532e5386 upstream.

This reverts commit bb32e1987bc55ce1db400faf47d85891da3c9b9f.

Commit 1a3ed0dc3594 ("mmc: sdhci-xenon: fix 1.8v regulator stabilization")
contains proper fix for the issue described in commit bb32e1987bc5 ("mmc:
sdhci-xenon: fix annoying 1.8V regulator warning").

Fixes: 8d876bf472db ("mmc: sdhci-xenon: wait 5ms after set 1.8V signal enable")
Cc: stable@vger.kernel.org # 1a3ed0dc3594 ("mmc: sdhci-xenon: fix 1.8v regulator stabilization")
Signed-off-by: Pali Rohár <pali@kernel.org>
Reviewed-by: Marek Behún <kabel@kernel.org>
Reviewed-by: Marcin Wojtas <mw@semihalf.com>
Link: https://lore.kernel.org/r/20220318141441.32329-1-pali@kernel.org
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/mmc/host/sdhci-xenon.c | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/drivers/mmc/host/sdhci-xenon.c b/drivers/mmc/host/sdhci-xenon.c
index 0e5234a5ca22..d509198c00c8 100644
--- a/drivers/mmc/host/sdhci-xenon.c
+++ b/drivers/mmc/host/sdhci-xenon.c
@@ -240,16 +240,6 @@ static void xenon_voltage_switch(struct sdhci_host *host)
 {
 	/* Wait for 5ms after set 1.8V signal enable bit */
 	usleep_range(5000, 5500);
-
-	/*
-	 * For some reason the controller's Host Control2 register reports
-	 * the bit representing 1.8V signaling as 0 when read after it was
-	 * written as 1. Subsequent read reports 1.
-	 *
-	 * Since this may cause some issues, do an empty read of the Host
-	 * Control2 register here to circumvent this.
-	 */
-	sdhci_readw(host, SDHCI_HOST_CONTROL2);
 }
 
 static const struct sdhci_ops sdhci_xenon_ops = {
-- 
Gitee


From 786719f2db6e047c47741f627891993075e7a54d Mon Sep 17 00:00:00 2001
From: Yann Gautier <yann.gautier@foss.st.com>
Date: Thu, 14 Jul 2022 21:59:09 +0800
Subject: [PATCH 318/674] mmc: mmci: stm32: correctly check all elements of sg
 list

stable inclusion
from stable-v5.10.111
commit 029b4170737f03253dd5cb9a57a657ce0cc2b532
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=029b4170737f03253dd5cb9a57a657ce0cc2b532

--------------------------------

commit 0d319dd5a27183b75d984e3dc495248e59f99334 upstream.

Use sg and not data->sg when checking sg list elements. Else only the
first element alignment is checked.
The last element should be checked the same way, for_each_sg already set
sg to sg_next(sg).

Fixes: 46b723dd867d ("mmc: mmci: add stm32 sdmmc variant")
Cc: stable@vger.kernel.org
Signed-off-by: Yann Gautier <yann.gautier@foss.st.com>
Link: https://lore.kernel.org/r/20220317111944.116148-2-yann.gautier@foss.st.com
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/mmc/host/mmci_stm32_sdmmc.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/mmc/host/mmci_stm32_sdmmc.c b/drivers/mmc/host/mmci_stm32_sdmmc.c
index a75d3dd34d18..4cceb9bab036 100644
--- a/drivers/mmc/host/mmci_stm32_sdmmc.c
+++ b/drivers/mmc/host/mmci_stm32_sdmmc.c
@@ -62,8 +62,8 @@ static int sdmmc_idma_validate_data(struct mmci_host *host,
 	 * excepted the last element which has no constraint on idmasize
 	 */
 	for_each_sg(data->sg, sg, data->sg_len - 1, i) {
-		if (!IS_ALIGNED(data->sg->offset, sizeof(u32)) ||
-		    !IS_ALIGNED(data->sg->length, SDMMC_IDMA_BURST)) {
+		if (!IS_ALIGNED(sg->offset, sizeof(u32)) ||
+		    !IS_ALIGNED(sg->length, SDMMC_IDMA_BURST)) {
 			dev_err(mmc_dev(host->mmc),
 				"unaligned scatterlist: ofst:%x length:%d\n",
 				data->sg->offset, data->sg->length);
@@ -71,7 +71,7 @@ static int sdmmc_idma_validate_data(struct mmci_host *host,
 		}
 	}
 
-	if (!IS_ALIGNED(data->sg->offset, sizeof(u32))) {
+	if (!IS_ALIGNED(sg->offset, sizeof(u32))) {
 		dev_err(mmc_dev(host->mmc),
 			"unaligned last scatterlist: ofst:%x length:%d\n",
 			data->sg->offset, data->sg->length);
-- 
Gitee


From 54d20fe524573e5de33005a6fc6e362820bee749 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa+renesas@sang-engineering.com>
Date: Thu, 14 Jul 2022 21:59:10 +0800
Subject: [PATCH 319/674] mmc: renesas_sdhi: don't overwrite TAP settings when
 HS400 tuning is complete

stable inclusion
from stable-v5.10.111
commit 8b6f04b4c9d965c04431620d12afb5e729c7eacf
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=8b6f04b4c9d965c04431620d12afb5e729c7eacf

--------------------------------

commit 03e59b1e2f56245163b14c69e0a830c24b1a3a47 upstream.

When HS400 tuning is complete and HS400 is going to be activated, we
have to keep the current number of TAPs and should not overwrite them
with a hardcoded value. This was probably a copy&paste mistake when
upporting HS400 support from the BSP.

Fixes: 26eb2607fa28 ("mmc: renesas_sdhi: add eMMC HS400 mode support")
Reported-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Reviewed-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20220404114902.12175-1-wsa+renesas@sang-engineering.com
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/mmc/host/renesas_sdhi_core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/mmc/host/renesas_sdhi_core.c b/drivers/mmc/host/renesas_sdhi_core.c
index 782879d46ff4..ac01fb518386 100644
--- a/drivers/mmc/host/renesas_sdhi_core.c
+++ b/drivers/mmc/host/renesas_sdhi_core.c
@@ -390,10 +390,10 @@ static void renesas_sdhi_hs400_complete(struct mmc_host *mmc)
 			SH_MOBILE_SDHI_SCC_TMPPORT2_HS400OSEL) |
 			sd_scc_read32(host, priv, SH_MOBILE_SDHI_SCC_TMPPORT2));
 
-	/* Set the sampling clock selection range of HS400 mode */
 	sd_scc_write32(host, priv, SH_MOBILE_SDHI_SCC_DTCNTL,
 		       SH_MOBILE_SDHI_SCC_DTCNTL_TAPEN |
-		       0x4 << SH_MOBILE_SDHI_SCC_DTCNTL_TAPNUM_SHIFT);
+		       sd_scc_read32(host, priv,
+				     SH_MOBILE_SDHI_SCC_DTCNTL));
 
 	/* Avoid bad TAP */
 	if (bad_taps & BIT(priv->tap_set)) {
-- 
Gitee


From b55f36cef846f393fbea2977d0423c201a4c9e5f Mon Sep 17 00:00:00 2001
From: Guo Xuenan <guoxuenan@huawei.com>
Date: Thu, 14 Jul 2022 21:59:11 +0800
Subject: [PATCH 320/674] lz4: fix LZ4_decompress_safe_partial read out of
 bound

stable inclusion
from stable-v5.10.111
commit 6adc01a7aa37445dafe8846faa0610a86029b253
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=6adc01a7aa37445dafe8846faa0610a86029b253

--------------------------------

commit eafc0a02391b7b36617b36c97c4b5d6832cf5e24 upstream.

When partialDecoding, it is EOF if we've either filled the output buffer
or can't proceed with reading an offset for following match.

In some extreme corner cases when compressed data is suitably corrupted,
UAF will occur.  As reported by KASAN [1], LZ4_decompress_safe_partial
may lead to read out of bound problem during decoding.  lz4 upstream has
fixed it [2] and this issue has been disscussed here [3] before.

current decompression routine was ported from lz4 v1.8.3, bumping
lib/lz4 to v1.9.+ is certainly a huge work to be done later, so, we'd
better fix it first.

[1] https://lore.kernel.org/all/000000000000830d1205cf7f0477@google.com/
[2] https://github.com/lz4/lz4/commit/c5d6f8a8be3927c0bec91bcc58667a6cfad244ad#
[3] https://lore.kernel.org/all/CC666AE8-4CA4-4951-B6FB-A2EFDE3AC03B@fb.com/

Link: https://lkml.kernel.org/r/20211111105048.2006070-1-guoxuenan@huawei.com
Reported-by: syzbot+63d688f1d899c588fb71@syzkaller.appspotmail.com
Signed-off-by: Guo Xuenan <guoxuenan@huawei.com>
Reviewed-by: Nick Terrell <terrelln@fb.com>
Acked-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Cc: Yann Collet <cyan@fb.com>
Cc: Chengyang Fan <cy.fan@huawei.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 lib/lz4/lz4_decompress.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c
index 8a7724a6ce2f..5b6705c4b2d2 100644
--- a/lib/lz4/lz4_decompress.c
+++ b/lib/lz4/lz4_decompress.c
@@ -271,8 +271,12 @@ static FORCE_INLINE int LZ4_decompress_generic(
 			ip += length;
 			op += length;
 
-			/* Necessarily EOF, due to parsing restrictions */
-			if (!partialDecoding || (cpy == oend))
+			/* Necessarily EOF when !partialDecoding.
+			 * When partialDecoding, it is EOF if we've either
+			 * filled the output buffer or
+			 * can't proceed with reading an offset for following match.
+			 */
+			if (!partialDecoding || (cpy == oend) || (ip >= (iend - 2)))
 				break;
 		} else {
 			/* may overwrite up to WILDCOPYLENGTH beyond cpy */
-- 
Gitee


From e4646733d5b0658a060d834a2043dcc314498658 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 14 Jul 2022 21:59:12 +0800
Subject: [PATCH 321/674] mmmremap.c: avoid pointless
 invalidate_range_start/end on mremap(old_size=0)

stable inclusion
from stable-v5.10.111
commit 7d659cb1763ff17d1c6ee082fa6feb4267c7a30b
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=7d659cb1763ff17d1c6ee082fa6feb4267c7a30b

--------------------------------

commit 01e67e04c28170c47700c2c226d732bbfedb1ad0 upstream.

If an mremap() syscall with old_size=0 ends up in move_page_tables(), it
will call invalidate_range_start()/invalidate_range_end() unnecessarily,
i.e.  with an empty range.

This causes a WARN in KVM's mmu_notifier.  In the past, empty ranges
have been diagnosed to be off-by-one bugs, hence the WARNing.  Given the
low (so far) number of unique reports, the benefits of detecting more
buggy callers seem to outweigh the cost of having to fix cases such as
this one, where userspace is doing something silly.  In this particular
case, an early return from move_page_tables() is enough to fix the
issue.

Link: https://lkml.kernel.org/r/20220329173155.172439-1-pbonzini@redhat.com
Reported-by: syzbot+6bde52d89cfdf9f61425@syzkaller.appspotmail.com
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Cc: Sean Christopherson <seanjc@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 mm/mremap.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/mm/mremap.c b/mm/mremap.c
index ecfca97b97ae..2f7f3494a990 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -487,6 +487,9 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
 	pmd_t *old_pmd, *new_pmd;
 	pud_t *old_pud, *new_pud;
 
+	if (!len)
+		return 0;
+
 	old_end = old_addr + len;
 	flush_cache_range(vma, old_addr, old_end);
 
-- 
Gitee


From 31a2a59fa5a7d5c8fc20ef0aec8cbd3816e0905b Mon Sep 17 00:00:00 2001
From: Miaohe Lin <linmiaohe@huawei.com>
Date: Thu, 14 Jul 2022 21:59:13 +0800
Subject: [PATCH 322/674] mm/mempolicy: fix mpol_new leak in
 shared_policy_replace

stable inclusion
from stable-v5.10.111
commit f7e183b0a7136b6dc9c7b9b2a85a608a8feba894
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=f7e183b0a7136b6dc9c7b9b2a85a608a8feba894

--------------------------------

commit 4ad099559b00ac01c3726e5c95dc3108ef47d03e upstream.

If mpol_new is allocated but not used in restart loop, mpol_new will be
freed via mpol_put before returning to the caller.  But refcnt is not
initialized yet, so mpol_put could not do the right things and might
leak the unused mpol_new.  This would happen if mempolicy was updated on
the shared shmem file while the sp->lock has been dropped during the
memory allocation.

This issue could be triggered easily with the below code snippet if
there are many processes doing the below work at the same time:

  shmid = shmget((key_t)5566, 1024 * PAGE_SIZE, 0666|IPC_CREAT);
  shm = shmat(shmid, 0, 0);
  loop many times {
    mbind(shm, 1024 * PAGE_SIZE, MPOL_LOCAL, mask, maxnode, 0);
    mbind(shm + 128 * PAGE_SIZE, 128 * PAGE_SIZE, MPOL_DEFAULT, mask,
          maxnode, 0);
  }

Link: https://lkml.kernel.org/r/20220329111416.27954-1-linmiaohe@huawei.com
Fixes: 42288fe366c4 ("mm: mempolicy: Convert shared_policy mutex to spinlock")
Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: <stable@vger.kernel.org>	[3.8]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 mm/mempolicy.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index ef7eb6a06827..e7fdb3a749b5 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2690,6 +2690,7 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
 	mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 	if (!mpol_new)
 		goto err_out;
+	atomic_set(&mpol_new->refcnt, 1);
 	goto restart;
 }
 
-- 
Gitee


From eb917904cf24022c23f3e55f39678545e0c45883 Mon Sep 17 00:00:00 2001
From: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
Date: Thu, 14 Jul 2022 21:59:14 +0800
Subject: [PATCH 323/674] x86/pm: Save the MSR validity status at context setup

stable inclusion
from stable-v5.10.111
commit 8c9e26c890ba130ad98137770d221dac7853540c
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=8c9e26c890ba130ad98137770d221dac7853540c

--------------------------------

commit 73924ec4d560257004d5b5116b22a3647661e364 upstream.

The mechanism to save/restore MSRs during S3 suspend/resume checks for
the MSR validity during suspend, and only restores the MSR if its a
valid MSR.  This is not optimal, as an invalid MSR will unnecessarily
throw an exception for every suspend cycle.  The more invalid MSRs,
higher the impact will be.

Check and save the MSR validity at setup.  This ensures that only valid
MSRs that are guaranteed to not throw an exception will be attempted
during suspend.

Fixes: 7a9c2dd08ead ("x86/pm: Introduce quirk framework to save/restore extra MSR registers around suspend/resume")
Suggested-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
Reviewed-by: Dave Hansen <dave.hansen@linux.intel.com>
Acked-by: Borislav Petkov <bp@suse.de>
Cc: stable@vger.kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 arch/x86/power/cpu.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index db1378c6ff26..3b42175673c2 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -40,7 +40,8 @@ static void msr_save_context(struct saved_context *ctxt)
 	struct saved_msr *end = msr + ctxt->saved_msrs.num;
 
 	while (msr < end) {
-		msr->valid = !rdmsrl_safe(msr->info.msr_no, &msr->info.reg.q);
+		if (msr->valid)
+			rdmsrl(msr->info.msr_no, msr->info.reg.q);
 		msr++;
 	}
 }
@@ -427,8 +428,10 @@ static int msr_build_context(const u32 *msr_id, const int num)
 	}
 
 	for (i = saved_msrs->num, j = 0; i < total_num; i++, j++) {
+		u64 dummy;
+
 		msr_array[i].info.msr_no	= msr_id[j];
-		msr_array[i].valid		= false;
+		msr_array[i].valid		= !rdmsrl_safe(msr_id[j], &dummy);
 		msr_array[i].info.reg.q		= 0;
 	}
 	saved_msrs->num   = total_num;
-- 
Gitee


From d362730053576be5d068beb2a76a2fae15608616 Mon Sep 17 00:00:00 2001
From: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
Date: Thu, 14 Jul 2022 21:59:15 +0800
Subject: [PATCH 324/674] x86/speculation: Restore speculation related MSRs
 during S3 resume

stable inclusion
from stable-v5.10.111
commit fc4bdaed4d4ea4209e65115bd3948a1e4ac51cbb
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=fc4bdaed4d4ea4209e65115bd3948a1e4ac51cbb

--------------------------------

commit e2a1256b17b16f9b9adf1b6fea56819e7b68e463 upstream.

After resuming from suspend-to-RAM, the MSRs that control CPU's
speculative execution behavior are not being restored on the boot CPU.

These MSRs are used to mitigate speculative execution vulnerabilities.
Not restoring them correctly may leave the CPU vulnerable.  Secondary
CPU's MSRs are correctly being restored at S3 resume by
identify_secondary_cpu().

During S3 resume, restore these MSRs for boot CPU when restoring its
processor state.

Fixes: 772439717dbf ("x86/bugs/intel: Set proper CPU features and setup RDS")
Reported-by: Neelima Krishnan <neelima.krishnan@intel.com>
Signed-off-by: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
Tested-by: Neelima Krishnan <neelima.krishnan@intel.com>
Acked-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: stable@vger.kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 arch/x86/power/cpu.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 3b42175673c2..decebcd8ee1c 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -506,10 +506,24 @@ static int pm_cpu_check(const struct x86_cpu_id *c)
 	return ret;
 }
 
+static void pm_save_spec_msr(void)
+{
+	u32 spec_msr_id[] = {
+		MSR_IA32_SPEC_CTRL,
+		MSR_IA32_TSX_CTRL,
+		MSR_TSX_FORCE_ABORT,
+		MSR_IA32_MCU_OPT_CTRL,
+		MSR_AMD64_LS_CFG,
+	};
+
+	msr_build_context(spec_msr_id, ARRAY_SIZE(spec_msr_id));
+}
+
 static int pm_check_save_msr(void)
 {
 	dmi_check_system(msr_save_dmi_table);
 	pm_cpu_check(msr_save_cpu_table);
+	pm_save_spec_msr();
 
 	return 0;
 }
-- 
Gitee


From 8fda77c8ca2585f55d6655055152994765478271 Mon Sep 17 00:00:00 2001
From: Ethan Lien <ethanlien@synology.com>
Date: Thu, 14 Jul 2022 21:59:16 +0800
Subject: [PATCH 325/674] btrfs: fix qgroup reserve overflow the qgroup limit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

stable inclusion
from stable-v5.10.111
commit 82ae73ac963cee877ce34f7c31b2b456b516e96c
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=82ae73ac963cee877ce34f7c31b2b456b516e96c

--------------------------------

commit b642b52d0b50f4d398cb4293f64992d0eed2e2ce upstream.

We use extent_changeset->bytes_changed in qgroup_reserve_data() to record
how many bytes we set for EXTENT_QGROUP_RESERVED state. Currently the
bytes_changed is set as "unsigned int", and it will overflow if we try to
fallocate a range larger than 4GiB. The result is we reserve less bytes
and eventually break the qgroup limit.

Unlike regular buffered/direct write, which we use one changeset for
each ordered extent, which can never be larger than 256M.  For
fallocate, we use one changeset for the whole range, thus it no longer
respects the 256M per extent limit, and caused the problem.

The following example test script reproduces the problem:

  $ cat qgroup-overflow.sh
  #!/bin/bash

  DEV=/dev/sdj
  MNT=/mnt/sdj

  mkfs.btrfs -f $DEV
  mount $DEV $MNT

  # Set qgroup limit to 2GiB.
  btrfs quota enable $MNT
  btrfs qgroup limit 2G $MNT

  # Try to fallocate a 3GiB file. This should fail.
  echo
  echo "Try to fallocate a 3GiB file..."
  fallocate -l 3G $MNT/3G.file

  # Try to fallocate a 5GiB file.
  echo
  echo "Try to fallocate a 5GiB file..."
  fallocate -l 5G $MNT/5G.file

  # See we break the qgroup limit.
  echo
  sync
  btrfs qgroup show -r $MNT

  umount $MNT

When running the test:

  $ ./qgroup-overflow.sh
  (...)

  Try to fallocate a 3GiB file...
  fallocate: fallocate failed: Disk quota exceeded

  Try to fallocate a 5GiB file...

  qgroupid         rfer         excl     max_rfer
  --------         ----         ----     --------
  0/5           5.00GiB      5.00GiB      2.00GiB

Since we have no control of how bytes_changed is used, it's better to
set it to u64.

CC: stable@vger.kernel.org # 4.14+
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Ethan Lien <ethanlien@synology.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 fs/btrfs/extent_io.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index f39d02e7f7ef..16f44bc481ab 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -121,7 +121,7 @@ struct extent_buffer {
  */
 struct extent_changeset {
 	/* How many bytes are set/cleared in this operation */
-	unsigned int bytes_changed;
+	u64 bytes_changed;
 
 	/* Changed ranges */
 	struct ulist range_changed;
-- 
Gitee


From 6ecec32ac172c992aaaeaa8846ab3c96905e9ac9 Mon Sep 17 00:00:00 2001
From: Kaiwen Hu <kevinhu@synology.com>
Date: Thu, 14 Jul 2022 21:59:17 +0800
Subject: [PATCH 326/674] btrfs: prevent subvol with swapfile from being
 deleted

stable inclusion
from stable-v5.10.111
commit a044bca8ef310bd43e3c683a1b1df66f96fa9e0a
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=a044bca8ef310bd43e3c683a1b1df66f96fa9e0a

--------------------------------

commit 60021bd754c6ca0addc6817994f20290a321d8d6 upstream.

A subvolume with an active swapfile must not be deleted otherwise it
would not be possible to deactivate it.

After the subvolume is deleted, we cannot swapoff the swapfile in this
deleted subvolume because the path is unreachable.  The swapfile is
still active and holding references, the filesystem cannot be unmounted.

The test looks like this:

  mkfs.btrfs -f $dev > /dev/null
  mount $dev $mnt

  btrfs sub create $mnt/subvol
  touch $mnt/subvol/swapfile
  chmod 600 $mnt/subvol/swapfile
  chattr +C $mnt/subvol/swapfile
  dd if=/dev/zero of=$mnt/subvol/swapfile bs=1K count=4096
  mkswap $mnt/subvol/swapfile
  swapon $mnt/subvol/swapfile

  btrfs sub delete $mnt/subvol
  swapoff $mnt/subvol/swapfile  # failed: No such file or directory
  swapoff --all

  unmount $mnt                  # target is busy.

To prevent above issue, we simply check that whether the subvolume
contains any active swapfile, and stop the deleting process.  This
behavior is like snapshot ioctl dealing with a swapfile.

CC: stable@vger.kernel.org # 5.4+
Reviewed-by: Robbie Ko <robbieko@synology.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Kaiwen Hu <kevinhu@synology.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 fs/btrfs/inode.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1d9262a35473..f7f4ac01589b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4023,6 +4023,13 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
 			   dest->root_key.objectid);
 		return -EPERM;
 	}
+	if (atomic_read(&dest->nr_swapfiles)) {
+		spin_unlock(&dest->root_item_lock);
+		btrfs_warn(fs_info,
+			   "attempt to delete subvolume %llu with active swapfile",
+			   root->root_key.objectid);
+		return -EPERM;
+	}
 	root_flags = btrfs_root_flags(&dest->root_item);
 	btrfs_set_root_flags(&dest->root_item,
 			     root_flags | BTRFS_ROOT_SUBVOL_DEAD);
@@ -10215,8 +10222,23 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
 	 * set. We use this counter to prevent snapshots. We must increment it
 	 * before walking the extents because we don't want a concurrent
 	 * snapshot to run after we've already checked the extents.
+	 *
+	 * It is possible that subvolume is marked for deletion but still not
+	 * removed yet. To prevent this race, we check the root status before
+	 * activating the swapfile.
 	 */
+	spin_lock(&root->root_item_lock);
+	if (btrfs_root_dead(root)) {
+		spin_unlock(&root->root_item_lock);
+
+		btrfs_exclop_finish(fs_info);
+		btrfs_warn(fs_info,
+		"cannot activate swapfile because subvolume %llu is being deleted",
+			root->root_key.objectid);
+		return -EPERM;
+	}
 	atomic_inc(&root->nr_swapfiles);
+	spin_unlock(&root->root_item_lock);
 
 	isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize);
 
-- 
Gitee


From 46529dec611c6d97453432ea503a0080ae79a783 Mon Sep 17 00:00:00 2001
From: Guo Ren <guoren@linux.alibaba.com>
Date: Thu, 14 Jul 2022 21:59:18 +0800
Subject: [PATCH 327/674] arm64: patch_text: Fixup last cpu should be master

stable inclusion
from stable-v5.10.111
commit 8bb41682911f8d3d0874af6646406856290565a9
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=8bb41682911f8d3d0874af6646406856290565a9

--------------------------------

commit 31a099dbd91e69fcab55eef4be15ed7a8c984918 upstream.

These patch_text implementations are using stop_machine_cpuslocked
infrastructure with atomic cpu_count. The original idea: When the
master CPU patch_text, the others should wait for it. But current
implementation is using the first CPU as master, which couldn't
guarantee the remaining CPUs are waiting. This patch changes the
last CPU as the master to solve the potential risk.

Fixes: ae16480785de ("arm64: introduce interfaces to hotpatch kernel and module code")
Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
Signed-off-by: Guo Ren <guoren@kernel.org>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
Cc: <stable@vger.kernel.org>
Link: https://lore.kernel.org/r/20220407073323.743224-2-guoren@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 arch/arm64/kernel/insn.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kernel/insn.c b/arch/arm64/kernel/insn.c
index 6c0de2f60ea9..7d4fdf974542 100644
--- a/arch/arm64/kernel/insn.c
+++ b/arch/arm64/kernel/insn.c
@@ -216,8 +216,8 @@ static int __kprobes aarch64_insn_patch_text_cb(void *arg)
 	int i, ret = 0;
 	struct aarch64_insn_patch *pp = arg;
 
-	/* The first CPU becomes master */
-	if (atomic_inc_return(&pp->cpu_count) == 1) {
+	/* The last CPU becomes master */
+	if (atomic_inc_return(&pp->cpu_count) == num_online_cpus()) {
 		for (i = 0; ret == 0 && i < pp->insn_cnt; i++)
 			ret = aarch64_insn_patch_text_nosync(pp->text_addrs[i],
 							     pp->new_insns[i]);
-- 
Gitee


From f71259712007a69998415360475dfe01721c728d Mon Sep 17 00:00:00 2001
From: Douglas Miller <doug.miller@cornelisnetworks.com>
Date: Thu, 14 Jul 2022 21:59:19 +0800
Subject: [PATCH 328/674] RDMA/hfi1: Fix use-after-free bug for mm struct

stable inclusion
from stable-v5.10.111
commit 5f54364ff6cfcd14cddf5441c4a490bb28dd69f7
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=5f54364ff6cfcd14cddf5441c4a490bb28dd69f7

--------------------------------

commit 2bbac98d0930e8161b1957dc0ec99de39ade1b3c upstream.

Under certain conditions, such as MPI_Abort, the hfi1 cleanup code may
represent the last reference held on the task mm.
hfi1_mmu_rb_unregister() then drops the last reference and the mm is freed
before the final use in hfi1_release_user_pages().  A new task may
allocate the mm structure while it is still being used, resulting in
problems. One manifestation is corruption of the mmap_sem counter leading
to a hang in down_write().  Another is corruption of an mm struct that is
in use by another task.

Fixes: 3d2a9d642512 ("IB/hfi1: Ensure correct mm is used at all times")
Link: https://lore.kernel.org/r/20220408133523.122165.72975.stgit@awfm-01.cornelisnetworks.com
Cc: <stable@vger.kernel.org>
Signed-off-by: Douglas Miller <doug.miller@cornelisnetworks.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@cornelisnetworks.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/infiniband/hw/hfi1/mmu_rb.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c
index d213f65d4cdd..ed8a96ae61ce 100644
--- a/drivers/infiniband/hw/hfi1/mmu_rb.c
+++ b/drivers/infiniband/hw/hfi1/mmu_rb.c
@@ -121,6 +121,9 @@ void hfi1_mmu_rb_unregister(struct mmu_rb_handler *handler)
 	unsigned long flags;
 	struct list_head del_list;
 
+	/* Prevent freeing of mm until we are completely finished. */
+	mmgrab(handler->mn.mm);
+
 	/* Unregister first so we don't get any more notifications. */
 	mmu_notifier_unregister(&handler->mn, handler->mn.mm);
 
@@ -143,6 +146,9 @@ void hfi1_mmu_rb_unregister(struct mmu_rb_handler *handler)
 
 	do_remove(handler, &del_list);
 
+	/* Now the mm may be freed. */
+	mmdrop(handler->mn.mm);
+
 	kfree(handler);
 }
 
-- 
Gitee


From b20392ad0b9b08a9940ef5817929710e1637e9cd Mon Sep 17 00:00:00 2001
From: Shreeya Patel <shreeya.patel@collabora.com>
Date: Thu, 14 Jul 2022 21:59:20 +0800
Subject: [PATCH 329/674] gpio: Restrict usage of GPIO chip irq members before
 initialization

stable inclusion
from stable-v5.10.111
commit 7e88a50704b0c49ad3f2d11e8b963341cf68a89f
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=7e88a50704b0c49ad3f2d11e8b963341cf68a89f

--------------------------------

commit 5467801f1fcbdc46bc7298a84dbf3ca1ff2a7320 upstream.

GPIO chip irq members are exposed before they could be completely
initialized and this leads to race conditions.

One such issue was observed for the gc->irq.domain variable which
was accessed through the I2C interface in gpiochip_to_irq() before
it could be initialized by gpiochip_add_irqchip(). This resulted in
Kernel NULL pointer dereference.

Following are the logs for reference :-

kernel: Call Trace:
kernel:  gpiod_to_irq+0x53/0x70
kernel:  acpi_dev_gpio_irq_get_by+0x113/0x1f0
kernel:  i2c_acpi_get_irq+0xc0/0xd0
kernel:  i2c_device_probe+0x28a/0x2a0
kernel:  really_probe+0xf2/0x460
kernel: RIP: 0010:gpiochip_to_irq+0x47/0xc0

To avoid such scenarios, restrict usage of GPIO chip irq members before
they are completely initialized.

Signed-off-by: Shreeya Patel <shreeya.patel@collabora.com>
Cc: stable@vger.kernel.org
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Bartosz Golaszewski <brgl@bgdev.pl>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/gpio/gpiolib.c      | 19 +++++++++++++++++++
 include/linux/gpio/driver.h |  9 +++++++++
 2 files changed, 28 insertions(+)

diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 00526fdd7691..d18078748200 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -1411,6 +1411,16 @@ static int gpiochip_to_irq(struct gpio_chip *gc, unsigned offset)
 {
 	struct irq_domain *domain = gc->irq.domain;
 
+#ifdef CONFIG_GPIOLIB_IRQCHIP
+	/*
+	 * Avoid race condition with other code, which tries to lookup
+	 * an IRQ before the irqchip has been properly registered,
+	 * i.e. while gpiochip is still being brought up.
+	 */
+	if (!gc->irq.initialized)
+		return -EPROBE_DEFER;
+#endif
+
 	if (!gpiochip_irqchip_irq_valid(gc, offset))
 		return -ENXIO;
 
@@ -1604,6 +1614,15 @@ static int gpiochip_add_irqchip(struct gpio_chip *gc,
 
 	acpi_gpiochip_request_interrupts(gc);
 
+	/*
+	 * Using barrier() here to prevent compiler from reordering
+	 * gc->irq.initialized before initialization of above
+	 * GPIO chip irq members.
+	 */
+	barrier();
+
+	gc->irq.initialized = true;
+
 	return 0;
 }
 
diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index 8e144306e262..b216899b4745 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -224,6 +224,15 @@ struct gpio_irq_chip {
 				unsigned long *valid_mask,
 				unsigned int ngpios);
 
+	/**
+	 * @initialized:
+	 *
+	 * Flag to track GPIO chip irq member's initialization.
+	 * This flag will make sure GPIO chip irq members are not used
+	 * before they are initialized.
+	 */
+	bool initialized;
+
 	/**
 	 * @valid_mask:
 	 *
-- 
Gitee


From 2a1074ace4931c8d4355fc8d897faea5e867f35d Mon Sep 17 00:00:00 2001
From: Christian Lamparter <chunkeey@gmail.com>
Date: Thu, 14 Jul 2022 21:59:21 +0800
Subject: [PATCH 330/674] ata: sata_dwc_460ex: Fix crash due to OOB write

stable inclusion
from stable-v5.10.111
commit fc629224aa62f23849cae83717932985ac51232d
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=fc629224aa62f23849cae83717932985ac51232d

--------------------------------

commit 7aa8104a554713b685db729e66511b93d989dd6a upstream.

the driver uses libata's "tag" values from in various arrays.
Since the mentioned patch bumped the ATA_TAG_INTERNAL to 32,
the value of the SATA_DWC_QCMD_MAX needs to account for that.

Otherwise ATA_TAG_INTERNAL usage cause similar crashes like
this as reported by Tice Rex on the OpenWrt Forum and
reproduced (with symbols) here:

| BUG: Kernel NULL pointer dereference at 0x00000000
| Faulting instruction address: 0xc03ed4b8
| Oops: Kernel access of bad area, sig: 11 [#1]
| BE PAGE_SIZE=4K PowerPC 44x Platform
| CPU: 0 PID: 362 Comm: scsi_eh_1 Not tainted 5.4.163 #0
| NIP:  c03ed4b8 LR: c03d27e8 CTR: c03ed36c
| REGS: cfa59950 TRAP: 0300   Not tainted  (5.4.163)
| MSR:  00021000 <CE,ME>  CR: 42000222  XER: 00000000
| DEAR: 00000000 ESR: 00000000
| GPR00: c03d27e8 cfa59a08 cfa55fe0 00000000 0fa46bc0 [...]
| [..]
| NIP [c03ed4b8] sata_dwc_qc_issue+0x14c/0x254
| LR [c03d27e8] ata_qc_issue+0x1c8/0x2dc
| Call Trace:
| [cfa59a08] [c003f4e0] __cancel_work_timer+0x124/0x194 (unreliable)
| [cfa59a78] [c03d27e8] ata_qc_issue+0x1c8/0x2dc
| [cfa59a98] [c03d2b3c] ata_exec_internal_sg+0x240/0x524
| [cfa59b08] [c03d2e98] ata_exec_internal+0x78/0xe0
| [cfa59b58] [c03d30fc] ata_read_log_page.part.38+0x1dc/0x204
| [cfa59bc8] [c03d324c] ata_identify_page_supported+0x68/0x130
| [...]

This is because sata_dwc_dma_xfer_complete() NULLs the
dma_pending's next neighbour "chan" (a *dma_chan struct) in
this '32' case right here (line ~735):
> hsdevp->dma_pending[tag] = SATA_DWC_DMA_PENDING_NONE;

Then the next time, a dma gets issued; dma_dwc_xfer_setup() passes
the NULL'd hsdevp->chan to the dmaengine_slave_config() which then
causes the crash.

With this patch, SATA_DWC_QCMD_MAX is now set to ATA_MAX_QUEUE + 1.
This avoids the OOB. But please note, there was a worthwhile discussion
on what ATA_TAG_INTERNAL and ATA_MAX_QUEUE is. And why there should not
be a "fake" 33 command-long queue size.

Ideally, the dw driver should account for the ATA_TAG_INTERNAL.
In Damien Le Moal's words: "... having looked at the driver, it
is a bigger change than just faking a 33rd "tag" that is in fact
not a command tag at all."

Fixes: 28361c403683c ("libata: add extra internal command")
Cc: stable@kernel.org # 4.18+
BugLink: https://github.com/openwrt/openwrt/issues/9505
Signed-off-by: Christian Lamparter <chunkeey@gmail.com>
Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/ata/sata_dwc_460ex.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/ata/sata_dwc_460ex.c b/drivers/ata/sata_dwc_460ex.c
index 982fe9112532..464260f66870 100644
--- a/drivers/ata/sata_dwc_460ex.c
+++ b/drivers/ata/sata_dwc_460ex.c
@@ -145,7 +145,11 @@ struct sata_dwc_device {
 #endif
 };
 
-#define SATA_DWC_QCMD_MAX	32
+/*
+ * Allow one extra special slot for commands and DMA management
+ * to account for libata internal commands.
+ */
+#define SATA_DWC_QCMD_MAX	(ATA_MAX_QUEUE + 1)
 
 struct sata_dwc_device_port {
 	struct sata_dwc_device	*hsdev;
-- 
Gitee


From 14b1b10261db1b384c4c1e3797f8618cf80cc71b Mon Sep 17 00:00:00 2001
From: Xiaomeng Tong <xiam0nd.tong@gmail.com>
Date: Thu, 14 Jul 2022 21:59:22 +0800
Subject: [PATCH 331/674] perf: qcom_l2_pmu: fix an incorrect NULL check on
 list iterator

stable inclusion
from stable-v5.10.111
commit 451214b266e949b219d27faa9f44b434086a2f92
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=451214b266e949b219d27faa9f44b434086a2f92

--------------------------------

commit 2012a9e279013933885983cbe0a5fe828052563b upstream.

The bug is here:
	return cluster;

The list iterator value 'cluster' will *always* be set and non-NULL
by list_for_each_entry(), so it is incorrect to assume that the
iterator value will be NULL if the list is empty or no element
is found.

To fix the bug, return 'cluster' when found, otherwise return NULL.

Cc: stable@vger.kernel.org
Fixes: 21bdbb7102ed ("perf: add qcom l2 cache perf events driver")
Signed-off-by: Xiaomeng Tong <xiam0nd.tong@gmail.com>
Link: https://lore.kernel.org/r/20220327055733.4070-1-xiam0nd.tong@gmail.com
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/perf/qcom_l2_pmu.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/perf/qcom_l2_pmu.c b/drivers/perf/qcom_l2_pmu.c
index 23a0e008dafa..d58810f53727 100644
--- a/drivers/perf/qcom_l2_pmu.c
+++ b/drivers/perf/qcom_l2_pmu.c
@@ -739,7 +739,7 @@ static struct cluster_pmu *l2_cache_associate_cpu_with_cluster(
 {
 	u64 mpidr;
 	int cpu_cluster_id;
-	struct cluster_pmu *cluster = NULL;
+	struct cluster_pmu *cluster;
 
 	/*
 	 * This assumes that the cluster_id is in MPIDR[aff1] for
@@ -761,10 +761,10 @@ static struct cluster_pmu *l2_cache_associate_cpu_with_cluster(
 			 cluster->cluster_id);
 		cpumask_set_cpu(cpu, &cluster->cluster_cpus);
 		*per_cpu_ptr(l2cache_pmu->pmu_cluster, cpu) = cluster;
-		break;
+		return cluster;
 	}
 
-	return cluster;
+	return NULL;
 }
 
 static int l2cache_pmu_online_cpu(unsigned int cpu, struct hlist_node *node)
-- 
Gitee


From 0591f31b34eb378187203f76d4e94d76e30f8a06 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Thu, 14 Jul 2022 21:59:23 +0800
Subject: [PATCH 332/674] irqchip/gic-v3: Fix GICR_CTLR.RWP polling

stable inclusion
from stable-v5.10.111
commit ff24114bb08d8b90edf2aff0a4fd0689523e6c17
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=ff24114bb08d8b90edf2aff0a4fd0689523e6c17

--------------------------------

commit 0df6664531a12cdd8fc873f0cac0dcb40243d3e9 upstream.

It turns out that our polling of RWP is totally wrong when checking
for it in the redistributors, as we test the *distributor* bit index,
whereas it is a different bit number in the RDs... Oopsie boo.

This is embarassing. Not only because it is wrong, but also because
it took *8 years* to notice the blunder...

Just fix the damn thing.

Fixes: 021f653791ad ("irqchip: gic-v3: Initial support for GICv3")
Signed-off-by: Marc Zyngier <maz@kernel.org>
Cc: stable@vger.kernel.org
Reviewed-by: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Link: https://lore.kernel.org/r/20220315165034.794482-2-maz@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/irqchip/irq-gic-v3.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c
index aa8facbdf407..130b19aeb679 100644
--- a/drivers/irqchip/irq-gic-v3.c
+++ b/drivers/irqchip/irq-gic-v3.c
@@ -206,11 +206,11 @@ static inline void __iomem *gic_dist_base(struct irq_data *d)
 	}
 }
 
-static void gic_do_wait_for_rwp(void __iomem *base)
+static void gic_do_wait_for_rwp(void __iomem *base, u32 bit)
 {
 	u32 count = 1000000;	/* 1s! */
 
-	while (readl_relaxed(base + GICD_CTLR) & GICD_CTLR_RWP) {
+	while (readl_relaxed(base + GICD_CTLR) & bit) {
 		count--;
 		if (!count) {
 			pr_err_ratelimited("RWP timeout, gone fishing\n");
@@ -224,13 +224,13 @@ static void gic_do_wait_for_rwp(void __iomem *base)
 /* Wait for completion of a distributor change */
 static void gic_dist_wait_for_rwp(void)
 {
-	gic_do_wait_for_rwp(gic_data.dist_base);
+	gic_do_wait_for_rwp(gic_data.dist_base, GICD_CTLR_RWP);
 }
 
 /* Wait for completion of a redistributor change */
 static void gic_redist_wait_for_rwp(void)
 {
-	gic_do_wait_for_rwp(gic_data_rdist_rd_base());
+	gic_do_wait_for_rwp(gic_data_rdist_rd_base(), GICR_CTLR_RWP);
 }
 
 #ifdef CONFIG_ARM64
-- 
Gitee


From bf55f83f44824afe860c4772d938a4a64ebd97e6 Mon Sep 17 00:00:00 2001
From: Alex Deucher <alexander.deucher@amd.com>
Date: Thu, 14 Jul 2022 21:59:24 +0800
Subject: [PATCH 333/674] drm/amdgpu/smu10: fix SoC/fclk units in auto mode

stable inclusion
from stable-v5.10.111
commit 786ae8de3a5e556c12dc548a048715453ccec20d
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=786ae8de3a5e556c12dc548a048715453ccec20d

--------------------------------

commit 2f25d8ce09b7ba5d769c132ba3d4eb84a941d2cb upstream.

SMU takes clock limits in Mhz units.  socclk and fclk were
using 10 khz units in some cases.  Switch to Mhz units.
Fixes higher than required SoC clocks.

Fixes: 97cf32996c46d9 ("drm/amd/pm: Removed fixed clock in auto mode DPM")
Reviewed-by: Paul Menzel <pmenzel@molgen.mpg.de>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Cc: stable@vger.kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu10_hwmgr.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu10_hwmgr.c b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu10_hwmgr.c
index e6f40ee9f313..9d97938bd49e 100644
--- a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu10_hwmgr.c
+++ b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu10_hwmgr.c
@@ -709,13 +709,13 @@ static int smu10_dpm_force_dpm_level(struct pp_hwmgr *hwmgr,
 		smum_send_msg_to_smc_with_parameter(hwmgr,
 						PPSMC_MSG_SetHardMinFclkByFreq,
 						hwmgr->display_config->num_display > 3 ?
-						data->clock_vol_info.vdd_dep_on_fclk->entries[0].clk :
+						(data->clock_vol_info.vdd_dep_on_fclk->entries[0].clk / 100) :
 						min_mclk,
 						NULL);
 
 		smum_send_msg_to_smc_with_parameter(hwmgr,
 						PPSMC_MSG_SetHardMinSocclkByFreq,
-						data->clock_vol_info.vdd_dep_on_socclk->entries[0].clk,
+						data->clock_vol_info.vdd_dep_on_socclk->entries[0].clk / 100,
 						NULL);
 		smum_send_msg_to_smc_with_parameter(hwmgr,
 						PPSMC_MSG_SetHardMinVcn,
@@ -728,11 +728,11 @@ static int smu10_dpm_force_dpm_level(struct pp_hwmgr *hwmgr,
 						NULL);
 		smum_send_msg_to_smc_with_parameter(hwmgr,
 						PPSMC_MSG_SetSoftMaxFclkByFreq,
-						data->clock_vol_info.vdd_dep_on_fclk->entries[index_fclk].clk,
+						data->clock_vol_info.vdd_dep_on_fclk->entries[index_fclk].clk / 100,
 						NULL);
 		smum_send_msg_to_smc_with_parameter(hwmgr,
 						PPSMC_MSG_SetSoftMaxSocclkByFreq,
-						data->clock_vol_info.vdd_dep_on_socclk->entries[index_socclk].clk,
+						data->clock_vol_info.vdd_dep_on_socclk->entries[index_socclk].clk / 100,
 						NULL);
 		smum_send_msg_to_smc_with_parameter(hwmgr,
 						PPSMC_MSG_SetSoftMaxVcn,
-- 
Gitee


From 2802838b83da289166b5a8a4e385398582fcaeaf Mon Sep 17 00:00:00 2001
From: Karol Herbst <kherbst@redhat.com>
Date: Thu, 14 Jul 2022 21:59:25 +0800
Subject: [PATCH 334/674] drm/nouveau/pmu: Add missing callbacks for Tegra
 devices

stable inclusion
from stable-v5.10.111
commit 326b408e7ec7b1715bcd81d1d0861256eebf8890
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=326b408e7ec7b1715bcd81d1d0861256eebf8890

--------------------------------

commit 38d4e5cf5b08798f093374e53c2f4609d5382dd5 upstream.

Fixes a crash booting on those platforms with nouveau.

Fixes: 4cdd2450bf73 ("drm/nouveau/pmu/gm200-: use alternate falcon reset sequence")
Cc: Ben Skeggs <bskeggs@redhat.com>
Cc: Karol Herbst <kherbst@redhat.com>
Cc: dri-devel@lists.freedesktop.org
Cc: nouveau@lists.freedesktop.org
Cc: <stable@vger.kernel.org> # v5.17+
Signed-off-by: Karol Herbst <kherbst@redhat.com>
Reviewed-by: Lyude Paul <lyude@redhat.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20220322124800.2605463-1-kherbst@redhat.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/gpu/drm/nouveau/nvkm/subdev/pmu/gm20b.c | 1 +
 drivers/gpu/drm/nouveau/nvkm/subdev/pmu/gp102.c | 2 +-
 drivers/gpu/drm/nouveau/nvkm/subdev/pmu/gp10b.c | 1 +
 drivers/gpu/drm/nouveau/nvkm/subdev/pmu/priv.h  | 1 +
 4 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/pmu/gm20b.c b/drivers/gpu/drm/nouveau/nvkm/subdev/pmu/gm20b.c
index 7938722b4da1..d82529becfdc 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/pmu/gm20b.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/pmu/gm20b.c
@@ -216,6 +216,7 @@ gm20b_pmu = {
 	.intr = gt215_pmu_intr,
 	.recv = gm20b_pmu_recv,
 	.initmsg = gm20b_pmu_initmsg,
+	.reset = gf100_pmu_reset,
 };
 
 #if IS_ENABLED(CONFIG_ARCH_TEGRA_210_SOC)
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/pmu/gp102.c b/drivers/gpu/drm/nouveau/nvkm/subdev/pmu/gp102.c
index 3dfb3e8522f6..9f32982216b6 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/pmu/gp102.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/pmu/gp102.c
@@ -23,7 +23,7 @@
  */
 #include "priv.h"
 
-static void
+void
 gp102_pmu_reset(struct nvkm_pmu *pmu)
 {
 	struct nvkm_device *device = pmu->subdev.device;
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/pmu/gp10b.c b/drivers/gpu/drm/nouveau/nvkm/subdev/pmu/gp10b.c
index 7f5f9d544836..0bd4b32ad863 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/pmu/gp10b.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/pmu/gp10b.c
@@ -83,6 +83,7 @@ gp10b_pmu = {
 	.intr = gt215_pmu_intr,
 	.recv = gm20b_pmu_recv,
 	.initmsg = gm20b_pmu_initmsg,
+	.reset = gp102_pmu_reset,
 };
 
 #if IS_ENABLED(CONFIG_ARCH_TEGRA_210_SOC)
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/pmu/priv.h b/drivers/gpu/drm/nouveau/nvkm/subdev/pmu/priv.h
index b945ec320cd2..80c4cb861d40 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/pmu/priv.h
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/pmu/priv.h
@@ -41,6 +41,7 @@ int gt215_pmu_send(struct nvkm_pmu *, u32[2], u32, u32, u32, u32);
 
 bool gf100_pmu_enabled(struct nvkm_pmu *);
 void gf100_pmu_reset(struct nvkm_pmu *);
+void gp102_pmu_reset(struct nvkm_pmu *pmu);
 
 void gk110_pmu_pgob(struct nvkm_pmu *, bool);
 
-- 
Gitee


From 092160affba0ccae99f68cb2ddc3520cb35ee904 Mon Sep 17 00:00:00 2001
From: Lee Jones <lee.jones@linaro.org>
Date: Thu, 14 Jul 2022 21:59:26 +0800
Subject: [PATCH 335/674] drm/amdkfd: Create file descriptor after client is
 added to smi_clients list
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

stable inclusion
from stable-v5.10.111
commit 82e43950143cc92534c359f5fa42092c928930e7
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=82e43950143cc92534c359f5fa42092c928930e7

--------------------------------

commit e79a2398e1b2d47060474dca291542368183bc0f upstream.

This ensures userspace cannot prematurely clean-up the client before
it is fully initialised which has been proven to cause issues in the
past.

Cc: Felix Kuehling <Felix.Kuehling@amd.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: "Christian König" <christian.koenig@amd.com>
Cc: "Pan, Xinhui" <Xinhui.Pan@amd.com>
Cc: David Airlie <airlied@linux.ie>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: amd-gfx@lists.freedesktop.org
Cc: dri-devel@lists.freedesktop.org
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Cc: stable@vger.kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 24 +++++++++++++--------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
index 17d1736367ea..bd4caa36ab2e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
@@ -270,15 +270,6 @@ int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd)
 		return ret;
 	}
 
-	ret = anon_inode_getfd(kfd_smi_name, &kfd_smi_ev_fops, (void *)client,
-			       O_RDWR);
-	if (ret < 0) {
-		kfifo_free(&client->fifo);
-		kfree(client);
-		return ret;
-	}
-	*fd = ret;
-
 	init_waitqueue_head(&client->wait_queue);
 	spin_lock_init(&client->lock);
 	client->events = 0;
@@ -288,5 +279,20 @@ int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd)
 	list_add_rcu(&client->list, &dev->smi_clients);
 	spin_unlock(&dev->smi_lock);
 
+	ret = anon_inode_getfd(kfd_smi_name, &kfd_smi_ev_fops, (void *)client,
+			       O_RDWR);
+	if (ret < 0) {
+		spin_lock(&dev->smi_lock);
+		list_del_rcu(&client->list);
+		spin_unlock(&dev->smi_lock);
+
+		synchronize_rcu();
+
+		kfifo_free(&client->fifo);
+		kfree(client);
+		return ret;
+	}
+	*fd = ret;
+
 	return 0;
 }
-- 
Gitee


From 2695cd91c767577de371da7aadea900d6aba588e Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 14 Jul 2022 21:59:27 +0800
Subject: [PATCH 336/674] perf build: Don't use -ffat-lto-objects in the python
 feature test when building with clang-13

stable inclusion
from stable-v5.10.111
commit 79abc219bafdb27fa1b4e788757c71eed1003d6a
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=79abc219bafdb27fa1b4e788757c71eed1003d6a

--------------------------------

commit 3a8a0475861a443f02e3a9b57d044fe2a0a99291 upstream.

Using -ffat-lto-objects in the python feature test when building with
clang-13 results in:

  clang-13: error: optimization flag '-ffat-lto-objects' is not supported [-Werror,-Wignored-optimization-argument]
  error: command '/usr/sbin/clang' failed with exit code 1
  cp: cannot stat '/tmp/build/perf/python_ext_build/lib/perf*.so': No such file or directory
  make[2]: *** [Makefile.perf:639: /tmp/build/perf/python/perf.so] Error 1

Noticed when building on a docker.io/library/archlinux:base container.

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Fangrui Song <maskray@google.com>
Cc: Florian Fainelli <f.fainelli@gmail.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Keeping <john@metanate.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Michael Petlan <mpetlan@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Sedat Dilek <sedat.dilek@gmail.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 tools/perf/Makefile.config | 3 +++
 tools/perf/util/setup.py   | 2 ++
 2 files changed, 5 insertions(+)

diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config
index 99266f1da1b2..1e77304d1b42 100644
--- a/tools/perf/Makefile.config
+++ b/tools/perf/Makefile.config
@@ -261,6 +261,9 @@ ifdef PYTHON_CONFIG
   PYTHON_EMBED_LIBADD := $(call grep-libs,$(PYTHON_EMBED_LDOPTS)) -lutil
   PYTHON_EMBED_CCOPTS := $(shell $(PYTHON_CONFIG_SQ) --includes 2>/dev/null)
   FLAGS_PYTHON_EMBED := $(PYTHON_EMBED_CCOPTS) $(PYTHON_EMBED_LDOPTS)
+  ifeq ($(CC_NO_CLANG), 0)
+    PYTHON_EMBED_CCOPTS := $(filter-out -ffat-lto-objects, $(PYTHON_EMBED_CCOPTS))
+  endif
 endif
 
 FEATURE_CHECK_CFLAGS-libpython := $(PYTHON_EMBED_CCOPTS)
diff --git a/tools/perf/util/setup.py b/tools/perf/util/setup.py
index c5e3e9a68162..78e8b31b3651 100644
--- a/tools/perf/util/setup.py
+++ b/tools/perf/util/setup.py
@@ -23,6 +23,8 @@ if cc_is_clang:
             vars[var] = sub("-fstack-protector-strong", "", vars[var])
         if not clang_has_option("-fno-semantic-interposition"):
             vars[var] = sub("-fno-semantic-interposition", "", vars[var])
+        if not clang_has_option("-ffat-lto-objects"):
+            vars[var] = sub("-ffat-lto-objects", "", vars[var])
 
 from distutils.core import setup, Extension
 
-- 
Gitee


From 92fd030ba22b7a1ad73cf9a02c351bd9a7df3438 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 14 Jul 2022 21:59:28 +0800
Subject: [PATCH 337/674] perf python: Fix probing for some clang command line
 options

stable inclusion
from stable-v5.10.111
commit 6374faf49e89f4283c6af4f74d5928ef0a657714
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=6374faf49e89f4283c6af4f74d5928ef0a657714

--------------------------------

commit dd6e1fe91cdd52774ca642d1da75b58a86356b56 upstream.

The clang compiler complains about some options even without a source
file being available, while others require one, so use the simple
tools/build/feature/test-hello.c file.

Then check for the "is not supported" string in its output, in addition
to the "unknown argument" already being looked for.

This was noticed when building with clang-13 where -ffat-lto-objects
isn't supported and since we were looking just for "unknown argument"
and not providing a source code to clang, was mistakenly assumed as
being available and not being filtered to set of command line options
provided to clang, leading to a build failure.

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Fangrui Song <maskray@google.com>
Cc: Florian Fainelli <f.fainelli@gmail.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Keeping <john@metanate.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Michael Petlan <mpetlan@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Sedat Dilek <sedat.dilek@gmail.com>
Link: http://lore.kernel.org/lkml/
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 tools/perf/util/setup.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tools/perf/util/setup.py b/tools/perf/util/setup.py
index 78e8b31b3651..b670469a8124 100644
--- a/tools/perf/util/setup.py
+++ b/tools/perf/util/setup.py
@@ -1,12 +1,14 @@
-from os import getenv
+from os import getenv, path
 from subprocess import Popen, PIPE
 from re import sub
 
 cc = getenv("CC")
 cc_is_clang = b"clang version" in Popen([cc.split()[0], "-v"], stderr=PIPE).stderr.readline()
+src_feature_tests  = getenv('srctree') + '/tools/build/feature'
 
 def clang_has_option(option):
-    return [o for o in Popen([cc, option], stderr=PIPE).stderr.readlines() if b"unknown argument" in o] == [ ]
+    cc_output = Popen([cc, option, path.join(src_feature_tests, "test-hello.c") ], stderr=PIPE).stderr.readlines()
+    return [o for o in cc_output if ((b"unknown argument" in o) or (b"is not supported" in o))] == [ ]
 
 if cc_is_clang:
     from distutils.sysconfig import get_config_vars
-- 
Gitee


From b7f76fdc7a1dd722887136255282e674efab7bad Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 14 Jul 2022 21:59:29 +0800
Subject: [PATCH 338/674] tools build: Filter out options and warnings not
 supported by clang

stable inclusion
from stable-v5.10.111
commit 75c8558d410f6cedc96f6f701f7ced1c8aced6fc
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=75c8558d410f6cedc96f6f701f7ced1c8aced6fc

--------------------------------

commit 41caff459a5b956b3e23ba9ca759dd0629ad3dda upstream.

These make the feature check fail when using clang, so remove them just
like is done in tools/perf/Makefile.config to build perf itself.

Adding -Wno-compound-token-split-by-macro to tools/perf/Makefile.config
when building with clang is also necessary to avoid these warnings
turned into errors (-Werror):

    CC      /tmp/build/perf/util/scripting-engines/trace-event-perl.o
  In file included from util/scripting-engines/trace-event-perl.c:35:
  In file included from /usr/lib64/perl5/CORE/perl.h:4085:
  In file included from /usr/lib64/perl5/CORE/hv.h:659:
  In file included from /usr/lib64/perl5/CORE/hv_func.h:34:
  In file included from /usr/lib64/perl5/CORE/sbox32_hash.h:4:
  /usr/lib64/perl5/CORE/zaphod32_hash.h:150:5: error: '(' and '{' tokens introducing statement expression appear in different macro expansion contexts [-Werror,-Wcompound-token-split-by-macro]
      ZAPHOD32_SCRAMBLE32(state[0],0x9fade23b);
      ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  /usr/lib64/perl5/CORE/zaphod32_hash.h:80:38: note: expanded from macro 'ZAPHOD32_SCRAMBLE32'
  #define ZAPHOD32_SCRAMBLE32(v,prime) STMT_START {  \
                                       ^~~~~~~~~~
  /usr/lib64/perl5/CORE/perl.h:737:29: note: expanded from macro 'STMT_START'
  #   define STMT_START   (void)( /* gcc supports "({ STATEMENTS; })" */
                                ^
  /usr/lib64/perl5/CORE/zaphod32_hash.h:150:5: note: '{' token is here
      ZAPHOD32_SCRAMBLE32(state[0],0x9fade23b);
      ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  /usr/lib64/perl5/CORE/zaphod32_hash.h:80:49: note: expanded from macro 'ZAPHOD32_SCRAMBLE32'
  #define ZAPHOD32_SCRAMBLE32(v,prime) STMT_START {  \
                                                  ^
  /usr/lib64/perl5/CORE/zaphod32_hash.h:150:5: error: '}' and ')' tokens terminating statement expression appear in different macro expansion contexts [-Werror,-Wcompound-token-split-by-macro]
      ZAPHOD32_SCRAMBLE32(state[0],0x9fade23b);
      ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  /usr/lib64/perl5/CORE/zaphod32_hash.h:87:41: note: expanded from macro 'ZAPHOD32_SCRAMBLE32'
      v ^= (v>>23);                       \
                                          ^
  /usr/lib64/perl5/CORE/zaphod32_hash.h:150:5: note: ')' token is here
      ZAPHOD32_SCRAMBLE32(state[0],0x9fade23b);
      ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  /usr/lib64/perl5/CORE/zaphod32_hash.h:88:3: note: expanded from macro 'ZAPHOD32_SCRAMBLE32'
  } STMT_END
    ^~~~~~~~
  /usr/lib64/perl5/CORE/perl.h:738:21: note: expanded from macro 'STMT_END'
  #   define STMT_END     )
                          ^

Please refer to the discussion on the Link: tag below, where Nathan
clarifies the situation:

<quote>
acme> And then get to the problems at the end of this message, which seem
acme> similar to the problem described here:
acme>
acme> From  Nathan Chancellor <>
acme> Subject	[PATCH] mwifiex: Remove unnecessary braces from HostCmd_SET_SEQ_NO_BSS_INFO
acme>
acme> https://lkml.org/lkml/2020/9/1/135
acme>
acme> So perhaps in this case its better to disable that
acme> -Werror,-Wcompound-token-split-by-macro when building with clang?

Yes, I think that is probably the best solution. As far as I can tell,
at least in this file and context, the warning appears harmless, as the
"create a GNU C statement expression from two different macros" is very
much intentional, based on the presence of PERL_USE_GCC_BRACE_GROUPS.
The warning is fixed in upstream Perl by just avoiding creating GNU C
statement expressions using STMT_START and STMT_END:

  https://github.com/Perl/perl5/issues/18780
  https://github.com/Perl/perl5/pull/18984

If I am reading the source code correctly, an alternative to disabling
the warning would be specifying -DPERL_GCC_BRACE_GROUPS_FORBIDDEN but it
seems like that might end up impacting more than just this site,
according to the issue discussion above.
</quote>

Based-on-a-patch-by: Sedat Dilek <sedat.dilek@gmail.com>
Tested-by: Sedat Dilek <sedat.dilek@gmail.com> # Debian/Selfmade LLVM-14 (x86-64)
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Fangrui Song <maskray@google.com>
Cc: Florian Fainelli <f.fainelli@gmail.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Keeping <john@metanate.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Michael Petlan <mpetlan@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Link: http://lore.kernel.org/lkml/YkxWcYzph5pC1EK8@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 tools/build/feature/Makefile | 7 +++++++
 tools/perf/Makefile.config   | 3 +++
 2 files changed, 10 insertions(+)

diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile
index d45ec15ac383..09f47aff9012 100644
--- a/tools/build/feature/Makefile
+++ b/tools/build/feature/Makefile
@@ -217,6 +217,13 @@ PERL_EMBED_LIBADD = $(call grep-libs,$(PERL_EMBED_LDOPTS))
 PERL_EMBED_CCOPTS = `perl -MExtUtils::Embed -e ccopts 2>/dev/null`
 FLAGS_PERL_EMBED=$(PERL_EMBED_CCOPTS) $(PERL_EMBED_LDOPTS)
 
+ifeq ($(CC_NO_CLANG), 0)
+  PERL_EMBED_LDOPTS := $(filter-out -specs=%,$(PERL_EMBED_LDOPTS))
+  PERL_EMBED_CCOPTS := $(filter-out -flto=auto -ffat-lto-objects, $(PERL_EMBED_CCOPTS))
+  PERL_EMBED_CCOPTS := $(filter-out -specs=%,$(PERL_EMBED_CCOPTS))
+  FLAGS_PERL_EMBED += -Wno-compound-token-split-by-macro
+endif
+
 $(OUTPUT)test-libperl.bin:
 	$(BUILD) $(FLAGS_PERL_EMBED)
 
diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config
index 1e77304d1b42..78700c7ec7df 100644
--- a/tools/perf/Makefile.config
+++ b/tools/perf/Makefile.config
@@ -776,6 +776,9 @@ else
     LDFLAGS += $(PERL_EMBED_LDFLAGS)
     EXTLIBS += $(PERL_EMBED_LIBADD)
     CFLAGS += -DHAVE_LIBPERL_SUPPORT
+    ifeq ($(CC_NO_CLANG), 0)
+      CFLAGS += -Wno-compound-token-split-by-macro
+    endif
     $(call detected,CONFIG_LIBPERL)
   endif
 endif
-- 
Gitee


From 1a568a6171d30d1a0d1c51d724b71ed20cfb9013 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 14 Jul 2022 21:59:30 +0800
Subject: [PATCH 339/674] tools build: Use $(shell ) instead of `` to get
 embedded libperl's ccopts

stable inclusion
from stable-v5.10.111
commit 40e00885a61f6d31c76f32d05f4c87b3fc272720
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=40e00885a61f6d31c76f32d05f4c87b3fc272720

--------------------------------

commit 541f695cbcb6932c22638b06e0cbe1d56177e2e9 upstream.

Just like its done for ldopts and for both in tools/perf/Makefile.config.

Using `` to initialize PERL_EMBED_CCOPTS somehow precludes using:

  $(filter-out SOMETHING_TO_FILTER,$(PERL_EMBED_CCOPTS))

And we need to do it to allow for building with versions of clang where
some gcc options selected by distros are not available.

Tested-by: Sedat Dilek <sedat.dilek@gmail.com> # Debian/Selfmade LLVM-14 (x86-64)
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Fangrui Song <maskray@google.com>
Cc: Florian Fainelli <f.fainelli@gmail.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Keeping <john@metanate.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Michael Petlan <mpetlan@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Link: http://lore.kernel.org/lkml/YktYX2OnLtyobRYD@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 tools/build/feature/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile
index 09f47aff9012..db3417ff8c5f 100644
--- a/tools/build/feature/Makefile
+++ b/tools/build/feature/Makefile
@@ -214,7 +214,7 @@ strip-libs = $(filter-out -l%,$(1))
 PERL_EMBED_LDOPTS = $(shell perl -MExtUtils::Embed -e ldopts 2>/dev/null)
 PERL_EMBED_LDFLAGS = $(call strip-libs,$(PERL_EMBED_LDOPTS))
 PERL_EMBED_LIBADD = $(call grep-libs,$(PERL_EMBED_LDOPTS))
-PERL_EMBED_CCOPTS = `perl -MExtUtils::Embed -e ccopts 2>/dev/null`
+PERL_EMBED_CCOPTS = $(shell perl -MExtUtils::Embed -e ccopts 2>/dev/null)
 FLAGS_PERL_EMBED=$(PERL_EMBED_CCOPTS) $(PERL_EMBED_LDOPTS)
 
 ifeq ($(CC_NO_CLANG), 0)
-- 
Gitee


From 0092f1288694966ce9468fb9f0056ffd733c5b3e Mon Sep 17 00:00:00 2001
From: Vinod Koul <vkoul@kernel.org>
Date: Thu, 14 Jul 2022 21:59:31 +0800
Subject: [PATCH 340/674] dmaengine: Revert "dmaengine: shdma: Fix runtime PM
 imbalance on error"

stable inclusion
from stable-v5.10.111
commit 03b39bbbec8becbecde5d3f84a352633c8f2f49b
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=03b39bbbec8becbecde5d3f84a352633c8f2f49b

--------------------------------

commit d143f939a95696d38ff800ada14402fa50ebbd6c upstream.

This reverts commit 455896c53d5b ("dmaengine: shdma: Fix runtime PM
imbalance on error") as the patch wrongly reduced the count on error and
did not bail out. So drop the count by reverting the patch .

Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/dma/sh/shdma-base.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/dma/sh/shdma-base.c b/drivers/dma/sh/shdma-base.c
index 19ac95c0098f..7f72b3f4cd1a 100644
--- a/drivers/dma/sh/shdma-base.c
+++ b/drivers/dma/sh/shdma-base.c
@@ -115,10 +115,8 @@ static dma_cookie_t shdma_tx_submit(struct dma_async_tx_descriptor *tx)
 		ret = pm_runtime_get(schan->dev);
 
 		spin_unlock_irq(&schan->chan_lock);
-		if (ret < 0) {
+		if (ret < 0)
 			dev_err(schan->dev, "%s(): GET = %d\n", __func__, ret);
-			pm_runtime_put(schan->dev);
-		}
 
 		pm_runtime_barrier(schan->dev);
 
-- 
Gitee


From 3e393ad0efe336af9da4abd7ce7b5b858e9b9b0c Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 14 Jul 2022 21:59:32 +0800
Subject: [PATCH 341/674] ubsan: remove CONFIG_UBSAN_OBJECT_SIZE

stable inclusion
from stable-v5.10.111
commit 58823a9b097cf36dd59c4d463396e15a5ec4efb7
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=58823a9b097cf36dd59c4d463396e15a5ec4efb7

--------------------------------

commit 69d0db01e210e07fe915e5da91b54a867cda040f upstream.

The object-size sanitizer is redundant to -Warray-bounds, and
inappropriately performs its checks at run-time when all information
needed for the evaluation is available at compile-time, making it quite
difficult to use:

  https://bugzilla.kernel.org/show_bug.cgi?id=214861

With -Warray-bounds almost enabled globally, it doesn't make sense to
keep this around.

Link: https://lkml.kernel.org/r/20211203235346.110809-1-keescook@chromium.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Marco Elver <elver@google.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Michal Marek <michal.lkml@markovi.net>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Tadeusz Struk <tadeusz.struk@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 lib/test_ubsan.c       | 11 -----------
 scripts/Makefile.ubsan |  1 -
 2 files changed, 12 deletions(-)

diff --git a/lib/test_ubsan.c b/lib/test_ubsan.c
index 9ea10adf7a66..b1d0a6ecfe1b 100644
--- a/lib/test_ubsan.c
+++ b/lib/test_ubsan.c
@@ -89,16 +89,6 @@ static void test_ubsan_misaligned_access(void)
 	*ptr = val;
 }
 
-static void test_ubsan_object_size_mismatch(void)
-{
-	/* "((aligned(8)))" helps this not into be misaligned for ptr-access. */
-	volatile int val __aligned(8) = 4;
-	volatile long long *ptr, val2;
-
-	ptr = (long long *)&val;
-	val2 = *ptr;
-}
-
 static const test_ubsan_fp test_ubsan_array[] = {
 	test_ubsan_add_overflow,
 	test_ubsan_sub_overflow,
@@ -110,7 +100,6 @@ static const test_ubsan_fp test_ubsan_array[] = {
 	test_ubsan_load_invalid_value,
 	//test_ubsan_null_ptr_deref, /* exclude it because there is a crash */
 	test_ubsan_misaligned_access,
-	test_ubsan_object_size_mismatch,
 };
 
 static int __init test_ubsan_init(void)
diff --git a/scripts/Makefile.ubsan b/scripts/Makefile.ubsan
index 9716dab06bc7..2156e18391a3 100644
--- a/scripts/Makefile.ubsan
+++ b/scripts/Makefile.ubsan
@@ -23,7 +23,6 @@ ifdef CONFIG_UBSAN_MISC
       CFLAGS_UBSAN += $(call cc-option, -fsanitize=integer-divide-by-zero)
       CFLAGS_UBSAN += $(call cc-option, -fsanitize=unreachable)
       CFLAGS_UBSAN += $(call cc-option, -fsanitize=signed-integer-overflow)
-      CFLAGS_UBSAN += $(call cc-option, -fsanitize=object-size)
       CFLAGS_UBSAN += $(call cc-option, -fsanitize=bool)
       CFLAGS_UBSAN += $(call cc-option, -fsanitize=enum)
 endif
-- 
Gitee


From b9be96107a16c9aacfbc3d386be05e9b250c68fd Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Thu, 14 Jul 2022 21:59:33 +0800
Subject: [PATCH 342/674] mm: don't skip swap entry even if zap_details
 specified

stable inclusion
from stable-v5.10.111
commit f089471d1b754cdd386f081f6c62eec414e8e188
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=f089471d1b754cdd386f081f6c62eec414e8e188

--------------------------------

commit 5abfd71d936a8aefd9f9ccd299dea7a164a5d455 upstream.

Patch series "mm: Rework zap ptes on swap entries", v5.

Patch 1 should fix a long standing bug for zap_pte_range() on
zap_details usage.  The risk is we could have some swap entries skipped
while we should have zapped them.

Migration entries are not the major concern because file backed memory
always zap in the pattern that "first time without page lock, then
re-zap with page lock" hence the 2nd zap will always make sure all
migration entries are already recovered.

However there can be issues with real swap entries got skipped
errornoously.  There's a reproducer provided in commit message of patch
1 for that.

Patch 2-4 are cleanups that are based on patch 1.  After the whole
patchset applied, we should have a very clean view of zap_pte_range().

Only patch 1 needs to be backported to stable if necessary.

This patch (of 4):

The "details" pointer shouldn't be the token to decide whether we should
skip swap entries.

For example, when the callers specified details->zap_mapping==NULL, it
means the user wants to zap all the pages (including COWed pages), then
we need to look into swap entries because there can be private COWed
pages that was swapped out.

Skipping some swap entries when details is non-NULL may lead to wrongly
leaving some of the swap entries while we should have zapped them.

A reproducer of the problem:
Reviewed-by: Wei Li <liwei391@huawei.com>

===8<===
        #define _GNU_SOURCE         /* See feature_test_macros(7) */
        #include <stdio.h>
        #include <assert.h>
        #include <unistd.h>
        #include <sys/mman.h>
        #include <sys/types.h>

        int page_size;
        int shmem_fd;
        char *buffer;

        void main(void)
        {
                int ret;
                char val;

                page_size = getpagesize();
                shmem_fd = memfd_create("test", 0);
                assert(shmem_fd >= 0);

                ret = ftruncate(shmem_fd, page_size * 2);
                assert(ret == 0);

                buffer = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE,
                                MAP_PRIVATE, shmem_fd, 0);
                assert(buffer != MAP_FAILED);

                /* Write private page, swap it out */
                buffer[page_size] = 1;
                madvise(buffer, page_size * 2, MADV_PAGEOUT);

                /* This should drop private buffer[page_size] already */
                ret = ftruncate(shmem_fd, page_size);
                assert(ret == 0);
                /* Recover the size */
                ret = ftruncate(shmem_fd, page_size * 2);
                assert(ret == 0);

                /* Re-read the data, it should be all zero */
                val = buffer[page_size];
                if (val == 0)
                        printf("Good\n");
                else
                        printf("BUG\n");
        }
===8<===

We don't need to touch up the pmd path, because pmd never had a issue with
swap entries.  For example, shmem pmd migration will always be split into
pte level, and same to swapping on anonymous.

Add another helper should_zap_cows() so that we can also check whether we
should zap private mappings when there's no page pointer specified.

This patch drops that trick, so we handle swap ptes coherently.  Meanwhile
we should do the same check upon migration entry, hwpoison entry and
genuine swap entries too.

To be explicit, we should still remember to keep the private entries if
even_cows==false, and always zap them when even_cows==true.

The issue seems to exist starting from the initial commit of git.

[peterx@redhat.com: comment tweaks]
  Link: https://lkml.kernel.org/r/20220217060746.71256-2-peterx@redhat.com

Link: https://lkml.kernel.org/r/20220217060746.71256-1-peterx@redhat.com
Link: https://lkml.kernel.org/r/20220216094810.60572-1-peterx@redhat.com
Link: https://lkml.kernel.org/r/20220216094810.60572-2-peterx@redhat.com
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Kirill A . Shutemov" <kirill@shutemov.name>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yang Shi <shy828301@gmail.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 mm/memory.c | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 8379f39dd697..6d3a07c821fe 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1204,6 +1204,17 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
 	return ret;
 }
 
+/* Whether we should zap all COWed (private) pages too */
+static inline bool should_zap_cows(struct zap_details *details)
+{
+	/* By default, zap all pages */
+	if (!details)
+		return true;
+
+	/* Or, we zap COWed pages only if the caller wants to */
+	return !details->check_mapping;
+}
+
 static unsigned long zap_pte_range(struct mmu_gather *tlb,
 				struct vm_area_struct *vma, pmd_t *pmd,
 				unsigned long addr, unsigned long end,
@@ -1295,16 +1306,18 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 			continue;
 		}
 
-		/* If details->check_mapping, we leave swap entries. */
-		if (unlikely(details))
-			continue;
-
-		if (!non_swap_entry(entry))
+		if (!non_swap_entry(entry)) {
+			/* Genuine swap entry, hence a private anon page */
+			if (!should_zap_cows(details))
+				continue;
 			rss[MM_SWAPENTS]--;
-		else if (is_migration_entry(entry)) {
+		} else if (is_migration_entry(entry)) {
 			struct page *page;
 
 			page = migration_entry_to_page(entry);
+			if (details && details->check_mapping &&
+			    details->check_mapping != page_rmapping(page))
+				continue;
 			rss[mm_counter(page)]--;
 		}
 		if (unlikely(!free_swap_and_cache(entry)))
-- 
Gitee


From 44ef621d6cc16f1830a38b570f129309f4193293 Mon Sep 17 00:00:00 2001
From: Sachin Sant <sachinp@linux.vnet.ibm.com>
Date: Thu, 14 Jul 2022 21:59:34 +0800
Subject: [PATCH 343/674] selftests/cgroup: Fix build on older distros

stable inclusion
from stable-v5.10.111
commit e74da71e66142784aa086dbaae047901074b4518
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=e74da71e66142784aa086dbaae047901074b4518

--------------------------------

commit c2e46f6b3e3551558d44c4dc518b9667cb0d5f8b upstream.

On older distros struct clone_args does not have a cgroup member,
leading to build errors:

 cgroup_util.c: In function 'clone_into_cgroup':
 cgroup_util.c:343:4: error: 'struct clone_args' has no member named 'cgroup'
 cgroup_util.c:346:33: error: invalid application of 'sizeof' to incomplete
  type 'struct clone_args'

But the selftests already have a locally defined version of the
structure which is up to date, called __clone_args.

So use __clone_args which fixes the error.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Sachin Sant <sachinp@linux.vnet.ibm.com>>
Acked-by: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Ovidiu Panait <ovidiu.panait@windriver.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 tools/testing/selftests/cgroup/cgroup_util.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/cgroup/cgroup_util.c b/tools/testing/selftests/cgroup/cgroup_util.c
index 391ab5675290..f61145f98577 100644
--- a/tools/testing/selftests/cgroup/cgroup_util.c
+++ b/tools/testing/selftests/cgroup/cgroup_util.c
@@ -330,13 +330,13 @@ pid_t clone_into_cgroup(int cgroup_fd)
 #ifdef CLONE_ARGS_SIZE_VER2
 	pid_t pid;
 
-	struct clone_args args = {
+	struct __clone_args args = {
 		.flags = CLONE_INTO_CGROUP,
 		.exit_signal = SIGCHLD,
 		.cgroup = cgroup_fd,
 	};
 
-	pid = sys_clone3(&args, sizeof(struct clone_args));
+	pid = sys_clone3(&args, sizeof(struct __clone_args));
 	/*
 	 * Verify that this is a genuine test failure:
 	 * ENOSYS -> clone3() not available
-- 
Gitee


From f6e6c8df509e5f0ce1e554b1a63edeff9a862619 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 14 Jul 2022 21:59:35 +0800
Subject: [PATCH 344/674] selftests: cgroup: Make cg_create() use 0755 for
 permission instead of 0644
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

stable inclusion
from stable-v5.10.111
commit 9dd39d2c6572c7be091ecd6aced950bd91175415
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=9dd39d2c6572c7be091ecd6aced950bd91175415

--------------------------------

commit b09c2baa56347ae65795350dfcc633dedb1c2970 upstream.

0644 is an odd perm to create a cgroup which is a directory. Use the regular
0755 instead. This is necessary for euid switching test case.

Reviewed-by: Michal Koutný <mkoutny@suse.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Ovidiu Panait <ovidiu.panait@windriver.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 tools/testing/selftests/cgroup/cgroup_util.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/cgroup/cgroup_util.c b/tools/testing/selftests/cgroup/cgroup_util.c
index f61145f98577..962ced827513 100644
--- a/tools/testing/selftests/cgroup/cgroup_util.c
+++ b/tools/testing/selftests/cgroup/cgroup_util.c
@@ -212,7 +212,7 @@ int cg_find_unified_root(char *root, size_t len)
 
 int cg_create(const char *cgroup)
 {
-	return mkdir(cgroup, 0644);
+	return mkdir(cgroup, 0755);
 }
 
 int cg_wait_for_proc_count(const char *cgroup, int count)
-- 
Gitee


From 2960182013a289f79bb330f4d92043ea1e32d4e1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 14 Jul 2022 21:59:36 +0800
Subject: [PATCH 345/674] selftests: cgroup: Test open-time credential usage
 for migration checks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

stable inclusion
from stable-v5.10.111
commit 637eca44b8f72c5eae6539fa037548c5fe7193a3
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=637eca44b8f72c5eae6539fa037548c5fe7193a3

--------------------------------

commit 613e040e4dc285367bff0f8f75ea59839bc10947 upstream.

When a task is writing to an fd opened by a different task, the perm check
should use the credentials of the latter task. Add a test for it.

Tested-by: Michal Koutný <mkoutny@suse.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Ovidiu Panait <ovidiu.panait@windriver.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 tools/testing/selftests/cgroup/test_core.c | 68 ++++++++++++++++++++++
 1 file changed, 68 insertions(+)

diff --git a/tools/testing/selftests/cgroup/test_core.c b/tools/testing/selftests/cgroup/test_core.c
index 3df648c37876..01b766506973 100644
--- a/tools/testing/selftests/cgroup/test_core.c
+++ b/tools/testing/selftests/cgroup/test_core.c
@@ -674,6 +674,73 @@ static int test_cgcore_thread_migration(const char *root)
 	return ret;
 }
 
+/*
+ * cgroup migration permission check should be performed based on the
+ * credentials at the time of open instead of write.
+ */
+static int test_cgcore_lesser_euid_open(const char *root)
+{
+	const uid_t test_euid = 65534;	/* usually nobody, any !root is fine */
+	int ret = KSFT_FAIL;
+	char *cg_test_a = NULL, *cg_test_b = NULL;
+	char *cg_test_a_procs = NULL, *cg_test_b_procs = NULL;
+	int cg_test_b_procs_fd = -1;
+	uid_t saved_uid;
+
+	cg_test_a = cg_name(root, "cg_test_a");
+	cg_test_b = cg_name(root, "cg_test_b");
+
+	if (!cg_test_a || !cg_test_b)
+		goto cleanup;
+
+	cg_test_a_procs = cg_name(cg_test_a, "cgroup.procs");
+	cg_test_b_procs = cg_name(cg_test_b, "cgroup.procs");
+
+	if (!cg_test_a_procs || !cg_test_b_procs)
+		goto cleanup;
+
+	if (cg_create(cg_test_a) || cg_create(cg_test_b))
+		goto cleanup;
+
+	if (cg_enter_current(cg_test_a))
+		goto cleanup;
+
+	if (chown(cg_test_a_procs, test_euid, -1) ||
+	    chown(cg_test_b_procs, test_euid, -1))
+		goto cleanup;
+
+	saved_uid = geteuid();
+	if (seteuid(test_euid))
+		goto cleanup;
+
+	cg_test_b_procs_fd = open(cg_test_b_procs, O_RDWR);
+
+	if (seteuid(saved_uid))
+		goto cleanup;
+
+	if (cg_test_b_procs_fd < 0)
+		goto cleanup;
+
+	if (write(cg_test_b_procs_fd, "0", 1) >= 0 || errno != EACCES)
+		goto cleanup;
+
+	ret = KSFT_PASS;
+
+cleanup:
+	cg_enter_current(root);
+	if (cg_test_b_procs_fd >= 0)
+		close(cg_test_b_procs_fd);
+	if (cg_test_b)
+		cg_destroy(cg_test_b);
+	if (cg_test_a)
+		cg_destroy(cg_test_a);
+	free(cg_test_b_procs);
+	free(cg_test_a_procs);
+	free(cg_test_b);
+	free(cg_test_a);
+	return ret;
+}
+
 #define T(x) { x, #x }
 struct corecg_test {
 	int (*fn)(const char *root);
@@ -689,6 +756,7 @@ struct corecg_test {
 	T(test_cgcore_proc_migration),
 	T(test_cgcore_thread_migration),
 	T(test_cgcore_destroy),
+	T(test_cgcore_lesser_euid_open),
 };
 #undef T
 
-- 
Gitee


From 71df1200c5804db3367e118f0cee906beed1d6d8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 14 Jul 2022 21:59:37 +0800
Subject: [PATCH 346/674] selftests: cgroup: Test open-time cgroup namespace
 usage for migration checks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

stable inclusion
from stable-v5.10.111
commit 919823bd6738a5f4388df033a9d6795c82b97cf7
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=919823bd6738a5f4388df033a9d6795c82b97cf7

--------------------------------

commit bf35a7879f1dfb0d050fe779168bcf25c7de66f5 upstream.

When a task is writing to an fd opened by a different task, the perm check
should use the cgroup namespace of the latter task. Add a test for it.

Tested-by: Michal Koutný <mkoutny@suse.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Ovidiu Panait <ovidiu.panait@windriver.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 tools/testing/selftests/cgroup/test_core.c | 97 ++++++++++++++++++++++
 1 file changed, 97 insertions(+)

diff --git a/tools/testing/selftests/cgroup/test_core.c b/tools/testing/selftests/cgroup/test_core.c
index 01b766506973..600123503063 100644
--- a/tools/testing/selftests/cgroup/test_core.c
+++ b/tools/testing/selftests/cgroup/test_core.c
@@ -1,11 +1,14 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 
+#define _GNU_SOURCE
 #include <linux/limits.h>
+#include <linux/sched.h>
 #include <sys/types.h>
 #include <sys/mman.h>
 #include <sys/wait.h>
 #include <unistd.h>
 #include <fcntl.h>
+#include <sched.h>
 #include <stdio.h>
 #include <errno.h>
 #include <signal.h>
@@ -741,6 +744,99 @@ static int test_cgcore_lesser_euid_open(const char *root)
 	return ret;
 }
 
+struct lesser_ns_open_thread_arg {
+	const char	*path;
+	int		fd;
+	int		err;
+};
+
+static int lesser_ns_open_thread_fn(void *arg)
+{
+	struct lesser_ns_open_thread_arg *targ = arg;
+
+	targ->fd = open(targ->path, O_RDWR);
+	targ->err = errno;
+	return 0;
+}
+
+/*
+ * cgroup migration permission check should be performed based on the cgroup
+ * namespace at the time of open instead of write.
+ */
+static int test_cgcore_lesser_ns_open(const char *root)
+{
+	static char stack[65536];
+	const uid_t test_euid = 65534;	/* usually nobody, any !root is fine */
+	int ret = KSFT_FAIL;
+	char *cg_test_a = NULL, *cg_test_b = NULL;
+	char *cg_test_a_procs = NULL, *cg_test_b_procs = NULL;
+	int cg_test_b_procs_fd = -1;
+	struct lesser_ns_open_thread_arg targ = { .fd = -1 };
+	pid_t pid;
+	int status;
+
+	cg_test_a = cg_name(root, "cg_test_a");
+	cg_test_b = cg_name(root, "cg_test_b");
+
+	if (!cg_test_a || !cg_test_b)
+		goto cleanup;
+
+	cg_test_a_procs = cg_name(cg_test_a, "cgroup.procs");
+	cg_test_b_procs = cg_name(cg_test_b, "cgroup.procs");
+
+	if (!cg_test_a_procs || !cg_test_b_procs)
+		goto cleanup;
+
+	if (cg_create(cg_test_a) || cg_create(cg_test_b))
+		goto cleanup;
+
+	if (cg_enter_current(cg_test_b))
+		goto cleanup;
+
+	if (chown(cg_test_a_procs, test_euid, -1) ||
+	    chown(cg_test_b_procs, test_euid, -1))
+		goto cleanup;
+
+	targ.path = cg_test_b_procs;
+	pid = clone(lesser_ns_open_thread_fn, stack + sizeof(stack),
+		    CLONE_NEWCGROUP | CLONE_FILES | CLONE_VM | SIGCHLD,
+		    &targ);
+	if (pid < 0)
+		goto cleanup;
+
+	if (waitpid(pid, &status, 0) < 0)
+		goto cleanup;
+
+	if (!WIFEXITED(status))
+		goto cleanup;
+
+	cg_test_b_procs_fd = targ.fd;
+	if (cg_test_b_procs_fd < 0)
+		goto cleanup;
+
+	if (cg_enter_current(cg_test_a))
+		goto cleanup;
+
+	if ((status = write(cg_test_b_procs_fd, "0", 1)) >= 0 || errno != ENOENT)
+		goto cleanup;
+
+	ret = KSFT_PASS;
+
+cleanup:
+	cg_enter_current(root);
+	if (cg_test_b_procs_fd >= 0)
+		close(cg_test_b_procs_fd);
+	if (cg_test_b)
+		cg_destroy(cg_test_b);
+	if (cg_test_a)
+		cg_destroy(cg_test_a);
+	free(cg_test_b_procs);
+	free(cg_test_a_procs);
+	free(cg_test_b);
+	free(cg_test_a);
+	return ret;
+}
+
 #define T(x) { x, #x }
 struct corecg_test {
 	int (*fn)(const char *root);
@@ -757,6 +853,7 @@ struct corecg_test {
 	T(test_cgcore_thread_migration),
 	T(test_cgcore_destroy),
 	T(test_cgcore_lesser_euid_open),
+	T(test_cgcore_lesser_ns_open),
 };
 #undef T
 
-- 
Gitee


From 00c200254efd233f4a843dd3f6a763b1dfd92940 Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Thu, 14 Jul 2022 21:59:38 +0800
Subject: [PATCH 347/674] arm64: module: remove (NOLOAD) from linker script

stable inclusion
from stable-v5.10.111
commit e1f540b752cbc671e52b30184eefa9ffab0bd28e
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=e1f540b752cbc671e52b30184eefa9ffab0bd28e

--------------------------------

commit 4013e26670c590944abdab56c4fa797527b74325 upstream.

On ELF, (NOLOAD) sets the section type to SHT_NOBITS[1]. It is conceptually
inappropriate for .plt and .text.* sections which are always
SHT_PROGBITS.

In GNU ld, if PLT entries are needed, .plt will be SHT_PROGBITS anyway
and (NOLOAD) will be essentially ignored. In ld.lld, since
https://reviews.llvm.org/D118840 ("[ELF] Support (TYPE=<value>) to
customize the output section type"), ld.lld will report a `section type
mismatch` error. Just remove (NOLOAD) to fix the error.

[1] https://lld.llvm.org/ELF/linker_script.html As of today, "The
section should be marked as not loadable" on
https://sourceware.org/binutils/docs/ld/Output-Section-Type.html is
outdated for ELF.

Tested-by: Nathan Chancellor <nathan@kernel.org>
Reported-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Fangrui Song <maskray@google.com>
Acked-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20220218081209.354383-1-maskray@google.com
Signed-off-by: Will Deacon <will@kernel.org>
[nathan: Fix conflicts due to lack of 1cbdf60bd1b7]
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 arch/arm64/include/asm/module.lds.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/include/asm/module.lds.h b/arch/arm64/include/asm/module.lds.h
index 810045628c66..0522337d600a 100644
--- a/arch/arm64/include/asm/module.lds.h
+++ b/arch/arm64/include/asm/module.lds.h
@@ -1,7 +1,7 @@
 #ifdef CONFIG_ARM64_MODULE_PLTS
 SECTIONS {
-	.plt 0 (NOLOAD) : { BYTE(0) }
-	.init.plt 0 (NOLOAD) : { BYTE(0) }
-	.text.ftrace_trampoline 0 (NOLOAD) : { BYTE(0) }
+	.plt 0 : { BYTE(0) }
+	.init.plt 0 : { BYTE(0) }
+	.text.ftrace_trampoline 0 : { BYTE(0) }
 }
 #endif
-- 
Gitee


From 13b7a59d21a8b21e731d7e146b2100de1f106be4 Mon Sep 17 00:00:00 2001
From: "Andrea Parri (Microsoft)" <parri.andrea@gmail.com>
Date: Thu, 14 Jul 2022 21:59:39 +0800
Subject: [PATCH 348/674] Drivers: hv: vmbus: Replace smp_store_mb() with
 virt_store_mb()

stable inclusion
from stable-v5.10.111
commit 000e09462f859d71563824c413717a2d3eda4bca
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=000e09462f859d71563824c413717a2d3eda4bca

--------------------------------

commit eaa03d34535872d29004cb5cf77dc9dec1ba9a25 upstream.

Following the recommendation in Documentation/memory-barriers.txt for
virtual machine guests.

Fixes: 8b6a877c060ed ("Drivers: hv: vmbus: Replace the per-CPU channel lists with a global array of channels")
Signed-off-by: Andrea Parri (Microsoft) <parri.andrea@gmail.com>
Link: https://lore.kernel.org/r/20220328154457.100872-1-parri.andrea@gmail.com
Signed-off-by: Wei Liu <wei.liu@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/hv/channel_mgmt.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index 6476bfe193af..5dbb949b1afd 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -350,7 +350,7 @@ void vmbus_channel_map_relid(struct vmbus_channel *channel)
 	 * execute:
 	 *
 	 *  (a) In the "normal (i.e., not resuming from hibernation)" path,
-	 *      the full barrier in smp_store_mb() guarantees that the store
+	 *      the full barrier in virt_store_mb() guarantees that the store
 	 *      is propagated to all CPUs before the add_channel_work work
 	 *      is queued.  In turn, add_channel_work is queued before the
 	 *      channel's ring buffer is allocated/initialized and the
@@ -362,14 +362,14 @@ void vmbus_channel_map_relid(struct vmbus_channel *channel)
 	 *      recv_int_page before retrieving the channel pointer from the
 	 *      array of channels.
 	 *
-	 *  (b) In the "resuming from hibernation" path, the smp_store_mb()
+	 *  (b) In the "resuming from hibernation" path, the virt_store_mb()
 	 *      guarantees that the store is propagated to all CPUs before
 	 *      the VMBus connection is marked as ready for the resume event
 	 *      (cf. check_ready_for_resume_event()).  The interrupt handler
 	 *      of the VMBus driver and vmbus_chan_sched() can not run before
 	 *      vmbus_bus_resume() has completed execution (cf. resume_noirq).
 	 */
-	smp_store_mb(
+	virt_store_mb(
 		vmbus_connection.channels[channel->offermsg.child_relid],
 		channel);
 }
-- 
Gitee


From 26acbfd810c1e4b9697b9c848ba73cdbf0198019 Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Thu, 14 Jul 2022 21:59:40 +0800
Subject: [PATCH 349/674] irqchip/gic, gic-v3: Prevent GSI to SGI translations

stable inclusion
from stable-v5.10.111
commit 5973f7507a73878653de7f27f4a433c9a8ba2690
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=5973f7507a73878653de7f27f4a433c9a8ba2690

--------------------------------

commit 544808f7e21cb9ccdb8f3aa7de594c05b1419061 upstream.

At the moment the GIC IRQ domain translation routine happily converts
ACPI table GSI numbers below 16 to GIC SGIs (Software Generated
Interrupts aka IPIs). On the Devicetree side we explicitly forbid this
translation, actually the function will never return HWIRQs below 16 when
using a DT based domain translation.

We expect SGIs to be handled in the first part of the function, and any
further occurrence should be treated as a firmware bug, so add a check
and print to report this explicitly and avoid lengthy debug sessions.

Fixes: 64b499d8df40 ("irqchip/gic-v3: Configure SGIs as standard interrupts")
Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20220404110842.2882446-1-andre.przywara@arm.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 drivers/irqchip/irq-gic-v3.c | 6 ++++++
 drivers/irqchip/irq-gic.c    | 6 ++++++
 2 files changed, 12 insertions(+)

diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c
index 130b19aeb679..64d7811a2b77 100644
--- a/drivers/irqchip/irq-gic-v3.c
+++ b/drivers/irqchip/irq-gic-v3.c
@@ -1560,6 +1560,12 @@ static int gic_irq_domain_translate(struct irq_domain *d,
 		if(fwspec->param_count != 2)
 			return -EINVAL;
 
+		if (fwspec->param[0] < 16) {
+			pr_err(FW_BUG "Illegal GSI%d translation request\n",
+			       fwspec->param[0]);
+			return -EINVAL;
+		}
+
 		*hwirq = fwspec->param[0];
 		*type = fwspec->param[1];
 
diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c
index 176f5f06432d..205cbd24ff20 100644
--- a/drivers/irqchip/irq-gic.c
+++ b/drivers/irqchip/irq-gic.c
@@ -1094,6 +1094,12 @@ static int gic_irq_domain_translate(struct irq_domain *d,
 		if(fwspec->param_count != 2)
 			return -EINVAL;
 
+		if (fwspec->param[0] < 16) {
+			pr_err(FW_BUG "Illegal GSI%d translation request\n",
+			       fwspec->param[0]);
+			return -EINVAL;
+		}
+
 		*hwirq = fwspec->param[0];
 		*type = fwspec->param[1];
 
-- 
Gitee


From e57838ef1deca12650ab27919d6b86bbdd2446c5 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Thu, 14 Jul 2022 21:59:41 +0800
Subject: [PATCH 350/674] mm/sparsemem: fix 'mem_section' will never be NULL
 gcc 12 warning

stable inclusion
from stable-v5.10.111
commit 5c672073bcca537a0880257f31c6d942388df046
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=5c672073bcca537a0880257f31c6d942388df046

--------------------------------

commit a431dbbc540532b7465eae4fc8b56a85a9fc7d17 upstream.

The gcc 12 compiler reports a "'mem_section' will never be NULL" warning
on the following code:

    static inline struct mem_section *__nr_to_section(unsigned long nr)
    {
    #ifdef CONFIG_SPARSEMEM_EXTREME
        if (!mem_section)
                return NULL;
    #endif
        if (!mem_section[SECTION_NR_TO_ROOT(nr)])
                return NULL;
       :

It happens with CONFIG_SPARSEMEM_EXTREME off.  The mem_section definition
is

    #ifdef CONFIG_SPARSEMEM_EXTREME
    extern struct mem_section **mem_section;
    #else
    extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT];
    #endif

In the !CONFIG_SPARSEMEM_EXTREME case, mem_section is a static
2-dimensional array and so the check "!mem_section[SECTION_NR_TO_ROOT(nr)]"
doesn't make sense.

Fix this warning by moving the "!mem_section[SECTION_NR_TO_ROOT(nr)]"
check up inside the CONFIG_SPARSEMEM_EXTREME block and adding an
explicit NR_SECTION_ROOTS check to make sure that there is no
out-of-bound array access.

Link: https://lkml.kernel.org/r/20220331180246.2746210-1-longman@redhat.com
Fixes: 3e347261a80b ("sparsemem extreme implementation")
Signed-off-by: Waiman Long <longman@redhat.com>
Reported-by: Justin Forbes <jforbes@redhat.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Rafael Aquini <aquini@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 include/linux/mmzone.h | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 7e1e09fe4e3b..13c7a048e5ab 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1282,13 +1282,16 @@ static inline unsigned long *section_to_usemap(struct mem_section *ms)
 
 static inline struct mem_section *__nr_to_section(unsigned long nr)
 {
+	unsigned long root = SECTION_NR_TO_ROOT(nr);
+
+	if (unlikely(root >= NR_SECTION_ROOTS))
+		return NULL;
+
 #ifdef CONFIG_SPARSEMEM_EXTREME
-	if (!mem_section)
+	if (!mem_section || !mem_section[root])
 		return NULL;
 #endif
-	if (!mem_section[SECTION_NR_TO_ROOT(nr)])
-		return NULL;
-	return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK];
+	return &mem_section[root][nr & SECTION_ROOT_MASK];
 }
 extern unsigned long __section_nr(struct mem_section *ms);
 extern size_t mem_section_usage_size(void);
-- 
Gitee


From 8a66ef5ba84e4dd35aa9a52a7c89e78fcd31ceb6 Mon Sep 17 00:00:00 2001
From: Zheng Zengkai <zhengzengkai@huawei.com>
Date: Thu, 14 Jul 2022 21:59:42 +0800
Subject: [PATCH 351/674] Revert "powerpc: Fix virt_addr_valid() check"

hulk inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z
CVE: NA

--------------------------------

This reverts commit 446340627ad9171d570513e86ff5337c5d74c113.
Revert the old patch and apply the new patch from v5.10.111 LTS.

Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 arch/powerpc/include/asm/page.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h
index 300d4c105a3a..254687258f42 100644
--- a/arch/powerpc/include/asm/page.h
+++ b/arch/powerpc/include/asm/page.h
@@ -132,10 +132,7 @@ static inline bool pfn_valid(unsigned long pfn)
 #define virt_to_page(kaddr)	pfn_to_page(virt_to_pfn(kaddr))
 #define pfn_to_kaddr(pfn)	__va((pfn) << PAGE_SHIFT)
 
-#define virt_addr_valid(vaddr)	({						\
-	unsigned long _addr = (unsigned long)vaddr;				\
-	(unsigned long)(_addr) >= PAGE_OFFSET && pfn_valid(virt_to_pfn(_addr));	\
-})
+#define virt_addr_valid(kaddr)	pfn_valid(virt_to_pfn(kaddr))
 
 /*
  * On Book-E parts we need __va to parse the device tree and we can't
-- 
Gitee


From 1377e00ff63a005f49784c3aeae0d1737a9fa007 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Thu, 14 Jul 2022 21:59:43 +0800
Subject: [PATCH 352/674] powerpc: Fix virt_addr_valid() for 64-bit Book3E &
 32-bit

stable inclusion
from stable-v5.10.111
commit d36febbcd537fcc50284e8b89609632d0146529f
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GL1Z

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=d36febbcd537fcc50284e8b89609632d0146529f

--------------------------------

commit ffa0b64e3be58519ae472ea29a1a1ad681e32f48 upstream.

mpe: On 64-bit Book3E vmalloc space starts at 0x8000000000000000.

Because of the way __pa() works we have:
  __pa(0x8000000000000000) == 0, and therefore
  virt_to_pfn(0x8000000000000000) == 0, and therefore
  virt_addr_valid(0x8000000000000000) == true

Which is wrong, virt_addr_valid() should be false for vmalloc space.
In fact all vmalloc addresses that alias with a valid PFN will return
true from virt_addr_valid(). That can cause bugs with hardened usercopy
as described below by Kefeng Wang:

  When running ethtool eth0 on 64-bit Book3E, a BUG occurred:

    usercopy: Kernel memory exposure attempt detected from SLUB object not in SLUB page?! (offset 0, size 1048)!
    kernel BUG at mm/usercopy.c:99
    ...
    usercopy_abort+0x64/0xa0 (unreliable)
    __check_heap_object+0x168/0x190
    __check_object_size+0x1a0/0x200
    dev_ethtool+0x2494/0x2b20
    dev_ioctl+0x5d0/0x770
    sock_do_ioctl+0xf0/0x1d0
    sock_ioctl+0x3ec/0x5a0
    __se_sys_ioctl+0xf0/0x160
    system_call_exception+0xfc/0x1f0
    system_call_common+0xf8/0x200

  The code shows below,

    data = vzalloc(array_size(gstrings.len, ETH_GSTRING_LEN));
    copy_to_user(useraddr, data, gstrings.len * ETH_GSTRING_LEN))

  The data is alloced by vmalloc(), virt_addr_valid(ptr) will return true
  on 64-bit Book3E, which leads to the panic.

  As commit 4dd7554a6456 ("powerpc/64: Add VIRTUAL_BUG_ON checks for __va
  and __pa addresses") does, make sure the virt addr above PAGE_OFFSET in
  the virt_addr_valid() for 64-bit, also add upper limit check to make
  sure the virt is below high_memory.

  Meanwhile, for 32-bit PAGE_OFFSET is the virtual address of the start
  of lowmem, high_memory is the upper low virtual address, the check is
  suitable for 32-bit, this will fix the issue mentioned in commit
  602946ec2f90 ("powerpc: Set max_mapnr correctly") too.

On 32-bit there is a similar problem with high memory, that was fixed in
commit 602946ec2f90 ("powerpc: Set max_mapnr correctly"), but that
commit breaks highmem and needs to be reverted.

We can't easily fix __pa(), we have code that relies on its current
behaviour. So for now add extra checks to virt_addr_valid().

For 64-bit Book3S the extra checks are not necessary, the combination of
virt_to_pfn() and pfn_valid() should yield the correct result, but they
are harmless.

Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu>
[mpe: Add additional change log detail]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20220406145802.538416-1-mpe@ellerman.id.au
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Wei Li <liwei391@huawei.com>
---
 arch/powerpc/include/asm/page.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h
index 254687258f42..f2c5c26869f1 100644
--- a/arch/powerpc/include/asm/page.h
+++ b/arch/powerpc/include/asm/page.h
@@ -132,7 +132,11 @@ static inline bool pfn_valid(unsigned long pfn)
 #define virt_to_page(kaddr)	pfn_to_page(virt_to_pfn(kaddr))
 #define pfn_to_kaddr(pfn)	__va((pfn) << PAGE_SHIFT)
 
-#define virt_addr_valid(kaddr)	pfn_valid(virt_to_pfn(kaddr))
+#define virt_addr_valid(vaddr)	({					\
+	unsigned long _addr = (unsigned long)vaddr;			\
+	_addr >= PAGE_OFFSET && _addr < (unsigned long)high_memory &&	\
+	pfn_valid(virt_to_pfn(_addr));					\
+})
 
 /*
  * On Book-E parts we need __va to parse the device tree and we can't
-- 
Gitee


From 70ae4a2480398d82a47cd27b093859a42190ee66 Mon Sep 17 00:00:00 2001
From: Wang Wensheng <wangwensheng4@huawei.com>
Date: Tue, 19 Jul 2022 11:36:02 +0800
Subject: [PATCH 353/674] mm/sharepool: Allow share THP to kernel

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5DS9S
CVE: NA

--------------------------------------------------

This is not used for THP but the user page table is just like THP. The
user alloc hugepages via a special driver and its vma is not marked with
VM_HUGETLB. This commit allow to share those vma to kernel.

Signed-off-by: Wang Wensheng <wangwensheng4@huawei.com>
Reviewed-by: Weilong Chen <chenweilong@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 include/linux/share_pool.h |  1 +
 mm/share_pool.c            | 44 +++++++++++++++++++++++++++++++++-----
 2 files changed, 40 insertions(+), 5 deletions(-)

diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h
index 6f294911c6af..d95084b8f624 100644
--- a/include/linux/share_pool.h
+++ b/include/linux/share_pool.h
@@ -178,6 +178,7 @@ struct sp_walk_data {
 	unsigned long uva_aligned;
 	unsigned long page_size;
 	bool is_hugepage;
+	bool is_page_type_set;
 	pmd_t *pmd;
 };
 
diff --git a/mm/share_pool.c b/mm/share_pool.c
index 76088952d0a5..60ad48e238c4 100644
--- a/mm/share_pool.c
+++ b/mm/share_pool.c
@@ -2994,9 +2994,40 @@ EXPORT_SYMBOL_GPL(mg_sp_make_share_k2u);
 static int sp_pmd_entry(pmd_t *pmd, unsigned long addr,
 			unsigned long next, struct mm_walk *walk)
 {
+	struct page *page;
 	struct sp_walk_data *sp_walk_data = walk->private;
 
+	/*
+	 * There exist a scene in DVPP where the pagetable is huge page but its
+	 * vma doesn't record it, something like THP.
+	 * So we cannot make out whether it is a hugepage map until we access the
+	 * pmd here. If mixed size of pages appear, just return an error.
+	 */
+	if (pmd_huge(*pmd)) {
+		if (!sp_walk_data->is_page_type_set) {
+			sp_walk_data->is_page_type_set = true;
+			sp_walk_data->is_hugepage = true;
+		} else if (!sp_walk_data->is_hugepage)
+			return -EFAULT;
+
+		/* To skip pte level walk */
+		walk->action = ACTION_CONTINUE;
+
+		page = pmd_page(*pmd);
+		get_page(page);
+		sp_walk_data->pages[sp_walk_data->page_count++] = page;
+
+		return 0;
+	}
+
+	if (!sp_walk_data->is_page_type_set) {
+		sp_walk_data->is_page_type_set = true;
+		sp_walk_data->is_hugepage = false;
+	} else if (sp_walk_data->is_hugepage)
+		return -EFAULT;
+
 	sp_walk_data->pmd = pmd;
+
 	return 0;
 }
 
@@ -3140,6 +3171,8 @@ static int __sp_walk_page_range(unsigned long uva, unsigned long size,
 		sp_walk.pmd_entry = sp_pmd_entry;
 	}
 
+	sp_walk_data->is_page_type_set = false;
+	sp_walk_data->page_count = 0;
 	sp_walk_data->page_size = page_size;
 	uva_aligned = ALIGN_DOWN(uva, page_size);
 	sp_walk_data->uva_aligned = uva_aligned;
@@ -3164,8 +3197,12 @@ static int __sp_walk_page_range(unsigned long uva, unsigned long size,
 
 	ret = walk_page_range(mm, uva_aligned, uva_aligned + size_aligned,
 			      &sp_walk, sp_walk_data);
-	if (ret)
+	if (ret) {
+		while (sp_walk_data->page_count--)
+			put_page(pages[sp_walk_data->page_count]);
 		kvfree(pages);
+		sp_walk_data->pages = NULL;
+	}
 
 	return ret;
 }
@@ -3201,9 +3238,7 @@ void *sp_make_share_u2k(unsigned long uva, unsigned long size, int pid)
 	int ret = 0;
 	struct mm_struct *mm = current->mm;
 	void *p = ERR_PTR(-ESRCH);
-	struct sp_walk_data sp_walk_data = {
-		.page_count = 0,
-	};
+	struct sp_walk_data sp_walk_data;
 	struct vm_struct *area;
 
 	check_interrupt_context();
@@ -3544,7 +3579,6 @@ int sp_walk_page_range(unsigned long uva, unsigned long size,
 		return -ESRCH;
 	}
 
-	sp_walk_data->page_count = 0;
 	down_write(&mm->mmap_lock);
 	if (likely(!mm->core_state))
 		ret = __sp_walk_page_range(uva, size, mm, sp_walk_data);
-- 
Gitee


From 5cc1b791067f40e729f4690bc4957ba1c57aa943 Mon Sep 17 00:00:00 2001
From: Guo Mengqi <guomengqi3@huawei.com>
Date: Tue, 19 Jul 2022 11:36:03 +0800
Subject: [PATCH 354/674] mm/sharepool: use rwsem to protect sp group exit

ascend inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5DS9S
CVE: NA

-------------------------------------------------

Fix following situation:

when the last process in a group exits, and a second process tries to add
to this group.

The second process may get a invalid spg. However the group's
use_count is increased by 1, which caused the first process failed to
free the group when it exits. And then second process called
sp_group_drop --> free_sp_group and cause a double request of rwsem.

Signed-off-by: Guo Mengqi <guomengqi3@huawei.com>
Reviewed-by: Weilong Chen <chenweilong@huawei.com>
Signed-off-by: Yang Yingliang <yangyingliang@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 mm/share_pool.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/mm/share_pool.c b/mm/share_pool.c
index 60ad48e238c4..a8266a4a7e79 100644
--- a/mm/share_pool.c
+++ b/mm/share_pool.c
@@ -698,20 +698,25 @@ static void free_new_spg_id(bool new, int spg_id)
 		free_sp_group_id(spg_id);
 }
 
-static void free_sp_group(struct sp_group *spg)
+static void free_sp_group_locked(struct sp_group *spg)
 {
 	fput(spg->file);
 	fput(spg->file_hugetlb);
 	free_spg_stat(spg->id);
-	down_write(&sp_group_sem);
 	idr_remove(&sp_group_idr, spg->id);
-	up_write(&sp_group_sem);
 	free_sp_group_id((unsigned int)spg->id);
 	kfree(spg);
 	system_group_count--;
 	WARN(system_group_count < 0, "unexpected group count\n");
 }
 
+static void free_sp_group(struct sp_group *spg)
+{
+	down_write(&sp_group_sem);
+	free_sp_group_locked(spg);
+	up_write(&sp_group_sem);
+}
+
 static void sp_group_drop(struct sp_group *spg)
 {
 	if (atomic_dec_and_test(&spg->use_count))
@@ -4473,14 +4478,15 @@ void sp_group_post_exit(struct mm_struct *mm)
 		sp_proc_stat_drop(stat);
 	}
 
-	/* lockless traverse */
+	down_write(&sp_group_sem);
 	list_for_each_entry_safe(spg_node, tmp, &master->node_list, group_node) {
 		spg = spg_node->spg;
 		/* match with refcount inc in sp_group_add_task */
-		sp_group_drop(spg);
+		if (atomic_dec_and_test(&spg->use_count))
+			free_sp_group_locked(spg);
 		kfree(spg_node);
 	}
-
+	up_write(&sp_group_sem);
 	kfree(master);
 }
 
-- 
Gitee


From 8d06b0971a06c4cd5e6dcc645e71e749e3085856 Mon Sep 17 00:00:00 2001
From: Wang Wensheng <wangwensheng4@huawei.com>
Date: Tue, 19 Jul 2022 11:36:04 +0800
Subject: [PATCH 355/674] mm/sharepool: Accept device_id in k2u flags

ascend inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5DS9S
CVE: NA

-------------------------------------------------

We use device_id to select the correct dvpp vspace range when SP_DVPP
flag is specified.

Signed-off-by: Wang Wensheng <wangwensheng4@huawei.com>
Reviewed-by: Weilong Chen <chenweilong@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 mm/share_pool.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/share_pool.c b/mm/share_pool.c
index a8266a4a7e79..087e8f8cbfb3 100644
--- a/mm/share_pool.c
+++ b/mm/share_pool.c
@@ -2852,10 +2852,11 @@ static int sp_k2u_prepare(unsigned long kva, unsigned long size,
 
 	trace_sp_k2u_begin(kc);
 
-	if (sp_flags & ~SP_DVPP) {
+	if (sp_flags & ~SP_FLAG_MASK) {
 		pr_err_ratelimited("k2u sp_flags %lx error\n", sp_flags);
 		return -EINVAL;
 	}
+	sp_flags &= ~SP_HUGEPAGE;
 
 	if (!current->mm) {
 		pr_err_ratelimited("k2u: kthread is not allowed\n");
-- 
Gitee


From 5815e08af8ecbfefc844e692ed7bf597ad696253 Mon Sep 17 00:00:00 2001
From: Guo Mengqi <guomengqi3@huawei.com>
Date: Tue, 19 Jul 2022 11:36:05 +0800
Subject: [PATCH 356/674] mm/sharepool: Modify sharepool sp_mmap() page_offset

ascend inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5DS9S
CVE: NA

-----------------------------------

In sp_mmap(), if use offset = va - MMAP_BASE/DVPP_BASE, then normal
sp_alloc pgoff may have same value with DVPP pgoff, causing DVPP
and sp_alloc mapped to overlapped part of file unexpectedly.

To fix the problem, pass VA value as mmap offset, for in this scenario,
VA value in one task address space will not be same.

Signed-off-by: Guo Mengqi <guomengqi3@huawei.com>
Reviewed-by: Ding Tianhong <dingtianhong@huawei.com>
Signed-off-by: Yang Yingliang <yangyingliang@huawei.com>
Reviewed-by: Weilong Chen <chenweilong@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 mm/share_pool.c | 21 +++++----------------
 1 file changed, 5 insertions(+), 16 deletions(-)

diff --git a/mm/share_pool.c b/mm/share_pool.c
index 087e8f8cbfb3..29bbe0732781 100644
--- a/mm/share_pool.c
+++ b/mm/share_pool.c
@@ -58,6 +58,11 @@
 
 #define spg_valid(spg)		((spg)->is_alive == true)
 
+/* Use spa va address as mmap offset. This can work because spa_file
+ * is setup with 64-bit address space. So va shall be well covered.
+ */
+#define addr_offset(spa)	((spa)->va_start)
+
 #define byte2kb(size)		((size) >> 10)
 #define byte2mb(size)		((size) >> 20)
 #define page2kb(page_num)	((page_num) << (PAGE_SHIFT - 10))
@@ -931,22 +936,6 @@ static bool is_device_addr(unsigned long addr)
 	return false;
 }
 
-static loff_t addr_offset(struct sp_area *spa)
-{
-	unsigned long addr;
-
-	if (unlikely(!spa)) {
-		WARN(1, "invalid spa when calculate addr offset\n");
-		return 0;
-	}
-	addr = spa->va_start;
-
-	if (!is_device_addr(addr))
-		return (loff_t)(addr - MMAP_SHARE_POOL_START);
-
-	return (loff_t)(addr - sp_dev_va_start[spa->device_id]);
-}
-
 static struct sp_group *create_spg(int spg_id)
 {
 	int ret;
-- 
Gitee


From 022b7bdfb108d3623c3ce8ab6740d18ee22ac2e4 Mon Sep 17 00:00:00 2001
From: Zhou Guanghui <zhouguanghui1@huawei.com>
Date: Tue, 19 Jul 2022 11:36:06 +0800
Subject: [PATCH 357/674] mm/sharepool: Support read-only memory allocation

ascend inclusion
category: Bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5DS9S
CVE: NA

--------------------------------

When the driver uses the shared pool memory to share the memory
with the user space, the user space is not allowed to operate
this area. This prevents users from damaging sensitive data.

When the sp_alloc and k2u processes apply for private memory,
read-only memory can be applied for.

Signed-off-by: Zhou Guanghui <zhouguanghui1@huawei.com>
Reviewed-by: Weilong Chen <chenweilong@huawei.com>
Signed-off-by: Yongqiang Liu <liuyongqiang13@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 include/linux/share_pool.h |  3 ++-
 mm/share_pool.c            | 18 ++++++++++++++++--
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h
index d95084b8f624..022e61bb6ce4 100644
--- a/include/linux/share_pool.h
+++ b/include/linux/share_pool.h
@@ -15,6 +15,7 @@
 #define SP_HUGEPAGE_ONLY	(1 << 1)
 #define SP_DVPP			(1 << 2)
 #define SP_SPEC_NODE_ID		(1 << 3)
+#define SP_PROT_RO		(1 << 16)
 
 #define DEVICE_ID_BITS		4UL
 #define DEVICE_ID_MASK		((1UL << DEVICE_ID_BITS) - 1UL)
@@ -24,7 +25,7 @@
 #define NODE_ID_SHIFT		(DEVICE_ID_SHIFT + DEVICE_ID_BITS)
 
 #define SP_FLAG_MASK		(SP_HUGEPAGE | SP_HUGEPAGE_ONLY | SP_DVPP | \
-				 SP_SPEC_NODE_ID | \
+				 SP_SPEC_NODE_ID | SP_PROT_RO | \
 				(DEVICE_ID_MASK << DEVICE_ID_SHIFT) | \
 				(NODE_ID_MASK << NODE_ID_SHIFT))
 
diff --git a/mm/share_pool.c b/mm/share_pool.c
index 29bbe0732781..8be3f75d0449 100644
--- a/mm/share_pool.c
+++ b/mm/share_pool.c
@@ -2325,6 +2325,9 @@ static int sp_alloc_mmap(struct mm_struct *mm, struct sp_area *spa,
 	if (spg_node)
 		prot = spg_node->prot;
 
+	if (ac->sp_flags & SP_PROT_RO)
+		prot = PROT_READ;
+
 	/* when success, mmap_addr == spa->va_start */
 	mmap_addr = sp_mmap(mm, spa_file(spa), spa, &populate, prot);
 	if (IS_ERR_VALUE(mmap_addr)) {
@@ -2349,6 +2352,10 @@ static int sp_alloc_mmap(struct mm_struct *mm, struct sp_area *spa,
 		ret = -EINVAL;
 		goto unmap;
 	}
+
+	if (ac->sp_flags & SP_PROT_RO)
+		vma->vm_flags &= ~VM_MAYWRITE;
+
 	/* clean PTE_RDONLY flags or trigger SMMU event */
 	if (prot & PROT_WRITE)
 		vma->vm_page_prot = __pgprot(((~PTE_RDONLY) & vma->vm_page_prot.pgprot) | PTE_DIRTY);
@@ -2644,6 +2651,9 @@ static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa,
 		goto put_mm;
 	}
 
+	if (kc && kc->sp_flags & SP_PROT_RO)
+		prot = PROT_READ;
+
 	ret_addr = sp_mmap(mm, spa_file(spa), spa, &populate, prot);
 	if (IS_ERR_VALUE(ret_addr)) {
 		pr_debug("k2u mmap failed %lx\n", ret_addr);
@@ -2656,6 +2666,9 @@ static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa,
 	if (prot & PROT_WRITE)
 		vma->vm_page_prot = __pgprot(((~PTE_RDONLY) & vma->vm_page_prot.pgprot) | PTE_DIRTY);
 
+	if (kc && kc->sp_flags & SP_PROT_RO)
+		vma->vm_flags &= ~VM_MAYWRITE;
+
 	if (is_vm_hugetlb_page(vma)) {
 		ret = remap_vmalloc_hugepage_range(vma, (void *)kva, 0);
 		if (ret) {
@@ -2707,6 +2720,7 @@ static void *sp_make_share_kva_to_task(unsigned long kva, unsigned long size, un
 	struct sp_area *spa;
 	struct spg_proc_stat *stat;
 	unsigned long prot = PROT_READ | PROT_WRITE;
+	struct sp_k2u_context kc;
 
 	down_write(&sp_group_sem);
 	stat = sp_init_process_stat(current, current->mm, spg_none);
@@ -2725,8 +2739,8 @@ static void *sp_make_share_kva_to_task(unsigned long kva, unsigned long size, un
 	}
 
 	spa->kva = kva;
-
-	uva = (void *)sp_remap_kva_to_vma(kva, spa, current->mm, prot, NULL);
+	kc.sp_flags = sp_flags;
+	uva = (void *)sp_remap_kva_to_vma(kva, spa, current->mm, prot, &kc);
 	__sp_area_drop(spa);
 	if (IS_ERR(uva))
 		pr_err("remap k2u to task failed %ld\n", PTR_ERR(uva));
-- 
Gitee


From ce9fcee175901940ba7734c898392de94012270c Mon Sep 17 00:00:00 2001
From: Wang Wensheng <wangwensheng4@huawei.com>
Date: Tue, 19 Jul 2022 11:36:07 +0800
Subject: [PATCH 358/674] mm/sharepool: Fix using uninitialized sp_flag

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5DS9S
CVE: NA

-------------------------------------------------

Add the missing initialization for kc.sp_flag in
sp_make_share_kva_to_spg(). Or a random value would be used in
sp_remap_kva_to_vma().

Signed-off-by: Wang Wensheng <wangwensheng4@huawei.com>
Reviewed-by: Weilong Chen <chenweilong@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 mm/share_pool.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/share_pool.c b/mm/share_pool.c
index 8be3f75d0449..edba445d4bbf 100644
--- a/mm/share_pool.c
+++ b/mm/share_pool.c
@@ -2781,7 +2781,7 @@ static void *sp_make_share_kva_to_spg(unsigned long kva, unsigned long size,
 	}
 
 	spa->kva = kva;
-
+	kc.sp_flags = sp_flags;
 	list_for_each_entry(spg_node, &spg->procs, proc_node) {
 		mm = spg_node->master->mm;
 		kc.state = K2U_NORMAL;
-- 
Gitee


From d722c6b6843a68aaaaa1984e821e8db003df6d5c Mon Sep 17 00:00:00 2001
From: Yuan Can <yuancan@huawei.com>
Date: Tue, 19 Jul 2022 11:36:08 +0800
Subject: [PATCH 359/674] mm/sharepool: Avoid NULL pointer dereference in
 mg_sp_group_add_task

ascend inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5DS9S
CVE: NA

------------------------------------------------------

create_spg_node() may fail with NULL pointer returened, and in
the out_drop_spg_node path, the NULL pointer will be dereferenced
in free_spg_node().

Signed-off-by: Yuan Can <yuancan@huawei.com>
Reviewed-by: Weilong Chen <chenweilong@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 mm/share_pool.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/share_pool.c b/mm/share_pool.c
index edba445d4bbf..caf3e89b41c4 100644
--- a/mm/share_pool.c
+++ b/mm/share_pool.c
@@ -1283,7 +1283,7 @@ int mg_sp_group_add_task(int pid, unsigned long prot, int spg_id)
 	node = create_spg_node(mm, prot, spg);
 	if (unlikely(IS_ERR(node))) {
 		ret = PTR_ERR(node);
-		goto out_drop_spg_node;
+		goto out_drop_group;
 	}
 
 	/* per process statistics initialization */
-- 
Gitee


From 6db68bf84902025dd57fdfe94a5085166adade03 Mon Sep 17 00:00:00 2001
From: Zhou Guanghui <zhouguanghui1@huawei.com>
Date: Tue, 19 Jul 2022 11:36:09 +0800
Subject: [PATCH 360/674] mm/sharepool: Delete single-group mode

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5DS9S
CVE: NA

-------------------------------------------------

The single-group mode has no application scenario. Therefore, the
related branch is deleted.

The boot option "enable_sp_multi_group_mode" does not take effect.

Signed-off-by: Zhou Guanghui <zhouguanghui1@huawei.com>
Reviewed-by: Weilong Chen <chenweilong@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 mm/share_pool.c | 137 +++++++++---------------------------------------
 1 file changed, 25 insertions(+), 112 deletions(-)

diff --git a/mm/share_pool.c b/mm/share_pool.c
index caf3e89b41c4..076243713f83 100644
--- a/mm/share_pool.c
+++ b/mm/share_pool.c
@@ -67,9 +67,6 @@
 #define byte2mb(size)		((size) >> 20)
 #define page2kb(page_num)	((page_num) << (PAGE_SHIFT - 10))
 
-#define SINGLE_GROUP_MODE	1
-#define MULTI_GROUP_MODE	2
-
 #define MAX_GROUP_FOR_SYSTEM	50000
 #define MAX_GROUP_FOR_TASK	3000
 #define MAX_PROC_PER_GROUP	1024
@@ -98,8 +95,6 @@ int sysctl_share_pool_map_lock_enable;
 int sysctl_sp_perf_k2u;
 int sysctl_sp_perf_alloc;
 
-static int share_pool_group_mode = SINGLE_GROUP_MODE;
-
 static int system_group_count;
 
 static unsigned int sp_device_number;
@@ -1068,12 +1063,6 @@ static int mm_add_group_init(struct mm_struct *mm, struct sp_group *spg)
 	struct sp_group_master *master = mm->sp_group_master;
 	bool exist = false;
 
-	if (share_pool_group_mode == SINGLE_GROUP_MODE && master &&
-	    master->count == 1) {
-		pr_err_ratelimited("at most one sp group for a task is allowed in single mode\n");
-		return -EEXIST;
-	}
-
 	master = sp_init_group_master_locked(mm, &exist);
 	if (IS_ERR(master))
 		return PTR_ERR(master);
@@ -2211,72 +2200,30 @@ static int sp_alloc_prepare(unsigned long size, unsigned long sp_flags,
 	if (sp_flags & SP_HUGEPAGE_ONLY)
 		sp_flags |= SP_HUGEPAGE;
 
-	if (share_pool_group_mode == SINGLE_GROUP_MODE) {
-		spg = __sp_find_spg(current->pid, SPG_ID_DEFAULT);
-		if (spg) {
-			if (spg_id != SPG_ID_DEFAULT && spg->id != spg_id) {
-				sp_group_drop(spg);
-				return -ENODEV;
-			}
-
-			/* up_read will be at the end of sp_alloc */
-			down_read(&spg->rw_lock);
-			if (!spg_valid(spg)) {
-				up_read(&spg->rw_lock);
-				sp_group_drop(spg);
-				pr_err_ratelimited("allocation failed, spg is dead\n");
-				return -ENODEV;
-			}
-		} else {  /* alocation pass through scene */
-			if (enable_mdc_default_group) {
-				int ret = 0;
-
-				ret = sp_group_add_task(current->tgid, spg_id);
-				if (ret < 0) {
-					pr_err_ratelimited("add group failed in pass through\n");
-					return ret;
-				}
-
-				spg = __sp_find_spg(current->pid, SPG_ID_DEFAULT);
-
-				/* up_read will be at the end of sp_alloc */
-				down_read(&spg->rw_lock);
-				if (!spg_valid(spg)) {
-					up_read(&spg->rw_lock);
-					sp_group_drop(spg);
-					pr_err_ratelimited("pass through allocation failed, spg is dead\n");
-					return -ENODEV;
-				}
-			} else {
-				spg = spg_none;
-			}
+	if (spg_id != SPG_ID_DEFAULT) {
+		spg = __sp_find_spg(current->pid, spg_id);
+		if (!spg) {
+			pr_err_ratelimited("allocation failed, can't find group\n");
+			return -ENODEV;
 		}
-	} else {
-		if (spg_id != SPG_ID_DEFAULT) {
-			spg = __sp_find_spg(current->pid, spg_id);
-			if (!spg) {
-				pr_err_ratelimited("allocation failed, can't find group\n");
-				return -ENODEV;
-			}
 
-			/* up_read will be at the end of sp_alloc */
-			down_read(&spg->rw_lock);
-			if (!spg_valid(spg)) {
-				up_read(&spg->rw_lock);
-				sp_group_drop(spg);
-				pr_err_ratelimited("allocation failed, spg is dead\n");
-				return -ENODEV;
-			}
+		/* up_read will be at the end of sp_alloc */
+		down_read(&spg->rw_lock);
+		if (!spg_valid(spg)) {
+			up_read(&spg->rw_lock);
+			sp_group_drop(spg);
+			pr_err_ratelimited("allocation failed, spg is dead\n");
+			return -ENODEV;
+		}
 
-			if (!is_process_in_group(spg, current->mm)) {
-				up_read(&spg->rw_lock);
-				sp_group_drop(spg);
-				pr_err_ratelimited("allocation failed, task not in group\n");
-				return -ENODEV;
-			}
-		} else {  /* alocation pass through scene */
-			spg = spg_none;
+		if (!is_process_in_group(spg, current->mm)) {
+			up_read(&spg->rw_lock);
+			sp_group_drop(spg);
+			pr_err_ratelimited("allocation failed, task not in group\n");
+			return -ENODEV;
 		}
+	} else {  /* alocation pass through scene */
+		spg = spg_none;
 	}
 
 	if (sp_flags & SP_HUGEPAGE) {
@@ -2892,33 +2839,12 @@ static int sp_k2u_prepare(unsigned long kva, unsigned long size,
 	kc->size_aligned = size_aligned;
 	kc->sp_flags = sp_flags;
 	kc->spg_id = spg_id;
-	kc->to_task = false;
-	return 0;
-}
-
-static int sp_check_k2task(struct sp_k2u_context *kc)
-{
-	int ret = 0;
-	int spg_id = kc->spg_id;
-
-	if (share_pool_group_mode == SINGLE_GROUP_MODE) {
-		struct sp_group *spg = get_first_group(current->mm);
+	if (spg_id == SPG_ID_DEFAULT || spg_id == SPG_ID_NONE)
+		kc->to_task = true;
+	else
+		kc->to_task = false;
 
-		if (!spg) {
-			if (spg_id != SPG_ID_NONE && spg_id != SPG_ID_DEFAULT)
-				ret = -EINVAL;
-			else
-				kc->to_task = true;
-		} else {
-			if (spg_id != SPG_ID_DEFAULT && spg_id != spg->id)
-				ret = -EINVAL;
-			sp_group_drop(spg);
-		}
-	} else {
-		if (spg_id == SPG_ID_DEFAULT || spg_id == SPG_ID_NONE)
-			kc->to_task = true;
-	}
-	return ret;
+	return 0;
 }
 
 static void *sp_k2u_finish(void *uva, struct sp_k2u_context *kc)
@@ -2963,12 +2889,6 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size,
 	if (ret)
 		return ERR_PTR(ret);
 
-	ret = sp_check_k2task(&kc);
-	if (ret) {
-		uva = ERR_PTR(ret);
-		goto out;
-	}
-
 	if (kc.to_task)
 		uva = sp_make_share_kva_to_task(kc.kva_aligned, kc.size_aligned, kc.sp_flags);
 	else {
@@ -3735,13 +3655,6 @@ static int __init enable_share_k2u_to_group(char *s)
 }
 __setup("enable_sp_share_k2u_spg", enable_share_k2u_to_group);
 
-static int __init enable_sp_multi_group_mode(char *s)
-{
-	share_pool_group_mode = MULTI_GROUP_MODE;
-	return 1;
-}
-__setup("enable_sp_multi_group_mode", enable_sp_multi_group_mode);
-
 /*** Statistical and maintenance functions ***/
 
 static void free_process_spg_proc_stat(struct sp_proc_stat *proc_stat)
-- 
Gitee


From b31f7f2856d50d7b943d12c43852f28ef630f5ba Mon Sep 17 00:00:00 2001
From: Zhou Guanghui <zhouguanghui1@huawei.com>
Date: Tue, 19 Jul 2022 11:36:10 +0800
Subject: [PATCH 361/674] mm/sharepool: Create global normal and dvpp mapping

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5DS9S
CVE: NA

-------------------------------------------------

struct sp_mapping is used to manage the address space of a shared
pool. During the initialization of the shared pool, normal address
spaces are created to allocate the memory of the current shared pool.

Signed-off-by: Zhou Guanghui <zhouguanghui1@huawei.com>
Reviewed-by: Weilong Chen <chenweilong@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 include/linux/share_pool.h | 18 +++++++++++++
 mm/share_pool.c            | 52 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 70 insertions(+)

diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h
index 022e61bb6ce4..654dc8cc2922 100644
--- a/include/linux/share_pool.h
+++ b/include/linux/share_pool.h
@@ -101,6 +101,17 @@ struct sp_proc_stat {
 	atomic64_t k2u_size;
 };
 
+/*
+ * address space management
+ */
+struct sp_mapping {
+	unsigned long flag;
+	atomic_t user;
+	unsigned long start[MAX_DEVID];
+	unsigned long end[MAX_DEVID];
+	struct rb_root area_root;
+};
+
 /* Processes in the same sp_group can share memory.
  * Memory layout for share pool:
  *
@@ -142,6 +153,8 @@ struct sp_group {
 	atomic_t	 use_count;
 	/* protect the group internal elements, except spa_list */
 	struct rw_semaphore	rw_lock;
+	struct sp_mapping *dvpp;
+	struct sp_mapping *normal;
 };
 
 /* a per-process(per mm) struct which manages a sp_group_node list */
@@ -155,6 +168,11 @@ struct sp_group_master {
 	struct list_head node_list;
 	struct mm_struct *mm;
 	struct sp_proc_stat *stat;
+	/*
+	 * Used to apply for the shared pool memory of the current process.
+	 * For example, sp_alloc non-share memory or k2task.
+	 */
+	struct sp_group *local;
 };
 
 /*
diff --git a/mm/share_pool.c b/mm/share_pool.c
index 076243713f83..6c70ff72b7af 100644
--- a/mm/share_pool.c
+++ b/mm/share_pool.c
@@ -130,6 +130,48 @@ static DECLARE_RWSEM(sp_spg_stat_sem);
 /* for kthread buff_module_guard_work */
 static struct sp_proc_stat kthread_stat;
 
+#define SP_MAPPING_DVPP		0x1
+#define SP_MAPPING_NORMAL	0x2
+static struct sp_mapping *sp_mapping_normal;
+
+static void sp_mapping_range_init(struct sp_mapping *spm)
+{
+	int i;
+
+	for (i = 0; i < MAX_DEVID; i++) {
+		if (spm->flag & SP_MAPPING_NORMAL) {
+			spm->start[i] = MMAP_SHARE_POOL_START;
+			spm->end[i] = MMAP_SHARE_POOL_16G_START;
+			continue;
+		}
+
+		if (!is_sp_dev_addr_enabled(i)) {
+			spm->start[i] = MMAP_SHARE_POOL_16G_START +
+				i * MMAP_SHARE_POOL_16G_START;
+			spm->end[i] = spm->start[i] + MMAP_SHARE_POOL_16G_START;
+		} else {
+			spm->start[i] = sp_dev_va_start[i];
+			spm->end[i] = spm->start[i] + sp_dev_va_size[i];
+		}
+	}
+}
+
+static struct sp_mapping *sp_mapping_create(unsigned long flag)
+{
+	struct sp_mapping *spm;
+
+	spm = kzalloc(sizeof(struct sp_mapping), GFP_KERNEL);
+	if (!spm)
+		return ERR_PTR(-ENOMEM);
+
+	spm->flag = flag;
+	sp_mapping_range_init(spm);
+	atomic_set(&spm->user, 0);
+	spm->area_root = RB_ROOT;
+
+	return spm;
+}
+
 /* The caller must hold sp_group_sem */
 static struct sp_group_master *sp_init_group_master_locked(
 	struct mm_struct *mm, bool *exist)
@@ -4432,12 +4474,22 @@ static void __init sp_device_number_detect(void)
 
 static int __init share_pool_init(void)
 {
+	if (!sp_is_enabled())
+		return 0;
+
 	/* lockless, as init kthread has no sp operation else */
 	spg_none = create_spg(GROUP_NONE);
 	/* without free spg_none, not a serious problem */
 	if (IS_ERR(spg_none) || !spg_none)
 		goto fail;
 
+	sp_mapping_normal = sp_mapping_create(SP_MAPPING_NORMAL);
+	if (IS_ERR(sp_mapping_normal)) {
+		sp_group_drop(spg_none);
+		goto fail;
+	}
+	atomic_inc(&sp_mapping_normal->user);
+
 	sp_device_number_detect();
 	proc_sharepool_init();
 
-- 
Gitee


From 55f1816a3b2fd31c9a2ca8ee7c01bcc387f1700e Mon Sep 17 00:00:00 2001
From: Wang Wensheng <wangwensheng4@huawei.com>
Date: Tue, 19 Jul 2022 11:36:11 +0800
Subject: [PATCH 362/674] mm/sharepool: Fix kabi borken in sp_group_master

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5DS9S
CVE: NA

-------------------------------------------------

The sp_group_master structure is used only in sharepool subsys and no
other drivers use it.

Signed-off-by: Wang Wensheng <wangwensheng4@huawei.com>
Reviewed-by: Weilong Chen <chenweilong@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 include/linux/share_pool.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h
index 654dc8cc2922..b1b81947fa3f 100644
--- a/include/linux/share_pool.h
+++ b/include/linux/share_pool.h
@@ -10,6 +10,7 @@
 #include <linux/hashtable.h>
 #include <linux/numa.h>
 #include <linux/jump_label.h>
+#include <linux/kabi.h>
 
 #define SP_HUGEPAGE		(1 << 0)
 #define SP_HUGEPAGE_ONLY	(1 << 1)
@@ -172,7 +173,7 @@ struct sp_group_master {
 	 * Used to apply for the shared pool memory of the current process.
 	 * For example, sp_alloc non-share memory or k2task.
 	 */
-	struct sp_group *local;
+	KABI_EXTEND(struct sp_group *local)
 };
 
 /*
-- 
Gitee


From 85d9c2fe58f7fc974131b36e7c12bb1ae85edd15 Mon Sep 17 00:00:00 2001
From: Zhou Guanghui <zhouguanghui1@huawei.com>
Date: Tue, 19 Jul 2022 11:36:12 +0800
Subject: [PATCH 363/674] mm/sharepool: Address space management for sp_group

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5DS9S
CVE: NA

-------------------------------------------------

Separately manage the normal and dvpp address spaces of the
sp_group and set the normal and dvpp address spaces of the
corresponding groups when adding a group, sp_alloc, and k2task.

Signed-off-by: Zhou Guanghui <zhouguanghui1@huawei.com>
Reviewed-by: Weilong Chen <chenweilong@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 include/linux/share_pool.h |   6 +
 mm/share_pool.c            | 296 +++++++++++++++++++++++++++++--------
 2 files changed, 237 insertions(+), 65 deletions(-)

diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h
index b1b81947fa3f..289877dde2fb 100644
--- a/include/linux/share_pool.h
+++ b/include/linux/share_pool.h
@@ -40,6 +40,8 @@
 #define SPG_ID_AUTO_MIN 100000
 #define SPG_ID_AUTO_MAX 199999
 #define SPG_ID_AUTO     200000  /* generate group id automatically */
+#define SPG_ID_LOCAL_MIN	200001
+#define SPG_ID_LOCAL_MAX	299999
 
 #define MAX_DEVID 8	/* the max num of Da-vinci devices */
 
@@ -111,6 +113,10 @@ struct sp_mapping {
 	unsigned long start[MAX_DEVID];
 	unsigned long end[MAX_DEVID];
 	struct rb_root area_root;
+
+	struct rb_node *free_area_cache;
+	unsigned long cached_hole_size;
+	unsigned long cached_vstart;
 };
 
 /* Processes in the same sp_group can share memory.
diff --git a/mm/share_pool.c b/mm/share_pool.c
index 6c70ff72b7af..c3634d62e1fa 100644
--- a/mm/share_pool.c
+++ b/mm/share_pool.c
@@ -147,8 +147,8 @@ static void sp_mapping_range_init(struct sp_mapping *spm)
 
 		if (!is_sp_dev_addr_enabled(i)) {
 			spm->start[i] = MMAP_SHARE_POOL_16G_START +
-				i * MMAP_SHARE_POOL_16G_START;
-			spm->end[i] = spm->start[i] + MMAP_SHARE_POOL_16G_START;
+				i * MMAP_SHARE_POOL_16G_SIZE;
+			spm->end[i] = spm->start[i] + MMAP_SHARE_POOL_16G_SIZE;
 		} else {
 			spm->start[i] = sp_dev_va_start[i];
 			spm->end[i] = spm->start[i] + sp_dev_va_size[i];
@@ -172,10 +172,91 @@ static struct sp_mapping *sp_mapping_create(unsigned long flag)
 	return spm;
 }
 
+static void sp_mapping_destroy(struct sp_mapping *spm)
+{
+	kfree(spm);
+}
+
+static void sp_mapping_attach(struct sp_group *spg, struct sp_mapping *spm)
+{
+	atomic_inc(&spm->user);
+	if (spm->flag & SP_MAPPING_DVPP)
+		spg->dvpp = spm;
+	else if (spm->flag & SP_MAPPING_NORMAL)
+		spg->normal = spm;
+}
+
+static void sp_mapping_detach(struct sp_group *spg, struct sp_mapping *spm)
+{
+	if (spm && atomic_dec_and_test(&spm->user))
+		sp_mapping_destroy(spm);
+}
+
+/*
+ * When you set the address space of a group, the normal address space
+ * is globally unified. When processing the DVPP address space, consider
+ * the following situations:
+ * 1. If a process is added to a non-new group, the DVPP address space
+ *    must have been created. If the local group of the process also
+ *    contains the DVPP address space and they are different, this
+ *    scenario is not allowed to avoid address conflict.
+ * 2. If the DVPP address space does not exist in the local group of the
+ *    process, attach the local group of the process to the DVPP address
+ *    space of the group.
+ * 3. Add a new group. If the process has applied for the dvpp address
+ *    space (sp_alloc or k2u), attach the new group to the dvpp address
+ *    space of the current process.
+ * 4. If the process has not applied for the DVPP address space, attach
+ *    the new group and the local group of the current process to the
+ *     newly created DVPP address space.
+ *
+ * the caller must hold sp_group_sem
+ */
+static int sp_mapping_group_setup(struct mm_struct *mm, struct sp_group *spg)
+{
+	struct sp_group_master *master = mm->sp_group_master;
+	struct sp_group *local = master->local;
+	struct sp_mapping *spm;
+
+	if (!list_empty(&spg->procs)) {
+		/* 1 */
+		if (local->dvpp && local->dvpp != spg->dvpp) {
+			pr_info_ratelimited("Duplicate address space, id=%d\n",
+					spg->id);
+			return 0;
+		}
+
+		/* 2 */
+		if (!local->dvpp) {
+			sp_mapping_attach(local, spg->dvpp);
+			sp_mapping_attach(local, spg->normal);
+		}
+	} else {
+		/* 4 */
+		if (!local->dvpp) {
+			spm = sp_mapping_create(SP_MAPPING_DVPP);
+			if (IS_ERR(spm))
+				return PTR_ERR(spm);
+			sp_mapping_attach(local, spm);
+			sp_mapping_attach(local, sp_mapping_normal);
+		}
+
+		/* 3 */
+		sp_mapping_attach(spg, local->dvpp);
+		sp_mapping_attach(spg, sp_mapping_normal);
+	}
+
+	return 0;
+}
+
+static struct sp_group *create_spg(int spg_id);
+static void free_new_spg_id(bool new, int spg_id);
 /* The caller must hold sp_group_sem */
 static struct sp_group_master *sp_init_group_master_locked(
 	struct mm_struct *mm, bool *exist)
 {
+	int spg_id;
+	struct sp_group *spg;
 	struct sp_group_master *master = mm->sp_group_master;
 
 	if (master) {
@@ -187,16 +268,92 @@ static struct sp_group_master *sp_init_group_master_locked(
 	if (master == NULL)
 		return ERR_PTR(-ENOMEM);
 
+	spg_id = ida_alloc_range(&sp_group_id_ida, SPG_ID_LOCAL_MIN,
+				 SPG_ID_LOCAL_MAX, GFP_ATOMIC);
+	if (spg_id < 0) {
+		kfree(master);
+		pr_err_ratelimited("generate local group id failed %d\n", spg_id);
+		return ERR_PTR(spg_id);
+	}
+
+	spg = create_spg(spg_id);
+	if (IS_ERR(spg)) {
+		free_new_spg_id(true, spg_id);
+		kfree(master);
+		return (struct sp_group_master *)spg;
+	}
+
 	INIT_LIST_HEAD(&master->node_list);
 	master->count = 0;
 	master->stat = NULL;
 	master->mm = mm;
+	master->local = spg;
 	mm->sp_group_master = master;
 
 	*exist = false;
 	return master;
 }
 
+static inline bool is_local_group(int spg_id)
+{
+	return spg_id >= SPG_ID_LOCAL_MIN && spg_id <= SPG_ID_LOCAL_MAX;
+}
+
+/*
+ * If the process is added to a group first, the address space of the local
+ * group of the process must have been set. If the process is not added to
+ * a group, directly create or attach the process to the corresponding DVPP
+ * and normal address space.
+ */
+static int sp_mapping_group_setup_local(struct mm_struct *mm)
+{
+	struct sp_group_master *master;
+	struct sp_mapping *spm;
+	bool exist = false;
+
+	master = sp_init_group_master_locked(mm, &exist);
+	if (IS_ERR(master))
+		return PTR_ERR(master);
+
+	if (master->local->dvpp)
+		return 0;
+
+	spm = sp_mapping_create(SP_MAPPING_DVPP);
+	if (IS_ERR(spm))
+		return PTR_ERR(spm);
+	sp_mapping_attach(master->local, spm);
+	sp_mapping_attach(master->local, sp_mapping_normal);
+
+	return 0;
+}
+
+static struct sp_group *sp_get_local_group(struct mm_struct *mm)
+{
+	int ret;
+	struct sp_group_master *master;
+
+	down_read(&sp_group_sem);
+	master = mm->sp_group_master;
+	if (master && master->local) {
+		atomic_inc(&master->local->use_count);
+		up_read(&sp_group_sem);
+		return master->local;
+	}
+	up_read(&sp_group_sem);
+
+	down_write(&sp_group_sem);
+	ret = sp_mapping_group_setup_local(mm);
+	if (ret) {
+		up_write(&sp_group_sem);
+		return ERR_PTR(ret);
+	}
+	master = mm->sp_group_master;
+	atomic_inc(&master->local->use_count);
+	up_write(&sp_group_sem);
+
+	return master->local;
+}
+
 static struct sp_proc_stat *sp_get_proc_stat(struct mm_struct *mm)
 {
 	struct sp_proc_stat *stat;
@@ -580,7 +737,7 @@ static void spa_inc_usage(struct sp_area *spa)
 	case SPA_TYPE_K2TASK:
 		spa_stat.k2u_task_num += 1;
 		spa_stat.k2u_task_size += size;
-		update_spg_stat_k2u(size, true, spg_none->stat);
+		update_spg_stat_k2u(size, true, spa->spg->stat);
 		break;
 	case SPA_TYPE_K2SPG:
 		spa_stat.k2u_spg_num += 1;
@@ -603,7 +760,7 @@ static void spa_inc_usage(struct sp_area *spa)
 	spa_stat.total_num += 1;
 	spa_stat.total_size += size;
 
-	if (spa->spg != spg_none) {
+	if (!is_local_group(spa->spg->id)) {
 		atomic_inc(&sp_overall_stat.spa_total_num);
 		atomic64_add(size, &sp_overall_stat.spa_total_size);
 	}
@@ -626,7 +783,7 @@ static void spa_dec_usage(struct sp_area *spa)
 	case SPA_TYPE_K2TASK:
 		spa_stat.k2u_task_num -= 1;
 		spa_stat.k2u_task_size -= size;
-		update_spg_stat_k2u(size, false, spg_none->stat);
+		update_spg_stat_k2u(size, false, spa->spg->stat);
 		break;
 	case SPA_TYPE_K2SPG:
 		spa_stat.k2u_spg_num -= 1;
@@ -645,7 +802,7 @@ static void spa_dec_usage(struct sp_area *spa)
 	spa_stat.total_num -= 1;
 	spa_stat.total_size -= size;
 
-	if (spa->spg != spg_none) {
+	if (!is_local_group(spa->spg->id)) {
 		atomic_dec(&sp_overall_stat.spa_total_num);
 		atomic64_sub(spa->real_size, &sp_overall_stat.spa_total_size);
 	}
@@ -730,7 +887,8 @@ static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa,
 static void free_sp_group_id(int spg_id)
 {
 	/* ida operation is protected by an internal spin_lock */
-	if (spg_id >= SPG_ID_AUTO_MIN && spg_id <= SPG_ID_AUTO_MAX)
+	if ((spg_id >= SPG_ID_AUTO_MIN && spg_id <= SPG_ID_AUTO_MAX) ||
+	    (spg_id >= SPG_ID_LOCAL_MIN && spg_id <= SPG_ID_LOCAL_MAX))
 		ida_free(&sp_group_id_ida, spg_id);
 }
 
@@ -747,8 +905,11 @@ static void free_sp_group_locked(struct sp_group *spg)
 	free_spg_stat(spg->id);
 	idr_remove(&sp_group_idr, spg->id);
 	free_sp_group_id((unsigned int)spg->id);
+	sp_mapping_detach(spg, spg->dvpp);
+	sp_mapping_detach(spg, spg->normal);
+	if (!is_local_group(spg->id))
+		system_group_count--;
 	kfree(spg);
-	system_group_count--;
 	WARN(system_group_count < 0, "unexpected group count\n");
 }
 
@@ -981,7 +1142,8 @@ static struct sp_group *create_spg(int spg_id)
 	struct user_struct *user = NULL;
 	int hsize_log = MAP_HUGE_2MB >> MAP_HUGE_SHIFT;
 
-	if (unlikely(system_group_count + 1 == MAX_GROUP_FOR_SYSTEM)) {
+	if (unlikely(system_group_count + 1 == MAX_GROUP_FOR_SYSTEM &&
+		     !is_local_group(spg_id))) {
 		pr_err_ratelimited("reach system max group num\n");
 		return ERR_PTR(-ENOSPC);
 	}
@@ -1028,7 +1190,8 @@ static struct sp_group *create_spg(int spg_id)
 	if (ret < 0)
 		goto out_fput_all;
 
-	system_group_count++;
+	if (!is_local_group(spg_id))
+		system_group_count++;
 	return spg;
 
 out_fput_all:
@@ -1311,6 +1474,10 @@ int mg_sp_group_add_task(int pid, unsigned long prot, int spg_id)
 	if (ret)
 		goto out_drop_group;
 
+	ret = sp_mapping_group_setup(mm, spg);
+	if (ret)
+		goto out_drop_group;
+
 	node = create_spg_node(mm, prot, spg);
 	if (unlikely(IS_ERR(node))) {
 		ret = PTR_ERR(node);
@@ -1592,7 +1759,6 @@ static void __insert_sp_area(struct sp_area *spa)
 
 /* The sp_area cache globals are protected by sp_area_lock */
 static struct rb_node *free_sp_area_cache;
-static unsigned long cached_hole_size;
 static unsigned long cached_vstart;  /* affected by SP_DVPP and sp_config_dvpp_range() */
 
 /**
@@ -1611,11 +1777,12 @@ static struct sp_area *sp_alloc_area(unsigned long size, unsigned long flags,
 {
 	struct sp_area *spa, *first, *err;
 	struct rb_node *n;
-	unsigned long vstart = MMAP_SHARE_POOL_START;
-	unsigned long vend = MMAP_SHARE_POOL_16G_START;
+	unsigned long vstart;
+	unsigned long vend;
 	unsigned long addr;
 	unsigned long size_align = ALIGN(size, PMD_SIZE); /* va aligned to 2M */
 	int device_id, node_id;
+	struct sp_mapping *mapping;
 
 	device_id = sp_flags_device_id(flags);
 	node_id = flags & SP_SPEC_NODE_ID ? sp_flags_node_id(flags) : device_id;
@@ -1625,17 +1792,13 @@ static struct sp_area *sp_alloc_area(unsigned long size, unsigned long flags,
 		return ERR_PTR(-EINVAL);
 	}
 
-	if ((flags & SP_DVPP)) {
-		if (!is_sp_dev_addr_enabled(device_id)) {
-			vstart = MMAP_SHARE_POOL_16G_START +
-				device_id * MMAP_SHARE_POOL_16G_SIZE;
-			vend = vstart + MMAP_SHARE_POOL_16G_SIZE;
-		} else {
-			vstart = sp_dev_va_start[device_id];
-			vend = vstart + sp_dev_va_size[device_id];
-		}
-	}
+	if (flags & SP_DVPP)
+		mapping = spg->dvpp;
+	else
+		mapping = spg->normal;
 
+	vstart = mapping->start[device_id];
+	vend = mapping->end[device_id];
 	spa = __kmalloc_node(sizeof(struct sp_area), GFP_KERNEL, node_id);
 	if (unlikely(!spa))
 		return ERR_PTR(-ENOMEM);
@@ -1651,18 +1814,18 @@ static struct sp_area *sp_alloc_area(unsigned long size, unsigned long flags,
 	 * Note that sp_free_area may update free_sp_area_cache
 	 * without updating cached_hole_size.
 	 */
-	if (!free_sp_area_cache || size_align < cached_hole_size ||
-	    vstart != cached_vstart) {
-		cached_hole_size = 0;
-		free_sp_area_cache = NULL;
+	if (!mapping->free_area_cache || size_align < mapping->cached_hole_size ||
+	    vstart != mapping->cached_vstart) {
+		mapping->cached_hole_size = 0;
+		mapping->free_area_cache = NULL;
 	}
 
 	/* record if we encounter less permissive parameters */
-	cached_vstart = vstart;
+	mapping->cached_vstart = vstart;
 
 	/* find starting point for our search */
-	if (free_sp_area_cache) {
-		first = rb_entry(free_sp_area_cache, struct sp_area, rb_node);
+	if (mapping->free_area_cache) {
+		first = rb_entry(mapping->free_area_cache, struct sp_area, rb_node);
 		addr = first->va_end;
 		if (addr + size_align < addr) {
 			err = ERR_PTR(-EOVERFLOW);
@@ -1675,7 +1838,7 @@ static struct sp_area *sp_alloc_area(unsigned long size, unsigned long flags,
 			goto error;
 		}
 
-		n = sp_area_root.rb_node;
+		n = mapping->area_root.rb_node;
 		first = NULL;
 
 		while (n) {
@@ -1697,8 +1860,8 @@ static struct sp_area *sp_alloc_area(unsigned long size, unsigned long flags,
 
 	/* from the starting point, traverse areas until a suitable hole is found */
 	while (addr + size_align > first->va_start && addr + size_align <= vend) {
-		if (addr + cached_hole_size < first->va_start)
-			cached_hole_size = first->va_start - addr;
+		if (addr + mapping->cached_hole_size < first->va_start)
+			mapping->cached_hole_size = first->va_start - addr;
 		addr = first->va_end;
 		if (addr + size_align < addr) {
 			err = ERR_PTR(-EOVERFLOW);
@@ -1736,9 +1899,8 @@ static struct sp_area *sp_alloc_area(unsigned long size, unsigned long flags,
 
 	spa_inc_usage(spa);
 	__insert_sp_area(spa);
-	free_sp_area_cache = &spa->rb_node;
-	if (spa->spg != spg_none)
-		list_add_tail(&spa->link, &spg->spa_list);
+	mapping->free_area_cache = &spa->rb_node;
+	list_add_tail(&spa->link, &spg->spa_list);
 
 	spin_unlock(&sp_area_lock);
 
@@ -1829,8 +1991,7 @@ static void sp_free_area(struct sp_area *spa)
 		pr_debug("clear spa->kva %ld is not valid\n", spa->kva);
 
 	spa_dec_usage(spa);
-	if (spa->spg != spg_none)
-		list_del(&spa->link);
+	list_del(&spa->link);
 
 	rb_erase(&spa->rb_node, &sp_area_root);
 	RB_CLEAR_NODE(&spa->rb_node);
@@ -1990,7 +2151,7 @@ static void sp_fallocate(struct sp_area *spa)
 
 static void sp_free_unmap_fallocate(struct sp_area *spa)
 {
-	if (spa->spg != spg_none) {
+	if (!is_local_group(spa->spg->id)) {
 		down_read(&spa->spg->rw_lock);
 		__sp_free(spa->spg, spa->va_start, spa_size(spa), NULL);
 		sp_fallocate(spa);
@@ -2195,7 +2356,6 @@ static void trace_sp_alloc_begin(struct sp_alloc_context *ac)
 static void trace_sp_alloc_finish(struct sp_alloc_context *ac, unsigned long va)
 {
 	unsigned long cost;
-	bool is_pass_through = ac->spg == spg_none ? true : false;
 
 	if (!sysctl_sp_perf_alloc)
 		return;
@@ -2207,7 +2367,8 @@ static void trace_sp_alloc_finish(struct sp_alloc_context *ac, unsigned long va)
 	if (cost >= (unsigned long)sysctl_sp_perf_alloc) {
 		pr_err("Task %s(%d/%d) sp_alloc returns 0x%lx consumes %luus, size is %luKB, size_aligned is %luKB, sp_flags is %lx, pass through is %d\n",
 		       current->comm, current->tgid, current->pid,
-		       va, cost, byte2kb(ac->size), byte2kb(ac->size_aligned), ac->sp_flags, is_pass_through);
+		       va, cost, byte2kb(ac->size), byte2kb(ac->size_aligned), ac->sp_flags,
+		       is_local_group(ac->spg->id));
 	}
 }
 
@@ -2265,7 +2426,9 @@ static int sp_alloc_prepare(unsigned long size, unsigned long sp_flags,
 			return -ENODEV;
 		}
 	} else {  /* alocation pass through scene */
-		spg = spg_none;
+		spg = sp_get_local_group(current->mm);
+		if (IS_ERR(spg))
+			return PTR_ERR(spg);
 	}
 
 	if (sp_flags & SP_HUGEPAGE) {
@@ -2288,7 +2451,7 @@ static int sp_alloc_prepare(unsigned long size, unsigned long sp_flags,
 static void sp_alloc_unmap(struct mm_struct *mm, struct sp_area *spa,
 	struct sp_group_node *spg_node)
 {
-	if (spa->spg != spg_none)
+	if (!is_local_group(spa->spg->id))
 		__sp_free(spa->spg, spa->va_start, spa->real_size, mm);
 }
 
@@ -2353,7 +2516,7 @@ static int sp_alloc_mmap(struct mm_struct *mm, struct sp_area *spa,
 	return ret;
 
 unmap:
-	if (spa->spg != spg_none)
+	if (!is_local_group(spa->spg->id))
 		sp_alloc_unmap(list_next_entry(spg_node, proc_node)->master->mm, spa, spg_node);
 	else
 		sp_munmap(mm, spa->va_start, spa->real_size);
@@ -2456,7 +2619,7 @@ static int __sp_alloc_mmap_populate(struct mm_struct *mm, struct sp_area *spa,
 	ret = sp_alloc_populate(mm, spa, ac);
 	if (ret) {
 err:
-		if (spa->spg != spg_none)
+		if (!is_local_group(spa->spg->id))
 			sp_alloc_unmap(list_next_entry(spg_node, proc_node)->master->mm, spa, spg_node);
 		else
 			sp_munmap(mm, spa->va_start, spa->real_size);
@@ -2482,7 +2645,7 @@ static int sp_alloc_mmap_populate(struct sp_area *spa,
 	struct mm_struct *mm;
 	struct sp_group_node *spg_node;
 
-	if (spa->spg == spg_none) {
+	if (is_local_group(spa->spg->id)) {
 		ret = __sp_alloc_mmap_populate(current->mm, spa, NULL, ac);
 	} else {
 		/* create mapping for each process in the group */
@@ -2506,10 +2669,9 @@ static void sp_alloc_finish(int result, struct sp_area *spa,
 	struct sp_alloc_context *ac)
 {
 	struct sp_group *spg = ac->spg;
-	bool is_pass_through = spg == spg_none ? true : false;
 
-	/* match sp_alloc_check_prepare */
-	if (!is_pass_through)
+	/* match sp_alloc_prepare */
+	if (!is_local_group(spg->id))
 		up_read(&spg->rw_lock);
 
 	if (!result)
@@ -2521,9 +2683,7 @@ static void sp_alloc_finish(int result, struct sp_area *spa,
 		trace_sp_alloc_finish(ac, spa->va_start);
 	}
 
-	if (!is_pass_through)
-		sp_group_drop(spg);
-
+	sp_group_drop(spg);
 	sp_dump_stack();
 	sp_try_to_compact();
 }
@@ -2705,22 +2865,33 @@ static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa,
  */
 static void *sp_make_share_kva_to_task(unsigned long kva, unsigned long size, unsigned long sp_flags)
 {
+	int ret;
 	void *uva;
 	struct sp_area *spa;
 	struct spg_proc_stat *stat;
 	unsigned long prot = PROT_READ | PROT_WRITE;
 	struct sp_k2u_context kc;
+	struct sp_group *spg;
 
 	down_write(&sp_group_sem);
-	stat = sp_init_process_stat(current, current->mm, spg_none);
-	up_write(&sp_group_sem);
+	ret = sp_mapping_group_setup_local(current->mm);
+	if (ret) {
+		up_write(&sp_group_sem);
+		pr_err_ratelimited("k2u_task init local mapping failed %d\n", ret);
+		return ERR_PTR(ret);
+	}
+
+	spg = current->mm->sp_group_master->local;
+	stat = sp_init_process_stat(current, current->mm, spg);
 	if (IS_ERR(stat)) {
+		up_write(&sp_group_sem);
 		pr_err_ratelimited("k2u_task init process stat failed %lx\n",
 				PTR_ERR(stat));
 		return stat;
 	}
+	up_write(&sp_group_sem);
 
-	spa = sp_alloc_area(size, sp_flags, spg_none, SPA_TYPE_K2TASK, current->tgid);
+	spa = sp_alloc_area(size, sp_flags, spg, SPA_TYPE_K2TASK, current->tgid);
 	if (IS_ERR(spa)) {
 		pr_err_ratelimited("alloc spa failed in k2u_task (potential no enough virtual memory when -75): %ld\n",
 				PTR_ERR(spa));
@@ -3916,7 +4087,7 @@ static void rb_spa_stat_show(struct seq_file *seq)
 		atomic_inc(&spa->use_count);
 		spin_unlock(&sp_area_lock);
 
-		if (spa->spg == spg_none)  /* k2u to task */
+		if (is_local_group(spa->spg->id))  /* k2u to task */
 			seq_printf(seq, "%-10s ", "None");
 		else {
 			down_read(&spa->spg->rw_lock);
@@ -4446,6 +4617,9 @@ void sp_group_post_exit(struct mm_struct *mm)
 		kfree(spg_node);
 	}
 	up_write(&sp_group_sem);
+
+	if (master->local)
+		sp_group_drop(master->local);
 	kfree(master);
 }
 
@@ -4477,17 +4651,9 @@ static int __init share_pool_init(void)
 	if (!sp_is_enabled())
 		return 0;
 
-	/* lockless, as init kthread has no sp operation else */
-	spg_none = create_spg(GROUP_NONE);
-	/* without free spg_none, not a serious problem */
-	if (IS_ERR(spg_none) || !spg_none)
-		goto fail;
-
 	sp_mapping_normal = sp_mapping_create(SP_MAPPING_NORMAL);
-	if (IS_ERR(sp_mapping_normal)) {
-		sp_group_drop(spg_none);
+	if (IS_ERR(sp_mapping_normal))
 		goto fail;
-	}
 	atomic_inc(&sp_mapping_normal->user);
 
 	sp_device_number_detect();
-- 
Gitee


From 561d77b6ceb86ef4b7ec0d0d393657b3a3feea15 Mon Sep 17 00:00:00 2001
From: Zhou Guanghui <zhouguanghui1@huawei.com>
Date: Tue, 19 Jul 2022 11:36:13 +0800
Subject: [PATCH 364/674] mm/sharepool: Add an interface to obtain an id

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5DS9S
CVE: NA

-------------------------------------------------

The DVPP address space is per process or per sharing group.
During sp_free and unshare, you need to know which address
space the current address belongs to.

Signed-off-by: Zhou Guanghui <zhouguanghui1@huawei.com>
Reviewed-by: Weilong Chen <chenweilong@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 include/linux/share_pool.h | 12 ++++++++++++
 mm/share_pool.c            | 37 +++++++++++++++++++++++++++++++++++++
 2 files changed, 49 insertions(+)

diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h
index 289877dde2fb..077816a6fb3b 100644
--- a/include/linux/share_pool.h
+++ b/include/linux/share_pool.h
@@ -284,6 +284,9 @@ extern bool mg_is_sharepool_addr(unsigned long addr);
 extern int mg_sp_group_add_task(int pid, unsigned long prot, int spg_id);
 extern int sp_group_add_task(int pid, int spg_id);
 
+extern int sp_id_of_current(void);
+extern int mg_sp_id_of_current(void);
+
 extern void sp_area_drop(struct vm_area_struct *vma);
 extern int sp_group_exit(struct mm_struct *mm);
 extern void sp_group_post_exit(struct mm_struct *mm);
@@ -431,6 +434,15 @@ static inline int mg_sp_unshare(unsigned long va, unsigned long size)
 	return -EPERM;
 }
 
+static inline int sp_id_of_current(void)
+{
+	return -EPERM;
+}
+
+static inline int mg_sp_id_of_current(void)
+{
+	return -EPERM;
+}
 
 static inline void sp_init_mm(struct mm_struct *mm)
 {
diff --git a/mm/share_pool.c b/mm/share_pool.c
index c3634d62e1fa..39ecde90c01b 100644
--- a/mm/share_pool.c
+++ b/mm/share_pool.c
@@ -1734,6 +1734,43 @@ int sp_group_del_task(int pid, int spg_id)
 }
 EXPORT_SYMBOL_GPL(sp_group_del_task);
 
+int sp_id_of_current(void)
+{
+	int ret, spg_id;
+	struct sp_group_master *master;
+
+	if (current->flags & PF_KTHREAD || !current->mm)
+		return -EINVAL;
+
+	down_read(&sp_group_sem);
+	master = current->mm->sp_group_master;
+	if (master && master->local) {
+		spg_id = master->local->id;
+		up_read(&sp_group_sem);
+		return spg_id;
+	}
+	up_read(&sp_group_sem);
+
+	down_write(&sp_group_sem);
+	ret = sp_mapping_group_setup_local(current->mm);
+	if (ret) {
+		up_write(&sp_group_sem);
+		return ret;
+	}
+	master = current->mm->sp_group_master;
+	spg_id = master->local->id;
+	up_write(&sp_group_sem);
+
+	return spg_id;
+}
+EXPORT_SYMBOL_GPL(sp_id_of_current);
+
+int mg_sp_id_of_current(void)
+{
+	return sp_id_of_current();
+}
+EXPORT_SYMBOL_GPL(mg_sp_id_of_current);
+
 /* the caller must hold sp_area_lock */
 static void __insert_sp_area(struct sp_area *spa)
 {
-- 
Gitee


From 7de67bbc1aa0cf7b7ca7408505ae5c174fa96abc Mon Sep 17 00:00:00 2001
From: Zhou Guanghui <zhouguanghui1@huawei.com>
Date: Tue, 19 Jul 2022 11:36:14 +0800
Subject: [PATCH 365/674] mm/sharepool: Release the sp addr based on the id

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5DS9S
CVE: NA

-------------------------------------------------

The address space of the DVPP is managed by group. When releasing
the shared pool memory, you need to find the corresponding address
space based on the ID.

Signed-off-by: Zhou Guanghui <zhouguanghui1@huawei.com>
Reviewed-by: Weilong Chen <chenweilong@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 include/linux/share_pool.h |  12 +-
 mm/share_pool.c            | 235 ++++++++++++++++++-------------------
 2 files changed, 122 insertions(+), 125 deletions(-)

diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h
index 077816a6fb3b..8eb964230fd4 100644
--- a/include/linux/share_pool.h
+++ b/include/linux/share_pool.h
@@ -250,8 +250,8 @@ extern int proc_sp_group_state(struct seq_file *m, struct pid_namespace *ns,
 extern void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id);
 extern void *mg_sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id);
 
-extern int sp_free(unsigned long addr);
-extern int mg_sp_free(unsigned long addr);
+extern int sp_free(unsigned long addr, int id);
+extern int mg_sp_free(unsigned long addr, int id);
 
 extern void *sp_make_share_k2u(unsigned long kva, unsigned long size,
 			unsigned long sp_flags, int pid, int spg_id);
@@ -262,7 +262,7 @@ extern void *sp_make_share_u2k(unsigned long uva, unsigned long size, int pid);
 extern void *mg_sp_make_share_u2k(unsigned long uva, unsigned long size, int pid);
 
 extern int sp_unshare(unsigned long va, unsigned long size, int pid, int spg_id);
-extern int mg_sp_unshare(unsigned long va, unsigned long size);
+extern int mg_sp_unshare(unsigned long va, unsigned long size, int id);
 
 extern int sp_walk_page_range(unsigned long uva, unsigned long size,
 	struct task_struct *tsk, struct sp_walk_data *sp_walk_data);
@@ -392,12 +392,12 @@ static inline void *mg_sp_alloc(unsigned long size, unsigned long sp_flags, int
 	return NULL;
 }
 
-static inline int sp_free(unsigned long addr)
+static inline int sp_free(unsigned long addr, int id)
 {
 	return -EPERM;
 }
 
-static inline int mg_sp_free(unsigned long addr)
+static inline int mg_sp_free(unsigned long addr, int id)
 {
 	return -EPERM;
 }
@@ -429,7 +429,7 @@ static inline int sp_unshare(unsigned long va, unsigned long size, int pid, int
 	return -EPERM;
 }
 
-static inline int mg_sp_unshare(unsigned long va, unsigned long size)
+static inline int mg_sp_unshare(unsigned long va, unsigned long size, int id)
 {
 	return -EPERM;
 }
diff --git a/mm/share_pool.c b/mm/share_pool.c
index 39ecde90c01b..45cdb77a4299 100644
--- a/mm/share_pool.c
+++ b/mm/share_pool.c
@@ -642,12 +642,6 @@ static void free_spg_stat(int spg_id)
 	kfree(stat);
 }
 
-/*
- * Group '0' for k2u_task and pass through. No process will be actually
- * added to.
- */
-static struct sp_group *spg_none;
-
 /* statistics of all sp area, protected by sp_area_lock */
 struct sp_spa_stat {
 	unsigned int total_num;
@@ -944,26 +938,6 @@ static int get_task(int pid, struct task_struct **task)
 	return 0;
 }
 
-static struct sp_group *get_first_group(struct mm_struct *mm)
-{
-	struct sp_group *spg = NULL;
-	struct sp_group_master *master = mm->sp_group_master;
-
-	if (master && master->count >= 1) {
-		struct sp_group_node *spg_node = NULL;
-
-		spg_node = list_first_entry(&master->node_list,
-					struct sp_group_node, group_node);
-		spg = spg_node->spg;
-
-		/* don't revive a dead group */
-		if (!spg || !atomic_inc_not_zero(&spg->use_count))
-			spg = NULL;
-	}
-
-	return spg;
-}
-
 /*
  * the caller must:
  * 1. hold spg->rw_lock
@@ -988,35 +962,27 @@ static struct sp_group *__sp_find_spg_locked(int pid, int spg_id)
 	struct task_struct *tsk = NULL;
 	int ret = 0;
 
-	ret = get_task(pid, &tsk);
-	if (ret)
-		return NULL;
-
 	if (spg_id == SPG_ID_DEFAULT) {
-		/*
-		 * Once we encounter a concurrency problem here.
-		 * To fix it, we believe get_task_mm() and mmput() is too
-		 * heavy because we just get the pointer of sp_group.
-		 */
+		ret = get_task(pid, &tsk);
+		if (ret)
+			return NULL;
+
 		task_lock(tsk);
 		if (tsk->mm == NULL)
 			spg = NULL;
-		else
-			spg = get_first_group(tsk->mm);
+		else if (tsk->mm->sp_group_master)
+			spg = tsk->mm->sp_group_master->local;
 		task_unlock(tsk);
+
+		put_task_struct(tsk);
 	} else {
 		spg = idr_find(&sp_group_idr, spg_id);
-		/* don't revive a dead group */
-		if (!spg || !atomic_inc_not_zero(&spg->use_count))
-			goto fail;
 	}
 
-	put_task_struct(tsk);
-	return spg;
+	if (!spg || !atomic_inc_not_zero(&spg->use_count))
+		return NULL;
 
-fail:
-	put_task_struct(tsk);
-	return NULL;
+	return spg;
 }
 
 static struct sp_group *__sp_find_spg(int pid, int spg_id)
@@ -1772,9 +1738,9 @@ int mg_sp_id_of_current(void)
 EXPORT_SYMBOL_GPL(mg_sp_id_of_current);
 
 /* the caller must hold sp_area_lock */
-static void __insert_sp_area(struct sp_area *spa)
+static void __insert_sp_area(struct sp_mapping *spm, struct sp_area *spa)
 {
-	struct rb_node **p = &sp_area_root.rb_node;
+	struct rb_node **p = &spm->area_root.rb_node;
 	struct rb_node *parent = NULL;
 
 	while (*p) {
@@ -1791,13 +1757,9 @@ static void __insert_sp_area(struct sp_area *spa)
 	}
 
 	rb_link_node(&spa->rb_node, parent, p);
-	rb_insert_color(&spa->rb_node, &sp_area_root);
+	rb_insert_color(&spa->rb_node, &spm->area_root);
 }
 
-/* The sp_area cache globals are protected by sp_area_lock */
-static struct rb_node *free_sp_area_cache;
-static unsigned long cached_vstart;  /* affected by SP_DVPP and sp_config_dvpp_range() */
-
 /**
  * sp_alloc_area() - Allocate a region of VA from the share pool.
  * @size: the size of VA to allocate.
@@ -1845,10 +1807,10 @@ static struct sp_area *sp_alloc_area(unsigned long size, unsigned long flags,
 	/*
 	 * Invalidate cache if we have more permissive parameters.
 	 * cached_hole_size notes the largest hole noticed _below_
-	 * the sp_area cached in free_sp_area_cache: if size fits
+	 * the sp_area cached in free_area_cache: if size fits
 	 * into that hole, we want to scan from vstart to reuse
-	 * the hole instead of allocating above free_sp_area_cache.
-	 * Note that sp_free_area may update free_sp_area_cache
+	 * the hole instead of allocating above free_area_cache.
+	 * Note that sp_free_area may update free_area_cache
 	 * without updating cached_hole_size.
 	 */
 	if (!mapping->free_area_cache || size_align < mapping->cached_hole_size ||
@@ -1935,7 +1897,7 @@ static struct sp_area *sp_alloc_area(unsigned long size, unsigned long flags,
 	spa->device_id = device_id;
 
 	spa_inc_usage(spa);
-	__insert_sp_area(spa);
+	__insert_sp_area(mapping, spa);
 	mapping->free_area_cache = &spa->rb_node;
 	list_add_tail(&spa->link, &spg->spa_list);
 
@@ -1950,9 +1912,15 @@ static struct sp_area *sp_alloc_area(unsigned long size, unsigned long flags,
 }
 
 /* the caller should hold sp_area_lock */
-static struct sp_area *__find_sp_area_locked(unsigned long addr)
+static struct sp_area *__find_sp_area_locked(struct sp_group *spg,
+		unsigned long addr)
 {
-	struct rb_node *n = sp_area_root.rb_node;
+	struct rb_node *n;
+
+	if (addr >= MMAP_SHARE_POOL_START && addr < MMAP_SHARE_POOL_16G_START)
+		n = spg->normal->area_root.rb_node;
+	else
+		n = spg->dvpp->area_root.rb_node;
 
 	while (n) {
 		struct sp_area *spa;
@@ -1970,12 +1938,12 @@ static struct sp_area *__find_sp_area_locked(unsigned long addr)
 	return NULL;
 }
 
-static struct sp_area *__find_sp_area(unsigned long addr)
+static struct sp_area *__find_sp_area(struct sp_group *spg, unsigned long addr)
 {
 	struct sp_area *n;
 
 	spin_lock(&sp_area_lock);
-	n = __find_sp_area_locked(addr);
+	n = __find_sp_area_locked(spg, addr);
 	if (n)
 		atomic_inc(&n->use_count);
 	spin_unlock(&sp_area_lock);
@@ -2000,22 +1968,30 @@ static bool vmalloc_area_clr_flag(unsigned long kva, unsigned long flags)
  */
 static void sp_free_area(struct sp_area *spa)
 {
+	unsigned long addr = spa->va_start;
+	struct sp_mapping *spm;
+
 	lockdep_assert_held(&sp_area_lock);
 
-	if (free_sp_area_cache) {
+	if (addr >= MMAP_SHARE_POOL_START && addr < MMAP_SHARE_POOL_16G_START)
+		spm = spa->spg->normal;
+	else
+		spm = spa->spg->dvpp;
+
+	if (spm->free_area_cache) {
 		struct sp_area *cache;
 
-		cache = rb_entry(free_sp_area_cache, struct sp_area, rb_node);
+		cache = rb_entry(spm->free_area_cache, struct sp_area, rb_node);
 		if (spa->va_start <= cache->va_start) {
-			free_sp_area_cache = rb_prev(&spa->rb_node);
+			spm->free_area_cache = rb_prev(&spa->rb_node);
 			/*
 			 * the new cache node may be changed to another region,
 			 * i.e. from DVPP region to normal region
 			 */
-			if (free_sp_area_cache) {
-				cache = rb_entry(free_sp_area_cache,
+			if (spm->free_area_cache) {
+				cache = rb_entry(spm->free_area_cache,
 						 struct sp_area, rb_node);
-				cached_vstart = cache->region_vstart;
+				spm->cached_vstart = cache->region_vstart;
 			}
 			/*
 			 * We don't try to update cached_hole_size,
@@ -2030,7 +2006,7 @@ static void sp_free_area(struct sp_area *spa)
 	spa_dec_usage(spa);
 	list_del(&spa->link);
 
-	rb_erase(&spa->rb_node, &sp_area_root);
+	rb_erase(&spa->rb_node, &spm->area_root);
 	RB_CLEAR_NODE(&spa->rb_node);
 	kfree(spa);
 }
@@ -2072,7 +2048,7 @@ void sp_area_drop(struct vm_area_struct *vma)
 	 * an atomic operation.
 	 */
 	spin_lock(&sp_area_lock);
-	spa = __find_sp_area_locked(vma->vm_start);
+	spa = __find_sp_area_locked(vma->vm_mm->sp_group_master->local, vma->vm_start);
 	__sp_area_drop_locked(spa);
 	spin_unlock(&sp_area_lock);
 }
@@ -2204,7 +2180,7 @@ static int sp_check_caller_permission(struct sp_group *spg, struct mm_struct *mm
 	int ret = 0;
 
 	down_read(&spg->rw_lock);
-	if (!is_process_in_group(spg, mm))
+	if (!is_local_group(spg->id) && !is_process_in_group(spg, mm))
 		ret = -EPERM;
 	up_read(&spg->rw_lock);
 	return ret;
@@ -2217,6 +2193,7 @@ struct sp_free_context {
 	unsigned long addr;
 	struct sp_area *spa;
 	int state;
+	int spg_id;
 };
 
 /* when success, __sp_area_drop(spa) should be used */
@@ -2225,10 +2202,18 @@ static int sp_free_get_spa(struct sp_free_context *fc)
 	int ret = 0;
 	unsigned long addr = fc->addr;
 	struct sp_area *spa;
+	struct sp_group *spg;
+
+	spg = __sp_find_spg(current->tgid, fc->spg_id);
+	if (!spg) {
+		pr_debug("sp free get group failed %d\n", fc->spg_id);
+		return -EINVAL;
+	}
 
 	fc->state = FREE_CONT;
 
-	spa = __find_sp_area(addr);
+	spa = __find_sp_area(spg, addr);
+	sp_group_drop(spg);
 	if (!spa) {
 		pr_debug("sp free invalid input addr %lx\n", addr);
 		return -EINVAL;
@@ -2241,46 +2226,37 @@ static int sp_free_get_spa(struct sp_free_context *fc)
 	}
 	fc->spa = spa;
 
-	if (spa->spg != spg_none) {
-		/*
-		 * Access control: an sp addr can only be freed by
-		 * 1. another task in the same spg
-		 * 2. a kthread
-		 *
-		 * a passthrough addr can only be freed by the applier process
-		 */
-		if (!current->mm)
-			goto check_spa;
+	if (!current->mm)
+		goto check_spa;
 
-		ret = sp_check_caller_permission(spa->spg, current->mm);
-		if (ret < 0)
-			goto drop_spa;
+	ret = sp_check_caller_permission(spa->spg, current->mm);
+	if (ret < 0)
+		goto drop_spa;
 
 check_spa:
-		down_write(&spa->spg->rw_lock);
-		if (!spg_valid(spa->spg)) {
-			fc->state = FREE_END;
-			up_write(&spa->spg->rw_lock);
-			goto drop_spa;
-			/* we must return success(0) in this situation */
-		}
-		/* the life cycle of spa has a direct relation with sp group */
-		if (unlikely(spa->is_dead)) {
-			up_write(&spa->spg->rw_lock);
-			pr_err_ratelimited("unexpected double sp free\n");
-			dump_stack();
-			ret = -EINVAL;
-			goto drop_spa;
-		}
-		spa->is_dead = true;
-		up_write(&spa->spg->rw_lock);
+	if (is_local_group(spa->spg->id) && (current->tgid != spa->applier)) {
+		ret = -EPERM;
+		goto drop_spa;
+	}
 
-	} else {
-		if (current->tgid != spa->applier) {
-			ret = -EPERM;
-			goto drop_spa;
-		}
+	down_write(&spa->spg->rw_lock);
+	if (!spg_valid(spa->spg)) {
+		fc->state = FREE_END;
+		up_write(&spa->spg->rw_lock);
+		goto drop_spa;
+		/* we must return success(0) in this situation */
+	}
+	/* the life cycle of spa has a direct relation with sp group */
+	if (unlikely(spa->is_dead)) {
+		up_write(&spa->spg->rw_lock);
+		pr_err_ratelimited("unexpected double sp free\n");
+		dump_stack();
+		ret = -EINVAL;
+		goto drop_spa;
 	}
+	spa->is_dead = true;
+	up_write(&spa->spg->rw_lock);
+
 	return 0;
 
 drop_spa:
@@ -2291,21 +2267,26 @@ static int sp_free_get_spa(struct sp_free_context *fc)
 /**
  * sp_free() - Free the memory allocated by sp_alloc().
  * @addr: the starting VA of the memory.
+ * @id: Address space identifier, which is used to distinguish the addr.
  *
  * Return:
  * * 0		- success.
  * * -EINVAL	- the memory can't be found or was not allocted by share pool.
  * * -EPERM	- the caller has no permision to free the memory.
  */
-int sp_free(unsigned long addr)
+int sp_free(unsigned long addr, int id)
 {
 	int ret = 0;
 	struct sp_free_context fc = {
 		.addr = addr,
+		.spg_id = id,
 	};
 
 	check_interrupt_context();
 
+	if (current->flags & PF_KTHREAD)
+		return -EINVAL;
+
 	ret = sp_free_get_spa(&fc);
 	if (ret || fc.state == FREE_END)
 		goto out;
@@ -2326,9 +2307,9 @@ int sp_free(unsigned long addr)
 }
 EXPORT_SYMBOL_GPL(sp_free);
 
-int mg_sp_free(unsigned long addr)
+int mg_sp_free(unsigned long addr, int id)
 {
-	return sp_free(addr);
+	return sp_free(addr, id);
 }
 EXPORT_SYMBOL_GPL(mg_sp_free);
 
@@ -2422,6 +2403,11 @@ static int sp_alloc_prepare(unsigned long size, unsigned long sp_flags,
 	if (enable_mdc_default_group)
 		spg_id = mdc_default_group_id;
 
+	if (current->flags & PF_KTHREAD) {
+		pr_err_ratelimited("allocation failed, task is kthread\n");
+		return -EINVAL;
+	}
+
 	if (unlikely(!size || (size >> PAGE_SHIFT) > totalram_pages())) {
 		pr_err_ratelimited("allocation failed, invalid size %lu\n", size);
 		return -EINVAL;
@@ -2462,7 +2448,7 @@ static int sp_alloc_prepare(unsigned long size, unsigned long sp_flags,
 			pr_err_ratelimited("allocation failed, task not in group\n");
 			return -ENODEV;
 		}
-	} else {  /* alocation pass through scene */
+	} else {  /* allocation pass through scene */
 		spg = sp_get_local_group(current->mm);
 		if (IS_ERR(spg))
 			return PTR_ERR(spg);
@@ -3493,7 +3479,7 @@ EXPORT_SYMBOL_GPL(mg_sp_make_share_u2k);
  *
  * This also means we must trust DVPP channel destroy and guard worker code.
  */
-static int sp_unshare_uva(unsigned long uva, unsigned long size)
+static int sp_unshare_uva(unsigned long uva, unsigned long size, int group_id)
 {
 	int ret = 0;
 	struct mm_struct *mm;
@@ -3501,14 +3487,21 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size)
 	unsigned long uva_aligned;
 	unsigned long size_aligned;
 	unsigned int page_size;
+	struct sp_group *spg;
+
+	spg = __sp_find_spg(current->tgid, group_id);
+	if (!spg) {
+		pr_debug("sp unshare find group failed %d\n", group_id);
+		return -EINVAL;
+	}
 
 	/*
 	 * at first we guess it's a hugepage addr
 	 * we can tolerate at most PMD_SIZE or PAGE_SIZE which is matched in k2u
 	 */
-	spa = __find_sp_area(ALIGN_DOWN(uva, PMD_SIZE));
+	spa = __find_sp_area(spg, ALIGN_DOWN(uva, PMD_SIZE));
 	if (!spa) {
-		spa = __find_sp_area(ALIGN_DOWN(uva, PAGE_SIZE));
+		spa = __find_sp_area(spg, ALIGN_DOWN(uva, PAGE_SIZE));
 		if (!spa) {
 			ret = -EINVAL;
 			pr_debug("invalid input uva %lx in unshare uva\n", (unsigned long)uva);
@@ -3639,6 +3632,7 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size)
 out_drop_area:
 	__sp_area_drop(spa);
 out:
+	sp_group_drop(spg);
 	return ret;
 }
 
@@ -3702,9 +3696,12 @@ int sp_unshare(unsigned long va, unsigned long size, int pid, int spg_id)
 
 	check_interrupt_context();
 
+	if (current->flags & PF_KTHREAD)
+		return -EINVAL;
+
 	if (va < TASK_SIZE) {
 		/* user address */
-		ret = sp_unshare_uva(va, size);
+		ret = sp_unshare_uva(va, size, spg_id);
 	} else if (va >= PAGE_OFFSET) {
 		/* kernel address */
 		ret = sp_unshare_kva(va, size);
@@ -3718,9 +3715,9 @@ int sp_unshare(unsigned long va, unsigned long size, int pid, int spg_id)
 }
 EXPORT_SYMBOL_GPL(sp_unshare);
 
-int mg_sp_unshare(unsigned long va, unsigned long size)
+int mg_sp_unshare(unsigned long va, unsigned long size, int id)
 {
-	return sp_unshare(va, size, 0, 0);
+	return sp_unshare(va, size, 0, id);
 }
 EXPORT_SYMBOL_GPL(mg_sp_unshare);
 
@@ -3880,8 +3877,8 @@ int sp_node_id(struct vm_area_struct *vma)
 	if (!sp_is_enabled())
 		return node_id;
 
-	if (vma) {
-		spa = __find_sp_area(vma->vm_start);
+	if (vma && vma->vm_flags & VM_SHARE_POOL) {
+		spa = __find_sp_area(vma->vm_mm->sp_group_master->local, vma->vm_start);
 		if (spa) {
 			node_id = spa->node_id;
 			__sp_area_drop(spa);
@@ -4047,7 +4044,7 @@ static void print_process_prot(struct seq_file *seq, unsigned long prot)
 		seq_puts(seq, "R");
 	else if (prot == (PROT_READ | PROT_WRITE))
 		seq_puts(seq, "RW");
-	else  /* e.g. spg_none */
+	else
 		seq_puts(seq, "-");
 }
 
@@ -4448,7 +4445,7 @@ vm_fault_t sharepool_no_page(struct mm_struct *mm,
 	int node_id;
 	struct sp_area *spa;
 
-	spa = __find_sp_area(vma->vm_start);
+	spa = __find_sp_area(mm->sp_group_master->local, vma->vm_start);
 	if (!spa) {
 		pr_err("share pool: vma is invalid, not from sp mmap\n");
 		return ret;
-- 
Gitee


From c70421912716bd11b5bd2f18e3f537e7e20935a1 Mon Sep 17 00:00:00 2001
From: Zhou Guanghui <zhouguanghui1@huawei.com>
Date: Tue, 19 Jul 2022 11:36:15 +0800
Subject: [PATCH 366/674] mm/sharepool: Share pool statistics adaption

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5DS9S
CVE: NA

-------------------------------------------------

The management of the address space is adjusted, and the statistical
data processing of the shared pool needs to be adapted.

Signed-off-by: Zhou Guanghui <zhouguanghui1@huawei.com>
Signed-off-by: Zhang Jian <zhangjian210@huawei.com>
Reviewed-by: Weilong Chen <chenweilong@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 mm/share_pool.c | 69 ++++++++++++++++++++++++++++---------------------
 1 file changed, 40 insertions(+), 29 deletions(-)

diff --git a/mm/share_pool.c b/mm/share_pool.c
index 45cdb77a4299..e673b05bda5a 100644
--- a/mm/share_pool.c
+++ b/mm/share_pool.c
@@ -699,7 +699,6 @@ struct sp_area {
 	int device_id;
 };
 static DEFINE_SPINLOCK(sp_area_lock);
-static struct rb_root sp_area_root = RB_ROOT;
 
 static unsigned long spa_size(struct sp_area *spa)
 {
@@ -4106,14 +4105,13 @@ int proc_sp_group_state(struct seq_file *m, struct pid_namespace *ns,
 	return 0;
 }
 
-static void rb_spa_stat_show(struct seq_file *seq)
+static void spa_stat_of_mapping_show(struct seq_file *seq, struct sp_mapping *spm)
 {
 	struct rb_node *node;
 	struct sp_area *spa, *prev = NULL;
 
 	spin_lock(&sp_area_lock);
-
-	for (node = rb_first(&sp_area_root); node; node = rb_next(node)) {
+	for (node = rb_first(&spm->area_root); node; node = rb_next(node)) {
 		__sp_area_drop_locked(prev);
 
 		spa = rb_entry(node, struct sp_area, rb_node);
@@ -4121,16 +4119,12 @@ static void rb_spa_stat_show(struct seq_file *seq)
 		atomic_inc(&spa->use_count);
 		spin_unlock(&sp_area_lock);
 
-		if (is_local_group(spa->spg->id))  /* k2u to task */
-			seq_printf(seq, "%-10s ", "None");
-		else {
-			down_read(&spa->spg->rw_lock);
-			if (spg_valid(spa->spg))  /* k2u to group */
-				seq_printf(seq, "%-10d ", spa->spg->id);
-			else  /* spg is dead */
-				seq_printf(seq, "%-10s ", "Dead");
-			up_read(&spa->spg->rw_lock);
-		}
+		down_read(&spa->spg->rw_lock);
+		if (spg_valid(spa->spg))  /* k2u to group */
+			seq_printf(seq, "%-10d ", spa->spg->id);
+		else  /* spg is dead */
+			seq_printf(seq, "%-10s ", "Dead");
+		up_read(&spa->spg->rw_lock);
 
 		seq_printf(seq, "%2s%-14lx %2s%-14lx %-10ld ",
 			   "0x", spa->va_start,
@@ -4166,6 +4160,30 @@ static void rb_spa_stat_show(struct seq_file *seq)
 	spin_unlock(&sp_area_lock);
 }
 
+static void spa_normal_stat_show(struct seq_file *seq)
+{
+	spa_stat_of_mapping_show(seq, sp_mapping_normal);
+}
+
+static int idr_spg_dvpp_stat_show_cb(int id, void *p, void *data)
+{
+	struct sp_group *spg = p;
+	struct seq_file *seq = data;
+
+	if (!is_local_group(spg->id) || atomic_read(&spg->dvpp->user) == 1)
+		spa_stat_of_mapping_show(seq, spg->dvpp);
+
+	return 0;
+}
+
+static void spa_dvpp_stat_show(struct seq_file *seq)
+{
+	down_read(&sp_group_sem);
+	idr_for_each(&sp_group_idr, idr_spg_dvpp_stat_show_cb, seq);
+	up_read(&sp_group_sem);
+}
+
+
 void spa_overview_show(struct seq_file *seq)
 {
 	unsigned int total_num, alloc_num, k2u_task_num, k2u_spg_num;
@@ -4219,12 +4237,11 @@ static int idr_spg_stat_cb(int id, void *p, void *data)
 	struct sp_spg_stat *s = p;
 	struct seq_file *seq = data;
 
-	if (seq != NULL) {
-		if (id == 0)
-			seq_puts(seq, "Non Group ");
-		else
-			seq_printf(seq, "Group %6d ", id);
+	if (is_local_group(id) && atomic64_read(&s->size) == 0)
+		return 0;
 
+	if (seq != NULL) {
+		seq_printf(seq, "Group %6d ", id);
 		seq_printf(seq, "size: %lld KB, spa num: %d, total alloc: %lld KB, normal alloc: %lld KB, huge alloc: %lld KB\n",
 			   byte2kb(atomic64_read(&s->size)),
 			   atomic_read(&s->spa_num),
@@ -4232,11 +4249,7 @@ static int idr_spg_stat_cb(int id, void *p, void *data)
 			   byte2kb(atomic64_read(&s->alloc_nsize)),
 			   byte2kb(atomic64_read(&s->alloc_hsize)));
 	} else {
-		if (id == 0)
-			pr_info("Non Group ");
-		else
-			pr_info("Group %6d ", id);
-
+		pr_info("Group %6d ", id);
 		pr_info("size: %lld KB, spa num: %d, total alloc: %lld KB, normal alloc: %lld KB, huge alloc: %lld KB\n",
 			byte2kb(atomic64_read(&s->size)),
 			atomic_read(&s->spa_num),
@@ -4280,7 +4293,8 @@ static int spa_stat_show(struct seq_file *seq, void *offset)
 	/* print the file header */
 	seq_printf(seq, "%-10s %-16s %-16s %-10s %-7s %-5s %-8s %-8s\n",
 		   "Group ID", "va_start", "va_end", "Size(KB)", "Type", "Huge", "PID", "Ref");
-	rb_spa_stat_show(seq);
+	spa_normal_stat_show(seq);
+	spa_dvpp_stat_show(seq);
 	return 0;
 }
 
@@ -4317,10 +4331,7 @@ static int idr_proc_stat_cb(int id, void *p, void *data)
 		prot = get_process_prot_locked(id, mm);
 
 		seq_printf(seq, "%-8d ", tgid);
-		if (id == 0)
-			seq_printf(seq, "%-8c ", '-');
-		else
-			seq_printf(seq, "%-8d ", id);
+		seq_printf(seq, "%-8d ", id);
 		seq_printf(seq, "%-9ld %-9ld %-9ld %-10ld %-10ld %-8ld %-7ld %-7ld %-10ld ",
 			   get_spg_proc_alloc(spg_proc_stat),
 			   get_spg_proc_k2u(spg_proc_stat),
-- 
Gitee


From 331c3817722767fafe762073c5bec5e5cd99c5f8 Mon Sep 17 00:00:00 2001
From: Wang Wensheng <wangwensheng4@huawei.com>
Date: Tue, 19 Jul 2022 11:36:16 +0800
Subject: [PATCH 367/674] mm/sharepool: Use vm_private_data to store the spa

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5DS9S
CVE: NA

-------------------------------------------------

When we destroy a vma, we first find the spa depending on the
vma->vm_start, during which we should hold the sp_area_lock.
While we store the spa in vma, we can get the spa directly.
Don't worry if the spa exists or if it's to be freed soon, since
we have increaced the refcount for the spa when it's mappend into
a vma.

Signed-off-by: Wang Wensheng <wangwensheng4@huawei.com>
Reviewed-by: Weilong Chen <chenweilong@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 mm/share_pool.c | 45 ++++++++++++++++-----------------------------
 1 file changed, 16 insertions(+), 29 deletions(-)

diff --git a/mm/share_pool.c b/mm/share_pool.c
index e673b05bda5a..c7b185739b61 100644
--- a/mm/share_pool.c
+++ b/mm/share_pool.c
@@ -855,7 +855,7 @@ static inline bool check_aoscore_process(struct task_struct *tsk)
 
 static unsigned long sp_mmap(struct mm_struct *mm, struct file *file,
 			     struct sp_area *spa, unsigned long *populate,
-			     unsigned long prot);
+			     unsigned long prot, struct vm_area_struct **pvma);
 static void sp_munmap(struct mm_struct *mm, unsigned long addr, unsigned long size);
 
 #define K2U_NORMAL	0
@@ -1504,7 +1504,7 @@ int mg_sp_group_add_task(int pid, unsigned long prot, int spg_id)
 			break;
 		}
 
-		addr = sp_mmap(mm, file, spa, &populate, prot);
+		addr = sp_mmap(mm, file, spa, &populate, prot, NULL);
 		if (IS_ERR_VALUE(addr)) {
 			sp_munmap_task_areas(mm, spg, &spa->link);
 			up_write(&mm->mmap_lock);
@@ -2034,8 +2034,6 @@ static void __sp_area_drop(struct sp_area *spa)
 
 void sp_area_drop(struct vm_area_struct *vma)
 {
-	struct sp_area *spa;
-
 	if (!(vma->vm_flags & VM_SHARE_POOL))
 		return;
 
@@ -2047,8 +2045,7 @@ void sp_area_drop(struct vm_area_struct *vma)
 	 * an atomic operation.
 	 */
 	spin_lock(&sp_area_lock);
-	spa = __find_sp_area_locked(vma->vm_mm->sp_group_master->local, vma->vm_start);
-	__sp_area_drop_locked(spa);
+	__sp_area_drop_locked(vma->vm_private_data);
 	spin_unlock(&sp_area_lock);
 }
 
@@ -2315,7 +2312,7 @@ EXPORT_SYMBOL_GPL(mg_sp_free);
 /* wrapper of __do_mmap() and the caller must hold down_write(&mm->mmap_lock). */
 static unsigned long sp_mmap(struct mm_struct *mm, struct file *file,
 			     struct sp_area *spa, unsigned long *populate,
-			     unsigned long prot)
+			     unsigned long prot, struct vm_area_struct **pvma)
 {
 	unsigned long addr = spa->va_start;
 	unsigned long size = spa_size(spa);
@@ -2323,6 +2320,7 @@ static unsigned long sp_mmap(struct mm_struct *mm, struct file *file,
 			      MAP_SHARE_POOL;
 	unsigned long vm_flags = VM_NORESERVE | VM_SHARE_POOL | VM_DONTCOPY;
 	unsigned long pgoff = addr_offset(spa) >> PAGE_SHIFT;
+	struct vm_area_struct *vma;
 
 	/* Mark the mapped region to be locked. After the MAP_LOCKED is enable,
 	 * multiple tasks will preempt resources, causing performance loss.
@@ -2338,8 +2336,13 @@ static unsigned long sp_mmap(struct mm_struct *mm, struct file *file,
 		pr_err("do_mmap fails %ld\n", addr);
 	} else {
 		BUG_ON(addr != spa->va_start);
+		vma = find_vma(mm, addr);
+		vma->vm_private_data = spa;
+		if (pvma)
+			*pvma = vma;
 	}
 
+
 	return addr;
 }
 
@@ -2484,7 +2487,6 @@ static int sp_alloc_mmap(struct mm_struct *mm, struct sp_area *spa,
 	unsigned long mmap_addr;
 	/* pass through default permission */
 	unsigned long prot = PROT_READ | PROT_WRITE;
-	unsigned long sp_addr = spa->va_start;
 	unsigned long populate = 0;
 	struct vm_area_struct *vma;
 
@@ -2503,7 +2505,7 @@ static int sp_alloc_mmap(struct mm_struct *mm, struct sp_area *spa,
 		prot = PROT_READ;
 
 	/* when success, mmap_addr == spa->va_start */
-	mmap_addr = sp_mmap(mm, spa_file(spa), spa, &populate, prot);
+	mmap_addr = sp_mmap(mm, spa_file(spa), spa, &populate, prot, &vma);
 	if (IS_ERR_VALUE(mmap_addr)) {
 		up_write(&mm->mmap_lock);
 		sp_alloc_unmap(mm, spa, spg_node);
@@ -2519,14 +2521,6 @@ static int sp_alloc_mmap(struct mm_struct *mm, struct sp_area *spa,
 	}
 	ac->populate = populate;
 
-	vma = find_vma(mm, sp_addr);
-	if (unlikely(!vma)) {
-		up_write(&mm->mmap_lock);
-		WARN(1, "allocation failed, can't find %lx vma\n", sp_addr);
-		ret = -EINVAL;
-		goto unmap;
-	}
-
 	if (ac->sp_flags & SP_PROT_RO)
 		vma->vm_flags &= ~VM_MAYWRITE;
 
@@ -2825,15 +2819,12 @@ static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa,
 	if (kc && kc->sp_flags & SP_PROT_RO)
 		prot = PROT_READ;
 
-	ret_addr = sp_mmap(mm, spa_file(spa), spa, &populate, prot);
+	ret_addr = sp_mmap(mm, spa_file(spa), spa, &populate, prot, &vma);
 	if (IS_ERR_VALUE(ret_addr)) {
 		pr_debug("k2u mmap failed %lx\n", ret_addr);
 		goto put_mm;
 	}
-	BUG_ON(ret_addr != spa->va_start);
 
-	vma = find_vma(mm, ret_addr);
-	BUG_ON(vma == NULL);
 	if (prot & PROT_WRITE)
 		vma->vm_page_prot = __pgprot(((~PTE_RDONLY) & vma->vm_page_prot.pgprot) | PTE_DIRTY);
 
@@ -3876,12 +3867,9 @@ int sp_node_id(struct vm_area_struct *vma)
 	if (!sp_is_enabled())
 		return node_id;
 
-	if (vma && vma->vm_flags & VM_SHARE_POOL) {
-		spa = __find_sp_area(vma->vm_mm->sp_group_master->local, vma->vm_start);
-		if (spa) {
-			node_id = spa->node_id;
-			__sp_area_drop(spa);
-		}
+	if (vma && vma->vm_flags & VM_SHARE_POOL && vma->vm_private_data) {
+		spa = vma->vm_private_data;
+		node_id = spa->node_id;
 	}
 
 	return node_id;
@@ -4456,13 +4444,12 @@ vm_fault_t sharepool_no_page(struct mm_struct *mm,
 	int node_id;
 	struct sp_area *spa;
 
-	spa = __find_sp_area(mm->sp_group_master->local, vma->vm_start);
+	spa = vma->vm_private_data;
 	if (!spa) {
 		pr_err("share pool: vma is invalid, not from sp mmap\n");
 		return ret;
 	}
 	node_id = spa->node_id;
-	__sp_area_drop(spa);
 
 retry:
 	page = find_lock_page(mapping, idx);
-- 
Gitee


From 447951e3d1f83b41cb91a3fb332eacf734b6714a Mon Sep 17 00:00:00 2001
From: Wang Wensheng <wangwensheng4@huawei.com>
Date: Tue, 19 Jul 2022 11:36:17 +0800
Subject: [PATCH 368/674] mm/sharepool: Unify the memory allocation process

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5DS9S
CVE: NA

-------------------------------------------------

There are two types of memory allocated from sharepool: passthrough
memory for DVPP and shared memory. Currently, we branch to different
routines depending on the memory type, both during the allocation and
free process. Since we have already create a local group for passthrough
memory, with just one step ahead, we could drop the redundant branches
in allocation and free process and in all the fallback process when an
error occurs.

Here is the content of this patch:
1. Add erery process to its local group when initilizing its group_master.
2. Avoid to return the local group in find_sp_group_id_by_pid().
3. Delete the redundant branches during allocation and free process.

Signed-off-by: Wang Wensheng <wangwensheng4@huawei.com>
Reviewed-by: Weilong Chen <chenweilong@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 mm/share_pool.c | 139 ++++++++++++++++++++++++++++++------------------
 1 file changed, 87 insertions(+), 52 deletions(-)

diff --git a/mm/share_pool.c b/mm/share_pool.c
index c7b185739b61..b627c9347f78 100644
--- a/mm/share_pool.c
+++ b/mm/share_pool.c
@@ -249,13 +249,15 @@ static int sp_mapping_group_setup(struct mm_struct *mm, struct sp_group *spg)
 	return 0;
 }
 
+static void free_sp_group_locked(struct sp_group *spg);
+static int local_group_add_task(struct mm_struct *mm, struct sp_group *spg);
 static struct sp_group *create_spg(int spg_id);
 static void free_new_spg_id(bool new, int spg_id);
 /* The caller must hold sp_group_sem */
 static struct sp_group_master *sp_init_group_master_locked(
 	struct mm_struct *mm, bool *exist)
 {
-	int spg_id;
+	int spg_id, ret;
 	struct sp_group *spg;
 	struct sp_group_master *master = mm->sp_group_master;
 
@@ -271,16 +273,15 @@ static struct sp_group_master *sp_init_group_master_locked(
 	spg_id = ida_alloc_range(&sp_group_id_ida, SPG_ID_LOCAL_MIN,
 				 SPG_ID_LOCAL_MAX, GFP_ATOMIC);
 	if (spg_id < 0) {
-		kfree(master);
 		pr_err_ratelimited("generate local group id failed %d\n", spg_id);
-		return ERR_PTR(spg_id);
+		ret = spg_id;
+		goto free_master;
 	}
 
 	spg = create_spg(spg_id);
 	if (IS_ERR(spg)) {
-		free_new_spg_id(true, spg_id);
-		kfree(master);
-		return (struct sp_group_master *)spg;
+		ret = PTR_ERR(spg);
+		goto free_spg_id;
 	}
 
 	INIT_LIST_HEAD(&master->node_list);
@@ -290,8 +291,20 @@ static struct sp_group_master *sp_init_group_master_locked(
 	master->local = spg;
 	mm->sp_group_master = master;
 
+	ret = local_group_add_task(mm, spg);
+	if (ret < 0)
+		goto free_spg;
+
 	*exist = false;
 	return master;
+
+free_spg:
+	free_sp_group_locked(spg);
+free_spg_id:
+	free_new_spg_id(true, spg_id);
+free_master:
+	kfree(master);
+	return ERR_PTR(ret);
 }
 
 static inline bool is_local_group(int spg_id)
@@ -670,6 +683,8 @@ static struct sp_overall_stat sp_overall_stat;
 
 enum spa_type {
 	SPA_TYPE_ALLOC = 1,
+	/* NOTE: reorganize after the statisical structure is reconstructed. */
+	SPA_TYPE_ALLOC_PRIVATE = SPA_TYPE_ALLOC,
 	SPA_TYPE_K2TASK,
 	SPA_TYPE_K2SPG,
 };
@@ -1037,7 +1052,7 @@ EXPORT_SYMBOL_GPL(sp_group_id_by_pid);
  */
 int mg_sp_group_id_by_pid(int pid, int *spg_ids, int *num)
 {
-	int ret = 0;
+	int ret = 0, real_count;
 	struct sp_group_node *node;
 	struct sp_group_master *master = NULL;
 	struct task_struct *tsk;
@@ -1062,18 +1077,28 @@ int mg_sp_group_id_by_pid(int pid, int *spg_ids, int *num)
 		goto out_up_read;
 	}
 
-	if (!master->count) {
+	/*
+	 * There is a local group for each process which is used for
+	 * passthrough allocation. The local group is a internal
+	 * implementation for convenience and is not attempt to bother
+	 * the user.
+	 */
+	real_count = master->count - 1;
+	if (real_count <= 0) {
 		ret = -ENODEV;
 		goto out_up_read;
 	}
-	if ((unsigned int)*num < master->count) {
+	if ((unsigned int)*num < real_count) {
 		ret = -E2BIG;
 		goto out_up_read;
 	}
-	*num = master->count;
+	*num = real_count;
 
-	list_for_each_entry(node, &master->node_list, group_node)
+	list_for_each_entry(node, &master->node_list, group_node) {
+		if (is_local_group(node->spg->id))
+			continue;
 		*(spg_ids++) = node->spg->id;
+	}
 
 out_up_read:
 	up_read(&sp_group_sem);
@@ -1245,7 +1270,7 @@ static int mm_add_group_init(struct mm_struct *mm, struct sp_group *spg)
 		return -EEXIST;
 	}
 
-	if (master->count + 1 == MAX_GROUP_FOR_TASK) {
+	if (master->count == MAX_GROUP_FOR_TASK) {
 		pr_err("task reaches max group num\n");
 		return -ENOSPC;
 	}
@@ -1289,6 +1314,29 @@ static int insert_spg_node(struct sp_group *spg, struct sp_group_node *node)
 	return 0;
 }
 
+static int local_group_add_task(struct mm_struct *mm, struct sp_group *spg)
+{
+	struct sp_group_node *node;
+	struct spg_proc_stat *stat;
+
+	node = create_spg_node(mm, PROT_READ | PROT_WRITE, spg);
+	if (IS_ERR(node))
+		return PTR_ERR(node);
+
+	/* use current just to avoid compile error, rebuild in following patch */
+	stat = sp_init_process_stat(current, mm, spg);
+	if (IS_ERR(stat)) {
+		free_sp_group_locked(spg);
+		pr_err_ratelimited("init process stat failed %lx\n", PTR_ERR(stat));
+		return PTR_ERR(stat);
+	}
+
+	insert_spg_node(spg, node);
+	mmget(mm);
+
+	return 0;
+}
+
 /* the caller must down_write(&spg->rw_lock) */
 static void delete_spg_node(struct sp_group *spg, struct sp_group_node *node)
 {
@@ -2160,15 +2208,10 @@ static void sp_fallocate(struct sp_area *spa)
 
 static void sp_free_unmap_fallocate(struct sp_area *spa)
 {
-	if (!is_local_group(spa->spg->id)) {
-		down_read(&spa->spg->rw_lock);
-		__sp_free(spa->spg, spa->va_start, spa_size(spa), NULL);
-		sp_fallocate(spa);
-		up_read(&spa->spg->rw_lock);
-	} else {
-		sp_munmap(current->mm, spa->va_start, spa_size(spa));
-		sp_fallocate(spa);
-	}
+	down_read(&spa->spg->rw_lock);
+	__sp_free(spa->spg, spa->va_start, spa_size(spa), NULL);
+	sp_fallocate(spa);
+	up_read(&spa->spg->rw_lock);
 }
 
 static int sp_check_caller_permission(struct sp_group *spg, struct mm_struct *mm)
@@ -2176,9 +2219,10 @@ static int sp_check_caller_permission(struct sp_group *spg, struct mm_struct *mm
 	int ret = 0;
 
 	down_read(&spg->rw_lock);
-	if (!is_local_group(spg->id) && !is_process_in_group(spg, mm))
+	if (!is_process_in_group(spg, mm))
 		ret = -EPERM;
 	up_read(&spg->rw_lock);
+
 	return ret;
 }
 
@@ -2363,6 +2407,7 @@ struct sp_alloc_context {
 	struct timespec64 start;
 	struct timespec64 end;
 	bool have_mbind;
+	enum spa_type type;
 };
 
 static void trace_sp_alloc_begin(struct sp_alloc_context *ac)
@@ -2450,10 +2495,13 @@ static int sp_alloc_prepare(unsigned long size, unsigned long sp_flags,
 			pr_err_ratelimited("allocation failed, task not in group\n");
 			return -ENODEV;
 		}
+		ac->type = SPA_TYPE_ALLOC;
 	} else {  /* allocation pass through scene */
 		spg = sp_get_local_group(current->mm);
 		if (IS_ERR(spg))
 			return PTR_ERR(spg);
+		down_read(&spg->rw_lock);
+		ac->type = SPA_TYPE_ALLOC_PRIVATE;
 	}
 
 	if (sp_flags & SP_HUGEPAGE) {
@@ -2476,8 +2524,7 @@ static int sp_alloc_prepare(unsigned long size, unsigned long sp_flags,
 static void sp_alloc_unmap(struct mm_struct *mm, struct sp_area *spa,
 	struct sp_group_node *spg_node)
 {
-	if (!is_local_group(spa->spg->id))
-		__sp_free(spa->spg, spa->va_start, spa->real_size, mm);
+	__sp_free(spa->spg, spa->va_start, spa->real_size, mm);
 }
 
 static int sp_alloc_mmap(struct mm_struct *mm, struct sp_area *spa,
@@ -2532,10 +2579,7 @@ static int sp_alloc_mmap(struct mm_struct *mm, struct sp_area *spa,
 	return ret;
 
 unmap:
-	if (!is_local_group(spa->spg->id))
-		sp_alloc_unmap(list_next_entry(spg_node, proc_node)->master->mm, spa, spg_node);
-	else
-		sp_munmap(mm, spa->va_start, spa->real_size);
+	sp_alloc_unmap(list_next_entry(spg_node, proc_node)->master->mm, spa, spg_node);
 	return ret;
 }
 
@@ -2635,10 +2679,7 @@ static int __sp_alloc_mmap_populate(struct mm_struct *mm, struct sp_area *spa,
 	ret = sp_alloc_populate(mm, spa, ac);
 	if (ret) {
 err:
-		if (!is_local_group(spa->spg->id))
-			sp_alloc_unmap(list_next_entry(spg_node, proc_node)->master->mm, spa, spg_node);
-		else
-			sp_munmap(mm, spa->va_start, spa->real_size);
+		sp_alloc_unmap(list_next_entry(spg_node, proc_node)->master->mm, spa, spg_node);
 
 		if (unlikely(fatal_signal_pending(current)))
 			pr_warn_ratelimited("allocation failed, current thread is killed\n");
@@ -2661,34 +2702,30 @@ static int sp_alloc_mmap_populate(struct sp_area *spa,
 	struct mm_struct *mm;
 	struct sp_group_node *spg_node;
 
-	if (is_local_group(spa->spg->id)) {
-		ret = __sp_alloc_mmap_populate(current->mm, spa, NULL, ac);
-	} else {
-		/* create mapping for each process in the group */
-		list_for_each_entry(spg_node, &spa->spg->procs, proc_node) {
-			mm = spg_node->master->mm;
-			mmap_ret = __sp_alloc_mmap_populate(mm, spa, spg_node, ac);
-			if (mmap_ret) {
-				if (ac->state != ALLOC_COREDUMP)
-					return mmap_ret;
-				ac->state = ALLOC_NORMAL;
-				continue;
-			}
-			ret = mmap_ret;
+	/* create mapping for each process in the group */
+	list_for_each_entry(spg_node, &spa->spg->procs, proc_node) {
+		mm = spg_node->master->mm;
+		mmap_ret = __sp_alloc_mmap_populate(mm, spa, spg_node, ac);
+		if (mmap_ret) {
+			if (ac->state != ALLOC_COREDUMP)
+				return mmap_ret;
+			ac->state = ALLOC_NORMAL;
+			continue;
 		}
+		ret = mmap_ret;
 	}
+
 	return ret;
 }
 
 /* spa maybe an error pointer, so introduce variable spg */
 static void sp_alloc_finish(int result, struct sp_area *spa,
-	struct sp_alloc_context *ac)
+		struct sp_alloc_context *ac)
 {
 	struct sp_group *spg = ac->spg;
 
 	/* match sp_alloc_prepare */
-	if (!is_local_group(spg->id))
-		up_read(&spg->rw_lock);
+	up_read(&spg->rw_lock);
 
 	if (!result)
 		sp_update_process_stat(current, true, spa);
@@ -2728,7 +2765,7 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id)
 
 try_again:
 	spa = sp_alloc_area(ac.size_aligned, ac.sp_flags, ac.spg,
-			    SPA_TYPE_ALLOC, current->tgid);
+			    ac.type, current->tgid);
 	if (IS_ERR(spa)) {
 		pr_err_ratelimited("alloc spa failed in allocation(potential no enough virtual memory when -75): %ld\n",
 			PTR_ERR(spa));
@@ -4650,8 +4687,6 @@ void sp_group_post_exit(struct mm_struct *mm)
 	}
 	up_write(&sp_group_sem);
 
-	if (master->local)
-		sp_group_drop(master->local);
 	kfree(master);
 }
 
-- 
Gitee


From 283d8d877377f78b6403e7f95abc7683394390e2 Mon Sep 17 00:00:00 2001
From: Wang Wensheng <wangwensheng4@huawei.com>
Date: Tue, 19 Jul 2022 11:36:18 +0800
Subject: [PATCH 369/674] mm/sharepool: Clear the initialization of
 sp-associated structure for a process

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5DS9S
CVE: NA

-------------------------------------------------

A few structures must have been created when a process want to get into
sharepool subsystem, including allocating sharepool memory, being added
into a spg or doing k2u and so on.

Currently we create those structures just before we actually need them.
For example, we find or create a sp_spa_stat after a successful memory
allocation and before updating the statistical structure. The creation of
a new structure may fail due to oom and we should then reclaim the
memory allocated and revert all the process before. Or we just forget to
do that and a potential memory-leak occurs. This design makes it
confused when we indeed create a structure and we always worry about
potential memory-leak when we changes the code around it.

A better solution is to initialize all that structures at the same time
when a process join in sharepool subsystem. And in future, we will clear
the unnecessary statistical structures.

Signed-off-by: Wang Wensheng <wangwensheng4@huawei.com>
Reviewed-by: Weilong Chen <chenweilong@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 include/linux/share_pool.h |   4 -
 mm/share_pool.c            | 278 ++++++++++++++++---------------------
 2 files changed, 116 insertions(+), 166 deletions(-)

diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h
index 8eb964230fd4..51710669b1a7 100644
--- a/include/linux/share_pool.h
+++ b/include/linux/share_pool.h
@@ -507,10 +507,6 @@ static inline struct sp_proc_stat *sp_get_proc_stat_ref(struct mm_struct *mm)
 	return NULL;
 }
 
-static inline void sp_proc_stat_drop(struct sp_proc_stat *stat)
-{
-}
-
 static inline void spa_overview_show(struct seq_file *seq)
 {
 }
diff --git a/mm/share_pool.c b/mm/share_pool.c
index b627c9347f78..75339e9d130e 100644
--- a/mm/share_pool.c
+++ b/mm/share_pool.c
@@ -249,33 +249,22 @@ static int sp_mapping_group_setup(struct mm_struct *mm, struct sp_group *spg)
 	return 0;
 }
 
-static void free_sp_group_locked(struct sp_group *spg);
-static int local_group_add_task(struct mm_struct *mm, struct sp_group *spg);
 static struct sp_group *create_spg(int spg_id);
 static void free_new_spg_id(bool new, int spg_id);
-/* The caller must hold sp_group_sem */
-static struct sp_group_master *sp_init_group_master_locked(
-	struct mm_struct *mm, bool *exist)
+static void free_sp_group_locked(struct sp_group *spg);
+static int local_group_add_task(struct mm_struct *mm, struct sp_group *spg);
+static int init_local_group(struct mm_struct *mm)
 {
 	int spg_id, ret;
 	struct sp_group *spg;
+	struct sp_mapping *spm;
 	struct sp_group_master *master = mm->sp_group_master;
 
-	if (master) {
-		*exist = true;
-		return master;
-	}
-
-	master = kmalloc(sizeof(struct sp_group_master), GFP_KERNEL);
-	if (master == NULL)
-		return ERR_PTR(-ENOMEM);
-
 	spg_id = ida_alloc_range(&sp_group_id_ida, SPG_ID_LOCAL_MIN,
 				 SPG_ID_LOCAL_MAX, GFP_ATOMIC);
 	if (spg_id < 0) {
 		pr_err_ratelimited("generate local group id failed %d\n", spg_id);
-		ret = spg_id;
-		goto free_master;
+		return spg_id;
 	}
 
 	spg = create_spg(spg_id);
@@ -284,60 +273,73 @@ static struct sp_group_master *sp_init_group_master_locked(
 		goto free_spg_id;
 	}
 
-	INIT_LIST_HEAD(&master->node_list);
-	master->count = 0;
-	master->stat = NULL;
-	master->mm = mm;
 	master->local = spg;
-	mm->sp_group_master = master;
+	spm = sp_mapping_create(SP_MAPPING_DVPP);
+	if (IS_ERR(spm)) {
+		ret = PTR_ERR(spm);
+		goto free_spg;
+	}
+	sp_mapping_attach(master->local, spm);
+	sp_mapping_attach(master->local, sp_mapping_normal);
 
 	ret = local_group_add_task(mm, spg);
 	if (ret < 0)
+		/* The spm would be released while destroying the spg*/
 		goto free_spg;
 
-	*exist = false;
-	return master;
+	return 0;
 
 free_spg:
 	free_sp_group_locked(spg);
+	master->local = NULL;
 free_spg_id:
 	free_new_spg_id(true, spg_id);
-free_master:
-	kfree(master);
-	return ERR_PTR(ret);
-}
 
-static inline bool is_local_group(int spg_id)
-{
-	return spg_id >= SPG_ID_LOCAL_MIN && spg_id <= SPG_ID_LOCAL_MAX;
+	return ret;
 }
 
-/*
- * If the process is added to a group first, the address space of the local
- * group of the process must have been set. If the process is not added to
- * a group, directly create or attach the process to the corresponding DVPP
- * and normal address space.
- */
-static int sp_mapping_group_setup_local(struct mm_struct *mm)
+static void sp_proc_stat_drop(struct sp_proc_stat *stat);
+static int sp_init_proc_stat(struct mm_struct *mm, struct task_struct *tsk);
+/* The caller must hold sp_group_sem */
+static int sp_init_group_master_locked(struct task_struct *tsk, struct mm_struct *mm)
 {
+	int ret;
 	struct sp_group_master *master;
-	struct sp_mapping *spm;
-	bool exist = false;
-
-	master = sp_init_group_master_locked(mm, &exist);
-	if (IS_ERR(master))
-		return PTR_ERR(master);
 
-	if (master->local->dvpp)
+	if (mm->sp_group_master)
 		return 0;
 
-	spm = sp_mapping_create(SP_MAPPING_DVPP);
-	if (IS_ERR(spm))
-		return PTR_ERR(spm);
-	sp_mapping_attach(master->local, spm);
-	sp_mapping_attach(master->local, sp_mapping_normal);
+	master = kmalloc(sizeof(struct sp_group_master), GFP_KERNEL);
+	if (!master)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&master->node_list);
+	master->count = 0;
+	master->mm = mm;
+	mm->sp_group_master = master;
+
+	ret = sp_init_proc_stat(mm, tsk);
+	if (ret)
+		goto free_master;
+
+	ret = init_local_group(mm);
+	if (ret)
+		goto put_stat;
 
 	return 0;
+
+put_stat:
+	sp_proc_stat_drop(master->stat);
+free_master:
+	mm->sp_group_master = NULL;
+	kfree(master);
+
+	return ret;
+}
+
+static inline bool is_local_group(int spg_id)
+{
+	return spg_id >= SPG_ID_LOCAL_MIN && spg_id <= SPG_ID_LOCAL_MAX;
 }
 
 static struct sp_group *sp_get_local_group(struct mm_struct *mm)
@@ -355,7 +357,7 @@ static struct sp_group *sp_get_local_group(struct mm_struct *mm)
 	up_read(&sp_group_sem);
 
 	down_write(&sp_group_sem);
-	ret = sp_mapping_group_setup_local(mm);
+	ret = sp_init_group_master_locked(current, mm);
 	if (ret) {
 		up_write(&sp_group_sem);
 		return ERR_PTR(ret);
@@ -403,37 +405,29 @@ static struct sp_proc_stat *create_proc_stat(struct mm_struct *mm,
 	return stat;
 }
 
-static struct sp_proc_stat *sp_init_proc_stat(struct sp_group_master *master,
-	struct mm_struct *mm, struct task_struct *tsk)
+static int sp_init_proc_stat(struct mm_struct *mm, struct task_struct *tsk)
 {
 	struct sp_proc_stat *stat;
 	int alloc_id, tgid = tsk->tgid;
-
-	down_write(&sp_proc_stat_sem);
-	stat = master->stat;
-	if (stat) {
-		up_write(&sp_proc_stat_sem);
-		return stat;
-	}
+	struct sp_group_master *master = mm->sp_group_master;
 
 	stat = create_proc_stat(mm, tsk);
-	if (IS_ERR(stat)) {
-		up_write(&sp_proc_stat_sem);
-		return stat;
-	}
+	if (IS_ERR(stat))
+		return PTR_ERR(stat);
 
+	down_write(&sp_proc_stat_sem);
 	alloc_id = idr_alloc(&sp_proc_stat_idr, stat, tgid, tgid + 1, GFP_KERNEL);
 	if (alloc_id < 0) {
 		up_write(&sp_proc_stat_sem);
 		pr_err_ratelimited("proc stat idr alloc failed %d\n", alloc_id);
 		kfree(stat);
-		return ERR_PTR(alloc_id);
+		return alloc_id;
 	}
 
 	master->stat = stat;
 	up_write(&sp_proc_stat_sem);
 
-	return stat;
+	return 0;
 }
 
 static void update_spg_stat_alloc(unsigned long size, bool inc,
@@ -547,18 +541,14 @@ static struct spg_proc_stat *create_spg_proc_stat(int tgid, int spg_id)
 	return stat;
 }
 
-static struct spg_proc_stat *sp_init_spg_proc_stat(
-	struct sp_proc_stat *proc_stat, int tgid, struct sp_group *spg)
+static struct spg_proc_stat *sp_init_spg_proc_stat(struct sp_proc_stat *proc_stat,
+						   struct sp_group *spg)
 {
 	struct spg_proc_stat *stat;
 	int spg_id = spg->id;  /* visit spg id locklessly */
 	struct sp_spg_stat *spg_stat = spg->stat;
 
-	stat = find_spg_proc_stat(proc_stat, tgid, spg_id);
-	if (stat)
-		return stat;
-
-	stat = create_spg_proc_stat(tgid, spg_id);
+	stat = create_spg_proc_stat(proc_stat->tgid, spg_id);
 	if (IS_ERR(stat))
 		return stat;
 
@@ -575,31 +565,6 @@ static struct spg_proc_stat *sp_init_spg_proc_stat(
 	return stat;
 }
 
-/*
- * The caller must
- * 1. ensure no concurrency problem for task_struct and mm_struct.
- * 2. hold sp_group_sem for sp_group_master (pay attention to ABBA deadlock)
- */
-static struct spg_proc_stat *sp_init_process_stat(struct task_struct *tsk,
-	struct mm_struct *mm, struct sp_group *spg)
-{
-	struct sp_group_master *master;
-	bool exist;
-	struct sp_proc_stat *proc_stat;
-	struct spg_proc_stat *spg_proc_stat;
-
-	master = sp_init_group_master_locked(mm, &exist);
-	if (IS_ERR(master))
-		return (struct spg_proc_stat *)master;
-
-	proc_stat = sp_init_proc_stat(master, mm, tsk);
-	if (IS_ERR(proc_stat))
-		return (struct spg_proc_stat *)proc_stat;
-
-	spg_proc_stat = sp_init_spg_proc_stat(proc_stat, tsk->tgid, spg);
-	return spg_proc_stat;
-}
-
 static struct sp_spg_stat *create_spg_stat(int spg_id)
 {
 	struct sp_spg_stat *stat;
@@ -846,9 +811,9 @@ static void sp_update_process_stat(struct task_struct *tsk, bool inc,
 	enum spa_type type = spa->type;
 
 	down_write(&sp_group_sem);
-	stat = sp_init_process_stat(tsk, tsk->mm, spa->spg);
+	stat = find_spg_proc_stat(tsk->mm->sp_group_master->stat, tsk->tgid, spa->spg->id);
 	up_write(&sp_group_sem);
-	if (unlikely(IS_ERR(stat)))
+	if (!stat)
 		return;
 
 	update_spg_proc_stat(size, inc, stat, type);
@@ -1253,26 +1218,27 @@ static void sp_munmap_task_areas(struct mm_struct *mm, struct sp_group *spg, str
 }
 
 /* the caller must hold sp_group_sem */
-static int mm_add_group_init(struct mm_struct *mm, struct sp_group *spg)
+static int mm_add_group_init(struct task_struct *tsk, struct mm_struct *mm,
+			     struct sp_group *spg)
 {
-	struct sp_group_master *master = mm->sp_group_master;
-	bool exist = false;
-
-	master = sp_init_group_master_locked(mm, &exist);
-	if (IS_ERR(master))
-		return PTR_ERR(master);
-
-	if (!exist)
-		return 0;
+	int ret;
+	struct sp_group_master *master;
 
-	if (is_process_in_group(spg, mm)) {
-		pr_err_ratelimited("task already in target group, id=%d\n", spg->id);
-		return -EEXIST;
-	}
+	if (!mm->sp_group_master) {
+		ret = sp_init_group_master_locked(tsk, mm);
+		if (ret)
+			return ret;
+	} else {
+		if (is_process_in_group(spg, mm)) {
+			pr_err_ratelimited("task already in target group, id=%d\n", spg->id);
+			return -EEXIST;
+		}
 
-	if (master->count == MAX_GROUP_FOR_TASK) {
-		pr_err("task reaches max group num\n");
-		return -ENOSPC;
+		master = mm->sp_group_master;
+		if (master->count == MAX_GROUP_FOR_TASK) {
+			pr_err("task reaches max group num\n");
+			return -ENOSPC;
+		}
 	}
 
 	return 0;
@@ -1311,29 +1277,13 @@ static int insert_spg_node(struct sp_group *spg, struct sp_group_node *node)
 
 	spg->proc_num++;
 	list_add_tail(&node->proc_node, &spg->procs);
-	return 0;
-}
-
-static int local_group_add_task(struct mm_struct *mm, struct sp_group *spg)
-{
-	struct sp_group_node *node;
-	struct spg_proc_stat *stat;
-
-	node = create_spg_node(mm, PROT_READ | PROT_WRITE, spg);
-	if (IS_ERR(node))
-		return PTR_ERR(node);
-
-	/* use current just to avoid compile error, rebuild in following patch */
-	stat = sp_init_process_stat(current, mm, spg);
-	if (IS_ERR(stat)) {
-		free_sp_group_locked(spg);
-		pr_err_ratelimited("init process stat failed %lx\n", PTR_ERR(stat));
-		return PTR_ERR(stat);
-	}
-
-	insert_spg_node(spg, node);
-	mmget(mm);
 
+	/*
+	 * The only way where sp_init_spg_proc_stat got failed is that there is no
+	 * memory for sp_spg_stat. We will avoid this failure when we put sp_spg_stat
+	 * into sp_group_node later.
+	 */
+	sp_init_spg_proc_stat(node->master->stat, spg);
 	return 0;
 }
 
@@ -1356,6 +1306,20 @@ static void free_spg_node(struct mm_struct *mm, struct sp_group *spg,
 	kfree(spg_node);
 }
 
+static int local_group_add_task(struct mm_struct *mm, struct sp_group *spg)
+{
+	struct sp_group_node *node;
+
+	node = create_spg_node(mm, PROT_READ | PROT_WRITE, spg);
+	if (IS_ERR(node))
+		return PTR_ERR(node);
+
+	insert_spg_node(spg, node);
+	mmget(mm);
+
+	return 0;
+}
+
 /**
  * sp_group_add_task() - Add a process to an share group (sp_group).
  * @pid: the pid of the task to be added.
@@ -1380,7 +1344,6 @@ int mg_sp_group_add_task(int pid, unsigned long prot, int spg_id)
 	int ret = 0;
 	bool id_newly_generated = false;
 	struct sp_area *spa, *prev = NULL;
-	struct spg_proc_stat *stat;
 
 	check_interrupt_context();
 
@@ -1483,29 +1446,26 @@ int mg_sp_group_add_task(int pid, unsigned long prot, int spg_id)
 		}
 	}
 
-	ret = mm_add_group_init(mm, spg);
-	if (ret)
+	down_write(&spg->rw_lock);
+	ret = mm_add_group_init(tsk, mm, spg);
+	if (ret) {
+		up_write(&spg->rw_lock);
 		goto out_drop_group;
+	}
 
 	ret = sp_mapping_group_setup(mm, spg);
-	if (ret)
+	if (ret) {
+		up_write(&spg->rw_lock);
 		goto out_drop_group;
+	}
 
 	node = create_spg_node(mm, prot, spg);
 	if (unlikely(IS_ERR(node))) {
+		up_write(&spg->rw_lock);
 		ret = PTR_ERR(node);
 		goto out_drop_group;
 	}
 
-	/* per process statistics initialization */
-	stat = sp_init_process_stat(tsk, mm, spg);
-	if (IS_ERR(stat)) {
-		ret = PTR_ERR(stat);
-		pr_err_ratelimited("init process stat failed %lx\n", PTR_ERR(stat));
-		goto out_drop_spg_node;
-	}
-
-	down_write(&spg->rw_lock);
 	ret = insert_spg_node(spg, node);
 	if (unlikely(ret)) {
 		up_write(&spg->rw_lock);
@@ -1757,7 +1717,7 @@ int sp_id_of_current(void)
 
 	down_read(&sp_group_sem);
 	master = current->mm->sp_group_master;
-	if (master && master->local) {
+	if (master) {
 		spg_id = master->local->id;
 		up_read(&sp_group_sem);
 		return spg_id;
@@ -1765,7 +1725,7 @@ int sp_id_of_current(void)
 	up_read(&sp_group_sem);
 
 	down_write(&sp_group_sem);
-	ret = sp_mapping_group_setup_local(current->mm);
+	ret = sp_init_group_master_locked(current, current->mm);
 	if (ret) {
 		up_write(&sp_group_sem);
 		return ret;
@@ -2924,7 +2884,7 @@ static void *sp_make_share_kva_to_task(unsigned long kva, unsigned long size, un
 	struct sp_group *spg;
 
 	down_write(&sp_group_sem);
-	ret = sp_mapping_group_setup_local(current->mm);
+	ret = sp_init_group_master_locked(current, current->mm);
 	if (ret) {
 		up_write(&sp_group_sem);
 		pr_err_ratelimited("k2u_task init local mapping failed %d\n", ret);
@@ -2932,13 +2892,7 @@ static void *sp_make_share_kva_to_task(unsigned long kva, unsigned long size, un
 	}
 
 	spg = current->mm->sp_group_master->local;
-	stat = sp_init_process_stat(current, current->mm, spg);
-	if (IS_ERR(stat)) {
-		up_write(&sp_group_sem);
-		pr_err_ratelimited("k2u_task init process stat failed %lx\n",
-				PTR_ERR(stat));
-		return stat;
-	}
+	stat = find_spg_proc_stat(current->mm->sp_group_master->stat, current->tgid, spg->id);
 	up_write(&sp_group_sem);
 
 	spa = sp_alloc_area(size, sp_flags, spg, SPA_TYPE_K2TASK, current->tgid);
@@ -3959,7 +3913,7 @@ static void free_sp_proc_stat(struct sp_proc_stat *stat)
 }
 
 /* the caller make sure stat is not NULL */
-void sp_proc_stat_drop(struct sp_proc_stat *stat)
+static void sp_proc_stat_drop(struct sp_proc_stat *stat)
 {
 	if (atomic_dec_and_test(&stat->use_count))
 		free_sp_proc_stat(stat);
-- 
Gitee


From b632bf41c09a6e1c6587db46a1b0d4419f83ee13 Mon Sep 17 00:00:00 2001
From: Wang Wensheng <wangwensheng4@huawei.com>
Date: Tue, 19 Jul 2022 11:36:19 +0800
Subject: [PATCH 370/674] mm/sharepool: Update sp_mapping structure

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5DS9S
CVE: NA

-------------------------------------------------

1. Add a list for sp_mapping to record all the sp_groups attached to it.
2. Initialize the sp_mapping for local_group when it is created. So when
   we add a task to a group, we should merge the dvpp mapping of the
   local group.
3. Every two groups can be merged if and only if at least one of them is
   empty. Then the empty mapping would be dropped and another mapping
   would be attached to the two groups. This need to traverse all the
   groups attached to the mapping.
4. A mapping is considered empty when no spa is allocated from it and
   its address space is default.

Signed-off-by: Wang Wensheng <wangwensheng4@huawei.com>
Reviewed-by: Weilong Chen <chenweilong@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 include/linux/share_pool.h |  9 ++++--
 mm/share_pool.c            | 65 +++++++++++++++++++++++---------------
 2 files changed, 47 insertions(+), 27 deletions(-)

diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h
index 51710669b1a7..f002370ab5f8 100644
--- a/include/linux/share_pool.h
+++ b/include/linux/share_pool.h
@@ -117,6 +117,9 @@ struct sp_mapping {
 	struct rb_node *free_area_cache;
 	unsigned long cached_hole_size;
 	unsigned long cached_vstart;
+
+	/* list head for all groups attached to this mapping, dvpp mapping only */
+	struct list_head group_head;
 };
 
 /* Processes in the same sp_group can share memory.
@@ -160,8 +163,10 @@ struct sp_group {
 	atomic_t	 use_count;
 	/* protect the group internal elements, except spa_list */
 	struct rw_semaphore	rw_lock;
-	struct sp_mapping *dvpp;
-	struct sp_mapping *normal;
+	/* list node for dvpp mapping */
+	struct list_head	mnode;
+	struct sp_mapping	*dvpp;
+	struct sp_mapping	*normal;
 };
 
 /* a per-process(per mm) struct which manages a sp_group_node list */
diff --git a/mm/share_pool.c b/mm/share_pool.c
index 75339e9d130e..e38436b25efc 100644
--- a/mm/share_pool.c
+++ b/mm/share_pool.c
@@ -168,6 +168,7 @@ static struct sp_mapping *sp_mapping_create(unsigned long flag)
 	sp_mapping_range_init(spm);
 	atomic_set(&spm->user, 0);
 	spm->area_root = RB_ROOT;
+	INIT_LIST_HEAD(&spm->group_head);
 
 	return spm;
 }
@@ -180,18 +181,45 @@ static void sp_mapping_destroy(struct sp_mapping *spm)
 static void sp_mapping_attach(struct sp_group *spg, struct sp_mapping *spm)
 {
 	atomic_inc(&spm->user);
-	if (spm->flag & SP_MAPPING_DVPP)
+	if (spm->flag & SP_MAPPING_DVPP) {
 		spg->dvpp = spm;
-	else if (spm->flag & SP_MAPPING_NORMAL)
+		list_add_tail(&spg->mnode, &spm->group_head);
+	} else if (spm->flag & SP_MAPPING_NORMAL)
 		spg->normal = spm;
 }
 
 static void sp_mapping_detach(struct sp_group *spg, struct sp_mapping *spm)
 {
-	if (spm && atomic_dec_and_test(&spm->user))
+	if (!spm)
+		return;
+	if (spm->flag & SP_MAPPING_DVPP)
+		list_del(&spg->mnode);
+	if (atomic_dec_and_test(&spm->user))
 		sp_mapping_destroy(spm);
 }
 
+/* merge old mapping to new, and the old mapping would be destroyed */
+static void sp_mapping_merge(struct sp_mapping *new, struct sp_mapping *old)
+{
+	struct sp_group *spg, *tmp;
+
+	if (new == old)
+		return;
+
+	list_for_each_entry_safe(spg, tmp, &old->group_head, mnode) {
+		list_move_tail(&spg->mnode, &new->group_head);
+		spg->dvpp = new;
+	}
+
+	atomic_add(atomic_read(&old->user), &new->user);
+	sp_mapping_destroy(old);
+}
+
+static bool is_mapping_empty(struct sp_mapping *spm)
+{
+	return RB_EMPTY_ROOT(&spm->area_root);
+}
+
 /*
  * When you set the address space of a group, the normal address space
  * is globally unified. When processing the DVPP address space, consider
@@ -216,32 +244,18 @@ static int sp_mapping_group_setup(struct mm_struct *mm, struct sp_group *spg)
 {
 	struct sp_group_master *master = mm->sp_group_master;
 	struct sp_group *local = master->local;
-	struct sp_mapping *spm;
 
 	if (!list_empty(&spg->procs)) {
-		/* 1 */
-		if (local->dvpp && local->dvpp != spg->dvpp) {
-			pr_info_ratelimited("Duplicate address space, id=%d\n",
-					spg->id);
-			return 0;
-		}
-
-		/* 2 */
-		if (!local->dvpp) {
-			sp_mapping_attach(local, spg->dvpp);
-			sp_mapping_attach(local, spg->normal);
+		if (is_mapping_empty(local->dvpp))
+			sp_mapping_merge(spg->dvpp, local->dvpp);
+		else if (is_mapping_empty(spg->dvpp))
+			sp_mapping_merge(local->dvpp, spg->dvpp);
+		else {
+			pr_info_ratelimited("Duplicate address space, id=%d\n", spg->id);
+			return -EINVAL;
 		}
 	} else {
-		/* 4 */
-		if (!local->dvpp) {
-			spm = sp_mapping_create(SP_MAPPING_DVPP);
-			if (IS_ERR(spm))
-				return PTR_ERR(spm);
-			sp_mapping_attach(local, spm);
-			sp_mapping_attach(local, sp_mapping_normal);
-		}
-
-		/* 3 */
+		/* the mapping of local group is always set */
 		sp_mapping_attach(spg, local->dvpp);
 		sp_mapping_attach(spg, sp_mapping_normal);
 	}
@@ -1121,6 +1135,7 @@ static struct sp_group *create_spg(int spg_id)
 	atomic_set(&spg->use_count, 1);
 	INIT_LIST_HEAD(&spg->procs);
 	INIT_LIST_HEAD(&spg->spa_list);
+	INIT_LIST_HEAD(&spg->mnode);
 	init_rwsem(&spg->rw_lock);
 
 	sprintf(name, "sp_group_%d", spg_id);
-- 
Gitee


From 8e8888e678ff09fda4691f55954325fc6bb64dd1 Mon Sep 17 00:00:00 2001
From: Wang Wensheng <wangwensheng4@huawei.com>
Date: Tue, 19 Jul 2022 11:36:20 +0800
Subject: [PATCH 371/674] mm/sharepool: Introduce SPG_NON_DVPP flag for
 sp_group_add_task

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5DS9S
CVE: NA

-------------------------------------------------

When SPG_NOD_DVPP is specified to sp_group_add_task, we don't create a
DVPP mapping for the newly created sp_group. And the new group cannot
support allocating DVPP memory.

Signed-off-by: Wang Wensheng <wangwensheng4@huawei.com>
Reviewed-by: Weilong Chen <chenweilong@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 include/linux/share_pool.h |  7 +++--
 mm/share_pool.c            | 60 ++++++++++++++++++++------------------
 2 files changed, 36 insertions(+), 31 deletions(-)

diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h
index f002370ab5f8..1911cd35843b 100644
--- a/include/linux/share_pool.h
+++ b/include/linux/share_pool.h
@@ -43,6 +43,9 @@
 #define SPG_ID_LOCAL_MIN	200001
 #define SPG_ID_LOCAL_MAX	299999
 
+#define SPG_FLAG_NON_DVPP	(1 << 0)
+#define SPG_FLAG_MASK		(SPG_FLAG_NON_DVPP)
+
 #define MAX_DEVID 8	/* the max num of Da-vinci devices */
 
 extern int sysctl_share_pool_hugepage_enable;
@@ -146,6 +149,7 @@ struct sp_mapping {
  */
 struct sp_group {
 	int		 id;
+	unsigned long	 flag;
 	struct file	 *file;
 	struct file	 *file_hugetlb;
 	/* number of process in this group */
@@ -286,9 +290,6 @@ extern bool mg_sp_config_dvpp_range(size_t start, size_t size, int device_id, in
 extern bool is_sharepool_addr(unsigned long addr);
 extern bool mg_is_sharepool_addr(unsigned long addr);
 
-extern int mg_sp_group_add_task(int pid, unsigned long prot, int spg_id);
-extern int sp_group_add_task(int pid, int spg_id);
-
 extern int sp_id_of_current(void);
 extern int mg_sp_id_of_current(void);
 
diff --git a/mm/share_pool.c b/mm/share_pool.c
index e38436b25efc..8297635c6dac 100644
--- a/mm/share_pool.c
+++ b/mm/share_pool.c
@@ -221,31 +221,20 @@ static bool is_mapping_empty(struct sp_mapping *spm)
 }
 
 /*
- * When you set the address space of a group, the normal address space
- * is globally unified. When processing the DVPP address space, consider
- * the following situations:
- * 1. If a process is added to a non-new group, the DVPP address space
- *    must have been created. If the local group of the process also
- *    contains the DVPP address space and they are different, this
- *    scenario is not allowed to avoid address conflict.
- * 2. If the DVPP address space does not exist in the local group of the
- *    process, attach the local group of the process to the DVPP address
- *    space of the group.
- * 3. Add a new group. If the process has applied for the dvpp address
- *    space (sp_alloc or k2u), attach the new group to the dvpp address
- *    space of the current process.
- * 4. If the process has not applied for the DVPP address space, attach
- *    the new group and the local group of the current process to the
- *     newly created DVPP address space.
- *
+ * 1. The mappings of local group is set on creating.
+ * 2. This is used to setup the mapping for groups created during add_task.
+ * 3. The normal mapping exists for all groups.
+ * 4. The dvpp mappings for the new group and local group can merge _iff_ at
+ *    least one of the mapping is empty.
  * the caller must hold sp_group_sem
+ * NOTE: undo the mergeing when the later process failed.
  */
 static int sp_mapping_group_setup(struct mm_struct *mm, struct sp_group *spg)
 {
 	struct sp_group_master *master = mm->sp_group_master;
 	struct sp_group *local = master->local;
 
-	if (!list_empty(&spg->procs)) {
+	if (!list_empty(&spg->procs) && !(spg->flag & SPG_FLAG_NON_DVPP)) {
 		if (is_mapping_empty(local->dvpp))
 			sp_mapping_merge(spg->dvpp, local->dvpp);
 		else if (is_mapping_empty(spg->dvpp))
@@ -255,15 +244,17 @@ static int sp_mapping_group_setup(struct mm_struct *mm, struct sp_group *spg)
 			return -EINVAL;
 		}
 	} else {
-		/* the mapping of local group is always set */
-		sp_mapping_attach(spg, local->dvpp);
-		sp_mapping_attach(spg, sp_mapping_normal);
+		if (!(spg->flag & SPG_FLAG_NON_DVPP))
+			/* the mapping of local group is always set */
+			sp_mapping_attach(spg, local->dvpp);
+		if (!spg->normal)
+			sp_mapping_attach(spg, sp_mapping_normal);
 	}
 
 	return 0;
 }
 
-static struct sp_group *create_spg(int spg_id);
+static struct sp_group *create_spg(int spg_id, unsigned long flag);
 static void free_new_spg_id(bool new, int spg_id);
 static void free_sp_group_locked(struct sp_group *spg);
 static int local_group_add_task(struct mm_struct *mm, struct sp_group *spg);
@@ -281,7 +272,7 @@ static int init_local_group(struct mm_struct *mm)
 		return spg_id;
 	}
 
-	spg = create_spg(spg_id);
+	spg = create_spg(spg_id, 0);
 	if (IS_ERR(spg)) {
 		ret = PTR_ERR(spg);
 		goto free_spg_id;
@@ -1103,7 +1094,7 @@ static bool is_device_addr(unsigned long addr)
 	return false;
 }
 
-static struct sp_group *create_spg(int spg_id)
+static struct sp_group *create_spg(int spg_id, unsigned long flag)
 {
 	int ret;
 	struct sp_group *spg;
@@ -1117,6 +1108,11 @@ static struct sp_group *create_spg(int spg_id)
 		return ERR_PTR(-ENOSPC);
 	}
 
+	if (flag & ~SPG_FLAG_MASK) {
+		pr_err_ratelimited("invalid flag:%#lx\n", flag);
+		return ERR_PTR(-EINVAL);
+	}
+
 	spg = kzalloc(sizeof(*spg), GFP_KERNEL);
 	if (spg == NULL)
 		return ERR_PTR(-ENOMEM);
@@ -1129,6 +1125,7 @@ static struct sp_group *create_spg(int spg_id)
 	}
 
 	spg->id = spg_id;
+	spg->flag = flag;
 	spg->is_alive = true;
 	spg->proc_num = 0;
 	spg->owner = current->group_leader;
@@ -1176,14 +1173,14 @@ static struct sp_group *create_spg(int spg_id)
 }
 
 /* the caller must hold sp_group_sem */
-static struct sp_group *find_or_alloc_sp_group(int spg_id)
+static struct sp_group *find_or_alloc_sp_group(int spg_id, unsigned long flag)
 {
 	struct sp_group *spg;
 
 	spg = __sp_find_spg_locked(current->pid, spg_id);
 
 	if (!spg) {
-		spg = create_spg(spg_id);
+		spg = create_spg(spg_id, flag);
 	} else {
 		down_read(&spg->rw_lock);
 		if (!spg_valid(spg)) {
@@ -1336,10 +1333,11 @@ static int local_group_add_task(struct mm_struct *mm, struct sp_group *spg)
 }
 
 /**
- * sp_group_add_task() - Add a process to an share group (sp_group).
+ * mg_sp_group_add_task() - Add a process to an share group (sp_group).
  * @pid: the pid of the task to be added.
  * @prot: the prot of task for this spg.
  * @spg_id: the ID of the sp_group.
+ * @flag: to give some special message.
  *
  * A process can't be added to more than one sp_group in single group mode
  * and can in multiple group mode.
@@ -1352,6 +1350,7 @@ static int local_group_add_task(struct mm_struct *mm, struct sp_group *spg)
  */
 int mg_sp_group_add_task(int pid, unsigned long prot, int spg_id)
 {
+	unsigned long flag = 0;
 	struct task_struct *tsk;
 	struct mm_struct *mm;
 	struct sp_group *spg;
@@ -1445,7 +1444,7 @@ int mg_sp_group_add_task(int pid, unsigned long prot, int spg_id)
 		goto out_put_task;
 	}
 
-	spg = find_or_alloc_sp_group(spg_id);
+	spg = find_or_alloc_sp_group(spg_id, flag);
 	if (IS_ERR(spg)) {
 		up_write(&sp_group_sem);
 		ret = PTR_ERR(spg);
@@ -1818,6 +1817,11 @@ static struct sp_area *sp_alloc_area(unsigned long size, unsigned long flags,
 	else
 		mapping = spg->normal;
 
+	if (!mapping) {
+		pr_err_ratelimited("non DVPP spg, id %d\n", spg->id);
+		return ERR_PTR(-EINVAL);
+	}
+
 	vstart = mapping->start[device_id];
 	vend = mapping->end[device_id];
 	spa = __kmalloc_node(sizeof(struct sp_area), GFP_KERNEL, node_id);
-- 
Gitee


From 1e645d0e51cde5a3e7f1228249cda8f6bb2c8640 Mon Sep 17 00:00:00 2001
From: Wang Wensheng <wangwensheng4@huawei.com>
Date: Tue, 19 Jul 2022 11:36:21 +0800
Subject: [PATCH 372/674] mm/sharepool: Configure the DVPP range for process

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5DS9S
CVE: NA

-------------------------------------------------

Currently the dvpp range is global for each device. And it is
unreasonable after the reconstruction that makes the DVPP mappings
private to each process or group.

This allows to configure the dvpp range for each process.
The dvpp range for each dvpp mapping can only be configured once just
as the old version.

Signed-off-by: Wang Wensheng <wangwensheng4@huawei.com>
Reviewed-by: Weilong Chen <chenweilong@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 mm/share_pool.c | 66 +++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 58 insertions(+), 8 deletions(-)

diff --git a/mm/share_pool.c b/mm/share_pool.c
index 8297635c6dac..1c10b136f20f 100644
--- a/mm/share_pool.c
+++ b/mm/share_pool.c
@@ -220,6 +220,17 @@ static bool is_mapping_empty(struct sp_mapping *spm)
 	return RB_EMPTY_ROOT(&spm->area_root);
 }
 
+static bool can_mappings_merge(struct sp_mapping *m1, struct sp_mapping *m2)
+{
+	int i;
+
+	for (i = 0; i < sp_device_number; i++)
+		if (m1->start[i] != m2->start[i] || m1->end[i] != m2->end[i])
+			return false;
+
+	return true;
+}
+
 /*
  * 1. The mappings of local group is set on creating.
  * 2. This is used to setup the mapping for groups created during add_task.
@@ -235,6 +246,11 @@ static int sp_mapping_group_setup(struct mm_struct *mm, struct sp_group *spg)
 	struct sp_group *local = master->local;
 
 	if (!list_empty(&spg->procs) && !(spg->flag & SPG_FLAG_NON_DVPP)) {
+		if (!can_mappings_merge(local->dvpp, spg->dvpp)) {
+			pr_info_ratelimited("address space conflict, id=%d\n", spg->id);
+			return -EINVAL;
+		}
+
 		if (is_mapping_empty(local->dvpp))
 			sp_mapping_merge(spg->dvpp, local->dvpp);
 		else if (is_mapping_empty(spg->dvpp))
@@ -3825,16 +3841,50 @@ EXPORT_SYMBOL_GPL(sp_unregister_notifier);
  */
 bool sp_config_dvpp_range(size_t start, size_t size, int device_id, int pid)
 {
-	if (pid < 0 ||
-	    size <= 0 || size > MMAP_SHARE_POOL_16G_SIZE ||
-	    device_id < 0 || device_id >= sp_device_number ||
-	    !is_online_node_id(device_id) ||
-	    is_sp_dev_addr_enabled(device_id))
+	int ret;
+	bool err = false;
+	struct task_struct *tsk;
+	struct mm_struct *mm;
+	struct sp_group *spg;
+	struct sp_mapping *spm;
+	unsigned long default_start;
+
+	/* NOTE: check the start address */
+	if (pid < 0 || size <= 0 || size > MMAP_SHARE_POOL_16G_SIZE ||
+	    device_id < 0 || device_id >= sp_device_number || !is_online_node_id(device_id))
 		return false;
 
-	sp_dev_va_start[device_id] = start;
-	sp_dev_va_size[device_id] = size;
-	return true;
+	ret = get_task(pid, &tsk);
+	if (ret)
+		return false;
+
+	mm = get_task_mm(tsk->group_leader);
+	if (!mm)
+		goto put_task;
+
+	spg = sp_get_local_group(mm);
+	if (IS_ERR(spg))
+		goto put_mm;
+
+	spm = spg->dvpp;
+	default_start = MMAP_SHARE_POOL_16G_START + device_id * MMAP_SHARE_POOL_16G_SIZE;
+	/* The dvpp range of each group can be configured only once */
+	if (spm->start[device_id] != default_start)
+		goto put_spg;
+
+	spm->start[device_id] = start;
+	spm->end[device_id] = start + size;
+
+	err = true;
+
+put_spg:
+	sp_group_drop(spg);
+put_mm:
+	mmput(mm);
+put_task:
+	put_task_struct(tsk);
+
+	return err;
 }
 EXPORT_SYMBOL_GPL(sp_config_dvpp_range);
 
-- 
Gitee


From 451822b698080b9fd8d72efdcaa6c3220802fae5 Mon Sep 17 00:00:00 2001
From: Wang Wensheng <wangwensheng4@huawei.com>
Date: Tue, 19 Jul 2022 11:36:22 +0800
Subject: [PATCH 373/674] mm/sharepool: Don't check the DVPP address space
 range before merging

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5DS9S
CVE: NA

-------------------------------------------------

The user doesn't care about the start address of the dvpp range, what
is mattered is that the virtual space tagged DVPP located at in a 16G
range. So we can safely drop the dvpp address space as long as it's
empty during merging process.

Signed-off-by: Wang Wensheng <wangwensheng4@huawei.com>
Reviewed-by: Weilong Chen <chenweilong@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 mm/share_pool.c | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/mm/share_pool.c b/mm/share_pool.c
index 1c10b136f20f..b14152acbee8 100644
--- a/mm/share_pool.c
+++ b/mm/share_pool.c
@@ -246,16 +246,23 @@ static int sp_mapping_group_setup(struct mm_struct *mm, struct sp_group *spg)
 	struct sp_group *local = master->local;
 
 	if (!list_empty(&spg->procs) && !(spg->flag & SPG_FLAG_NON_DVPP)) {
-		if (!can_mappings_merge(local->dvpp, spg->dvpp)) {
-			pr_info_ratelimited("address space conflict, id=%d\n", spg->id);
-			return -EINVAL;
-		}
+		/*
+		 * Don't return an error when the mappings' address range conflict.
+		 * As long as the mapping is unused, we can drop the empty mapping.
+		 * This may change the address range for the task or group implicitly,
+		 * give a warn for it.
+		 */
+		bool is_conflict = !can_mappings_merge(local->dvpp, spg->dvpp);
 
-		if (is_mapping_empty(local->dvpp))
+		if (is_mapping_empty(local->dvpp)) {
 			sp_mapping_merge(spg->dvpp, local->dvpp);
-		else if (is_mapping_empty(spg->dvpp))
+			if (is_conflict)
+				pr_warn_ratelimited("task address space conflict, spg_id=%d\n", spg->id);
+		} else if (is_mapping_empty(spg->dvpp)) {
 			sp_mapping_merge(local->dvpp, spg->dvpp);
-		else {
+			if (is_conflict)
+				pr_warn_ratelimited("group address space conflict, spg_id=%d\n", spg->id);
+		} else {
 			pr_info_ratelimited("Duplicate address space, id=%d\n", spg->id);
 			return -EINVAL;
 		}
-- 
Gitee


From a829370d803a17014c0509be190d926641d34011 Mon Sep 17 00:00:00 2001
From: Wang Wensheng <wangwensheng4@huawei.com>
Date: Tue, 19 Jul 2022 11:36:23 +0800
Subject: [PATCH 374/674] mm/sharepool: Add a task_struct parameter for
 sp_get_local_group()

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5DS9S
CVE: NA

-------------------------------------------------

sp_get_local_group() could be invoked in kthread, where the current
process isn't the process we want. Add a parameter and let the caller to
avoid this problem.

Signed-off-by: Wang Wensheng <wangwensheng4@huawei.com>
Reviewed-by: Weilong Chen <chenweilong@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 mm/share_pool.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mm/share_pool.c b/mm/share_pool.c
index b14152acbee8..a39bece9af96 100644
--- a/mm/share_pool.c
+++ b/mm/share_pool.c
@@ -370,7 +370,7 @@ static inline bool is_local_group(int spg_id)
 	return spg_id >= SPG_ID_LOCAL_MIN && spg_id <= SPG_ID_LOCAL_MAX;
 }
 
-static struct sp_group *sp_get_local_group(struct mm_struct *mm)
+static struct sp_group *sp_get_local_group(struct task_struct *tsk, struct mm_struct *mm)
 {
 	int ret;
 	struct sp_group_master *master;
@@ -385,7 +385,7 @@ static struct sp_group *sp_get_local_group(struct mm_struct *mm)
 	up_read(&sp_group_sem);
 
 	down_write(&sp_group_sem);
-	ret = sp_init_group_master_locked(current, mm);
+	ret = sp_init_group_master_locked(tsk, mm);
 	if (ret) {
 		up_write(&sp_group_sem);
 		return ERR_PTR(ret);
@@ -2499,7 +2499,7 @@ static int sp_alloc_prepare(unsigned long size, unsigned long sp_flags,
 		}
 		ac->type = SPA_TYPE_ALLOC;
 	} else {  /* allocation pass through scene */
-		spg = sp_get_local_group(current->mm);
+		spg = sp_get_local_group(current, current->mm);
 		if (IS_ERR(spg))
 			return PTR_ERR(spg);
 		down_read(&spg->rw_lock);
@@ -3869,7 +3869,7 @@ bool sp_config_dvpp_range(size_t start, size_t size, int device_id, int pid)
 	if (!mm)
 		goto put_task;
 
-	spg = sp_get_local_group(mm);
+	spg = sp_get_local_group(tsk, mm);
 	if (IS_ERR(spg))
 		goto put_mm;
 
-- 
Gitee


From ac4045cfdc80d4b27d5746430d9bb6976624a1bd Mon Sep 17 00:00:00 2001
From: Wang Wensheng <wangwensheng4@huawei.com>
Date: Tue, 19 Jul 2022 11:36:24 +0800
Subject: [PATCH 375/674] mm/sharepool: Check sp_is_enabled() in all exported
 interfaces

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5DS9S
CVE: NA

--------------------------------------------------

We should forbid the usage of sharepool interfaces if sharepool is not
enabled. Or undefined behaviour would panic the kernel.

Signed-off-by: Wang Wensheng <wangwensheng4@huawei.com>
Reviewed-by: Weilong Chen <chenweilong@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 mm/share_pool.c | 45 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 44 insertions(+), 1 deletion(-)

diff --git a/mm/share_pool.c b/mm/share_pool.c
index a39bece9af96..750524f1afc2 100644
--- a/mm/share_pool.c
+++ b/mm/share_pool.c
@@ -1015,6 +1015,9 @@ int sp_group_id_by_pid(int pid)
 	struct sp_group *spg;
 	int spg_id = -ENODEV;
 
+	if (!sp_is_enabled())
+		return -EOPNOTSUPP;
+
 	check_interrupt_context();
 
 	spg = __sp_find_spg(pid, SPG_ID_DEFAULT);
@@ -1050,6 +1053,9 @@ int mg_sp_group_id_by_pid(int pid, int *spg_ids, int *num)
 	struct sp_group_master *master = NULL;
 	struct task_struct *tsk;
 
+	if (!sp_is_enabled())
+		return -EOPNOTSUPP;
+
 	check_interrupt_context();
 
 	if (!spg_ids || num <= 0)
@@ -1382,6 +1388,9 @@ int mg_sp_group_add_task(int pid, unsigned long prot, int spg_id)
 	bool id_newly_generated = false;
 	struct sp_area *spa, *prev = NULL;
 
+	if (!sp_is_enabled())
+		return -EOPNOTSUPP;
+
 	check_interrupt_context();
 
 	/* only allow READ, READ | WRITE */
@@ -1658,6 +1667,9 @@ int mg_sp_group_del_task(int pid, int spg_id)
 	struct mm_struct *mm = NULL;
 	bool is_alive = true;
 
+	if (!sp_is_enabled())
+		return -EOPNOTSUPP;
+
 	if (spg_id < SPG_ID_MIN || spg_id > SPG_ID_AUTO) {
 		pr_err_ratelimited("del from group failed, invalid group id %d\n", spg_id);
 		return -EINVAL;
@@ -1749,6 +1761,9 @@ int sp_id_of_current(void)
 	int ret, spg_id;
 	struct sp_group_master *master;
 
+	if (!sp_is_enabled())
+		return -EOPNOTSUPP;
+
 	if (current->flags & PF_KTHREAD || !current->mm)
 		return -EINVAL;
 
@@ -2324,6 +2339,9 @@ int sp_free(unsigned long addr, int id)
 		.spg_id = id,
 	};
 
+	if (!sp_is_enabled())
+		return -EOPNOTSUPP;
+
 	check_interrupt_context();
 
 	if (current->flags & PF_KTHREAD)
@@ -2761,6 +2779,9 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id)
 	int ret = 0;
 	struct sp_alloc_context ac;
 
+	if (!sp_is_enabled())
+		return ERR_PTR(-EOPNOTSUPP);
+
 	ret = sp_alloc_prepare(size, sp_flags, spg_id, &ac);
 	if (ret)
 		return ERR_PTR(ret);
@@ -3142,6 +3163,9 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size,
 	int ret;
 	struct sp_k2u_context kc;
 
+	if (!sp_is_enabled())
+		return ERR_PTR(-EOPNOTSUPP);
+
 	check_interrupt_context();
 
 	ret = sp_k2u_prepare(kva, size, sp_flags, spg_id, &kc);
@@ -3429,6 +3453,9 @@ void *sp_make_share_u2k(unsigned long uva, unsigned long size, int pid)
 	struct sp_walk_data sp_walk_data;
 	struct vm_struct *area;
 
+	if (!sp_is_enabled())
+		return ERR_PTR(-EOPNOTSUPP);
+
 	check_interrupt_context();
 
 	if (mm == NULL) {
@@ -3717,6 +3744,9 @@ int sp_unshare(unsigned long va, unsigned long size, int pid, int spg_id)
 {
 	int ret = 0;
 
+	if (!sp_is_enabled())
+		return -EOPNOTSUPP;
+
 	check_interrupt_context();
 
 	if (current->flags & PF_KTHREAD)
@@ -3762,6 +3792,9 @@ int sp_walk_page_range(unsigned long uva, unsigned long size,
 	struct mm_struct *mm;
 	int ret = 0;
 
+	if (!sp_is_enabled())
+		return -EOPNOTSUPP;
+
 	check_interrupt_context();
 
 	if (unlikely(!sp_walk_data)) {
@@ -3807,6 +3840,9 @@ EXPORT_SYMBOL_GPL(mg_sp_walk_page_range);
  */
 void sp_walk_page_free(struct sp_walk_data *sp_walk_data)
 {
+	if (!sp_is_enabled())
+		return;
+
 	check_interrupt_context();
 
 	if (!sp_walk_data)
@@ -3856,6 +3892,9 @@ bool sp_config_dvpp_range(size_t start, size_t size, int device_id, int pid)
 	struct sp_mapping *spm;
 	unsigned long default_start;
 
+	if (!sp_is_enabled())
+		return false;
+
 	/* NOTE: check the start address */
 	if (pid < 0 || size <= 0 || size > MMAP_SHARE_POOL_16G_SIZE ||
 	    device_id < 0 || device_id >= sp_device_number || !is_online_node_id(device_id))
@@ -3916,7 +3955,8 @@ static bool is_sp_normal_addr(unsigned long addr)
  */
 bool is_sharepool_addr(unsigned long addr)
 {
-	return is_sp_normal_addr(addr) || is_device_addr(addr);
+	return sp_is_enabled() &&
+		(is_sp_normal_addr(addr) || is_device_addr(addr));
 }
 EXPORT_SYMBOL_GPL(is_sharepool_addr);
 
@@ -4113,6 +4153,9 @@ int proc_sp_group_state(struct seq_file *m, struct pid_namespace *ns,
 	unsigned long anon, file, shmem, total_rss, prot;
 	long sp_res, sp_res_nsize, non_sp_res, non_sp_shm;
 
+	if (!sp_is_enabled())
+		return 0;
+
 	if (!mm)
 		return 0;
 
-- 
Gitee


From e906032bf468bd9dc1f95a90feae88abd33a690a Mon Sep 17 00:00:00 2001
From: Felix Kuehling <Felix.Kuehling@amd.com>
Date: Tue, 19 Jul 2022 16:30:15 +0800
Subject: [PATCH 376/674] drm/amdkfd: Use drm_priv to pass VM from KFD to
 amdgpu

stable inclusion
from stable-v5.10.112
commit f0c31f192f38c76c2ab09469dd1e636984fb9093
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=f0c31f192f38c76c2ab09469dd1e636984fb9093

--------------------------------

commit b40a6ab2cf9213923bf8e821ce7fa7f6a0a26990 upstream.

amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu needs the drm_priv to allow mmap
to access the BO through the corresponding file descriptor. The VM can
also be extracted from drm_priv, so drm_priv can replace the vm parameter
in the kfd2kgd interface.

Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
Reviewed-by: Philip Yang <philip.yang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
[This is a partial cherry-pick of the upstream commit.]
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 26f8a2138377..1b4c7ced8b92 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1024,11 +1024,15 @@ int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct kgd_dev *kgd,
 					   struct dma_fence **ef)
 {
 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
-	struct drm_file *drm_priv = filp->private_data;
-	struct amdgpu_fpriv *drv_priv = drm_priv->driver_priv;
-	struct amdgpu_vm *avm = &drv_priv->vm;
+	struct amdgpu_fpriv *drv_priv;
+	struct amdgpu_vm *avm;
 	int ret;
 
+	ret = amdgpu_file_to_fpriv(filp, &drv_priv);
+	if (ret)
+		return ret;
+	avm = &drv_priv->vm;
+
 	/* Already a compute VM? */
 	if (avm->process_info)
 		return -EINVAL;
-- 
Gitee


From 9be8a9dfb0c93b3e4dd447e3ce1b1f23f6b7b1ec Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Tue, 19 Jul 2022 16:30:16 +0800
Subject: [PATCH 377/674] cpuidle: PSCI: Move the `has_lpi` check to the
 beginning of the function

stable inclusion
from stable-v5.10.112
commit 503934df3108c75bb4f913c6d483df2ae2c3eff0
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=503934df3108c75bb4f913c6d483df2ae2c3eff0

--------------------------------

commit 01f6c7338ce267959975da65d86ba34f44d54220 upstream.

Currently the first thing checked is whether the PCSI cpu_suspend function
has been initialized.

Another change will be overloading `acpi_processor_ffh_lpi_probe` and
calling it sooner.  So make the `has_lpi` check the first thing checked
to prepare for that change.

Reviewed-by: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 arch/arm64/kernel/cpuidle.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/kernel/cpuidle.c b/arch/arm64/kernel/cpuidle.c
index b512b5503f6e..d4ff9ae673fa 100644
--- a/arch/arm64/kernel/cpuidle.c
+++ b/arch/arm64/kernel/cpuidle.c
@@ -54,6 +54,9 @@ static int psci_acpi_cpu_init_idle(unsigned int cpu)
 	struct acpi_lpi_state *lpi;
 	struct acpi_processor *pr = per_cpu(processors, cpu);
 
+	if (unlikely(!pr || !pr->flags.has_lpi))
+		return -EINVAL;
+
 	/*
 	 * If the PSCI cpu_suspend function hook has not been initialized
 	 * idle states must not be enabled, so bail out
@@ -61,9 +64,6 @@ static int psci_acpi_cpu_init_idle(unsigned int cpu)
 	if (!psci_ops.cpu_suspend)
 		return -EOPNOTSUPP;
 
-	if (unlikely(!pr || !pr->flags.has_lpi))
-		return -EINVAL;
-
 	count = pr->power.count - 1;
 	if (count <= 0)
 		return -ENODEV;
-- 
Gitee


From a5c4a16563c4cc73b930e407811fa8af048fdf32 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Tue, 19 Jul 2022 16:30:17 +0800
Subject: [PATCH 378/674] ACPI: processor idle: Check for architectural support
 for LPI

stable inclusion
from stable-v5.10.112
commit 5d131318bb8765839e45c0e55fd1f57483e62a50
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=5d131318bb8765839e45c0e55fd1f57483e62a50

--------------------------------

commit eb087f305919ee8169ad65665610313e74260463 upstream.

When `osc_pc_lpi_support_confirmed` is set through `_OSC` and `_LPI` is
populated then the cpuidle driver assumes that LPI is fully functional.

However currently the kernel only provides architectural support for LPI
on ARM.  This leads to high power consumption on X86 platforms that
otherwise try to enable LPI.

So probe whether or not LPI support is implemented before enabling LPI in
the kernel.  This is done by overloading `acpi_processor_ffh_lpi_probe` to
check whether it returns `-EOPNOTSUPP`. It also means that all future
implementations of `acpi_processor_ffh_lpi_probe` will need to follow
these semantics as well.

Reviewed-by: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/acpi/processor_idle.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 8377c3ed10ff..9921b481c7ee 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -1080,6 +1080,11 @@ static int flatten_lpi_states(struct acpi_processor *pr,
 	return 0;
 }
 
+int __weak acpi_processor_ffh_lpi_probe(unsigned int cpu)
+{
+	return -EOPNOTSUPP;
+}
+
 static int acpi_processor_get_lpi_info(struct acpi_processor *pr)
 {
 	int ret, i;
@@ -1088,6 +1093,11 @@ static int acpi_processor_get_lpi_info(struct acpi_processor *pr)
 	struct acpi_device *d = NULL;
 	struct acpi_lpi_states_array info[2], *tmp, *prev, *curr;
 
+	/* make sure our architecture has support */
+	ret = acpi_processor_ffh_lpi_probe(pr->id);
+	if (ret == -EOPNOTSUPP)
+		return ret;
+
 	if (!osc_pc_lpi_support_confirmed)
 		return -EOPNOTSUPP;
 
@@ -1139,11 +1149,6 @@ static int acpi_processor_get_lpi_info(struct acpi_processor *pr)
 	return 0;
 }
 
-int __weak acpi_processor_ffh_lpi_probe(unsigned int cpu)
-{
-	return -ENODEV;
-}
-
 int __weak acpi_processor_ffh_lpi_enter(struct acpi_lpi_state *lpi)
 {
 	return -ENODEV;
-- 
Gitee


From 1ba702b21c4b2cc9b68fbef75ae931dfb442b098 Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan@kernel.org>
Date: Tue, 19 Jul 2022 16:30:18 +0800
Subject: [PATCH 379/674] btrfs: remove unused variable in
 btrfs_{start,write}_dirty_block_groups()

stable inclusion
from stable-v5.10.112
commit 921fdc45a08456e456ae031dfa2b0996173097db
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=921fdc45a08456e456ae031dfa2b0996173097db

--------------------------------

commit 6d4a6b515c39f1f8763093e0f828959b2fbc2f45 upstream.

Clang's version of -Wunused-but-set-variable recently gained support for
unary operations, which reveals two unused variables:

  fs/btrfs/block-group.c:2949:6: error: variable 'num_started' set but not used [-Werror,-Wunused-but-set-variable]
          int num_started = 0;
              ^
  fs/btrfs/block-group.c:3116:6: error: variable 'num_started' set but not used [-Werror,-Wunused-but-set-variable]
          int num_started = 0;
              ^
  2 errors generated.

These variables appear to be unused from their introduction, so just
remove them to silence the warnings.

Fixes: c9dc4c657850 ("Btrfs: two stage dirty block group writeout")
Fixes: 1bbc621ef284 ("Btrfs: allow block group cache writeout outside critical section in commit")
CC: stable@vger.kernel.org # 5.4+
Link: https://github.com/ClangBuiltLinux/linux/issues/1614
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 fs/btrfs/block-group.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index c99e293b50f5..e351f5319950 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -2570,7 +2570,6 @@ int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
 	struct btrfs_path *path = NULL;
 	LIST_HEAD(dirty);
 	struct list_head *io = &cur_trans->io_bgs;
-	int num_started = 0;
 	int loops = 0;
 
 	spin_lock(&cur_trans->dirty_bgs_lock);
@@ -2636,7 +2635,6 @@ int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
 			cache->io_ctl.inode = NULL;
 			ret = btrfs_write_out_cache(trans, cache, path);
 			if (ret == 0 && cache->io_ctl.inode) {
-				num_started++;
 				should_put = 0;
 
 				/*
@@ -2737,7 +2735,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
 	int should_put;
 	struct btrfs_path *path;
 	struct list_head *io = &cur_trans->io_bgs;
-	int num_started = 0;
 
 	path = btrfs_alloc_path();
 	if (!path)
@@ -2795,7 +2792,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
 			cache->io_ctl.inode = NULL;
 			ret = btrfs_write_out_cache(trans, cache, path);
 			if (ret == 0 && cache->io_ctl.inode) {
-				num_started++;
 				should_put = 0;
 				list_add_tail(&cache->io_list, io);
 			} else {
-- 
Gitee


From 7d3f6a00b77d08b8f777aeca070a39c9fcc433ec Mon Sep 17 00:00:00 2001
From: Rob Clark <robdclark@chromium.org>
Date: Tue, 19 Jul 2022 16:30:19 +0800
Subject: [PATCH 380/674] drm/msm: Add missing put_task_struct() in debugfs
 path

stable inclusion
from stable-v5.10.112
commit cb66641f81060a5a5d88f85d2aeb73860edc79df
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=cb66641f81060a5a5d88f85d2aeb73860edc79df

--------------------------------

[ Upstream commit ac3e4f42d5ec459f701743debd9c1ad2f2247402 ]

Fixes: 25faf2f2e065 ("drm/msm: Show process names in gem_describe")
Signed-off-by: Rob Clark <robdclark@chromium.org>
Link: https://lore.kernel.org/r/20220317184550.227991-1-robdclark@gmail.com
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/gpu/drm/msm/msm_gem.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c
index 819567e40565..9c05bf6c4551 100644
--- a/drivers/gpu/drm/msm/msm_gem.c
+++ b/drivers/gpu/drm/msm/msm_gem.c
@@ -849,6 +849,7 @@ void msm_gem_describe(struct drm_gem_object *obj, struct seq_file *m)
 					get_pid_task(aspace->pid, PIDTYPE_PID);
 				if (task) {
 					comm = kstrdup(task->comm, GFP_KERNEL);
+					put_task_struct(task);
 				} else {
 					comm = NULL;
 				}
-- 
Gitee


From dff5229263233c766939f0ba745b8e342ba70293 Mon Sep 17 00:00:00 2001
From: Miaoqian Lin <linmq006@gmail.com>
Date: Tue, 19 Jul 2022 16:30:20 +0800
Subject: [PATCH 381/674] memory: atmel-ebi: Fix missing of_node_put in
 atmel_ebi_probe

stable inclusion
from stable-v5.10.112
commit 16c628b0c6faa2fab1a75ffce80860f8106cf002
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=16c628b0c6faa2fab1a75ffce80860f8106cf002

--------------------------------

[ Upstream commit 6f296a9665ba5ac68937bf11f96214eb9de81baa ]

The device_node pointer is returned by of_parse_phandle() with refcount
incremented. We should use of_node_put() on it when done.

Fixes: 87108dc78eb8 ("memory: atmel-ebi: Enable the SMC clock if specified")
Signed-off-by: Miaoqian Lin <linmq006@gmail.com>
Reviewed-by: Claudiu Beznea <claudiu.beznea@microchip.com>
Link: https://lore.kernel.org/r/20220309110144.22412-1-linmq006@gmail.com
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/memory/atmel-ebi.c | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/drivers/memory/atmel-ebi.c b/drivers/memory/atmel-ebi.c
index c267283b01fd..e749dcb3ddea 100644
--- a/drivers/memory/atmel-ebi.c
+++ b/drivers/memory/atmel-ebi.c
@@ -544,20 +544,27 @@ static int atmel_ebi_probe(struct platform_device *pdev)
 	smc_np = of_parse_phandle(dev->of_node, "atmel,smc", 0);
 
 	ebi->smc.regmap = syscon_node_to_regmap(smc_np);
-	if (IS_ERR(ebi->smc.regmap))
-		return PTR_ERR(ebi->smc.regmap);
+	if (IS_ERR(ebi->smc.regmap)) {
+		ret = PTR_ERR(ebi->smc.regmap);
+		goto put_node;
+	}
 
 	ebi->smc.layout = atmel_hsmc_get_reg_layout(smc_np);
-	if (IS_ERR(ebi->smc.layout))
-		return PTR_ERR(ebi->smc.layout);
+	if (IS_ERR(ebi->smc.layout)) {
+		ret = PTR_ERR(ebi->smc.layout);
+		goto put_node;
+	}
 
 	ebi->smc.clk = of_clk_get(smc_np, 0);
 	if (IS_ERR(ebi->smc.clk)) {
-		if (PTR_ERR(ebi->smc.clk) != -ENOENT)
-			return PTR_ERR(ebi->smc.clk);
+		if (PTR_ERR(ebi->smc.clk) != -ENOENT) {
+			ret = PTR_ERR(ebi->smc.clk);
+			goto put_node;
+		}
 
 		ebi->smc.clk = NULL;
 	}
+	of_node_put(smc_np);
 	ret = clk_prepare_enable(ebi->smc.clk);
 	if (ret)
 		return ret;
@@ -608,6 +615,10 @@ static int atmel_ebi_probe(struct platform_device *pdev)
 	}
 
 	return of_platform_populate(np, NULL, NULL, dev);
+
+put_node:
+	of_node_put(smc_np);
+	return ret;
 }
 
 static __maybe_unused int atmel_ebi_resume(struct device *dev)
-- 
Gitee


From 85ad5f549c32641b3e7a31b359bd84bd55388a15 Mon Sep 17 00:00:00 2001
From: Cristian Marussi <cristian.marussi@arm.com>
Date: Tue, 19 Jul 2022 16:30:21 +0800
Subject: [PATCH 382/674] firmware: arm_scmi: Fix sorting of retrieved clock
 rates

stable inclusion
from stable-v5.10.112
commit 5637129712023680c1b165a758c2c4d702bcd199
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=5637129712023680c1b165a758c2c4d702bcd199

--------------------------------

[ Upstream commit 23274739a5b6166f74d8d9cb5243d7bf6b46aab9 ]

During SCMI Clock protocol initialization, after having retrieved from the
SCMI platform all the available discrete rates for a specific clock, the
clock rates array is sorted, unfortunately using a pointer to its end as
a base instead of its start, so that sorting does not work.

Fix invocation of sort() passing as base a pointer to the start of the
retrieved clock rates array.

Link: https://lore.kernel.org/r/20220318092813.49283-1-cristian.marussi@arm.com
Fixes: dccec73de91d ("firmware: arm_scmi: Keep the discrete clock rates sorted")
Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/firmware/arm_scmi/clock.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/firmware/arm_scmi/clock.c b/drivers/firmware/arm_scmi/clock.c
index 4645677d86f1..a45678cd9b74 100644
--- a/drivers/firmware/arm_scmi/clock.c
+++ b/drivers/firmware/arm_scmi/clock.c
@@ -202,7 +202,8 @@ scmi_clock_describe_rates_get(const struct scmi_handle *handle, u32 clk_id,
 
 	if (rate_discrete && rate) {
 		clk->list.num_rates = tot_rate_cnt;
-		sort(rate, tot_rate_cnt, sizeof(*rate), rate_cmp_func, NULL);
+		sort(clk->list.rates, tot_rate_cnt, sizeof(*rate),
+		     rate_cmp_func, NULL);
 	}
 
 	clk->rate_discrete = rate_discrete;
-- 
Gitee


From f274e8812df479b375a9cda1d455f35f29353333 Mon Sep 17 00:00:00 2001
From: Kyle Copperfield <kmcopper@danwin1210.me>
Date: Tue, 19 Jul 2022 16:30:22 +0800
Subject: [PATCH 383/674] media: rockchip/rga: do proper error checking in
 probe

stable inclusion
from stable-v5.10.112
commit af12dd71235cf9f06ed3e98b1c28f55951effa6d
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=af12dd71235cf9f06ed3e98b1c28f55951effa6d

--------------------------------

[ Upstream commit 6150f276073a1480030242a7e006a89e161d6cd6 ]

The latest fix for probe error handling contained a typo that causes
probing to fail with the following message:

  rockchip-rga: probe of ff680000.rga failed with error -12

This patch fixes the typo.

Fixes: e58430e1d4fd (media: rockchip/rga: fix error handling in probe)
Reviewed-by: Dragan Simic <dragan.simic@gmail.com>
Signed-off-by: Kyle Copperfield <kmcopper@danwin1210.me>
Reviewed-by: Kieran Bingham <kieran.bingham+renesas@ideasonboard.com>
Reviewed-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/media/platform/rockchip/rga/rga.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/media/platform/rockchip/rga/rga.c b/drivers/media/platform/rockchip/rga/rga.c
index 6759091b15e0..d99ea8973b67 100644
--- a/drivers/media/platform/rockchip/rga/rga.c
+++ b/drivers/media/platform/rockchip/rga/rga.c
@@ -895,7 +895,7 @@ static int rga_probe(struct platform_device *pdev)
 	}
 	rga->dst_mmu_pages =
 		(unsigned int *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 3);
-	if (rga->dst_mmu_pages) {
+	if (!rga->dst_mmu_pages) {
 		ret = -ENOMEM;
 		goto free_src_pages;
 	}
-- 
Gitee


From d22a8440aff6344482a5c03c289138a750cdd2ff Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 19 Jul 2022 16:30:23 +0800
Subject: [PATCH 384/674] SUNRPC: Fix the svc_deferred_event trace class

stable inclusion
from stable-v5.10.112
commit 85ee17ca21cf92989e8c923e3ea4514c291e9d38
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=85ee17ca21cf92989e8c923e3ea4514c291e9d38

--------------------------------

[ Upstream commit 4d5004451ab2218eab94a30e1841462c9316ba19 ]

Fix a NULL deref crash that occurs when an svc_rqst is deferred
while the sunrpc tracing subsystem is enabled. svc_revisit() sets
dr->xprt to NULL, so it can't be relied upon in the tracepoint to
provide the remote's address.

Unfortunately we can't revert the "svc_deferred_class" hunk in
commit ece200ddd54b ("sunrpc: Save remote presentation address in
svc_xprt for trace events") because there is now a specific check
of event format specifiers for unsafe dereferences. The warning
that check emits is:

  event svc_defer_recv has unsafe dereference of argument 1

A "%pISpc" format specifier with a "struct sockaddr *" is indeed
flagged by this check.

Instead, take the brute-force approach used by the svcrdma_qp_error
tracepoint. Convert the dr::addr field into a presentation address
in the TP_fast_assign() arm of the trace event, and store that as
a string. This fix can be backported to -stable kernels.

In the meantime, commit c6ced22997ad ("tracing: Update print fmt
check to handle new __get_sockaddr() macro") is now in v5.18, so
this wonky fix can be replaced with __sockaddr() and friends
properly during the v5.19 merge window.

Fixes: ece200ddd54b ("sunrpc: Save remote presentation address in svc_xprt for trace events")
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 include/trace/events/sunrpc.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h
index 9fe6cdcf2222..8220369ee610 100644
--- a/include/trace/events/sunrpc.h
+++ b/include/trace/events/sunrpc.h
@@ -1873,17 +1873,18 @@ DECLARE_EVENT_CLASS(svc_deferred_event,
 	TP_STRUCT__entry(
 		__field(const void *, dr)
 		__field(u32, xid)
-		__string(addr, dr->xprt->xpt_remotebuf)
+		__array(__u8, addr, INET6_ADDRSTRLEN + 10)
 	),
 
 	TP_fast_assign(
 		__entry->dr = dr;
 		__entry->xid = be32_to_cpu(*(__be32 *)(dr->args +
 						       (dr->xprt_hlen>>2)));
-		__assign_str(addr, dr->xprt->xpt_remotebuf);
+		snprintf(__entry->addr, sizeof(__entry->addr) - 1,
+			 "%pISpc", (struct sockaddr *)&dr->addr);
 	),
 
-	TP_printk("addr=%s dr=%p xid=0x%08x", __get_str(addr), __entry->dr,
+	TP_printk("addr=%s dr=%p xid=0x%08x", __entry->addr, __entry->dr,
 		__entry->xid)
 );
 
-- 
Gitee


From 871e0a3789244674e4db0411f059783ab1a0c272 Mon Sep 17 00:00:00 2001
From: Guillaume Nault <gnault@redhat.com>
Date: Tue, 19 Jul 2022 16:30:24 +0800
Subject: [PATCH 385/674] veth: Ensure eth header is in skb's linear part

stable inclusion
from stable-v5.10.112
commit d67c900f1947d64ba8a64f693504bcaab8d9000c
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=d67c900f1947d64ba8a64f693504bcaab8d9000c

--------------------------------

[ Upstream commit 726e2c5929de841fdcef4e2bf995680688ae1b87 ]

After feeding a decapsulated packet to a veth device with act_mirred,
skb_headlen() may be 0. But veth_xmit() calls __dev_forward_skb(),
which expects at least ETH_HLEN byte of linear data (as
__dev_forward_skb2() calls eth_type_trans(), which pulls ETH_HLEN bytes
unconditionally).

Use pskb_may_pull() to ensure veth_xmit() respects this constraint.

kernel BUG at include/linux/skbuff.h:2328!
RIP: 0010:eth_type_trans+0xcf/0x140
Call Trace:
 <IRQ>
 __dev_forward_skb2+0xe3/0x160
 veth_xmit+0x6e/0x250 [veth]
 dev_hard_start_xmit+0xc7/0x200
 __dev_queue_xmit+0x47f/0x520
 ? skb_ensure_writable+0x85/0xa0
 ? skb_mpls_pop+0x98/0x1c0
 tcf_mirred_act+0x442/0x47e [act_mirred]
 tcf_action_exec+0x86/0x140
 fl_classify+0x1d8/0x1e0 [cls_flower]
 ? dma_pte_clear_level+0x129/0x1a0
 ? dma_pte_clear_level+0x129/0x1a0
 ? prb_fill_curr_block+0x2f/0xc0
 ? skb_copy_bits+0x11a/0x220
 __tcf_classify+0x58/0x110
 tcf_classify_ingress+0x6b/0x140
 __netif_receive_skb_core.constprop.0+0x47d/0xfd0
 ? __iommu_dma_unmap_swiotlb+0x44/0x90
 __netif_receive_skb_one_core+0x3d/0xa0
 netif_receive_skb+0x116/0x170
 be_process_rx+0x22f/0x330 [be2net]
 be_poll+0x13c/0x370 [be2net]
 __napi_poll+0x2a/0x170
 net_rx_action+0x22f/0x2f0
 __do_softirq+0xca/0x2a8
 __irq_exit_rcu+0xc1/0xe0
 common_interrupt+0x83/0xa0

Fixes: e314dbdc1c0d ("[NET]: Virtual ethernet device driver.")
Signed-off-by: Guillaume Nault <gnault@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/net/veth.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index f7e3eb309a26..5be8ed910553 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -292,7 +292,7 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	rcu_read_lock();
 	rcv = rcu_dereference(priv->peer);
-	if (unlikely(!rcv)) {
+	if (unlikely(!rcv) || !pskb_may_pull(skb, ETH_HLEN)) {
 		kfree_skb(skb);
 		goto drop;
 	}
-- 
Gitee


From 959e7afb724417d91eb14dcf2944560c29ffc66a Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 19 Jul 2022 16:30:25 +0800
Subject: [PATCH 386/674] gpiolib: acpi: use correct format characters

stable inclusion
from stable-v5.10.112
commit 9709c8b5cdc88b1adb77acdbfd6e8a3f942d9af5
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=9709c8b5cdc88b1adb77acdbfd6e8a3f942d9af5

--------------------------------

[ Upstream commit 213d266ebfb1621aab79cfe63388facc520a1381 ]

When compiling with -Wformat, clang emits the following warning:

  gpiolib-acpi.c:393:4: warning: format specifies type 'unsigned char' but the argument has type 'int' [-Wformat]
                        pin);
                        ^~~

So warning that '%hhX' is paired with an 'int' is all just completely
mindless and wrong. Sadly, I can see a different bogus warning reason
why people would want to use '%02hhX'.

Again, the *sane* thing from a human perspective is to use '%02X. But
if the compiler doesn't do any range analysis at all, it could decide
that "Oh, that print format could need up to 8 bytes of space in the
result". Using '%02hhX' would cut that down to two.

And since we use

        char ev_name[5];

and currently use "_%c%02hhX" as the format string, even a compiler
that doesn't notice that "pin <= 255" test that guards this all will
go "OK, that's at most 4 bytes and the final NUL termination, so it's
fine".

While a compiler - like gcc - that only sees that the original source
of the 'pin' value is a 'unsigned short' array, and then doesn't take
the "pin <= 255" into account, will warn like this:

  gpiolib-acpi.c: In function 'acpi_gpiochip_request_interrupt':
  gpiolib-acpi.c:206:24: warning: '%02X' directive writing between 2 and 4 bytes into a region of size 3 [-Wformat-overflow=]
       sprintf(ev_name, "_%c%02X",
                            ^~~~
  gpiolib-acpi.c:206:20: note: directive argument in the range [0, 65535]

because gcc isn't being very good at that argument range analysis either.

In other words, the original use of 'hhx' was bogus to begin with, and
due to *another* compiler warning being bad, and we had that bad code
being written back in 2016 to work around _that_ compiler warning
(commit e40a3ae1f794: "gpio: acpi: work around false-positive
-Wstring-overflow warning").

Sadly, two different bad compiler warnings together does not make for
one good one.

It just makes for even more pain.

End result: I think the simplest and cleanest option is simply the
proposed change which undoes that '%hhX' change for gcc, and replaces
it with just using a slightly bigger stack allocation. It's not like
a 5-byte allocation is in any way likely to have saved any actual stack,
since all the other variables in that function are 'int' or bigger.

False-positive compiler warnings really do make people write worse
code, and that's a problem. But on a scale of bad code, I feel that
extending the buffer trivially is better than adding a pointless cast
that literally makes no sense.

At least in this case the end result isn't unreadable or buggy. We've
had several cases of bad compiler warnings that caused changes that
were actually horrendously wrong.

Fixes: e40a3ae1f794 ("gpio: acpi: work around false-positive -Wstring-overflow warning")
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/gpio/gpiolib-acpi.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpio/gpiolib-acpi.c b/drivers/gpio/gpiolib-acpi.c
index 55e4f402ec8b..44ee319da1b3 100644
--- a/drivers/gpio/gpiolib-acpi.c
+++ b/drivers/gpio/gpiolib-acpi.c
@@ -276,8 +276,8 @@ static acpi_status acpi_gpiochip_alloc_event(struct acpi_resource *ares,
 	pin = agpio->pin_table[0];
 
 	if (pin <= 255) {
-		char ev_name[5];
-		sprintf(ev_name, "_%c%02hhX",
+		char ev_name[8];
+		sprintf(ev_name, "_%c%02X",
 			agpio->triggering == ACPI_EDGE_SENSITIVE ? 'E' : 'L',
 			pin);
 		if (ACPI_SUCCESS(acpi_get_handle(handle, ev_name, &evt_handle)))
-- 
Gitee


From 90c40a7f629da238f092827e977ded885831507c Mon Sep 17 00:00:00 2001
From: Calvin Johnson <calvin.johnson@oss.nxp.com>
Date: Tue, 19 Jul 2022 16:30:26 +0800
Subject: [PATCH 387/674] net: mdio: Alphabetically sort header inclusion

stable inclusion
from stable-v5.10.112
commit 43e58e119a2b913d3b7576455b0b1f56b569ca0b
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=43e58e119a2b913d3b7576455b0b1f56b569ca0b

--------------------------------

[ Upstream commit 1bf343665057312167750509b0c48e8299293ac5 ]

Alphabetically sort header inclusion

Signed-off-by: Calvin Johnson <calvin.johnson@oss.nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/net/mdio/mdio-bcm-unimac.c      | 16 +++++++---------
 drivers/net/mdio/mdio-bitbang.c         |  4 ++--
 drivers/net/mdio/mdio-cavium.c          |  2 +-
 drivers/net/mdio/mdio-gpio.c            | 10 +++++-----
 drivers/net/mdio/mdio-ipq4019.c         |  4 ++--
 drivers/net/mdio/mdio-ipq8064.c         |  4 ++--
 drivers/net/mdio/mdio-mscc-miim.c       |  8 ++++----
 drivers/net/mdio/mdio-mux-bcm-iproc.c   | 10 +++++-----
 drivers/net/mdio/mdio-mux-gpio.c        |  8 ++++----
 drivers/net/mdio/mdio-mux-mmioreg.c     |  6 +++---
 drivers/net/mdio/mdio-mux-multiplexer.c |  2 +-
 drivers/net/mdio/mdio-mux.c             |  6 +++---
 drivers/net/mdio/mdio-octeon.c          |  8 ++++----
 drivers/net/mdio/mdio-thunder.c         | 10 +++++-----
 drivers/net/mdio/mdio-xgene.c           |  6 +++---
 drivers/net/mdio/of_mdio.c              | 10 +++++-----
 16 files changed, 56 insertions(+), 58 deletions(-)

diff --git a/drivers/net/mdio/mdio-bcm-unimac.c b/drivers/net/mdio/mdio-bcm-unimac.c
index fbd36891ee64..5d171e7f118d 100644
--- a/drivers/net/mdio/mdio-bcm-unimac.c
+++ b/drivers/net/mdio/mdio-bcm-unimac.c
@@ -5,20 +5,18 @@
  * Copyright (C) 2014-2017 Broadcom
  */
 
+#include <linux/clk.h>
+#include <linux/delay.h>
+#include <linux/io.h>
 #include <linux/kernel.h>
-#include <linux/phy.h>
-#include <linux/platform_device.h>
-#include <linux/sched.h>
 #include <linux/module.h>
-#include <linux/io.h>
-#include <linux/delay.h>
-#include <linux/clk.h>
-
 #include <linux/of.h>
-#include <linux/of_platform.h>
 #include <linux/of_mdio.h>
-
+#include <linux/of_platform.h>
+#include <linux/phy.h>
 #include <linux/platform_data/mdio-bcm-unimac.h>
+#include <linux/platform_device.h>
+#include <linux/sched.h>
 
 #define MDIO_CMD		0x00
 #define  MDIO_START_BUSY	(1 << 29)
diff --git a/drivers/net/mdio/mdio-bitbang.c b/drivers/net/mdio/mdio-bitbang.c
index 5136275c8e73..99588192cc78 100644
--- a/drivers/net/mdio/mdio-bitbang.c
+++ b/drivers/net/mdio/mdio-bitbang.c
@@ -14,10 +14,10 @@
  * Vitaly Bordug <vbordug@ru.mvista.com>
  */
 
-#include <linux/module.h>
+#include <linux/delay.h>
 #include <linux/mdio-bitbang.h>
+#include <linux/module.h>
 #include <linux/types.h>
-#include <linux/delay.h>
 
 #define MDIO_READ 2
 #define MDIO_WRITE 1
diff --git a/drivers/net/mdio/mdio-cavium.c b/drivers/net/mdio/mdio-cavium.c
index 1afd6fc1a351..95ce274c1be1 100644
--- a/drivers/net/mdio/mdio-cavium.c
+++ b/drivers/net/mdio/mdio-cavium.c
@@ -4,9 +4,9 @@
  */
 
 #include <linux/delay.h>
+#include <linux/io.h>
 #include <linux/module.h>
 #include <linux/phy.h>
-#include <linux/io.h>
 
 #include "mdio-cavium.h"
 
diff --git a/drivers/net/mdio/mdio-gpio.c b/drivers/net/mdio/mdio-gpio.c
index 1b00235d7dc5..56c8f914f893 100644
--- a/drivers/net/mdio/mdio-gpio.c
+++ b/drivers/net/mdio/mdio-gpio.c
@@ -17,15 +17,15 @@
  * Vitaly Bordug <vbordug@ru.mvista.com>
  */
 
-#include <linux/module.h>
-#include <linux/slab.h>
+#include <linux/gpio/consumer.h>
 #include <linux/interrupt.h>
-#include <linux/platform_device.h>
-#include <linux/platform_data/mdio-gpio.h>
 #include <linux/mdio-bitbang.h>
 #include <linux/mdio-gpio.h>
-#include <linux/gpio/consumer.h>
+#include <linux/module.h>
 #include <linux/of_mdio.h>
+#include <linux/platform_data/mdio-gpio.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
 
 struct mdio_gpio_info {
 	struct mdiobb_ctrl ctrl;
diff --git a/drivers/net/mdio/mdio-ipq4019.c b/drivers/net/mdio/mdio-ipq4019.c
index 25c25ea6da66..9cd71d896963 100644
--- a/drivers/net/mdio/mdio-ipq4019.c
+++ b/drivers/net/mdio/mdio-ipq4019.c
@@ -3,10 +3,10 @@
 /* Copyright (c) 2020 Sartura Ltd. */
 
 #include <linux/delay.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
 #include <linux/io.h>
 #include <linux/iopoll.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
 #include <linux/of_address.h>
 #include <linux/of_mdio.h>
 #include <linux/phy.h>
diff --git a/drivers/net/mdio/mdio-ipq8064.c b/drivers/net/mdio/mdio-ipq8064.c
index f0a6bfa61645..49d4e9aa30bb 100644
--- a/drivers/net/mdio/mdio-ipq8064.c
+++ b/drivers/net/mdio/mdio-ipq8064.c
@@ -7,12 +7,12 @@
 
 #include <linux/delay.h>
 #include <linux/kernel.h>
+#include <linux/mfd/syscon.h>
 #include <linux/module.h>
-#include <linux/regmap.h>
 #include <linux/of_mdio.h>
 #include <linux/of_address.h>
 #include <linux/platform_device.h>
-#include <linux/mfd/syscon.h>
+#include <linux/regmap.h>
 
 /* MII address register definitions */
 #define MII_ADDR_REG_ADDR                       0x10
diff --git a/drivers/net/mdio/mdio-mscc-miim.c b/drivers/net/mdio/mdio-mscc-miim.c
index 1c9232fca1e2..037649bef92e 100644
--- a/drivers/net/mdio/mdio-mscc-miim.c
+++ b/drivers/net/mdio/mdio-mscc-miim.c
@@ -6,14 +6,14 @@
  * Copyright (c) 2017 Microsemi Corporation
  */
 
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/phy.h>
-#include <linux/platform_device.h>
 #include <linux/bitops.h>
 #include <linux/io.h>
 #include <linux/iopoll.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
 #include <linux/of_mdio.h>
+#include <linux/phy.h>
+#include <linux/platform_device.h>
 
 #define MSCC_MIIM_REG_STATUS		0x0
 #define		MSCC_MIIM_STATUS_STAT_PENDING	BIT(2)
diff --git a/drivers/net/mdio/mdio-mux-bcm-iproc.c b/drivers/net/mdio/mdio-mux-bcm-iproc.c
index 42fb5f166136..641cfa41f492 100644
--- a/drivers/net/mdio/mdio-mux-bcm-iproc.c
+++ b/drivers/net/mdio/mdio-mux-bcm-iproc.c
@@ -3,14 +3,14 @@
  * Copyright 2016 Broadcom
  */
 #include <linux/clk.h>
-#include <linux/platform_device.h>
+#include <linux/delay.h>
 #include <linux/device.h>
-#include <linux/of_mdio.h>
+#include <linux/iopoll.h>
+#include <linux/mdio-mux.h>
 #include <linux/module.h>
+#include <linux/of_mdio.h>
 #include <linux/phy.h>
-#include <linux/mdio-mux.h>
-#include <linux/delay.h>
-#include <linux/iopoll.h>
+#include <linux/platform_device.h>
 
 #define MDIO_RATE_ADJ_EXT_OFFSET	0x000
 #define MDIO_RATE_ADJ_INT_OFFSET	0x004
diff --git a/drivers/net/mdio/mdio-mux-gpio.c b/drivers/net/mdio/mdio-mux-gpio.c
index 10a758fdc9e6..3c7f16f06b45 100644
--- a/drivers/net/mdio/mdio-mux-gpio.c
+++ b/drivers/net/mdio/mdio-mux-gpio.c
@@ -3,13 +3,13 @@
  * Copyright (C) 2011, 2012 Cavium, Inc.
  */
 
-#include <linux/platform_device.h>
 #include <linux/device.h>
-#include <linux/of_mdio.h>
+#include <linux/gpio/consumer.h>
+#include <linux/mdio-mux.h>
 #include <linux/module.h>
+#include <linux/of_mdio.h>
 #include <linux/phy.h>
-#include <linux/mdio-mux.h>
-#include <linux/gpio/consumer.h>
+#include <linux/platform_device.h>
 
 #define DRV_VERSION "1.1"
 #define DRV_DESCRIPTION "GPIO controlled MDIO bus multiplexer driver"
diff --git a/drivers/net/mdio/mdio-mux-mmioreg.c b/drivers/net/mdio/mdio-mux-mmioreg.c
index d1a8780e24d8..c02fb2a067ee 100644
--- a/drivers/net/mdio/mdio-mux-mmioreg.c
+++ b/drivers/net/mdio/mdio-mux-mmioreg.c
@@ -7,13 +7,13 @@
  * Copyright 2012 Freescale Semiconductor, Inc.
  */
 
-#include <linux/platform_device.h>
 #include <linux/device.h>
+#include <linux/mdio-mux.h>
+#include <linux/module.h>
 #include <linux/of_address.h>
 #include <linux/of_mdio.h>
-#include <linux/module.h>
 #include <linux/phy.h>
-#include <linux/mdio-mux.h>
+#include <linux/platform_device.h>
 
 struct mdio_mux_mmioreg_state {
 	void *mux_handle;
diff --git a/drivers/net/mdio/mdio-mux-multiplexer.c b/drivers/net/mdio/mdio-mux-multiplexer.c
index d6564381aa3e..527acfc3c045 100644
--- a/drivers/net/mdio/mdio-mux-multiplexer.c
+++ b/drivers/net/mdio/mdio-mux-multiplexer.c
@@ -4,10 +4,10 @@
  * Copyright 2019 NXP
  */
 
-#include <linux/platform_device.h>
 #include <linux/mdio-mux.h>
 #include <linux/module.h>
 #include <linux/mux/consumer.h>
+#include <linux/platform_device.h>
 
 struct mdio_mux_multiplexer_state {
 	struct mux_control *muxc;
diff --git a/drivers/net/mdio/mdio-mux.c b/drivers/net/mdio/mdio-mux.c
index ccb3ee704eb1..3dde0c2b3e09 100644
--- a/drivers/net/mdio/mdio-mux.c
+++ b/drivers/net/mdio/mdio-mux.c
@@ -3,12 +3,12 @@
  * Copyright (C) 2011, 2012 Cavium, Inc.
  */
 
-#include <linux/platform_device.h>
-#include <linux/mdio-mux.h>
-#include <linux/of_mdio.h>
 #include <linux/device.h>
+#include <linux/mdio-mux.h>
 #include <linux/module.h>
+#include <linux/of_mdio.h>
 #include <linux/phy.h>
+#include <linux/platform_device.h>
 
 #define DRV_DESCRIPTION "MDIO bus multiplexer driver"
 
diff --git a/drivers/net/mdio/mdio-octeon.c b/drivers/net/mdio/mdio-octeon.c
index 6faf39314ac9..e096e68ac667 100644
--- a/drivers/net/mdio/mdio-octeon.c
+++ b/drivers/net/mdio/mdio-octeon.c
@@ -3,13 +3,13 @@
  * Copyright (C) 2009-2015 Cavium, Inc.
  */
 
-#include <linux/platform_device.h>
+#include <linux/gfp.h>
+#include <linux/io.h>
+#include <linux/module.h>
 #include <linux/of_address.h>
 #include <linux/of_mdio.h>
-#include <linux/module.h>
-#include <linux/gfp.h>
 #include <linux/phy.h>
-#include <linux/io.h>
+#include <linux/platform_device.h>
 
 #include "mdio-cavium.h"
 
diff --git a/drivers/net/mdio/mdio-thunder.c b/drivers/net/mdio/mdio-thunder.c
index dd7430c998a2..822d2cdd2f35 100644
--- a/drivers/net/mdio/mdio-thunder.c
+++ b/drivers/net/mdio/mdio-thunder.c
@@ -3,14 +3,14 @@
  * Copyright (C) 2009-2016 Cavium, Inc.
  */
 
-#include <linux/of_address.h>
-#include <linux/of_mdio.h>
-#include <linux/module.h>
+#include <linux/acpi.h>
 #include <linux/gfp.h>
-#include <linux/phy.h>
 #include <linux/io.h>
-#include <linux/acpi.h>
+#include <linux/module.h>
+#include <linux/of_address.h>
+#include <linux/of_mdio.h>
 #include <linux/pci.h>
+#include <linux/phy.h>
 
 #include "mdio-cavium.h"
 
diff --git a/drivers/net/mdio/mdio-xgene.c b/drivers/net/mdio/mdio-xgene.c
index 461207cdf5d6..7ab4e26db08c 100644
--- a/drivers/net/mdio/mdio-xgene.c
+++ b/drivers/net/mdio/mdio-xgene.c
@@ -13,11 +13,11 @@
 #include <linux/io.h>
 #include <linux/mdio/mdio-xgene.h>
 #include <linux/module.h>
-#include <linux/of_platform.h>
-#include <linux/of_net.h>
 #include <linux/of_mdio.h>
-#include <linux/prefetch.h>
+#include <linux/of_net.h>
+#include <linux/of_platform.h>
 #include <linux/phy.h>
+#include <linux/prefetch.h>
 #include <net/ip.h>
 
 static bool xgene_mdio_status;
diff --git a/drivers/net/mdio/of_mdio.c b/drivers/net/mdio/of_mdio.c
index 4daf94bb56a5..ea0bf13e8ac3 100644
--- a/drivers/net/mdio/of_mdio.c
+++ b/drivers/net/mdio/of_mdio.c
@@ -8,17 +8,17 @@
  * out of the OpenFirmware device tree and using it to populate an mii_bus.
  */
 
-#include <linux/kernel.h>
 #include <linux/device.h>
-#include <linux/netdevice.h>
 #include <linux/err.h>
-#include <linux/phy.h>
-#include <linux/phy_fixed.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
 #include <linux/of.h>
 #include <linux/of_irq.h>
 #include <linux/of_mdio.h>
 #include <linux/of_net.h>
-#include <linux/module.h>
+#include <linux/phy.h>
+#include <linux/phy_fixed.h>
 
 #define DEFAULT_GPIO_RESET_DELAY	10	/* in microseconds */
 
-- 
Gitee


From 7245b48fdf9bfaa2208cd9cc0a52c8ba9a83e9bc Mon Sep 17 00:00:00 2001
From: Vadim Pasternak <vadimp@nvidia.com>
Date: Tue, 19 Jul 2022 16:30:27 +0800
Subject: [PATCH 388/674] mlxsw: i2c: Fix initialization error flow

stable inclusion
from stable-v5.10.112
commit 7a7cf84148416d93d5904eb9a7cf5499ab852b84
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=7a7cf84148416d93d5904eb9a7cf5499ab852b84

--------------------------------

[ Upstream commit d452088cdfd5a4ad9d96d847d2273fe958d6339b ]

Add mutex_destroy() call in driver initialization error flow.

Fixes: 6882b0aee180f ("mlxsw: Introduce support for I2C bus")
Signed-off-by: Vadim Pasternak <vadimp@nvidia.com>
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Link: https://lore.kernel.org/r/20220407070703.2421076-1-idosch@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/net/ethernet/mellanox/mlxsw/i2c.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/i2c.c b/drivers/net/ethernet/mellanox/mlxsw/i2c.c
index 939b692ffc33..ce843ea91464 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/i2c.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/i2c.c
@@ -650,6 +650,7 @@ static int mlxsw_i2c_probe(struct i2c_client *client,
 	return 0;
 
 errout:
+	mutex_destroy(&mlxsw_i2c->cmd.lock);
 	i2c_set_clientdata(client, NULL);
 
 	return err;
-- 
Gitee


From b8be64b53ebbf0643cfb6bb46bc207290acbefe3 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Tue, 19 Jul 2022 16:30:28 +0800
Subject: [PATCH 389/674] net/sched: fix initialization order when updating
 chain 0 head

stable inclusion
from stable-v5.10.112
commit f2cc341fcc42a2507d9c36e94317f5e0834fc5fd
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=f2cc341fcc42a2507d9c36e94317f5e0834fc5fd

--------------------------------

[ Upstream commit e65812fd22eba32f11abe28cb377cbd64cfb1ba0 ]

Currently, when inserting a new filter that needs to sit at the head
of chain 0, it will first update the heads pointer on all devices using
the (shared) block, and only then complete the initialization of the new
element so that it has a "next" element.

This can lead to a situation that the chain 0 head is propagated to
another CPU before the "next" initialization is done. When this race
condition is triggered, packets being matched on that CPU will simply
miss all other filters, and will flow through the stack as if there were
no other filters installed. If the system is using OVS + TC, such
packets will get handled by vswitchd via upcall, which results in much
higher latency and reordering. For other applications it may result in
packet drops.

This is reproducible with a tc only setup, but it varies from system to
system. It could be reproduced with a shared block amongst 10 veth
tunnels, and an ingress filter mirroring packets to another veth.
That's because using the last added veth tunnel to the shared block to
do the actual traffic, it makes the race window bigger and easier to
trigger.

The fix is rather simple, to just initialize the next pointer of the new
filter instance (tp) before propagating the head change.

The fixes tag is pointing to the original code though this issue should
only be observed when using it unlocked.

Fixes: 2190d1d0944f ("net: sched: introduce helpers to work with filter chains")
Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: Vlad Buslov <vladbu@nvidia.com>
Reviewed-by: Davide Caratti <dcaratti@redhat.com>
Link: https://lore.kernel.org/r/b97d5f4eaffeeb9d058155bcab63347527261abf.1649341369.git.marcelo.leitner@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 net/sched/cls_api.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 9a789a057a74..b8ffb7e4f696 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -1656,10 +1656,10 @@ static int tcf_chain_tp_insert(struct tcf_chain *chain,
 	if (chain->flushing)
 		return -EAGAIN;
 
+	RCU_INIT_POINTER(tp->next, tcf_chain_tp_prev(chain, chain_info));
 	if (*chain_info->pprev == chain->filter_chain)
 		tcf_chain0_head_change(chain, tp);
 	tcf_proto_get(tp);
-	RCU_INIT_POINTER(tp->next, tcf_chain_tp_prev(chain, chain_info));
 	rcu_assign_pointer(*chain_info->pprev, tp);
 
 	return 0;
-- 
Gitee


From 4578de3d669952439941ad0ba4fc4b50fb1e85ba Mon Sep 17 00:00:00 2001
From: Michael Walle <michael@walle.cc>
Date: Tue, 19 Jul 2022 16:30:29 +0800
Subject: [PATCH 390/674] net: dsa: felix: suppress -EPROBE_DEFER errors

stable inclusion
from stable-v5.10.112
commit 2ad9d890d850db6c55ec2d9b5b3cbb7ca5178f3f
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=2ad9d890d850db6c55ec2d9b5b3cbb7ca5178f3f

--------------------------------

[ Upstream commit e6934e4048c91502efcb21da92b7ae37cd8fa741 ]

The DSA master might not have been probed yet in which case the probe of
the felix switch fails with -EPROBE_DEFER:
[    4.435305] mscc_felix 0000:00:00.5: Failed to register DSA switch: -517

It is not an error. Use dev_err_probe() to demote this particular error
to a debug message.

Fixes: 56051948773e ("net: dsa: ocelot: add driver for Felix switch family")
Signed-off-by: Michael Walle <michael@walle.cc>
Reviewed-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Link: https://lore.kernel.org/r/20220408101521.281886-1-michael@walle.cc
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/net/dsa/ocelot/felix_vsc9959.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/dsa/ocelot/felix_vsc9959.c b/drivers/net/dsa/ocelot/felix_vsc9959.c
index cd8d9b0e0edb..c96dfc11aa6f 100644
--- a/drivers/net/dsa/ocelot/felix_vsc9959.c
+++ b/drivers/net/dsa/ocelot/felix_vsc9959.c
@@ -1466,7 +1466,7 @@ static int felix_pci_probe(struct pci_dev *pdev,
 
 	err = dsa_register_switch(ds);
 	if (err) {
-		dev_err(&pdev->dev, "Failed to register DSA switch: %d\n", err);
+		dev_err_probe(&pdev->dev, err, "Failed to register DSA switch\n");
 		goto err_register_ds;
 	}
 
-- 
Gitee


From 29da03f5d2a00383d7da4b7a600ac824f4e27f31 Mon Sep 17 00:00:00 2001
From: Dinh Nguyen <dinguyen@kernel.org>
Date: Tue, 19 Jul 2022 16:30:30 +0800
Subject: [PATCH 391/674] net: ethernet: stmmac: fix altr_tse_pcs function when
 using a fixed-link

stable inclusion
from stable-v5.10.112
commit 08d5e3e954537931c8da7428034808d202e98299
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=08d5e3e954537931c8da7428034808d202e98299

--------------------------------

[ Upstream commit a6aaa00324240967272b451bfa772547bd576ee6 ]

When using a fixed-link, the altr_tse_pcs driver crashes
due to null-pointer dereference as no phy_device is provided to
tse_pcs_fix_mac_speed function. Fix this by adding a check for
phy_dev before calling the tse_pcs_fix_mac_speed() function.

Also clean up the tse_pcs_fix_mac_speed function a bit. There is
no need to check for splitter_base and sgmii_adapter_base
because the driver will fail if these 2 variables are not
derived from the device tree.

Fixes: fb3bbdb85989 ("net: ethernet: Add TSE PCS support to dwmac-socfpga")
Signed-off-by: Dinh Nguyen <dinguyen@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/net/ethernet/stmicro/stmmac/altr_tse_pcs.c  |  8 --------
 drivers/net/ethernet/stmicro/stmmac/altr_tse_pcs.h  |  4 ++++
 drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c | 13 +++++--------
 3 files changed, 9 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/altr_tse_pcs.c b/drivers/net/ethernet/stmicro/stmmac/altr_tse_pcs.c
index cd478d2cd871..00f6d347eaf7 100644
--- a/drivers/net/ethernet/stmicro/stmmac/altr_tse_pcs.c
+++ b/drivers/net/ethernet/stmicro/stmmac/altr_tse_pcs.c
@@ -57,10 +57,6 @@
 #define TSE_PCS_USE_SGMII_ENA				BIT(0)
 #define TSE_PCS_IF_USE_SGMII				0x03
 
-#define SGMII_ADAPTER_CTRL_REG				0x00
-#define SGMII_ADAPTER_DISABLE				0x0001
-#define SGMII_ADAPTER_ENABLE				0x0000
-
 #define AUTONEGO_LINK_TIMER				20
 
 static int tse_pcs_reset(void __iomem *base, struct tse_pcs *pcs)
@@ -202,12 +198,8 @@ void tse_pcs_fix_mac_speed(struct tse_pcs *pcs, struct phy_device *phy_dev,
 			   unsigned int speed)
 {
 	void __iomem *tse_pcs_base = pcs->tse_pcs_base;
-	void __iomem *sgmii_adapter_base = pcs->sgmii_adapter_base;
 	u32 val;
 
-	writew(SGMII_ADAPTER_ENABLE,
-	       sgmii_adapter_base + SGMII_ADAPTER_CTRL_REG);
-
 	pcs->autoneg = phy_dev->autoneg;
 
 	if (phy_dev->autoneg == AUTONEG_ENABLE) {
diff --git a/drivers/net/ethernet/stmicro/stmmac/altr_tse_pcs.h b/drivers/net/ethernet/stmicro/stmmac/altr_tse_pcs.h
index 442812c0a4bd..694ac25ef426 100644
--- a/drivers/net/ethernet/stmicro/stmmac/altr_tse_pcs.h
+++ b/drivers/net/ethernet/stmicro/stmmac/altr_tse_pcs.h
@@ -10,6 +10,10 @@
 #include <linux/phy.h>
 #include <linux/timer.h>
 
+#define SGMII_ADAPTER_CTRL_REG		0x00
+#define SGMII_ADAPTER_ENABLE		0x0000
+#define SGMII_ADAPTER_DISABLE		0x0001
+
 struct tse_pcs {
 	struct device *dev;
 	void __iomem *tse_pcs_base;
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c
index f37b6d57b2fe..8bb0106cb7ea 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c
@@ -18,9 +18,6 @@
 
 #include "altr_tse_pcs.h"
 
-#define SGMII_ADAPTER_CTRL_REG                          0x00
-#define SGMII_ADAPTER_DISABLE                           0x0001
-
 #define SYSMGR_EMACGRP_CTRL_PHYSEL_ENUM_GMII_MII 0x0
 #define SYSMGR_EMACGRP_CTRL_PHYSEL_ENUM_RGMII 0x1
 #define SYSMGR_EMACGRP_CTRL_PHYSEL_ENUM_RMII 0x2
@@ -62,16 +59,14 @@ static void socfpga_dwmac_fix_mac_speed(void *priv, unsigned int speed)
 {
 	struct socfpga_dwmac *dwmac = (struct socfpga_dwmac *)priv;
 	void __iomem *splitter_base = dwmac->splitter_base;
-	void __iomem *tse_pcs_base = dwmac->pcs.tse_pcs_base;
 	void __iomem *sgmii_adapter_base = dwmac->pcs.sgmii_adapter_base;
 	struct device *dev = dwmac->dev;
 	struct net_device *ndev = dev_get_drvdata(dev);
 	struct phy_device *phy_dev = ndev->phydev;
 	u32 val;
 
-	if ((tse_pcs_base) && (sgmii_adapter_base))
-		writew(SGMII_ADAPTER_DISABLE,
-		       sgmii_adapter_base + SGMII_ADAPTER_CTRL_REG);
+	writew(SGMII_ADAPTER_DISABLE,
+	       sgmii_adapter_base + SGMII_ADAPTER_CTRL_REG);
 
 	if (splitter_base) {
 		val = readl(splitter_base + EMAC_SPLITTER_CTRL_REG);
@@ -93,7 +88,9 @@ static void socfpga_dwmac_fix_mac_speed(void *priv, unsigned int speed)
 		writel(val, splitter_base + EMAC_SPLITTER_CTRL_REG);
 	}
 
-	if (tse_pcs_base && sgmii_adapter_base)
+	writew(SGMII_ADAPTER_ENABLE,
+	       sgmii_adapter_base + SGMII_ADAPTER_CTRL_REG);
+	if (phy_dev)
 		tse_pcs_fix_mac_speed(&dwmac->pcs, phy_dev, speed);
 }
 
-- 
Gitee


From 1421e20fee73d26e077b2fc005b15d6f346416cb Mon Sep 17 00:00:00 2001
From: Benedikt Spranger <b.spranger@linutronix.de>
Date: Tue, 19 Jul 2022 16:30:31 +0800
Subject: [PATCH 392/674] net/sched: taprio: Check if socket flags are valid

stable inclusion
from stable-v5.10.112
commit a44938950e5e12dd676ec2b38701b6d3c4e1a281
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=a44938950e5e12dd676ec2b38701b6d3c4e1a281

--------------------------------

[ Upstream commit e8a64bbaaad1f6548cec5508297bc6d45e8ab69e ]

A user may set the SO_TXTIME socket option to ensure a packet is send
at a given time. The taprio scheduler has to confirm, that it is allowed
to send a packet at that given time, by a check against the packet time
schedule. The scheduler drop the packet, if the gates are closed at the
given send time.

The check, if SO_TXTIME is set, may fail since sk_flags are part of an
union and the union is used otherwise. This happen, if a socket is not
a full socket, like a request socket for example.

Add a check to verify, if the union is used for sk_flags.

Fixes: 4cfd5779bd6e ("taprio: Add support for txtime-assist mode")
Signed-off-by: Benedikt Spranger <b.spranger@linutronix.de>
Reviewed-by: Kurt Kanzenbach <kurt@linutronix.de>
Acked-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 net/sched/sch_taprio.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
index 806babdd838d..eca525791013 100644
--- a/net/sched/sch_taprio.c
+++ b/net/sched/sch_taprio.c
@@ -427,7 +427,8 @@ static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 	if (unlikely(!child))
 		return qdisc_drop(skb, sch, to_free);
 
-	if (skb->sk && sock_flag(skb->sk, SOCK_TXTIME)) {
+	/* sk_flags are only safe to use on full sockets. */
+	if (skb->sk && sk_fullsock(skb->sk) && sock_flag(skb->sk, SOCK_TXTIME)) {
 		if (!is_valid_interval(skb, sch))
 			return qdisc_drop(skb, sch, to_free);
 	} else if (TXTIME_ASSIST_IS_ENABLED(q->flags)) {
-- 
Gitee


From cf3b55065cc63242aa66207a9a7aa8abe63d6693 Mon Sep 17 00:00:00 2001
From: Rameshkumar Sundaram <quic_ramess@quicinc.com>
Date: Tue, 19 Jul 2022 16:30:32 +0800
Subject: [PATCH 393/674] cfg80211: hold bss_lock while updating nontrans_list

stable inclusion
from stable-v5.10.112
commit 5513f9a0b0684383c1bc120ea37b35ae944a709a
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=5513f9a0b0684383c1bc120ea37b35ae944a709a

--------------------------------

[ Upstream commit a5199b5626cd6913cf8776a835bc63d40e0686ad ]

Synchronize additions to nontrans_list of transmitting BSS with
bss_lock to avoid races. Also when cfg80211_add_nontrans_list() fails
__cfg80211_unlink_bss() needs bss_lock to be held (has lockdep assert
on bss_lock). So protect the whole block with bss_lock to avoid
races and warnings. Found during code review.

Fixes: 0b8fb8235be8 ("cfg80211: Parsing of Multiple BSSID information in scanning")
Signed-off-by: Rameshkumar Sundaram <quic_ramess@quicinc.com>
Link: https://lore.kernel.org/r/1649668071-9370-1-git-send-email-quic_ramess@quicinc.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 net/wireless/scan.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index c1b2655682a8..6dc9b7e22b71 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -1968,11 +1968,13 @@ cfg80211_inform_single_bss_data(struct wiphy *wiphy,
 		/* this is a nontransmitting bss, we need to add it to
 		 * transmitting bss' list if it is not there
 		 */
+		spin_lock_bh(&rdev->bss_lock);
 		if (cfg80211_add_nontrans_list(non_tx_data->tx_bss,
 					       &res->pub)) {
 			if (__cfg80211_unlink_bss(rdev, res))
 				rdev->bss_generation++;
 		}
+		spin_unlock_bh(&rdev->bss_lock);
 	}
 
 	trace_cfg80211_return_bss(&res->pub);
-- 
Gitee


From 8d0ffc27fd8f031c68c7d1fae4ea65819e71f052 Mon Sep 17 00:00:00 2001
From: Rob Clark <robdclark@chromium.org>
Date: Tue, 19 Jul 2022 16:30:33 +0800
Subject: [PATCH 394/674] drm/msm: Fix range size vs end confusion

stable inclusion
from stable-v5.10.112
commit 5f78ad93837c61ed62a6f7b10a960c0c9218382c
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=5f78ad93837c61ed62a6f7b10a960c0c9218382c

--------------------------------

[ Upstream commit 537fef808be5ea56f6fc06932162550819a3b3c3 ]

The fourth param is size, rather than range_end.

Note that we could increase the address space size if we had a way to
prevent buffers from spanning a 4G split, mostly just to avoid fw bugs
with 64b math.

Fixes: 84c31ee16f90 ("drm/msm/a6xx: Add support for per-instance pagetables")
Signed-off-by: Rob Clark <robdclark@chromium.org>
Link: https://lore.kernel.org/r/20220407202836.1211268-1-robdclark@gmail.com
Signed-off-by: Rob Clark <robdclark@chromium.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
index 9e09805575db..39563daff4a0 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
@@ -1222,7 +1222,7 @@ a6xx_create_private_address_space(struct msm_gpu *gpu)
 		return ERR_CAST(mmu);
 
 	return msm_gem_address_space_create(mmu,
-		"gpu", 0x100000000ULL, 0x1ffffffffULL);
+		"gpu", 0x100000000ULL, SZ_4G);
 }
 
 static uint32_t a6xx_get_rptr(struct msm_gpu *gpu, struct msm_ringbuffer *ring)
-- 
Gitee


From e8dcb50674b91f69cf1c54d27af9ff783b2305c1 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <swboyd@chromium.org>
Date: Tue, 19 Jul 2022 16:30:34 +0800
Subject: [PATCH 395/674] drm/msm/dsi: Use connector directly in
 msm_dsi_manager_connector_init()

stable inclusion
from stable-v5.10.112
commit 98a7f6c4ada4ca97fcb0d508e097f8597df19d4d
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=98a7f6c4ada4ca97fcb0d508e097f8597df19d4d

--------------------------------

[ Upstream commit 47b7de6b88b962ef339a2427a023d2a23d161654 ]

The member 'msm_dsi->connector' isn't assigned until
msm_dsi_manager_connector_init() returns (see msm_dsi_modeset_init() and
how it assigns the return value). Therefore this pointer is going to be
NULL here. Let's use 'connector' which is what was intended.

Cc: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Cc: Sean Paul <seanpaul@chromium.org>
Fixes: 6d5e78406991 ("drm/msm/dsi: Move dsi panel init into modeset init path")
Signed-off-by: Stephen Boyd <swboyd@chromium.org>
Reviewed-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Patchwork: https://patchwork.freedesktop.org/patch/478693/
Link: https://lore.kernel.org/r/20220318000731.2823718-1-swboyd@chromium.org
Signed-off-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Signed-off-by: Rob Clark <robdclark@chromium.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/gpu/drm/msm/dsi/dsi_manager.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/msm/dsi/dsi_manager.c b/drivers/gpu/drm/msm/dsi/dsi_manager.c
index 1d28dfba2c9b..fb421ca56b3d 100644
--- a/drivers/gpu/drm/msm/dsi/dsi_manager.c
+++ b/drivers/gpu/drm/msm/dsi/dsi_manager.c
@@ -644,7 +644,7 @@ struct drm_connector *msm_dsi_manager_connector_init(u8 id)
 	return connector;
 
 fail:
-	connector->funcs->destroy(msm_dsi->connector);
+	connector->funcs->destroy(connector);
 	return ERR_PTR(ret);
 }
 
-- 
Gitee


From 0a0cbbcba4acbadb29ef77ce3cdf774688a9f533 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Tue, 19 Jul 2022 16:30:35 +0800
Subject: [PATCH 396/674] net/smc: Fix NULL pointer dereference in
 smc_pnet_find_ib()

stable inclusion
from stable-v5.10.112
commit 35b91e49bc80ca944a8679c3b139ddaf2f8eea0f
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=35b91e49bc80ca944a8679c3b139ddaf2f8eea0f

--------------------------------

[ Upstream commit d22f4f977236f97e01255a80bca2ea93a8094fc8 ]

dev_name() was called with dev.parent as argument but without to
NULL-check it before.
Solve this by checking the pointer before the call to dev_name().

Fixes: af5f60c7e3d5 ("net/smc: allow PCI IDs as ib device names in the pnet table")
Reported-by: syzbot+03e3e228510223dabd34@syzkaller.appspotmail.com
Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 net/smc/smc_pnet.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c
index 9007c7e3bae4..30bae60d626c 100644
--- a/net/smc/smc_pnet.c
+++ b/net/smc/smc_pnet.c
@@ -310,8 +310,9 @@ static struct smc_ib_device *smc_pnet_find_ib(char *ib_name)
 	list_for_each_entry(ibdev, &smc_ib_devices.list, list) {
 		if (!strncmp(ibdev->ibdev->name, ib_name,
 			     sizeof(ibdev->ibdev->name)) ||
-		    !strncmp(dev_name(ibdev->ibdev->dev.parent), ib_name,
-			     IB_DEVICE_NAME_MAX - 1)) {
+		    (ibdev->ibdev->dev.parent &&
+		     !strncmp(dev_name(ibdev->ibdev->dev.parent), ib_name,
+			     IB_DEVICE_NAME_MAX - 1))) {
 			goto out;
 		}
 	}
-- 
Gitee


From 892570c0513e94daee3c90231e744554fd1ed370 Mon Sep 17 00:00:00 2001
From: Ajish Koshy <Ajish.Koshy@microchip.com>
Date: Tue, 19 Jul 2022 16:30:36 +0800
Subject: [PATCH 397/674] scsi: pm80xx: Mask and unmask upper interrupt vectors
 32-63

stable inclusion
from stable-v5.10.112
commit e08d26971237d1af8c3ca456f5729f3f4033890b
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=e08d26971237d1af8c3ca456f5729f3f4033890b

--------------------------------

[ Upstream commit 294080eacf92a0781e6d43663448a55001ec8c64 ]

When upper inbound and outbound queues 32-63 are enabled, we see upper
vectors 32-63 in interrupt service routine. We need corresponding registers
to handle masking and unmasking of these upper interrupts.

To achieve this, we use registers MSGU_ODMR_U(0x34) to mask and
MSGU_ODMR_CLR_U(0x3C) to unmask the interrupts. In these registers bit 0-31
represents interrupt vectors 32-63.

Link: https://lore.kernel.org/r/20220411064603.668448-2-Ajish.Koshy@microchip.com
Fixes: 05c6c029a44d ("scsi: pm80xx: Increase number of supported queues")
Reviewed-by: John Garry <john.garry@huawei.com>
Acked-by: Jack Wang <jinpu.wang@ionos.com>
Signed-off-by: Ajish Koshy <Ajish.Koshy@microchip.com>
Signed-off-by: Viswas G <Viswas.G@microchip.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/scsi/pm8001/pm80xx_hwi.c | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/drivers/scsi/pm8001/pm80xx_hwi.c b/drivers/scsi/pm8001/pm80xx_hwi.c
index a66a06914913..e1014bb9ee28 100644
--- a/drivers/scsi/pm8001/pm80xx_hwi.c
+++ b/drivers/scsi/pm8001/pm80xx_hwi.c
@@ -1682,10 +1682,11 @@ static void
 pm80xx_chip_interrupt_enable(struct pm8001_hba_info *pm8001_ha, u8 vec)
 {
 #ifdef PM8001_USE_MSIX
-	u32 mask;
-	mask = (u32)(1 << vec);
-
-	pm8001_cw32(pm8001_ha, 0, MSGU_ODMR_CLR, (u32)(mask & 0xFFFFFFFF));
+	if (vec < 32)
+		pm8001_cw32(pm8001_ha, 0, MSGU_ODMR_CLR, 1U << vec);
+	else
+		pm8001_cw32(pm8001_ha, 0, MSGU_ODMR_CLR_U,
+			    1U << (vec - 32));
 	return;
 #endif
 	pm80xx_chip_intx_interrupt_enable(pm8001_ha);
@@ -1701,12 +1702,15 @@ static void
 pm80xx_chip_interrupt_disable(struct pm8001_hba_info *pm8001_ha, u8 vec)
 {
 #ifdef PM8001_USE_MSIX
-	u32 mask;
-	if (vec == 0xFF)
-		mask = 0xFFFFFFFF;
+	if (vec == 0xFF) {
+		/* disable all vectors 0-31, 32-63 */
+		pm8001_cw32(pm8001_ha, 0, MSGU_ODMR, 0xFFFFFFFF);
+		pm8001_cw32(pm8001_ha, 0, MSGU_ODMR_U, 0xFFFFFFFF);
+	} else if (vec < 32)
+		pm8001_cw32(pm8001_ha, 0, MSGU_ODMR, 1U << vec);
 	else
-		mask = (u32)(1 << vec);
-	pm8001_cw32(pm8001_ha, 0, MSGU_ODMR, (u32)(mask & 0xFFFFFFFF));
+		pm8001_cw32(pm8001_ha, 0, MSGU_ODMR_U,
+			    1U << (vec - 32));
 	return;
 #endif
 	pm80xx_chip_intx_interrupt_disable(pm8001_ha);
-- 
Gitee


From 6ad5d4746792d911f5dde2f44071fe381bd095ed Mon Sep 17 00:00:00 2001
From: Ajish Koshy <Ajish.Koshy@microchip.com>
Date: Tue, 19 Jul 2022 16:30:37 +0800
Subject: [PATCH 398/674] scsi: pm80xx: Enable upper inbound, outbound queues

stable inclusion
from stable-v5.10.112
commit da9cf24aa739f48cdbf5b2624eacb4a6c81cf9e9
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=da9cf24aa739f48cdbf5b2624eacb4a6c81cf9e9

--------------------------------

[ Upstream commit bcd8a45223470e00b5f254018174d64a75db4bbe ]

Executing driver on servers with more than 32 CPUs were faced with command
timeouts. This is because we were not geting completions for commands
submitted on IQ32 - IQ63.

Set E64Q bit to enable upper inbound and outbound queues 32 to 63 in the
MPI main configuration table.

Added 500ms delay after successful MPI initialization as mentioned in
controller datasheet.

Link: https://lore.kernel.org/r/20220411064603.668448-3-Ajish.Koshy@microchip.com
Fixes: 05c6c029a44d ("scsi: pm80xx: Increase number of supported queues")
Reviewed-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Acked-by: Jack Wang <jinpu.wang@ionos.com>
Signed-off-by: Ajish Koshy <Ajish.Koshy@microchip.com>
Signed-off-by: Viswas G <Viswas.G@microchip.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/scsi/pm8001/pm80xx_hwi.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/drivers/scsi/pm8001/pm80xx_hwi.c b/drivers/scsi/pm8001/pm80xx_hwi.c
index e1014bb9ee28..51345f02383d 100644
--- a/drivers/scsi/pm8001/pm80xx_hwi.c
+++ b/drivers/scsi/pm8001/pm80xx_hwi.c
@@ -765,6 +765,10 @@ static void init_default_table_values(struct pm8001_hba_info *pm8001_ha)
 	pm8001_ha->main_cfg_tbl.pm80xx_tbl.pcs_event_log_severity	= 0x01;
 	pm8001_ha->main_cfg_tbl.pm80xx_tbl.fatal_err_interrupt		= 0x01;
 
+	/* Enable higher IQs and OQs, 32 to 63, bit 16 */
+	if (pm8001_ha->max_q_num > 32)
+		pm8001_ha->main_cfg_tbl.pm80xx_tbl.fatal_err_interrupt |=
+							1 << 16;
 	/* Disable end to end CRC checking */
 	pm8001_ha->main_cfg_tbl.pm80xx_tbl.crc_core_dump = (0x1 << 16);
 
@@ -1024,6 +1028,13 @@ static int mpi_init_check(struct pm8001_hba_info *pm8001_ha)
 	if (0x0000 != gst_len_mpistate)
 		return -EBUSY;
 
+	/*
+	 *  As per controller datasheet, after successful MPI
+	 *  initialization minimum 500ms delay is required before
+	 *  issuing commands.
+	 */
+	msleep(500);
+
 	return 0;
 }
 
-- 
Gitee


From c19ae4e35a05c1b9deb051f21612276f4f5c259f Mon Sep 17 00:00:00 2001
From: Mike Christie <michael.christie@oracle.com>
Date: Tue, 19 Jul 2022 16:30:38 +0800
Subject: [PATCH 399/674] scsi: iscsi: Stop queueing during ep_disconnect

stable inclusion
from stable-v5.10.112
commit 17d14456f6262b87f2ce6e749cc52ebdfa90949d
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=17d14456f6262b87f2ce6e749cc52ebdfa90949d

--------------------------------

[ Upstream commit 891e2639deae721dc43764a44fa255890dc34313 ]

During ep_disconnect we have been doing iscsi_suspend_tx/queue to block new
I/O but every driver except cxgbi and iscsi_tcp can still get I/O from
__iscsi_conn_send_pdu() if we haven't called iscsi_conn_failure() before
ep_disconnect. This could happen if we were terminating the session, and
the logout timed out before it was even sent to libiscsi.

Fix the issue by adding a helper which reverses the bind_conn call that
allows new I/O to be queued. Drivers implementing ep_disconnect can use this
to make sure new I/O is not queued to them when handling the disconnect.

Link: https://lore.kernel.org/r/20210525181821.7617-3-michael.christie@oracle.com
Reviewed-by: Lee Duncan <lduncan@suse.com>
Signed-off-by: Mike Christie <michael.christie@oracle.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/infiniband/ulp/iser/iscsi_iser.c |  1 +
 drivers/scsi/be2iscsi/be_main.c          |  1 +
 drivers/scsi/bnx2i/bnx2i_iscsi.c         |  1 +
 drivers/scsi/cxgbi/cxgb3i/cxgb3i.c       |  1 +
 drivers/scsi/cxgbi/cxgb4i/cxgb4i.c       |  1 +
 drivers/scsi/libiscsi.c                  | 70 +++++++++++++++++++++---
 drivers/scsi/qedi/qedi_iscsi.c           |  1 +
 drivers/scsi/qla4xxx/ql4_os.c            |  1 +
 drivers/scsi/scsi_transport_iscsi.c      | 10 +++-
 include/scsi/libiscsi.h                  |  1 +
 include/scsi/scsi_transport_iscsi.h      |  1 +
 11 files changed, 78 insertions(+), 11 deletions(-)

diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c
index 3690e28cc7ea..8857d8322710 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.c
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.c
@@ -988,6 +988,7 @@ static struct iscsi_transport iscsi_iser_transport = {
 	/* connection management */
 	.create_conn            = iscsi_iser_conn_create,
 	.bind_conn              = iscsi_iser_conn_bind,
+	.unbind_conn		= iscsi_conn_unbind,
 	.destroy_conn           = iscsi_conn_teardown,
 	.attr_is_visible	= iser_attr_is_visible,
 	.set_param              = iscsi_iser_set_param,
diff --git a/drivers/scsi/be2iscsi/be_main.c b/drivers/scsi/be2iscsi/be_main.c
index 987dc8135a9b..b977e039bb78 100644
--- a/drivers/scsi/be2iscsi/be_main.c
+++ b/drivers/scsi/be2iscsi/be_main.c
@@ -5810,6 +5810,7 @@ struct iscsi_transport beiscsi_iscsi_transport = {
 	.destroy_session = beiscsi_session_destroy,
 	.create_conn = beiscsi_conn_create,
 	.bind_conn = beiscsi_conn_bind,
+	.unbind_conn = iscsi_conn_unbind,
 	.destroy_conn = iscsi_conn_teardown,
 	.attr_is_visible = beiscsi_attr_is_visible,
 	.set_iface_param = beiscsi_iface_set_param,
diff --git a/drivers/scsi/bnx2i/bnx2i_iscsi.c b/drivers/scsi/bnx2i/bnx2i_iscsi.c
index 37f5b719050e..197aca76a69d 100644
--- a/drivers/scsi/bnx2i/bnx2i_iscsi.c
+++ b/drivers/scsi/bnx2i/bnx2i_iscsi.c
@@ -2276,6 +2276,7 @@ struct iscsi_transport bnx2i_iscsi_transport = {
 	.destroy_session	= bnx2i_session_destroy,
 	.create_conn		= bnx2i_conn_create,
 	.bind_conn		= bnx2i_conn_bind,
+	.unbind_conn		= iscsi_conn_unbind,
 	.destroy_conn		= bnx2i_conn_destroy,
 	.attr_is_visible	= bnx2i_attr_is_visible,
 	.set_param		= iscsi_set_param,
diff --git a/drivers/scsi/cxgbi/cxgb3i/cxgb3i.c b/drivers/scsi/cxgbi/cxgb3i/cxgb3i.c
index 37d99357120f..edcd3fab6973 100644
--- a/drivers/scsi/cxgbi/cxgb3i/cxgb3i.c
+++ b/drivers/scsi/cxgbi/cxgb3i/cxgb3i.c
@@ -117,6 +117,7 @@ static struct iscsi_transport cxgb3i_iscsi_transport = {
 	/* connection management */
 	.create_conn	= cxgbi_create_conn,
 	.bind_conn	= cxgbi_bind_conn,
+	.unbind_conn	= iscsi_conn_unbind,
 	.destroy_conn	= iscsi_tcp_conn_teardown,
 	.start_conn	= iscsi_conn_start,
 	.stop_conn	= iscsi_conn_stop,
diff --git a/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c b/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c
index 2c3491528d42..efb3e2b3398e 100644
--- a/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c
+++ b/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c
@@ -134,6 +134,7 @@ static struct iscsi_transport cxgb4i_iscsi_transport = {
 	/* connection management */
 	.create_conn	= cxgbi_create_conn,
 	.bind_conn		= cxgbi_bind_conn,
+	.unbind_conn	= iscsi_conn_unbind,
 	.destroy_conn	= iscsi_tcp_conn_teardown,
 	.start_conn		= iscsi_conn_start,
 	.stop_conn		= iscsi_conn_stop,
diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c
index 7ef5efd94422..e361856509d5 100644
--- a/drivers/scsi/libiscsi.c
+++ b/drivers/scsi/libiscsi.c
@@ -1386,23 +1386,32 @@ void iscsi_session_failure(struct iscsi_session *session,
 }
 EXPORT_SYMBOL_GPL(iscsi_session_failure);
 
-void iscsi_conn_failure(struct iscsi_conn *conn, enum iscsi_err err)
+static bool iscsi_set_conn_failed(struct iscsi_conn *conn)
 {
 	struct iscsi_session *session = conn->session;
 
-	spin_lock_bh(&session->frwd_lock);
-	if (session->state == ISCSI_STATE_FAILED) {
-		spin_unlock_bh(&session->frwd_lock);
-		return;
-	}
+	if (session->state == ISCSI_STATE_FAILED)
+		return false;
 
 	if (conn->stop_stage == 0)
 		session->state = ISCSI_STATE_FAILED;
-	spin_unlock_bh(&session->frwd_lock);
 
 	set_bit(ISCSI_SUSPEND_BIT, &conn->suspend_tx);
 	set_bit(ISCSI_SUSPEND_BIT, &conn->suspend_rx);
-	iscsi_conn_error_event(conn->cls_conn, err);
+	return true;
+}
+
+void iscsi_conn_failure(struct iscsi_conn *conn, enum iscsi_err err)
+{
+	struct iscsi_session *session = conn->session;
+	bool needs_evt;
+
+	spin_lock_bh(&session->frwd_lock);
+	needs_evt = iscsi_set_conn_failed(conn);
+	spin_unlock_bh(&session->frwd_lock);
+
+	if (needs_evt)
+		iscsi_conn_error_event(conn->cls_conn, err);
 }
 EXPORT_SYMBOL_GPL(iscsi_conn_failure);
 
@@ -2178,6 +2187,51 @@ static void iscsi_check_transport_timeouts(struct timer_list *t)
 	spin_unlock(&session->frwd_lock);
 }
 
+/**
+ * iscsi_conn_unbind - prevent queueing to conn.
+ * @cls_conn: iscsi conn ep is bound to.
+ * @is_active: is the conn in use for boot or is this for EH/termination
+ *
+ * This must be called by drivers implementing the ep_disconnect callout.
+ * It disables queueing to the connection from libiscsi in preparation for
+ * an ep_disconnect call.
+ */
+void iscsi_conn_unbind(struct iscsi_cls_conn *cls_conn, bool is_active)
+{
+	struct iscsi_session *session;
+	struct iscsi_conn *conn;
+
+	if (!cls_conn)
+		return;
+
+	conn = cls_conn->dd_data;
+	session = conn->session;
+	/*
+	 * Wait for iscsi_eh calls to exit. We don't wait for the tmf to
+	 * complete or timeout. The caller just wants to know what's running
+	 * is everything that needs to be cleaned up, and no cmds will be
+	 * queued.
+	 */
+	mutex_lock(&session->eh_mutex);
+
+	iscsi_suspend_queue(conn);
+	iscsi_suspend_tx(conn);
+
+	spin_lock_bh(&session->frwd_lock);
+	if (!is_active) {
+		/*
+		 * if logout timed out before userspace could even send a PDU
+		 * the state might still be in ISCSI_STATE_LOGGED_IN and
+		 * allowing new cmds and TMFs.
+		 */
+		if (session->state == ISCSI_STATE_LOGGED_IN)
+			iscsi_set_conn_failed(conn);
+	}
+	spin_unlock_bh(&session->frwd_lock);
+	mutex_unlock(&session->eh_mutex);
+}
+EXPORT_SYMBOL_GPL(iscsi_conn_unbind);
+
 static void iscsi_prep_abort_task_pdu(struct iscsi_task *task,
 				      struct iscsi_tm *hdr)
 {
diff --git a/drivers/scsi/qedi/qedi_iscsi.c b/drivers/scsi/qedi/qedi_iscsi.c
index f51723e2d522..8f8036ae9a56 100644
--- a/drivers/scsi/qedi/qedi_iscsi.c
+++ b/drivers/scsi/qedi/qedi_iscsi.c
@@ -1428,6 +1428,7 @@ struct iscsi_transport qedi_iscsi_transport = {
 	.destroy_session = qedi_session_destroy,
 	.create_conn = qedi_conn_create,
 	.bind_conn = qedi_conn_bind,
+	.unbind_conn = iscsi_conn_unbind,
 	.start_conn = qedi_conn_start,
 	.stop_conn = iscsi_conn_stop,
 	.destroy_conn = qedi_conn_destroy,
diff --git a/drivers/scsi/qla4xxx/ql4_os.c b/drivers/scsi/qla4xxx/ql4_os.c
index 2c23b692e318..a8b8bf118c76 100644
--- a/drivers/scsi/qla4xxx/ql4_os.c
+++ b/drivers/scsi/qla4xxx/ql4_os.c
@@ -259,6 +259,7 @@ static struct iscsi_transport qla4xxx_iscsi_transport = {
 	.start_conn             = qla4xxx_conn_start,
 	.create_conn            = qla4xxx_conn_create,
 	.bind_conn              = qla4xxx_conn_bind,
+	.unbind_conn		= iscsi_conn_unbind,
 	.stop_conn              = iscsi_conn_stop,
 	.destroy_conn           = qla4xxx_conn_destroy,
 	.set_param              = iscsi_set_param,
diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c
index a5759d0e388a..6490211b55d2 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -2957,7 +2957,7 @@ static int iscsi_if_ep_connect(struct iscsi_transport *transport,
 }
 
 static int iscsi_if_ep_disconnect(struct iscsi_transport *transport,
-				  u64 ep_handle)
+				  u64 ep_handle, bool is_active)
 {
 	struct iscsi_cls_conn *conn;
 	struct iscsi_endpoint *ep;
@@ -2974,6 +2974,8 @@ static int iscsi_if_ep_disconnect(struct iscsi_transport *transport,
 		conn->ep = NULL;
 		mutex_unlock(&conn->ep_mutex);
 		conn->state = ISCSI_CONN_FAILED;
+
+		transport->unbind_conn(conn, is_active);
 	}
 
 	transport->ep_disconnect(ep);
@@ -3005,7 +3007,8 @@ iscsi_if_transport_ep(struct iscsi_transport *transport,
 		break;
 	case ISCSI_UEVENT_TRANSPORT_EP_DISCONNECT:
 		rc = iscsi_if_ep_disconnect(transport,
-					    ev->u.ep_disconnect.ep_handle);
+					    ev->u.ep_disconnect.ep_handle,
+					    false);
 		break;
 	}
 	return rc;
@@ -3730,7 +3733,7 @@ iscsi_if_recv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, uint32_t *group)
 		conn = iscsi_conn_lookup(ev->u.b_conn.sid, ev->u.b_conn.cid);
 
 		if (conn && conn->ep)
-			iscsi_if_ep_disconnect(transport, conn->ep->id);
+			iscsi_if_ep_disconnect(transport, conn->ep->id, true);
 
 		if (!session || !conn) {
 			err = -EINVAL;
@@ -4649,6 +4652,7 @@ iscsi_register_transport(struct iscsi_transport *tt)
 	int err;
 
 	BUG_ON(!tt);
+	WARN_ON(tt->ep_disconnect && !tt->unbind_conn);
 
 	priv = iscsi_if_transport_lookup(tt);
 	if (priv)
diff --git a/include/scsi/libiscsi.h b/include/scsi/libiscsi.h
index b47428d86a4b..881e4762d626 100644
--- a/include/scsi/libiscsi.h
+++ b/include/scsi/libiscsi.h
@@ -424,6 +424,7 @@ extern int iscsi_conn_start(struct iscsi_cls_conn *);
 extern void iscsi_conn_stop(struct iscsi_cls_conn *, int);
 extern int iscsi_conn_bind(struct iscsi_cls_session *, struct iscsi_cls_conn *,
 			   int);
+extern void iscsi_conn_unbind(struct iscsi_cls_conn *cls_conn, bool is_active);
 extern void iscsi_conn_failure(struct iscsi_conn *conn, enum iscsi_err err);
 extern void iscsi_session_failure(struct iscsi_session *session,
 				  enum iscsi_err err);
diff --git a/include/scsi/scsi_transport_iscsi.h b/include/scsi/scsi_transport_iscsi.h
index f28bb20d6271..eb6ed499324d 100644
--- a/include/scsi/scsi_transport_iscsi.h
+++ b/include/scsi/scsi_transport_iscsi.h
@@ -82,6 +82,7 @@ struct iscsi_transport {
 	void (*destroy_session) (struct iscsi_cls_session *session);
 	struct iscsi_cls_conn *(*create_conn) (struct iscsi_cls_session *sess,
 				uint32_t cid);
+	void (*unbind_conn) (struct iscsi_cls_conn *conn, bool is_active);
 	int (*bind_conn) (struct iscsi_cls_session *session,
 			  struct iscsi_cls_conn *cls_conn,
 			  uint64_t transport_eph, int is_leading);
-- 
Gitee


From 46a768c953aeaf0744e3543eb51d534c4fde1cfb Mon Sep 17 00:00:00 2001
From: Mike Christie <michael.christie@oracle.com>
Date: Tue, 19 Jul 2022 16:30:39 +0800
Subject: [PATCH 400/674] scsi: iscsi: Force immediate failure during shutdown

stable inclusion
from stable-v5.10.112
commit 4029a1e992fc6a0e29ff56c74a9632317fe76022
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=4029a1e992fc6a0e29ff56c74a9632317fe76022

--------------------------------

[ Upstream commit 06c203a5566beecebb1f8838d026de8a61c8df71 ]

If the system is not up, we can just fail immediately since iscsid is not
going to ever answer our netlink events. We are already setting the
recovery_tmo to 0, but by passing stop_conn STOP_CONN_TERM we never will
block the session and start the recovery timer, because for that flag
userspace will do the unbind and destroy events which would remove the
devices and wake up and kill the eh.

Since the conn is dead and the system is going dowm this just has us use
STOP_CONN_RECOVER with recovery_tmo=0 so we fail immediately. However, if
the user has set the recovery_tmo=-1 we let the system hang like they
requested since they might have used that setting for specific reasons
(one known reason is for buggy cluster software).

Link: https://lore.kernel.org/r/20210525181821.7617-5-michael.christie@oracle.com
Signed-off-by: Mike Christie <michael.christie@oracle.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/scsi/scsi_transport_iscsi.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c
index 6490211b55d2..f93660142e35 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -2508,11 +2508,17 @@ static void stop_conn_work_fn(struct work_struct *work)
 		session = iscsi_session_lookup(sid);
 		if (session) {
 			if (system_state != SYSTEM_RUNNING) {
-				session->recovery_tmo = 0;
-				iscsi_if_stop_conn(conn, STOP_CONN_TERM);
-			} else {
-				iscsi_if_stop_conn(conn, STOP_CONN_RECOVER);
+				/*
+				 * If the user has set up for the session to
+				 * never timeout then hang like they wanted.
+				 * For all other cases fail right away since
+				 * userspace is not going to relogin.
+				 */
+				if (session->recovery_tmo > 0)
+					session->recovery_tmo = 0;
 			}
+
+			iscsi_if_stop_conn(conn, STOP_CONN_RECOVER);
 		}
 
 		list_del_init(&conn->conn_list_err);
-- 
Gitee


From 0be1155784fa93ef8777bab4219fccb588c1ed26 Mon Sep 17 00:00:00 2001
From: Mike Christie <michael.christie@oracle.com>
Date: Tue, 19 Jul 2022 16:30:40 +0800
Subject: [PATCH 401/674] scsi: iscsi: Use system_unbound_wq for destroy_work

stable inclusion
from stable-v5.10.112
commit 22608545b834b855c0a7fd9f675edc884b7d21a3
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=22608545b834b855c0a7fd9f675edc884b7d21a3

--------------------------------

[ Upstream commit b25b957d2db1585602c2c70fdf4261a5641fe6b7 ]

Use the system_unbound_wq for async session destruction. We don't need a
dedicated workqueue for async session destruction because:

 1. perf does not seem to be an issue since we only allow 1 active work.

 2. it does not have deps with other system works and we can run them in
    parallel with each other.

Link: https://lore.kernel.org/r/20210525181821.7617-6-michael.christie@oracle.com
Reviewed-by: Lee Duncan <lduncan@suse.com>
Signed-off-by: Mike Christie <michael.christie@oracle.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/scsi/scsi_transport_iscsi.c | 15 +--------------
 1 file changed, 1 insertion(+), 14 deletions(-)

diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c
index f93660142e35..ed0c7e812445 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -95,8 +95,6 @@ static DECLARE_WORK(stop_conn_work, stop_conn_work_fn);
 static atomic_t iscsi_session_nr; /* sysfs session id for next new session */
 static struct workqueue_struct *iscsi_eh_timer_workq;
 
-static struct workqueue_struct *iscsi_destroy_workq;
-
 static DEFINE_IDA(iscsi_sess_ida);
 /*
  * list of registered transports and lock that must
@@ -3717,7 +3715,7 @@ iscsi_if_recv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, uint32_t *group)
 			list_del_init(&session->sess_list);
 			spin_unlock_irqrestore(&sesslock, flags);
 
-			queue_work(iscsi_destroy_workq, &session->destroy_work);
+			queue_work(system_unbound_wq, &session->destroy_work);
 		}
 		break;
 	case ISCSI_UEVENT_UNBIND_SESSION:
@@ -4813,18 +4811,8 @@ static __init int iscsi_transport_init(void)
 		goto release_nls;
 	}
 
-	iscsi_destroy_workq = alloc_workqueue("%s",
-			WQ_SYSFS | __WQ_LEGACY | WQ_MEM_RECLAIM | WQ_UNBOUND,
-			1, "iscsi_destroy");
-	if (!iscsi_destroy_workq) {
-		err = -ENOMEM;
-		goto destroy_wq;
-	}
-
 	return 0;
 
-destroy_wq:
-	destroy_workqueue(iscsi_eh_timer_workq);
 release_nls:
 	netlink_kernel_release(nls);
 unregister_flashnode_bus:
@@ -4846,7 +4834,6 @@ static __init int iscsi_transport_init(void)
 
 static void __exit iscsi_transport_exit(void)
 {
-	destroy_workqueue(iscsi_destroy_workq);
 	destroy_workqueue(iscsi_eh_timer_workq);
 	netlink_kernel_release(nls);
 	bus_unregister(&iscsi_flashnode_bus);
-- 
Gitee


From aa1de7f19c05dba66a6b7a09e7c6984c9bd4c663 Mon Sep 17 00:00:00 2001
From: Mike Christie <michael.christie@oracle.com>
Date: Tue, 19 Jul 2022 16:30:41 +0800
Subject: [PATCH 402/674] scsi: iscsi: Rel ref after iscsi_lookup_endpoint()

stable inclusion
from stable-v5.10.112
commit 812573896711f3ffb083c374f26f5807efca3b2f
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=812573896711f3ffb083c374f26f5807efca3b2f

--------------------------------

[ Upstream commit 9e5fe1700896c85040943fdc0d3fee0dd3e0d36f ]

Subsequent commits allow the kernel to do ep_disconnect. In that case we
will have to get a proper refcount on the ep so one thread does not delete
it from under another.

Link: https://lore.kernel.org/r/20210525181821.7617-7-michael.christie@oracle.com
Reviewed-by: Lee Duncan <lduncan@suse.com>
Signed-off-by: Mike Christie <michael.christie@oracle.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/infiniband/ulp/iser/iscsi_iser.c |  1 +
 drivers/scsi/be2iscsi/be_iscsi.c         | 19 ++++++++++++------
 drivers/scsi/bnx2i/bnx2i_iscsi.c         | 23 +++++++++++++++-------
 drivers/scsi/cxgbi/libcxgbi.c            | 12 ++++++++----
 drivers/scsi/qedi/qedi_iscsi.c           | 25 +++++++++++++++++-------
 drivers/scsi/qla4xxx/ql4_os.c            |  1 +
 drivers/scsi/scsi_transport_iscsi.c      | 25 ++++++++++++++++--------
 include/scsi/scsi_transport_iscsi.h      |  1 +
 8 files changed, 75 insertions(+), 32 deletions(-)

diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c
index 8857d8322710..a16e066989fa 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.c
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.c
@@ -499,6 +499,7 @@ iscsi_iser_conn_bind(struct iscsi_cls_session *cls_session,
 	iser_conn->iscsi_conn = conn;
 
 out:
+	iscsi_put_endpoint(ep);
 	mutex_unlock(&iser_conn->state_mutex);
 	return error;
 }
diff --git a/drivers/scsi/be2iscsi/be_iscsi.c b/drivers/scsi/be2iscsi/be_iscsi.c
index a13c203ef7a9..c4881657a807 100644
--- a/drivers/scsi/be2iscsi/be_iscsi.c
+++ b/drivers/scsi/be2iscsi/be_iscsi.c
@@ -182,6 +182,7 @@ int beiscsi_conn_bind(struct iscsi_cls_session *cls_session,
 	struct beiscsi_endpoint *beiscsi_ep;
 	struct iscsi_endpoint *ep;
 	uint16_t cri_index;
+	int rc = 0;
 
 	ep = iscsi_lookup_endpoint(transport_fd);
 	if (!ep)
@@ -189,15 +190,17 @@ int beiscsi_conn_bind(struct iscsi_cls_session *cls_session,
 
 	beiscsi_ep = ep->dd_data;
 
-	if (iscsi_conn_bind(cls_session, cls_conn, is_leading))
-		return -EINVAL;
+	if (iscsi_conn_bind(cls_session, cls_conn, is_leading)) {
+		rc = -EINVAL;
+		goto put_ep;
+	}
 
 	if (beiscsi_ep->phba != phba) {
 		beiscsi_log(phba, KERN_ERR, BEISCSI_LOG_CONFIG,
 			    "BS_%d : beiscsi_ep->hba=%p not equal to phba=%p\n",
 			    beiscsi_ep->phba, phba);
-
-		return -EEXIST;
+		rc = -EEXIST;
+		goto put_ep;
 	}
 	cri_index = BE_GET_CRI_FROM_CID(beiscsi_ep->ep_cid);
 	if (phba->conn_table[cri_index]) {
@@ -209,7 +212,8 @@ int beiscsi_conn_bind(struct iscsi_cls_session *cls_session,
 				      beiscsi_ep->ep_cid,
 				      beiscsi_conn,
 				      phba->conn_table[cri_index]);
-			return -EINVAL;
+			rc = -EINVAL;
+			goto put_ep;
 		}
 	}
 
@@ -226,7 +230,10 @@ int beiscsi_conn_bind(struct iscsi_cls_session *cls_session,
 		    "BS_%d : cid %d phba->conn_table[%u]=%p\n",
 		    beiscsi_ep->ep_cid, cri_index, beiscsi_conn);
 	phba->conn_table[cri_index] = beiscsi_conn;
-	return 0;
+
+put_ep:
+	iscsi_put_endpoint(ep);
+	return rc;
 }
 
 static int beiscsi_iface_create_ipv4(struct beiscsi_hba *phba)
diff --git a/drivers/scsi/bnx2i/bnx2i_iscsi.c b/drivers/scsi/bnx2i/bnx2i_iscsi.c
index 197aca76a69d..8cf2f9a7cfdc 100644
--- a/drivers/scsi/bnx2i/bnx2i_iscsi.c
+++ b/drivers/scsi/bnx2i/bnx2i_iscsi.c
@@ -1420,17 +1420,23 @@ static int bnx2i_conn_bind(struct iscsi_cls_session *cls_session,
 	 * Forcefully terminate all in progress connection recovery at the
 	 * earliest, either in bind(), send_pdu(LOGIN), or conn_start()
 	 */
-	if (bnx2i_adapter_ready(hba))
-		return -EIO;
+	if (bnx2i_adapter_ready(hba)) {
+		ret_code = -EIO;
+		goto put_ep;
+	}
 
 	bnx2i_ep = ep->dd_data;
 	if ((bnx2i_ep->state == EP_STATE_TCP_FIN_RCVD) ||
-	    (bnx2i_ep->state == EP_STATE_TCP_RST_RCVD))
+	    (bnx2i_ep->state == EP_STATE_TCP_RST_RCVD)) {
 		/* Peer disconnect via' FIN or RST */
-		return -EINVAL;
+		ret_code = -EINVAL;
+		goto put_ep;
+	}
 
-	if (iscsi_conn_bind(cls_session, cls_conn, is_leading))
-		return -EINVAL;
+	if (iscsi_conn_bind(cls_session, cls_conn, is_leading)) {
+		ret_code = -EINVAL;
+		goto put_ep;
+	}
 
 	if (bnx2i_ep->hba != hba) {
 		/* Error - TCP connection does not belong to this device
@@ -1441,7 +1447,8 @@ static int bnx2i_conn_bind(struct iscsi_cls_session *cls_session,
 		iscsi_conn_printk(KERN_ALERT, cls_conn->dd_data,
 				  "belong to hba (%s)\n",
 				  hba->netdev->name);
-		return -EEXIST;
+		ret_code = -EEXIST;
+		goto put_ep;
 	}
 	bnx2i_ep->conn = bnx2i_conn;
 	bnx2i_conn->ep = bnx2i_ep;
@@ -1458,6 +1465,8 @@ static int bnx2i_conn_bind(struct iscsi_cls_session *cls_session,
 		bnx2i_put_rq_buf(bnx2i_conn, 0);
 
 	bnx2i_arm_cq_event_coalescing(bnx2i_conn->ep, CNIC_ARM_CQE);
+put_ep:
+	iscsi_put_endpoint(ep);
 	return ret_code;
 }
 
diff --git a/drivers/scsi/cxgbi/libcxgbi.c b/drivers/scsi/cxgbi/libcxgbi.c
index ecb134b4699f..506b561670af 100644
--- a/drivers/scsi/cxgbi/libcxgbi.c
+++ b/drivers/scsi/cxgbi/libcxgbi.c
@@ -2690,11 +2690,13 @@ int cxgbi_bind_conn(struct iscsi_cls_session *cls_session,
 	err = csk->cdev->csk_ddp_setup_pgidx(csk, csk->tid,
 					     ppm->tformat.pgsz_idx_dflt);
 	if (err < 0)
-		return err;
+		goto put_ep;
 
 	err = iscsi_conn_bind(cls_session, cls_conn, is_leading);
-	if (err)
-		return -EINVAL;
+	if (err) {
+		err = -EINVAL;
+		goto put_ep;
+	}
 
 	/*  calculate the tag idx bits needed for this conn based on cmds_max */
 	cconn->task_idx_bits = (__ilog2_u32(conn->session->cmds_max - 1)) + 1;
@@ -2715,7 +2717,9 @@ int cxgbi_bind_conn(struct iscsi_cls_session *cls_session,
 	/*  init recv engine */
 	iscsi_tcp_hdr_recv_prep(tcp_conn);
 
-	return 0;
+put_ep:
+	iscsi_put_endpoint(ep);
+	return err;
 }
 EXPORT_SYMBOL_GPL(cxgbi_bind_conn);
 
diff --git a/drivers/scsi/qedi/qedi_iscsi.c b/drivers/scsi/qedi/qedi_iscsi.c
index 8f8036ae9a56..5f7e62f19d83 100644
--- a/drivers/scsi/qedi/qedi_iscsi.c
+++ b/drivers/scsi/qedi/qedi_iscsi.c
@@ -387,6 +387,7 @@ static int qedi_conn_bind(struct iscsi_cls_session *cls_session,
 	struct qedi_ctx *qedi = iscsi_host_priv(shost);
 	struct qedi_endpoint *qedi_ep;
 	struct iscsi_endpoint *ep;
+	int rc = 0;
 
 	ep = iscsi_lookup_endpoint(transport_fd);
 	if (!ep)
@@ -394,11 +395,16 @@ static int qedi_conn_bind(struct iscsi_cls_session *cls_session,
 
 	qedi_ep = ep->dd_data;
 	if ((qedi_ep->state == EP_STATE_TCP_FIN_RCVD) ||
-	    (qedi_ep->state == EP_STATE_TCP_RST_RCVD))
-		return -EINVAL;
+	    (qedi_ep->state == EP_STATE_TCP_RST_RCVD)) {
+		rc = -EINVAL;
+		goto put_ep;
+	}
+
+	if (iscsi_conn_bind(cls_session, cls_conn, is_leading)) {
+		rc = -EINVAL;
+		goto put_ep;
+	}
 
-	if (iscsi_conn_bind(cls_session, cls_conn, is_leading))
-		return -EINVAL;
 
 	qedi_ep->conn = qedi_conn;
 	qedi_conn->ep = qedi_ep;
@@ -408,13 +414,18 @@ static int qedi_conn_bind(struct iscsi_cls_session *cls_session,
 	qedi_conn->cmd_cleanup_req = 0;
 	qedi_conn->cmd_cleanup_cmpl = 0;
 
-	if (qedi_bind_conn_to_iscsi_cid(qedi, qedi_conn))
-		return -EINVAL;
+	if (qedi_bind_conn_to_iscsi_cid(qedi, qedi_conn)) {
+		rc = -EINVAL;
+		goto put_ep;
+	}
+
 
 	spin_lock_init(&qedi_conn->tmf_work_lock);
 	INIT_LIST_HEAD(&qedi_conn->tmf_work_list);
 	init_waitqueue_head(&qedi_conn->wait_queue);
-	return 0;
+put_ep:
+	iscsi_put_endpoint(ep);
+	return rc;
 }
 
 static int qedi_iscsi_update_conn(struct qedi_ctx *qedi,
diff --git a/drivers/scsi/qla4xxx/ql4_os.c b/drivers/scsi/qla4xxx/ql4_os.c
index a8b8bf118c76..8d82d2a83059 100644
--- a/drivers/scsi/qla4xxx/ql4_os.c
+++ b/drivers/scsi/qla4xxx/ql4_os.c
@@ -3238,6 +3238,7 @@ static int qla4xxx_conn_bind(struct iscsi_cls_session *cls_session,
 	conn = cls_conn->dd_data;
 	qla_conn = conn->dd_data;
 	qla_conn->qla_ep = ep->dd_data;
+	iscsi_put_endpoint(ep);
 	return 0;
 }
 
diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c
index ed0c7e812445..dea0944ebbfa 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -266,9 +266,20 @@ void iscsi_destroy_endpoint(struct iscsi_endpoint *ep)
 }
 EXPORT_SYMBOL_GPL(iscsi_destroy_endpoint);
 
+void iscsi_put_endpoint(struct iscsi_endpoint *ep)
+{
+	put_device(&ep->dev);
+}
+EXPORT_SYMBOL_GPL(iscsi_put_endpoint);
+
+/**
+ * iscsi_lookup_endpoint - get ep from handle
+ * @handle: endpoint handle
+ *
+ * Caller must do a iscsi_put_endpoint.
+ */
 struct iscsi_endpoint *iscsi_lookup_endpoint(u64 handle)
 {
-	struct iscsi_endpoint *ep;
 	struct device *dev;
 
 	dev = class_find_device(&iscsi_endpoint_class, NULL, &handle,
@@ -276,13 +287,7 @@ struct iscsi_endpoint *iscsi_lookup_endpoint(u64 handle)
 	if (!dev)
 		return NULL;
 
-	ep = iscsi_dev_to_endpoint(dev);
-	/*
-	 * we can drop this now because the interface will prevent
-	 * removals and lookups from racing.
-	 */
-	put_device(dev);
-	return ep;
+	return iscsi_dev_to_endpoint(dev);
 }
 EXPORT_SYMBOL_GPL(iscsi_lookup_endpoint);
 
@@ -2983,6 +2988,7 @@ static int iscsi_if_ep_disconnect(struct iscsi_transport *transport,
 	}
 
 	transport->ep_disconnect(ep);
+	iscsi_put_endpoint(ep);
 	return 0;
 }
 
@@ -3008,6 +3014,7 @@ iscsi_if_transport_ep(struct iscsi_transport *transport,
 
 		ev->r.retcode = transport->ep_poll(ep,
 						   ev->u.ep_poll.timeout_ms);
+		iscsi_put_endpoint(ep);
 		break;
 	case ISCSI_UEVENT_TRANSPORT_EP_DISCONNECT:
 		rc = iscsi_if_ep_disconnect(transport,
@@ -3691,6 +3698,7 @@ iscsi_if_recv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, uint32_t *group)
 					ev->u.c_bound_session.initial_cmdsn,
 					ev->u.c_bound_session.cmds_max,
 					ev->u.c_bound_session.queue_depth);
+		iscsi_put_endpoint(ep);
 		break;
 	case ISCSI_UEVENT_DESTROY_SESSION:
 		session = iscsi_session_lookup(ev->u.d_session.sid);
@@ -3762,6 +3770,7 @@ iscsi_if_recv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, uint32_t *group)
 			mutex_lock(&conn->ep_mutex);
 			conn->ep = ep;
 			mutex_unlock(&conn->ep_mutex);
+			iscsi_put_endpoint(ep);
 		} else
 			iscsi_cls_conn_printk(KERN_ERR, conn,
 					      "Could not set ep conn "
diff --git a/include/scsi/scsi_transport_iscsi.h b/include/scsi/scsi_transport_iscsi.h
index eb6ed499324d..09992f019dc9 100644
--- a/include/scsi/scsi_transport_iscsi.h
+++ b/include/scsi/scsi_transport_iscsi.h
@@ -444,6 +444,7 @@ extern int iscsi_scan_finished(struct Scsi_Host *shost, unsigned long time);
 extern struct iscsi_endpoint *iscsi_create_endpoint(int dd_size);
 extern void iscsi_destroy_endpoint(struct iscsi_endpoint *ep);
 extern struct iscsi_endpoint *iscsi_lookup_endpoint(u64 handle);
+extern void iscsi_put_endpoint(struct iscsi_endpoint *ep);
 extern int iscsi_block_scsi_eh(struct scsi_cmnd *cmd);
 extern struct iscsi_iface *iscsi_create_iface(struct Scsi_Host *shost,
 					      struct iscsi_transport *t,
-- 
Gitee


From 3cc519b403fc2f1807a4b6c9acd76fd423494a1a Mon Sep 17 00:00:00 2001
From: Mike Christie <michael.christie@oracle.com>
Date: Tue, 19 Jul 2022 16:30:42 +0800
Subject: [PATCH 403/674] scsi: iscsi: Fix in-kernel conn failure handling

stable inclusion
from stable-v5.10.112
commit 46f37a34a53d4c12fa413b7aec7fe7c3e5cece51
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=46f37a34a53d4c12fa413b7aec7fe7c3e5cece51

--------------------------------

[ Upstream commit 23d6fefbb3f6b1cc29794427588b470ed06ff64e ]

Commit 0ab710458da1 ("scsi: iscsi: Perform connection failure entirely in
kernel space") has the following regressions/bugs that this patch fixes:

1. It can return cmds to upper layers like dm-multipath where that can
retry them. After they are successful the fs/app can send new I/O to the
same sectors, but we've left the cmds running in FW or in the net layer.
We need to be calling ep_disconnect if userspace is not up.

This patch only fixes the issue for offload drivers. iscsi_tcp will be
fixed in separate commit because it doesn't have a ep_disconnect call.

2. The drivers that implement ep_disconnect expect that it's called before
conn_stop. Besides crashes, if the cleanup_task callout is called before
ep_disconnect it might free up driver/card resources for session1 then they
could be allocated for session2. But because the driver's ep_disconnect is
not called it has not cleaned up the firmware so the card is still using
the resources for the original cmd.

3. The stop_conn_work_fn can run after userspace has done its recovery and
we are happily using the session. We will then end up with various bugs
depending on what is going on at the time.

We may also run stop_conn_work_fn late after userspace has called stop_conn
and ep_disconnect and is now going to call start/bind conn. If
stop_conn_work_fn runs after bind but before start, we would leave the conn
in a unbound but sort of started state where IO might be allowed even
though the drivers have been set in a state where they no longer expect
I/O.

4. Returning -EAGAIN in iscsi_if_destroy_conn if we haven't yet run the in
kernel stop_conn function is breaking userspace. We should have been doing
this for the caller.

Link: https://lore.kernel.org/r/20210525181821.7617-8-michael.christie@oracle.com
Fixes: 0ab710458da1 ("scsi: iscsi: Perform connection failure entirely in kernel space")
Reviewed-by: Lee Duncan <lduncan@suse.com>
Signed-off-by: Mike Christie <michael.christie@oracle.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/scsi/scsi_transport_iscsi.c | 471 ++++++++++++++++------------
 include/scsi/scsi_transport_iscsi.h |  10 +-
 2 files changed, 283 insertions(+), 198 deletions(-)

diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c
index dea0944ebbfa..7246f7b2ff03 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -86,15 +86,11 @@ struct iscsi_internal {
 	struct transport_container session_cont;
 };
 
-/* Worker to perform connection failure on unresponsive connections
- * completely in kernel space.
- */
-static void stop_conn_work_fn(struct work_struct *work);
-static DECLARE_WORK(stop_conn_work, stop_conn_work_fn);
-
 static atomic_t iscsi_session_nr; /* sysfs session id for next new session */
 static struct workqueue_struct *iscsi_eh_timer_workq;
 
+static struct workqueue_struct *iscsi_conn_cleanup_workq;
+
 static DEFINE_IDA(iscsi_sess_ida);
 /*
  * list of registered transports and lock that must
@@ -1601,12 +1597,6 @@ static DECLARE_TRANSPORT_CLASS(iscsi_connection_class,
 static struct sock *nls;
 static DEFINE_MUTEX(rx_queue_mutex);
 
-/*
- * conn_mutex protects the {start,bind,stop,destroy}_conn from racing
- * against the kernel stop_connection recovery mechanism
- */
-static DEFINE_MUTEX(conn_mutex);
-
 static LIST_HEAD(sesslist);
 static DEFINE_SPINLOCK(sesslock);
 static LIST_HEAD(connlist);
@@ -2228,6 +2218,123 @@ void iscsi_remove_session(struct iscsi_cls_session *session)
 }
 EXPORT_SYMBOL_GPL(iscsi_remove_session);
 
+static void iscsi_stop_conn(struct iscsi_cls_conn *conn, int flag)
+{
+	ISCSI_DBG_TRANS_CONN(conn, "Stopping conn.\n");
+
+	switch (flag) {
+	case STOP_CONN_RECOVER:
+		conn->state = ISCSI_CONN_FAILED;
+		break;
+	case STOP_CONN_TERM:
+		conn->state = ISCSI_CONN_DOWN;
+		break;
+	default:
+		iscsi_cls_conn_printk(KERN_ERR, conn, "invalid stop flag %d\n",
+				      flag);
+		return;
+	}
+
+	conn->transport->stop_conn(conn, flag);
+	ISCSI_DBG_TRANS_CONN(conn, "Stopping conn done.\n");
+}
+
+static int iscsi_if_stop_conn(struct iscsi_transport *transport,
+			      struct iscsi_uevent *ev)
+{
+	int flag = ev->u.stop_conn.flag;
+	struct iscsi_cls_conn *conn;
+
+	conn = iscsi_conn_lookup(ev->u.stop_conn.sid, ev->u.stop_conn.cid);
+	if (!conn)
+		return -EINVAL;
+
+	ISCSI_DBG_TRANS_CONN(conn, "iscsi if conn stop.\n");
+	/*
+	 * If this is a termination we have to call stop_conn with that flag
+	 * so the correct states get set. If we haven't run the work yet try to
+	 * avoid the extra run.
+	 */
+	if (flag == STOP_CONN_TERM) {
+		cancel_work_sync(&conn->cleanup_work);
+		iscsi_stop_conn(conn, flag);
+	} else {
+		/*
+		 * Figure out if it was the kernel or userspace initiating this.
+		 */
+		if (!test_and_set_bit(ISCSI_CLS_CONN_BIT_CLEANUP, &conn->flags)) {
+			iscsi_stop_conn(conn, flag);
+		} else {
+			ISCSI_DBG_TRANS_CONN(conn,
+					     "flush kernel conn cleanup.\n");
+			flush_work(&conn->cleanup_work);
+		}
+		/*
+		 * Only clear for recovery to avoid extra cleanup runs during
+		 * termination.
+		 */
+		clear_bit(ISCSI_CLS_CONN_BIT_CLEANUP, &conn->flags);
+	}
+	ISCSI_DBG_TRANS_CONN(conn, "iscsi if conn stop done.\n");
+	return 0;
+}
+
+static void iscsi_ep_disconnect(struct iscsi_cls_conn *conn, bool is_active)
+{
+	struct iscsi_cls_session *session = iscsi_conn_to_session(conn);
+	struct iscsi_endpoint *ep;
+
+	ISCSI_DBG_TRANS_CONN(conn, "disconnect ep.\n");
+	conn->state = ISCSI_CONN_FAILED;
+
+	if (!conn->ep || !session->transport->ep_disconnect)
+		return;
+
+	ep = conn->ep;
+	conn->ep = NULL;
+
+	session->transport->unbind_conn(conn, is_active);
+	session->transport->ep_disconnect(ep);
+	ISCSI_DBG_TRANS_CONN(conn, "disconnect ep done.\n");
+}
+
+static void iscsi_cleanup_conn_work_fn(struct work_struct *work)
+{
+	struct iscsi_cls_conn *conn = container_of(work, struct iscsi_cls_conn,
+						   cleanup_work);
+	struct iscsi_cls_session *session = iscsi_conn_to_session(conn);
+
+	mutex_lock(&conn->ep_mutex);
+	/*
+	 * If we are not at least bound there is nothing for us to do. Userspace
+	 * will do a ep_disconnect call if offload is used, but will not be
+	 * doing a stop since there is nothing to clean up, so we have to clear
+	 * the cleanup bit here.
+	 */
+	if (conn->state != ISCSI_CONN_BOUND && conn->state != ISCSI_CONN_UP) {
+		ISCSI_DBG_TRANS_CONN(conn, "Got error while conn is already failed. Ignoring.\n");
+		clear_bit(ISCSI_CLS_CONN_BIT_CLEANUP, &conn->flags);
+		mutex_unlock(&conn->ep_mutex);
+		return;
+	}
+
+	iscsi_ep_disconnect(conn, false);
+
+	if (system_state != SYSTEM_RUNNING) {
+		/*
+		 * If the user has set up for the session to never timeout
+		 * then hang like they wanted. For all other cases fail right
+		 * away since userspace is not going to relogin.
+		 */
+		if (session->recovery_tmo > 0)
+			session->recovery_tmo = 0;
+	}
+
+	iscsi_stop_conn(conn, STOP_CONN_RECOVER);
+	mutex_unlock(&conn->ep_mutex);
+	ISCSI_DBG_TRANS_CONN(conn, "cleanup done.\n");
+}
+
 void iscsi_free_session(struct iscsi_cls_session *session)
 {
 	ISCSI_DBG_TRANS_SESSION(session, "Freeing session\n");
@@ -2267,7 +2374,7 @@ iscsi_create_conn(struct iscsi_cls_session *session, int dd_size, uint32_t cid)
 
 	mutex_init(&conn->ep_mutex);
 	INIT_LIST_HEAD(&conn->conn_list);
-	INIT_LIST_HEAD(&conn->conn_list_err);
+	INIT_WORK(&conn->cleanup_work, iscsi_cleanup_conn_work_fn);
 	conn->transport = transport;
 	conn->cid = cid;
 	conn->state = ISCSI_CONN_DOWN;
@@ -2324,7 +2431,6 @@ int iscsi_destroy_conn(struct iscsi_cls_conn *conn)
 
 	spin_lock_irqsave(&connlock, flags);
 	list_del(&conn->conn_list);
-	list_del(&conn->conn_list_err);
 	spin_unlock_irqrestore(&connlock, flags);
 
 	transport_unregister_device(&conn->dev);
@@ -2451,83 +2557,6 @@ int iscsi_offload_mesg(struct Scsi_Host *shost,
 }
 EXPORT_SYMBOL_GPL(iscsi_offload_mesg);
 
-/*
- * This can be called without the rx_queue_mutex, if invoked by the kernel
- * stop work. But, in that case, it is guaranteed not to race with
- * iscsi_destroy by conn_mutex.
- */
-static void iscsi_if_stop_conn(struct iscsi_cls_conn *conn, int flag)
-{
-	/*
-	 * It is important that this path doesn't rely on
-	 * rx_queue_mutex, otherwise, a thread doing allocation on a
-	 * start_session/start_connection could sleep waiting on a
-	 * writeback to a failed iscsi device, that cannot be recovered
-	 * because the lock is held.  If we don't hold it here, the
-	 * kernel stop_conn_work_fn has a chance to stop the broken
-	 * session and resolve the allocation.
-	 *
-	 * Still, the user invoked .stop_conn() needs to be serialized
-	 * with stop_conn_work_fn by a private mutex.  Not pretty, but
-	 * it works.
-	 */
-	mutex_lock(&conn_mutex);
-	switch (flag) {
-	case STOP_CONN_RECOVER:
-		conn->state = ISCSI_CONN_FAILED;
-		break;
-	case STOP_CONN_TERM:
-		conn->state = ISCSI_CONN_DOWN;
-		break;
-	default:
-		iscsi_cls_conn_printk(KERN_ERR, conn,
-				      "invalid stop flag %d\n", flag);
-		goto unlock;
-	}
-
-	conn->transport->stop_conn(conn, flag);
-unlock:
-	mutex_unlock(&conn_mutex);
-}
-
-static void stop_conn_work_fn(struct work_struct *work)
-{
-	struct iscsi_cls_conn *conn, *tmp;
-	unsigned long flags;
-	LIST_HEAD(recovery_list);
-
-	spin_lock_irqsave(&connlock, flags);
-	if (list_empty(&connlist_err)) {
-		spin_unlock_irqrestore(&connlock, flags);
-		return;
-	}
-	list_splice_init(&connlist_err, &recovery_list);
-	spin_unlock_irqrestore(&connlock, flags);
-
-	list_for_each_entry_safe(conn, tmp, &recovery_list, conn_list_err) {
-		uint32_t sid = iscsi_conn_get_sid(conn);
-		struct iscsi_cls_session *session;
-
-		session = iscsi_session_lookup(sid);
-		if (session) {
-			if (system_state != SYSTEM_RUNNING) {
-				/*
-				 * If the user has set up for the session to
-				 * never timeout then hang like they wanted.
-				 * For all other cases fail right away since
-				 * userspace is not going to relogin.
-				 */
-				if (session->recovery_tmo > 0)
-					session->recovery_tmo = 0;
-			}
-
-			iscsi_if_stop_conn(conn, STOP_CONN_RECOVER);
-		}
-
-		list_del_init(&conn->conn_list_err);
-	}
-}
-
 void iscsi_conn_error_event(struct iscsi_cls_conn *conn, enum iscsi_err error)
 {
 	struct nlmsghdr	*nlh;
@@ -2535,12 +2564,9 @@ void iscsi_conn_error_event(struct iscsi_cls_conn *conn, enum iscsi_err error)
 	struct iscsi_uevent *ev;
 	struct iscsi_internal *priv;
 	int len = nlmsg_total_size(sizeof(*ev));
-	unsigned long flags;
 
-	spin_lock_irqsave(&connlock, flags);
-	list_add(&conn->conn_list_err, &connlist_err);
-	spin_unlock_irqrestore(&connlock, flags);
-	queue_work(system_unbound_wq, &stop_conn_work);
+	if (!test_and_set_bit(ISCSI_CLS_CONN_BIT_CLEANUP, &conn->flags))
+		queue_work(iscsi_conn_cleanup_workq, &conn->cleanup_work);
 
 	priv = iscsi_if_transport_lookup(conn->transport);
 	if (!priv)
@@ -2870,26 +2896,17 @@ static int
 iscsi_if_destroy_conn(struct iscsi_transport *transport, struct iscsi_uevent *ev)
 {
 	struct iscsi_cls_conn *conn;
-	unsigned long flags;
 
 	conn = iscsi_conn_lookup(ev->u.d_conn.sid, ev->u.d_conn.cid);
 	if (!conn)
 		return -EINVAL;
 
-	spin_lock_irqsave(&connlock, flags);
-	if (!list_empty(&conn->conn_list_err)) {
-		spin_unlock_irqrestore(&connlock, flags);
-		return -EAGAIN;
-	}
-	spin_unlock_irqrestore(&connlock, flags);
-
+	ISCSI_DBG_TRANS_CONN(conn, "Flushing cleanup during destruction\n");
+	flush_work(&conn->cleanup_work);
 	ISCSI_DBG_TRANS_CONN(conn, "Destroying transport conn\n");
 
-	mutex_lock(&conn_mutex);
 	if (transport->destroy_conn)
 		transport->destroy_conn(conn);
-	mutex_unlock(&conn_mutex);
-
 	return 0;
 }
 
@@ -2966,7 +2983,7 @@ static int iscsi_if_ep_connect(struct iscsi_transport *transport,
 }
 
 static int iscsi_if_ep_disconnect(struct iscsi_transport *transport,
-				  u64 ep_handle, bool is_active)
+				  u64 ep_handle)
 {
 	struct iscsi_cls_conn *conn;
 	struct iscsi_endpoint *ep;
@@ -2977,17 +2994,30 @@ static int iscsi_if_ep_disconnect(struct iscsi_transport *transport,
 	ep = iscsi_lookup_endpoint(ep_handle);
 	if (!ep)
 		return -EINVAL;
+
 	conn = ep->conn;
-	if (conn) {
-		mutex_lock(&conn->ep_mutex);
-		conn->ep = NULL;
+	if (!conn) {
+		/*
+		 * conn was not even bound yet, so we can't get iscsi conn
+		 * failures yet.
+		 */
+		transport->ep_disconnect(ep);
+		goto put_ep;
+	}
+
+	mutex_lock(&conn->ep_mutex);
+	/* Check if this was a conn error and the kernel took ownership */
+	if (test_bit(ISCSI_CLS_CONN_BIT_CLEANUP, &conn->flags)) {
+		ISCSI_DBG_TRANS_CONN(conn, "flush kernel conn cleanup.\n");
 		mutex_unlock(&conn->ep_mutex);
-		conn->state = ISCSI_CONN_FAILED;
 
-		transport->unbind_conn(conn, is_active);
+		flush_work(&conn->cleanup_work);
+		goto put_ep;
 	}
 
-	transport->ep_disconnect(ep);
+	iscsi_ep_disconnect(conn, false);
+	mutex_unlock(&conn->ep_mutex);
+put_ep:
 	iscsi_put_endpoint(ep);
 	return 0;
 }
@@ -3018,8 +3048,7 @@ iscsi_if_transport_ep(struct iscsi_transport *transport,
 		break;
 	case ISCSI_UEVENT_TRANSPORT_EP_DISCONNECT:
 		rc = iscsi_if_ep_disconnect(transport,
-					    ev->u.ep_disconnect.ep_handle,
-					    false);
+					    ev->u.ep_disconnect.ep_handle);
 		break;
 	}
 	return rc;
@@ -3646,18 +3675,129 @@ iscsi_get_host_stats(struct iscsi_transport *transport, struct nlmsghdr *nlh)
 	return err;
 }
 
+static int iscsi_if_transport_conn(struct iscsi_transport *transport,
+				   struct nlmsghdr *nlh)
+{
+	struct iscsi_uevent *ev = nlmsg_data(nlh);
+	struct iscsi_cls_session *session;
+	struct iscsi_cls_conn *conn = NULL;
+	struct iscsi_endpoint *ep;
+	uint32_t pdu_len;
+	int err = 0;
+
+	switch (nlh->nlmsg_type) {
+	case ISCSI_UEVENT_CREATE_CONN:
+		return iscsi_if_create_conn(transport, ev);
+	case ISCSI_UEVENT_DESTROY_CONN:
+		return iscsi_if_destroy_conn(transport, ev);
+	case ISCSI_UEVENT_STOP_CONN:
+		return iscsi_if_stop_conn(transport, ev);
+	}
+
+	/*
+	 * The following cmds need to be run under the ep_mutex so in kernel
+	 * conn cleanup (ep_disconnect + unbind and conn) is not done while
+	 * these are running. They also must not run if we have just run a conn
+	 * cleanup because they would set the state in a way that might allow
+	 * IO or send IO themselves.
+	 */
+	switch (nlh->nlmsg_type) {
+	case ISCSI_UEVENT_START_CONN:
+		conn = iscsi_conn_lookup(ev->u.start_conn.sid,
+					 ev->u.start_conn.cid);
+		break;
+	case ISCSI_UEVENT_BIND_CONN:
+		conn = iscsi_conn_lookup(ev->u.b_conn.sid, ev->u.b_conn.cid);
+		break;
+	case ISCSI_UEVENT_SEND_PDU:
+		conn = iscsi_conn_lookup(ev->u.send_pdu.sid, ev->u.send_pdu.cid);
+		break;
+	}
+
+	if (!conn)
+		return -EINVAL;
+
+	mutex_lock(&conn->ep_mutex);
+	if (test_bit(ISCSI_CLS_CONN_BIT_CLEANUP, &conn->flags)) {
+		mutex_unlock(&conn->ep_mutex);
+		ev->r.retcode = -ENOTCONN;
+		return 0;
+	}
+
+	switch (nlh->nlmsg_type) {
+	case ISCSI_UEVENT_BIND_CONN:
+		if (conn->ep) {
+			/*
+			 * For offload boot support where iscsid is restarted
+			 * during the pivot root stage, the ep will be intact
+			 * here when the new iscsid instance starts up and
+			 * reconnects.
+			 */
+			iscsi_ep_disconnect(conn, true);
+		}
+
+		session = iscsi_session_lookup(ev->u.b_conn.sid);
+		if (!session) {
+			err = -EINVAL;
+			break;
+		}
+
+		ev->r.retcode =	transport->bind_conn(session, conn,
+						ev->u.b_conn.transport_eph,
+						ev->u.b_conn.is_leading);
+		if (!ev->r.retcode)
+			conn->state = ISCSI_CONN_BOUND;
+
+		if (ev->r.retcode || !transport->ep_connect)
+			break;
+
+		ep = iscsi_lookup_endpoint(ev->u.b_conn.transport_eph);
+		if (ep) {
+			ep->conn = conn;
+			conn->ep = ep;
+			iscsi_put_endpoint(ep);
+		} else {
+			err = -ENOTCONN;
+			iscsi_cls_conn_printk(KERN_ERR, conn,
+					      "Could not set ep conn binding\n");
+		}
+		break;
+	case ISCSI_UEVENT_START_CONN:
+		ev->r.retcode = transport->start_conn(conn);
+		if (!ev->r.retcode)
+			conn->state = ISCSI_CONN_UP;
+		break;
+	case ISCSI_UEVENT_SEND_PDU:
+		pdu_len = nlh->nlmsg_len - sizeof(*nlh) - sizeof(*ev);
+
+		if ((ev->u.send_pdu.hdr_size > pdu_len) ||
+		    (ev->u.send_pdu.data_size > (pdu_len - ev->u.send_pdu.hdr_size))) {
+			err = -EINVAL;
+			break;
+		}
+
+		ev->r.retcode =	transport->send_pdu(conn,
+				(struct iscsi_hdr *)((char *)ev + sizeof(*ev)),
+				(char *)ev + sizeof(*ev) + ev->u.send_pdu.hdr_size,
+				ev->u.send_pdu.data_size);
+		break;
+	default:
+		err = -ENOSYS;
+	}
+
+	mutex_unlock(&conn->ep_mutex);
+	return err;
+}
 
 static int
 iscsi_if_recv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, uint32_t *group)
 {
 	int err = 0;
 	u32 portid;
-	u32 pdu_len;
 	struct iscsi_uevent *ev = nlmsg_data(nlh);
 	struct iscsi_transport *transport = NULL;
 	struct iscsi_internal *priv;
 	struct iscsi_cls_session *session;
-	struct iscsi_cls_conn *conn;
 	struct iscsi_endpoint *ep = NULL;
 
 	if (!netlink_capable(skb, CAP_SYS_ADMIN))
@@ -3734,90 +3874,16 @@ iscsi_if_recv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, uint32_t *group)
 		else
 			err = -EINVAL;
 		break;
-	case ISCSI_UEVENT_CREATE_CONN:
-		err = iscsi_if_create_conn(transport, ev);
-		break;
-	case ISCSI_UEVENT_DESTROY_CONN:
-		err = iscsi_if_destroy_conn(transport, ev);
-		break;
-	case ISCSI_UEVENT_BIND_CONN:
-		session = iscsi_session_lookup(ev->u.b_conn.sid);
-		conn = iscsi_conn_lookup(ev->u.b_conn.sid, ev->u.b_conn.cid);
-
-		if (conn && conn->ep)
-			iscsi_if_ep_disconnect(transport, conn->ep->id, true);
-
-		if (!session || !conn) {
-			err = -EINVAL;
-			break;
-		}
-
-		mutex_lock(&conn_mutex);
-		ev->r.retcode =	transport->bind_conn(session, conn,
-						ev->u.b_conn.transport_eph,
-						ev->u.b_conn.is_leading);
-		if (!ev->r.retcode)
-			conn->state = ISCSI_CONN_BOUND;
-		mutex_unlock(&conn_mutex);
-
-		if (ev->r.retcode || !transport->ep_connect)
-			break;
-
-		ep = iscsi_lookup_endpoint(ev->u.b_conn.transport_eph);
-		if (ep) {
-			ep->conn = conn;
-
-			mutex_lock(&conn->ep_mutex);
-			conn->ep = ep;
-			mutex_unlock(&conn->ep_mutex);
-			iscsi_put_endpoint(ep);
-		} else
-			iscsi_cls_conn_printk(KERN_ERR, conn,
-					      "Could not set ep conn "
-					      "binding\n");
-		break;
 	case ISCSI_UEVENT_SET_PARAM:
 		err = iscsi_set_param(transport, ev);
 		break;
-	case ISCSI_UEVENT_START_CONN:
-		conn = iscsi_conn_lookup(ev->u.start_conn.sid, ev->u.start_conn.cid);
-		if (conn) {
-			mutex_lock(&conn_mutex);
-			ev->r.retcode = transport->start_conn(conn);
-			if (!ev->r.retcode)
-				conn->state = ISCSI_CONN_UP;
-			mutex_unlock(&conn_mutex);
-		}
-		else
-			err = -EINVAL;
-		break;
+	case ISCSI_UEVENT_CREATE_CONN:
+	case ISCSI_UEVENT_DESTROY_CONN:
 	case ISCSI_UEVENT_STOP_CONN:
-		conn = iscsi_conn_lookup(ev->u.stop_conn.sid, ev->u.stop_conn.cid);
-		if (conn)
-			iscsi_if_stop_conn(conn, ev->u.stop_conn.flag);
-		else
-			err = -EINVAL;
-		break;
+	case ISCSI_UEVENT_START_CONN:
+	case ISCSI_UEVENT_BIND_CONN:
 	case ISCSI_UEVENT_SEND_PDU:
-		pdu_len = nlh->nlmsg_len - sizeof(*nlh) - sizeof(*ev);
-
-		if ((ev->u.send_pdu.hdr_size > pdu_len) ||
-		    (ev->u.send_pdu.data_size > (pdu_len - ev->u.send_pdu.hdr_size))) {
-			err = -EINVAL;
-			break;
-		}
-
-		conn = iscsi_conn_lookup(ev->u.send_pdu.sid, ev->u.send_pdu.cid);
-		if (conn) {
-			mutex_lock(&conn_mutex);
-			ev->r.retcode =	transport->send_pdu(conn,
-				(struct iscsi_hdr*)((char*)ev + sizeof(*ev)),
-				(char*)ev + sizeof(*ev) + ev->u.send_pdu.hdr_size,
-				ev->u.send_pdu.data_size);
-			mutex_unlock(&conn_mutex);
-		}
-		else
-			err = -EINVAL;
+		err = iscsi_if_transport_conn(transport, nlh);
 		break;
 	case ISCSI_UEVENT_GET_STATS:
 		err = iscsi_if_get_stats(transport, nlh);
@@ -4820,8 +4886,18 @@ static __init int iscsi_transport_init(void)
 		goto release_nls;
 	}
 
+	iscsi_conn_cleanup_workq = alloc_workqueue("%s",
+			WQ_SYSFS | WQ_MEM_RECLAIM | WQ_UNBOUND, 0,
+			"iscsi_conn_cleanup");
+	if (!iscsi_conn_cleanup_workq) {
+		err = -ENOMEM;
+		goto destroy_wq;
+	}
+
 	return 0;
 
+destroy_wq:
+	destroy_workqueue(iscsi_eh_timer_workq);
 release_nls:
 	netlink_kernel_release(nls);
 unregister_flashnode_bus:
@@ -4843,6 +4919,7 @@ static __init int iscsi_transport_init(void)
 
 static void __exit iscsi_transport_exit(void)
 {
+	destroy_workqueue(iscsi_conn_cleanup_workq);
 	destroy_workqueue(iscsi_eh_timer_workq);
 	netlink_kernel_release(nls);
 	bus_unregister(&iscsi_flashnode_bus);
diff --git a/include/scsi/scsi_transport_iscsi.h b/include/scsi/scsi_transport_iscsi.h
index 09992f019dc9..c5d7810fd792 100644
--- a/include/scsi/scsi_transport_iscsi.h
+++ b/include/scsi/scsi_transport_iscsi.h
@@ -197,15 +197,23 @@ enum iscsi_connection_state {
 	ISCSI_CONN_BOUND,
 };
 
+#define ISCSI_CLS_CONN_BIT_CLEANUP	1
+
 struct iscsi_cls_conn {
 	struct list_head conn_list;	/* item in connlist */
-	struct list_head conn_list_err;	/* item in connlist_err */
 	void *dd_data;			/* LLD private data */
 	struct iscsi_transport *transport;
 	uint32_t cid;			/* connection id */
+	/*
+	 * This protects the conn startup and binding/unbinding of the ep to
+	 * the conn. Unbinding includes ep_disconnect and stop_conn.
+	 */
 	struct mutex ep_mutex;
 	struct iscsi_endpoint *ep;
 
+	unsigned long flags;
+	struct work_struct cleanup_work;
+
 	struct device dev;		/* sysfs transport/container device */
 	enum iscsi_connection_state state;
 };
-- 
Gitee


From 7c342fdea7eb24f53c1525330344bd30c994f50e Mon Sep 17 00:00:00 2001
From: Mike Christie <michael.christie@oracle.com>
Date: Tue, 19 Jul 2022 16:30:43 +0800
Subject: [PATCH 404/674] scsi: iscsi: Move iscsi_ep_disconnect()

stable inclusion
from stable-v5.10.112
commit 699bd835c36e095dc80bea21d9a15cd7f7bc8b9f
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=699bd835c36e095dc80bea21d9a15cd7f7bc8b9f

--------------------------------

[ Upstream commit c34f95e98d8fb750eefd4f3fe58b4f8b5e89253b ]

This patch moves iscsi_ep_disconnect() so it can be called earlier in the
next patch.

Link: https://lore.kernel.org/r/20220408001314.5014-2-michael.christie@oracle.com
Tested-by: Manish Rangankar <mrangankar@marvell.com>
Reviewed-by: Lee Duncan <lduncan@suse.com>
Reviewed-by: Chris Leech <cleech@redhat.com>
Signed-off-by: Mike Christie <michael.christie@oracle.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/scsi/scsi_transport_iscsi.c | 38 ++++++++++++++---------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c
index 7246f7b2ff03..8f6dc391356d 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -2239,6 +2239,25 @@ static void iscsi_stop_conn(struct iscsi_cls_conn *conn, int flag)
 	ISCSI_DBG_TRANS_CONN(conn, "Stopping conn done.\n");
 }
 
+static void iscsi_ep_disconnect(struct iscsi_cls_conn *conn, bool is_active)
+{
+	struct iscsi_cls_session *session = iscsi_conn_to_session(conn);
+	struct iscsi_endpoint *ep;
+
+	ISCSI_DBG_TRANS_CONN(conn, "disconnect ep.\n");
+	conn->state = ISCSI_CONN_FAILED;
+
+	if (!conn->ep || !session->transport->ep_disconnect)
+		return;
+
+	ep = conn->ep;
+	conn->ep = NULL;
+
+	session->transport->unbind_conn(conn, is_active);
+	session->transport->ep_disconnect(ep);
+	ISCSI_DBG_TRANS_CONN(conn, "disconnect ep done.\n");
+}
+
 static int iscsi_if_stop_conn(struct iscsi_transport *transport,
 			      struct iscsi_uevent *ev)
 {
@@ -2279,25 +2298,6 @@ static int iscsi_if_stop_conn(struct iscsi_transport *transport,
 	return 0;
 }
 
-static void iscsi_ep_disconnect(struct iscsi_cls_conn *conn, bool is_active)
-{
-	struct iscsi_cls_session *session = iscsi_conn_to_session(conn);
-	struct iscsi_endpoint *ep;
-
-	ISCSI_DBG_TRANS_CONN(conn, "disconnect ep.\n");
-	conn->state = ISCSI_CONN_FAILED;
-
-	if (!conn->ep || !session->transport->ep_disconnect)
-		return;
-
-	ep = conn->ep;
-	conn->ep = NULL;
-
-	session->transport->unbind_conn(conn, is_active);
-	session->transport->ep_disconnect(ep);
-	ISCSI_DBG_TRANS_CONN(conn, "disconnect ep done.\n");
-}
-
 static void iscsi_cleanup_conn_work_fn(struct work_struct *work)
 {
 	struct iscsi_cls_conn *conn = container_of(work, struct iscsi_cls_conn,
-- 
Gitee


From 2f4ea143646fb62a066c48ce57ba1f9046fda321 Mon Sep 17 00:00:00 2001
From: Mike Christie <michael.christie@oracle.com>
Date: Tue, 19 Jul 2022 16:30:44 +0800
Subject: [PATCH 405/674] scsi: iscsi: Fix offload conn cleanup when iscsid
 restarts

stable inclusion
from stable-v5.10.112
commit 73805795c99ff32154c9e37c4334968c037c0f7f
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=73805795c99ff32154c9e37c4334968c037c0f7f

--------------------------------

[ Upstream commit cbd2283aaf47fef4ded4b29124b1ef3beb515f3a ]

When userspace restarts during boot or upgrades it won't know about the
offload driver's endpoint and connection mappings. iscsid will start by
cleaning up the old session by doing a stop_conn call. Later, if we are
able to create a new connection, we clean up the old endpoint during the
binding stage. The problem is that if we do stop_conn before doing the
ep_disconnect call offload, drivers can still be executing I/O. We then
might free tasks from the under the card/driver.

This moves the ep_disconnect call to before we do the stop_conn call for
this case. It will then work and look like a normal recovery/cleanup
procedure from the driver's point of view.

Link: https://lore.kernel.org/r/20220408001314.5014-3-michael.christie@oracle.com
Tested-by: Manish Rangankar <mrangankar@marvell.com>
Reviewed-by: Lee Duncan <lduncan@suse.com>
Reviewed-by: Chris Leech <cleech@redhat.com>
Signed-off-by: Mike Christie <michael.christie@oracle.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/scsi/scsi_transport_iscsi.c | 48 +++++++++++++++++------------
 1 file changed, 28 insertions(+), 20 deletions(-)

diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c
index 8f6dc391356d..38abc3a15692 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -2258,6 +2258,23 @@ static void iscsi_ep_disconnect(struct iscsi_cls_conn *conn, bool is_active)
 	ISCSI_DBG_TRANS_CONN(conn, "disconnect ep done.\n");
 }
 
+static void iscsi_if_disconnect_bound_ep(struct iscsi_cls_conn *conn,
+					 struct iscsi_endpoint *ep,
+					 bool is_active)
+{
+	/* Check if this was a conn error and the kernel took ownership */
+	if (!test_bit(ISCSI_CLS_CONN_BIT_CLEANUP, &conn->flags)) {
+		iscsi_ep_disconnect(conn, is_active);
+	} else {
+		ISCSI_DBG_TRANS_CONN(conn, "flush kernel conn cleanup.\n");
+		mutex_unlock(&conn->ep_mutex);
+
+		flush_work(&conn->cleanup_work);
+
+		mutex_lock(&conn->ep_mutex);
+	}
+}
+
 static int iscsi_if_stop_conn(struct iscsi_transport *transport,
 			      struct iscsi_uevent *ev)
 {
@@ -2278,6 +2295,16 @@ static int iscsi_if_stop_conn(struct iscsi_transport *transport,
 		cancel_work_sync(&conn->cleanup_work);
 		iscsi_stop_conn(conn, flag);
 	} else {
+		/*
+		 * For offload, when iscsid is restarted it won't know about
+		 * existing endpoints so it can't do a ep_disconnect. We clean
+		 * it up here for userspace.
+		 */
+		mutex_lock(&conn->ep_mutex);
+		if (conn->ep)
+			iscsi_if_disconnect_bound_ep(conn, conn->ep, true);
+		mutex_unlock(&conn->ep_mutex);
+
 		/*
 		 * Figure out if it was the kernel or userspace initiating this.
 		 */
@@ -3006,16 +3033,7 @@ static int iscsi_if_ep_disconnect(struct iscsi_transport *transport,
 	}
 
 	mutex_lock(&conn->ep_mutex);
-	/* Check if this was a conn error and the kernel took ownership */
-	if (test_bit(ISCSI_CLS_CONN_BIT_CLEANUP, &conn->flags)) {
-		ISCSI_DBG_TRANS_CONN(conn, "flush kernel conn cleanup.\n");
-		mutex_unlock(&conn->ep_mutex);
-
-		flush_work(&conn->cleanup_work);
-		goto put_ep;
-	}
-
-	iscsi_ep_disconnect(conn, false);
+	iscsi_if_disconnect_bound_ep(conn, ep, false);
 	mutex_unlock(&conn->ep_mutex);
 put_ep:
 	iscsi_put_endpoint(ep);
@@ -3726,16 +3744,6 @@ static int iscsi_if_transport_conn(struct iscsi_transport *transport,
 
 	switch (nlh->nlmsg_type) {
 	case ISCSI_UEVENT_BIND_CONN:
-		if (conn->ep) {
-			/*
-			 * For offload boot support where iscsid is restarted
-			 * during the pivot root stage, the ep will be intact
-			 * here when the new iscsid instance starts up and
-			 * reconnects.
-			 */
-			iscsi_ep_disconnect(conn, true);
-		}
-
 		session = iscsi_session_lookup(ev->u.b_conn.sid);
 		if (!session) {
 			err = -EINVAL;
-- 
Gitee


From bb9e69a1c8fe8138412cdb7d5666c33ece5d9f1c Mon Sep 17 00:00:00 2001
From: Mike Christie <michael.christie@oracle.com>
Date: Tue, 19 Jul 2022 16:30:45 +0800
Subject: [PATCH 406/674] scsi: iscsi: Fix conn cleanup and stop race during
 iscsid restart

stable inclusion
from stable-v5.10.112
commit 45226fac4d316fea68357b307cb46a4aac764ec6
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=45226fac4d316fea68357b307cb46a4aac764ec6

--------------------------------

[ Upstream commit 7c6e99c18167ed89729bf167ccb4a7e3ab3115ba ]

If iscsid is doing a stop_conn at the same time the kernel is starting
error recovery we can hit a race that allows the cleanup work to run on a
valid connection. In the race, iscsi_if_stop_conn sees the cleanup bit set,
but it calls flush_work on the clean_work before iscsi_conn_error_event has
queued it. The flush then returns before the queueing and so the
cleanup_work can run later and disconnect/stop a conn while it's in a
connected state.

The patch:

Commit 0ab710458da1 ("scsi: iscsi: Perform connection failure entirely in
kernel space")

added the late stop_conn call bug originally, and the patch:

Commit 23d6fefbb3f6 ("scsi: iscsi: Fix in-kernel conn failure handling")

attempted to fix it but only fixed the normal EH case and left the above
race for the iscsid restart case. For the normal EH case we don't hit the
race because we only signal userspace to start recovery after we have done
the queueing, so the flush will always catch the queued work or see it
completed.

For iscsid restart cases like boot, we can hit the race because iscsid will
call down to the kernel before the kernel has signaled any error, so both
code paths can be running at the same time. This adds a lock around the
setting of the cleanup bit and queueing so they happen together.

Link: https://lore.kernel.org/r/20220408001314.5014-6-michael.christie@oracle.com
Fixes: 0ab710458da1 ("scsi: iscsi: Perform connection failure entirely in kernel space")
Tested-by: Manish Rangankar <mrangankar@marvell.com>
Reviewed-by: Lee Duncan <lduncan@suse.com>
Reviewed-by: Chris Leech <cleech@redhat.com>
Signed-off-by: Mike Christie <michael.christie@oracle.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/scsi/scsi_transport_iscsi.c | 17 +++++++++++++++++
 include/scsi/scsi_transport_iscsi.h |  2 ++
 2 files changed, 19 insertions(+)

diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c
index 38abc3a15692..21cc306a0f11 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -2263,9 +2263,12 @@ static void iscsi_if_disconnect_bound_ep(struct iscsi_cls_conn *conn,
 					 bool is_active)
 {
 	/* Check if this was a conn error and the kernel took ownership */
+	spin_lock_irq(&conn->lock);
 	if (!test_bit(ISCSI_CLS_CONN_BIT_CLEANUP, &conn->flags)) {
+		spin_unlock_irq(&conn->lock);
 		iscsi_ep_disconnect(conn, is_active);
 	} else {
+		spin_unlock_irq(&conn->lock);
 		ISCSI_DBG_TRANS_CONN(conn, "flush kernel conn cleanup.\n");
 		mutex_unlock(&conn->ep_mutex);
 
@@ -2308,9 +2311,12 @@ static int iscsi_if_stop_conn(struct iscsi_transport *transport,
 		/*
 		 * Figure out if it was the kernel or userspace initiating this.
 		 */
+		spin_lock_irq(&conn->lock);
 		if (!test_and_set_bit(ISCSI_CLS_CONN_BIT_CLEANUP, &conn->flags)) {
+			spin_unlock_irq(&conn->lock);
 			iscsi_stop_conn(conn, flag);
 		} else {
+			spin_unlock_irq(&conn->lock);
 			ISCSI_DBG_TRANS_CONN(conn,
 					     "flush kernel conn cleanup.\n");
 			flush_work(&conn->cleanup_work);
@@ -2319,7 +2325,9 @@ static int iscsi_if_stop_conn(struct iscsi_transport *transport,
 		 * Only clear for recovery to avoid extra cleanup runs during
 		 * termination.
 		 */
+		spin_lock_irq(&conn->lock);
 		clear_bit(ISCSI_CLS_CONN_BIT_CLEANUP, &conn->flags);
+		spin_unlock_irq(&conn->lock);
 	}
 	ISCSI_DBG_TRANS_CONN(conn, "iscsi if conn stop done.\n");
 	return 0;
@@ -2340,7 +2348,9 @@ static void iscsi_cleanup_conn_work_fn(struct work_struct *work)
 	 */
 	if (conn->state != ISCSI_CONN_BOUND && conn->state != ISCSI_CONN_UP) {
 		ISCSI_DBG_TRANS_CONN(conn, "Got error while conn is already failed. Ignoring.\n");
+		spin_lock_irq(&conn->lock);
 		clear_bit(ISCSI_CLS_CONN_BIT_CLEANUP, &conn->flags);
+		spin_unlock_irq(&conn->lock);
 		mutex_unlock(&conn->ep_mutex);
 		return;
 	}
@@ -2400,6 +2410,7 @@ iscsi_create_conn(struct iscsi_cls_session *session, int dd_size, uint32_t cid)
 		conn->dd_data = &conn[1];
 
 	mutex_init(&conn->ep_mutex);
+	spin_lock_init(&conn->lock);
 	INIT_LIST_HEAD(&conn->conn_list);
 	INIT_WORK(&conn->cleanup_work, iscsi_cleanup_conn_work_fn);
 	conn->transport = transport;
@@ -2591,9 +2602,12 @@ void iscsi_conn_error_event(struct iscsi_cls_conn *conn, enum iscsi_err error)
 	struct iscsi_uevent *ev;
 	struct iscsi_internal *priv;
 	int len = nlmsg_total_size(sizeof(*ev));
+	unsigned long flags;
 
+	spin_lock_irqsave(&conn->lock, flags);
 	if (!test_and_set_bit(ISCSI_CLS_CONN_BIT_CLEANUP, &conn->flags))
 		queue_work(iscsi_conn_cleanup_workq, &conn->cleanup_work);
+	spin_unlock_irqrestore(&conn->lock, flags);
 
 	priv = iscsi_if_transport_lookup(conn->transport);
 	if (!priv)
@@ -3736,11 +3750,14 @@ static int iscsi_if_transport_conn(struct iscsi_transport *transport,
 		return -EINVAL;
 
 	mutex_lock(&conn->ep_mutex);
+	spin_lock_irq(&conn->lock);
 	if (test_bit(ISCSI_CLS_CONN_BIT_CLEANUP, &conn->flags)) {
+		spin_unlock_irq(&conn->lock);
 		mutex_unlock(&conn->ep_mutex);
 		ev->r.retcode = -ENOTCONN;
 		return 0;
 	}
+	spin_unlock_irq(&conn->lock);
 
 	switch (nlh->nlmsg_type) {
 	case ISCSI_UEVENT_BIND_CONN:
diff --git a/include/scsi/scsi_transport_iscsi.h b/include/scsi/scsi_transport_iscsi.h
index c5d7810fd792..037c77fb5dc5 100644
--- a/include/scsi/scsi_transport_iscsi.h
+++ b/include/scsi/scsi_transport_iscsi.h
@@ -211,6 +211,8 @@ struct iscsi_cls_conn {
 	struct mutex ep_mutex;
 	struct iscsi_endpoint *ep;
 
+	/* Used when accessing flags and queueing work. */
+	spinlock_t lock;
 	unsigned long flags;
 	struct work_struct cleanup_work;
 
-- 
Gitee


From 27ad469e84a31f70057c77002b6a49892c519c2d Mon Sep 17 00:00:00 2001
From: Petr Malat <oss@malat.biz>
Date: Tue, 19 Jul 2022 16:30:46 +0800
Subject: [PATCH 407/674] sctp: Initialize daddr on peeled off socket

stable inclusion
from stable-v5.10.112
commit eb8873b324d918eeaa572d08d1d05785f4fb1e0a
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=eb8873b324d918eeaa572d08d1d05785f4fb1e0a

--------------------------------

[ Upstream commit 8467dda0c26583547731e7f3ea73fc3856bae3bf ]

Function sctp_do_peeloff() wrongly initializes daddr of the original
socket instead of the peeled off socket, which makes getpeername()
return zeroes instead of the primary address. Initialize the new socket
instead.

Fixes: d570ee490fb1 ("[SCTP]: Correctly set daddr for IPv6 sockets during peeloff")
Signed-off-by: Petr Malat <oss@malat.biz>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Link: https://lore.kernel.org/r/20220409063611.673193-1-oss@malat.biz
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 net/sctp/socket.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 0a9e2c7d8e5f..e9b4ea3d934f 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -5518,7 +5518,7 @@ int sctp_do_peeloff(struct sock *sk, sctp_assoc_t id, struct socket **sockp)
 	 * Set the daddr and initialize id to something more random and also
 	 * copy over any ip options.
 	 */
-	sp->pf->to_sk_daddr(&asoc->peer.primary_addr, sk);
+	sp->pf->to_sk_daddr(&asoc->peer.primary_addr, sock->sk);
 	sp->pf->copy_ip_options(sk, sock->sk);
 
 	/* Populate the fields of the newsk from the oldsk and migrate the
-- 
Gitee


From 63bb182af29369522607c1173290629e068c3ae0 Mon Sep 17 00:00:00 2001
From: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Date: Tue, 19 Jul 2022 16:30:47 +0800
Subject: [PATCH 408/674] testing/selftests/mqueue: Fix mq_perf_tests to free
 the allocated cpu set

stable inclusion
from stable-v5.10.112
commit 280f721edc54a782f1dfcec573ee929e92bf186a
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=280f721edc54a782f1dfcec573ee929e92bf186a

--------------------------------

[ Upstream commit ce64763c63854b4079f2e036638aa881a1fb3fbc ]

The selftest "mqueue/mq_perf_tests.c" use CPU_ALLOC to allocate
CPU set. This cpu set is used further in pthread_attr_setaffinity_np
and by pthread_create in the code. But in current code, allocated
cpu set is not freed.

Fix this issue by adding CPU_FREE in the "shutdown" function which
is called in most of the error/exit path for the cleanup. There are
few error paths which exit without using shutdown. Add a common goto
error path with CPU_FREE for these cases.

Fixes: 7820b0715b6f ("tools/selftests: add mq_perf_tests")
Signed-off-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 .../testing/selftests/mqueue/mq_perf_tests.c  | 25 +++++++++++++------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/tools/testing/selftests/mqueue/mq_perf_tests.c b/tools/testing/selftests/mqueue/mq_perf_tests.c
index b019e0b8221c..84fda3b49073 100644
--- a/tools/testing/selftests/mqueue/mq_perf_tests.c
+++ b/tools/testing/selftests/mqueue/mq_perf_tests.c
@@ -180,6 +180,9 @@ void shutdown(int exit_val, char *err_cause, int line_no)
 	if (in_shutdown++)
 		return;
 
+	/* Free the cpu_set allocated using CPU_ALLOC in main function */
+	CPU_FREE(cpu_set);
+
 	for (i = 0; i < num_cpus_to_pin; i++)
 		if (cpu_threads[i]) {
 			pthread_kill(cpu_threads[i], SIGUSR1);
@@ -551,6 +554,12 @@ int main(int argc, char *argv[])
 		perror("sysconf(_SC_NPROCESSORS_ONLN)");
 		exit(1);
 	}
+
+	if (getuid() != 0)
+		ksft_exit_skip("Not running as root, but almost all tests "
+			"require root in order to modify\nsystem settings.  "
+			"Exiting.\n");
+
 	cpus_online = min(MAX_CPUS, sysconf(_SC_NPROCESSORS_ONLN));
 	cpu_set = CPU_ALLOC(cpus_online);
 	if (cpu_set == NULL) {
@@ -589,7 +598,7 @@ int main(int argc, char *argv[])
 						cpu_set)) {
 					fprintf(stderr, "Any given CPU may "
 						"only be given once.\n");
-					exit(1);
+					goto err_code;
 				} else
 					CPU_SET_S(cpus_to_pin[cpu],
 						  cpu_set_size, cpu_set);
@@ -607,7 +616,7 @@ int main(int argc, char *argv[])
 				queue_path = malloc(strlen(option) + 2);
 				if (!queue_path) {
 					perror("malloc()");
-					exit(1);
+					goto err_code;
 				}
 				queue_path[0] = '/';
 				queue_path[1] = 0;
@@ -622,17 +631,12 @@ int main(int argc, char *argv[])
 		fprintf(stderr, "Must pass at least one CPU to continuous "
 			"mode.\n");
 		poptPrintUsage(popt_context, stderr, 0);
-		exit(1);
+		goto err_code;
 	} else if (!continuous_mode) {
 		num_cpus_to_pin = 1;
 		cpus_to_pin[0] = cpus_online - 1;
 	}
 
-	if (getuid() != 0)
-		ksft_exit_skip("Not running as root, but almost all tests "
-			"require root in order to modify\nsystem settings.  "
-			"Exiting.\n");
-
 	max_msgs = fopen(MAX_MSGS, "r+");
 	max_msgsize = fopen(MAX_MSGSIZE, "r+");
 	if (!max_msgs)
@@ -740,4 +744,9 @@ int main(int argc, char *argv[])
 			sleep(1);
 	}
 	shutdown(0, "", 0);
+
+err_code:
+	CPU_FREE(cpu_set);
+	exit(1);
+
 }
-- 
Gitee


From 861b9ebf493ff71b55d558fcb2d05ba75d25e3ca Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Tue, 19 Jul 2022 16:30:48 +0800
Subject: [PATCH 409/674] perf tools: Fix misleading add event PMU debug
 message

stable inclusion
from stable-v5.10.112
commit bfba9722cf2e801af181a56072f92f924ad7b156
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=bfba9722cf2e801af181a56072f92f924ad7b156

--------------------------------

[ Upstream commit f034fc50d3c7d9385c20d505ab4cf56b8fd18ac7 ]

Fix incorrect debug message:

   Attempting to add event pmu 'intel_pt' with '' that may result in
   non-fatal errors

which always appears with perf record -vv and intel_pt e.g.

    perf record -vv -e intel_pt//u uname

The message is incorrect because there will never be non-fatal errors.

Suppress the message if the PMU is 'selectable' i.e. meant to be
selected directly as an event.

Fixes: 4ac22b484d4c79e8 ("perf parse-events: Make add PMU verbose output clearer")
Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Link: http://lore.kernel.org/lkml/20220411061758.2458417-1-adrian.hunter@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 tools/perf/util/parse-events.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index d0408320c434..db3c85bb4fd0 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -1446,7 +1446,9 @@ int parse_events_add_pmu(struct parse_events_state *parse_state,
 	bool use_uncore_alias;
 	LIST_HEAD(config_terms);
 
-	if (verbose > 1) {
+	pmu = parse_state->fake_pmu ?: perf_pmu__find(name);
+
+	if (verbose > 1 && !(pmu && pmu->selectable)) {
 		fprintf(stderr, "Attempting to add event pmu '%s' with '",
 			name);
 		if (head_config) {
@@ -1459,7 +1461,6 @@ int parse_events_add_pmu(struct parse_events_state *parse_state,
 		fprintf(stderr, "' that may result in non-fatal errors\n");
 	}
 
-	pmu = parse_state->fake_pmu ?: perf_pmu__find(name);
 	if (!pmu) {
 		char *err_str;
 
-- 
Gitee


From 1f92c40221e98e94c159274b21e172ae137de8c1 Mon Sep 17 00:00:00 2001
From: Lin Ma <linma@zju.edu.cn>
Date: Tue, 19 Jul 2022 16:30:49 +0800
Subject: [PATCH 410/674] nfc: nci: add flush_workqueue to prevent uaf

stable inclusion
from stable-v5.10.112
commit 67677050cecbe0edfdd81cd508415e9636ba7c65
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=67677050cecbe0edfdd81cd508415e9636ba7c65

--------------------------------

[ Upstream commit ef27324e2cb7bb24542d6cb2571740eefe6b00dc ]

Our detector found a concurrent use-after-free bug when detaching an
NCI device. The main reason for this bug is the unexpected scheduling
between the used delayed mechanism (timer and workqueue).

The race can be demonstrated below:

Thread-1                           Thread-2
                                 | nci_dev_up()
                                 |   nci_open_device()
                                 |     __nci_request(nci_reset_req)
                                 |       nci_send_cmd
                                 |         queue_work(cmd_work)
nci_unregister_device()          |
  nci_close_device()             | ...
    del_timer_sync(cmd_timer)[1] |
...                              | Worker
nci_free_device()                | nci_cmd_work()
  kfree(ndev)[3]                 |   mod_timer(cmd_timer)[2]

In short, the cleanup routine thought that the cmd_timer has already
been detached by [1] but the mod_timer can re-attach the timer [2], even
it is already released [3], resulting in UAF.

This UAF is easy to trigger, crash trace by POC is like below

[   66.703713] ==================================================================
[   66.703974] BUG: KASAN: use-after-free in enqueue_timer+0x448/0x490
[   66.703974] Write of size 8 at addr ffff888009fb7058 by task kworker/u4:1/33
[   66.703974]
[   66.703974] CPU: 1 PID: 33 Comm: kworker/u4:1 Not tainted 5.18.0-rc2 #5
[   66.703974] Workqueue: nfc2_nci_cmd_wq nci_cmd_work
[   66.703974] Call Trace:
[   66.703974]  <TASK>
[   66.703974]  dump_stack_lvl+0x57/0x7d
[   66.703974]  print_report.cold+0x5e/0x5db
[   66.703974]  ? enqueue_timer+0x448/0x490
[   66.703974]  kasan_report+0xbe/0x1c0
[   66.703974]  ? enqueue_timer+0x448/0x490
[   66.703974]  enqueue_timer+0x448/0x490
[   66.703974]  __mod_timer+0x5e6/0xb80
[   66.703974]  ? mark_held_locks+0x9e/0xe0
[   66.703974]  ? try_to_del_timer_sync+0xf0/0xf0
[   66.703974]  ? lockdep_hardirqs_on_prepare+0x17b/0x410
[   66.703974]  ? queue_work_on+0x61/0x80
[   66.703974]  ? lockdep_hardirqs_on+0xbf/0x130
[   66.703974]  process_one_work+0x8bb/0x1510
[   66.703974]  ? lockdep_hardirqs_on_prepare+0x410/0x410
[   66.703974]  ? pwq_dec_nr_in_flight+0x230/0x230
[   66.703974]  ? rwlock_bug.part.0+0x90/0x90
[   66.703974]  ? _raw_spin_lock_irq+0x41/0x50
[   66.703974]  worker_thread+0x575/0x1190
[   66.703974]  ? process_one_work+0x1510/0x1510
[   66.703974]  kthread+0x2a0/0x340
[   66.703974]  ? kthread_complete_and_exit+0x20/0x20
[   66.703974]  ret_from_fork+0x22/0x30
[   66.703974]  </TASK>
[   66.703974]
[   66.703974] Allocated by task 267:
[   66.703974]  kasan_save_stack+0x1e/0x40
[   66.703974]  __kasan_kmalloc+0x81/0xa0
[   66.703974]  nci_allocate_device+0xd3/0x390
[   66.703974]  nfcmrvl_nci_register_dev+0x183/0x2c0
[   66.703974]  nfcmrvl_nci_uart_open+0xf2/0x1dd
[   66.703974]  nci_uart_tty_ioctl+0x2c3/0x4a0
[   66.703974]  tty_ioctl+0x764/0x1310
[   66.703974]  __x64_sys_ioctl+0x122/0x190
[   66.703974]  do_syscall_64+0x3b/0x90
[   66.703974]  entry_SYSCALL_64_after_hwframe+0x44/0xae
[   66.703974]
[   66.703974] Freed by task 406:
[   66.703974]  kasan_save_stack+0x1e/0x40
[   66.703974]  kasan_set_track+0x21/0x30
[   66.703974]  kasan_set_free_info+0x20/0x30
[   66.703974]  __kasan_slab_free+0x108/0x170
[   66.703974]  kfree+0xb0/0x330
[   66.703974]  nfcmrvl_nci_unregister_dev+0x90/0xd0
[   66.703974]  nci_uart_tty_close+0xdf/0x180
[   66.703974]  tty_ldisc_kill+0x73/0x110
[   66.703974]  tty_ldisc_hangup+0x281/0x5b0
[   66.703974]  __tty_hangup.part.0+0x431/0x890
[   66.703974]  tty_release+0x3a8/0xc80
[   66.703974]  __fput+0x1f0/0x8c0
[   66.703974]  task_work_run+0xc9/0x170
[   66.703974]  exit_to_user_mode_prepare+0x194/0x1a0
[   66.703974]  syscall_exit_to_user_mode+0x19/0x50
[   66.703974]  do_syscall_64+0x48/0x90
[   66.703974]  entry_SYSCALL_64_after_hwframe+0x44/0xae

To fix the UAF, this patch adds flush_workqueue() to ensure the
nci_cmd_work is finished before the following del_timer_sync.
This combination will promise the timer is actually detached.

Fixes: 6a2968aaf50c ("NFC: basic NCI protocol implementation")
Signed-off-by: Lin Ma <linma@zju.edu.cn>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 net/nfc/nci/core.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c
index e38719e2ee58..2cfff70f70e0 100644
--- a/net/nfc/nci/core.c
+++ b/net/nfc/nci/core.c
@@ -548,6 +548,10 @@ static int nci_close_device(struct nci_dev *ndev)
 	mutex_lock(&ndev->req_lock);
 
 	if (!test_and_clear_bit(NCI_UP, &ndev->flags)) {
+		/* Need to flush the cmd wq in case
+		 * there is a queued/running cmd_work
+		 */
+		flush_workqueue(ndev->cmd_wq);
 		del_timer_sync(&ndev->cmd_timer);
 		del_timer_sync(&ndev->data_timer);
 		mutex_unlock(&ndev->req_lock);
-- 
Gitee


From 6c99e853acfb83e08bdd1f0718bd2ed0140e1d91 Mon Sep 17 00:00:00 2001
From: Harshit Mogalapalli <harshit.m.mogalapalli@oracle.com>
Date: Tue, 19 Jul 2022 16:30:50 +0800
Subject: [PATCH 411/674] cifs: potential buffer overflow in handling symlinks

stable inclusion
from stable-v5.10.112
commit 4e166a41180be2f1e66bbb6d46448e80a9a5ec05
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=4e166a41180be2f1e66bbb6d46448e80a9a5ec05

--------------------------------

[ Upstream commit 64c4a37ac04eeb43c42d272f6e6c8c12bfcf4304 ]

Smatch printed a warning:
	arch/x86/crypto/poly1305_glue.c:198 poly1305_update_arch() error:
	__memcpy() 'dctx->buf' too small (16 vs u32max)

It's caused because Smatch marks 'link_len' as untrusted since it comes
from sscanf(). Add a check to ensure that 'link_len' is not larger than
the size of the 'link_str' buffer.

Fixes: c69c1b6eaea1 ("cifs: implement CIFSParseMFSymlink()")
Signed-off-by: Harshit Mogalapalli <harshit.m.mogalapalli@oracle.com>
Reviewed-by: Ronnie Sahlberg <lsahlber@redhat.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 fs/cifs/link.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 94dab4309fbb..85d30fef98a2 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -97,6 +97,9 @@ parse_mf_symlink(const u8 *buf, unsigned int buf_len, unsigned int *_link_len,
 	if (rc != 1)
 		return -EINVAL;
 
+	if (link_len > CIFS_MF_SYMLINK_LINK_MAXLEN)
+		return -EINVAL;
+
 	rc = symlink_hash(link_len, link_str, md5_hash);
 	if (rc) {
 		cifs_dbg(FYI, "%s: MD5 hash failure: %d\n", __func__, rc);
-- 
Gitee


From def148dda90b1b8026cc515078cdcae72a4a3742 Mon Sep 17 00:00:00 2001
From: Khazhismel Kumykov <khazhy@google.com>
Date: Tue, 19 Jul 2022 16:30:51 +0800
Subject: [PATCH 412/674] dm mpath: only use ktime_get_ns() in historical
 selector

stable inclusion
from stable-v5.10.112
commit 504c15f07f541f87185113ccd485a814adf31e4f
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=504c15f07f541f87185113ccd485a814adf31e4f

--------------------------------

[ Upstream commit ce40426fdc3c92acdba6b5ca74bc7277ffaa6a3d ]

Mixing sched_clock() and ktime_get_ns() usage will give bad results.

Switch hst_select_path() from using sched_clock() to ktime_get_ns().
Also rename path_service_time()'s 'sched_now' variable to 'now'.

Fixes: 2613eab11996 ("dm mpath: add Historical Service Time Path Selector")
Signed-off-by: Khazhismel Kumykov <khazhy@google.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/md/dm-historical-service-time.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/md/dm-historical-service-time.c b/drivers/md/dm-historical-service-time.c
index 186f91e2752c..06fe43c13ba3 100644
--- a/drivers/md/dm-historical-service-time.c
+++ b/drivers/md/dm-historical-service-time.c
@@ -429,7 +429,7 @@ static struct dm_path *hst_select_path(struct path_selector *ps,
 {
 	struct selector *s = ps->context;
 	struct path_info *pi = NULL, *best = NULL;
-	u64 time_now = sched_clock();
+	u64 time_now = ktime_get_ns();
 	struct dm_path *ret = NULL;
 	unsigned long flags;
 
@@ -470,7 +470,7 @@ static int hst_start_io(struct path_selector *ps, struct dm_path *path,
 
 static u64 path_service_time(struct path_info *pi, u64 start_time)
 {
-	u64 sched_now = ktime_get_ns();
+	u64 now = ktime_get_ns();
 
 	/* if a previous disk request has finished after this IO was
 	 * sent to the hardware, pretend the submission happened
@@ -479,11 +479,11 @@ static u64 path_service_time(struct path_info *pi, u64 start_time)
 	if (time_after64(pi->last_finish, start_time))
 		start_time = pi->last_finish;
 
-	pi->last_finish = sched_now;
-	if (time_before64(sched_now, start_time))
+	pi->last_finish = now;
+	if (time_before64(now, start_time))
 		return 0;
 
-	return sched_now - start_time;
+	return now - start_time;
 }
 
 static int hst_end_io(struct path_selector *ps, struct dm_path *path,
-- 
Gitee


From 27806a5a06b3e9807856235d29669faf104a209c Mon Sep 17 00:00:00 2001
From: Jeremy Linton <jeremy.linton@arm.com>
Date: Tue, 19 Jul 2022 16:30:52 +0800
Subject: [PATCH 413/674] net: bcmgenet: Revert "Use stronger register
 read/writes to assure ordering"

stable inclusion
from stable-v5.10.112
commit 6f9c06501d288c96ebfb651443dd365b0aca3757
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=6f9c06501d288c96ebfb651443dd365b0aca3757

--------------------------------

[ Upstream commit 2df3fc4a84e917a422935cc5bae18f43f9955d31 ]

It turns out after digging deeper into this bug, that it was being
triggered by GCC12 failing to call the bcmgenet_enable_dma()
routine. Given that a gcc12 fix has been merged [1] and the genet
driver now works properly when built with gcc12, this commit should
be reverted.

[1]
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105160
https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=aabb9a261ef060cf24fd626713f1d7d9df81aa57

Fixes: 8d3ea3d402db ("net: bcmgenet: Use stronger register read/writes to assure ordering")
Signed-off-by: Jeremy Linton <jeremy.linton@arm.com>
Acked-by: Florian Fainelli <f.fainelli@gmail.com>
Link: https://lore.kernel.org/r/20220412210420.1129430-1-jeremy.linton@arm.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/net/ethernet/broadcom/genet/bcmgenet.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
index d676e59eb82d..5de37c33a737 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c
+++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
@@ -76,7 +76,7 @@ static inline void bcmgenet_writel(u32 value, void __iomem *offset)
 	if (IS_ENABLED(CONFIG_MIPS) && IS_ENABLED(CONFIG_CPU_BIG_ENDIAN))
 		__raw_writel(value, offset);
 	else
-		writel(value, offset);
+		writel_relaxed(value, offset);
 }
 
 static inline u32 bcmgenet_readl(void __iomem *offset)
@@ -84,7 +84,7 @@ static inline u32 bcmgenet_readl(void __iomem *offset)
 	if (IS_ENABLED(CONFIG_MIPS) && IS_ENABLED(CONFIG_CPU_BIG_ENDIAN))
 		return __raw_readl(offset);
 	else
-		return readl(offset);
+		return readl_relaxed(offset);
 }
 
 static inline void dmadesc_set_length_status(struct bcmgenet_priv *priv,
-- 
Gitee


From 5a088293eeb41bc6a6282b491a354a4e6394b80f Mon Sep 17 00:00:00 2001
From: Aurabindo Pillai <aurabindo.pillai@amd.com>
Date: Tue, 19 Jul 2022 16:30:53 +0800
Subject: [PATCH 414/674] drm/amd: Add USBC connector ID

stable inclusion
from stable-v5.10.112
commit 9b5d1b3413d784fea12c387c27dbd10e3446e404
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=9b5d1b3413d784fea12c387c27dbd10e3446e404

--------------------------------

[ Upstream commit c5c948aa894a831f96fccd025e47186b1ee41615 ]

[Why&How] Add a dedicated AMDGPU specific ID for use with
newer ASICs that support USB-C output

Signed-off-by: Aurabindo Pillai <aurabindo.pillai@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/gpu/drm/amd/amdgpu/ObjectID.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/ObjectID.h b/drivers/gpu/drm/amd/amdgpu/ObjectID.h
index 5b393622f592..a0f0a17e224f 100644
--- a/drivers/gpu/drm/amd/amdgpu/ObjectID.h
+++ b/drivers/gpu/drm/amd/amdgpu/ObjectID.h
@@ -119,6 +119,7 @@
 #define CONNECTOR_OBJECT_ID_eDP                   0x14
 #define CONNECTOR_OBJECT_ID_MXM                   0x15
 #define CONNECTOR_OBJECT_ID_LVDS_eDP              0x16
+#define CONNECTOR_OBJECT_ID_USBC                  0x17
 
 /* deleted */
 
-- 
Gitee


From 3c484dfda822f1c6cf3d9ae7c9edc0814a0647f1 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Tue, 19 Jul 2022 16:30:54 +0800
Subject: [PATCH 415/674] btrfs: fix fallocate to use file_modified to update
 permissions consistently

stable inclusion
from stable-v5.10.112
commit 217190dc66efdf58e04a5c1f239f330c4c47d81b
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=217190dc66efdf58e04a5c1f239f330c4c47d81b

--------------------------------

[ Upstream commit 05fd9564e9faf0f23b4676385e27d9405cef6637 ]

Since the initial introduction of (posix) fallocate back at the turn of
the century, it has been possible to use this syscall to change the
user-visible contents of files.  This can happen by extending the file
size during a preallocation, or through any of the newer modes (punch,
zero range).  Because the call can be used to change file contents, we
should treat it like we do any other modification to a file -- update
the mtime, and drop set[ug]id privileges/capabilities.

The VFS function file_modified() does all this for us if pass it a
locked inode, so let's make fallocate drop permissions correctly.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 fs/btrfs/file.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index f59ec55e5feb..416a1b753ff6 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2833,8 +2833,9 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
 	return ret;
 }
 
-static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
+static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
 {
+	struct inode *inode = file_inode(file);
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct extent_state *cached_state = NULL;
@@ -2866,6 +2867,10 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 		goto out_only_mutex;
 	}
 
+	ret = file_modified(file);
+	if (ret)
+		goto out_only_mutex;
+
 	lockstart = round_up(offset, btrfs_inode_sectorsize(BTRFS_I(inode)));
 	lockend = round_down(offset + len,
 			     btrfs_inode_sectorsize(BTRFS_I(inode))) - 1;
@@ -3301,7 +3306,7 @@ static long btrfs_fallocate(struct file *file, int mode,
 		return -EOPNOTSUPP;
 
 	if (mode & FALLOC_FL_PUNCH_HOLE)
-		return btrfs_punch_hole(inode, offset, len);
+		return btrfs_punch_hole(file, offset, len);
 
 	/*
 	 * Only trigger disk allocation, don't trigger qgroup reserve
@@ -3323,6 +3328,10 @@ static long btrfs_fallocate(struct file *file, int mode,
 			goto out;
 	}
 
+	ret = file_modified(file);
+	if (ret)
+		goto out;
+
 	/*
 	 * TODO: Move these two operations after we have checked
 	 * accurate reserved space, or fallocate can still fail but
-- 
Gitee


From 66c9a7dc806a961055c7b9ede42f4d06905a1e1a Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Tue, 19 Jul 2022 16:30:55 +0800
Subject: [PATCH 416/674] btrfs: do not warn for free space inode in
 cow_file_range

stable inclusion
from stable-v5.10.112
commit 76e086ce7b2d71b0d7a1a121c5f53b5fa07c1c8c
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=76e086ce7b2d71b0d7a1a121c5f53b5fa07c1c8c

--------------------------------

[ Upstream commit a7d16d9a07bbcb7dcd5214a1bea75c808830bc0d ]

This is a long time leftover from when I originally added the free space
inode, the point was to catch cases where we weren't honoring the NOCOW
flag.  However there exists a race with relocation, if we allocate our
free space inode in a block group that is about to be relocated, we
could trigger the COW path before the relocation has the opportunity to
find the extents and delete the free space cache.  In production where
we have auto-relocation enabled we're seeing this WARN_ON_ONCE() around
5k times in a 2 week period, so not super common but enough that it's at
the top of our metrics.

We're properly handling the error here, and with us phasing out v1 space
cache anyway just drop the WARN_ON_ONCE.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 fs/btrfs/inode.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f7f4ac01589b..4a5248097d7a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -995,7 +995,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
 	int ret = 0;
 
 	if (btrfs_is_free_space_inode(inode)) {
-		WARN_ON_ONCE(1);
 		ret = -EINVAL;
 		goto out_unlock;
 	}
-- 
Gitee


From 4b68fc4d27954159a26562642360a24f6a435678 Mon Sep 17 00:00:00 2001
From: Charlene Liu <Charlene.Liu@amd.com>
Date: Tue, 19 Jul 2022 16:30:56 +0800
Subject: [PATCH 417/674] drm/amd/display: fix audio format not updated after
 edid updated

stable inclusion
from stable-v5.10.112
commit 756c61c1680f435292b7d06072a8f390f551bca6
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=756c61c1680f435292b7d06072a8f390f551bca6

--------------------------------

[ Upstream commit 5e8a71cf13bc9184fee915b2220be71b4c6cac74 ]

[why]
for the case edid change only changed audio format.
driver still need to update stream.

Reviewed-by: Alvin Lee <Alvin.Lee2@amd.com>
Reviewed-by: Aric Cyr <Aric.Cyr@amd.com>
Acked-by: Alex Hung <alex.hung@amd.com>
Signed-off-by: Charlene Liu <Charlene.Liu@amd.com>
Tested-by: Daniel Wheeler <daniel.wheeler@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/gpu/drm/amd/display/dc/core/dc_resource.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_resource.c b/drivers/gpu/drm/amd/display/dc/core/dc_resource.c
index 5c5ccbad9658..1e47afc4ccc1 100644
--- a/drivers/gpu/drm/amd/display/dc/core/dc_resource.c
+++ b/drivers/gpu/drm/amd/display/dc/core/dc_resource.c
@@ -1701,8 +1701,8 @@ bool dc_is_stream_unchanged(
 	if (old_stream->ignore_msa_timing_param != stream->ignore_msa_timing_param)
 		return false;
 
-	// Only Have Audio left to check whether it is same or not. This is a corner case for Tiled sinks
-	if (old_stream->audio_info.mode_count != stream->audio_info.mode_count)
+	/*compare audio info*/
+	if (memcmp(&old_stream->audio_info, &stream->audio_info, sizeof(stream->audio_info)) != 0)
 		return false;
 
 	return true;
-- 
Gitee


From 4b7c332d3a355b2841bd5c15e5f67be2f23baac1 Mon Sep 17 00:00:00 2001
From: Chiawen Huang <chiawen.huang@amd.com>
Date: Tue, 19 Jul 2022 16:30:57 +0800
Subject: [PATCH 418/674] drm/amd/display: FEC check in timing validation

stable inclusion
from stable-v5.10.112
commit 6906e05cf3ad2cfe704eda003de59aa0cb89b291
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=6906e05cf3ad2cfe704eda003de59aa0cb89b291

--------------------------------

[ Upstream commit 7d56a154e22ffb3613fdebf83ec34d5225a22993 ]

[Why]
disable/enable leads FEC mismatch between hw/sw FEC state.

[How]
check FEC status to fastboot on/off.

Reviewed-by: Anthony Koo <Anthony.Koo@amd.com>
Acked-by: Alex Hung <alex.hung@amd.com>
Signed-off-by: Chiawen Huang <chiawen.huang@amd.com>
Tested-by: Daniel Wheeler <daniel.wheeler@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/gpu/drm/amd/display/dc/core/dc.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/gpu/drm/amd/display/dc/core/dc.c b/drivers/gpu/drm/amd/display/dc/core/dc.c
index 93f5229c303e..ac5323596c65 100644
--- a/drivers/gpu/drm/amd/display/dc/core/dc.c
+++ b/drivers/gpu/drm/amd/display/dc/core/dc.c
@@ -1173,6 +1173,10 @@ bool dc_validate_seamless_boot_timing(const struct dc *dc,
 	if (!link->link_enc->funcs->is_dig_enabled(link->link_enc))
 		return false;
 
+	/* Check for FEC status*/
+	if (link->link_enc->funcs->fec_is_active(link->link_enc))
+		return false;
+
 	enc_inst = link->link_enc->funcs->get_dig_frontend(link->link_enc);
 
 	if (enc_inst == ENGINE_ID_UNKNOWN)
-- 
Gitee


From 12565d62e538545a7394254551290145099600ed Mon Sep 17 00:00:00 2001
From: "Leo (Hanghong) Ma" <hanghong.ma@amd.com>
Date: Tue, 19 Jul 2022 16:30:58 +0800
Subject: [PATCH 419/674] drm/amd/display: Update VTEM Infopacket definition

stable inclusion
from stable-v5.10.112
commit 7fc0610ad8182d05596e671c950bca1f3921d5e0
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=7fc0610ad8182d05596e671c950bca1f3921d5e0

--------------------------------

[ Upstream commit c9fbf6435162ed5fb7201d1d4adf6585c6a8c327 ]

[Why & How]
The latest HDMI SPEC has updated the VTEM packet structure,
so change the VTEM Infopacket defined in the driver side to align
with the SPEC.

Reviewed-by: Chris Park <Chris.Park@amd.com>
Acked-by: Alex Hung <alex.hung@amd.com>
Signed-off-by: Leo (Hanghong) Ma <hanghong.ma@amd.com>
Tested-by: Daniel Wheeler <daniel.wheeler@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 .../gpu/drm/amd/display/modules/info_packet/info_packet.c    | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/modules/info_packet/info_packet.c b/drivers/gpu/drm/amd/display/modules/info_packet/info_packet.c
index 0fdf7a3e96de..96e18050a617 100644
--- a/drivers/gpu/drm/amd/display/modules/info_packet/info_packet.c
+++ b/drivers/gpu/drm/amd/display/modules/info_packet/info_packet.c
@@ -100,7 +100,8 @@ enum vsc_packet_revision {
 //PB7 = MD0
 #define MASK_VTEM_MD0__VRR_EN         0x01
 #define MASK_VTEM_MD0__M_CONST        0x02
-#define MASK_VTEM_MD0__RESERVED2      0x0C
+#define MASK_VTEM_MD0__QMS_EN         0x04
+#define MASK_VTEM_MD0__RESERVED2      0x08
 #define MASK_VTEM_MD0__FVA_FACTOR_M1  0xF0
 
 //MD1
@@ -109,7 +110,7 @@ enum vsc_packet_revision {
 //MD2
 #define MASK_VTEM_MD2__BASE_REFRESH_RATE_98  0x03
 #define MASK_VTEM_MD2__RB                    0x04
-#define MASK_VTEM_MD2__RESERVED3             0xF8
+#define MASK_VTEM_MD2__NEXT_TFR              0xF8
 
 //MD3
 #define MASK_VTEM_MD3__BASE_REFRESH_RATE_07  0xFF
-- 
Gitee


From ec62d273d1dbdea4f1b1ce234925be2accd23128 Mon Sep 17 00:00:00 2001
From: Tushar Patel <tushar.patel@amd.com>
Date: Tue, 19 Jul 2022 16:30:59 +0800
Subject: [PATCH 420/674] drm/amdkfd: Fix Incorrect VMIDs passed to HWS

stable inclusion
from stable-v5.10.112
commit d2e0931e6d84d133a78b87b70d32d5776ee2b101
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=d2e0931e6d84d133a78b87b70d32d5776ee2b101

--------------------------------

[ Upstream commit b7dfbd2e601f3fee545bc158feceba4f340fe7cf ]

Compute-only GPUs have more than 8 VMIDs allocated to KFD. Fix
this by passing correct number of VMIDs to HWS

v2: squash in warning fix (Alex)

Signed-off-by: Tushar Patel <tushar.patel@amd.com>
Reviewed-by: Felix Kuehling <felix.kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c |  2 +-
 drivers/gpu/drm/amd/amdkfd/kfd_device.c | 11 +++--------
 2 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index ed13a2f76884..30659c1776e8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -632,7 +632,7 @@ MODULE_PARM_DESC(sched_policy,
  * Maximum number of processes that HWS can schedule concurrently. The maximum is the
  * number of VMIDs assigned to the HWS, which is also the default.
  */
-int hws_max_conc_proc = 8;
+int hws_max_conc_proc = -1;
 module_param(hws_max_conc_proc, int, 0444);
 MODULE_PARM_DESC(hws_max_conc_proc,
 	"Max # processes HWS can execute concurrently when sched_policy=0 (0 = no concurrency, #VMIDs for KFD = Maximum(default))");
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 84313135c2ea..148e43dee657 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -664,15 +664,10 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
 			- kfd->vm_info.first_vmid_kfd + 1;
 
 	/* Verify module parameters regarding mapped process number*/
-	if ((hws_max_conc_proc < 0)
-			|| (hws_max_conc_proc > kfd->vm_info.vmid_num_kfd)) {
-		dev_err(kfd_device,
-			"hws_max_conc_proc %d must be between 0 and %d, use %d instead\n",
-			hws_max_conc_proc, kfd->vm_info.vmid_num_kfd,
-			kfd->vm_info.vmid_num_kfd);
+	if (hws_max_conc_proc >= 0)
+		kfd->max_proc_per_quantum = min((u32)hws_max_conc_proc, kfd->vm_info.vmid_num_kfd);
+	else
 		kfd->max_proc_per_quantum = kfd->vm_info.vmid_num_kfd;
-	} else
-		kfd->max_proc_per_quantum = hws_max_conc_proc;
 
 	/* calculate max size of mqds needed for queues */
 	size = max_num_of_queues_per_device *
-- 
Gitee


From 69ce17cda16b6153a6d8cffe4004f85c00653f08 Mon Sep 17 00:00:00 2001
From: Tianci Yin <tianci.yin@amd.com>
Date: Tue, 19 Jul 2022 16:31:00 +0800
Subject: [PATCH 421/674] drm/amdgpu/vcn: improve vcn dpg stop procedure

stable inclusion
from stable-v5.10.112
commit e5afacc826a80cd82ac3f9eb26bc80e45cd14c20
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=e5afacc826a80cd82ac3f9eb26bc80e45cd14c20

--------------------------------

[ Upstream commit 6ea239adc2a712eb318f04f5c29b018ba65ea38a ]

Prior to disabling dpg, VCN need unpausing dpg mode, or VCN will hang in
S3 resuming.

Reviewed-by: James Zhu <James.Zhu@amd.com>
Signed-off-by: Tianci Yin <tianci.yin@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c
index 2099f6ebd833..bdb8e596bda6 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c
@@ -1429,8 +1429,11 @@ static int vcn_v3_0_start_sriov(struct amdgpu_device *adev)
 
 static int vcn_v3_0_stop_dpg_mode(struct amdgpu_device *adev, int inst_idx)
 {
+	struct dpg_pause_state state = {.fw_based = VCN_DPG_STATE__UNPAUSE};
 	uint32_t tmp;
 
+	vcn_v3_0_pause_dpg_mode(adev, 0, &state);
+
 	/* Wait for power status to be 1 */
 	SOC15_WAIT_ON_RREG(VCN, inst_idx, mmUVD_POWER_STATUS, 1,
 		UVD_POWER_STATUS__UVD_POWER_STATUS_MASK);
-- 
Gitee


From ea649459fc1bff1151b9dbc58fa7a502c2bb20c6 Mon Sep 17 00:00:00 2001
From: QintaoShen <unSimple1993@163.com>
Date: Tue, 19 Jul 2022 16:31:01 +0800
Subject: [PATCH 422/674] drm/amdkfd: Check for potential null return of
 kmalloc_array()

stable inclusion
from stable-v5.10.112
commit 1d7a5aae884ca727d41c7ed15d4c82fdb67c040c
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=1d7a5aae884ca727d41c7ed15d4c82fdb67c040c

--------------------------------

[ Upstream commit ebbb7bb9e80305820dc2328a371c1b35679f2667 ]

As the kmalloc_array() may return null, the 'event_waiters[i].wait' would lead to null-pointer dereference.
Therefore, it is better to check the return value of kmalloc_array() to avoid this confusion.

Signed-off-by: QintaoShen <unSimple1993@163.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_events.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
index ba2c2ce0c55a..159be13ef20b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@@ -531,6 +531,8 @@ static struct kfd_event_waiter *alloc_event_waiters(uint32_t num_events)
 	event_waiters = kmalloc_array(num_events,
 					sizeof(struct kfd_event_waiter),
 					GFP_KERNEL);
+	if (!event_waiters)
+		return NULL;
 
 	for (i = 0; (event_waiters) && (i < num_events) ; i++) {
 		init_wait(&event_waiters[i].wait);
-- 
Gitee


From aed10586a103cb44228fcbc35ed89ba073dc160a Mon Sep 17 00:00:00 2001
From: Michael Kelley <mikelley@microsoft.com>
Date: Tue, 19 Jul 2022 16:31:02 +0800
Subject: [PATCH 423/674] Drivers: hv: vmbus: Prevent load re-ordering when
 reading ring buffer

stable inclusion
from stable-v5.10.112
commit 43666798059c7cb5c374ce1f79f51e147aa5d419
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=43666798059c7cb5c374ce1f79f51e147aa5d419

--------------------------------

[ Upstream commit b6cae15b5710c8097aad26a2e5e752c323ee5348 ]

When reading a packet from a host-to-guest ring buffer, there is no
memory barrier between reading the write index (to see if there is
a packet to read) and reading the contents of the packet. The Hyper-V
host uses store-release when updating the write index to ensure that
writes of the packet data are completed first. On the guest side,
the processor can reorder and read the packet data before the write
index, and sometimes get stale packet data. Getting such stale packet
data has been observed in a reproducible case in a VM on ARM64.

Fix this by using virt_load_acquire() to read the write index,
ensuring that reads of the packet data cannot be reordered
before it. Preventing such reordering is logically correct, and
with this change, getting stale data can no longer be reproduced.

Signed-off-by: Michael Kelley <mikelley@microsoft.com>
Reviewed-by: Andrea Parri (Microsoft) <parri.andrea@gmail.com>
Link: https://lore.kernel.org/r/1648394710-33480-1-git-send-email-mikelley@microsoft.com
Signed-off-by: Wei Liu <wei.liu@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/hv/ring_buffer.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/hv/ring_buffer.c b/drivers/hv/ring_buffer.c
index 356e22159e83..769851b6e74c 100644
--- a/drivers/hv/ring_buffer.c
+++ b/drivers/hv/ring_buffer.c
@@ -378,7 +378,16 @@ int hv_ringbuffer_read(struct vmbus_channel *channel,
 static u32 hv_pkt_iter_avail(const struct hv_ring_buffer_info *rbi)
 {
 	u32 priv_read_loc = rbi->priv_read_index;
-	u32 write_loc = READ_ONCE(rbi->ring_buffer->write_index);
+	u32 write_loc;
+
+	/*
+	 * The Hyper-V host writes the packet data, then uses
+	 * store_release() to update the write_index.  Use load_acquire()
+	 * here to prevent loads of the packet data from being re-ordered
+	 * before the read of the write_index and potentially getting
+	 * stale data.
+	 */
+	write_loc = virt_load_acquire(&rbi->ring_buffer->write_index);
 
 	if (write_loc >= priv_read_loc)
 		return write_loc - priv_read_loc;
-- 
Gitee


From a10270a6d949ebe9bd6487e4a198c524de8bbb45 Mon Sep 17 00:00:00 2001
From: Xiaoguang Wang <xiaoguang.wang@linux.alibaba.com>
Date: Tue, 19 Jul 2022 16:31:03 +0800
Subject: [PATCH 424/674] scsi: target: tcmu: Fix possible page UAF

stable inclusion
from stable-v5.10.112
commit aec36b98a1bbaa84bfd8299a306e4c12314af626
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=aec36b98a1bbaa84bfd8299a306e4c12314af626

--------------------------------

[ Upstream commit a6968f7a367f128d120447360734344d5a3d5336 ]

tcmu_try_get_data_page() looks up pages under cmdr_lock, but it does not
take refcount properly and just returns page pointer. When
tcmu_try_get_data_page() returns, the returned page may have been freed by
tcmu_blocks_release().

We need to get_page() under cmdr_lock to avoid concurrent
tcmu_blocks_release().

Link: https://lore.kernel.org/r/20220311132206.24515-1-xiaoguang.wang@linux.alibaba.com
Reviewed-by: Bodo Stroesser <bostroesser@gmail.com>
Signed-off-by: Xiaoguang Wang <xiaoguang.wang@linux.alibaba.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/target/target_core_user.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
index c6950f157b99..c283e45ac300 100644
--- a/drivers/target/target_core_user.c
+++ b/drivers/target/target_core_user.c
@@ -1676,6 +1676,7 @@ static struct page *tcmu_try_get_block_page(struct tcmu_dev *udev, uint32_t dbi)
 	mutex_lock(&udev->cmdr_lock);
 	page = tcmu_get_block_page(udev, dbi);
 	if (likely(page)) {
+		get_page(page);
 		mutex_unlock(&udev->cmdr_lock);
 		return page;
 	}
@@ -1714,6 +1715,7 @@ static vm_fault_t tcmu_vma_fault(struct vm_fault *vmf)
 		/* For the vmalloc()ed cmd area pages */
 		addr = (void *)(unsigned long)info->mem[mi].addr + offset;
 		page = vmalloc_to_page(addr);
+		get_page(page);
 	} else {
 		uint32_t dbi;
 
@@ -1724,7 +1726,6 @@ static vm_fault_t tcmu_vma_fault(struct vm_fault *vmf)
 			return VM_FAULT_SIGBUS;
 	}
 
-	get_page(page);
 	vmf->page = page;
 	return 0;
 }
-- 
Gitee


From 39621a1d6b6040950287449cf56a62ae1e415a41 Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Tue, 19 Jul 2022 16:31:04 +0800
Subject: [PATCH 425/674] scsi: lpfc: Fix queue failures when recovering from
 PCI parity error

stable inclusion
from stable-v5.10.112
commit b9a110fa755b2a144c677254938d709bdbea36ac
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=b9a110fa755b2a144c677254938d709bdbea36ac

--------------------------------

[ Upstream commit df0101197c4d9596682901631f3ee193ed354873 ]

When recovering from a pci-parity error the driver is failing to re-create
queues, causing recovery to fail. Looking deeper, it was found that the
interrupt vector count allocated on the recovery was fewer than the vectors
originally allocated. This disparity resulted in CPU map entries with stale
information. When the driver tries to re-create the queues, it attempts to
use the stale information which indicates an eq/interrupt vector that was
no longer created.

Fix by clearng the cpup map array before enabling and requesting the IRQs
in the lpfc_sli_reset_slot_s4 routine().

Link: https://lore.kernel.org/r/20220317032737.45308-4-jsmart2021@gmail.com
Co-developed-by: Justin Tee <justin.tee@broadcom.com>
Signed-off-by: Justin Tee <justin.tee@broadcom.com>
Signed-off-by: James Smart <jsmart2021@gmail.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/scsi/lpfc/lpfc_init.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c
index 1149bfc42fe6..134e4ee5dc48 100644
--- a/drivers/scsi/lpfc/lpfc_init.c
+++ b/drivers/scsi/lpfc/lpfc_init.c
@@ -13614,6 +13614,8 @@ lpfc_io_slot_reset_s4(struct pci_dev *pdev)
 	psli->sli_flag &= ~LPFC_SLI_ACTIVE;
 	spin_unlock_irq(&phba->hbalock);
 
+	/* Init cpu_map array */
+	lpfc_cpu_map_array_init(phba);
 	/* Configure and enable interrupt */
 	intr_mode = lpfc_sli4_enable_intr(phba, phba->intr_mode);
 	if (intr_mode == LPFC_INTR_ERROR) {
-- 
Gitee


From 3a7835f442a2fe8b45dd3a87e218a1bbefef5e49 Mon Sep 17 00:00:00 2001
From: Tyrel Datwyler <tyreld@linux.ibm.com>
Date: Tue, 19 Jul 2022 16:31:05 +0800
Subject: [PATCH 426/674] scsi: ibmvscsis: Increase INITIAL_SRP_LIMIT to 1024

stable inclusion
from stable-v5.10.112
commit d3478709edf2cb84a46f87f7960d555f5797a967
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=d3478709edf2cb84a46f87f7960d555f5797a967

--------------------------------

[ Upstream commit 0bade8e53279157c7cc9dd95d573b7e82223d78a ]

The adapter request_limit is hardcoded to be INITIAL_SRP_LIMIT which is
currently an arbitrary value of 800. Increase this value to 1024 which
better matches the characteristics of the typical IBMi Initiator that
supports 32 LUNs and a queue depth of 32.

This change also has the secondary benefit of being a power of two as
required by the kfifo API. Since, Commit ab9bb6318b09 ("Partially revert
"kfifo: fix kfifo_alloc() and kfifo_init()"") the size of IU pool for each
target has been rounded down to 512 when attempting to kfifo_init() those
pools with the current request_limit size of 800.

Link: https://lore.kernel.org/r/20220322194443.678433-1-tyreld@linux.ibm.com
Signed-off-by: Tyrel Datwyler <tyreld@linux.ibm.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c b/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c
index cc3908c2d2f9..a3431485def8 100644
--- a/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c
+++ b/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c
@@ -35,7 +35,7 @@
 
 #define IBMVSCSIS_VERSION	"v0.2"
 
-#define	INITIAL_SRP_LIMIT	800
+#define	INITIAL_SRP_LIMIT	1024
 #define	DEFAULT_MAX_SECTORS	256
 #define MAX_TXU			1024 * 1024
 
-- 
Gitee


From 132b9e782031f5b8a728e1550f8f422704ab36de Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Tue, 19 Jul 2022 16:31:06 +0800
Subject: [PATCH 427/674] net: micrel: fix KS8851_MLL Kconfig

stable inclusion
from stable-v5.10.112
commit 1ff5359afa5ec0dd09fe76183dc4fa24b50e4125
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=1ff5359afa5ec0dd09fe76183dc4fa24b50e4125

--------------------------------

[ Upstream commit c3efcedd272aa6dd5929e20cf902a52ddaa1197a ]

KS8851_MLL selects MICREL_PHY, which depends on PTP_1588_CLOCK_OPTIONAL,
so make KS8851_MLL also depend on PTP_1588_CLOCK_OPTIONAL since
'select' does not follow any dependency chains.

Fixes kconfig warning and build errors:

WARNING: unmet direct dependencies detected for MICREL_PHY
  Depends on [m]: NETDEVICES [=y] && PHYLIB [=y] && PTP_1588_CLOCK_OPTIONAL [=m]
  Selected by [y]:
  - KS8851_MLL [=y] && NETDEVICES [=y] && ETHERNET [=y] && NET_VENDOR_MICREL [=y] && HAS_IOMEM [=y]

ld: drivers/net/phy/micrel.o: in function `lan8814_ts_info':
micrel.c:(.text+0xb35): undefined reference to `ptp_clock_index'
ld: drivers/net/phy/micrel.o: in function `lan8814_probe':
micrel.c:(.text+0x2586): undefined reference to `ptp_clock_register'

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/net/ethernet/micrel/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/micrel/Kconfig b/drivers/net/ethernet/micrel/Kconfig
index 42bc014136fe..9ceb7e1fb169 100644
--- a/drivers/net/ethernet/micrel/Kconfig
+++ b/drivers/net/ethernet/micrel/Kconfig
@@ -37,6 +37,7 @@ config KS8851
 config KS8851_MLL
 	tristate "Micrel KS8851 MLL"
 	depends on HAS_IOMEM
+	depends on PTP_1588_CLOCK_OPTIONAL
 	select MII
 	select CRC32
 	select EEPROM_93CX6
-- 
Gitee


From f568a8dd74946b9ca81f7091d4e325bec0cb4bc1 Mon Sep 17 00:00:00 2001
From: Christian Lamparter <chunkeey@gmail.com>
Date: Tue, 19 Jul 2022 16:31:07 +0800
Subject: [PATCH 428/674] ata: libata-core: Disable READ LOG DMA EXT for
 Samsung 840 EVOs

stable inclusion
from stable-v5.10.112
commit 150fe861c57c042510c81b24bddf4a9818ceb5a2
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=150fe861c57c042510c81b24bddf4a9818ceb5a2

--------------------------------

[ Upstream commit 5399752299396a3c9df6617f4b3c907d7aa4ded8 ]

Samsung' 840 EVO with the latest firmware (EXT0DB6Q) locks up with
the a message: "READ LOG DMA EXT failed, trying PIO" during boot.

Initially this was discovered because it caused a crash
with the sata_dwc_460ex controller on a WD MyBook Live DUO.

The reporter "Tice Rex" which has the unique opportunity that he
has two Samsung 840 EVO SSD! One with the older firmware "EXT0BB0Q"
which booted fine and didn't expose "READ LOG DMA EXT". But the
newer/latest firmware "EXT0DB6Q" caused the headaches.

BugLink: https://github.com/openwrt/openwrt/issues/9505
Signed-off-by: Christian Lamparter <chunkeey@gmail.com>
Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/ata/libata-core.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index d2b544bdc7b5..f963a0a7da46 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -3974,6 +3974,9 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = {
 						ATA_HORKAGE_ZERO_AFTER_TRIM, },
 	{ "Crucial_CT*MX100*",		"MU01",	ATA_HORKAGE_NO_NCQ_TRIM |
 						ATA_HORKAGE_ZERO_AFTER_TRIM, },
+	{ "Samsung SSD 840 EVO*",	NULL,	ATA_HORKAGE_NO_NCQ_TRIM |
+						ATA_HORKAGE_NO_DMA_LOG |
+						ATA_HORKAGE_ZERO_AFTER_TRIM, },
 	{ "Samsung SSD 840*",		NULL,	ATA_HORKAGE_NO_NCQ_TRIM |
 						ATA_HORKAGE_ZERO_AFTER_TRIM, },
 	{ "Samsung SSD 850*",		NULL,	ATA_HORKAGE_NO_NCQ_TRIM |
-- 
Gitee


From 40a23576d58afca5e521f319eb61fa2d346f45ae Mon Sep 17 00:00:00 2001
From: Leo Ruan <tingquan.ruan@cn.bosch.com>
Date: Tue, 19 Jul 2022 16:31:08 +0800
Subject: [PATCH 429/674] gpu: ipu-v3: Fix dev_dbg frequency output

stable inclusion
from stable-v5.10.112
commit aa8cdedaf7606fc0e7e16cdf67c1d651f2c93b5f
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=aa8cdedaf7606fc0e7e16cdf67c1d651f2c93b5f

--------------------------------

[ Upstream commit 070a88fd4a03f921b73a2059e97d55faaa447dab ]

This commit corrects the printing of the IPU clock error percentage if
it is between -0.1% to -0.9%. For example, if the pixel clock requested
is 27.2 MHz but only 27.0 MHz can be achieved the deviation is -0.8%.
But the fixed point math had a flaw and calculated error of 0.2%.

Before:
  Clocks: IPU 270000000Hz DI 24716667Hz Needed 27200000Hz
  IPU clock can give 27000000 with divider 10, error 0.2%
  Want 27200000Hz IPU 270000000Hz DI 24716667Hz using IPU, 27000000Hz

After:
  Clocks: IPU 270000000Hz DI 24716667Hz Needed 27200000Hz
  IPU clock can give 27000000 with divider 10, error -0.8%
  Want 27200000Hz IPU 270000000Hz DI 24716667Hz using IPU, 27000000Hz

Signed-off-by: Leo Ruan <tingquan.ruan@cn.bosch.com>
Signed-off-by: Mark Jonas <mark.jonas@de.bosch.com>
Reviewed-by: Philipp Zabel <p.zabel@pengutronix.de>
Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
Link: https://lore.kernel.org/r/20220207151411.5009-1-mark.jonas@de.bosch.com
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/gpu/ipu-v3/ipu-di.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/ipu-v3/ipu-di.c b/drivers/gpu/ipu-v3/ipu-di.c
index b4a31d506fcc..74eca68891ad 100644
--- a/drivers/gpu/ipu-v3/ipu-di.c
+++ b/drivers/gpu/ipu-v3/ipu-di.c
@@ -451,8 +451,9 @@ static void ipu_di_config_clock(struct ipu_di *di,
 
 		error = rate / (sig->mode.pixelclock / 1000);
 
-		dev_dbg(di->ipu->dev, "  IPU clock can give %lu with divider %u, error %d.%u%%\n",
-			rate, div, (signed)(error - 1000) / 10, error % 10);
+		dev_dbg(di->ipu->dev, "  IPU clock can give %lu with divider %u, error %c%d.%d%%\n",
+			rate, div, error < 1000 ? '-' : '+',
+			abs(error - 1000) / 10, abs(error - 1000) % 10);
 
 		/* Allow a 1% error */
 		if (error < 1010 && error >= 990) {
-- 
Gitee


From d069824e2bfda41cd7fd8a3116057aec850d8744 Mon Sep 17 00:00:00 2001
From: Jonathan Bakker <xc-racer2@live.ca>
Date: Tue, 19 Jul 2022 16:31:09 +0800
Subject: [PATCH 430/674] regulator: wm8994: Add an off-on delay for WM8994
 variant

stable inclusion
from stable-v5.10.112
commit 2462faffbfa582698906a6aa5fe677cf35003851
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=2462faffbfa582698906a6aa5fe677cf35003851

--------------------------------

[ Upstream commit 92d96b603738ec4f35cde7198c303ae264dd47cb ]

As per Table 130 of the wm8994 datasheet at [1], there is an off-on
delay for LDO1 and LDO2.  In the wm8958 datasheet [2], I could not
find any reference to it.  I could not find a wm1811 datasheet to
double-check there, but as no one has complained presumably it works
without it.

This solves the issue on Samsung Aries boards with a wm8994 where
register writes fail when the device is powered off and back-on
quickly.

[1] https://statics.cirrus.com/pubs/proDatasheet/WM8994_Rev4.6.pdf
[2] https://statics.cirrus.com/pubs/proDatasheet/WM8958_v3.5.pdf

Signed-off-by: Jonathan Bakker <xc-racer2@live.ca>
Acked-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Link: https://lore.kernel.org/r/CY4PR04MB056771CFB80DC447C30D5A31CB1D9@CY4PR04MB0567.namprd04.prod.outlook.com
Signed-off-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/regulator/wm8994-regulator.c | 42 ++++++++++++++++++++++++++--
 1 file changed, 39 insertions(+), 3 deletions(-)

diff --git a/drivers/regulator/wm8994-regulator.c b/drivers/regulator/wm8994-regulator.c
index cadea0344486..40befdd9dfa9 100644
--- a/drivers/regulator/wm8994-regulator.c
+++ b/drivers/regulator/wm8994-regulator.c
@@ -71,6 +71,35 @@ static const struct regulator_ops wm8994_ldo2_ops = {
 };
 
 static const struct regulator_desc wm8994_ldo_desc[] = {
+	{
+		.name = "LDO1",
+		.id = 1,
+		.type = REGULATOR_VOLTAGE,
+		.n_voltages = WM8994_LDO1_MAX_SELECTOR + 1,
+		.vsel_reg = WM8994_LDO_1,
+		.vsel_mask = WM8994_LDO1_VSEL_MASK,
+		.ops = &wm8994_ldo1_ops,
+		.min_uV = 2400000,
+		.uV_step = 100000,
+		.enable_time = 3000,
+		.off_on_delay = 36000,
+		.owner = THIS_MODULE,
+	},
+	{
+		.name = "LDO2",
+		.id = 2,
+		.type = REGULATOR_VOLTAGE,
+		.n_voltages = WM8994_LDO2_MAX_SELECTOR + 1,
+		.vsel_reg = WM8994_LDO_2,
+		.vsel_mask = WM8994_LDO2_VSEL_MASK,
+		.ops = &wm8994_ldo2_ops,
+		.enable_time = 3000,
+		.off_on_delay = 36000,
+		.owner = THIS_MODULE,
+	},
+};
+
+static const struct regulator_desc wm8958_ldo_desc[] = {
 	{
 		.name = "LDO1",
 		.id = 1,
@@ -172,9 +201,16 @@ static int wm8994_ldo_probe(struct platform_device *pdev)
 	 * regulator core and we need not worry about it on the
 	 * error path.
 	 */
-	ldo->regulator = devm_regulator_register(&pdev->dev,
-						 &wm8994_ldo_desc[id],
-						 &config);
+	if (ldo->wm8994->type == WM8994) {
+		ldo->regulator = devm_regulator_register(&pdev->dev,
+							 &wm8994_ldo_desc[id],
+							 &config);
+	} else {
+		ldo->regulator = devm_regulator_register(&pdev->dev,
+							 &wm8958_ldo_desc[id],
+							 &config);
+	}
+
 	if (IS_ERR(ldo->regulator)) {
 		ret = PTR_ERR(ldo->regulator);
 		dev_err(wm8994->dev, "Failed to register LDO%d: %d\n",
-- 
Gitee


From c43d31e079236b16bef0241ff118474661325aca Mon Sep 17 00:00:00 2001
From: Joey Gouly <joey.gouly@arm.com>
Date: Tue, 19 Jul 2022 16:31:10 +0800
Subject: [PATCH 431/674] arm64: alternatives: mark patch_alternative() as
 `noinstr`

stable inclusion
from stable-v5.10.112
commit 98973d2bdd4a9af53cc7ec9560ddbddf01216eba
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=98973d2bdd4a9af53cc7ec9560ddbddf01216eba

--------------------------------

[ Upstream commit a2c0b0fbe01419f8f5d1c0b9c581631f34ffce8b ]

The alternatives code must be `noinstr` such that it does not patch itself,
as the cache invalidation is only performed after all the alternatives have
been applied.

Mark patch_alternative() as `noinstr`. Mark branch_insn_requires_update()
and get_alt_insn() with `__always_inline` since they are both only called
through patch_alternative().

Booting a kernel in QEMU TCG with KCSAN=y and ARM64_USE_LSE_ATOMICS=y caused
a boot hang:
[    0.241121] CPU: All CPU(s) started at EL2

The alternatives code was patching the atomics in __tsan_read4() from LL/SC
atomics to LSE atomics.

The following fragment is using LL/SC atomics in the .text section:
  | <__tsan_unaligned_read4+304>:     ldxr    x6, [x2]
  | <__tsan_unaligned_read4+308>:     add     x6, x6, x5
  | <__tsan_unaligned_read4+312>:     stxr    w7, x6, [x2]
  | <__tsan_unaligned_read4+316>:     cbnz    w7, <__tsan_unaligned_read4+304>

This LL/SC atomic sequence was to be replaced with LSE atomics. However since
the alternatives code was instrumentable, __tsan_read4() was being called after
only the first instruction was replaced, which led to the following code in memory:
  | <__tsan_unaligned_read4+304>:     ldadd   x5, x6, [x2]
  | <__tsan_unaligned_read4+308>:     add     x6, x6, x5
  | <__tsan_unaligned_read4+312>:     stxr    w7, x6, [x2]
  | <__tsan_unaligned_read4+316>:     cbnz    w7, <__tsan_unaligned_read4+304>

This caused an infinite loop as the `stxr` instruction never completed successfully,
so `w7` was always 0.

Signed-off-by: Joey Gouly <joey.gouly@arm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20220405104733.11476-1-joey.gouly@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 arch/arm64/kernel/alternative.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/kernel/alternative.c b/arch/arm64/kernel/alternative.c
index 73039949b5ce..5f8e4c2df53c 100644
--- a/arch/arm64/kernel/alternative.c
+++ b/arch/arm64/kernel/alternative.c
@@ -41,7 +41,7 @@ bool alternative_is_applied(u16 cpufeature)
 /*
  * Check if the target PC is within an alternative block.
  */
-static bool branch_insn_requires_update(struct alt_instr *alt, unsigned long pc)
+static __always_inline bool branch_insn_requires_update(struct alt_instr *alt, unsigned long pc)
 {
 	unsigned long replptr = (unsigned long)ALT_REPL_PTR(alt);
 	return !(pc >= replptr && pc <= (replptr + alt->alt_len));
@@ -49,7 +49,7 @@ static bool branch_insn_requires_update(struct alt_instr *alt, unsigned long pc)
 
 #define align_down(x, a)	((unsigned long)(x) & ~(((unsigned long)(a)) - 1))
 
-static u32 get_alt_insn(struct alt_instr *alt, __le32 *insnptr, __le32 *altinsnptr)
+static __always_inline u32 get_alt_insn(struct alt_instr *alt, __le32 *insnptr, __le32 *altinsnptr)
 {
 	u32 insn;
 
@@ -94,7 +94,7 @@ static u32 get_alt_insn(struct alt_instr *alt, __le32 *insnptr, __le32 *altinsnp
 	return insn;
 }
 
-static void patch_alternative(struct alt_instr *alt,
+static noinstr void patch_alternative(struct alt_instr *alt,
 			      __le32 *origptr, __le32 *updptr, int nr_inst)
 {
 	__le32 *replptr;
-- 
Gitee


From 827abe9996cbedb288fd7c07f53c5233406ca204 Mon Sep 17 00:00:00 2001
From: Steve Capper <steve.capper@arm.com>
Date: Tue, 19 Jul 2022 16:31:11 +0800
Subject: [PATCH 432/674] tlb: hugetlb: Add more sizes to
 tlb_remove_huge_tlb_entry

stable inclusion
from stable-v5.10.112
commit b643807a735e2d80eec972ad22536dcb66f79c2e
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=b643807a735e2d80eec972ad22536dcb66f79c2e

--------------------------------

[ Upstream commit 697a1d44af8ba0477ee729e632f4ade37999249a ]

tlb_remove_huge_tlb_entry only considers PMD_SIZE and PUD_SIZE when
updating the mmu_gather structure.

Unfortunately on arm64 there are two additional huge page sizes that
need to be covered: CONT_PTE_SIZE and CONT_PMD_SIZE. Where an end-user
attempts to employ contiguous huge pages, a VM_BUG_ON can be experienced
due to the fact that the tlb structure hasn't been correctly updated by
the relevant tlb_flush_p.._range() call from tlb_remove_huge_tlb_entry.

This patch adds inequality logic to the generic implementation of
tlb_remove_huge_tlb_entry s.t. CONT_PTE_SIZE and CONT_PMD_SIZE are
effectively covered on arm64. Also, as well as ptes, pmds and puds;
p4ds are now considered too.

Reported-by: David Hildenbrand <david@redhat.com>
Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/linux-mm/811c5c8e-b3a2-85d2-049c-717f17c3a03a@redhat.com/
Signed-off-by: Steve Capper <steve.capper@arm.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20220330112543.863-1-steve.capper@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 include/asm-generic/tlb.h | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 6661ee1cff47..a0c4b99d2899 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -563,10 +563,14 @@ static inline void tlb_flush_p4d_range(struct mmu_gather *tlb,
 #define tlb_remove_huge_tlb_entry(h, tlb, ptep, address)	\
 	do {							\
 		unsigned long _sz = huge_page_size(h);		\
-		if (_sz == PMD_SIZE)				\
-			tlb_flush_pmd_range(tlb, address, _sz);	\
-		else if (_sz == PUD_SIZE)			\
+		if (_sz >= P4D_SIZE)				\
+			tlb_flush_p4d_range(tlb, address, _sz);	\
+		else if (_sz >= PUD_SIZE)			\
 			tlb_flush_pud_range(tlb, address, _sz);	\
+		else if (_sz >= PMD_SIZE)			\
+			tlb_flush_pmd_range(tlb, address, _sz);	\
+		else						\
+			tlb_flush_pte_range(tlb, address, _sz);	\
 		__tlb_remove_tlb_entry(tlb, ptep, address);	\
 	} while (0)
 
-- 
Gitee


From 80804b54f0cdbced28775c066034a95cc3e3129f Mon Sep 17 00:00:00 2001
From: Andy Chiu <andy.chiu@sifive.com>
Date: Tue, 19 Jul 2022 16:31:12 +0800
Subject: [PATCH 433/674] net: axienet: setup mdio unconditionally

stable inclusion
from stable-v5.10.112
commit 9c12fcf1d864b20bdbb422804b69b2dec936839b
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=9c12fcf1d864b20bdbb422804b69b2dec936839b

--------------------------------

[ Upstream commit d1c4f93e3f0a023024a6f022a61528c06cf1daa9 ]

The call to axienet_mdio_setup should not depend on whether "phy-node"
pressents on the DT. Besides, since `lp->phy_node` is used if PHY is in
SGMII or 100Base-X modes, move it into the if statement. And the next patch
will remove `lp->phy_node` from driver's private structure and do an
of_node_put on it right away after use since it is not used elsewhere.

Signed-off-by: Andy Chiu <andy.chiu@sifive.com>
Reviewed-by: Greentime Hu <greentime.hu@sifive.com>
Reviewed-by: Robert Hancock <robert.hancock@calian.com>
Reviewed-by: Radhey Shyam Pandey <radhey.shyam.pandey@xilinx.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/net/ethernet/xilinx/xilinx_axienet_main.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
index b8e85ef7ce92..ef20184277ff 100644
--- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
+++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
@@ -2076,15 +2076,14 @@ static int axienet_probe(struct platform_device *pdev)
 	if (ret)
 		goto cleanup_clk;
 
-	lp->phy_node = of_parse_phandle(pdev->dev.of_node, "phy-handle", 0);
-	if (lp->phy_node) {
-		ret = axienet_mdio_setup(lp);
-		if (ret)
-			dev_warn(&pdev->dev,
-				 "error registering MDIO bus: %d\n", ret);
-	}
+	ret = axienet_mdio_setup(lp);
+	if (ret)
+		dev_warn(&pdev->dev,
+			 "error registering MDIO bus: %d\n", ret);
+
 	if (lp->phy_mode == PHY_INTERFACE_MODE_SGMII ||
 	    lp->phy_mode == PHY_INTERFACE_MODE_1000BASEX) {
+		lp->phy_node = of_parse_phandle(pdev->dev.of_node, "phy-handle", 0);
 		if (!lp->phy_node) {
 			dev_err(&pdev->dev, "phy-handle required for 1000BaseX/SGMII\n");
 			ret = -EINVAL;
-- 
Gitee


From b7fdd0885604d2c2d7df33e1fa7801710528c7d0 Mon Sep 17 00:00:00 2001
From: Marcin Kozlowski <marcinguy@gmail.com>
Date: Tue, 19 Jul 2022 16:31:13 +0800
Subject: [PATCH 434/674] net: usb: aqc111: Fix out-of-bounds accesses in RX
 fixup

stable inclusion
from stable-v5.10.112
commit d90df6da50c56ad8b1a132e3cf86b6cdf8f507b7
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=d90df6da50c56ad8b1a132e3cf86b6cdf8f507b7

--------------------------------

[ Upstream commit afb8e246527536848b9b4025b40e613edf776a9d ]

aqc111_rx_fixup() contains several out-of-bounds accesses that can be
triggered by a malicious (or defective) USB device, in particular:

 - The metadata array (desc_offset..desc_offset+2*pkt_count) can be out of bounds,
   causing OOB reads and (on big-endian systems) OOB endianness flips.
 - A packet can overlap the metadata array, causing a later OOB
   endianness flip to corrupt data used by a cloned SKB that has already
   been handed off into the network stack.
 - A packet SKB can be constructed whose tail is far beyond its end,
   causing out-of-bounds heap data to be considered part of the SKB's
   data.

Found doing variant analysis. Tested it with another driver (ax88179_178a), since
I don't have a aqc111 device to test it, but the code looks very similar.

Signed-off-by: Marcin Kozlowski <marcinguy@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/net/usb/aqc111.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/net/usb/aqc111.c b/drivers/net/usb/aqc111.c
index 0717c18015c9..c9c409518174 100644
--- a/drivers/net/usb/aqc111.c
+++ b/drivers/net/usb/aqc111.c
@@ -1102,10 +1102,15 @@ static int aqc111_rx_fixup(struct usbnet *dev, struct sk_buff *skb)
 	if (start_of_descs != desc_offset)
 		goto err;
 
-	/* self check desc_offset from header*/
-	if (desc_offset >= skb_len)
+	/* self check desc_offset from header and make sure that the
+	 * bounds of the metadata array are inside the SKB
+	 */
+	if (pkt_count * 2 + desc_offset >= skb_len)
 		goto err;
 
+	/* Packets must not overlap the metadata array */
+	skb_trim(skb, desc_offset);
+
 	if (pkt_count == 0)
 		goto err;
 
-- 
Gitee


From 178c28b56c0cf450c4683b59aabe81a164e0099a Mon Sep 17 00:00:00 2001
From: Xiaomeng Tong <xiam0nd.tong@gmail.com>
Date: Tue, 19 Jul 2022 16:31:14 +0800
Subject: [PATCH 435/674] myri10ge: fix an incorrect free for skb in
 myri10ge_sw_tso

stable inclusion
from stable-v5.10.112
commit fa5ee7c4232cc9976c0db483b914519639de75dc
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=fa5ee7c4232cc9976c0db483b914519639de75dc

--------------------------------

[ Upstream commit b423e54ba965b4469b48e46fd16941f1e1701697 ]

All remaining skbs should be released when myri10ge_xmit fails to
transmit a packet. Fix it within another skb_list_walk_safe.

Signed-off-by: Xiaomeng Tong <xiam0nd.tong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/net/ethernet/myricom/myri10ge/myri10ge.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c
index b5c4a64bc025..30763e966633 100644
--- a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c
+++ b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c
@@ -2901,11 +2901,9 @@ static netdev_tx_t myri10ge_sw_tso(struct sk_buff *skb,
 		status = myri10ge_xmit(curr, dev);
 		if (status != 0) {
 			dev_kfree_skb_any(curr);
-			if (segs != NULL) {
-				curr = segs;
-				segs = next;
+			skb_list_walk_safe(next, curr, next) {
 				curr->next = NULL;
-				dev_kfree_skb_any(segs);
+				dev_kfree_skb_any(curr);
 			}
 			goto drop;
 		}
-- 
Gitee


From 403e0c62a0c117a53d7e8635f93b63068b37ad95 Mon Sep 17 00:00:00 2001
From: Martin Leung <Martin.Leung@amd.com>
Date: Tue, 19 Jul 2022 16:31:15 +0800
Subject: [PATCH 436/674] drm/amd/display: Revert FEC check in validation

stable inclusion
from stable-v5.10.112
commit 34ea097fb63d116d857e6c23b97cb94ff02ccf42
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=34ea097fb63d116d857e6c23b97cb94ff02ccf42

--------------------------------

[ Upstream commit b2075fce104b88b789c15ef1ed2b91dc94198e26 ]

why and how:
causes failure on install on certain machines

Reviewed-by: George Shen <George.Shen@amd.com>
Acked-by: Alex Hung <alex.hung@amd.com>
Signed-off-by: Martin Leung <Martin.Leung@amd.com>
Tested-by: Daniel Wheeler <daniel.wheeler@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/gpu/drm/amd/display/dc/core/dc.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/dc/core/dc.c b/drivers/gpu/drm/amd/display/dc/core/dc.c
index ac5323596c65..93f5229c303e 100644
--- a/drivers/gpu/drm/amd/display/dc/core/dc.c
+++ b/drivers/gpu/drm/amd/display/dc/core/dc.c
@@ -1173,10 +1173,6 @@ bool dc_validate_seamless_boot_timing(const struct dc *dc,
 	if (!link->link_enc->funcs->is_dig_enabled(link->link_enc))
 		return false;
 
-	/* Check for FEC status*/
-	if (link->link_enc->funcs->fec_is_active(link->link_enc))
-		return false;
-
 	enc_inst = link->link_enc->funcs->get_dig_frontend(link->link_enc);
 
 	if (enc_inst == ENGINE_ID_UNKNOWN)
-- 
Gitee


From 91b2c7e8a9b175f7e497dc79208d4f6479a43304 Mon Sep 17 00:00:00 2001
From: Roman Li <Roman.Li@amd.com>
Date: Tue, 19 Jul 2022 16:31:16 +0800
Subject: [PATCH 437/674] drm/amd/display: Fix allocate_mst_payload assert on
 resume

stable inclusion
from stable-v5.10.112
commit 4b44cd5840577479ee6ed387371dd1d1252ca921
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=4b44cd5840577479ee6ed387371dd1d1252ca921

--------------------------------

[ Upstream commit f4346fb3edf7720db3f7f5e1cab1f667cd024280 ]

[Why]
On resume we do link detection for all non-MST connectors.
MST is handled separately. However the condition for telling
if connector is on mst branch is not enough for mst hub case.
Link detection for mst branch link leads to mst topology reset.
That causes assert in dc_link_allocate_mst_payload()

[How]
Use link type as indicator for mst link.

Reviewed-by: Wayne Lin <Wayne.Lin@amd.com>
Acked-by: Alex Hung <alex.hung@amd.com>
Signed-off-by: Roman Li <Roman.Li@amd.com>
Tested-by: Daniel Wheeler <daniel.wheeler@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index e828f9414ba2..7bb151283f44 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -2022,7 +2022,8 @@ static int dm_resume(void *handle)
 		 * this is the case when traversing through already created
 		 * MST connectors, should be skipped
 		 */
-		if (aconnector->mst_port)
+		if (aconnector->dc_link &&
+		    aconnector->dc_link->type == dc_connection_mst_branch)
 			continue;
 
 		mutex_lock(&aconnector->hpd_lock);
-- 
Gitee


From b1b141e7bc7c97341376d4ccfb36067d643f8737 Mon Sep 17 00:00:00 2001
From: Alexey Galakhov <agalakhov@gmail.com>
Date: Tue, 19 Jul 2022 16:31:17 +0800
Subject: [PATCH 438/674] scsi: mvsas: Add PCI ID of RocketRaid 2640

stable inclusion
from stable-v5.10.112
commit 5b7ce74b6bc8c7e6ee3cb8b22b8c5e3e27943339
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=5b7ce74b6bc8c7e6ee3cb8b22b8c5e3e27943339

--------------------------------

[ Upstream commit 5f2bce1e222028dc1c15f130109a17aa654ae6e8 ]

The HighPoint RocketRaid 2640 is a low-cost SAS controller based on Marvell
chip. The chip in question was already supported by the kernel, just the
PCI ID of this particular board was missing.

Link: https://lore.kernel.org/r/20220309212535.402987-1-agalakhov@gmail.com
Signed-off-by: Alexey Galakhov <agalakhov@gmail.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/scsi/mvsas/mv_init.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/scsi/mvsas/mv_init.c b/drivers/scsi/mvsas/mv_init.c
index 0cfea7b2ab13..85ca8421fb86 100644
--- a/drivers/scsi/mvsas/mv_init.c
+++ b/drivers/scsi/mvsas/mv_init.c
@@ -646,6 +646,7 @@ static struct pci_device_id mvs_pci_table[] = {
 	{ PCI_VDEVICE(ARECA, PCI_DEVICE_ID_ARECA_1300), chip_1300 },
 	{ PCI_VDEVICE(ARECA, PCI_DEVICE_ID_ARECA_1320), chip_1320 },
 	{ PCI_VDEVICE(ADAPTEC2, 0x0450), chip_6440 },
+	{ PCI_VDEVICE(TTI, 0x2640), chip_6440 },
 	{ PCI_VDEVICE(TTI, 0x2710), chip_9480 },
 	{ PCI_VDEVICE(TTI, 0x2720), chip_9480 },
 	{ PCI_VDEVICE(TTI, 0x2721), chip_9480 },
-- 
Gitee


From d0ed685b6c3043cadeccd631164b3042bb1582c7 Mon Sep 17 00:00:00 2001
From: Chandrakanth patil <chandrakanth.patil@broadcom.com>
Date: Tue, 19 Jul 2022 16:31:18 +0800
Subject: [PATCH 439/674] scsi: megaraid_sas: Target with invalid LUN ID is
 deleted during scan

stable inclusion
from stable-v5.10.112
commit e8cf1e4d953d7022704f57aca4e5ccce6cee14ea
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=e8cf1e4d953d7022704f57aca4e5ccce6cee14ea

--------------------------------

[ Upstream commit 56495f295d8e021f77d065b890fc0100e3f9f6d8 ]

The megaraid_sas driver supports single LUN for RAID devices. That is LUN
0. All other LUNs are unsupported. When a device scan on a logical target
with invalid LUN number is invoked through sysfs, that target ends up
getting removed.

Add LUN ID validation in the slave destroy function to avoid the target
deletion.

Link: https://lore.kernel.org/r/20220324094711.48833-1-chandrakanth.patil@broadcom.com
Signed-off-by: Chandrakanth patil <chandrakanth.patil@broadcom.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/scsi/megaraid/megaraid_sas.h      | 3 +++
 drivers/scsi/megaraid/megaraid_sas_base.c | 7 +++++++
 2 files changed, 10 insertions(+)

diff --git a/drivers/scsi/megaraid/megaraid_sas.h b/drivers/scsi/megaraid/megaraid_sas.h
index 6b8ec57e8bdf..c088a848776e 100644
--- a/drivers/scsi/megaraid/megaraid_sas.h
+++ b/drivers/scsi/megaraid/megaraid_sas.h
@@ -2554,6 +2554,9 @@ struct megasas_instance_template {
 #define MEGASAS_IS_LOGICAL(sdev)					\
 	((sdev->channel < MEGASAS_MAX_PD_CHANNELS) ? 0 : 1)
 
+#define MEGASAS_IS_LUN_VALID(sdev)					\
+	(((sdev)->lun == 0) ? 1 : 0)
+
 #define MEGASAS_DEV_INDEX(scp)						\
 	(((scp->device->channel % 2) * MEGASAS_MAX_DEV_PER_CHANNEL) +	\
 	scp->device->id)
diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c
index 1c86b67476f0..718160ca66b3 100644
--- a/drivers/scsi/megaraid/megaraid_sas_base.c
+++ b/drivers/scsi/megaraid/megaraid_sas_base.c
@@ -2116,6 +2116,9 @@ static int megasas_slave_alloc(struct scsi_device *sdev)
 			goto scan_target;
 		}
 		return -ENXIO;
+	} else if (!MEGASAS_IS_LUN_VALID(sdev)) {
+		sdev_printk(KERN_INFO, sdev, "%s: invalid LUN\n", __func__);
+		return -ENXIO;
 	}
 
 scan_target:
@@ -2146,6 +2149,10 @@ static void megasas_slave_destroy(struct scsi_device *sdev)
 	instance = megasas_lookup_instance(sdev->host->host_no);
 
 	if (MEGASAS_IS_LOGICAL(sdev)) {
+		if (!MEGASAS_IS_LUN_VALID(sdev)) {
+			sdev_printk(KERN_INFO, sdev, "%s: invalid LUN\n", __func__);
+			return;
+		}
 		ld_tgt_id = MEGASAS_TARGET_ID(sdev);
 		instance->ld_tgtid_status[ld_tgt_id] = LD_TARGET_ID_DELETED;
 		if (megasas_dbg_lvl & LD_PD_DEBUG)
-- 
Gitee


From f13ab67744d7dd85de295d2fb71fb9285c589b73 Mon Sep 17 00:00:00 2001
From: Duoming Zhou <duoming@zju.edu.cn>
Date: Tue, 19 Jul 2022 16:31:19 +0800
Subject: [PATCH 440/674] drivers: net: slip: fix NPD bug in sl_tx_timeout()

stable inclusion
from stable-v5.10.112
commit ca24c5e8f0ac3d43ec0cff29e1c861be73aff165
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=ca24c5e8f0ac3d43ec0cff29e1c861be73aff165

--------------------------------

[ Upstream commit ec4eb8a86ade4d22633e1da2a7d85a846b7d1798 ]

When a slip driver is detaching, the slip_close() will act to
cleanup necessary resources and sl->tty is set to NULL in
slip_close(). Meanwhile, the packet we transmit is blocked,
sl_tx_timeout() will be called. Although slip_close() and
sl_tx_timeout() use sl->lock to synchronize, we don`t judge
whether sl->tty equals to NULL in sl_tx_timeout() and the
null pointer dereference bug will happen.

   (Thread 1)                 |      (Thread 2)
                              | slip_close()
                              |   spin_lock_bh(&sl->lock)
                              |   ...
...                           |   sl->tty = NULL //(1)
sl_tx_timeout()               |   spin_unlock_bh(&sl->lock)
  spin_lock(&sl->lock);       |
  ...                         |   ...
  tty_chars_in_buffer(sl->tty)|
    if (tty->ops->..) //(2)   |
    ...                       |   synchronize_rcu()

We set NULL to sl->tty in position (1) and dereference sl->tty
in position (2).

This patch adds check in sl_tx_timeout(). If sl->tty equals to
NULL, sl_tx_timeout() will goto out.

Signed-off-by: Duoming Zhou <duoming@zju.edu.cn>
Reviewed-by: Jiri Slaby <jirislaby@kernel.org>
Link: https://lore.kernel.org/r/20220405132206.55291-1-duoming@zju.edu.cn
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/net/slip/slip.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/slip/slip.c b/drivers/net/slip/slip.c
index f81fb0b13a94..369bd30fed35 100644
--- a/drivers/net/slip/slip.c
+++ b/drivers/net/slip/slip.c
@@ -468,7 +468,7 @@ static void sl_tx_timeout(struct net_device *dev, unsigned int txqueue)
 	spin_lock(&sl->lock);
 
 	if (netif_queue_stopped(dev)) {
-		if (!netif_running(dev))
+		if (!netif_running(dev) || !sl->tty)
 			goto out;
 
 		/* May be we must check transmitter timeout here ?
-- 
Gitee


From de06824ed8a1447dcaa55c1689369b3d58926652 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Tue, 19 Jul 2022 16:31:20 +0800
Subject: [PATCH 441/674] perf/imx_ddr: Fix undefined behavior due to shift
 overflowing the constant
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

stable inclusion
from stable-v5.10.112
commit 000b3921b4d52399865ea398e5f6c999246437f4
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=000b3921b4d52399865ea398e5f6c999246437f4

--------------------------------

[ Upstream commit d02b4dd84e1a90f7f1444d027c0289bf355b0d5a ]

Fix:

  In file included from <command-line>:0:0:
  In function ‘ddr_perf_counter_enable’,
      inlined from ‘ddr_perf_irq_handler’ at drivers/perf/fsl_imx8_ddr_perf.c:651:2:
  ././include/linux/compiler_types.h:352:38: error: call to ‘__compiletime_assert_729’ \
	declared with attribute error: FIELD_PREP: mask is not constant
    _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__)
...

See https://lore.kernel.org/r/YkwQ6%2BtIH8GQpuct@zn.tnic for the gory
details as to why it triggers with older gccs only.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Frank Li <Frank.li@nxp.com>
Cc: Will Deacon <will@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Shawn Guo <shawnguo@kernel.org>
Cc: Sascha Hauer <s.hauer@pengutronix.de>
Cc: Pengutronix Kernel Team <kernel@pengutronix.de>
Cc: Fabio Estevam <festevam@gmail.com>
Cc: NXP Linux Team <linux-imx@nxp.com>
Cc: linux-arm-kernel@lists.infradead.org
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20220405151517.29753-10-bp@alien8.de
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/perf/fsl_imx8_ddr_perf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/perf/fsl_imx8_ddr_perf.c b/drivers/perf/fsl_imx8_ddr_perf.c
index 7f7bc0993670..e09bbf3890c4 100644
--- a/drivers/perf/fsl_imx8_ddr_perf.c
+++ b/drivers/perf/fsl_imx8_ddr_perf.c
@@ -29,7 +29,7 @@
 #define CNTL_OVER_MASK		0xFFFFFFFE
 
 #define CNTL_CSV_SHIFT		24
-#define CNTL_CSV_MASK		(0xFF << CNTL_CSV_SHIFT)
+#define CNTL_CSV_MASK		(0xFFU << CNTL_CSV_SHIFT)
 
 #define EVENT_CYCLES_ID		0
 #define EVENT_CYCLES_COUNTER	0
-- 
Gitee


From 4558f170f76cde124ffd2ac7f038a37f786a40c4 Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Tue, 19 Jul 2022 16:31:21 +0800
Subject: [PATCH 442/674] mm, page_alloc: fix build_zonerefs_node()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

stable inclusion
from stable-v5.10.112
commit 192e507ef894b558f51e12ae87e938a959b7dfb6
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=192e507ef894b558f51e12ae87e938a959b7dfb6

--------------------------------

commit e553f62f10d93551eb883eca227ac54d1a4fad84 upstream.

Since commit 6aa303defb74 ("mm, vmscan: only allocate and reclaim from
zones with pages managed by the buddy allocator") only zones with free
memory are included in a built zonelist.  This is problematic when e.g.
all memory of a zone has been ballooned out when zonelists are being
rebuilt.

The decision whether to rebuild the zonelists when onlining new memory
is done based on populated_zone() returning 0 for the zone the memory
will be added to.  The new zone is added to the zonelists only, if it
has free memory pages (managed_zone() returns a non-zero value) after
the memory has been onlined.  This implies, that onlining memory will
always free the added pages to the allocator immediately, but this is
not true in all cases: when e.g. running as a Xen guest the onlined new
memory will be added only to the ballooned memory list, it will be freed
only when the guest is being ballooned up afterwards.

Another problem with using managed_zone() for the decision whether a
zone is being added to the zonelists is, that a zone with all memory
used will in fact be removed from all zonelists in case the zonelists
happen to be rebuilt.

Use populated_zone() when building a zonelist as it has been done before
that commit.

There was a report that QubesOS (based on Xen) is hitting this problem.
Xen has switched to use the zone device functionality in kernel 5.9 and
QubesOS wants to use memory hotplugging for guests in order to be able
to start a guest with minimal memory and expand it as needed.  This was
the report leading to the patch.

Link: https://lkml.kernel.org/r/20220407120637.9035-1-jgross@suse.com
Fixes: 6aa303defb74 ("mm, vmscan: only allocate and reclaim from zones with pages managed by the buddy allocator")
Signed-off-by: Juergen Gross <jgross@suse.com>
Reported-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com>
Reviewed-by: Wei Yang <richard.weiyang@gmail.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 mm/page_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c81ff36f4121..ec73cca1726c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5895,7 +5895,7 @@ static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs)
 	do {
 		zone_type--;
 		zone = pgdat->node_zones + zone_type;
-		if (managed_zone(zone)) {
+		if (populated_zone(zone)) {
 			zoneref_set_zone(zone, &zonerefs[nr_zones++]);
 			check_highest_zone(zone_type);
 		}
-- 
Gitee


From 4549c05d8c359e00b5a7d383b0a3c9a3a124b776 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Tue, 19 Jul 2022 16:31:22 +0800
Subject: [PATCH 443/674] mm: fix unexpected zeroed page mapping with zram swap

stable inclusion
from stable-v5.10.112
commit 20ed94f8181a25212e7404e44958e234f407624b
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=20ed94f8181a25212e7404e44958e234f407624b

--------------------------------

commit e914d8f00391520ecc4495dd0ca0124538ab7119 upstream.

Two processes under CLONE_VM cloning, user process can be corrupted by
seeing zeroed page unexpectedly.

      CPU A                        CPU B

  do_swap_page                do_swap_page
  SWP_SYNCHRONOUS_IO path     SWP_SYNCHRONOUS_IO path
  swap_readpage valid data
    swap_slot_free_notify
      delete zram entry
                              swap_readpage zeroed(invalid) data
                              pte_lock
                              map the *zero data* to userspace
                              pte_unlock
  pte_lock
  if (!pte_same)
    goto out_nomap;
  pte_unlock
  return and next refault will
  read zeroed data

The swap_slot_free_notify is bogus for CLONE_VM case since it doesn't
increase the refcount of swap slot at copy_mm so it couldn't catch up
whether it's safe or not to discard data from backing device.  In the
case, only the lock it could rely on to synchronize swap slot freeing is
page table lock.  Thus, this patch gets rid of the swap_slot_free_notify
function.  With this patch, CPU A will see correct data.

      CPU A                        CPU B

  do_swap_page                do_swap_page
  SWP_SYNCHRONOUS_IO path     SWP_SYNCHRONOUS_IO path
                              swap_readpage original data
                              pte_lock
                              map the original data
                              swap_free
                                swap_range_free
                                  bd_disk->fops->swap_slot_free_notify
  swap_readpage read zeroed data
                              pte_unlock
  pte_lock
  if (!pte_same)
    goto out_nomap;
  pte_unlock
  return
  on next refault will see mapped data by CPU B

The concern of the patch would increase memory consumption since it
could keep wasted memory with compressed form in zram as well as
uncompressed form in address space.  However, most of cases of zram uses
no readahead and do_swap_page is followed by swap_free so it will free
the compressed form from in zram quickly.

Link: https://lkml.kernel.org/r/YjTVVxIAsnKAXjTd@google.com
Fixes: 0bcac06f27d7 ("mm, swap: skip swapcache for swapin of synchronous device")
Reported-by: Ivan Babrou <ivan@cloudflare.com>
Tested-by: Ivan Babrou <ivan@cloudflare.com>
Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Nitin Gupta <ngupta@vflare.org>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: David Hildenbrand <david@redhat.com>
Cc: <stable@vger.kernel.org>	[4.14+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 mm/page_io.c | 54 ----------------------------------------------------
 1 file changed, 54 deletions(-)

diff --git a/mm/page_io.c b/mm/page_io.c
index 21f3160d39a8..ee28c39e566e 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -69,54 +69,6 @@ void end_swap_bio_write(struct bio *bio)
 	bio_put(bio);
 }
 
-static void swap_slot_free_notify(struct page *page)
-{
-	struct swap_info_struct *sis;
-	struct gendisk *disk;
-	swp_entry_t entry;
-
-	/*
-	 * There is no guarantee that the page is in swap cache - the software
-	 * suspend code (at least) uses end_swap_bio_read() against a non-
-	 * swapcache page.  So we must check PG_swapcache before proceeding with
-	 * this optimization.
-	 */
-	if (unlikely(!PageSwapCache(page)))
-		return;
-
-	sis = page_swap_info(page);
-	if (data_race(!(sis->flags & SWP_BLKDEV)))
-		return;
-
-	/*
-	 * The swap subsystem performs lazy swap slot freeing,
-	 * expecting that the page will be swapped out again.
-	 * So we can avoid an unnecessary write if the page
-	 * isn't redirtied.
-	 * This is good for real swap storage because we can
-	 * reduce unnecessary I/O and enhance wear-leveling
-	 * if an SSD is used as the as swap device.
-	 * But if in-memory swap device (eg zram) is used,
-	 * this causes a duplicated copy between uncompressed
-	 * data in VM-owned memory and compressed data in
-	 * zram-owned memory.  So let's free zram-owned memory
-	 * and make the VM-owned decompressed page *dirty*,
-	 * so the page should be swapped out somewhere again if
-	 * we again wish to reclaim it.
-	 */
-	disk = sis->bdev->bd_disk;
-	entry.val = page_private(page);
-	if (disk->fops->swap_slot_free_notify && __swap_count(entry) == 1) {
-		unsigned long offset;
-
-		offset = swp_offset(entry);
-
-		SetPageDirty(page);
-		disk->fops->swap_slot_free_notify(sis->bdev,
-				offset);
-	}
-}
-
 static void end_swap_bio_read(struct bio *bio)
 {
 	struct page *page = bio_first_page_all(bio);
@@ -132,7 +84,6 @@ static void end_swap_bio_read(struct bio *bio)
 	}
 
 	SetPageUptodate(page);
-	swap_slot_free_notify(page);
 out:
 	unlock_page(page);
 	WRITE_ONCE(bio->bi_private, NULL);
@@ -411,11 +362,6 @@ int swap_readpage(struct page *page, bool synchronous)
 	if (sis->flags & SWP_SYNCHRONOUS_IO) {
 		ret = bdev_read_page(sis->bdev, swap_page_sector(page), page);
 		if (!ret) {
-			if (trylock_page(page)) {
-				swap_slot_free_notify(page);
-				unlock_page(page);
-			}
-
 			count_vm_event(PSWPIN);
 			goto out;
 		}
-- 
Gitee


From cad9c6583ef4ffd342a40b344ce38dc0b5f1de17 Mon Sep 17 00:00:00 2001
From: Patrick Wang <patrick.wang.shcn@gmail.com>
Date: Tue, 19 Jul 2022 16:31:23 +0800
Subject: [PATCH 444/674] mm: kmemleak: take a full lowmem check in
 kmemleak_*_phys()

stable inclusion
from stable-v5.10.112
commit 06c348fde545ec90e25de3e5bc4b814bff70ae9f
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=06c348fde545ec90e25de3e5bc4b814bff70ae9f

--------------------------------

commit 23c2d497de21f25898fbea70aeb292ab8acc8c94 upstream.

The kmemleak_*_phys() apis do not check the address for lowmem's min
boundary, while the caller may pass an address below lowmem, which will
trigger an oops:

  # echo scan > /sys/kernel/debug/kmemleak
  Unable to handle kernel paging request at virtual address ff5fffffffe00000
  Oops [#1]
  Modules linked in:
  CPU: 2 PID: 134 Comm: bash Not tainted 5.18.0-rc1-next-20220407 #33
  Hardware name: riscv-virtio,qemu (DT)
  epc : scan_block+0x74/0x15c
   ra : scan_block+0x72/0x15c
  epc : ffffffff801e5806 ra : ffffffff801e5804 sp : ff200000104abc30
   gp : ffffffff815cd4e8 tp : ff60000004cfa340 t0 : 0000000000000200
   t1 : 00aaaaaac23954cc t2 : 00000000000003ff s0 : ff200000104abc90
   s1 : ffffffff81b0ff28 a0 : 0000000000000000 a1 : ff5fffffffe01000
   a2 : ffffffff81b0ff28 a3 : 0000000000000002 a4 : 0000000000000001
   a5 : 0000000000000000 a6 : ff200000104abd7c a7 : 0000000000000005
   s2 : ff5fffffffe00ff9 s3 : ffffffff815cd998 s4 : ffffffff815d0e90
   s5 : ffffffff81b0ff28 s6 : 0000000000000020 s7 : ffffffff815d0eb0
   s8 : ffffffffffffffff s9 : ff5fffffffe00000 s10: ff5fffffffe01000
   s11: 0000000000000022 t3 : 00ffffffaa17db4c t4 : 000000000000000f
   t5 : 0000000000000001 t6 : 0000000000000000
  status: 0000000000000100 badaddr: ff5fffffffe00000 cause: 000000000000000d
    scan_gray_list+0x12e/0x1a6
    kmemleak_scan+0x2aa/0x57e
    kmemleak_write+0x32a/0x40c
    full_proxy_write+0x56/0x82
    vfs_write+0xa6/0x2a6
    ksys_write+0x6c/0xe2
    sys_write+0x22/0x2a
    ret_from_syscall+0x0/0x2

The callers may not quite know the actual address they pass(e.g. from
devicetree).  So the kmemleak_*_phys() apis should guarantee the address
they finally use is in lowmem range, so check the address for lowmem's
min boundary.

Link: https://lkml.kernel.org/r/20220413122925.33856-1-patrick.wang.shcn@gmail.com
Signed-off-by: Patrick Wang <patrick.wang.shcn@gmail.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 mm/kmemleak.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 90eb82299149..118be6533374 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1124,7 +1124,7 @@ EXPORT_SYMBOL(kmemleak_no_scan);
 void __ref kmemleak_alloc_phys(phys_addr_t phys, size_t size, int min_count,
 			       gfp_t gfp)
 {
-	if (!IS_ENABLED(CONFIG_HIGHMEM) || PHYS_PFN(phys) < max_low_pfn)
+	if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn)
 		kmemleak_alloc(__va(phys), size, min_count, gfp);
 }
 EXPORT_SYMBOL(kmemleak_alloc_phys);
@@ -1138,7 +1138,7 @@ EXPORT_SYMBOL(kmemleak_alloc_phys);
  */
 void __ref kmemleak_free_part_phys(phys_addr_t phys, size_t size)
 {
-	if (!IS_ENABLED(CONFIG_HIGHMEM) || PHYS_PFN(phys) < max_low_pfn)
+	if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn)
 		kmemleak_free_part(__va(phys), size);
 }
 EXPORT_SYMBOL(kmemleak_free_part_phys);
@@ -1150,7 +1150,7 @@ EXPORT_SYMBOL(kmemleak_free_part_phys);
  */
 void __ref kmemleak_not_leak_phys(phys_addr_t phys)
 {
-	if (!IS_ENABLED(CONFIG_HIGHMEM) || PHYS_PFN(phys) < max_low_pfn)
+	if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn)
 		kmemleak_not_leak(__va(phys));
 }
 EXPORT_SYMBOL(kmemleak_not_leak_phys);
@@ -1162,7 +1162,7 @@ EXPORT_SYMBOL(kmemleak_not_leak_phys);
  */
 void __ref kmemleak_ignore_phys(phys_addr_t phys)
 {
-	if (!IS_ENABLED(CONFIG_HIGHMEM) || PHYS_PFN(phys) < max_low_pfn)
+	if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn)
 		kmemleak_ignore(__va(phys));
 }
 EXPORT_SYMBOL(kmemleak_ignore_phys);
-- 
Gitee


From 85fc72e3a63a7017a2c50040f40a96baab16577d Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Tue, 19 Jul 2022 16:31:24 +0800
Subject: [PATCH 445/674] KVM: x86/mmu: Resolve nx_huge_pages when kvm.ko is
 loaded

stable inclusion
from stable-v5.10.112
commit 342454231ee5f2c2782f5510cab2e7a968486fef
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=342454231ee5f2c2782f5510cab2e7a968486fef

--------------------------------

commit 1d0e84806047f38027d7572adb4702ef7c09b317 upstream.

Resolve nx_huge_pages to true/false when kvm.ko is loaded, leaving it as
-1 is technically undefined behavior when its value is read out by
param_get_bool(), as boolean values are supposed to be '0' or '1'.

Alternatively, KVM could define a custom getter for the param, but the
auto value doesn't depend on the vendor module in any way, and printing
"auto" would be unnecessarily unfriendly to the user.

In addition to fixing the undefined behavior, resolving the auto value
also fixes the scenario where the auto value resolves to N and no vendor
module is loaded.  Previously, -1 would result in Y being printed even
though KVM would ultimately disable the mitigation.

Rename the existing MMU module init/exit helpers to clarify that they're
invoked with respect to the vendor module, and add comments to document
why KVM has two separate "module init" flows.

  =========================================================================
  UBSAN: invalid-load in kernel/params.c:320:33
  load of value 255 is not a valid value for type '_Bool'
  CPU: 6 PID: 892 Comm: tail Not tainted 5.17.0-rc3+ #799
  Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
  Call Trace:
   <TASK>
   dump_stack_lvl+0x34/0x44
   ubsan_epilogue+0x5/0x40
   __ubsan_handle_load_invalid_value.cold+0x43/0x48
   param_get_bool.cold+0xf/0x14
   param_attr_show+0x55/0x80
   module_attr_show+0x1c/0x30
   sysfs_kf_seq_show+0x93/0xc0
   seq_read_iter+0x11c/0x450
   new_sync_read+0x11b/0x1a0
   vfs_read+0xf0/0x190
   ksys_read+0x5f/0xe0
   do_syscall_64+0x3b/0xc0
   entry_SYSCALL_64_after_hwframe+0x44/0xae
   </TASK>
  =========================================================================

Fixes: b8e8c8303ff2 ("kvm: mmu: ITLB_MULTIHIT mitigation")
Cc: stable@vger.kernel.org
Reported-by: Bruno Goncalves <bgoncalv@redhat.com>
Reported-by: Jan Stancek <jstancek@redhat.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20220331221359.3912754-1-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 arch/x86/include/asm/kvm_host.h |  5 +++--
 arch/x86/kvm/mmu/mmu.c          | 20 ++++++++++++++++----
 arch/x86/kvm/x86.c              | 20 ++++++++++++++++++--
 3 files changed, 37 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 559063fd8907..3d9da4629325 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1360,8 +1360,9 @@ static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm)
 		return -ENOTSUPP;
 }
 
-int kvm_mmu_module_init(void);
-void kvm_mmu_module_exit(void);
+void kvm_mmu_x86_module_init(void);
+int kvm_mmu_vendor_module_init(void);
+void kvm_mmu_vendor_module_exit(void);
 
 void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
 int kvm_mmu_create(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 9506cfcb86be..99ae11011ed4 100755
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -5878,12 +5878,24 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
 	return 0;
 }
 
-int kvm_mmu_module_init(void)
+/*
+ * nx_huge_pages needs to be resolved to true/false when kvm.ko is loaded, as
+ * its default value of -1 is technically undefined behavior for a boolean.
+ */
+void kvm_mmu_x86_module_init(void)
 {
-	int ret = -ENOMEM;
-
 	if (nx_huge_pages == -1)
 		__set_nx_huge_pages(get_nx_auto_mode());
+}
+
+/*
+ * The bulk of the MMU initialization is deferred until the vendor module is
+ * loaded as many of the masks/values may be modified by VMX or SVM, i.e. need
+ * to be reset when a potentially different vendor module is loaded.
+ */
+int kvm_mmu_vendor_module_init(void)
+{
+	int ret = -ENOMEM;
 
 	/*
 	 * MMU roles use union aliasing which is, generally speaking, an
@@ -5957,7 +5969,7 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
 	mmu_free_memory_caches(vcpu);
 }
 
-void kvm_mmu_module_exit(void)
+void kvm_mmu_vendor_module_exit(void)
 {
 	mmu_destroy_caches();
 	percpu_counter_destroy(&kvm_total_used_mmu_pages);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1c45d9bafe0a..e43760895eec 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -8098,7 +8098,7 @@ int kvm_arch_init(void *opaque)
 		goto out_free_x86_emulator_cache;
 	}
 
-	r = kvm_mmu_module_init();
+	r = kvm_mmu_vendor_module_init();
 	if (r)
 		goto out_free_percpu;
 
@@ -8158,7 +8158,7 @@ void kvm_arch_exit(void)
 	cancel_work_sync(&pvclock_gtod_work);
 #endif
 	kvm_x86_ops.hardware_enable = NULL;
-	kvm_mmu_module_exit();
+	kvm_mmu_vendor_module_exit();
 	free_percpu(user_return_msrs);
 	kmem_cache_destroy(x86_emulator_cache);
 	kmem_cache_destroy(x86_fpu_cache);
@@ -11549,3 +11549,19 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_update_request);
+
+static int __init kvm_x86_init(void)
+{
+	kvm_mmu_x86_module_init();
+	return 0;
+}
+module_init(kvm_x86_init);
+
+static void __exit kvm_x86_exit(void)
+{
+	/*
+	 * If module_init() is implemented, module_exit() must also be
+	 * implemented to allow module unload.
+	 */
+}
+module_exit(kvm_x86_exit);
-- 
Gitee


From 27c1f5f2de382058cca4a54c75348162d4d1b901 Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Tue, 19 Jul 2022 16:31:25 +0800
Subject: [PATCH 446/674] memory: renesas-rpc-if: fix platform-device leak in
 error path

stable inclusion
from stable-v5.10.112
commit c089ffc846c85f200db34ad208338f4f81a6d82d
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=c089ffc846c85f200db34ad208338f4f81a6d82d

--------------------------------

commit b452dbf24d7d9a990d70118462925f6ee287d135 upstream.

Make sure to free the flash platform device in the event that
registration fails during probe.

Fixes: ca7d8b980b67 ("memory: add Renesas RPC-IF driver")
Cc: stable@vger.kernel.org      # 5.8
Cc: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Signed-off-by: Johan Hovold <johan@kernel.org>
Link: https://lore.kernel.org/r/20220303180632.3194-1-johan@kernel.org
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/memory/renesas-rpc-if.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/memory/renesas-rpc-if.c b/drivers/memory/renesas-rpc-if.c
index 9019121a80f5..781af51e3f79 100644
--- a/drivers/memory/renesas-rpc-if.c
+++ b/drivers/memory/renesas-rpc-if.c
@@ -592,6 +592,7 @@ static int rpcif_probe(struct platform_device *pdev)
 	struct platform_device *vdev;
 	struct device_node *flash;
 	const char *name;
+	int ret;
 
 	flash = of_get_next_child(pdev->dev.of_node, NULL);
 	if (!flash) {
@@ -615,7 +616,14 @@ static int rpcif_probe(struct platform_device *pdev)
 		return -ENOMEM;
 	vdev->dev.parent = &pdev->dev;
 	platform_set_drvdata(pdev, vdev);
-	return platform_device_add(vdev);
+
+	ret = platform_device_add(vdev);
+	if (ret) {
+		platform_device_put(vdev);
+		return ret;
+	}
+
+	return 0;
 }
 
 static int rpcif_remove(struct platform_device *pdev)
-- 
Gitee


From 2722edfe52ea5b7ed9a6a2f2a818ba5d8f2f518e Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Tue, 19 Jul 2022 16:31:26 +0800
Subject: [PATCH 447/674] gcc-plugins: latent_entropy: use /dev/urandom

stable inclusion
from stable-v5.10.112
commit cc21ae932656483b07982afcec7e38a5bd6acc0c
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=cc21ae932656483b07982afcec7e38a5bd6acc0c

--------------------------------

commit c40160f2998c897231f8454bf797558d30a20375 upstream.

While the latent entropy plugin mostly doesn't derive entropy from
get_random_const() for measuring the call graph, when __latent_entropy is
applied to a constant, then it's initialized statically to output from
get_random_const(). In that case, this data is derived from a 64-bit
seed, which means a buffer of 512 bits doesn't really have that amount
of compile-time entropy.

This patch fixes that shortcoming by just buffering chunks of
/dev/urandom output and doling it out as requested.

At the same time, it's important that we don't break the use of
-frandom-seed, for people who want the runtime benefits of the latent
entropy plugin, while still having compile-time determinism. In that
case, we detect whether gcc's set_random_seed() has been called by
making a call to get_random_seed(noinit=true) in the plugin init
function, which is called after set_random_seed() is called but before
anything that calls get_random_seed(noinit=false), and seeing if it's
zero or not. If it's not zero, we're in deterministic mode, and so we
just generate numbers with a basic xorshift prng.

Note that we don't detect if -frandom-seed is being used using the
documented local_tick variable, because it's assigned via:
   local_tick = (unsigned) tv.tv_sec * 1000 + tv.tv_usec / 1000;
which may well overflow and become -1 on its own, and so isn't
reliable: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105171

[kees: The 256 byte rnd_buf size was chosen based on average (250),
 median (64), and std deviation (575) bytes of used entropy for a
 defconfig x86_64 build]

Fixes: 38addce8b600 ("gcc-plugins: Add latent_entropy plugin")
Cc: stable@vger.kernel.org
Cc: PaX Team <pageexec@freemail.hu>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20220405222815.21155-1-Jason@zx2c4.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 scripts/gcc-plugins/latent_entropy_plugin.c | 44 +++++++++++++--------
 1 file changed, 27 insertions(+), 17 deletions(-)

diff --git a/scripts/gcc-plugins/latent_entropy_plugin.c b/scripts/gcc-plugins/latent_entropy_plugin.c
index cbe1d6c4b1a5..c84bef1d2895 100644
--- a/scripts/gcc-plugins/latent_entropy_plugin.c
+++ b/scripts/gcc-plugins/latent_entropy_plugin.c
@@ -86,25 +86,31 @@ static struct plugin_info latent_entropy_plugin_info = {
 	.help		= "disable\tturn off latent entropy instrumentation\n",
 };
 
-static unsigned HOST_WIDE_INT seed;
-/*
- * get_random_seed() (this is a GCC function) generates the seed.
- * This is a simple random generator without any cryptographic security because
- * the entropy doesn't come from here.
- */
+static unsigned HOST_WIDE_INT deterministic_seed;
+static unsigned HOST_WIDE_INT rnd_buf[32];
+static size_t rnd_idx = ARRAY_SIZE(rnd_buf);
+static int urandom_fd = -1;
+
 static unsigned HOST_WIDE_INT get_random_const(void)
 {
-	unsigned int i;
-	unsigned HOST_WIDE_INT ret = 0;
-
-	for (i = 0; i < 8 * sizeof(ret); i++) {
-		ret = (ret << 1) | (seed & 1);
-		seed >>= 1;
-		if (ret & 1)
-			seed ^= 0xD800000000000000ULL;
+	if (deterministic_seed) {
+		unsigned HOST_WIDE_INT w = deterministic_seed;
+		w ^= w << 13;
+		w ^= w >> 7;
+		w ^= w << 17;
+		deterministic_seed = w;
+		return deterministic_seed;
 	}
 
-	return ret;
+	if (urandom_fd < 0) {
+		urandom_fd = open("/dev/urandom", O_RDONLY);
+		gcc_assert(urandom_fd >= 0);
+	}
+	if (rnd_idx >= ARRAY_SIZE(rnd_buf)) {
+		gcc_assert(read(urandom_fd, rnd_buf, sizeof(rnd_buf)) == sizeof(rnd_buf));
+		rnd_idx = 0;
+	}
+	return rnd_buf[rnd_idx++];
 }
 
 static tree tree_get_random_const(tree type)
@@ -549,8 +555,6 @@ static void latent_entropy_start_unit(void *gcc_data __unused,
 	tree type, id;
 	int quals;
 
-	seed = get_random_seed(false);
-
 	if (in_lto_p)
 		return;
 
@@ -585,6 +589,12 @@ __visible int plugin_init(struct plugin_name_args *plugin_info,
 	const struct plugin_argument * const argv = plugin_info->argv;
 	int i;
 
+	/*
+	 * Call get_random_seed() with noinit=true, so that this returns
+	 * 0 in the case where no seed has been passed via -frandom-seed.
+	 */
+	deterministic_seed = get_random_seed(true);
+
 	static const struct ggc_root_tab gt_ggc_r_gt_latent_entropy[] = {
 		{
 			.base = &latent_entropy_decl,
-- 
Gitee


From 7fe75e370e687ab849938c7c32f3960cb83eda28 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= <toke@toke.dk>
Date: Tue, 19 Jul 2022 16:31:27 +0800
Subject: [PATCH 448/674] ath9k: Properly clear TX status area before reporting
 to mac80211
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

stable inclusion
from stable-v5.10.112
commit 0f65cedae5009ad004dde18e66694615adfc7365
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=0f65cedae5009ad004dde18e66694615adfc7365

--------------------------------

commit 037250f0a45cf9ecf5b52d4b9ff8eadeb609c800 upstream.

The ath9k driver was not properly clearing the status area in the
ieee80211_tx_info struct before reporting TX status to mac80211. Instead,
it was manually filling in fields, which meant that fields introduced later
were left as-is.

Conveniently, mac80211 actually provides a helper to zero out the status
area, so use that to make sure we zero everything.

The last commit touching the driver function writing the status information
seems to have actually been fixing an issue that was also caused by the
area being uninitialised; but it only added clearing of a single field
instead of the whole struct. That is now redundant, though, so revert that
commit and use it as a convenient Fixes tag.

Fixes: cc591d77aba1 ("ath9k: Make sure to zero status.tx_time before reporting TX status")
Reported-by: Bagas Sanjaya <bagasdotme@gmail.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>
Tested-by: Bagas Sanjaya <bagasdotme@gmail.com>
Signed-off-by: Kalle Valo <kvalo@kernel.org>
Link: https://lore.kernel.org/r/20220330164409.16645-1-toke@toke.dk
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/net/wireless/ath/ath9k/xmit.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/net/wireless/ath/ath9k/xmit.c b/drivers/net/wireless/ath/ath9k/xmit.c
index 5691bd6eb82c..d9ece2090a1d 100644
--- a/drivers/net/wireless/ath/ath9k/xmit.c
+++ b/drivers/net/wireless/ath/ath9k/xmit.c
@@ -2512,6 +2512,8 @@ static void ath_tx_rc_status(struct ath_softc *sc, struct ath_buf *bf,
 	struct ath_hw *ah = sc->sc_ah;
 	u8 i, tx_rateindex;
 
+	ieee80211_tx_info_clear_status(tx_info);
+
 	if (txok)
 		tx_info->status.ack_signal = ts->ts_rssi;
 
@@ -2554,9 +2556,6 @@ static void ath_tx_rc_status(struct ath_softc *sc, struct ath_buf *bf,
 	}
 
 	tx_info->status.rates[tx_rateindex].count = ts->ts_longretry + 1;
-
-	/* we report airtime in ath_tx_count_airtime(), don't report twice */
-	tx_info->status.tx_time = 0;
 }
 
 static void ath_tx_processq(struct ath_softc *sc, struct ath_txq *txq)
-- 
Gitee


From 0d73660552f81e9219575f5cf9b39d28aaf13ad6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= <toke@redhat.com>
Date: Tue, 19 Jul 2022 16:31:28 +0800
Subject: [PATCH 449/674] ath9k: Fix usage of driver-private space in tx_info
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

stable inclusion
from stable-v5.10.112
commit 9b7ec35253c9d23f3471e8ae6e7a956d86bb2a2c
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=9b7ec35253c9d23f3471e8ae6e7a956d86bb2a2c

--------------------------------

commit 5a6b06f5927c940fa44026695779c30b7536474c upstream.

The ieee80211_tx_info_clear_status() helper also clears the rate counts and
the driver-private part of struct ieee80211_tx_info, so using it breaks
quite a few other things. So back out of using it, and instead define a
ath-internal helper that only clears the area between the
status_driver_data and the rates info. Combined with moving the
ath_frame_info struct to status_driver_data, this avoids clearing anything
we shouldn't be, and so we can keep the existing code for handling the rate
information.

While fixing this I also noticed that the setting of
tx_info->status.rates[tx_rateindex].count on hardware underrun errors was
always immediately overridden by the normal setting of the same fields, so
rearrange the code so that the underrun detection actually takes effect.

The new helper could be generalised to a 'memset_between()' helper, but
leave it as a driver-internal helper for now since this needs to go to
stable.

Cc: stable@vger.kernel.org
Reported-by: Peter Seiderer <ps.report@gmx.net>
Fixes: 037250f0a45c ("ath9k: Properly clear TX status area before reporting to mac80211")
Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Reviewed-by: Peter Seiderer <ps.report@gmx.net>
Tested-by: Peter Seiderer <ps.report@gmx.net>
Signed-off-by: Kalle Valo <kvalo@kernel.org>
Link: https://lore.kernel.org/r/20220404204800.2681133-1-toke@toke.dk
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/net/wireless/ath/ath9k/main.c |  2 +-
 drivers/net/wireless/ath/ath9k/xmit.c | 30 ++++++++++++++++++---------
 2 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/drivers/net/wireless/ath/ath9k/main.c b/drivers/net/wireless/ath/ath9k/main.c
index af367696fd92..ac354dfc5055 100644
--- a/drivers/net/wireless/ath/ath9k/main.c
+++ b/drivers/net/wireless/ath/ath9k/main.c
@@ -839,7 +839,7 @@ static bool ath9k_txq_list_has_key(struct list_head *txq_list, u32 keyix)
 			continue;
 
 		txinfo = IEEE80211_SKB_CB(bf->bf_mpdu);
-		fi = (struct ath_frame_info *)&txinfo->rate_driver_data[0];
+		fi = (struct ath_frame_info *)&txinfo->status.status_driver_data[0];
 		if (fi->keyix == keyix)
 			return true;
 	}
diff --git a/drivers/net/wireless/ath/ath9k/xmit.c b/drivers/net/wireless/ath/ath9k/xmit.c
index d9ece2090a1d..6555abf02f18 100644
--- a/drivers/net/wireless/ath/ath9k/xmit.c
+++ b/drivers/net/wireless/ath/ath9k/xmit.c
@@ -141,8 +141,8 @@ static struct ath_frame_info *get_frame_info(struct sk_buff *skb)
 {
 	struct ieee80211_tx_info *tx_info = IEEE80211_SKB_CB(skb);
 	BUILD_BUG_ON(sizeof(struct ath_frame_info) >
-		     sizeof(tx_info->rate_driver_data));
-	return (struct ath_frame_info *) &tx_info->rate_driver_data[0];
+		     sizeof(tx_info->status.status_driver_data));
+	return (struct ath_frame_info *) &tx_info->status.status_driver_data[0];
 }
 
 static void ath_send_bar(struct ath_atx_tid *tid, u16 seqno)
@@ -2501,6 +2501,16 @@ static void ath_tx_complete_buf(struct ath_softc *sc, struct ath_buf *bf,
 	spin_unlock_irqrestore(&sc->tx.txbuflock, flags);
 }
 
+static void ath_clear_tx_status(struct ieee80211_tx_info *tx_info)
+{
+	void *ptr = &tx_info->status;
+
+	memset(ptr + sizeof(tx_info->status.rates), 0,
+	       sizeof(tx_info->status) -
+	       sizeof(tx_info->status.rates) -
+	       sizeof(tx_info->status.status_driver_data));
+}
+
 static void ath_tx_rc_status(struct ath_softc *sc, struct ath_buf *bf,
 			     struct ath_tx_status *ts, int nframes, int nbad,
 			     int txok)
@@ -2512,7 +2522,7 @@ static void ath_tx_rc_status(struct ath_softc *sc, struct ath_buf *bf,
 	struct ath_hw *ah = sc->sc_ah;
 	u8 i, tx_rateindex;
 
-	ieee80211_tx_info_clear_status(tx_info);
+	ath_clear_tx_status(tx_info);
 
 	if (txok)
 		tx_info->status.ack_signal = ts->ts_rssi;
@@ -2528,6 +2538,13 @@ static void ath_tx_rc_status(struct ath_softc *sc, struct ath_buf *bf,
 	tx_info->status.ampdu_len = nframes;
 	tx_info->status.ampdu_ack_len = nframes - nbad;
 
+	tx_info->status.rates[tx_rateindex].count = ts->ts_longretry + 1;
+
+	for (i = tx_rateindex + 1; i < hw->max_rates; i++) {
+		tx_info->status.rates[i].count = 0;
+		tx_info->status.rates[i].idx = -1;
+	}
+
 	if ((ts->ts_status & ATH9K_TXERR_FILT) == 0 &&
 	    (tx_info->flags & IEEE80211_TX_CTL_NO_ACK) == 0) {
 		/*
@@ -2549,13 +2566,6 @@ static void ath_tx_rc_status(struct ath_softc *sc, struct ath_buf *bf,
 			tx_info->status.rates[tx_rateindex].count =
 				hw->max_rate_tries;
 	}
-
-	for (i = tx_rateindex + 1; i < hw->max_rates; i++) {
-		tx_info->status.rates[i].count = 0;
-		tx_info->status.rates[i].idx = -1;
-	}
-
-	tx_info->status.rates[tx_rateindex].count = ts->ts_longretry + 1;
 }
 
 static void ath_tx_processq(struct ath_softc *sc, struct ath_txq *txq)
-- 
Gitee


From 0263aca6a297fa8aa203747352714924f2eca0cc Mon Sep 17 00:00:00 2001
From: Jia-Ju Bai <baijiaju1990@gmail.com>
Date: Tue, 19 Jul 2022 16:31:29 +0800
Subject: [PATCH 450/674] btrfs: fix root ref counts in error handling in
 btrfs_get_root_ref

stable inclusion
from stable-v5.10.112
commit 1d2eda18f6ffbd9902594469c6e1a055014eb2ac
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=1d2eda18f6ffbd9902594469c6e1a055014eb2ac

--------------------------------

commit 168a2f776b9762f4021421008512dd7ab7474df1 upstream.

In btrfs_get_root_ref(), when btrfs_insert_fs_root() fails,
btrfs_put_root() can happen for two reasons:

- the root already exists in the tree, in that case it returns the
  reference obtained in btrfs_lookup_fs_root()

- another error so the cleanup is done in the fail label

Calling btrfs_put_root() unconditionally would lead to double decrement
of the root reference possibly freeing it in the second case.

Reported-by: TOTE Robot <oslab@tsinghua.edu.cn>
Fixes: bc44d7c4b2b1 ("btrfs: push btrfs_grab_fs_root into btrfs_get_fs_root")
CC: stable@vger.kernel.org # 5.10+
Signed-off-by: Jia-Ju Bai <baijiaju1990@gmail.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 fs/btrfs/disk-io.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a5bcad027883..87e55b024ac2 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1596,9 +1596,10 @@ static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info,
 
 	ret = btrfs_insert_fs_root(fs_info, root);
 	if (ret) {
-		btrfs_put_root(root);
-		if (ret == -EEXIST)
+		if (ret == -EEXIST) {
+			btrfs_put_root(root);
 			goto again;
+		}
 		goto fail;
 	}
 	return root;
-- 
Gitee


From ee239ac33693cabb8d0dc9c9e140bd046b467d16 Mon Sep 17 00:00:00 2001
From: Naohiro Aota <naohiro.aota@wdc.com>
Date: Tue, 19 Jul 2022 16:31:30 +0800
Subject: [PATCH 451/674] btrfs: mark resumed async balance as writing

stable inclusion
from stable-v5.10.112
commit 5e4dd1799883941cd9590415a47d967909c8d2ac
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=5e4dd1799883941cd9590415a47d967909c8d2ac

--------------------------------

commit a690e5f2db4d1dca742ce734aaff9f3112d63764 upstream.

When btrfs balance is interrupted with umount, the background balance
resumes on the next mount. There is a potential deadlock with FS freezing
here like as described in commit 26559780b953 ("btrfs: zoned: mark
relocation as writing"). Mark the process as sb_writing to avoid it.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
CC: stable@vger.kernel.org # 4.9+
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 fs/btrfs/volumes.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index e462de991723..366d04763864 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4220,10 +4220,12 @@ static int balance_kthread(void *data)
 	struct btrfs_fs_info *fs_info = data;
 	int ret = 0;
 
+	sb_start_write(fs_info->sb);
 	mutex_lock(&fs_info->balance_mutex);
 	if (fs_info->balance_ctl)
 		ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
 	mutex_unlock(&fs_info->balance_mutex);
+	sb_end_write(fs_info->sb);
 
 	return ret;
 }
-- 
Gitee


From 9fe75bc0d31f056bf8ed165bdbaf08125f2d65f0 Mon Sep 17 00:00:00 2001
From: Tim Crawford <tcrawford@system76.com>
Date: Tue, 19 Jul 2022 16:31:31 +0800
Subject: [PATCH 452/674] ALSA: hda/realtek: Add quirk for Clevo PD50PNT

stable inclusion
from stable-v5.10.112
commit 163e162471308932c2f57f98a9a5fc25a8991247
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=163e162471308932c2f57f98a9a5fc25a8991247

--------------------------------

commit 9eb6f5c388060d8cef3c8b616cc31b765e022359 upstream.

Fixes speaker output and headset detection on Clevo PD50PNT.

Signed-off-by: Tim Crawford <tcrawford@system76.com>
Cc: <stable@vger.kernel.org>
Link: https://lore.kernel.org/r/20220405182029.27431-1-tcrawford@system76.com
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 sound/pci/hda/patch_realtek.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
index b886326ce9b9..ca0b8b9da577 100644
--- a/sound/pci/hda/patch_realtek.c
+++ b/sound/pci/hda/patch_realtek.c
@@ -2626,6 +2626,7 @@ static const struct snd_pci_quirk alc882_fixup_tbl[] = {
 	SND_PCI_QUIRK(0x1558, 0x65e1, "Clevo PB51[ED][DF]", ALC1220_FIXUP_CLEVO_PB51ED_PINS),
 	SND_PCI_QUIRK(0x1558, 0x65e5, "Clevo PC50D[PRS](?:-D|-G)?", ALC1220_FIXUP_CLEVO_PB51ED_PINS),
 	SND_PCI_QUIRK(0x1558, 0x65f1, "Clevo PC50HS", ALC1220_FIXUP_CLEVO_PB51ED_PINS),
+	SND_PCI_QUIRK(0x1558, 0x65f5, "Clevo PD50PN[NRT]", ALC1220_FIXUP_CLEVO_PB51ED_PINS),
 	SND_PCI_QUIRK(0x1558, 0x67d1, "Clevo PB71[ER][CDF]", ALC1220_FIXUP_CLEVO_PB51ED_PINS),
 	SND_PCI_QUIRK(0x1558, 0x67e1, "Clevo PB71[DE][CDF]", ALC1220_FIXUP_CLEVO_PB51ED_PINS),
 	SND_PCI_QUIRK(0x1558, 0x67e5, "Clevo PC70D[PRS](?:-D|-G)?", ALC1220_FIXUP_CLEVO_PB51ED_PINS),
-- 
Gitee


From 393019e7734575f237cf27208be878e6c32df16b Mon Sep 17 00:00:00 2001
From: Tao Jin <tao-j@outlook.com>
Date: Tue, 19 Jul 2022 16:31:32 +0800
Subject: [PATCH 453/674] ALSA: hda/realtek: add quirk for Lenovo Thinkpad X12
 speakers

stable inclusion
from stable-v5.10.112
commit 48d070ca5e7e015b7aa24d355656037863247910
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=48d070ca5e7e015b7aa24d355656037863247910

--------------------------------

commit 264fb03497ec1c7841bba872571bcd11beed57a7 upstream.

For this specific device on Lenovo Thinkpad X12 tablet, the verbs were
dumped by qemu running a guest OS that init this codec properly.
After studying the dump, it turns out that
the same quirk used by the other Lenovo devices can be reused.

The patch was tested working against the mainline kernel.

Cc: <stable@vger.kernel.org>
Signed-off-by: Tao Jin <tao-j@outlook.com>
Link: https://lore.kernel.org/r/CO6PR03MB6241CD73310B37858FE64C85E1E89@CO6PR03MB6241.namprd03.prod.outlook.com
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 sound/pci/hda/patch_realtek.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
index ca0b8b9da577..11d653190e6e 100644
--- a/sound/pci/hda/patch_realtek.c
+++ b/sound/pci/hda/patch_realtek.c
@@ -8995,6 +8995,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = {
 	SND_PCI_QUIRK(0x17aa, 0x505d, "Thinkpad", ALC298_FIXUP_TPT470_DOCK),
 	SND_PCI_QUIRK(0x17aa, 0x505f, "Thinkpad", ALC298_FIXUP_TPT470_DOCK),
 	SND_PCI_QUIRK(0x17aa, 0x5062, "Thinkpad", ALC298_FIXUP_TPT470_DOCK),
+	SND_PCI_QUIRK(0x17aa, 0x508b, "Thinkpad X12 Gen 1", ALC287_FIXUP_LEGION_15IMHG05_SPEAKERS),
 	SND_PCI_QUIRK(0x17aa, 0x5109, "Thinkpad", ALC269_FIXUP_LIMIT_INT_MIC_BOOST),
 	SND_PCI_QUIRK(0x17aa, 0x511e, "Thinkpad", ALC298_FIXUP_TPT470_DOCK),
 	SND_PCI_QUIRK(0x17aa, 0x511f, "Thinkpad", ALC298_FIXUP_TPT470_DOCK),
-- 
Gitee


From 5f9cced69272c308e918adc6bc33f99691b4ca2e Mon Sep 17 00:00:00 2001
From: "Fabio M. De Francesco" <fmdefrancesco@gmail.com>
Date: Tue, 19 Jul 2022 16:31:33 +0800
Subject: [PATCH 454/674] ALSA: pcm: Test for "silence" field in struct
 "pcm_format_data"

stable inclusion
from stable-v5.10.112
commit 912797e54c99a98f0722f21313e13a3938bb6dba
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=912797e54c99a98f0722f21313e13a3938bb6dba

--------------------------------

commit 2f7a26abb8241a0208c68d22815aa247c5ddacab upstream.

Syzbot reports "KASAN: null-ptr-deref Write in
snd_pcm_format_set_silence".[1]

It is due to missing validation of the "silence" field of struct
"pcm_format_data" in "pcm_formats" array.

Add a test for valid "pat" and, if it is not so, return -EINVAL.

[1] https://lore.kernel.org/lkml/000000000000d188ef05dc2c7279@google.com/

Reported-and-tested-by: syzbot+205eb15961852c2c5974@syzkaller.appspotmail.com
Signed-off-by: Fabio M. De Francesco <fmdefrancesco@gmail.com>
Cc: <stable@vger.kernel.org>
Link: https://lore.kernel.org/r/20220409012655.9399-1-fmdefrancesco@gmail.com
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 sound/core/pcm_misc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sound/core/pcm_misc.c b/sound/core/pcm_misc.c
index 257d412eac5d..30f0f96e0000 100644
--- a/sound/core/pcm_misc.c
+++ b/sound/core/pcm_misc.c
@@ -429,7 +429,7 @@ int snd_pcm_format_set_silence(snd_pcm_format_t format, void *data, unsigned int
 		return 0;
 	width = pcm_formats[(INT)format].phys; /* physical width */
 	pat = pcm_formats[(INT)format].silence;
-	if (! width)
+	if (!width || !pat)
 		return -EINVAL;
 	/* signed or 1 byte data */
 	if (pcm_formats[(INT)format].signd == 1 || width <= 8) {
-- 
Gitee


From bc08c8d7dc05ab89ef8a64a206f12fd80b1b681d Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 19 Jul 2022 16:31:34 +0800
Subject: [PATCH 455/674] nl80211: correctly check NL80211_ATTR_REG_ALPHA2 size

stable inclusion
from stable-v5.10.112
commit 659214603bf26d2699039099cad1bea5b13bcae0
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=659214603bf26d2699039099cad1bea5b13bcae0

--------------------------------

commit 6624bb34b4eb19f715db9908cca00122748765d7 upstream.

We need this to be at least two bytes, so we can access
alpha2[0] and alpha2[1]. It may be three in case some
userspace used NUL-termination since it was NLA_STRING
(and we also push it out with NUL-termination).

Cc: stable@vger.kernel.org
Reported-by: Lee Jones <lee.jones@linaro.org>
Link: https://lore.kernel.org/r/20220411114201.fd4a31f06541.Ie7ff4be2cf348d8cc28ed0d626fc54becf7ea799@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 net/wireless/nl80211.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 0df8b9a19952..12f44ad4e0d8 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -475,7 +475,8 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 				   .len = IEEE80211_MAX_MESH_ID_LEN },
 	[NL80211_ATTR_MPATH_NEXT_HOP] = NLA_POLICY_ETH_ADDR_COMPAT,
 
-	[NL80211_ATTR_REG_ALPHA2] = { .type = NLA_STRING, .len = 2 },
+	/* allow 3 for NUL-termination, we used to declare this NLA_STRING */
+	[NL80211_ATTR_REG_ALPHA2] = NLA_POLICY_RANGE(NLA_BINARY, 2, 3),
 	[NL80211_ATTR_REG_RULES] = { .type = NLA_NESTED },
 
 	[NL80211_ATTR_BSS_CTS_PROT] = { .type = NLA_U8 },
-- 
Gitee


From 87c8fc794e1ad4c982a5f14d61451e4414530e88 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Tue, 19 Jul 2022 16:31:35 +0800
Subject: [PATCH 456/674] ipv6: fix panic when forwarding a pkt with no in6 dev

stable inclusion
from stable-v5.10.112
commit a263712ba8c9ded25dd9d2d5ced11bcea5b33a3e
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=a263712ba8c9ded25dd9d2d5ced11bcea5b33a3e

--------------------------------

commit e3fa461d8b0e185b7da8a101fe94dfe6dd500ac0 upstream.

kongweibin reported a kernel panic in ip6_forward() when input interface
has no in6 dev associated.

The following tc commands were used to reproduce this panic:
tc qdisc del dev vxlan100 root
tc qdisc add dev vxlan100 root netem corrupt 5%

CC: stable@vger.kernel.org
Fixes: ccd27f05ae7b ("ipv6: fix 'disable_policy' for fwd packets")
Reported-by: kongweibin <kongweibin2@huawei.com>
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 net/ipv6/ip6_output.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 2aa39ce7093d..05e19e5d6514 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -508,7 +508,7 @@ int ip6_forward(struct sk_buff *skb)
 		goto drop;
 
 	if (!net->ipv6.devconf_all->disable_policy &&
-	    !idev->cnf.disable_policy &&
+	    (!idev || !idev->cnf.disable_policy) &&
 	    !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 		goto drop;
-- 
Gitee


From c8051bbbb4dc679a4d608a04540905776581c0d8 Mon Sep 17 00:00:00 2001
From: Melissa Wen <mwen@igalia.com>
Date: Tue, 19 Jul 2022 16:31:36 +0800
Subject: [PATCH 457/674] drm/amd/display: don't ignore alpha property on
 pre-multiplied mode

stable inclusion
from stable-v5.10.112
commit 68ae52efa132601bf11f5a3f11521846f7d0ea76
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=68ae52efa132601bf11f5a3f11521846f7d0ea76

--------------------------------

commit e4f1541caf60fcbe5a59e9d25805c0b5865e546a upstream.

"Pre-multiplied" is the default pixel blend mode for KMS/DRM, as
documented in supported_modes of drm_plane_create_blend_mode_property():
https://cgit.freedesktop.org/drm/drm-misc/tree/drivers/gpu/drm/drm_blend.c

In this mode, both 'pixel alpha' and 'plane alpha' participate in the
calculation, as described by the pixel blend mode formula in KMS/DRM
documentation:

out.rgb = plane_alpha * fg.rgb +
          (1 - (plane_alpha * fg.alpha)) * bg.rgb

Considering the blend config mechanisms we have in the driver so far,
the alpha mode that better fits this blend mode is the
_PER_PIXEL_ALPHA_COMBINED_GLOBAL_GAIN, where the value for global_gain
is the plane alpha (global_alpha).

With this change, alpha property stops to be ignored. It also addresses
Bug: https://gitlab.freedesktop.org/drm/amd/-/issues/1734

v2:
 * keep the 8-bit value for global_alpha_value (Nicholas)
 * correct the logical ordering for combined global gain (Nicholas)
 * apply to dcn10 too (Nicholas)

Signed-off-by: Melissa Wen <mwen@igalia.com>
Tested-by: Rodrigo Siqueira <Rodrigo.Siqueira@amd.com>
Reviewed-by: Harry Wentland <harry.wentland@amd.com>
Tested-by: Simon Ser <contact@emersion.fr>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Cc: stable@vger.kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 .../drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c  | 14 +++++++++-----
 drivers/gpu/drm/amd/display/dc/dcn20/dcn20_hwseq.c | 14 +++++++++-----
 2 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c
index 532f6a1145b5..31a13daf4289 100644
--- a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c
+++ b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c
@@ -2387,14 +2387,18 @@ void dcn10_update_mpcc(struct dc *dc, struct pipe_ctx *pipe_ctx)
 				&blnd_cfg.black_color);
 	}
 
-	if (per_pixel_alpha)
-		blnd_cfg.alpha_mode = MPCC_ALPHA_BLEND_MODE_PER_PIXEL_ALPHA;
-	else
-		blnd_cfg.alpha_mode = MPCC_ALPHA_BLEND_MODE_GLOBAL_ALPHA;
-
 	blnd_cfg.overlap_only = false;
 	blnd_cfg.global_gain = 0xff;
 
+	if (per_pixel_alpha && pipe_ctx->plane_state->global_alpha) {
+		blnd_cfg.alpha_mode = MPCC_ALPHA_BLEND_MODE_PER_PIXEL_ALPHA_COMBINED_GLOBAL_GAIN;
+		blnd_cfg.global_gain = pipe_ctx->plane_state->global_alpha_value;
+	} else if (per_pixel_alpha) {
+		blnd_cfg.alpha_mode = MPCC_ALPHA_BLEND_MODE_PER_PIXEL_ALPHA;
+	} else {
+		blnd_cfg.alpha_mode = MPCC_ALPHA_BLEND_MODE_GLOBAL_ALPHA;
+	}
+
 	if (pipe_ctx->plane_state->global_alpha)
 		blnd_cfg.global_alpha = pipe_ctx->plane_state->global_alpha_value;
 	else
diff --git a/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_hwseq.c b/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_hwseq.c
index 79a2b9c785f0..3d778760a3b5 100644
--- a/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_hwseq.c
+++ b/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_hwseq.c
@@ -2270,14 +2270,18 @@ void dcn20_update_mpcc(struct dc *dc, struct pipe_ctx *pipe_ctx)
 				pipe_ctx, &blnd_cfg.black_color);
 	}
 
-	if (per_pixel_alpha)
-		blnd_cfg.alpha_mode = MPCC_ALPHA_BLEND_MODE_PER_PIXEL_ALPHA;
-	else
-		blnd_cfg.alpha_mode = MPCC_ALPHA_BLEND_MODE_GLOBAL_ALPHA;
-
 	blnd_cfg.overlap_only = false;
 	blnd_cfg.global_gain = 0xff;
 
+	if (per_pixel_alpha && pipe_ctx->plane_state->global_alpha) {
+		blnd_cfg.alpha_mode = MPCC_ALPHA_BLEND_MODE_PER_PIXEL_ALPHA_COMBINED_GLOBAL_GAIN;
+		blnd_cfg.global_gain = pipe_ctx->plane_state->global_alpha_value;
+	} else if (per_pixel_alpha) {
+		blnd_cfg.alpha_mode = MPCC_ALPHA_BLEND_MODE_PER_PIXEL_ALPHA;
+	} else {
+		blnd_cfg.alpha_mode = MPCC_ALPHA_BLEND_MODE_GLOBAL_ALPHA;
+	}
+
 	if (pipe_ctx->plane_state->global_alpha)
 		blnd_cfg.global_alpha = pipe_ctx->plane_state->global_alpha_value;
 	else
-- 
Gitee


From 216b1f0fcc1e079b4b946864436f608a377e61d5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tomasz=20Mo=C5=84?= <desowin@gmail.com>
Date: Tue, 19 Jul 2022 16:31:37 +0800
Subject: [PATCH 458/674] drm/amdgpu: Enable gfxoff quirk on MacBook Pro
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

stable inclusion
from stable-v5.10.112
commit 1fcfe37d170ad81d8a5432f49e76c2b2b2918274
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=1fcfe37d170ad81d8a5432f49e76c2b2b2918274

--------------------------------

commit 4593c1b6d159f1e5c35c07a7f125e79e5a864302 upstream.

Enabling gfxoff quirk results in perfectly usable graphical user
interface on MacBook Pro (15-inch, 2019) with Radeon Pro Vega 20 4 GB.

Without the quirk, X server is completely unusable as every few seconds
there is gpu reset due to ring gfx timeout.

Signed-off-by: Tomasz Moń <desowin@gmail.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Cc: stable@vger.kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 8dd7587fea26..c621ebd90031 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -1248,6 +1248,8 @@ static const struct amdgpu_gfxoff_quirk amdgpu_gfxoff_quirk_list[] = {
 	{ 0x1002, 0x15dd, 0x103c, 0x83e7, 0xd3 },
 	/* GFXOFF is unstable on C6 parts with a VBIOS 113-RAVEN-114 */
 	{ 0x1002, 0x15dd, 0x1002, 0x15dd, 0xc6 },
+	/* Apple MacBook Pro (15-inch, 2019) Radeon Pro Vega 20 4 GB */
+	{ 0x1002, 0x69af, 0x106b, 0x019a, 0xc0 },
 	{ 0, 0, 0, 0, 0 },
 };
 
-- 
Gitee


From f98299044e078934160b31fa51fdc758cb8591e5 Mon Sep 17 00:00:00 2001
From: Rei Yamamoto <yamamoto.rei@jp.fujitsu.com>
Date: Tue, 19 Jul 2022 16:31:38 +0800
Subject: [PATCH 459/674] genirq/affinity: Consider that CPUs on nodes can be
 unbalanced

stable inclusion
from stable-v5.10.112
commit 0275c75955d1cdec07bc6d4f6551dbf253c2aaeb
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=0275c75955d1cdec07bc6d4f6551dbf253c2aaeb

--------------------------------

commit 08d835dff916bfe8f45acc7b92c7af6c4081c8a7 upstream.

If CPUs on a node are offline at boot time, the number of nodes is
different when building affinity masks for present cpus and when building
affinity masks for possible cpus. This causes the following problem:

In the case that the number of vectors is less than the number of nodes
there are cases where bits of masks for present cpus are overwritten when
building masks for possible cpus.

Fix this by excluding CPUs, which are not part of the current build mask
(present/possible).

[ tglx: Massaged changelog and added comment ]

Fixes: b82592199032 ("genirq/affinity: Spread IRQs to all available NUMA nodes")
Signed-off-by: Rei Yamamoto <yamamoto.rei@jp.fujitsu.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20220331003309.10891-1-yamamoto.rei@jp.fujitsu.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 kernel/irq/affinity.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 4d89ad4fae3b..5fb78addff51 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -269,8 +269,9 @@ static int __irq_build_affinity_masks(unsigned int startvec,
 	 */
 	if (numvecs <= nodes) {
 		for_each_node_mask(n, nodemsk) {
-			cpumask_or(&masks[curvec].mask, &masks[curvec].mask,
-				   node_to_cpumask[n]);
+			/* Ensure that only CPUs which are in both masks are set */
+			cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
+			cpumask_or(&masks[curvec].mask, &masks[curvec].mask, nmsk);
 			if (++curvec == last_affv)
 				curvec = firstvec;
 		}
-- 
Gitee


From ed6e6b980f5110a34f15993debb2e1e3d7727b25 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Tue, 19 Jul 2022 16:31:39 +0800
Subject: [PATCH 460/674] tick/nohz: Use WARN_ON_ONCE() to prevent console
 saturation

stable inclusion
from stable-v5.10.112
commit 0806f1930562c1522cef061353e3d6a8d78899d0
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=0806f1930562c1522cef061353e3d6a8d78899d0

--------------------------------

commit 40e97e42961f8c6cc7bd5fe67cc18417e02d78f1 upstream.

While running some testing on code that happened to allow the variable
tick_nohz_full_running to get set but with no "possible" NOHZ cores to
back up that setting, this warning triggered:

        if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
                WARN_ON(tick_nohz_full_running);

The console was overwhemled with an endless stream of one WARN per tick
per core and there was no way to even see what was going on w/o using a
serial console to capture it and then trace it back to this.

Change it to WARN_ON_ONCE().

Fixes: 08ae95f4fd3b ("nohz_full: Allow the boot CPU to be nohz_full")
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20211206145950.10927-3-paul.gortmaker@windriver.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 kernel/time/tick-sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 2d7899700b7a..d0440f5d5b45 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -164,7 +164,7 @@ static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now)
 	 */
 	if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) {
 #ifdef CONFIG_NO_HZ_FULL
-		WARN_ON(tick_nohz_full_running);
+		WARN_ON_ONCE(tick_nohz_full_running);
 #endif
 		tick_do_timer_cpu = cpu;
 	}
-- 
Gitee


From a615fdf82ac1af66bbc3533faa655348b4f1af16 Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan@kernel.org>
Date: Tue, 19 Jul 2022 16:31:40 +0800
Subject: [PATCH 461/674] ARM: davinci: da850-evm: Avoid NULL pointer
 dereference

stable inclusion
from stable-v5.10.112
commit 0a312ec66a03133d28570f07bc52749ccfef54da
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=0a312ec66a03133d28570f07bc52749ccfef54da

--------------------------------

commit 83a1cde5c74bfb44b49cb2a940d044bb2380f4ea upstream.

With newer versions of GCC, there is a panic in da850_evm_config_emac()
when booting multi_v5_defconfig in QEMU under the palmetto-bmc machine:

Unable to handle kernel NULL pointer dereference at virtual address 00000020
pgd = (ptrval)
[00000020] *pgd=00000000
Internal error: Oops: 5 [#1] PREEMPT ARM
Modules linked in:
CPU: 0 PID: 1 Comm: swapper Not tainted 5.15.0 #1
Hardware name: Generic DT based system
PC is at da850_evm_config_emac+0x1c/0x120
LR is at do_one_initcall+0x50/0x1e0

The emac_pdata pointer in soc_info is NULL because davinci_soc_info only
gets populated on davinci machines but da850_evm_config_emac() is called
on all machines via device_initcall().

Move the rmii_en assignment below the machine check so that it is only
dereferenced when running on a supported SoC.

Fixes: bae105879f2f ("davinci: DA850/OMAP-L138 EVM: implement autodetect of RMII PHY")
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Bartosz Golaszewski <brgl@bgdev.pl>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/YcS4xVWs6bQlQSPC@archlinux-ax161/
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 arch/arm/mach-davinci/board-da850-evm.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/arm/mach-davinci/board-da850-evm.c b/arch/arm/mach-davinci/board-da850-evm.c
index 428012687a80..7f7f6bae21c2 100644
--- a/arch/arm/mach-davinci/board-da850-evm.c
+++ b/arch/arm/mach-davinci/board-da850-evm.c
@@ -1101,11 +1101,13 @@ static int __init da850_evm_config_emac(void)
 	int ret;
 	u32 val;
 	struct davinci_soc_info *soc_info = &davinci_soc_info;
-	u8 rmii_en = soc_info->emac_pdata->rmii_en;
+	u8 rmii_en;
 
 	if (!machine_is_davinci_da850_evm())
 		return 0;
 
+	rmii_en = soc_info->emac_pdata->rmii_en;
+
 	cfg_chip3_base = DA8XX_SYSCFG0_VIRT(DA8XX_CFGCHIP3_REG);
 
 	val = __raw_readl(cfg_chip3_base);
-- 
Gitee


From df361a40bff129549f4d1a2a8120108c8c217ef7 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Tue, 19 Jul 2022 16:31:41 +0800
Subject: [PATCH 462/674] dm integrity: fix memory corruption when tag_size is
 less than digest size

stable inclusion
from stable-v5.10.112
commit cd02b2687d66f0a8e716384de4b9a0671331f1dc
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=cd02b2687d66f0a8e716384de4b9a0671331f1dc

--------------------------------

commit 08c1af8f1c13bbf210f1760132f4df24d0ed46d6 upstream.

It is possible to set up dm-integrity in such a way that the
"tag_size" parameter is less than the actual digest size. In this
situation, a part of the digest beyond tag_size is ignored.

In this case, dm-integrity would write beyond the end of the
ic->recalc_tags array and corrupt memory. The corruption happened in
integrity_recalc->integrity_sector_checksum->crypto_shash_final.

Fix this corruption by increasing the tags array so that it has enough
padding at the end to accomodate the loop in integrity_recalc() being
able to write a full digest size for the last member of the tags
array.

Cc: stable@vger.kernel.org # v4.19+
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/md/dm-integrity.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index f7471a2642dd..6f085e96c3f3 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -4232,6 +4232,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	}
 
 	if (ic->internal_hash) {
+		size_t recalc_tags_size;
 		ic->recalc_wq = alloc_workqueue("dm-integrity-recalc", WQ_MEM_RECLAIM, 1);
 		if (!ic->recalc_wq ) {
 			ti->error = "Cannot allocate workqueue";
@@ -4245,8 +4246,10 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 			r = -ENOMEM;
 			goto bad;
 		}
-		ic->recalc_tags = kvmalloc_array(RECALC_SECTORS >> ic->sb->log2_sectors_per_block,
-						 ic->tag_size, GFP_KERNEL);
+		recalc_tags_size = (RECALC_SECTORS >> ic->sb->log2_sectors_per_block) * ic->tag_size;
+		if (crypto_shash_digestsize(ic->internal_hash) > ic->tag_size)
+			recalc_tags_size += crypto_shash_digestsize(ic->internal_hash) - ic->tag_size;
+		ic->recalc_tags = kvmalloc(recalc_tags_size, GFP_KERNEL);
 		if (!ic->recalc_tags) {
 			ti->error = "Cannot allocate tags for recalculating";
 			r = -ENOMEM;
-- 
Gitee


From 1b379510924fbac700bfa5e8de9a7c39dee4ea23 Mon Sep 17 00:00:00 2001
From: Nadav Amit <namit@vmware.com>
Date: Tue, 19 Jul 2022 16:31:42 +0800
Subject: [PATCH 463/674] smp: Fix offline cpu check in
 flush_smp_call_function_queue()

stable inclusion
from stable-v5.10.112
commit 89496d80bf847b49cb1349a7028b8b3c6e595d95
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=89496d80bf847b49cb1349a7028b8b3c6e595d95

--------------------------------

commit 9e949a3886356fe9112c6f6f34a6e23d1d35407f upstream.

The check in flush_smp_call_function_queue() for callbacks that are sent
to offline CPUs currently checks whether the queue is empty.

However, flush_smp_call_function_queue() has just deleted all the
callbacks from the queue and moved all the entries into a local list.
This checks would only be positive if some callbacks were added in the
short time after llist_del_all() was called. This does not seem to be
the intention of this check.

Change the check to look at the local list to which the entries were
moved instead of the queue from which all the callbacks were just
removed.

Fixes: 8d056c48e4862 ("CPU hotplug, smp: flush any pending IPI callbacks before CPU offline")
Signed-off-by: Nadav Amit <namit@vmware.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20220319072015.1495036-1-namit@vmware.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 kernel/smp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/smp.c b/kernel/smp.c
index 7cb03edf1735..114776d0d11e 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -376,7 +376,7 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
 
 	/* There shouldn't be any pending callbacks on an offline CPU. */
 	if (unlikely(warn_cpu_offline && !cpu_online(smp_processor_id()) &&
-		     !warned && !llist_empty(head))) {
+		     !warned && entry != NULL)) {
 		warned = true;
 		WARN(1, "IPI on offline CPU %d\n", smp_processor_id());
 
-- 
Gitee


From 9563c809dc9eee0873cbae6aa662ad56fd498642 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Povi=C5=A1er?= <povik+lin@cutebit.org>
Date: Tue, 19 Jul 2022 16:31:43 +0800
Subject: [PATCH 464/674] i2c: pasemi: Wait for write xfers to finish
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

stable inclusion
from stable-v5.10.112
commit 84837f43e56fc2594f07a49399667c467de527a3
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=84837f43e56fc2594f07a49399667c467de527a3

--------------------------------

commit bd8963e602c77adc76dbbbfc3417c3cf14fed76b upstream.

Wait for completion of write transfers before returning from the driver.
At first sight it may seem advantageous to leave write transfers queued
for the controller to carry out on its own time, but there's a couple of
issues with it:

 * Driver doesn't check for FIFO space.

 * The queued writes can complete while the driver is in its I2C read
   transfer path which means it will get confused by the raising of
   XEN (the 'transaction ended' signal). This can cause a spurious
   ENODATA error due to premature reading of the MRXFIFO register.

Adding the wait fixes some unreliability issues with the driver. There's
some efficiency cost to it (especially with pasemi_smb_waitready doing
its polling), but that will be alleviated once the driver receives
interrupt support.

Fixes: beb58aa39e6e ("i2c: PA Semi SMBus driver")
Signed-off-by: Martin Povišer <povik+lin@cutebit.org>
Reviewed-by: Sven Peter <sven@svenpeter.dev>
Signed-off-by: Wolfram Sang <wsa@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/i2c/busses/i2c-pasemi.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/i2c/busses/i2c-pasemi.c b/drivers/i2c/busses/i2c-pasemi.c
index 20f2772c0e79..2c909522f0f3 100644
--- a/drivers/i2c/busses/i2c-pasemi.c
+++ b/drivers/i2c/busses/i2c-pasemi.c
@@ -137,6 +137,12 @@ static int pasemi_i2c_xfer_msg(struct i2c_adapter *adapter,
 
 		TXFIFO_WR(smbus, msg->buf[msg->len-1] |
 			  (stop ? MTXFIFO_STOP : 0));
+
+		if (stop) {
+			err = pasemi_smb_waitready(smbus);
+			if (err)
+				goto reset_out;
+		}
 	}
 
 	return 0;
-- 
Gitee


From bff236a574e83e12d5825dd7b1bbdbdc8352144d Mon Sep 17 00:00:00 2001
From: Anna-Maria Behnsen <anna-maria@linutronix.de>
Date: Tue, 19 Jul 2022 16:31:44 +0800
Subject: [PATCH 465/674] timers: Fix warning condition in __run_timers()

stable inclusion
from stable-v5.10.112
commit 9a5a4d23e24d39784b643e5db2e4c3a97da4a1ac
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=9a5a4d23e24d39784b643e5db2e4c3a97da4a1ac

--------------------------------

commit c54bc0fc84214b203f7a0ebfd1bd308ce2abe920 upstream.

When the timer base is empty, base::next_expiry is set to base::clk +
NEXT_TIMER_MAX_DELTA and base::next_expiry_recalc is false. When no timer
is queued until jiffies reaches base::next_expiry value, the warning for
not finding any expired timer and base::next_expiry_recalc is false in
__run_timers() triggers.

To prevent triggering the warning in this valid scenario
base::timers_pending needs to be added to the warning condition.

Fixes: 31cd0e119d50 ("timers: Recalculate next timer interrupt only when necessary")
Reported-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: Anna-Maria Behnsen <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lore.kernel.org/r/20220405191732.7438-3-anna-maria@linutronix.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 kernel/time/timer.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 351420c23064..f7d3a108e27c 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1738,11 +1738,14 @@ static inline void __run_timers(struct timer_base *base)
 	       time_after_eq(jiffies, base->next_expiry)) {
 		levels = collect_expired_timers(base, heads);
 		/*
-		 * The only possible reason for not finding any expired
-		 * timer at this clk is that all matching timers have been
-		 * dequeued.
+		 * The two possible reasons for not finding any expired
+		 * timer at this clk are that all matching timers have been
+		 * dequeued or no timer has been queued since
+		 * base::next_expiry was set to base::clk +
+		 * NEXT_TIMER_MAX_DELTA.
 		 */
-		WARN_ON_ONCE(!levels && !base->next_expiry_recalc);
+		WARN_ON_ONCE(!levels && !base->next_expiry_recalc
+			     && base->timers_pending);
 		base->clk++;
 		base->next_expiry = __next_timer_interrupt(base);
 
-- 
Gitee


From d1cd8c77cc229f0a36962b246f61efabc1e9c475 Mon Sep 17 00:00:00 2001
From: Chao Gao <chao.gao@intel.com>
Date: Tue, 19 Jul 2022 16:31:45 +0800
Subject: [PATCH 466/674] dma-direct: avoid redundant memory sync for swiotlb

stable inclusion
from stable-v5.10.112
commit 26f827e095aba4141ffd684276ef54025de27574
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=26f827e095aba4141ffd684276ef54025de27574

--------------------------------

commit 9e02977bfad006af328add9434c8bffa40e053bb upstream.

When we looked into FIO performance with swiotlb enabled in VM, we found
swiotlb_bounce() is always called one more time than expected for each DMA
read request.

It turns out that the bounce buffer is copied to original DMA buffer twice
after the completion of a DMA request (one is done by in
dma_direct_sync_single_for_cpu(), the other by swiotlb_tbl_unmap_single()).
But the content in bounce buffer actually doesn't change between the two
rounds of copy. So, one round of copy is redundant.

Pass DMA_ATTR_SKIP_CPU_SYNC flag to swiotlb_tbl_unmap_single() to
skip the memory copy in it.

This fix increases FIO 64KB sequential read throughput in a guest with
swiotlb=force by 5.6%.

Fixes: 55897af63091 ("dma-direct: merge swiotlb_dma_ops into the dma_direct code")
Reported-by: Wang Zhaoyang1 <zhaoyang1.wang@intel.com>
Reported-by: Gao Liang <liang.gao@intel.com>
Signed-off-by: Chao Gao <chao.gao@intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 kernel/dma/direct.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h
index b98615578737..c9d380318dd8 100644
--- a/kernel/dma/direct.h
+++ b/kernel/dma/direct.h
@@ -114,6 +114,7 @@ static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
 		dma_direct_sync_single_for_cpu(dev, addr, size, dir);
 
 	if (unlikely(is_swiotlb_buffer(phys)))
-		swiotlb_tbl_unmap_single(dev, phys, size, size, dir, attrs);
+		swiotlb_tbl_unmap_single(dev, phys, size, size, dir,
+					 attrs | DMA_ATTR_SKIP_CPU_SYNC);
 }
 #endif /* _KERNEL_DMA_DIRECT_H */
-- 
Gitee


From ae2fd3a1fbc11fbd574dd44f99a99cac1a639516 Mon Sep 17 00:00:00 2001
From: Mike Christie <michael.christie@oracle.com>
Date: Tue, 19 Jul 2022 16:31:46 +0800
Subject: [PATCH 467/674] scsi: iscsi: Fix endpoint reuse regression

stable inclusion
from stable-v5.10.112
commit 129db30599bc8b4a7cd0f219feb1e0154802e3d2
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=129db30599bc8b4a7cd0f219feb1e0154802e3d2

--------------------------------

commit 0aadafb5c34403a7cced1a8d61877048dc059f70 upstream.

This patch fixes a bug where when using iSCSI offload we can free an
endpoint while userspace still thinks it's active. That then causes the
endpoint ID to be reused for a new connection's endpoint while userspace
still thinks the ID is for the original connection. Userspace will then end
up disconnecting a running connection's endpoint or trying to bind to
another connection's endpoint.

This bug is a regression added in:

Commit 23d6fefbb3f6 ("scsi: iscsi: Fix in-kernel conn failure handling")

where we added a in kernel ep_disconnect call to fix a bug in:

Commit 0ab710458da1 ("scsi: iscsi: Perform connection failure entirely in
kernel space")

where we would call stop_conn without having done ep_disconnect. This early
ep_disconnect call will then free the endpoint and it's ID while userspace
still thinks the ID is valid.

Fix the early release of the ID by having the in kernel recovery code keep
a reference to the endpoint until userspace has called into the kernel to
finish cleaning up the endpoint/connection. It requires the previous commit
"scsi: iscsi: Release endpoint ID when its freed" which moved the freeing
of the ID until when the endpoint is released.

Link: https://lore.kernel.org/r/20220408001314.5014-5-michael.christie@oracle.com
Fixes: 23d6fefbb3f6 ("scsi: iscsi: Fix in-kernel conn failure handling")
Tested-by: Manish Rangankar <mrangankar@marvell.com>
Reviewed-by: Lee Duncan <lduncan@suse.com>
Reviewed-by: Chris Leech <cleech@redhat.com>
Signed-off-by: Mike Christie <michael.christie@oracle.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/scsi/scsi_transport_iscsi.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c
index 21cc306a0f11..6877a75e1a5d 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -2273,7 +2273,11 @@ static void iscsi_if_disconnect_bound_ep(struct iscsi_cls_conn *conn,
 		mutex_unlock(&conn->ep_mutex);
 
 		flush_work(&conn->cleanup_work);
-
+		/*
+		 * Userspace is now done with the EP so we can release the ref
+		 * iscsi_cleanup_conn_work_fn took.
+		 */
+		iscsi_put_endpoint(ep);
 		mutex_lock(&conn->ep_mutex);
 	}
 }
@@ -2355,6 +2359,12 @@ static void iscsi_cleanup_conn_work_fn(struct work_struct *work)
 		return;
 	}
 
+	/*
+	 * Get a ref to the ep, so we don't release its ID until after
+	 * userspace is done referencing it in iscsi_if_disconnect_bound_ep.
+	 */
+	if (conn->ep)
+		get_device(&conn->ep->dev);
 	iscsi_ep_disconnect(conn, false);
 
 	if (system_state != SYSTEM_RUNNING) {
-- 
Gitee


From f8cb0194fef2d021f4c52d2aa65afc942a682f2a Mon Sep 17 00:00:00 2001
From: Mike Christie <michael.christie@oracle.com>
Date: Tue, 19 Jul 2022 16:31:47 +0800
Subject: [PATCH 468/674] scsi: iscsi: Fix unbound endpoint error handling

stable inclusion
from stable-v5.10.112
commit 361288633bfa0927b936ed3e1f01b605fb239994
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=361288633bfa0927b936ed3e1f01b605fb239994

--------------------------------

commit 03690d81974535f228e892a14f0d2d44404fe555 upstream.

If a driver raises a connection error before the connection is bound, we
can leave a cleanup_work queued that can later run and disconnect/stop a
connection that is logged in. The problem is that drivers can call
iscsi_conn_error_event for endpoints that are connected but not yet bound
when something like the network port they are using is brought down.
iscsi_cleanup_conn_work_fn will check for this and exit early, but if the
cleanup_work is stuck behind other works, it might not get run until after
userspace has done ep_disconnect. Because the endpoint is not yet bound
there was no way for ep_disconnect to flush the work.

The bug of leaving stop_conns queued was added in:

Commit 23d6fefbb3f6 ("scsi: iscsi: Fix in-kernel conn failure handling")

and:

Commit 0ab710458da1 ("scsi: iscsi: Perform connection failure entirely in
kernel space")

was supposed to fix it, but left this case.

This patch moves the conn state check to before we even queue the work so
we can avoid queueing.

Link: https://lore.kernel.org/r/20220408001314.5014-7-michael.christie@oracle.com
Fixes: 0ab710458da1 ("scsi: iscsi: Perform connection failure entirely in kernel space")
Tested-by: Manish Rangankar <mrangankar@marvell.com>
Reviewed-by: Lee Duncan <lduncan@@suse.com>
Reviewed-by: Chris Leech <cleech@redhat.com>
Signed-off-by: Mike Christie <michael.christie@oracle.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/scsi/scsi_transport_iscsi.c | 65 ++++++++++++++++-------------
 1 file changed, 36 insertions(+), 29 deletions(-)

diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c
index 6877a75e1a5d..ef7cd7520e7c 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -2224,10 +2224,10 @@ static void iscsi_stop_conn(struct iscsi_cls_conn *conn, int flag)
 
 	switch (flag) {
 	case STOP_CONN_RECOVER:
-		conn->state = ISCSI_CONN_FAILED;
+		WRITE_ONCE(conn->state, ISCSI_CONN_FAILED);
 		break;
 	case STOP_CONN_TERM:
-		conn->state = ISCSI_CONN_DOWN;
+		WRITE_ONCE(conn->state, ISCSI_CONN_DOWN);
 		break;
 	default:
 		iscsi_cls_conn_printk(KERN_ERR, conn, "invalid stop flag %d\n",
@@ -2245,7 +2245,7 @@ static void iscsi_ep_disconnect(struct iscsi_cls_conn *conn, bool is_active)
 	struct iscsi_endpoint *ep;
 
 	ISCSI_DBG_TRANS_CONN(conn, "disconnect ep.\n");
-	conn->state = ISCSI_CONN_FAILED;
+	WRITE_ONCE(conn->state, ISCSI_CONN_FAILED);
 
 	if (!conn->ep || !session->transport->ep_disconnect)
 		return;
@@ -2344,21 +2344,6 @@ static void iscsi_cleanup_conn_work_fn(struct work_struct *work)
 	struct iscsi_cls_session *session = iscsi_conn_to_session(conn);
 
 	mutex_lock(&conn->ep_mutex);
-	/*
-	 * If we are not at least bound there is nothing for us to do. Userspace
-	 * will do a ep_disconnect call if offload is used, but will not be
-	 * doing a stop since there is nothing to clean up, so we have to clear
-	 * the cleanup bit here.
-	 */
-	if (conn->state != ISCSI_CONN_BOUND && conn->state != ISCSI_CONN_UP) {
-		ISCSI_DBG_TRANS_CONN(conn, "Got error while conn is already failed. Ignoring.\n");
-		spin_lock_irq(&conn->lock);
-		clear_bit(ISCSI_CLS_CONN_BIT_CLEANUP, &conn->flags);
-		spin_unlock_irq(&conn->lock);
-		mutex_unlock(&conn->ep_mutex);
-		return;
-	}
-
 	/*
 	 * Get a ref to the ep, so we don't release its ID until after
 	 * userspace is done referencing it in iscsi_if_disconnect_bound_ep.
@@ -2425,7 +2410,7 @@ iscsi_create_conn(struct iscsi_cls_session *session, int dd_size, uint32_t cid)
 	INIT_WORK(&conn->cleanup_work, iscsi_cleanup_conn_work_fn);
 	conn->transport = transport;
 	conn->cid = cid;
-	conn->state = ISCSI_CONN_DOWN;
+	WRITE_ONCE(conn->state, ISCSI_CONN_DOWN);
 
 	/* this is released in the dev's release function */
 	if (!get_device(&session->dev))
@@ -2613,10 +2598,30 @@ void iscsi_conn_error_event(struct iscsi_cls_conn *conn, enum iscsi_err error)
 	struct iscsi_internal *priv;
 	int len = nlmsg_total_size(sizeof(*ev));
 	unsigned long flags;
+	int state;
 
 	spin_lock_irqsave(&conn->lock, flags);
-	if (!test_and_set_bit(ISCSI_CLS_CONN_BIT_CLEANUP, &conn->flags))
-		queue_work(iscsi_conn_cleanup_workq, &conn->cleanup_work);
+	/*
+	 * Userspace will only do a stop call if we are at least bound. And, we
+	 * only need to do the in kernel cleanup if in the UP state so cmds can
+	 * be released to upper layers. If in other states just wait for
+	 * userspace to avoid races that can leave the cleanup_work queued.
+	 */
+	state = READ_ONCE(conn->state);
+	switch (state) {
+	case ISCSI_CONN_BOUND:
+	case ISCSI_CONN_UP:
+		if (!test_and_set_bit(ISCSI_CLS_CONN_BIT_CLEANUP,
+				      &conn->flags)) {
+			queue_work(iscsi_conn_cleanup_workq,
+				   &conn->cleanup_work);
+		}
+		break;
+	default:
+		ISCSI_DBG_TRANS_CONN(conn, "Got conn error in state %d\n",
+				     state);
+		break;
+	}
 	spin_unlock_irqrestore(&conn->lock, flags);
 
 	priv = iscsi_if_transport_lookup(conn->transport);
@@ -2967,7 +2972,7 @@ iscsi_set_param(struct iscsi_transport *transport, struct iscsi_uevent *ev)
 	char *data = (char*)ev + sizeof(*ev);
 	struct iscsi_cls_conn *conn;
 	struct iscsi_cls_session *session;
-	int err = 0, value = 0;
+	int err = 0, value = 0, state;
 
 	if (ev->u.set_param.len > PAGE_SIZE)
 		return -EINVAL;
@@ -2984,8 +2989,8 @@ iscsi_set_param(struct iscsi_transport *transport, struct iscsi_uevent *ev)
 			session->recovery_tmo = value;
 		break;
 	default:
-		if ((conn->state == ISCSI_CONN_BOUND) ||
-			(conn->state == ISCSI_CONN_UP)) {
+		state = READ_ONCE(conn->state);
+		if (state == ISCSI_CONN_BOUND || state == ISCSI_CONN_UP) {
 			err = transport->set_param(conn, ev->u.set_param.param,
 					data, ev->u.set_param.len);
 		} else {
@@ -3781,7 +3786,7 @@ static int iscsi_if_transport_conn(struct iscsi_transport *transport,
 						ev->u.b_conn.transport_eph,
 						ev->u.b_conn.is_leading);
 		if (!ev->r.retcode)
-			conn->state = ISCSI_CONN_BOUND;
+			WRITE_ONCE(conn->state, ISCSI_CONN_BOUND);
 
 		if (ev->r.retcode || !transport->ep_connect)
 			break;
@@ -3800,7 +3805,8 @@ static int iscsi_if_transport_conn(struct iscsi_transport *transport,
 	case ISCSI_UEVENT_START_CONN:
 		ev->r.retcode = transport->start_conn(conn);
 		if (!ev->r.retcode)
-			conn->state = ISCSI_CONN_UP;
+			WRITE_ONCE(conn->state, ISCSI_CONN_UP);
+
 		break;
 	case ISCSI_UEVENT_SEND_PDU:
 		pdu_len = nlh->nlmsg_len - sizeof(*nlh) - sizeof(*ev);
@@ -4108,10 +4114,11 @@ static ssize_t show_conn_state(struct device *dev,
 {
 	struct iscsi_cls_conn *conn = iscsi_dev_to_conn(dev->parent);
 	const char *state = "unknown";
+	int conn_state = READ_ONCE(conn->state);
 
-	if (conn->state >= 0 &&
-	    conn->state < ARRAY_SIZE(connection_state_names))
-		state = connection_state_names[conn->state];
+	if (conn_state >= 0 &&
+	    conn_state < ARRAY_SIZE(connection_state_names))
+		state = connection_state_names[conn_state];
 
 	return sysfs_emit(buf, "%s\n", state);
 }
-- 
Gitee


From 9fa5f8762ef864534ce1e813298462751a3c67a3 Mon Sep 17 00:00:00 2001
From: Zhang Wensheng <zhangwensheng5@huawei.com>
Date: Tue, 19 Jul 2022 16:31:48 +0800
Subject: [PATCH 469/674] scsi: iscsi: fix kabi broken in struct iscsi_cls_conn

hulk inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X
CVE: NA

-------------------------------

In struct iscsi_cls_conn, "lock", "flags", "cleanup_work" are
added and the kabi is broken, fix it by wrapper.

Signed-off-by: Zhang Wensheng <zhangwensheng5@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>
---
 drivers/scsi/scsi_transport_iscsi.c | 71 +++++++++++++++++------------
 include/scsi/scsi_transport_iscsi.h | 16 +++++--
 2 files changed, 55 insertions(+), 32 deletions(-)

diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c
index ef7cd7520e7c..55b4ada390a6 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -2262,17 +2262,18 @@ static void iscsi_if_disconnect_bound_ep(struct iscsi_cls_conn *conn,
 					 struct iscsi_endpoint *ep,
 					 bool is_active)
 {
+	struct iscsi_cls_conn_wrapper *conn_wrapper = conn_to_wrapper(conn);
 	/* Check if this was a conn error and the kernel took ownership */
-	spin_lock_irq(&conn->lock);
-	if (!test_bit(ISCSI_CLS_CONN_BIT_CLEANUP, &conn->flags)) {
-		spin_unlock_irq(&conn->lock);
+	spin_lock_irq(&conn_wrapper->lock);
+	if (!test_bit(ISCSI_CLS_CONN_BIT_CLEANUP, &conn_wrapper->flags)) {
+		spin_unlock_irq(&conn_wrapper->lock);
 		iscsi_ep_disconnect(conn, is_active);
 	} else {
-		spin_unlock_irq(&conn->lock);
+		spin_unlock_irq(&conn_wrapper->lock);
 		ISCSI_DBG_TRANS_CONN(conn, "flush kernel conn cleanup.\n");
 		mutex_unlock(&conn->ep_mutex);
 
-		flush_work(&conn->cleanup_work);
+		flush_work(&conn_wrapper->cleanup_work);
 		/*
 		 * Userspace is now done with the EP so we can release the ref
 		 * iscsi_cleanup_conn_work_fn took.
@@ -2287,11 +2288,13 @@ static int iscsi_if_stop_conn(struct iscsi_transport *transport,
 {
 	int flag = ev->u.stop_conn.flag;
 	struct iscsi_cls_conn *conn;
+	struct iscsi_cls_conn_wrapper *conn_wrapper;
 
 	conn = iscsi_conn_lookup(ev->u.stop_conn.sid, ev->u.stop_conn.cid);
 	if (!conn)
 		return -EINVAL;
 
+	conn_wrapper = conn_to_wrapper(conn);
 	ISCSI_DBG_TRANS_CONN(conn, "iscsi if conn stop.\n");
 	/*
 	 * If this is a termination we have to call stop_conn with that flag
@@ -2299,7 +2302,7 @@ static int iscsi_if_stop_conn(struct iscsi_transport *transport,
 	 * avoid the extra run.
 	 */
 	if (flag == STOP_CONN_TERM) {
-		cancel_work_sync(&conn->cleanup_work);
+		cancel_work_sync(&conn_wrapper->cleanup_work);
 		iscsi_stop_conn(conn, flag);
 	} else {
 		/*
@@ -2315,23 +2318,23 @@ static int iscsi_if_stop_conn(struct iscsi_transport *transport,
 		/*
 		 * Figure out if it was the kernel or userspace initiating this.
 		 */
-		spin_lock_irq(&conn->lock);
-		if (!test_and_set_bit(ISCSI_CLS_CONN_BIT_CLEANUP, &conn->flags)) {
-			spin_unlock_irq(&conn->lock);
+		spin_lock_irq(&conn_wrapper->lock);
+		if (!test_and_set_bit(ISCSI_CLS_CONN_BIT_CLEANUP, &conn_wrapper->flags)) {
+			spin_unlock_irq(&conn_wrapper->lock);
 			iscsi_stop_conn(conn, flag);
 		} else {
-			spin_unlock_irq(&conn->lock);
+			spin_unlock_irq(&conn_wrapper->lock);
 			ISCSI_DBG_TRANS_CONN(conn,
 					     "flush kernel conn cleanup.\n");
-			flush_work(&conn->cleanup_work);
+			flush_work(&conn_wrapper->cleanup_work);
 		}
 		/*
 		 * Only clear for recovery to avoid extra cleanup runs during
 		 * termination.
 		 */
-		spin_lock_irq(&conn->lock);
-		clear_bit(ISCSI_CLS_CONN_BIT_CLEANUP, &conn->flags);
-		spin_unlock_irq(&conn->lock);
+		spin_lock_irq(&conn_wrapper->lock);
+		clear_bit(ISCSI_CLS_CONN_BIT_CLEANUP, &conn_wrapper->flags);
+		spin_unlock_irq(&conn_wrapper->lock);
 	}
 	ISCSI_DBG_TRANS_CONN(conn, "iscsi if conn stop done.\n");
 	return 0;
@@ -2339,8 +2342,10 @@ static int iscsi_if_stop_conn(struct iscsi_transport *transport,
 
 static void iscsi_cleanup_conn_work_fn(struct work_struct *work)
 {
-	struct iscsi_cls_conn *conn = container_of(work, struct iscsi_cls_conn,
+	struct iscsi_cls_conn_wrapper *conn_wrapper =
+						   container_of(work, struct iscsi_cls_conn_wrapper,
 						   cleanup_work);
+	struct iscsi_cls_conn *conn = &conn_wrapper->conn;
 	struct iscsi_cls_session *session = iscsi_conn_to_session(conn);
 
 	mutex_lock(&conn->ep_mutex);
@@ -2395,19 +2400,22 @@ iscsi_create_conn(struct iscsi_cls_session *session, int dd_size, uint32_t cid)
 {
 	struct iscsi_transport *transport = session->transport;
 	struct iscsi_cls_conn *conn;
+	struct iscsi_cls_conn_wrapper *conn_wrapper;
 	unsigned long flags;
 	int err;
 
-	conn = kzalloc(sizeof(*conn) + dd_size, GFP_KERNEL);
-	if (!conn)
+	conn_wrapper = kzalloc(sizeof(*conn_wrapper) + dd_size, GFP_KERNEL);
+	if (!conn_wrapper)
 		return NULL;
+
+	conn = &conn_wrapper->conn;
 	if (dd_size)
-		conn->dd_data = &conn[1];
+		conn->dd_data = &conn_wrapper[1];
 
 	mutex_init(&conn->ep_mutex);
-	spin_lock_init(&conn->lock);
+	spin_lock_init(&conn_wrapper->lock);
 	INIT_LIST_HEAD(&conn->conn_list);
-	INIT_WORK(&conn->cleanup_work, iscsi_cleanup_conn_work_fn);
+	INIT_WORK(&conn_wrapper->cleanup_work, iscsi_cleanup_conn_work_fn);
 	conn->transport = transport;
 	conn->cid = cid;
 	WRITE_ONCE(conn->state, ISCSI_CONN_DOWN);
@@ -2597,10 +2605,11 @@ void iscsi_conn_error_event(struct iscsi_cls_conn *conn, enum iscsi_err error)
 	struct iscsi_uevent *ev;
 	struct iscsi_internal *priv;
 	int len = nlmsg_total_size(sizeof(*ev));
+	struct iscsi_cls_conn_wrapper *conn_wrapper = conn_to_wrapper(conn);
 	unsigned long flags;
 	int state;
 
-	spin_lock_irqsave(&conn->lock, flags);
+	spin_lock_irqsave(&conn_wrapper->lock, flags);
 	/*
 	 * Userspace will only do a stop call if we are at least bound. And, we
 	 * only need to do the in kernel cleanup if in the UP state so cmds can
@@ -2612,9 +2621,9 @@ void iscsi_conn_error_event(struct iscsi_cls_conn *conn, enum iscsi_err error)
 	case ISCSI_CONN_BOUND:
 	case ISCSI_CONN_UP:
 		if (!test_and_set_bit(ISCSI_CLS_CONN_BIT_CLEANUP,
-				      &conn->flags)) {
+				      &conn_wrapper->flags)) {
 			queue_work(iscsi_conn_cleanup_workq,
-				   &conn->cleanup_work);
+				   &conn_wrapper->cleanup_work);
 		}
 		break;
 	default:
@@ -2622,7 +2631,7 @@ void iscsi_conn_error_event(struct iscsi_cls_conn *conn, enum iscsi_err error)
 				     state);
 		break;
 	}
-	spin_unlock_irqrestore(&conn->lock, flags);
+	spin_unlock_irqrestore(&conn_wrapper->lock, flags);
 
 	priv = iscsi_if_transport_lookup(conn->transport);
 	if (!priv)
@@ -2952,13 +2961,15 @@ static int
 iscsi_if_destroy_conn(struct iscsi_transport *transport, struct iscsi_uevent *ev)
 {
 	struct iscsi_cls_conn *conn;
+	struct iscsi_cls_conn_wrapper *conn_wrapper;
 
 	conn = iscsi_conn_lookup(ev->u.d_conn.sid, ev->u.d_conn.cid);
 	if (!conn)
 		return -EINVAL;
 
+	conn_wrapper = conn_to_wrapper(conn);
 	ISCSI_DBG_TRANS_CONN(conn, "Flushing cleanup during destruction\n");
-	flush_work(&conn->cleanup_work);
+	flush_work(&conn_wrapper->cleanup_work);
 	ISCSI_DBG_TRANS_CONN(conn, "Destroying transport conn\n");
 
 	if (transport->destroy_conn)
@@ -3728,6 +3739,7 @@ static int iscsi_if_transport_conn(struct iscsi_transport *transport,
 	struct iscsi_uevent *ev = nlmsg_data(nlh);
 	struct iscsi_cls_session *session;
 	struct iscsi_cls_conn *conn = NULL;
+	struct iscsi_cls_conn_wrapper *conn_wrapper;
 	struct iscsi_endpoint *ep;
 	uint32_t pdu_len;
 	int err = 0;
@@ -3764,15 +3776,16 @@ static int iscsi_if_transport_conn(struct iscsi_transport *transport,
 	if (!conn)
 		return -EINVAL;
 
+	conn_wrapper = conn_to_wrapper(conn);
 	mutex_lock(&conn->ep_mutex);
-	spin_lock_irq(&conn->lock);
-	if (test_bit(ISCSI_CLS_CONN_BIT_CLEANUP, &conn->flags)) {
-		spin_unlock_irq(&conn->lock);
+	spin_lock_irq(&conn_wrapper->lock);
+	if (test_bit(ISCSI_CLS_CONN_BIT_CLEANUP, &conn_wrapper->flags)) {
+		spin_unlock_irq(&conn_wrapper->lock);
 		mutex_unlock(&conn->ep_mutex);
 		ev->r.retcode = -ENOTCONN;
 		return 0;
 	}
-	spin_unlock_irq(&conn->lock);
+	spin_unlock_irq(&conn_wrapper->lock);
 
 	switch (nlh->nlmsg_type) {
 	case ISCSI_UEVENT_BIND_CONN:
diff --git a/include/scsi/scsi_transport_iscsi.h b/include/scsi/scsi_transport_iscsi.h
index 037c77fb5dc5..6f55690d3986 100644
--- a/include/scsi/scsi_transport_iscsi.h
+++ b/include/scsi/scsi_transport_iscsi.h
@@ -201,6 +201,7 @@ enum iscsi_connection_state {
 
 struct iscsi_cls_conn {
 	struct list_head conn_list;	/* item in connlist */
+	struct list_head conn_list_err; /* add back for fix kabi broken */
 	void *dd_data;			/* LLD private data */
 	struct iscsi_transport *transport;
 	uint32_t cid;			/* connection id */
@@ -211,15 +212,24 @@ struct iscsi_cls_conn {
 	struct mutex ep_mutex;
 	struct iscsi_endpoint *ep;
 
+	struct device dev;		/* sysfs transport/container device */
+	enum iscsi_connection_state state;
+};
+/*
+ *  The wrapper of iscsi_cls_conn to fix kabi while adding members.
+ */
+struct iscsi_cls_conn_wrapper {
+	struct iscsi_cls_conn conn;
+
 	/* Used when accessing flags and queueing work. */
 	spinlock_t lock;
 	unsigned long flags;
 	struct work_struct cleanup_work;
-
-	struct device dev;		/* sysfs transport/container device */
-	enum iscsi_connection_state state;
 };
 
+#define conn_to_wrapper(ic_conn) \
+	container_of(ic_conn, struct iscsi_cls_conn_wrapper, conn)
+
 #define iscsi_dev_to_conn(_dev) \
 	container_of(_dev, struct iscsi_cls_conn, dev)
 
-- 
Gitee


From 230035efb98b6edc066f3671e092a25013a131a2 Mon Sep 17 00:00:00 2001
From: Li Nan <linan122@huawei.com>
Date: Tue, 19 Jul 2022 16:31:49 +0800
Subject: [PATCH 470/674] scsi: iscsi: fix kabi broken in struct
 iscsi_transport

hulk inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HL0X
CVE: NA

-------------------------------

In struct iscsi_transport, "unbind_conn" was added and the kabi
is broken. Fix it by using "tgt_dscvr" and caps flag.

Signed-off-by: Li Nan <linan122@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: linan <linan122@huawei.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>
---
 drivers/infiniband/ulp/iser/iscsi_iser.c |  9 +++++--
 drivers/scsi/be2iscsi/be_main.c          |  9 +++++--
 drivers/scsi/bnx2i/bnx2i_iscsi.c         |  9 +++++--
 drivers/scsi/cxgbi/cxgb3i/cxgb3i.c       |  9 +++++--
 drivers/scsi/cxgbi/cxgb4i/cxgb4i.c       |  9 +++++--
 drivers/scsi/qedi/qedi_iscsi.c           |  8 ++++--
 drivers/scsi/qla4xxx/ql4_os.c            |  8 ++++--
 drivers/scsi/scsi_transport_iscsi.c      | 31 ++++++++++++++++++------
 include/scsi/iscsi_if.h                  |  1 +
 include/scsi/scsi_transport_iscsi.h      | 17 ++++++++++++-
 10 files changed, 87 insertions(+), 23 deletions(-)

diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c
index a16e066989fa..4884b122e413 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.c
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.c
@@ -979,17 +979,22 @@ static struct scsi_host_template iscsi_iser_sht = {
 	.track_queue_depth	= 1,
 };
 
+static struct iscsi_transport_expand iscsi_iser_expand = {
+	.unbind_conn            = iscsi_conn_unbind,
+};
+
 static struct iscsi_transport iscsi_iser_transport = {
 	.owner                  = THIS_MODULE,
 	.name                   = "iser",
-	.caps                   = CAP_RECOVERY_L0 | CAP_MULTI_R2T | CAP_TEXT_NEGO,
+	.caps                   = CAP_RECOVERY_L0 | CAP_MULTI_R2T | CAP_TEXT_NEGO
+					| CAP_OPS_EXPAND,
 	/* session management */
 	.create_session         = iscsi_iser_session_create,
 	.destroy_session        = iscsi_iser_session_destroy,
 	/* connection management */
 	.create_conn            = iscsi_iser_conn_create,
 	.bind_conn              = iscsi_iser_conn_bind,
-	.unbind_conn		= iscsi_conn_unbind,
+	.ops_expand             = &iscsi_iser_expand,
 	.destroy_conn           = iscsi_conn_teardown,
 	.attr_is_visible	= iser_attr_is_visible,
 	.set_param              = iscsi_iser_set_param,
diff --git a/drivers/scsi/be2iscsi/be_main.c b/drivers/scsi/be2iscsi/be_main.c
index b977e039bb78..06f697bfc49f 100644
--- a/drivers/scsi/be2iscsi/be_main.c
+++ b/drivers/scsi/be2iscsi/be_main.c
@@ -5801,16 +5801,21 @@ static struct pci_error_handlers beiscsi_eeh_handlers = {
 	.resume = beiscsi_eeh_resume,
 };
 
+struct iscsi_transport_expand beiscsi_iscsi_expand = {
+	.unbind_conn = iscsi_conn_unbind,
+};
+
 struct iscsi_transport beiscsi_iscsi_transport = {
 	.owner = THIS_MODULE,
 	.name = DRV_NAME,
 	.caps = CAP_RECOVERY_L0 | CAP_HDRDGST | CAP_TEXT_NEGO |
-		CAP_MULTI_R2T | CAP_DATADGST | CAP_DATA_PATH_OFFLOAD,
+		CAP_MULTI_R2T | CAP_DATADGST | CAP_DATA_PATH_OFFLOAD |
+		CAP_OPS_EXPAND,
 	.create_session = beiscsi_session_create,
 	.destroy_session = beiscsi_session_destroy,
 	.create_conn = beiscsi_conn_create,
 	.bind_conn = beiscsi_conn_bind,
-	.unbind_conn = iscsi_conn_unbind,
+	.ops_expand = &beiscsi_iscsi_expand,
 	.destroy_conn = iscsi_conn_teardown,
 	.attr_is_visible = beiscsi_attr_is_visible,
 	.set_iface_param = beiscsi_iface_set_param,
diff --git a/drivers/scsi/bnx2i/bnx2i_iscsi.c b/drivers/scsi/bnx2i/bnx2i_iscsi.c
index 8cf2f9a7cfdc..e13c77a76150 100644
--- a/drivers/scsi/bnx2i/bnx2i_iscsi.c
+++ b/drivers/scsi/bnx2i/bnx2i_iscsi.c
@@ -2274,18 +2274,23 @@ static struct scsi_host_template bnx2i_host_template = {
 	.track_queue_depth	= 1,
 };
 
+
+static struct iscsi_transport_expand bnx2i_iscsi_expand = {
+	.unbind_conn            = iscsi_conn_unbind,
+};
+
 struct iscsi_transport bnx2i_iscsi_transport = {
 	.owner			= THIS_MODULE,
 	.name			= "bnx2i",
 	.caps			= CAP_RECOVERY_L0 | CAP_HDRDGST |
 				  CAP_MULTI_R2T | CAP_DATADGST |
 				  CAP_DATA_PATH_OFFLOAD |
-				  CAP_TEXT_NEGO,
+				  CAP_TEXT_NEGO | CAP_OPS_EXPAND,
 	.create_session		= bnx2i_session_create,
 	.destroy_session	= bnx2i_session_destroy,
 	.create_conn		= bnx2i_conn_create,
 	.bind_conn		= bnx2i_conn_bind,
-	.unbind_conn		= iscsi_conn_unbind,
+	.ops_expand             = &bnx2i_iscsi_expand,
 	.destroy_conn		= bnx2i_conn_destroy,
 	.attr_is_visible	= bnx2i_attr_is_visible,
 	.set_param		= iscsi_set_param,
diff --git a/drivers/scsi/cxgbi/cxgb3i/cxgb3i.c b/drivers/scsi/cxgbi/cxgb3i/cxgb3i.c
index edcd3fab6973..65eb6230d390 100644
--- a/drivers/scsi/cxgbi/cxgb3i/cxgb3i.c
+++ b/drivers/scsi/cxgbi/cxgb3i/cxgb3i.c
@@ -100,13 +100,18 @@ static struct scsi_host_template cxgb3i_host_template = {
 	.track_queue_depth = 1,
 };
 
+static struct iscsi_transport_expand cxgb3i_iscsi_expand = {
+	.unbind_conn            = iscsi_conn_unbind,
+};
+
 static struct iscsi_transport cxgb3i_iscsi_transport = {
 	.owner		= THIS_MODULE,
 	.name		= DRV_MODULE_NAME,
 	/* owner and name should be set already */
 	.caps		= CAP_RECOVERY_L0 | CAP_MULTI_R2T | CAP_HDRDGST
 				| CAP_DATADGST | CAP_DIGEST_OFFLOAD |
-				CAP_PADDING_OFFLOAD | CAP_TEXT_NEGO,
+				CAP_PADDING_OFFLOAD | CAP_TEXT_NEGO |
+				CAP_OPS_EXPAND,
 	.attr_is_visible	= cxgbi_attr_is_visible,
 	.get_host_param	= cxgbi_get_host_param,
 	.set_host_param	= cxgbi_set_host_param,
@@ -117,7 +122,7 @@ static struct iscsi_transport cxgb3i_iscsi_transport = {
 	/* connection management */
 	.create_conn	= cxgbi_create_conn,
 	.bind_conn	= cxgbi_bind_conn,
-	.unbind_conn	= iscsi_conn_unbind,
+	.ops_expand     = &cxgb3i_iscsi_expand,
 	.destroy_conn	= iscsi_tcp_conn_teardown,
 	.start_conn	= iscsi_conn_start,
 	.stop_conn	= iscsi_conn_stop,
diff --git a/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c b/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c
index efb3e2b3398e..88f96964c356 100644
--- a/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c
+++ b/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c
@@ -118,12 +118,17 @@ static struct scsi_host_template cxgb4i_host_template = {
 	.track_queue_depth = 1,
 };
 
+static struct iscsi_transport_expand cxgb4i_iscsi_expand = {
+	.unbind_conn            = iscsi_conn_unbind,
+};
+
 static struct iscsi_transport cxgb4i_iscsi_transport = {
 	.owner		= THIS_MODULE,
 	.name		= DRV_MODULE_NAME,
 	.caps		= CAP_RECOVERY_L0 | CAP_MULTI_R2T | CAP_HDRDGST |
 				CAP_DATADGST | CAP_DIGEST_OFFLOAD |
-				CAP_PADDING_OFFLOAD | CAP_TEXT_NEGO,
+				CAP_PADDING_OFFLOAD | CAP_TEXT_NEGO |
+				CAP_OPS_EXPAND,
 	.attr_is_visible	= cxgbi_attr_is_visible,
 	.get_host_param	= cxgbi_get_host_param,
 	.set_host_param	= cxgbi_set_host_param,
@@ -134,7 +139,7 @@ static struct iscsi_transport cxgb4i_iscsi_transport = {
 	/* connection management */
 	.create_conn	= cxgbi_create_conn,
 	.bind_conn		= cxgbi_bind_conn,
-	.unbind_conn	= iscsi_conn_unbind,
+	.ops_expand     = &cxgb4i_iscsi_expand,
 	.destroy_conn	= iscsi_tcp_conn_teardown,
 	.start_conn		= iscsi_conn_start,
 	.stop_conn		= iscsi_conn_stop,
diff --git a/drivers/scsi/qedi/qedi_iscsi.c b/drivers/scsi/qedi/qedi_iscsi.c
index 5f7e62f19d83..ba9a22e55e32 100644
--- a/drivers/scsi/qedi/qedi_iscsi.c
+++ b/drivers/scsi/qedi/qedi_iscsi.c
@@ -1430,16 +1430,20 @@ static void qedi_cleanup_task(struct iscsi_task *task)
 	cmd->scsi_cmd = NULL;
 }
 
+static struct iscsi_transport_expand qedi_iscsi_expand = {
+	.unbind_conn            = iscsi_conn_unbind,
+};
+
 struct iscsi_transport qedi_iscsi_transport = {
 	.owner = THIS_MODULE,
 	.name = QEDI_MODULE_NAME,
 	.caps = CAP_RECOVERY_L0 | CAP_HDRDGST | CAP_MULTI_R2T | CAP_DATADGST |
-		CAP_DATA_PATH_OFFLOAD | CAP_TEXT_NEGO,
+		CAP_DATA_PATH_OFFLOAD | CAP_TEXT_NEGO | CAP_OPS_EXPAND,
 	.create_session = qedi_session_create,
 	.destroy_session = qedi_session_destroy,
 	.create_conn = qedi_conn_create,
 	.bind_conn = qedi_conn_bind,
-	.unbind_conn = iscsi_conn_unbind,
+	.ops_expand = &qedi_iscsi_expand,
 	.start_conn = qedi_conn_start,
 	.stop_conn = iscsi_conn_stop,
 	.destroy_conn = qedi_conn_destroy,
diff --git a/drivers/scsi/qla4xxx/ql4_os.c b/drivers/scsi/qla4xxx/ql4_os.c
index 8d82d2a83059..377d83762099 100644
--- a/drivers/scsi/qla4xxx/ql4_os.c
+++ b/drivers/scsi/qla4xxx/ql4_os.c
@@ -246,20 +246,24 @@ static struct scsi_host_template qla4xxx_driver_template = {
 	.vendor_id		= SCSI_NL_VID_TYPE_PCI | PCI_VENDOR_ID_QLOGIC,
 };
 
+static struct iscsi_transport_expand qla4xxx_iscsi_expand = {
+	.unbind_conn            = iscsi_conn_unbind,
+};
+
 static struct iscsi_transport qla4xxx_iscsi_transport = {
 	.owner			= THIS_MODULE,
 	.name			= DRIVER_NAME,
 	.caps			= CAP_TEXT_NEGO |
 				  CAP_DATA_PATH_OFFLOAD | CAP_HDRDGST |
 				  CAP_DATADGST | CAP_LOGIN_OFFLOAD |
-				  CAP_MULTI_R2T,
+				  CAP_MULTI_R2T | CAP_OPS_EXPAND,
 	.attr_is_visible	= qla4_attr_is_visible,
 	.create_session         = qla4xxx_session_create,
 	.destroy_session        = qla4xxx_session_destroy,
 	.start_conn             = qla4xxx_conn_start,
 	.create_conn            = qla4xxx_conn_create,
 	.bind_conn              = qla4xxx_conn_bind,
-	.unbind_conn		= iscsi_conn_unbind,
+	.ops_expand		= &qla4xxx_iscsi_expand,
 	.stop_conn              = iscsi_conn_stop,
 	.destroy_conn           = qla4xxx_conn_destroy,
 	.set_param              = iscsi_set_param,
diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c
index 55b4ada390a6..e23cb62ab216 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -2253,7 +2253,11 @@ static void iscsi_ep_disconnect(struct iscsi_cls_conn *conn, bool is_active)
 	ep = conn->ep;
 	conn->ep = NULL;
 
-	session->transport->unbind_conn(conn, is_active);
+	if (session->transport->caps & CAP_OPS_EXPAND &&
+	    session->transport->ops_expand &&
+	    session->transport->ops_expand->unbind_conn)
+		session->transport->ops_expand->unbind_conn(conn, is_active);
+
 	session->transport->ep_disconnect(ep);
 	ISCSI_DBG_TRANS_CONN(conn, "disconnect ep done.\n");
 }
@@ -3119,10 +3123,19 @@ iscsi_tgt_dscvr(struct iscsi_transport *transport,
 	struct Scsi_Host *shost;
 	struct sockaddr *dst_addr;
 	int err;
+	int (*tgt_dscvr)(struct Scsi_Host *shost, enum iscsi_tgt_dscvr type,
+			  uint32_t enable, struct sockaddr *dst_addr);
 
-	if (!transport->tgt_dscvr)
-		return -EINVAL;
-
+	if (transport->caps & CAP_OPS_EXPAND) {
+		if (!transport->ops_expand || !transport->ops_expand->tgt_dscvr)
+			return -EINVAL;
+		tgt_dscvr = transport->ops_expand->tgt_dscvr;
+	} else {
+		if (!transport->ops_expand)
+			return -EINVAL;
+		tgt_dscvr = (int (*)(struct Scsi_Host *, enum iscsi_tgt_dscvr, uint32_t,
+			     struct sockaddr *))(transport->ops_expand);
+	}
 	shost = scsi_host_lookup(ev->u.tgt_dscvr.host_no);
 	if (!shost) {
 		printk(KERN_ERR "target discovery could not find host no %u\n",
@@ -3132,8 +3145,8 @@ iscsi_tgt_dscvr(struct iscsi_transport *transport,
 
 
 	dst_addr = (struct sockaddr *)((char*)ev + sizeof(*ev));
-	err = transport->tgt_dscvr(shost, ev->u.tgt_dscvr.type,
-				   ev->u.tgt_dscvr.enable, dst_addr);
+	err = tgt_dscvr(shost, ev->u.tgt_dscvr.type,
+			ev->u.tgt_dscvr.enable, dst_addr);
 	scsi_host_put(shost);
 	return err;
 }
@@ -4786,8 +4799,10 @@ iscsi_register_transport(struct iscsi_transport *tt)
 	int err;
 
 	BUG_ON(!tt);
-	WARN_ON(tt->ep_disconnect && !tt->unbind_conn);
-
+	if (tt->caps & CAP_OPS_EXPAND) {
+		BUG_ON(!tt->ops_expand);
+		WARN_ON(tt->ep_disconnect && !tt->ops_expand->unbind_conn);
+	}
 	priv = iscsi_if_transport_lookup(tt);
 	if (priv)
 		return NULL;
diff --git a/include/scsi/iscsi_if.h b/include/scsi/iscsi_if.h
index 5225a23f2d0e..5679b9fb2b1e 100644
--- a/include/scsi/iscsi_if.h
+++ b/include/scsi/iscsi_if.h
@@ -761,6 +761,7 @@ enum iscsi_ping_status_code {
 					 and verification */
 #define CAP_LOGIN_OFFLOAD	0x4000  /* offload session login */
 
+#define CAP_OPS_EXPAND		0x8000  /* oiscsi_transport->ops_expand flag */
 /*
  * These flags describes reason of stop_conn() call
  */
diff --git a/include/scsi/scsi_transport_iscsi.h b/include/scsi/scsi_transport_iscsi.h
index 6f55690d3986..f9297176dcb8 100644
--- a/include/scsi/scsi_transport_iscsi.h
+++ b/include/scsi/scsi_transport_iscsi.h
@@ -29,6 +29,15 @@ struct bsg_job;
 struct iscsi_bus_flash_session;
 struct iscsi_bus_flash_conn;
 
+/*
+ *  The expansion of iscsi_transport to fix kabi while adding members.
+ */
+struct iscsi_transport_expand {
+	int (*tgt_dscvr)(struct Scsi_Host *shost, enum iscsi_tgt_dscvr type,
+				uint32_t enable, struct sockaddr *dst_addr);
+	void (*unbind_conn)(struct iscsi_cls_conn *conn, bool is_active);
+};
+
 /**
  * struct iscsi_transport - iSCSI Transport template
  *
@@ -82,7 +91,6 @@ struct iscsi_transport {
 	void (*destroy_session) (struct iscsi_cls_session *session);
 	struct iscsi_cls_conn *(*create_conn) (struct iscsi_cls_session *sess,
 				uint32_t cid);
-	void (*unbind_conn) (struct iscsi_cls_conn *conn, bool is_active);
 	int (*bind_conn) (struct iscsi_cls_session *session,
 			  struct iscsi_cls_conn *cls_conn,
 			  uint64_t transport_eph, int is_leading);
@@ -124,8 +132,15 @@ struct iscsi_transport {
 					      int non_blocking);
 	int (*ep_poll) (struct iscsi_endpoint *ep, int timeout_ms);
 	void (*ep_disconnect) (struct iscsi_endpoint *ep);
+#ifdef __GENKSYMS__
 	int (*tgt_dscvr) (struct Scsi_Host *shost, enum iscsi_tgt_dscvr type,
 			  uint32_t enable, struct sockaddr *dst_addr);
+#else
+	/*
+	 * onece ops_expand is used, caps must be set to CAP_OPS_EXPAND
+	 */
+	struct iscsi_transport_expand *ops_expand;
+#endif
 	int (*set_path) (struct Scsi_Host *shost, struct iscsi_path *params);
 	int (*set_iface_param) (struct Scsi_Host *shost, void *data,
 				uint32_t len);
-- 
Gitee


From bb0fba32beb4f89107920efd62bdebb367fed0ce Mon Sep 17 00:00:00 2001
From: LeoLiu-oc <LeoLiu-oc@zhaoxin.com>
Date: Tue, 19 Jul 2022 16:51:06 +0800
Subject: [PATCH 471/674] Driver for Zhaoxin CPU core temperature monitoring

zhaoxin inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I52DS7
CVE: NA

--------------------------------------------

Add support for the temperature sensor inside CPU.
Supported are all known variants of the Zhaoxin processors.

v1: Fix some character encoding mistaken.

Signed-off-by: LeoLiu-oc <LeoLiu-oc@zhaoxin.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Xiongfeng Wang <wangxiongfeng2@huawei.com>
---
 drivers/hwmon/Kconfig           |   9 +
 drivers/hwmon/Makefile          |   1 +
 drivers/hwmon/via-cputemp.c     |   1 -
 drivers/hwmon/zhaoxin-cputemp.c | 292 ++++++++++++++++++++++++++++++++
 4 files changed, 302 insertions(+), 1 deletion(-)
 create mode 100644 drivers/hwmon/zhaoxin-cputemp.c

diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig
index 45a1a5969d01..f2fe56e6f8bd 100644
--- a/drivers/hwmon/Kconfig
+++ b/drivers/hwmon/Kconfig
@@ -1899,6 +1899,15 @@ config SENSORS_VIA_CPUTEMP
 	  sensor inside your CPU. Supported are all known variants of
 	  the VIA C7 and Nano.
 
+config SENSORS_ZHAOXIN_CPUTEMP
+	tristate "Zhaoxin CPU temperature sensor"
+	depends on X86
+	select HWMON_VID
+	help
+	  If you say yes here you get support for the temperature
+	  sensor inside your CPU. Supported are all known variants of
+	  the Zhaoxin processors.
+
 config SENSORS_VIA686A
 	tristate "VIA686A"
 	depends on PCI
diff --git a/drivers/hwmon/Makefile b/drivers/hwmon/Makefile
index c22a5316bd91..95908c478b94 100644
--- a/drivers/hwmon/Makefile
+++ b/drivers/hwmon/Makefile
@@ -184,6 +184,7 @@ obj-$(CONFIG_SENSORS_TMP421)	+= tmp421.o
 obj-$(CONFIG_SENSORS_TMP513)	+= tmp513.o
 obj-$(CONFIG_SENSORS_VEXPRESS)	+= vexpress-hwmon.o
 obj-$(CONFIG_SENSORS_VIA_CPUTEMP)+= via-cputemp.o
+obj-$(CONFIG_SENSORS_ZHAOXIN_CPUTEMP)    += zhaoxin-cputemp.o
 obj-$(CONFIG_SENSORS_VIA686A)	+= via686a.o
 obj-$(CONFIG_SENSORS_VT1211)	+= vt1211.o
 obj-$(CONFIG_SENSORS_VT8231)	+= vt8231.o
diff --git a/drivers/hwmon/via-cputemp.c b/drivers/hwmon/via-cputemp.c
index e5d18dac8ee7..0a5057dbe51a 100644
--- a/drivers/hwmon/via-cputemp.c
+++ b/drivers/hwmon/via-cputemp.c
@@ -273,7 +273,6 @@ static const struct x86_cpu_id __initconst cputemp_ids[] = {
 	X86_MATCH_VENDOR_FAM_MODEL(CENTAUR, 6, X86_CENTAUR_FAM6_C7_A,	NULL),
 	X86_MATCH_VENDOR_FAM_MODEL(CENTAUR, 6, X86_CENTAUR_FAM6_C7_D,	NULL),
 	X86_MATCH_VENDOR_FAM_MODEL(CENTAUR, 6, X86_CENTAUR_FAM6_NANO,	NULL),
-	X86_MATCH_VENDOR_FAM_MODEL(CENTAUR, 7, X86_MODEL_ANY,		NULL),
 	{}
 };
 MODULE_DEVICE_TABLE(x86cpu, cputemp_ids);
diff --git a/drivers/hwmon/zhaoxin-cputemp.c b/drivers/hwmon/zhaoxin-cputemp.c
new file mode 100644
index 000000000000..39e729590eba
--- /dev/null
+++ b/drivers/hwmon/zhaoxin-cputemp.c
@@ -0,0 +1,292 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * zhaoxin-cputemp.c - Driver for Zhaoxin CPU core temperature monitoring
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/hwmon.h>
+#include <linux/hwmon-vid.h>
+#include <linux/sysfs.h>
+#include <linux/hwmon-sysfs.h>
+#include <linux/err.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/platform_device.h>
+#include <linux/cpu.h>
+#include <asm/msr.h>
+#include <asm/processor.h>
+#include <asm/cpu_device_id.h>
+
+#define DRVNAME    "zhaoxin_cputemp"
+
+enum { SHOW_TEMP, SHOW_LABEL, SHOW_NAME };
+
+/* Functions declaration */
+
+struct zhaoxin_cputemp_data {
+	struct device *hwmon_dev;
+	const char *name;
+	u8 vrm;
+	u32 id;
+	u32 msr_temp;
+	u32 msr_vid;
+};
+
+/* Sysfs stuff */
+
+static ssize_t name_show(struct device *dev, struct device_attribute *devattr,
+	char *buf)
+{
+	int ret;
+	struct sensor_device_attribute *attr = to_sensor_dev_attr(devattr);
+	struct zhaoxin_cputemp_data *data = dev_get_drvdata(dev);
+
+	if (attr->index == SHOW_NAME)
+		ret = sprintf(buf, "%s\n", data->name);
+	else    /* show label */
+		ret = sprintf(buf, "Core %d\n", data->id);
+	return ret;
+}
+
+static ssize_t temp_show(struct device *dev, struct device_attribute *devattr, char *buf)
+{
+	struct zhaoxin_cputemp_data *data = dev_get_drvdata(dev);
+	u32 eax, edx;
+	int err;
+
+	err = rdmsr_safe_on_cpu(data->id, data->msr_temp, &eax, &edx);
+	if (err)
+		return -EAGAIN;
+
+	return sprintf(buf, "%lu\n", ((unsigned long)eax & 0xffffff) * 1000);
+}
+
+static ssize_t cpu0_vid_show(struct device *dev, struct device_attribute *devattr, char *buf)
+{
+	struct zhaoxin_cputemp_data *data = dev_get_drvdata(dev);
+	u32 eax, edx;
+	int err;
+
+	err = rdmsr_safe_on_cpu(data->id, data->msr_vid, &eax, &edx);
+	if (err)
+		return -EAGAIN;
+
+	return sprintf(buf, "%d\n", vid_from_reg(~edx & 0x7f, data->vrm));
+}
+
+static SENSOR_DEVICE_ATTR_RO(temp1_input, temp, SHOW_TEMP);
+static SENSOR_DEVICE_ATTR_RO(temp1_label, name, SHOW_LABEL);
+static SENSOR_DEVICE_ATTR_RO(name, name, SHOW_NAME);
+
+static struct attribute *zhaoxin_cputemp_attributes[] = {
+	&sensor_dev_attr_name.dev_attr.attr,
+	&sensor_dev_attr_temp1_label.dev_attr.attr,
+	&sensor_dev_attr_temp1_input.dev_attr.attr,
+	NULL
+};
+
+static const struct attribute_group zhaoxin_cputemp_group = {
+	.attrs = zhaoxin_cputemp_attributes,
+};
+
+/* Optional attributes */
+static DEVICE_ATTR_RO(cpu0_vid);
+
+static int zhaoxin_cputemp_probe(struct platform_device *pdev)
+{
+	struct zhaoxin_cputemp_data *data;
+	int err;
+	u32 eax, edx;
+
+	data = devm_kzalloc(&pdev->dev, sizeof(struct zhaoxin_cputemp_data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	data->id = pdev->id;
+	data->name = "zhaoxin_cputemp";
+	data->msr_temp = 0x1423;
+
+	/* test if we can access the TEMPERATURE MSR */
+	err = rdmsr_safe_on_cpu(data->id, data->msr_temp, &eax, &edx);
+	if (err) {
+		dev_err(&pdev->dev, "Unable to access TEMPERATURE MSR, giving up\n");
+		return err;
+	}
+
+	platform_set_drvdata(pdev, data);
+
+	err = sysfs_create_group(&pdev->dev.kobj, &zhaoxin_cputemp_group);
+	if (err)
+		return err;
+
+	if (data->msr_vid)
+		data->vrm = vid_which_vrm();
+
+	if (data->vrm) {
+		err = device_create_file(&pdev->dev, &dev_attr_cpu0_vid);
+		if (err)
+			goto exit_remove;
+	}
+
+	data->hwmon_dev = hwmon_device_register(&pdev->dev);
+	if (IS_ERR(data->hwmon_dev)) {
+		err = PTR_ERR(data->hwmon_dev);
+		dev_err(&pdev->dev, "Class registration failed (%d)\n", err);
+		goto exit_remove;
+	}
+
+	return 0;
+
+exit_remove:
+	if (data->vrm)
+		device_remove_file(&pdev->dev, &dev_attr_cpu0_vid);
+	sysfs_remove_group(&pdev->dev.kobj, &zhaoxin_cputemp_group);
+	return err;
+}
+
+static int zhaoxin_cputemp_remove(struct platform_device *pdev)
+{
+	struct zhaoxin_cputemp_data *data = platform_get_drvdata(pdev);
+
+	hwmon_device_unregister(data->hwmon_dev);
+	if (data->vrm)
+		device_remove_file(&pdev->dev, &dev_attr_cpu0_vid);
+	sysfs_remove_group(&pdev->dev.kobj, &zhaoxin_cputemp_group);
+	return 0;
+}
+
+static struct platform_driver zhaoxin_cputemp_driver = {
+	.driver = {
+		.name = DRVNAME,
+	},
+	.probe = zhaoxin_cputemp_probe,
+	.remove = zhaoxin_cputemp_remove,
+};
+
+struct pdev_entry {
+	struct list_head list;
+	struct platform_device *pdev;
+	unsigned int cpu;
+};
+
+static LIST_HEAD(pdev_list);
+static DEFINE_MUTEX(pdev_list_mutex);
+
+static int zhaoxin_cputemp_online(unsigned int cpu)
+{
+	int err;
+	struct platform_device *pdev;
+	struct pdev_entry *pdev_entry;
+
+	pdev = platform_device_alloc(DRVNAME, cpu);
+	if (!pdev) {
+		err = -ENOMEM;
+		pr_err("Device allocation failed\n");
+		goto exit;
+	}
+
+	pdev_entry = kzalloc(sizeof(struct pdev_entry), GFP_KERNEL);
+	if (!pdev_entry) {
+		err = -ENOMEM;
+		goto exit_device_put;
+	}
+
+	err = platform_device_add(pdev);
+	if (err) {
+		pr_err("Device addition failed (%d)\n", err);
+		goto exit_device_free;
+	}
+
+	pdev_entry->pdev = pdev;
+	pdev_entry->cpu = cpu;
+	mutex_lock(&pdev_list_mutex);
+	list_add_tail(&pdev_entry->list, &pdev_list);
+	mutex_unlock(&pdev_list_mutex);
+
+	return 0;
+
+exit_device_free:
+	kfree(pdev_entry);
+exit_device_put:
+	platform_device_put(pdev);
+exit:
+	return err;
+}
+
+static int zhaoxin_cputemp_down_prep(unsigned int cpu)
+{
+	struct pdev_entry *p;
+
+	mutex_lock(&pdev_list_mutex);
+	list_for_each_entry(p, &pdev_list, list) {
+		if (p->cpu == cpu) {
+			platform_device_unregister(p->pdev);
+			list_del(&p->list);
+			mutex_unlock(&pdev_list_mutex);
+			kfree(p);
+			return 0;
+		}
+	}
+	mutex_unlock(&pdev_list_mutex);
+	return 0;
+}
+
+static const struct x86_cpu_id __initconst cputemp_ids[] = {
+	X86_MATCH_VENDOR_FAM_MODEL(CENTAUR, 7, X86_MODEL_ANY, NULL),
+	X86_MATCH_VENDOR_FAM_MODEL(ZHAOXIN, 7, X86_MODEL_ANY, NULL),
+	{}
+};
+MODULE_DEVICE_TABLE(x86cpu, cputemp_ids);
+
+static enum cpuhp_state zhaoxin_temp_online;
+
+static int __init zhaoxin_cputemp_init(void)
+{
+	int err;
+
+	if (!x86_match_cpu(cputemp_ids))
+		return -ENODEV;
+
+	err = platform_driver_register(&zhaoxin_cputemp_driver);
+	if (err)
+		goto exit;
+
+	err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "hwmon/zhaoxin:online",
+	zhaoxin_cputemp_online, zhaoxin_cputemp_down_prep);
+	if (err < 0)
+		goto exit_driver_unreg;
+	zhaoxin_temp_online = err;
+
+#ifndef CONFIG_HOTPLUG_CPU
+	if (list_empty(&pdev_list)) {
+		err = -ENODEV;
+		goto exit_hp_unreg;
+	}
+#endif
+	return 0;
+
+#ifndef CONFIG_HOTPLUG_CPU
+exit_hp_unreg:
+	cpuhp_remove_state_nocalls(zhaoxin_temp_online);
+#endif
+exit_driver_unreg:
+	platform_driver_unregister(&zhaoxin_cputemp_driver);
+exit:
+	return err;
+}
+
+static void __exit zhaoxin_cputemp_exit(void)
+{
+	cpuhp_remove_state(zhaoxin_temp_online);
+	platform_driver_unregister(&zhaoxin_cputemp_driver);
+}
+
+MODULE_DESCRIPTION("Zhaoxin CPU temperature monitor");
+MODULE_LICENSE("GPL");
+
+module_init(zhaoxin_cputemp_init)
+module_exit(zhaoxin_cputemp_exit)
-- 
Gitee


From d660fbb1aac606323b6623f5bfb9d4b890b3c5fa Mon Sep 17 00:00:00 2001
From: LeoLiuoc <LeoLiu-oc@zhaoxin.com>
Date: Tue, 19 Jul 2022 16:51:07 +0800
Subject: [PATCH 472/674] openeuler_defconfig: Enable SENSORS_ZHAOXIN_CPUTEMP
 as module by default

zhaoxin inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I52DS7
CVE: NA

--------------------------------------------

Set CONFIG_SENSORS_ZHAOXIN_CPUTEMP to 'm' by default in
openeuler_defconfig.

Signed-off-by: LeoLiuoc <LeoLiu-oc@zhaoxin.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Xiongfeng Wang <wangxiongfeng2@huawei.com>
---
 arch/x86/configs/openeuler_defconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig
index f9c94b618ad4..3eac70518e6f 100644
--- a/arch/x86/configs/openeuler_defconfig
+++ b/arch/x86/configs/openeuler_defconfig
@@ -4053,6 +4053,7 @@ CONFIG_SENSORS_TMP421=m
 # CONFIG_SENSORS_TMP513 is not set
 CONFIG_SENSORS_VIA_CPUTEMP=m
 CONFIG_SENSORS_VIA686A=m
+CONFIG_SENSORS_ZHAOXIN_CPUTEMP=m
 CONFIG_SENSORS_VT1211=m
 CONFIG_SENSORS_VT8231=m
 # CONFIG_SENSORS_W83773G is not set
-- 
Gitee


From 2dd89ae225574b493ce87324dacc2a5ef0f00dbd Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Tue, 19 Jul 2022 16:51:08 +0800
Subject: [PATCH 473/674] nbd: call genl_unregister_family() first in
 nbd_cleanup()

mainline inclusion
from mainline-v5.19-rc1
commit 06c4da89c24e7023ea448cadf8e9daf06a0aae6e
category: bugfix
bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5G01M
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=06c4da89c24e7023ea448cadf8e9daf06a0aae6e

--------------------------------

Otherwise there may be race between module removal and the handling of
netlink command, which can lead to the oops as shown below:

  BUG: kernel NULL pointer dereference, address: 0000000000000098
  Oops: 0002 [#1] SMP PTI
  CPU: 1 PID: 31299 Comm: nbd-client Tainted: G            E     5.14.0-rc4
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996)
  RIP: 0010:down_write+0x1a/0x50
  Call Trace:
   start_creating+0x89/0x130
   debugfs_create_dir+0x1b/0x130
   nbd_start_device+0x13d/0x390 [nbd]
   nbd_genl_connect+0x42f/0x748 [nbd]
   genl_family_rcv_msg_doit.isra.0+0xec/0x150
   genl_rcv_msg+0xe5/0x1e0
   netlink_rcv_skb+0x55/0x100
   genl_rcv+0x29/0x40
   netlink_unicast+0x1a8/0x250
   netlink_sendmsg+0x21b/0x430
   ____sys_sendmsg+0x2a4/0x2d0
   ___sys_sendmsg+0x81/0xc0
   __sys_sendmsg+0x62/0xb0
   __x64_sys_sendmsg+0x1f/0x30
   do_syscall_64+0x3b/0xc0
   entry_SYSCALL_64_after_hwframe+0x44/0xae
  Modules linked in: nbd(E-)

Signed-off-by: Hou Tao <houtao1@huawei.com>
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Link: https://lore.kernel.org/r/20220521073749.3146892-2-yukuai3@huawei.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Jason Yan <yanaijie@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 drivers/block/nbd.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 0ab548c78f24..a12daca0f92a 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -2537,6 +2537,12 @@ static void __exit nbd_cleanup(void)
 	struct nbd_device *nbd;
 	LIST_HEAD(del_list);
 
+	/*
+	 * Unregister netlink interface prior to waiting
+	 * for the completion of netlink commands.
+	 */
+	genl_unregister_family(&nbd_genl_family);
+
 	nbd_dbg_close();
 
 	mutex_lock(&nbd_index_mutex);
@@ -2552,7 +2558,6 @@ static void __exit nbd_cleanup(void)
 	}
 
 	idr_destroy(&nbd_index_idr);
-	genl_unregister_family(&nbd_genl_family);
 	unregister_blkdev(NBD_MAJOR, "nbd");
 }
 
-- 
Gitee


From b0452dec11cd872f0547ec6478779e664f2b95fe Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Tue, 19 Jul 2022 16:51:09 +0800
Subject: [PATCH 474/674] nbd: fix race between nbd_alloc_config() and module
 removal

mainline inclusion
from mainline-v5.19-rc1
commit c55b2b983b0fa012942c3eb16384b2b722caa810
category: bugfix
bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5G01M
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=c55b2b983b0fa012942c3eb16384b2b722caa810

--------------------------------

When nbd module is being removing, nbd_alloc_config() may be
called concurrently by nbd_genl_connect(), although try_module_get()
will return false, but nbd_alloc_config() doesn't handle it.

The race may lead to the leak of nbd_config and its related
resources (e.g, recv_workq) and oops in nbd_read_stat() due
to the unload of nbd module as shown below:

  BUG: kernel NULL pointer dereference, address: 0000000000000040
  Oops: 0000 [#1] SMP PTI
  CPU: 5 PID: 13840 Comm: kworker/u17:33 Not tainted 5.14.0+ #1
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996)
  Workqueue: knbd16-recv recv_work [nbd]
  RIP: 0010:nbd_read_stat.cold+0x130/0x1a4 [nbd]
  Call Trace:
   recv_work+0x3b/0xb0 [nbd]
   process_one_work+0x1ed/0x390
   worker_thread+0x4a/0x3d0
   kthread+0x12a/0x150
   ret_from_fork+0x22/0x30

Fixing it by checking the return value of try_module_get()
in nbd_alloc_config(). As nbd_alloc_config() may return ERR_PTR(-ENODEV),
assign nbd->config only when nbd_alloc_config() succeeds to ensure
the value of nbd->config is binary (valid or NULL).

Also adding a debug message to check the reference counter
of nbd_config during module removal.

Signed-off-by: Hou Tao <houtao1@huawei.com>
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Link: https://lore.kernel.org/r/20220521073749.3146892-3-yukuai3@huawei.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Jason Yan <yanaijie@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 drivers/block/nbd.c | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index a12daca0f92a..873ce6f71d85 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -1530,15 +1530,20 @@ static struct nbd_config *nbd_alloc_config(void)
 {
 	struct nbd_config *config;
 
+	if (!try_module_get(THIS_MODULE))
+		return ERR_PTR(-ENODEV);
+
 	config = kzalloc(sizeof(struct nbd_config), GFP_NOFS);
-	if (!config)
-		return NULL;
+	if (!config) {
+		module_put(THIS_MODULE);
+		return ERR_PTR(-ENOMEM);
+	}
+
 	atomic_set(&config->recv_threads, 0);
 	init_waitqueue_head(&config->recv_wq);
 	init_waitqueue_head(&config->conn_wait);
 	config->blksize = NBD_DEF_BLKSIZE;
 	atomic_set(&config->live_connections, 0);
-	try_module_get(THIS_MODULE);
 	return config;
 }
 
@@ -1565,12 +1570,13 @@ static int nbd_open(struct block_device *bdev, fmode_t mode)
 			mutex_unlock(&nbd->config_lock);
 			goto out;
 		}
-		config = nbd->config = nbd_alloc_config();
-		if (!config) {
-			ret = -ENOMEM;
+		config = nbd_alloc_config();
+		if (IS_ERR(config)) {
+			ret = PTR_ERR(config);
 			mutex_unlock(&nbd->config_lock);
 			goto out;
 		}
+		nbd->config = config;
 		refcount_set(&nbd->config_refs, 1);
 		refcount_inc(&nbd->refs);
 		mutex_unlock(&nbd->config_lock);
@@ -2005,13 +2011,14 @@ static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info)
 		nbd_put(nbd);
 		return -EINVAL;
 	}
-	config = nbd->config = nbd_alloc_config();
-	if (!nbd->config) {
+	config = nbd_alloc_config();
+	if (IS_ERR(config)) {
 		mutex_unlock(&nbd->config_lock);
 		nbd_put(nbd);
 		printk(KERN_ERR "nbd: couldn't allocate config\n");
-		return -ENOMEM;
+		return PTR_ERR(config);
 	}
+	nbd->config = config;
 	refcount_set(&nbd->config_refs, 1);
 	set_bit(NBD_RT_BOUND, &config->runtime_flags);
 
@@ -2552,6 +2559,9 @@ static void __exit nbd_cleanup(void)
 	while (!list_empty(&del_list)) {
 		nbd = list_first_entry(&del_list, struct nbd_device, list);
 		list_del_init(&nbd->list);
+		if (refcount_read(&nbd->config_refs))
+			printk(KERN_ERR "nbd: possibly leaking nbd_config (ref %d)\n",
+					refcount_read(&nbd->config_refs));
 		if (refcount_read(&nbd->refs) != 1)
 			printk(KERN_ERR "nbd: possibly leaking a device\n");
 		nbd_put(nbd);
-- 
Gitee


From 353a77e50e3931a82be3cdee8c71d7ef29726672 Mon Sep 17 00:00:00 2001
From: Duoming Zhou <duoming@zju.edu.cn>
Date: Tue, 19 Jul 2022 16:51:10 +0800
Subject: [PATCH 475/674] net: rose: fix UAF bugs caused by timer handler

stable inclusion
from stable-v5.10.129
commit 8f74cb27c2b4872fd14bf046201fa7b36a46885e
category: bugfix
bugzilla: 187170 https://gitee.com/src-openeuler/kernel/issues/I5FNWN
CVE: CVE-2022-2318

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?h=linux-5.10.y&id=8f74cb27c2b4872fd14bf046201fa7b36a46885e

--------------------------------

commit 9cc02ede696272c5271a401e4f27c262359bc2f6 upstream.

There are UAF bugs in rose_heartbeat_expiry(), rose_timer_expiry()
and rose_idletimer_expiry(). The root cause is that del_timer()
could not stop the timer handler that is running and the refcount
of sock is not managed properly.

One of the UAF bugs is shown below:

    (thread 1)          |        (thread 2)
                        |  rose_bind
                        |  rose_connect
                        |    rose_start_heartbeat
rose_release            |    (wait a time)
  case ROSE_STATE_0     |
  rose_destroy_socket   |  rose_heartbeat_expiry
    rose_stop_heartbeat |
    sock_put(sk)        |    ...
  sock_put(sk) // FREE  |
                        |    bh_lock_sock(sk) // USE

The sock is deallocated by sock_put() in rose_release() and
then used by bh_lock_sock() in rose_heartbeat_expiry().

Although rose_destroy_socket() calls rose_stop_heartbeat(),
it could not stop the timer that is running.

The KASAN report triggered by POC is shown below:

BUG: KASAN: use-after-free in _raw_spin_lock+0x5a/0x110
Write of size 4 at addr ffff88800ae59098 by task swapper/3/0
...
Call Trace:
 <IRQ>
 dump_stack_lvl+0xbf/0xee
 print_address_description+0x7b/0x440
 print_report+0x101/0x230
 ? irq_work_single+0xbb/0x140
 ? _raw_spin_lock+0x5a/0x110
 kasan_report+0xed/0x120
 ? _raw_spin_lock+0x5a/0x110
 kasan_check_range+0x2bd/0x2e0
 _raw_spin_lock+0x5a/0x110
 rose_heartbeat_expiry+0x39/0x370
 ? rose_start_heartbeat+0xb0/0xb0
 call_timer_fn+0x2d/0x1c0
 ? rose_start_heartbeat+0xb0/0xb0
 expire_timers+0x1f3/0x320
 __run_timers+0x3ff/0x4d0
 run_timer_softirq+0x41/0x80
 __do_softirq+0x233/0x544
 irq_exit_rcu+0x41/0xa0
 sysvec_apic_timer_interrupt+0x8c/0xb0
 </IRQ>
 <TASK>
 asm_sysvec_apic_timer_interrupt+0x1b/0x20
RIP: 0010:default_idle+0xb/0x10
RSP: 0018:ffffc9000012fea0 EFLAGS: 00000202
RAX: 000000000000bcae RBX: ffff888006660f00 RCX: 000000000000bcae
RDX: 0000000000000001 RSI: ffffffff843a11c0 RDI: ffffffff843a1180
RBP: dffffc0000000000 R08: dffffc0000000000 R09: ffffed100da36d46
R10: dfffe9100da36d47 R11: ffffffff83cf0950 R12: 0000000000000000
R13: 1ffff11000ccc1e0 R14: ffffffff8542af28 R15: dffffc0000000000
...
Allocated by task 146:
 __kasan_kmalloc+0xc4/0xf0
 sk_prot_alloc+0xdd/0x1a0
 sk_alloc+0x2d/0x4e0
 rose_create+0x7b/0x330
 __sock_create+0x2dd/0x640
 __sys_socket+0xc7/0x270
 __x64_sys_socket+0x71/0x80
 do_syscall_64+0x43/0x90
 entry_SYSCALL_64_after_hwframe+0x46/0xb0

Freed by task 152:
 kasan_set_track+0x4c/0x70
 kasan_set_free_info+0x1f/0x40
 ____kasan_slab_free+0x124/0x190
 kfree+0xd3/0x270
 __sk_destruct+0x314/0x460
 rose_release+0x2fa/0x3b0
 sock_close+0xcb/0x230
 __fput+0x2d9/0x650
 task_work_run+0xd6/0x160
 exit_to_user_mode_loop+0xc7/0xd0
 exit_to_user_mode_prepare+0x4e/0x80
 syscall_exit_to_user_mode+0x20/0x40
 do_syscall_64+0x4f/0x90
 entry_SYSCALL_64_after_hwframe+0x46/0xb0

This patch adds refcount of sock when we use functions
such as rose_start_heartbeat() and so on to start timer,
and decreases the refcount of sock when timer is finished
or deleted by functions such as rose_stop_heartbeat()
and so on. As a result, the UAF bugs could be mitigated.

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Duoming Zhou <duoming@zju.edu.cn>
Tested-by: Duoming Zhou <duoming@zju.edu.cn>
Link: https://lore.kernel.org/r/20220629002640.5693-1-duoming@zju.edu.cn
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Xu Jia <xujia39@huawei.com>
Reviewed-by: Wei Yongjun<weiyongjun1@huawei.com>
Reviewed-by: Wang Weiyang <wangweiyang2@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 net/rose/rose_timer.c | 34 +++++++++++++++++++---------------
 1 file changed, 19 insertions(+), 15 deletions(-)

diff --git a/net/rose/rose_timer.c b/net/rose/rose_timer.c
index b3138fc2e552..f06ddbed3fed 100644
--- a/net/rose/rose_timer.c
+++ b/net/rose/rose_timer.c
@@ -31,89 +31,89 @@ static void rose_idletimer_expiry(struct timer_list *);
 
 void rose_start_heartbeat(struct sock *sk)
 {
-	del_timer(&sk->sk_timer);
+	sk_stop_timer(sk, &sk->sk_timer);
 
 	sk->sk_timer.function = rose_heartbeat_expiry;
 	sk->sk_timer.expires  = jiffies + 5 * HZ;
 
-	add_timer(&sk->sk_timer);
+	sk_reset_timer(sk, &sk->sk_timer, sk->sk_timer.expires);
 }
 
 void rose_start_t1timer(struct sock *sk)
 {
 	struct rose_sock *rose = rose_sk(sk);
 
-	del_timer(&rose->timer);
+	sk_stop_timer(sk, &rose->timer);
 
 	rose->timer.function = rose_timer_expiry;
 	rose->timer.expires  = jiffies + rose->t1;
 
-	add_timer(&rose->timer);
+	sk_reset_timer(sk, &rose->timer, rose->timer.expires);
 }
 
 void rose_start_t2timer(struct sock *sk)
 {
 	struct rose_sock *rose = rose_sk(sk);
 
-	del_timer(&rose->timer);
+	sk_stop_timer(sk, &rose->timer);
 
 	rose->timer.function = rose_timer_expiry;
 	rose->timer.expires  = jiffies + rose->t2;
 
-	add_timer(&rose->timer);
+	sk_reset_timer(sk, &rose->timer, rose->timer.expires);
 }
 
 void rose_start_t3timer(struct sock *sk)
 {
 	struct rose_sock *rose = rose_sk(sk);
 
-	del_timer(&rose->timer);
+	sk_stop_timer(sk, &rose->timer);
 
 	rose->timer.function = rose_timer_expiry;
 	rose->timer.expires  = jiffies + rose->t3;
 
-	add_timer(&rose->timer);
+	sk_reset_timer(sk, &rose->timer, rose->timer.expires);
 }
 
 void rose_start_hbtimer(struct sock *sk)
 {
 	struct rose_sock *rose = rose_sk(sk);
 
-	del_timer(&rose->timer);
+	sk_stop_timer(sk, &rose->timer);
 
 	rose->timer.function = rose_timer_expiry;
 	rose->timer.expires  = jiffies + rose->hb;
 
-	add_timer(&rose->timer);
+	sk_reset_timer(sk, &rose->timer, rose->timer.expires);
 }
 
 void rose_start_idletimer(struct sock *sk)
 {
 	struct rose_sock *rose = rose_sk(sk);
 
-	del_timer(&rose->idletimer);
+	sk_stop_timer(sk, &rose->idletimer);
 
 	if (rose->idle > 0) {
 		rose->idletimer.function = rose_idletimer_expiry;
 		rose->idletimer.expires  = jiffies + rose->idle;
 
-		add_timer(&rose->idletimer);
+		sk_reset_timer(sk, &rose->idletimer, rose->idletimer.expires);
 	}
 }
 
 void rose_stop_heartbeat(struct sock *sk)
 {
-	del_timer(&sk->sk_timer);
+	sk_stop_timer(sk, &sk->sk_timer);
 }
 
 void rose_stop_timer(struct sock *sk)
 {
-	del_timer(&rose_sk(sk)->timer);
+	sk_stop_timer(sk, &rose_sk(sk)->timer);
 }
 
 void rose_stop_idletimer(struct sock *sk)
 {
-	del_timer(&rose_sk(sk)->idletimer);
+	sk_stop_timer(sk, &rose_sk(sk)->idletimer);
 }
 
 static void rose_heartbeat_expiry(struct timer_list *t)
@@ -130,6 +130,7 @@ static void rose_heartbeat_expiry(struct timer_list *t)
 		    (sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_DEAD))) {
 			bh_unlock_sock(sk);
 			rose_destroy_socket(sk);
+			sock_put(sk);
 			return;
 		}
 		break;
@@ -152,6 +153,7 @@ static void rose_heartbeat_expiry(struct timer_list *t)
 
 	rose_start_heartbeat(sk);
 	bh_unlock_sock(sk);
+	sock_put(sk);
 }
 
 static void rose_timer_expiry(struct timer_list *t)
@@ -181,6 +183,7 @@ static void rose_timer_expiry(struct timer_list *t)
 		break;
 	}
 	bh_unlock_sock(sk);
+	sock_put(sk);
 }
 
 static void rose_idletimer_expiry(struct timer_list *t)
@@ -205,4 +208,5 @@ static void rose_idletimer_expiry(struct timer_list *t)
 		sock_set_flag(sk, SOCK_DEAD);
 	}
 	bh_unlock_sock(sk);
+	sock_put(sk);
 }
-- 
Gitee


From 75b7fe6665a03a3430bf18ac4daca685b1baf5e6 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Tue, 19 Jul 2022 16:51:11 +0800
Subject: [PATCH 476/674] Revert "mm/filemap: fix that first page is not mark
 accessed in filemap_read()"

hulk inclusion
category: bugfix
bugzilla: 186896, https://gitee.com/src-openeuler/kernel/issues/I5GZC8
CVE: NA

--------------------------------

This reverts commit 499ecade21fc377c04ec66ed7f0505f1ca74d755.

Prepare to backport solution from mainline.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 mm/filemap.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index ebae261f9df9..9e209e8a3b0d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2551,11 +2551,10 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
 				flush_dcache_page(pages[i]);
 
 			copied = copy_page_to_iter(pages[i], offset, bytes, iter);
-			if (copied) {
-				ra->prev_pos = iocb->ki_pos;
-				written += copied;
-				iocb->ki_pos += copied;
-			}
+
+			written += copied;
+			iocb->ki_pos += copied;
+			ra->prev_pos = iocb->ki_pos;
 
 			if (copied < bytes) {
 				error = -EFAULT;
-- 
Gitee


From 939325bbad27dc99b8f8cea2791192e5f81b76de Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 19 Jul 2022 16:51:12 +0800
Subject: [PATCH 477/674] filemap: Correct the conditions for marking a folio
 as accessed

mainline inclusion
from mainline-5.19-rc4
commit 5ccc944dce3df5fd2fd683a7df4fd49d1068eba2
category: bugfix
bugzilla: 186896, https://gitee.com/src-openeuler/kernel/issues/I5GZC8
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=5ccc944dce3df5fd2fd683a7df4fd49d1068eba2

-------------------------------------------------

We had an off-by-one error which meant that we never marked the first page
in a read as accessed.  This was visible as a slowdown when re-reading
a file as pages were being evicted from cache too soon.  In reviewing
this code, we noticed a second bug where a multi-page folio would be
marked as accessed multiple times when doing reads that were less than
the size of the folio.

Abstract the comparison of whether two file positions are in the same
folio into a new function, fixing both of these bugs.

Reported-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>

Conflict: folios is not supported yet
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 mm/filemap.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index 9e209e8a3b0d..edb94663c5df 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2437,6 +2437,13 @@ static int generic_file_buffered_read_get_pages(struct kiocb *iocb,
 	goto find_page;
 }
 
+static inline bool pos_same_page(loff_t pos1, loff_t pos2, struct page *page)
+{
+	unsigned int shift = page_shift(page);
+
+	return (pos1 >> shift == pos2 >> shift);
+}
+
 /**
  * generic_file_buffered_read - generic file read routine
  * @iocb:	the iocb to read
@@ -2527,11 +2534,10 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
 		writably_mapped = mapping_writably_mapped(mapping);
 
 		/*
-		 * When a sequential read accesses a page several times, only
+		 * When a read accesses a page several times, only
 		 * mark it as accessed the first time.
 		 */
-		if (iocb->ki_pos >> PAGE_SHIFT !=
-		    ra->prev_pos >> PAGE_SHIFT)
+		if (pos_same_page(iocb->ki_pos, ra->prev_pos -1, pages[0]))
 			mark_page_accessed(pages[0]);
 		for (i = 1; i < pg_nr; i++)
 			mark_page_accessed(pages[i]);
-- 
Gitee


From 5399ee65708b987d36cb7ad7fbb33e5e8aff126d Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Tue, 19 Jul 2022 16:51:13 +0800
Subject: [PATCH 478/674] nbd: don't clear 'NBD_CMD_INFLIGHT' flag if request
 is not completed

mainline inclusion
from mainline-v5.19-rc1
commit 2895f1831e911ca87d4efdf43e35eb72a0c7e66e
category: bugfix
bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5H32C
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=2895f1831e911ca87d4efdf43e35eb72a0c7e66e

--------------------------------

Otherwise io will hung because request will only be completed if the
cmd has the flag 'NBD_CMD_INFLIGHT'.

Fixes: 07175cb1baf4 ("nbd: make sure request completion won't concurrent")
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Link: https://lore.kernel.org/r/20220521073749.3146892-4-yukuai3@huawei.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Jason Yan <yanaijie@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 drivers/block/nbd.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 873ce6f71d85..f7bc67e74390 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -395,13 +395,14 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
 	if (!mutex_trylock(&cmd->lock))
 		return BLK_EH_RESET_TIMER;
 
-	if (!__test_and_clear_bit(NBD_CMD_INFLIGHT, &cmd->flags)) {
+	if (!test_bit(NBD_CMD_INFLIGHT, &cmd->flags)) {
 		mutex_unlock(&cmd->lock);
 		return BLK_EH_DONE;
 	}
 
 	if (!refcount_inc_not_zero(&nbd->config_refs)) {
 		cmd->status = BLK_STS_TIMEOUT;
+		__clear_bit(NBD_CMD_INFLIGHT, &cmd->flags);
 		mutex_unlock(&cmd->lock);
 		goto done;
 	}
@@ -470,6 +471,7 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
 	dev_err_ratelimited(nbd_to_dev(nbd), "Connection timed out\n");
 	set_bit(NBD_RT_TIMEDOUT, &config->runtime_flags);
 	cmd->status = BLK_STS_IOERR;
+	__clear_bit(NBD_CMD_INFLIGHT, &cmd->flags);
 	mutex_unlock(&cmd->lock);
 	sock_shutdown(nbd);
 	nbd_config_put(nbd);
@@ -737,7 +739,7 @@ static struct nbd_cmd *nbd_handle_reply(struct nbd_device *nbd, int index,
 	cmd = blk_mq_rq_to_pdu(req);
 
 	mutex_lock(&cmd->lock);
-	if (!__test_and_clear_bit(NBD_CMD_INFLIGHT, &cmd->flags)) {
+	if (!test_bit(NBD_CMD_INFLIGHT, &cmd->flags)) {
 		dev_err(disk_to_dev(nbd->disk), "Suspicious reply %d (status %u flags %lu)",
 			tag, cmd->status, cmd->flags);
 		ret = -ENOENT;
@@ -844,8 +846,16 @@ static void recv_work(struct work_struct *work)
 		}
 
 		rq = blk_mq_rq_from_pdu(cmd);
-		if (likely(!blk_should_fake_timeout(rq->q)))
-			blk_mq_complete_request(rq);
+		if (likely(!blk_should_fake_timeout(rq->q))) {
+			bool complete;
+
+			mutex_lock(&cmd->lock);
+			complete = __test_and_clear_bit(NBD_CMD_INFLIGHT,
+							&cmd->flags);
+			mutex_unlock(&cmd->lock);
+			if (complete)
+				blk_mq_complete_request(rq);
+		}
 		percpu_ref_put(&q->q_usage_counter);
 	}
 
-- 
Gitee


From bc5bd4749331fde731378d46148191ac6403843b Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Tue, 19 Jul 2022 16:51:14 +0800
Subject: [PATCH 479/674] nbd: fix io hung while disconnecting device

mainline inclusion
from mainline-v5.19-rc1
commit 09dadb5985023e27d4740ebd17e6fea4640110e5
category: bugfix
bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5H32C
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=09dadb5985023e27d4740ebd17e6fea4640110e5

--------------------------------

In our tests, "qemu-nbd" triggers a io hung:

INFO: task qemu-nbd:11445 blocked for more than 368 seconds.
      Not tainted 5.18.0-rc3-next-20220422-00003-g2176915513ca #884
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
task:qemu-nbd        state:D stack:    0 pid:11445 ppid:     1 flags:0x00000000
Call Trace:
 <TASK>
 __schedule+0x480/0x1050
 ? _raw_spin_lock_irqsave+0x3e/0xb0
 schedule+0x9c/0x1b0
 blk_mq_freeze_queue_wait+0x9d/0xf0
 ? ipi_rseq+0x70/0x70
 blk_mq_freeze_queue+0x2b/0x40
 nbd_add_socket+0x6b/0x270 [nbd]
 nbd_ioctl+0x383/0x510 [nbd]
 blkdev_ioctl+0x18e/0x3e0
 __x64_sys_ioctl+0xac/0x120
 do_syscall_64+0x35/0x80
 entry_SYSCALL_64_after_hwframe+0x44/0xae
RIP: 0033:0x7fd8ff706577
RSP: 002b:00007fd8fcdfebf8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
RAX: ffffffffffffffda RBX: 0000000040000000 RCX: 00007fd8ff706577
RDX: 000000000000000d RSI: 000000000000ab00 RDI: 000000000000000f
RBP: 000000000000000f R08: 000000000000fbe8 R09: 000055fe497c62b0
R10: 00000002aff20000 R11: 0000000000000246 R12: 000000000000006d
R13: 0000000000000000 R14: 00007ffe82dc5e70 R15: 00007fd8fcdff9c0

"qemu-ndb -d" will call ioctl 'NBD_DISCONNECT' first, however, following
message was found:

block nbd0: Send disconnect failed -32

Which indicate that something is wrong with the server. Then,
"qemu-nbd -d" will call ioctl 'NBD_CLEAR_SOCK', however ioctl can't clear
requests after commit 2516ab1543fd("nbd: only clear the queue on device
teardown"). And in the meantime, request can't complete through timeout
because nbd_xmit_timeout() will always return 'BLK_EH_RESET_TIMER', which
means such request will never be completed in this situation.

Now that the flag 'NBD_CMD_INFLIGHT' can make sure requests won't
complete multiple times, switch back to call nbd_clear_sock() in
nbd_clear_sock_ioctl(), so that inflight requests can be cleared.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Link: https://lore.kernel.org/r/20220521073749.3146892-5-yukuai3@huawei.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Jason Yan <yanaijie@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 drivers/block/nbd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index f7bc67e74390..b45f4e5585b5 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -1424,7 +1424,7 @@ static int nbd_start_device_ioctl(struct nbd_device *nbd, struct block_device *b
 static void nbd_clear_sock_ioctl(struct nbd_device *nbd,
 				 struct block_device *bdev)
 {
-	sock_shutdown(nbd);
+	nbd_clear_sock(nbd);
 	__invalidate_device(bdev, true);
 	nbd_bdev_reset(bdev);
 	if (test_and_clear_bit(NBD_RT_HAS_CONFIG_REF,
-- 
Gitee


From 7a36a8c476f698de1351166d7f661e29e0404e3d Mon Sep 17 00:00:00 2001
From: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
Date: Tue, 19 Jul 2022 16:51:15 +0800
Subject: [PATCH 480/674] xen/arm: Fix race in RB-tree based P2M accounting

mainline inclusion
from mainline-v5.19-rc6
commit b75cd218274e01d026dc5240e86fdeb44bbed0c8
category: bugfix
bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5G5TV
CVE: CVE-2022-33744

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b75cd218274e01d026dc5240e86fdeb44bbed0c8

--------------------------------

xen/arm: Fix race in RB-tree based P2M accounting

During the PV driver life cycle the mappings are added to
the RB-tree by set_foreign_p2m_mapping(), which is called from
gnttab_map_refs() and are removed by clear_foreign_p2m_mapping()
which is called from gnttab_unmap_refs(). As both functions end
up calling __set_phys_to_machine_multi() which updates the RB-tree,
this function can be called concurrently.

There is already a "p2m_lock" to protect against concurrent accesses,
but the problem is that the first read of "phys_to_mach.rb_node"
in __set_phys_to_machine_multi() is not covered by it, so this might
lead to the incorrect mappings update (removing in our case) in RB-tree.

In my environment the related issue happens rarely and only when
PV net backend is running, the xen_add_phys_to_mach_entry() claims
that it cannot add new pfn <-> mfn mapping to the tree since it is
already exists which results in a failure when mapping foreign pages.

But there might be other bad consequences related to the non-protected
root reads such use-after-free, etc.

While at it, also fix the similar usage in __pfn_to_mfn(), so
initialize "struct rb_node *n" with the "p2m_lock" held in both
functions to avoid possible bad consequences.

This is CVE-2022-33744 / XSA-406.

Signed-off-by: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
Signed-off-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Zhao Wenhui <zhaowenhui8@huawei.com>
Reviewed-by: Xiu Jianfeng <xiujianfeng@huawei.com>
Reviewed-by: Chen Hui <judy.chenhui@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 arch/arm/xen/p2m.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/arm/xen/p2m.c b/arch/arm/xen/p2m.c
index acb464547a54..4a1991a103ea 100644
--- a/arch/arm/xen/p2m.c
+++ b/arch/arm/xen/p2m.c
@@ -62,11 +62,12 @@ static int xen_add_phys_to_mach_entry(struct xen_p2m_entry *new)
 
 unsigned long __pfn_to_mfn(unsigned long pfn)
 {
-	struct rb_node *n = phys_to_mach.rb_node;
+	struct rb_node *n;
 	struct xen_p2m_entry *entry;
 	unsigned long irqflags;
 
 	read_lock_irqsave(&p2m_lock, irqflags);
+	n = phys_to_mach.rb_node;
 	while (n) {
 		entry = rb_entry(n, struct xen_p2m_entry, rbnode_phys);
 		if (entry->pfn <= pfn &&
@@ -153,10 +154,11 @@ bool __set_phys_to_machine_multi(unsigned long pfn,
 	int rc;
 	unsigned long irqflags;
 	struct xen_p2m_entry *p2m_entry;
-	struct rb_node *n = phys_to_mach.rb_node;
+	struct rb_node *n;
 
 	if (mfn == INVALID_P2M_ENTRY) {
 		write_lock_irqsave(&p2m_lock, irqflags);
+		n = phys_to_mach.rb_node;
 		while (n) {
 			p2m_entry = rb_entry(n, struct xen_p2m_entry, rbnode_phys);
 			if (p2m_entry->pfn <= pfn &&
-- 
Gitee


From 5809235f957616012f0a5a38f53a8f155588b457 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Tue, 19 Jul 2022 16:51:16 +0800
Subject: [PATCH 481/674] crypto: hisilicon - use dev_driver_string() instead
 of pci_dev->driver->name
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

mainline inclusion
from mainline-v5.16-rc1
commit 1fbbcffd0ee1
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=1fbbcffd0ee1

----------------------------------------------------------------------

Replace dev->driver_name() by dev_driver_string() for the corresponding
struct device.  This is a step toward removing pci_dev->driver.

[bhelgaas: split to separate patch]
Link: https://lore.kernel.org/r/20211004125935.2300113-8-u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Jiangshui Yang <yangjiangshui@h-partners.com>
Reviewed-by: Yang Shen <shenyang39@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 drivers/crypto/hisilicon/qm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c
index 57de083e8967..56e19062ba44 100644
--- a/drivers/crypto/hisilicon/qm.c
+++ b/drivers/crypto/hisilicon/qm.c
@@ -3220,7 +3220,7 @@ static int qm_alloc_uacce(struct hisi_qm *qm)
 	};
 	int ret;
 
-	ret = strscpy(interface.name, pdev->driver->name,
+	ret = strscpy(interface.name, dev_driver_string(&pdev->dev),
 		      sizeof(interface.name));
 	if (ret < 0)
 		return -ENAMETOOLONG;
-- 
Gitee


From 6c0fc54930cb6f16f1d7c8b8a2d0ce65778493c4 Mon Sep 17 00:00:00 2001
From: Yang Guang <yang.guang5@zte.com.cn>
Date: Tue, 19 Jul 2022 16:51:17 +0800
Subject: [PATCH 482/674] crypto: hisilicon/hpre - use swap() to make code
 cleaner

mainline inclusion
from mainline-v5.17-rc1
commit 574c833ef3a6
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=574c833ef3a6

----------------------------------------------------------------------

Use the macro 'swap()' defined in 'include/linux/minmax.h' to avoid
opencoding it.

Reported-by: Zeal Robot <zealci@zte.com.cn>
Signed-off-by: Yang Guang <yang.guang5@zte.com.cn>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Jiangshui Yang <yangjiangshui@h-partners.com>
Reviewed-by: Yang Shen <shenyang39@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 drivers/crypto/hisilicon/hpre/hpre_crypto.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/crypto/hisilicon/hpre/hpre_crypto.c b/drivers/crypto/hisilicon/hpre/hpre_crypto.c
index a032c192ef1d..0f1724d355b8 100644
--- a/drivers/crypto/hisilicon/hpre/hpre_crypto.c
+++ b/drivers/crypto/hisilicon/hpre/hpre_crypto.c
@@ -1177,13 +1177,10 @@ static void hpre_rsa_exit_tfm(struct crypto_akcipher *tfm)
 static void hpre_key_to_big_end(u8 *data, int len)
 {
 	int i, j;
-	u8 tmp;
 
 	for (i = 0; i < len / 2; i++) {
 		j = len - i - 1;
-		tmp = data[j];
-		data[j] = data[i];
-		data[i] = tmp;
+		swap(data[j], data[i]);
 	}
 }
 
-- 
Gitee


From f7093db71f3180d4abb378debc5b3206c58e0390 Mon Sep 17 00:00:00 2001
From: Kai Ye <yekai13@huawei.com>
Date: Tue, 19 Jul 2022 16:51:18 +0800
Subject: [PATCH 483/674] crypto: hisilicon - modify the value of engine type
 rate

mainline inclusion
from mainline-v5.17-rc1
commit 376a5c3cdd7c
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=376a5c3cdd7c

----------------------------------------------------------------------

Modify the value of type rate from new QM spec.

Signed-off-by: Kai Ye <yekai13@huawei.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Jiangshui Yang <yangjiangshui@h-partners.com>
Reviewed-by: Yang Shen <shenyang39@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 drivers/crypto/hisilicon/hpre/hpre_main.c | 2 +-
 drivers/crypto/hisilicon/sec2/sec_main.c  | 2 +-
 drivers/crypto/hisilicon/zip/zip_main.c   | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/crypto/hisilicon/hpre/hpre_main.c b/drivers/crypto/hisilicon/hpre/hpre_main.c
index 65a641396c07..ebfab3e14499 100644
--- a/drivers/crypto/hisilicon/hpre/hpre_main.c
+++ b/drivers/crypto/hisilicon/hpre/hpre_main.c
@@ -103,7 +103,7 @@
 #define HPRE_QM_PM_FLR			BIT(11)
 #define HPRE_QM_SRIOV_FLR		BIT(12)
 
-#define HPRE_SHAPER_TYPE_RATE		128
+#define HPRE_SHAPER_TYPE_RATE		640
 #define HPRE_VIA_MSI_DSM		1
 #define HPRE_SQE_MASK_OFFSET		8
 #define HPRE_SQE_MASK_LEN		24
diff --git a/drivers/crypto/hisilicon/sec2/sec_main.c b/drivers/crypto/hisilicon/sec2/sec_main.c
index 90551bf38b52..26d3ab1d308b 100644
--- a/drivers/crypto/hisilicon/sec2/sec_main.c
+++ b/drivers/crypto/hisilicon/sec2/sec_main.c
@@ -105,7 +105,7 @@
 
 #define SEC_SQE_MASK_OFFSET		64
 #define SEC_SQE_MASK_LEN		48
-#define SEC_SHAPER_TYPE_RATE		128
+#define SEC_SHAPER_TYPE_RATE		400
 
 struct sec_hw_error {
 	u32 int_msk;
diff --git a/drivers/crypto/hisilicon/zip/zip_main.c b/drivers/crypto/hisilicon/zip/zip_main.c
index 9c4fb0af5ebd..678f8b58ec42 100644
--- a/drivers/crypto/hisilicon/zip/zip_main.c
+++ b/drivers/crypto/hisilicon/zip/zip_main.c
@@ -103,8 +103,8 @@
 #define HZIP_PREFETCH_ENABLE		(~(BIT(26) | BIT(17) | BIT(0)))
 #define HZIP_SVA_PREFETCH_DISABLE	BIT(26)
 #define HZIP_SVA_DISABLE_READY		(BIT(26) | BIT(30))
-#define HZIP_SHAPER_RATE_COMPRESS	252
-#define HZIP_SHAPER_RATE_DECOMPRESS	229
+#define HZIP_SHAPER_RATE_COMPRESS	750
+#define HZIP_SHAPER_RATE_DECOMPRESS	140
 #define HZIP_DELAY_1_US		1
 #define HZIP_POLL_TIMEOUT_US	1000
 
-- 
Gitee


From 42b23856b36ff79226eb4e9022e9ed1b1a55e56a Mon Sep 17 00:00:00 2001
From: Kai Ye <yekai13@huawei.com>
Date: Tue, 19 Jul 2022 16:51:19 +0800
Subject: [PATCH 484/674] crypto: hisilicon/qm - modify the value of qos
 initialization

mainline inclusion
from mainline-v5.17-rc1
commit ecc7169d4f73
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ecc7169d4f73

----------------------------------------------------------------------

The value of qos should be reset after flr resetting or device
resetting. So set the max of qos value for every function. Then
update the value of qos when user writing the alg_qos.

Signed-off-by: Kai Ye <yekai13@huawei.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Jiangshui Yang <yangjiangshui@h-partners.com>
Reviewed-by: Yang Shen <shenyang39@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 drivers/crypto/hisilicon/qm.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c
index 56e19062ba44..fea69f7aadba 100644
--- a/drivers/crypto/hisilicon/qm.c
+++ b/drivers/crypto/hisilicon/qm.c
@@ -1257,10 +1257,10 @@ static int qm_set_vft_common(struct hisi_qm *qm, enum vft_type type,
 
 static int qm_shaper_init_vft(struct hisi_qm *qm, u32 fun_num)
 {
+	u32 qos = qm->factor[fun_num].func_qos;
 	int ret, i;
 
-	qm->factor[fun_num].func_qos = QM_QOS_MAX_VAL;
-	ret = qm_get_shaper_para(QM_QOS_MAX_VAL * QM_QOS_RATE, &qm->factor[fun_num]);
+	ret = qm_get_shaper_para(qos * QM_QOS_RATE, &qm->factor[fun_num]);
 	if (ret) {
 		dev_err(&qm->pdev->dev, "failed to calculate shaper parameter!\n");
 		return ret;
@@ -5838,13 +5838,15 @@ static int hisi_qp_alloc_memory(struct hisi_qm *qm)
 static int hisi_qm_memory_init(struct hisi_qm *qm)
 {
 	struct device *dev = &qm->pdev->dev;
-	int ret, total_vfs;
+	int ret, total_func, i;
 	size_t off = 0;
 
-	total_vfs = pci_sriov_get_totalvfs(qm->pdev);
-	qm->factor = kcalloc(total_vfs + 1, sizeof(struct qm_shaper_factor), GFP_KERNEL);
+	total_func = pci_sriov_get_totalvfs(qm->pdev) + 1;
+	qm->factor = kcalloc(total_func, sizeof(struct qm_shaper_factor), GFP_KERNEL);
 	if (!qm->factor)
 		return -ENOMEM;
+	for (i = 0; i < total_func; i++)
+		qm->factor[i].func_qos = QM_QOS_MAX_VAL;
 
 #define QM_INIT_BUF(qm, type, num) do { \
 	(qm)->type = ((qm)->qdma.va + (off)); \
-- 
Gitee


From 9412be3d72142ad991e79d082a2742305f69fcf3 Mon Sep 17 00:00:00 2001
From: Kai Ye <yekai13@huawei.com>
Date: Tue, 19 Jul 2022 16:51:20 +0800
Subject: [PATCH 485/674] crypto: hisilicon/qm - some optimizations of ths qos
 write process

mainline inclusion
from mainline-v5.17-rc1
commit 488f30d4b8b3
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=488f30d4b8b3

----------------------------------------------------------------------

1. Optimize overly long functions.
2. Fix the format symbol does not match the actual type.
3. Use the PCI_DEVFN to get the function id.

Signed-off-by: Kai Ye <yekai13@huawei.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Jiangshui Yang <yangjiangshui@h-partners.com>
Reviewed-by: Yang Shen <shenyang39@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 drivers/crypto/hisilicon/qm.c | 87 ++++++++++++++++++-----------------
 1 file changed, 46 insertions(+), 41 deletions(-)

diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c
index fea69f7aadba..9e578eba295f 100644
--- a/drivers/crypto/hisilicon/qm.c
+++ b/drivers/crypto/hisilicon/qm.c
@@ -4352,66 +4352,69 @@ static ssize_t qm_qos_value_init(const char *buf, unsigned long *val)
 	return 0;
 }
 
+static ssize_t qm_get_qos_value(struct hisi_qm *qm, const char *buf,
+			       unsigned long *val,
+			       unsigned int *fun_index)
+{
+	char tbuf_bdf[QM_DBG_READ_LEN] = {0};
+	char val_buf[QM_QOS_VAL_MAX_LEN] = {0};
+	u32 tmp1, device, function;
+	int ret, bus;
+
+	ret = sscanf(buf, "%s %s", tbuf_bdf, val_buf);
+	if (ret != QM_QOS_PARAM_NUM)
+		return -EINVAL;
+
+	ret = qm_qos_value_init(val_buf, val);
+	if (*val == 0 || *val > QM_QOS_MAX_VAL || ret) {
+		pci_err(qm->pdev, "input qos value is error, please set 1~1000!\n");
+		return -EINVAL;
+	}
+
+	ret = sscanf(tbuf_bdf, "%u:%x:%u.%u", &tmp1, &bus, &device, &function);
+	if (ret != QM_QOS_BDF_PARAM_NUM) {
+		pci_err(qm->pdev, "input pci bdf value is error!\n");
+		return -EINVAL;
+	}
+
+	*fun_index = PCI_DEVFN(device, function);
+
+	return 0;
+}
+
 static ssize_t qm_algqos_write(struct file *filp, const char __user *buf,
 			       size_t count, loff_t *pos)
 {
 	struct hisi_qm *qm = filp->private_data;
 	char tbuf[QM_DBG_READ_LEN];
-	int tmp1, bus, device, function;
-	char tbuf_bdf[QM_DBG_READ_LEN] = {0};
-	char val_buf[QM_QOS_VAL_MAX_LEN] = {0};
 	unsigned int fun_index;
-	unsigned long val = 0;
+	unsigned long val;
 	int len, ret;
 
 	if (qm->fun_type == QM_HW_VF)
 		return -EINVAL;
 
-	/* Mailbox and reset cannot be operated at the same time */
-	if (test_and_set_bit(QM_RESETTING, &qm->misc_ctl)) {
-		pci_err(qm->pdev, "dev resetting, write alg qos failed!\n");
-		return -EAGAIN;
-	}
-
-	if (*pos != 0) {
-		ret = 0;
-		goto err_get_status;
-	}
+	if (*pos != 0)
+		return 0;
 
-	if (count >= QM_DBG_READ_LEN) {
-		ret = -ENOSPC;
-		goto err_get_status;
-	}
+	if (count >= QM_DBG_READ_LEN)
+		return -ENOSPC;
 
 	len = simple_write_to_buffer(tbuf, QM_DBG_READ_LEN - 1, pos, buf, count);
-	if (len < 0) {
-		ret = len;
-		goto err_get_status;
-	}
+	if (len < 0)
+		return len;
 
 	tbuf[len] = '\0';
-	ret = sscanf(tbuf, "%s %s", tbuf_bdf, val_buf);
-	if (ret != QM_QOS_PARAM_NUM) {
-		ret = -EINVAL;
-		goto err_get_status;
-	}
-
-	ret = qm_qos_value_init(val_buf, &val);
-	if (val == 0 || val > QM_QOS_MAX_VAL || ret) {
-		pci_err(qm->pdev, "input qos value is error, please set 1~1000!\n");
-		ret = -EINVAL;
-		goto err_get_status;
-	}
+	ret = qm_get_qos_value(qm, tbuf, &val, &fun_index);
+	if (ret)
+		return ret;
 
-	ret = sscanf(tbuf_bdf, "%d:%x:%d.%d", &tmp1, &bus, &device, &function);
-	if (ret != QM_QOS_BDF_PARAM_NUM) {
-		pci_err(qm->pdev, "input pci bdf value is error!\n");
-		ret = -EINVAL;
-		goto err_get_status;
+	/* Mailbox and reset cannot be operated at the same time */
+	if (test_and_set_bit(QM_RESETTING, &qm->misc_ctl)) {
+		pci_err(qm->pdev, "dev resetting, write alg qos failed!\n");
+		return -EAGAIN;
 	}
 
-	fun_index = device * 8 + function;
-
 	ret = qm_pm_get_sync(qm);
 	if (ret) {
 		ret = -EINVAL;
@@ -4425,6 +4428,8 @@ static ssize_t qm_algqos_write(struct file *filp, const char __user *buf,
 		goto err_put_sync;
 	}
 
+	pci_info(qm->pdev, "the qos value of function%u is set to %lu.\n",
+		 fun_index, val);
 	ret = count;
 
 err_put_sync:
-- 
Gitee


From 990c7e184747263ca37f0ca88419e62d7c6b4889 Mon Sep 17 00:00:00 2001
From: Kai Ye <yekai13@huawei.com>
Date: Tue, 19 Jul 2022 16:51:21 +0800
Subject: [PATCH 486/674] crypto: hisilicon/qm - simplified the calculation of
 qos shaper parameters

mainline inclusion
from mainline-v5.17-rc1
commit 13389403fe8a
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=13389403fe8a

----------------------------------------------------------------------

Some optimize for the calculation of qos shaper parameters.
and modify the comments.

Signed-off-by: Kai Ye <yekai13@huawei.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Jiangshui Yang <yangjiangshui@h-partners.com>
Reviewed-by: Yang Shen <shenyang39@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 drivers/crypto/hisilicon/qm.c | 84 +++++++++++++++++++++++------------
 1 file changed, 55 insertions(+), 29 deletions(-)

diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c
index 9e578eba295f..46aeb88be4ed 100644
--- a/drivers/crypto/hisilicon/qm.c
+++ b/drivers/crypto/hisilicon/qm.c
@@ -505,10 +505,30 @@ static const char * const qp_s[] = {
 	"none", "init", "start", "stop", "close",
 };
 
-static const u32 typical_qos_val[QM_QOS_TYPICAL_NUM] = {100, 250, 500, 1000,
-						10000, 25000, 50000, 100000};
-static const u32 typical_qos_cbs_s[QM_QOS_TYPICAL_NUM] = {9, 10, 11, 12, 16,
-							 17, 18, 19};
+struct qm_typical_qos_table {
+	u32 start;
+	u32 end;
+	u32 val;
+};
+
+/* the qos step is 100 */
+static struct qm_typical_qos_table shaper_cir_s[] = {
+	{100, 100, 4},
+	{200, 200, 3},
+	{300, 500, 2},
+	{600, 1000, 1},
+	{1100, 100000, 0},
+};
+
+static struct qm_typical_qos_table shaper_cbs_s[] = {
+	{100, 200, 9},
+	{300, 500, 11},
+	{600, 1000, 12},
+	{1100, 10000, 16},
+	{10100, 25000, 17},
+	{25100, 50000, 18},
+	{50100, 100000, 19}
+};
 
 static bool qm_avail_state(struct hisi_qm *qm, enum qm_state new)
 {
@@ -1119,12 +1139,14 @@ static void qm_init_prefetch(struct hisi_qm *qm)
 }
 
 /*
+ * acc_shaper_para_calc() Get the IR value by the qos formula, the return value
+ * is the expected qos calculated.
  * the formula:
  * IR = X Mbps if ir = 1 means IR = 100 Mbps, if ir = 10000 means = 10Gbps
  *
- *		        IR_b * (2 ^ IR_u) * 8
- * IR(Mbps) * 10 ^ -3 = -------------------------
- *		        Tick * (2 ^ IR_s)
+ *		IR_b * (2 ^ IR_u) * 8000
+ * IR(Mbps) = -------------------------
+ *		  Tick * (2 ^ IR_s)
  */
 static u32 acc_shaper_para_calc(u64 cir_b, u64 cir_u, u64 cir_s)
 {
@@ -1134,17 +1156,28 @@ static u32 acc_shaper_para_calc(u64 cir_b, u64 cir_u, u64 cir_s)
 
 static u32 acc_shaper_calc_cbs_s(u32 ir)
 {
+	int table_size = ARRAY_SIZE(shaper_cbs_s);
 	int i;
 
-	if (ir < typical_qos_val[0])
-		return QM_SHAPER_MIN_CBS_S;
+	for (i = 0; i < table_size; i++) {
+		if (ir >= shaper_cbs_s[i].start && ir <= shaper_cbs_s[i].end)
+			return shaper_cbs_s[i].val;
+	}
 
-	for (i = 1; i < QM_QOS_TYPICAL_NUM; i++) {
-		if (ir >= typical_qos_val[i - 1] && ir < typical_qos_val[i])
-			return typical_qos_cbs_s[i - 1];
+	return QM_SHAPER_MIN_CBS_S;
+}
+
+static u32 acc_shaper_calc_cir_s(u32 ir)
+{
+	int table_size = ARRAY_SIZE(shaper_cir_s);
+	int i;
+
+	for (i = 0; i < table_size; i++) {
+		if (ir >= shaper_cir_s[i].start && ir <= shaper_cir_s[i].end)
+			return shaper_cir_s[i].val;
 	}
 
-	return typical_qos_cbs_s[QM_QOS_TYPICAL_NUM - 1];
+	return 0;
 }
 
 static int qm_get_shaper_para(u32 ir, struct qm_shaper_factor *factor)
@@ -1153,25 +1186,18 @@ static int qm_get_shaper_para(u32 ir, struct qm_shaper_factor *factor)
 	u32 error_rate;
 
 	factor->cbs_s = acc_shaper_calc_cbs_s(ir);
+	cir_s = acc_shaper_calc_cir_s(ir);
 
 	for (cir_b = QM_QOS_MIN_CIR_B; cir_b <= QM_QOS_MAX_CIR_B; cir_b++) {
 		for (cir_u = 0; cir_u <= QM_QOS_MAX_CIR_U; cir_u++) {
-			for (cir_s = 0; cir_s <= QM_QOS_MAX_CIR_S; cir_s++) {
-				/** the formula is changed to:
-				 *	   IR_b * (2 ^ IR_u) * DIVISOR_CLK
-				 * IR(Mbps) = -------------------------
-				 *	       768 * (2 ^ IR_s)
-				 */
-				ir_calc = acc_shaper_para_calc(cir_b, cir_u,
-							       cir_s);
-				error_rate = QM_QOS_EXPAND_RATE * (u32)abs(ir_calc - ir) / ir;
-				if (error_rate <= QM_QOS_MIN_ERROR_RATE) {
-					factor->cir_b = cir_b;
-					factor->cir_u = cir_u;
-					factor->cir_s = cir_s;
-
-					return 0;
-				}
+			ir_calc = acc_shaper_para_calc(cir_b, cir_u, cir_s);
+
+			error_rate = QM_QOS_EXPAND_RATE * (u32)abs(ir_calc - ir) / ir;
+			if (error_rate <= QM_QOS_MIN_ERROR_RATE) {
+				factor->cir_b = cir_b;
+				factor->cir_u = cir_u;
+				factor->cir_s = cir_s;
+				return 0;
 			}
 		}
 	}
-- 
Gitee


From deca7e9bccabbce816c9f6b03fe122d8785bcde3 Mon Sep 17 00:00:00 2001
From: Weili Qian <qianweili@huawei.com>
Date: Tue, 19 Jul 2022 16:51:22 +0800
Subject: [PATCH 487/674] crypto: hisilicon/qm - fix incorrect return value of
 hisi_qm_resume()

mainline inclusion
from mainline-v5.17-rc1
commit 3f9dd4c802b9
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=3f9dd4c802b9

----------------------------------------------------------------------

When hisi_qm_resume() returns 0, it indicates that the device has started
successfully.  If the device fails to start, hisi_qm_resume() needs to
return the actual error code to the caller instead of 0.

Fixes: d7ea53395b72 ("crypto: hisilicon - add runtime PM ops")
Signed-off-by: Weili Qian <qianweili@huawei.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Jiangshui Yang <yangjiangshui@h-partners.com>
Reviewed-by: Yang Shen <shenyang39@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 drivers/crypto/hisilicon/qm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c
index 46aeb88be4ed..23933c8bd88d 100644
--- a/drivers/crypto/hisilicon/qm.c
+++ b/drivers/crypto/hisilicon/qm.c
@@ -6170,7 +6170,7 @@ int hisi_qm_resume(struct device *dev)
 	if (ret)
 		pci_err(pdev, "failed to start qm(%d)\n", ret);
 
-	return 0;
+	return ret;
 }
 EXPORT_SYMBOL_GPL(hisi_qm_resume);
 
-- 
Gitee


From f25cf6ed884994315fbaada94ad24dac3fb911d2 Mon Sep 17 00:00:00 2001
From: Weili Qian <qianweili@huawei.com>
Date: Tue, 19 Jul 2022 16:51:23 +0800
Subject: [PATCH 488/674] crypto: hisilicon/hpre - fix memory leak in
 hpre_curve25519_src_init()

mainline inclusion
from mainline-v5.17-rc1
commit 51fa916b81e5
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=51fa916b81e5

----------------------------------------------------------------------

hpre_curve25519_src_init() allocates memory for 'ptr' before calling
memcmp(). If memcmp() returns 0, the function will return '-EINVAL'
without freeing memory.

Signed-off-by: Weili Qian <qianweili@huawei.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Jiangshui Yang <yangjiangshui@h-partners.com>
Reviewed-by: Yang Shen <shenyang39@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 drivers/crypto/hisilicon/hpre/hpre_crypto.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/crypto/hisilicon/hpre/hpre_crypto.c b/drivers/crypto/hisilicon/hpre/hpre_crypto.c
index 0f1724d355b8..97d54c1465c2 100644
--- a/drivers/crypto/hisilicon/hpre/hpre_crypto.c
+++ b/drivers/crypto/hisilicon/hpre/hpre_crypto.c
@@ -1862,7 +1862,7 @@ static int hpre_curve25519_src_init(struct hpre_asym_request *hpre_req,
 	 */
 	if (memcmp(ptr, p, ctx->key_sz) == 0) {
 		dev_err(dev, "gx is p!\n");
-		return -EINVAL;
+		goto err;
 	} else if (memcmp(ptr, p, ctx->key_sz) > 0) {
 		hpre_curve25519_src_modulo_p(ptr);
 	}
-- 
Gitee


From 53bff6015473369f097159156f2ae9544db4d6b4 Mon Sep 17 00:00:00 2001
From: Weili Qian <qianweili@huawei.com>
Date: Tue, 19 Jul 2022 16:51:24 +0800
Subject: [PATCH 489/674] crypto: hisilicon/qm - disable qm clock-gating

mainline inclusion
from mainline-v5.17-rc1
commit 4cee0700cf1d
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=4cee0700cf1d

----------------------------------------------------------------------

For Kunpeng930, if qm clock-gating is enabled, rate limiter
will be inaccurate. Therefore, disable clock-gating before doing task.

Signed-off-by: Weili Qian <qianweili@huawei.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Reviewed-by: Yang Shen <shenyang39@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 drivers/crypto/hisilicon/qm.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c
index 23933c8bd88d..60ca59d11702 100644
--- a/drivers/crypto/hisilicon/qm.c
+++ b/drivers/crypto/hisilicon/qm.c
@@ -126,6 +126,8 @@
 #define QM_CQC_VFT			0x1
 #define QM_VFT_CFG			0x100060
 #define QM_VFT_CFG_OP_ENABLE		0x100054
+#define QM_PM_CTRL			0x100148
+#define QM_IDLE_DISABLE			BIT(9)
 
 #define QM_VFT_CFG_DATA_L		0x100064
 #define QM_VFT_CFG_DATA_H		0x100068
@@ -800,6 +802,19 @@ static void qm_db(struct hisi_qm *qm, u16 qn, u8 cmd, u16 index, u8 priority)
 	qm->ops->qm_db(qm, qn, cmd, index, priority);
 }
 
+static void qm_disable_clock_gate(struct hisi_qm *qm)
+{
+	u32 val;
+
+	/* if qm enables clock gating in Kunpeng930, qos will be inaccurate. */
+	if (qm->ver < QM_HW_V3)
+		return;
+
+	val = readl(qm->io_base + QM_PM_CTRL);
+	val |= QM_IDLE_DISABLE;
+	writel(val, qm->io_base +  QM_PM_CTRL);
+}
+
 static int qm_dev_mem_reset(struct hisi_qm *qm)
 {
 	u32 val;
@@ -5947,6 +5962,7 @@ int hisi_qm_init(struct hisi_qm *qm)
 	}
 
 	if (qm->fun_type == QM_HW_PF) {
+		qm_disable_clock_gate(qm);
 		ret = qm_dev_mem_reset(qm);
 		if (ret) {
 			dev_err(dev, "failed to reset device memory\n");
@@ -6111,6 +6127,7 @@ static int qm_rebuild_for_resume(struct hisi_qm *qm)
 
 	qm_cmd_init(qm);
 	hisi_qm_dev_err_init(qm);
+	qm_disable_clock_gate(qm);
 	ret = qm_dev_mem_reset(qm);
 	if (ret)
 		pci_err(pdev, "failed to reset device memory\n");
-- 
Gitee


From 3753ee5611dcd2a48df0633d0cd83210d3cea803 Mon Sep 17 00:00:00 2001
From: Tom Rix <trix@redhat.com>
Date: Tue, 19 Jul 2022 16:51:25 +0800
Subject: [PATCH 490/674] crypto: hisilicon - cleanup warning in
 qm_get_qos_value()

mainline inclusion
from mainline-v5.17-rc1
commit c5d692a2335d
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c5d692a2335d

----------------------------------------------------------------------

Building with clang static analysis returns this warning:

qm.c:4382:11: warning: The left operand of '==' is a garbage value
        if (*val == 0 || *val > QM_QOS_MAX_VAL || ret) {
            ~~~~ ^

The call to qm_qos_value_init() can return an error without setting
*val.  So check ret before checking *val.

Fixes: 72b010dc33b9 ("crypto: hisilicon/qm - supports writing QoS int the host")
Signed-off-by: Tom Rix <trix@redhat.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Jiangshui Yang <yangjiangshui@h-partners.com>
Reviewed-by: Yang Shen <shenyang39@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 drivers/crypto/hisilicon/qm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c
index 60ca59d11702..4e02bbf4768a 100644
--- a/drivers/crypto/hisilicon/qm.c
+++ b/drivers/crypto/hisilicon/qm.c
@@ -4407,7 +4407,7 @@ static ssize_t qm_get_qos_value(struct hisi_qm *qm, const char *buf,
 		return -EINVAL;
 
 	ret = qm_qos_value_init(val_buf, val);
-	if (*val == 0 || *val > QM_QOS_MAX_VAL || ret) {
+	if (ret || *val == 0 || *val > QM_QOS_MAX_VAL) {
 		pci_err(qm->pdev, "input qos value is error, please set 1~1000!\n");
 		return -EINVAL;
 	}
-- 
Gitee


From 00a6d7cd188ea5d2376bd3389e888c32f8c0bf2d Mon Sep 17 00:00:00 2001
From: Kai Ye <yekai13@huawei.com>
Date: Tue, 19 Jul 2022 16:51:26 +0800
Subject: [PATCH 491/674] crypto: hisilicon/sec - use the correct print format

mainline inclusion
from mainline-v5.18-rc1
commit 498382593c7c
category: cleanup
bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=498382593c7c

----------------------------------------------------------------------

Use the correct print format. Printing an unsigned int value should
use %u instead of %d.

Signed-off-by: Kai Ye <yekai13@huawei.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Reviewed-by: Yang Shen <shenyang39@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 drivers/crypto/hisilicon/sec2/sec_crypto.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/crypto/hisilicon/sec2/sec_crypto.c b/drivers/crypto/hisilicon/sec2/sec_crypto.c
index 9d0a39f60fb2..a91635c348b5 100644
--- a/drivers/crypto/hisilicon/sec2/sec_crypto.c
+++ b/drivers/crypto/hisilicon/sec2/sec_crypto.c
@@ -240,7 +240,7 @@ static void sec_req_cb(struct hisi_qp *qp, void *resp)
 
 	if (unlikely(type != type_supported)) {
 		atomic64_inc(&dfx->err_bd_cnt);
-		pr_err("err bd type [%d]\n", type);
+		pr_err("err bd type [%u]\n", type);
 		return;
 	}
 
-- 
Gitee


From c9f4c20d3e3504f8f7f8c8ee38e81303d829e64f Mon Sep 17 00:00:00 2001
From: Kai Ye <yekai13@huawei.com>
Date: Tue, 19 Jul 2022 16:51:27 +0800
Subject: [PATCH 492/674] crypto: hisilicon/qm - cleanup warning in
 qm_vf_read_qos

mainline inclusion
from mainline-v5.18-rc1
commit 05b3bade290d
category: clean up
bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=05b3bade290d

----------------------------------------------------------------------

The kernel test rebot report this warning: Uninitialized variable: ret.
The code flow may return value of ret directly. This value is an
uninitialized variable, here is fix it.

Signed-off-by: Kai Ye <yekai13@huawei.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Reviewed-by: Yang Shen <shenyang39@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 drivers/crypto/hisilicon/qm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c
index 4e02bbf4768a..bf103c8743d3 100644
--- a/drivers/crypto/hisilicon/qm.c
+++ b/drivers/crypto/hisilicon/qm.c
@@ -4308,7 +4308,7 @@ static void qm_vf_get_qos(struct hisi_qm *qm, u32 fun_num)
 static int qm_vf_read_qos(struct hisi_qm *qm)
 {
 	int cnt = 0;
-	int ret;
+	int ret = -EINVAL;
 
 	/* reset mailbox qos val */
 	qm->mb_qos = 0;
-- 
Gitee


From 2fef6b05e7b41437e9adbe48d957791c7f2b8664 Mon Sep 17 00:00:00 2001
From: Kai Ye <yekai13@huawei.com>
Date: Tue, 19 Jul 2022 16:51:28 +0800
Subject: [PATCH 493/674] crypto: hisilicon/sec - add the register
 configuration for HW V3

mainline inclusion
from mainline-v5.18-rc1
commit aec01cc8d119
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=aec01cc8d119

----------------------------------------------------------------------

Added the register configuration of the SVA mode for HW V3.

Signed-off-by: Kai Ye <yekai13@huawei.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Reviewed-by: Yang Shen <shenyang39@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 drivers/crypto/hisilicon/sec2/sec_main.c | 51 +++++++++++++++++++-----
 1 file changed, 40 insertions(+), 11 deletions(-)

diff --git a/drivers/crypto/hisilicon/sec2/sec_main.c b/drivers/crypto/hisilicon/sec2/sec_main.c
index 26d3ab1d308b..45d2b27da9ad 100644
--- a/drivers/crypto/hisilicon/sec2/sec_main.c
+++ b/drivers/crypto/hisilicon/sec2/sec_main.c
@@ -90,6 +90,10 @@
 					SEC_USER1_WB_DATA_SSV)
 #define SEC_USER1_SMMU_SVA		(SEC_USER1_SMMU_NORMAL | SEC_USER1_SVA_SET)
 #define SEC_USER1_SMMU_MASK		(~SEC_USER1_SVA_SET)
+#define SEC_INTERFACE_USER_CTRL0_REG_V3	0x302220
+#define SEC_INTERFACE_USER_CTRL1_REG_V3	0x302224
+#define SEC_USER1_SMMU_NORMAL_V3	(BIT(23) | BIT(17) | BIT(11) | BIT(5))
+#define SEC_USER1_SMMU_MASK_V3		0xFF79E79E
 #define SEC_CORE_INT_STATUS_M_ECC	BIT(2)
 
 #define SEC_PREFETCH_CFG		0x301130
@@ -335,6 +339,41 @@ static void sec_set_endian(struct hisi_qm *qm)
 	writel_relaxed(reg, qm->io_base + SEC_CONTROL_REG);
 }
 
+static void sec_engine_sva_config(struct hisi_qm *qm)
+{
+	u32 reg;
+
+	if (qm->ver > QM_HW_V2) {
+		reg = readl_relaxed(qm->io_base +
+				SEC_INTERFACE_USER_CTRL0_REG_V3);
+		reg |= SEC_USER0_SMMU_NORMAL;
+		writel_relaxed(reg, qm->io_base +
+				SEC_INTERFACE_USER_CTRL0_REG_V3);
+
+		reg = readl_relaxed(qm->io_base +
+				SEC_INTERFACE_USER_CTRL1_REG_V3);
+		reg &= SEC_USER1_SMMU_MASK_V3;
+		reg |= SEC_USER1_SMMU_NORMAL_V3;
+		writel_relaxed(reg, qm->io_base +
+				SEC_INTERFACE_USER_CTRL1_REG_V3);
+	} else {
+		reg = readl_relaxed(qm->io_base +
+				SEC_INTERFACE_USER_CTRL0_REG);
+		reg |= SEC_USER0_SMMU_NORMAL;
+		writel_relaxed(reg, qm->io_base +
+				SEC_INTERFACE_USER_CTRL0_REG);
+		reg = readl_relaxed(qm->io_base +
+				SEC_INTERFACE_USER_CTRL1_REG);
+		reg &= SEC_USER1_SMMU_MASK;
+		if (qm->use_sva)
+			reg |= SEC_USER1_SMMU_SVA;
+		else
+			reg |= SEC_USER1_SMMU_NORMAL;
+		writel_relaxed(reg, qm->io_base +
+				SEC_INTERFACE_USER_CTRL1_REG);
+	}
+}
+
 static void sec_open_sva_prefetch(struct hisi_qm *qm)
 {
 	u32 val;
@@ -426,17 +465,7 @@ static int sec_engine_init(struct hisi_qm *qm)
 	reg |= (0x1 << SEC_TRNG_EN_SHIFT);
 	writel_relaxed(reg, qm->io_base + SEC_CONTROL_REG);
 
-	reg = readl_relaxed(qm->io_base + SEC_INTERFACE_USER_CTRL0_REG);
-	reg |= SEC_USER0_SMMU_NORMAL;
-	writel_relaxed(reg, qm->io_base + SEC_INTERFACE_USER_CTRL0_REG);
-
-	reg = readl_relaxed(qm->io_base + SEC_INTERFACE_USER_CTRL1_REG);
-	reg &= SEC_USER1_SMMU_MASK;
-	if (qm->use_sva && qm->ver == QM_HW_V2)
-		reg |= SEC_USER1_SMMU_SVA;
-	else
-		reg |= SEC_USER1_SMMU_NORMAL;
-	writel_relaxed(reg, qm->io_base + SEC_INTERFACE_USER_CTRL1_REG);
+	sec_engine_sva_config(qm);
 
 	writel(SEC_SINGLE_PORT_MAX_TRANS,
 	       qm->io_base + AM_CFG_SINGLE_PORT_MAX_TRANS);
-- 
Gitee


From abe783433177f68cee95ccf27bb1caf043049740 Mon Sep 17 00:00:00 2001
From: Kai Ye <yekai13@huawei.com>
Date: Tue, 19 Jul 2022 16:51:29 +0800
Subject: [PATCH 494/674] crypto: hisilicon/sec - not need to enable sm4 extra
 mode at HW V3

mainline inclusion
from mainline-v5.18-rc1
commit f8a265282644
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=f8a265282644

----------------------------------------------------------------------

It is not need to enable sm4 extra mode in at HW V3. Here is fix it.

Signed-off-by: Kai Ye <yekai13@huawei.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Reviewed-by: Yang Shen <shenyang39@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 drivers/crypto/hisilicon/sec2/sec_main.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/crypto/hisilicon/sec2/sec_main.c b/drivers/crypto/hisilicon/sec2/sec_main.c
index 45d2b27da9ad..0b9906ff69e3 100644
--- a/drivers/crypto/hisilicon/sec2/sec_main.c
+++ b/drivers/crypto/hisilicon/sec2/sec_main.c
@@ -472,9 +472,11 @@ static int sec_engine_init(struct hisi_qm *qm)
 
 	writel(SEC_SAA_ENABLE, qm->io_base + SEC_SAA_EN_REG);
 
-	/* Enable sm4 extra mode, as ctr/ecb */
-	writel_relaxed(SEC_BD_ERR_CHK_EN0,
-		       qm->io_base + SEC_BD_ERR_CHK_EN_REG0);
+	/* HW V2 enable sm4 extra mode, as ctr/ecb */
+	if (qm->ver < QM_HW_V3)
+		writel_relaxed(SEC_BD_ERR_CHK_EN0,
+			       qm->io_base + SEC_BD_ERR_CHK_EN_REG0);
+
 	/* Enable sm4 xts mode multiple iv */
 	writel_relaxed(SEC_BD_ERR_CHK_EN1,
 		       qm->io_base + SEC_BD_ERR_CHK_EN_REG1);
-- 
Gitee


From 6af5a7bb0f0743e7142bd81e610639570283aa2c Mon Sep 17 00:00:00 2001
From: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Date: Tue, 19 Jul 2022 16:51:30 +0800
Subject: [PATCH 495/674] crypto: hisilicon/qm: Move the QM header to
 include/linux

mainline inclusion
from mainline-v5.18-rc1
commit ff5812e00d5e
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ff5812e00d5e

----------------------------------------------------------------------

Since we are going to introduce VFIO PCI HiSilicon ACC driver for live
migration in subsequent patches, move the ACC QM header file to a
common include dir.

Acked-by: Zhou Wang <wangzhou1@hisilicon.com>
Acked-by: Longfang Liu <liulongfang@huawei.com>
Acked-by: Kai Ye <yekai13@huawei.com>
Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Link: https://lore.kernel.org/r/20220308184902.2242-2-shameerali.kolothum.thodi@huawei.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: Yang Shen <shenyang39@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 drivers/crypto/hisilicon/hpre/hpre.h                         | 2 +-
 drivers/crypto/hisilicon/qm.c                                | 2 +-
 drivers/crypto/hisilicon/sec2/sec.h                          | 2 +-
 drivers/crypto/hisilicon/sgl.c                               | 2 +-
 drivers/crypto/hisilicon/zip/zip.h                           | 2 +-
 drivers/crypto/hisilicon/qm.h => include/linux/hisi_acc_qm.h | 0
 6 files changed, 5 insertions(+), 5 deletions(-)
 rename drivers/crypto/hisilicon/qm.h => include/linux/hisi_acc_qm.h (100%)

diff --git a/drivers/crypto/hisilicon/hpre/hpre.h b/drivers/crypto/hisilicon/hpre/hpre.h
index e0b4a1982ee9..9a0558ed82f9 100644
--- a/drivers/crypto/hisilicon/hpre/hpre.h
+++ b/drivers/crypto/hisilicon/hpre/hpre.h
@@ -4,7 +4,7 @@
 #define __HISI_HPRE_H
 
 #include <linux/list.h>
-#include "../qm.h"
+#include <linux/hisi_acc_qm.h>
 
 #define HPRE_SQE_SIZE			sizeof(struct hpre_sqe)
 #define HPRE_PF_DEF_Q_NUM		64
diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c
index bf103c8743d3..f7ddf5659c43 100644
--- a/drivers/crypto/hisilicon/qm.c
+++ b/drivers/crypto/hisilicon/qm.c
@@ -15,7 +15,7 @@
 #include <linux/uacce.h>
 #include <linux/uaccess.h>
 #include <uapi/misc/uacce/hisi_qm.h>
-#include "qm.h"
+#include <linux/hisi_acc_qm.h>
 
 /* eq/aeq irq enable */
 #define QM_VF_AEQ_INT_SOURCE		0x0
diff --git a/drivers/crypto/hisilicon/sec2/sec.h b/drivers/crypto/hisilicon/sec2/sec.h
index d97cf02b1df7..c2e9b01187a7 100644
--- a/drivers/crypto/hisilicon/sec2/sec.h
+++ b/drivers/crypto/hisilicon/sec2/sec.h
@@ -4,7 +4,7 @@
 #ifndef __HISI_SEC_V2_H
 #define __HISI_SEC_V2_H
 
-#include "../qm.h"
+#include <linux/hisi_acc_qm.h>
 #include "sec_crypto.h"
 
 /* Algorithm resource per hardware SEC queue */
diff --git a/drivers/crypto/hisilicon/sgl.c b/drivers/crypto/hisilicon/sgl.c
index 057273769f26..f7efc02b065f 100644
--- a/drivers/crypto/hisilicon/sgl.c
+++ b/drivers/crypto/hisilicon/sgl.c
@@ -1,9 +1,9 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2019 HiSilicon Limited. */
 #include <linux/dma-mapping.h>
+#include <linux/hisi_acc_qm.h>
 #include <linux/module.h>
 #include <linux/slab.h>
-#include "qm.h"
 
 #define HISI_ACC_SGL_SGE_NR_MIN		1
 #define HISI_ACC_SGL_NR_MAX		256
diff --git a/drivers/crypto/hisilicon/zip/zip.h b/drivers/crypto/hisilicon/zip/zip.h
index 517fdbdff3ea..3dfd3bac5a33 100644
--- a/drivers/crypto/hisilicon/zip/zip.h
+++ b/drivers/crypto/hisilicon/zip/zip.h
@@ -7,7 +7,7 @@
 #define pr_fmt(fmt)	"hisi_zip: " fmt
 
 #include <linux/list.h>
-#include "../qm.h"
+#include <linux/hisi_acc_qm.h>
 
 enum hisi_zip_error_type {
 	/* negative compression */
diff --git a/drivers/crypto/hisilicon/qm.h b/include/linux/hisi_acc_qm.h
similarity index 100%
rename from drivers/crypto/hisilicon/qm.h
rename to include/linux/hisi_acc_qm.h
-- 
Gitee


From e073afaff8c17c31cebf51a1e9d1a0cf253e1ce2 Mon Sep 17 00:00:00 2001
From: Kai Ye <yekai13@huawei.com>
Date: Tue, 19 Jul 2022 16:51:31 +0800
Subject: [PATCH 496/674] crypto: hisilicon/qm - support the userspace task
 resetting

mainline inclusion
from mainline-v5.16-rc1
commit 8bb765271aded24ca724a39701c6e686234c7020
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=8bb765271aded24ca724a39701c6e686234c7020

----------------------------------------------------------------------

Allocate an extra memory page for qp in the qp memory initialization.
Set a qp error flag in the extra page addr when device resetting.
This error flag can be seen in the userspace. This flag can helps
users to stop tasks when device resetting. After resetting, this error
flag will be reset when this qp is created again. So app should release
the old qp and request a new one, and do the task on the new queue
again.

Signed-off-by: Kai Ye <yekai13@huawei.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Jiangshui Yang <yangjiangshui@h-partners.com>
Reviewed-by: Yang Shen <shenyang39@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 drivers/crypto/hisilicon/migration/acc_vf_migration.h | 2 +-
 drivers/crypto/hisilicon/qm.c                         | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/crypto/hisilicon/migration/acc_vf_migration.h b/drivers/crypto/hisilicon/migration/acc_vf_migration.h
index 28aa7e318ca9..2f459eecb8f0 100644
--- a/drivers/crypto/hisilicon/migration/acc_vf_migration.h
+++ b/drivers/crypto/hisilicon/migration/acc_vf_migration.h
@@ -8,7 +8,7 @@
 #include <linux/pci.h>
 #include <linux/vfio.h>
 
-#include "../qm.h"
+#include <linux/hisi_acc_qm.h>
 
 #define VFIO_PCI_OFFSET_SHIFT   40
 #define VFIO_PCI_OFFSET_TO_INDEX(off)   ((off) >> VFIO_PCI_OFFSET_SHIFT)
diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c
index f7ddf5659c43..7dadf476335f 100644
--- a/drivers/crypto/hisilicon/qm.c
+++ b/drivers/crypto/hisilicon/qm.c
@@ -5492,11 +5492,14 @@ static void qm_pf_reset_vf_prepare(struct hisi_qm *qm,
 		atomic_set(&qm->status.flags, QM_STOP);
 		cmd = QM_VF_PREPARE_FAIL;
 		goto err_prepare;
+	} else {
+		goto out;
 	}
 
 err_prepare:
 	hisi_qm_set_hw_reset(qm, QM_RESET_STOP_TX_OFFSET);
 	hisi_qm_set_hw_reset(qm, QM_RESET_STOP_RX_OFFSET);
+out:
 	pci_save_state(pdev);
 	ret = qm->ops->ping_pf(qm, cmd);
 	if (ret)
-- 
Gitee


From 4a1bf8c8e36ad8a44f865f8f9ddb889a6f0d1040 Mon Sep 17 00:00:00 2001
From: Longfang Liu <liulongfang@huawei.com>
Date: Tue, 19 Jul 2022 16:51:32 +0800
Subject: [PATCH 497/674] crypto: hisilicon/qm: Move few definitions to common
 header

mainline inclusion
from mainline-v5.18-rc1
commit b4b084d71332
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b4b084d71332

----------------------------------------------------------------------

Move Doorbell and Mailbox definitions to common header file.
Also export QM mailbox functions.

This will be useful when we introduce VFIO PCI HiSilicon ACC live
migration driver.

Signed-off-by: Longfang Liu <liulongfang@huawei.com>
Acked-by: Zhou Wang <wangzhou1@hisilicon.com>
Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Link: https://lore.kernel.org/r/20220308184902.2242-3-shameerali.kolothum.thodi@huawei.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Jiangshui Yang <yangjiangshui@h-partners.com>
Reviewed-by: Yang Shen <shenyang39@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 drivers/crypto/hisilicon/qm.c | 58 +++++++++++------------------------
 include/linux/hisi_acc_qm.h   | 38 +++++++++++++++++++++++
 2 files changed, 56 insertions(+), 40 deletions(-)

diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c
index 7dadf476335f..abeaef09b91c 100644
--- a/drivers/crypto/hisilicon/qm.c
+++ b/drivers/crypto/hisilicon/qm.c
@@ -33,23 +33,6 @@
 #define QM_ABNORMAL_EVENT_IRQ_VECTOR	3
 
 /* mailbox */
-#define QM_MB_CMD_SQC			0x0
-#define QM_MB_CMD_CQC			0x1
-#define QM_MB_CMD_EQC			0x2
-#define QM_MB_CMD_AEQC			0x3
-#define QM_MB_CMD_SQC_BT		0x4
-#define QM_MB_CMD_CQC_BT		0x5
-#define QM_MB_CMD_SQC_VFT_V2		0x6
-#define QM_MB_CMD_STOP_QP		0x8
-#define QM_MB_CMD_SRC			0xc
-#define QM_MB_CMD_DST			0xd
-
-#define QM_MB_CMD_SEND_BASE		0x300
-#define QM_MB_EVENT_SHIFT		8
-#define QM_MB_BUSY_SHIFT		13
-#define QM_MB_OP_SHIFT			14
-#define QM_MB_CMD_DATA_ADDR_L		0x304
-#define QM_MB_CMD_DATA_ADDR_H		0x308
 #define QM_MB_PING_ALL_VFS		0xffff
 #define QM_MB_CMD_DATA_SHIFT		32
 #define QM_MB_CMD_DATA_MASK		GENMASK(31, 0)
@@ -103,19 +86,12 @@
 #define QM_DB_CMD_SHIFT_V1		16
 #define QM_DB_INDEX_SHIFT_V1		32
 #define QM_DB_PRIORITY_SHIFT_V1		48
-#define QM_DOORBELL_SQ_CQ_BASE_V2	0x1000
-#define QM_DOORBELL_EQ_AEQ_BASE_V2	0x2000
 #define QM_QUE_ISO_CFG_V		0x0030
 #define QM_PAGE_SIZE			0x0034
 #define QM_QUE_ISO_EN			0x100154
 #define QM_CAPBILITY			0x100158
 #define QM_QP_NUN_MASK			GENMASK(10, 0)
 #define QM_QP_DB_INTERVAL		0x10000
-#define QM_QP_MAX_NUM_SHIFT		11
-#define QM_DB_CMD_SHIFT_V2		12
-#define QM_DB_RAND_SHIFT_V2		16
-#define QM_DB_INDEX_SHIFT_V2		32
-#define QM_DB_PRIORITY_SHIFT_V2		48
 
 #define QM_MEM_START_INIT		0x100040
 #define QM_MEM_INIT_DONE		0x100044
@@ -693,7 +669,7 @@ static void qm_mb_pre_init(struct qm_mailbox *mailbox, u8 cmd,
 }
 
 /* return 0 mailbox ready, -ETIMEDOUT hardware timeout */
-static int qm_wait_mb_ready(struct hisi_qm *qm)
+int hisi_qm_wait_mb_ready(struct hisi_qm *qm)
 {
 	u32 val;
 
@@ -701,6 +677,7 @@ static int qm_wait_mb_ready(struct hisi_qm *qm)
 					  val, !((val >> QM_MB_BUSY_SHIFT) &
 					  0x1), POLL_PERIOD, POLL_TIMEOUT);
 }
+EXPORT_SYMBOL_GPL(hisi_qm_wait_mb_ready);
 
 /* 128 bit should be written to hardware at one time to trigger a mailbox */
 static void qm_mb_write(struct hisi_qm *qm, const void *src)
@@ -726,14 +703,14 @@ static void qm_mb_write(struct hisi_qm *qm, const void *src)
 
 static int qm_mb_nolock(struct hisi_qm *qm, struct qm_mailbox *mailbox)
 {
-	if (unlikely(qm_wait_mb_ready(qm))) {
+	if (unlikely(hisi_qm_wait_mb_ready(qm))) {
 		dev_err(&qm->pdev->dev, "QM mailbox is busy to start!\n");
 		goto mb_busy;
 	}
 
 	qm_mb_write(qm, mailbox);
 
-	if (unlikely(qm_wait_mb_ready(qm))) {
+	if (unlikely(hisi_qm_wait_mb_ready(qm))) {
 		dev_err(&qm->pdev->dev, "QM mailbox operation timeout!\n");
 		goto mb_busy;
 	}
@@ -745,8 +722,8 @@ static int qm_mb_nolock(struct hisi_qm *qm, struct qm_mailbox *mailbox)
 	return -EBUSY;
 }
 
-static int qm_mb(struct hisi_qm *qm, u8 cmd, dma_addr_t dma_addr, u16 queue,
-		 bool op)
+int hisi_qm_mb(struct hisi_qm *qm, u8 cmd, dma_addr_t dma_addr, u16 queue,
+	       bool op)
 {
 	struct qm_mailbox mailbox;
 	int ret;
@@ -762,6 +739,7 @@ static int qm_mb(struct hisi_qm *qm, u8 cmd, dma_addr_t dma_addr, u16 queue,
 
 	return ret;
 }
+EXPORT_SYMBOL_GPL(hisi_qm_mb);
 
 static void qm_db_v1(struct hisi_qm *qm, u16 qn, u8 cmd, u16 index, u8 priority)
 {
@@ -1351,7 +1329,7 @@ static int qm_get_vft_v2(struct hisi_qm *qm, u32 *base, u32 *number)
 	u64 sqc_vft;
 	int ret;
 
-	ret = qm_mb(qm, QM_MB_CMD_SQC_VFT_V2, 0, 0, 1);
+	ret = hisi_qm_mb(qm, QM_MB_CMD_SQC_VFT_V2, 0, 0, 1);
 	if (ret)
 		return ret;
 
@@ -1725,12 +1703,12 @@ static int dump_show(struct hisi_qm *qm, void *info,
 
 static int qm_dump_sqc_raw(struct hisi_qm *qm, dma_addr_t dma_addr, u16 qp_id)
 {
-	return qm_mb(qm, QM_MB_CMD_SQC, dma_addr, qp_id, 1);
+	return hisi_qm_mb(qm, QM_MB_CMD_SQC, dma_addr, qp_id, 1);
 }
 
 static int qm_dump_cqc_raw(struct hisi_qm *qm, dma_addr_t dma_addr, u16 qp_id)
 {
-	return qm_mb(qm, QM_MB_CMD_CQC, dma_addr, qp_id, 1);
+	return hisi_qm_mb(qm, QM_MB_CMD_CQC, dma_addr, qp_id, 1);
 }
 
 static int qm_sqc_dump(struct hisi_qm *qm, const char *s)
@@ -1842,7 +1820,7 @@ static int qm_eqc_aeqc_dump(struct hisi_qm *qm, char *s, size_t size,
 	if (IS_ERR(xeqc))
 		return PTR_ERR(xeqc);
 
-	ret = qm_mb(qm, cmd, xeqc_dma, 0, 1);
+	ret = hisi_qm_mb(qm, cmd, xeqc_dma, 0, 1);
 	if (ret)
 		goto err_free_ctx;
 
@@ -2495,7 +2473,7 @@ static int qm_ping_pf(struct hisi_qm *qm, u64 cmd)
 
 static int qm_stop_qp(struct hisi_qp *qp)
 {
-	return qm_mb(qp->qm, QM_MB_CMD_STOP_QP, 0, qp->qp_id, 0);
+	return hisi_qm_mb(qp->qm, QM_MB_CMD_STOP_QP, 0, qp->qp_id, 0);
 }
 
 static int qm_set_msi(struct hisi_qm *qm, bool set)
@@ -2763,7 +2741,7 @@ static int qm_sq_ctx_cfg(struct hisi_qp *qp, int qp_id, u32 pasid)
 		return -ENOMEM;
 	}
 
-	ret = qm_mb(qm, QM_MB_CMD_SQC, sqc_dma, qp_id, 0);
+	ret = hisi_qm_mb(qm, QM_MB_CMD_SQC, sqc_dma, qp_id, 0);
 	dma_unmap_single(dev, sqc_dma, sizeof(struct qm_sqc), DMA_TO_DEVICE);
 	kfree(sqc);
 
@@ -2804,7 +2782,7 @@ static int qm_cq_ctx_cfg(struct hisi_qp *qp, int qp_id, u32 pasid)
 		return -ENOMEM;
 	}
 
-	ret = qm_mb(qm, QM_MB_CMD_CQC, cqc_dma, qp_id, 0);
+	ret = hisi_qm_mb(qm, QM_MB_CMD_CQC, cqc_dma, qp_id, 0);
 	dma_unmap_single(dev, cqc_dma, sizeof(struct qm_cqc), DMA_TO_DEVICE);
 	kfree(cqc);
 
@@ -3664,7 +3642,7 @@ static int qm_eq_ctx_cfg(struct hisi_qm *qm)
 		return -ENOMEM;
 	}
 
-	ret = qm_mb(qm, QM_MB_CMD_EQC, eqc_dma, 0, 0);
+	ret = hisi_qm_mb(qm, QM_MB_CMD_EQC, eqc_dma, 0, 0);
 	dma_unmap_single(dev, eqc_dma, sizeof(struct qm_eqc), DMA_TO_DEVICE);
 	kfree(eqc);
 
@@ -3693,7 +3671,7 @@ static int qm_aeq_ctx_cfg(struct hisi_qm *qm)
 		return -ENOMEM;
 	}
 
-	ret = qm_mb(qm, QM_MB_CMD_AEQC, aeqc_dma, 0, 0);
+	ret = hisi_qm_mb(qm, QM_MB_CMD_AEQC, aeqc_dma, 0, 0);
 	dma_unmap_single(dev, aeqc_dma, sizeof(struct qm_aeqc), DMA_TO_DEVICE);
 	kfree(aeqc);
 
@@ -3732,11 +3710,11 @@ static int __hisi_qm_start(struct hisi_qm *qm)
 	if (ret)
 		return ret;
 
-	ret = qm_mb(qm, QM_MB_CMD_SQC_BT, qm->sqc_dma, 0, 0);
+	ret = hisi_qm_mb(qm, QM_MB_CMD_SQC_BT, qm->sqc_dma, 0, 0);
 	if (ret)
 		return ret;
 
-	ret = qm_mb(qm, QM_MB_CMD_CQC_BT, qm->cqc_dma, 0, 0);
+	ret = hisi_qm_mb(qm, QM_MB_CMD_CQC_BT, qm->cqc_dma, 0, 0);
 	if (ret)
 		return ret;
 
diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h
index 718687dd7242..0ac0fdb6a6bd 100644
--- a/include/linux/hisi_acc_qm.h
+++ b/include/linux/hisi_acc_qm.h
@@ -34,6 +34,40 @@
 #define QM_WUSER_M_CFG_ENABLE		0x1000a8
 #define WUSER_M_CFG_ENABLE		0xffffffff
 
+/* mailbox */
+#define QM_MB_CMD_SQC                   0x0
+#define QM_MB_CMD_CQC                   0x1
+#define QM_MB_CMD_EQC                   0x2
+#define QM_MB_CMD_AEQC                  0x3
+#define QM_MB_CMD_SQC_BT                0x4
+#define QM_MB_CMD_CQC_BT                0x5
+#define QM_MB_CMD_SQC_VFT_V2            0x6
+#define QM_MB_CMD_STOP_QP               0x8
+#define QM_MB_CMD_SRC                   0xc
+#define QM_MB_CMD_DST                   0xd
+
+#define QM_MB_CMD_SEND_BASE		0x300
+#define QM_MB_EVENT_SHIFT               8
+#define QM_MB_BUSY_SHIFT		13
+#define QM_MB_OP_SHIFT			14
+#define QM_MB_CMD_DATA_ADDR_L		0x304
+#define QM_MB_CMD_DATA_ADDR_H		0x308
+#define QM_MB_MAX_WAIT_CNT		6000
+
+/* doorbell */
+#define QM_DOORBELL_CMD_SQ              0
+#define QM_DOORBELL_CMD_CQ              1
+#define QM_DOORBELL_CMD_EQ              2
+#define QM_DOORBELL_CMD_AEQ             3
+
+#define QM_DOORBELL_SQ_CQ_BASE_V2	0x1000
+#define QM_DOORBELL_EQ_AEQ_BASE_V2	0x2000
+#define QM_QP_MAX_NUM_SHIFT             11
+#define QM_DB_CMD_SHIFT_V2		12
+#define QM_DB_RAND_SHIFT_V2		16
+#define QM_DB_INDEX_SHIFT_V2		32
+#define QM_DB_PRIORITY_SHIFT_V2		48
+
 /* qm cache */
 #define QM_CACHE_CTL			0x100050
 #define SQC_CACHE_ENABLE		BIT(0)
@@ -421,6 +455,10 @@ pci_ers_result_t hisi_qm_dev_slot_reset(struct pci_dev *pdev);
 void hisi_qm_reset_prepare(struct pci_dev *pdev);
 void hisi_qm_reset_done(struct pci_dev *pdev);
 
+int hisi_qm_wait_mb_ready(struct hisi_qm *qm);
+int hisi_qm_mb(struct hisi_qm *qm, u8 cmd, dma_addr_t dma_addr, u16 queue,
+	       bool op);
+
 struct hisi_acc_sgl_pool;
 struct hisi_acc_hw_sgl *hisi_acc_sg_buf_map_to_hw_sgl(struct device *dev,
 	struct scatterlist *sgl, struct hisi_acc_sgl_pool *pool,
-- 
Gitee


From 8fd5a1b7df96e0dec7c9f613cce0c706856c986a Mon Sep 17 00:00:00 2001
From: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Date: Tue, 19 Jul 2022 16:51:33 +0800
Subject: [PATCH 498/674] hisi_acc_qm: Move VF PCI device IDs to common header

mainline inclusion
from mainline-v5.18-rc1
commit fae74feacd2d
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=fae74feacd2d

----------------------------------------------------------------------

Move the PCI Device IDs of HiSilicon ACC VF devices to a common header
and also use a uniform naming convention.

This will be useful when we introduce the vfio PCI HiSilicon ACC live
migration driver in subsequent patches.

Cc: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Zhou Wang <wangzhou1@hisilicon.com>
Acked-by: Longfang Liu <liulongfang@huawei.com>
Acked-by: Kai Ye <yekai13@huawei.com>
Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>	# pci_ids.h
Link: https://lore.kernel.org/r/20220308184902.2242-4-shameerali.kolothum.thodi@huawei.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Jiangshui Yang <yangjiangshui@h-partners.com>
Reviewed-by: Yang Shen <shenyang39@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 drivers/crypto/hisilicon/hpre/hpre_main.c | 13 ++++++-------
 drivers/crypto/hisilicon/sec2/sec_main.c  | 15 +++++++--------
 drivers/crypto/hisilicon/zip/zip_main.c   | 11 +++++------
 include/linux/pci_ids.h                   |  3 +++
 4 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/drivers/crypto/hisilicon/hpre/hpre_main.c b/drivers/crypto/hisilicon/hpre/hpre_main.c
index ebfab3e14499..3589d8879b5e 100644
--- a/drivers/crypto/hisilicon/hpre/hpre_main.c
+++ b/drivers/crypto/hisilicon/hpre/hpre_main.c
@@ -68,8 +68,7 @@
 #define HPRE_REG_RD_INTVRL_US		10
 #define HPRE_REG_RD_TMOUT_US		1000
 #define HPRE_DBGFS_VAL_MAX_LEN		20
-#define HPRE_PCI_DEVICE_ID		0xa258
-#define HPRE_PCI_VF_DEVICE_ID		0xa259
+#define PCI_DEVICE_ID_HUAWEI_HPRE_PF	0xa258
 #define HPRE_QM_USR_CFG_MASK		GENMASK(31, 1)
 #define HPRE_QM_AXI_CFG_MASK		GENMASK(15, 0)
 #define HPRE_QM_VFG_AX_MASK		GENMASK(7, 0)
@@ -111,8 +110,8 @@
 static const char hpre_name[] = "hisi_hpre";
 static struct dentry *hpre_debugfs_root;
 static const struct pci_device_id hpre_dev_ids[] = {
-	{ PCI_DEVICE(PCI_VENDOR_ID_HUAWEI, HPRE_PCI_DEVICE_ID) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_HUAWEI, HPRE_PCI_VF_DEVICE_ID) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_HUAWEI, PCI_DEVICE_ID_HUAWEI_HPRE_PF) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_HUAWEI, PCI_DEVICE_ID_HUAWEI_HPRE_VF) },
 	{ 0, }
 };
 
@@ -242,7 +241,7 @@ MODULE_PARM_DESC(uacce_mode, UACCE_MODE_DESC);
 
 static int pf_q_num_set(const char *val, const struct kernel_param *kp)
 {
-	return q_num_set(val, kp, HPRE_PCI_DEVICE_ID);
+	return q_num_set(val, kp, PCI_DEVICE_ID_HUAWEI_HPRE_PF);
 }
 
 static const struct kernel_param_ops hpre_pf_q_num_ops = {
@@ -921,7 +920,7 @@ static int hpre_debugfs_init(struct hisi_qm *qm)
 	qm->debug.sqe_mask_len = HPRE_SQE_MASK_LEN;
 	hisi_qm_debug_init(qm);
 
-	if (qm->pdev->device == HPRE_PCI_DEVICE_ID) {
+	if (qm->pdev->device == PCI_DEVICE_ID_HUAWEI_HPRE_PF) {
 		ret = hpre_ctrl_debug_init(qm);
 		if (ret)
 			goto failed_to_create;
@@ -958,7 +957,7 @@ static int hpre_qm_init(struct hisi_qm *qm, struct pci_dev *pdev)
 	qm->sqe_size = HPRE_SQE_SIZE;
 	qm->dev_name = hpre_name;
 
-	qm->fun_type = (pdev->device == HPRE_PCI_DEVICE_ID) ?
+	qm->fun_type = (pdev->device == PCI_DEVICE_ID_HUAWEI_HPRE_PF) ?
 			QM_HW_PF : QM_HW_VF;
 	if (qm->fun_type == QM_HW_PF) {
 		qm->qp_base = HPRE_PF_DEF_Q_BASE;
diff --git a/drivers/crypto/hisilicon/sec2/sec_main.c b/drivers/crypto/hisilicon/sec2/sec_main.c
index 0b9906ff69e3..85f55d8f04cf 100644
--- a/drivers/crypto/hisilicon/sec2/sec_main.c
+++ b/drivers/crypto/hisilicon/sec2/sec_main.c
@@ -20,8 +20,7 @@
 
 #define SEC_VF_NUM			63
 #define SEC_QUEUE_NUM_V1		4096
-#define SEC_PF_PCI_DEVICE_ID		0xa255
-#define SEC_VF_PCI_DEVICE_ID		0xa256
+#define PCI_DEVICE_ID_HUAWEI_SEC_PF	0xa255
 
 #define SEC_BD_ERR_CHK_EN0		0xEFFFFFFF
 #define SEC_BD_ERR_CHK_EN1		0x7ffff7fd
@@ -229,7 +228,7 @@ static const struct debugfs_reg32 sec_dfx_regs[] = {
 
 static int sec_pf_q_num_set(const char *val, const struct kernel_param *kp)
 {
-	return q_num_set(val, kp, SEC_PF_PCI_DEVICE_ID);
+	return q_num_set(val, kp, PCI_DEVICE_ID_HUAWEI_SEC_PF);
 }
 
 static const struct kernel_param_ops sec_pf_q_num_ops = {
@@ -317,8 +316,8 @@ module_param_cb(uacce_mode, &sec_uacce_mode_ops, &uacce_mode, 0444);
 MODULE_PARM_DESC(uacce_mode, UACCE_MODE_DESC);
 
 static const struct pci_device_id sec_dev_ids[] = {
-	{ PCI_DEVICE(PCI_VENDOR_ID_HUAWEI, SEC_PF_PCI_DEVICE_ID) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_HUAWEI, SEC_VF_PCI_DEVICE_ID) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_HUAWEI, PCI_DEVICE_ID_HUAWEI_SEC_PF) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_HUAWEI, PCI_DEVICE_ID_HUAWEI_SEC_VF) },
 	{ 0, }
 };
 MODULE_DEVICE_TABLE(pci, sec_dev_ids);
@@ -748,7 +747,7 @@ static int sec_core_debug_init(struct hisi_qm *qm)
 	regset->base = qm->io_base;
 	regset->dev = dev;
 
-	if (qm->pdev->device == SEC_PF_PCI_DEVICE_ID)
+	if (qm->pdev->device == PCI_DEVICE_ID_HUAWEI_SEC_PF)
 		debugfs_create_file("regs", 0444, tmp_d, regset, &sec_regs_fops);
 
 	for (i = 0; i < ARRAY_SIZE(sec_dfx_labels); i++) {
@@ -766,7 +765,7 @@ static int sec_debug_init(struct hisi_qm *qm)
 	struct sec_dev *sec = container_of(qm, struct sec_dev, qm);
 	int i;
 
-	if (qm->pdev->device == SEC_PF_PCI_DEVICE_ID) {
+	if (qm->pdev->device == PCI_DEVICE_ID_HUAWEI_SEC_PF) {
 		for (i = SEC_CLEAR_ENABLE; i < SEC_DEBUG_FILE_NUM; i++) {
 			spin_lock_init(&sec->debug.files[i].lock);
 			sec->debug.files[i].index = i;
@@ -908,7 +907,7 @@ static int sec_qm_init(struct hisi_qm *qm, struct pci_dev *pdev)
 	qm->sqe_size = SEC_SQE_SIZE;
 	qm->dev_name = sec_name;
 
-	qm->fun_type = (pdev->device == SEC_PF_PCI_DEVICE_ID) ?
+	qm->fun_type = (pdev->device == PCI_DEVICE_ID_HUAWEI_SEC_PF) ?
 			QM_HW_PF : QM_HW_VF;
 	if (qm->fun_type == QM_HW_PF) {
 		qm->qp_base = SEC_PF_DEF_Q_BASE;
diff --git a/drivers/crypto/hisilicon/zip/zip_main.c b/drivers/crypto/hisilicon/zip/zip_main.c
index 678f8b58ec42..66decfe07282 100644
--- a/drivers/crypto/hisilicon/zip/zip_main.c
+++ b/drivers/crypto/hisilicon/zip/zip_main.c
@@ -15,8 +15,7 @@
 #include <linux/uacce.h>
 #include "zip.h"
 
-#define PCI_DEVICE_ID_ZIP_PF		0xa250
-#define PCI_DEVICE_ID_ZIP_VF		0xa251
+#define PCI_DEVICE_ID_HUAWEI_ZIP_PF	0xa250
 
 #define HZIP_QUEUE_NUM_V1		4096
 
@@ -246,7 +245,7 @@ MODULE_PARM_DESC(uacce_mode, UACCE_MODE_DESC);
 
 static int pf_q_num_set(const char *val, const struct kernel_param *kp)
 {
-	return q_num_set(val, kp, PCI_DEVICE_ID_ZIP_PF);
+	return q_num_set(val, kp, PCI_DEVICE_ID_HUAWEI_ZIP_PF);
 }
 
 static const struct kernel_param_ops pf_q_num_ops = {
@@ -268,8 +267,8 @@ module_param_cb(vfs_num, &vfs_num_ops, &vfs_num, 0444);
 MODULE_PARM_DESC(vfs_num, "Number of VFs to enable(1-63), 0(default)");
 
 static const struct pci_device_id hisi_zip_dev_ids[] = {
-	{ PCI_DEVICE(PCI_VENDOR_ID_HUAWEI, PCI_DEVICE_ID_ZIP_PF) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_HUAWEI, PCI_DEVICE_ID_ZIP_VF) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_HUAWEI, PCI_DEVICE_ID_HUAWEI_ZIP_PF) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_HUAWEI, PCI_DEVICE_ID_HUAWEI_ZIP_VF) },
 	{ 0, }
 };
 MODULE_DEVICE_TABLE(pci, hisi_zip_dev_ids);
@@ -838,7 +837,7 @@ static int hisi_zip_qm_init(struct hisi_qm *qm, struct pci_dev *pdev)
 	qm->sqe_size = HZIP_SQE_SIZE;
 	qm->dev_name = hisi_zip_name;
 
-	qm->fun_type = (pdev->device == PCI_DEVICE_ID_ZIP_PF) ?
+	qm->fun_type = (pdev->device == PCI_DEVICE_ID_HUAWEI_ZIP_PF) ?
 			QM_HW_PF : QM_HW_VF;
 	if (qm->fun_type == QM_HW_PF) {
 		qm->qp_base = HZIP_PF_DEF_Q_BASE;
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index a1069c76e9bc..f9792f9128fb 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2570,6 +2570,9 @@
 #define PCI_DEVICE_ID_KORENIX_JETCARDF3	0x17ff
 
 #define PCI_VENDOR_ID_HUAWEI		0x19e5
+#define PCI_DEVICE_ID_HUAWEI_ZIP_VF    0xa251
+#define PCI_DEVICE_ID_HUAWEI_SEC_VF    0xa256
+#define PCI_DEVICE_ID_HUAWEI_HPRE_VF   0xa259
 
 /* Hisilicon PCIe NP devices */
 #define PCIE_DEVICE_ID_HISI_5896        0x5896
-- 
Gitee


From b5592e08022f801ee8ff6db035a69924d7a21d1a Mon Sep 17 00:00:00 2001
From: Longfang Liu <liulongfang@huawei.com>
Date: Tue, 19 Jul 2022 16:51:34 +0800
Subject: [PATCH 499/674] crypto: hisilicon/qm: Set the VF QM state register

mainline inclusion
from mainline-v5.18-rc1
commit 1e459b25081d
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=1e459b25081d

----------------------------------------------------------------------

We use VF QM state register to record the status of the QM configuration
state. This will be used in the ACC migration driver to determine whether
we can safely save and restore the QM data.

Signed-off-by: Longfang Liu <liulongfang@huawei.com>
Acked-by: Zhou Wang <wangzhou1@hisilicon.com>
Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Link: https://lore.kernel.org/r/20220308184902.2242-8-shameerali.kolothum.thodi@huawei.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Jiangshui Yang <yangjiangshui@h-partners.com>
Reviewed-by: Yang Shen <shenyang39@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 drivers/crypto/hisilicon/qm.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c
index abeaef09b91c..46918ea35cd6 100644
--- a/drivers/crypto/hisilicon/qm.c
+++ b/drivers/crypto/hisilicon/qm.c
@@ -3527,9 +3527,8 @@ void hisi_qm_uninit(struct hisi_qm *qm)
 		dma_free_coherent(dev, qm->qdma.size,
 				  qm->qdma.va, qm->qdma.dma);
 	}
-	up_write(&qm->qps_lock);
-
 	hisi_qm_set_state(qm, VF_NOT_READY);
+	up_write(&qm->qps_lock);
 
 	qm_irq_unregister(qm);
 	hisi_qm_pci_uninit(qm);
@@ -3547,7 +3546,6 @@ EXPORT_SYMBOL_GPL(hisi_qm_uninit);
  * @number: The number of queues in vft.
  *
  * We can allocate multiple queues to a qm by configuring virtual function
- * table. We get related configures by this function. Normally, we call this
  * function in VF driver to get the queue information.
  *
  * qm hw v1 does not support this interface.
-- 
Gitee


From 86bab9ccc46065903e44971ca0d81bc23593efed Mon Sep 17 00:00:00 2001
From: Hui Tang <tanghui20@huawei.com>
Date: Tue, 19 Jul 2022 16:51:35 +0800
Subject: [PATCH 500/674] crypto: hisilicon/qm - optimize the barrier operation

mainline inclusion
from mainline-v5.19-rc1
commit 4cda2f4a0ee6
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=4cda2f4a0ee6

----------------------------------------------------------------------

A 'dma_wmb' barrier is enough to guarantee previous writes
before accessing by acc device in the outer shareable domain.

A 'smp_wmb' barrier is enough to guarantee previous writes
before accessing by other cpus in the inner shareble domain.

Signed-off-by: Hui Tang <tanghui20@huawei.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Jiangshui Yang <yangjiangshui@h-partners.com>
Reviewed-by: Yang Shen <shenyang39@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 drivers/crypto/hisilicon/qm.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c
index 46918ea35cd6..30e91e328859 100644
--- a/drivers/crypto/hisilicon/qm.c
+++ b/drivers/crypto/hisilicon/qm.c
@@ -687,13 +687,13 @@ static void qm_mb_write(struct hisi_qm *qm, const void *src)
 
 	if (!IS_ENABLED(CONFIG_ARM64)) {
 		memcpy_toio(fun_base, src, 16);
-		wmb();
+		dma_wmb();
 		return;
 	}
 
 	asm volatile("ldp %0, %1, %3\n"
 		     "stp %0, %1, %2\n"
-		     "dsb sy\n"
+		     "dmb oshst\n"
 		     : "=&r" (tmp0),
 		       "=&r" (tmp1),
 		       "+Q" (*((char __iomem *)fun_base))
@@ -982,7 +982,7 @@ static void qm_set_qp_disable(struct hisi_qp *qp, int offset)
 	*addr = 1;
 
 	/* make sure setup is completed */
-	mb();
+	smp_wmb();
 }
 
 static void qm_disable_qp(struct hisi_qm *qm, u32 qp_id)
-- 
Gitee


From 05666491669e9fe8d6bec2854efa00dcc48f01da Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 19 Jul 2022 16:51:36 +0800
Subject: [PATCH 501/674] kernel.h: drop inclusion in bitmap.h

mainline inclusion
from mainline-v5.13-rc1
commit 08c5188ef40ff82aed559123dc0ab2d2254b1b1c
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=1fbbcffd0ee1

----------------------------------------------------------------------

The bitmap.h header is used in a lot of code around the kernel.  Besides
that it includes kernel.h which sometimes makes a loop.

The problem here is many unneeded loops that make header hell
dependencies.  For example, how may you move bitmap_zalloc() from C-file
to the header?  Currently it's impossible.  And bitmap.h here is only the
tip of an iceberg.

kerne.h is a dump of everything that even has nothing in common at all.
We may still have it, but in my new code I prefer to include only the
headers that I want to use, without the bulk of unneeded kernel code.

Break the loop by introducing align.h, including it in kernel.h and
bitmap.h followed by replacing kernel.h with limits.h.

Link: https://lkml.kernel.org/r/20210326170347.37441-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Yury Norov <yury.norov@gmail.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Reviewed-by: Yang Shen <shenyang39@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 include/linux/align.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)
 create mode 100644 include/linux/align.h

diff --git a/include/linux/align.h b/include/linux/align.h
new file mode 100644
index 000000000000..2b4acec7b95a
--- /dev/null
+++ b/include/linux/align.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_ALIGN_H
+#define _LINUX_ALIGN_H
+
+#include <linux/const.h>
+
+/* @a is a power of 2 value */
+#define ALIGN(x, a)		__ALIGN_KERNEL((x), (a))
+#define ALIGN_DOWN(x, a)	__ALIGN_KERNEL((x) - ((a) - 1), (a))
+#define __ALIGN_MASK(x, mask)	__ALIGN_KERNEL_MASK((x), (mask))
+#define PTR_ALIGN(p, a)		((typeof(p))ALIGN((unsigned long)(p), (a)))
+#define PTR_ALIGN_DOWN(p, a)	((typeof(p))ALIGN_DOWN((unsigned long)(p), (a)))
+#define IS_ALIGNED(x, a)		(((x) & ((typeof(x))(a) - 1)) == 0)
+
+#endif	/* _LINUX_ALIGN_H */
-- 
Gitee


From d8df683a6f2fe0c5ce696f43fb8f31dd6c0b409d Mon Sep 17 00:00:00 2001
From: Kai Ye <yekai13@huawei.com>
Date: Tue, 19 Jul 2022 16:51:37 +0800
Subject: [PATCH 502/674] crypto: hisilicon/qm - add register checking for ACC

mainline inclusion
from mainline-v5.19-rc1
commit f1724d397c60
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=f1724d397c60

----------------------------------------------------------------------

Add register detection function to accelerator. Provided a tool that
user can checking differential register through Debugfs.
e.g.
    cd /sys/kernel/debug/hisi_zip/<bdf>/zip_dfx
    cat diff_regs

Signed-off-by: Longfang Liu <liulongfang@huawei.com>
Signed-off-by: Kai Ye <yekai13@huawei.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Jiangshui Yang <yangjiangshui@h-partners.com>
Reviewed-by: Yang Shen <shenyang39@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 drivers/crypto/hisilicon/qm.c | 182 +++++++++++++++++++++++++++++++++-
 include/linux/hisi_acc_qm.h   |  14 +++
 2 files changed, 195 insertions(+), 1 deletion(-)

diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c
index 30e91e328859..45f1ded0801f 100644
--- a/drivers/crypto/hisilicon/qm.c
+++ b/drivers/crypto/hisilicon/qm.c
@@ -253,7 +253,15 @@
 #define QM_QOS_MAX_CIR_U		6
 #define QM_QOS_MAX_CIR_S		11
 #define QM_QOS_VAL_MAX_LEN		32
-
+#define QM_DFX_BASE		0x0100000
+#define QM_DFX_STATE1		0x0104000
+#define QM_DFX_STATE2		0x01040C8
+#define QM_DFX_COMMON		0x0000
+#define QM_DFX_BASE_LEN		0x5A
+#define QM_DFX_STATE1_LEN		0x2E
+#define QM_DFX_STATE2_LEN		0x11
+#define QM_DFX_COMMON_LEN		0xC3
+#define QM_DFX_REGS_LEN		4UL
 #define QM_AUTOSUSPEND_DELAY		3000
 
 #define QM_MK_CQC_DW3_V1(hop_num, pg_sz, buf_sz, cqe_sz) \
@@ -467,6 +475,23 @@ static const struct hisi_qm_hw_error qm_hw_error[] = {
 	{ /* sentinel */ }
 };
 
+/* define the QM's dfx regs region and region length */
+static struct dfx_diff_registers qm_diff_regs[] = {
+	{
+		.reg_offset = QM_DFX_BASE,
+		.reg_len = QM_DFX_BASE_LEN,
+	}, {
+		.reg_offset = QM_DFX_STATE1,
+		.reg_len = QM_DFX_STATE1_LEN,
+	}, {
+		.reg_offset = QM_DFX_STATE2,
+		.reg_len = QM_DFX_STATE2_LEN,
+	}, {
+		.reg_offset = QM_DFX_COMMON,
+		.reg_len = QM_DFX_COMMON_LEN,
+	},
+};
+
 static const char * const qm_db_timeout[] = {
 	"sq", "cq", "eq", "aeq",
 };
@@ -1625,6 +1650,156 @@ static int qm_regs_show(struct seq_file *s, void *unused)
 
 DEFINE_SHOW_ATTRIBUTE(qm_regs);
 
+static struct dfx_diff_registers *dfx_regs_init(struct hisi_qm *qm,
+	const struct dfx_diff_registers *cregs, int reg_len)
+{
+	struct dfx_diff_registers *diff_regs;
+	u32 j, base_offset;
+	int i;
+
+	diff_regs = kcalloc(reg_len, sizeof(*diff_regs), GFP_KERNEL);
+	if (!diff_regs)
+		return ERR_PTR(-ENOMEM);
+
+	for (i = 0; i < reg_len; i++) {
+		if (!cregs[i].reg_len)
+			continue;
+
+		diff_regs[i].reg_offset = cregs[i].reg_offset;
+		diff_regs[i].reg_len = cregs[i].reg_len;
+		diff_regs[i].regs = kcalloc(QM_DFX_REGS_LEN, cregs[i].reg_len,
+					 GFP_KERNEL);
+		if (!diff_regs[i].regs)
+			goto alloc_error;
+
+		for (j = 0; j < diff_regs[i].reg_len; j++) {
+			base_offset = diff_regs[i].reg_offset +
+					j * QM_DFX_REGS_LEN;
+			diff_regs[i].regs[j] = readl(qm->io_base + base_offset);
+		}
+	}
+
+	return diff_regs;
+
+alloc_error:
+	while (i > 0) {
+		i--;
+		kfree(diff_regs[i].regs);
+	}
+	kfree(diff_regs);
+	return ERR_PTR(-ENOMEM);
+}
+
+static void dfx_regs_uninit(struct hisi_qm *qm,
+		struct dfx_diff_registers *dregs, int reg_len)
+{
+	int i;
+
+	/* Setting the pointer is NULL to prevent double free */
+	for (i = 0; i < reg_len; i++) {
+		kfree(dregs[i].regs);
+		dregs[i].regs = NULL;
+	}
+	kfree(dregs);
+	dregs = NULL;
+}
+
+/**
+ * hisi_qm_diff_regs_init() - Allocate memory for registers.
+ * @qm: device qm handle.
+ * @dregs: diff registers handle.
+ * @reg_len: diff registers region length.
+ */
+int hisi_qm_diff_regs_init(struct hisi_qm *qm,
+		struct dfx_diff_registers *dregs, int reg_len)
+{
+	if (!qm || !dregs || reg_len <= 0)
+		return -EINVAL;
+
+	if (qm->fun_type != QM_HW_PF)
+		return 0;
+
+	qm->debug.qm_diff_regs = dfx_regs_init(qm, qm_diff_regs,
+						ARRAY_SIZE(qm_diff_regs));
+	if (IS_ERR(qm->debug.qm_diff_regs))
+		return PTR_ERR(qm->debug.qm_diff_regs);
+
+	qm->debug.acc_diff_regs = dfx_regs_init(qm, dregs, reg_len);
+	if (IS_ERR(qm->debug.acc_diff_regs)) {
+		dfx_regs_uninit(qm, qm->debug.qm_diff_regs,
+				ARRAY_SIZE(qm_diff_regs));
+		return PTR_ERR(qm->debug.acc_diff_regs);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(hisi_qm_diff_regs_init);
+
+/**
+ * hisi_qm_diff_regs_uninit() - Free memory for registers.
+ * @qm: device qm handle.
+ * @reg_len: diff registers region length.
+ */
+void hisi_qm_diff_regs_uninit(struct hisi_qm *qm, int reg_len)
+{
+	if (!qm  || reg_len <= 0 || qm->fun_type != QM_HW_PF)
+		return;
+
+	dfx_regs_uninit(qm, qm->debug.acc_diff_regs, reg_len);
+	dfx_regs_uninit(qm, qm->debug.qm_diff_regs, ARRAY_SIZE(qm_diff_regs));
+}
+EXPORT_SYMBOL_GPL(hisi_qm_diff_regs_uninit);
+
+/**
+ * hisi_qm_acc_diff_regs_dump() - Dump registers's value.
+ * @qm: device qm handle.
+ * @s: Debugfs file handle.
+ * @dregs: diff registers handle.
+ * @regs_len: diff registers region length.
+ */
+void hisi_qm_acc_diff_regs_dump(struct hisi_qm *qm, struct seq_file *s,
+	struct dfx_diff_registers *dregs, int regs_len)
+{
+	u32 j, val, base_offset;
+	int i, ret;
+
+	if (!qm || !s || !dregs || regs_len <= 0)
+		return;
+
+	ret = hisi_qm_get_dfx_access(qm);
+	if (ret)
+		return;
+
+	down_read(&qm->qps_lock);
+	for (i = 0; i < regs_len; i++) {
+		if (!dregs[i].reg_len)
+			continue;
+
+		for (j = 0; j < dregs[i].reg_len; j++) {
+			base_offset = dregs[i].reg_offset + j * QM_DFX_REGS_LEN;
+			val = readl(qm->io_base + base_offset);
+			if (val != dregs[i].regs[j])
+				seq_printf(s, "0x%08x = 0x%08x ---> 0x%08x\n",
+					   base_offset, dregs[i].regs[j], val);
+		}
+	}
+	up_read(&qm->qps_lock);
+
+	hisi_qm_put_dfx_access(qm);
+}
+EXPORT_SYMBOL_GPL(hisi_qm_acc_diff_regs_dump);
+
+static int qm_diff_regs_show(struct seq_file *s, void *unused)
+{
+	struct hisi_qm *qm = s->private;
+
+	hisi_qm_acc_diff_regs_dump(qm, s, qm->debug.qm_diff_regs,
+					ARRAY_SIZE(qm_diff_regs));
+
+	return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(qm_diff_regs);
+
 static ssize_t qm_cmd_read(struct file *filp, char __user *buffer,
 			   size_t count, loff_t *pos)
 {
@@ -4487,6 +4662,7 @@ static void hisi_qm_set_algqos_init(struct hisi_qm *qm)
  */
 void hisi_qm_debug_init(struct hisi_qm *qm)
 {
+	struct dfx_diff_registers *qm_regs = qm->debug.qm_diff_regs;
 	struct qm_dfx *dfx = &qm->debug.dfx;
 	struct dentry *qm_d;
 	void *data;
@@ -4502,6 +4678,10 @@ void hisi_qm_debug_init(struct hisi_qm *qm)
 			qm_create_debugfs_file(qm, qm->debug.qm_d, i);
 	}
 
+	if (qm_regs)
+		debugfs_create_file("diff_regs", 0444, qm->debug.qm_d,
+					qm, &qm_diff_regs_fops);
+
 	debugfs_create_file("regs", 0444, qm->debug.qm_d, qm, &qm_regs_fops);
 
 	debugfs_create_file("cmd", 0600, qm->debug.qm_d, qm, &qm_cmd_fops);
diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h
index 0ac0fdb6a6bd..7e318055f76b 100644
--- a/include/linux/hisi_acc_qm.h
+++ b/include/linux/hisi_acc_qm.h
@@ -137,6 +137,12 @@ enum qm_state {
 	QM_STOP,
 };
 
+struct dfx_diff_registers {
+	u32 *regs;
+	u32 reg_offset;
+	u32 reg_len;
+};
+
 enum qp_state {
 	QP_INIT = 1,
 	QP_START,
@@ -191,6 +197,8 @@ struct qm_debug {
 	struct dentry *debug_root;
 	struct dentry *qm_d;
 	struct debugfs_file files[DEBUG_FILE_NUM];
+	struct dfx_diff_registers *qm_diff_regs;
+	struct dfx_diff_registers *acc_diff_regs;
 };
 
 struct qm_shaper_factor {
@@ -449,6 +457,12 @@ int hisi_qm_sriov_disable(struct pci_dev *pdev, bool is_frozen);
 int hisi_qm_sriov_configure(struct pci_dev *pdev, int num_vfs);
 void hisi_qm_dev_err_init(struct hisi_qm *qm);
 void hisi_qm_dev_err_uninit(struct hisi_qm *qm);
+int hisi_qm_diff_regs_init(struct hisi_qm *qm,
+		struct dfx_diff_registers *dregs, int reg_len);
+void hisi_qm_diff_regs_uninit(struct hisi_qm *qm, int reg_len);
+void hisi_qm_acc_diff_regs_dump(struct hisi_qm *qm, struct seq_file *s,
+		struct dfx_diff_registers *dregs, int regs_len);
+
 pci_ers_result_t hisi_qm_dev_err_detected(struct pci_dev *pdev,
 					  pci_channel_state_t state);
 pci_ers_result_t hisi_qm_dev_slot_reset(struct pci_dev *pdev);
-- 
Gitee


From 77c39b7bdfb49e70596d03a0bb28416f28a132ae Mon Sep 17 00:00:00 2001
From: Kai Ye <yekai13@huawei.com>
Date: Tue, 19 Jul 2022 16:51:38 +0800
Subject: [PATCH 503/674] crypto: hisilicon/hpre - support register checking

mainline inclusion
from mainline-v5.19-rc1
commit 9210bdaa0d49
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=9210bdaa0d49

----------------------------------------------------------------------

The value of the register is changed after the task running. A debugfs
file node is added to help users to check the change of register values.

Signed-off-by: Longfang Liu <liulongfang@huawei.com>
Signed-off-by: Kai Ye <yekai13@huawei.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Jiangshui Yang <yangjiangshui@h-partners.com>
Reviewed-by: Yang Shen <shenyang39@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 drivers/crypto/hisilicon/hpre/hpre_main.c | 90 ++++++++++++++++++-----
 1 file changed, 72 insertions(+), 18 deletions(-)

diff --git a/drivers/crypto/hisilicon/hpre/hpre_main.c b/drivers/crypto/hisilicon/hpre/hpre_main.c
index 3589d8879b5e..07222856cc84 100644
--- a/drivers/crypto/hisilicon/hpre/hpre_main.c
+++ b/drivers/crypto/hisilicon/hpre/hpre_main.c
@@ -107,6 +107,15 @@
 #define HPRE_SQE_MASK_OFFSET		8
 #define HPRE_SQE_MASK_LEN		24
 
+#define HPRE_DFX_BASE		0x301000
+#define HPRE_DFX_COMMON1		0x301400
+#define HPRE_DFX_COMMON2		0x301A00
+#define HPRE_DFX_CORE		0x302000
+#define HPRE_DFX_BASE_LEN		0x55
+#define HPRE_DFX_COMMON1_LEN		0x41
+#define HPRE_DFX_COMMON2_LEN		0xE
+#define HPRE_DFX_CORE_LEN		0x43
+
 static const char hpre_name[] = "hisi_hpre";
 static struct dentry *hpre_debugfs_root;
 static const struct pci_device_id hpre_dev_ids[] = {
@@ -226,6 +235,53 @@ static const char *hpre_dfx_files[HPRE_DFX_FILE_NUM] = {
 	"invalid_req_cnt"
 };
 
+/* define the HPRE's dfx regs region and region length */
+static struct dfx_diff_registers hpre_diff_regs[] = {
+	{
+		.reg_offset = HPRE_DFX_BASE,
+		.reg_len = HPRE_DFX_BASE_LEN,
+	}, {
+		.reg_offset = HPRE_DFX_COMMON1,
+		.reg_len = HPRE_DFX_COMMON1_LEN,
+	}, {
+		.reg_offset = HPRE_DFX_COMMON2,
+		.reg_len = HPRE_DFX_COMMON2_LEN,
+	}, {
+		.reg_offset = HPRE_DFX_CORE,
+		.reg_len = HPRE_DFX_CORE_LEN,
+	},
+};
+
+static int hpre_diff_regs_show(struct seq_file *s, void *unused)
+{
+	struct hisi_qm *qm = s->private;
+
+	hisi_qm_acc_diff_regs_dump(qm, s, qm->debug.acc_diff_regs,
+					ARRAY_SIZE(hpre_diff_regs));
+
+	return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(hpre_diff_regs);
+
+static int hpre_com_regs_show(struct seq_file *s, void *unused)
+{
+	hisi_qm_regs_dump(s, s->private);
+
+	return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(hpre_com_regs);
+
+static int hpre_cluster_regs_show(struct seq_file *s, void *unused)
+{
+	hisi_qm_regs_dump(s, s->private);
+
+	return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(hpre_cluster_regs);
+
 static const struct kernel_param_ops hpre_uacce_mode_ops = {
 	.set = uacce_mode_set,
 	.get = param_get_int,
@@ -779,24 +835,6 @@ static int hpre_debugfs_atomic64_set(void *data, u64 val)
 DEFINE_DEBUGFS_ATTRIBUTE(hpre_atomic64_ops, hpre_debugfs_atomic64_get,
 			 hpre_debugfs_atomic64_set, "%llu\n");
 
-static int hpre_com_regs_show(struct seq_file *s, void *unused)
-{
-	hisi_qm_regs_dump(s, s->private);
-
-	return 0;
-}
-
-DEFINE_SHOW_ATTRIBUTE(hpre_com_regs);
-
-static int hpre_cluster_regs_show(struct seq_file *s, void *unused)
-{
-	hisi_qm_regs_dump(s, s->private);
-
-	return 0;
-}
-
-DEFINE_SHOW_ATTRIBUTE(hpre_cluster_regs);
-
 static int hpre_create_debugfs_file(struct hisi_qm *qm, struct dentry *dir,
 				    enum hpre_ctrl_dbgfs_file type, int indx)
 {
@@ -895,6 +933,7 @@ static int hpre_ctrl_debug_init(struct hisi_qm *qm)
 
 static void hpre_dfx_debug_init(struct hisi_qm *qm)
 {
+	struct dfx_diff_registers *hpre_regs = qm->debug.acc_diff_regs;
 	struct hpre *hpre = container_of(qm, struct hpre, qm);
 	struct hpre_dfx *dfx = hpre->debug.dfx;
 	struct dentry *parent;
@@ -906,6 +945,10 @@ static void hpre_dfx_debug_init(struct hisi_qm *qm)
 		debugfs_create_file(hpre_dfx_files[i], 0644, parent, &dfx[i],
 				    &hpre_atomic64_ops);
 	}
+
+	if (qm->fun_type == QM_HW_PF && hpre_regs)
+		debugfs_create_file("diff_regs", 0444, parent,
+				      qm, &hpre_diff_regs_fops);
 }
 
 static int hpre_debugfs_init(struct hisi_qm *qm)
@@ -918,6 +961,13 @@ static int hpre_debugfs_init(struct hisi_qm *qm)
 
 	qm->debug.sqe_mask_offset = HPRE_SQE_MASK_OFFSET;
 	qm->debug.sqe_mask_len = HPRE_SQE_MASK_LEN;
+	ret = hisi_qm_diff_regs_init(qm, hpre_diff_regs,
+				ARRAY_SIZE(hpre_diff_regs));
+	if (ret) {
+		dev_warn(dev, "Failed to init HPRE diff regs!\n");
+		goto debugfs_remove;
+	}
+
 	hisi_qm_debug_init(qm);
 
 	if (qm->pdev->device == PCI_DEVICE_ID_HUAWEI_HPRE_PF) {
@@ -931,12 +981,16 @@ static int hpre_debugfs_init(struct hisi_qm *qm)
 	return 0;
 
 failed_to_create:
+	hisi_qm_diff_regs_uninit(qm, ARRAY_SIZE(hpre_diff_regs));
+debugfs_remove:
 	debugfs_remove_recursive(qm->debug.debug_root);
 	return ret;
 }
 
 static void hpre_debugfs_exit(struct hisi_qm *qm)
 {
+	hisi_qm_diff_regs_uninit(qm, ARRAY_SIZE(hpre_diff_regs));
+
 	debugfs_remove_recursive(qm->debug.debug_root);
 }
 
-- 
Gitee


From 0a1b0ca9c4ae8c5c479bfe3b2be89c19a567b584 Mon Sep 17 00:00:00 2001
From: Kai Ye <yekai13@huawei.com>
Date: Tue, 19 Jul 2022 16:51:39 +0800
Subject: [PATCH 504/674] crypto: hisilicon/sec - support register checking

mainline inclusion
from mainline-v5.19-rc1
commit 16175030bb5b
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=16175030bb5b

----------------------------------------------------------------------

The value of the register is changed after the task running. A debugfs
file node is added to help users to check the change of register values.

Signed-off-by: Longfang Liu <liulongfang@huawei.com>
Signed-off-by: Kai Ye <yekai13@huawei.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Jiangshui Yang <yangjiangshui@h-partners.com>
Reviewed-by: Yang Shen <shenyang39@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 drivers/crypto/hisilicon/sec2/sec_main.c | 53 ++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/drivers/crypto/hisilicon/sec2/sec_main.c b/drivers/crypto/hisilicon/sec2/sec_main.c
index 85f55d8f04cf..0dab1fde56d1 100644
--- a/drivers/crypto/hisilicon/sec2/sec_main.c
+++ b/drivers/crypto/hisilicon/sec2/sec_main.c
@@ -110,6 +110,15 @@
 #define SEC_SQE_MASK_LEN		48
 #define SEC_SHAPER_TYPE_RATE		400
 
+#define SEC_DFX_BASE		0x301000
+#define SEC_DFX_CORE		0x302100
+#define SEC_DFX_COMMON1		0x301600
+#define SEC_DFX_COMMON2		0x301C00
+#define SEC_DFX_BASE_LEN		0x9D
+#define SEC_DFX_CORE_LEN		0x32B
+#define SEC_DFX_COMMON1_LEN		0x45
+#define SEC_DFX_COMMON2_LEN		0xBA
+
 struct sec_hw_error {
 	u32 int_msk;
 	const char *msg;
@@ -226,6 +235,34 @@ static const struct debugfs_reg32 sec_dfx_regs[] = {
 	{"SEC_BD_SAA8                   ",  0x301C40},
 };
 
+/* define the SEC's dfx regs region and region length */
+static struct dfx_diff_registers sec_diff_regs[] = {
+	{
+		.reg_offset = SEC_DFX_BASE,
+		.reg_len = SEC_DFX_BASE_LEN,
+	}, {
+		.reg_offset = SEC_DFX_COMMON1,
+		.reg_len = SEC_DFX_COMMON1_LEN,
+	}, {
+		.reg_offset = SEC_DFX_COMMON2,
+		.reg_len = SEC_DFX_COMMON2_LEN,
+	}, {
+		.reg_offset = SEC_DFX_CORE,
+		.reg_len = SEC_DFX_CORE_LEN,
+	},
+};
+
+static int sec_diff_regs_show(struct seq_file *s, void *unused)
+{
+	struct hisi_qm *qm = s->private;
+
+	hisi_qm_acc_diff_regs_dump(qm, s, qm->debug.acc_diff_regs,
+					ARRAY_SIZE(sec_diff_regs));
+
+	return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(sec_diff_regs);
+
 static int sec_pf_q_num_set(const char *val, const struct kernel_param *kp)
 {
 	return q_num_set(val, kp, PCI_DEVICE_ID_HUAWEI_SEC_PF);
@@ -729,6 +766,7 @@ DEFINE_SHOW_ATTRIBUTE(sec_regs);
 
 static int sec_core_debug_init(struct hisi_qm *qm)
 {
+	struct dfx_diff_registers *sec_regs = qm->debug.acc_diff_regs;
 	struct sec_dev *sec = container_of(qm, struct sec_dev, qm);
 	struct device *dev = &qm->pdev->dev;
 	struct sec_dfx *dfx = &sec->debug.dfx;
@@ -749,6 +787,9 @@ static int sec_core_debug_init(struct hisi_qm *qm)
 
 	if (qm->pdev->device == PCI_DEVICE_ID_HUAWEI_SEC_PF)
 		debugfs_create_file("regs", 0444, tmp_d, regset, &sec_regs_fops);
+	if (qm->fun_type == QM_HW_PF && sec_regs)
+		debugfs_create_file("diff_regs", 0444, tmp_d,
+				      qm, &sec_diff_regs_fops);
 
 	for (i = 0; i < ARRAY_SIZE(sec_dfx_labels); i++) {
 		atomic64_t *data = (atomic64_t *)((uintptr_t)dfx +
@@ -790,6 +831,14 @@ static int sec_debugfs_init(struct hisi_qm *qm)
 						  sec_debugfs_root);
 	qm->debug.sqe_mask_offset = SEC_SQE_MASK_OFFSET;
 	qm->debug.sqe_mask_len = SEC_SQE_MASK_LEN;
+
+	ret = hisi_qm_diff_regs_init(qm, sec_diff_regs,
+				ARRAY_SIZE(sec_diff_regs));
+	if (ret) {
+		dev_warn(dev, "Failed to init SEC diff regs!\n");
+		goto debugfs_remove;
+	}
+
 	hisi_qm_debug_init(qm);
 
 	ret = sec_debug_init(qm);
@@ -799,12 +848,16 @@ static int sec_debugfs_init(struct hisi_qm *qm)
 	return 0;
 
 failed_to_create:
+	hisi_qm_diff_regs_uninit(qm, ARRAY_SIZE(sec_diff_regs));
+debugfs_remove:
 	debugfs_remove_recursive(sec_debugfs_root);
 	return ret;
 }
 
 static void sec_debugfs_exit(struct hisi_qm *qm)
 {
+	hisi_qm_diff_regs_uninit(qm, ARRAY_SIZE(sec_diff_regs));
+
 	debugfs_remove_recursive(qm->debug.debug_root);
 }
 
-- 
Gitee


From 16683f6b5cc2566f2660154788901cf40a52d690 Mon Sep 17 00:00:00 2001
From: Kai Ye <yekai13@huawei.com>
Date: Tue, 19 Jul 2022 16:51:40 +0800
Subject: [PATCH 505/674] crypto: hisilicon/zip - support register checking

mainline inclusion
from mainline-v5.19-rc1
commit 9b0c97dfc215
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=9b0c97dfc215

----------------------------------------------------------------------

The value of the register is changed after the task running. A debugfs
file node is added to help users to check the change of register values.

Signed-off-by: Longfang Liu <liulongfang@huawei.com>
Signed-off-by: Kai Ye <yekai13@huawei.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Jiangshui Yang <yangjiangshui@h-partners.com>
Reviewed-by: Yang Shen <shenyang39@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 drivers/crypto/hisilicon/zip/zip_main.c | 78 ++++++++++++++++++++++---
 1 file changed, 70 insertions(+), 8 deletions(-)

diff --git a/drivers/crypto/hisilicon/zip/zip_main.c b/drivers/crypto/hisilicon/zip/zip_main.c
index 66decfe07282..ed539755eaf8 100644
--- a/drivers/crypto/hisilicon/zip/zip_main.c
+++ b/drivers/crypto/hisilicon/zip/zip_main.c
@@ -49,14 +49,18 @@
 
 #define HZIP_QM_IDEL_STATUS		0x3040e4
 
-#define HZIP_CORE_DEBUG_COMP_0		0x302000
-#define HZIP_CORE_DEBUG_COMP_1		0x303000
-#define HZIP_CORE_DEBUG_DECOMP_0	0x304000
-#define HZIP_CORE_DEBUG_DECOMP_1	0x305000
-#define HZIP_CORE_DEBUG_DECOMP_2	0x306000
-#define HZIP_CORE_DEBUG_DECOMP_3	0x307000
-#define HZIP_CORE_DEBUG_DECOMP_4	0x308000
-#define HZIP_CORE_DEBUG_DECOMP_5	0x309000
+#define HZIP_CORE_DFX_BASE		0x301000
+#define HZIP_CLOCK_GATED_CONTL		0X301004
+#define HZIP_CORE_DFX_COMP_0		0x302000
+#define HZIP_CORE_DFX_COMP_1		0x303000
+#define HZIP_CORE_DFX_DECOMP_0		0x304000
+#define HZIP_CORE_DFX_DECOMP_1		0x305000
+#define HZIP_CORE_DFX_DECOMP_2		0x306000
+#define HZIP_CORE_DFX_DECOMP_3		0x307000
+#define HZIP_CORE_DFX_DECOMP_4		0x308000
+#define HZIP_CORE_DFX_DECOMP_5		0x309000
+#define HZIP_CORE_REGS_BASE_LEN		0xB0
+#define HZIP_CORE_REGS_DFX_LEN		0x28
 
 #define HZIP_CORE_INT_SOURCE		0x3010A0
 #define HZIP_CORE_INT_MASK_REG		0x3010A4
@@ -230,6 +234,48 @@ static const struct debugfs_reg32 hzip_dfx_regs[] = {
 	{"HZIP_DECOMP_LZ77_CURR_ST       ",  0x9cull},
 };
 
+/* define the ZIP's dfx regs region and region length */
+static struct dfx_diff_registers hzip_diff_regs[] = {
+	{
+		.reg_offset = HZIP_CORE_DFX_BASE,
+		.reg_len = HZIP_CORE_REGS_BASE_LEN,
+	}, {
+		.reg_offset = HZIP_CORE_DFX_COMP_0,
+		.reg_len = HZIP_CORE_REGS_DFX_LEN,
+	}, {
+		.reg_offset = HZIP_CORE_DFX_COMP_1,
+		.reg_len = HZIP_CORE_REGS_DFX_LEN,
+	}, {
+		.reg_offset = HZIP_CORE_DFX_DECOMP_0,
+		.reg_len = HZIP_CORE_REGS_DFX_LEN,
+	}, {
+		.reg_offset = HZIP_CORE_DFX_DECOMP_1,
+		.reg_len = HZIP_CORE_REGS_DFX_LEN,
+	}, {
+		.reg_offset = HZIP_CORE_DFX_DECOMP_2,
+		.reg_len = HZIP_CORE_REGS_DFX_LEN,
+	}, {
+		.reg_offset = HZIP_CORE_DFX_DECOMP_3,
+		.reg_len = HZIP_CORE_REGS_DFX_LEN,
+	}, {
+		.reg_offset = HZIP_CORE_DFX_DECOMP_4,
+		.reg_len = HZIP_CORE_REGS_DFX_LEN,
+	}, {
+		.reg_offset = HZIP_CORE_DFX_DECOMP_5,
+		.reg_len = HZIP_CORE_REGS_DFX_LEN,
+	},
+};
+
+static int hzip_diff_regs_show(struct seq_file *s, void *unused)
+{
+	struct hisi_qm *qm = s->private;
+
+	hisi_qm_acc_diff_regs_dump(qm, s, qm->debug.acc_diff_regs,
+					ARRAY_SIZE(hzip_diff_regs));
+
+	return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(hzip_diff_regs);
 static const struct kernel_param_ops zip_uacce_mode_ops = {
 	.set = uacce_mode_set,
 	.get = param_get_int,
@@ -621,6 +667,7 @@ static int hisi_zip_core_debug_init(struct hisi_qm *qm)
 
 static void hisi_zip_dfx_debug_init(struct hisi_qm *qm)
 {
+	struct dfx_diff_registers *hzip_regs = qm->debug.acc_diff_regs;
 	struct hisi_zip *zip = container_of(qm, struct hisi_zip, qm);
 	struct hisi_zip_dfx *dfx = &zip->dfx;
 	struct dentry *tmp_dir;
@@ -634,6 +681,10 @@ static void hisi_zip_dfx_debug_init(struct hisi_qm *qm)
 				    0644, tmp_dir, data,
 				    &zip_atomic64_ops);
 	}
+
+	if (qm->fun_type == QM_HW_PF && hzip_regs)
+		debugfs_create_file("diff_regs", 0444, tmp_dir,
+				      qm, &hzip_diff_regs_fops);
 }
 
 static int hisi_zip_ctrl_debug_init(struct hisi_qm *qm)
@@ -666,6 +717,13 @@ static int hisi_zip_debugfs_init(struct hisi_qm *qm)
 	qm->debug.sqe_mask_offset = HZIP_SQE_MASK_OFFSET;
 	qm->debug.sqe_mask_len = HZIP_SQE_MASK_LEN;
 	qm->debug.debug_root = dev_d;
+	ret = hisi_qm_diff_regs_init(qm, hzip_diff_regs,
+				ARRAY_SIZE(hzip_diff_regs));
+	if (ret) {
+		dev_warn(dev, "Failed to init ZIP diff regs!\n");
+		goto debugfs_remove;
+	}
+
 	hisi_qm_debug_init(qm);
 
 	if (qm->fun_type == QM_HW_PF) {
@@ -679,6 +737,8 @@ static int hisi_zip_debugfs_init(struct hisi_qm *qm)
 	return 0;
 
 failed_to_create:
+	hisi_qm_diff_regs_uninit(qm, ARRAY_SIZE(hzip_diff_regs));
+debugfs_remove:
 	debugfs_remove_recursive(hzip_debugfs_root);
 	return ret;
 }
@@ -703,6 +763,8 @@ static void hisi_zip_debug_regs_clear(struct hisi_qm *qm)
 
 static void hisi_zip_debugfs_exit(struct hisi_qm *qm)
 {
+	hisi_qm_diff_regs_uninit(qm, ARRAY_SIZE(hzip_diff_regs));
+
 	debugfs_remove_recursive(qm->debug.debug_root);
 
 	if (qm->fun_type == QM_HW_PF) {
-- 
Gitee


From 64ee1e58dc5b085080d90b0a7b0504240723395a Mon Sep 17 00:00:00 2001
From: Kai Ye <yekai13@huawei.com>
Date: Tue, 19 Jul 2022 16:51:41 +0800
Subject: [PATCH 506/674] crypto: hisilicon/qm - add last word dumping for ACC

mainline inclusion
from mainline-v5.19-rc1
commit a888ccd6c666
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a888ccd6c666

----------------------------------------------------------------------

Add last word dumping function during acc engines controller reset.
The last words are reported to the printed information during the
reset. The dmesg information included qm debugging registers and
engine debugging registers. It can help to improve debugging
capability.

Signed-off-by: Kai Ye <yekai13@huawei.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Jiangshui Yang <yangjiangshui@h-partners.com>
Reviewed-by: Yang Shen <shenyang39@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 drivers/crypto/hisilicon/qm.c | 57 +++++++++++++++++++++++++++++++++++
 include/linux/hisi_acc_qm.h   |  4 +++
 2 files changed, 61 insertions(+)

diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c
index 45f1ded0801f..e44d0c46705a 100644
--- a/drivers/crypto/hisilicon/qm.c
+++ b/drivers/crypto/hisilicon/qm.c
@@ -3604,6 +3604,17 @@ static void hisi_qm_set_state(struct hisi_qm *qm, enum vf_state state)
 		writel(state, qm->io_base + QM_VF_STATE);
 }
 
+static void qm_last_regs_uninit(struct hisi_qm *qm)
+{
+	struct qm_debug *debug = &qm->debug;
+
+	if (qm->fun_type == QM_HW_VF || !debug->qm_last_words)
+		return;
+
+	kfree(debug->qm_last_words);
+	debug->qm_last_words = NULL;
+}
+
 static void hisi_qm_pre_init(struct hisi_qm *qm)
 {
 	struct pci_dev *pdev = qm->pdev;
@@ -3685,6 +3696,8 @@ void hisi_qm_uninit(struct hisi_qm *qm)
 	struct pci_dev *pdev = qm->pdev;
 	struct device *dev = &pdev->dev;
 
+	qm_last_regs_uninit(qm);
+
 	qm_cmd_uninit(qm);
 	kfree(qm->factor);
 	down_write(&qm->qps_lock);
@@ -5364,6 +5377,24 @@ static int qm_controller_reset_done(struct hisi_qm *qm)
 	return 0;
 }
 
+static void qm_show_last_dfx_regs(struct hisi_qm *qm)
+{
+	struct qm_debug *debug = &qm->debug;
+	struct pci_dev *pdev = qm->pdev;
+	u32 val;
+	int i;
+
+	if (qm->fun_type == QM_HW_VF || !debug->qm_last_words)
+		return;
+
+	for (i = 0; i < ARRAY_SIZE(qm_dfx_regs); i++) {
+		val = readl_relaxed(qm->io_base + qm_dfx_regs[i].offset);
+		if (debug->qm_last_words[i] != val)
+			pci_info(pdev, "%s \t= 0x%08x => 0x%08x\n",
+			qm_dfx_regs[i].name, debug->qm_last_words[i], val);
+	}
+}
+
 static int qm_controller_reset(struct hisi_qm *qm)
 {
 	struct pci_dev *pdev = qm->pdev;
@@ -5379,6 +5410,10 @@ static int qm_controller_reset(struct hisi_qm *qm)
 		return ret;
 	}
 
+	qm_show_last_dfx_regs(qm);
+	if (qm->err_ini->show_last_dfx_regs)
+		qm->err_ini->show_last_dfx_regs(qm);
+
 	ret = qm_soft_reset(qm);
 	if (ret) {
 		pci_err(pdev, "Controller reset failed (%d)\n", ret);
@@ -6091,6 +6126,26 @@ static int hisi_qm_memory_init(struct hisi_qm *qm)
 	return ret;
 }
 
+static void qm_last_regs_init(struct hisi_qm *qm)
+{
+	int dfx_regs_num = ARRAY_SIZE(qm_dfx_regs);
+	struct qm_debug *debug = &qm->debug;
+	int i;
+
+	if (qm->fun_type == QM_HW_VF)
+		return;
+
+	debug->qm_last_words = kcalloc(dfx_regs_num, sizeof(unsigned int),
+								GFP_KERNEL);
+	if (!debug->qm_last_words)
+		return;
+
+	for (i = 0; i < dfx_regs_num; i++) {
+		debug->qm_last_words[i] = readl_relaxed(qm->io_base +
+			qm_dfx_regs[i].offset);
+	}
+}
+
 /**
  * hisi_qm_init() - Initialize configures about qm.
  * @qm: The qm needing init.
@@ -6143,6 +6198,8 @@ int hisi_qm_init(struct hisi_qm *qm)
 	qm_cmd_init(qm);
 	atomic_set(&qm->status.flags, QM_INIT);
 
+	qm_last_regs_init(qm);
+
 	return 0;
 
 err_alloc_uacce:
diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h
index 7e318055f76b..677ce831949a 100644
--- a/include/linux/hisi_acc_qm.h
+++ b/include/linux/hisi_acc_qm.h
@@ -197,6 +197,9 @@ struct qm_debug {
 	struct dentry *debug_root;
 	struct dentry *qm_d;
 	struct debugfs_file files[DEBUG_FILE_NUM];
+	unsigned int *qm_last_words;
+	/* ACC engines recoreding last regs */
+	unsigned int *last_words;
 	struct dfx_diff_registers *qm_diff_regs;
 	struct dfx_diff_registers *acc_diff_regs;
 };
@@ -252,6 +255,7 @@ struct hisi_qm_err_ini {
 	void (*open_sva_prefetch)(struct hisi_qm *qm);
 	void (*close_sva_prefetch)(struct hisi_qm *qm);
 	void (*log_dev_hw_err)(struct hisi_qm *qm, u32 err_sts);
+	void (*show_last_dfx_regs)(struct hisi_qm *qm);
 	void (*err_info_init)(struct hisi_qm *qm);
 };
 
-- 
Gitee


From 1ad337a645d4ef19149608fadc651f8d0a3fb17b Mon Sep 17 00:00:00 2001
From: Kai Ye <yekai13@huawei.com>
Date: Tue, 19 Jul 2022 16:51:42 +0800
Subject: [PATCH 507/674] crypto: hisilicon/sec - support last word dumping

mainline inclusion
from mainline-v5.19-rc1
commit 8a88d0914529
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=8a88d0914529

----------------------------------------------------------------------

Add last word dumping function during sec engine controller reset.

Signed-off-by: Kai Ye <yekai13@huawei.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Jiangshui Yang <yangjiangshui@h-partners.com>
Reviewed-by: Yang Shen <shenyang39@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 drivers/crypto/hisilicon/sec2/sec_main.c | 55 +++++++++++++++++++++++-
 1 file changed, 54 insertions(+), 1 deletion(-)

diff --git a/drivers/crypto/hisilicon/sec2/sec_main.c b/drivers/crypto/hisilicon/sec2/sec_main.c
index 0dab1fde56d1..eb6474d3f2cc 100644
--- a/drivers/crypto/hisilicon/sec2/sec_main.c
+++ b/drivers/crypto/hisilicon/sec2/sec_main.c
@@ -861,6 +861,53 @@ static void sec_debugfs_exit(struct hisi_qm *qm)
 	debugfs_remove_recursive(qm->debug.debug_root);
 }
 
+static int sec_show_last_regs_init(struct hisi_qm *qm)
+{
+	struct qm_debug *debug = &qm->debug;
+	int i;
+
+	debug->last_words = kcalloc(ARRAY_SIZE(sec_dfx_regs),
+					sizeof(unsigned int), GFP_KERNEL);
+	if (!debug->last_words)
+		return -ENOMEM;
+
+	for (i = 0; i < ARRAY_SIZE(sec_dfx_regs); i++)
+		debug->last_words[i] = readl_relaxed(qm->io_base +
+							sec_dfx_regs[i].offset);
+
+	return 0;
+}
+
+static void sec_show_last_regs_uninit(struct hisi_qm *qm)
+{
+	struct qm_debug *debug = &qm->debug;
+
+	if (qm->fun_type == QM_HW_VF || !debug->last_words)
+		return;
+
+	kfree(debug->last_words);
+	debug->last_words = NULL;
+}
+
+static void sec_show_last_dfx_regs(struct hisi_qm *qm)
+{
+	struct qm_debug *debug = &qm->debug;
+	struct pci_dev *pdev = qm->pdev;
+	u32 val;
+	int i;
+
+	if (qm->fun_type == QM_HW_VF || !debug->last_words)
+		return;
+
+	/* dumps last word of the debugging registers during controller reset */
+	for (i = 0; i < ARRAY_SIZE(sec_dfx_regs); i++) {
+		val = readl_relaxed(qm->io_base + sec_dfx_regs[i].offset);
+		if (val != debug->last_words[i])
+			pci_info(pdev, "%s \t= 0x%08x => 0x%08x\n",
+				sec_dfx_regs[i].name, debug->last_words[i], val);
+	}
+}
+
 static void sec_log_hw_error(struct hisi_qm *qm, u32 err_sts)
 {
 	const struct sec_hw_error *errs = sec_hw_errors;
@@ -927,6 +974,7 @@ static const struct hisi_qm_err_ini sec_err_ini = {
 	.open_axi_master_ooo	= sec_open_axi_master_ooo,
 	.open_sva_prefetch	= sec_open_sva_prefetch,
 	.close_sva_prefetch	= sec_close_sva_prefetch,
+	.show_last_dfx_regs	= sec_show_last_dfx_regs,
 	.err_info_init		= sec_err_info_init,
 };
 
@@ -945,8 +993,11 @@ static int sec_pf_probe_init(struct sec_dev *sec)
 	sec_open_sva_prefetch(qm);
 	hisi_qm_dev_err_init(qm);
 	sec_debug_regs_clear(qm);
+	ret = sec_show_last_regs_init(qm);
+	if (ret)
+		pci_err(qm->pdev, "Failed to init last word regs!\n");
 
-	return 0;
+	return ret;
 }
 
 static int sec_qm_init(struct hisi_qm *qm, struct pci_dev *pdev)
@@ -1120,6 +1171,7 @@ static int sec_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	sec_debugfs_exit(qm);
 	hisi_qm_stop(qm, QM_NORMAL);
 err_probe_uninit:
+	sec_show_last_regs_uninit(qm);
 	sec_probe_uninit(qm);
 err_qm_uninit:
 	sec_qm_uninit(qm);
@@ -1144,6 +1196,7 @@ static void sec_remove(struct pci_dev *pdev)
 
 	if (qm->fun_type == QM_HW_PF)
 		sec_debug_regs_clear(qm);
+	sec_show_last_regs_uninit(qm);
 
 	sec_probe_uninit(qm);
 
-- 
Gitee


From 5366ae3636ab81eab3d8cac3b7f2ab211b98bf76 Mon Sep 17 00:00:00 2001
From: Kai Ye <yekai13@huawei.com>
Date: Tue, 19 Jul 2022 16:51:43 +0800
Subject: [PATCH 508/674] crypto: hisilicon/hpre - support last word dumping

mainline inclusion
from mainline-v5.19-rc1
commit 42123e81fdba
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=42123e81fdba

----------------------------------------------------------------------

1. Add some debugging registers.
2. Add last word dumping function during hpre engine controller reset.

Signed-off-by: Kai Ye <yekai13@huawei.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Jiangshui Yang <yangjiangshui@h-partners.com>
Reviewed-by: Yang Shen <shenyang39@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 drivers/crypto/hisilicon/hpre/hpre_main.c | 132 ++++++++++++++++++----
 1 file changed, 112 insertions(+), 20 deletions(-)

diff --git a/drivers/crypto/hisilicon/hpre/hpre_main.c b/drivers/crypto/hisilicon/hpre/hpre_main.c
index 07222856cc84..8200793aa15c 100644
--- a/drivers/crypto/hisilicon/hpre/hpre_main.c
+++ b/drivers/crypto/hisilicon/hpre/hpre_main.c
@@ -36,6 +36,12 @@
 #define HPRE_DATA_WUSER_CFG		0x301040
 #define HPRE_INT_MASK			0x301400
 #define HPRE_INT_STATUS			0x301800
+#define HPRE_HAC_INT_MSK		0x301400
+#define HPRE_HAC_RAS_CE_ENB		0x301410
+#define HPRE_HAC_RAS_NFE_ENB		0x301414
+#define HPRE_HAC_RAS_FE_ENB		0x301418
+#define HPRE_HAC_INT_SET		0x301500
+#define HPRE_RNG_TIMEOUT_NUM		0x301A34
 #define HPRE_CORE_INT_ENABLE		0
 #define HPRE_CORE_INT_DISABLE		GENMASK(21, 0)
 #define HPRE_RDCHN_INI_ST		0x301a00
@@ -201,28 +207,32 @@ static const u64 hpre_cluster_offsets[] = {
 };
 
 static const struct debugfs_reg32 hpre_cluster_dfx_regs[] = {
-	{"CORES_EN_STATUS          ",  HPRE_CORE_EN_OFFSET},
-	{"CORES_INI_CFG              ",  HPRE_CORE_INI_CFG_OFFSET},
-	{"CORES_INI_STATUS         ",  HPRE_CORE_INI_STATUS_OFFSET},
-	{"CORES_HTBT_WARN         ",  HPRE_CORE_HTBT_WARN_OFFSET},
-	{"CORES_IS_SCHD               ",  HPRE_CORE_IS_SCHD_OFFSET},
+	{"CORES_EN_STATUS     ",  HPRE_CORE_EN_OFFSET},
+	{"CORES_INI_CFG       ",  HPRE_CORE_INI_CFG_OFFSET},
+	{"CORES_INI_STATUS    ",  HPRE_CORE_INI_STATUS_OFFSET},
+	{"CORES_HTBT_WARN     ",  HPRE_CORE_HTBT_WARN_OFFSET},
+	{"CORES_IS_SCHD       ",  HPRE_CORE_IS_SCHD_OFFSET},
 };
 
 static const struct debugfs_reg32 hpre_com_dfx_regs[] = {
-	{"READ_CLR_EN          ",  HPRE_CTRL_CNT_CLR_CE},
-	{"AXQOS                   ",  HPRE_VFG_AXQOS},
-	{"AWUSR_CFG              ",  HPRE_AWUSR_FP_CFG},
-	{"QM_ARUSR_MCFG1           ",  QM_ARUSER_M_CFG_1},
-	{"QM_AWUSR_MCFG1           ",  QM_AWUSER_M_CFG_1},
-	{"BD_ENDIAN               ",  HPRE_BD_ENDIAN},
-	{"ECC_CHECK_CTRL       ",  HPRE_ECC_BYPASS},
-	{"RAS_INT_WIDTH       ",  HPRE_RAS_WIDTH_CFG},
-	{"POISON_BYPASS       ",  HPRE_POISON_BYPASS},
-	{"BD_ARUSER               ",  HPRE_BD_ARUSR_CFG},
-	{"BD_AWUSER               ",  HPRE_BD_AWUSR_CFG},
-	{"DATA_ARUSER            ",  HPRE_DATA_RUSER_CFG},
-	{"DATA_AWUSER           ",  HPRE_DATA_WUSER_CFG},
-	{"INT_STATUS               ",  HPRE_INT_STATUS},
+	{"READ_CLR_EN     ",  HPRE_CTRL_CNT_CLR_CE},
+	{"AXQOS           ",  HPRE_VFG_AXQOS},
+	{"AWUSR_CFG       ",  HPRE_AWUSR_FP_CFG},
+	{"BD_ENDIAN       ",  HPRE_BD_ENDIAN},
+	{"ECC_CHECK_CTRL  ",  HPRE_ECC_BYPASS},
+	{"RAS_INT_WIDTH   ",  HPRE_RAS_WIDTH_CFG},
+	{"POISON_BYPASS   ",  HPRE_POISON_BYPASS},
+	{"BD_ARUSER       ",  HPRE_BD_ARUSR_CFG},
+	{"BD_AWUSER       ",  HPRE_BD_AWUSR_CFG},
+	{"DATA_ARUSER     ",  HPRE_DATA_RUSER_CFG},
+	{"DATA_AWUSER     ",  HPRE_DATA_WUSER_CFG},
+	{"INT_STATUS      ",  HPRE_INT_STATUS},
+	{"INT_MASK        ",  HPRE_HAC_INT_MSK},
+	{"RAS_CE_ENB      ",  HPRE_HAC_RAS_CE_ENB},
+	{"RAS_NFE_ENB     ",  HPRE_HAC_RAS_NFE_ENB},
+	{"RAS_FE_ENB      ",  HPRE_HAC_RAS_FE_ENB},
+	{"INT_SET         ",  HPRE_HAC_INT_SET},
+	{"RNG_TIMEOUT_NUM ",  HPRE_RNG_TIMEOUT_NUM},
 };
 
 static const char *hpre_dfx_files[HPRE_DFX_FILE_NUM] = {
@@ -1023,6 +1033,82 @@ static int hpre_qm_init(struct hisi_qm *qm, struct pci_dev *pdev)
 	return hisi_qm_init(qm);
 }
 
+static int hpre_show_last_regs_init(struct hisi_qm *qm)
+{
+	int cluster_dfx_regs_num =  ARRAY_SIZE(hpre_cluster_dfx_regs);
+	int com_dfx_regs_num = ARRAY_SIZE(hpre_com_dfx_regs);
+	u8 clusters_num = hpre_cluster_num(qm);
+	struct qm_debug *debug = &qm->debug;
+	void __iomem *io_base;
+	int i, j, idx;
+
+	debug->last_words = kcalloc(cluster_dfx_regs_num * clusters_num +
+			com_dfx_regs_num, sizeof(unsigned int), GFP_KERNEL);
+	if (!debug->last_words)
+		return -ENOMEM;
+
+	for (i = 0; i < com_dfx_regs_num; i++)
+		debug->last_words[i] = readl_relaxed(qm->io_base +
+						hpre_com_dfx_regs[i].offset);
+
+	for (i = 0; i < clusters_num; i++) {
+		io_base = qm->io_base + hpre_cluster_offsets[i];
+		for (j = 0; j < cluster_dfx_regs_num; j++) {
+			idx = com_dfx_regs_num + i * cluster_dfx_regs_num + j;
+			debug->last_words[idx] = readl_relaxed(
+				io_base + hpre_cluster_dfx_regs[j].offset);
+		}
+	}
+
+	return 0;
+}
+
+static void hpre_show_last_regs_uninit(struct hisi_qm *qm)
+{
+	struct qm_debug *debug = &qm->debug;
+
+	if (qm->fun_type == QM_HW_VF || !debug->last_words)
+		return;
+
+	kfree(debug->last_words);
+	debug->last_words = NULL;
+}
+
+static void hpre_show_last_dfx_regs(struct hisi_qm *qm)
+{
+	int cluster_dfx_regs_num =  ARRAY_SIZE(hpre_cluster_dfx_regs);
+	int com_dfx_regs_num = ARRAY_SIZE(hpre_com_dfx_regs);
+	u8 clusters_num = hpre_cluster_num(qm);
+	struct qm_debug *debug = &qm->debug;
+	struct pci_dev *pdev = qm->pdev;
+	void __iomem *io_base;
+	int i, j, idx;
+	u32 val;
+
+	if (qm->fun_type == QM_HW_VF || !debug->last_words)
+		return;
+
+	/* dumps last word of the debugging registers during controller reset */
+	for (i = 0; i < com_dfx_regs_num; i++) {
+		val = readl_relaxed(qm->io_base + hpre_com_dfx_regs[i].offset);
+		if (debug->last_words[i] != val)
+			pci_info(pdev, "Common_core:%s \t= 0x%08x => 0x%08x\n",
+			  hpre_com_dfx_regs[i].name, debug->last_words[i], val);
+	}
+
+	for (i = 0; i < clusters_num; i++) {
+		io_base = qm->io_base + hpre_cluster_offsets[i];
+		for (j = 0; j <  cluster_dfx_regs_num; j++) {
+			val = readl_relaxed(io_base +
+					     hpre_cluster_dfx_regs[j].offset);
+			idx = com_dfx_regs_num + i * cluster_dfx_regs_num + j;
+			if (debug->last_words[idx] != val)
+				pci_info(pdev, "cluster-%d:%s \t= 0x%08x => 0x%08x\n",
+				i, hpre_cluster_dfx_regs[j].name, debug->last_words[idx], val);
+		}
+	}
+}
+
 static void hpre_log_hw_error(struct hisi_qm *qm, u32 err_sts)
 {
 	const struct hpre_hw_error *err = hpre_hw_errors;
@@ -1081,6 +1167,7 @@ static const struct hisi_qm_err_ini hpre_err_ini = {
 	.open_axi_master_ooo	= hpre_open_axi_master_ooo,
 	.open_sva_prefetch	= hpre_open_sva_prefetch,
 	.close_sva_prefetch	= hpre_close_sva_prefetch,
+	.show_last_dfx_regs	= hpre_show_last_dfx_regs,
 	.err_info_init		= hpre_err_info_init,
 };
 
@@ -1098,8 +1185,11 @@ static int hpre_pf_probe_init(struct hpre *hpre)
 	qm->err_ini = &hpre_err_ini;
 	qm->err_ini->err_info_init(qm);
 	hisi_qm_dev_err_init(qm);
+	ret = hpre_show_last_regs_init(qm);
+	if (ret)
+		pci_err(qm->pdev, "Failed to init last word regs!\n");
 
-	return 0;
+	return ret;
 }
 
 static int hpre_probe_init(struct hpre *hpre)
@@ -1185,6 +1275,7 @@ static int hpre_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	hisi_qm_stop(qm, QM_NORMAL);
 
 err_with_err_init:
+	hpre_show_last_regs_uninit(qm);
 	hisi_qm_dev_err_uninit(qm);
 
 err_with_qm_init:
@@ -1215,6 +1306,7 @@ static void hpre_remove(struct pci_dev *pdev)
 	if (qm->fun_type == QM_HW_PF) {
 		hpre_cnt_regs_clear(qm);
 		qm->debug.curr_qm_qp_num = 0;
+		hpre_show_last_regs_uninit(qm);
 		hisi_qm_dev_err_uninit(qm);
 	}
 
-- 
Gitee


From 29c3db94b3d218f5370bc932a32c8770c276ecfa Mon Sep 17 00:00:00 2001
From: Kai Ye <yekai13@huawei.com>
Date: Tue, 19 Jul 2022 16:51:44 +0800
Subject: [PATCH 509/674] crypto: hisilicon/zip - support last word dumping

mainline inclusion
from mainline-v5.19-rc1
commit 5bfabd50c6fa
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=5bfabd50c6fa

----------------------------------------------------------------------

1. Add some debugging registers.
2. Add last word dumping function during zip engine controller reset.

Signed-off-by: Kai Ye <yekai13@huawei.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Jiangshui Yang <yangjiangshui@h-partners.com>
Reviewed-by: Yang Shen <shenyang39@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 drivers/crypto/hisilicon/zip/zip_main.c | 107 +++++++++++++++++++++++-
 1 file changed, 106 insertions(+), 1 deletion(-)

diff --git a/drivers/crypto/hisilicon/zip/zip_main.c b/drivers/crypto/hisilicon/zip/zip_main.c
index ed539755eaf8..727030e1a57e 100644
--- a/drivers/crypto/hisilicon/zip/zip_main.c
+++ b/drivers/crypto/hisilicon/zip/zip_main.c
@@ -234,6 +234,22 @@ static const struct debugfs_reg32 hzip_dfx_regs[] = {
 	{"HZIP_DECOMP_LZ77_CURR_ST       ",  0x9cull},
 };
 
+static const struct debugfs_reg32 hzip_com_dfx_regs[] = {
+	{"HZIP_CLOCK_GATE_CTRL           ",  0x301004},
+	{"HZIP_CORE_INT_RAS_CE_ENB       ",  0x301160},
+	{"HZIP_CORE_INT_RAS_NFE_ENB      ",  0x301164},
+	{"HZIP_CORE_INT_RAS_FE_ENB       ",  0x301168},
+	{"HZIP_UNCOM_ERR_RAS_CTRL        ",  0x30116C},
+};
+
+static const struct debugfs_reg32 hzip_dump_dfx_regs[] = {
+	{"HZIP_GET_BD_NUM                ",  0x00ull},
+	{"HZIP_GET_RIGHT_BD              ",  0x04ull},
+	{"HZIP_GET_ERROR_BD              ",  0x08ull},
+	{"HZIP_DONE_BD_NUM               ",  0x0cull},
+	{"HZIP_MAX_DELAY                 ",  0x20ull},
+};
+
 /* define the ZIP's dfx regs region and region length */
 static struct dfx_diff_registers hzip_diff_regs[] = {
 	{
@@ -773,6 +789,87 @@ static void hisi_zip_debugfs_exit(struct hisi_qm *qm)
 	}
 }
 
+static int hisi_zip_show_last_regs_init(struct hisi_qm *qm)
+{
+	int core_dfx_regs_num =  ARRAY_SIZE(hzip_dump_dfx_regs);
+	int com_dfx_regs_num = ARRAY_SIZE(hzip_com_dfx_regs);
+	struct qm_debug *debug = &qm->debug;
+	void __iomem *io_base;
+	int i, j, idx;
+
+	debug->last_words = kcalloc(core_dfx_regs_num * HZIP_CORE_NUM +
+			com_dfx_regs_num, sizeof(unsigned int), GFP_KERNEL);
+	if (!debug->last_words)
+		return -ENOMEM;
+
+	for (i = 0; i < com_dfx_regs_num; i++) {
+		io_base = qm->io_base + hzip_com_dfx_regs[i].offset;
+		debug->last_words[i] = readl_relaxed(io_base);
+	}
+
+	for (i = 0; i < HZIP_CORE_NUM; i++) {
+		io_base = qm->io_base + core_offsets[i];
+		for (j = 0; j < core_dfx_regs_num; j++) {
+			idx = com_dfx_regs_num + i * core_dfx_regs_num + j;
+			debug->last_words[idx] = readl_relaxed(
+				io_base + hzip_dump_dfx_regs[j].offset);
+		}
+	}
+
+	return 0;
+}
+
+static void hisi_zip_show_last_regs_uninit(struct hisi_qm *qm)
+{
+	struct qm_debug *debug = &qm->debug;
+
+	if (qm->fun_type == QM_HW_VF || !debug->last_words)
+		return;
+
+	kfree(debug->last_words);
+	debug->last_words = NULL;
+}
+
+static void hisi_zip_show_last_dfx_regs(struct hisi_qm *qm)
+{
+	int core_dfx_regs_num =  ARRAY_SIZE(hzip_dump_dfx_regs);
+	int com_dfx_regs_num = ARRAY_SIZE(hzip_com_dfx_regs);
+	struct qm_debug *debug = &qm->debug;
+	char buf[HZIP_BUF_SIZE];
+	void __iomem *base;
+	int i, j, idx;
+	u32 val;
+
+	if (qm->fun_type == QM_HW_VF || !debug->last_words)
+		return;
+
+	for (i = 0; i < com_dfx_regs_num; i++) {
+		val = readl_relaxed(qm->io_base + hzip_com_dfx_regs[i].offset);
+		if (debug->last_words[i] != val)
+			pci_info(qm->pdev, "com_dfx: %s \t= 0x%08x => 0x%08x\n",
+				hzip_com_dfx_regs[i].name, debug->last_words[i], val);
+	}
+
+	for (i = 0; i < HZIP_CORE_NUM; i++) {
+		if (i < HZIP_COMP_CORE_NUM)
+			scnprintf(buf, sizeof(buf), "Comp_core-%d", i);
+		else
+			scnprintf(buf, sizeof(buf), "Decomp_core-%d",
+				  i - HZIP_COMP_CORE_NUM);
+		base = qm->io_base + core_offsets[i];
+
+		pci_info(qm->pdev, "==>%s:\n", buf);
+		/* dump last word for dfx regs during control resetting */
+		for (j = 0; j < core_dfx_regs_num; j++) {
+			idx = com_dfx_regs_num + i * core_dfx_regs_num + j;
+			val = readl_relaxed(base + hzip_dump_dfx_regs[j].offset);
+			if (debug->last_words[idx] != val)
+				pci_info(qm->pdev, "%s \t= 0x%08x => 0x%08x\n",
+					hzip_dump_dfx_regs[j].name, debug->last_words[idx], val);
+		}
+	}
+}
+
 static void hisi_zip_log_hw_error(struct hisi_qm *qm, u32 err_sts)
 {
 	const struct hisi_zip_hw_error *err = zip_hw_error;
@@ -860,6 +957,7 @@ static const struct hisi_qm_err_ini hisi_zip_err_ini = {
 	.close_axi_master_ooo	= hisi_zip_close_axi_master_ooo,
 	.open_sva_prefetch	= hisi_zip_open_sva_prefetch,
 	.close_sva_prefetch	= hisi_zip_close_sva_prefetch,
+	.show_last_dfx_regs	= hisi_zip_show_last_dfx_regs,
 	.err_info_init		= hisi_zip_err_info_init,
 };
 
@@ -867,6 +965,7 @@ static int hisi_zip_pf_probe_init(struct hisi_zip *hisi_zip)
 {
 	struct hisi_qm *qm = &hisi_zip->qm;
 	struct hisi_zip_ctrl *ctrl;
+	int ret;
 
 	ctrl = devm_kzalloc(&qm->pdev->dev, sizeof(*ctrl), GFP_KERNEL);
 	if (!ctrl)
@@ -882,7 +981,11 @@ static int hisi_zip_pf_probe_init(struct hisi_zip *hisi_zip)
 	hisi_qm_dev_err_init(qm);
 	hisi_zip_debug_regs_clear(qm);
 
-	return 0;
+	ret = hisi_zip_show_last_regs_init(qm);
+	if (ret)
+		pci_err(qm->pdev, "Failed to init last word regs!\n");
+
+	return ret;
 }
 
 static int hisi_zip_qm_init(struct hisi_qm *qm, struct pci_dev *pdev)
@@ -1026,6 +1129,7 @@ static int hisi_zip_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	hisi_qm_stop(qm, QM_NORMAL);
 
 err_dev_err_uninit:
+	hisi_zip_show_last_regs_uninit(qm);
 	hisi_qm_dev_err_uninit(qm);
 
 err_qm_uninit:
@@ -1047,6 +1151,7 @@ static void hisi_zip_remove(struct pci_dev *pdev)
 
 	hisi_zip_debugfs_exit(qm);
 	hisi_qm_stop(qm, QM_NORMAL);
+	hisi_zip_show_last_regs_uninit(qm);
 	hisi_qm_dev_err_uninit(qm);
 	hisi_zip_qm_uninit(qm);
 }
-- 
Gitee


From e4c7bd8e82ef37fccbbb968e75ab58ec1d250a31 Mon Sep 17 00:00:00 2001
From: Yang Shen <shenyang39@huawei.com>
Date: Tue, 19 Jul 2022 16:51:45 +0800
Subject: [PATCH 510/674] crypto: hisilicon/sgl - align the hardware sgl dma
 address

mainline inclusion
from mainline-v5.19-rc1
commit 948e35f13181
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=948e35f13181

----------------------------------------------------------------------

The hardware needs aligned sgl dma address. So expend the sgl_size to
align 64 bytes.

Signed-off-by: Yang Shen <shenyang39@huawei.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Jiangshui Yang <yangjiangshui@h-partners.com>
Reviewed-by: Yang Shen <shenyang39@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 drivers/crypto/hisilicon/sgl.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/crypto/hisilicon/sgl.c b/drivers/crypto/hisilicon/sgl.c
index f7efc02b065f..2b6f2281cfd6 100644
--- a/drivers/crypto/hisilicon/sgl.c
+++ b/drivers/crypto/hisilicon/sgl.c
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2019 HiSilicon Limited. */
+#include <linux/align.h>
 #include <linux/dma-mapping.h>
 #include <linux/hisi_acc_qm.h>
 #include <linux/module.h>
@@ -64,8 +65,9 @@ struct hisi_acc_sgl_pool *hisi_acc_create_sgl_pool(struct device *dev,
 	if (!dev || !count || !sge_nr || sge_nr > HISI_ACC_SGL_SGE_NR_MAX)
 		return ERR_PTR(-EINVAL);
 
-	sgl_size = sizeof(struct acc_hw_sge) * sge_nr +
-		   sizeof(struct hisi_acc_hw_sgl);
+	sgl_size = ALIGN(sizeof(struct acc_hw_sge) * sge_nr +
+			 sizeof(struct hisi_acc_hw_sgl),
+			 HISI_ACC_SGL_ALIGN_SIZE);
 
 	/*
 	 * the pool may allocate a block of memory of size PAGE_SIZE * 2^(MAX_ORDER - 1),
-- 
Gitee


From 8061d4c1d05c1e2ec061b9b98940cd18d0f004dd Mon Sep 17 00:00:00 2001
From: Weili Qian <qianweili@huawei.com>
Date: Tue, 19 Jul 2022 16:51:46 +0800
Subject: [PATCH 511/674] crypto: hisilicon/qm - remove unused function
 declaration

mainline inclusion
from mainline-v5.19-rc1
commit 0b0002315adf
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0b0002315adf

----------------------------------------------------------------------

The 'hisi_qm_get_hw_version' function is unused, so remove the function
declaration.

Signed-off-by: Weili Qian <qianweili@huawei.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Jiangshui Yang <yangjiangshui@h-partners.com>
Reviewed-by: Yang Shen <shenyang39@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 include/linux/hisi_acc_qm.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h
index 677ce831949a..b84092a39429 100644
--- a/include/linux/hisi_acc_qm.h
+++ b/include/linux/hisi_acc_qm.h
@@ -454,7 +454,6 @@ int hisi_qp_send(struct hisi_qp *qp, const void *msg);
 int hisi_qm_get_free_qp_num(struct hisi_qm *qm);
 int hisi_qm_get_vft(struct hisi_qm *qm, u32 *base, u32 *number);
 void hisi_qm_debug_init(struct hisi_qm *qm);
-enum qm_hw_ver hisi_qm_get_hw_version(struct pci_dev *pdev);
 void hisi_qm_debug_regs_clear(struct hisi_qm *qm);
 int hisi_qm_sriov_enable(struct pci_dev *pdev, int max_vfs);
 int hisi_qm_sriov_disable(struct pci_dev *pdev, bool is_frozen);
-- 
Gitee


From 6362db4b8c498a64eac4f98d897b559c73dd1532 Mon Sep 17 00:00:00 2001
From: Weili Qian <qianweili@huawei.com>
Date: Tue, 19 Jul 2022 16:51:47 +0800
Subject: [PATCH 512/674] crypto: hisilicon/qm - set function with static

mainline inclusion
from mainline-v5.19-rc1
commit fb06eb9727d6
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=fb06eb9727d6

----------------------------------------------------------------------

These functions 'hisi_qm_create_qp' and 'hisi_qm_set_vft' are not
used outside qm.c, so they are marked as static.

Signed-off-by: Weili Qian <qianweili@huawei.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Jiangshui Yang <yangjiangshui@h-partners.com>
Reviewed-by: Yang Shen <shenyang39@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 drivers/crypto/hisilicon/qm.c | 6 ++----
 include/linux/hisi_acc_qm.h   | 2 --
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c
index e44d0c46705a..c14c52f17ca9 100644
--- a/drivers/crypto/hisilicon/qm.c
+++ b/drivers/crypto/hisilicon/qm.c
@@ -2835,7 +2835,7 @@ static struct hisi_qp *qm_create_qp_nolock(struct hisi_qm *qm, u8 alg_type)
  * return created qp, -EBUSY if all qps in qm allocated, -ENOMEM if allocating
  * qp memory fails.
  */
-struct hisi_qp *hisi_qm_create_qp(struct hisi_qm *qm, u8 alg_type)
+static struct hisi_qp *hisi_qm_create_qp(struct hisi_qm *qm, u8 alg_type)
 {
 	struct hisi_qp *qp;
 	int ret;
@@ -2853,7 +2853,6 @@ struct hisi_qp *hisi_qm_create_qp(struct hisi_qm *qm, u8 alg_type)
 
 	return qp;
 }
-EXPORT_SYMBOL_GPL(hisi_qm_create_qp);
 
 /**
  * hisi_qm_release_qp() - Release a qp back to its qm.
@@ -3738,7 +3737,7 @@ EXPORT_SYMBOL_GPL(hisi_qm_uninit);
  *
  * qm hw v1 does not support this interface.
  */
-int hisi_qm_get_vft(struct hisi_qm *qm, u32 *base, u32 *number)
+static int hisi_qm_get_vft(struct hisi_qm *qm, u32 *base, u32 *number)
 {
 	if (!base || !number)
 		return -EINVAL;
@@ -3750,7 +3749,6 @@ int hisi_qm_get_vft(struct hisi_qm *qm, u32 *base, u32 *number)
 
 	return qm->ops->get_vft(qm, base, number);
 }
-EXPORT_SYMBOL_GPL(hisi_qm_get_vft);
 
 /**
  * hisi_qm_set_vft() - Set vft to a qm.
diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h
index b84092a39429..4ac2d64c2f32 100644
--- a/include/linux/hisi_acc_qm.h
+++ b/include/linux/hisi_acc_qm.h
@@ -446,13 +446,11 @@ int hisi_qm_init(struct hisi_qm *qm);
 void hisi_qm_uninit(struct hisi_qm *qm);
 int hisi_qm_start(struct hisi_qm *qm);
 int hisi_qm_stop(struct hisi_qm *qm, enum qm_stop_reason r);
-struct hisi_qp *hisi_qm_create_qp(struct hisi_qm *qm, u8 alg_type);
 int hisi_qm_start_qp(struct hisi_qp *qp, unsigned long arg);
 int hisi_qm_stop_qp(struct hisi_qp *qp);
 void hisi_qm_release_qp(struct hisi_qp *qp);
 int hisi_qp_send(struct hisi_qp *qp, const void *msg);
 int hisi_qm_get_free_qp_num(struct hisi_qm *qm);
-int hisi_qm_get_vft(struct hisi_qm *qm, u32 *base, u32 *number);
 void hisi_qm_debug_init(struct hisi_qm *qm);
 void hisi_qm_debug_regs_clear(struct hisi_qm *qm);
 int hisi_qm_sriov_enable(struct pci_dev *pdev, int max_vfs);
-- 
Gitee


From 80a2e816f7a5196f99d029a7387d51962a74a3fe Mon Sep 17 00:00:00 2001
From: Weili Qian <qianweili@huawei.com>
Date: Tue, 19 Jul 2022 16:51:48 +0800
Subject: [PATCH 513/674] crypto: hisilicon/qm - replace hisi_qm_release_qp()
 with hisi_qm_free_qps()

mainline inclusion
from mainline-v5.19-rc1
commit 7982996c5b08
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=7982996c5b08

----------------------------------------------------------------------

hisi_qm_free_qps() can release multiple queues in one call, and it is
already exported. So, replace hisi_qm_release_qp() with hisi_qm_free_qps()
in zip_crypto.c, and do not export hisi_qm_release_qp() outside qm.c.

Signed-off-by: Weili Qian <qianweili@huawei.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Jiangshui Yang <yangjiangshui@h-partners.com>
Reviewed-by: Yang Shen <shenyang39@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 drivers/crypto/hisilicon/qm.c             | 3 +--
 drivers/crypto/hisilicon/zip/zip_crypto.c | 2 +-
 include/linux/hisi_acc_qm.h               | 1 -
 3 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c
index c14c52f17ca9..b5aedf7dacdd 100644
--- a/drivers/crypto/hisilicon/qm.c
+++ b/drivers/crypto/hisilicon/qm.c
@@ -2860,7 +2860,7 @@ static struct hisi_qp *hisi_qm_create_qp(struct hisi_qm *qm, u8 alg_type)
  *
  * This function releases the resource of a qp.
  */
-void hisi_qm_release_qp(struct hisi_qp *qp)
+static void hisi_qm_release_qp(struct hisi_qp *qp)
 {
 	struct hisi_qm *qm = qp->qm;
 
@@ -2878,7 +2878,6 @@ void hisi_qm_release_qp(struct hisi_qp *qp)
 
 	qm_pm_put_sync(qm);
 }
-EXPORT_SYMBOL_GPL(hisi_qm_release_qp);
 
 static int qm_sq_ctx_cfg(struct hisi_qp *qp, int qp_id, u32 pasid)
 {
diff --git a/drivers/crypto/hisilicon/zip/zip_crypto.c b/drivers/crypto/hisilicon/zip/zip_crypto.c
index 9520a4113c81..67869513e48c 100644
--- a/drivers/crypto/hisilicon/zip/zip_crypto.c
+++ b/drivers/crypto/hisilicon/zip/zip_crypto.c
@@ -521,7 +521,7 @@ static int hisi_zip_start_qp(struct hisi_qp *qp, struct hisi_zip_qp_ctx *ctx,
 static void hisi_zip_release_qp(struct hisi_zip_qp_ctx *ctx)
 {
 	hisi_qm_stop_qp(ctx->qp);
-	hisi_qm_release_qp(ctx->qp);
+	hisi_qm_free_qps(&ctx->qp, 1);
 }
 
 static const struct hisi_zip_sqe_ops hisi_zip_ops_v1 = {
diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h
index 4ac2d64c2f32..452d50f2c05c 100644
--- a/include/linux/hisi_acc_qm.h
+++ b/include/linux/hisi_acc_qm.h
@@ -448,7 +448,6 @@ int hisi_qm_start(struct hisi_qm *qm);
 int hisi_qm_stop(struct hisi_qm *qm, enum qm_stop_reason r);
 int hisi_qm_start_qp(struct hisi_qp *qp, unsigned long arg);
 int hisi_qm_stop_qp(struct hisi_qp *qp);
-void hisi_qm_release_qp(struct hisi_qp *qp);
 int hisi_qp_send(struct hisi_qp *qp, const void *msg);
 int hisi_qm_get_free_qp_num(struct hisi_qm *qm);
 void hisi_qm_debug_init(struct hisi_qm *qm);
-- 
Gitee


From 139dd6e79d062c15be9eaa37b0c66c3f257bba18 Mon Sep 17 00:00:00 2001
From: Weili Qian <qianweili@huawei.com>
Date: Tue, 19 Jul 2022 16:51:49 +0800
Subject: [PATCH 514/674] crypto: hisilicon/qm - remove
 hisi_qm_get_free_qp_num()

mainline inclusion
from mainline-v5.19-rc1
commit b0c42232fce4
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b0c42232fce4

----------------------------------------------------------------------

hisi_qm_get_free_qp_num() is to get the free queue number on the function.
It is a simple function and is only called by
hisi_qm_get_available_instances().

This patch modifies to get the free queue directly in
hisi_qm_get_available_instances(), and remove hisi_qm_get_free_qp_num().

Signed-off-by: Weili Qian <qianweili@huawei.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Jiangshui Yang <yangjiangshui@h-partners.com>
Reviewed-by: Yang Shen <shenyang39@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 drivers/crypto/hisilicon/qm.c | 28 +++++++++-------------------
 include/linux/hisi_acc_qm.h   |  1 -
 2 files changed, 9 insertions(+), 20 deletions(-)

diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c
index b5aedf7dacdd..f75e14e64c3c 100644
--- a/drivers/crypto/hisilicon/qm.c
+++ b/drivers/crypto/hisilicon/qm.c
@@ -3227,9 +3227,17 @@ static void qm_qp_event_notifier(struct hisi_qp *qp)
 	wake_up_interruptible(&qp->uacce_q->wait);
 }
 
+ /* This function returns free number of qp in qm. */
 static int hisi_qm_get_available_instances(struct uacce_device *uacce)
 {
-	return hisi_qm_get_free_qp_num(uacce->priv);
+	struct hisi_qm *qm = uacce->priv;
+	int ret;
+
+	down_read(&qm->qps_lock);
+	ret = qm->qp_num - qm->qp_in_used;
+	up_read(&qm->qps_lock);
+
+	return ret;
 }
 
 static void hisi_qm_set_hw_reset(struct hisi_qm *qm, int offset)
@@ -3540,24 +3548,6 @@ void hisi_qm_wait_task_finish(struct hisi_qm *qm, struct hisi_qm_list *qm_list)
 }
 EXPORT_SYMBOL_GPL(hisi_qm_wait_task_finish);
 
-/**
- * hisi_qm_get_free_qp_num() - Get free number of qp in qm.
- * @qm: The qm which want to get free qp.
- *
- * This function return free number of qp in qm.
- */
-int hisi_qm_get_free_qp_num(struct hisi_qm *qm)
-{
-	int ret;
-
-	down_read(&qm->qps_lock);
-	ret = qm->qp_num - qm->qp_in_used;
-	up_read(&qm->qps_lock);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(hisi_qm_get_free_qp_num);
-
 static void hisi_qp_memory_uninit(struct hisi_qm *qm, int num)
 {
 	struct device *dev = &qm->pdev->dev;
diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h
index 452d50f2c05c..7ffdee585153 100644
--- a/include/linux/hisi_acc_qm.h
+++ b/include/linux/hisi_acc_qm.h
@@ -449,7 +449,6 @@ int hisi_qm_stop(struct hisi_qm *qm, enum qm_stop_reason r);
 int hisi_qm_start_qp(struct hisi_qp *qp, unsigned long arg);
 int hisi_qm_stop_qp(struct hisi_qp *qp);
 int hisi_qp_send(struct hisi_qp *qp, const void *msg);
-int hisi_qm_get_free_qp_num(struct hisi_qm *qm);
 void hisi_qm_debug_init(struct hisi_qm *qm);
 void hisi_qm_debug_regs_clear(struct hisi_qm *qm);
 int hisi_qm_sriov_enable(struct pci_dev *pdev, int max_vfs);
-- 
Gitee


From d6217b97b475f3ebb4c466aee52b89e4b16f9588 Mon Sep 17 00:00:00 2001
From: Kai Ye <yekai13@huawei.com>
Date: Tue, 19 Jul 2022 16:51:50 +0800
Subject: [PATCH 515/674] crypto: hisilicon/sec - add sm4 generic selection

mainline inclusion
from mainline-v5.19-rc1
commit fdbf5e46e7af
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=fdbf5e46e7af

----------------------------------------------------------------------

Add sm4 generic selection for fallback tfm in the Kconfig.

Signed-off-by: Kai Ye <yekai13@huawei.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Jiangshui Yang <yangjiangshui@h-partners.com>
Reviewed-by: Yang Shen <shenyang39@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 drivers/crypto/hisilicon/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/crypto/hisilicon/Kconfig b/drivers/crypto/hisilicon/Kconfig
index ae0bd02f5c40..f9f749e96aae 100644
--- a/drivers/crypto/hisilicon/Kconfig
+++ b/drivers/crypto/hisilicon/Kconfig
@@ -26,6 +26,7 @@ config CRYPTO_DEV_HISI_SEC2
 	select CRYPTO_SHA1
 	select CRYPTO_SHA256
 	select CRYPTO_SHA512
+	select CRYPTO_SM4
 	depends on PCI && PCI_MSI
 	depends on UACCE || UACCE=n
 	depends on ARM64 || (COMPILE_TEST && 64BIT)
-- 
Gitee


From da6deb3f3174a3df57e060911f6f7fe933f64194 Mon Sep 17 00:00:00 2001
From: Kai Ye <yekai13@huawei.com>
Date: Tue, 19 Jul 2022 16:51:51 +0800
Subject: [PATCH 516/674] crypto: hisilicon/sec - delete the flag
 CRYPTO_ALG_ALLOCATES_MEMORY

mainline inclusion
from mainline-v5.19-rc1
commit 2d33f5771b51
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5AFY1
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=2d33f5771b51

----------------------------------------------------------------------

Should not to uses the CRYPTO_ALG_ALLOCATES_MEMORY in SEC2. The SEC2
driver uses the pre-allocated buffers, including the src sgl pool, dst
sgl pool and other qp ctx resources. (e.g. IV buffer, mac buffer, key
buffer). The SEC2 driver doesn't allocate memory during request processing.
The driver only maps software sgl to allocated hardware sgl during I/O. So
here is fix it.

Signed-off-by: Kai Ye <yekai13@huawei.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Jiangshui Yang <yangjiangshui@h-partners.com>
Reviewed-by: Yang Shen <shenyang39@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 drivers/crypto/hisilicon/sec2/sec_crypto.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/crypto/hisilicon/sec2/sec_crypto.c b/drivers/crypto/hisilicon/sec2/sec_crypto.c
index a91635c348b5..6eebe739893c 100644
--- a/drivers/crypto/hisilicon/sec2/sec_crypto.c
+++ b/drivers/crypto/hisilicon/sec2/sec_crypto.c
@@ -2113,7 +2113,6 @@ static int sec_skcipher_decrypt(struct skcipher_request *sk_req)
 		.cra_driver_name = "hisi_sec_"sec_cra_name,\
 		.cra_priority = SEC_PRIORITY,\
 		.cra_flags = CRYPTO_ALG_ASYNC |\
-		 CRYPTO_ALG_ALLOCATES_MEMORY |\
 		 CRYPTO_ALG_NEED_FALLBACK,\
 		.cra_blocksize = blk_size,\
 		.cra_ctxsize = sizeof(struct sec_ctx),\
@@ -2366,7 +2365,6 @@ static int sec_aead_decrypt(struct aead_request *a_req)
 		.cra_driver_name = "hisi_sec_"sec_cra_name,\
 		.cra_priority = SEC_PRIORITY,\
 		.cra_flags = CRYPTO_ALG_ASYNC |\
-		 CRYPTO_ALG_ALLOCATES_MEMORY |\
 		 CRYPTO_ALG_NEED_FALLBACK,\
 		.cra_blocksize = blk_size,\
 		.cra_ctxsize = sizeof(struct sec_ctx),\
-- 
Gitee


From ce5059233c0fda81b7fd2c871b260b73499e0d75 Mon Sep 17 00:00:00 2001
From: Zhengchao Shao <shaozhengchao@huawei.com>
Date: Tue, 19 Jul 2022 22:10:05 +0800
Subject: [PATCH 517/674] crypto: hisilicon/sec - don't sleep when in softirq

mainline inclusion
from mainline-v5.19-rc6
commit 02884a4f12de11f54d4ca67a07dd1f111d96fdbd
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GEVR
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/herbert/cryptodev-2.6.git/commit/?id=68740ab505431f268dc1ee26a54b871e75f0ddaa

--------------------------------

When kunpeng920 encryption driver is used to deencrypt and decrypt
packets during the softirq, it is not allowed to use mutex lock. The
kernel will report the following error:

BUG: scheduling while atomic: swapper/57/0/0x00000300
Call trace:
dump_backtrace+0x0/0x1e4
show_stack+0x20/0x2c
dump_stack+0xd8/0x140
__schedule_bug+0x68/0x80
__schedule+0x728/0x840
schedule+0x50/0xe0
schedule_preempt_disabled+0x18/0x24
__mutex_lock.constprop.0+0x594/0x5dc
__mutex_lock_slowpath+0x1c/0x30
mutex_lock+0x50/0x60
sec_request_init+0x8c/0x1a0 [hisi_sec2]
sec_process+0x28/0x1ac [hisi_sec2]
sec_skcipher_crypto+0xf4/0x1d4 [hisi_sec2]
sec_skcipher_encrypt+0x1c/0x30 [hisi_sec2]
crypto_skcipher_encrypt+0x2c/0x40
crypto_authenc_encrypt+0xc8/0xfc [authenc]
crypto_aead_encrypt+0x2c/0x40
echainiv_encrypt+0x144/0x1a0 [echainiv]
crypto_aead_encrypt+0x2c/0x40
esp_output_tail+0x348/0x5c0 [esp4]
esp_output+0x120/0x19c [esp4]
xfrm_output_one+0x25c/0x4d4
xfrm_output_resume+0x6c/0x1fc
xfrm_output+0xac/0x3c0
xfrm4_output+0x64/0x130
ip_build_and_send_pkt+0x158/0x20c
tcp_v4_send_synack+0xdc/0x1f0
tcp_conn_request+0x7d0/0x994
tcp_v4_conn_request+0x58/0x6c
tcp_v6_conn_request+0xf0/0x100
tcp_rcv_state_process+0x1cc/0xd60
tcp_v4_do_rcv+0x10c/0x250
tcp_v4_rcv+0xfc4/0x10a4
ip_protocol_deliver_rcu+0xf4/0x200
ip_local_deliver_finish+0x58/0x70
ip_local_deliver+0x68/0x120
ip_sublist_rcv_finish+0x70/0x94
ip_list_rcv_finish.constprop.0+0x17c/0x1d0
ip_sublist_rcv+0x40/0xb0
ip_list_rcv+0x140/0x1dc
__netif_receive_skb_list_core+0x154/0x28c
__netif_receive_skb_list+0x120/0x1a0
netif_receive_skb_list_internal+0xe4/0x1f0
napi_complete_done+0x70/0x1f0
gro_cell_poll+0x9c/0xb0
napi_poll+0xcc/0x264
net_rx_action+0xd4/0x21c
__do_softirq+0x130/0x358
irq_exit+0x11c/0x13c
__handle_domain_irq+0x88/0xf0
gic_handle_irq+0x78/0x2c0
el1_irq+0xb8/0x140
arch_cpu_idle+0x18/0x40
default_idle_call+0x5c/0x1c0
cpuidle_idle_call+0x174/0x1b0
do_idle+0xc8/0x160
cpu_startup_entry+0x30/0x11c
secondary_start_kernel+0x158/0x1e4
softirq: huh, entered softirq 3 NET_RX 0000000093774ee4 with
preempt_count 00000100, exited with fffffe00?

Fixes: 416d82204df4 ("crypto: hisilicon - add HiSilicon SEC V2 driver")
Signed-off-by: Zhengchao Shao <shaozhengchao@huawei.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Signed-off-by: Zhengchao Shao <shaozhengchao@huawei.com>
Reviewed-by: Yue Haibing <yuehaibing@huawei.com>
Reviewed-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 drivers/crypto/hisilicon/sec2/sec.h        |  2 +-
 drivers/crypto/hisilicon/sec2/sec_crypto.c | 20 ++++++++++----------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/drivers/crypto/hisilicon/sec2/sec.h b/drivers/crypto/hisilicon/sec2/sec.h
index c2e9b01187a7..a44c8dba3cda 100644
--- a/drivers/crypto/hisilicon/sec2/sec.h
+++ b/drivers/crypto/hisilicon/sec2/sec.h
@@ -119,7 +119,7 @@ struct sec_qp_ctx {
 	struct idr req_idr;
 	struct sec_alg_res res[QM_Q_DEPTH];
 	struct sec_ctx *ctx;
-	struct mutex req_lock;
+	spinlock_t req_lock;
 	struct list_head backlog;
 	struct hisi_acc_sgl_pool *c_in_pool;
 	struct hisi_acc_sgl_pool *c_out_pool;
diff --git a/drivers/crypto/hisilicon/sec2/sec_crypto.c b/drivers/crypto/hisilicon/sec2/sec_crypto.c
index 6eebe739893c..71dfa7db6394 100644
--- a/drivers/crypto/hisilicon/sec2/sec_crypto.c
+++ b/drivers/crypto/hisilicon/sec2/sec_crypto.c
@@ -127,11 +127,11 @@ static int sec_alloc_req_id(struct sec_req *req, struct sec_qp_ctx *qp_ctx)
 {
 	int req_id;
 
-	mutex_lock(&qp_ctx->req_lock);
+	spin_lock_bh(&qp_ctx->req_lock);
 
 	req_id = idr_alloc_cyclic(&qp_ctx->req_idr, NULL,
 				  0, QM_Q_DEPTH, GFP_ATOMIC);
-	mutex_unlock(&qp_ctx->req_lock);
+	spin_unlock_bh(&qp_ctx->req_lock);
 	if (unlikely(req_id < 0)) {
 		dev_err(req->ctx->dev, "alloc req id fail!\n");
 		return req_id;
@@ -156,9 +156,9 @@ static void sec_free_req_id(struct sec_req *req)
 	qp_ctx->req_list[req_id] = NULL;
 	req->qp_ctx = NULL;
 
-	mutex_lock(&qp_ctx->req_lock);
+	spin_lock_bh(&qp_ctx->req_lock);
 	idr_remove(&qp_ctx->req_idr, req_id);
-	mutex_unlock(&qp_ctx->req_lock);
+	spin_unlock_bh(&qp_ctx->req_lock);
 }
 
 static u8 pre_parse_finished_bd(struct bd_status *status, void *resp)
@@ -273,7 +273,7 @@ static int sec_bd_send(struct sec_ctx *ctx, struct sec_req *req)
 	    !(req->flag & CRYPTO_TFM_REQ_MAY_BACKLOG))
 		return -EBUSY;
 
-	mutex_lock(&qp_ctx->req_lock);
+	spin_lock_bh(&qp_ctx->req_lock);
 	ret = hisi_qp_send(qp_ctx->qp, &req->sec_sqe);
 
 	if (ctx->fake_req_limit <=
@@ -281,10 +281,10 @@ static int sec_bd_send(struct sec_ctx *ctx, struct sec_req *req)
 		list_add_tail(&req->backlog_head, &qp_ctx->backlog);
 		atomic64_inc(&ctx->sec->debug.dfx.send_cnt);
 		atomic64_inc(&ctx->sec->debug.dfx.send_busy_cnt);
-		mutex_unlock(&qp_ctx->req_lock);
+		spin_unlock_bh(&qp_ctx->req_lock);
 		return -EBUSY;
 	}
-	mutex_unlock(&qp_ctx->req_lock);
+	spin_unlock_bh(&qp_ctx->req_lock);
 
 	if (unlikely(ret == -EBUSY))
 		return -ENOBUFS;
@@ -487,7 +487,7 @@ static int sec_create_qp_ctx(struct hisi_qm *qm, struct sec_ctx *ctx,
 
 	qp->req_cb = sec_req_cb;
 
-	mutex_init(&qp_ctx->req_lock);
+	spin_lock_init(&qp_ctx->req_lock);
 	idr_init(&qp_ctx->req_idr);
 	INIT_LIST_HEAD(&qp_ctx->backlog);
 
@@ -1382,7 +1382,7 @@ static struct sec_req *sec_back_req_clear(struct sec_ctx *ctx,
 {
 	struct sec_req *backlog_req = NULL;
 
-	mutex_lock(&qp_ctx->req_lock);
+	spin_lock_bh(&qp_ctx->req_lock);
 	if (ctx->fake_req_limit >=
 	    atomic_read(&qp_ctx->qp->qp_status.used) &&
 	    !list_empty(&qp_ctx->backlog)) {
@@ -1390,7 +1390,7 @@ static struct sec_req *sec_back_req_clear(struct sec_ctx *ctx,
 				typeof(*backlog_req), backlog_head);
 		list_del(&backlog_req->backlog_head);
 	}
-	mutex_unlock(&qp_ctx->req_lock);
+	spin_unlock_bh(&qp_ctx->req_lock);
 
 	return backlog_req;
 }
-- 
Gitee


From 21f33dad65b601629e5d4b1441aa4b3596c2ef39 Mon Sep 17 00:00:00 2001
From: GUO Zihua <guozihua@huawei.com>
Date: Tue, 19 Jul 2022 22:10:06 +0800
Subject: [PATCH 518/674] KEYS: Fix error path return value in
 pgp_generate_fingerprint

hulk inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5GRSV
CVE: NA

--------------------------------

In function pgp_generate_fingerprint, return value is not set correctly
shall kmalloc failed, leading to the caller triggering a read out-of-bound
while reading ctx.fingerprint or ctx.raw_fingerprint.

This patch fixes this issue by setting correct return value on the error
path.

Fixes: 4006f47d4e21 ("KEYS: PGP data parser")
Signed-off-by: GUO Zihua <guozihua@huawei.com>
Reviewed-by: Roberto Sassu <roberto.sassu@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 crypto/asymmetric_keys/pgp_public_key.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/crypto/asymmetric_keys/pgp_public_key.c b/crypto/asymmetric_keys/pgp_public_key.c
index 27b9efeafc4f..33a089797e59 100644
--- a/crypto/asymmetric_keys/pgp_public_key.c
+++ b/crypto/asymmetric_keys/pgp_public_key.c
@@ -152,8 +152,10 @@ static int pgp_generate_fingerprint(struct pgp_key_data_parse_context *ctx,
 	digest_size = crypto_shash_digestsize(tfm);
 
 	raw_fingerprint = kmalloc(digest_size, GFP_KERNEL);
-	if (!raw_fingerprint)
+	if (!raw_fingerprint) {
+		ret = -ENOMEM;
 		goto cleanup_hash;
+	}
 
 	ret = crypto_shash_final(digest, raw_fingerprint);
 	if (ret < 0)
@@ -161,8 +163,10 @@ static int pgp_generate_fingerprint(struct pgp_key_data_parse_context *ctx,
 
 	ctx->fingerprint_len = digest_size * 2;
 	fingerprint = kmalloc(digest_size * 2 + 1, GFP_KERNEL);
-	if (!fingerprint)
+	if (!fingerprint) {
+		ret = -ENOMEM;
 		goto cleanup_raw_fingerprint;
+	}
 
 	offset = digest_size - 8;
 	pr_debug("offset %u/%u\n", offset, digest_size);
-- 
Gitee


From a176b3de08939eb678861bfb694b2145cf8c05b8 Mon Sep 17 00:00:00 2001
From: GUO Zihua <guozihua@huawei.com>
Date: Tue, 19 Jul 2022 22:10:07 +0800
Subject: [PATCH 519/674] KEYS: Add safe guard against faulty PGP key

hulk inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5H4FC
CVE: NA

--------------------------------

Under normal condition, when there is a user id packet, there will
always be a public key packet in the front, meaning ctx.fingerprint
will never be NULL. However, if a malicious or faulty PGP key is
provided with only user id packet but not public key packet, a read
out-of-bound will be triggered during the generation of key
description. To make things worse, a NULL pointer deference could be
triggered in pgp_key_generate_id().

This patch adds a safe guard which prevents parsing the key further if
no public key packet is provided.

Fixes: a98cb7a4b757 ("KEYS: Provide PGP key description autogeneration")
Signed-off-by: GUO Zihua <guozihua@huawei.com>
Reviewed-by: Roberto Sassu <roberto.sassu@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 crypto/asymmetric_keys/pgp_public_key.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/crypto/asymmetric_keys/pgp_public_key.c b/crypto/asymmetric_keys/pgp_public_key.c
index 33a089797e59..98b1707a0164 100644
--- a/crypto/asymmetric_keys/pgp_public_key.c
+++ b/crypto/asymmetric_keys/pgp_public_key.c
@@ -315,6 +315,11 @@ static int pgp_key_parse(struct key_preparsed_payload *prep)
 	if (ret < 0)
 		goto error;
 
+	if (!ctx.fingerprint) {
+		ret = -EINVAL;
+		goto error;
+	}
+
 	if (ctx.user_id && ctx.user_id_len > 0) {
 		/* Propose a description for the key
 		 * (user ID without the comment)
-- 
Gitee


From 886e6e2458ccd70aee6b17f5b40638cde92aa89e Mon Sep 17 00:00:00 2001
From: GUO Zihua <guozihua@huawei.com>
Date: Tue, 19 Jul 2022 22:10:08 +0800
Subject: [PATCH 520/674] KEYS: Fix mistaken sizeof call in pgp_key_generate_id

hulk inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5H61R
CVE: NA

--------------------------------

pgp_key_generate_id() is trying get the size of a flexible length
structure, however the sizeof() is called on the pointer itself.
Besides, considering it's trying to get the size of a flexible length
structure, use struct_size() instead.

Fixes: 4006f47d4e21 ("KEYS: PGP data parser")
Signed-off-by: GUO Zihua <guozihua@huawei.com>
Reviewed-by: Roberto Sassu <roberto.sassu@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 crypto/asymmetric_keys/pgp_public_key.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/crypto/asymmetric_keys/pgp_public_key.c b/crypto/asymmetric_keys/pgp_public_key.c
index 98b1707a0164..928029a13435 100644
--- a/crypto/asymmetric_keys/pgp_public_key.c
+++ b/crypto/asymmetric_keys/pgp_public_key.c
@@ -283,7 +283,8 @@ static struct asymmetric_key_ids *pgp_key_generate_id(
 		goto error;
 
 	kids->id[0] = kid;
-	kids->id[1] = kmemdup(kid, sizeof(kid) + fingerprint_len, GFP_KERNEL);
+	kids->id[1] = kmemdup(kid, struct_size(kid, data, fingerprint_len),
+			      GFP_KERNEL);
 	if (!kids->id[1])
 		goto error;
 
-- 
Gitee


From ad02cf0db6075088d349c2a89d564aa04648395c Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Tue, 19 Jul 2022 22:10:09 +0800
Subject: [PATCH 521/674] block: fix mismatch size for flush_rq

hulk inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HEZ8
CVE: NA

--------------------------------

commit 5c250d556e10 ("blk-mq: fix kabi broken in struct request")
intrudoce 'struct request_wrapper' to fix kabi broken in 'struct request',
it requires to allocate more size for 'struct request'. However, flush_rq
is missed for such adaptation, which will lead to following
slab-out-of-bounds:

==================================================================
BUG: KASAN: slab-out-of-bounds in sg_init_table+0x23/0x40
Write of size 4096 at addr ffff88812249a148 by task swapper/0/1

Call Trace:
 dump_stack+0xbe/0xf9
 ? sg_init_table+0x23/0x40
 print_address_description.constprop.0+0x1e/0x220
 ? _raw_spin_lock_irqsave+0x80/0xe0
 ? _raw_write_unlock_irqrestore+0x20/0x20
 ? blk_alloc_flush_queue+0xd3/0x1a0
 ? sg_init_table+0x23/0x40
 ? sg_init_table+0x23/0x40
 kasan_report.cold+0x67/0x7f
 ? sg_init_table+0x23/0x40
 check_memory_region+0x17c/0x1e0
 memset+0x20/0x40
 sg_init_table+0x23/0x40
 virtblk_init_request+0x3d/0x50
 ? virtblk_map_queues+0x40/0x40
 blk_mq_realloc_hw_ctxs+0x44d/0xb50
 blk_mq_init_allocated_queue+0x20f/0x980
 ? blk_set_default_limits+0x1ac/0x1c0
 ? blk_alloc_queue+0x3f0/0x410
 blk_mq_init_queue_data+0x58/0xa0
 virtblk_probe+0x51b/0xee0
 ? cache_type_store+0x1a0/0x1a0
 ? __sanitizer_cov_trace_switch+0x50/0x90
 ? ioread8+0x89/0xa0
 virtio_dev_probe+0x449/0x5d0
 ? virtio_features_ok.part.0+0xb0/0xb0
 really_probe+0x26d/0x8a0
 driver_probe_device+0xef/0x280
 device_driver_attach+0xaf/0xc0
 __driver_attach+0x158/0x280
 ? device_driver_attach+0xc0/0xc0
 bus_for_each_dev+0x111/0x180
 ? subsys_dev_iter_exit+0x20/0x20
 ? bus_add_driver+0xb6/0x3e0
 ? klist_node_init+0x7c/0xb0
 bus_add_driver+0x336/0x3e0
 driver_register+0x105/0x1a0
 ? nbd_init+0x273/0x273
 init+0x69/0xad
 do_one_initcall+0xcb/0x370
 ? initcall_blacklisted+0x1b0/0x1b0
 ? parameq+0x110/0x110
 ? __kasan_kmalloc.constprop.0+0xc2/0xd0
 ? kasan_unpoison_shadow+0x33/0x40
 do_initcalls+0x223/0x265
 kernel_init_freeable+0x2bb/0x302
 ? rest_init+0xea/0xea
 kernel_init+0x13/0x1f6
 ? rest_init+0xea/0xea
 ret_from_fork+0x22/0x30

Allocated by task 1:
 kasan_save_stack+0x1b/0x40
 __kasan_kmalloc.constprop.0+0xc2/0xd0
 blk_alloc_flush_queue+0xd3/0x1a0
 blk_mq_realloc_hw_ctxs+0x9fa/0xb50
 blk_mq_init_allocated_queue+0x20f/0x980
 blk_mq_init_queue_data+0x58/0xa0
 virtblk_probe+0x51b/0xee0
 virtio_dev_probe+0x449/0x5d0
 really_probe+0x26d/0x8a0
 driver_probe_device+0xef/0x280
 device_driver_attach+0xaf/0xc0
 __driver_attach+0x158/0x280
 bus_for_each_dev+0x111/0x180
 bus_add_driver+0x336/0x3e0
 driver_register+0x105/0x1a0
 init+0x69/0xad
 do_one_initcall+0xcb/0x370
 do_initcalls+0x223/0x265
 kernel_init_freeable+0x2bb/0x302
 kernel_init+0x13/0x1f6
 ret_from_fork+0x22/0x30

Fixes: 5c250d556e10 ("blk-mq: fix kabi broken in struct request")
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 block/blk-flush.c         | 2 +-
 drivers/scsi/scsi_error.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/block/blk-flush.c b/block/blk-flush.c
index 82919829bc4d..71faf07a626f 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -470,7 +470,7 @@ struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size,
 					      gfp_t flags)
 {
 	struct blk_flush_queue *fq;
-	int rq_sz = sizeof(struct request);
+	int rq_sz = sizeof(struct request_wrapper);
 
 	fq = kzalloc_node(sizeof(*fq), flags, node);
 	if (!fq)
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index f11f51e2465f..bcbeadb2d0f0 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -2359,7 +2359,7 @@ scsi_ioctl_reset(struct scsi_device *dev, int __user *arg)
 		return -EIO;
 
 	error = -EIO;
-	rq = kzalloc(sizeof(struct request) + sizeof(struct scsi_cmnd) +
+	rq = kzalloc(sizeof(struct request_wrapper) + sizeof(struct scsi_cmnd) +
 			shost->hostt->cmd_size, GFP_KERNEL);
 	if (!rq)
 		goto out_put_autopm_host;
-- 
Gitee


From 22706f326dde3b8d062c738904f28c4c6501f9ad Mon Sep 17 00:00:00 2001
From: Zhang Zekun <zhangzekun11@huawei.com>
Date: Tue, 19 Jul 2022 22:10:10 +0800
Subject: [PATCH 522/674] arm64: openeuler_defconfig: enable ACPI_HMAT and
 HOT_MEMREMOVE

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5E461
CVE: NA

------------------------

Enable ACPI HMAT and memory hot remove feature on arm64 by default.

For ACPI_HMAT: ACPI HMAT describe the memory attributes, such as bandwidth
and latency details, related to the System Physical Address(SPA) Memory
Ranges. HMAT is especially useful when software wants to get some information
about a certain special memory's memory attributes, such as PMEM and HBM.

For MEMORY_HOT_REMOTE: Add support for memory hot remove feature. Some special
memory, such as HBM, can be power comsuming, and will only be used in some
aimed senarios. With memory hot remove feature, User can offline the idle memory
for energy saving purpose when this special memory is unused.

As PMEM and HBM are getting more popular, ACPI_HMAT and memory hot remove feature
should be enabled as default.

The following configs should be opened with CONFIG_ACPI_HMAT by default:
1.CONFIG_EFI_SOFT_RESERVE=y
2.CONFIG_HMEM_REPORTING=y
3.CONFIG_DEV_DAX_HMEM=m
4.CONFIG_DEV_DAX_HMEM_DEVICES=y

Signed-off-by: Zhang Zekun <zhangzekun11@huawei.com>
Reviewed-by: Chao Liu <liuchao173@huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Reviewed-by: Kai Liu <kai.liu@suse.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 arch/arm64/configs/openeuler_defconfig | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig
index 78a63cbc3db6..d246fd508ef6 100644
--- a/arch/arm64/configs/openeuler_defconfig
+++ b/arch/arm64/configs/openeuler_defconfig
@@ -709,7 +709,12 @@ CONFIG_ACPI_REDUCED_HARDWARE_ONLY=y
 CONFIG_ACPI_NFIT=m
 # CONFIG_NFIT_SECURITY_DEBUG is not set
 CONFIG_ACPI_NUMA=y
-# CONFIG_ACPI_HMAT is not set
+CONFIG_ACPI_HMAT=y
+CONFIG_EFI_SOFT_RESERVE=y
+# CONFIG_ZONE_DEVICE is not set
+CONFIG_HMEM_REPORTING=y
+CONFIG_DEV_DAX_HMEM=m
+CONFIG_DEV_DAX_HMEM_DEVICES=y
 CONFIG_HAVE_ACPI_APEI=y
 CONFIG_ACPI_APEI=y
 CONFIG_ACPI_APEI_GHES=y
@@ -1046,7 +1051,7 @@ CONFIG_COHERENT_DEVICE=y
 CONFIG_MEMORY_HOTPLUG=y
 CONFIG_MEMORY_HOTPLUG_SPARSE=y
 CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE=y
-# CONFIG_MEMORY_HOTREMOVE is not set
+CONFIG_MEMORY_HOTREMOVE=y
 CONFIG_SPLIT_PTLOCK_CPUS=4
 CONFIG_MEMORY_BALLOON=y
 CONFIG_BALLOON_COMPACTION=y
-- 
Gitee


From 4be8584d8d0305e1c6b673777f8a351481e78b73 Mon Sep 17 00:00:00 2001
From: Chen Zhongjin <chenzhongjin@huawei.com>
Date: Tue, 19 Jul 2022 22:10:11 +0800
Subject: [PATCH 523/674] ARM: module: Add all unwind tables when load module

mainline inclusion
from mainline-v5.18-rc1
commit b6f21d14f1ac1261579b691673a0c823275cbaf8
category: bugfix
bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5H8ET

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b6f21d14f1ac1261579b691673a0c823275cbaf8

--------------------------------

For EABI stack unwinding, when loading .ko module
the EXIDX sections will be added to a unwind_table list.

However not all EXIDX sections are added because EXIDX
sections are searched by hardcoded section names.

For functions in other sections such as .ref.text
or .kprobes.text, gcc generates seprated EXIDX sections
(such as .ARM.exidx.ref.text or .ARM.exidx.kprobes.text).

These extra EXIDX sections are not loaded, so when unwinding
functions in these sections, we will failed with:

	unwind: Index not found xxx

To fix that, I refactor the code for searching and adding
EXIDX sections:

- Check section type to search EXIDX tables (0x70000001)
instead of strcmp() the hardcoded names. Then find the
corresponding text sections by their section names.

- Add a unwind_table list in module->arch to save their own
unwind_table instead of the fixed-lenth array.

- Save .ARM.exidx.init.text section ptr, because it should
be cleaned after module init.

Now all EXIDX sections of .ko can be added correctly.

Signed-off-by: Chen Zhongjin <chenzhongjin@huawei.com>
Reviewed-by: Kuohai Xu <xukuohai@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 arch/arm/include/asm/module.h | 17 ++------
 arch/arm/include/asm/unwind.h |  1 +
 arch/arm/kernel/module.c      | 78 ++++++++++++++++++-----------------
 3 files changed, 45 insertions(+), 51 deletions(-)

diff --git a/arch/arm/include/asm/module.h b/arch/arm/include/asm/module.h
index 734b8fe36896..c7c5dc6f3777 100644
--- a/arch/arm/include/asm/module.h
+++ b/arch/arm/include/asm/module.h
@@ -3,20 +3,10 @@
 #define _ASM_ARM_MODULE_H
 
 #include <asm-generic/module.h>
-
-struct unwind_table;
+#include <asm/unwind.h>
 
 #ifdef CONFIG_ARM_UNWIND
-enum {
-	ARM_SEC_INIT,
-	ARM_SEC_DEVINIT,
-	ARM_SEC_CORE,
-	ARM_SEC_EXIT,
-	ARM_SEC_DEVEXIT,
-	ARM_SEC_HOT,
-	ARM_SEC_UNLIKELY,
-	ARM_SEC_MAX,
-};
+#define ELF_SECTION_UNWIND 0x70000001
 #endif
 
 #define PLT_ENT_STRIDE		L1_CACHE_BYTES
@@ -37,7 +27,8 @@ struct mod_plt_sec {
 
 struct mod_arch_specific {
 #ifdef CONFIG_ARM_UNWIND
-	struct unwind_table *unwind[ARM_SEC_MAX];
+	struct list_head unwind_list;
+	struct unwind_table *init_table;
 #endif
 #ifdef CONFIG_ARM_MODULE_PLTS
 	struct mod_plt_sec	core;
diff --git a/arch/arm/include/asm/unwind.h b/arch/arm/include/asm/unwind.h
index 0f8a3439902d..b51f85417f58 100644
--- a/arch/arm/include/asm/unwind.h
+++ b/arch/arm/include/asm/unwind.h
@@ -24,6 +24,7 @@ struct unwind_idx {
 
 struct unwind_table {
 	struct list_head list;
+	struct list_head mod_list;
 	const struct unwind_idx *start;
 	const struct unwind_idx *origin;
 	const struct unwind_idx *stop;
diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c
index 1cd09cf38c69..bfe2bc380d38 100644
--- a/arch/arm/kernel/module.c
+++ b/arch/arm/kernel/module.c
@@ -369,46 +369,40 @@ int module_finalize(const Elf32_Ehdr *hdr, const Elf_Shdr *sechdrs,
 #ifdef CONFIG_ARM_UNWIND
 	const char *secstrs = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
 	const Elf_Shdr *sechdrs_end = sechdrs + hdr->e_shnum;
-	struct mod_unwind_map maps[ARM_SEC_MAX];
-	int i;
+	struct list_head *unwind_list = &mod->arch.unwind_list;
 
-	memset(maps, 0, sizeof(maps));
+	INIT_LIST_HEAD(unwind_list);
+	mod->arch.init_table = NULL;
 
 	for (s = sechdrs; s < sechdrs_end; s++) {
 		const char *secname = secstrs + s->sh_name;
+		const char *txtname;
+		const Elf_Shdr *txt_sec;
 
-		if (!(s->sh_flags & SHF_ALLOC))
+		if (!(s->sh_flags & SHF_ALLOC) ||
+		    s->sh_type != ELF_SECTION_UNWIND)
 			continue;
 
-		if (strcmp(".ARM.exidx.init.text", secname) == 0)
-			maps[ARM_SEC_INIT].unw_sec = s;
-		else if (strcmp(".ARM.exidx", secname) == 0)
-			maps[ARM_SEC_CORE].unw_sec = s;
-		else if (strcmp(".ARM.exidx.exit.text", secname) == 0)
-			maps[ARM_SEC_EXIT].unw_sec = s;
-		else if (strcmp(".ARM.exidx.text.unlikely", secname) == 0)
-			maps[ARM_SEC_UNLIKELY].unw_sec = s;
-		else if (strcmp(".ARM.exidx.text.hot", secname) == 0)
-			maps[ARM_SEC_HOT].unw_sec = s;
-		else if (strcmp(".init.text", secname) == 0)
-			maps[ARM_SEC_INIT].txt_sec = s;
-		else if (strcmp(".text", secname) == 0)
-			maps[ARM_SEC_CORE].txt_sec = s;
-		else if (strcmp(".exit.text", secname) == 0)
-			maps[ARM_SEC_EXIT].txt_sec = s;
-		else if (strcmp(".text.unlikely", secname) == 0)
-			maps[ARM_SEC_UNLIKELY].txt_sec = s;
-		else if (strcmp(".text.hot", secname) == 0)
-			maps[ARM_SEC_HOT].txt_sec = s;
-	}
+		if (!strcmp(".ARM.exidx", secname))
+			txtname = ".text";
+		else
+			txtname = secname + strlen(".ARM.exidx");
+		txt_sec = find_mod_section(hdr, sechdrs, txtname);
+
+		if (txt_sec) {
+			struct unwind_table *table =
+				unwind_table_add(s->sh_addr,
+						s->sh_size,
+						txt_sec->sh_addr,
+						txt_sec->sh_size);
 
-	for (i = 0; i < ARM_SEC_MAX; i++)
-		if (maps[i].unw_sec && maps[i].txt_sec)
-			mod->arch.unwind[i] =
-				unwind_table_add(maps[i].unw_sec->sh_addr,
-					         maps[i].unw_sec->sh_size,
-					         maps[i].txt_sec->sh_addr,
-					         maps[i].txt_sec->sh_size);
+			list_add(&table->mod_list, unwind_list);
+
+			/* save init table for module_arch_freeing_init */
+			if (strcmp(".ARM.exidx.init.text", secname) == 0)
+				mod->arch.init_table = table;
+		}
+	}
 #endif
 #ifdef CONFIG_ARM_PATCH_PHYS_VIRT
 	s = find_mod_section(hdr, sechdrs, ".pv_table");
@@ -429,19 +423,27 @@ void
 module_arch_cleanup(struct module *mod)
 {
 #ifdef CONFIG_ARM_UNWIND
-	int i;
+	struct unwind_table *tmp;
+	struct unwind_table *n;
 
-	for (i = 0; i < ARM_SEC_MAX; i++) {
-		unwind_table_del(mod->arch.unwind[i]);
-		mod->arch.unwind[i] = NULL;
+	list_for_each_entry_safe(tmp, n,
+			&mod->arch.unwind_list, mod_list) {
+		list_del(&tmp->mod_list);
+		unwind_table_del(tmp);
 	}
+	mod->arch.init_table = NULL;
 #endif
 }
 
 void __weak module_arch_freeing_init(struct module *mod)
 {
 #ifdef CONFIG_ARM_UNWIND
-	unwind_table_del(mod->arch.unwind[ARM_SEC_INIT]);
-	mod->arch.unwind[ARM_SEC_INIT] = NULL;
+	struct unwind_table *init = mod->arch.init_table;
+
+	if (init) {
+		mod->arch.init_table = NULL;
+		list_del(&init->mod_list);
+		unwind_table_del(init);
+	}
 #endif
 }
-- 
Gitee


From 97b810c871a4912c6f62a2b92468dd2d90a266fd Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 19 Jul 2022 22:10:12 +0800
Subject: [PATCH 524/674] af_unix: take address assignment/hash insertion into
 a new helper

mainline inclusion
from mainline-v5.14-rc1
commit 185ab886d3fb283e837283c343bf539c371e26cf
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I4OM1C
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=185ab886d3fb283e837283c343bf539c371e26cf

--------------------------------

Duplicated logics in all bind variants (autobind, bind-to-path,
bind-to-abstract) gets taken into a common helper.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Baisong Zhong <zhongbaisong@huawei.com>
Reviewed-by: Yue Haibing <yuehaibing@huawei.com>
Reviewed-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 net/unix/af_unix.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index b7edca89e0ba..8c5389e4cb2c 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -262,6 +262,14 @@ static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
 	sk_add_node(sk, list);
 }
 
+static void __unix_set_addr(struct sock *sk, struct unix_address *addr,
+			    unsigned hash)
+{
+	__unix_remove_socket(sk);
+	smp_store_release(&unix_sk(sk)->addr, addr);
+	__unix_insert_socket(&unix_socket_table[hash], sk);
+}
+
 static inline void unix_remove_socket(struct sock *sk)
 {
 	spin_lock(&unix_table_lock);
@@ -935,9 +943,7 @@ static int unix_autobind(struct socket *sock)
 	}
 	addr->hash ^= sk->sk_type;
 
-	__unix_remove_socket(sk);
-	smp_store_release(&u->addr, addr);
-	__unix_insert_socket(&unix_socket_table[addr->hash], sk);
+	__unix_set_addr(sk, addr, addr->hash);
 	spin_unlock(&unix_table_lock);
 	err = 0;
 
@@ -1039,7 +1045,6 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	int err;
 	unsigned int hash;
 	struct unix_address *addr;
-	struct hlist_head *list;
 	struct path path = { };
 
 	err = -EINVAL;
@@ -1091,25 +1096,20 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 		hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE - 1);
 		spin_lock(&unix_table_lock);
 		u->path = path;
-		list = &unix_socket_table[hash];
 	} else {
 		spin_lock(&unix_table_lock);
 		err = -EADDRINUSE;
 		if (__unix_find_socket_byname(net, sunaddr, addr_len,
 					      sk->sk_type, hash)) {
+			spin_unlock(&unix_table_lock);
 			unix_release_addr(addr);
-			goto out_unlock;
+			goto out_up;
 		}
-
-		list = &unix_socket_table[addr->hash];
+		hash = addr->hash;
 	}
 
 	err = 0;
-	__unix_remove_socket(sk);
-	smp_store_release(&u->addr, addr);
-	__unix_insert_socket(list, sk);
-
-out_unlock:
+	__unix_set_addr(sk, addr, hash);
 	spin_unlock(&unix_table_lock);
 out_up:
 	mutex_unlock(&u->bindlock);
-- 
Gitee


From 4a41ddaf9b4a55bad2642712ce5ff59d113abb0f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 19 Jul 2022 22:10:13 +0800
Subject: [PATCH 525/674] unix_bind(): allocate addr earlier

mainline inclusion
from mainline-v5.14-rc1
commit c34d4582518ff83a4848c2d33a46be82e2499a5b
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I4OM1C
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c34d4582518ff83a4848c2d33a46be82e2499a5b

--------------------------------

makes it easier to massage; we do pay for that by extra work
(kmalloc+memcpy+kfree) in some error cases, but those are not
on the hot paths anyway.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Baisong Zhong <zhongbaisong@huawei.com>
Reviewed-by: Yue Haibing <yuehaibing@huawei.com>
Reviewed-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 net/unix/af_unix.c | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 8c5389e4cb2c..8dcd342bee10 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1061,6 +1061,15 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	if (err < 0)
 		goto out;
 	addr_len = err;
+	err = -ENOMEM;
+	addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
+	if (!addr)
+		goto out;
+
+	memcpy(addr->name, sunaddr, addr_len);
+	addr->len = addr_len;
+	addr->hash = hash ^ sk->sk_type;
+	refcount_set(&addr->refcnt, 1);
 
 	if (sun_path[0]) {
 		umode_t mode = S_IFSOCK |
@@ -1069,7 +1078,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 		if (err) {
 			if (err == -EEXIST)
 				err = -EADDRINUSE;
-			goto out;
+			goto out_addr;
 		}
 	}
 
@@ -1081,16 +1090,6 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	if (u->addr)
 		goto out_up;
 
-	err = -ENOMEM;
-	addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
-	if (!addr)
-		goto out_up;
-
-	memcpy(addr->name, sunaddr, addr_len);
-	addr->len = addr_len;
-	addr->hash = hash ^ sk->sk_type;
-	refcount_set(&addr->refcnt, 1);
-
 	if (sun_path[0]) {
 		addr->hash = UNIX_HASH_SIZE;
 		hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE - 1);
@@ -1102,20 +1101,23 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 		if (__unix_find_socket_byname(net, sunaddr, addr_len,
 					      sk->sk_type, hash)) {
 			spin_unlock(&unix_table_lock);
-			unix_release_addr(addr);
 			goto out_up;
 		}
 		hash = addr->hash;
 	}
 
-	err = 0;
 	__unix_set_addr(sk, addr, hash);
 	spin_unlock(&unix_table_lock);
+	addr = NULL;
+	err = 0;
 out_up:
 	mutex_unlock(&u->bindlock);
 out_put:
 	if (err)
 		path_put(&path);
+out_addr:
+	if (addr)
+		unix_release_addr(addr);
 out:
 	return err;
 }
-- 
Gitee


From 5d545249d66573a819ceee8c0b60b66fd34a5a6c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 19 Jul 2022 22:10:14 +0800
Subject: [PATCH 526/674] unix_bind(): separate BSD and abstract cases

mainline inclusion
from mainline-v5.14-rc1
commit aee515170576609a0aa3413dc06a7f36f05a5fe2
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I4OM1C
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=aee515170576609a0aa3413dc06a7f36f05a5fe2

--------------------------------

We do get some duplication that way, but it's minor compared to
parts that are different.  What we get is an ability to change
locking in BSD case without making failure exits very hard to
follow.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Baisong Zhong <zhongbaisong@huawei.com>
Reviewed-by: Yue Haibing <yuehaibing@huawei.com>
Reviewed-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 net/unix/af_unix.c | 55 ++++++++++++++++++++++++++++------------------
 1 file changed, 34 insertions(+), 21 deletions(-)

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 8dcd342bee10..685c190e4366 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1045,7 +1045,6 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	int err;
 	unsigned int hash;
 	struct unix_address *addr;
-	struct path path = { };
 
 	err = -EINVAL;
 	if (addr_len < offsetofend(struct sockaddr_un, sun_family) ||
@@ -1072,6 +1071,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	refcount_set(&addr->refcnt, 1);
 
 	if (sun_path[0]) {
+		struct path path = { };
 		umode_t mode = S_IFSOCK |
 		       (SOCK_INODE(sock)->i_mode & ~current_umask());
 		err = unix_mknod(sun_path, mode, &path);
@@ -1080,41 +1080,54 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 				err = -EADDRINUSE;
 			goto out_addr;
 		}
-	}
 
-	err = mutex_lock_interruptible(&u->bindlock);
-	if (err)
-		goto out_put;
+		err = mutex_lock_interruptible(&u->bindlock);
+		if (err) {
+			path_put(&path);
+			goto out_addr;
+		}
 
-	err = -EINVAL;
-	if (u->addr)
-		goto out_up;
+		err = -EINVAL;
+		if (u->addr) {
+			mutex_unlock(&u->bindlock);
+			path_put(&path);
+			goto out_addr;
+		}
 
-	if (sun_path[0]) {
 		addr->hash = UNIX_HASH_SIZE;
 		hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE - 1);
 		spin_lock(&unix_table_lock);
 		u->path = path;
+		__unix_set_addr(sk, addr, hash);
+		spin_unlock(&unix_table_lock);
+		mutex_unlock(&u->bindlock);
+		addr = NULL;
+		err = 0;
 	} else {
+		err = mutex_lock_interruptible(&u->bindlock);
+		if (err)
+			goto out_addr;
+
+		err = -EINVAL;
+		if (u->addr) {
+			mutex_unlock(&u->bindlock);
+			goto out_addr;
+		}
+
 		spin_lock(&unix_table_lock);
 		err = -EADDRINUSE;
 		if (__unix_find_socket_byname(net, sunaddr, addr_len,
 					      sk->sk_type, hash)) {
 			spin_unlock(&unix_table_lock);
-			goto out_up;
+			mutex_unlock(&u->bindlock);
+			goto out_addr;
 		}
-		hash = addr->hash;
+		__unix_set_addr(sk, addr, addr->hash);
+		spin_unlock(&unix_table_lock);
+		mutex_unlock(&u->bindlock);
+		addr = NULL;
+		err = 0;
 	}
-
-	__unix_set_addr(sk, addr, hash);
-	spin_unlock(&unix_table_lock);
-	addr = NULL;
-	err = 0;
-out_up:
-	mutex_unlock(&u->bindlock);
-out_put:
-	if (err)
-		path_put(&path);
 out_addr:
 	if (addr)
 		unix_release_addr(addr);
-- 
Gitee


From 7fb99800c8fee521e0e98d46be4fb9634e0a56e4 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 19 Jul 2022 22:10:15 +0800
Subject: [PATCH 527/674] unix_bind(): take BSD and abstract address cases into
 new helpers

mainline inclusion
from mainline-v5.14-rc1
commit fa42d910a38ee310d5c6826563dd58a08735d5b0
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I4OM1C
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=fa42d910a38ee310d5c6826563dd58a08735d5b0

--------------------------------

unix_bind_bsd() and unix_bind_abstract() respectively.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Baisong Zhong <zhongbaisong@huawei.com>
Reviewed-by: Yue Haibing <yuehaibing@huawei.com>
Reviewed-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 net/unix/af_unix.c | 147 +++++++++++++++++++++++----------------------
 1 file changed, 74 insertions(+), 73 deletions(-)

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 685c190e4366..9172058866b4 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1035,104 +1035,105 @@ static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
 	return err;
 }
 
+static int unix_bind_bsd(struct sock *sk, struct unix_address *addr)
+{
+	struct unix_sock *u = unix_sk(sk);
+	struct path path = { };
+	umode_t mode = S_IFSOCK |
+	       (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
+	unsigned int hash;
+	int err;
+
+	err = unix_mknod(addr->name->sun_path, mode, &path);
+	if (err)
+		return err;
+
+	err = mutex_lock_interruptible(&u->bindlock);
+	if (err) {
+		path_put(&path);
+		return err;
+	}
+
+	if (u->addr) {
+		mutex_unlock(&u->bindlock);
+		path_put(&path);
+		return -EINVAL;
+	}
+
+	addr->hash = UNIX_HASH_SIZE;
+	hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE - 1);
+	spin_lock(&unix_table_lock);
+	u->path = path;
+	__unix_set_addr(sk, addr, hash);
+	spin_unlock(&unix_table_lock);
+	mutex_unlock(&u->bindlock);
+	return 0;
+}
+
+static int unix_bind_abstract(struct sock *sk, unsigned hash,
+			      struct unix_address *addr)
+{
+	struct unix_sock *u = unix_sk(sk);
+	int err;
+
+	err = mutex_lock_interruptible(&u->bindlock);
+	if (err)
+		return err;
+
+	if (u->addr) {
+		mutex_unlock(&u->bindlock);
+		return -EINVAL;
+	}
+
+	spin_lock(&unix_table_lock);
+	if (__unix_find_socket_byname(sock_net(sk), addr->name, addr->len,
+				      sk->sk_type, hash)) {
+		spin_unlock(&unix_table_lock);
+		mutex_unlock(&u->bindlock);
+		return -EADDRINUSE;
+	}
+	__unix_set_addr(sk, addr, addr->hash);
+	spin_unlock(&unix_table_lock);
+	mutex_unlock(&u->bindlock);
+	return 0;
+}
+
 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 {
 	struct sock *sk = sock->sk;
-	struct net *net = sock_net(sk);
-	struct unix_sock *u = unix_sk(sk);
 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
 	char *sun_path = sunaddr->sun_path;
 	int err;
 	unsigned int hash;
 	struct unix_address *addr;
 
-	err = -EINVAL;
 	if (addr_len < offsetofend(struct sockaddr_un, sun_family) ||
 	    sunaddr->sun_family != AF_UNIX)
-		goto out;
+		return -EINVAL;
 
-	if (addr_len == sizeof(short)) {
-		err = unix_autobind(sock);
-		goto out;
-	}
+	if (addr_len == sizeof(short))
+		return unix_autobind(sock);
 
 	err = unix_mkname(sunaddr, addr_len, &hash);
 	if (err < 0)
-		goto out;
+		return err;
 	addr_len = err;
-	err = -ENOMEM;
 	addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
 	if (!addr)
-		goto out;
+		return -ENOMEM;
 
 	memcpy(addr->name, sunaddr, addr_len);
 	addr->len = addr_len;
 	addr->hash = hash ^ sk->sk_type;
 	refcount_set(&addr->refcnt, 1);
 
-	if (sun_path[0]) {
-		struct path path = { };
-		umode_t mode = S_IFSOCK |
-		       (SOCK_INODE(sock)->i_mode & ~current_umask());
-		err = unix_mknod(sun_path, mode, &path);
-		if (err) {
-			if (err == -EEXIST)
-				err = -EADDRINUSE;
-			goto out_addr;
-		}
-
-		err = mutex_lock_interruptible(&u->bindlock);
-		if (err) {
-			path_put(&path);
-			goto out_addr;
-		}
-
-		err = -EINVAL;
-		if (u->addr) {
-			mutex_unlock(&u->bindlock);
-			path_put(&path);
-			goto out_addr;
-		}
-
-		addr->hash = UNIX_HASH_SIZE;
-		hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE - 1);
-		spin_lock(&unix_table_lock);
-		u->path = path;
-		__unix_set_addr(sk, addr, hash);
-		spin_unlock(&unix_table_lock);
-		mutex_unlock(&u->bindlock);
-		addr = NULL;
-		err = 0;
-	} else {
-		err = mutex_lock_interruptible(&u->bindlock);
-		if (err)
-			goto out_addr;
-
-		err = -EINVAL;
-		if (u->addr) {
-			mutex_unlock(&u->bindlock);
-			goto out_addr;
-		}
-
-		spin_lock(&unix_table_lock);
-		err = -EADDRINUSE;
-		if (__unix_find_socket_byname(net, sunaddr, addr_len,
-					      sk->sk_type, hash)) {
-			spin_unlock(&unix_table_lock);
-			mutex_unlock(&u->bindlock);
-			goto out_addr;
-		}
-		__unix_set_addr(sk, addr, addr->hash);
-		spin_unlock(&unix_table_lock);
-		mutex_unlock(&u->bindlock);
-		addr = NULL;
-		err = 0;
-	}
-out_addr:
-	if (addr)
+	if (sun_path[0])
+		err = unix_bind_bsd(sk, addr);
+	else
+		err = unix_bind_abstract(sk, hash, addr);
+	if (err)
 		unix_release_addr(addr);
-out:
-	return err;
+	return err == -EEXIST ? -EADDRINUSE : err;
 }
 
 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
-- 
Gitee


From 79d3eccb01d2c61ea60d9929f073fdda8b25ab00 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Date: Tue, 19 Jul 2022 22:10:16 +0800
Subject: [PATCH 528/674] af_unix: Use offsetof() instead of sizeof().

mainline inclusion
from mainline-v5.17-rc1
commit 755662ce78d14c1a9118df921c528b1f992ded2e
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I4OM1C
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=755662ce78d14c1a9118df921c528b1f992ded2e

--------------------------------

The length of the AF_UNIX socket address contains an offset to the member
sun_path of struct sockaddr_un.

Currently, the preceding member is just sun_family, and its type is
sa_family_t and resolved to short.  Therefore, the offset is represented by
sizeof(short).  However, it is not clear and fragile to changes in struct
sockaddr_storage or sockaddr_un.

This commit makes it clear and robust by rewriting sizeof() with
offsetof().

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Baisong Zhong <zhongbaisong@huawei.com>
Reviewed-by: Yue Haibing <yuehaibing@huawei.com>
Reviewed-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 net/unix/af_unix.c | 19 ++++++++++++-------
 net/unix/diag.c    |  3 ++-
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 9172058866b4..cdd2a46e2a98 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -230,7 +230,8 @@ static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp
 {
 	*hashp = 0;
 
-	if (len <= sizeof(short) || len > sizeof(*sunaddr))
+	if (len <= offsetof(struct sockaddr_un, sun_path) ||
+	    len > sizeof(*sunaddr))
 		return -EINVAL;
 	if (!sunaddr || sunaddr->sun_family != AF_UNIX)
 		return -EINVAL;
@@ -243,7 +244,8 @@ static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp
 		 * kernel address buffer.
 		 */
 		((char *)sunaddr)[len] = 0;
-		len = strlen(sunaddr->sun_path)+1+sizeof(short);
+		len = strlen(sunaddr->sun_path) +
+			offsetof(struct sockaddr_un, sun_path) + 1;
 		return len;
 	}
 
@@ -911,7 +913,8 @@ static int unix_autobind(struct socket *sock)
 		goto out;
 
 	err = -ENOMEM;
-	addr = kzalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
+	addr = kzalloc(sizeof(*addr) +
+		       offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
 	if (!addr)
 		goto out;
 
@@ -919,7 +922,8 @@ static int unix_autobind(struct socket *sock)
 	refcount_set(&addr->refcnt, 1);
 
 retry:
-	addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
+	addr->len = sprintf(addr->name->sun_path + 1, "%05x", ordernum) +
+		offsetof(struct sockaddr_un, sun_path) + 1;
 	addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0));
 
 	spin_lock(&unix_table_lock);
@@ -1111,7 +1115,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	    sunaddr->sun_family != AF_UNIX)
 		return -EINVAL;
 
-	if (addr_len == sizeof(short))
+	if (addr_len == offsetof(struct sockaddr_un, sun_path))
 		return unix_autobind(sock);
 
 	err = unix_mkname(sunaddr, addr_len, &hash);
@@ -1549,7 +1553,7 @@ static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
 	if (!addr) {
 		sunaddr->sun_family = AF_UNIX;
 		sunaddr->sun_path[0] = 0;
-		err = sizeof(short);
+		err = offsetof(struct sockaddr_un, sun_path);
 	} else {
 		err = addr->len;
 		memcpy(sunaddr, addr->name, addr->len);
@@ -2927,7 +2931,8 @@ static int unix_seq_show(struct seq_file *seq, void *v)
 			seq_putc(seq, ' ');
 
 			i = 0;
-			len = u->addr->len - sizeof(short);
+			len = u->addr->len -
+				offsetof(struct sockaddr_un, sun_path);
 			if (!UNIX_ABSTRACT(s))
 				len--;
 			else {
diff --git a/net/unix/diag.c b/net/unix/diag.c
index 9ff64f9df1f3..dd77e81b41a0 100644
--- a/net/unix/diag.c
+++ b/net/unix/diag.c
@@ -19,7 +19,8 @@ static int sk_diag_dump_name(struct sock *sk, struct sk_buff *nlskb)
 	if (!addr)
 		return 0;
 
-	return nla_put(nlskb, UNIX_DIAG_NAME, addr->len - sizeof(short),
+	return nla_put(nlskb, UNIX_DIAG_NAME,
+		       addr->len - offsetof(struct sockaddr_un, sun_path),
 		       addr->name->sun_path);
 }
 
-- 
Gitee


From a7a7a7cf78d041127a4116e088a64fb8544d2ebc Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 19 Jul 2022 22:10:17 +0800
Subject: [PATCH 529/674] __unix_find_socket_byname(): don't pass hash and type
 separately

mainline inclusion
from mainline-v5.14-rc1
commit be752283a2a2b4bfc2df512b5d9b03a34aece252
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I4OM1C
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=be752283a2a2b4bfc2df512b5d9b03a34aece252

--------------------------------

We only care about exclusive or of those, so pass that directly.
Makes life simpler for callers as well...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Baisong Zhong <zhongbaisong@huawei.com>
Reviewed-by: Yue Haibing <yuehaibing@huawei.com>
Reviewed-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 net/unix/af_unix.c | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index cdd2a46e2a98..e59f5714e02f 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -288,11 +288,11 @@ static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
 
 static struct sock *__unix_find_socket_byname(struct net *net,
 					      struct sockaddr_un *sunname,
-					      int len, int type, unsigned int hash)
+					      int len, unsigned int hash)
 {
 	struct sock *s;
 
-	sk_for_each(s, &unix_socket_table[hash ^ type]) {
+	sk_for_each(s, &unix_socket_table[hash]) {
 		struct unix_sock *u = unix_sk(s);
 
 		if (!net_eq(sock_net(s), net))
@@ -307,13 +307,12 @@ static struct sock *__unix_find_socket_byname(struct net *net,
 
 static inline struct sock *unix_find_socket_byname(struct net *net,
 						   struct sockaddr_un *sunname,
-						   int len, int type,
-						   unsigned int hash)
+						   int len, unsigned int hash)
 {
 	struct sock *s;
 
 	spin_lock(&unix_table_lock);
-	s = __unix_find_socket_byname(net, sunname, len, type, hash);
+	s = __unix_find_socket_byname(net, sunname, len, hash);
 	if (s)
 		sock_hold(s);
 	spin_unlock(&unix_table_lock);
@@ -925,12 +924,12 @@ static int unix_autobind(struct socket *sock)
 	addr->len = sprintf(addr->name->sun_path + 1, "%05x", ordernum) +
 		offsetof(struct sockaddr_un, sun_path) + 1;
 	addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0));
+	addr->hash ^= sk->sk_type;
 
 	spin_lock(&unix_table_lock);
 	ordernum = (ordernum+1)&0xFFFFF;
 
-	if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type,
-				      addr->hash)) {
+	if (__unix_find_socket_byname(net, addr->name, addr->len, addr->hash)) {
 		spin_unlock(&unix_table_lock);
 		/*
 		 * __unix_find_socket_byname() may take long time if many names
@@ -945,7 +944,6 @@ static int unix_autobind(struct socket *sock)
 		}
 		goto retry;
 	}
-	addr->hash ^= sk->sk_type;
 
 	__unix_set_addr(sk, addr, addr->hash);
 	spin_unlock(&unix_table_lock);
@@ -992,7 +990,7 @@ static struct sock *unix_find_other(struct net *net,
 		}
 	} else {
 		err = -ECONNREFUSED;
-		u = unix_find_socket_byname(net, sunname, len, type, hash);
+		u = unix_find_socket_byname(net, sunname, len, type ^ hash);
 		if (u) {
 			struct dentry *dentry;
 			dentry = unix_sk(u)->path.dentry;
@@ -1074,8 +1072,7 @@ static int unix_bind_bsd(struct sock *sk, struct unix_address *addr)
 	return 0;
 }
 
-static int unix_bind_abstract(struct sock *sk, unsigned hash,
-			      struct unix_address *addr)
+static int unix_bind_abstract(struct sock *sk, struct unix_address *addr)
 {
 	struct unix_sock *u = unix_sk(sk);
 	int err;
@@ -1091,7 +1088,7 @@ static int unix_bind_abstract(struct sock *sk, unsigned hash,
 
 	spin_lock(&unix_table_lock);
 	if (__unix_find_socket_byname(sock_net(sk), addr->name, addr->len,
-				      sk->sk_type, hash)) {
+				      addr->hash)) {
 		spin_unlock(&unix_table_lock);
 		mutex_unlock(&u->bindlock);
 		return -EADDRINUSE;
@@ -1134,7 +1131,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	if (sun_path[0])
 		err = unix_bind_bsd(sk, addr);
 	else
-		err = unix_bind_abstract(sk, hash, addr);
+		err = unix_bind_abstract(sk, addr);
 	if (err)
 		unix_release_addr(addr);
 	return err == -EEXIST ? -EADDRINUSE : err;
-- 
Gitee


From 06b109c3b4894add5786033c766f07b04d37476a Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Date: Tue, 19 Jul 2022 22:10:18 +0800
Subject: [PATCH 530/674] af_unix: Pass struct sock to unix_autobind().

mainline inclusion
from mainline-v5.17-rc1
commit f7ed31f4615f4e1d97c0e4325c5b8a240e10073c
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I4OM1C
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=f7ed31f4615f4e1d97c0e4325c5b8a240e10073c

--------------------------------

We do not use struct socket in unix_autobind() and pass struct sock to
unix_bind_bsd() and unix_bind_abstract().  Let's pass it to unix_autobind()
as well.

Also, this patch fixes these errors by checkpatch.pl.

  ERROR: do not use assignment in if condition
  #1795: FILE: net/unix/af_unix.c:1795:
  +	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr

  CHECK: Logical continuations should be on the previous line
  #1796: FILE: net/unix/af_unix.c:1796:
  +	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr
  +	    && (err = unix_autobind(sock)) != 0)

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Baisong Zhong <zhongbaisong@huawei.com>
Reviewed-by: Yue Haibing <yuehaibing@huawei.com>
Reviewed-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 net/unix/af_unix.c | 36 +++++++++++++++++++++---------------
 1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index e59f5714e02f..006e35065e51 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -894,15 +894,13 @@ static int unix_release(struct socket *sock)
 	return 0;
 }
 
-static int unix_autobind(struct socket *sock)
+static int unix_autobind(struct sock *sk)
 {
-	struct sock *sk = sock->sk;
-	struct net *net = sock_net(sk);
 	struct unix_sock *u = unix_sk(sk);
-	static u32 ordernum = 1;
 	struct unix_address *addr;
-	int err;
 	unsigned int retries = 0;
+	static u32 ordernum = 1;
+	int err;
 
 	err = mutex_lock_interruptible(&u->bindlock);
 	if (err)
@@ -929,7 +927,8 @@ static int unix_autobind(struct socket *sock)
 	spin_lock(&unix_table_lock);
 	ordernum = (ordernum+1)&0xFFFFF;
 
-	if (__unix_find_socket_byname(net, addr->name, addr->len, addr->hash)) {
+	if (__unix_find_socket_byname(sock_net(sk), addr->name, addr->len,
+				      addr->hash)) {
 		spin_unlock(&unix_table_lock);
 		/*
 		 * __unix_find_socket_byname() may take long time if many names
@@ -1113,7 +1112,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 		return -EINVAL;
 
 	if (addr_len == offsetof(struct sockaddr_un, sun_path))
-		return unix_autobind(sock);
+		return unix_autobind(sk);
 
 	err = unix_mkname(sunaddr, addr_len, &hash);
 	if (err < 0)
@@ -1183,8 +1182,11 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
 		alen = err;
 
 		if (test_bit(SOCK_PASSCRED, &sock->flags) &&
-		    !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
-			goto out;
+		    !unix_sk(sk)->addr) {
+			err = unix_autobind(sk);
+			if (err)
+				goto out;
+		}
 
 restart:
 		other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err);
@@ -1284,9 +1286,11 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 		goto out;
 	addr_len = err;
 
-	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr &&
-	    (err = unix_autobind(sock)) != 0)
-		goto out;
+	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr) {
+		err = unix_autobind(sk);
+		if (err)
+			goto out;
+	}
 
 	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
 
@@ -1737,9 +1741,11 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
 			goto out;
 	}
 
-	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr
-	    && (err = unix_autobind(sock)) != 0)
-		goto out;
+	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr) {
+		err = unix_autobind(sk);
+		if (err)
+			goto out;
+	}
 
 	err = -EMSGSIZE;
 	if (len > sk->sk_sndbuf - 32)
-- 
Gitee


From 75136c5f52f91a1606125965bb16136d5c4385f0 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 19 Jul 2022 22:10:19 +0800
Subject: [PATCH 531/674] fold unix_mknod() into unix_bind_bsd()

mainline inclusion
from mainline-v5.14-rc1
commit 71e6be6f7d2bada7099d79205779c4452d4fd35b
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I4OM1C
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=71e6be6f7d2bada7099d79205779c4452d4fd35b

--------------------------------

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>

Conflicts:
	net/unix/af_unix.c

Signed-off-by: Baisong Zhong <zhongbaisong@huawei.com>
Reviewed-by: Yue Haibing <yuehaibing@huawei.com>
Reviewed-by: Yue Haibing <yuehaibing@huawei.com>
Reviewed-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 net/unix/af_unix.c | 39 +++++++++++++++------------------------
 1 file changed, 15 insertions(+), 24 deletions(-)

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 006e35065e51..3eba0e23c512 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1007,45 +1007,36 @@ static struct sock *unix_find_other(struct net *net,
 	return NULL;
 }
 
-static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
+static int unix_bind_bsd(struct sock *sk, struct unix_address *addr)
 {
+	struct unix_sock *u = unix_sk(sk);
+	umode_t mode = S_IFSOCK |
+	       (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
+	struct path parent, path;
 	struct dentry *dentry;
-	struct path path;
-	int err = 0;
+	unsigned int hash;
+	int err;
+
 	/*
 	 * Get the parent directory, calculate the hash for last
 	 * component.
 	 */
-	dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
-	err = PTR_ERR(dentry);
+	dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
 	if (IS_ERR(dentry))
-		return err;
+		return PTR_ERR(dentry);
 
 	/*
 	 * All right, let's create it.
 	 */
-	err = security_path_mknod(&path, dentry, mode, 0);
+	err = security_path_mknod(&parent, dentry, mode, 0);
 	if (!err) {
-		err = vfs_mknod(d_inode(path.dentry), dentry, mode, 0);
+		err = vfs_mknod(d_inode(parent.dentry), dentry, mode, 0);
 		if (!err) {
-			res->mnt = mntget(path.mnt);
-			res->dentry = dget(dentry);
+			path.mnt = mntget(parent.mnt);
+			path.dentry = dget(dentry);
 		}
 	}
-	done_path_create(&path, dentry);
-	return err;
-}
-
-static int unix_bind_bsd(struct sock *sk, struct unix_address *addr)
-{
-	struct unix_sock *u = unix_sk(sk);
-	struct path path = { };
-	umode_t mode = S_IFSOCK |
-	       (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
-	unsigned int hash;
-	int err;
-
-	err = unix_mknod(addr->name->sun_path, mode, &path);
+	done_path_create(&parent, dentry);
 	if (err)
 		return err;
 
-- 
Gitee


From 55929e5b1e8074f1c4cc1189ae2fd522515046f0 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Date: Tue, 19 Jul 2022 22:10:20 +0800
Subject: [PATCH 532/674] af_unix: Factorise unix_find_other() based on address
 types.

mainline inclusion
from mainline-v5.17-rc1
commit fa39ef0e472961baef49ddb0e6f7b8ebb555bd8f
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I4OM1C
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=fa39ef0e472961baef49ddb0e6f7b8ebb555bd8f

--------------------------------

As done in the commit fa42d910a38e ("unix_bind(): take BSD and abstract
address cases into new helpers"), this patch moves BSD and abstract address
cases from unix_find_other() into unix_find_bsd() and unix_find_abstract().

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

Conflicts:
	net/unix/af_unix.c

Signed-off-by: Baisong Zhong <zhongbaisong@huawei.com>
Reviewed-by: Yue Haibing <yuehaibing@huawei.com>
Reviewed-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 include/linux/fs.h |   4 ++
 net/unix/af_unix.c | 136 +++++++++++++++++++++++++++------------------
 2 files changed, 85 insertions(+), 55 deletions(-)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 18259e38dcd7..a7bc1eaa27ee 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2844,6 +2844,10 @@ static inline int bmap(struct inode *inode,  sector_t *block)
 extern int notify_change(struct dentry *, struct iattr *, struct inode **);
 extern int inode_permission(struct inode *, int);
 extern int generic_permission(struct inode *, int);
+static inline int path_permission(const struct path *path, int mask)
+{
+	return inode_permission(d_inode(path->dentry), mask);
+}
 extern int __check_sticky(struct inode *dir, struct inode *inode);
 
 static inline bool execute_ok(struct inode *inode)
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 3eba0e23c512..eafd1d62bf55 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -894,6 +894,87 @@ static int unix_release(struct socket *sock)
 	return 0;
 }
 
+static struct sock *unix_find_bsd(struct net *net, struct sockaddr_un *sunaddr,
+				  int type, int *error)
+{
+	struct inode *inode;
+	struct path path;
+	struct sock *sk;
+	int err;
+
+	err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
+	if (err)
+		goto fail;
+
+	err = path_permission(&path, MAY_WRITE);
+	if (err)
+		goto path_put;
+
+	err = -ECONNREFUSED;
+	inode = d_backing_inode(path.dentry);
+	if (!S_ISSOCK(inode->i_mode))
+		goto path_put;
+
+	sk = unix_find_socket_byinode(inode);
+	if (!sk)
+		goto path_put;
+
+	err = -EPROTOTYPE;
+	if (sk->sk_type == type)
+		touch_atime(&path);
+	else
+		goto sock_put;
+
+	path_put(&path);
+
+	return sk;
+
+sock_put:
+	sock_put(sk);
+path_put:
+	path_put(&path);
+fail:
+	*error = err;
+	return NULL;
+}
+
+static struct sock *unix_find_abstract(struct net *net,
+				       struct sockaddr_un *sunaddr,
+				       int addr_len, int type,
+				       unsigned int hash, int *error)
+{
+	struct dentry *dentry;
+	struct sock *sk;
+
+	sk = unix_find_socket_byname(net, sunaddr, addr_len, type ^ hash);
+	if (!sk) {
+		*error = -ECONNREFUSED;
+		return NULL;
+	}
+
+	dentry = unix_sk(sk)->path.dentry;
+	if (dentry)
+		touch_atime(&unix_sk(sk)->path);
+
+	return sk;
+}
+
+static struct sock *unix_find_other(struct net *net,
+				    struct sockaddr_un *sunaddr,
+				    int addr_len, int type,
+				    unsigned int hash, int *error)
+{
+	struct sock *sk;
+
+	if (sunaddr->sun_path[0])
+		sk = unix_find_bsd(net, sunaddr, type, error);
+	else
+		sk = unix_find_abstract(net, sunaddr, addr_len, type, hash,
+					error);
+
+	return sk;
+}
+
 static int unix_autobind(struct sock *sk)
 {
 	struct unix_sock *u = unix_sk(sk);
@@ -952,61 +1033,6 @@ out:	mutex_unlock(&u->bindlock);
 	return err;
 }
 
-static struct sock *unix_find_other(struct net *net,
-				    struct sockaddr_un *sunname, int len,
-				    int type, unsigned int hash, int *error)
-{
-	struct sock *u;
-	struct path path;
-	int err = 0;
-
-	if (sunname->sun_path[0]) {
-		struct inode *inode;
-		err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
-		if (err)
-			goto fail;
-		inode = d_backing_inode(path.dentry);
-		err = inode_permission(inode, MAY_WRITE);
-		if (err)
-			goto put_fail;
-
-		err = -ECONNREFUSED;
-		if (!S_ISSOCK(inode->i_mode))
-			goto put_fail;
-		u = unix_find_socket_byinode(inode);
-		if (!u)
-			goto put_fail;
-
-		if (u->sk_type == type)
-			touch_atime(&path);
-
-		path_put(&path);
-
-		err = -EPROTOTYPE;
-		if (u->sk_type != type) {
-			sock_put(u);
-			goto fail;
-		}
-	} else {
-		err = -ECONNREFUSED;
-		u = unix_find_socket_byname(net, sunname, len, type ^ hash);
-		if (u) {
-			struct dentry *dentry;
-			dentry = unix_sk(u)->path.dentry;
-			if (dentry)
-				touch_atime(&unix_sk(u)->path);
-		} else
-			goto fail;
-	}
-	return u;
-
-put_fail:
-	path_put(&path);
-fail:
-	*error = err;
-	return NULL;
-}
-
 static int unix_bind_bsd(struct sock *sk, struct unix_address *addr)
 {
 	struct unix_sock *u = unix_sk(sk);
-- 
Gitee


From c05abd84ed1185f7d46f23c69513624e07897f78 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Date: Tue, 19 Jul 2022 22:10:21 +0800
Subject: [PATCH 533/674] af_unix: Return an error as a pointer in
 unix_find_other().

mainline inclusion
from mainline-v5.17-rc1
commit aed26f557bbc94f0c778f63d7dfe86af99208f68
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I4OM1C
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=aed26f557bbc94f0c778f63d7dfe86af99208f68

--------------------------------

We can return an error as a pointer and need not pass an additional
argument to unix_find_other().

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Baisong Zhong <zhongbaisong@huawei.com>
Reviewed-by: Yue Haibing <yuehaibing@huawei.com>
Reviewed-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 net/unix/af_unix.c | 40 ++++++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 18 deletions(-)

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index eafd1d62bf55..e78477a4fa4b 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -895,7 +895,7 @@ static int unix_release(struct socket *sock)
 }
 
 static struct sock *unix_find_bsd(struct net *net, struct sockaddr_un *sunaddr,
-				  int type, int *error)
+				  int type)
 {
 	struct inode *inode;
 	struct path path;
@@ -934,23 +934,20 @@ static struct sock *unix_find_bsd(struct net *net, struct sockaddr_un *sunaddr,
 path_put:
 	path_put(&path);
 fail:
-	*error = err;
-	return NULL;
+	return ERR_PTR(err);
 }
 
 static struct sock *unix_find_abstract(struct net *net,
 				       struct sockaddr_un *sunaddr,
 				       int addr_len, int type,
-				       unsigned int hash, int *error)
+				       unsigned int hash)
 {
 	struct dentry *dentry;
 	struct sock *sk;
 
 	sk = unix_find_socket_byname(net, sunaddr, addr_len, type ^ hash);
-	if (!sk) {
-		*error = -ECONNREFUSED;
-		return NULL;
-	}
+	if (!sk)
+		return ERR_PTR(-ECONNREFUSED);
 
 	dentry = unix_sk(sk)->path.dentry;
 	if (dentry)
@@ -962,15 +959,14 @@ static struct sock *unix_find_abstract(struct net *net,
 static struct sock *unix_find_other(struct net *net,
 				    struct sockaddr_un *sunaddr,
 				    int addr_len, int type,
-				    unsigned int hash, int *error)
+				    unsigned int hash)
 {
 	struct sock *sk;
 
 	if (sunaddr->sun_path[0])
-		sk = unix_find_bsd(net, sunaddr, type, error);
+		sk = unix_find_bsd(net, sunaddr, type);
 	else
-		sk = unix_find_abstract(net, sunaddr, addr_len, type, hash,
-					error);
+		sk = unix_find_abstract(net, sunaddr, addr_len, type, hash);
 
 	return sk;
 }
@@ -1206,9 +1202,11 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
 		}
 
 restart:
-		other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err);
-		if (!other)
+		other = unix_find_other(net, sunaddr, alen, sock->type, hash);
+		if (IS_ERR(other)) {
+			err = PTR_ERR(other);
 			goto out;
+		}
 
 		unix_state_double_lock(sk, other);
 
@@ -1330,9 +1328,12 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 
 restart:
 	/*  Find listening sock. */
-	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err);
-	if (!other)
+	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash);
+	if (IS_ERR(other)) {
+		err = PTR_ERR(other);
+		other = NULL;
 		goto out;
+	}
 
 	/* Latch state of peer */
 	unix_state_lock(other);
@@ -1803,9 +1804,12 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
 			goto out_free;
 
 		other = unix_find_other(net, sunaddr, namelen, sk->sk_type,
-					hash, &err);
-		if (other == NULL)
+					hash);
+		if (IS_ERR(other)) {
+			err = PTR_ERR(other);
+			other = NULL;
 			goto out_free;
+		}
 	}
 
 	if (sk_filter(other, skb) < 0) {
-- 
Gitee


From 4d0707b8fbdbc96b6b74d94f0131b43f07ccd010 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Date: Tue, 19 Jul 2022 22:10:22 +0800
Subject: [PATCH 534/674] af_unix: Cut unix_validate_addr() out of
 unix_mkname().

mainline inclusion
from mainline-v5.17-rc1
commit b8a58aa6fccc5b2940f0da18c7f02e8a1deb693a
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I4OM1C
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b8a58aa6fccc5b2940f0da18c7f02e8a1deb693a

--------------------------------

unix_mkname() tests socket address length and family and does some
processing based on the address type.  It is called in the early stage,
and therefore some instructions are redundant and can end up in vain.

The address length/family tests are done twice in unix_bind().  Also, the
address type is rechecked later in unix_bind() and unix_find_other(), where
we can do the same processing.  Moreover, in the BSD address case, the hash
is set to 0 but never used and confusing.

This patch moves the address tests out of unix_mkname(), and the following
patches move the other part into appropriate places and remove
unix_mkname() finally.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Baisong Zhong <zhongbaisong@huawei.com>
Reviewed-by: Yue Haibing <yuehaibing@huawei.com>
Reviewed-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 net/unix/af_unix.c | 40 ++++++++++++++++++++++++++++++----------
 1 file changed, 30 insertions(+), 10 deletions(-)

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index e78477a4fa4b..ef8b4fcd52fb 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -226,15 +226,22 @@ static inline void unix_release_addr(struct unix_address *addr)
  *		- if started by zero, it is abstract name.
  */
 
+static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
+{
+	if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
+	    addr_len > sizeof(*sunaddr))
+		return -EINVAL;
+
+	if (sunaddr->sun_family != AF_UNIX)
+		return -EINVAL;
+
+	return 0;
+}
+
 static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp)
 {
 	*hashp = 0;
 
-	if (len <= offsetof(struct sockaddr_un, sun_path) ||
-	    len > sizeof(*sunaddr))
-		return -EINVAL;
-	if (!sunaddr || sunaddr->sun_family != AF_UNIX)
-		return -EINVAL;
 	if (sunaddr->sun_path[0]) {
 		/*
 		 * This may look like an off by one error but it is a bit more
@@ -1120,13 +1127,14 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	unsigned int hash;
 	struct unix_address *addr;
 
-	if (addr_len < offsetofend(struct sockaddr_un, sun_family) ||
-	    sunaddr->sun_family != AF_UNIX)
-		return -EINVAL;
-
-	if (addr_len == offsetof(struct sockaddr_un, sun_path))
+	if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
+	    sunaddr->sun_family == AF_UNIX)
 		return unix_autobind(sk);
 
+	err = unix_validate_addr(sunaddr, addr_len);
+	if (err)
+		return err;
+
 	err = unix_mkname(sunaddr, addr_len, &hash);
 	if (err < 0)
 		return err;
@@ -1189,6 +1197,10 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
 		goto out;
 
 	if (addr->sa_family != AF_UNSPEC) {
+		err = unix_validate_addr(sunaddr, alen);
+		if (err)
+			goto out;
+
 		err = unix_mkname(sunaddr, alen, &hash);
 		if (err < 0)
 			goto out;
@@ -1296,6 +1308,10 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 	int err;
 	long timeo;
 
+	err = unix_validate_addr(sunaddr, addr_len);
+	if (err)
+		goto out;
+
 	err = unix_mkname(sunaddr, addr_len, &hash);
 	if (err < 0)
 		goto out;
@@ -1747,6 +1763,10 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
 		goto out;
 
 	if (msg->msg_namelen) {
+		err = unix_validate_addr(sunaddr, msg->msg_namelen);
+		if (err)
+			goto out;
+
 		err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
 		if (err < 0)
 			goto out;
-- 
Gitee


From e6a91904b5adb313f0e81a1a41bfabb2e3940905 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Date: Tue, 19 Jul 2022 22:10:23 +0800
Subject: [PATCH 535/674] af_unix: Copy unix_mkname() into
 unix_find_(bsd|abstract)().

mainline inclusion
from mainline-v5.17-rc1
commit d2d8c9fddb1c11ccfa73bf0ad2b1e6b4ea7afdaf
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I4OM1C
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=d2d8c9fddb1c11ccfa73bf0ad2b1e6b4ea7afdaf

--------------------------------

We should not call unix_mkname() before unix_find_other() and instead do
the same thing where necessary based on the address type:

  - terminating the address with '\0' in unix_find_bsd()
  - calculating the hash in unix_find_abstract().

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Baisong Zhong <zhongbaisong@huawei.com>
Reviewed-by: Yue Haibing <yuehaibing@huawei.com>
Reviewed-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 net/unix/af_unix.c | 63 ++++++++++++++++++----------------------------
 1 file changed, 25 insertions(+), 38 deletions(-)

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index ef8b4fcd52fb..333b76fd830c 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -238,19 +238,25 @@ static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
 	return 0;
 }
 
+static void unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
+{
+	/* This may look like an off by one error but it is a bit more
+	 * subtle.  108 is the longest valid AF_UNIX path for a binding.
+	 * sun_path[108] doesn't as such exist.  However in kernel space
+	 * we are guaranteed that it is a valid memory location in our
+	 * kernel address buffer because syscall functions always pass
+	 * a pointer of struct sockaddr_storage which has a bigger buffer
+	 * than 108.
+	 */
+	((char *)sunaddr)[addr_len] = 0;
+}
+
 static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp)
 {
 	*hashp = 0;
 
 	if (sunaddr->sun_path[0]) {
-		/*
-		 * This may look like an off by one error but it is a bit more
-		 * subtle. 108 is the longest valid AF_UNIX path for a binding.
-		 * sun_path[108] doesn't as such exist.  However in kernel space
-		 * we are guaranteed that it is a valid memory location in our
-		 * kernel address buffer.
-		 */
-		((char *)sunaddr)[len] = 0;
+		unix_mkname_bsd(sunaddr, len);
 		len = strlen(sunaddr->sun_path) +
 			offsetof(struct sockaddr_un, sun_path) + 1;
 		return len;
@@ -902,13 +908,14 @@ static int unix_release(struct socket *sock)
 }
 
 static struct sock *unix_find_bsd(struct net *net, struct sockaddr_un *sunaddr,
-				  int type)
+				  int addr_len, int type)
 {
 	struct inode *inode;
 	struct path path;
 	struct sock *sk;
 	int err;
 
+	unix_mkname_bsd(sunaddr, addr_len);
 	err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
 	if (err)
 		goto fail;
@@ -946,9 +953,9 @@ static struct sock *unix_find_bsd(struct net *net, struct sockaddr_un *sunaddr,
 
 static struct sock *unix_find_abstract(struct net *net,
 				       struct sockaddr_un *sunaddr,
-				       int addr_len, int type,
-				       unsigned int hash)
+				       int addr_len, int type)
 {
+	unsigned int hash = unix_hash_fold(csum_partial(sunaddr, addr_len, 0));
 	struct dentry *dentry;
 	struct sock *sk;
 
@@ -965,15 +972,14 @@ static struct sock *unix_find_abstract(struct net *net,
 
 static struct sock *unix_find_other(struct net *net,
 				    struct sockaddr_un *sunaddr,
-				    int addr_len, int type,
-				    unsigned int hash)
+				    int addr_len, int type)
 {
 	struct sock *sk;
 
 	if (sunaddr->sun_path[0])
-		sk = unix_find_bsd(net, sunaddr, type);
+		sk = unix_find_bsd(net, sunaddr, addr_len, type);
 	else
-		sk = unix_find_abstract(net, sunaddr, addr_len, type, hash);
+		sk = unix_find_abstract(net, sunaddr, addr_len, type);
 
 	return sk;
 }
@@ -1189,7 +1195,6 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
 	struct net *net = sock_net(sk);
 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
 	struct sock *other;
-	unsigned int hash;
 	int err;
 
 	err = -EINVAL;
@@ -1201,11 +1206,6 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
 		if (err)
 			goto out;
 
-		err = unix_mkname(sunaddr, alen, &hash);
-		if (err < 0)
-			goto out;
-		alen = err;
-
 		if (test_bit(SOCK_PASSCRED, &sock->flags) &&
 		    !unix_sk(sk)->addr) {
 			err = unix_autobind(sk);
@@ -1214,7 +1214,7 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
 		}
 
 restart:
-		other = unix_find_other(net, sunaddr, alen, sock->type, hash);
+		other = unix_find_other(net, sunaddr, alen, sock->type);
 		if (IS_ERR(other)) {
 			err = PTR_ERR(other);
 			goto out;
@@ -1303,7 +1303,6 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 	struct sock *newsk = NULL;
 	struct sock *other = NULL;
 	struct sk_buff *skb = NULL;
-	unsigned int hash;
 	int st;
 	int err;
 	long timeo;
@@ -1312,11 +1311,6 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 	if (err)
 		goto out;
 
-	err = unix_mkname(sunaddr, addr_len, &hash);
-	if (err < 0)
-		goto out;
-	addr_len = err;
-
 	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr) {
 		err = unix_autobind(sk);
 		if (err)
@@ -1344,7 +1338,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 
 restart:
 	/*  Find listening sock. */
-	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash);
+	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
 	if (IS_ERR(other)) {
 		err = PTR_ERR(other);
 		other = NULL;
@@ -1744,9 +1738,7 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
 	struct unix_sock *u = unix_sk(sk);
 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
 	struct sock *other = NULL;
-	int namelen = 0; /* fake GCC */
 	int err;
-	unsigned int hash;
 	struct sk_buff *skb;
 	long timeo;
 	struct scm_cookie scm;
@@ -1766,11 +1758,6 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
 		err = unix_validate_addr(sunaddr, msg->msg_namelen);
 		if (err)
 			goto out;
-
-		err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
-		if (err < 0)
-			goto out;
-		namelen = err;
 	} else {
 		sunaddr = NULL;
 		err = -ENOTCONN;
@@ -1823,8 +1810,8 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
 		if (sunaddr == NULL)
 			goto out_free;
 
-		other = unix_find_other(net, sunaddr, namelen, sk->sk_type,
-					hash);
+		other = unix_find_other(net, sunaddr, msg->msg_namelen,
+					sk->sk_type);
 		if (IS_ERR(other)) {
 			err = PTR_ERR(other);
 			other = NULL;
-- 
Gitee


From 3687543508a837e7202cdf2195bfbdebf9dbbf8b Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Date: Tue, 19 Jul 2022 22:10:24 +0800
Subject: [PATCH 536/674] af_unix: Remove unix_mkname().

mainline inclusion
from mainline-v5.17-rc1
commit 5c32a3ed64b4c87ed6d9978074db5f0a54c4cd20
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I4OM1C
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=5c32a3ed64b4c87ed6d9978074db5f0a54c4cd20

--------------------------------

This patch removes unix_mkname() and postpones calculating a hash to
unix_bind_abstract().  Some BSD stuffs still remain in unix_bind()
though, the next patch packs them into unix_bind_bsd().

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Baisong Zhong <zhongbaisong@huawei.com>
Reviewed-by: Yue Haibing <yuehaibing@huawei.com>
Reviewed-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 net/unix/af_unix.c | 34 +++++++++++-----------------------
 1 file changed, 11 insertions(+), 23 deletions(-)

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 333b76fd830c..9f1273fab014 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -251,21 +251,6 @@ static void unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
 	((char *)sunaddr)[addr_len] = 0;
 }
 
-static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp)
-{
-	*hashp = 0;
-
-	if (sunaddr->sun_path[0]) {
-		unix_mkname_bsd(sunaddr, len);
-		len = strlen(sunaddr->sun_path) +
-			offsetof(struct sockaddr_un, sun_path) + 1;
-		return len;
-	}
-
-	*hashp = unix_hash_fold(csum_partial(sunaddr, len, 0));
-	return len;
-}
-
 static void __unix_remove_socket(struct sock *sk)
 {
 	sk_del_node_init(sk);
@@ -1111,6 +1096,9 @@ static int unix_bind_abstract(struct sock *sk, struct unix_address *addr)
 		return -EINVAL;
 	}
 
+	addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0));
+	addr->hash ^= sk->sk_type;
+
 	spin_lock(&unix_table_lock);
 	if (__unix_find_socket_byname(sock_net(sk), addr->name, addr->len,
 				      addr->hash)) {
@@ -1126,12 +1114,11 @@ static int unix_bind_abstract(struct sock *sk, struct unix_address *addr)
 
 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 {
-	struct sock *sk = sock->sk;
 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
 	char *sun_path = sunaddr->sun_path;
-	int err;
-	unsigned int hash;
+	struct sock *sk = sock->sk;
 	struct unix_address *addr;
+	int err;
 
 	if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
 	    sunaddr->sun_family == AF_UNIX)
@@ -1141,17 +1128,18 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	if (err)
 		return err;
 
-	err = unix_mkname(sunaddr, addr_len, &hash);
-	if (err < 0)
-		return err;
-	addr_len = err;
+	if (sun_path[0]) {
+		unix_mkname_bsd(sunaddr, addr_len);
+		addr_len = strlen(sunaddr->sun_path) +
+			offsetof(struct sockaddr_un, sun_path) + 1;
+	}
+
 	addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
 	if (!addr)
 		return -ENOMEM;
 
 	memcpy(addr->name, sunaddr, addr_len);
 	addr->len = addr_len;
-	addr->hash = hash ^ sk->sk_type;
 	refcount_set(&addr->refcnt, 1);
 
 	if (sun_path[0])
-- 
Gitee


From e73a033cd4e3796ef6885df595658646e560c607 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 19 Jul 2022 22:10:25 +0800
Subject: [PATCH 537/674] unix_bind_bsd(): move done_path_create() call after
 dealing with ->bindlock

mainline inclusion
from mainline-v5.14-rc1
commit 56c1731b280dc71febf5df80fcac1923ea973ab8
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I4OM1C
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=56c1731b280dc71febf5df80fcac1923ea973ab8

--------------------------------

Final preparations for doing unlink on failure past the successful
mknod.  We can't hold ->bindlock over ->mknod() or ->unlink(), since
either might do sb_start_write() (e.g. on overlayfs).  However, we
can do it while holding filesystem and VFS locks - doing
	kern_path_create()
	vfs_mknod()
	grab ->bindlock
	if u->addr had been set
		drop ->bindlock
		done_path_create
		return -EINVAL
	else
		assign the address to socket
		drop ->bindlock
		done_path_create
		return 0
would be deadlock-free.  Here we massage unix_bind_bsd() to that
form.  We are still doing equivalent transformations.

Next commit will *not* be an equivalent transformation - it will
add a call of vfs_unlink() before done_path_create() in "alread bound"
case.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>

Conflicts:
	net/unix/af_unix.c

Signed-off-by: Baisong Zhong <zhongbaisong@huawei.com>
Reviewed-by: Yue Haibing <yuehaibing@huawei.com>
Reviewed-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 net/unix/af_unix.c | 26 +++++++++++---------------
 1 file changed, 11 insertions(+), 15 deletions(-)

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 9f1273fab014..fbb7dcaf7d85 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1032,7 +1032,7 @@ static int unix_bind_bsd(struct sock *sk, struct unix_address *addr)
 	struct unix_sock *u = unix_sk(sk);
 	umode_t mode = S_IFSOCK |
 	       (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
-	struct path parent, path;
+	struct path parent;
 	struct dentry *dentry;
 	unsigned int hash;
 	int err;
@@ -1049,36 +1049,32 @@ static int unix_bind_bsd(struct sock *sk, struct unix_address *addr)
 	 * All right, let's create it.
 	 */
 	err = security_path_mknod(&parent, dentry, mode, 0);
-	if (!err) {
+	if (!err)
 		err = vfs_mknod(d_inode(parent.dentry), dentry, mode, 0);
-		if (!err) {
-			path.mnt = mntget(parent.mnt);
-			path.dentry = dget(dentry);
-		}
-	}
-	done_path_create(&parent, dentry);
-	if (err)
+	if (err) {
+		done_path_create(&parent, dentry);
 		return err;
-
+	}
 	err = mutex_lock_interruptible(&u->bindlock);
 	if (err) {
-		path_put(&path);
+		done_path_create(&parent, dentry);
 		return err;
 	}
-
 	if (u->addr) {
 		mutex_unlock(&u->bindlock);
-		path_put(&path);
+		done_path_create(&parent, dentry);
 		return -EINVAL;
 	}
 
 	addr->hash = UNIX_HASH_SIZE;
-	hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE - 1);
+	hash = d_backing_inode(dentry)->i_ino & (UNIX_HASH_SIZE - 1);
 	spin_lock(&unix_table_lock);
-	u->path = path;
+	u->path.mnt = mntget(parent.mnt);
+	u->path.dentry = dget(dentry);
 	__unix_set_addr(sk, addr, hash);
 	spin_unlock(&unix_table_lock);
 	mutex_unlock(&u->bindlock);
+	done_path_create(&parent, dentry);
 	return 0;
 }
 
-- 
Gitee


From ce559a0c9ae94877e3db8e1a19d2bdf2dc8a07a0 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 19 Jul 2022 22:10:26 +0800
Subject: [PATCH 538/674] unix_bind_bsd(): unlink if we fail after successful
 mknod

mainline inclusion
from mainline-v5.14-rc1
commit c0c3b8d380a8f54c75786d41f6f9efbe761dac6c
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I4OM1C
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c0c3b8d380a8f54c75786d41f6f9efbe761dac6c

--------------------------------

We can do that more or less safely, since the parent is
held locked all along.  Yes, somebody might observe the
object via dcache, only to have it disappear afterwards,
but there's really no good way to prevent that.  It won't
race with other bind(2) or attempts to move the sucker
elsewhere, or put something else in its place - locked
parent prevents that.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>

Conflicts:
	net/unix/af_unix.c

Signed-off-by: Baisong Zhong <zhongbaisong@huawei.com>
Reviewed-by: Yue Haibing <yuehaibing@huawei.com>
Reviewed-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 net/unix/af_unix.c | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index fbb7dcaf7d85..a443b8862c4c 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1051,20 +1051,13 @@ static int unix_bind_bsd(struct sock *sk, struct unix_address *addr)
 	err = security_path_mknod(&parent, dentry, mode, 0);
 	if (!err)
 		err = vfs_mknod(d_inode(parent.dentry), dentry, mode, 0);
-	if (err) {
-		done_path_create(&parent, dentry);
-		return err;
-	}
+	if (err)
+		goto out;
 	err = mutex_lock_interruptible(&u->bindlock);
-	if (err) {
-		done_path_create(&parent, dentry);
-		return err;
-	}
-	if (u->addr) {
-		mutex_unlock(&u->bindlock);
-		done_path_create(&parent, dentry);
-		return -EINVAL;
-	}
+	if (err)
+		goto out_unlink;
+	if (u->addr)
+		goto out_unlock;
 
 	addr->hash = UNIX_HASH_SIZE;
 	hash = d_backing_inode(dentry)->i_ino & (UNIX_HASH_SIZE - 1);
@@ -1076,6 +1069,16 @@ static int unix_bind_bsd(struct sock *sk, struct unix_address *addr)
 	mutex_unlock(&u->bindlock);
 	done_path_create(&parent, dentry);
 	return 0;
+
+out_unlock:
+	mutex_unlock(&u->bindlock);
+	err = -EINVAL;
+out_unlink:
+	/* failed after successful mknod?  unlink what we'd created... */
+	vfs_unlink(d_inode(parent.dentry), dentry, NULL);
+out:
+	done_path_create(&parent, dentry);
+	return err;
 }
 
 static int unix_bind_abstract(struct sock *sk, struct unix_address *addr)
-- 
Gitee


From 26b3fc648100bf91002039cf87d130b6b3389139 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Date: Tue, 19 Jul 2022 22:10:27 +0800
Subject: [PATCH 539/674] af_unix: Allocate unix_address in
 unix_bind_(bsd|abstract)().

mainline inclusion
from mainline-v5.17-rc1
commit 12f21c49ad83eba93d0485b8c9edcc28201bee93
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I4OM1C
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=12f21c49ad83eba93d0485b8c9edcc28201bee93

--------------------------------

To terminate address with '\0' in unix_bind_bsd(), we add
unix_create_addr() and call it in unix_bind_bsd() and unix_bind_abstract().

Also, unix_bind_abstract() does not return -EEXIST.  Only
kern_path_create() and vfs_mknod() in unix_bind_bsd() can return it,
so we move the last error check in unix_bind() to unix_bind_bsd().

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

Conflicts:
	net/unix/af_unix.c

Signed-off-by: Baisong Zhong <zhongbaisong@huawei.com>
Reviewed-by: Yue Haibing <yuehaibing@huawei.com>
Reviewed-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 net/unix/af_unix.c | 105 ++++++++++++++++++++++++++++-----------------
 1 file changed, 66 insertions(+), 39 deletions(-)

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index a443b8862c4c..4a25742ed4be 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -213,6 +213,22 @@ struct sock *unix_peer_get(struct sock *s)
 }
 EXPORT_SYMBOL_GPL(unix_peer_get);
 
+static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
+					     int addr_len)
+{
+	struct unix_address *addr;
+
+	addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
+	if (!addr)
+		return NULL;
+
+	refcount_set(&addr->refcnt, 1);
+	addr->len = addr_len;
+	memcpy(addr->name, sunaddr, addr_len);
+
+	return addr;
+}
+
 static inline void unix_release_addr(struct unix_address *addr)
 {
 	if (refcount_dec_and_test(&addr->refcnt))
@@ -1027,23 +1043,35 @@ out:	mutex_unlock(&u->bindlock);
 	return err;
 }
 
-static int unix_bind_bsd(struct sock *sk, struct unix_address *addr)
+static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
+			 int addr_len)
 {
-	struct unix_sock *u = unix_sk(sk);
 	umode_t mode = S_IFSOCK |
 	       (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
-	struct path parent;
+	struct unix_sock *u = unix_sk(sk);
+	struct unix_address *addr;
 	struct dentry *dentry;
+	struct path parent;
 	unsigned int hash;
 	int err;
 
+	unix_mkname_bsd(sunaddr, addr_len);
+	addr_len = strlen(sunaddr->sun_path) +
+		offsetof(struct sockaddr_un, sun_path) + 1;
+
+	addr = unix_create_addr(sunaddr, addr_len);
+	if (!addr)
+		return -ENOMEM;
+
 	/*
 	 * Get the parent directory, calculate the hash for last
 	 * component.
 	 */
 	dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
-	if (IS_ERR(dentry))
-		return PTR_ERR(dentry);
+	if (IS_ERR(dentry)) {
+		err = PTR_ERR(dentry);
+		goto out;
+	}
 
 	/*
 	 * All right, let's create it.
@@ -1052,7 +1080,7 @@ static int unix_bind_bsd(struct sock *sk, struct unix_address *addr)
 	if (!err)
 		err = vfs_mknod(d_inode(parent.dentry), dentry, mode, 0);
 	if (err)
-		goto out;
+		goto out_path;
 	err = mutex_lock_interruptible(&u->bindlock);
 	if (err)
 		goto out_unlink;
@@ -1076,47 +1104,61 @@ static int unix_bind_bsd(struct sock *sk, struct unix_address *addr)
 out_unlink:
 	/* failed after successful mknod?  unlink what we'd created... */
 	vfs_unlink(d_inode(parent.dentry), dentry, NULL);
-out:
+out_path:
 	done_path_create(&parent, dentry);
-	return err;
+out:
+	unix_release_addr(addr);
+	return err == -EEXIST ? -EADDRINUSE : err;
 }
 
-static int unix_bind_abstract(struct sock *sk, struct unix_address *addr)
+static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
+			      int addr_len)
 {
 	struct unix_sock *u = unix_sk(sk);
+	struct unix_address *addr;
 	int err;
 
+	addr = unix_create_addr(sunaddr, addr_len);
+	if (!addr)
+		return -ENOMEM;
+
 	err = mutex_lock_interruptible(&u->bindlock);
 	if (err)
-		return err;
+		goto out;
 
 	if (u->addr) {
-		mutex_unlock(&u->bindlock);
-		return -EINVAL;
+		err = -EINVAL;
+		goto out_mutex;
 	}
 
 	addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0));
 	addr->hash ^= sk->sk_type;
 
 	spin_lock(&unix_table_lock);
+
 	if (__unix_find_socket_byname(sock_net(sk), addr->name, addr->len,
-				      addr->hash)) {
-		spin_unlock(&unix_table_lock);
-		mutex_unlock(&u->bindlock);
-		return -EADDRINUSE;
-	}
+				      addr->hash))
+		goto out_spin;
+
 	__unix_set_addr(sk, addr, addr->hash);
 	spin_unlock(&unix_table_lock);
 	mutex_unlock(&u->bindlock);
 	return 0;
+
+out_spin:
+	spin_unlock(&unix_table_lock);
+	err = -EADDRINUSE;
+out_mutex:
+	mutex_unlock(&u->bindlock);
+out:
+	unix_release_addr(addr);
+	return err;
 }
 
 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 {
 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
-	char *sun_path = sunaddr->sun_path;
 	struct sock *sk = sock->sk;
-	struct unix_address *addr;
 	int err;
 
 	if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
@@ -1127,27 +1169,12 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	if (err)
 		return err;
 
-	if (sun_path[0]) {
-		unix_mkname_bsd(sunaddr, addr_len);
-		addr_len = strlen(sunaddr->sun_path) +
-			offsetof(struct sockaddr_un, sun_path) + 1;
-	}
-
-	addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
-	if (!addr)
-		return -ENOMEM;
-
-	memcpy(addr->name, sunaddr, addr_len);
-	addr->len = addr_len;
-	refcount_set(&addr->refcnt, 1);
-
-	if (sun_path[0])
-		err = unix_bind_bsd(sk, addr);
+	if (sunaddr->sun_path[0])
+		err = unix_bind_bsd(sk, sunaddr, addr_len);
 	else
-		err = unix_bind_abstract(sk, addr);
-	if (err)
-		unix_release_addr(addr);
-	return err == -EEXIST ? -EADDRINUSE : err;
+		err = unix_bind_abstract(sk, sunaddr, addr_len);
+
+	return err;
 }
 
 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
-- 
Gitee


From 915d7d25c0d71e35f7cac439bfe7c06fa8065c99 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Date: Tue, 19 Jul 2022 22:10:28 +0800
Subject: [PATCH 540/674] af_unix: Remove UNIX_ABSTRACT() macro and test
 sun_path[0] instead.

mainline inclusion
from mainline-v5.17-rc1
commit 5ce7ab4961a9320ca0836e06849210d088723a56
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I4OM1C
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=5ce7ab4961a9320ca0836e06849210d088723a56

--------------------------------

In BSD and abstract address cases, we store sockets in the hash table with
keys between 0 and UNIX_HASH_SIZE - 1.  However, the hash saved in a socket
varies depending on its address type; sockets with BSD addresses always
have UNIX_HASH_SIZE in their unix_sk(sk)->addr->hash.

This is just for the UNIX_ABSTRACT() macro used to check the address type.
The difference of the saved hashes comes from the first byte of the address
in the first place.  So, we can test it directly.

Then we can keep a real hash in each socket and replace unix_table_lock
with per-hash locks in the later patch.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

Conflicts:
	tools/testing/selftests/bpf/progs/bpf_iter_unix.c
	tools/testing/selftests/bpf/progs/bpf_tracing_net.h
	tools/testing/selftests/bpf/progs/test_skc_to_unix_sock.c

Signed-off-by: Baisong Zhong <zhongbaisong@huawei.com>
Reviewed-by: Yue Haibing <yuehaibing@huawei.com>
Reviewed-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 net/unix/af_unix.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 4a25742ed4be..e66551d6a43d 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -133,8 +133,6 @@ static struct hlist_head *unix_sockets_unbound(void *addr)
 	return &unix_socket_table[UNIX_HASH_SIZE + hash];
 }
 
-#define UNIX_ABSTRACT(sk)	(unix_sk(sk)->addr->hash < UNIX_HASH_SIZE)
-
 #ifdef CONFIG_SECURITY_NETWORK
 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 {
@@ -2978,9 +2976,9 @@ static int unix_seq_show(struct seq_file *seq, void *v)
 			i = 0;
 			len = u->addr->len -
 				offsetof(struct sockaddr_un, sun_path);
-			if (!UNIX_ABSTRACT(s))
+			if (u->addr->name->sun_path[0]) {
 				len--;
-			else {
+			} else {
 				seq_putc(seq, '@');
 				i++;
 			}
-- 
Gitee


From 2ae0412dc0863a7096624c94aac0a631e13d7e49 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Date: Tue, 19 Jul 2022 22:10:29 +0800
Subject: [PATCH 541/674] af_unix: Return errno instead of NULL in
 unix_create1().

mainline inclusion
from mainline-v5.15-rc4
commit f4bd73b5a950866f6c6fc98a7b684d307c5d586a
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I4OM1C
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=f4bd73b5a950866f6c6fc98a7b684d307c5d586a

--------------------------------

unix_create1() returns NULL on error, and the callers assume that it never
fails for reasons other than out of memory.  So, the callers always return
-ENOMEM when unix_create1() fails.

However, it also returns NULL when the number of af_unix sockets exceeds
twice the limit controlled by sysctl: fs.file-max.  In this case, the
callers should return -ENFILE like alloc_empty_file().

This patch changes unix_create1() to return the correct error value instead
of NULL on error.

Out of curiosity, the assumption has been wrong since 1999 due to this
change introduced in 2.2.4 [0].

  diff -u --recursive --new-file v2.2.3/linux/net/unix/af_unix.c linux/net/unix/af_unix.c
  --- v2.2.3/linux/net/unix/af_unix.c	Tue Jan 19 11:32:53 1999
  +++ linux/net/unix/af_unix.c	Sun Mar 21 07:22:00 1999
  @@ -388,6 +413,9 @@
   {
   	struct sock *sk;

  +	if (atomic_read(&unix_nr_socks) >= 2*max_files)
  +		return NULL;
  +
   	MOD_INC_USE_COUNT;
   	sk = sk_alloc(PF_UNIX, GFP_KERNEL, 1);
   	if (!sk) {

[0]: https://cdn.kernel.org/pub/linux/kernel/v2.2/patch-2.2.4.gz

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Signed-off-by: David S. Miller <davem@davemloft.net>

Conflicts:
	net/unix/af_unix.c

Signed-off-by: Baisong Zhong <zhongbaisong@huawei.com>
Reviewed-by: Yue Haibing <yuehaibing@huawei.com>
Reviewed-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 net/unix/af_unix.c | 50 ++++++++++++++++++++++++++++++----------------
 1 file changed, 33 insertions(+), 17 deletions(-)

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index e66551d6a43d..9589e54bd3c9 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -821,16 +821,22 @@ static struct proto unix_proto = {
 
 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern)
 {
-	struct sock *sk = NULL;
 	struct unix_sock *u;
+	struct sock *sk;
+	int err;
 
 	atomic_long_inc(&unix_nr_socks);
-	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
-		goto out;
+	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
+		err = -ENFILE;
+		goto err;
+	}
 
 	sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto, kern);
-	if (!sk)
-		goto out;
+
+	if (!sk) {
+		err = -ENOMEM;
+		goto err;
+	}
 
 	sock_init_data(sock, sk);
 
@@ -850,20 +856,23 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern)
 	init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
 	memset(&u->scm_stat, 0, sizeof(struct scm_stat));
 	unix_insert_socket(unix_sockets_unbound(sk), sk);
-out:
-	if (sk == NULL)
-		atomic_long_dec(&unix_nr_socks);
-	else {
-		local_bh_disable();
-		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
-		local_bh_enable();
-	}
+
+	local_bh_disable();
+	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+	local_bh_enable();
+
 	return sk;
+
+err:
+	atomic_long_dec(&unix_nr_socks);
+	return ERR_PTR(err);
 }
 
 static int unix_create(struct net *net, struct socket *sock, int protocol,
 		       int kern)
 {
+	struct sock *sk;
+
 	if (protocol && protocol != PF_UNIX)
 		return -EPROTONOSUPPORT;
 
@@ -890,7 +899,11 @@ static int unix_create(struct net *net, struct socket *sock, int protocol,
 		return -ESOCKTNOSUPPORT;
 	}
 
-	return unix_create1(net, sock, kern) ? 0 : -ENOMEM;
+	sk = unix_create1(net, sock, kern);
+	if (IS_ERR(sk))
+		return PTR_ERR(sk);
+
+	return 0;
 }
 
 static int unix_release(struct socket *sock)
@@ -1336,12 +1349,15 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 	   we will have to recheck all again in any case.
 	 */
 
-	err = -ENOMEM;
-
 	/* create new sock for complete connection */
 	newsk = unix_create1(sock_net(sk), NULL, 0);
-	if (newsk == NULL)
+	if (IS_ERR(newsk)) {
+		err = PTR_ERR(newsk);
+		newsk = NULL;
 		goto out;
+	}
+
+	err = -ENOMEM;
 
 	/* Allocate skb for sending to listening sock */
 	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
-- 
Gitee


From ff751cc10e429bd22dc113dff0912d099be193c6 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Date: Tue, 19 Jul 2022 22:10:30 +0800
Subject: [PATCH 542/674] af_unix: Add helpers to calculate hashes.

mainline inclusion
from mainline-v5.17-rc1
commit f452be496a5c8f58b1a67cde79e89b9f1cfde31c
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I4OM1C
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=f452be496a5c8f58b1a67cde79e89b9f1cfde31c

--------------------------------

This patch adds three helper functions that calculate hashes for unbound
sockets and bound sockets with BSD/abstract addresses.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Baisong Zhong <zhongbaisong@huawei.com>
Reviewed-by: Yue Haibing <yuehaibing@huawei.com>
Reviewed-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 net/unix/af_unix.c | 64 +++++++++++++++++++++++++---------------------
 1 file changed, 35 insertions(+), 29 deletions(-)

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 9589e54bd3c9..114b09d22b68 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -122,15 +122,38 @@ DEFINE_SPINLOCK(unix_table_lock);
 EXPORT_SYMBOL_GPL(unix_table_lock);
 static atomic_long_t unix_nr_socks;
 
+/* SMP locking strategy:
+ *    hash table is protected with spinlock unix_table_lock
+ *    each socket state is protected by separate spin lock.
+ */
 
-static struct hlist_head *unix_sockets_unbound(void *addr)
+static unsigned int unix_unbound_hash(struct sock *sk)
 {
-	unsigned long hash = (unsigned long)addr;
+	unsigned long hash = (unsigned long)sk;
 
 	hash ^= hash >> 16;
 	hash ^= hash >> 8;
-	hash %= UNIX_HASH_SIZE;
-	return &unix_socket_table[UNIX_HASH_SIZE + hash];
+	hash ^= sk->sk_type;
+
+	return UNIX_HASH_SIZE + (hash & (UNIX_HASH_SIZE - 1));
+}
+
+static unsigned int unix_bsd_hash(struct inode *i)
+{
+	return i->i_ino & (UNIX_HASH_SIZE - 1);
+}
+
+static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
+				       int addr_len, int type)
+{
+	__wsum csum = csum_partial(sunaddr, addr_len, 0);
+	unsigned int hash;
+
+	hash = (__force unsigned int)csum_fold(csum);
+	hash ^= hash >> 8;
+	hash ^= type;
+
+	return hash & (UNIX_HASH_SIZE - 1);
 }
 
 #ifdef CONFIG_SECURITY_NETWORK
@@ -161,20 +184,6 @@ static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
 }
 #endif /* CONFIG_SECURITY_NETWORK */
 
-/*
- *  SMP locking strategy:
- *    hash table is protected with spinlock unix_table_lock
- *    each socket state is protected by separate spin lock.
- */
-
-static inline unsigned int unix_hash_fold(__wsum n)
-{
-	unsigned int hash = (__force unsigned int)csum_fold(n);
-
-	hash ^= hash>>8;
-	return hash&(UNIX_HASH_SIZE-1);
-}
-
 #define unix_peer(sk) (unix_sk(sk)->peer)
 
 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
@@ -333,11 +342,11 @@ static inline struct sock *unix_find_socket_byname(struct net *net,
 
 static struct sock *unix_find_socket_byinode(struct inode *i)
 {
+	unsigned int hash = unix_bsd_hash(i);
 	struct sock *s;
 
 	spin_lock(&unix_table_lock);
-	sk_for_each(s,
-		    &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
+	sk_for_each(s, &unix_socket_table[hash]) {
 		struct dentry *dentry = unix_sk(s)->path.dentry;
 
 		if (dentry && d_backing_inode(dentry) == i) {
@@ -855,7 +864,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern)
 	init_waitqueue_head(&u->peer_wait);
 	init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
 	memset(&u->scm_stat, 0, sizeof(struct scm_stat));
-	unix_insert_socket(unix_sockets_unbound(sk), sk);
+	unix_insert_socket(&unix_socket_table[unix_unbound_hash(sk)], sk);
 
 	local_bh_disable();
 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
@@ -967,11 +976,11 @@ static struct sock *unix_find_abstract(struct net *net,
 				       struct sockaddr_un *sunaddr,
 				       int addr_len, int type)
 {
-	unsigned int hash = unix_hash_fold(csum_partial(sunaddr, addr_len, 0));
+	unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
 	struct dentry *dentry;
 	struct sock *sk;
 
-	sk = unix_find_socket_byname(net, sunaddr, addr_len, type ^ hash);
+	sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
 	if (!sk)
 		return ERR_PTR(-ECONNREFUSED);
 
@@ -1023,8 +1032,7 @@ static int unix_autobind(struct sock *sk)
 retry:
 	addr->len = sprintf(addr->name->sun_path + 1, "%05x", ordernum) +
 		offsetof(struct sockaddr_un, sun_path) + 1;
-	addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0));
-	addr->hash ^= sk->sk_type;
+	addr->hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
 
 	spin_lock(&unix_table_lock);
 	ordernum = (ordernum+1)&0xFFFFF;
@@ -1099,7 +1107,7 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
 		goto out_unlock;
 
 	addr->hash = UNIX_HASH_SIZE;
-	hash = d_backing_inode(dentry)->i_ino & (UNIX_HASH_SIZE - 1);
+	hash = unix_bsd_hash(d_backing_inode(dentry));
 	spin_lock(&unix_table_lock);
 	u->path.mnt = mntget(parent.mnt);
 	u->path.dentry = dget(dentry);
@@ -1142,9 +1150,7 @@ static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
 		goto out_mutex;
 	}
 
-	addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0));
-	addr->hash ^= sk->sk_type;
-
+	addr->hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
 	spin_lock(&unix_table_lock);
 
 	if (__unix_find_socket_byname(sock_net(sk), addr->name, addr->len,
-- 
Gitee


From ef9bfa8e29eaa22b13a4e2c006245484b7fb6c40 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Date: Tue, 19 Jul 2022 22:10:31 +0800
Subject: [PATCH 543/674] af_unix: Save hash in sk_hash.

mainline inclusion
from mainline-v5.17-rc1
commit e6b4b873896f0e9298f70d25726f4bb1e1b265ba
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I4OM1C
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e6b4b873896f0e9298f70d25726f4bb1e1b265ba

--------------------------------

To replace unix_table_lock with per-hash locks in the next patch, we need
to save a hash in each socket because /proc/net/unix or BPF prog iterate
sockets while holding a hash table lock and release it later in a different
function.

Currently, we store a real/pseudo hash in struct unix_address.  However, we
do not allocate it to unbound sockets, nor should we do just for that.  For
this purpose, we can use sk_hash.  Then, we no longer use the hash field in
struct unix_address and can remove it.

Also, this patch does
  - rename unix_insert_socket() to unix_insert_unbound_socket()
  - remove the redundant list argument from __unix_insert_socket() and
     unix_insert_unbound_socket()
  - use 'unsigned int' instead of 'unsigned' in __unix_set_addr_hash()
  - remove 'inline' from unix_remove_socket() and
     unix_insert_unbound_socket().

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Baisong Zhong <zhongbaisong@huawei.com>
Reviewed-by: Yue Haibing <yuehaibing@huawei.com>
Reviewed-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 include/net/af_unix.h |  2 +-
 net/unix/af_unix.c    | 42 +++++++++++++++++++++++-------------------
 2 files changed, 24 insertions(+), 20 deletions(-)

diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index f42fdddecd41..beb6ef56bf85 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -26,7 +26,7 @@ extern struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
 struct unix_address {
 	refcount_t	refcnt;
 	int		len;
-	unsigned int	hash;
+	unsigned int	hash;		/* deprecated */
 	struct sockaddr_un name[];
 };
 
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 114b09d22b68..552f95c7d9e1 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -279,31 +279,33 @@ static void __unix_remove_socket(struct sock *sk)
 	sk_del_node_init(sk);
 }
 
-static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
+static void __unix_insert_socket(struct sock *sk)
 {
 	WARN_ON(!sk_unhashed(sk));
-	sk_add_node(sk, list);
+	sk_add_node(sk, &unix_socket_table[sk->sk_hash]);
 }
 
-static void __unix_set_addr(struct sock *sk, struct unix_address *addr,
-			    unsigned hash)
+static void __unix_set_addr_hash(struct sock *sk, struct unix_address *addr,
+				 unsigned int hash)
 {
 	__unix_remove_socket(sk);
 	smp_store_release(&unix_sk(sk)->addr, addr);
-	__unix_insert_socket(&unix_socket_table[hash], sk);
+
+	sk->sk_hash = hash;
+	__unix_insert_socket(sk);
 }
 
-static inline void unix_remove_socket(struct sock *sk)
+static void unix_remove_socket(struct sock *sk)
 {
 	spin_lock(&unix_table_lock);
 	__unix_remove_socket(sk);
 	spin_unlock(&unix_table_lock);
 }
 
-static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
+static void unix_insert_unbound_socket(struct sock *sk)
 {
 	spin_lock(&unix_table_lock);
-	__unix_insert_socket(list, sk);
+	__unix_insert_socket(sk);
 	spin_unlock(&unix_table_lock);
 }
 
@@ -849,6 +851,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern)
 
 	sock_init_data(sock, sk);
 
+	sk->sk_hash		= unix_unbound_hash(sk);
 	sk->sk_allocation	= GFP_KERNEL_ACCOUNT;
 	sk->sk_write_space	= unix_write_space;
 	sk->sk_max_ack_backlog	= net->unx.sysctl_max_dgram_qlen;
@@ -864,7 +867,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern)
 	init_waitqueue_head(&u->peer_wait);
 	init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
 	memset(&u->scm_stat, 0, sizeof(struct scm_stat));
-	unix_insert_socket(&unix_socket_table[unix_unbound_hash(sk)], sk);
+	unix_insert_unbound_socket(sk);
 
 	local_bh_disable();
 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
@@ -1011,6 +1014,7 @@ static int unix_autobind(struct sock *sk)
 	struct unix_address *addr;
 	unsigned int retries = 0;
 	static u32 ordernum = 1;
+	unsigned int new_hash;
 	int err;
 
 	err = mutex_lock_interruptible(&u->bindlock);
@@ -1032,13 +1036,13 @@ static int unix_autobind(struct sock *sk)
 retry:
 	addr->len = sprintf(addr->name->sun_path + 1, "%05x", ordernum) +
 		offsetof(struct sockaddr_un, sun_path) + 1;
-	addr->hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
 
+	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
 	spin_lock(&unix_table_lock);
 	ordernum = (ordernum+1)&0xFFFFF;
 
 	if (__unix_find_socket_byname(sock_net(sk), addr->name, addr->len,
-				      addr->hash)) {
+				      new_hash)) {
 		spin_unlock(&unix_table_lock);
 		/*
 		 * __unix_find_socket_byname() may take long time if many names
@@ -1054,7 +1058,7 @@ static int unix_autobind(struct sock *sk)
 		goto retry;
 	}
 
-	__unix_set_addr(sk, addr, addr->hash);
+	__unix_set_addr_hash(sk, addr, new_hash);
 	spin_unlock(&unix_table_lock);
 	err = 0;
 
@@ -1069,9 +1073,9 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
 	       (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
 	struct unix_sock *u = unix_sk(sk);
 	struct unix_address *addr;
+	unsigned int new_hash;
 	struct dentry *dentry;
 	struct path parent;
-	unsigned int hash;
 	int err;
 
 	unix_mkname_bsd(sunaddr, addr_len);
@@ -1106,12 +1110,11 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
 	if (u->addr)
 		goto out_unlock;
 
-	addr->hash = UNIX_HASH_SIZE;
-	hash = unix_bsd_hash(d_backing_inode(dentry));
+	new_hash = unix_bsd_hash(d_backing_inode(dentry));
 	spin_lock(&unix_table_lock);
 	u->path.mnt = mntget(parent.mnt);
 	u->path.dentry = dget(dentry);
-	__unix_set_addr(sk, addr, hash);
+	__unix_set_addr_hash(sk, addr, new_hash);
 	spin_unlock(&unix_table_lock);
 	mutex_unlock(&u->bindlock);
 	done_path_create(&parent, dentry);
@@ -1135,6 +1138,7 @@ static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
 {
 	struct unix_sock *u = unix_sk(sk);
 	struct unix_address *addr;
+	unsigned int new_hash;
 	int err;
 
 	addr = unix_create_addr(sunaddr, addr_len);
@@ -1150,14 +1154,14 @@ static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
 		goto out_mutex;
 	}
 
-	addr->hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
+	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
 	spin_lock(&unix_table_lock);
 
 	if (__unix_find_socket_byname(sock_net(sk), addr->name, addr->len,
-				      addr->hash))
+				      new_hash))
 		goto out_spin;
 
-	__unix_set_addr(sk, addr, addr->hash);
+	__unix_set_addr_hash(sk, addr, new_hash);
 	spin_unlock(&unix_table_lock);
 	mutex_unlock(&u->bindlock);
 	return 0;
-- 
Gitee


From 0a92541d3ad1cb5f2cae7605fd7ce186b544795a Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Date: Tue, 19 Jul 2022 22:10:32 +0800
Subject: [PATCH 544/674] af_unix: Replace the big lock with small locks.

mainline inclusion
from mainline-v5.17-rc1
commit afd20b9290e184c203fe22f2d6b80dc7127ba724
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I4OM1C
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=afd20b9290e184c203fe22f2d6b80dc7127ba724

--------------------------------

The hash table of AF_UNIX sockets is protected by the single lock.  This
patch replaces it with per-hash locks.

The effect is noticeable when we handle multiple sockets simultaneously.
Here is a test result on an EC2 c5.24xlarge instance.  It shows latency
(under 10us only) in unix_insert_unbound_socket() while 64 CPUs creating
1024 sockets for each in parallel.

  Without this patch:

     nsec          : count     distribution
        0          : 179      |                                        |
        500        : 3021     |*********                               |
        1000       : 6271     |*******************                     |
        1500       : 6318     |*******************                     |
        2000       : 5828     |*****************                       |
        2500       : 5124     |***************                         |
        3000       : 4426     |*************                           |
        3500       : 3672     |***********                             |
        4000       : 3138     |*********                               |
        4500       : 2811     |********                                |
        5000       : 2384     |*******                                 |
        5500       : 2023     |******                                  |
        6000       : 1954     |*****                                   |
        6500       : 1737     |*****                                   |
        7000       : 1749     |*****                                   |
        7500       : 1520     |****                                    |
        8000       : 1469     |****                                    |
        8500       : 1394     |****                                    |
        9000       : 1232     |***                                     |
        9500       : 1138     |***                                     |
        10000      : 994      |***                                     |

  With this patch:

     nsec          : count     distribution
        0          : 1634     |****                                    |
        500        : 13170    |****************************************|
        1000       : 13156    |*************************************** |
        1500       : 9010     |***************************             |
        2000       : 6363     |*******************                     |
        2500       : 4443     |*************                           |
        3000       : 3240     |*********                               |
        3500       : 2549     |*******                                 |
        4000       : 1872     |*****                                   |
        4500       : 1504     |****                                    |
        5000       : 1247     |***                                     |
        5500       : 1035     |***                                     |
        6000       : 889      |**                                      |
        6500       : 744      |**                                      |
        7000       : 634      |*                                       |
        7500       : 498      |*                                       |
        8000       : 433      |*                                       |
        8500       : 355      |*                                       |
        9000       : 336      |*                                       |
        9500       : 284      |                                        |
        10000      : 243      |                                        |

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

Conflicts:
	net/unix/af_unix.c

Signed-off-by: Baisong Zhong <zhongbaisong@huawei.com>
Reviewed-by: Yue Haibing <yuehaibing@huawei.com>
Reviewed-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 include/net/af_unix.h |  2 +-
 net/unix/af_unix.c    | 98 ++++++++++++++++++++++++++-----------------
 net/unix/diag.c       | 20 ++++-----
 3 files changed, 71 insertions(+), 49 deletions(-)

diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index beb6ef56bf85..ae41d8ee970a 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -20,7 +20,7 @@ struct sock *unix_peer_get(struct sock *sk);
 #define UNIX_HASH_BITS	8
 
 extern unsigned int unix_tot_inflight;
-extern spinlock_t unix_table_lock;
+extern spinlock_t unix_table_locks[2 * UNIX_HASH_SIZE];
 extern struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
 
 struct unix_address {
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 552f95c7d9e1..b497dee9b2a4 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -116,14 +116,14 @@
 
 #include "scm.h"
 
+spinlock_t unix_table_locks[2 * UNIX_HASH_SIZE];
+EXPORT_SYMBOL_GPL(unix_table_locks);
 struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
 EXPORT_SYMBOL_GPL(unix_socket_table);
-DEFINE_SPINLOCK(unix_table_lock);
-EXPORT_SYMBOL_GPL(unix_table_lock);
 static atomic_long_t unix_nr_socks;
 
 /* SMP locking strategy:
- *    hash table is protected with spinlock unix_table_lock
+ *    hash table is protected with spinlock unix_table_locks
  *    each socket state is protected by separate spin lock.
  */
 
@@ -156,6 +156,25 @@ static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
 	return hash & (UNIX_HASH_SIZE - 1);
 }
 
+static void unix_table_double_lock(unsigned int hash1, unsigned int hash2)
+{
+	/* hash1 and hash2 is never the same because
+	 * one is between 0 and UNIX_HASH_SIZE - 1, and
+	 * another is between UNIX_HASH_SIZE and UNIX_HASH_SIZE * 2.
+	 */
+	if (hash1 > hash2)
+		swap(hash1, hash2);
+
+	spin_lock(&unix_table_locks[hash1]);
+	spin_lock_nested(&unix_table_locks[hash2], SINGLE_DEPTH_NESTING);
+}
+
+static void unix_table_double_unlock(unsigned int hash1, unsigned int hash2)
+{
+	spin_unlock(&unix_table_locks[hash1]);
+	spin_unlock(&unix_table_locks[hash2]);
+}
+
 #ifdef CONFIG_SECURITY_NETWORK
 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 {
@@ -297,16 +316,16 @@ static void __unix_set_addr_hash(struct sock *sk, struct unix_address *addr,
 
 static void unix_remove_socket(struct sock *sk)
 {
-	spin_lock(&unix_table_lock);
+	spin_lock(&unix_table_locks[sk->sk_hash]);
 	__unix_remove_socket(sk);
-	spin_unlock(&unix_table_lock);
+	spin_unlock(&unix_table_locks[sk->sk_hash]);
 }
 
 static void unix_insert_unbound_socket(struct sock *sk)
 {
-	spin_lock(&unix_table_lock);
+	spin_lock(&unix_table_locks[sk->sk_hash]);
 	__unix_insert_socket(sk);
-	spin_unlock(&unix_table_lock);
+	spin_unlock(&unix_table_locks[sk->sk_hash]);
 }
 
 static struct sock *__unix_find_socket_byname(struct net *net,
@@ -334,11 +353,11 @@ static inline struct sock *unix_find_socket_byname(struct net *net,
 {
 	struct sock *s;
 
-	spin_lock(&unix_table_lock);
+	spin_lock(&unix_table_locks[hash]);
 	s = __unix_find_socket_byname(net, sunname, len, hash);
 	if (s)
 		sock_hold(s);
-	spin_unlock(&unix_table_lock);
+	spin_unlock(&unix_table_locks[hash]);
 	return s;
 }
 
@@ -347,19 +366,18 @@ static struct sock *unix_find_socket_byinode(struct inode *i)
 	unsigned int hash = unix_bsd_hash(i);
 	struct sock *s;
 
-	spin_lock(&unix_table_lock);
+	spin_lock(&unix_table_locks[hash]);
 	sk_for_each(s, &unix_socket_table[hash]) {
 		struct dentry *dentry = unix_sk(s)->path.dentry;
 
 		if (dentry && d_backing_inode(dentry) == i) {
 			sock_hold(s);
-			goto found;
+			spin_unlock(&unix_table_locks[hash]);
+			return s;
 		}
 	}
-	s = NULL;
-found:
-	spin_unlock(&unix_table_lock);
-	return s;
+	spin_unlock(&unix_table_locks[hash]);
+	return NULL;
 }
 
 /* Support code for asymmetrically connected dgram sockets
@@ -1010,11 +1028,11 @@ static struct sock *unix_find_other(struct net *net,
 
 static int unix_autobind(struct sock *sk)
 {
+	unsigned int new_hash, old_hash = sk->sk_hash;
 	struct unix_sock *u = unix_sk(sk);
 	struct unix_address *addr;
 	unsigned int retries = 0;
 	static u32 ordernum = 1;
-	unsigned int new_hash;
 	int err;
 
 	err = mutex_lock_interruptible(&u->bindlock);
@@ -1038,12 +1056,13 @@ static int unix_autobind(struct sock *sk)
 		offsetof(struct sockaddr_un, sun_path) + 1;
 
 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
-	spin_lock(&unix_table_lock);
+	unix_table_double_lock(old_hash, new_hash);
 	ordernum = (ordernum+1)&0xFFFFF;
 
 	if (__unix_find_socket_byname(sock_net(sk), addr->name, addr->len,
 				      new_hash)) {
-		spin_unlock(&unix_table_lock);
+		unix_table_double_unlock(old_hash, new_hash);
+
 		/*
 		 * __unix_find_socket_byname() may take long time if many names
 		 * are already in use.
@@ -1059,7 +1078,7 @@ static int unix_autobind(struct sock *sk)
 	}
 
 	__unix_set_addr_hash(sk, addr, new_hash);
-	spin_unlock(&unix_table_lock);
+	unix_table_double_unlock(old_hash, new_hash);
 	err = 0;
 
 out:	mutex_unlock(&u->bindlock);
@@ -1071,9 +1090,9 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
 {
 	umode_t mode = S_IFSOCK |
 	       (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
+	unsigned int new_hash, old_hash = sk->sk_hash;
 	struct unix_sock *u = unix_sk(sk);
 	struct unix_address *addr;
-	unsigned int new_hash;
 	struct dentry *dentry;
 	struct path parent;
 	int err;
@@ -1111,11 +1130,11 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
 		goto out_unlock;
 
 	new_hash = unix_bsd_hash(d_backing_inode(dentry));
-	spin_lock(&unix_table_lock);
+	unix_table_double_lock(old_hash, new_hash);
 	u->path.mnt = mntget(parent.mnt);
 	u->path.dentry = dget(dentry);
 	__unix_set_addr_hash(sk, addr, new_hash);
-	spin_unlock(&unix_table_lock);
+	unix_table_double_unlock(old_hash, new_hash);
 	mutex_unlock(&u->bindlock);
 	done_path_create(&parent, dentry);
 	return 0;
@@ -1136,9 +1155,9 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
 			      int addr_len)
 {
+	unsigned int new_hash, old_hash = sk->sk_hash;
 	struct unix_sock *u = unix_sk(sk);
 	struct unix_address *addr;
-	unsigned int new_hash;
 	int err;
 
 	addr = unix_create_addr(sunaddr, addr_len);
@@ -1155,19 +1174,19 @@ static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
 	}
 
 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
-	spin_lock(&unix_table_lock);
+	unix_table_double_lock(old_hash, new_hash);
 
 	if (__unix_find_socket_byname(sock_net(sk), addr->name, addr->len,
 				      new_hash))
 		goto out_spin;
 
 	__unix_set_addr_hash(sk, addr, new_hash);
-	spin_unlock(&unix_table_lock);
+	unix_table_double_unlock(old_hash, new_hash);
 	mutex_unlock(&u->bindlock);
 	return 0;
 
 out_spin:
-	spin_unlock(&unix_table_lock);
+	unix_table_double_unlock(old_hash, new_hash);
 	err = -EADDRINUSE;
 out_mutex:
 	mutex_unlock(&u->bindlock);
@@ -1469,9 +1488,9 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 	 *
 	 * The contents of *(otheru->addr) and otheru->path
 	 * are seen fully set up here, since we have found
-	 * otheru in hash under unix_table_lock.  Insertion
+	 * otheru in hash under unix_table_locks.  Insertion
 	 * into the hash chain we'd found it in had been done
-	 * in an earlier critical area protected by unix_table_lock,
+	 * in an earlier critical area protected by unix_table_locks,
 	 * the same one where we'd set *(otheru->addr) contents,
 	 * as well as otheru->path and otheru->addr itself.
 	 *
@@ -2900,7 +2919,7 @@ static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
 
 #define get_bucket(x) ((x) >> BUCKET_SPACE)
-#define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1))
+#define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
 
 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
@@ -2924,7 +2943,7 @@ static struct sock *unix_next_socket(struct seq_file *seq,
 				     struct sock *sk,
 				     loff_t *pos)
 {
-	unsigned long bucket;
+	unsigned long bucket = get_bucket(*pos);
 
 	while (sk > (struct sock *)SEQ_START_TOKEN) {
 		sk = sk_next(sk);
@@ -2935,12 +2954,13 @@ static struct sock *unix_next_socket(struct seq_file *seq,
 	}
 
 	do {
+		spin_lock(&unix_table_locks[bucket]);
 		sk = unix_from_bucket(seq, pos);
 		if (sk)
 			return sk;
 
 next_bucket:
-		bucket = get_bucket(*pos) + 1;
+		spin_unlock(&unix_table_locks[bucket++]);
 		*pos = set_bucket_offset(bucket, 1);
 	} while (bucket < ARRAY_SIZE(unix_socket_table));
 
@@ -2948,10 +2968,7 @@ static struct sock *unix_next_socket(struct seq_file *seq,
 }
 
 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
-	__acquires(unix_table_lock)
 {
-	spin_lock(&unix_table_lock);
-
 	if (!*pos)
 		return SEQ_START_TOKEN;
 
@@ -2968,9 +2985,11 @@ static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 }
 
 static void unix_seq_stop(struct seq_file *seq, void *v)
-	__releases(unix_table_lock)
 {
-	spin_unlock(&unix_table_lock);
+	struct sock *sk = v;
+
+	if (sk)
+		spin_unlock(&unix_table_locks[sk->sk_hash]);
 }
 
 static int unix_seq_show(struct seq_file *seq, void *v)
@@ -2995,7 +3014,7 @@ static int unix_seq_show(struct seq_file *seq, void *v)
 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
 			sock_i_ino(s));
 
-		if (u->addr) {	// under unix_table_lock here
+		if (u->addr) {	// under unix_table_locks here
 			int i, len;
 			seq_putc(seq, ' ');
 
@@ -3067,10 +3086,13 @@ static struct pernet_operations unix_net_ops = {
 
 static int __init af_unix_init(void)
 {
-	int rc = -1;
+	int i, rc = -1;
 
 	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
 
+	for (i = 0; i < 2 * UNIX_HASH_SIZE; i++)
+		spin_lock_init(&unix_table_locks[i]);
+
 	rc = proto_register(&unix_proto, 1);
 	if (rc != 0) {
 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
diff --git a/net/unix/diag.c b/net/unix/diag.c
index dd77e81b41a0..c522ab94d0c8 100644
--- a/net/unix/diag.c
+++ b/net/unix/diag.c
@@ -13,7 +13,7 @@
 
 static int sk_diag_dump_name(struct sock *sk, struct sk_buff *nlskb)
 {
-	/* might or might not have unix_table_lock */
+	/* might or might not have unix_table_locks */
 	struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
 
 	if (!addr)
@@ -204,13 +204,13 @@ static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
 	s_slot = cb->args[0];
 	num = s_num = cb->args[1];
 
-	spin_lock(&unix_table_lock);
 	for (slot = s_slot;
 	     slot < ARRAY_SIZE(unix_socket_table);
 	     s_num = 0, slot++) {
 		struct sock *sk;
 
 		num = 0;
+		spin_lock(&unix_table_locks[slot]);
 		sk_for_each(sk, &unix_socket_table[slot]) {
 			if (!net_eq(sock_net(sk), net))
 				continue;
@@ -221,14 +221,16 @@ static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
 			if (sk_diag_dump(sk, skb, req,
 					 NETLINK_CB(cb->skb).portid,
 					 cb->nlh->nlmsg_seq,
-					 NLM_F_MULTI) < 0)
+					 NLM_F_MULTI) < 0) {
+				spin_unlock(&unix_table_locks[slot]);
 				goto done;
+			}
 next:
 			num++;
 		}
+		spin_unlock(&unix_table_locks[slot]);
 	}
 done:
-	spin_unlock(&unix_table_lock);
 	cb->args[0] = slot;
 	cb->args[1] = num;
 
@@ -237,21 +239,19 @@ static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
 
 static struct sock *unix_lookup_by_ino(unsigned int ino)
 {
-	int i;
 	struct sock *sk;
+	int i;
 
-	spin_lock(&unix_table_lock);
 	for (i = 0; i < ARRAY_SIZE(unix_socket_table); i++) {
+		spin_lock(&unix_table_locks[i]);
 		sk_for_each(sk, &unix_socket_table[i])
 			if (ino == sock_i_ino(sk)) {
 				sock_hold(sk);
-				spin_unlock(&unix_table_lock);
-
+				spin_unlock(&unix_table_locks[i]);
 				return sk;
 			}
+		spin_unlock(&unix_table_locks[i]);
 	}
-
-	spin_unlock(&unix_table_lock);
 	return NULL;
 }
 
-- 
Gitee


From 93e5fa79f21ad078d7f7c055e8be023892006d05 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Date: Tue, 19 Jul 2022 22:10:33 +0800
Subject: [PATCH 545/674] af_unix: Relax race in unix_autobind().

mainline inclusion
from mainline-v5.17-rc1
commit 9acbc584c3a4e9706703039708ec947ffc152c66
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I4OM1C
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=9acbc584c3a4e9706703039708ec947ffc152c66

--------------------------------

When we bind an AF_UNIX socket without a name specified, the kernel selects
an available one from 0x00000 to 0xFFFFF.  unix_autobind() starts searching
from a number in the 'static' variable and increments it after acquiring
two locks.

If multiple processes try autobind, they obtain the same lock and check if
a socket in the hash list has the same name.  If not, one process uses it,
and all except one end up retrying the _next_ number (actually not, it may
be incremented by the other processes).  The more we autobind sockets in
parallel, the longer the latency gets.  We can avoid such a race by
searching for a name from a random number.

These show latency in unix_autobind() while 64 CPUs are simultaneously
autobind-ing 1024 sockets for each.

  Without this patch:

     usec          : count     distribution
        0          : 1176     |***                                     |
        2          : 3655     |***********                             |
        4          : 4094     |*************                           |
        6          : 3831     |************                            |
        8          : 3829     |************                            |
        10         : 3844     |************                            |
        12         : 3638     |***********                             |
        14         : 2992     |*********                               |
        16         : 2485     |*******                                 |
        18         : 2230     |*******                                 |
        20         : 2095     |******                                  |
        22         : 1853     |*****                                   |
        24         : 1827     |*****                                   |
        26         : 1677     |*****                                   |
        28         : 1473     |****                                    |
        30         : 1573     |*****                                   |
        32         : 1417     |****                                    |
        34         : 1385     |****                                    |
        36         : 1345     |****                                    |
        38         : 1344     |****                                    |
        40         : 1200     |***                                     |

  With this patch:

     usec          : count     distribution
        0          : 1855     |******                                  |
        2          : 6464     |*********************                   |
        4          : 9936     |********************************        |
        6          : 12107    |****************************************|
        8          : 10441    |**********************************      |
        10         : 7264     |***********************                 |
        12         : 4254     |**************                          |
        14         : 2538     |********                                |
        16         : 1596     |*****                                   |
        18         : 1088     |***                                     |
        20         : 800      |**                                      |
        22         : 670      |**                                      |
        24         : 601      |*                                       |
        26         : 562      |*                                       |
        28         : 525      |*                                       |
        30         : 446      |*                                       |
        32         : 378      |*                                       |
        34         : 337      |*                                       |
        36         : 317      |*                                       |
        38         : 314      |*                                       |
        40         : 298      |                                        |

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Baisong Zhong <zhongbaisong@huawei.com>
Reviewed-by: Yue Haibing <yuehaibing@huawei.com>
Reviewed-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 net/unix/af_unix.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index b497dee9b2a4..de3a1ffac26d 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1031,8 +1031,7 @@ static int unix_autobind(struct sock *sk)
 	unsigned int new_hash, old_hash = sk->sk_hash;
 	struct unix_sock *u = unix_sk(sk);
 	struct unix_address *addr;
-	unsigned int retries = 0;
-	static u32 ordernum = 1;
+	u32 lastnum, ordernum;
 	int err;
 
 	err = mutex_lock_interruptible(&u->bindlock);
@@ -1048,32 +1047,35 @@ static int unix_autobind(struct sock *sk)
 	if (!addr)
 		goto out;
 
+	addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
 	addr->name->sun_family = AF_UNIX;
 	refcount_set(&addr->refcnt, 1);
 
+	ordernum = prandom_u32();
+	lastnum = ordernum & 0xFFFFF;
 retry:
-	addr->len = sprintf(addr->name->sun_path + 1, "%05x", ordernum) +
-		offsetof(struct sockaddr_un, sun_path) + 1;
+	ordernum = (ordernum + 1) & 0xFFFFF;
+	sprintf(addr->name->sun_path + 1, "%05x", ordernum);
 
 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
 	unix_table_double_lock(old_hash, new_hash);
-	ordernum = (ordernum+1)&0xFFFFF;
 
 	if (__unix_find_socket_byname(sock_net(sk), addr->name, addr->len,
 				      new_hash)) {
 		unix_table_double_unlock(old_hash, new_hash);
 
-		/*
-		 * __unix_find_socket_byname() may take long time if many names
+		/* __unix_find_socket_byname() may take long time if many names
 		 * are already in use.
 		 */
 		cond_resched();
-		/* Give up if all names seems to be in use. */
-		if (retries++ == 0xFFFFF) {
+
+		if (ordernum == lastnum) {
+			/* Give up if all names seems to be in use. */
 			err = -ENOSPC;
-			kfree(addr);
+			unix_release_addr(addr);
 			goto out;
 		}
+
 		goto retry;
 	}
 
-- 
Gitee


From 8b038d52a6cb7e735aaa5f100cecb70cba382981 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <naoya.horiguchi@nec.com>
Date: Wed, 13 Jul 2022 12:19:58 +0800
Subject: [PATCH 546/674] mm/hwpoison: do not lock page again when
 me_huge_page() successfully recovers

mainline inclusion
from mainline-v5.13
commit ea6d0630100b285f059d0a8d8e86f38a46407536
category: bugfix
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1
CVE: NA

Intel_SIG: commit ea6d0630100b mm/hwpoison: do not lock page again when me_huge_page() successfully recovers.
Backport for MCA recovery enhancing & bug fix.

--------------------------------

Currently me_huge_page() temporary unlocks page to perform some actions
then locks it again later.  My testcase (which calls hard-offline on
some tail page in a hugetlb, then accesses the address of the hugetlb
range) showed that page allocation code detects this page lock on buddy
page and printed out "BUG: Bad page state" message.

check_new_page_bad() does not consider a page with __PG_HWPOISON as bad
page, so this flag works as kind of filter, but this filtering doesn't
work in this case because the "bad page" is not the actual hwpoisoned
page.  So stop locking page again.  Actions to be taken depend on the
page type of the error, so page unlocking should be done in ->action()
callbacks.  So let's make it assumed and change all existing callbacks
that way.

Link: https://lkml.kernel.org/r/20210609072029.74645-1-nao.horiguchi@gmail.com
Fixes: commit 78bb920344b8 ("mm: hwpoison: dissolve in-use hugepage in unrecoverable memory error")
Signed-off-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 mm/memory-failure.c | 44 ++++++++++++++++++++++++++++++--------------
 1 file changed, 30 insertions(+), 14 deletions(-)

diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 0519f20d2b57..74a21042845a 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -658,6 +658,7 @@ static int truncate_error_page(struct page *p, unsigned long pfn,
  */
 static int me_kernel(struct page *p, unsigned long pfn)
 {
+	unlock_page(p);
 	return MF_IGNORED;
 }
 
@@ -667,6 +668,7 @@ static int me_kernel(struct page *p, unsigned long pfn)
 static int me_unknown(struct page *p, unsigned long pfn)
 {
 	pr_err("Memory failure: %#lx: Unknown page state\n", pfn);
+	unlock_page(p);
 	return MF_FAILED;
 }
 
@@ -675,6 +677,7 @@ static int me_unknown(struct page *p, unsigned long pfn)
  */
 static int me_pagecache_clean(struct page *p, unsigned long pfn)
 {
+	int ret;
 	struct address_space *mapping;
 
 	delete_from_lru_cache(p);
@@ -683,8 +686,10 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
 	 * For anonymous pages we're done the only reference left
 	 * should be the one m_f() holds.
 	 */
-	if (PageAnon(p))
-		return MF_RECOVERED;
+	if (PageAnon(p)) {
+		ret = MF_RECOVERED;
+		goto out;
+	}
 
 	/*
 	 * Now truncate the page in the page cache. This is really
@@ -698,7 +703,8 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
 		/*
 		 * Page has been teared down in the meanwhile
 		 */
-		return MF_FAILED;
+		ret = MF_FAILED;
+		goto out;
 	}
 
 	/*
@@ -706,7 +712,10 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
 	 *
 	 * Open: to take i_mutex or not for this? Right now we don't.
 	 */
-	return truncate_error_page(p, pfn, mapping);
+	ret = truncate_error_page(p, pfn, mapping);
+out:
+	unlock_page(p);
+	return ret;
 }
 
 /*
@@ -782,24 +791,26 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn)
  */
 static int me_swapcache_dirty(struct page *p, unsigned long pfn)
 {
+	int ret;
+
 	ClearPageDirty(p);
 	/* Trigger EIO in shmem: */
 	ClearPageUptodate(p);
 
-	if (!delete_from_lru_cache(p))
-		return MF_DELAYED;
-	else
-		return MF_FAILED;
+	ret = delete_from_lru_cache(p) ? MF_FAILED : MF_DELAYED;
+	unlock_page(p);
+	return ret;
 }
 
 static int me_swapcache_clean(struct page *p, unsigned long pfn)
 {
+	int ret;
+
 	delete_from_swap_cache(p);
 
-	if (!delete_from_lru_cache(p))
-		return MF_RECOVERED;
-	else
-		return MF_FAILED;
+	ret = delete_from_lru_cache(p) ? MF_FAILED : MF_RECOVERED;
+	unlock_page(p);
+	return ret;
 }
 
 /*
@@ -820,6 +831,7 @@ static int me_huge_page(struct page *p, unsigned long pfn)
 	mapping = page_mapping(hpage);
 	if (mapping) {
 		res = truncate_error_page(hpage, pfn, mapping);
+		unlock_page(hpage);
 	} else {
 		res = MF_FAILED;
 		unlock_page(hpage);
@@ -834,7 +846,6 @@ static int me_huge_page(struct page *p, unsigned long pfn)
 			page_ref_inc(p);
 			res = MF_RECOVERED;
 		}
-		lock_page(hpage);
 	}
 
 	return res;
@@ -866,6 +877,8 @@ static struct page_state {
 	unsigned long mask;
 	unsigned long res;
 	enum mf_action_page_type type;
+
+	/* Callback ->action() has to unlock the relevant page inside it. */
 	int (*action)(struct page *p, unsigned long pfn);
 } error_states[] = {
 	{ reserved,	reserved,	MF_MSG_KERNEL,	me_kernel },
@@ -929,6 +942,7 @@ static int page_action(struct page_state *ps, struct page *p,
 	int result;
 	int count;
 
+	/* page p should be unlocked after returning from ps->action().  */
 	result = ps->action(p, pfn);
 
 	count = page_count(p) - 1;
@@ -1246,7 +1260,7 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags)
 		goto out;
 	}
 
-	res = identify_page_state(pfn, p, page_flags);
+	return identify_page_state(pfn, p, page_flags);
 out:
 	unlock_page(head);
 	return res;
@@ -1529,6 +1543,8 @@ int memory_failure(unsigned long pfn, int flags)
 
 identify_page_state:
 	res = identify_page_state(pfn, p, page_flags);
+	mutex_unlock(&mf_mutex);
+	return res;
 unlock_page:
 	unlock_page(p);
 unlock_mutex:
-- 
Gitee


From 75a005c5418abcb879bdf3053d0853956cec7e73 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <naoya.horiguchi@nec.com>
Date: Mon, 28 Jun 2021 19:43:14 -0700
Subject: [PATCH 547/674] mm,hwpoison: send SIGBUS with error virutal address

mainline inclusion
from mainline-v5.14-rc1
commit a3f5d80ea401ac857f2910e28b15f35b2cf902f4
category: bugfix
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1
CVE: NA

Intel-SIG: commit a3f5d80ea401 mm,hwpoison: send SIGBUS with error virutal address.
Backport for MCA recovery enhancing & bug fix.

--------------------------------

Now an action required MCE in already hwpoisoned address surely sends a
SIGBUS to current process, but the SIGBUS doesn't convey error virtual
address.  That's not optimal for hwpoison-aware applications.

To fix the issue, make memory_failure() call kill_accessing_process(),
that does pagetable walk to find the error virtual address.  It could find
multiple virtual addresses for the same error page, and it seems hard to
tell which virtual address is correct one.  But that's rare and sending
incorrect virtual address could be better than no address.  So let's
report the first found virtual address for now.

[naoya.horiguchi@nec.com: fix walk_page_range() return]
  Link: https://lkml.kernel.org/r/20210603051055.GA244241@hori.linux.bs1.fc.nec.co.jp
Link: https://lkml.kernel.org/r/20210521030156.2612074-4-nao.horiguchi@gmail.com
Signed-off-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Aili Yao <yaoaili@kingsoft.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: David Hildenbrand <david@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Jue Wang <juew@google.com>
Cc: Borislav Petkov <bp@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 arch/x86/kernel/cpu/mce/core.c |  13 ++-
 include/linux/swapops.h        |   5 ++
 mm/memory-failure.c            | 150 ++++++++++++++++++++++++++++++++-
 3 files changed, 165 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 0ced8b250090..cdda2198deae 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -1271,6 +1271,7 @@ static void kill_me_maybe(struct callback_head *cb)
 {
 	struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me);
 	int flags = MF_ACTION_REQUIRED;
+	int ret;
 
 	p->mce_count = 0;
 	pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr);
@@ -1278,13 +1279,21 @@ static void kill_me_maybe(struct callback_head *cb)
 	if (!p->mce_ripv)
 		flags |= MF_MUST_KILL;
 
-	if (!memory_failure(p->mce_addr >> PAGE_SHIFT, flags) &&
-	    !(p->mce_kflags & MCE_IN_KERNEL_COPYIN)) {
+	ret = memory_failure(p->mce_addr >> PAGE_SHIFT, flags);
+	if (!ret && !(p->mce_kflags & MCE_IN_KERNEL_COPYIN)) {
 		set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page);
 		sync_core();
 		return;
 	}
 
+	/*
+	 * -EHWPOISON from memory_failure() means that it already sent SIGBUS
+	 * to the current process with the proper error info, so no need to
+	 * send SIGBUS here again.
+	 */
+	if (ret == -EHWPOISON)
+		return;
+
 	if (p->mce_vaddr != (void __user *)-1l) {
 		force_sig_mceerr(BUS_MCEERR_AR, p->mce_vaddr, PAGE_SHIFT);
 	} else {
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 0d429a102d41..708fbeb21dd3 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -332,6 +332,11 @@ static inline int is_hwpoison_entry(swp_entry_t entry)
 	return swp_type(entry) == SWP_HWPOISON;
 }
 
+static inline unsigned long hwpoison_entry_to_pfn(swp_entry_t entry)
+{
+	return swp_offset(entry);
+}
+
 static inline void num_poisoned_pages_inc(void)
 {
 	atomic_long_inc(&num_poisoned_pages);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 74a21042845a..0035e8ca4e49 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -56,6 +56,7 @@
 #include <linux/kfifo.h>
 #include <linux/ratelimit.h>
 #include <linux/page-isolation.h>
+#include <linux/pagewalk.h>
 #include "internal.h"
 #include "ras/ras_event.h"
 
@@ -554,6 +555,148 @@ void collect_procs(struct page *page, struct list_head *tokill,
 }
 EXPORT_SYMBOL_GPL(collect_procs);
 
+struct hwp_walk {
+	struct to_kill tk;
+	unsigned long pfn;
+	int flags;
+};
+
+static void set_to_kill(struct to_kill *tk, unsigned long addr, short shift)
+{
+	tk->addr = addr;
+	tk->size_shift = shift;
+}
+
+static int check_hwpoisoned_entry(pte_t pte, unsigned long addr, short shift,
+				unsigned long poisoned_pfn, struct to_kill *tk)
+{
+	unsigned long pfn = 0;
+
+	if (pte_present(pte)) {
+		pfn = pte_pfn(pte);
+	} else {
+		swp_entry_t swp = pte_to_swp_entry(pte);
+
+		if (is_hwpoison_entry(swp))
+			pfn = hwpoison_entry_to_pfn(swp);
+	}
+
+	if (!pfn || pfn != poisoned_pfn)
+		return 0;
+
+	set_to_kill(tk, addr, shift);
+	return 1;
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr,
+				      struct hwp_walk *hwp)
+{
+	pmd_t pmd = *pmdp;
+	unsigned long pfn;
+	unsigned long hwpoison_vaddr;
+
+	if (!pmd_present(pmd))
+		return 0;
+	pfn = pmd_pfn(pmd);
+	if (pfn <= hwp->pfn && hwp->pfn < pfn + HPAGE_PMD_NR) {
+		hwpoison_vaddr = addr + ((hwp->pfn - pfn) << PAGE_SHIFT);
+		set_to_kill(&hwp->tk, hwpoison_vaddr, PAGE_SHIFT);
+		return 1;
+	}
+	return 0;
+}
+#else
+static int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr,
+				      struct hwp_walk *hwp)
+{
+	return 0;
+}
+#endif
+
+static int hwpoison_pte_range(pmd_t *pmdp, unsigned long addr,
+			      unsigned long end, struct mm_walk *walk)
+{
+	struct hwp_walk *hwp = (struct hwp_walk *)walk->private;
+	int ret = 0;
+	pte_t *ptep;
+	spinlock_t *ptl;
+
+	ptl = pmd_trans_huge_lock(pmdp, walk->vma);
+	if (ptl) {
+		ret = check_hwpoisoned_pmd_entry(pmdp, addr, hwp);
+		spin_unlock(ptl);
+		goto out;
+	}
+
+	if (pmd_trans_unstable(pmdp))
+		goto out;
+
+	ptep = pte_offset_map_lock(walk->vma->vm_mm, pmdp, addr, &ptl);
+	for (; addr != end; ptep++, addr += PAGE_SIZE) {
+		ret = check_hwpoisoned_entry(*ptep, addr, PAGE_SHIFT,
+					     hwp->pfn, &hwp->tk);
+		if (ret == 1)
+			break;
+	}
+	pte_unmap_unlock(ptep - 1, ptl);
+out:
+	cond_resched();
+	return ret;
+}
+
+#ifdef CONFIG_HUGETLB_PAGE
+static int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask,
+			    unsigned long addr, unsigned long end,
+			    struct mm_walk *walk)
+{
+	struct hwp_walk *hwp = (struct hwp_walk *)walk->private;
+	pte_t pte = huge_ptep_get(ptep);
+	struct hstate *h = hstate_vma(walk->vma);
+
+	return check_hwpoisoned_entry(pte, addr, huge_page_shift(h),
+				      hwp->pfn, &hwp->tk);
+}
+#else
+#define hwpoison_hugetlb_range	NULL
+#endif
+
+static struct mm_walk_ops hwp_walk_ops = {
+	.pmd_entry = hwpoison_pte_range,
+	.hugetlb_entry = hwpoison_hugetlb_range,
+};
+
+/*
+ * Sends SIGBUS to the current process with error info.
+ *
+ * This function is intended to handle "Action Required" MCEs on already
+ * hardware poisoned pages. They could happen, for example, when
+ * memory_failure() failed to unmap the error page at the first call, or
+ * when multiple local machine checks happened on different CPUs.
+ *
+ * MCE handler currently has no easy access to the error virtual address,
+ * so this function walks page table to find it. The returned virtual address
+ * is proper in most cases, but it could be wrong when the application
+ * process has multiple entries mapping the error page.
+ */
+static int kill_accessing_process(struct task_struct *p, unsigned long pfn,
+				  int flags)
+{
+	int ret;
+	struct hwp_walk priv = {
+		.pfn = pfn,
+	};
+	priv.tk.tsk = p;
+
+	mmap_read_lock(p->mm);
+	ret = walk_page_range(p->mm, 0, TASK_SIZE, &hwp_walk_ops,
+			      (void *)&priv);
+	if (ret == 1 && priv.tk.addr)
+		kill_proc(&priv.tk, pfn, flags);
+	mmap_read_unlock(p->mm);
+	return ret ? -EFAULT : -EHWPOISON;
+}
+
 static const char *action_name[] = {
 	[MF_IGNORED] = "Ignored",
 	[MF_FAILED] = "Failed",
@@ -1204,7 +1347,10 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags)
 	if (TestSetPageHWPoison(head)) {
 		pr_err("Memory failure: %#lx: already hardware poisoned\n",
 		       pfn);
-		return -EHWPOISON;
+		res = -EHWPOISON;
+		if (flags & MF_ACTION_REQUIRED)
+			res = kill_accessing_process(current, page_to_pfn(head), flags);
+		return res;
 	}
 
 	num_poisoned_pages_inc();
@@ -1409,6 +1555,8 @@ int memory_failure(unsigned long pfn, int flags)
 		pr_err("Memory failure: %#lx: already hardware poisoned\n",
 			pfn);
 		res = -EHWPOISON;
+		if (flags & MF_ACTION_REQUIRED)
+			res = kill_accessing_process(current, pfn, flags);
 		goto unlock_mutex;
 	}
 
-- 
Gitee


From d1b9413b3213dc72557fc4cdb01ee3ab18d716f4 Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Tue, 17 Aug 2021 17:29:41 -0700
Subject: [PATCH 548/674] x86/mce: Change to not send SIGBUS error during copy
 from user

mainline inclusion
from mainline-v5.16-rc1
commit a6e3cf70b772541c2388abdb86e5a562cfe18e63
category: bugfix
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1
CVE: NA

Intel-SIG: commit a6e3cf70b772 x86/mce: Change to not send SIGBUS
error during copy from user.
Backport for MCA recovery enhance and bug fix.

--------------------------------

Sending a SIGBUS for a copy from user is not the correct semantic.
System calls should return -EFAULT (or a short count for write(2)).

Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210818002942.1607544-3-tony.luck@intel.com
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 arch/x86/kernel/cpu/mce/core.c | 36 +++++++++++++++++++---------------
 1 file changed, 20 insertions(+), 16 deletions(-)

diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index cdda2198deae..8aeb56f5e577 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -1280,7 +1280,7 @@ static void kill_me_maybe(struct callback_head *cb)
 		flags |= MF_MUST_KILL;
 
 	ret = memory_failure(p->mce_addr >> PAGE_SHIFT, flags);
-	if (!ret && !(p->mce_kflags & MCE_IN_KERNEL_COPYIN)) {
+	if (!ret) {
 		set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page);
 		sync_core();
 		return;
@@ -1294,15 +1294,21 @@ static void kill_me_maybe(struct callback_head *cb)
 	if (ret == -EHWPOISON)
 		return;
 
-	if (p->mce_vaddr != (void __user *)-1l) {
-		force_sig_mceerr(BUS_MCEERR_AR, p->mce_vaddr, PAGE_SHIFT);
-	} else {
-		pr_err("Memory error not recovered");
-		kill_me_now(cb);
-	}
+	pr_err("Memory error not recovered");
+	kill_me_now(cb);
 }
 
-static void queue_task_work(struct mce *m, char *msg, int kill_current_task)
+static void kill_me_never(struct callback_head *cb)
+{
+	struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me);
+
+	p->mce_count = 0;
+	pr_err("Kernel accessed poison in user space at %llx\n", p->mce_addr);
+	if (!memory_failure(p->mce_addr >> PAGE_SHIFT, 0))
+		set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page);
+}
+
+static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callback_head *))
 {
 	int count = ++current->mce_count;
 
@@ -1312,11 +1318,7 @@ static void queue_task_work(struct mce *m, char *msg, int kill_current_task)
 		current->mce_kflags = m->kflags;
 		current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV);
 		current->mce_whole_page = whole_page(m);
-
-		if (kill_current_task)
-			current->mce_kill_me.func = kill_me_now;
-		else
-			current->mce_kill_me.func = kill_me_maybe;
+		current->mce_kill_me.func = func;
 	}
 
 	/* Ten is likely overkill. Don't expect more than two faults before task_work() */
@@ -1486,8 +1488,10 @@ noinstr void do_machine_check(struct pt_regs *regs)
 		/* If this triggers there is no way to recover. Die hard. */
 		BUG_ON(!on_thread_stack() || !user_mode(regs));
 
-		queue_task_work(&m, msg, kill_it);
-
+		if (kill_it)
+			queue_task_work(&m, msg, kill_me_now);
+		else
+			queue_task_work(&m, msg, kill_me_maybe);
 	} else {
 		/*
 		 * Handle an MCE which has happened in kernel space but from
@@ -1504,7 +1508,7 @@ noinstr void do_machine_check(struct pt_regs *regs)
 		}
 
 		if (m.kflags & MCE_IN_KERNEL_COPYIN)
-			queue_task_work(&m, msg, kill_it);
+			queue_task_work(&m, msg, kill_me_never);
 	}
 
 	instrumentation_end();
-- 
Gitee


From 4bc1efa1413b9bdfaa10be0f049f19e2a2266253 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 31 May 2021 00:32:44 -0400
Subject: [PATCH 549/674] generic_perform_write()/iomap_write_actor(): saner
 logics for short copy

mainline inclusion
from mainline-v5.13
commit bc1bb416bbb9203e250f5c49aaf1d11b5d9c8adb
category: bugfix
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1
CVE: NA

Intel-SIG: commit bc1bb416bbb9 generic_perform_write()/iomap_write_actor():
saner logics for short copy.
Backport for MCA recovery enhance and bug fix.

--------------------------------

if we run into a short copy and ->write_end() refuses to advance at all,
use the amount we'd managed to copy for the next iteration to handle.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 fs/iomap/buffered-io.c | 25 ++++++++++---------------
 mm/filemap.c           | 24 +++++++++---------------
 2 files changed, 19 insertions(+), 30 deletions(-)

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 3ec494f5d7ee..d5246e277f17 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -771,10 +771,6 @@ iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 		 * Otherwise there's a nasty deadlock on copying from the
 		 * same page as we're writing to, without it being marked
 		 * up-to-date.
-		 *
-		 * Not only is this an optimisation, but it is also required
-		 * to check that the address is actually valid, when atomic
-		 * usercopies are used, below.
 		 */
 		if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
 			status = -EFAULT;
@@ -791,25 +787,24 @@ iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 
 		copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
 
-		copied = iomap_write_end(inode, pos, bytes, copied, page, iomap,
+		status = iomap_write_end(inode, pos, bytes, copied, page, iomap,
 				srcmap);
 
 		cond_resched();
 
-		iov_iter_advance(i, copied);
-		if (unlikely(copied == 0)) {
+		if (unlikely(status == 0)) {
 			/*
-			 * If we were unable to copy any data at all, we must
-			 * fall back to a single segment length write.
-			 *
-			 * If we didn't fallback here, we could livelock
-			 * because not all segments in the iov can be copied at
-			 * once without a pagefault.
+			 * A short copy made iomap_write_end() reject the
+			 * thing entirely.  Might be memory poisoning
+			 * halfway through, might be a race with munmap,
+			 * might be severe memory pressure.
 			 */
-			bytes = min_t(unsigned long, PAGE_SIZE - offset,
-						iov_iter_single_seg_count(i));
+			if (copied)
+				bytes = copied;
 			goto again;
 		}
+		copied = status;
+		iov_iter_advance(i, copied);
 		pos += copied;
 		written += copied;
 		length -= copied;
diff --git a/mm/filemap.c b/mm/filemap.c
index 3958fc3280d8..80c5c418eba8 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3510,10 +3510,6 @@ ssize_t generic_perform_write(struct file *file,
 		 * Otherwise there's a nasty deadlock on copying from the
 		 * same page as we're writing to, without it being marked
 		 * up-to-date.
-		 *
-		 * Not only is this an optimisation, but it is also required
-		 * to check that the address is actually valid, when atomic
-		 * usercopies are used, below.
 		 */
 		if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
 			status = -EFAULT;
@@ -3540,24 +3536,22 @@ ssize_t generic_perform_write(struct file *file,
 						page, fsdata);
 		if (unlikely(status < 0))
 			break;
-		copied = status;
 
 		cond_resched();
 
-		iov_iter_advance(i, copied);
-		if (unlikely(copied == 0)) {
+		if (unlikely(status == 0)) {
 			/*
-			 * If we were unable to copy any data at all, we must
-			 * fall back to a single segment length write.
-			 *
-			 * If we didn't fallback here, we could livelock
-			 * because not all segments in the iov can be copied at
-			 * once without a pagefault.
+			 * A short copy made ->write_end() reject the
+			 * thing entirely.  Might be memory poisoning
+			 * halfway through, might be a race with munmap,
+			 * might be severe memory pressure.
 			 */
-			bytes = min_t(unsigned long, PAGE_SIZE - offset,
-						iov_iter_single_seg_count(i));
+			if (copied)
+				bytes = copied;
 			goto again;
 		}
+		copied = status;
+		iov_iter_advance(i, copied);
 		pos += copied;
 		written += copied;
 
-- 
Gitee


From a1728f8de83520f93310fc7409c502358ca46fdd Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Tue, 17 Aug 2021 17:29:42 -0700
Subject: [PATCH 550/674] x86/mce: Drop copyin special case for #MC

mainline inclusion
from mainline-v5.16-rc1
commit 690658471b5f28d306e6492c4585d748cb5304e8
category: bugfix
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1
CVE: NA

Intel-SIG: commit 690658471b5f x86/mce: Drop copyin special case for #MC.
Backport for MCA recovery enhancing & bug fix.

--------------------------------

Fixes to the iterator code to handle faults that are not on page
boundaries mean that the special case for machine check during copy from
user is no longer needed.

For a full list of those fixes, see the output of:

  git log --oneline v5.14 ^v5.13 -- lib/iov_iter.c

Intel-SIG: commit 690658471b5f x86/mce: Drop copyin special case for #MC.
backport for MCA recovery enhance and bug fix.

Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210818002942.1607544-4-tony.luck@intel.com
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 arch/x86/lib/copy_user_64.S | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index 77b9b2a3b5c8..e0e71ca023ce 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -234,24 +234,11 @@ EXPORT_SYMBOL(copy_user_enhanced_fast_string)
  */
 SYM_CODE_START_LOCAL(.Lcopy_user_handle_tail)
 	movl %edx,%ecx
-	cmp $X86_TRAP_MC,%eax		/* check if X86_TRAP_MC */
-	je 3f
 1:	rep movsb
 2:	mov %ecx,%eax
 	ASM_CLAC
 	ret
 
-	/*
-	 * Return zero to pretend that this copy succeeded. This
-	 * is counter-intuitive, but needed to prevent the code
-	 * in lib/iov_iter.c from retrying and running back into
-	 * the poison cache line again. The machine check handler
-	 * will ensure that a SIGBUS is sent to the task.
-	 */
-3:	xorl %eax,%eax
-	ASM_CLAC
-	ret
-
 	_ASM_EXTABLE_CPY(1b, 2b)
 SYM_CODE_END(.Lcopy_user_handle_tail)
 
-- 
Gitee


From 3ea63cb74c8cf3446e0caf8c32682ae92d31a58f Mon Sep 17 00:00:00 2001
From: Youquan Song <youquan.song@intel.com>
Date: Thu, 23 Dec 2021 12:07:01 -0800
Subject: [PATCH 551/674] x86/mce: Reduce number of machine checks taken during
 recovery

mainline inclusion
from mainline-v5.17-rc1
commit 3376136300a00df9a864b88fa969177d6c3be8e5
category: bugfix
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1
CVE: NA

Intel-SIG: commit 3376136300a0 x86/mce: Reduce number of machine checks taken during
 recovery.
Backport for MCA recovery enhancing & bug fix.

--------------------------------

When any of the copy functions in arch/x86/lib/copy_user_64.S take a
fault, the fixup code copies the remaining byte count from %ecx to %edx
and unconditionally jumps to .Lcopy_user_handle_tail to continue the
copy in case any more bytes can be copied.

If the fault was #PF this may copy more bytes (because the page fault
handler might have fixed the fault). But when the fault is a machine
check the original copy code will have copied all the way to the poisoned
cache line. So .Lcopy_user_handle_tail will just take another machine
check for no good reason.

Every code path to .Lcopy_user_handle_tail comes from an exception fixup
path, so add a check there to check the trap type (in %eax) and simply
return the count of remaining bytes if the trap was a machine check.

Doing this reduces the number of machine checks taken during synthetic
tests from four to three.

As well as reducing the number of machine checks, this also allows
Skylake generation Xeons to recover some cases that currently fail. The
is because REP; MOVSB is only recoverable when source and destination
are well aligned and the byte count is large. That useless call to
.Lcopy_user_handle_tail may violate one or more of these conditions and
generate a fatal machine check.

  [ Tony: Add more details to commit message. ]
  [ bp: Fixup comment.
    Also, another tip patchset which is adding straight-line speculation
    mitigation changes the "ret" instruction to an all-caps macro "RET".
    But, since gas is case-insensitive, use "RET" in the newly added asm block
    already in order to simplify tip branch merging on its way upstream.
  ]

Signed-off-by: Youquan Song <youquan.song@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lore.kernel.org/r/YcTW5dh8yTGucDd+@agluck-desk2.amr.corp.intel.com
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 arch/x86/lib/copy_user_64.S | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index e0e71ca023ce..403ba3eb4b84 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -225,6 +225,7 @@ EXPORT_SYMBOL(copy_user_enhanced_fast_string)
  * Don't try to copy the tail if machine check happened
  *
  * Input:
+ * eax trap number written by ex_handler_copy()
  * rdi destination
  * rsi source
  * rdx count
@@ -233,12 +234,20 @@ EXPORT_SYMBOL(copy_user_enhanced_fast_string)
  * eax uncopied bytes or 0 if successful.
  */
 SYM_CODE_START_LOCAL(.Lcopy_user_handle_tail)
+	cmp $X86_TRAP_MC,%eax
+	je 3f
+
 	movl %edx,%ecx
 1:	rep movsb
 2:	mov %ecx,%eax
 	ASM_CLAC
 	ret
 
+3:
+	movl %edx,%eax
+	ASM_CLAC
+	RET
+
 	_ASM_EXTABLE_CPY(1b, 2b)
 SYM_CODE_END(.Lcopy_user_handle_tail)
 
-- 
Gitee


From 297f05904ec5352fa767b706f70b7bf9d88ab986 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <naoya.horiguchi@nec.com>
Date: Sun, 6 Feb 2022 15:10:14 +0800
Subject: [PATCH 552/674] mm/hwpoison: fix error page recovered but reported
 "not recovered"

mainline inclusion
from mainline-v5.18-rc1
commit 046545a661af2beec21de7b90ca0e35f05088a81
category: bugfix
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1
CVE: NA

Intel-SIG: commit 046545a661af mm/hwpoison: fix error page recovered but reported "not
 recovered".
Backport for MCA recovery enhancing & bug fix.

--------------------------------

When an uncorrected memory error is consumed there is a race between the
CMCI from the memory controller reporting an uncorrected error with a
UCNA signature, and the core reporting and SRAR signature machine check
when the data is about to be consumed.

If the CMCI wins that race, the page is marked poisoned when
uc_decode_notifier() calls memory_failure() and the machine check
processing code finds the page already poisoned.  It calls
kill_accessing_process() to make sure a SIGBUS is sent.  But returns the
wrong error code.

Console log looks like this:

  mce: Uncorrected hardware memory error in user-access at 3710b3400
  Memory failure: 0x3710b3: recovery action for dirty LRU page: Recovered
  Memory failure: 0x3710b3: already hardware poisoned
  Memory failure: 0x3710b3: Sending SIGBUS to einj_mem_uc:361438 due to hardware memory corruption
  mce: Memory error not recovered

kill_accessing_process() is supposed to return -EHWPOISON to notify that
SIGBUS is already set to the process and kill_me_maybe() doesn't have to
send it again.  But current code simply fails to do this, so fix it to
make sure to work as intended.  This change avoids the noise message
"Memory error not recovered" and skips duplicate SIGBUSs.

[tony.luck@intel.com: reword some parts of commit message]

Link: https://lkml.kernel.org/r/20220113231117.1021405-1-naoya.horiguchi@linux.dev
Fixes: a3f5d80ea401 ("mm,hwpoison: send SIGBUS with error virutal address")
Signed-off-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
Reported-by: Youquan Song <youquan.song@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 mm/memory-failure.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 0035e8ca4e49..7c7769ff0f1d 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -693,8 +693,10 @@ static int kill_accessing_process(struct task_struct *p, unsigned long pfn,
 			      (void *)&priv);
 	if (ret == 1 && priv.tk.addr)
 		kill_proc(&priv.tk, pfn, flags);
+	else
+		ret = 0;
 	mmap_read_unlock(p->mm);
-	return ret ? -EFAULT : -EHWPOISON;
+	return ret > 0 ? -EHWPOISON : -EFAULT;
 }
 
 static const char *action_name[] = {
-- 
Gitee


From 928722d827c692d23c11f40980c7edc536a58cd9 Mon Sep 17 00:00:00 2001
From: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Date: Tue, 17 Nov 2020 20:49:52 +0800
Subject: [PATCH 553/674] EDAC: Add DDR5 new memory type

mainline inclusion
from mainline-v5.13
commit bc1c99a5971aa7571e8b9731c28fa32abe12cab8
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1
CVE: NA

Intel_SIG: commit bc1c99a5971a EDAC: Add DDR5 new memory type.
Backport for EDAC enhancing & bug fix.

--------------------------------

Add a new entry to 'enum mem_type' and a new string to
'edac_mem_types[]' for DDR5 new memory type.

Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 drivers/edac/edac_mc.c | 1 +
 include/linux/edac.h   | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index f4eb071327be..34514c638f19 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -161,6 +161,7 @@ const char * const edac_mem_types[] = {
 	[MEM_DDR4]	= "Unbuffered-DDR4",
 	[MEM_RDDR4]	= "Registered-DDR4",
 	[MEM_LRDDR4]	= "Load-Reduced-DDR4-RAM",
+	[MEM_DDR5]	= "Unbuffered-DDR5",
 	[MEM_NVDIMM]	= "Non-volatile-RAM",
 };
 EXPORT_SYMBOL_GPL(edac_mem_types);
diff --git a/include/linux/edac.h b/include/linux/edac.h
index 15e8f3d8a895..6c4565cc6273 100644
--- a/include/linux/edac.h
+++ b/include/linux/edac.h
@@ -179,6 +179,7 @@ static inline char *mc_event_error_type(const unsigned int err_type)
  * @MEM_RDDR4:		Registered DDR4 RAM
  *			This is a variant of the DDR4 memories.
  * @MEM_LRDDR4:		Load-Reduced DDR4 memory.
+ * @MEM_DDR5:		Unbuffered DDR5 RAM
  * @MEM_NVDIMM:		Non-volatile RAM
  */
 enum mem_type {
@@ -203,6 +204,7 @@ enum mem_type {
 	MEM_DDR4,
 	MEM_RDDR4,
 	MEM_LRDDR4,
+	MEM_DDR5,
 	MEM_NVDIMM,
 };
 
@@ -226,6 +228,7 @@ enum mem_type {
 #define MEM_FLAG_DDR4           BIT(MEM_DDR4)
 #define MEM_FLAG_RDDR4          BIT(MEM_RDDR4)
 #define MEM_FLAG_LRDDR4         BIT(MEM_LRDDR4)
+#define MEM_FLAG_DDR5           BIT(MEM_DDR5)
 #define MEM_FLAG_NVDIMM         BIT(MEM_NVDIMM)
 
 /**
-- 
Gitee


From 432fad27ed99567be07b4ebfbb7aa50b8856c83e Mon Sep 17 00:00:00 2001
From: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Date: Tue, 17 Nov 2020 20:49:53 +0800
Subject: [PATCH 554/674] EDAC/i10nm: Add Intel Sapphire Rapids server support

mainline inclusion
from mainline-v5.11-rc1
commit 479f58dda25bb46daeb937f124718e8b4aea6781
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1
CVE: NA

Intel_SIG: commit 479f58dda25b EDAC/i10nm: Add Intel Sapphire
Rapids server support.
Backport for add EDAC SPR suppporting.

--------------------------------

The Sapphire Rapids CPU model shares the same memory controller
architecture with Ice Lake server. There are some configurations
different from Ice Lake server as below:
- The device ID for configuration agent.
- The size for per channel memory-mapped I/O.
- The DDR5 memory support.
So add the above configurations and the Sapphire Rapids CPU model
ID for EDAC support.

Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 drivers/edac/i10nm_base.c | 34 +++++++++++++++++++++++++---------
 drivers/edac/skx_base.c   |  6 +++---
 drivers/edac/skx_common.c | 23 ++++++++++++++++++-----
 drivers/edac/skx_common.h | 16 ++++++++++++----
 4 files changed, 58 insertions(+), 21 deletions(-)

diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c
index 3a7362f968c9..4f7f9970a901 100644
--- a/drivers/edac/i10nm_base.c
+++ b/drivers/edac/i10nm_base.c
@@ -13,7 +13,7 @@
 #include "edac_module.h"
 #include "skx_common.h"
 
-#define I10NM_REVISION	"v0.0.3"
+#define I10NM_REVISION	"v0.0.4"
 #define EDAC_MOD_STR	"i10nm_edac"
 
 /* Debug macros */
@@ -25,11 +25,13 @@
 #define I10NM_GET_IMC_BAR(d, i, reg)	\
 	pci_read_config_dword((d)->uracu, 0xd8 + (i) * 4, &(reg))
 #define I10NM_GET_DIMMMTR(m, i, j)	\
-	readl((m)->mbase + 0x2080c + (i) * 0x4000 + (j) * 4)
+	readl((m)->mbase + 0x2080c + (i) * (m)->chan_mmio_sz + (j) * 4)
 #define I10NM_GET_MCDDRTCFG(m, i)	\
-	readl((m)->mbase + 0x20970 + (i) * 0x4000)
+	readl((m)->mbase + 0x20970 + (i) * (m)->chan_mmio_sz)
 #define I10NM_GET_MCMTR(m, i)		\
-	readl((m)->mbase + 0x20ef8 + (i) * 0x4000)
+	readl((m)->mbase + 0x20ef8 + (i) * (m)->chan_mmio_sz)
+#define I10NM_GET_AMAP(m, i)		\
+	readl((m)->mbase + 0x20814 + (i) * (m)->chan_mmio_sz)
 
 #define I10NM_GET_SCK_MMIO_BASE(reg)	(GET_BITFIELD(reg, 0, 28) << 23)
 #define I10NM_GET_IMC_MMIO_OFFSET(reg)	(GET_BITFIELD(reg, 0, 10) << 12)
@@ -129,12 +131,22 @@ static struct res_config i10nm_cfg0 = {
 	.type			= I10NM,
 	.decs_did		= 0x3452,
 	.busno_cfg_offset	= 0xcc,
+	.ddr_chan_mmio_sz	= 0x4000,
 };
 
 static struct res_config i10nm_cfg1 = {
 	.type			= I10NM,
 	.decs_did		= 0x3452,
 	.busno_cfg_offset	= 0xd0,
+	.ddr_chan_mmio_sz	= 0x4000,
+};
+
+static struct res_config spr_cfg = {
+	.type			= SPR,
+	.decs_did		= 0x3252,
+	.busno_cfg_offset	= 0xd0,
+	.ddr_chan_mmio_sz	= 0x8000,
+	.support_ddr5		= true,
 };
 
 static const struct x86_cpu_id i10nm_cpuids[] = {
@@ -143,6 +155,7 @@ static const struct x86_cpu_id i10nm_cpuids[] = {
 	X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(ICELAKE_X,		X86_STEPPINGS(0x0, 0x3), &i10nm_cfg0),
 	X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(ICELAKE_X,		X86_STEPPINGS(0x4, 0xf), &i10nm_cfg1),
 	X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(ICELAKE_D,		X86_STEPPINGS(0x0, 0xf), &i10nm_cfg1),
+	X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(SAPPHIRERAPIDS_X,	X86_STEPPINGS(0x0, 0xf), &spr_cfg),
 	{}
 };
 MODULE_DEVICE_TABLE(x86cpu, i10nm_cpuids);
@@ -157,12 +170,13 @@ static bool i10nm_check_ecc(struct skx_imc *imc, int chan)
 	return !!GET_BITFIELD(mcmtr, 2, 2);
 }
 
-static int i10nm_get_dimm_config(struct mem_ctl_info *mci)
+static int i10nm_get_dimm_config(struct mem_ctl_info *mci,
+				 struct res_config *cfg)
 {
 	struct skx_pvt *pvt = mci->pvt_info;
 	struct skx_imc *imc = pvt->imc;
+	u32 mtr, amap, mcddrtcfg;
 	struct dimm_info *dimm;
-	u32 mtr, mcddrtcfg;
 	int i, j, ndimms;
 
 	for (i = 0; i < I10NM_NUM_CHANNELS; i++) {
@@ -171,6 +185,7 @@ static int i10nm_get_dimm_config(struct mem_ctl_info *mci)
 
 		ndimms = 0;
 		mcddrtcfg = I10NM_GET_MCDDRTCFG(imc, i);
+		amap = I10NM_GET_AMAP(imc, i);
 		for (j = 0; j < I10NM_NUM_DIMMS; j++) {
 			dimm = edac_get_dimm(mci, i, j, 0);
 			mtr = I10NM_GET_DIMMMTR(imc, i, j);
@@ -178,8 +193,8 @@ static int i10nm_get_dimm_config(struct mem_ctl_info *mci)
 				 mtr, mcddrtcfg, imc->mc, i, j);
 
 			if (IS_DIMM_PRESENT(mtr))
-				ndimms += skx_get_dimm_info(mtr, 0, 0, dimm,
-							    imc, i, j);
+				ndimms += skx_get_dimm_info(mtr, 0, amap, dimm,
+							    imc, i, j, cfg);
 			else if (IS_NVDIMM_PRESENT(mcddrtcfg, j))
 				ndimms += skx_get_nvdimm_info(dimm, imc, i, j,
 							      EDAC_MOD_STR);
@@ -306,10 +321,11 @@ static int __init i10nm_init(void)
 			d->imc[i].lmc = i;
 			d->imc[i].src_id  = src_id;
 			d->imc[i].node_id = node_id;
+			d->imc[i].chan_mmio_sz = cfg->ddr_chan_mmio_sz;
 
 			rc = skx_register_mci(&d->imc[i], d->imc[i].mdev,
 					      "Intel_10nm Socket", EDAC_MOD_STR,
-					      i10nm_get_dimm_config);
+					      i10nm_get_dimm_config, cfg);
 			if (rc < 0)
 				goto fail;
 		}
diff --git a/drivers/edac/skx_base.c b/drivers/edac/skx_base.c
index f887e3166651..4dbd46575bfb 100644
--- a/drivers/edac/skx_base.c
+++ b/drivers/edac/skx_base.c
@@ -174,7 +174,7 @@ static bool skx_check_ecc(u32 mcmtr)
 	return !!GET_BITFIELD(mcmtr, 2, 2);
 }
 
-static int skx_get_dimm_config(struct mem_ctl_info *mci)
+static int skx_get_dimm_config(struct mem_ctl_info *mci, struct res_config *cfg)
 {
 	struct skx_pvt *pvt = mci->pvt_info;
 	u32 mtr, mcmtr, amap, mcddrtcfg;
@@ -195,7 +195,7 @@ static int skx_get_dimm_config(struct mem_ctl_info *mci)
 			pci_read_config_dword(imc->chan[i].cdev,
 					      0x80 + 4 * j, &mtr);
 			if (IS_DIMM_PRESENT(mtr)) {
-				ndimms += skx_get_dimm_info(mtr, mcmtr, amap, dimm, imc, i, j);
+				ndimms += skx_get_dimm_info(mtr, mcmtr, amap, dimm, imc, i, j, cfg);
 			} else if (IS_NVDIMM_PRESENT(mcddrtcfg, j)) {
 				ndimms += skx_get_nvdimm_info(dimm, imc, i, j,
 							      EDAC_MOD_STR);
@@ -705,7 +705,7 @@ static int __init skx_init(void)
 			d->imc[i].node_id = node_id;
 			rc = skx_register_mci(&d->imc[i], d->imc[i].chan[0].cdev,
 					      "Skylake Socket", EDAC_MOD_STR,
-					      skx_get_dimm_config);
+					      skx_get_dimm_config, cfg);
 			if (rc < 0)
 				goto fail;
 		}
diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c
index 2b4ce8e5ac2f..81c3e2ec6f56 100644
--- a/drivers/edac/skx_common.c
+++ b/drivers/edac/skx_common.c
@@ -304,15 +304,25 @@ static int skx_get_dimm_attr(u32 reg, int lobit, int hibit, int add,
 #define numcol(reg)	skx_get_dimm_attr(reg, 0, 1, 10, 0, 2, "cols")
 
 int skx_get_dimm_info(u32 mtr, u32 mcmtr, u32 amap, struct dimm_info *dimm,
-		      struct skx_imc *imc, int chan, int dimmno)
+		      struct skx_imc *imc, int chan, int dimmno,
+		      struct res_config *cfg)
 {
-	int  banks = 16, ranks, rows, cols, npages;
+	int  banks, ranks, rows, cols, npages;
+	enum mem_type mtype;
 	u64 size;
 
 	ranks = numrank(mtr);
 	rows = numrow(mtr);
 	cols = numcol(mtr);
 
+	if (cfg->support_ddr5 && (amap & 0x8)) {
+		banks = 32;
+		mtype = MEM_DDR5;
+	} else {
+		banks = 16;
+		mtype = MEM_DDR4;
+	}
+
 	/*
 	 * Compute size in 8-byte (2^3) words, then shift to MiB (2^20)
 	 */
@@ -332,7 +342,7 @@ int skx_get_dimm_info(u32 mtr, u32 mcmtr, u32 amap, struct dimm_info *dimm,
 	dimm->nr_pages = npages;
 	dimm->grain = 32;
 	dimm->dtype = get_width(mtr);
-	dimm->mtype = MEM_DDR4;
+	dimm->mtype = mtype;
 	dimm->edac_mode = EDAC_SECDED; /* likely better than this */
 	snprintf(dimm->label, sizeof(dimm->label), "CPU_SrcID#%u_MC#%u_Chan#%u_DIMM#%u",
 		 imc->src_id, imc->lmc, chan, dimmno);
@@ -390,7 +400,8 @@ int skx_get_nvdimm_info(struct dimm_info *dimm, struct skx_imc *imc,
 
 int skx_register_mci(struct skx_imc *imc, struct pci_dev *pdev,
 		     const char *ctl_name, const char *mod_str,
-		     get_dimm_config_f get_dimm_config)
+		     get_dimm_config_f get_dimm_config,
+		     struct res_config *cfg)
 {
 	struct mem_ctl_info *mci;
 	struct edac_mc_layer layers[2];
@@ -425,13 +436,15 @@ int skx_register_mci(struct skx_imc *imc, struct pci_dev *pdev,
 	}
 
 	mci->mtype_cap = MEM_FLAG_DDR4 | MEM_FLAG_NVDIMM;
+	if (cfg->support_ddr5)
+		mci->mtype_cap |= MEM_FLAG_DDR5;
 	mci->edac_ctl_cap = EDAC_FLAG_NONE;
 	mci->edac_cap = EDAC_FLAG_NONE;
 	mci->mod_name = mod_str;
 	mci->dev_name = pci_name(pdev);
 	mci->ctl_page_to_phys = NULL;
 
-	rc = get_dimm_config(mci);
+	rc = get_dimm_config(mci, cfg);
 	if (rc < 0)
 		goto fail;
 
diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h
index 78f8c1de0b71..bf56bebff138 100644
--- a/drivers/edac/skx_common.h
+++ b/drivers/edac/skx_common.h
@@ -59,6 +59,7 @@ struct skx_dev {
 		struct mem_ctl_info *mci;
 		struct pci_dev *mdev; /* for i10nm CPU */
 		void __iomem *mbase;  /* for i10nm CPU */
+		int chan_mmio_sz;     /* for i10nm CPU */
 		u8 mc;	/* system wide mc# */
 		u8 lmc;	/* socket relative mc# */
 		u8 src_id, node_id;
@@ -82,7 +83,8 @@ struct skx_pvt {
 
 enum type {
 	SKX,
-	I10NM
+	I10NM,
+	SPR
 };
 
 enum {
@@ -118,9 +120,13 @@ struct res_config {
 	unsigned int decs_did;
 	/* Default bus number configuration register offset */
 	int busno_cfg_offset;
+	/* Per DDR channel memory-mapped I/O size */
+	int ddr_chan_mmio_sz;
+	bool support_ddr5;
 };
 
-typedef int (*get_dimm_config_f)(struct mem_ctl_info *mci);
+typedef int (*get_dimm_config_f)(struct mem_ctl_info *mci,
+				 struct res_config *cfg);
 typedef bool (*skx_decode_f)(struct decoded_addr *res);
 typedef void (*skx_show_retry_log_f)(struct decoded_addr *res, char *msg, int len);
 
@@ -136,14 +142,16 @@ int skx_get_all_bus_mappings(struct res_config *cfg, struct list_head **list);
 int skx_get_hi_lo(unsigned int did, int off[], u64 *tolm, u64 *tohm);
 
 int skx_get_dimm_info(u32 mtr, u32 mcmtr, u32 amap, struct dimm_info *dimm,
-		      struct skx_imc *imc, int chan, int dimmno);
+		      struct skx_imc *imc, int chan, int dimmno,
+		      struct res_config *cfg);
 
 int skx_get_nvdimm_info(struct dimm_info *dimm, struct skx_imc *imc,
 			int chan, int dimmno, const char *mod_str);
 
 int skx_register_mci(struct skx_imc *imc, struct pci_dev *pdev,
 		     const char *ctl_name, const char *mod_str,
-		     get_dimm_config_f get_dimm_config);
+		     get_dimm_config_f get_dimm_config,
+		     struct res_config *cfg);
 
 int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
 			void *data);
-- 
Gitee


From 3dd707aa7c8e5d5cda72390ee0d22508944bdb77 Mon Sep 17 00:00:00 2001
From: Youquan Song <youquan.song@intel.com>
Date: Wed, 18 Aug 2021 10:57:01 -0700
Subject: [PATCH 555/674] EDAC/i10nm: Retrieve and print retry_rd_err_log
 registers

mainline inclusion
from mainline-v5.15-rc1
commit cf4e6d52f58399c777276172ec250502e19d5e63
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1
CVE: NA

Intel-SIG: commit cf4e6d52f583 EDAC/i10nm: Retrieve and print
retry_rd_err_log registers.
Backport for EDAC retry_rd_err_log for ICX/SPR support.

--------------------------------

Retrieve and print retry_rd_err_log registers like the earlier change:
commit e80634a75aba ("EDAC, skx: Retrieve and print retry_rd_err_log registers")

This is a little trickier than on Skylake because of potential
interference with BIOS use of the same registers. The default
behavior is to ignore these registers.

A module parameter retry_rd_err_log(default=0) controls the mode of operation:
- 0=off  : Default.
- 1=bios : Linux doesn't reset any control bits, but just reports values.
           This is "no harm" mode, but it may miss reporting some data.
- 2=linux: Linux tries to take control and resets mode bits,
           clears valid/UC bits after reading. This should be
           more reliable (especially if BIOS interference is reduced
           by disabling eMCA reporting mode in BIOS setup).

Co-developed-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Youquan Song <youquan.song@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/r/20210818175701.1611513-3-tony.luck@intel.com
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 drivers/edac/i10nm_base.c | 146 ++++++++++++++++++++++++++++++++++++++
 drivers/edac/skx_base.c   |   3 +-
 drivers/edac/skx_common.c |   4 +-
 drivers/edac/skx_common.h |   7 +-
 4 files changed, 157 insertions(+), 3 deletions(-)

diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c
index 4f7f9970a901..7798de1e2aed 100644
--- a/drivers/edac/i10nm_base.c
+++ b/drivers/edac/i10nm_base.c
@@ -32,14 +32,137 @@
 	readl((m)->mbase + 0x20ef8 + (i) * (m)->chan_mmio_sz)
 #define I10NM_GET_AMAP(m, i)		\
 	readl((m)->mbase + 0x20814 + (i) * (m)->chan_mmio_sz)
+#define I10NM_GET_REG32(m, i, offset)	\
+	readl((m)->mbase + (i) * (m)->chan_mmio_sz + (offset))
+#define I10NM_GET_REG64(m, i, offset)	\
+	readq((m)->mbase + (i) * (m)->chan_mmio_sz + (offset))
+#define I10NM_SET_REG32(m, i, offset, v)	\
+	writel(v, (m)->mbase + (i) * (m)->chan_mmio_sz + (offset))
 
 #define I10NM_GET_SCK_MMIO_BASE(reg)	(GET_BITFIELD(reg, 0, 28) << 23)
 #define I10NM_GET_IMC_MMIO_OFFSET(reg)	(GET_BITFIELD(reg, 0, 10) << 12)
 #define I10NM_GET_IMC_MMIO_SIZE(reg)	((GET_BITFIELD(reg, 13, 23) - \
 					 GET_BITFIELD(reg, 0, 10) + 1) << 12)
 
+#define RETRY_RD_ERR_LOG_UC		BIT(1)
+#define RETRY_RD_ERR_LOG_NOOVER		BIT(14)
+#define RETRY_RD_ERR_LOG_EN		BIT(15)
+#define RETRY_RD_ERR_LOG_NOOVER_UC	(BIT(14) | BIT(1))
+#define RETRY_RD_ERR_LOG_OVER_UC_V	(BIT(2) | BIT(1) | BIT(0))
+
 static struct list_head *i10nm_edac_list;
 
+static struct res_config *res_cfg;
+static int retry_rd_err_log;
+
+static u32 offsets_scrub_icx[]  = {0x22c60, 0x22c54, 0x22c5c, 0x22c58, 0x22c28, 0x20ed8};
+static u32 offsets_scrub_spr[]  = {0x22c60, 0x22c54, 0x22f08, 0x22c58, 0x22c28, 0x20ed8};
+static u32 offsets_demand_icx[] = {0x22e54, 0x22e60, 0x22e64, 0x22e58, 0x22e5c, 0x20ee0};
+static u32 offsets_demand_spr[] = {0x22e54, 0x22e60, 0x22f10, 0x22e58, 0x22e5c, 0x20ee0};
+
+static void __enable_retry_rd_err_log(struct skx_imc *imc, int chan, bool enable)
+{
+	u32 s, d;
+
+	if (!imc->mbase)
+		return;
+
+	s = I10NM_GET_REG32(imc, chan, res_cfg->offsets_scrub[0]);
+	d = I10NM_GET_REG32(imc, chan, res_cfg->offsets_demand[0]);
+
+	if (enable) {
+		/* Save default configurations */
+		imc->chan[chan].retry_rd_err_log_s = s;
+		imc->chan[chan].retry_rd_err_log_d = d;
+
+		s &= ~RETRY_RD_ERR_LOG_NOOVER_UC;
+		s |=  RETRY_RD_ERR_LOG_EN;
+		d &= ~RETRY_RD_ERR_LOG_NOOVER_UC;
+		d |=  RETRY_RD_ERR_LOG_EN;
+	} else {
+		/* Restore default configurations */
+		if (imc->chan[chan].retry_rd_err_log_s & RETRY_RD_ERR_LOG_UC)
+			s |=  RETRY_RD_ERR_LOG_UC;
+		if (imc->chan[chan].retry_rd_err_log_s & RETRY_RD_ERR_LOG_NOOVER)
+			s |=  RETRY_RD_ERR_LOG_NOOVER;
+		if (!(imc->chan[chan].retry_rd_err_log_s & RETRY_RD_ERR_LOG_EN))
+			s &= ~RETRY_RD_ERR_LOG_EN;
+		if (imc->chan[chan].retry_rd_err_log_d & RETRY_RD_ERR_LOG_UC)
+			d |=  RETRY_RD_ERR_LOG_UC;
+		if (imc->chan[chan].retry_rd_err_log_d & RETRY_RD_ERR_LOG_NOOVER)
+			d |=  RETRY_RD_ERR_LOG_NOOVER;
+		if (!(imc->chan[chan].retry_rd_err_log_d & RETRY_RD_ERR_LOG_EN))
+			d &= ~RETRY_RD_ERR_LOG_EN;
+	}
+
+	I10NM_SET_REG32(imc, chan, res_cfg->offsets_scrub[0], s);
+	I10NM_SET_REG32(imc, chan, res_cfg->offsets_demand[0], d);
+}
+
+static void enable_retry_rd_err_log(bool enable)
+{
+	struct skx_dev *d;
+	int i, j;
+
+	edac_dbg(2, "\n");
+
+	list_for_each_entry(d, i10nm_edac_list, list)
+		for (i = 0; i < I10NM_NUM_IMC; i++)
+			for (j = 0; j < I10NM_NUM_CHANNELS; j++)
+				__enable_retry_rd_err_log(&d->imc[i], j, enable);
+}
+
+static void show_retry_rd_err_log(struct decoded_addr *res, char *msg,
+				  int len, bool scrub_err)
+{
+	struct skx_imc *imc = &res->dev->imc[res->imc];
+	u32 log0, log1, log2, log3, log4;
+	u32 corr0, corr1, corr2, corr3;
+	u64 log2a, log5;
+	u32 *offsets;
+	int n;
+
+	if (!imc->mbase)
+		return;
+
+	offsets = scrub_err ? res_cfg->offsets_scrub : res_cfg->offsets_demand;
+
+	log0 = I10NM_GET_REG32(imc, res->channel, offsets[0]);
+	log1 = I10NM_GET_REG32(imc, res->channel, offsets[1]);
+	log3 = I10NM_GET_REG32(imc, res->channel, offsets[3]);
+	log4 = I10NM_GET_REG32(imc, res->channel, offsets[4]);
+	log5 = I10NM_GET_REG64(imc, res->channel, offsets[5]);
+
+	if (res_cfg->type == SPR) {
+		log2a = I10NM_GET_REG64(imc, res->channel, offsets[2]);
+		n = snprintf(msg, len, " retry_rd_err_log[%.8x %.8x %.16llx %.8x %.8x %.16llx]",
+			     log0, log1, log2a, log3, log4, log5);
+	} else {
+		log2 = I10NM_GET_REG32(imc, res->channel, offsets[2]);
+		n = snprintf(msg, len, " retry_rd_err_log[%.8x %.8x %.8x %.8x %.8x %.16llx]",
+			     log0, log1, log2, log3, log4, log5);
+	}
+
+	corr0 = I10NM_GET_REG32(imc, res->channel, 0x22c18);
+	corr1 = I10NM_GET_REG32(imc, res->channel, 0x22c1c);
+	corr2 = I10NM_GET_REG32(imc, res->channel, 0x22c20);
+	corr3 = I10NM_GET_REG32(imc, res->channel, 0x22c24);
+
+	if (len - n > 0)
+		snprintf(msg + n, len - n,
+			 " correrrcnt[%.4x %.4x %.4x %.4x %.4x %.4x %.4x %.4x]",
+			 corr0 & 0xffff, corr0 >> 16,
+			 corr1 & 0xffff, corr1 >> 16,
+			 corr2 & 0xffff, corr2 >> 16,
+			 corr3 & 0xffff, corr3 >> 16);
+
+	/* Clear status bits */
+	if (retry_rd_err_log == 2 && (log0 & RETRY_RD_ERR_LOG_OVER_UC_V)) {
+		log0 &= ~RETRY_RD_ERR_LOG_OVER_UC_V;
+		I10NM_SET_REG32(imc, res->channel, offsets[0], log0);
+	}
+}
+
 static struct pci_dev *pci_get_dev_wrapper(int dom, unsigned int bus,
 					   unsigned int dev, unsigned int fun)
 {
@@ -132,6 +255,8 @@ static struct res_config i10nm_cfg0 = {
 	.decs_did		= 0x3452,
 	.busno_cfg_offset	= 0xcc,
 	.ddr_chan_mmio_sz	= 0x4000,
+	.offsets_scrub		= offsets_scrub_icx,
+	.offsets_demand		= offsets_demand_icx,
 };
 
 static struct res_config i10nm_cfg1 = {
@@ -139,6 +264,8 @@ static struct res_config i10nm_cfg1 = {
 	.decs_did		= 0x3452,
 	.busno_cfg_offset	= 0xd0,
 	.ddr_chan_mmio_sz	= 0x4000,
+	.offsets_scrub		= offsets_scrub_icx,
+	.offsets_demand		= offsets_demand_icx,
 };
 
 static struct res_config spr_cfg = {
@@ -147,6 +274,8 @@ static struct res_config spr_cfg = {
 	.busno_cfg_offset	= 0xd0,
 	.ddr_chan_mmio_sz	= 0x8000,
 	.support_ddr5		= true,
+	.offsets_scrub		= offsets_scrub_spr,
+	.offsets_demand		= offsets_demand_spr,
 };
 
 static const struct x86_cpu_id i10nm_cpuids[] = {
@@ -286,6 +415,7 @@ static int __init i10nm_init(void)
 		return -ENODEV;
 
 	cfg = (struct res_config *)id->driver_data;
+	res_cfg = cfg;
 
 	rc = skx_get_hi_lo(0x09a2, off, &tolm, &tohm);
 	if (rc)
@@ -339,6 +469,12 @@ static int __init i10nm_init(void)
 	mce_register_decode_chain(&i10nm_mce_dec);
 	setup_i10nm_debug();
 
+	if (retry_rd_err_log && res_cfg->offsets_scrub && res_cfg->offsets_demand) {
+		skx_set_decode(NULL, show_retry_rd_err_log);
+		if (retry_rd_err_log == 2)
+			enable_retry_rd_err_log(true);
+	}
+
 	i10nm_printk(KERN_INFO, "%s\n", I10NM_REVISION);
 
 	return 0;
@@ -350,6 +486,13 @@ static int __init i10nm_init(void)
 static void __exit i10nm_exit(void)
 {
 	edac_dbg(2, "\n");
+
+	if (retry_rd_err_log && res_cfg->offsets_scrub && res_cfg->offsets_demand) {
+		skx_set_decode(NULL, NULL);
+		if (retry_rd_err_log == 2)
+			enable_retry_rd_err_log(false);
+	}
+
 	teardown_i10nm_debug();
 	mce_unregister_decode_chain(&i10nm_mce_dec);
 	skx_adxl_put();
@@ -359,5 +502,8 @@ static void __exit i10nm_exit(void)
 module_init(i10nm_init);
 module_exit(i10nm_exit);
 
+module_param(retry_rd_err_log, int, 0444);
+MODULE_PARM_DESC(retry_rd_err_log, "retry_rd_err_log: 0=off(default), 1=bios(Linux doesn't reset any control bits, but just reports values.), 2=linux(Linux tries to take control and resets mode bits, clear valid/UC bits after reading.)");
+
 MODULE_LICENSE("GPL v2");
 MODULE_DESCRIPTION("MC Driver for Intel 10nm server processors");
diff --git a/drivers/edac/skx_base.c b/drivers/edac/skx_base.c
index 4dbd46575bfb..1abc020d49ab 100644
--- a/drivers/edac/skx_base.c
+++ b/drivers/edac/skx_base.c
@@ -230,7 +230,8 @@ static int skx_get_dimm_config(struct mem_ctl_info *mci, struct res_config *cfg)
 #define SKX_ILV_TARGET(tgt)	((tgt) & 7)
 
 static void skx_show_retry_rd_err_log(struct decoded_addr *res,
-				      char *msg, int len)
+				      char *msg, int len,
+				      bool scrub_err)
 {
 	u32 log0, log1, log2, log3, log4;
 	u32 corr0, corr1, corr2, corr3;
diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c
index 81c3e2ec6f56..05ffd39d6de9 100644
--- a/drivers/edac/skx_common.c
+++ b/drivers/edac/skx_common.c
@@ -494,6 +494,7 @@ static void skx_mce_output_error(struct mem_ctl_info *mci,
 	bool ripv = GET_BITFIELD(m->mcgstatus, 0, 0);
 	bool overflow = GET_BITFIELD(m->status, 62, 62);
 	bool uncorrected_error = GET_BITFIELD(m->status, 61, 61);
+	bool scrub_err = false;
 	bool recoverable;
 	int len;
 	u32 core_err_cnt = GET_BITFIELD(m->status, 38, 52);
@@ -545,6 +546,7 @@ static void skx_mce_output_error(struct mem_ctl_info *mci,
 			break;
 		case 4:
 			optype = "memory scrubbing error";
+			scrub_err = true;
 			break;
 		default:
 			optype = "reserved";
@@ -567,7 +569,7 @@ static void skx_mce_output_error(struct mem_ctl_info *mci,
 	}
 
 	if (skx_show_retry_rd_err_log)
-		skx_show_retry_rd_err_log(res, skx_msg + len, MSG_SIZE - len);
+		skx_show_retry_rd_err_log(res, skx_msg + len, MSG_SIZE - len, scrub_err);
 
 	edac_dbg(0, "%s\n", skx_msg);
 
diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h
index bf56bebff138..22e174c740be 100644
--- a/drivers/edac/skx_common.h
+++ b/drivers/edac/skx_common.h
@@ -66,6 +66,8 @@ struct skx_dev {
 		struct skx_channel {
 			struct pci_dev	*cdev;
 			struct pci_dev	*edev;
+			u32 retry_rd_err_log_s;
+			u32 retry_rd_err_log_d;
 			struct skx_dimm {
 				u8 close_pg;
 				u8 bank_xor_enable;
@@ -123,12 +125,15 @@ struct res_config {
 	/* Per DDR channel memory-mapped I/O size */
 	int ddr_chan_mmio_sz;
 	bool support_ddr5;
+	/* Offsets of retry_rd_err_log registers */
+	u32 *offsets_scrub;
+	u32 *offsets_demand;
 };
 
 typedef int (*get_dimm_config_f)(struct mem_ctl_info *mci,
 				 struct res_config *cfg);
 typedef bool (*skx_decode_f)(struct decoded_addr *res);
-typedef void (*skx_show_retry_log_f)(struct decoded_addr *res, char *msg, int len);
+typedef void (*skx_show_retry_log_f)(struct decoded_addr *res, char *msg, int len, bool scrub_err);
 
 int __init skx_adxl_get(void);
 void __exit skx_adxl_put(void);
-- 
Gitee


From f82dbd0e34a668a0fc38aa46107fb0f2866eb1d9 Mon Sep 17 00:00:00 2001
From: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Date: Fri, 11 Jun 2021 10:01:18 -0700
Subject: [PATCH 556/674] EDAC/skx_common: Add new ADXL components for 2-level
 memory

mainline inclusion
from mainline-v5.14-rc1
commit 2f4348e5a86198704368a699a7c4cdeb21d569f5
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1
CVE: NA

Intel-SIG: commit 2f4348e5a861 EDAC/skx_common: Add new ADXL components for 2-level
 memory.
Backport to add EDAC 2LM support.

--------------------------------

Some Intel servers may configure memory in 2 levels, using
fast "near" memory (e.g. DDR) as a cache for larger, slower,
"far" memory (e.g. 3D X-point).

In these configurations the BIOS ADXL address translation for
an address in a 2-level memory range will provide details of
both the "near" and far components.

Current exported ADXL components are only for 1-level memory
system or for 2nd level memory of 2-level memory system. So
add new ADXL components for 1st level memory of 2-level memory
system to fully support 2-level memory system and the detection
of memory error source(1st level memory or 2nd level memory).

Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/r/20210611170123.1057025-2-tony.luck@intel.com
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 drivers/edac/skx_common.c | 67 ++++++++++++++++++++++++++++++++-------
 drivers/edac/skx_common.h | 11 +++++++
 2 files changed, 67 insertions(+), 11 deletions(-)

diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c
index 05ffd39d6de9..3d0b391a400d 100644
--- a/drivers/edac/skx_common.c
+++ b/drivers/edac/skx_common.c
@@ -23,10 +23,13 @@
 #include "skx_common.h"
 
 static const char * const component_names[] = {
-	[INDEX_SOCKET]	= "ProcessorSocketId",
-	[INDEX_MEMCTRL]	= "MemoryControllerId",
-	[INDEX_CHANNEL]	= "ChannelId",
-	[INDEX_DIMM]	= "DimmSlotId",
+	[INDEX_SOCKET]		= "ProcessorSocketId",
+	[INDEX_MEMCTRL]		= "MemoryControllerId",
+	[INDEX_CHANNEL]		= "ChannelId",
+	[INDEX_DIMM]		= "DimmSlotId",
+	[INDEX_NM_MEMCTRL]	= "NmMemoryControllerId",
+	[INDEX_NM_CHANNEL]	= "NmChannelId",
+	[INDEX_NM_DIMM]		= "NmDimmSlotId",
 };
 
 static int component_indices[ARRAY_SIZE(component_names)];
@@ -34,12 +37,14 @@ static int adxl_component_count;
 static const char * const *adxl_component_names;
 static u64 *adxl_values;
 static char *adxl_msg;
+static unsigned long adxl_nm_bitmap;
 
 static char skx_msg[MSG_SIZE];
 static skx_decode_f skx_decode;
 static skx_show_retry_log_f skx_show_retry_rd_err_log;
 static u64 skx_tolm, skx_tohm;
 static LIST_HEAD(dev_edac_list);
+static bool skx_mem_cfg_2lm;
 
 int __init skx_adxl_get(void)
 {
@@ -56,14 +61,25 @@ int __init skx_adxl_get(void)
 		for (j = 0; names[j]; j++) {
 			if (!strcmp(component_names[i], names[j])) {
 				component_indices[i] = j;
+
+				if (i >= INDEX_NM_FIRST)
+					adxl_nm_bitmap |= 1 << i;
+
 				break;
 			}
 		}
 
-		if (!names[j])
+		if (!names[j] && i < INDEX_NM_FIRST)
 			goto err;
 	}
 
+	if (skx_mem_cfg_2lm) {
+		if (!adxl_nm_bitmap)
+			skx_printk(KERN_NOTICE, "Not enough ADXL components for 2-level memory.\n");
+		else
+			edac_dbg(2, "adxl_nm_bitmap: 0x%lx\n", adxl_nm_bitmap);
+	}
+
 	adxl_component_names = names;
 	while (*names++)
 		adxl_component_count++;
@@ -99,7 +115,7 @@ void __exit skx_adxl_put(void)
 	kfree(adxl_msg);
 }
 
-static bool skx_adxl_decode(struct decoded_addr *res)
+static bool skx_adxl_decode(struct decoded_addr *res, bool error_in_1st_level_mem)
 {
 	struct skx_dev *d;
 	int i, len = 0;
@@ -116,11 +132,20 @@ static bool skx_adxl_decode(struct decoded_addr *res)
 	}
 
 	res->socket  = (int)adxl_values[component_indices[INDEX_SOCKET]];
-	res->imc     = (int)adxl_values[component_indices[INDEX_MEMCTRL]];
-	res->channel = (int)adxl_values[component_indices[INDEX_CHANNEL]];
-	res->dimm    = (int)adxl_values[component_indices[INDEX_DIMM]];
+	if (error_in_1st_level_mem) {
+		res->imc     = (adxl_nm_bitmap & BIT_NM_MEMCTRL) ?
+			       (int)adxl_values[component_indices[INDEX_NM_MEMCTRL]] : -1;
+		res->channel = (adxl_nm_bitmap & BIT_NM_CHANNEL) ?
+			       (int)adxl_values[component_indices[INDEX_NM_CHANNEL]] : -1;
+		res->dimm    = (adxl_nm_bitmap & BIT_NM_DIMM) ?
+			       (int)adxl_values[component_indices[INDEX_NM_DIMM]] : -1;
+	} else {
+		res->imc     = (int)adxl_values[component_indices[INDEX_MEMCTRL]];
+		res->channel = (int)adxl_values[component_indices[INDEX_CHANNEL]];
+		res->dimm    = (int)adxl_values[component_indices[INDEX_DIMM]];
+	}
 
-	if (res->imc > NUM_IMC - 1) {
+	if (res->imc > NUM_IMC - 1 || res->imc < 0) {
 		skx_printk(KERN_ERR, "Bad imc %d\n", res->imc);
 		return false;
 	}
@@ -151,6 +176,11 @@ static bool skx_adxl_decode(struct decoded_addr *res)
 	return true;
 }
 
+void skx_set_mem_cfg(bool mem_cfg_2lm)
+{
+	skx_mem_cfg_2lm = mem_cfg_2lm;
+}
+
 void skx_set_decode(skx_decode_f decode, skx_show_retry_log_f show_retry_log)
 {
 	skx_decode = decode;
@@ -580,6 +610,21 @@ static void skx_mce_output_error(struct mem_ctl_info *mci,
 			     optype, skx_msg);
 }
 
+static bool skx_error_in_1st_level_mem(const struct mce *m)
+{
+	u32 errcode;
+
+	if (!skx_mem_cfg_2lm)
+		return false;
+
+	errcode = GET_BITFIELD(m->status, 0, 15);
+
+	if ((errcode & 0xef80) != 0x280)
+		return false;
+
+	return true;
+}
+
 int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
 			void *data)
 {
@@ -599,7 +644,7 @@ int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
 	res.addr = mce->addr;
 
 	if (adxl_component_count) {
-		if (!skx_adxl_decode(&res))
+		if (!skx_adxl_decode(&res, skx_error_in_1st_level_mem(mce)))
 			return NOTIFY_DONE;
 	} else if (!skx_decode || !skx_decode(&res)) {
 		return NOTIFY_DONE;
diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h
index 22e174c740be..de9f82977f0e 100644
--- a/drivers/edac/skx_common.h
+++ b/drivers/edac/skx_common.h
@@ -9,6 +9,8 @@
 #ifndef _SKX_COMM_EDAC_H
 #define _SKX_COMM_EDAC_H
 
+#include <linux/bits.h>
+
 #define MSG_SIZE		1024
 
 /*
@@ -94,9 +96,17 @@ enum {
 	INDEX_MEMCTRL,
 	INDEX_CHANNEL,
 	INDEX_DIMM,
+	INDEX_NM_FIRST,
+	INDEX_NM_MEMCTRL = INDEX_NM_FIRST,
+	INDEX_NM_CHANNEL,
+	INDEX_NM_DIMM,
 	INDEX_MAX
 };
 
+#define BIT_NM_MEMCTRL	BIT_ULL(INDEX_NM_MEMCTRL)
+#define BIT_NM_CHANNEL	BIT_ULL(INDEX_NM_CHANNEL)
+#define BIT_NM_DIMM	BIT_ULL(INDEX_NM_DIMM)
+
 struct decoded_addr {
 	struct skx_dev *dev;
 	u64	addr;
@@ -138,6 +148,7 @@ typedef void (*skx_show_retry_log_f)(struct decoded_addr *res, char *msg, int le
 int __init skx_adxl_get(void);
 void __exit skx_adxl_put(void);
 void skx_set_decode(skx_decode_f decode, skx_show_retry_log_f show_retry_log);
+void skx_set_mem_cfg(bool mem_cfg_2lm);
 
 int skx_get_src_id(struct skx_dev *d, int off, u8 *id);
 int skx_get_node_id(struct skx_dev *d, u8 *id);
-- 
Gitee


From 6b81ee370e7b72e93ff4d3bef67c02ed4a35632b Mon Sep 17 00:00:00 2001
From: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Date: Fri, 11 Jun 2021 10:01:19 -0700
Subject: [PATCH 557/674] EDAC/i10nm: Add detection of memory levels for
 ICX/SPR servers

mainline inclusion
from mainline-v5.14-rc1
commit 4bd4d32e9a38d7ffb091b4109ab63c8f601e5678
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1
CVE: NA

Intel-SIG: commit 4bd4d32e9a38 EDAC/i10nm: Add detection of memory levels for ICX/SPR
 servers.
Backport to add EDAC 2LM support.

--------------------------------

Current i10nm_edac driver is only for system configured in 1-level
memory. If the system is configured in 2-level memory, the driver
doesn't report the 1st level memory DIMM for the error address, even
if the error occurs in the 1st level memory.

Both Ice Lake servers and Sapphire Rapids servers can be configured
in 2-level memory. Add detection of memory levels to i10nm_edac for
the two kinds of servers so that the driver can report the 2nd level
memory DIMM or the 1st level memory DIMM according to error source.

Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/r/20210611170123.1057025-3-tony.luck@intel.com
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 drivers/edac/i10nm_base.c | 39 +++++++++++++++++++++++++++++++++++++++
 drivers/edac/skx_common.h |  3 +++
 2 files changed, 42 insertions(+)

diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c
index 7798de1e2aed..4e2a4d8396da 100644
--- a/drivers/edac/i10nm_base.c
+++ b/drivers/edac/i10nm_base.c
@@ -24,6 +24,8 @@
 	pci_read_config_dword((d)->uracu, 0xd0, &(reg))
 #define I10NM_GET_IMC_BAR(d, i, reg)	\
 	pci_read_config_dword((d)->uracu, 0xd8 + (i) * 4, &(reg))
+#define I10NM_GET_SAD(d, offset, i, reg)\
+	pci_read_config_dword((d)->sad_all, (offset) + (i) * 8, &(reg))
 #define I10NM_GET_DIMMMTR(m, i, j)	\
 	readl((m)->mbase + 0x2080c + (i) * (m)->chan_mmio_sz + (j) * 4)
 #define I10NM_GET_MCDDRTCFG(m, i)	\
@@ -50,6 +52,10 @@
 #define RETRY_RD_ERR_LOG_NOOVER_UC	(BIT(14) | BIT(1))
 #define RETRY_RD_ERR_LOG_OVER_UC_V	(BIT(2) | BIT(1) | BIT(0))
 
+#define I10NM_MAX_SAD			16
+#define I10NM_SAD_ENABLE(reg)		GET_BITFIELD(reg, 0, 0)
+#define I10NM_SAD_NM_CACHEABLE(reg)	GET_BITFIELD(reg, 5, 5)
+
 static struct list_head *i10nm_edac_list;
 
 static struct res_config *res_cfg;
@@ -186,6 +192,31 @@ static struct pci_dev *pci_get_dev_wrapper(int dom, unsigned int bus,
 	return pdev;
 }
 
+static bool i10nm_check_2lm(struct res_config *cfg)
+{
+	struct skx_dev *d;
+	u32 reg;
+	int i;
+
+	list_for_each_entry(d, i10nm_edac_list, list) {
+		d->sad_all = pci_get_dev_wrapper(d->seg, d->bus[1],
+						 PCI_SLOT(cfg->sad_all_devfn),
+						 PCI_FUNC(cfg->sad_all_devfn));
+		if (!d->sad_all)
+			continue;
+
+		for (i = 0; i < I10NM_MAX_SAD; i++) {
+			I10NM_GET_SAD(d, cfg->sad_all_offset, i, reg);
+			if (I10NM_SAD_ENABLE(reg) && I10NM_SAD_NM_CACHEABLE(reg)) {
+				edac_dbg(2, "2-level memory configuration.\n");
+				return true;
+			}
+		}
+	}
+
+	return false;
+}
+
 static int i10nm_get_all_munits(void)
 {
 	struct pci_dev *mdev;
@@ -255,6 +286,8 @@ static struct res_config i10nm_cfg0 = {
 	.decs_did		= 0x3452,
 	.busno_cfg_offset	= 0xcc,
 	.ddr_chan_mmio_sz	= 0x4000,
+	.sad_all_devfn		= PCI_DEVFN(29, 0),
+	.sad_all_offset		= 0x108,
 	.offsets_scrub		= offsets_scrub_icx,
 	.offsets_demand		= offsets_demand_icx,
 };
@@ -264,6 +297,8 @@ static struct res_config i10nm_cfg1 = {
 	.decs_did		= 0x3452,
 	.busno_cfg_offset	= 0xd0,
 	.ddr_chan_mmio_sz	= 0x4000,
+	.sad_all_devfn		= PCI_DEVFN(29, 0),
+	.sad_all_offset		= 0x108,
 	.offsets_scrub		= offsets_scrub_icx,
 	.offsets_demand		= offsets_demand_icx,
 };
@@ -274,6 +309,8 @@ static struct res_config spr_cfg = {
 	.busno_cfg_offset	= 0xd0,
 	.ddr_chan_mmio_sz	= 0x8000,
 	.support_ddr5		= true,
+	.sad_all_devfn		= PCI_DEVFN(10, 0),
+	.sad_all_offset		= 0x300,
 	.offsets_scrub		= offsets_scrub_spr,
 	.offsets_demand		= offsets_demand_spr,
 };
@@ -429,6 +466,8 @@ static int __init i10nm_init(void)
 		return -ENODEV;
 	}
 
+	skx_set_mem_cfg(i10nm_check_2lm(cfg));
+
 	rc = i10nm_get_all_munits();
 	if (rc < 0)
 		goto fail;
diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h
index de9f82977f0e..1fb7540a7092 100644
--- a/drivers/edac/skx_common.h
+++ b/drivers/edac/skx_common.h
@@ -135,6 +135,9 @@ struct res_config {
 	/* Per DDR channel memory-mapped I/O size */
 	int ddr_chan_mmio_sz;
 	bool support_ddr5;
+	/* SAD device number and function number */
+	unsigned int sad_all_devfn;
+	int sad_all_offset;
 	/* Offsets of retry_rd_err_log registers */
 	u32 *offsets_scrub;
 	u32 *offsets_demand;
-- 
Gitee


From 45875988308087e7fae3b15f217ae376765b126b Mon Sep 17 00:00:00 2001
From: root <root@DNP-HBM01.bj.intel.com>
Date: Wed, 13 Jul 2022 15:28:12 +0800
Subject: [PATCH 558/674] EDAC/i10nm: Add support for high bandwidth memory

mainline inclusion
from mainline-v5.14-rc1
commit c945088384d00e6eb61535cc4ba25bc062090909
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1
CVE: NA

Intel-SIG: commit c945088384d0 EDAC/i10nm: Add support for high bandwidth memory.
Backport to add EDAC HBM support.

--------------------------------

A future Xeon processor will include in-package HBM (high bandwidth
memory). The in-package HBM memory controller shares the same
architecture with the regular DDR memory controller.

Add the HBM memory controller devices for EDAC support.

Tested-by: Hongyu Ning <hongyu.ning@linux.intel.com>
Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/r/20210611170123.1057025-4-tony.luck@intel.com
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 drivers/edac/i10nm_base.c | 132 ++++++++++++++++++++++++++++++++++----
 drivers/edac/skx_common.c |  15 +++--
 drivers/edac/skx_common.h |  20 +++++-
 3 files changed, 148 insertions(+), 19 deletions(-)

diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c
index 4e2a4d8396da..47d6d7282c58 100644
--- a/drivers/edac/i10nm_base.c
+++ b/drivers/edac/i10nm_base.c
@@ -13,7 +13,7 @@
 #include "edac_module.h"
 #include "skx_common.h"
 
-#define I10NM_REVISION	"v0.0.4"
+#define I10NM_REVISION	"v0.0.5"
 #define EDAC_MOD_STR	"i10nm_edac"
 
 /* Debug macros */
@@ -26,14 +26,22 @@
 	pci_read_config_dword((d)->uracu, 0xd8 + (i) * 4, &(reg))
 #define I10NM_GET_SAD(d, offset, i, reg)\
 	pci_read_config_dword((d)->sad_all, (offset) + (i) * 8, &(reg))
+#define I10NM_GET_HBM_IMC_BAR(d, reg)	\
+	pci_read_config_dword((d)->uracu, 0xd4, &(reg))
+#define I10NM_GET_CAPID3_CFG(d, reg)	\
+	pci_read_config_dword((d)->pcu_cr3, 0x90, &(reg))
 #define I10NM_GET_DIMMMTR(m, i, j)	\
-	readl((m)->mbase + 0x2080c + (i) * (m)->chan_mmio_sz + (j) * 4)
+	readl((m)->mbase + ((m)->hbm_mc ? 0x80c : 0x2080c) + \
+	(i) * (m)->chan_mmio_sz + (j) * 4)
 #define I10NM_GET_MCDDRTCFG(m, i)	\
-	readl((m)->mbase + 0x20970 + (i) * (m)->chan_mmio_sz)
+	readl((m)->mbase + ((m)->hbm_mc ? 0x970 : 0x20970) + \
+	(i) * (m)->chan_mmio_sz)
 #define I10NM_GET_MCMTR(m, i)		\
-	readl((m)->mbase + 0x20ef8 + (i) * (m)->chan_mmio_sz)
+	readl((m)->mbase + ((m)->hbm_mc ? 0xef8 : 0x20ef8) + \
+	(i) * (m)->chan_mmio_sz)
 #define I10NM_GET_AMAP(m, i)		\
-	readl((m)->mbase + 0x20814 + (i) * (m)->chan_mmio_sz)
+	readl((m)->mbase + ((m)->hbm_mc ? 0x814 : 0x20814) + \
+	(i) * (m)->chan_mmio_sz)
 #define I10NM_GET_REG32(m, i, offset)	\
 	readl((m)->mbase + (i) * (m)->chan_mmio_sz + (offset))
 #define I10NM_GET_REG64(m, i, offset)	\
@@ -45,6 +53,12 @@
 #define I10NM_GET_IMC_MMIO_OFFSET(reg)	(GET_BITFIELD(reg, 0, 10) << 12)
 #define I10NM_GET_IMC_MMIO_SIZE(reg)	((GET_BITFIELD(reg, 13, 23) - \
 					 GET_BITFIELD(reg, 0, 10) + 1) << 12)
+#define I10NM_GET_HBM_IMC_MMIO_OFFSET(reg)	\
+	((GET_BITFIELD(reg, 0, 10) << 12) + 0x140000)
+
+#define I10NM_HBM_IMC_MMIO_SIZE		0x9000
+#define I10NM_IS_HBM_PRESENT(reg)	GET_BITFIELD(reg, 27, 30)
+#define I10NM_IS_HBM_IMC(reg)		GET_BITFIELD(reg, 29, 29)
 
 #define RETRY_RD_ERR_LOG_UC		BIT(1)
 #define RETRY_RD_ERR_LOG_NOOVER		BIT(14)
@@ -217,7 +231,7 @@ static bool i10nm_check_2lm(struct res_config *cfg)
 	return false;
 }
 
-static int i10nm_get_all_munits(void)
+static int i10nm_get_ddr_munits(void)
 {
 	struct pci_dev *mdev;
 	void __iomem *mbase;
@@ -245,7 +259,7 @@ static int i10nm_get_all_munits(void)
 		edac_dbg(2, "socket%d mmio base 0x%llx (reg 0x%x)\n",
 			 j++, base, reg);
 
-		for (i = 0; i < I10NM_NUM_IMC; i++) {
+		for (i = 0; i < I10NM_NUM_DDR_IMC; i++) {
 			mdev = pci_get_dev_wrapper(d->seg, d->bus[0],
 						   12 + i, 0);
 			if (i == 0 && !mdev) {
@@ -281,6 +295,90 @@ static int i10nm_get_all_munits(void)
 	return 0;
 }
 
+static bool i10nm_check_hbm_imc(struct skx_dev *d)
+{
+	u32 reg;
+
+	if (I10NM_GET_CAPID3_CFG(d, reg)) {
+		i10nm_printk(KERN_ERR, "Failed to get capid3_cfg\n");
+		return false;
+	}
+
+	return I10NM_IS_HBM_PRESENT(reg) != 0;
+}
+
+static int i10nm_get_hbm_munits(void)
+{
+	struct pci_dev *mdev;
+	void __iomem *mbase;
+	u32 reg, off, mcmtr;
+	struct skx_dev *d;
+	int i, lmc;
+	u64 base;
+
+	list_for_each_entry(d, i10nm_edac_list, list) {
+		d->pcu_cr3 = pci_get_dev_wrapper(d->seg, d->bus[1], 30, 3);
+		if (!d->pcu_cr3)
+			return -ENODEV;
+
+		if (!i10nm_check_hbm_imc(d)) {
+			i10nm_printk(KERN_DEBUG, "No hbm memory\n");
+			return -ENODEV;
+		}
+
+		if (I10NM_GET_SCK_BAR(d, reg)) {
+			i10nm_printk(KERN_ERR, "Failed to get socket bar\n");
+			return -ENODEV;
+		}
+		base = I10NM_GET_SCK_MMIO_BASE(reg);
+
+		if (I10NM_GET_HBM_IMC_BAR(d, reg)) {
+			i10nm_printk(KERN_ERR, "Failed to get hbm mc bar\n");
+			return -ENODEV;
+		}
+		base += I10NM_GET_HBM_IMC_MMIO_OFFSET(reg);
+
+		lmc = I10NM_NUM_DDR_IMC;
+
+		for (i = 0; i < I10NM_NUM_HBM_IMC; i++) {
+			mdev = pci_get_dev_wrapper(d->seg, d->bus[0],
+						   12 + i / 4, 1 + i % 4);
+			if (i == 0 && !mdev) {
+				i10nm_printk(KERN_ERR, "No hbm mc found\n");
+				return -ENODEV;
+			}
+			if (!mdev)
+				continue;
+
+			d->imc[lmc].mdev = mdev;
+			off = i * I10NM_HBM_IMC_MMIO_SIZE;
+
+			edac_dbg(2, "hbm mc%d mmio base 0x%llx size 0x%x\n",
+				 lmc, base + off, I10NM_HBM_IMC_MMIO_SIZE);
+
+			mbase = ioremap(base + off, I10NM_HBM_IMC_MMIO_SIZE);
+			if (!mbase) {
+				i10nm_printk(KERN_ERR, "Failed to ioremap for hbm mc 0x%llx\n",
+					     base + off);
+				return -ENOMEM;
+			}
+
+			d->imc[lmc].mbase = mbase;
+			d->imc[lmc].hbm_mc = true;
+
+			mcmtr = I10NM_GET_MCMTR(&d->imc[lmc], 0);
+			if (!I10NM_IS_HBM_IMC(mcmtr)) {
+				i10nm_printk(KERN_ERR, "This isn't an hbm mc!\n");
+				return -ENODEV;
+			}
+
+			lmc++;
+		}
+	}
+
+	return 0;
+}
+
 static struct res_config i10nm_cfg0 = {
 	.type			= I10NM,
 	.decs_did		= 0x3452,
@@ -308,6 +406,7 @@ static struct res_config spr_cfg = {
 	.decs_did		= 0x3252,
 	.busno_cfg_offset	= 0xd0,
 	.ddr_chan_mmio_sz	= 0x8000,
+	.hbm_chan_mmio_sz	= 0x4000,
 	.support_ddr5		= true,
 	.sad_all_devfn		= PCI_DEVFN(10, 0),
 	.sad_all_offset		= 0x300,
@@ -345,14 +444,14 @@ static int i10nm_get_dimm_config(struct mem_ctl_info *mci,
 	struct dimm_info *dimm;
 	int i, j, ndimms;
 
-	for (i = 0; i < I10NM_NUM_CHANNELS; i++) {
+	for (i = 0; i < imc->num_channels; i++) {
 		if (!imc->mbase)
 			continue;
 
 		ndimms = 0;
 		mcddrtcfg = I10NM_GET_MCDDRTCFG(imc, i);
 		amap = I10NM_GET_AMAP(imc, i);
-		for (j = 0; j < I10NM_NUM_DIMMS; j++) {
+		for (j = 0; j < imc->num_dimms; j++) {
 			dimm = edac_get_dimm(mci, i, j, 0);
 			mtr = I10NM_GET_DIMMMTR(imc, i, j);
 			edac_dbg(1, "dimmmtr 0x%x mcddrtcfg 0x%x (mc%d ch%d dimm%d)\n",
@@ -468,8 +567,9 @@ static int __init i10nm_init(void)
 
 	skx_set_mem_cfg(i10nm_check_2lm(cfg));
 
-	rc = i10nm_get_all_munits();
-	if (rc < 0)
+	rc = i10nm_get_ddr_munits();
+
+	if (i10nm_get_hbm_munits() && rc)
 		goto fail;
 
 	list_for_each_entry(d, i10nm_edac_list, list) {
@@ -490,7 +590,15 @@ static int __init i10nm_init(void)
 			d->imc[i].lmc = i;
 			d->imc[i].src_id  = src_id;
 			d->imc[i].node_id = node_id;
-			d->imc[i].chan_mmio_sz = cfg->ddr_chan_mmio_sz;
+			if (d->imc[i].hbm_mc) {
+				d->imc[i].chan_mmio_sz = cfg->hbm_chan_mmio_sz;
+				d->imc[i].num_channels = I10NM_NUM_HBM_CHANNELS;
+				d->imc[i].num_dimms    = I10NM_NUM_HBM_DIMMS;
+			} else {
+				d->imc[i].chan_mmio_sz = cfg->ddr_chan_mmio_sz;
+				d->imc[i].num_channels = I10NM_NUM_DDR_CHANNELS;
+				d->imc[i].num_dimms    = I10NM_NUM_DDR_DIMMS;
+			}
 
 			rc = skx_register_mci(&d->imc[i], d->imc[i].mdev,
 					      "Intel_10nm Socket", EDAC_MOD_STR,
diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c
index 3d0b391a400d..0c133d32b777 100644
--- a/drivers/edac/skx_common.c
+++ b/drivers/edac/skx_common.c
@@ -343,9 +343,9 @@ int skx_get_dimm_info(u32 mtr, u32 mcmtr, u32 amap, struct dimm_info *dimm,
 
 	ranks = numrank(mtr);
 	rows = numrow(mtr);
-	cols = numcol(mtr);
+	cols = imc->hbm_mc ? 6 : numcol(mtr);
 
-	if (cfg->support_ddr5 && (amap & 0x8)) {
+	if (cfg->support_ddr5 && ((amap & 0x8) || imc->hbm_mc)) {
 		banks = 32;
 		mtype = MEM_DDR5;
 	} else {
@@ -374,8 +374,13 @@ int skx_get_dimm_info(u32 mtr, u32 mcmtr, u32 amap, struct dimm_info *dimm,
 	dimm->dtype = get_width(mtr);
 	dimm->mtype = mtype;
 	dimm->edac_mode = EDAC_SECDED; /* likely better than this */
-	snprintf(dimm->label, sizeof(dimm->label), "CPU_SrcID#%u_MC#%u_Chan#%u_DIMM#%u",
-		 imc->src_id, imc->lmc, chan, dimmno);
+
+	if (imc->hbm_mc)
+		snprintf(dimm->label, sizeof(dimm->label), "CPU_SrcID#%u_HBMC#%u_Chan#%u",
+			 imc->src_id, imc->lmc, chan);
+	else
+		snprintf(dimm->label, sizeof(dimm->label), "CPU_SrcID#%u_MC#%u_Chan#%u_DIMM#%u",
+			 imc->src_id, imc->lmc, chan, dimmno);
 
 	return 1;
 }
@@ -705,6 +710,8 @@ void skx_remove(void)
 		}
 		if (d->util_all)
 			pci_dev_put(d->util_all);
+		if (d->pcu_cr3)
+			pci_dev_put(d->pcu_cr3);
 		if (d->sad_all)
 			pci_dev_put(d->sad_all);
 		if (d->uracu)
diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h
index 1fb7540a7092..03ac067a80b9 100644
--- a/drivers/edac/skx_common.h
+++ b/drivers/edac/skx_common.h
@@ -32,9 +32,17 @@
 #define SKX_NUM_CHANNELS	3	/* Channels per memory controller */
 #define SKX_NUM_DIMMS		2	/* Max DIMMS per channel */
 
-#define I10NM_NUM_IMC		4
-#define I10NM_NUM_CHANNELS	2
-#define I10NM_NUM_DIMMS		2
+#define I10NM_NUM_DDR_IMC	4
+#define I10NM_NUM_DDR_CHANNELS	2
+#define I10NM_NUM_DDR_DIMMS	2
+
+#define I10NM_NUM_HBM_IMC	16
+#define I10NM_NUM_HBM_CHANNELS	2
+#define I10NM_NUM_HBM_DIMMS	1
+
+#define I10NM_NUM_IMC		(I10NM_NUM_DDR_IMC + I10NM_NUM_HBM_IMC)
+#define I10NM_NUM_CHANNELS	MAX(I10NM_NUM_DDR_CHANNELS, I10NM_NUM_HBM_CHANNELS)
+#define I10NM_NUM_DIMMS		MAX(I10NM_NUM_DDR_DIMMS, I10NM_NUM_HBM_DIMMS)
 
 #define MAX(a, b)	((a) > (b) ? (a) : (b))
 #define NUM_IMC		MAX(SKX_NUM_IMC, I10NM_NUM_IMC)
@@ -56,12 +64,16 @@ struct skx_dev {
 	struct pci_dev *sad_all;
 	struct pci_dev *util_all;
 	struct pci_dev *uracu; /* for i10nm CPU */
+	struct pci_dev *pcu_cr3; /* for HBM memory detection */
 	u32 mcroute;
 	struct skx_imc {
 		struct mem_ctl_info *mci;
 		struct pci_dev *mdev; /* for i10nm CPU */
 		void __iomem *mbase;  /* for i10nm CPU */
 		int chan_mmio_sz;     /* for i10nm CPU */
+		int num_channels; /* channels per memory controller */
+		int num_dimms; /* dimms per channel */
+		bool hbm_mc;
 		u8 mc;	/* system wide mc# */
 		u8 lmc;	/* socket relative mc# */
 		u8 src_id, node_id;
@@ -134,6 +146,8 @@ struct res_config {
 	int busno_cfg_offset;
 	/* Per DDR channel memory-mapped I/O size */
 	int ddr_chan_mmio_sz;
+	/* Per HBM channel memory-mapped I/O size */
+	int hbm_chan_mmio_sz;
 	bool support_ddr5;
 	/* SAD device number and function number */
 	unsigned int sad_all_devfn;
-- 
Gitee


From b0adb31550986f669e45c7f42f13876451a91d0e Mon Sep 17 00:00:00 2001
From: Naveen Krishna Chatradhi <nchatrad@amd.com>
Date: Wed, 30 Jun 2021 20:58:24 +0530
Subject: [PATCH 559/674] EDAC/mc: Add new HBM2 memory type

mainline inclusion
from mainline-v5.13
commit e1ca90b7cc5cb5d3a38321cbb65ad36a59fcb574
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1
CVE: NA

Intel-SIG: commit e1ca90b7cc5c EDAC/mc: Add new HBM2 memory type.
Backport to add EDAC HBM support.

--------------------------------

Add a new entry to 'enum mem_type' and a new string to 'edac_mem_types[]'
for HBM2 (High Bandwidth Memory Gen 2) new memory type.

Reviewed-by: Yazen Ghannam <yazen.ghannam@amd.com>
Signed-off-by: Muralidhara M K <muralimk@amd.com>
Signed-off-by: Naveen Krishna Chatradhi <nchatrad@amd.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/r/20210630152828.162659-4-nchatrad@amd.com
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 drivers/edac/edac_mc.c | 1 +
 include/linux/edac.h   | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index 34514c638f19..bf4297075c22 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -163,6 +163,7 @@ const char * const edac_mem_types[] = {
 	[MEM_LRDDR4]	= "Load-Reduced-DDR4-RAM",
 	[MEM_DDR5]	= "Unbuffered-DDR5",
 	[MEM_NVDIMM]	= "Non-volatile-RAM",
+	[MEM_HBM2]	= "High-bandwidth-memory-Gen2",
 };
 EXPORT_SYMBOL_GPL(edac_mem_types);
 
diff --git a/include/linux/edac.h b/include/linux/edac.h
index 6c4565cc6273..490134495008 100644
--- a/include/linux/edac.h
+++ b/include/linux/edac.h
@@ -181,6 +181,7 @@ static inline char *mc_event_error_type(const unsigned int err_type)
  * @MEM_LRDDR4:		Load-Reduced DDR4 memory.
  * @MEM_DDR5:		Unbuffered DDR5 RAM
  * @MEM_NVDIMM:		Non-volatile RAM
+ * @MEM_HBM2:		High bandwidth Memory Gen 2.
  */
 enum mem_type {
 	MEM_EMPTY = 0,
@@ -206,6 +207,7 @@ enum mem_type {
 	MEM_LRDDR4,
 	MEM_DDR5,
 	MEM_NVDIMM,
+	MEM_HBM2,
 };
 
 #define MEM_FLAG_EMPTY		BIT(MEM_EMPTY)
@@ -230,6 +232,7 @@ enum mem_type {
 #define MEM_FLAG_LRDDR4         BIT(MEM_LRDDR4)
 #define MEM_FLAG_DDR5           BIT(MEM_DDR5)
 #define MEM_FLAG_NVDIMM         BIT(MEM_NVDIMM)
+#define MEM_FLAG_HBM2		BIT(MEM_HBM2)
 
 /**
  * enum edac-type - Error Detection and Correction capabilities and mode
-- 
Gitee


From 4a111addf4c90fc79db41b634b5b5b8b788d0f09 Mon Sep 17 00:00:00 2001
From: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Date: Tue, 20 Jul 2021 09:30:09 -0700
Subject: [PATCH 560/674] EDAC/skx_common: Set the memory type correctly for
 HBM memory

mainline inclusion
from mainline-v5.15-rc1
commit fd07a4a0d30b5468a1f4a0739e34f5f014df7d44
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1
CVE: NA

Intel-SIG: commit fd07a4a0d30b EDAC/skx_common: Set the memory type correctly for HBM
 memory.
Backport to add EDAC HBM support.

--------------------------------

Set the memory type to MEM_HBM2 if it's managed by the HBM2
memory controller.

Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/r/20210720163009.GA1417532@agluck-desk2.amr.corp.intel.com
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 drivers/edac/skx_common.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c
index 0c133d32b777..19c17c5198c5 100644
--- a/drivers/edac/skx_common.c
+++ b/drivers/edac/skx_common.c
@@ -345,7 +345,10 @@ int skx_get_dimm_info(u32 mtr, u32 mcmtr, u32 amap, struct dimm_info *dimm,
 	rows = numrow(mtr);
 	cols = imc->hbm_mc ? 6 : numcol(mtr);
 
-	if (cfg->support_ddr5 && ((amap & 0x8) || imc->hbm_mc)) {
+	if (imc->hbm_mc) {
+		banks = 32;
+		mtype = MEM_HBM2;
+	} else if (cfg->support_ddr5 && (amap & 0x8)) {
 		banks = 32;
 		mtype = MEM_DDR5;
 	} else {
-- 
Gitee


From 028e1264e604bafb90f67c5533b2ad86b0cc8ea1 Mon Sep 17 00:00:00 2001
From: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Date: Fri, 24 Dec 2021 04:11:26 -0500
Subject: [PATCH 561/674] EDAC/i10nm: Release mdev/mbase when failing to detect
 HBM

mainline inclusion
from mainline-v5.16
commit c370baa328022cbd46c59c821d1b467a97f047be
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1
CVE: NA

Intel-SIG: commit c370baa32802 EDAC/i10nm: Release mdev/mbase when failing to detect HBM.
Backport to add EDAC HBM support.

--------------------------------

On systems without HBM (High Bandwidth Memory) mdev/mbase are not
released/unmapped.

Add the code to release mdev/mbase when failing to detect HBM.

[Tony: re-word commit message]

Cc: <stable@vger.kernel.org>
Fixes: c945088384d0 ("EDAC/i10nm: Add support for high bandwidth memory")
Reported-by: kernel test robot <lkp@intel.com>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/r/20211224091126.1246-1-qiuxu.zhuo@intel.com
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 drivers/edac/i10nm_base.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c
index 47d6d7282c58..d63ddc9c994d 100644
--- a/drivers/edac/i10nm_base.c
+++ b/drivers/edac/i10nm_base.c
@@ -358,6 +358,9 @@ static int i10nm_get_hbm_munits(void)
 
 			mbase = ioremap(base + off, I10NM_HBM_IMC_MMIO_SIZE);
 			if (!mbase) {
+				pci_dev_put(d->imc[lmc].mdev);
+				d->imc[lmc].mdev = NULL;
+
 				i10nm_printk(KERN_ERR, "Failed to ioremap for hbm mc 0x%llx\n",
 					     base + off);
 				return -ENOMEM;
@@ -368,6 +371,12 @@ static int i10nm_get_hbm_munits(void)
 
 			mcmtr = I10NM_GET_MCMTR(&d->imc[lmc], 0);
 			if (!I10NM_IS_HBM_IMC(mcmtr)) {
+				iounmap(d->imc[lmc].mbase);
+				d->imc[lmc].mbase = NULL;
+				d->imc[lmc].hbm_mc = false;
+				pci_dev_put(d->imc[lmc].mdev);
+				d->imc[lmc].mdev = NULL;
+
 				i10nm_printk(KERN_ERR, "This isn't an hbm mc!\n");
 				return -ENODEV;
 			}
-- 
Gitee


From 910f3dd810c9bab8332d53f2c32c55f5fe65a64c Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 26 Jul 2022 17:38:14 +0800
Subject: [PATCH 562/674] etherdevice: Adjust ether_addr* prototypes to silence
 -Wstringop-overead
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

stable inclusion
from stable-v5.10.113
commit 08ad7a770efacfecf903143f8de88d1e351a1f2d
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=08ad7a770efacfecf903143f8de88d1e351a1f2d

--------------------------------

commit 2618a0dae09ef37728dab89ff60418cbe25ae6bd upstream.

With GCC 12, -Wstringop-overread was warning about an implicit cast from
char[6] to char[8]. However, the extra 2 bytes are always thrown away,
alignment doesn't matter, and the risk of hitting the edge of unallocated
memory has been accepted, so this prototype can just be converted to a
regular char *. Silences:

net/core/dev.c: In function ‘bpf_prog_run_generic_xdp’: net/core/dev.c:4618:21: warning: ‘ether_addr_equal_64bits’ reading 8 bytes from a region of size 6 [-Wstringop-overread]
 4618 |         orig_host = ether_addr_equal_64bits(eth->h_dest, > skb->dev->dev_addr);
      |                     ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
net/core/dev.c:4618:21: note: referencing argument 1 of type ‘const u8[8]’ {aka ‘const unsigned char[8]’}
net/core/dev.c:4618:21: note: referencing argument 2 of type ‘const u8[8]’ {aka ‘const unsigned char[8]’}
In file included from net/core/dev.c:91: include/linux/etherdevice.h:375:20: note: in a call to function ‘ether_addr_equal_64bits’
  375 | static inline bool ether_addr_equal_64bits(const u8 addr1[6+2],
      |                    ^~~~~~~~~~~~~~~~~~~~~~~

Reported-by: Marc Kleine-Budde <mkl@pengutronix.de>
Tested-by: Marc Kleine-Budde <mkl@pengutronix.de>
Link: https://lore.kernel.org/netdev/20220212090811.uuzk6d76agw2vv73@pengutronix.de
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: netdev@vger.kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
Cc: Khem Raj <raj.khem@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 include/linux/etherdevice.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index 2e5debc0373c..99209f50915f 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -127,7 +127,7 @@ static inline bool is_multicast_ether_addr(const u8 *addr)
 #endif
 }
 
-static inline bool is_multicast_ether_addr_64bits(const u8 addr[6+2])
+static inline bool is_multicast_ether_addr_64bits(const u8 *addr)
 {
 #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
 #ifdef __BIG_ENDIAN
@@ -352,8 +352,7 @@ static inline bool ether_addr_equal(const u8 *addr1, const u8 *addr2)
  * Please note that alignment of addr1 & addr2 are only guaranteed to be 16 bits.
  */
 
-static inline bool ether_addr_equal_64bits(const u8 addr1[6+2],
-					   const u8 addr2[6+2])
+static inline bool ether_addr_equal_64bits(const u8 *addr1, const u8 *addr2)
 {
 #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
 	u64 fold = (*(const u64 *)addr1) ^ (*(const u64 *)addr2);
-- 
Gitee


From 0ccd441a7641338e1220f8a244da9159b653ff5b Mon Sep 17 00:00:00 2001
From: Xiongwei Song <sxwjean@gmail.com>
Date: Tue, 26 Jul 2022 17:38:15 +0800
Subject: [PATCH 563/674] mm: page_alloc: fix building error on
 -Werror=array-compare

stable inclusion
from stable-v5.10.113
commit 69848f9488bc1088bcd9a73987dff8d2cb47a060
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=69848f9488bc1088bcd9a73987dff8d2cb47a060

--------------------------------

commit ca831f29f8f25c97182e726429b38c0802200c8f upstream.

Arthur Marsh reported we would hit the error below when building kernel
with gcc-12:

  CC      mm/page_alloc.o
  mm/page_alloc.c: In function `mem_init_print_info':
  mm/page_alloc.c:8173:27: error: comparison between two arrays [-Werror=array-compare]
   8173 |                 if (start <= pos && pos < end && size > adj) \
        |

In C++20, the comparision between arrays should be warned.

Link: https://lkml.kernel.org/r/20211125130928.32465-1-sxwjean@me.com
Signed-off-by: Xiongwei Song <sxwjean@gmail.com>
Reported-by: Arthur Marsh <arthur.marsh@internode.on.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Khem Raj <raj.khem@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 mm/page_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ec73cca1726c..cf9c69d631f3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -7974,7 +7974,7 @@ void __init mem_init_print_info(const char *str)
 	 */
 #define adj_init_size(start, end, size, pos, adj) \
 	do { \
-		if (start <= pos && pos < end && size > adj) \
+		if (&start[0] <= &pos[0] && &pos[0] < &end[0] && size > adj) \
 			size -= adj; \
 	} while (0)
 
-- 
Gitee


From d0005c0a490428fb4f12a085d77c1bb34bf1dbf5 Mon Sep 17 00:00:00 2001
From: Daniel Bristot de Oliveira <bristot@kernel.org>
Date: Tue, 26 Jul 2022 17:38:16 +0800
Subject: [PATCH 564/674] tracing: Dump stacktrace trigger to the corresponding
 instance

mainline inclusion
from mainline-v5.17-rc6
commit ce33c845b030c9cf768370c951bc699470b09fa7
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ce33c845b030c9cf768370c951bc699470b09fa7

--------------------------------

The stacktrace event trigger is not dumping the stacktrace to the instance
where it was enabled, but to the global "instance."

Use the private_data, pointing to the trigger file, to figure out the
corresponding trace instance, and use it in the trigger action, like
snapshot_trigger does.

Link: https://lkml.kernel.org/r/afbb0b4f18ba92c276865bc97204d438473f4ebc.1645396236.git.bristot@kernel.org

Cc: stable@vger.kernel.org
Fixes: ae63b31e4d0e2 ("tracing: Separate out trace events from global variables")
Reviewed-by: Tom Zanussi <zanussi@kernel.org>
Tested-by: Tom Zanussi <zanussi@kernel.org>
Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>

[zzk: __trace_stack() was changed due to the merge of "36590c50b2d0 tracing: Merge irqflags + preempt counter",
cherry-pick from mainline ce33c845b030c9 instead of stable 5.10.y]

Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Yang Jihong <yangjihong1@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 kernel/trace/trace_events_trigger.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index d0309de2f84f..3c6229f16e81 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -1219,7 +1219,12 @@ static void
 stacktrace_trigger(struct event_trigger_data *data, void *rec,
 		   struct ring_buffer_event *event)
 {
-	trace_dump_stack(STACK_SKIP);
+	struct trace_event_file *file = data->private_data;
+
+	if (file)
+		__trace_stack(file->tr, tracing_gen_ctx(), STACK_SKIP);
+	else
+		trace_dump_stack(STACK_SKIP);
 }
 
 static void
-- 
Gitee


From 32af01826a81a3f51b3e13d211351569f762f6c2 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Tue, 26 Jul 2022 17:38:17 +0800
Subject: [PATCH 565/674] perf tools: Fix segfault accessing sample_id xyarray

stable inclusion
from stable-v5.10.113
commit 378061c9b886994fa045186390d61a5e7c696ae3
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=378061c9b886994fa045186390d61a5e7c696ae3

--------------------------------

commit a668cc07f990d2ed19424d5c1a529521a9d1cee1 upstream.

perf_evsel::sample_id is an xyarray which can cause a segfault when
accessed beyond its size. e.g.

  # perf record -e intel_pt// -C 1 sleep 1
  Segmentation fault (core dumped)
  #

That is happening because a dummy event is opened to capture text poke
events accross all CPUs, however the mmap logic is allocating according
to the number of user_requested_cpus.

In general, perf sometimes uses the evsel cpus to open events, and
sometimes the evlist user_requested_cpus. However, it is not necessary
to determine which case is which because the opened event file
descriptors are also in an xyarray, the size of whch can be used
to correctly allocate the size of the sample_id xyarray, because there
is one ID per file descriptor.

Note, in the affected code path, perf_evsel fd array is subsequently
used to get the file descriptor for the mmap, so it makes sense for the
xyarrays to be the same size there.

Fixes: d1a177595b3a824c ("libperf: Adopt perf_evlist__mmap()/munmap() from tools/perf")
Fixes: 246eba8e9041c477 ("perf tools: Add support for PERF_RECORD_TEXT_POKE")
Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Acked-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: stable@vger.kernel.org # 5.5+
Link: https://lore.kernel.org/r/20220413114232.26914-1-adrian.hunter@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 tools/lib/perf/evlist.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tools/lib/perf/evlist.c b/tools/lib/perf/evlist.c
index 17465d454a0e..f76b1a9d5a6e 100644
--- a/tools/lib/perf/evlist.c
+++ b/tools/lib/perf/evlist.c
@@ -571,7 +571,6 @@ int perf_evlist__mmap_ops(struct perf_evlist *evlist,
 {
 	struct perf_evsel *evsel;
 	const struct perf_cpu_map *cpus = evlist->cpus;
-	const struct perf_thread_map *threads = evlist->threads;
 
 	if (!ops || !ops->get || !ops->mmap)
 		return -EINVAL;
@@ -583,7 +582,7 @@ int perf_evlist__mmap_ops(struct perf_evlist *evlist,
 	perf_evlist__for_each_entry(evlist, evsel) {
 		if ((evsel->attr.read_format & PERF_FORMAT_ID) &&
 		    evsel->sample_id == NULL &&
-		    perf_evsel__alloc_id(evsel, perf_cpu_map__nr(cpus), threads->nr) < 0)
+		    perf_evsel__alloc_id(evsel, evsel->fd->max_x, evsel->fd->max_y) < 0)
 			return -ENOMEM;
 	}
 
-- 
Gitee


From 9f91d21694fddc986c0beb30b2680d69457e66ec Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Tue, 26 Jul 2022 17:38:18 +0800
Subject: [PATCH 566/674] gfs2: assign rgrp glock before compute_bitstructs

stable inclusion
from stable-v5.10.113
commit 04dd45d9776eb4802df8b0a0a6fcdf27842fbdd1
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=04dd45d9776eb4802df8b0a0a6fcdf27842fbdd1

--------------------------------

commit 428f651cb80b227af47fc302e4931791f2fb4741 upstream.

Before this patch, function read_rindex_entry called compute_bitstructs
before it allocated a glock for the rgrp. But if compute_bitstructs found
a problem with the rgrp, it called gfs2_consist_rgrpd, and that called
gfs2_dump_glock for rgd->rd_gl which had not yet been assigned.

read_rindex_entry
   compute_bitstructs
      gfs2_consist_rgrpd
         gfs2_dump_glock <---------rgd->rd_gl was not set.

This patch changes read_rindex_entry so it assigns an rgrp glock before
calling compute_bitstructs so gfs2_dump_glock does not reference an
unassigned pointer. If an error is discovered, the glock must also be
put, so a new goto and label were added.

Reported-by: syzbot+c6fd14145e2f62ca0784@syzkaller.appspotmail.com
Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 fs/gfs2/rgrp.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index dc55b029afaa..c5bde789a16d 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -906,15 +906,15 @@ static int read_rindex_entry(struct gfs2_inode *ip)
 	rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes);
 	spin_lock_init(&rgd->rd_rsspin);
 
-	error = compute_bitstructs(rgd);
-	if (error)
-		goto fail;
-
 	error = gfs2_glock_get(sdp, rgd->rd_addr,
 			       &gfs2_rgrp_glops, CREATE, &rgd->rd_gl);
 	if (error)
 		goto fail;
 
+	error = compute_bitstructs(rgd);
+	if (error)
+		goto fail_glock;
+
 	rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr;
 	rgd->rd_flags &= ~(GFS2_RDF_UPTODATE | GFS2_RDF_PREFERRED);
 	if (rgd->rd_data > sdp->sd_max_rg_data)
@@ -928,6 +928,7 @@ static int read_rindex_entry(struct gfs2_inode *ip)
 	}
 
 	error = 0; /* someone else read in the rgrp; free it and ignore it */
+fail_glock:
 	gfs2_glock_put(rgd->rd_gl);
 
 fail:
-- 
Gitee


From 302f6418e23bbcecc28eafb2b9b50b3fd5de7806 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Tue, 26 Jul 2022 17:38:19 +0800
Subject: [PATCH 567/674] ALSA: usb-audio: Clear MIDI port active flag after
 draining

stable inclusion
from stable-v5.10.113
commit 8ce3820fc9d496100fb39008a952eebfb695b060
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=8ce3820fc9d496100fb39008a952eebfb695b060

--------------------------------

commit 0665886ad1392e6b5bae85d7a6ccbed48dca1522 upstream.

When a rawmidi output stream is closed, it calls the drain at first,
then does trigger-off only when the drain returns -ERESTARTSYS as a
fallback.  It implies that each driver should turn off the stream
properly after the drain.  Meanwhile, USB-audio MIDI interface didn't
change the port->active flag after the drain.  This may leave the
output work picking up the port that is closed right now, which
eventually leads to a use-after-free for the already released rawmidi
object.

This patch fixes the bug by properly clearing the port->active flag
after the output drain.

Reported-by: syzbot+70e777a39907d6d5fd0a@syzkaller.appspotmail.com
Cc: <stable@vger.kernel.org>
Link: https://lore.kernel.org/r/00000000000011555605dceaff03@google.com
Link: https://lore.kernel.org/r/20220420130247.22062-1-tiwai@suse.de
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 sound/usb/midi.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sound/usb/midi.c b/sound/usb/midi.c
index fa91290ad89d..84676a8fb60d 100644
--- a/sound/usb/midi.c
+++ b/sound/usb/midi.c
@@ -1210,6 +1210,7 @@ static void snd_usbmidi_output_drain(struct snd_rawmidi_substream *substream)
 		} while (drain_urbs && timeout);
 		finish_wait(&ep->drain_wait, &wait);
 	}
+	port->active = 0;
 	spin_unlock_irq(&ep->buffer_lock);
 }
 
-- 
Gitee


From 6942c4a9bb82dd73e524481428c69ffad9c9a445 Mon Sep 17 00:00:00 2001
From: Tim Crawford <tcrawford@system76.com>
Date: Tue, 26 Jul 2022 17:38:20 +0800
Subject: [PATCH 568/674] ALSA: hda/realtek: Add quirk for Clevo NP70PNP

stable inclusion
from stable-v5.10.113
commit cf9b19546494b2e5df87486752946ae71c6775db
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=cf9b19546494b2e5df87486752946ae71c6775db

--------------------------------

commit 86222af07abf1f5f07a5873cc399c29ab8a9b8b8 upstream.

Fixes headset detection on Clevo NP70PNP.

Signed-off-by: Tim Crawford <tcrawford@system76.com>
Cc: <stable@vger.kernel.org>
Link: https://lore.kernel.org/r/20220421170412.3697-1-tcrawford@system76.com
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 sound/pci/hda/patch_realtek.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
index 11d653190e6e..b5168959fcf6 100644
--- a/sound/pci/hda/patch_realtek.c
+++ b/sound/pci/hda/patch_realtek.c
@@ -8897,6 +8897,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = {
 	SND_PCI_QUIRK(0x1558, 0x8562, "Clevo NH[5|7][0-9]RZ[Q]", ALC269_FIXUP_DMIC),
 	SND_PCI_QUIRK(0x1558, 0x8668, "Clevo NP50B[BE]", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE),
 	SND_PCI_QUIRK(0x1558, 0x866d, "Clevo NP5[05]PN[HJK]", ALC256_FIXUP_SYSTEM76_MIC_NO_PRESENCE),
+	SND_PCI_QUIRK(0x1558, 0x867c, "Clevo NP7[01]PNP", ALC256_FIXUP_SYSTEM76_MIC_NO_PRESENCE),
 	SND_PCI_QUIRK(0x1558, 0x867d, "Clevo NP7[01]PN[HJK]", ALC256_FIXUP_SYSTEM76_MIC_NO_PRESENCE),
 	SND_PCI_QUIRK(0x1558, 0x8680, "Clevo NJ50LU", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE),
 	SND_PCI_QUIRK(0x1558, 0x8686, "Clevo NH50[CZ]U", ALC256_FIXUP_MIC_NO_PRESENCE_AND_RESUME),
-- 
Gitee


From 83ab10a7a9ff529b76e8517e74380568f7b81797 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 26 Jul 2022 17:38:21 +0800
Subject: [PATCH 569/674] ASoC: atmel: Remove system clock tree configuration
 for at91sam9g20ek

stable inclusion
from stable-v5.10.113
commit 608fc58858bfa7552a9824c2f0e4a3ab8dd4efaa
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=608fc58858bfa7552a9824c2f0e4a3ab8dd4efaa

--------------------------------

[ Upstream commit c775cbf62ed4911e4f0f23880f01815753123690 ]

The MCLK of the WM8731 on the AT91SAM9G20-EK board is connected to the
PCK0 output of the SoC, intended in the reference software to be supplied
using PLLB and programmed to 12MHz. As originally written for use with a
board file the audio driver was responsible for configuring the entire tree
but in the conversion to the common clock framework the registration of
the named pck0 and pllb clocks was removed so the driver has failed to
instantiate ever since.

Since the WM8731 driver has had support for managing a MCLK provided via
the common clock framework for some time we can simply drop all the clock
management code from the machine driver other than configuration of the
sysclk rate, the CODEC driver still respects that configuration from the
machine driver.

Fixes: ff78a189b0ae55f ("ARM: at91: remove old at91-specific clock driver")
Signed-off-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Codrin Ciubotariu <codrin.ciubotariu@microchip.com>
Link: https://lore.kernel.org/r/20220325154241.1600757-2-broonie@kernel.org
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 sound/soc/atmel/sam9g20_wm8731.c | 61 --------------------------------
 1 file changed, 61 deletions(-)

diff --git a/sound/soc/atmel/sam9g20_wm8731.c b/sound/soc/atmel/sam9g20_wm8731.c
index 8a55d59a6c2a..d243de5f23dc 100644
--- a/sound/soc/atmel/sam9g20_wm8731.c
+++ b/sound/soc/atmel/sam9g20_wm8731.c
@@ -46,35 +46,6 @@
  */
 #undef ENABLE_MIC_INPUT
 
-static struct clk *mclk;
-
-static int at91sam9g20ek_set_bias_level(struct snd_soc_card *card,
-					struct snd_soc_dapm_context *dapm,
-					enum snd_soc_bias_level level)
-{
-	static int mclk_on;
-	int ret = 0;
-
-	switch (level) {
-	case SND_SOC_BIAS_ON:
-	case SND_SOC_BIAS_PREPARE:
-		if (!mclk_on)
-			ret = clk_enable(mclk);
-		if (ret == 0)
-			mclk_on = 1;
-		break;
-
-	case SND_SOC_BIAS_OFF:
-	case SND_SOC_BIAS_STANDBY:
-		if (mclk_on)
-			clk_disable(mclk);
-		mclk_on = 0;
-		break;
-	}
-
-	return ret;
-}
-
 static const struct snd_soc_dapm_widget at91sam9g20ek_dapm_widgets[] = {
 	SND_SOC_DAPM_MIC("Int Mic", NULL),
 	SND_SOC_DAPM_SPK("Ext Spk", NULL),
@@ -135,7 +106,6 @@ static struct snd_soc_card snd_soc_at91sam9g20ek = {
 	.owner = THIS_MODULE,
 	.dai_link = &at91sam9g20ek_dai,
 	.num_links = 1,
-	.set_bias_level = at91sam9g20ek_set_bias_level,
 
 	.dapm_widgets = at91sam9g20ek_dapm_widgets,
 	.num_dapm_widgets = ARRAY_SIZE(at91sam9g20ek_dapm_widgets),
@@ -148,7 +118,6 @@ static int at91sam9g20ek_audio_probe(struct platform_device *pdev)
 {
 	struct device_node *np = pdev->dev.of_node;
 	struct device_node *codec_np, *cpu_np;
-	struct clk *pllb;
 	struct snd_soc_card *card = &snd_soc_at91sam9g20ek;
 	int ret;
 
@@ -162,31 +131,6 @@ static int at91sam9g20ek_audio_probe(struct platform_device *pdev)
 		return -EINVAL;
 	}
 
-	/*
-	 * Codec MCLK is supplied by PCK0 - set it up.
-	 */
-	mclk = clk_get(NULL, "pck0");
-	if (IS_ERR(mclk)) {
-		dev_err(&pdev->dev, "Failed to get MCLK\n");
-		ret = PTR_ERR(mclk);
-		goto err;
-	}
-
-	pllb = clk_get(NULL, "pllb");
-	if (IS_ERR(pllb)) {
-		dev_err(&pdev->dev, "Failed to get PLLB\n");
-		ret = PTR_ERR(pllb);
-		goto err_mclk;
-	}
-	ret = clk_set_parent(mclk, pllb);
-	clk_put(pllb);
-	if (ret != 0) {
-		dev_err(&pdev->dev, "Failed to set MCLK parent\n");
-		goto err_mclk;
-	}
-
-	clk_set_rate(mclk, MCLK_RATE);
-
 	card->dev = &pdev->dev;
 
 	/* Parse device node info */
@@ -230,9 +174,6 @@ static int at91sam9g20ek_audio_probe(struct platform_device *pdev)
 
 	return ret;
 
-err_mclk:
-	clk_put(mclk);
-	mclk = NULL;
 err:
 	atmel_ssc_put_audio(0);
 	return ret;
@@ -242,8 +183,6 @@ static int at91sam9g20ek_audio_remove(struct platform_device *pdev)
 {
 	struct snd_soc_card *card = platform_get_drvdata(pdev);
 
-	clk_disable(mclk);
-	mclk = NULL;
 	snd_soc_unregister_card(card);
 	atmel_ssc_put_audio(0);
 
-- 
Gitee


From f7ec810a47d7f46e7b1739befe01d4850fd7ca31 Mon Sep 17 00:00:00 2001
From: Miaoqian Lin <linmq006@gmail.com>
Date: Tue, 26 Jul 2022 17:38:22 +0800
Subject: [PATCH 570/674] ASoC: msm8916-wcd-digital: Check failure for
 devm_snd_soc_register_component

stable inclusion
from stable-v5.10.113
commit b6f474cd3097b9570d5c2c30075ca3506fbbcf33
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=b6f474cd3097b9570d5c2c30075ca3506fbbcf33

--------------------------------

[ Upstream commit e927b05f3cc20de87f6b7d912a5bbe556931caca ]

devm_snd_soc_register_component() may fails, we should check the error
and do the corresponding error handling.

Fixes: 150db8c5afa1 ("ASoC: codecs: Add msm8916-wcd digital codec")
Signed-off-by: Miaoqian Lin <linmq006@gmail.com>
Link: https://lore.kernel.org/r/20220403115239.30140-1-linmq006@gmail.com
Signed-off-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 sound/soc/codecs/msm8916-wcd-digital.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/sound/soc/codecs/msm8916-wcd-digital.c b/sound/soc/codecs/msm8916-wcd-digital.c
index 9ad7fc0baf07..20a07c92b2fc 100644
--- a/sound/soc/codecs/msm8916-wcd-digital.c
+++ b/sound/soc/codecs/msm8916-wcd-digital.c
@@ -1206,9 +1206,16 @@ static int msm8916_wcd_digital_probe(struct platform_device *pdev)
 
 	dev_set_drvdata(dev, priv);
 
-	return devm_snd_soc_register_component(dev, &msm8916_wcd_digital,
+	ret = devm_snd_soc_register_component(dev, &msm8916_wcd_digital,
 				      msm8916_wcd_digital_dai,
 				      ARRAY_SIZE(msm8916_wcd_digital_dai));
+	if (ret)
+		goto err_mclk;
+
+	return 0;
+
+err_mclk:
+	clk_disable_unprepare(priv->mclk);
 err_clk:
 	clk_disable_unprepare(priv->ahbclk);
 	return ret;
-- 
Gitee


From ecf9eaf316dc293ccd09a729f924840b09fe60db Mon Sep 17 00:00:00 2001
From: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Date: Tue, 26 Jul 2022 17:38:23 +0800
Subject: [PATCH 571/674] ASoC: codecs: wcd934x: do not switch off SIDO Buck
 when codec is in use

stable inclusion
from stable-v5.10.113
commit 12aa8021c7a72811cd8096b1e456cd5d265896d9
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=12aa8021c7a72811cd8096b1e456cd5d265896d9

--------------------------------

[ Upstream commit db6dd1bee63d1d88fbddfe07af800af5948ac28e ]

SIDO(Single-Inductor Dual-Ouput) Buck powers up both analog and digital
circuits along with internal memory, powering off this is the last thing
that codec should do when going to very low power.

Current code was powering off this Buck if there are no users of sysclk,
which is not correct. Powering off this buck will result in no register access.
This code path was never tested until recently after adding pm support
in SoundWire controller. Fix this by removing the buck poweroff when the
codec is active and also the code that is not used.

Without this patch all the read/write transactions will never complete and
results in SLIMBus Errors like:

qcom,slim-ngd qcom,slim-ngd.1: Tx:MT:0x0, MC:0x60, LA:0xcf failed:-110
wcd934x-codec wcd934x-codec.1.auto: ASoC: error at soc_component_read_no_lock
	on wcd934x-codec.1.auto for register: [0x00000d05] -110
qcom,slim-ngd-ctrl 171c0000.slim: Error Interrupt received 0x82000000

Reported-by: Amit Pundir <amit.pundir@linaro.org>
Fixes: a61f3b4f476e ("ASoC: wcd934x: add support to wcd9340/wcd9341 codec")
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Tested-by: Amit Pundir <amit.pundir@linaro.org>
Link: https://lore.kernel.org/r/20220407094313.2880-1-srinivas.kandagatla@linaro.org
Signed-off-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 sound/soc/codecs/wcd934x.c | 26 +-------------------------
 1 file changed, 1 insertion(+), 25 deletions(-)

diff --git a/sound/soc/codecs/wcd934x.c b/sound/soc/codecs/wcd934x.c
index 8540ac230d0e..fd704df9b175 100644
--- a/sound/soc/codecs/wcd934x.c
+++ b/sound/soc/codecs/wcd934x.c
@@ -1188,29 +1188,7 @@ static int wcd934x_set_sido_input_src(struct wcd934x_codec *wcd, int sido_src)
 	if (sido_src == wcd->sido_input_src)
 		return 0;
 
-	if (sido_src == SIDO_SOURCE_INTERNAL) {
-		regmap_update_bits(wcd->regmap, WCD934X_ANA_BUCK_CTL,
-				   WCD934X_ANA_BUCK_HI_ACCU_EN_MASK, 0);
-		usleep_range(100, 110);
-		regmap_update_bits(wcd->regmap, WCD934X_ANA_BUCK_CTL,
-				   WCD934X_ANA_BUCK_HI_ACCU_PRE_ENX_MASK, 0x0);
-		usleep_range(100, 110);
-		regmap_update_bits(wcd->regmap, WCD934X_ANA_RCO,
-				   WCD934X_ANA_RCO_BG_EN_MASK, 0);
-		usleep_range(100, 110);
-		regmap_update_bits(wcd->regmap, WCD934X_ANA_BUCK_CTL,
-				   WCD934X_ANA_BUCK_PRE_EN1_MASK,
-				   WCD934X_ANA_BUCK_PRE_EN1_ENABLE);
-		usleep_range(100, 110);
-		regmap_update_bits(wcd->regmap, WCD934X_ANA_BUCK_CTL,
-				   WCD934X_ANA_BUCK_PRE_EN2_MASK,
-				   WCD934X_ANA_BUCK_PRE_EN2_ENABLE);
-		usleep_range(100, 110);
-		regmap_update_bits(wcd->regmap, WCD934X_ANA_BUCK_CTL,
-				   WCD934X_ANA_BUCK_HI_ACCU_EN_MASK,
-				   WCD934X_ANA_BUCK_HI_ACCU_ENABLE);
-		usleep_range(100, 110);
-	} else if (sido_src == SIDO_SOURCE_RCO_BG) {
+	if (sido_src == SIDO_SOURCE_RCO_BG) {
 		regmap_update_bits(wcd->regmap, WCD934X_ANA_RCO,
 				   WCD934X_ANA_RCO_BG_EN_MASK,
 				   WCD934X_ANA_RCO_BG_ENABLE);
@@ -1296,8 +1274,6 @@ static int wcd934x_disable_ana_bias_and_syclk(struct wcd934x_codec *wcd)
 	regmap_update_bits(wcd->regmap, WCD934X_CLK_SYS_MCLK_PRG,
 			   WCD934X_EXT_CLK_BUF_EN_MASK |
 			   WCD934X_MCLK_EN_MASK, 0x0);
-	wcd934x_set_sido_input_src(wcd, SIDO_SOURCE_INTERNAL);
-
 	regmap_update_bits(wcd->regmap, WCD934X_ANA_BIAS,
 			   WCD934X_ANA_BIAS_EN_MASK, 0);
 	regmap_update_bits(wcd->regmap, WCD934X_ANA_BIAS,
-- 
Gitee


From 4be6d449af6b349bdeeb388d9a40f814d8120112 Mon Sep 17 00:00:00 2001
From: Miaoqian Lin <linmq006@gmail.com>
Date: Tue, 26 Jul 2022 17:38:24 +0800
Subject: [PATCH 572/674] dmaengine: imx-sdma: Fix error checking in
 sdma_event_remap

stable inclusion
from stable-v5.10.113
commit 9bc949a181ba805b76b81d1ae1519af093328b81
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=9bc949a181ba805b76b81d1ae1519af093328b81

--------------------------------

[ Upstream commit 7104b9cb35a33ad803a1adbbfa50569b008faf15 ]

of_parse_phandle() returns NULL on errors, rather than error
pointers. Using NULL check on grp_np to fix this.

Fixes: d078cd1b4185 ("dmaengine: imx-sdma: Add imx6sx platform support")
Signed-off-by: Miaoqian Lin <linmq006@gmail.com>
Link: https://lore.kernel.org/r/20220308064952.15743-1-linmq006@gmail.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/dma/imx-sdma.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/dma/imx-sdma.c b/drivers/dma/imx-sdma.c
index 306f93e4b26a..792c91cd1608 100644
--- a/drivers/dma/imx-sdma.c
+++ b/drivers/dma/imx-sdma.c
@@ -1789,7 +1789,7 @@ static int sdma_event_remap(struct sdma_engine *sdma)
 	u32 reg, val, shift, num_map, i;
 	int ret = 0;
 
-	if (IS_ERR(np) || IS_ERR(gpr_np))
+	if (IS_ERR(np) || !gpr_np)
 		goto out;
 
 	event_remap = of_find_property(np, propname, NULL);
@@ -1837,7 +1837,7 @@ static int sdma_event_remap(struct sdma_engine *sdma)
 	}
 
 out:
-	if (!IS_ERR(gpr_np))
+	if (gpr_np)
 		of_node_put(gpr_np);
 
 	return ret;
-- 
Gitee


From dfc6a02d0e0f49da209a4e3fce9e1e52775ae7a8 Mon Sep 17 00:00:00 2001
From: zhangqilong <zhangqilong3@huawei.com>
Date: Tue, 26 Jul 2022 17:38:25 +0800
Subject: [PATCH 573/674] dmaengine: mediatek:Fix PM usage reference leak of
 mtk_uart_apdma_alloc_chan_resources

stable inclusion
from stable-v5.10.113
commit f714abf28f819849d2fd93a4a8db15cff3f8798e
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=f714abf28f819849d2fd93a4a8db15cff3f8798e

--------------------------------

[ Upstream commit 545b2baac89b859180e51215468c05d85ea8465a ]

pm_runtime_get_sync will increment pm usage counter even it failed.
Forgetting to putting operation will result in reference leak here.
We fix it:
1) Replacing it with pm_runtime_resume_and_get to keep usage counter
   balanced.
2) Add putting operation before returning error.

Fixes:9135408c3ace4 ("dmaengine: mediatek: Add MediaTek UART APDMA support")
Signed-off-by: Zhang Qilong <zhangqilong3@huawei.com>
Link: https://lore.kernel.org/r/20220319022142.142709-1-zhangqilong3@huawei.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/dma/mediatek/mtk-uart-apdma.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/dma/mediatek/mtk-uart-apdma.c b/drivers/dma/mediatek/mtk-uart-apdma.c
index 375e7e647df6..a1517ef1f4a0 100644
--- a/drivers/dma/mediatek/mtk-uart-apdma.c
+++ b/drivers/dma/mediatek/mtk-uart-apdma.c
@@ -274,7 +274,7 @@ static int mtk_uart_apdma_alloc_chan_resources(struct dma_chan *chan)
 	unsigned int status;
 	int ret;
 
-	ret = pm_runtime_get_sync(mtkd->ddev.dev);
+	ret = pm_runtime_resume_and_get(mtkd->ddev.dev);
 	if (ret < 0) {
 		pm_runtime_put_noidle(chan->device->dev);
 		return ret;
@@ -288,18 +288,21 @@ static int mtk_uart_apdma_alloc_chan_resources(struct dma_chan *chan)
 	ret = readx_poll_timeout(readl, c->base + VFF_EN,
 			  status, !status, 10, 100);
 	if (ret)
-		return ret;
+		goto err_pm;
 
 	ret = request_irq(c->irq, mtk_uart_apdma_irq_handler,
 			  IRQF_TRIGGER_NONE, KBUILD_MODNAME, chan);
 	if (ret < 0) {
 		dev_err(chan->device->dev, "Can't request dma IRQ\n");
-		return -EINVAL;
+		ret = -EINVAL;
+		goto err_pm;
 	}
 
 	if (mtkd->support_33bits)
 		mtk_uart_apdma_write(c, VFF_4G_SUPPORT, VFF_4G_SUPPORT_CLR_B);
 
+err_pm:
+	pm_runtime_put_noidle(mtkd->ddev.dev);
 	return ret;
 }
 
-- 
Gitee


From 4ab150b4794d5ccbae344f03323fce6d73d66fcd Mon Sep 17 00:00:00 2001
From: Allen-KH Cheng <allen-kh.cheng@mediatek.com>
Date: Tue, 26 Jul 2022 17:38:26 +0800
Subject: [PATCH 574/674] spi: spi-mtk-nor: initialize spi controller after
 resume

stable inclusion
from stable-v5.10.113
commit 3f7914dbeacdffe4fa198dee7afefa74998b5d8d
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=3f7914dbeacdffe4fa198dee7afefa74998b5d8d

--------------------------------

[ Upstream commit 317c2045618cc1f8d38beb8c93a7bdb6ad8638c6 ]

After system resumes, the registers of nor controller are
initialized with default values. The nor controller will
not function properly.

To handle both issues above, we add mtk_nor_init() in
mtk_nor_resume after pm_runtime_force_resume().

Fixes: 3bfd9103c7af ("spi: spi-mtk-nor: Add power management support")

Signed-off-by: Allen-KH Cheng <allen-kh.cheng@mediatek.com>
Reviewed-by: Rex-BC Chen <rex-bc.chen@mediatek.com>
Link: https://lore.kernel.org/r/20220412115743.22641-1-allen-kh.cheng@mediatek.com
Signed-off-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/spi/spi-mtk-nor.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/drivers/spi/spi-mtk-nor.c b/drivers/spi/spi-mtk-nor.c
index 288f6c2bbd57..106e3cacba4c 100644
--- a/drivers/spi/spi-mtk-nor.c
+++ b/drivers/spi/spi-mtk-nor.c
@@ -895,7 +895,17 @@ static int __maybe_unused mtk_nor_suspend(struct device *dev)
 
 static int __maybe_unused mtk_nor_resume(struct device *dev)
 {
-	return pm_runtime_force_resume(dev);
+	struct spi_controller *ctlr = dev_get_drvdata(dev);
+	struct mtk_nor *sp = spi_controller_get_devdata(ctlr);
+	int ret;
+
+	ret = pm_runtime_force_resume(dev);
+	if (ret)
+		return ret;
+
+	mtk_nor_init(sp);
+
+	return 0;
 }
 
 static const struct dev_pm_ops mtk_nor_pm_ops = {
-- 
Gitee


From 79b7e6ed7a6170a8f469c8035db58f163600a410 Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Tue, 26 Jul 2022 17:38:27 +0800
Subject: [PATCH 575/674] esp: limit skb_page_frag_refill use to a single page

stable inclusion
from stable-v5.10.113
commit c075c3ea031757f8ea2d34567565b61a868c08d5
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=c075c3ea031757f8ea2d34567565b61a868c08d5

--------------------------------

[ Upstream commit 5bd8baab087dff657e05387aee802e70304cc813 ]

Commit ebe48d368e97 ("esp: Fix possible buffer overflow in ESP
transformation") tried to fix skb_page_frag_refill usage in ESP by
capping allocsize to 32k, but that doesn't completely solve the issue,
as skb_page_frag_refill may return a single page. If that happens, we
will write out of bounds, despite the check introduced in the previous
patch.

This patch forces COW in cases where we would end up calling
skb_page_frag_refill with a size larger than a page (first in
esp_output_head with tailen, then in esp_output_tail with
skb->data_len).

Fixes: cac2661c53f3 ("esp4: Avoid skb_cow_data whenever possible")
Fixes: 03e2a30f6a27 ("esp6: Avoid skb_cow_data whenever possible")
Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 include/net/esp.h | 2 --
 net/ipv4/esp4.c   | 5 ++---
 net/ipv6/esp6.c   | 5 ++---
 3 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/include/net/esp.h b/include/net/esp.h
index 90cd02ff77ef..9c5637d41d95 100644
--- a/include/net/esp.h
+++ b/include/net/esp.h
@@ -4,8 +4,6 @@
 
 #include <linux/skbuff.h>
 
-#define ESP_SKB_FRAG_MAXSIZE (PAGE_SIZE << SKB_FRAG_PAGE_ORDER)
-
 struct ip_esp_hdr;
 
 static inline struct ip_esp_hdr *ip_esp_hdr(const struct sk_buff *skb)
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 9aae82145bc1..20d738137841 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -448,7 +448,6 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *
 	struct page *page;
 	struct sk_buff *trailer;
 	int tailen = esp->tailen;
-	unsigned int allocsz;
 
 	/* this is non-NULL only with TCP/UDP Encapsulation */
 	if (x->encap) {
@@ -458,8 +457,8 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *
 			return err;
 	}
 
-	allocsz = ALIGN(skb->data_len + tailen, L1_CACHE_BYTES);
-	if (allocsz > ESP_SKB_FRAG_MAXSIZE)
+	if (ALIGN(tailen, L1_CACHE_BYTES) > PAGE_SIZE ||
+	    ALIGN(skb->data_len, L1_CACHE_BYTES) > PAGE_SIZE)
 		goto cow;
 
 	if (!skb_cloned(skb)) {
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 20c7bef6829e..cb28f8928f9e 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -483,7 +483,6 @@ int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info
 	struct page *page;
 	struct sk_buff *trailer;
 	int tailen = esp->tailen;
-	unsigned int allocsz;
 
 	if (x->encap) {
 		int err = esp6_output_encap(x, skb, esp);
@@ -492,8 +491,8 @@ int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info
 			return err;
 	}
 
-	allocsz = ALIGN(skb->data_len + tailen, L1_CACHE_BYTES);
-	if (allocsz > ESP_SKB_FRAG_MAXSIZE)
+	if (ALIGN(tailen, L1_CACHE_BYTES) > PAGE_SIZE ||
+	    ALIGN(skb->data_len, L1_CACHE_BYTES) > PAGE_SIZE)
 		goto cow;
 
 	if (!skb_cloned(skb)) {
-- 
Gitee


From f648582b76cf544acd8efbeed6f5fa0020acf160 Mon Sep 17 00:00:00 2001
From: Sasha Neftin <sasha.neftin@intel.com>
Date: Tue, 26 Jul 2022 17:38:28 +0800
Subject: [PATCH 576/674] igc: Fix infinite loop in release_swfw_sync

stable inclusion
from stable-v5.10.113
commit 46b0e4f998ceead45f273be4d282b7fbb70f6125
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=46b0e4f998ceead45f273be4d282b7fbb70f6125

--------------------------------

[ Upstream commit 907862e9aef75bf89e2b265efcc58870be06081e ]

An infinite loop may occur if we fail to acquire the HW semaphore,
which is needed for resource release.
This will typically happen if the hardware is surprise-removed.
At this stage there is nothing to do, except log an error and quit.

Fixes: c0071c7aa5fe ("igc: Add HW initialization code")
Suggested-by: Dima Ruinskiy <dima.ruinskiy@intel.com>
Signed-off-by: Sasha Neftin <sasha.neftin@intel.com>
Tested-by: Naama Meir <naamax.meir@linux.intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/net/ethernet/intel/igc/igc_i225.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/igc/igc_i225.c b/drivers/net/ethernet/intel/igc/igc_i225.c
index 553d6bc78e6b..624236a4202e 100644
--- a/drivers/net/ethernet/intel/igc/igc_i225.c
+++ b/drivers/net/ethernet/intel/igc/igc_i225.c
@@ -156,8 +156,15 @@ void igc_release_swfw_sync_i225(struct igc_hw *hw, u16 mask)
 {
 	u32 swfw_sync;
 
-	while (igc_get_hw_semaphore_i225(hw))
-		; /* Empty */
+	/* Releasing the resource requires first getting the HW semaphore.
+	 * If we fail to get the semaphore, there is nothing we can do,
+	 * except log an error and quit. We are not allowed to hang here
+	 * indefinitely, as it may cause denial of service or system crash.
+	 */
+	if (igc_get_hw_semaphore_i225(hw)) {
+		hw_dbg("Failed to release SW_FW_SYNC.\n");
+		return;
+	}
 
 	swfw_sync = rd32(IGC_SW_FW_SYNC);
 	swfw_sync &= ~mask;
-- 
Gitee


From 39d9023b5d9c8d03091fb5c7c0781836854451bb Mon Sep 17 00:00:00 2001
From: Sasha Neftin <sasha.neftin@intel.com>
Date: Tue, 26 Jul 2022 17:38:29 +0800
Subject: [PATCH 577/674] igc: Fix BUG: scheduling while atomic

stable inclusion
from stable-v5.10.113
commit fc7116a79a86500e15520658f279b6536ff9f700
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=fc7116a79a86500e15520658f279b6536ff9f700

--------------------------------

[ Upstream commit c80a29f0fe9b6f5457e0788e27d1110577eba99b ]

Replace usleep_range() method with udelay() method to allow atomic contexts
in low-level MDIO access functions.

The following issue can be seen by doing the following:
$ modprobe -r bonding
$ modprobe -v bonding max_bonds=1 mode=1 miimon=100 use_carrier=0
$ ip link set bond0 up
$ ifenslave bond0 eth0 eth1

[  982.357308] BUG: scheduling while atomic: kworker/u64:0/9/0x00000002
[  982.364431] INFO: lockdep is turned off.
[  982.368824] Modules linked in: bonding sctp ip6_udp_tunnel udp_tunnel mlx4_ib ib_uverbs ib_core mlx4_en mlx4_core nfp tls sunrpc intel_rapl_msr iTCO_wdt iTCO_vendor_support mxm_wmi dcdbas intel_rapl_common sb_edac x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel kvm irqbypass crct10dif_pclmul crc32_pclmul ghash_clmulni_intel rapl intel_cstate intel_uncore pcspkr lpc_ich mei_me ipmi_ssif mei ipmi_si ipmi_devintf ipmi_msghandler wmi acpi_power_meter xfs libcrc32c sr_mod cdrom sd_mod t10_pi sg mgag200 drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops drm ahci libahci crc32c_intel libata i2c_algo_bit tg3 megaraid_sas igc dm_mirror dm_region_hash dm_log dm_mod [last unloaded: bonding]
[  982.437941] CPU: 25 PID: 9 Comm: kworker/u64:0 Kdump: loaded Tainted: G        W        --------- -  - 4.18.0-348.el8.x86_64+debug #1
[  982.451333] Hardware name: Dell Inc. PowerEdge R730/0H21J3, BIOS 2.7.0 12/005/2017
[  982.459791] Workqueue: bond0 bond_mii_monitor [bonding]
[  982.465622] Call Trace:
[  982.468355]  dump_stack+0x8e/0xd0
[  982.472056]  __schedule_bug.cold.60+0x3a/0x60
[  982.476919]  __schedule+0x147b/0x1bc0
[  982.481007]  ? firmware_map_remove+0x16b/0x16b
[  982.485967]  ? hrtimer_fixup_init+0x40/0x40
[  982.490625]  schedule+0xd9/0x250
[  982.494227]  schedule_hrtimeout_range_clock+0x10d/0x2c0
[  982.500058]  ? hrtimer_nanosleep_restart+0x130/0x130
[  982.505598]  ? hrtimer_init_sleeper_on_stack+0x90/0x90
[  982.511332]  ? usleep_range+0x88/0x130
[  982.515514]  ? recalibrate_cpu_khz+0x10/0x10
[  982.520279]  ? ktime_get+0xab/0x1c0
[  982.524175]  ? usleep_range+0x88/0x130
[  982.528355]  usleep_range+0xdd/0x130
[  982.532344]  ? console_conditional_schedule+0x30/0x30
[  982.537987]  ? igc_put_hw_semaphore+0x17/0x60 [igc]
[  982.543432]  igc_read_phy_reg_gpy+0x111/0x2b0 [igc]
[  982.548887]  igc_phy_has_link+0xfa/0x260 [igc]
[  982.553847]  ? igc_get_phy_id+0x210/0x210 [igc]
[  982.558894]  ? lock_acquire+0x34d/0x890
[  982.563187]  ? lock_downgrade+0x710/0x710
[  982.567659]  ? rcu_read_unlock+0x50/0x50
[  982.572039]  igc_check_for_copper_link+0x106/0x210 [igc]
[  982.577970]  ? igc_config_fc_after_link_up+0x840/0x840 [igc]
[  982.584286]  ? rcu_read_unlock+0x50/0x50
[  982.588661]  ? lock_release+0x591/0xb80
[  982.592939]  ? lock_release+0x591/0xb80
[  982.597220]  igc_has_link+0x113/0x330 [igc]
[  982.601887]  ? lock_downgrade+0x710/0x710
[  982.606362]  igc_ethtool_get_link+0x6d/0x90 [igc]
[  982.611614]  bond_check_dev_link+0x131/0x2c0 [bonding]
[  982.617350]  ? bond_time_in_interval+0xd0/0xd0 [bonding]
[  982.623277]  ? rcu_read_lock_held+0x62/0xc0
[  982.627944]  ? rcu_read_lock_sched_held+0xe0/0xe0
[  982.633198]  bond_mii_monitor+0x314/0x2500 [bonding]
[  982.638738]  ? lock_contended+0x880/0x880
[  982.643214]  ? bond_miimon_link_change+0xa0/0xa0 [bonding]
[  982.649336]  ? lock_acquire+0x34d/0x890
[  982.653615]  ? lock_downgrade+0x710/0x710
[  982.658089]  ? debug_object_deactivate+0x221/0x340
[  982.663436]  ? rcu_read_unlock+0x50/0x50
[  982.667811]  ? debug_print_object+0x2b0/0x2b0
[  982.672672]  ? __switch_to_asm+0x41/0x70
[  982.677049]  ? __switch_to_asm+0x35/0x70
[  982.681426]  ? _raw_spin_unlock_irq+0x24/0x40
[  982.686288]  ? trace_hardirqs_on+0x20/0x195
[  982.690956]  ? _raw_spin_unlock_irq+0x24/0x40
[  982.695818]  process_one_work+0x8f0/0x1770
[  982.700390]  ? pwq_dec_nr_in_flight+0x320/0x320
[  982.705443]  ? debug_show_held_locks+0x50/0x50
[  982.710403]  worker_thread+0x87/0xb40
[  982.714489]  ? process_one_work+0x1770/0x1770
[  982.719349]  kthread+0x344/0x410
[  982.722950]  ? kthread_insert_work_sanity_check+0xd0/0xd0
[  982.728975]  ret_from_fork+0x3a/0x50

Fixes: 5586838fe9ce ("igc: Add code for PHY support")
Reported-by: Corinna Vinschen <vinschen@redhat.com>
Suggested-by: Dima Ruinskiy <dima.ruinskiy@intel.com>
Signed-off-by: Sasha Neftin <sasha.neftin@intel.com>
Tested-by: Corinna Vinschen <vinschen@redhat.com>
Tested-by: Naama Meir <naamax.meir@linux.intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/net/ethernet/intel/igc/igc_phy.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/igc/igc_phy.c b/drivers/net/ethernet/intel/igc/igc_phy.c
index e380b7a3ea63..8de4de2e5636 100644
--- a/drivers/net/ethernet/intel/igc/igc_phy.c
+++ b/drivers/net/ethernet/intel/igc/igc_phy.c
@@ -583,7 +583,7 @@ static s32 igc_read_phy_reg_mdic(struct igc_hw *hw, u32 offset, u16 *data)
 	 * the lower time out
 	 */
 	for (i = 0; i < IGC_GEN_POLL_TIMEOUT; i++) {
-		usleep_range(500, 1000);
+		udelay(50);
 		mdic = rd32(IGC_MDIC);
 		if (mdic & IGC_MDIC_READY)
 			break;
@@ -640,7 +640,7 @@ static s32 igc_write_phy_reg_mdic(struct igc_hw *hw, u32 offset, u16 data)
 	 * the lower time out
 	 */
 	for (i = 0; i < IGC_GEN_POLL_TIMEOUT; i++) {
-		usleep_range(500, 1000);
+		udelay(50);
 		mdic = rd32(IGC_MDIC);
 		if (mdic & IGC_MDIC_READY)
 			break;
-- 
Gitee


From ab18d18b75f5f270c8f87b9743007d904f37b66e Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 26 Jul 2022 17:38:30 +0800
Subject: [PATCH 578/674] rxrpc: Restore removed timer deletion

stable inclusion
from stable-v5.10.113
commit 60592f16a456d239df9060968dd9cb561d0a6fb0
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=60592f16a456d239df9060968dd9cb561d0a6fb0

--------------------------------

[ Upstream commit ee3b0826b4764f6c13ad6db67495c5a1c38e9025 ]

A recent patch[1] from Eric Dumazet flipped the order in which the
keepalive timer and the keepalive worker were cancelled in order to fix a
syzbot reported issue[2].  Unfortunately, this enables the mirror image bug
whereby the timer races with rxrpc_exit_net(), restarting the worker after
it has been cancelled:

	CPU 1		CPU 2
	===============	=====================
			if (rxnet->live)
			<INTERRUPT>
	rxnet->live = false;
 	cancel_work_sync(&rxnet->peer_keepalive_work);
			rxrpc_queue_work(&rxnet->peer_keepalive_work);
	del_timer_sync(&rxnet->peer_keepalive_timer);

Fix this by restoring the removed del_timer_sync() so that we try to remove
the timer twice.  If the timer runs again, it should see ->live == false
and not restart the worker.

Fixes: 1946014ca3b1 ("rxrpc: fix a race in rxrpc_exit_net()")
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Eric Dumazet <edumazet@google.com>
cc: Marc Dionne <marc.dionne@auristor.com>
cc: linux-afs@lists.infradead.org
Link: https://lore.kernel.org/r/20220404183439.3537837-1-eric.dumazet@gmail.com/ [1]
Link: https://syzkaller.appspot.com/bug?extid=724378c4bb58f703b09a [2]
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 net/rxrpc/net_ns.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/rxrpc/net_ns.c b/net/rxrpc/net_ns.c
index f15d6942da45..cc7e30733feb 100644
--- a/net/rxrpc/net_ns.c
+++ b/net/rxrpc/net_ns.c
@@ -113,7 +113,9 @@ static __net_exit void rxrpc_exit_net(struct net *net)
 	struct rxrpc_net *rxnet = rxrpc_net(net);
 
 	rxnet->live = false;
+	del_timer_sync(&rxnet->peer_keepalive_timer);
 	cancel_work_sync(&rxnet->peer_keepalive_work);
+	/* Remove the timer again as the worker may have restarted it. */
 	del_timer_sync(&rxnet->peer_keepalive_timer);
 	rxrpc_destroy_all_calls(rxnet);
 	rxrpc_destroy_all_connections(rxnet);
-- 
Gitee


From 27b6e01f60e73dc8944d2d18ed50a60a32e68f08 Mon Sep 17 00:00:00 2001
From: Tony Lu <tonylu@linux.alibaba.com>
Date: Tue, 26 Jul 2022 17:38:31 +0800
Subject: [PATCH 579/674] net/smc: Fix sock leak when release after
 smc_shutdown()

stable inclusion
from stable-v5.10.113
commit a499cb5f3ef9f976eac96c02adbcc505764e2b91
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=a499cb5f3ef9f976eac96c02adbcc505764e2b91

--------------------------------

[ Upstream commit 1a74e99323746353bba11562a2f2d0aa8102f402 ]

Since commit e5d5aadcf3cd ("net/smc: fix sk_refcnt underflow on linkdown
and fallback"), for a fallback connection, __smc_release() does not call
sock_put() if its state is already SMC_CLOSED.

When calling smc_shutdown() after falling back, its state is set to
SMC_CLOSED but does not call sock_put(), so this patch calls it.

Reported-and-tested-by: syzbot+6e29a053eb165bd50de5@syzkaller.appspotmail.com
Fixes: e5d5aadcf3cd ("net/smc: fix sk_refcnt underflow on linkdown and fallback")
Signed-off-by: Tony Lu <tonylu@linux.alibaba.com>
Acked-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 net/smc/af_smc.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 4f16d406ad8e..1b98f3241150 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -2144,8 +2144,10 @@ static int smc_shutdown(struct socket *sock, int how)
 	if (smc->use_fallback) {
 		rc = kernel_sock_shutdown(smc->clcsock, how);
 		sk->sk_shutdown = smc->clcsock->sk->sk_shutdown;
-		if (sk->sk_shutdown == SHUTDOWN_MASK)
+		if (sk->sk_shutdown == SHUTDOWN_MASK) {
 			sk->sk_state = SMC_CLOSED;
+			sock_put(sk);
+		}
 		goto out;
 	}
 	switch (how) {
-- 
Gitee


From 3059be689186bbdbc081b5cdc9f8b08e78487795 Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Tue, 26 Jul 2022 17:38:32 +0800
Subject: [PATCH 580/674] net/packet: fix packet_sock xmit return value
 checking

stable inclusion
from stable-v5.10.113
commit 8fb76adb89f0d576c107e05fd922bf3d04c470d9
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=8fb76adb89f0d576c107e05fd922bf3d04c470d9

--------------------------------

[ Upstream commit 29e8e659f984be00d75ec5fef4e37c88def72712 ]

packet_sock xmit could be dev_queue_xmit, which also returns negative
errors. So only checking positive errors is not enough, or userspace
sendmsg may return success while packet is not send out.

Move the net_xmit_errno() assignment in the braces as checkpatch.pl said
do not use assignment in if condition.

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Reported-by: Flavio Leitner <fbl@redhat.com>
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 net/packet/af_packet.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index d0c95d7dd292..5ee600d108a0 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2817,8 +2817,9 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 
 		status = TP_STATUS_SEND_REQUEST;
 		err = po->xmit(skb);
-		if (unlikely(err > 0)) {
-			err = net_xmit_errno(err);
+		if (unlikely(err != 0)) {
+			if (err > 0)
+				err = net_xmit_errno(err);
 			if (err && __packet_get_status(po, ph) ==
 				   TP_STATUS_AVAILABLE) {
 				/* skb was destructed already */
@@ -3019,8 +3020,12 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
 		skb->no_fcs = 1;
 
 	err = po->xmit(skb);
-	if (err > 0 && (err = net_xmit_errno(err)) != 0)
-		goto out_unlock;
+	if (unlikely(err != 0)) {
+		if (err > 0)
+			err = net_xmit_errno(err);
+		if (err)
+			goto out_unlock;
+	}
 
 	dev_put(dev);
 
-- 
Gitee


From beeab1369997e4479ef51ac7c3a97460393a01e0 Mon Sep 17 00:00:00 2001
From: Peilin Ye <peilin.ye@bytedance.com>
Date: Tue, 26 Jul 2022 17:38:33 +0800
Subject: [PATCH 581/674] ip6_gre: Avoid updating tunnel->tun_hlen in
 __gre6_xmit()

stable inclusion
from stable-v5.10.113
commit 200f96ebb389788dea563e8a0b1105863a836039
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=200f96ebb389788dea563e8a0b1105863a836039

--------------------------------

[ Upstream commit f40c064e933d7787ca7411b699504d7a2664c1f5 ]

Do not update tunnel->tun_hlen in data plane code.  Use a local variable
instead, just like "tunnel_hlen" in net/ipv4/ip_gre.c:gre_fb_xmit().

Co-developed-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Peilin Ye <peilin.ye@bytedance.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 net/ipv6/ip6_gre.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 9a0263f25232..949d6fbc1ca0 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -743,6 +743,7 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb,
 		struct ip_tunnel_info *tun_info;
 		const struct ip_tunnel_key *key;
 		__be16 flags;
+		int tun_hlen;
 
 		tun_info = skb_tunnel_info_txcheck(skb);
 		if (IS_ERR(tun_info) ||
@@ -760,9 +761,9 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb,
 		dsfield = key->tos;
 		flags = key->tun_flags &
 			(TUNNEL_CSUM | TUNNEL_KEY | TUNNEL_SEQ);
-		tunnel->tun_hlen = gre_calc_hlen(flags);
+		tun_hlen = gre_calc_hlen(flags);
 
-		gre_build_header(skb, tunnel->tun_hlen,
+		gre_build_header(skb, tun_hlen,
 				 flags, protocol,
 				 tunnel_id_to_key32(tun_info->key.tun_id),
 				 (flags & TUNNEL_SEQ) ? htonl(tunnel->o_seqno++)
-- 
Gitee


From be914f9d3ae55f43b0733a64e2b769bcd02d7d0c Mon Sep 17 00:00:00 2001
From: Peilin Ye <peilin.ye@bytedance.com>
Date: Tue, 26 Jul 2022 17:38:34 +0800
Subject: [PATCH 582/674] ip6_gre: Fix skb_under_panic in __gre6_xmit()

stable inclusion
from stable-v5.10.113
commit 93366275be72bc55cada31a2a1273147553acab8
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=93366275be72bc55cada31a2a1273147553acab8

--------------------------------

[ Upstream commit ab198e1d0dd8dc4bc7575fb50758e2cbd51e14e1 ]

Feng reported an skb_under_panic BUG triggered by running
test_ip6gretap() in tools/testing/selftests/bpf/test_tunnel.sh:

[   82.492551] skbuff: skb_under_panic: text:ffffffffb268bb8e len:403 put:12 head:ffff9997c5480000 data:ffff9997c547fff8 tail:0x18b end:0x2c0 dev:ip6gretap11
<...>
[   82.607380] Call Trace:
[   82.609389]  <TASK>
[   82.611136]  skb_push.cold.109+0x10/0x10
[   82.614289]  __gre6_xmit+0x41e/0x590
[   82.617169]  ip6gre_tunnel_xmit+0x344/0x3f0
[   82.620526]  dev_hard_start_xmit+0xf1/0x330
[   82.623882]  sch_direct_xmit+0xe4/0x250
[   82.626961]  __dev_queue_xmit+0x720/0xfe0
<...>
[   82.633431]  packet_sendmsg+0x96a/0x1cb0
[   82.636568]  sock_sendmsg+0x30/0x40
<...>

The following sequence of events caused the BUG:

1. During ip6gretap device initialization, tunnel->tun_hlen (e.g. 4) is
   calculated based on old flags (see ip6gre_calc_hlen());
2. packet_snd() reserves header room for skb A, assuming
   tunnel->tun_hlen is 4;
3. Later (in clsact Qdisc), the eBPF program sets a new tunnel key for
   skb A using bpf_skb_set_tunnel_key() (see _ip6gretap_set_tunnel());
4. __gre6_xmit() detects the new tunnel key, and recalculates
   "tun_hlen" (e.g. 12) based on new flags (e.g. TUNNEL_KEY and
   TUNNEL_SEQ);
5. gre_build_header() calls skb_push() with insufficient reserved header
   room, triggering the BUG.

As sugguested by Cong, fix it by moving the call to skb_cow_head() after
the recalculation of tun_hlen.

Reproducer:

  OBJ=$LINUX/tools/testing/selftests/bpf/test_tunnel_kern.o

  ip netns add at_ns0
  ip link add veth0 type veth peer name veth1
  ip link set veth0 netns at_ns0
  ip netns exec at_ns0 ip addr add 172.16.1.100/24 dev veth0
  ip netns exec at_ns0 ip link set dev veth0 up
  ip link set dev veth1 up mtu 1500
  ip addr add dev veth1 172.16.1.200/24

  ip netns exec at_ns0 ip addr add ::11/96 dev veth0
  ip netns exec at_ns0 ip link set dev veth0 up
  ip addr add dev veth1 ::22/96
  ip link set dev veth1 up

  ip netns exec at_ns0 \
  	ip link add dev ip6gretap00 type ip6gretap seq flowlabel 0xbcdef key 2 \
  	local ::11 remote ::22

  ip netns exec at_ns0 ip addr add dev ip6gretap00 10.1.1.100/24
  ip netns exec at_ns0 ip addr add dev ip6gretap00 fc80::100/96
  ip netns exec at_ns0 ip link set dev ip6gretap00 up

  ip link add dev ip6gretap11 type ip6gretap external
  ip addr add dev ip6gretap11 10.1.1.200/24
  ip addr add dev ip6gretap11 fc80::200/24
  ip link set dev ip6gretap11 up

  tc qdisc add dev ip6gretap11 clsact
  tc filter add dev ip6gretap11 egress bpf da obj $OBJ sec ip6gretap_set_tunnel
  tc filter add dev ip6gretap11 ingress bpf da obj $OBJ sec ip6gretap_get_tunnel

  ping6 -c 3 -w 10 -q ::11

Fixes: 6712abc168eb ("ip6_gre: add ip6 gre and gretap collect_md mode")
Reported-by: Feng Zhou <zhoufeng.zf@bytedance.com>
Co-developed-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Peilin Ye <peilin.ye@bytedance.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 net/ipv6/ip6_gre.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 949d6fbc1ca0..1f6c752f13b4 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -733,9 +733,6 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb,
 	else
 		fl6->daddr = tunnel->parms.raddr;
 
-	if (skb_cow_head(skb, dev->needed_headroom ?: tunnel->hlen))
-		return -ENOMEM;
-
 	/* Push GRE header. */
 	protocol = (dev->type == ARPHRD_ETHER) ? htons(ETH_P_TEB) : proto;
 
@@ -763,6 +760,9 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb,
 			(TUNNEL_CSUM | TUNNEL_KEY | TUNNEL_SEQ);
 		tun_hlen = gre_calc_hlen(flags);
 
+		if (skb_cow_head(skb, dev->needed_headroom ?: tun_hlen + tunnel->encap_hlen))
+			return -ENOMEM;
+
 		gre_build_header(skb, tun_hlen,
 				 flags, protocol,
 				 tunnel_id_to_key32(tun_info->key.tun_id),
@@ -773,6 +773,9 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb,
 		if (tunnel->parms.o_flags & TUNNEL_SEQ)
 			tunnel->o_seqno++;
 
+		if (skb_cow_head(skb, dev->needed_headroom ?: tunnel->hlen))
+			return -ENOMEM;
+
 		gre_build_header(skb, tunnel->tun_hlen, tunnel->parms.o_flags,
 				 protocol, tunnel->parms.o_key,
 				 htonl(tunnel->o_seqno));
-- 
Gitee


From cf223164578f4184522fc77982b2d795305ba27e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 26 Jul 2022 17:38:35 +0800
Subject: [PATCH 583/674] net/sched: cls_u32: fix possible leak in
 u32_init_knode()

stable inclusion
from stable-v5.10.113
commit 0ac8f83d8f6471e39b3cfe8ee48560df3bf9c451
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=0ac8f83d8f6471e39b3cfe8ee48560df3bf9c451

--------------------------------

[ Upstream commit ec5b0f605b105457f257f2870acad4a5d463984b ]

While investigating a related syzbot report,
I found that whenever call to tcf_exts_init()
from u32_init_knode() is failing, we end up
with an elevated refcount on ht->refcnt

To avoid that, only increase the refcount after
all possible errors have been evaluated.

Fixes: b9a24bb76bf6 ("net_sched: properly handle failure case of tcf_exts_init()")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: Jiri Pirko <jiri@resnulli.us>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 net/sched/cls_u32.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index b61db335c49d..da042bc8b239 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -814,10 +814,6 @@ static struct tc_u_knode *u32_init_knode(struct net *net, struct tcf_proto *tp,
 	new->flags = n->flags;
 	RCU_INIT_POINTER(new->ht_down, ht);
 
-	/* bump reference count as long as we hold pointer to structure */
-	if (ht)
-		ht->refcnt++;
-
 #ifdef CONFIG_CLS_U32_PERF
 	/* Statistics may be incremented by readers during update
 	 * so we must keep them in tact. When the node is later destroyed
@@ -839,6 +835,10 @@ static struct tc_u_knode *u32_init_knode(struct net *net, struct tcf_proto *tp,
 		return NULL;
 	}
 
+	/* bump reference count as long as we hold pointer to structure */
+	if (ht)
+		ht->refcnt++;
+
 	return new;
 }
 
-- 
Gitee


From aa8ab17792740416d50918eaf63bc38ed591fbc2 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@kernel.org>
Date: Tue, 26 Jul 2022 17:38:36 +0800
Subject: [PATCH 584/674] l3mdev: l3mdev_master_upper_ifindex_by_index_rcu
 should be using netdev_master_upper_dev_get_rcu

stable inclusion
from stable-v5.10.113
commit 078d839f11acfa9bf5b549d2189abda9f2a943a0
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=078d839f11acfa9bf5b549d2189abda9f2a943a0

--------------------------------

[ Upstream commit 83daab06252ee5d0e1f4373ff28b79304945fc19 ]

Next patch uses l3mdev_master_upper_ifindex_by_index_rcu which throws
a splat with debug kernels:

[13783.087570] ------------[ cut here ]------------
[13783.093974] RTNL: assertion failed at net/core/dev.c (6702)
[13783.100761] WARNING: CPU: 3 PID: 51132 at net/core/dev.c:6702 netdev_master_upper_dev_get+0x16a/0x1a0

[13783.184226] CPU: 3 PID: 51132 Comm: kworker/3:3 Not tainted 5.17.0-custom-100090-g6f963aafb1cc #682
[13783.194788] Hardware name: Mellanox Technologies Ltd. MSN2010/SA002610, BIOS 5.6.5 08/24/2017
[13783.204755] Workqueue: mld mld_ifc_work [ipv6]
[13783.210338] RIP: 0010:netdev_master_upper_dev_get+0x16a/0x1a0
[13783.217209] Code: 0f 85 e3 fe ff ff e8 65 ac ec fe ba 2e 1a 00 00 48 c7 c6 60 6f 38 83 48 c7 c7 c0 70 38 83 c6 05 5e b5 d7 01 01 e8 c6 29 52 00 <0f> 0b e9 b8 fe ff ff e8 5a 6c 35 ff e9 1c ff ff ff 48 89 ef e8 7d
[13783.238659] RSP: 0018:ffffc9000b37f5a8 EFLAGS: 00010286
[13783.244995] RAX: 0000000000000000 RBX: ffff88812ee5c000 RCX: 0000000000000000
[13783.253379] RDX: ffff88811ce09d40 RSI: ffffffff812d0fcd RDI: fffff5200166fea7
[13783.261769] RBP: 0000000000000000 R08: 0000000000000001 R09: ffff8882375f4287
[13783.270138] R10: ffffed1046ebe850 R11: 0000000000000001 R12: dffffc0000000000
[13783.278510] R13: 0000000000000275 R14: ffffc9000b37f688 R15: ffff8881273b4af8
[13783.286870] FS:  0000000000000000(0000) GS:ffff888237400000(0000) knlGS:0000000000000000
[13783.296352] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[13783.303177] CR2: 00007ff25fc9b2e8 CR3: 0000000174d23000 CR4: 00000000001006e0
[13783.311546] Call Trace:
[13783.314660]  <TASK>
[13783.317553]  l3mdev_master_upper_ifindex_by_index_rcu+0x43/0xe0
...

Change l3mdev_master_upper_ifindex_by_index_rcu to use
netdev_master_upper_dev_get_rcu.

Fixes: 6a6d6681ac1a ("l3mdev: add function to retreive upper master")
Signed-off-by: Ido Schimmel <idosch@idosch.org>
Signed-off-by: David Ahern <dsahern@kernel.org>
Cc: Alexis Bauvin <abauvin@scaleway.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 net/l3mdev/l3mdev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c
index 864326f150e2..f2c3a61ad134 100644
--- a/net/l3mdev/l3mdev.c
+++ b/net/l3mdev/l3mdev.c
@@ -147,7 +147,7 @@ int l3mdev_master_upper_ifindex_by_index_rcu(struct net *net, int ifindex)
 
 	dev = dev_get_by_index_rcu(net, ifindex);
 	while (dev && !netif_is_l3_master(dev))
-		dev = netdev_master_upper_dev_get(dev);
+		dev = netdev_master_upper_dev_get_rcu(dev);
 
 	return dev ? dev->ifindex : 0;
 }
-- 
Gitee


From 3bd107cbe765f64dbde8b4282a01282e5a9be749 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 26 Jul 2022 17:38:37 +0800
Subject: [PATCH 585/674] ipv6: make ip6_rt_gc_expire an atomic_t

stable inclusion
from stable-v5.10.113
commit 49516e6ed91434d022a800321a8bc7d8054f62ac
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=49516e6ed91434d022a800321a8bc7d8054f62ac

--------------------------------

[ Upstream commit 9cb7c013420f98fa6fd12fc6a5dc055170c108db ]

Reads and Writes to ip6_rt_gc_expire always have been racy,
as syzbot reported lately [1]

There is a possible risk of under-flow, leading
to unexpected high value passed to fib6_run_gc(),
although I have not observed this in the field.

Hosts hitting ip6_dst_gc() very hard are under pretty bad
state anyway.

[1]
BUG: KCSAN: data-race in ip6_dst_gc / ip6_dst_gc

read-write to 0xffff888102110744 of 4 bytes by task 13165 on cpu 1:
 ip6_dst_gc+0x1f3/0x220 net/ipv6/route.c:3311
 dst_alloc+0x9b/0x160 net/core/dst.c:86
 ip6_dst_alloc net/ipv6/route.c:344 [inline]
 icmp6_dst_alloc+0xb2/0x360 net/ipv6/route.c:3261
 mld_sendpack+0x2b9/0x580 net/ipv6/mcast.c:1807
 mld_send_cr net/ipv6/mcast.c:2119 [inline]
 mld_ifc_work+0x576/0x800 net/ipv6/mcast.c:2651
 process_one_work+0x3d3/0x720 kernel/workqueue.c:2289
 worker_thread+0x618/0xa70 kernel/workqueue.c:2436
 kthread+0x1a9/0x1e0 kernel/kthread.c:376
 ret_from_fork+0x1f/0x30

read-write to 0xffff888102110744 of 4 bytes by task 11607 on cpu 0:
 ip6_dst_gc+0x1f3/0x220 net/ipv6/route.c:3311
 dst_alloc+0x9b/0x160 net/core/dst.c:86
 ip6_dst_alloc net/ipv6/route.c:344 [inline]
 icmp6_dst_alloc+0xb2/0x360 net/ipv6/route.c:3261
 mld_sendpack+0x2b9/0x580 net/ipv6/mcast.c:1807
 mld_send_cr net/ipv6/mcast.c:2119 [inline]
 mld_ifc_work+0x576/0x800 net/ipv6/mcast.c:2651
 process_one_work+0x3d3/0x720 kernel/workqueue.c:2289
 worker_thread+0x618/0xa70 kernel/workqueue.c:2436
 kthread+0x1a9/0x1e0 kernel/kthread.c:376
 ret_from_fork+0x1f/0x30

value changed: 0x00000bb3 -> 0x00000ba9

Reported by Kernel Concurrency Sanitizer on:
CPU: 0 PID: 11607 Comm: kworker/0:21 Not tainted 5.18.0-rc1-syzkaller-00037-g42e7a03d3bad-dirty #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Workqueue: mld mld_ifc_work

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20220413181333.649424-1-eric.dumazet@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>

 Conflicts:
	include/net/netns/ipv6.h
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 include/net/netns/ipv6.h |  4 ++--
 net/ipv6/route.c         | 11 ++++++-----
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index b2a28201f4fd..35e0c4a6c71f 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -79,8 +79,8 @@ struct netns_ipv6 {
 	struct dst_ops		ip6_dst_ops;
 	rwlock_t		fib6_walker_lock;
 	spinlock_t		fib6_gc_lock;
-	unsigned int		 ip6_rt_gc_expire;
-	unsigned long		 ip6_rt_last_gc;
+	atomic_t		ip6_rt_gc_expire;
+	unsigned long		ip6_rt_last_gc;
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
 	unsigned int		fib6_rules_require_fldissect;
 #endif
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index a44ad9637e8a..4ef59dc515e5 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3187,6 +3187,7 @@ static int ip6_dst_gc(struct dst_ops *ops)
 	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
 	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
 	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
+	unsigned int val;
 	int entries;
 
 	entries = dst_entries_get_fast(ops);
@@ -3197,13 +3198,13 @@ static int ip6_dst_gc(struct dst_ops *ops)
 	    entries <= rt_max_size)
 		goto out;
 
-	net->ipv6.ip6_rt_gc_expire++;
-	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
+	fib6_run_gc(atomic_inc_return(&net->ipv6.ip6_rt_gc_expire), net, true);
 	entries = dst_entries_get_slow(ops);
 	if (entries < ops->gc_thresh)
-		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
+		atomic_set(&net->ipv6.ip6_rt_gc_expire, rt_gc_timeout >> 1);
 out:
-	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
+	val = atomic_read(&net->ipv6.ip6_rt_gc_expire);
+	atomic_set(&net->ipv6.ip6_rt_gc_expire, val - (val >> rt_elasticity));
 	return entries > rt_max_size;
 }
 
@@ -6358,7 +6359,7 @@ static int __net_init ip6_route_net_init(struct net *net)
 	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
 	net->ipv6.sysctl.skip_notify_on_dev_down = 0;
 
-	net->ipv6.ip6_rt_gc_expire = 30*HZ;
+	atomic_set(&net->ipv6.ip6_rt_gc_expire, 30*HZ);
 
 	ret = 0;
 out:
-- 
Gitee


From 6007bb2484ce25d53b3df5fb1dba2aea01cef119 Mon Sep 17 00:00:00 2001
From: Xu Jia <xujia39@huawei.com>
Date: Tue, 26 Jul 2022 17:38:38 +0800
Subject: [PATCH 586/674] ipv6: fix kabi for ip6_rt_gc_expire in struct
 netns_ipv6

hulk inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

--------------------------------

Making ip6_rt_gc_expire atomic breaks the KABI of struct netns_ipv6.
This patch uses KABI_REPLACE to fix it.

Signed-off-by: Xu Jia <xujia39@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Yue Haibing <yuehaibing@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 include/net/netns/ipv6.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index 35e0c4a6c71f..31d0cf3c7377 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -79,7 +79,7 @@ struct netns_ipv6 {
 	struct dst_ops		ip6_dst_ops;
 	rwlock_t		fib6_walker_lock;
 	spinlock_t		fib6_gc_lock;
-	atomic_t		ip6_rt_gc_expire;
+	KABI_REPLACE(unsigned int ip6_rt_gc_expire, atomic_t ip6_rt_gc_expire)
 	unsigned long		ip6_rt_last_gc;
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
 	unsigned int		fib6_rules_require_fldissect;
-- 
Gitee


From d7434e9cbc0f809add85594b97bcd8731050cfe9 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 26 Jul 2022 17:38:39 +0800
Subject: [PATCH 587/674] netlink: reset network and mac headers in
 netlink_dump()

stable inclusion
from stable-v5.10.113
commit 3d55b195747c1259c994b868a509beae947a7c6e
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=3d55b195747c1259c994b868a509beae947a7c6e

--------------------------------

[ Upstream commit 99c07327ae11e24886d552dddbe4537bfca2765d ]

netlink_dump() is allocating an skb, reserves space in it
but forgets to reset network header.

This allows a BPF program, invoked later from sk_filter()
to access uninitialized kernel memory from the reserved
space.

Theorically mac header reset could be omitted, because
it is set to a special initial value.
bpf_internal_load_pointer_neg_helper calls skb_mac_header()
without checking skb_mac_header_was_set().
Relying on skb->len not being too big seems fragile.
We also could add a sanity check in bpf_internal_load_pointer_neg_helper()
to avoid surprises in the future.

syzbot report was:

BUG: KMSAN: uninit-value in ___bpf_prog_run+0xa22b/0xb420 kernel/bpf/core.c:1637
 ___bpf_prog_run+0xa22b/0xb420 kernel/bpf/core.c:1637
 __bpf_prog_run32+0x121/0x180 kernel/bpf/core.c:1796
 bpf_dispatcher_nop_func include/linux/bpf.h:784 [inline]
 __bpf_prog_run include/linux/filter.h:626 [inline]
 bpf_prog_run include/linux/filter.h:633 [inline]
 __bpf_prog_run_save_cb+0x168/0x580 include/linux/filter.h:756
 bpf_prog_run_save_cb include/linux/filter.h:770 [inline]
 sk_filter_trim_cap+0x3bc/0x8c0 net/core/filter.c:150
 sk_filter include/linux/filter.h:905 [inline]
 netlink_dump+0xe0c/0x16c0 net/netlink/af_netlink.c:2276
 netlink_recvmsg+0x1129/0x1c80 net/netlink/af_netlink.c:2002
 sock_recvmsg_nosec net/socket.c:948 [inline]
 sock_recvmsg net/socket.c:966 [inline]
 sock_read_iter+0x5a9/0x630 net/socket.c:1039
 do_iter_readv_writev+0xa7f/0xc70
 do_iter_read+0x52c/0x14c0 fs/read_write.c:786
 vfs_readv fs/read_write.c:906 [inline]
 do_readv+0x432/0x800 fs/read_write.c:943
 __do_sys_readv fs/read_write.c:1034 [inline]
 __se_sys_readv fs/read_write.c:1031 [inline]
 __x64_sys_readv+0xe5/0x120 fs/read_write.c:1031
 do_syscall_x64 arch/x86/entry/common.c:51 [inline]
 do_syscall_64+0x54/0xd0 arch/x86/entry/common.c:81
 entry_SYSCALL_64_after_hwframe+0x44/0xae

Uninit was stored to memory at:
 ___bpf_prog_run+0x96c/0xb420 kernel/bpf/core.c:1558
 __bpf_prog_run32+0x121/0x180 kernel/bpf/core.c:1796
 bpf_dispatcher_nop_func include/linux/bpf.h:784 [inline]
 __bpf_prog_run include/linux/filter.h:626 [inline]
 bpf_prog_run include/linux/filter.h:633 [inline]
 __bpf_prog_run_save_cb+0x168/0x580 include/linux/filter.h:756
 bpf_prog_run_save_cb include/linux/filter.h:770 [inline]
 sk_filter_trim_cap+0x3bc/0x8c0 net/core/filter.c:150
 sk_filter include/linux/filter.h:905 [inline]
 netlink_dump+0xe0c/0x16c0 net/netlink/af_netlink.c:2276
 netlink_recvmsg+0x1129/0x1c80 net/netlink/af_netlink.c:2002
 sock_recvmsg_nosec net/socket.c:948 [inline]
 sock_recvmsg net/socket.c:966 [inline]
 sock_read_iter+0x5a9/0x630 net/socket.c:1039
 do_iter_readv_writev+0xa7f/0xc70
 do_iter_read+0x52c/0x14c0 fs/read_write.c:786
 vfs_readv fs/read_write.c:906 [inline]
 do_readv+0x432/0x800 fs/read_write.c:943
 __do_sys_readv fs/read_write.c:1034 [inline]
 __se_sys_readv fs/read_write.c:1031 [inline]
 __x64_sys_readv+0xe5/0x120 fs/read_write.c:1031
 do_syscall_x64 arch/x86/entry/common.c:51 [inline]
 do_syscall_64+0x54/0xd0 arch/x86/entry/common.c:81
 entry_SYSCALL_64_after_hwframe+0x44/0xae

Uninit was created at:
 slab_post_alloc_hook mm/slab.h:737 [inline]
 slab_alloc_node mm/slub.c:3244 [inline]
 __kmalloc_node_track_caller+0xde3/0x14f0 mm/slub.c:4972
 kmalloc_reserve net/core/skbuff.c:354 [inline]
 __alloc_skb+0x545/0xf90 net/core/skbuff.c:426
 alloc_skb include/linux/skbuff.h:1158 [inline]
 netlink_dump+0x30f/0x16c0 net/netlink/af_netlink.c:2242
 netlink_recvmsg+0x1129/0x1c80 net/netlink/af_netlink.c:2002
 sock_recvmsg_nosec net/socket.c:948 [inline]
 sock_recvmsg net/socket.c:966 [inline]
 sock_read_iter+0x5a9/0x630 net/socket.c:1039
 do_iter_readv_writev+0xa7f/0xc70
 do_iter_read+0x52c/0x14c0 fs/read_write.c:786
 vfs_readv fs/read_write.c:906 [inline]
 do_readv+0x432/0x800 fs/read_write.c:943
 __do_sys_readv fs/read_write.c:1034 [inline]
 __se_sys_readv fs/read_write.c:1031 [inline]
 __x64_sys_readv+0xe5/0x120 fs/read_write.c:1031
 do_syscall_x64 arch/x86/entry/common.c:51 [inline]
 do_syscall_64+0x54/0xd0 arch/x86/entry/common.c:81
 entry_SYSCALL_64_after_hwframe+0x44/0xae

CPU: 0 PID: 3470 Comm: syz-executor751 Not tainted 5.17.0-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011

Fixes: db65a3aaf29e ("netlink: Trim skb to alloc size to avoid MSG_TRUNC")
Fixes: 9063e21fb026 ("netlink: autosize skb lengthes")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Link: https://lore.kernel.org/r/20220415181442.551228-1-eric.dumazet@gmail.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 net/netlink/af_netlink.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index f37916156ca5..cbfb601c4ee9 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2276,6 +2276,13 @@ static int netlink_dump(struct sock *sk)
 	 * single netdev. The outcome is MSG_TRUNC error.
 	 */
 	skb_reserve(skb, skb_tailroom(skb) - alloc_size);
+
+	/* Make sure malicious BPF programs can not read unitialized memory
+	 * from skb->head -> skb->data
+	 */
+	skb_reset_network_header(skb);
+	skb_reset_mac_header(skb);
+
 	netlink_skb_set_owner_r(skb, sk);
 
 	if (nlk->dump_done_errno > 0) {
-- 
Gitee


From ceaf8ddd59c3450db33261efc14a28835bafb9ce Mon Sep 17 00:00:00 2001
From: Kevin Hao <haokexin@gmail.com>
Date: Tue, 26 Jul 2022 17:38:40 +0800
Subject: [PATCH 588/674] net: stmmac: Use readl_poll_timeout_atomic() in
 atomic state

stable inclusion
from stable-v5.10.113
commit f593f49fcd174ebe0ebeebecf46fadc134593ddb
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=f593f49fcd174ebe0ebeebecf46fadc134593ddb

--------------------------------

[ Upstream commit 234901de2bc6847eaa0aeb4aba62c31ffb8d3ad6 ]

The init_systime() may be invoked in atomic state. We have observed the
following call trace when running "phc_ctl /dev/ptp0 set" on a Intel
Agilex board.
  BUG: sleeping function called from invalid context at drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c:74
  in_atomic(): 1, irqs_disabled(): 128, non_block: 0, pid: 381, name: phc_ctl
  preempt_count: 1, expected: 0
  RCU nest depth: 0, expected: 0
  Preemption disabled at:
  [<ffff80000892ef78>] stmmac_set_time+0x34/0x8c
  CPU: 2 PID: 381 Comm: phc_ctl Not tainted 5.18.0-rc2-next-20220414-yocto-standard+ #567
  Hardware name: SoCFPGA Agilex SoCDK (DT)
  Call trace:
   dump_backtrace.part.0+0xc4/0xd0
   show_stack+0x24/0x40
   dump_stack_lvl+0x7c/0xa0
   dump_stack+0x18/0x34
   __might_resched+0x154/0x1c0
   __might_sleep+0x58/0x90
   init_systime+0x78/0x120
   stmmac_set_time+0x64/0x8c
   ptp_clock_settime+0x60/0x9c
   pc_clock_settime+0x6c/0xc0
   __arm64_sys_clock_settime+0x88/0xf0
   invoke_syscall+0x5c/0x130
   el0_svc_common.constprop.0+0x4c/0x100
   do_el0_svc+0x7c/0xa0
   el0_svc+0x58/0xcc
   el0t_64_sync_handler+0xa4/0x130
   el0t_64_sync+0x18c/0x190

So we should use readl_poll_timeout_atomic() here instead of
readl_poll_timeout().

Also adjust the delay time to 10us to fix a "__bad_udelay" build error
reported by "kernel test robot <lkp@intel.com>". I have tested this on
Intel Agilex and NXP S32G boards, there is no delay needed at all.
So the 10us delay should be long enough for most cases.

Fixes: ff8ed737860e ("net: stmmac: use readl_poll_timeout() function in init_systime()")
Signed-off-by: Kevin Hao <haokexin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c
index 07b1b8374cd2..53efcc9c40e2 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c
@@ -68,9 +68,9 @@ static int init_systime(void __iomem *ioaddr, u32 sec, u32 nsec)
 	writel(value, ioaddr + PTP_TCR);
 
 	/* wait for present system time initialize to complete */
-	return readl_poll_timeout(ioaddr + PTP_TCR, value,
+	return readl_poll_timeout_atomic(ioaddr + PTP_TCR, value,
 				 !(value & PTP_TCR_TSINIT),
-				 10000, 100000);
+				 10, 100000);
 }
 
 static int config_addend(void __iomem *ioaddr, u32 addend)
-- 
Gitee


From 1d394f14128d12cf5b6313d895622aad5fd5a5ba Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 26 Jul 2022 17:38:41 +0800
Subject: [PATCH 589/674] dmaengine: idxd: add RO check for wq max_batch_size
 write

stable inclusion
from stable-v5.10.113
commit 9a3c026dc3a59fa23c63d90752f07538a3c75646
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=9a3c026dc3a59fa23c63d90752f07538a3c75646

--------------------------------

[ Upstream commit 66903461ffed0b66fc3e0200082d4e09365aacdc ]

Block wq_max_batch_size_store() when the device is configured as read-only
and not configurable.

Fixes: e7184b159dd3 ("dmaengine: idxd: add support for configurable max wq batch size")
Reported-by: Bernice Zhang <bernice.zhang@intel.com>
Tested-by: Bernice Zhang <bernice.zhang@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/164971493551.2201159.1942042593642155209.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/dma/idxd/sysfs.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index 7b41cdff1a2c..5bf4b4be64e4 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -1132,6 +1132,9 @@ static ssize_t wq_max_batch_size_store(struct device *dev, struct device_attribu
 	u64 batch_size;
 	int rc;
 
+	if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
+		return -EPERM;
+
 	if (wq->state != IDXD_WQ_DISABLED)
 		return -EPERM;
 
-- 
Gitee


From c324bed6bcd3c12392271dddbbbd03c23acddd8d Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 26 Jul 2022 17:38:42 +0800
Subject: [PATCH 590/674] dmaengine: idxd: add RO check for wq
 max_transfer_size write

stable inclusion
from stable-v5.10.113
commit 520aab8b723cb2bffef957844b634af04e96428d
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=520aab8b723cb2bffef957844b634af04e96428d

--------------------------------

[ Upstream commit 505a2d1032ae656b0a8c736be110255503941cde ]

Block wq_max_transfer_size_store() when the device is configured as
read-only and not configurable.

Fixes: d7aad5550eca ("dmaengine: idxd: add support for configurable max wq xfer size")
Reported-by: Bernice Zhang <bernice.zhang@intel.com>
Tested-by: Bernice Zhang <bernice.zhang@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/164971488154.2200913.10706665404118545941.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/dma/idxd/sysfs.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index 5bf4b4be64e4..51af0dfc3c63 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -1098,6 +1098,9 @@ static ssize_t wq_max_transfer_size_store(struct device *dev, struct device_attr
 	u64 xfer_size;
 	int rc;
 
+	if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
+		return -EPERM;
+
 	if (wq->state != IDXD_WQ_DISABLED)
 		return -EPERM;
 
-- 
Gitee


From aec8147faa831c7bd51628f95d992afe0942c107 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Tue, 26 Jul 2022 17:38:43 +0800
Subject: [PATCH 591/674] selftests: mlxsw: vxlan_flooding: Prevent flooding of
 unwanted packets

stable inclusion
from stable-v5.10.113
commit 3bf8ca35017024fa1cad55344f798cd5cd131c16
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=3bf8ca35017024fa1cad55344f798cd5cd131c16

--------------------------------

[ Upstream commit 044011fdf162c5dd61c02841930c8f438a9adadb ]

The test verifies that packets are correctly flooded by the bridge and
the VXLAN device by matching on the encapsulated packets at the other
end. However, if packets other than those generated by the test also
ingress the bridge (e.g., MLD packets), they will be flooded as well and
interfere with the expected count.

Make the test more robust by making sure that only the packets generated
by the test can ingress the bridge. Drop all the rest using tc filters
on the egress of 'br0' and 'h1'.

In the software data path, the problem can be solved by matching on the
inner destination MAC or dropping unwanted packets at the egress of the
VXLAN device, but this is not currently supported by mlxsw.

Fixes: 94d302deae25 ("selftests: mlxsw: Add a test for VxLAN flooding")
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: Amit Cohen <amcohen@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 .../drivers/net/mlxsw/vxlan_flooding.sh         | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/tools/testing/selftests/drivers/net/mlxsw/vxlan_flooding.sh b/tools/testing/selftests/drivers/net/mlxsw/vxlan_flooding.sh
index fedcb7b35af9..af5ea50ed5c0 100755
--- a/tools/testing/selftests/drivers/net/mlxsw/vxlan_flooding.sh
+++ b/tools/testing/selftests/drivers/net/mlxsw/vxlan_flooding.sh
@@ -172,6 +172,17 @@ flooding_filters_add()
 	local lsb
 	local i
 
+	# Prevent unwanted packets from entering the bridge and interfering
+	# with the test.
+	tc qdisc add dev br0 clsact
+	tc filter add dev br0 egress protocol all pref 1 handle 1 \
+		matchall skip_hw action drop
+	tc qdisc add dev $h1 clsact
+	tc filter add dev $h1 egress protocol all pref 1 handle 1 \
+		flower skip_hw dst_mac de:ad:be:ef:13:37 action pass
+	tc filter add dev $h1 egress protocol all pref 2 handle 2 \
+		matchall skip_hw action drop
+
 	tc qdisc add dev $rp2 clsact
 
 	for i in $(eval echo {1..$num_remotes}); do
@@ -194,6 +205,12 @@ flooding_filters_del()
 	done
 
 	tc qdisc del dev $rp2 clsact
+
+	tc filter del dev $h1 egress protocol all pref 2 handle 2 matchall
+	tc filter del dev $h1 egress protocol all pref 1 handle 1 flower
+	tc qdisc del dev $h1 clsact
+	tc filter del dev br0 egress protocol all pref 1 handle 1 matchall
+	tc qdisc del dev br0 clsact
 }
 
 flooding_check_packets()
-- 
Gitee


From a260b077acedec14a79fb4dc4e06619ac344af7f Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Tue, 26 Jul 2022 17:38:44 +0800
Subject: [PATCH 592/674] arm64/mm: Remove [PUD|PMD]_TABLE_BIT from
 [pud|pmd]_bad()

stable inclusion
from stable-v5.10.113
commit 18ff7a2efa4e14a6bd86aad31a6a8cebdf0dbabc
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=18ff7a2efa4e14a6bd86aad31a6a8cebdf0dbabc

--------------------------------

[ Upstream commit e377ab82311af95c99648c6424a6b888a0ccb102 ]

Semantics wise, [pud|pmd]_bad() have always implied that a given [PUD|PMD]
entry does not have a pointer to the next level page table. This had been
made clear in the commit a1c76574f345 ("arm64: mm: use *_sect to check for
section maps"). Hence explicitly check for a table entry rather than just
testing a single bit. This basically redefines [pud|pmd]_bad() in terms of
[pud|pmd]_table() making the semantics clear.

Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/1620644871-26280-1-git-send-email-anshuman.khandual@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 arch/arm64/include/asm/pgtable.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index daed33697d98..19d18bca6958 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -516,13 +516,12 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
 
 #define pmd_none(pmd)		(!pmd_val(pmd))
 
-#define pmd_bad(pmd)		(!(pmd_val(pmd) & PMD_TABLE_BIT))
-
 #define pmd_table(pmd)		((pmd_val(pmd) & PMD_TYPE_MASK) == \
 				 PMD_TYPE_TABLE)
 #define pmd_sect(pmd)		((pmd_val(pmd) & PMD_TYPE_MASK) == \
 				 PMD_TYPE_SECT)
 #define pmd_leaf(pmd)		pmd_sect(pmd)
+#define pmd_bad(pmd)		(!pmd_table(pmd))
 
 #if defined(CONFIG_ARM64_64K_PAGES) || CONFIG_PGTABLE_LEVELS < 3
 static inline bool pud_sect(pud_t pud) { return false; }
@@ -606,7 +605,7 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
 	pr_err("%s:%d: bad pmd %016llx.\n", __FILE__, __LINE__, pmd_val(e))
 
 #define pud_none(pud)		(!pud_val(pud))
-#define pud_bad(pud)		(!(pud_val(pud) & PUD_TABLE_BIT))
+#define pud_bad(pud)		(!pud_table(pud))
 #define pud_present(pud)	pte_present(pud_pte(pud))
 #define pud_leaf(pud)		pud_sect(pud)
 #define pud_valid(pud)		pte_valid(pud_pte(pud))
-- 
Gitee


From 286861f72d40a9d40e51210f620f52c090483b8a Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Tue, 26 Jul 2022 17:38:45 +0800
Subject: [PATCH 593/674] arm64: mm: fix p?d_leaf()

stable inclusion
from stable-v5.10.113
commit 052e4a661f90d27586b4364764dcfb2c022b00ee
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=052e4a661f90d27586b4364764dcfb2c022b00ee

--------------------------------

[ Upstream commit 23bc8f69f0eceecbb87c3801d2e48827d2dca92b ]

The pmd_leaf() is used to test a leaf mapped PMD, however, it misses
the PROT_NONE mapped PMD on arm64.  Fix it.  A real world issue [1]
caused by this was reported by Qian Cai. Also fix pud_leaf().

Link: https://patchwork.kernel.org/comment/24798260/ [1]
Fixes: 8aa82df3c123 ("arm64: mm: add p?d_leaf() definitions")
Reported-by: Qian Cai <quic_qiancai@quicinc.com>
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Link: https://lore.kernel.org/r/20220422060033.48711-1-songmuchun@bytedance.com
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 arch/arm64/include/asm/pgtable.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 19d18bca6958..ab2443900f4e 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -520,7 +520,7 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
 				 PMD_TYPE_TABLE)
 #define pmd_sect(pmd)		((pmd_val(pmd) & PMD_TYPE_MASK) == \
 				 PMD_TYPE_SECT)
-#define pmd_leaf(pmd)		pmd_sect(pmd)
+#define pmd_leaf(pmd)		(pmd_present(pmd) && !pmd_table(pmd))
 #define pmd_bad(pmd)		(!pmd_table(pmd))
 
 #if defined(CONFIG_ARM64_64K_PAGES) || CONFIG_PGTABLE_LEVELS < 3
@@ -607,7 +607,7 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
 #define pud_none(pud)		(!pud_val(pud))
 #define pud_bad(pud)		(!pud_table(pud))
 #define pud_present(pud)	pte_present(pud_pte(pud))
-#define pud_leaf(pud)		pud_sect(pud)
+#define pud_leaf(pud)		(pud_present(pud) && !pud_table(pud))
 #define pud_valid(pud)		pte_valid(pud_pte(pud))
 
 static inline void set_pud(pud_t *pudp, pud_t pud)
-- 
Gitee


From be31be301f9aa6457bb1cd1ac8ba374db3ecba79 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 26 Jul 2022 17:38:46 +0800
Subject: [PATCH 594/674] ARM: vexpress/spc: Avoid negative array index when
 !SMP

stable inclusion
from stable-v5.10.113
commit d513ea9b7ef822b638857c7114d26af108e4bab1
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=d513ea9b7ef822b638857c7114d26af108e4bab1

--------------------------------

[ Upstream commit b3f1dd52c991d79118f35e6d1bf4d7cb09882e38 ]

When building multi_v7_defconfig+CONFIG_SMP=n, -Warray-bounds exposes
a couple negative array index accesses:

arch/arm/mach-vexpress/spc.c: In function 've_spc_clk_init':
arch/arm/mach-vexpress/spc.c:583:21: warning: array subscript -1 is below array bounds of 'bool[2]' {aka '_Bool[2]'} [-Warray-bounds]
  583 |   if (init_opp_table[cluster])
      |       ~~~~~~~~~~~~~~^~~~~~~~~
arch/arm/mach-vexpress/spc.c:556:7: note: while referencing 'init_opp_table'
  556 |  bool init_opp_table[MAX_CLUSTERS] = { false };
      |       ^~~~~~~~~~~~~~
arch/arm/mach-vexpress/spc.c:592:18: warning: array subscript -1 is below array bounds of 'bool[2]' {aka '_Bool[2]'} [-Warray-bounds]
  592 |    init_opp_table[cluster] = true;
      |    ~~~~~~~~~~~~~~^~~~~~~~~
arch/arm/mach-vexpress/spc.c:556:7: note: while referencing 'init_opp_table'
  556 |  bool init_opp_table[MAX_CLUSTERS] = { false };
      |       ^~~~~~~~~~~~~~

Skip this logic when built !SMP.

Link: https://lore.kernel.org/r/20220331190443.851661-1-keescook@chromium.org
Cc: Liviu Dudau <liviu.dudau@arm.com>
Cc: Sudeep Holla <sudeep.holla@arm.com>
Cc: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: linux-arm-kernel@lists.infradead.org
Acked-by: Liviu Dudau <liviu.dudau@arm.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 arch/arm/mach-vexpress/spc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm/mach-vexpress/spc.c b/arch/arm/mach-vexpress/spc.c
index 1da11bdb1dfb..1c6500c4e6a1 100644
--- a/arch/arm/mach-vexpress/spc.c
+++ b/arch/arm/mach-vexpress/spc.c
@@ -580,7 +580,7 @@ static int __init ve_spc_clk_init(void)
 		}
 
 		cluster = topology_physical_package_id(cpu_dev->id);
-		if (init_opp_table[cluster])
+		if (cluster < 0 || init_opp_table[cluster])
 			continue;
 
 		if (ve_init_opp_table(cpu_dev))
-- 
Gitee


From c8a5122e572911c692240f7a894883779217a520 Mon Sep 17 00:00:00 2001
From: Sameer Pujar <spujar@nvidia.com>
Date: Tue, 26 Jul 2022 17:38:47 +0800
Subject: [PATCH 595/674] reset: tegra-bpmp: Restore Handle errors in BPMP
 response

stable inclusion
from stable-v5.10.113
commit cb17b56a9b4de46c057691b524c692c10e8d9d51
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=cb17b56a9b4de46c057691b524c692c10e8d9d51

--------------------------------

[ Upstream commit d1da1052ffad63aa5181b69f20a6952e31f339c2 ]

This reverts following commit 69125b4b9440 ("reset: tegra-bpmp: Revert
Handle errors in BPMP response").

The Tegra194 HDA reset failure is fixed by commit d278dc9151a0 ("ALSA:
hda/tegra: Fix Tegra194 HDA reset failure"). The temporary revert of
original commit c045ceb5a145 ("reset: tegra-bpmp: Handle errors in BPMP
response") can be removed now.

Signed-off-by: Sameer Pujar <spujar@nvidia.com>
Tested-by: Jon Hunter <jonathanh@nvidia.com>
Reviewed-by: Jon Hunter <jonathanh@nvidia.com>
Acked-by: Thierry Reding <treding@nvidia.com>
Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
Link: https://lore.kernel.org/r/1641995806-15245-1-git-send-email-spujar@nvidia.com
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/reset/tegra/reset-bpmp.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/reset/tegra/reset-bpmp.c b/drivers/reset/tegra/reset-bpmp.c
index 24d3395964cc..4c5bba52b105 100644
--- a/drivers/reset/tegra/reset-bpmp.c
+++ b/drivers/reset/tegra/reset-bpmp.c
@@ -20,6 +20,7 @@ static int tegra_bpmp_reset_common(struct reset_controller_dev *rstc,
 	struct tegra_bpmp *bpmp = to_tegra_bpmp(rstc);
 	struct mrq_reset_request request;
 	struct tegra_bpmp_message msg;
+	int err;
 
 	memset(&request, 0, sizeof(request));
 	request.cmd = command;
@@ -30,7 +31,13 @@ static int tegra_bpmp_reset_common(struct reset_controller_dev *rstc,
 	msg.tx.data = &request;
 	msg.tx.size = sizeof(request);
 
-	return tegra_bpmp_transfer(bpmp, &msg);
+	err = tegra_bpmp_transfer(bpmp, &msg);
+	if (err)
+		return err;
+	if (msg.rx.ret)
+		return -EINVAL;
+
+	return 0;
 }
 
 static int tegra_bpmp_reset_module(struct reset_controller_dev *rstc,
-- 
Gitee


From f0894049e9ec662a3fbb60d84f3124663a1efb9b Mon Sep 17 00:00:00 2001
From: Jiapeng Chong <jiapeng.chong@linux.alibaba.com>
Date: Tue, 26 Jul 2022 17:38:48 +0800
Subject: [PATCH 596/674] platform/x86: samsung-laptop: Fix an unsigned
 comparison which can never be negative

stable inclusion
from stable-v5.10.113
commit 490815f0b50e42238a22ea4b2dc64105f5a59e6f
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=490815f0b50e42238a22ea4b2dc64105f5a59e6f

--------------------------------

[ Upstream commit 0284d4d1be753f648f28b77bdfbe6a959212af5c ]

Eliminate the follow smatch warnings:

drivers/platform/x86/samsung-laptop.c:1124 kbd_led_set() warn: unsigned
'value' is never less than zero.

Reported-by: Abaci Robot <abaci@linux.alibaba.com>
Signed-off-by: Jiapeng Chong <jiapeng.chong@linux.alibaba.com>
Link: https://lore.kernel.org/r/20220322061830.105579-1-jiapeng.chong@linux.alibaba.com
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/platform/x86/samsung-laptop.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/platform/x86/samsung-laptop.c b/drivers/platform/x86/samsung-laptop.c
index d5cec6e35bb8..0e456c39a603 100644
--- a/drivers/platform/x86/samsung-laptop.c
+++ b/drivers/platform/x86/samsung-laptop.c
@@ -1121,8 +1121,6 @@ static void kbd_led_set(struct led_classdev *led_cdev,
 
 	if (value > samsung->kbd_led.max_brightness)
 		value = samsung->kbd_led.max_brightness;
-	else if (value < 0)
-		value = 0;
 
 	samsung->kbd_led_wk = value;
 	queue_work(samsung->led_workqueue, &samsung->kbd_led_work);
-- 
Gitee


From 45f7650be7ed083d437c26a46a5148ee50c5641b Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Tue, 26 Jul 2022 17:38:49 +0800
Subject: [PATCH 597/674] ALSA: usb-audio: Fix undefined behavior due to shift
 overflowing the constant
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

stable inclusion
from stable-v5.10.113
commit cd227ac03f2aa918ee3672fe2a5960dd0e7be331
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=cd227ac03f2aa918ee3672fe2a5960dd0e7be331

--------------------------------

[ Upstream commit 1ef8715975de8bd481abbd0839ed4f49d9e5b0ff ]

Fix:

  sound/usb/midi.c: In function ‘snd_usbmidi_out_endpoint_create’:
  sound/usb/midi.c:1389:2: error: case label does not reduce to an integer constant
    case USB_ID(0xfc08, 0x0101): /* Unknown vendor Cable */
    ^~~~

See https://lore.kernel.org/r/YkwQ6%2BtIH8GQpuct@zn.tnic for the gory
details as to why it triggers with older gccs only.

[ A slight correction with parentheses around the argument by tiwai ]

Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lore.kernel.org/r/20220405151517.29753-3-bp@alien8.de
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 sound/usb/usbaudio.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sound/usb/usbaudio.h b/sound/usb/usbaudio.h
index e54a98f46549..d8e31ee03b9d 100644
--- a/sound/usb/usbaudio.h
+++ b/sound/usb/usbaudio.h
@@ -8,7 +8,7 @@
  */
 
 /* handling of USB vendor/product ID pairs as 32-bit numbers */
-#define USB_ID(vendor, product) (((vendor) << 16) | (product))
+#define USB_ID(vendor, product) (((unsigned int)(vendor) << 16) | (product))
 #define USB_ID_VENDOR(id) ((id) >> 16)
 #define USB_ID_PRODUCT(id) ((u16)(id))
 
-- 
Gitee


From bade8d4a2dbd0d43173f01e1a0ec525814c13f44 Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Tue, 26 Jul 2022 17:38:50 +0800
Subject: [PATCH 598/674] arm64: dts: imx: Fix imx8*-var-som touchscreen
 property sizes

stable inclusion
from stable-v5.10.113
commit 8e7ea11364758d43e577b7835b8e98f27927d56c
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=8e7ea11364758d43e577b7835b8e98f27927d56c

--------------------------------

[ Upstream commit 1bc12d301594eafde0a8529d28d459af81053b3a ]

The common touchscreen properties are all 32-bit, not 16-bit. These
properties must not be too important as they are all ignored in case of an
error reading them.

Signed-off-by: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/Yk3moe6Hz8ELM0iS@robh.at.kernel.org'
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 arch/arm64/boot/dts/freescale/imx8mm-var-som.dtsi | 8 ++++----
 arch/arm64/boot/dts/freescale/imx8mn-var-som.dtsi | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/arch/arm64/boot/dts/freescale/imx8mm-var-som.dtsi b/arch/arm64/boot/dts/freescale/imx8mm-var-som.dtsi
index 49082529764f..0fac1f3f7f47 100644
--- a/arch/arm64/boot/dts/freescale/imx8mm-var-som.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx8mm-var-som.dtsi
@@ -89,12 +89,12 @@ touchscreen@0 {
 		pendown-gpio = <&gpio1 3 GPIO_ACTIVE_LOW>;
 
 		ti,x-min = /bits/ 16 <125>;
-		touchscreen-size-x = /bits/ 16 <4008>;
+		touchscreen-size-x = <4008>;
 		ti,y-min = /bits/ 16 <282>;
-		touchscreen-size-y = /bits/ 16 <3864>;
+		touchscreen-size-y = <3864>;
 		ti,x-plate-ohms = /bits/ 16 <180>;
-		touchscreen-max-pressure = /bits/ 16 <255>;
-		touchscreen-average-samples = /bits/ 16 <10>;
+		touchscreen-max-pressure = <255>;
+		touchscreen-average-samples = <10>;
 		ti,debounce-tol = /bits/ 16 <3>;
 		ti,debounce-rep = /bits/ 16 <1>;
 		ti,settle-delay-usec = /bits/ 16 <150>;
diff --git a/arch/arm64/boot/dts/freescale/imx8mn-var-som.dtsi b/arch/arm64/boot/dts/freescale/imx8mn-var-som.dtsi
index 7f356edf9f91..f6287f174355 100644
--- a/arch/arm64/boot/dts/freescale/imx8mn-var-som.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx8mn-var-som.dtsi
@@ -70,12 +70,12 @@ touchscreen@0 {
 		pendown-gpio = <&gpio1 3 GPIO_ACTIVE_LOW>;
 
 		ti,x-min = /bits/ 16 <125>;
-		touchscreen-size-x = /bits/ 16 <4008>;
+		touchscreen-size-x = <4008>;
 		ti,y-min = /bits/ 16 <282>;
-		touchscreen-size-y = /bits/ 16 <3864>;
+		touchscreen-size-y = <3864>;
 		ti,x-plate-ohms = /bits/ 16 <180>;
-		touchscreen-max-pressure = /bits/ 16 <255>;
-		touchscreen-average-samples = /bits/ 16 <10>;
+		touchscreen-max-pressure = <255>;
+		touchscreen-average-samples = <10>;
 		ti,debounce-tol = /bits/ 16 <3>;
 		ti,debounce-rep = /bits/ 16 <1>;
 		ti,settle-delay-usec = /bits/ 16 <150>;
-- 
Gitee


From b9a0a6d062e89d410cd33f03b17ed7840e096128 Mon Sep 17 00:00:00 2001
From: Hongbin Wang <wh_bin@126.com>
Date: Tue, 26 Jul 2022 17:38:51 +0800
Subject: [PATCH 599/674] vxlan: fix error return code in vxlan_fdb_append

stable inclusion
from stable-v5.10.113
commit e129c55153c8aba74e4c73451399f12dacf6347d
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=e129c55153c8aba74e4c73451399f12dacf6347d

--------------------------------

[ Upstream commit 7cea5560bf656b84f9ed01c0cc829d4eecd0640b ]

When kmalloc and dst_cache_init failed,
should return ENOMEM rather than ENOBUFS.

Signed-off-by: Hongbin Wang <wh_bin@126.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/net/vxlan.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index c362a6ac94c4..73953b0141ba 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -710,11 +710,11 @@ static int vxlan_fdb_append(struct vxlan_fdb *f,
 
 	rd = kmalloc(sizeof(*rd), GFP_ATOMIC);
 	if (rd == NULL)
-		return -ENOBUFS;
+		return -ENOMEM;
 
 	if (dst_cache_init(&rd->dst_cache, GFP_ATOMIC)) {
 		kfree(rd);
-		return -ENOBUFS;
+		return -ENOMEM;
 	}
 
 	rd->remote_ip = *ip;
-- 
Gitee


From 62f154bc19641ca00a8b926c5d28d3d1271eea1c Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 26 Jul 2022 17:38:52 +0800
Subject: [PATCH 600/674] cifs: Check the IOCB_DIRECT flag, not O_DIRECT

stable inclusion
from stable-v5.10.113
commit 5bef9fc38ffa3e0a712fcd1e07f4f123ba160242
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=5bef9fc38ffa3e0a712fcd1e07f4f123ba160242

--------------------------------

[ Upstream commit 994fd530a512597ffcd713b0f6d5bc916c5698f0 ]

Use the IOCB_DIRECT indicator flag on the I/O context rather than checking to
see if the file was opened O_DIRECT.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Steve French <sfrench@samba.org>
cc: Shyam Prasad N <nspmangalore@gmail.com>
cc: Rohith Surabattula <rohiths.msft@gmail.com>
cc: linux-cifs@vger.kernel.org
Signed-off-by: Steve French <stfrench@microsoft.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 fs/cifs/cifsfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index aa5a4d759ca2..370188b2a55d 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -898,7 +898,7 @@ cifs_loose_read_iter(struct kiocb *iocb, struct iov_iter *iter)
 	ssize_t rc;
 	struct inode *inode = file_inode(iocb->ki_filp);
 
-	if (iocb->ki_filp->f_flags & O_DIRECT)
+	if (iocb->ki_flags & IOCB_DIRECT)
 		return cifs_user_readv(iocb, iter);
 
 	rc = cifs_revalidate_mapping(inode);
-- 
Gitee


From 652cce7aeaa6bf232f67c210d878f48733c34045 Mon Sep 17 00:00:00 2001
From: Kai-Heng Feng <kai.heng.feng@canonical.com>
Date: Tue, 26 Jul 2022 17:38:53 +0800
Subject: [PATCH 601/674] net: atlantic: Avoid out-of-bounds indexing

stable inclusion
from stable-v5.10.113
commit 0de9c104d04aed5e464bd788abb423ed333980c7
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=0de9c104d04aed5e464bd788abb423ed333980c7

--------------------------------

[ Upstream commit 8d3a6c37d50d5a0504c126c932cc749e6dd9c78f ]

UBSAN warnings are observed on atlantic driver:
[ 294.432996] UBSAN: array-index-out-of-bounds in /build/linux-Qow4fL/linux-5.15.0/drivers/net/ethernet/aquantia/atlantic/aq_nic.c:484:48
[ 294.433695] index 8 is out of range for type 'aq_vec_s *[8]'

The ring is dereferenced right before breaking out the loop, to prevent
that from happening, only use the index in the loop to fix the issue.

BugLink: https://bugs.launchpad.net/bugs/1958770
Tested-by: Mario Limonciello <mario.limonciello@amd.com>
Signed-off-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
Reviewed-by: Igor Russkikh <irusskikh@marvell.com>
Link: https://lore.kernel.org/r/20220408022204.16815-1-kai.heng.feng@canonical.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 .../net/ethernet/aquantia/atlantic/aq_nic.c   |  8 +++----
 .../net/ethernet/aquantia/atlantic/aq_vec.c   | 24 +++++++++----------
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
index 0cf8ae8aeac8..2fb4126ae8d8 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
@@ -480,8 +480,8 @@ int aq_nic_start(struct aq_nic_s *self)
 	if (err < 0)
 		goto err_exit;
 
-	for (i = 0U, aq_vec = self->aq_vec[0];
-		self->aq_vecs > i; ++i, aq_vec = self->aq_vec[i]) {
+	for (i = 0U; self->aq_vecs > i; ++i) {
+		aq_vec = self->aq_vec[i];
 		err = aq_vec_start(aq_vec);
 		if (err < 0)
 			goto err_exit;
@@ -511,8 +511,8 @@ int aq_nic_start(struct aq_nic_s *self)
 		mod_timer(&self->polling_timer, jiffies +
 			  AQ_CFG_POLLING_TIMER_INTERVAL);
 	} else {
-		for (i = 0U, aq_vec = self->aq_vec[0];
-			self->aq_vecs > i; ++i, aq_vec = self->aq_vec[i]) {
+		for (i = 0U; self->aq_vecs > i; ++i) {
+			aq_vec = self->aq_vec[i];
 			err = aq_pci_func_alloc_irq(self, i, self->ndev->name,
 						    aq_vec_isr, aq_vec,
 						    aq_vec_get_affinity_mask(aq_vec));
diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_vec.c b/drivers/net/ethernet/aquantia/atlantic/aq_vec.c
index f4774cf051c9..6ab1f3212d24 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_vec.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_vec.c
@@ -43,8 +43,8 @@ static int aq_vec_poll(struct napi_struct *napi, int budget)
 	if (!self) {
 		err = -EINVAL;
 	} else {
-		for (i = 0U, ring = self->ring[0];
-			self->tx_rings > i; ++i, ring = self->ring[i]) {
+		for (i = 0U; self->tx_rings > i; ++i) {
+			ring = self->ring[i];
 			u64_stats_update_begin(&ring[AQ_VEC_RX_ID].stats.rx.syncp);
 			ring[AQ_VEC_RX_ID].stats.rx.polls++;
 			u64_stats_update_end(&ring[AQ_VEC_RX_ID].stats.rx.syncp);
@@ -182,8 +182,8 @@ int aq_vec_init(struct aq_vec_s *self, const struct aq_hw_ops *aq_hw_ops,
 	self->aq_hw_ops = aq_hw_ops;
 	self->aq_hw = aq_hw;
 
-	for (i = 0U, ring = self->ring[0];
-		self->tx_rings > i; ++i, ring = self->ring[i]) {
+	for (i = 0U; self->tx_rings > i; ++i) {
+		ring = self->ring[i];
 		err = aq_ring_init(&ring[AQ_VEC_TX_ID], ATL_RING_TX);
 		if (err < 0)
 			goto err_exit;
@@ -224,8 +224,8 @@ int aq_vec_start(struct aq_vec_s *self)
 	unsigned int i = 0U;
 	int err = 0;
 
-	for (i = 0U, ring = self->ring[0];
-		self->tx_rings > i; ++i, ring = self->ring[i]) {
+	for (i = 0U; self->tx_rings > i; ++i) {
+		ring = self->ring[i];
 		err = self->aq_hw_ops->hw_ring_tx_start(self->aq_hw,
 							&ring[AQ_VEC_TX_ID]);
 		if (err < 0)
@@ -248,8 +248,8 @@ void aq_vec_stop(struct aq_vec_s *self)
 	struct aq_ring_s *ring = NULL;
 	unsigned int i = 0U;
 
-	for (i = 0U, ring = self->ring[0];
-		self->tx_rings > i; ++i, ring = self->ring[i]) {
+	for (i = 0U; self->tx_rings > i; ++i) {
+		ring = self->ring[i];
 		self->aq_hw_ops->hw_ring_tx_stop(self->aq_hw,
 						 &ring[AQ_VEC_TX_ID]);
 
@@ -268,8 +268,8 @@ void aq_vec_deinit(struct aq_vec_s *self)
 	if (!self)
 		goto err_exit;
 
-	for (i = 0U, ring = self->ring[0];
-		self->tx_rings > i; ++i, ring = self->ring[i]) {
+	for (i = 0U; self->tx_rings > i; ++i) {
+		ring = self->ring[i];
 		aq_ring_tx_clean(&ring[AQ_VEC_TX_ID]);
 		aq_ring_rx_deinit(&ring[AQ_VEC_RX_ID]);
 	}
@@ -297,8 +297,8 @@ void aq_vec_ring_free(struct aq_vec_s *self)
 	if (!self)
 		goto err_exit;
 
-	for (i = 0U, ring = self->ring[0];
-		self->tx_rings > i; ++i, ring = self->ring[i]) {
+	for (i = 0U; self->tx_rings > i; ++i) {
+		ring = self->ring[i];
 		aq_ring_free(&ring[AQ_VEC_TX_ID]);
 		if (i < self->rx_rings)
 			aq_ring_free(&ring[AQ_VEC_RX_ID]);
-- 
Gitee


From e10c83e7cbb797c3b2d7dbe6c704fa84cebf6fc6 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Tue, 26 Jul 2022 17:38:54 +0800
Subject: [PATCH 602/674] mt76: Fix undefined behavior due to shift overflowing
 the constant
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

stable inclusion
from stable-v5.10.113
commit 202748f441488e08c730736bf5def1ae08f16811
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=202748f441488e08c730736bf5def1ae08f16811

--------------------------------

[ Upstream commit dbc2b1764734857d68425468ffa8486e97ab89df ]

Fix:

  drivers/net/wireless/mediatek/mt76/mt76x2/pci.c: In function ‘mt76x2e_probe’:
  ././include/linux/compiler_types.h:352:38: error: call to ‘__compiletime_assert_946’ \
	declared with attribute error: FIELD_PREP: mask is not constant
    _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__)

See https://lore.kernel.org/r/YkwQ6%2BtIH8GQpuct@zn.tnic for the gory
details as to why it triggers with older gccs only.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Felix Fietkau <nbd@nbd.name>
Cc: Lorenzo Bianconi <lorenzo.bianconi83@gmail.com>
Cc: Ryder Lee <ryder.lee@mediatek.com>
Cc: Shayne Chen <shayne.chen@mediatek.com>
Cc: Sean Wang <sean.wang@mediatek.com>
Cc: Kalle Valo <kvalo@kernel.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: linux-wireless@vger.kernel.org
Cc: netdev@vger.kernel.org
Signed-off-by: Kalle Valo <kvalo@kernel.org>
Link: https://lore.kernel.org/r/20220405151517.29753-9-bp@alien8.de
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/net/wireless/mediatek/mt76/mt76x2/pci.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2/pci.c b/drivers/net/wireless/mediatek/mt76/mt76x2/pci.c
index ecaf85b483ac..e57e49a722dc 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76x2/pci.c
+++ b/drivers/net/wireless/mediatek/mt76/mt76x2/pci.c
@@ -80,7 +80,7 @@ mt76x2e_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	mt76_rmw_field(dev, 0x15a10, 0x1f << 16, 0x9);
 
 	/* RG_SSUSB_G1_CDR_BIC_LTR = 0xf */
-	mt76_rmw_field(dev, 0x15a0c, 0xf << 28, 0xf);
+	mt76_rmw_field(dev, 0x15a0c, 0xfU << 28, 0xf);
 
 	/* RG_SSUSB_CDR_BR_PE1D = 0x3 */
 	mt76_rmw_field(dev, 0x15c58, 0x3 << 6, 0x3);
-- 
Gitee


From d89078f497c0d04cb4ec777eca7bc4a451c68b00 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@alien8.de>
Date: Tue, 26 Jul 2022 17:38:55 +0800
Subject: [PATCH 603/674] brcmfmac: sdio: Fix undefined behavior due to shift
 overflowing the constant
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

stable inclusion
from stable-v5.10.113
commit b3afe5a7fd7548fa3a818f8dd20e22db4b8db019
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=b3afe5a7fd7548fa3a818f8dd20e22db4b8db019

--------------------------------

[ Upstream commit 6fb3a5868b2117611f41e421e10e6a8c2a13039a ]

Fix:

  drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c: In function ‘brcmf_sdio_drivestrengthinit’:
  drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c:3798:2: error: case label does not reduce to an integer constant
    case SDIOD_DRVSTR_KEY(BRCM_CC_43143_CHIP_ID, 17):
    ^~~~
  drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c:3809:2: error: case label does not reduce to an integer constant
    case SDIOD_DRVSTR_KEY(BRCM_CC_43362_CHIP_ID, 13):
    ^~~~

See https://lore.kernel.org/r/YkwQ6%2BtIH8GQpuct@zn.tnic for the gory
details as to why it triggers with older gccs only.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Arend van Spriel <aspriel@gmail.com>
Cc: Franky Lin <franky.lin@broadcom.com>
Cc: Hante Meuleman <hante.meuleman@broadcom.com>
Cc: Kalle Valo <kvalo@kernel.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: brcm80211-dev-list.pdl@broadcom.com
Cc: netdev@vger.kernel.org
Acked-by: Arend van Spriel <arend.vanspriel@broadcom.com>
Signed-off-by: Kalle Valo <kvalo@kernel.org>
Link: https://lore.kernel.org/r/Ykx0iRlvtBnKqtbG@zn.tnic
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
index 6d5d5c39c635..9929e90866f0 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
@@ -557,7 +557,7 @@ enum brcmf_sdio_frmtype {
 	BRCMF_SDIO_FT_SUB,
 };
 
-#define SDIOD_DRVSTR_KEY(chip, pmu)     (((chip) << 16) | (pmu))
+#define SDIOD_DRVSTR_KEY(chip, pmu)     (((unsigned int)(chip) << 16) | (pmu))
 
 /* SDIO Pad drive strength to select value mappings */
 struct sdiod_drive_str {
-- 
Gitee


From d1c00ee05775fa5b9a20a1392fe710e785fc68c9 Mon Sep 17 00:00:00 2001
From: Lv Ruyi <lv.ruyi@zte.com.cn>
Date: Tue, 26 Jul 2022 17:38:56 +0800
Subject: [PATCH 604/674] dpaa_eth: Fix missing of_node_put in
 dpaa_get_ts_info()

stable inclusion
from stable-v5.10.113
commit 8d71edabb0abe6c8e3e1601e73f63f81783b6cc0
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=8d71edabb0abe6c8e3e1601e73f63f81783b6cc0

--------------------------------

[ Upstream commit 1a7eb80d170c28be2928433702256fe2a0bd1e0f ]

Both of of_get_parent() and of_parse_phandle() return node pointer with
refcount incremented, use of_node_put() on it to decrease refcount
when done.

Reported-by: Zeal Robot <zealci@zte.com.cn>
Signed-off-by: Lv Ruyi <lv.ruyi@zte.com.cn>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c b/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c
index 763d2c7b5fb1..5750f9a56393 100644
--- a/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c
+++ b/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c
@@ -489,11 +489,15 @@ static int dpaa_get_ts_info(struct net_device *net_dev,
 	info->phc_index = -1;
 
 	fman_node = of_get_parent(mac_node);
-	if (fman_node)
+	if (fman_node) {
 		ptp_node = of_parse_phandle(fman_node, "ptimer-handle", 0);
+		of_node_put(fman_node);
+	}
 
-	if (ptp_node)
+	if (ptp_node) {
 		ptp_dev = of_find_device_by_node(ptp_node);
+		of_node_put(ptp_node);
+	}
 
 	if (ptp_dev)
 		ptp = platform_get_drvdata(ptp_dev);
-- 
Gitee


From 63fc6600e0de84218fd9587b1037e42025fb27c9 Mon Sep 17 00:00:00 2001
From: Xiaoke Wang <xkernel.wang@foxmail.com>
Date: Tue, 26 Jul 2022 17:38:57 +0800
Subject: [PATCH 605/674] drm/msm/mdp5: check the return of kzalloc()

stable inclusion
from stable-v5.10.113
commit 9581e07b549bd911ce3af12b57ea2646b4995ebf
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=9581e07b549bd911ce3af12b57ea2646b4995ebf

--------------------------------

[ Upstream commit 047ae665577776b7feb11bd4f81f46627cff95e7 ]

kzalloc() is a memory allocation function which can return NULL when
some internal memory errors happen. So it is better to check it to
prevent potential wrong memory access.

Besides, since mdp5_plane_reset() is void type, so we should better
set `plane-state` to NULL after releasing it.

Signed-off-by: Xiaoke Wang <xkernel.wang@foxmail.com>
Reviewed-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Patchwork: https://patchwork.freedesktop.org/patch/481055/
Link: https://lore.kernel.org/r/tencent_8E2A1C78140EE1784AB2FF4B2088CC0AB908@qq.com
Signed-off-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Signed-off-by: Rob Clark <robdclark@chromium.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/gpu/drm/msm/disp/mdp5/mdp5_plane.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/msm/disp/mdp5/mdp5_plane.c b/drivers/gpu/drm/msm/disp/mdp5/mdp5_plane.c
index 83423092de2f..da0799333970 100644
--- a/drivers/gpu/drm/msm/disp/mdp5/mdp5_plane.c
+++ b/drivers/gpu/drm/msm/disp/mdp5/mdp5_plane.c
@@ -179,7 +179,10 @@ static void mdp5_plane_reset(struct drm_plane *plane)
 		drm_framebuffer_put(plane->state->fb);
 
 	kfree(to_mdp5_plane_state(plane->state));
+	plane->state = NULL;
 	mdp5_state = kzalloc(sizeof(*mdp5_state), GFP_KERNEL);
+	if (!mdp5_state)
+		return;
 
 	/* assign default blend parameters */
 	mdp5_state->alpha = 255;
-- 
Gitee


From 6f2cb02318c126edf72a06c12938b3bc93a7de1a Mon Sep 17 00:00:00 2001
From: Tomas Melin <tomas.melin@vaisala.com>
Date: Tue, 26 Jul 2022 17:38:58 +0800
Subject: [PATCH 606/674] net: macb: Restart tx only if queue pointer is
 lagging

stable inclusion
from stable-v5.10.113
commit a284cca3d81ae5e5f5f23c09b8298339188953a5
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=a284cca3d81ae5e5f5f23c09b8298339188953a5

--------------------------------

[ Upstream commit 5ad7f18cd82cee8e773d40cc7a1465a526f2615c ]

commit 4298388574da ("net: macb: restart tx after tx used bit read")
added support for restarting transmission. Restarting tx does not work
in case controller asserts TXUBR interrupt and TQBP is already at the end
of the tx queue. In that situation, restarting tx will immediately cause
assertion of another TXUBR interrupt. The driver will end up in an infinite
interrupt loop which it cannot break out of.

For cases where TQBP is at the end of the tx queue, instead
only clear TX_USED interrupt. As more data gets pushed to the queue,
transmission will resume.

This issue was observed on a Xilinx Zynq-7000 based board.
During stress test of the network interface,
driver would get stuck on interrupt loop within seconds or minutes
causing CPU to stall.

Signed-off-by: Tomas Melin <tomas.melin@vaisala.com>
Tested-by: Claudiu Beznea <claudiu.beznea@microchip.com>
Reviewed-by: Claudiu Beznea <claudiu.beznea@microchip.com>
Link: https://lore.kernel.org/r/20220407161659.14532-1-tomas.melin@vaisala.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/net/ethernet/cadence/macb_main.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c
index 78c6d133f54f..3244f69555f7 100644
--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
@@ -1531,6 +1531,7 @@ static void macb_tx_restart(struct macb_queue *queue)
 	unsigned int head = queue->tx_head;
 	unsigned int tail = queue->tx_tail;
 	struct macb *bp = queue->bp;
+	unsigned int head_idx, tbqp;
 
 	if (bp->caps & MACB_CAPS_ISR_CLEAR_ON_WRITE)
 		queue_writel(queue, ISR, MACB_BIT(TXUBR));
@@ -1538,6 +1539,13 @@ static void macb_tx_restart(struct macb_queue *queue)
 	if (head == tail)
 		return;
 
+	tbqp = queue_readl(queue, TBQP) / macb_dma_desc_get_size(bp);
+	tbqp = macb_adj_dma_desc_idx(bp, macb_tx_ring_wrap(bp, tbqp));
+	head_idx = macb_adj_dma_desc_idx(bp, macb_tx_ring_wrap(bp, head));
+
+	if (tbqp == head_idx)
+		return;
+
 	macb_writel(bp, NCR, macb_readl(bp, NCR) | MACB_BIT(TSTART));
 }
 
-- 
Gitee


From 0389f1ff99ad7375ff5980b964ab00267a72af25 Mon Sep 17 00:00:00 2001
From: Mike Christie <michael.christie@oracle.com>
Date: Tue, 26 Jul 2022 17:38:59 +0800
Subject: [PATCH 607/674] scsi: qedi: Fix failed disconnect handling

stable inclusion
from stable-v5.10.113
commit bf28bba3041055759ce6afe5e61bc4fd709ea6e4
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=bf28bba3041055759ce6afe5e61bc4fd709ea6e4

--------------------------------

[ Upstream commit 857b06527f707f5df634b854898a191b5c1d0272 ]

We set the qedi_ep state to EP_STATE_OFLDCONN_START when the ep is
created. Then in qedi_set_path we kick off the offload work. If userspace
times out the connection and calls ep_disconnect, qedi will only flush the
offload work if the qedi_ep state has transitioned away from
EP_STATE_OFLDCONN_START. If we can't connect we will not have transitioned
state and will leave the offload work running, and we will free the qedi_ep
from under it.

This patch just has us init the work when we create the ep, then always
flush it.

Link: https://lore.kernel.org/r/20220408001314.5014-10-michael.christie@oracle.com
Tested-by: Manish Rangankar <mrangankar@marvell.com>
Reviewed-by: Lee Duncan <lduncan@suse.com>
Reviewed-by: Chris Leech <cleech@redhat.com>
Acked-by: Manish Rangankar <mrangankar@marvell.com>
Signed-off-by: Mike Christie <michael.christie@oracle.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/scsi/qedi/qedi_iscsi.c | 69 +++++++++++++++++-----------------
 1 file changed, 34 insertions(+), 35 deletions(-)

diff --git a/drivers/scsi/qedi/qedi_iscsi.c b/drivers/scsi/qedi/qedi_iscsi.c
index ba9a22e55e32..8003b3519b95 100644
--- a/drivers/scsi/qedi/qedi_iscsi.c
+++ b/drivers/scsi/qedi/qedi_iscsi.c
@@ -828,6 +828,37 @@ static int qedi_task_xmit(struct iscsi_task *task)
 	return qedi_iscsi_send_ioreq(task);
 }
 
+static void qedi_offload_work(struct work_struct *work)
+{
+	struct qedi_endpoint *qedi_ep =
+		container_of(work, struct qedi_endpoint, offload_work);
+	struct qedi_ctx *qedi;
+	int wait_delay = 5 * HZ;
+	int ret;
+
+	qedi = qedi_ep->qedi;
+
+	ret = qedi_iscsi_offload_conn(qedi_ep);
+	if (ret) {
+		QEDI_ERR(&qedi->dbg_ctx,
+			 "offload error: iscsi_cid=%u, qedi_ep=%p, ret=%d\n",
+			 qedi_ep->iscsi_cid, qedi_ep, ret);
+		qedi_ep->state = EP_STATE_OFLDCONN_FAILED;
+		return;
+	}
+
+	ret = wait_event_interruptible_timeout(qedi_ep->tcp_ofld_wait,
+					       (qedi_ep->state ==
+					       EP_STATE_OFLDCONN_COMPL),
+					       wait_delay);
+	if (ret <= 0 || qedi_ep->state != EP_STATE_OFLDCONN_COMPL) {
+		qedi_ep->state = EP_STATE_OFLDCONN_FAILED;
+		QEDI_ERR(&qedi->dbg_ctx,
+			 "Offload conn TIMEOUT iscsi_cid=%u, qedi_ep=%p\n",
+			 qedi_ep->iscsi_cid, qedi_ep);
+	}
+}
+
 static struct iscsi_endpoint *
 qedi_ep_connect(struct Scsi_Host *shost, struct sockaddr *dst_addr,
 		int non_blocking)
@@ -876,6 +907,7 @@ qedi_ep_connect(struct Scsi_Host *shost, struct sockaddr *dst_addr,
 	}
 	qedi_ep = ep->dd_data;
 	memset(qedi_ep, 0, sizeof(struct qedi_endpoint));
+	INIT_WORK(&qedi_ep->offload_work, qedi_offload_work);
 	qedi_ep->state = EP_STATE_IDLE;
 	qedi_ep->iscsi_cid = (u32)-1;
 	qedi_ep->qedi = qedi;
@@ -1026,12 +1058,11 @@ static void qedi_ep_disconnect(struct iscsi_endpoint *ep)
 	qedi_ep = ep->dd_data;
 	qedi = qedi_ep->qedi;
 
+	flush_work(&qedi_ep->offload_work);
+
 	if (qedi_ep->state == EP_STATE_OFLDCONN_START)
 		goto ep_exit_recover;
 
-	if (qedi_ep->state != EP_STATE_OFLDCONN_NONE)
-		flush_work(&qedi_ep->offload_work);
-
 	if (qedi_ep->conn) {
 		qedi_conn = qedi_ep->conn;
 		conn = qedi_conn->cls_conn->dd_data;
@@ -1196,37 +1227,6 @@ static int qedi_data_avail(struct qedi_ctx *qedi, u16 vlanid)
 	return rc;
 }
 
-static void qedi_offload_work(struct work_struct *work)
-{
-	struct qedi_endpoint *qedi_ep =
-		container_of(work, struct qedi_endpoint, offload_work);
-	struct qedi_ctx *qedi;
-	int wait_delay = 5 * HZ;
-	int ret;
-
-	qedi = qedi_ep->qedi;
-
-	ret = qedi_iscsi_offload_conn(qedi_ep);
-	if (ret) {
-		QEDI_ERR(&qedi->dbg_ctx,
-			 "offload error: iscsi_cid=%u, qedi_ep=%p, ret=%d\n",
-			 qedi_ep->iscsi_cid, qedi_ep, ret);
-		qedi_ep->state = EP_STATE_OFLDCONN_FAILED;
-		return;
-	}
-
-	ret = wait_event_interruptible_timeout(qedi_ep->tcp_ofld_wait,
-					       (qedi_ep->state ==
-					       EP_STATE_OFLDCONN_COMPL),
-					       wait_delay);
-	if ((ret <= 0) || (qedi_ep->state != EP_STATE_OFLDCONN_COMPL)) {
-		qedi_ep->state = EP_STATE_OFLDCONN_FAILED;
-		QEDI_ERR(&qedi->dbg_ctx,
-			 "Offload conn TIMEOUT iscsi_cid=%u, qedi_ep=%p\n",
-			 qedi_ep->iscsi_cid, qedi_ep);
-	}
-}
-
 static int qedi_set_path(struct Scsi_Host *shost, struct iscsi_path *path_data)
 {
 	struct qedi_ctx *qedi;
@@ -1342,7 +1342,6 @@ static int qedi_set_path(struct Scsi_Host *shost, struct iscsi_path *path_data)
 			  qedi_ep->dst_addr, qedi_ep->dst_port);
 	}
 
-	INIT_WORK(&qedi_ep->offload_work, qedi_offload_work);
 	queue_work(qedi->offload_thread, &qedi_ep->offload_work);
 
 	ret = 0;
-- 
Gitee


From 7bec28503270e64c6bd754f2d882beeef7c3278a Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Tue, 26 Jul 2022 17:39:00 +0800
Subject: [PATCH 608/674] stat: fix inconsistency between struct stat and
 struct compat_stat

stable inclusion
from stable-v5.10.113
commit 76101c8e0c31938e59bbcb4beed47c8d8b89d39a
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=76101c8e0c31938e59bbcb4beed47c8d8b89d39a

--------------------------------

[ Upstream commit 932aba1e169090357a77af18850a10c256b50819 ]

struct stat (defined in arch/x86/include/uapi/asm/stat.h) has 32-bit
st_dev and st_rdev; struct compat_stat (defined in
arch/x86/include/asm/compat.h) has 16-bit st_dev and st_rdev followed by
a 16-bit padding.

This patch fixes struct compat_stat to match struct stat.

[ Historical note: the old x86 'struct stat' did have that 16-bit field
  that the compat layer had kept around, but it was changes back in 2003
  by "struct stat - support larger dev_t":

    https://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git/commit/?id=e95b2065677fe32512a597a79db94b77b90c968d

  and back in those days, the x86_64 port was still new, and separate
  from the i386 code, and had already picked up the old version with a
  16-bit st_dev field ]

Note that we can't change compat_dev_t because it is used by
compat_loop_info.

Also, if the st_dev and st_rdev values are 32-bit, we don't have to use
old_valid_dev to test if the value fits into them.  This fixes
-EOVERFLOW on filesystems that are on NVMe because NVMe uses the major
number 259.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Cc: Andreas Schwab <schwab@linux-m68k.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 arch/x86/include/asm/compat.h |  6 ++----
 fs/stat.c                     | 19 ++++++++++---------
 2 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index 0e327a01f50f..46a067bd7e0b 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -29,15 +29,13 @@ typedef u32		compat_caddr_t;
 typedef __kernel_fsid_t	compat_fsid_t;
 
 struct compat_stat {
-	compat_dev_t	st_dev;
-	u16		__pad1;
+	u32		st_dev;
 	compat_ino_t	st_ino;
 	compat_mode_t	st_mode;
 	compat_nlink_t	st_nlink;
 	__compat_uid_t	st_uid;
 	__compat_gid_t	st_gid;
-	compat_dev_t	st_rdev;
-	u16		__pad2;
+	u32		st_rdev;
 	u32		st_size;
 	u32		st_blksize;
 	u32		st_blocks;
diff --git a/fs/stat.c b/fs/stat.c
index 1196af4d1ea0..04550c0ba540 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -306,9 +306,6 @@ SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, stat
 #  define choose_32_64(a,b) b
 #endif
 
-#define valid_dev(x)  choose_32_64(old_valid_dev(x),true)
-#define encode_dev(x) choose_32_64(old_encode_dev,new_encode_dev)(x)
-
 #ifndef INIT_STRUCT_STAT_PADDING
 #  define INIT_STRUCT_STAT_PADDING(st) memset(&st, 0, sizeof(st))
 #endif
@@ -317,7 +314,9 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf)
 {
 	struct stat tmp;
 
-	if (!valid_dev(stat->dev) || !valid_dev(stat->rdev))
+	if (sizeof(tmp.st_dev) < 4 && !old_valid_dev(stat->dev))
+		return -EOVERFLOW;
+	if (sizeof(tmp.st_rdev) < 4 && !old_valid_dev(stat->rdev))
 		return -EOVERFLOW;
 #if BITS_PER_LONG == 32
 	if (stat->size > MAX_NON_LFS)
@@ -325,7 +324,7 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf)
 #endif
 
 	INIT_STRUCT_STAT_PADDING(tmp);
-	tmp.st_dev = encode_dev(stat->dev);
+	tmp.st_dev = new_encode_dev(stat->dev);
 	tmp.st_ino = stat->ino;
 	if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)
 		return -EOVERFLOW;
@@ -335,7 +334,7 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf)
 		return -EOVERFLOW;
 	SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid));
 	SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid));
-	tmp.st_rdev = encode_dev(stat->rdev);
+	tmp.st_rdev = new_encode_dev(stat->rdev);
 	tmp.st_size = stat->size;
 	tmp.st_atime = stat->atime.tv_sec;
 	tmp.st_mtime = stat->mtime.tv_sec;
@@ -616,11 +615,13 @@ static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf)
 {
 	struct compat_stat tmp;
 
-	if (!old_valid_dev(stat->dev) || !old_valid_dev(stat->rdev))
+	if (sizeof(tmp.st_dev) < 4 && !old_valid_dev(stat->dev))
+		return -EOVERFLOW;
+	if (sizeof(tmp.st_rdev) < 4 && !old_valid_dev(stat->rdev))
 		return -EOVERFLOW;
 
 	memset(&tmp, 0, sizeof(tmp));
-	tmp.st_dev = old_encode_dev(stat->dev);
+	tmp.st_dev = new_encode_dev(stat->dev);
 	tmp.st_ino = stat->ino;
 	if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)
 		return -EOVERFLOW;
@@ -630,7 +631,7 @@ static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf)
 		return -EOVERFLOW;
 	SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid));
 	SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid));
-	tmp.st_rdev = old_encode_dev(stat->rdev);
+	tmp.st_rdev = new_encode_dev(stat->rdev);
 	if ((u64) stat->size > MAX_NON_LFS)
 		return -EOVERFLOW;
 	tmp.st_size = stat->size;
-- 
Gitee


From 47590400345a095df3f6c386c59924849fc0fe28 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 26 Jul 2022 17:39:01 +0800
Subject: [PATCH 609/674] nvme: add a quirk to disable namespace identifiers

stable inclusion
from stable-v5.10.113
commit 316bd86c2261fffa37577c3d35721a9337248e54
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=316bd86c2261fffa37577c3d35721a9337248e54

--------------------------------

[ Upstream commit 00ff400e6deee00f7b15e200205b2708b63b8cf6 ]

Add a quirk to disable using and exporting namespace identifiers for
controllers where they are broken beyond repair.

The most directly visible problem with non-unique namespace identifiers
is that they break the /dev/disk/by-id/ links, with the link for a
supposedly unique identifier now pointing to one of multiple possible
namespaces that share the same ID, and a somewhat random selection of
which one actually shows up.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/nvme/host/core.c | 24 ++++++++++++++++++------
 drivers/nvme/host/nvme.h |  5 +++++
 2 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index dcc047f01a07..274635c0c02a 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1250,6 +1250,8 @@ static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids,
 				 warn_str, cur->nidl);
 			return -1;
 		}
+		if (ctrl->quirks & NVME_QUIRK_BOGUS_NID)
+			return NVME_NIDT_EUI64_LEN;
 		memcpy(ids->eui64, data + sizeof(*cur), NVME_NIDT_EUI64_LEN);
 		return NVME_NIDT_EUI64_LEN;
 	case NVME_NIDT_NGUID:
@@ -1258,6 +1260,8 @@ static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids,
 				 warn_str, cur->nidl);
 			return -1;
 		}
+		if (ctrl->quirks & NVME_QUIRK_BOGUS_NID)
+			return NVME_NIDT_NGUID_LEN;
 		memcpy(ids->nguid, data + sizeof(*cur), NVME_NIDT_NGUID_LEN);
 		return NVME_NIDT_NGUID_LEN;
 	case NVME_NIDT_UUID:
@@ -1266,6 +1270,8 @@ static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids,
 				 warn_str, cur->nidl);
 			return -1;
 		}
+		if (ctrl->quirks & NVME_QUIRK_BOGUS_NID)
+			return NVME_NIDT_UUID_LEN;
 		uuid_copy(&ids->uuid, data + sizeof(*cur));
 		return NVME_NIDT_UUID_LEN;
 	case NVME_NIDT_CSI:
@@ -1361,12 +1367,18 @@ static int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid,
 	if ((*id)->ncap == 0) /* namespace not allocated or attached */
 		goto out_free_id;
 
-	if (ctrl->vs >= NVME_VS(1, 1, 0) &&
-	    !memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
-		memcpy(ids->eui64, (*id)->eui64, sizeof(ids->eui64));
-	if (ctrl->vs >= NVME_VS(1, 2, 0) &&
-	    !memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
-		memcpy(ids->nguid, (*id)->nguid, sizeof(ids->nguid));
+
+	if (ctrl->quirks & NVME_QUIRK_BOGUS_NID) {
+		dev_info(ctrl->device,
+			 "Ignoring bogus Namespace Identifiers\n");
+	} else {
+		if (ctrl->vs >= NVME_VS(1, 1, 0) &&
+		    !memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
+			memcpy(ids->eui64, (*id)->eui64, sizeof(ids->eui64));
+		if (ctrl->vs >= NVME_VS(1, 2, 0) &&
+		    !memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
+			memcpy(ids->nguid, (*id)->nguid, sizeof(ids->nguid));
+	}
 
 	return 0;
 
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 94cee2c566d3..11d3cc2890f9 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -151,6 +151,11 @@ enum nvme_quirks {
 	 * encoding the generation sequence number.
 	 */
 	NVME_QUIRK_SKIP_CID_GEN			= (1 << 17),
+
+	/*
+	 * Reports garbage in the namespace identifiers (eui64, nguid, uuid).
+	 */
+	NVME_QUIRK_BOGUS_NID			= (1 << 18),
 };
 
 /*
-- 
Gitee


From 983009806cf6c01a28ac7890c08a83cc38e75bc8 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 26 Jul 2022 17:39:02 +0800
Subject: [PATCH 610/674] nvme-pci: disable namespace identifiers for Qemu
 controllers

stable inclusion
from stable-v5.10.113
commit 7ec6e06ee405756f90226b946790a5a7a65795b7
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=7ec6e06ee405756f90226b946790a5a7a65795b7

--------------------------------

[ Upstream commit 66dd346b84d79fde20832ed691a54f4881eac20d ]

Qemu unconditionally reports a UUID, which depending on the qemu version
is either all-null (which is incorrect but harmless) or contains a single
bit set for all controllers.  In addition it can also optionally report
a eui64 which needs to be manually set.  Disable namespace identifiers
for Qemu controlles entirely even if in some cases they could be set
correctly through manual intervention.

Reported-by: Luis Chamberlain <mcgrof@kernel.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/nvme/host/pci.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index f435ab0809fb..e0a3d03198a2 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -3212,7 +3212,10 @@ static const struct pci_device_id nvme_id_table[] = {
 		.driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, },
 	{ PCI_VDEVICE(INTEL, 0x5845),	/* Qemu emulated controller */
 		.driver_data = NVME_QUIRK_IDENTIFY_CNS |
-				NVME_QUIRK_DISABLE_WRITE_ZEROES, },
+				NVME_QUIRK_DISABLE_WRITE_ZEROES |
+				NVME_QUIRK_BOGUS_NID, },
+	{ PCI_VDEVICE(REDHAT, 0x0010),	/* Qemu emulated controller */
+		.driver_data = NVME_QUIRK_BOGUS_NID, },
 	{ PCI_DEVICE(0x126f, 0x2263),	/* Silicon Motion unidentified */
 		.driver_data = NVME_QUIRK_NO_NS_DESC_LIST, },
 	{ PCI_DEVICE(0x1bb1, 0x0100),   /* Seagate Nytro Flash Storage */
-- 
Gitee


From 3d0cf64034acd4253f9d7e4e54c0ece407faa42c Mon Sep 17 00:00:00 2001
From: Shubhrajyoti Datta <shubhrajyoti.datta@xilinx.com>
Date: Tue, 26 Jul 2022 17:39:03 +0800
Subject: [PATCH 611/674] EDAC/synopsys: Read the error count from the correct
 register

stable inclusion
from stable-v5.10.113
commit 50cbc583fa838a63f8447251b88da569c3d36ba6
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=50cbc583fa838a63f8447251b88da569c3d36ba6

--------------------------------

commit e2932d1f6f055b2af2114c7e64a26dc1b5593d0c upstream.

Currently, the error count is read wrongly from the status register. Read
the count from the proper error count register (ERRCNT).

  [ bp: Massage. ]

Fixes: b500b4a029d5 ("EDAC, synopsys: Add ECC support for ZynqMP DDR controller")
Signed-off-by: Shubhrajyoti Datta <shubhrajyoti.datta@xilinx.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Michal Simek <michal.simek@xilinx.com>
Cc: <stable@vger.kernel.org>
Link: https://lore.kernel.org/r/20220414102813.4468-1-shubhrajyoti.datta@xilinx.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/edac/synopsys_edac.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/drivers/edac/synopsys_edac.c b/drivers/edac/synopsys_edac.c
index 92906b56b1a2..fea44dc0484b 100644
--- a/drivers/edac/synopsys_edac.c
+++ b/drivers/edac/synopsys_edac.c
@@ -163,6 +163,11 @@
 #define ECC_STAT_CECNT_SHIFT		8
 #define ECC_STAT_BITNUM_MASK		0x7F
 
+/* ECC error count register definitions */
+#define ECC_ERRCNT_UECNT_MASK		0xFFFF0000
+#define ECC_ERRCNT_UECNT_SHIFT		16
+#define ECC_ERRCNT_CECNT_MASK		0xFFFF
+
 /* DDR QOS Interrupt register definitions */
 #define DDR_QOS_IRQ_STAT_OFST		0x20200
 #define DDR_QOSUE_MASK			0x4
@@ -418,15 +423,16 @@ static int zynqmp_get_error_info(struct synps_edac_priv *priv)
 	base = priv->baseaddr;
 	p = &priv->stat;
 
+	regval = readl(base + ECC_ERRCNT_OFST);
+	p->ce_cnt = regval & ECC_ERRCNT_CECNT_MASK;
+	p->ue_cnt = (regval & ECC_ERRCNT_UECNT_MASK) >> ECC_ERRCNT_UECNT_SHIFT;
+	if (!p->ce_cnt)
+		goto ue_err;
+
 	regval = readl(base + ECC_STAT_OFST);
 	if (!regval)
 		return 1;
 
-	p->ce_cnt = (regval & ECC_STAT_CECNT_MASK) >> ECC_STAT_CECNT_SHIFT;
-	p->ue_cnt = (regval & ECC_STAT_UECNT_MASK) >> ECC_STAT_UECNT_SHIFT;
-	if (!p->ce_cnt)
-		goto ue_err;
-
 	p->ceinfo.bitpos = (regval & ECC_STAT_BITNUM_MASK);
 
 	regval = readl(base + ECC_CEADDR0_OFST);
-- 
Gitee


From eb179e5cfb436d79cb432f855a26e7596e24531e Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Tue, 26 Jul 2022 17:39:04 +0800
Subject: [PATCH 612/674] mm, hugetlb: allow for "high" userspace addresses

stable inclusion
from stable-v5.10.113
commit 6b932920b96fc3002352fe8225ec63a1cd1717ec
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=6b932920b96fc3002352fe8225ec63a1cd1717ec

--------------------------------

commit 5f24d5a579d1eace79d505b148808a850b417d4c upstream.

This is a fix for commit f6795053dac8 ("mm: mmap: Allow for "high"
userspace addresses") for hugetlb.

This patch adds support for "high" userspace addresses that are
optionally supported on the system and have to be requested via a hint
mechanism ("high" addr parameter to mmap).

Architectures such as powerpc and x86 achieve this by making changes to
their architectural versions of hugetlb_get_unmapped_area() function.
However, arm64 uses the generic version of that function.

So take into account arch_get_mmap_base() and arch_get_mmap_end() in
hugetlb_get_unmapped_area().  To allow that, move those two macros out
of mm/mmap.c into include/linux/sched/mm.h

If these macros are not defined in architectural code then they default
to (TASK_SIZE) and (base) so should not introduce any behavioural
changes to architectures that do not define them.

For the time being, only ARM64 is affected by this change.

Catalin (ARM64) said
 "We should have fixed hugetlb_get_unmapped_area() as well when we added
  support for 52-bit VA. The reason for commit f6795053dac8 was to
  prevent normal mmap() from returning addresses above 48-bit by default
  as some user-space had hard assumptions about this.

  It's a slight ABI change if you do this for hugetlb_get_unmapped_area()
  but I doubt anyone would notice. It's more likely that the current
  behaviour would cause issues, so I'd rather have them consistent.

  Basically when arm64 gained support for 52-bit addresses we did not
  want user-space calling mmap() to suddenly get such high addresses,
  otherwise we could have inadvertently broken some programs (similar
  behaviour to x86 here). Hence we added commit f6795053dac8. But we
  missed hugetlbfs which could still get such high mmap() addresses. So
  in theory that's a potential regression that should have bee addressed
  at the same time as commit f6795053dac8 (and before arm64 enabled
  52-bit addresses)"

Link: https://lkml.kernel.org/r/ab847b6edb197bffdfe189e70fb4ac76bfe79e0d.1650033747.git.christophe.leroy@csgroup.eu
Fixes: f6795053dac8 ("mm: mmap: Allow for "high" userspace addresses")
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Steve Capper <steve.capper@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: <stable@vger.kernel.org>	[5.0.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>

 Conflicts:
	fs/hugetlbfs/inode.c
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 fs/hugetlbfs/inode.c     | 9 +++++----
 include/linux/sched/mm.h | 8 ++++++++
 mm/mmap.c                | 8 --------
 3 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 6f2943465bff..8a87d1b43387 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -252,7 +252,7 @@ hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr,
 	info.flags = 0;
 	info.length = len;
 	info.low_limit = current->mm->mmap_base;
-	info.high_limit = TASK_SIZE;
+	info.high_limit = arch_get_mmap_end(addr);
 	info.align_mask = PAGE_MASK & ~huge_page_mask(h);
 	info.align_offset = 0;
 
@@ -272,7 +272,7 @@ hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr,
 	info.flags = VM_UNMAPPED_AREA_TOPDOWN;
 	info.length = len;
 	info.low_limit = max(PAGE_SIZE, mmap_min_addr);
-	info.high_limit = current->mm->mmap_base;
+	info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base);
 	info.align_mask = PAGE_MASK & ~huge_page_mask(h);
 	info.align_offset = 0;
 
@@ -291,7 +291,7 @@ hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr,
 		VM_BUG_ON(addr != -ENOMEM);
 		info.flags = 0;
 		info.low_limit = current->mm->mmap_base;
-		info.high_limit = TASK_SIZE;
+		info.high_limit = arch_get_mmap_end(addr);
 
 		if (enable_mmap_dvpp)
 			dvpp_mmap_get_area(&info, flags);
@@ -309,6 +309,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma;
 	struct hstate *h = hstate_file(file);
+	const unsigned long mmap_end = arch_get_mmap_end(addr);
 
 	if (len & ~huge_page_mask(h))
 		return -EINVAL;
@@ -328,7 +329,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 			return -ENOMEM;
 
 		vma = find_vma(mm, addr);
-		if (TASK_SIZE - len >= addr &&
+		if (mmap_end - len >= addr &&
 		    (!vma || addr + len <= vm_start_gap(vma)))
 			return addr;
 	}
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index dc1f4dcd9a82..e3e5e149b00e 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -106,6 +106,14 @@ static inline void mm_update_next_owner(struct mm_struct *mm)
 #endif /* CONFIG_MEMCG */
 
 #ifdef CONFIG_MMU
+#ifndef arch_get_mmap_end
+#define arch_get_mmap_end(addr)	(TASK_SIZE)
+#endif
+
+#ifndef arch_get_mmap_base
+#define arch_get_mmap_base(addr, base) (base)
+#endif
+
 extern void arch_pick_mmap_layout(struct mm_struct *mm,
 				  struct rlimit *rlim_stack);
 extern unsigned long
diff --git a/mm/mmap.c b/mm/mmap.c
index 5ad32537604a..5489d70db84e 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2404,14 +2404,6 @@ unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info)
 	return addr;
 }
 
-#ifndef arch_get_mmap_end
-#define arch_get_mmap_end(addr)	(TASK_SIZE)
-#endif
-
-#ifndef arch_get_mmap_base
-#define arch_get_mmap_base(addr, base) (base)
-#endif
-
 /* Get an address range which is currently unmapped.
  * For shmat() with addr=0.
  *
-- 
Gitee


From ccc63440f2c0b4d4473bfecbaf1a19e3c14afce7 Mon Sep 17 00:00:00 2001
From: Alistair Popple <apopple@nvidia.com>
Date: Tue, 26 Jul 2022 17:39:05 +0800
Subject: [PATCH 613/674] mm/mmu_notifier.c: fix race in
 mmu_interval_notifier_remove()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

stable inclusion
from stable-v5.10.113
commit 9ca66d79143980260be615b964b8dc1504a5d0c6
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=9ca66d79143980260be615b964b8dc1504a5d0c6

--------------------------------

commit 319561669a59d8e9206ab311ae5433ef92fd79d1 upstream.

In some cases it is possible for mmu_interval_notifier_remove() to race
with mn_tree_inv_end() allowing it to return while the notifier data
structure is still in use.  Consider the following sequence:

  CPU0 - mn_tree_inv_end()            CPU1 - mmu_interval_notifier_remove()
  ----------------------------------- ------------------------------------
                                      spin_lock(subscriptions->lock);
                                      seq = subscriptions->invalidate_seq;
  spin_lock(subscriptions->lock);     spin_unlock(subscriptions->lock);
  subscriptions->invalidate_seq++;
                                      wait_event(invalidate_seq != seq);
                                      return;
  interval_tree_remove(interval_sub); kfree(interval_sub);
  spin_unlock(subscriptions->lock);
  wake_up_all();

As the wait_event() condition is true it will return immediately.  This
can lead to use-after-free type errors if the caller frees the data
structure containing the interval notifier subscription while it is
still on a deferred list.  Fix this by taking the appropriate lock when
reading invalidate_seq to ensure proper synchronisation.

I observed this whilst running stress testing during some development.
You do have to be pretty unlucky, but it leads to the usual problems of
use-after-free (memory corruption, kernel crash, difficult to diagnose
WARN_ON, etc).

Link: https://lkml.kernel.org/r/20220420043734.476348-1-apopple@nvidia.com
Fixes: 99cb252f5e68 ("mm/mmu_notifier: add an interval tree notifier")
Signed-off-by: Alistair Popple <apopple@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Cc: Christian König <christian.koenig@amd.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 mm/mmu_notifier.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 07f42a7a6065..9165ca619c8c 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -1043,6 +1043,18 @@ int mmu_interval_notifier_insert_locked(
 }
 EXPORT_SYMBOL_GPL(mmu_interval_notifier_insert_locked);
 
+static bool
+mmu_interval_seq_released(struct mmu_notifier_subscriptions *subscriptions,
+			  unsigned long seq)
+{
+	bool ret;
+
+	spin_lock(&subscriptions->lock);
+	ret = subscriptions->invalidate_seq != seq;
+	spin_unlock(&subscriptions->lock);
+	return ret;
+}
+
 /**
  * mmu_interval_notifier_remove - Remove a interval notifier
  * @interval_sub: Interval subscription to unregister
@@ -1090,7 +1102,7 @@ void mmu_interval_notifier_remove(struct mmu_interval_notifier *interval_sub)
 	lock_map_release(&__mmu_notifier_invalidate_range_start_map);
 	if (seq)
 		wait_event(subscriptions->wq,
-			   READ_ONCE(subscriptions->invalidate_seq) != seq);
+			   mmu_interval_seq_released(subscriptions, seq));
 
 	/* pairs with mmgrab in mmu_interval_notifier_insert() */
 	mmdrop(mm);
-- 
Gitee


From 0affb0e98c917096fe752a041443b815bb2b61ba Mon Sep 17 00:00:00 2001
From: Zheyu Ma <zheyuma97@gmail.com>
Date: Tue, 26 Jul 2022 17:39:06 +0800
Subject: [PATCH 614/674] ata: pata_marvell: Check the 'bmdma_addr' beforing
 reading

stable inclusion
from stable-v5.10.113
commit cf23a960c5c62ddd8283a855a742977ee983a62a
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=cf23a960c5c62ddd8283a855a742977ee983a62a

--------------------------------

commit aafa9f958342db36c17ac2a7f1b841032c96feb4 upstream.

Before detecting the cable type on the dma bar, the driver should check
whether the 'bmdma_addr' is zero, which means the adapter does not
support DMA, otherwise we will get the following error:

[    5.146634] Bad IO access at port 0x1 (return inb(port))
[    5.147206] WARNING: CPU: 2 PID: 303 at lib/iomap.c:44 ioread8+0x4a/0x60
[    5.150856] RIP: 0010:ioread8+0x4a/0x60
[    5.160238] Call Trace:
[    5.160470]  <TASK>
[    5.160674]  marvell_cable_detect+0x6e/0xc0 [pata_marvell]
[    5.161728]  ata_eh_recover+0x3520/0x6cc0
[    5.168075]  ata_do_eh+0x49/0x3c0

Signed-off-by: Zheyu Ma <zheyuma97@gmail.com>
Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/ata/pata_marvell.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/ata/pata_marvell.c b/drivers/ata/pata_marvell.c
index b066809ba9a1..c56f4043b0cc 100644
--- a/drivers/ata/pata_marvell.c
+++ b/drivers/ata/pata_marvell.c
@@ -83,6 +83,8 @@ static int marvell_cable_detect(struct ata_port *ap)
 	switch(ap->port_no)
 	{
 	case 0:
+		if (!ap->ioaddr.bmdma_addr)
+			return ATA_CBL_PATA_UNK;
 		if (ioread8(ap->ioaddr.bmdma_addr + 1) & 1)
 			return ATA_CBL_PATA40;
 		return ATA_CBL_PATA80;
-- 
Gitee


From d5e4b0822bc88bcd32070ab02176509d7a9eb06a Mon Sep 17 00:00:00 2001
From: Xiaomeng Tong <xiam0nd.tong@gmail.com>
Date: Tue, 26 Jul 2022 17:39:07 +0800
Subject: [PATCH 615/674] dma: at_xdmac: fix a missing check on list iterator

stable inclusion
from stable-v5.10.113
commit 358a3846f6a950b8db803262e74081ad4ab235d4
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=358a3846f6a950b8db803262e74081ad4ab235d4

--------------------------------

commit 206680c4e46b62fd8909385e0874a36952595b85 upstream.

The bug is here:
	__func__, desc, &desc->tx_dma_desc.phys, ret, cookie, residue);

The list iterator 'desc' will point to a bogus position containing
HEAD if the list is empty or no element is found. To avoid dev_dbg()
prints a invalid address, use a new variable 'iter' as the list
iterator, while use the origin variable 'desc' as a dedicated
pointer to point to the found element.

Cc: stable@vger.kernel.org
Fixes: 82e2424635f4c ("dmaengine: xdmac: fix print warning on dma_addr_t variable")
Signed-off-by: Xiaomeng Tong <xiam0nd.tong@gmail.com>
Link: https://lore.kernel.org/r/20220327061154.4867-1-xiam0nd.tong@gmail.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/dma/at_xdmac.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/drivers/dma/at_xdmac.c b/drivers/dma/at_xdmac.c
index 90afba0b36fe..47552db6b8dc 100644
--- a/drivers/dma/at_xdmac.c
+++ b/drivers/dma/at_xdmac.c
@@ -1390,7 +1390,7 @@ at_xdmac_tx_status(struct dma_chan *chan, dma_cookie_t cookie,
 {
 	struct at_xdmac_chan	*atchan = to_at_xdmac_chan(chan);
 	struct at_xdmac		*atxdmac = to_at_xdmac(atchan->chan.device);
-	struct at_xdmac_desc	*desc, *_desc;
+	struct at_xdmac_desc	*desc, *_desc, *iter;
 	struct list_head	*descs_list;
 	enum dma_status		ret;
 	int			residue, retry;
@@ -1505,11 +1505,13 @@ at_xdmac_tx_status(struct dma_chan *chan, dma_cookie_t cookie,
 	 * microblock.
 	 */
 	descs_list = &desc->descs_list;
-	list_for_each_entry_safe(desc, _desc, descs_list, desc_node) {
-		dwidth = at_xdmac_get_dwidth(desc->lld.mbr_cfg);
-		residue -= (desc->lld.mbr_ubc & 0xffffff) << dwidth;
-		if ((desc->lld.mbr_nda & 0xfffffffc) == cur_nda)
+	list_for_each_entry_safe(iter, _desc, descs_list, desc_node) {
+		dwidth = at_xdmac_get_dwidth(iter->lld.mbr_cfg);
+		residue -= (iter->lld.mbr_ubc & 0xffffff) << dwidth;
+		if ((iter->lld.mbr_nda & 0xfffffffc) == cur_nda) {
+			desc = iter;
 			break;
+		}
 	}
 	residue += cur_ubc << dwidth;
 
-- 
Gitee


From 2d19822cbd21ac865ee8a7dc3376e640a457ee1f Mon Sep 17 00:00:00 2001
From: Manuel Ullmann <labre@posteo.de>
Date: Tue, 26 Jul 2022 17:39:08 +0800
Subject: [PATCH 616/674] net: atlantic: invert deep par in pm functions,
 preventing null derefs

stable inclusion
from stable-v5.10.113
commit ba2716da233618c3f361b8ece818a0e91de7a8f0
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=ba2716da233618c3f361b8ece818a0e91de7a8f0

--------------------------------

commit cbe6c3a8f8f4315b96e46e1a1c70393c06d95a4c upstream.

This will reset deeply on freeze and thaw instead of suspend and
resume and prevent null pointer dereferences of the uninitialized ring
0 buffer while thawing.

The impact is an indefinitely hanging kernel. You can't switch
consoles after this and the only possible user interaction is SysRq.

BUG: kernel NULL pointer dereference
RIP: 0010:aq_ring_rx_fill+0xcf/0x210 [atlantic]
aq_vec_init+0x85/0xe0 [atlantic]
aq_nic_init+0xf7/0x1d0 [atlantic]
atl_resume_common+0x4f/0x100 [atlantic]
pci_pm_thaw+0x42/0xa0

resolves in aq_ring.o to

```
0000000000000ae0 <aq_ring_rx_fill>:
{
/* ... */
 baf:	48 8b 43 08          	mov    0x8(%rbx),%rax
 		buff->flags = 0U; /* buff is NULL */
```

The bug has been present since the introduction of the new pm code in
8aaa112a57c1 ("net: atlantic: refactoring pm logic") and was hidden
until 8ce84271697a ("net: atlantic: changes for multi-TC support"),
which refactored the aq_vec_{free,alloc} functions into
aq_vec_{,ring}_{free,alloc}, but is technically not wrong. The
original functions just always reinitialized the buffers on S3/S4. If
the interface is down before freezing, the bug does not occur. It does
not matter, whether the initrd contains and loads the module before
thawing.

So the fix is to invert the boolean parameter deep in all pm function
calls, which was clearly intended to be set like that.

First report was on Github [1], which you have to guess from the
resume logs in the posted dmesg snippet. Recently I posted one on
Bugzilla [2], since I did not have an AQC device so far.

#regzbot introduced: 8ce84271697a
#regzbot from: koo5 <kolman.jindrich@gmail.com>
#regzbot monitor: https://github.com/Aquantia/AQtion/issues/32

Fixes: 8aaa112a57c1 ("net: atlantic: refactoring pm logic")
Link: https://github.com/Aquantia/AQtion/issues/32 [1]
Link: https://bugzilla.kernel.org/show_bug.cgi?id=215798 [2]
Cc: stable@vger.kernel.org
Reported-by: koo5 <kolman.jindrich@gmail.com>
Signed-off-by: Manuel Ullmann <labre@posteo.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c b/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c
index 1826253f97dc..bdfd462c74db 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c
@@ -450,22 +450,22 @@ static int atl_resume_common(struct device *dev, bool deep)
 
 static int aq_pm_freeze(struct device *dev)
 {
-	return aq_suspend_common(dev, false);
+	return aq_suspend_common(dev, true);
 }
 
 static int aq_pm_suspend_poweroff(struct device *dev)
 {
-	return aq_suspend_common(dev, true);
+	return aq_suspend_common(dev, false);
 }
 
 static int aq_pm_thaw(struct device *dev)
 {
-	return atl_resume_common(dev, false);
+	return atl_resume_common(dev, true);
 }
 
 static int aq_pm_resume_restore(struct device *dev)
 {
-	return atl_resume_common(dev, true);
+	return atl_resume_common(dev, false);
 }
 
 static const struct dev_pm_ops aq_pm_ops = {
-- 
Gitee


From e85467a78402f38cdbc4a15730731610dd8aa195 Mon Sep 17 00:00:00 2001
From: Guo Ren <guoren@linux.alibaba.com>
Date: Tue, 26 Jul 2022 17:39:09 +0800
Subject: [PATCH 617/674] xtensa: patch_text: Fixup last cpu should be master

stable inclusion
from stable-v5.10.113
commit f399ab11dd6cc0a6927029371fbe2ec027181598
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=f399ab11dd6cc0a6927029371fbe2ec027181598

--------------------------------

commit ee69d4be8fd064cd08270b4808d2dfece3614ee0 upstream.

These patch_text implementations are using stop_machine_cpuslocked
infrastructure with atomic cpu_count. The original idea: When the
master CPU patch_text, the others should wait for it. But current
implementation is using the first CPU as master, which couldn't
guarantee the remaining CPUs are waiting. This patch changes the
last CPU as the master to solve the potential risk.

Fixes: 64711f9a47d4 ("xtensa: implement jump_label support")
Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
Signed-off-by: Guo Ren <guoren@kernel.org>
Reviewed-by: Max Filippov <jcmvbkbc@gmail.com>
Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
Cc: <stable@vger.kernel.org>
Message-Id: <20220407073323.743224-4-guoren@kernel.org>
Signed-off-by: Max Filippov <jcmvbkbc@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 arch/xtensa/kernel/jump_label.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/xtensa/kernel/jump_label.c b/arch/xtensa/kernel/jump_label.c
index 0dde21e0d3de..ad1841cecdfb 100644
--- a/arch/xtensa/kernel/jump_label.c
+++ b/arch/xtensa/kernel/jump_label.c
@@ -40,7 +40,7 @@ static int patch_text_stop_machine(void *data)
 {
 	struct patch *patch = data;
 
-	if (atomic_inc_return(&patch->cpu_count) == 1) {
+	if (atomic_inc_return(&patch->cpu_count) == num_online_cpus()) {
 		local_patch_text(patch->addr, patch->data, patch->sz);
 		atomic_inc(&patch->cpu_count);
 	} else {
-- 
Gitee


From 90fa436f8a1afa90f01292c96ed7813fe10371d5 Mon Sep 17 00:00:00 2001
From: Max Filippov <jcmvbkbc@gmail.com>
Date: Tue, 26 Jul 2022 17:39:10 +0800
Subject: [PATCH 618/674] xtensa: fix a7 clobbering in coprocessor context
 load/store

stable inclusion
from stable-v5.10.113
commit 19f6dcb1f0f0f8523976c8aa1800856c9b4f35c3
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=19f6dcb1f0f0f8523976c8aa1800856c9b4f35c3

--------------------------------

commit 839769c35477d4acc2369e45000ca7b0b6af39a7 upstream.

Fast coprocessor exception handler saves a3..a6, but coprocessor context
load/store code uses a4..a7 as temporaries, potentially clobbering a7.
'Potentially' because coprocessor state load/store macros may not use
all four temporary registers (and neither FPU nor HiFi macros do).
Use a3..a6 as intended.

Cc: stable@vger.kernel.org
Fixes: c658eac628aa ("[XTENSA] Add support for configurable registers and coprocessors")
Signed-off-by: Max Filippov <jcmvbkbc@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 arch/xtensa/kernel/coprocessor.S | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/xtensa/kernel/coprocessor.S b/arch/xtensa/kernel/coprocessor.S
index 45cc0ae0af6f..c7b9f12896f2 100644
--- a/arch/xtensa/kernel/coprocessor.S
+++ b/arch/xtensa/kernel/coprocessor.S
@@ -29,7 +29,7 @@
 	.if XTENSA_HAVE_COPROCESSOR(x);					\
 		.align 4;						\
 	.Lsave_cp_regs_cp##x:						\
-		xchal_cp##x##_store a2 a4 a5 a6 a7;			\
+		xchal_cp##x##_store a2 a3 a4 a5 a6;			\
 		jx	a0;						\
 	.endif
 
@@ -46,7 +46,7 @@
 	.if XTENSA_HAVE_COPROCESSOR(x);					\
 		.align 4;						\
 	.Lload_cp_regs_cp##x:						\
-		xchal_cp##x##_load a2 a4 a5 a6 a7;			\
+		xchal_cp##x##_load a2 a3 a4 a5 a6;			\
 		jx	a0;						\
 	.endif
 
-- 
Gitee


From 490085c27ad981285d7f65b677a78bd578084e98 Mon Sep 17 00:00:00 2001
From: Paolo Valerio <pvalerio@redhat.com>
Date: Tue, 26 Jul 2022 17:39:11 +0800
Subject: [PATCH 619/674] openvswitch: fix OOB access in reserve_sfa_size()

stable inclusion
from stable-v5.10.113
commit 0837ff17d052b7d755d5086208c3445867aaff82
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=0837ff17d052b7d755d5086208c3445867aaff82

--------------------------------

commit cefa91b2332d7009bc0be5d951d6cbbf349f90f8 upstream.

Given a sufficiently large number of actions, while copying and
reserving memory for a new action of a new flow, if next_offset is
greater than MAX_ACTIONS_BUFSIZE, the function reserve_sfa_size() does
not return -EMSGSIZE as expected, but it allocates MAX_ACTIONS_BUFSIZE
bytes increasing actions_len by req_size. This can then lead to an OOB
write access, especially when further actions need to be copied.

Fix it by rearranging the flow action size check.

KASAN splat below:
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>

==================================================================
BUG: KASAN: slab-out-of-bounds in reserve_sfa_size+0x1ba/0x380 [openvswitch]
Write of size 65360 at addr ffff888147e4001c by task handler15/836

CPU: 1 PID: 836 Comm: handler15 Not tainted 5.18.0-rc1+ #27
...
Call Trace:
 <TASK>
 dump_stack_lvl+0x45/0x5a
 print_report.cold+0x5e/0x5db
 ? __lock_text_start+0x8/0x8
 ? reserve_sfa_size+0x1ba/0x380 [openvswitch]
 kasan_report+0xb5/0x130
 ? reserve_sfa_size+0x1ba/0x380 [openvswitch]
 kasan_check_range+0xf5/0x1d0
 memcpy+0x39/0x60
 reserve_sfa_size+0x1ba/0x380 [openvswitch]
 __add_action+0x24/0x120 [openvswitch]
 ovs_nla_add_action+0xe/0x20 [openvswitch]
 ovs_ct_copy_action+0x29d/0x1130 [openvswitch]
 ? __kernel_text_address+0xe/0x30
 ? unwind_get_return_address+0x56/0xa0
 ? create_prof_cpu_mask+0x20/0x20
 ? ovs_ct_verify+0xf0/0xf0 [openvswitch]
 ? prep_compound_page+0x198/0x2a0
 ? __kasan_check_byte+0x10/0x40
 ? kasan_unpoison+0x40/0x70
 ? ksize+0x44/0x60
 ? reserve_sfa_size+0x75/0x380 [openvswitch]
 __ovs_nla_copy_actions+0xc26/0x2070 [openvswitch]
 ? __zone_watermark_ok+0x420/0x420
 ? validate_set.constprop.0+0xc90/0xc90 [openvswitch]
 ? __alloc_pages+0x1a9/0x3e0
 ? __alloc_pages_slowpath.constprop.0+0x1da0/0x1da0
 ? unwind_next_frame+0x991/0x1e40
 ? __mod_node_page_state+0x99/0x120
 ? __mod_lruvec_page_state+0x2e3/0x470
 ? __kasan_kmalloc_large+0x90/0xe0
 ovs_nla_copy_actions+0x1b4/0x2c0 [openvswitch]
 ovs_flow_cmd_new+0x3cd/0xb10 [openvswitch]
 ...

Cc: stable@vger.kernel.org
Fixes: f28cd2af22a0 ("openvswitch: fix flow actions reallocation")
Signed-off-by: Paolo Valerio <pvalerio@redhat.com>
Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 net/openvswitch/flow_netlink.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index 98a7e6f64ab0..293a798e89f4 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -2436,7 +2436,7 @@ static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa,
 	new_acts_size = max(next_offset + req_size, ksize(*sfa) * 2);
 
 	if (new_acts_size > MAX_ACTIONS_BUFSIZE) {
-		if ((MAX_ACTIONS_BUFSIZE - next_offset) < req_size) {
+		if ((next_offset + req_size) > MAX_ACTIONS_BUFSIZE) {
 			OVS_NLERR(log, "Flow action size exceeds max %u",
 				  MAX_ACTIONS_BUFSIZE);
 			return ERR_PTR(-EMSGSIZE);
-- 
Gitee


From 140cdaa1ebf8c1c01a60b1f56522154c58dc71ce Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Tue, 26 Jul 2022 17:39:12 +0800
Subject: [PATCH 620/674] gpio: Request interrupts after IRQ is initialized
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

stable inclusion
from stable-v5.10.113
commit 54e6180c8c2d71b6f1fab0d1420f65299b26953e
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=54e6180c8c2d71b6f1fab0d1420f65299b26953e

--------------------------------

commit 06fb4ecfeac7e00d6704fa5ed19299f2fefb3cc9 upstream.

Commit 5467801f1fcb ("gpio: Restrict usage of GPIO chip irq members
before initialization") attempted to fix a race condition that lead to a
NULL pointer, but in the process caused a regression for _AEI/_EVT
declared GPIOs.

This manifests in messages showing deferred probing while trying to
allocate IRQs like so:

  amd_gpio AMDI0030:00: Failed to translate GPIO pin 0x0000 to IRQ, err -517
  amd_gpio AMDI0030:00: Failed to translate GPIO pin 0x002C to IRQ, err -517
  amd_gpio AMDI0030:00: Failed to translate GPIO pin 0x003D to IRQ, err -517
  [ .. more of the same .. ]

The code for walking _AEI doesn't handle deferred probing and so this
leads to non-functional GPIO interrupts.

Fix this issue by moving the call to `acpi_gpiochip_request_interrupts`
to occur after gc->irc.initialized is set.

Fixes: 5467801f1fcb ("gpio: Restrict usage of GPIO chip irq members before initialization")
Link: https://lore.kernel.org/linux-gpio/BL1PR12MB51577A77F000A008AA694675E2EF9@BL1PR12MB5157.namprd12.prod.outlook.com/
Link: https://bugzilla.suse.com/show_bug.cgi?id=1198697
Link: https://bugzilla.kernel.org/show_bug.cgi?id=215850
Link: https://gitlab.freedesktop.org/drm/amd/-/issues/1979
Link: https://gitlab.freedesktop.org/drm/amd/-/issues/1976
Reported-by: Mario Limonciello <mario.limonciello@amd.com>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Reviewed-by: Shreeya Patel <shreeya.patel@collabora.com>
Tested-By: Samuel Čavoj <samuel@cavoj.net>
Tested-By: lukeluk498@gmail.com Link:
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-and-tested-by: Takashi Iwai <tiwai@suse.de>
Cc: Shreeya Patel <shreeya.patel@collabora.com>
Cc: stable@vger.kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/gpio/gpiolib.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index d18078748200..59d8affad343 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -1612,8 +1612,6 @@ static int gpiochip_add_irqchip(struct gpio_chip *gc,
 
 	gpiochip_set_irq_hooks(gc);
 
-	acpi_gpiochip_request_interrupts(gc);
-
 	/*
 	 * Using barrier() here to prevent compiler from reordering
 	 * gc->irq.initialized before initialization of above
@@ -1623,6 +1621,8 @@ static int gpiochip_add_irqchip(struct gpio_chip *gc,
 
 	gc->irq.initialized = true;
 
+	acpi_gpiochip_request_interrupts(gc);
+
 	return 0;
 }
 
-- 
Gitee


From 9d7cda5f4e3dc4ab3acf27cd75d78b156a0cc099 Mon Sep 17 00:00:00 2001
From: Xiaomeng Tong <xiam0nd.tong@gmail.com>
Date: Tue, 26 Jul 2022 17:39:13 +0800
Subject: [PATCH 621/674] ASoC: soc-dapm: fix two incorrect uses of list
 iterator

stable inclusion
from stable-v5.10.113
commit 43a2a3734aa3d25ef11b90f14f18bf31fb0a7feb
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=43a2a3734aa3d25ef11b90f14f18bf31fb0a7feb

--------------------------------

commit f730a46b931d894816af34a0ff8e4ad51565b39f upstream.

These two bug are here:
	list_for_each_entry_safe_continue(w, n, list,
					power_list);
	list_for_each_entry_safe_continue(w, n, list,
					power_list);

After the list_for_each_entry_safe_continue() exits, the list iterator
will always be a bogus pointer which point to an invalid struct objdect
containing HEAD member. The funciton poniter 'w->event' will be a
invalid value which can lead to a control-flow hijack if the 'w' can be
controlled.

The original intention was to continue the outer list_for_each_entry_safe()
loop with the same entry if w->event is NULL, but misunderstanding the
meaning of list_for_each_entry_safe_continue().

So just add a 'continue;' to fix the bug.

Cc: stable@vger.kernel.org
Fixes: 163cac061c973 ("ASoC: Factor out DAPM sequence execution")
Signed-off-by: Xiaomeng Tong <xiam0nd.tong@gmail.com>
Link: https://lore.kernel.org/r/20220329012134.9375-1-xiam0nd.tong@gmail.com
Signed-off-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 sound/soc/soc-dapm.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c
index 2924d89bf0da..417732bdf286 100644
--- a/sound/soc/soc-dapm.c
+++ b/sound/soc/soc-dapm.c
@@ -1683,8 +1683,7 @@ static void dapm_seq_run(struct snd_soc_card *card,
 		switch (w->id) {
 		case snd_soc_dapm_pre:
 			if (!w->event)
-				list_for_each_entry_safe_continue(w, n, list,
-								  power_list);
+				continue;
 
 			if (event == SND_SOC_DAPM_STREAM_START)
 				ret = w->event(w,
@@ -1696,8 +1695,7 @@ static void dapm_seq_run(struct snd_soc_card *card,
 
 		case snd_soc_dapm_post:
 			if (!w->event)
-				list_for_each_entry_safe_continue(w, n, list,
-								  power_list);
+				continue;
 
 			if (event == SND_SOC_DAPM_STREAM_START)
 				ret = w->event(w,
-- 
Gitee


From 2e5f28ee0f8c9876e8af1a9347aee57594f8478d Mon Sep 17 00:00:00 2001
From: Sasha Neftin <sasha.neftin@intel.com>
Date: Tue, 26 Jul 2022 17:39:14 +0800
Subject: [PATCH 622/674] e1000e: Fix possible overflow in LTR decoding

stable inclusion
from stable-v5.10.113
commit 7082650eb8265b583d0aea0515bcc6f65b0e8755
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=7082650eb8265b583d0aea0515bcc6f65b0e8755

--------------------------------

commit 04ebaa1cfddae5f240cc7404f009133bb0389a47 upstream.

When we decode the latency and the max_latency, u16 value may not fit
the required size and could lead to the wrong LTR representation.

Scaling is represented as:
scale 0 - 1         (2^(5*0)) = 2^0
scale 1 - 32        (2^(5 *1))= 2^5
scale 2 - 1024      (2^(5 *2)) =2^10
scale 3 - 32768     (2^(5 *3)) =2^15
scale 4 - 1048576   (2^(5 *4)) = 2^20
scale 5 - 33554432  (2^(5 *4)) = 2^25
scale 4 and scale 5 required 20 and 25 bits respectively.
scale 6 reserved.

Replace the u16 type with the u32 type and allow corrected LTR
representation.

Cc: stable@vger.kernel.org
Fixes: 44a13a5d99c7 ("e1000e: Fix the max snoop/no-snoop latency for 10M")
Reported-by: James Hutchinson <jahutchinson99@googlemail.com>
Link: https://bugzilla.kernel.org/show_bug.cgi?id=215689
Suggested-by: Dima Ruinskiy <dima.ruinskiy@intel.com>
Signed-off-by: Sasha Neftin <sasha.neftin@intel.com>
Tested-by: Naama Meir <naamax.meir@linux.intel.com>
Tested-by: James Hutchinson <jahutchinson99@googlemail.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/net/ethernet/intel/e1000e/ich8lan.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/e1000e/ich8lan.c b/drivers/net/ethernet/intel/e1000e/ich8lan.c
index 15b1503d5b6c..1f51252b465a 100644
--- a/drivers/net/ethernet/intel/e1000e/ich8lan.c
+++ b/drivers/net/ethernet/intel/e1000e/ich8lan.c
@@ -1006,8 +1006,8 @@ static s32 e1000_platform_pm_pch_lpt(struct e1000_hw *hw, bool link)
 {
 	u32 reg = link << (E1000_LTRV_REQ_SHIFT + E1000_LTRV_NOSNOOP_SHIFT) |
 	    link << E1000_LTRV_REQ_SHIFT | E1000_LTRV_SEND;
-	u16 max_ltr_enc_d = 0;	/* maximum LTR decoded by platform */
-	u16 lat_enc_d = 0;	/* latency decoded */
+	u32 max_ltr_enc_d = 0;	/* maximum LTR decoded by platform */
+	u32 lat_enc_d = 0;	/* latency decoded */
 	u16 lat_enc = 0;	/* latency encoded */
 
 	if (link) {
-- 
Gitee


From d51577c959b12c0ff173292fd7ebc9090ed20a29 Mon Sep 17 00:00:00 2001
From: Sergey Matyukevich <sergey.matyukevich@synopsys.com>
Date: Tue, 26 Jul 2022 17:39:15 +0800
Subject: [PATCH 623/674] ARC: entry: fix syscall_trace_exit argument

stable inclusion
from stable-v5.10.113
commit 5580b974a84b30f6da90a3a562ea0dbfb0038110
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=5580b974a84b30f6da90a3a562ea0dbfb0038110

--------------------------------

commit b1c6ecfdd06907554518ec384ce8e99889d15193 upstream.

Function syscall_trace_exit expects pointer to pt_regs. However
r0 is also used to keep syscall return value. Restore pointer
to pt_regs before calling syscall_trace_exit.

Cc: <stable@vger.kernel.org>
Signed-off-by: Sergey Matyukevich <sergey.matyukevich@synopsys.com>
Signed-off-by: Vineet Gupta <vgupta@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 arch/arc/kernel/entry.S | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arc/kernel/entry.S b/arch/arc/kernel/entry.S
index ae656bfc31c3..301ade4d0b94 100644
--- a/arch/arc/kernel/entry.S
+++ b/arch/arc/kernel/entry.S
@@ -199,6 +199,7 @@ tracesys_exit:
 	st  r0, [sp, PT_r0]     ; sys call return value in pt_regs
 
 	;POST Sys Call Ptrace Hook
+	mov r0, sp		; pt_regs needed
 	bl  @syscall_trace_exit
 	b   ret_from_exception ; NOT ret_from_system_call at is saves r0 which
 	; we'd done before calling post hook above
-- 
Gitee


From 1e6c493076318775eed92b3e3fe887ab4dfde197 Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Tue, 26 Jul 2022 17:39:16 +0800
Subject: [PATCH 624/674] arm_pmu: Validate single/group leader events

stable inclusion
from stable-v5.10.113
commit c55327bc3712ca96dc05bc4c6aab805b6fa8bb74
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=c55327bc3712ca96dc05bc4c6aab805b6fa8bb74

--------------------------------

commit e5c23779f93d45e39a52758ca593bd7e62e9b4be upstream.

In the case where there is only a cycle counter available (i.e.
PMCR_EL0.N is 0) and an event other than CPU cycles is opened, the open
should fail as the event can never possibly be scheduled. However, the
event validation when an event is opened is skipped when the group
leader is opened. Fix this by always validating the group leader events.

Reported-by: Al Grant <al.grant@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Rob Herring <robh@kernel.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/20220408203330.4014015-1-robh@kernel.org
Cc: <stable@vger.kernel.org>
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/perf/arm_pmu.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
index fe075d9f95e2..c87faafbdba2 100644
--- a/drivers/perf/arm_pmu.c
+++ b/drivers/perf/arm_pmu.c
@@ -398,6 +398,9 @@ validate_group(struct perf_event *event)
 	if (!validate_event(event->pmu, &fake_pmu, leader))
 		return -EINVAL;
 
+	if (event == leader)
+		return 0;
+
 	for_each_sibling_event(sibling, leader) {
 		if (!validate_event(event->pmu, &fake_pmu, sibling))
 			return -EINVAL;
@@ -487,12 +490,7 @@ __hw_perf_event_init(struct perf_event *event)
 		local64_set(&hwc->period_left, hwc->sample_period);
 	}
 
-	if (event->group_leader != event) {
-		if (validate_group(event) != 0)
-			return -EINVAL;
-	}
-
-	return 0;
+	return validate_group(event);
 }
 
 static int armpmu_event_init(struct perf_event *event)
-- 
Gitee


From 8109e1614b619b8cb7791a76ca932a6e9da496d8 Mon Sep 17 00:00:00 2001
From: kuyo chang <kuyo.chang@mediatek.com>
Date: Tue, 26 Jul 2022 17:39:17 +0800
Subject: [PATCH 625/674] sched/pelt: Fix attach_entity_load_avg() corner case

stable inclusion
from stable-v5.10.113
commit 88fcfd6ee6c5a617e712b346e9c15fc3057e532e
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=88fcfd6ee6c5a617e712b346e9c15fc3057e532e

--------------------------------

[ Upstream commit 40f5aa4c5eaebfeaca4566217cb9c468e28ed682 ]

The warning in cfs_rq_is_decayed() triggered:

    SCHED_WARN_ON(cfs_rq->avg.load_avg ||
		  cfs_rq->avg.util_avg ||
		  cfs_rq->avg.runnable_avg)

There exists a corner case in attach_entity_load_avg() which will
cause load_sum to be zero while load_avg will not be.

Consider se_weight is 88761 as per the sched_prio_to_weight[] table.
Further assume the get_pelt_divider() is 47742, this gives:
se->avg.load_avg is 1.

However, calculating load_sum:

  se->avg.load_sum = div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se));
  se->avg.load_sum = 1*47742/88761 = 0.

Then enqueue_load_avg() adds this to the cfs_rq totals:

  cfs_rq->avg.load_avg += se->avg.load_avg;
  cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;

Resulting in load_avg being 1 with load_sum is 0, which will trigger
the WARN.

Fixes: f207934fb79d ("sched/fair: Align PELT windows between cfs_rq and its se")
Signed-off-by: kuyo chang <kuyo.chang@mediatek.com>
[peterz: massage changelog]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Link: https://lkml.kernel.org/r/20220414090229.342-1-kuyo.chang@mediatek.com
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 kernel/sched/fair.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 50d457979db6..09f002f1fe5d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3771,11 +3771,11 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
 
 	se->avg.runnable_sum = se->avg.runnable_avg * divider;
 
-	se->avg.load_sum = divider;
-	if (se_weight(se)) {
-		se->avg.load_sum =
-			div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se));
-	}
+	se->avg.load_sum = se->avg.load_avg * divider;
+	if (se_weight(se) < se->avg.load_sum)
+		se->avg.load_sum = div_u64(se->avg.load_sum, se_weight(se));
+	else
+		se->avg.load_sum = 1;
 
 	enqueue_load_avg(cfs_rq, se);
 	cfs_rq->avg.util_avg += se->avg.util_avg;
-- 
Gitee


From 47edbfbf9e4c8b189615ea817ffc52e0dc011aed Mon Sep 17 00:00:00 2001
From: Zhipeng Xie <xiezhipeng1@huawei.com>
Date: Tue, 26 Jul 2022 17:39:18 +0800
Subject: [PATCH 626/674] perf/core: Fix perf_mmap fail when
 CONFIG_PERF_USE_VMALLOC enabled

stable inclusion
from stable-v5.10.113
commit 51d9cbbb0f5a175e5b4c4a25d2ec995363304860
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=51d9cbbb0f5a175e5b4c4a25d2ec995363304860

--------------------------------

[ Upstream commit 60490e7966659b26d74bf1fa4aa8693d9a94ca88 ]

This problem can be reproduced with CONFIG_PERF_USE_VMALLOC enabled on
both x86_64 and aarch64 arch when using sysdig -B(using ebpf)[1].
sysdig -B works fine after rebuilding the kernel with
CONFIG_PERF_USE_VMALLOC disabled.

I tracked it down to the if condition event->rb->nr_pages != nr_pages
in perf_mmap is true when CONFIG_PERF_USE_VMALLOC is enabled where
event->rb->nr_pages = 1 and nr_pages = 2048 resulting perf_mmap to
return -EINVAL. This is because when CONFIG_PERF_USE_VMALLOC is
enabled, rb->nr_pages is always equal to 1.

Arch with CONFIG_PERF_USE_VMALLOC enabled by default:
	arc/arm/csky/mips/sh/sparc/xtensa

Arch with CONFIG_PERF_USE_VMALLOC disabled by default:
	x86_64/aarch64/...

Fix this problem by using data_page_nr()

[1] https://github.com/draios/sysdig

Fixes: 906010b2134e ("perf_event: Provide vmalloc() based mmap() backing")
Signed-off-by: Zhipeng Xie <xiezhipeng1@huawei.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20220209145417.6495-1-xiezhipeng1@huawei.com
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 kernel/events/core.c        | 2 +-
 kernel/events/internal.h    | 5 +++++
 kernel/events/ring_buffer.c | 5 -----
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4bd9dd6c3b72..68dc8a8e7990 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6174,7 +6174,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 again:
 	mutex_lock(&event->mmap_mutex);
 	if (event->rb) {
-		if (event->rb->nr_pages != nr_pages) {
+		if (data_page_nr(event->rb) != nr_pages) {
 			ret = -EINVAL;
 			goto unlock;
 		}
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 228801e20788..aa23ffdaf819 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -116,6 +116,11 @@ static inline int page_order(struct perf_buffer *rb)
 }
 #endif
 
+static inline int data_page_nr(struct perf_buffer *rb)
+{
+	return rb->nr_pages << page_order(rb);
+}
+
 static inline unsigned long perf_data_size(struct perf_buffer *rb)
 {
 	return rb->nr_pages << (PAGE_SHIFT + page_order(rb));
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index ef91ae75ca56..4032cd475000 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -856,11 +856,6 @@ void rb_free(struct perf_buffer *rb)
 }
 
 #else
-static int data_page_nr(struct perf_buffer *rb)
-{
-	return rb->nr_pages << page_order(rb);
-}
-
 static struct page *
 __perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff)
 {
-- 
Gitee


From 5f70f8f020405ec27343ea6c8c7f580922443b4a Mon Sep 17 00:00:00 2001
From: Dave Stevenson <dave.stevenson@raspberrypi.com>
Date: Tue, 26 Jul 2022 17:39:19 +0800
Subject: [PATCH 627/674] drm/panel/raspberrypi-touchscreen: Avoid NULL deref
 if not initialised

stable inclusion
from stable-v5.10.113
commit 231381f5211620cec836b921f1c7a2cf702b3e8a
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=231381f5211620cec836b921f1c7a2cf702b3e8a

--------------------------------

[ Upstream commit f92055ae0acb035891e988ce345d6b81a0316423 ]

If a call to rpi_touchscreen_i2c_write from rpi_touchscreen_probe
fails before mipi_dsi_device_register_full is called, then
in trying to log the error message if uses ts->dsi->dev when
it is still NULL.

Use ts->i2c->dev instead, which is initialised earlier in probe.

Fixes: 2f733d6194bd ("drm/panel: Add support for the Raspberry Pi 7" Touchscreen.")
Signed-off-by: Dave Stevenson <dave.stevenson@raspberrypi.com>
Signed-off-by: Stefan Wahren <stefan.wahren@i2se.com>
Signed-off-by: Maxime Ripard <maxime@cerno.tech>
Link: https://patchwork.freedesktop.org/patch/msgid/20220415162513.42190-2-stefan.wahren@i2se.com
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/gpu/drm/panel/panel-raspberrypi-touchscreen.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/panel/panel-raspberrypi-touchscreen.c b/drivers/gpu/drm/panel/panel-raspberrypi-touchscreen.c
index bbdd086be7f5..90487df62480 100644
--- a/drivers/gpu/drm/panel/panel-raspberrypi-touchscreen.c
+++ b/drivers/gpu/drm/panel/panel-raspberrypi-touchscreen.c
@@ -229,7 +229,7 @@ static void rpi_touchscreen_i2c_write(struct rpi_touchscreen *ts,
 
 	ret = i2c_smbus_write_byte_data(ts->i2c, reg, val);
 	if (ret)
-		dev_err(&ts->dsi->dev, "I2C write failed: %d\n", ret);
+		dev_err(&ts->i2c->dev, "I2C write failed: %d\n", ret);
 }
 
 static int rpi_touchscreen_write(struct rpi_touchscreen *ts, u16 reg, u32 val)
-- 
Gitee


From 27f402d6441a30ef1541f3bca32429aceb1ce8de Mon Sep 17 00:00:00 2001
From: Dave Stevenson <dave.stevenson@raspberrypi.com>
Date: Tue, 26 Jul 2022 17:39:20 +0800
Subject: [PATCH 628/674] drm/panel/raspberrypi-touchscreen: Initialise the
 bridge in prepare

stable inclusion
from stable-v5.10.113
commit 405d98427416849cf37c84c0c70bd5008b686a1e
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=405d98427416849cf37c84c0c70bd5008b686a1e

--------------------------------

[ Upstream commit 5f18c0782b99e26121efa93d20b76c19e17aa1dd ]

The panel has a prepare call which is before video starts, and an
enable call which is after.
The Toshiba bridge should be configured before video, so move
the relevant power and initialisation calls to prepare.

Fixes: 2f733d6194bd ("drm/panel: Add support for the Raspberry Pi 7" Touchscreen.")
Signed-off-by: Dave Stevenson <dave.stevenson@raspberrypi.com>
Signed-off-by: Stefan Wahren <stefan.wahren@i2se.com>
Signed-off-by: Maxime Ripard <maxime@cerno.tech>
Link: https://patchwork.freedesktop.org/patch/msgid/20220415162513.42190-3-stefan.wahren@i2se.com
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/gpu/drm/panel/panel-raspberrypi-touchscreen.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/panel/panel-raspberrypi-touchscreen.c b/drivers/gpu/drm/panel/panel-raspberrypi-touchscreen.c
index 90487df62480..4b92c6341490 100644
--- a/drivers/gpu/drm/panel/panel-raspberrypi-touchscreen.c
+++ b/drivers/gpu/drm/panel/panel-raspberrypi-touchscreen.c
@@ -265,7 +265,7 @@ static int rpi_touchscreen_noop(struct drm_panel *panel)
 	return 0;
 }
 
-static int rpi_touchscreen_enable(struct drm_panel *panel)
+static int rpi_touchscreen_prepare(struct drm_panel *panel)
 {
 	struct rpi_touchscreen *ts = panel_to_ts(panel);
 	int i;
@@ -295,6 +295,13 @@ static int rpi_touchscreen_enable(struct drm_panel *panel)
 	rpi_touchscreen_write(ts, DSI_STARTDSI, 0x01);
 	msleep(100);
 
+	return 0;
+}
+
+static int rpi_touchscreen_enable(struct drm_panel *panel)
+{
+	struct rpi_touchscreen *ts = panel_to_ts(panel);
+
 	/* Turn on the backlight. */
 	rpi_touchscreen_i2c_write(ts, REG_PWM, 255);
 
@@ -349,7 +356,7 @@ static int rpi_touchscreen_get_modes(struct drm_panel *panel,
 static const struct drm_panel_funcs rpi_touchscreen_funcs = {
 	.disable = rpi_touchscreen_disable,
 	.unprepare = rpi_touchscreen_noop,
-	.prepare = rpi_touchscreen_noop,
+	.prepare = rpi_touchscreen_prepare,
 	.enable = rpi_touchscreen_enable,
 	.get_modes = rpi_touchscreen_get_modes,
 };
-- 
Gitee


From d4248c33b8a3403ca22cce4319412763975d9b9f Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Tue, 26 Jul 2022 17:39:21 +0800
Subject: [PATCH 629/674] KVM: PPC: Fix TCE handling for VFIO

stable inclusion
from stable-v5.10.113
commit f8f8b3124b899867a18a3f63e538c791e21252ac
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=f8f8b3124b899867a18a3f63e538c791e21252ac

--------------------------------

[ Upstream commit 26a62b750a4e6364b0393562f66759b1494c3a01 ]

The LoPAPR spec defines a guest visible IOMMU with a variable page size.
Currently QEMU advertises 4K, 64K, 2M, 16MB pages, a Linux VM picks
the biggest (16MB). In the case of a passed though PCI device, there is
a hardware IOMMU which does not support all pages sizes from the above -
P8 cannot do 2MB and P9 cannot do 16MB. So for each emulated
16M IOMMU page we may create several smaller mappings ("TCEs") in
the hardware IOMMU.

The code wrongly uses the emulated TCE index instead of hardware TCE
index in error handling. The problem is easier to see on POWER8 with
multi-level TCE tables (when only the first level is preallocated)
as hash mode uses real mode TCE hypercalls handlers.
The kernel starts using indirect tables when VMs get bigger than 128GB
(depends on the max page order).
The very first real mode hcall is going to fail with H_TOO_HARD as
in the real mode we cannot allocate memory for TCEs (we can in the virtual
mode) but on the way out the code attempts to clear hardware TCEs using
emulated TCE indexes which corrupts random kernel memory because
it_offset==1<<59 is subtracted from those indexes and the resulting index
is out of the TCE table bounds.

This fixes kvmppc_clear_tce() to use the correct TCE indexes.

While at it, this fixes TCE cache invalidation which uses emulated TCE
indexes instead of the hardware ones. This went unnoticed as 64bit DMA
is used these days and VMs map all RAM in one go and only then do DMA
and this is when the TCE cache gets populated.

Potentially this could slow down mapping, however normally 16MB
emulated pages are backed by 64K hardware pages so it is one write to
the "TCE Kill" per 256 updates which is not that bad considering the size
of the cache (1024 TCEs or so).

Fixes: ca1fc489cfa0 ("KVM: PPC: Book3S: Allow backing bigger guest IOMMU pages with smaller physical pages")

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Tested-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Frederic Barrat <fbarrat@linux.ibm.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20220420050840.328223-1-aik@ozlabs.ru
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 arch/powerpc/kvm/book3s_64_vio.c    | 45 +++++++++++++++--------------
 arch/powerpc/kvm/book3s_64_vio_hv.c | 44 ++++++++++++++--------------
 2 files changed, 45 insertions(+), 44 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index 8da93fdfa59e..c640053ab03f 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -421,13 +421,19 @@ static void kvmppc_tce_put(struct kvmppc_spapr_tce_table *stt,
 	tbl[idx % TCES_PER_PAGE] = tce;
 }
 
-static void kvmppc_clear_tce(struct mm_struct *mm, struct iommu_table *tbl,
-		unsigned long entry)
+static void kvmppc_clear_tce(struct mm_struct *mm, struct kvmppc_spapr_tce_table *stt,
+		struct iommu_table *tbl, unsigned long entry)
 {
-	unsigned long hpa = 0;
-	enum dma_data_direction dir = DMA_NONE;
+	unsigned long i;
+	unsigned long subpages = 1ULL << (stt->page_shift - tbl->it_page_shift);
+	unsigned long io_entry = entry << (stt->page_shift - tbl->it_page_shift);
+
+	for (i = 0; i < subpages; ++i) {
+		unsigned long hpa = 0;
+		enum dma_data_direction dir = DMA_NONE;
 
-	iommu_tce_xchg_no_kill(mm, tbl, entry, &hpa, &dir);
+		iommu_tce_xchg_no_kill(mm, tbl, io_entry + i, &hpa, &dir);
+	}
 }
 
 static long kvmppc_tce_iommu_mapped_dec(struct kvm *kvm,
@@ -486,6 +492,8 @@ static long kvmppc_tce_iommu_unmap(struct kvm *kvm,
 			break;
 	}
 
+	iommu_tce_kill(tbl, io_entry, subpages);
+
 	return ret;
 }
 
@@ -545,6 +553,8 @@ static long kvmppc_tce_iommu_map(struct kvm *kvm,
 			break;
 	}
 
+	iommu_tce_kill(tbl, io_entry, subpages);
+
 	return ret;
 }
 
@@ -591,10 +601,9 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 			ret = kvmppc_tce_iommu_map(vcpu->kvm, stt, stit->tbl,
 					entry, ua, dir);
 
-		iommu_tce_kill(stit->tbl, entry, 1);
 
 		if (ret != H_SUCCESS) {
-			kvmppc_clear_tce(vcpu->kvm->mm, stit->tbl, entry);
+			kvmppc_clear_tce(vcpu->kvm->mm, stt, stit->tbl, entry);
 			goto unlock_exit;
 		}
 	}
@@ -670,13 +679,13 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 		 */
 		if (get_user(tce, tces + i)) {
 			ret = H_TOO_HARD;
-			goto invalidate_exit;
+			goto unlock_exit;
 		}
 		tce = be64_to_cpu(tce);
 
 		if (kvmppc_tce_to_ua(vcpu->kvm, tce, &ua)) {
 			ret = H_PARAMETER;
-			goto invalidate_exit;
+			goto unlock_exit;
 		}
 
 		list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
@@ -685,19 +694,15 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 					iommu_tce_direction(tce));
 
 			if (ret != H_SUCCESS) {
-				kvmppc_clear_tce(vcpu->kvm->mm, stit->tbl,
-						entry);
-				goto invalidate_exit;
+				kvmppc_clear_tce(vcpu->kvm->mm, stt, stit->tbl,
+						 entry + i);
+				goto unlock_exit;
 			}
 		}
 
 		kvmppc_tce_put(stt, entry + i, tce);
 	}
 
-invalidate_exit:
-	list_for_each_entry_lockless(stit, &stt->iommu_tables, next)
-		iommu_tce_kill(stit->tbl, entry, npages);
-
 unlock_exit:
 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
 
@@ -736,20 +741,16 @@ long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu,
 				continue;
 
 			if (ret == H_TOO_HARD)
-				goto invalidate_exit;
+				return ret;
 
 			WARN_ON_ONCE(1);
-			kvmppc_clear_tce(vcpu->kvm->mm, stit->tbl, entry);
+			kvmppc_clear_tce(vcpu->kvm->mm, stt, stit->tbl, entry + i);
 		}
 	}
 
 	for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift))
 		kvmppc_tce_put(stt, ioba >> stt->page_shift, tce_value);
 
-invalidate_exit:
-	list_for_each_entry_lockless(stit, &stt->iommu_tables, next)
-		iommu_tce_kill(stit->tbl, ioba >> stt->page_shift, npages);
-
 	return ret;
 }
 EXPORT_SYMBOL_GPL(kvmppc_h_stuff_tce);
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
index e5ba96c41f3f..57af53a6a2d8 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -247,13 +247,19 @@ static void iommu_tce_kill_rm(struct iommu_table *tbl,
 		tbl->it_ops->tce_kill(tbl, entry, pages, true);
 }
 
-static void kvmppc_rm_clear_tce(struct kvm *kvm, struct iommu_table *tbl,
-		unsigned long entry)
+static void kvmppc_rm_clear_tce(struct kvm *kvm, struct kvmppc_spapr_tce_table *stt,
+		struct iommu_table *tbl, unsigned long entry)
 {
-	unsigned long hpa = 0;
-	enum dma_data_direction dir = DMA_NONE;
+	unsigned long i;
+	unsigned long subpages = 1ULL << (stt->page_shift - tbl->it_page_shift);
+	unsigned long io_entry = entry << (stt->page_shift - tbl->it_page_shift);
+
+	for (i = 0; i < subpages; ++i) {
+		unsigned long hpa = 0;
+		enum dma_data_direction dir = DMA_NONE;
 
-	iommu_tce_xchg_no_kill_rm(kvm->mm, tbl, entry, &hpa, &dir);
+		iommu_tce_xchg_no_kill_rm(kvm->mm, tbl, io_entry + i, &hpa, &dir);
+	}
 }
 
 static long kvmppc_rm_tce_iommu_mapped_dec(struct kvm *kvm,
@@ -316,6 +322,8 @@ static long kvmppc_rm_tce_iommu_unmap(struct kvm *kvm,
 			break;
 	}
 
+	iommu_tce_kill_rm(tbl, io_entry, subpages);
+
 	return ret;
 }
 
@@ -379,6 +387,8 @@ static long kvmppc_rm_tce_iommu_map(struct kvm *kvm,
 			break;
 	}
 
+	iommu_tce_kill_rm(tbl, io_entry, subpages);
+
 	return ret;
 }
 
@@ -424,10 +434,8 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 			ret = kvmppc_rm_tce_iommu_map(vcpu->kvm, stt,
 					stit->tbl, entry, ua, dir);
 
-		iommu_tce_kill_rm(stit->tbl, entry, 1);
-
 		if (ret != H_SUCCESS) {
-			kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry);
+			kvmppc_rm_clear_tce(vcpu->kvm, stt, stit->tbl, entry);
 			return ret;
 		}
 	}
@@ -569,7 +577,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 		ua = 0;
 		if (kvmppc_rm_tce_to_ua(vcpu->kvm, tce, &ua)) {
 			ret = H_PARAMETER;
-			goto invalidate_exit;
+			goto unlock_exit;
 		}
 
 		list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
@@ -578,19 +586,15 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 					iommu_tce_direction(tce));
 
 			if (ret != H_SUCCESS) {
-				kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl,
-						entry);
-				goto invalidate_exit;
+				kvmppc_rm_clear_tce(vcpu->kvm, stt, stit->tbl,
+						entry + i);
+				goto unlock_exit;
 			}
 		}
 
 		kvmppc_rm_tce_put(stt, entry + i, tce);
 	}
 
-invalidate_exit:
-	list_for_each_entry_lockless(stit, &stt->iommu_tables, next)
-		iommu_tce_kill_rm(stit->tbl, entry, npages);
-
 unlock_exit:
 	if (!prereg)
 		arch_spin_unlock(&kvm->mmu_lock.rlock.raw_lock);
@@ -632,20 +636,16 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu,
 				continue;
 
 			if (ret == H_TOO_HARD)
-				goto invalidate_exit;
+				return ret;
 
 			WARN_ON_ONCE_RM(1);
-			kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry);
+			kvmppc_rm_clear_tce(vcpu->kvm, stt, stit->tbl, entry + i);
 		}
 	}
 
 	for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift))
 		kvmppc_rm_tce_put(stt, ioba >> stt->page_shift, tce_value);
 
-invalidate_exit:
-	list_for_each_entry_lockless(stit, &stt->iommu_tables, next)
-		iommu_tce_kill_rm(stit->tbl, ioba >> stt->page_shift, npages);
-
 	return ret;
 }
 
-- 
Gitee


From c0d651a41c25108ae1ada52d0b1f1a3e3020d53f Mon Sep 17 00:00:00 2001
From: Miaoqian Lin <linmq006@gmail.com>
Date: Tue, 26 Jul 2022 17:39:22 +0800
Subject: [PATCH 630/674] drm/vc4: Use pm_runtime_resume_and_get to fix
 pm_runtime_get_sync() usage

stable inclusion
from stable-v5.10.113
commit 0a2cef65b32919af7df4df979c5eede5f7825f17
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=0a2cef65b32919af7df4df979c5eede5f7825f17

--------------------------------

[ Upstream commit 3d0b93d92a2790337aa9d18cb332d02356a24126 ]

If the device is already in a runtime PM enabled state
pm_runtime_get_sync() will return 1.

Also, we need to call pm_runtime_put_noidle() when pm_runtime_get_sync()
fails, so use pm_runtime_resume_and_get() instead. this function
will handle this.

Fixes: 4078f5757144 ("drm/vc4: Add DSI driver")
Signed-off-by: Miaoqian Lin <linmq006@gmail.com>
Signed-off-by: Maxime Ripard <maxime@cerno.tech>
Link: https://patchwork.freedesktop.org/patch/msgid/20220420135008.2757-1-linmq006@gmail.com
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/gpu/drm/vc4/vc4_dsi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/vc4/vc4_dsi.c b/drivers/gpu/drm/vc4/vc4_dsi.c
index eaf276978ee7..ad84b56f4091 100644
--- a/drivers/gpu/drm/vc4/vc4_dsi.c
+++ b/drivers/gpu/drm/vc4/vc4_dsi.c
@@ -835,7 +835,7 @@ static void vc4_dsi_encoder_enable(struct drm_encoder *encoder)
 	unsigned long phy_clock;
 	int ret;
 
-	ret = pm_runtime_get_sync(dev);
+	ret = pm_runtime_resume_and_get(dev);
 	if (ret) {
 		DRM_ERROR("Failed to runtime PM enable on DSI%d\n", dsi->port);
 		return;
-- 
Gitee


From a0820bde54b9a6f88f03400828dd5f0fdfa2051f Mon Sep 17 00:00:00 2001
From: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Date: Tue, 26 Jul 2022 17:39:23 +0800
Subject: [PATCH 631/674] powerpc/perf: Fix power9 event alternatives

stable inclusion
from stable-v5.10.113
commit e012f9d1af54ca3c24ca0e9ec03a1a212972771c
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=e012f9d1af54ca3c24ca0e9ec03a1a212972771c

--------------------------------

[ Upstream commit 0dcad700bb2776e3886fe0a645a4bf13b1e747cd ]

When scheduling a group of events, there are constraint checks done to
make sure all events can go in a group. Example, one of the criteria is
that events in a group cannot use the same PMC. But platform specific
PMU supports alternative event for some of the event codes. During
perf_event_open(), if any event group doesn't match constraint check
criteria, further lookup is done to find alternative event.

By current design, the array of alternatives events in PMU code is
expected to be sorted by column 0. This is because in
find_alternative() the return criteria is based on event code
comparison. ie. "event < ev_alt[i][0])". This optimisation is there
since find_alternative() can be called multiple times. In power9 PMU
code, the alternative event array is not sorted properly and hence there
is breakage in finding alternative events.

To work with existing logic, fix the alternative event array to be
sorted by column 0 for power9-pmu.c

Results:

With alternative events, multiplexing can be avoided. That is, for
example, in power9 PM_LD_MISS_L1 (0x3e054) has alternative event,
PM_LD_MISS_L1_ALT (0x400f0). This is an identical event which can be
programmed in a different PMC.

Before:

 # perf stat -e r3e054,r300fc

 Performance counter stats for 'system wide':

           1057860      r3e054              (50.21%)
               379      r300fc              (49.79%)

       0.944329741 seconds time elapsed

Since both the events are using PMC3 in this case, they are
multiplexed here.

After:

 # perf stat -e r3e054,r300fc

 Performance counter stats for 'system wide':

           1006948      r3e054
               182      r300fc

Fixes: 91e0bd1e6251 ("powerpc/perf: Add PM_LD_MISS_L1 and PM_BR_2PATH to power9 event list")
Signed-off-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Reviewed-by: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20220419114828.89843-1-atrajeev@linux.vnet.ibm.com
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 arch/powerpc/perf/power9-pmu.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/perf/power9-pmu.c b/arch/powerpc/perf/power9-pmu.c
index 2a57e93a79dc..7245355bee28 100644
--- a/arch/powerpc/perf/power9-pmu.c
+++ b/arch/powerpc/perf/power9-pmu.c
@@ -133,11 +133,11 @@ int p9_dd22_bl_ev[] = {
 
 /* Table of alternatives, sorted by column 0 */
 static const unsigned int power9_event_alternatives[][MAX_ALT] = {
-	{ PM_INST_DISP,			PM_INST_DISP_ALT },
-	{ PM_RUN_CYC_ALT,		PM_RUN_CYC },
-	{ PM_RUN_INST_CMPL_ALT,		PM_RUN_INST_CMPL },
-	{ PM_LD_MISS_L1,		PM_LD_MISS_L1_ALT },
 	{ PM_BR_2PATH,			PM_BR_2PATH_ALT },
+	{ PM_INST_DISP,			PM_INST_DISP_ALT },
+	{ PM_RUN_CYC_ALT,               PM_RUN_CYC },
+	{ PM_LD_MISS_L1,                PM_LD_MISS_L1_ALT },
+	{ PM_RUN_INST_CMPL_ALT,         PM_RUN_INST_CMPL },
 };
 
 static int power9_get_alternatives(u64 event, unsigned int flags, u64 alt[])
-- 
Gitee


From ce66add4b99dde3edf99491d41ea364b6005683c Mon Sep 17 00:00:00 2001
From: Leo Yan <leo.yan@linaro.org>
Date: Tue, 26 Jul 2022 17:39:24 +0800
Subject: [PATCH 632/674] perf report: Set PERF_SAMPLE_DATA_SRC bit for Arm SPE
 event

stable inclusion
from stable-v5.10.113
commit 19590bbc691d81f03d2a24a3ec30c399ebe071e0
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=19590bbc691d81f03d2a24a3ec30c399ebe071e0

--------------------------------

[ Upstream commit ccb17caecfbd542f49a2a79ae088136ba8bfb794 ]

Since commit bb30acae4c4dacfa ("perf report: Bail out --mem-mode if mem
info is not available") "perf mem report" and "perf report --mem-mode"
don't report result if the PERF_SAMPLE_DATA_SRC bit is missed in sample
type.

The commit ffab487052054162 ("perf: arm-spe: Fix perf report
--mem-mode") partially fixes the issue.  It adds PERF_SAMPLE_DATA_SRC
bit for Arm SPE event, this allows the perf data file generated by
kernel v5.18-rc1 or later version can be reported properly.

On the other hand, perf tool still fails to be backward compatibility
for a data file recorded by an older version's perf which contains Arm
SPE trace data.  This patch is a workaround in reporting phase, when
detects ARM SPE PMU event and without PERF_SAMPLE_DATA_SRC bit, it will
force to set the bit in the sample type and give a warning info.

Fixes: bb30acae4c4dacfa ("perf report: Bail out --mem-mode if mem info is not available")
Reviewed-by: James Clark <james.clark@arm.com>
Signed-off-by: Leo Yan <leo.yan@linaro.org>
Tested-by: German Gomez <german.gomez@arm.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
Link: https://lore.kernel.org/r/20220414123201.842754-1-leo.yan@linaro.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 tools/perf/builtin-report.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 91cab5cdfbc1..b55ee073c2f7 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -340,6 +340,7 @@ static int report__setup_sample_type(struct report *rep)
 	struct perf_session *session = rep->session;
 	u64 sample_type = evlist__combined_sample_type(session->evlist);
 	bool is_pipe = perf_data__is_pipe(session->data);
+	struct evsel *evsel;
 
 	if (session->itrace_synth_opts->callchain ||
 	    session->itrace_synth_opts->add_callchain ||
@@ -394,6 +395,19 @@ static int report__setup_sample_type(struct report *rep)
 	}
 
 	if (sort__mode == SORT_MODE__MEMORY) {
+		/*
+		 * FIXUP: prior to kernel 5.18, Arm SPE missed to set
+		 * PERF_SAMPLE_DATA_SRC bit in sample type.  For backward
+		 * compatibility, set the bit if it's an old perf data file.
+		 */
+		evlist__for_each_entry(session->evlist, evsel) {
+			if (strstr(evsel->name, "arm_spe") &&
+				!(sample_type & PERF_SAMPLE_DATA_SRC)) {
+				evsel->core.attr.sample_type |= PERF_SAMPLE_DATA_SRC;
+				sample_type |= PERF_SAMPLE_DATA_SRC;
+			}
+		}
+
 		if (!is_pipe && !(sample_type & PERF_SAMPLE_DATA_SRC)) {
 			ui__error("Selected --mem-mode but no mem data. "
 				  "Did you call perf record without -d?\n");
-- 
Gitee


From 97cada6b51f5737aba63d7d3f69cf413a3c30a1b Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Tue, 26 Jul 2022 17:39:25 +0800
Subject: [PATCH 633/674] ext4: fix fallocate to use file_modified to update
 permissions consistently

stable inclusion
from stable-v5.10.113
commit f6038d43b25bba1cd50d2a77e207f6550aee9954
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=f6038d43b25bba1cd50d2a77e207f6550aee9954

--------------------------------

commit ad5cd4f4ee4d5fcdb1bfb7a0c073072961e70783 upstream.

Since the initial introduction of (posix) fallocate back at the turn of
the century, it has been possible to use this syscall to change the
user-visible contents of files.  This can happen by extending the file
size during a preallocation, or through any of the newer modes (punch,
zero, collapse, insert range).  Because the call can be used to change
file contents, we should treat it like we do any other modification to a
file -- update the mtime, and drop set[ug]id privileges/capabilities.

The VFS function file_modified() does all this for us if pass it a
locked inode, so let's make fallocate drop permissions correctly.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Link: https://lore.kernel.org/r/20220308185043.GA117678@magnolia
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 fs/ext4/ext4.h    |  2 +-
 fs/ext4/extents.c | 32 +++++++++++++++++++++++++-------
 fs/ext4/inode.c   |  7 ++++++-
 3 files changed, 32 insertions(+), 9 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 277f89d5de03..df43cda8fc2f 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2908,7 +2908,7 @@ extern int ext4_inode_attach_jinode(struct inode *inode);
 extern int ext4_can_truncate(struct inode *inode);
 extern int ext4_truncate(struct inode *);
 extern int ext4_break_layouts(struct inode *);
-extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length);
+extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length);
 extern void ext4_set_inode_flags(struct inode *, bool init);
 extern int ext4_alloc_da_blocks(struct inode *inode);
 extern void ext4_set_aops(struct inode *inode);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 4323186bae78..9d06695c04ab 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4510,9 +4510,9 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
 	return ret > 0 ? ret2 : ret;
 }
 
-static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len);
+static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len);
 
-static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len);
+static int ext4_insert_range(struct file *file, loff_t offset, loff_t len);
 
 static long ext4_zero_range(struct file *file, loff_t offset,
 			    loff_t len, int mode)
@@ -4583,6 +4583,10 @@ static long ext4_zero_range(struct file *file, loff_t offset,
 	/* Wait all existing dio workers, newcomers will block on i_mutex */
 	inode_dio_wait(inode);
 
+	ret = file_modified(file);
+	if (ret)
+		goto out_mutex;
+
 	/* Preallocate the range including the unaligned edges */
 	if (partial_begin || partial_end) {
 		ret = ext4_alloc_file_blocks(file,
@@ -4707,17 +4711,17 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 		goto exit;
 
 	if (mode & FALLOC_FL_PUNCH_HOLE) {
-		ret = ext4_punch_hole(inode, offset, len);
+		ret = ext4_punch_hole(file, offset, len);
 		goto exit;
 	}
 
 	if (mode & FALLOC_FL_COLLAPSE_RANGE) {
-		ret = ext4_collapse_range(inode, offset, len);
+		ret = ext4_collapse_range(file, offset, len);
 		goto exit;
 	}
 
 	if (mode & FALLOC_FL_INSERT_RANGE) {
-		ret = ext4_insert_range(inode, offset, len);
+		ret = ext4_insert_range(file, offset, len);
 		goto exit;
 	}
 
@@ -4753,6 +4757,10 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 	/* Wait all existing dio workers, newcomers will block on i_mutex */
 	inode_dio_wait(inode);
 
+	ret = file_modified(file);
+	if (ret)
+		goto out;
+
 	ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags);
 	if (ret)
 		goto out;
@@ -5255,8 +5263,9 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
  * This implements the fallocate's collapse range functionality for ext4
  * Returns: 0 and non-zero on error.
  */
-static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
+static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
 {
+	struct inode *inode = file_inode(file);
 	struct super_block *sb = inode->i_sb;
 	ext4_lblk_t punch_start, punch_stop;
 	handle_t *handle;
@@ -5307,6 +5316,10 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
 	/* Wait for existing dio to complete */
 	inode_dio_wait(inode);
 
+	ret = file_modified(file);
+	if (ret)
+		goto out_mutex;
+
 	/*
 	 * Prevent page faults from reinstantiating pages we have released from
 	 * page cache.
@@ -5401,8 +5414,9 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
  * by len bytes.
  * Returns 0 on success, error otherwise.
  */
-static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
+static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
 {
+	struct inode *inode = file_inode(file);
 	struct super_block *sb = inode->i_sb;
 	handle_t *handle;
 	struct ext4_ext_path *path;
@@ -5458,6 +5472,10 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
 	/* Wait for existing dio to complete */
 	inode_dio_wait(inode);
 
+	ret = file_modified(file);
+	if (ret)
+		goto out_mutex;
+
 	/*
 	 * Prevent page faults from reinstantiating pages we have released from
 	 * page cache.
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index e85c238edd85..69fcf82f2c81 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3972,8 +3972,9 @@ int ext4_break_layouts(struct inode *inode)
  * Returns: 0 on success or negative on failure
  */
 
-int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
+int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
 {
+	struct inode *inode = file_inode(file);
 	struct super_block *sb = inode->i_sb;
 	ext4_lblk_t first_block, stop_block;
 	struct address_space *mapping = inode->i_mapping;
@@ -4026,6 +4027,10 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
 	/* Wait all existing dio workers, newcomers will block on i_mutex */
 	inode_dio_wait(inode);
 
+	ret = file_modified(file);
+	if (ret)
+		goto out_mutex;
+
 	/*
 	 * Prevent page faults from reinstantiating pages we have released from
 	 * page cache.
-- 
Gitee


From aaf9e2faf5bb4ea97264d9f45f6c6e35f73f17e8 Mon Sep 17 00:00:00 2001
From: Tadeusz Struk <tadeusz.struk@linaro.org>
Date: Tue, 26 Jul 2022 17:39:26 +0800
Subject: [PATCH 634/674] ext4: limit length to bitmap_maxbytes - blocksize in
 punch_hole

stable inclusion
from stable-v5.10.113
commit 22c450d39f8922ae26de459cf4f83b2b294f207e
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=22c450d39f8922ae26de459cf4f83b2b294f207e

--------------------------------

commit 2da376228a2427501feb9d15815a45dbdbdd753e upstream.

Syzbot found an issue [1] in ext4_fallocate().
The C reproducer [2] calls fallocate(), passing size 0xffeffeff000ul,
and offset 0x1000000ul, which, when added together exceed the
bitmap_maxbytes for the inode. This triggers a BUG in
ext4_ind_remove_space(). According to the comments in this function
the 'end' parameter needs to be one block after the last block to be
removed. In the case when the BUG is triggered it points to the last
block. Modify the ext4_punch_hole() function and add constraint that
caps the length to satisfy the one before laster block requirement.

LINK: [1] https://syzkaller.appspot.com/bug?id=b80bd9cf348aac724a4f4dff251800106d721331
LINK: [2] https://syzkaller.appspot.com/text?tag=ReproC&x=14ba0238700000

Fixes: a4bb6b64e39a ("ext4: enable "punch hole" functionality")
Reported-by: syzbot+7a806094edd5d07ba029@syzkaller.appspotmail.com
Signed-off-by: Tadeusz Struk <tadeusz.struk@linaro.org>
Link: https://lore.kernel.org/r/20220331200515.153214-1-tadeusz.struk@linaro.org
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 fs/ext4/inode.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 69fcf82f2c81..f4e0c7cc4820 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3978,7 +3978,8 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
 	struct super_block *sb = inode->i_sb;
 	ext4_lblk_t first_block, stop_block;
 	struct address_space *mapping = inode->i_mapping;
-	loff_t first_block_offset, last_block_offset;
+	loff_t first_block_offset, last_block_offset, max_length;
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	handle_t *handle;
 	unsigned int credits;
 	int ret = 0, ret2 = 0;
@@ -4012,6 +4013,14 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
 		   offset;
 	}
 
+	/*
+	 * For punch hole the length + offset needs to be within one block
+	 * before last range. Adjust the length if it goes beyond that limit.
+	 */
+	max_length = sbi->s_bitmap_maxbytes - inode->i_sb->s_blocksize;
+	if (offset + length > max_length)
+		length = max_length - offset;
+
 	if (offset & (sb->s_blocksize - 1) ||
 	    (offset + length) & (sb->s_blocksize - 1)) {
 		/*
-- 
Gitee


From cf4de22f4fd1ee1f6367a12848686f71d1293010 Mon Sep 17 00:00:00 2001
From: "wangjianjian (C)" <wangjianjian3@huawei.com>
Date: Tue, 26 Jul 2022 17:39:27 +0800
Subject: [PATCH 635/674] ext4, doc: fix incorrect h_reserved size

stable inclusion
from stable-v5.10.113
commit 0c54b093766becb9c83317232c93290cf612b8ff
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=0c54b093766becb9c83317232c93290cf612b8ff

--------------------------------

commit 7102ffe4c166ca0f5e35137e9f9de83768c2d27d upstream.

According to document and code, ext4_xattr_header's size is 32 bytes, so
h_reserved size should be 3.

Signed-off-by: Wang Jianjian <wangjianjian3@huawei.com>
Link: https://lore.kernel.org/r/92fcc3a6-7d77-8c09-4126-377fcb4c46a5@huawei.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 Documentation/filesystems/ext4/attributes.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/filesystems/ext4/attributes.rst b/Documentation/filesystems/ext4/attributes.rst
index 54386a010a8d..871d2da7a0a9 100644
--- a/Documentation/filesystems/ext4/attributes.rst
+++ b/Documentation/filesystems/ext4/attributes.rst
@@ -76,7 +76,7 @@ The beginning of an extended attribute block is in
      - Checksum of the extended attribute block.
    * - 0x14
      - \_\_u32
-     - h\_reserved[2]
+     - h\_reserved[3]
      - Zero.
 
 The checksum is calculated against the FS UUID, the 64-bit block number
-- 
Gitee


From 27d40cc66fb275b8c38be0766f63cd14118d4423 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 26 Jul 2022 17:39:28 +0800
Subject: [PATCH 636/674] ext4: fix overhead calculation to account for the
 reserved gdt blocks

stable inclusion
from stable-v5.10.113
commit 4789149b9ea2a1893c62d816742f1a76514fc901
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=4789149b9ea2a1893c62d816742f1a76514fc901

--------------------------------

commit 10b01ee92df52c8d7200afead4d5e5f55a5c58b1 upstream.

The kernel calculation was underestimating the overhead by not taking
into account the reserved gdt blocks.  With this change, the overhead
calculated by the kernel matches the overhead calculation in mke2fs.

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 fs/ext4/super.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 26c938360895..34e4e00445da 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3912,9 +3912,11 @@ static int count_overhead(struct super_block *sb, ext4_group_t grp,
 	ext4_fsblk_t		first_block, last_block, b;
 	ext4_group_t		i, ngroups = ext4_get_groups_count(sb);
 	int			s, j, count = 0;
+	int			has_super = ext4_bg_has_super(sb, grp);
 
 	if (!ext4_has_feature_bigalloc(sb))
-		return (ext4_bg_has_super(sb, grp) + ext4_bg_num_gdb(sb, grp) +
+		return (has_super + ext4_bg_num_gdb(sb, grp) +
+			(has_super ? le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0) +
 			sbi->s_itb_per_group + 2);
 
 	first_block = le32_to_cpu(sbi->s_es->s_first_data_block) +
-- 
Gitee


From fd9c2ff88c62fbb32351bc8165f473a54e91817c Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 26 Jul 2022 17:39:29 +0800
Subject: [PATCH 637/674] ext4: force overhead calculation if the
 s_overhead_cluster makes no sense

stable inclusion
from stable-v5.10.113
commit e1e96e37272156d691203a3725b876787f38c8f2
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=e1e96e37272156d691203a3725b876787f38c8f2

--------------------------------

commit 85d825dbf4899a69407338bae462a59aa9a37326 upstream.

If the file system does not use bigalloc, calculating the overhead is
cheap, so force the recalculation of the overhead so we don't have to
trust the precalculated overhead in the superblock.

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 fs/ext4/super.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 34e4e00445da..706a159817ee 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -4977,9 +4977,18 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	 * Get the # of file system overhead blocks from the
 	 * superblock if present.
 	 */
-	if (es->s_overhead_clusters)
-		sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters);
-	else {
+	sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters);
+	/* ignore the precalculated value if it is ridiculous */
+	if (sbi->s_overhead > ext4_blocks_count(es))
+		sbi->s_overhead = 0;
+	/*
+	 * If the bigalloc feature is not enabled recalculating the
+	 * overhead doesn't take long, so we might as well just redo
+	 * it to make sure we are using the correct value.
+	 */
+	if (!ext4_has_feature_bigalloc(sb))
+		sbi->s_overhead = 0;
+	if (sbi->s_overhead == 0) {
 		err = ext4_calculate_overhead(sb);
 		if (err)
 			goto failed_mount_wq;
-- 
Gitee


From f120da4c1f854aadb526a8fd86e3caf0c9fdbe5e Mon Sep 17 00:00:00 2001
From: Oliver Hartkopp <socketcan@hartkopp.net>
Date: Tue, 26 Jul 2022 17:39:30 +0800
Subject: [PATCH 638/674] can: isotp: stop timeout monitoring when no first
 frame was sent

stable inclusion
from stable-v5.10.113
commit 50aac44273600cb0ae1efd010bb1de7701444a41
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=50aac44273600cb0ae1efd010bb1de7701444a41

--------------------------------

commit d73497081710c876c3c61444445512989e102152 upstream.

The first attempt to fix a the 'impossible' WARN_ON_ONCE(1) in
isotp_tx_timer_handler() focussed on the identical CAN IDs created by
the syzbot reproducer and lead to upstream fix/commit 3ea566422cbd
("can: isotp: sanitize CAN ID checks in isotp_bind()"). But this did
not catch the root cause of the wrong tx.state in the tx_timer handler.

In the isotp 'first frame' case a timeout monitoring needs to be started
before the 'first frame' is send. But when this sending failed the timeout
monitoring for this specific frame has to be disabled too.

Otherwise the tx_timer is fired with the 'warn me' tx.state of ISOTP_IDLE.

Fixes: e057dd3fc20f ("can: add ISO 15765-2:2016 transport protocol")
Link: https://lore.kernel.org/all/20220405175112.2682-1-socketcan@hartkopp.net
Reported-by: syzbot+2339c27f5c66c652843e@syzkaller.appspotmail.com
Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 net/can/isotp.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/net/can/isotp.c b/net/can/isotp.c
index 9a4a9c5a9f24..c515bbd46c67 100644
--- a/net/can/isotp.c
+++ b/net/can/isotp.c
@@ -864,6 +864,7 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
 	struct canfd_frame *cf;
 	int ae = (so->opt.flags & CAN_ISOTP_EXTEND_ADDR) ? 1 : 0;
 	int wait_tx_done = (so->opt.flags & CAN_ISOTP_WAIT_TX_DONE) ? 1 : 0;
+	s64 hrtimer_sec = 0;
 	int off;
 	int err;
 
@@ -962,7 +963,9 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
 		isotp_create_fframe(cf, so, ae);
 
 		/* start timeout for FC */
-		hrtimer_start(&so->txtimer, ktime_set(1, 0), HRTIMER_MODE_REL_SOFT);
+		hrtimer_sec = 1;
+		hrtimer_start(&so->txtimer, ktime_set(hrtimer_sec, 0),
+			      HRTIMER_MODE_REL_SOFT);
 	}
 
 	/* send the first or only CAN frame */
@@ -975,6 +978,11 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
 	if (err) {
 		pr_notice_once("can-isotp: %s: can_send_ret %d\n",
 			       __func__, err);
+
+		/* no transmission -> no timeout monitoring */
+		if (hrtimer_sec)
+			hrtimer_cancel(&so->txtimer);
+
 		goto err_out_drop;
 	}
 
-- 
Gitee


From 0c5ab04a2f48e4ea5814c339410bc679062b41ea Mon Sep 17 00:00:00 2001
From: Tudor Ambarus <tudor.ambarus@microchip.com>
Date: Tue, 26 Jul 2022 17:39:31 +0800
Subject: [PATCH 639/674] spi: atmel-quadspi: Fix the buswidth adjustment
 between spi-mem and controller

stable inclusion
from stable-v5.10.113
commit dccee748af17fc087ff5017152e532ef8e18c8c0
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=dccee748af17fc087ff5017152e532ef8e18c8c0

--------------------------------

commit 8c235cc25087495c4288d94f547e9d3061004991 upstream.

Use the spi_mem_default_supports_op() core helper in order to take into
account the buswidth specified by the user in device tree.

Cc: <stable@vger.kernel.org>
Fixes: 0e6aae08e9ae ("spi: Add QuadSPI driver for Atmel SAMA5D2")
Signed-off-by: Tudor Ambarus <tudor.ambarus@microchip.com>
Link: https://lore.kernel.org/r/20220406133604.455356-1-tudor.ambarus@microchip.com
Signed-off-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/spi/atmel-quadspi.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/spi/atmel-quadspi.c b/drivers/spi/atmel-quadspi.c
index 1e63fd4821f9..8aa89d93db11 100644
--- a/drivers/spi/atmel-quadspi.c
+++ b/drivers/spi/atmel-quadspi.c
@@ -277,6 +277,9 @@ static int atmel_qspi_find_mode(const struct spi_mem_op *op)
 static bool atmel_qspi_supports_op(struct spi_mem *mem,
 				   const struct spi_mem_op *op)
 {
+	if (!spi_mem_default_supports_op(mem, op))
+		return false;
+
 	if (atmel_qspi_find_mode(op) < 0)
 		return false;
 
-- 
Gitee


From dd76b5fe66faa3615f8d284d1fd07f1cccabf125 Mon Sep 17 00:00:00 2001
From: Lee Jones <lee.jones@linaro.org>
Date: Tue, 26 Jul 2022 17:39:32 +0800
Subject: [PATCH 640/674] staging: ion: Prevent incorrect reference counting
 behavour

stable inclusion
from stable-v5.10.113
commit fea24b07edfc348c67a019b6e17b39c0698e631f
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=fea24b07edfc348c67a019b6e17b39c0698e631f

--------------------------------

Supply additional check in order to prevent unexpected results.

Fixes: b892bf75b2034 ("ion: Switch ion to use dma-buf")
Suggested-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/staging/android/ion/ion.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/staging/android/ion/ion.c b/drivers/staging/android/ion/ion.c
index e1fe03ceb7f1..e6d4a3ee6cda 100644
--- a/drivers/staging/android/ion/ion.c
+++ b/drivers/staging/android/ion/ion.c
@@ -114,6 +114,9 @@ static void *ion_buffer_kmap_get(struct ion_buffer *buffer)
 	void *vaddr;
 
 	if (buffer->kmap_cnt) {
+		if (buffer->kmap_cnt == INT_MAX)
+			return ERR_PTR(-EOVERFLOW);
+
 		buffer->kmap_cnt++;
 		return buffer->vaddr;
 	}
-- 
Gitee


From 9a1fc0d8feaf050654872afaa9e66323f12e4a11 Mon Sep 17 00:00:00 2001
From: Khazhismel Kumykov <khazhy@google.com>
Date: Tue, 26 Jul 2022 17:39:33 +0800
Subject: [PATCH 641/674] block/compat_ioctl: fix range check in BLKGETSIZE

stable inclusion
from stable-v5.10.113
commit 8bedbc8f7f35f533000b347644b6bf1f62524676
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=8bedbc8f7f35f533000b347644b6bf1f62524676

--------------------------------

commit ccf16413e520164eb718cf8b22a30438da80ff23 upstream.

kernel ulong and compat_ulong_t may not be same width. Use type directly
to eliminate mismatches.

This would result in truncation rather than EFBIG for 32bit mode for
large disks.

Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Khazhismel Kumykov <khazhy@google.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Link: https://lore.kernel.org/r/20220414224056.2875681-1-khazhy@google.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 block/ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/ioctl.c b/block/ioctl.c
index 8171858dc8a9..7c578f84a4fd 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -683,7 +683,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 			       (bdev->bd_bdi->ra_pages * PAGE_SIZE) / 512);
 	case BLKGETSIZE:
 		size = i_size_read(bdev->bd_inode);
-		if ((size >> 9) > ~0UL)
+		if ((size >> 9) > ~(compat_ulong_t)0)
 			return -EFBIG;
 		return compat_put_ulong(argp, size >> 9);
 
-- 
Gitee


From 156b297acb77278b3f11077a9e8f92e81c87aebe Mon Sep 17 00:00:00 2001
From: Marek Vasut <marex@denx.de>
Date: Tue, 26 Jul 2022 17:39:34 +0800
Subject: [PATCH 642/674] Revert "net: micrel: fix KS8851_MLL Kconfig"

stable inclusion
from stable-v5.10.113
commit 7992fdb045fbc7cb0e34eba464b73044585c0638
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=7992fdb045fbc7cb0e34eba464b73044585c0638

--------------------------------

This reverts commit 1ff5359afa5ec0dd09fe76183dc4fa24b50e4125 which is
commit c3efcedd272aa6dd5929e20cf902a52ddaa1197a upstream.

The upstream commit c3efcedd272a ("net: micrel: fix KS8851_MLL Kconfig")
depends on e5f31552674e ("ethernet: fix PTP_1588_CLOCK dependencies")
which is not part of Linux 5.10.y . Revert the aforementioned commit to
prevent breakage in 5.10.y .

Signed-off-by: Marek Vasut <marex@denx.de>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Sasha Levin <sashal@kernel.org>
Cc: <stable@vger.kernel.org> # 5.10.x
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/net/ethernet/micrel/Kconfig | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/net/ethernet/micrel/Kconfig b/drivers/net/ethernet/micrel/Kconfig
index 9ceb7e1fb169..42bc014136fe 100644
--- a/drivers/net/ethernet/micrel/Kconfig
+++ b/drivers/net/ethernet/micrel/Kconfig
@@ -37,7 +37,6 @@ config KS8851
 config KS8851_MLL
 	tristate "Micrel KS8851 MLL"
 	depends on HAS_IOMEM
-	depends on PTP_1588_CLOCK_OPTIONAL
 	select MII
 	select CRC32
 	select EEPROM_93CX6
-- 
Gitee


From 03a99552eb74a789de7de2a13b0ea9b31dcf2dfa Mon Sep 17 00:00:00 2001
From: Zhengchao Shao <shaozhengchao@huawei.com>
Date: Thu, 28 Jul 2022 18:06:31 +0800
Subject: [PATCH 643/674] bpf: Don't redirect packets with invalid pkt_len

mainline inclusion
from mainline-v5.19-rc6
commit fd1894224407c484f652ad456e1ce423e89bb3eb
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HWKR
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/commit/?id=fd1894224407

--------------------------------

Syzbot found an issue [1]: fq_codel_drop() try to drop a flow whitout any
skbs, that is, the flow->head is null.
The root cause, as the [2] says, is because that bpf_prog_test_run_skb()
run a bpf prog which redirects empty skbs.
So we should determine whether the length of the packet modified by bpf
prog or others like bpf_prog_test is valid before forwarding it directly.

LINK: [1] https://syzkaller.appspot.com/bug?id=0b84da80c2917757915afa89f7738a9d16ec96c5
LINK: [2] https://www.spinics.net/lists/netdev/msg777503.html

Reported-by: syzbot+7a12909485b94426aceb@syzkaller.appspotmail.com
Signed-off-by: Zhengchao Shao <shaozhengchao@huawei.com>
Reviewed-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/r/20220715115559.139691-1-shaozhengchao@huawei.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Wei Yongjun <weiyongjun1@huawei.com>
Reviewed-by: Yue Haibing <yuehaibing@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 include/linux/skbuff.h | 8 ++++++++
 net/bpf/test_run.c     | 3 +++
 net/core/dev.c         | 1 +
 3 files changed, 12 insertions(+)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 68efccc15a87..72cfb047fd2c 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2251,6 +2251,14 @@ static inline void skb_set_tail_pointer(struct sk_buff *skb, const int offset)
 
 #endif /* NET_SKBUFF_DATA_USES_OFFSET */
 
+static inline void skb_assert_len(struct sk_buff *skb)
+{
+#ifdef CONFIG_DEBUG_NET
+	if (WARN_ONCE(!skb->len, "%s\n", __func__))
+		DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false);
+#endif /* CONFIG_DEBUG_NET */
+}
+
 /*
  *	Add data to an sk_buff
  */
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 99712d35e535..f266a9453c8e 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -398,6 +398,9 @@ static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb)
 {
 	struct qdisc_skb_cb *cb = (struct qdisc_skb_cb *)skb->cb;
 
+	if (!skb->len)
+		return -EINVAL;
+
 	if (!__skb)
 		return 0;
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 12089c484b30..8e4de36eede8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4094,6 +4094,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
 	bool again = false;
 
 	skb_reset_mac_header(skb);
+	skb_assert_len(skb);
 
 	if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
 		__skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
-- 
Gitee


From 92561effc05f39fbf33d80fd0f9e5978fb7b3a71 Mon Sep 17 00:00:00 2001
From: Sven Schnelle <svens@linux.ibm.com>
Date: Thu, 28 Jul 2022 18:06:32 +0800
Subject: [PATCH 644/674] ucounts: add missing data type changes

mainline inclusion
from mainline-v5.14-rc6
commit f153c2246783ba210493054d99c66353f56423c9
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5IDIC
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=f153c2246783ba210493054d99c66353f56423c9

--------------------------------

commit f9c82a4ea89c3 ("Increase size of ucounts to atomic_long_t")
changed the data type of ucounts/ucounts_max to long, but missed to
adjust a few other places. This is noticeable on big endian platforms
from user space because the /proc/sys/user/max_*_names files all
contain 0.

v4 - Made the min and max constants long so the sysctl values
     are actually settable on little endian machines.
     -- EWB

Fixes: f9c82a4ea89c ("Increase size of ucounts to atomic_long_t")
Signed-off-by: Sven Schnelle <svens@linux.ibm.com>
Tested-by: Nathan Chancellor <nathan@kernel.org>
Tested-by: Linux Kernel Functional Testing <lkft@linaro.org>
Acked-by: Alexey Gladkov <legion@kernel.org>
v1: https://lkml.kernel.org/r/20210721115800.910778-1-svens@linux.ibm.com
v2: https://lkml.kernel.org/r/20210721125233.1041429-1-svens@linux.ibm.com
v3: https://lkml.kernel.org/r/20210730062854.3601635-1-svens@linux.ibm.com
Link: https://lkml.kernel.org/r/8735rijqlv.fsf_-_@disp2133
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>

Conflict:
  fs/notify/fanotify/fanotify_user.c

Signed-off-by: Li Nan <linan122@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 fs/notify/inotify/inotify_user.c | 17 +++++++++++------
 kernel/ucount.c                  | 19 +++++++++++--------
 2 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 5f6c6bf65909..f13b64729d08 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -46,22 +46,27 @@ struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
 
 #include <linux/sysctl.h>
 
+static long it_zero = 0;
+static long it_int_max = INT_MAX;
+
 struct ctl_table inotify_table[] = {
 	{
 		.procname	= "max_user_instances",
 		.data		= &init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES],
-		.maxlen		= sizeof(int),
+		.maxlen		= sizeof(long),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= SYSCTL_ZERO,
+		.proc_handler	= proc_doulongvec_minmax,
+		.extra1		= &it_zero,
+		.extra2		= &it_int_max,
 	},
 	{
 		.procname	= "max_user_watches",
 		.data		= &init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES],
-		.maxlen		= sizeof(int),
+		.maxlen		= sizeof(long),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= SYSCTL_ZERO,
+		.proc_handler	= proc_doulongvec_minmax,
+		.extra1		= &it_zero,
+		.extra2		= &it_int_max,
 	},
 	{
 		.procname	= "max_queued_events",
diff --git a/kernel/ucount.c b/kernel/ucount.c
index dff1d9b739d2..1f5825b674d8 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -52,14 +52,17 @@ static struct ctl_table_root set_root = {
 	.permissions = set_permissions,
 };
 
-#define UCOUNT_ENTRY(name)				\
-	{						\
-		.procname	= name,			\
-		.maxlen		= sizeof(int),		\
-		.mode		= 0644,			\
-		.proc_handler	= proc_dointvec_minmax,	\
-		.extra1		= SYSCTL_ZERO,		\
-		.extra2		= SYSCTL_INT_MAX,	\
+static long ue_zero = 0;
+static long ue_int_max = INT_MAX;
+
+#define UCOUNT_ENTRY(name)					\
+	{							\
+		.procname	= name,				\
+		.maxlen		= sizeof(long),			\
+		.mode		= 0644,				\
+		.proc_handler	= proc_doulongvec_minmax,	\
+		.extra1		= &ue_zero,			\
+		.extra2		= &ue_int_max,			\
 	}
 static struct ctl_table user_table[] = {
 	UCOUNT_ENTRY("max_user_namespaces"),
-- 
Gitee


From a51eb65ba1c5c5fcf7c026732d5ace1f9cbaf8f9 Mon Sep 17 00:00:00 2001
From: Luo Meng <luomeng12@huawei.com>
Date: Thu, 28 Jul 2022 18:06:33 +0800
Subject: [PATCH 645/674] block: Fix warning in bd_link_disk_holder()

hulk inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ETAB
CVE: NA

--------------------------------

Warning reports as follows:

 WARNING: CPU: 3 PID: 674 at fs/block_dev.c:1272 bd_link_disk_holder+0xcd/0x270
 Modules linked in: null_blk(+)
 CPU: 3 PID: 674 Comm: dmsetup Not tainted 5.10.0-16691-gf6076432827d-dirty #158
 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190727_073836-4
 RIP: 0010:bd_link_disk_holder+0xcd/0x270
 Code: 69 73 ee 00 44 89 e8 5b 48 83 05 c5 bf 6d 0c 01 5d 41 5c 41 5d 41 5e 41 8
 RSP: 0018:ffffc9000049bbb8 EFLAGS: 00010202
 RAX: ffff888104e39038 RBX: ffff888104185000 RCX: 0000000000000000
 RDX: 0000000000000001 RSI: ffffffffaa085692 RDI: 0000000000000000
 RBP: ffff88810cc2ae00 R08: ffffffffa853659b R09: 0000000000000000
 R10: ffffc9000049bbb0 R11: 720030626c6c756e R12: ffff88810e800000
 R13: ffff88810e800090 R14: ffff888103570c98 R15: ffff888103570c80
 FS:  00007fb49dc13dc0(0000) GS:ffff88813bd00000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: 00007ff994ebde70 CR3: 000000010d54a000 CR4: 00000000000006e0
 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
 Call Trace:
  dm_get_table_device+0x175/0x300
  dm_get_device+0x238/0x360
  linear_ctr+0xee/0x170
  dm_table_add_target+0x199/0x4b0
  table_load+0x18c/0x480
  ? table_clear+0x190/0x190
  ctl_ioctl+0x21d/0x640
  ? check_preemption_disabled+0x140/0x150
  dm_ctl_ioctl+0x12/0x20
  __se_sys_ioctl+0xb1/0x100
  __x64_sys_ioctl+0x1e/0x30
  do_syscall_64+0x45/0x70
  entry_SYSCALL_64_after_hwframe+0x44/0xa9

This can reproduce by concurrent operations:
	1. modprobe null_blk
	2. echo -e "0 10000 linear /dev/nullb0 0" > table
	   dmsetup create xxx table

t1: create disk a                   |     t2: dm setup
                                    |
device_add_disk                     |
 dev->devt = devt                   |
                        	    | dm_get_table_device
                        	    | open_table_device
                        	    | blkdev_get_by_dev -> succeed
				    | bd_link_disk_holder
                        	    |  -> holder_dir is still NULL
 register_disk -> create holder_dir
  kobject_create_and_add

device_add_disk() will set devt before creating holder_dir, which
leaves a window that dm_get_table_device() can find the disk by
devt while it's holder_dir is NULL.

So move GENHD_FL_UP in blk_register_queue() to avoid this warning and
fix a NULL-ptr in  __blk_mq_sched_bio_merge().

Signed-off-by: Luo Meng <luomeng12@huawei.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 block/blk-sysfs.c | 6 ++++++
 block/genhd.c     | 2 --
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 0a4fcbda8ab4..aff53c3ae836 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -910,6 +910,12 @@ int blk_register_queue(struct gendisk *disk)
 		kobject_uevent(&q->elevator->kobj, KOBJ_ADD);
 	mutex_unlock(&q->sysfs_lock);
 
+
+	/*
+	 * Set the flag at last, so that block devcie can't be opened
+	 * before it's registration is done.
+	 */
+	disk->flags |= GENHD_FL_UP;
 	ret = 0;
 unlock:
 	mutex_unlock(&q->sysfs_dir_lock);
diff --git a/block/genhd.c b/block/genhd.c
index 8b37fcfa10d1..9d91f880ea95 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -799,8 +799,6 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk,
 	WARN_ON(!disk->minors &&
 		!(disk->flags & (GENHD_FL_EXT_DEVT | GENHD_FL_HIDDEN)));
 
-	disk->flags |= GENHD_FL_UP;
-
 	retval = blk_alloc_devt(&disk->part0, &devt);
 	if (retval) {
 		WARN_ON(1);
-- 
Gitee


From d7c2ddc8456d91991dbc87da304742d60d577b41 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Thu, 28 Jul 2022 18:06:34 +0800
Subject: [PATCH 646/674] block: fix that part scan is disabled in
 device_add_disk()

hulk inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ETAB
CVE: NA

--------------------------------

Patch ("block: Fix warning in bd_link_disk_holder()") moves the
setting of flag 'GENHD_FL_UP' behind blkdev_get, which will
disabled part scan:

devcie_add_disk
 register_disk
  blkdev_get
   __blkdev_get
    bdev_get_gendisk
     get_gendisk -> failed because 'GENHD_FL_UP' is not set

And this will cause tests block/017, block/018 and scsi/004 to fail.

Fix the problem by moving part scan as well.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 block/blk-sysfs.c | 33 +++++++++++++++++++++++++++++++++
 block/genhd.c     | 27 ---------------------------
 2 files changed, 33 insertions(+), 27 deletions(-)

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index aff53c3ae836..f7cd16cec0ed 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -821,6 +821,38 @@ struct kobj_type blk_queue_ktype = {
 	.release	= blk_release_queue,
 };
 
+static void disk_scan_partitions(struct gendisk *disk)
+{
+	struct block_device *bdev;
+
+	if (!get_capacity(disk) || !disk_part_scan_enabled(disk))
+		return;
+
+	set_bit(GD_NEED_PART_SCAN, &disk->state);
+	bdev = blkdev_get_by_dev(disk_devt(disk), FMODE_READ, NULL);
+	if (!IS_ERR(bdev))
+		blkdev_put(bdev, FMODE_READ);
+}
+
+static void disk_init_partition(struct gendisk *disk)
+{
+	struct device *ddev = disk_to_dev(disk);
+	struct disk_part_iter piter;
+	struct hd_struct *part;
+
+	disk_scan_partitions(disk);
+
+	/* announce disk after possible partitions are created */
+	dev_set_uevent_suppress(ddev, 0);
+	kobject_uevent(&ddev->kobj, KOBJ_ADD);
+
+	/* announce possible partitions */
+	disk_part_iter_init(&piter, disk, 0);
+	while ((part = disk_part_iter_next(&piter)))
+		kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD);
+	disk_part_iter_exit(&piter);
+}
+
 /**
  * blk_register_queue - register a block layer queue with sysfs
  * @disk: Disk of which the request queue should be registered with sysfs.
@@ -916,6 +948,7 @@ int blk_register_queue(struct gendisk *disk)
 	 * before it's registration is done.
 	 */
 	disk->flags |= GENHD_FL_UP;
+	disk_init_partition(disk);
 	ret = 0;
 unlock:
 	mutex_unlock(&q->sysfs_dir_lock);
diff --git a/block/genhd.c b/block/genhd.c
index 9d91f880ea95..021c9c2d7231 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -687,25 +687,10 @@ static int exact_lock(dev_t devt, void *data)
 	return 0;
 }
 
-static void disk_scan_partitions(struct gendisk *disk)
-{
-	struct block_device *bdev;
-
-	if (!get_capacity(disk) || !disk_part_scan_enabled(disk))
-		return;
-
-	set_bit(GD_NEED_PART_SCAN, &disk->state);
-	bdev = blkdev_get_by_dev(disk_devt(disk), FMODE_READ, NULL);
-	if (!IS_ERR(bdev))
-		blkdev_put(bdev, FMODE_READ);
-}
-
 static void register_disk(struct device *parent, struct gendisk *disk,
 			  const struct attribute_group **groups)
 {
 	struct device *ddev = disk_to_dev(disk);
-	struct disk_part_iter piter;
-	struct hd_struct *part;
 	int err;
 
 	ddev->parent = parent;
@@ -743,18 +728,6 @@ static void register_disk(struct device *parent, struct gendisk *disk,
 	if (disk->flags & GENHD_FL_HIDDEN)
 		return;
 
-	disk_scan_partitions(disk);
-
-	/* announce disk after possible partitions are created */
-	dev_set_uevent_suppress(ddev, 0);
-	kobject_uevent(&ddev->kobj, KOBJ_ADD);
-
-	/* announce possible partitions */
-	disk_part_iter_init(&piter, disk, 0);
-	while ((part = disk_part_iter_next(&piter)))
-		kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD);
-	disk_part_iter_exit(&piter);
-
 	if (disk->queue->backing_dev_info->dev) {
 		err = sysfs_create_link(&ddev->kobj,
 			  &disk->queue->backing_dev_info->dev->kobj,
-- 
Gitee


From 67b9d2775f9adafd9a4f5d8a7cc05f9cce884f1a Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Thu, 28 Jul 2022 18:06:35 +0800
Subject: [PATCH 647/674] block: prevent lockdep false positive warning about
 'bd_mutex'

hulk inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ETAB
CVE: NA

--------------------------------

Patch ("block: fix that part scan is disabled in device_add_disk()")
confuse lockdep to produce following warning:

=====================================================
WARNING: possible circular locking dependency detected
4.18.0+ #2 Tainted: G                 ---------r-  -
------------------------------------------------------
syz-executor.0/4652 is trying to acquire lock:
00000000ad5f5a19 (&mddev->open_mutex){+.+.}, at: md_open+0x13a/0x260 home/install/linux-rh-3-10/drivers/md/md.c:7626

but task is already holding lock:
000000005c3a3fea (&bdev->bd_mutex){+.+.}, at: __blkdev_get+0x156/0x1490 home/install/linux-rh-3-10/fs/block_dev.c:1583

which lock already depends on the new lock.

the existing dependency chain (in reverse order) is:

-> #2 (&bdev->bd_mutex){+.+.}:
       __mutex_lock_common home/install/linux-rh-3-10/kernel/locking/mutex.c:925 [inline]
       __mutex_lock+0x105/0x1270 home/install/linux-rh-3-10/kernel/locking/mutex.c:1072
       __blkdev_get+0x156/0x1490 home/install/linux-rh-3-10/fs/block_dev.c:1583
       blkdev_get+0x33c/0xac0 home/install/linux-rh-3-10/fs/block_dev.c:1735
       disk_init_partition home/install/linux-rh-3-10/block/blk-sysfs.c:972 [inline]
       blk_register_queue+0x5ed/0x6c0 home/install/linux-rh-3-10/block/blk-sysfs.c:1055
       __device_add_disk+0xab5/0xd70 home/install/linux-rh-3-10/block/genhd.c:729
       sd_probe_async+0x447/0x852 home/install/linux-rh-3-10/drivers/scsi/sd.c:3249
       async_run_entry_fn+0xe1/0x700 home/install/linux-rh-3-10/kernel/async.c:127
       process_one_work+0x9cf/0x1940 home/install/linux-rh-3-10/kernel/workqueue.c:2175
       worker_thread+0x91/0xc50 home/install/linux-rh-3-10/kernel/workqueue.c:2321
       kthread+0x33a/0x400 home/install/linux-rh-3-10/kernel/kthread.c:257
       ret_from_fork+0x3a/0x50 home/install/linux-rh-3-10/arch/x86/entry/entry_64.S:355

-> #1 (&q->sysfs_dir_lock){+.+.}:
       __mutex_lock_common home/install/linux-rh-3-10/kernel/locking/mutex.c:925 [inline]
       __mutex_lock+0x105/0x1270 home/install/linux-rh-3-10/kernel/locking/mutex.c:1072
       blk_register_queue+0x143/0x6c0 home/install/linux-rh-3-10/block/blk-sysfs.c:1010
       __device_add_disk+0xab5/0xd70 home/install/linux-rh-3-10/block/genhd.c:729
       add_disk home/install/linux-rh-3-10/./include/linux/genhd.h:447 [inline]
       md_alloc+0xb06/0x10d0 home/install/linux-rh-3-10/drivers/md/md.c:5525
       md_probe+0x32/0x60 home/install/linux-rh-3-10/drivers/md/md.c:5554
       kobj_lookup+0x2d2/0x450 home/install/linux-rh-3-10/drivers/base/map.c:152
       get_gendisk+0x3b/0x360 home/install/linux-rh-3-10/block/genhd.c:860
       bdev_get_gendisk home/install/linux-rh-3-10/fs/block_dev.c:1181 [inline]
       __blkdev_get+0x3b6/0x1490 home/install/linux-rh-3-10/fs/block_dev.c:1578
       blkdev_get+0x33c/0xac0 home/install/linux-rh-3-10/fs/block_dev.c:1735
       blkdev_open+0x1c2/0x250 home/install/linux-rh-3-10/fs/block_dev.c:1923
       do_dentry_open+0x686/0xf50 home/install/linux-rh-3-10/fs/open.c:777
       do_last home/install/linux-rh-3-10/fs/namei.c:3449 [inline]
       path_openat+0x92f/0x28c0 home/install/linux-rh-3-10/fs/namei.c:3578
       do_filp_open+0x1aa/0x2b0 home/install/linux-rh-3-10/fs/namei.c:3613
       do_sys_open+0x307/0x490 home/install/linux-rh-3-10/fs/open.c:1075
       do_syscall_64+0xca/0x5c0 home/install/linux-rh-3-10/arch/x86/entry/common.c:298
       entry_SYSCALL_64_after_hwframe+0x6a/0xdf

-> #0 (&mddev->open_mutex){+.+.}:
       lock_acquire+0x10b/0x3a0 home/install/linux-rh-3-10/kernel/locking/lockdep.c:3868
       __mutex_lock_common home/install/linux-rh-3-10/kernel/locking/mutex.c:925 [inline]
       __mutex_lock+0x105/0x1270 home/install/linux-rh-3-10/kernel/locking/mutex.c:1072
       md_open+0x13a/0x260 home/install/linux-rh-3-10/drivers/md/md.c:7626
       __blkdev_get+0x2dc/0x1490 home/install/linux-rh-3-10/fs/block_dev.c:1599
       blkdev_get+0x33c/0xac0 home/install/linux-rh-3-10/fs/block_dev.c:1735
       blkdev_open+0x1c2/0x250 home/install/linux-rh-3-10/fs/block_dev.c:1923
       do_dentry_open+0x686/0xf50 home/install/linux-rh-3-10/fs/open.c:777
       do_last home/install/linux-rh-3-10/fs/namei.c:3449 [inline]
       path_openat+0x92f/0x28c0 home/install/linux-rh-3-10/fs/namei.c:3578
       do_filp_open+0x1aa/0x2b0 home/install/linux-rh-3-10/fs/namei.c:3613
       do_sys_open+0x307/0x490 home/install/linux-rh-3-10/fs/open.c:1075
       do_syscall_64+0xca/0x5c0 home/install/linux-rh-3-10/arch/x86/entry/common.c:298
       entry_SYSCALL_64_after_hwframe+0x6a/0xdf

other info that might help us debug this:

Chain exists of:
  &mddev->open_mutex --> &q->sysfs_dir_lock --> &bdev->bd_mutex

 Possible unsafe locking scenario:

       CPU0                    CPU1
       ----                    ----
  lock(&bdev->bd_mutex);
                               lock(&q->sysfs_dir_lock);
                               lock(&bdev->bd_mutex);
  lock(&mddev->open_mutex);

 *** DEADLOCK ***

Since 'bd_mutex' and 'sysfs_dir_lock' is different is for each device,
deadlock between md_open() and sd_probe_async() is impossible. However,
lockdep is treating 'bd_mutex' and 'sysfs_dir_lock' from different devices
the same, and patch "block: fix that part scan is disabled in
device_add_disk()" is holding 'bd_mutex' inside 'sysfs_dir_lock',
which causes the false positive warning.

Fix the false positive warning by don't grab 'bd_mutex' inside
'sysfs_dir_lock'.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 block/blk-sysfs.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index f7cd16cec0ed..548d758365c6 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -948,10 +948,17 @@ int blk_register_queue(struct gendisk *disk)
 	 * before it's registration is done.
 	 */
 	disk->flags |= GENHD_FL_UP;
-	disk_init_partition(disk);
 	ret = 0;
 unlock:
 	mutex_unlock(&q->sysfs_dir_lock);
+	/*
+	 * Init partitions after releasing 'sysfs_dir_lock', otherwise lockdep
+	 * will be confused because it will treat 'bd_mutex' from different
+	 * devices as the same lock.
+	 */
+	if (!ret)
+		disk_init_partition(disk);
+
 	return ret;
 }
 EXPORT_SYMBOL_GPL(blk_register_queue);
-- 
Gitee


From 77eabe0b93e9dd68b400d52c9caa9282ca6094df Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Thu, 28 Jul 2022 18:06:36 +0800
Subject: [PATCH 648/674] inotify: show inotify mask flags in proc fdinfo

mainline inclusion
from mainline-v5.19-rc1
commit a32e697cda27679a0327ae2cafdad8c7170f548f
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5IHD1
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=a32e697cda27679a0327ae2cafdad8c7170f548f

--------------------------------

The inotify mask flags IN_ONESHOT and IN_EXCL_UNLINK are not "internal
to kernel" and should be exposed in procfs fdinfo so CRIU can restore
them.

Fixes: 6933599697c9 ("inotify: hide internal kernel bits from fdinfo")
Link: https://lore.kernel.org/r/20220422120327.3459282-2-amir73il@gmail.com
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Li Nan <linan122@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 fs/notify/fdinfo.c               | 11 ++---------
 fs/notify/inotify/inotify.h      | 12 ++++++++++++
 fs/notify/inotify/inotify_user.c |  2 +-
 3 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
index f0d6b54be412..765b50aeadd2 100644
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -83,16 +83,9 @@ static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
 	inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark);
 	inode = igrab(fsnotify_conn_inode(mark->connector));
 	if (inode) {
-		/*
-		 * IN_ALL_EVENTS represents all of the mask bits
-		 * that we expose to userspace.  There is at
-		 * least one bit (FS_EVENT_ON_CHILD) which is
-		 * used only internally to the kernel.
-		 */
-		u32 mask = mark->mask & IN_ALL_EVENTS;
-		seq_printf(m, "inotify wd:%x ino:%lx sdev:%x mask:%x ignored_mask:%x ",
+		seq_printf(m, "inotify wd:%x ino:%lx sdev:%x mask:%x ignored_mask:0 ",
 			   inode_mark->wd, inode->i_ino, inode->i_sb->s_dev,
-			   mask, mark->ignored_mask);
+			   inotify_mark_user_mask(mark));
 		show_mark_fhandle(m, inode);
 		seq_putc(m, '\n');
 		iput(inode);
diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
index 2007e3711916..8f00151eb731 100644
--- a/fs/notify/inotify/inotify.h
+++ b/fs/notify/inotify/inotify.h
@@ -22,6 +22,18 @@ static inline struct inotify_event_info *INOTIFY_E(struct fsnotify_event *fse)
 	return container_of(fse, struct inotify_event_info, fse);
 }
 
+/*
+ * INOTIFY_USER_FLAGS represents all of the mask bits that we expose to
+ * userspace.  There is at least one bit (FS_EVENT_ON_CHILD) which is
+ * used only internally to the kernel.
+ */
+#define INOTIFY_USER_MASK (IN_ALL_EVENTS | IN_ONESHOT | IN_EXCL_UNLINK)
+
+static inline __u32 inotify_mark_user_mask(struct fsnotify_mark *fsn_mark)
+{
+	return fsn_mark->mask & INOTIFY_USER_MASK;
+}
+
 extern void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
 					   struct fsnotify_group *group);
 extern int inotify_handle_inode_event(struct fsnotify_mark *inode_mark,
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index f13b64729d08..3986f1877457 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -93,7 +93,7 @@ static inline __u32 inotify_arg_to_mask(struct inode *inode, u32 arg)
 		mask |= FS_EVENT_ON_CHILD;
 
 	/* mask off the flags used to open the fd */
-	mask |= (arg & (IN_ALL_EVENTS | IN_ONESHOT | IN_EXCL_UNLINK));
+	mask |= (arg & INOTIFY_USER_MASK);
 
 	return mask;
 }
-- 
Gitee


From c248fb22ed21b7bf4308bfcdf962d11b4f73f62c Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Thu, 28 Jul 2022 18:06:37 +0800
Subject: [PATCH 649/674] fbcon: Disallow setting font bigger than screen size

stable inclusion
from stable-v5.10.130
commit b727561ddc9360de9631af2d970d8ffed676a750
category: bugfix
bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5IQ4M
CVE: CVE-2021-33655

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=b727561ddc9360de9631af2d970d8ffed676a750

--------------------------------

commit 65a01e601dbba8b7a51a2677811f70f783766682 upstream.

Prevent that users set a font size which is bigger than the physical screen.
It's unlikely this may happen (because screens are usually much larger than the
fonts and each font char is limited to 32x32 pixels), but it may happen on
smaller screens/LCD displays.

Signed-off-by: Helge Deller <deller@gmx.de>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Reviewed-by: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: stable@vger.kernel.org # v4.14+
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Chen Jun <chenjun102@huawei.com>
Reviewed-by: Xiu Jianfeng <xiujianfeng@huawei.com>
Reviewed-by: Weilong Chen <chenweilong@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 drivers/video/fbdev/core/fbcon.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c
index f102519ccefb..8d81e9321cf7 100644
--- a/drivers/video/fbdev/core/fbcon.c
+++ b/drivers/video/fbdev/core/fbcon.c
@@ -2510,6 +2510,11 @@ static int fbcon_set_font(struct vc_data *vc, struct console_font *font,
 	if (charcount != 256 && charcount != 512)
 		return -EINVAL;
 
+	/* font bigger than screen resolution ? */
+	if (w > FBCON_SWAP(info->var.rotate, info->var.xres, info->var.yres) ||
+	    h > FBCON_SWAP(info->var.rotate, info->var.yres, info->var.xres))
+		return -EINVAL;
+
 	/* Make sure drawing engine can handle the font */
 	if (!(info->pixmap.blit_x & (1 << (font->width - 1))) ||
 	    !(info->pixmap.blit_y & (1 << (font->height - 1))))
-- 
Gitee


From e664e9804a489f76e72be7601f53654b7d29014f Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Thu, 28 Jul 2022 18:06:38 +0800
Subject: [PATCH 650/674] fbcon: Prevent that screen size is smaller than font
 size

stable inclusion
from stable-v5.10.130
commit cecb806c766c78e1be62b6b7b1483ef59bbaeabe
category: bugfix
bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5IQ4M
CVE: CVE-2021-33655

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=cecb806c766c78e1be62b6b7b1483ef59bbaeabe

--------------------------------

commit e64242caef18b4a5840b0e7a9bff37abd4f4f933 upstream.

We need to prevent that users configure a screen size which is smaller than the
currently selected font size. Otherwise rendering chars on the screen will
access memory outside the graphics memory region.

This patch adds a new function fbcon_modechange_possible() which
implements this check and which later may be extended with other checks
if necessary.  The new function is called from the FBIOPUT_VSCREENINFO
ioctl handler in fbmem.c, which will return -EINVAL if userspace asked
for a too small screen size.

Signed-off-by: Helge Deller <deller@gmx.de>
Reviewed-by: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: stable@vger.kernel.org # v5.4+
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Chen Jun <chenjun102@huawei.com>
Reviewed-by: Xiu Jianfeng <xiujianfeng@huawei.com>
Reviewed-by: Weilong Chen <chenweilong@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 drivers/video/fbdev/core/fbcon.c | 28 ++++++++++++++++++++++++++++
 drivers/video/fbdev/core/fbmem.c |  4 +++-
 include/linux/fbcon.h            |  4 ++++
 3 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c
index 8d81e9321cf7..b4260a830e78 100644
--- a/drivers/video/fbdev/core/fbcon.c
+++ b/drivers/video/fbdev/core/fbcon.c
@@ -2776,6 +2776,34 @@ void fbcon_update_vcs(struct fb_info *info, bool all)
 }
 EXPORT_SYMBOL(fbcon_update_vcs);
 
+/* let fbcon check if it supports a new screen resolution */
+int fbcon_modechange_possible(struct fb_info *info, struct fb_var_screeninfo *var)
+{
+	struct fbcon_ops *ops = info->fbcon_par;
+	struct vc_data *vc;
+	unsigned int i;
+
+	WARN_CONSOLE_UNLOCKED();
+
+	if (!ops)
+		return 0;
+
+	/* prevent setting a screen size which is smaller than font size */
+	for (i = first_fb_vc; i <= last_fb_vc; i++) {
+		vc = vc_cons[i].d;
+		if (!vc || vc->vc_mode != KD_TEXT ||
+			   registered_fb[con2fb_map[i]] != info)
+			continue;
+
+		if (vc->vc_font.width  > FBCON_SWAP(var->rotate, var->xres, var->yres) ||
+		    vc->vc_font.height > FBCON_SWAP(var->rotate, var->yres, var->xres))
+			return -EINVAL;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(fbcon_modechange_possible);
+
 int fbcon_mode_deleted(struct fb_info *info,
 		       struct fb_videomode *mode)
 {
diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c
index 00939ca2065a..d2cffaf3fda8 100644
--- a/drivers/video/fbdev/core/fbmem.c
+++ b/drivers/video/fbdev/core/fbmem.c
@@ -1109,7 +1109,9 @@ static long do_fb_ioctl(struct fb_info *info, unsigned int cmd,
 			return -EFAULT;
 		console_lock();
 		lock_fb_info(info);
-		ret = fb_set_var(info, &var);
+		ret = fbcon_modechange_possible(info, &var);
+		if (!ret)
+			ret = fb_set_var(info, &var);
 		if (!ret)
 			fbcon_update_vcs(info, var.activate & FB_ACTIVATE_ALL);
 		unlock_fb_info(info);
diff --git a/include/linux/fbcon.h b/include/linux/fbcon.h
index ff5596dd30f8..2382dec6d6ab 100644
--- a/include/linux/fbcon.h
+++ b/include/linux/fbcon.h
@@ -15,6 +15,8 @@ void fbcon_new_modelist(struct fb_info *info);
 void fbcon_get_requirement(struct fb_info *info,
 			   struct fb_blit_caps *caps);
 void fbcon_fb_blanked(struct fb_info *info, int blank);
+int  fbcon_modechange_possible(struct fb_info *info,
+			       struct fb_var_screeninfo *var);
 void fbcon_update_vcs(struct fb_info *info, bool all);
 void fbcon_remap_all(struct fb_info *info);
 int fbcon_set_con2fb_map_ioctl(void __user *argp);
@@ -33,6 +35,8 @@ static inline void fbcon_new_modelist(struct fb_info *info) {}
 static inline void fbcon_get_requirement(struct fb_info *info,
 					 struct fb_blit_caps *caps) {}
 static inline void fbcon_fb_blanked(struct fb_info *info, int blank) {}
+static inline int  fbcon_modechange_possible(struct fb_info *info,
+				struct fb_var_screeninfo *var) { return 0; }
 static inline void fbcon_update_vcs(struct fb_info *info, bool all) {}
 static inline void fbcon_remap_all(struct fb_info *info) {}
 static inline int fbcon_set_con2fb_map_ioctl(void __user *argp) { return 0; }
-- 
Gitee


From 9ba05bbca9335251bfdaa95a344034c31969c8a0 Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Thu, 28 Jul 2022 18:06:39 +0800
Subject: [PATCH 651/674] fbmem: Check virtual screen sizes in fb_set_var()

stable inclusion
from stable-v5.10.130
commit b81212828ad19ab3eccf00626cd04099215060bf
category: bugfix
bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5IQ4M
CVE: CVE-2021-33655

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=b81212828ad19ab3eccf00626cd04099215060bf

--------------------------------

commit 6c11df58fd1ac0aefcb3b227f72769272b939e56 upstream.

Verify that the fbdev or drm driver correctly adjusted the virtual
screen sizes. On failure report the failing driver and reject the screen
size change.

Signed-off-by: Helge Deller <deller@gmx.de>
Reviewed-by: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: stable@vger.kernel.org # v5.4+
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Chen Jun <chenjun102@huawei.com>
Reviewed-by: Xiu Jianfeng <xiujianfeng@huawei.com>
Reviewed-by: Weilong Chen <chenweilong@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 drivers/video/fbdev/core/fbmem.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c
index d2cffaf3fda8..3b3ccb235522 100644
--- a/drivers/video/fbdev/core/fbmem.c
+++ b/drivers/video/fbdev/core/fbmem.c
@@ -1019,6 +1019,16 @@ fb_set_var(struct fb_info *info, struct fb_var_screeninfo *var)
 	if (ret)
 		return ret;
 
+	/* verify that virtual resolution >= physical resolution */
+	if (var->xres_virtual < var->xres ||
+	    var->yres_virtual < var->yres) {
+		pr_warn("WARNING: fbcon: Driver '%s' missed to adjust virtual screen size (%ux%u vs. %ux%u)\n",
+			info->fix.id,
+			var->xres_virtual, var->yres_virtual,
+			var->xres, var->yres);
+		return -EINVAL;
+	}
+
 	if ((var->activate & FB_ACTIVATE_MASK) != FB_ACTIVATE_NOW)
 		return 0;
 
-- 
Gitee


From 89e9ad6dfaef94bddf0aa0dcd1e2c0bd69dcbb57 Mon Sep 17 00:00:00 2001
From: Eric Snowberg <eric.snowberg@oracle.com>
Date: Thu, 28 Jul 2022 18:06:40 +0800
Subject: [PATCH 652/674] lockdown: Fix kexec lockdown bypass with ima policy

mainline inclusion
from mainline-v5.19-rc8
commit 543ce63b664e2c2f9533d089a4664b559c3e6b5b
category: bugfix
bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5I0FP
CVE: CVE-2022-21505

Reference: https://seclists.org/oss-sec/2022/q3/57
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=543ce63b664e2c2f9533d089a4664b559c3e6b5b

--------------------------------

The lockdown LSM is primarily used in conjunction with UEFI Secure Boot.
This LSM may also be used on machines without UEFI.  It can also be
enabled when UEFI Secure Boot is disabled.  One of lockdown's features
is to prevent kexec from loading untrusted kernels.  Lockdown can be
enabled through a bootparam or after the kernel has booted through
securityfs.

If IMA appraisal is used with the "ima_appraise=log" boot param,
lockdown can be defeated with kexec on any machine when Secure Boot is
disabled or unavailable.  IMA prevents setting "ima_appraise=log" from
the boot param when Secure Boot is enabled, but this does not cover
cases where lockdown is used without Secure Boot.

To defeat lockdown, boot without Secure Boot and add ima_appraise=log to
the kernel command line; then:

  $ echo "integrity" > /sys/kernel/security/lockdown
  $ echo "appraise func=KEXEC_KERNEL_CHECK appraise_type=imasig" > \
    /sys/kernel/security/ima/policy
  $ kexec -ls unsigned-kernel

Add a call to verify ima appraisal is set to "enforce" whenever lockdown
is enabled.  This fixes CVE-2022-21505.

Cc: stable@vger.kernel.org
Fixes: 29d3c1c8dfe7 ("kexec: Allow kexec_file() with appropriate IMA policy when locked down")
Signed-off-by: Eric Snowberg <eric.snowberg@oracle.com>
Acked-by: Mimi Zohar <zohar@linux.ibm.com>
Reviewed-by: John Haxby <john.haxby@oracle.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: GUO Zihua <guozihua@huawei.com>
Reviewed-by: Xiu Jianfeng <xiujianfeng@huawei.com>
Reviewed-by: Wang Weiyang <wangweiyang2@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 security/integrity/ima/ima_policy.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c
index 5a06050729a7..b1ab4b3d99fb 100644
--- a/security/integrity/ima/ima_policy.c
+++ b/security/integrity/ima/ima_policy.c
@@ -1900,6 +1900,10 @@ bool ima_appraise_signature(enum kernel_read_file_id id)
 	if (id >= READING_MAX_ID)
 		return false;
 
+	if (id == READING_KEXEC_IMAGE && !(ima_appraise & IMA_APPRAISE_ENFORCE)
+	    && security_locked_down(LOCKDOWN_KEXEC))
+		return false;
+
 	func = read_idmap[id] ?: FILE_CHECK;
 
 	rcu_read_lock();
-- 
Gitee


From 6658b33b26920a82df3203c6ecefd00b5820e921 Mon Sep 17 00:00:00 2001
From: ZhaoLong Wang <wangzhaolong1@huawei.com>
Date: Thu, 28 Jul 2022 18:06:41 +0800
Subject: [PATCH 653/674] ubifs: Fix the issue that UBIFS be read-only due to
 truncate in the encrypted directory

hulk inclusion
category: bugfix
bugzilla: 187163, https://gitee.com/openeuler/kernel/issues/I5GBC4
CVE: NA

--------------------------------

The ubifs_compress() function does not compress the data When the
data length is short than 128 bytes or the compressed data length
is not ideal.It cause that the compressed length of the truncated
data in the truncate_data_node() function may be greater than the
length of the raw data read from the flash.

The above two lengths are transferred to the ubifs_encrypt()
function as parameters. This may lead to assertion fails and then
the file system becomes read-only.

This patch use the actual length of the data in the memory as the
input parameter for assert comparison, which avoids the problem.

Signed-off-by: ZhaoLong Wang <wangzhaolong1@huawei.com>
Reviewed-by: zhihao Cheng <chengzhihao1@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 fs/ubifs/crypto.c  | 11 +++++++++++
 fs/ubifs/journal.c | 29 +++++++++++++++++------------
 2 files changed, 28 insertions(+), 12 deletions(-)

diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c
index 22be7aeb96c4..1dc22cf29c65 100644
--- a/fs/ubifs/crypto.c
+++ b/fs/ubifs/crypto.c
@@ -24,6 +24,17 @@ static bool ubifs_crypt_empty_dir(struct inode *inode)
 	return ubifs_check_dir_empty(inode) == 0;
 }
 
+/**
+ * ubifs_encrypt - Encrypt data.
+ * @inode: inode which refers to the data node
+ * @dn: data node to encrypt
+ * @in_len: length of data to be compressed
+ * @out_len: allocated memory size for the data area of @dn
+ * @block: logical block number of the block
+ *
+ * This function encrypt a possibly-compressed data in the data node.
+ * The encrypted data length will store in @out_len.
+ */
 int ubifs_encrypt(const struct inode *inode, struct ubifs_data_node *dn,
 		  unsigned int in_len, unsigned int *out_len, int block)
 {
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 72586512e51f..ee9888087983 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -1472,23 +1472,25 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
  * @block: data block number
  * @dn: data node to re-compress
  * @new_len: new length
+ * @dn_size: size of the data node @dn in memory
  *
  * This function is used when an inode is truncated and the last data node of
  * the inode has to be re-compressed/encrypted and re-written.
  */
 static int truncate_data_node(const struct ubifs_info *c, const struct inode *inode,
 			      unsigned int block, struct ubifs_data_node *dn,
-			      int *new_len)
+			      int *new_len, int dn_size)
 {
 	void *buf;
-	int err, dlen, compr_type, out_len, old_dlen;
+	int err, dlen, compr_type, out_len, data_size;
 
 	out_len = le32_to_cpu(dn->size);
 	buf = kmalloc_array(out_len, WORST_COMPR_FACTOR, GFP_NOFS);
 	if (!buf)
 		return -ENOMEM;
 
-	dlen = old_dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ;
+	dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ;
+	data_size = dn_size - UBIFS_DATA_NODE_SZ;
 	compr_type = le16_to_cpu(dn->compr_type);
 
 	if (IS_ENCRYPTED(inode)) {
@@ -1508,11 +1510,11 @@ static int truncate_data_node(const struct ubifs_info *c, const struct inode *in
 	}
 
 	if (IS_ENCRYPTED(inode)) {
-		err = ubifs_encrypt(inode, dn, out_len, &old_dlen, block);
+		err = ubifs_encrypt(inode, dn, out_len, &data_size, block);
 		if (err)
 			goto out;
 
-		out_len = old_dlen;
+		out_len = data_size;
 	} else {
 		dn->compr_size = 0;
 	}
@@ -1549,7 +1551,7 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
 	struct ubifs_ino_node *ino;
 	struct ubifs_trun_node *trun;
 	struct ubifs_data_node *dn;
-	int err, dlen, len, lnum, offs, bit, sz, sync = IS_SYNC(inode);
+	int err, dlen, len, lnum, offs, bit, sz, dn_size, sync = IS_SYNC(inode);
 	struct ubifs_inode *ui = ubifs_inode(inode);
 	ino_t inum = inode->i_ino;
 	unsigned int blk;
@@ -1562,10 +1564,13 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
 	ubifs_assert(c, S_ISREG(inode->i_mode));
 	ubifs_assert(c, mutex_is_locked(&ui->ui_mutex));
 
-	sz = UBIFS_TRUN_NODE_SZ + UBIFS_INO_NODE_SZ +
-	     UBIFS_MAX_DATA_NODE_SZ * WORST_COMPR_FACTOR;
+	dn_size = COMPRESSED_DATA_NODE_BUF_SZ;
+
+	if (IS_ENCRYPTED(inode))
+		dn_size += UBIFS_CIPHER_BLOCK_SIZE;
 
-	sz += ubifs_auth_node_sz(c);
+	sz = UBIFS_TRUN_NODE_SZ + UBIFS_INO_NODE_SZ +
+	     dn_size + ubifs_auth_node_sz(c);
 
 	ino = kmalloc(sz, GFP_NOFS);
 	if (!ino)
@@ -1596,15 +1601,15 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
 			if (dn_len <= 0 || dn_len > UBIFS_BLOCK_SIZE) {
 				ubifs_err(c, "bad data node (block %u, inode %lu)",
 					  blk, inode->i_ino);
-				ubifs_dump_node(c, dn, sz - UBIFS_INO_NODE_SZ -
-						UBIFS_TRUN_NODE_SZ);
+				ubifs_dump_node(c, dn, dn_size);
 				goto out_free;
 			}
 
 			if (dn_len <= dlen)
 				dlen = 0; /* Nothing to do */
 			else {
-				err = truncate_data_node(c, inode, blk, dn, &dlen);
+				err = truncate_data_node(c, inode, blk, dn,
+						&dlen, dn_size);
 				if (err)
 					goto out_free;
 			}
-- 
Gitee


From baf97eb4ed59819bb8e2d59570057de458a351d1 Mon Sep 17 00:00:00 2001
From: Zhihao Cheng <chengzhihao1@huawei.com>
Date: Thu, 28 Jul 2022 18:06:42 +0800
Subject: [PATCH 654/674] ubifs: Fix AA deadlock when setting xattr for
 encrypted file

hulk inclusion
category: bugfix
bugzilla: 187250, https://gitee.com/openeuler/kernel/issues/I5HSMS
CVE: NA

-------------------------------------------------

Following process:
vfs_setxattr(host)
  ubifs_xattr_set
    down_write(host_ui->xattr_sem)   <- lock first time
      create_xattr
        ubifs_new_inode(host)
          fscrypt_prepare_new_inode(host)
            fscrypt_policy_to_inherit(host)
              if (IS_ENCRYPTED(inode))
                fscrypt_require_key(host)
                  fscrypt_get_encryption_info(host)
                    ubifs_xattr_get(host)
                      down_read(host_ui->xattr_sem) <- AA deadlock

, which may trigger an AA deadlock problem:

[  102.620871] INFO: task setfattr:1599 blocked for more than 10 seconds.
[  102.625298]       Not tainted 5.19.0-rc7-00001-gb666b6823ce0-dirty #711
[  102.628732] task:setfattr        state:D stack:    0 pid: 1599
[  102.628749] Call Trace:
[  102.628753]  <TASK>
[  102.628776]  __schedule+0x482/0x1060
[  102.629964]  schedule+0x92/0x1a0
[  102.629976]  rwsem_down_read_slowpath+0x287/0x8c0
[  102.629996]  down_read+0x84/0x170
[  102.630585]  ubifs_xattr_get+0xd1/0x370 [ubifs]
[  102.630730]  ubifs_crypt_get_context+0x1f/0x30 [ubifs]
[  102.630791]  fscrypt_get_encryption_info+0x7d/0x1c0
[  102.630810]  fscrypt_policy_to_inherit+0x56/0xc0
[  102.630817]  fscrypt_prepare_new_inode+0x35/0x160
[  102.630830]  ubifs_new_inode+0xcc/0x4b0 [ubifs]
[  102.630873]  ubifs_xattr_set+0x591/0x9f0 [ubifs]
[  102.630961]  xattr_set+0x8c/0x3e0 [ubifs]
[  102.631003]  __vfs_setxattr+0x71/0xc0
[  102.631026]  vfs_setxattr+0x105/0x270
[  102.631034]  do_setxattr+0x6d/0x110
[  102.631041]  setxattr+0xa0/0xd0
[  102.631087]  __x64_sys_setxattr+0x2f/0x40

Fetch a reproducer in [Link].

Just like ext4 does, which skips encrypting for inode with
EXT4_EA_INODE_FL flag. Stop encypting xattr inode for ubifs.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=216260
Fixes: f4e3634a3b64222 ("ubifs: Fix races between xattr_{set|get} ...")
Fixes: d475a507457b5ca ("ubifs: Add skeleton for fscrypto")
Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 fs/ubifs/dir.c   | 25 ++++++++++++++-----------
 fs/ubifs/ubifs.h |  2 +-
 fs/ubifs/xattr.c |  2 +-
 3 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index f5777f59a101..acd7e83a35e4 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -68,13 +68,14 @@ static int inherit_flags(const struct inode *dir, umode_t mode)
  * @c: UBIFS file-system description object
  * @dir: parent directory inode
  * @mode: inode mode flags
+ * @is_xattr: whether the inode is xattr inode
  *
  * This function finds an unused inode number, allocates new inode and
  * initializes it. Returns new inode in case of success and an error code in
  * case of failure.
  */
 struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir,
-			      umode_t mode)
+			      umode_t mode, bool is_xattr)
 {
 	int err;
 	struct inode *inode;
@@ -99,10 +100,12 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir,
 			 current_time(inode);
 	inode->i_mapping->nrpages = 0;
 
-	err = fscrypt_prepare_new_inode(dir, inode, &encrypted);
-	if (err) {
-		ubifs_err(c, "fscrypt_prepare_new_inode failed: %i", err);
-		goto out_iput;
+	if (!is_xattr) {
+		err = fscrypt_prepare_new_inode(dir, inode, &encrypted);
+		if (err) {
+			ubifs_err(c, "fscrypt_prepare_new_inode failed: %i", err);
+			goto out_iput;
+		}
 	}
 
 	switch (mode & S_IFMT) {
@@ -308,7 +311,7 @@ static int ubifs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 
 	sz_change = CALC_DENT_SIZE(fname_len(&nm));
 
-	inode = ubifs_new_inode(c, dir, mode);
+	inode = ubifs_new_inode(c, dir, mode, false);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
 		goto out_fname;
@@ -369,7 +372,7 @@ static struct inode *create_whiteout(struct inode *dir, struct dentry *dentry)
 	if (err)
 		return ERR_PTR(err);
 
-	inode = ubifs_new_inode(c, dir, mode);
+	inode = ubifs_new_inode(c, dir, mode, false);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
 		goto out_free;
@@ -461,7 +464,7 @@ static int ubifs_tmpfile(struct inode *dir, struct dentry *dentry,
 		return err;
 	}
 
-	inode = ubifs_new_inode(c, dir, mode);
+	inode = ubifs_new_inode(c, dir, mode, false);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
 		goto out_budg;
@@ -1002,7 +1005,7 @@ static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 
 	sz_change = CALC_DENT_SIZE(fname_len(&nm));
 
-	inode = ubifs_new_inode(c, dir, S_IFDIR | mode);
+	inode = ubifs_new_inode(c, dir, S_IFDIR | mode, false);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
 		goto out_fname;
@@ -1089,7 +1092,7 @@ static int ubifs_mknod(struct inode *dir, struct dentry *dentry,
 
 	sz_change = CALC_DENT_SIZE(fname_len(&nm));
 
-	inode = ubifs_new_inode(c, dir, mode);
+	inode = ubifs_new_inode(c, dir, mode, false);
 	if (IS_ERR(inode)) {
 		kfree(dev);
 		err = PTR_ERR(inode);
@@ -1171,7 +1174,7 @@ static int ubifs_symlink(struct inode *dir, struct dentry *dentry,
 
 	sz_change = CALC_DENT_SIZE(fname_len(&nm));
 
-	inode = ubifs_new_inode(c, dir, S_IFLNK | S_IRWXUGO);
+	inode = ubifs_new_inode(c, dir, S_IFLNK | S_IRWXUGO, false);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
 		goto out_fname;
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 9a4a3191ed07..15cfee7c1125 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1996,7 +1996,7 @@ int ubifs_update_time(struct inode *inode, struct timespec64 *time, int flags);
 
 /* dir.c */
 struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir,
-			      umode_t mode);
+			      umode_t mode, bool is_xattr);
 int ubifs_getattr(const struct path *path, struct kstat *stat,
 		  u32 request_mask, unsigned int flags);
 int ubifs_check_dir_empty(struct inode *dir);
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 258b97939ba2..81efe638acd5 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -110,7 +110,7 @@ static int create_xattr(struct ubifs_info *c, struct inode *host,
 	if (err)
 		return err;
 
-	inode = ubifs_new_inode(c, host, S_IFREG | S_IRWXUGO);
+	inode = ubifs_new_inode(c, host, S_IFREG | S_IRWXUGO, true);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
 		goto out_budg;
-- 
Gitee


From 156e60a42ffffe594f7cf38df51ee07b7f0fc99c Mon Sep 17 00:00:00 2001
From: Liu Shixin <liushixin2@huawei.com>
Date: Thu, 28 Jul 2022 18:06:43 +0800
Subject: [PATCH 655/674] Revert "mm/dynamic_hugetlb: disable dynamic hugetlb
 if hugetlb_vmemmap is enabled"

hulk inclusion
category: bugfix
bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO
CVE: NA

--------------------------------

Will disable hugetlb_vmemmap when dynamic hugetlb is enabled in later patch.

This reverts commit c7ae7c0dd37aa112f9cb3e23878f4c5fe67f86bc.

Signed-off-by: Liu Shixin <liushixin2@huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 mm/dynamic_hugetlb.c | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/mm/dynamic_hugetlb.c b/mm/dynamic_hugetlb.c
index eb9b528b73de..f8ebc8ab7d60 100644
--- a/mm/dynamic_hugetlb.c
+++ b/mm/dynamic_hugetlb.c
@@ -1133,17 +1133,6 @@ void __init dynamic_hugetlb_init(void)
 	if (!enable_dhugetlb)
 		return;
 
-	/*
-	 * The dynamic_hugetlb feature need to split and merge pages frequently.
-	 * hugetlb_vmemmap will affects the perforemance of page split and merge.
-	 * If want to use dynamic hugetlb, please disable hugetlb_vmemmap.
-	 */
-	if (hugetlb_free_vmemmap_enabled) {
-		enable_dhugetlb = false;
-		pr_info("Please set hugetlb_free_vmemmap=off if want to enable dynamic hugetlb\n");
-		return;
-	}
-
 	count = max(hugepage_index(max_pfn), (unsigned long)DEFAULT_PAGELIST_COUNT);
 	size = sizeof(struct dhugetlb_pagelist) + count * sizeof(struct dhugetlb_pool *);
 	dhugetlb_pagelist_t = kzalloc(size, GFP_KERNEL);
-- 
Gitee


From 2706bcc0550ce47604e18a68f6bd8d0f78f0136d Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 28 Jul 2022 18:06:44 +0800
Subject: [PATCH 656/674] jump_label: Provide CONFIG-driven build state
 defaults

mainline inclusion
from mainline-v5.13-rc1
commit 0d66ccc1627013c95f1e7ef10b95b8451cd7834e
category: feature
bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0d66ccc1627013c95f1e7ef10b95b8451cd7834e

--------------------------------

As shown in the comment in jump_label.h, choosing the initial state of
static branches changes the assembly layout. If the condition is expected
to be likely it's inline, and if unlikely it is out of line via a jump.

A few places in the kernel use (or could be using) a CONFIG to choose the
default state, which would give a small performance benefit to their
compile-time declared default. Provide the infrastructure to do this.

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20210401232347.2791257-2-keescook@chromium.org
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 include/linux/jump_label.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index 7e1dce5670fc..d01c23025af0 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -388,6 +388,21 @@ struct static_key_false {
 		[0 ... (count) - 1] = STATIC_KEY_FALSE_INIT,	\
 	}
 
+#define _DEFINE_STATIC_KEY_1(name)	DEFINE_STATIC_KEY_TRUE(name)
+#define _DEFINE_STATIC_KEY_0(name)	DEFINE_STATIC_KEY_FALSE(name)
+#define DEFINE_STATIC_KEY_MAYBE(cfg, name)			\
+	__PASTE(_DEFINE_STATIC_KEY_, IS_ENABLED(cfg))(name)
+
+#define _DEFINE_STATIC_KEY_RO_1(name)	DEFINE_STATIC_KEY_TRUE_RO(name)
+#define _DEFINE_STATIC_KEY_RO_0(name)	DEFINE_STATIC_KEY_FALSE_RO(name)
+#define DEFINE_STATIC_KEY_MAYBE_RO(cfg, name)			\
+	__PASTE(_DEFINE_STATIC_KEY_RO_, IS_ENABLED(cfg))(name)
+
+#define _DECLARE_STATIC_KEY_1(name)	DECLARE_STATIC_KEY_TRUE(name)
+#define _DECLARE_STATIC_KEY_0(name)	DECLARE_STATIC_KEY_FALSE(name)
+#define DECLARE_STATIC_KEY_MAYBE(cfg, name)			\
+	__PASTE(_DECLARE_STATIC_KEY_, IS_ENABLED(cfg))(name)
+
 extern bool ____wrong_branch_error(void);
 
 #define static_key_enabled(x)							\
@@ -488,6 +503,10 @@ extern bool ____wrong_branch_error(void);
 
 #endif /* CONFIG_JUMP_LABEL */
 
+#define static_branch_maybe(config, x)					\
+	(IS_ENABLED(config) ? static_branch_likely(x)			\
+			    : static_branch_unlikely(x))
+
 /*
  * Advanced usage; refcount, branch is enabled when: count != 0
  */
-- 
Gitee


From 2722aa58c0fe3bd8c73b80786b1120e317c41365 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 28 Jul 2022 18:06:45 +0800
Subject: [PATCH 657/674] mm: make compound_head const-preserving

mainline inclusion
from mainline-v5.14-rc1
commit 0f2317e34e2c7b97efd4600122115410795ebeea
category: feature
bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0f2317e34e2c7b97efd4600122115410795ebeea

--------------------------------

If you pass a const pointer to compound_head(), you get a const pointer
back; if you pass a mutable pointer, you get a mutable pointer back.  Also
remove an unnecessary forward definition of struct page; we're about to
dereference page->compound_head, so it must already have been defined.

Link: https://lkml.kernel.org/r/20210416231531.2521383-5-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 include/linux/page-flags.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 65e1cbe1d1ce..b40b4e0ada31 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -194,17 +194,17 @@ enum pageflags {
 
 #ifndef __GENERATING_BOUNDS_H
 
-struct page;	/* forward declaration */
-
-static inline struct page *compound_head(struct page *page)
+static inline unsigned long _compound_head(const struct page *page)
 {
 	unsigned long head = READ_ONCE(page->compound_head);
 
 	if (unlikely(head & 1))
-		return (struct page *) (head - 1);
-	return page;
+		return head - 1;
+	return (unsigned long)page;
 }
 
+#define compound_head(page)	((typeof(page))_compound_head(page))
+
 static __always_inline int PageTail(struct page *page)
 {
 	return READ_ONCE(page->compound_head) & 1;
-- 
Gitee


From fe0aed0230fbd8246fb028d126548fa69b4f9bf4 Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Thu, 28 Jul 2022 18:06:46 +0800
Subject: [PATCH 658/674] mm: hugetlb: free the 2nd vmemmap page associated
 with each HugeTLB page

mainline inclusion
from mainline-v5.18-rc1
commit e7d324850bfcb30df563d144c0363cc44595277d
category: feature
bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e7d324850bfcb30df563d144c0363cc44595277d

--------------------------------

Patch series "Free the 2nd vmemmap page associated with each HugeTLB
page", v7.

This series can minimize the overhead of struct page for 2MB HugeTLB
pages significantly.  It further reduces the overhead of struct page by
12.5% for a 2MB HugeTLB compared to the previous approach, which means
2GB per 1TB HugeTLB.  It is a nice gain.  Comments and reviews are
welcome.  Thanks.

The main implementation and details can refer to the commit log of patch
1.  In this series, I have changed the following four helpers, the
following table shows the impact of the overhead of those helpers.

	+------------------+-----------------------+
	|       APIs       | head page | tail page |
	+------------------+-----------+-----------+
	|    PageHead()    |     Y     |     N     |
	+------------------+-----------+-----------+
	|    PageTail()    |     Y     |     N     |
	+------------------+-----------+-----------+
	|  PageCompound()  |     N     |     N     |
	+------------------+-----------+-----------+
	|  compound_head() |     Y     |     N     |
	+------------------+-----------+-----------+

	Y: Overhead is increased.
	N: Overhead is _NOT_ increased.

It shows that the overhead of those helpers on a tail page don't change
between "hugetlb_free_vmemmap=on" and "hugetlb_free_vmemmap=off".  But the
overhead on a head page will be increased when "hugetlb_free_vmemmap=on"
(except PageCompound()).  So I believe that Matthew Wilcox's folio series
will help with this.

The users of PageHead() and PageTail() are much less than compound_head()
and most users of PageTail() are VM_BUG_ON(), so I have done some tests
about the overhead of compound_head() on head pages.

I have tested the overhead of calling compound_head() on a head page,
which is 2.11ns (Measure the call time of 10 million times
compound_head(), and then average).

For a head page whose address is not aligned with PAGE_SIZE or a
non-compound page, the overhead of compound_head() is 2.54ns which is
increased by 20%.  For a head page whose address is aligned with
PAGE_SIZE, the overhead of compound_head() is 2.97ns which is increased by
40%.  Most pages are the former.  I do not think the overhead is
significant since the overhead of compound_head() itself is low.

This patch (of 5):

This patch minimizes the overhead of struct page for 2MB HugeTLB pages
significantly.  It further reduces the overhead of struct page by 12.5%
for a 2MB HugeTLB compared to the previous approach, which means 2GB per
1TB HugeTLB (2MB type).

After the feature of "Free sonme vmemmap pages of HugeTLB page" is
enabled, the mapping of the vmemmap addresses associated with a 2MB
HugeTLB page becomes the figure below.

     HugeTLB                    struct pages(8 pages)         page frame(8 pages)
 +-----------+ ---virt_to_page---> +-----------+   mapping to   +-----------+---> PG_head
 |           |                     |     0     | -------------> |     0     |
 |           |                     +-----------+                +-----------+
 |           |                     |     1     | -------------> |     1     |
 |           |                     +-----------+                +-----------+
 |           |                     |     2     | ----------------^ ^ ^ ^ ^ ^
 |           |                     +-----------+                   | | | | |
 |           |                     |     3     | ------------------+ | | | |
 |           |                     +-----------+                     | | | |
 |           |                     |     4     | --------------------+ | | |
 |    2MB    |                     +-----------+                       | | |
 |           |                     |     5     | ----------------------+ | |
 |           |                     +-----------+                         | |
 |           |                     |     6     | ------------------------+ |
 |           |                     +-----------+                           |
 |           |                     |     7     | --------------------------+
 |           |                     +-----------+
 |           |
 |           |
 |           |
 +-----------+

As we can see, the 2nd vmemmap page frame (indexed by 1) is reused and
remaped. However, the 2nd vmemmap page frame is also can be freed to
the buddy allocator, then we can change the mapping from the figure
above to the figure below.

    HugeTLB                    struct pages(8 pages)         page frame(8 pages)
 +-----------+ ---virt_to_page---> +-----------+   mapping to   +-----------+---> PG_head
 |           |                     |     0     | -------------> |     0     |
 |           |                     +-----------+                +-----------+
 |           |                     |     1     | ---------------^ ^ ^ ^ ^ ^ ^
 |           |                     +-----------+                  | | | | | |
 |           |                     |     2     | -----------------+ | | | | |
 |           |                     +-----------+                    | | | | |
 |           |                     |     3     | -------------------+ | | | |
 |           |                     +-----------+                      | | | |
 |           |                     |     4     | ---------------------+ | | |
 |    2MB    |                     +-----------+                        | | |
 |           |                     |     5     | -----------------------+ | |
 |           |                     +-----------+                          | |
 |           |                     |     6     | -------------------------+ |
 |           |                     +-----------+                            |
 |           |                     |     7     | ---------------------------+
 |           |                     +-----------+
 |           |
 |           |
 |           |
 +-----------+

After we do this, all tail vmemmap pages (1-7) are mapped to the head
vmemmap page frame (0).  In other words, there are more than one page
struct with PG_head associated with each HugeTLB page.  We __know__ that
there is only one head page struct, the tail page structs with PG_head are
fake head page structs.  We need an approach to distinguish between those
two different types of page structs so that compound_head(), PageHead()
and PageTail() can work properly if the parameter is the tail page struct
but with PG_head.

The following code snippet describes how to distinguish between real and
fake head page struct.

	if (test_bit(PG_head, &page->flags)) {
		unsigned long head = READ_ONCE(page[1].compound_head);

		if (head & 1) {
			if (head == (unsigned long)page + 1)
				==> head page struct
			else
				==> tail page struct
		} else
			==> head page struct
	}

We can safely access the field of the @page[1] with PG_head because the
@page is a compound page composed with at least two contiguous pages.

[songmuchun@bytedance.com: restore lost comment changes]

Link: https://lkml.kernel.org/r/20211101031651.75851-1-songmuchun@bytedance.com
Link: https://lkml.kernel.org/r/20211101031651.75851-2-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: Barry Song <song.bao.hua@hisilicon.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Chen Huang <chenhuang5@huawei.com>
Cc: Bodeddula Balasubramaniam <bodeddub@amazon.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Cc: Fam Zheng <fam.zheng@bytedance.com>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Conflicts:
	include/linux/page-flags.h
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 .../admin-guide/kernel-parameters.txt         |  2 +-
 include/linux/page-flags.h                    | 73 ++++++++++++++++++-
 mm/hugetlb_vmemmap.c                          | 62 +++++++++-------
 mm/sparse-vmemmap.c                           | 21 ++++++
 4 files changed, 125 insertions(+), 33 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 2b04cf8fbab4..e0957b73f63d 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1596,7 +1596,7 @@
 			[KNL] Reguires CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
 			enabled.
 			Allows heavy hugetlb users to free up some more
-			memory (6 * PAGE_SIZE for each 2MB hugetlb page).
+			memory (7 * PAGE_SIZE for each 2MB hugetlb page).
 			Format: { on | off (default) }
 
 			on:  enable the feature
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index b40b4e0ada31..7b31a81be7e1 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -194,25 +194,82 @@ enum pageflags {
 
 #ifndef __GENERATING_BOUNDS_H
 
+#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
+extern bool hugetlb_free_vmemmap_enabled;
+
+/*
+ * If the feature of freeing some vmemmap pages associated with each HugeTLB
+ * page is enabled, the head vmemmap page frame is reused and all of the tail
+ * vmemmap addresses map to the head vmemmap page frame (furture details can
+ * refer to the figure at the head of the mm/hugetlb_vmemmap.c).  In other
+ * words, there are more than one page struct with PG_head associated with each
+ * HugeTLB page.  We __know__ that there is only one head page struct, the tail
+ * page structs with PG_head are fake head page structs.  We need an approach
+ * to distinguish between those two different types of page structs so that
+ * compound_head() can return the real head page struct when the parameter is
+ * the tail page struct but with PG_head.
+ *
+ * The page_fixed_fake_head() returns the real head page struct if the @page is
+ * fake page head, otherwise, returns @page which can either be a true page
+ * head or tail.
+ */
+static __always_inline const struct page *page_fixed_fake_head(const struct page *page)
+{
+	if (!hugetlb_free_vmemmap_enabled)
+		return page;
+
+	/*
+	 * Only addresses aligned with PAGE_SIZE of struct page may be fake head
+	 * struct page. The alignment check aims to avoid access the fields (
+	 * e.g. compound_head) of the @page[1]. It can avoid touch a (possibly)
+	 * cold cacheline in some cases.
+	 */
+	if (IS_ALIGNED((unsigned long)page, PAGE_SIZE) &&
+	    test_bit(PG_head, &page->flags)) {
+		/*
+		 * We can safely access the field of the @page[1] with PG_head
+		 * because the @page is a compound page composed with at least
+		 * two contiguous pages.
+		 */
+		unsigned long head = READ_ONCE(page[1].compound_head);
+
+		if (likely(head & 1))
+			return (const struct page *)(head - 1);
+	}
+	return page;
+}
+#else
+static inline const struct page *page_fixed_fake_head(const struct page *page)
+{
+	return page;
+}
+#endif
+
+static __always_inline int page_is_fake_head(struct page *page)
+{
+	return page_fixed_fake_head(page) != page;
+}
+
 static inline unsigned long _compound_head(const struct page *page)
 {
 	unsigned long head = READ_ONCE(page->compound_head);
 
 	if (unlikely(head & 1))
 		return head - 1;
-	return (unsigned long)page;
+	return (unsigned long)page_fixed_fake_head(page);
 }
 
 #define compound_head(page)	((typeof(page))_compound_head(page))
 
 static __always_inline int PageTail(struct page *page)
 {
-	return READ_ONCE(page->compound_head) & 1;
+	return READ_ONCE(page->compound_head) & 1 || page_is_fake_head(page);
 }
 
 static __always_inline int PageCompound(struct page *page)
 {
-	return test_bit(PG_head, &page->flags) || PageTail(page);
+	return test_bit(PG_head, &page->flags) ||
+	       READ_ONCE(page->compound_head) & 1;
 }
 
 #define	PAGE_POISON_PATTERN	-1l
@@ -600,7 +657,15 @@ static inline void set_page_writeback_keepwrite(struct page *page)
 	test_set_page_writeback_keepwrite(page);
 }
 
-__PAGEFLAG(Head, head, PF_ANY) CLEARPAGEFLAG(Head, head, PF_ANY)
+static __always_inline int PageHead(struct page *page)
+{
+	PF_POISONED_CHECK(page);
+	return test_bit(PG_head, &page->flags) && !page_is_fake_head(page);
+}
+
+__SETPAGEFLAG(Head, head, PF_ANY)
+__CLEARPAGEFLAG(Head, head, PF_ANY)
+CLEARPAGEFLAG(Head, head, PF_ANY)
 
 static __always_inline void set_compound_head(struct page *page, struct page *head)
 {
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index c540c21e26f5..4977f5a520c2 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -124,9 +124,9 @@
  * page of page structs (page 0) associated with the HugeTLB page contains the 4
  * page structs necessary to describe the HugeTLB. The only use of the remaining
  * pages of page structs (page 1 to page 7) is to point to page->compound_head.
- * Therefore, we can remap pages 2 to 7 to page 1. Only 2 pages of page structs
+ * Therefore, we can remap pages 1 to 7 to page 0. Only 1 page of page structs
  * will be used for each HugeTLB page. This will allow us to free the remaining
- * 6 pages to the buddy allocator.
+ * 7 pages to the buddy allocator.
  *
  * Here is how things look after remapping.
  *
@@ -134,30 +134,30 @@
  * +-----------+ ---virt_to_page---> +-----------+   mapping to   +-----------+
  * |           |                     |     0     | -------------> |     0     |
  * |           |                     +-----------+                +-----------+
- * |           |                     |     1     | -------------> |     1     |
- * |           |                     +-----------+                +-----------+
- * |           |                     |     2     | ----------------^ ^ ^ ^ ^ ^
- * |           |                     +-----------+                   | | | | |
- * |           |                     |     3     | ------------------+ | | | |
- * |           |                     +-----------+                     | | | |
- * |           |                     |     4     | --------------------+ | | |
- * |    PMD    |                     +-----------+                       | | |
- * |   level   |                     |     5     | ----------------------+ | |
- * |  mapping  |                     +-----------+                         | |
- * |           |                     |     6     | ------------------------+ |
- * |           |                     +-----------+                           |
- * |           |                     |     7     | --------------------------+
+ * |           |                     |     1     | ---------------^ ^ ^ ^ ^ ^ ^
+ * |           |                     +-----------+                  | | | | | |
+ * |           |                     |     2     | -----------------+ | | | | |
+ * |           |                     +-----------+                    | | | | |
+ * |           |                     |     3     | -------------------+ | | | |
+ * |           |                     +-----------+                      | | | |
+ * |           |                     |     4     | ---------------------+ | | |
+ * |    PMD    |                     +-----------+                        | | |
+ * |   level   |                     |     5     | -----------------------+ | |
+ * |  mapping  |                     +-----------+                          | |
+ * |           |                     |     6     | -------------------------+ |
+ * |           |                     +-----------+                            |
+ * |           |                     |     7     | ---------------------------+
  * |           |                     +-----------+
  * |           |
  * |           |
  * |           |
  * +-----------+
  *
- * When a HugeTLB is freed to the buddy system, we should allocate 6 pages for
+ * When a HugeTLB is freed to the buddy system, we should allocate 7 pages for
  * vmemmap pages and restore the previous mapping relationship.
  *
  * For the HugeTLB page of the pud level mapping. It is similar to the former.
- * We also can use this approach to free (PAGE_SIZE - 2) vmemmap pages.
+ * We also can use this approach to free (PAGE_SIZE - 1) vmemmap pages.
  *
  * Apart from the HugeTLB page of the pmd/pud level mapping, some architectures
  * (e.g. aarch64) provides a contiguous bit in the translation table entries
@@ -166,7 +166,13 @@
  *
  * The contiguous bit is used to increase the mapping size at the pmd and pte
  * (last) level. So this type of HugeTLB page can be optimized only when its
- * size of the struct page structs is greater than 2 pages.
+ * size of the struct page structs is greater than 1 page.
+ *
+ * Notice: The head vmemmap page is not freed to the buddy allocator and all
+ * tail vmemmap pages are mapped to the head vmemmap page frame. So we can see
+ * more than one struct page struct with PG_head (e.g. 8 per 2 MB HugeTLB page)
+ * associated with each HugeTLB page. The compound_head() can handle this
+ * correctly (more details refer to the comment above compound_head()).
  */
 #define pr_fmt(fmt)	"HugeTLB: " fmt
 
@@ -175,19 +181,21 @@
 /*
  * There are a lot of struct page structures associated with each HugeTLB page.
  * For tail pages, the value of compound_head is the same. So we can reuse first
- * page of tail page structures. We map the virtual addresses of the remaining
- * pages of tail page structures to the first tail page struct, and then free
- * these page frames. Therefore, we need to reserve two pages as vmemmap areas.
+ * page of head page structures. We map the virtual addresses of all the pages
+ * of tail page structures to the head page struct, and then free these page
+ * frames. Therefore, we need to reserve one pages as vmemmap areas.
  */
-#define RESERVE_VMEMMAP_NR		2U
+#define RESERVE_VMEMMAP_NR		1U
 #define RESERVE_VMEMMAP_SIZE		(RESERVE_VMEMMAP_NR << PAGE_SHIFT)
 
-bool hugetlb_free_vmemmap_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON);
+bool hugetlb_free_vmemmap_enabled __read_mostly =
+	IS_ENABLED(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON);
+EXPORT_SYMBOL(hugetlb_free_vmemmap_enabled);
 
 static int __init early_hugetlb_free_vmemmap_param(char *buf)
 {
 	/* We cannot optimize if a "struct page" crosses page boundaries. */
-	if ((!is_power_of_2(sizeof(struct page)))) {
+	if (!is_power_of_2(sizeof(struct page))) {
 		pr_warn("cannot free vmemmap pages because \"struct page\" crosses page boundaries\n");
 		return 0;
 	}
@@ -236,7 +244,6 @@ int alloc_huge_page_vmemmap(struct hstate *h, struct page *head)
 	 */
 	ret = vmemmap_remap_alloc(vmemmap_addr, vmemmap_end, vmemmap_reuse,
 				  GFP_KERNEL | __GFP_NORETRY | __GFP_THISNODE);
-
 	if (!ret)
 		ClearHPageVmemmapOptimized(head);
 
@@ -282,9 +289,8 @@ void __init hugetlb_vmemmap_init(struct hstate *h)
 
 	vmemmap_pages = (nr_pages * sizeof(struct page)) >> PAGE_SHIFT;
 	/*
-	 * The head page and the first tail page are not to be freed to buddy
-	 * allocator, the other pages will map to the first tail page, so they
-	 * can be freed.
+	 * The head page is not to be freed to buddy allocator, the other tail
+	 * pages will map to the head page, so they can be freed.
 	 *
 	 * Could RESERVE_VMEMMAP_NR be greater than @vmemmap_pages? It is true
 	 * on some architectures (e.g. aarch64). See Documentation/arm64/
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 396a49462894..39bdbf0b28dd 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -245,6 +245,26 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
 	set_pte_at(&init_mm, addr, pte, entry);
 }
 
+/*
+ * How many struct page structs need to be reset. When we reuse the head
+ * struct page, the special metadata (e.g. page->flags or page->mapping)
+ * cannot copy to the tail struct page structs. The invalid value will be
+ * checked in the free_tail_pages_check(). In order to avoid the message
+ * of "corrupted mapping in tail page". We need to reset at least 3 (one
+ * head struct page struct and two tail struct page structs) struct page
+ * structs.
+ */
+#define NR_RESET_STRUCT_PAGE		3
+
+static inline void reset_struct_pages(struct page *start)
+{
+	int i;
+	struct page *from = start + NR_RESET_STRUCT_PAGE;
+
+	for (i = 0; i < NR_RESET_STRUCT_PAGE; i++)
+		memcpy(start + i, from, sizeof(*from));
+}
+
 static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
 				struct vmemmap_remap_walk *walk)
 {
@@ -258,6 +278,7 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
 	list_del(&page->lru);
 	to = page_to_virt(page);
 	copy_page(to, (void *)walk->reuse_addr);
+	reset_struct_pages(to);
 
 	set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
 }
-- 
Gitee


From 9b4aa808afd36ebad9567413eddf5a0db8b8441f Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Thu, 28 Jul 2022 18:06:47 +0800
Subject: [PATCH 659/674] mm: hugetlb: replace hugetlb_free_vmemmap_enabled
 with a static_key

mainline inclusion
from mainline-v5.18-rc1
commit a6b40850c442bf996e729e1d441d3dbc37cea171
category: feature
bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a6b40850c442bf996e729e1d441d3dbc37cea171

--------------------------------

The page_fixed_fake_head() is used throughout memory management and the
conditional check requires checking a global variable, although the
overhead of this check may be small, it increases when the memory cache
comes under pressure.  Also, the global variable will not be modified
after system boot, so it is very appropriate to use static key machanism.

Link: https://lkml.kernel.org/r/20211101031651.75851-3-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: Barry Song <song.bao.hua@hisilicon.com>
Cc: Bodeddula Balasubramaniam <bodeddub@amazon.com>
Cc: Chen Huang <chenhuang5@huawei.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Fam Zheng <fam.zheng@bytedance.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Conflicts:
	mm/memory_hotplug.c
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 include/linux/hugetlb.h    |  6 ------
 include/linux/page-flags.h | 16 ++++++++++++++--
 mm/hugetlb_vmemmap.c       | 12 ++++++------
 3 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 634630ebc8a7..fd1dc8d29436 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -1103,12 +1103,6 @@ static inline int hugetlb_insert_hugepage_pte_by_pa(struct mm_struct *mm,
 }
 #endif	/* CONFIG_HUGETLB_PAGE */
 
-#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
-extern bool hugetlb_free_vmemmap_enabled;
-#else
-#define hugetlb_free_vmemmap_enabled	false
-#endif
-
 static inline spinlock_t *huge_pte_lock(struct hstate *h,
 					struct mm_struct *mm, pte_t *pte)
 {
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 7b31a81be7e1..81061122af68 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -195,7 +195,14 @@ enum pageflags {
 #ifndef __GENERATING_BOUNDS_H
 
 #ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
-extern bool hugetlb_free_vmemmap_enabled;
+DECLARE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON,
+			 hugetlb_free_vmemmap_enabled_key);
+
+static __always_inline bool hugetlb_free_vmemmap_enabled(void)
+{
+	return static_branch_maybe(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON,
+				   &hugetlb_free_vmemmap_enabled_key);
+}
 
 /*
  * If the feature of freeing some vmemmap pages associated with each HugeTLB
@@ -215,7 +222,7 @@ extern bool hugetlb_free_vmemmap_enabled;
  */
 static __always_inline const struct page *page_fixed_fake_head(const struct page *page)
 {
-	if (!hugetlb_free_vmemmap_enabled)
+	if (!hugetlb_free_vmemmap_enabled())
 		return page;
 
 	/*
@@ -243,6 +250,11 @@ static inline const struct page *page_fixed_fake_head(const struct page *page)
 {
 	return page;
 }
+
+static inline bool hugetlb_free_vmemmap_enabled(void)
+{
+	return false;
+}
 #endif
 
 static __always_inline int page_is_fake_head(struct page *page)
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 4977f5a520c2..791626983c2e 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -188,9 +188,9 @@
 #define RESERVE_VMEMMAP_NR		1U
 #define RESERVE_VMEMMAP_SIZE		(RESERVE_VMEMMAP_NR << PAGE_SHIFT)
 
-bool hugetlb_free_vmemmap_enabled __read_mostly =
-	IS_ENABLED(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON);
-EXPORT_SYMBOL(hugetlb_free_vmemmap_enabled);
+DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON,
+			hugetlb_free_vmemmap_enabled_key);
+EXPORT_SYMBOL(hugetlb_free_vmemmap_enabled_key);
 
 static int __init early_hugetlb_free_vmemmap_param(char *buf)
 {
@@ -204,9 +204,9 @@ static int __init early_hugetlb_free_vmemmap_param(char *buf)
 		return -EINVAL;
 
 	if (!strcmp(buf, "on"))
-		hugetlb_free_vmemmap_enabled = true;
+		static_branch_enable(&hugetlb_free_vmemmap_enabled_key);
 	else if (!strcmp(buf, "off"))
-		hugetlb_free_vmemmap_enabled = false;
+		static_branch_disable(&hugetlb_free_vmemmap_enabled_key);
 	else
 		return -EINVAL;
 
@@ -284,7 +284,7 @@ void __init hugetlb_vmemmap_init(struct hstate *h)
 	BUILD_BUG_ON(__NR_USED_SUBPAGE >=
 		     RESERVE_VMEMMAP_SIZE / sizeof(struct page));
 
-	if (!hugetlb_free_vmemmap_enabled)
+	if (!hugetlb_free_vmemmap_enabled())
 		return;
 
 	vmemmap_pages = (nr_pages * sizeof(struct page)) >> PAGE_SHIFT;
-- 
Gitee


From b31b84bf5c6d282b2e3bdcf8dc6df4f395efcb19 Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Thu, 28 Jul 2022 18:06:48 +0800
Subject: [PATCH 660/674] mm: sparsemem: use page table lock to protect kernel
 pmd operations

mainline inclusion
from mainline-v5.18-rc1
commit d8d55f5616cf3b900a23a72dd24e7b07211e7859
category: feature
bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=d8d55f5616cf3b900a23a72dd24e7b07211e7859

--------------------------------

The init_mm.page_table_lock is used to protect kernel page tables, we
can use it to serialize splitting vmemmap PMD mappings instead of mmap
write lock, which can increase the concurrency of vmemmap_remap_free().

Actually, It increase the concurrency between allocations of HugeTLB
pages.  But it is not the only benefit.  There are a lot of users of
mmap read lock of init_mm.  The mmap write lock is holding through
vmemmap_remap_free(), removing mmap write lock usage to make it does not
affect other users of mmap read lock.  It is not making anything worse
and always a win to move.

Now the kernel page table walker does not hold the page_table_lock when
walking pmd entries.  There may be consistency issue of a pmd entry,
because pmd entry might change from a huge pmd entry to a PTE page
table.  There is only one user of kernel page table walker, namely
ptdump.  The ptdump already considers the consistency, which use a local
variable to cache the value of pmd entry.  But we also need to update
->action to ACTION_CONTINUE to make sure the walker does not walk every
pte entry again when concurrent thread has split the huge pmd.

Link: https://lkml.kernel.org/r/20211101031651.75851-4-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Cc: Barry Song <song.bao.hua@hisilicon.com>
Cc: Bodeddula Balasubramaniam <bodeddub@amazon.com>
Cc: Chen Huang <chenhuang5@huawei.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Fam Zheng <fam.zheng@bytedance.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Conflicts:
	mm/sparse-vmemmap.c
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 mm/ptdump.c         | 16 +++++++++++----
 mm/sparse-vmemmap.c | 47 ++++++++++++++++++++++++++++++---------------
 2 files changed, 43 insertions(+), 20 deletions(-)

diff --git a/mm/ptdump.c b/mm/ptdump.c
index 93f2f63dc52d..43661863096b 100644
--- a/mm/ptdump.c
+++ b/mm/ptdump.c
@@ -39,8 +39,10 @@ static int ptdump_pgd_entry(pgd_t *pgd, unsigned long addr,
 	if (st->effective_prot)
 		st->effective_prot(st, 0, pgd_val(val));
 
-	if (pgd_leaf(val))
+	if (pgd_leaf(val)) {
 		st->note_page(st, addr, 0, pgd_val(val));
+		walk->action = ACTION_CONTINUE;
+	}
 
 	return 0;
 }
@@ -59,8 +61,10 @@ static int ptdump_p4d_entry(p4d_t *p4d, unsigned long addr,
 	if (st->effective_prot)
 		st->effective_prot(st, 1, p4d_val(val));
 
-	if (p4d_leaf(val))
+	if (p4d_leaf(val)) {
 		st->note_page(st, addr, 1, p4d_val(val));
+		walk->action = ACTION_CONTINUE;
+	}
 
 	return 0;
 }
@@ -79,8 +83,10 @@ static int ptdump_pud_entry(pud_t *pud, unsigned long addr,
 	if (st->effective_prot)
 		st->effective_prot(st, 2, pud_val(val));
 
-	if (pud_leaf(val))
+	if (pud_leaf(val)) {
 		st->note_page(st, addr, 2, pud_val(val));
+		walk->action = ACTION_CONTINUE;
+	}
 
 	return 0;
 }
@@ -98,8 +104,10 @@ static int ptdump_pmd_entry(pmd_t *pmd, unsigned long addr,
 
 	if (st->effective_prot)
 		st->effective_prot(st, 3, pmd_val(val));
-	if (pmd_leaf(val))
+	if (pmd_leaf(val)) {
 		st->note_page(st, addr, 3, pmd_val(val));
+		walk->action = ACTION_CONTINUE;
+	}
 
 	return 0;
 }
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 39bdbf0b28dd..eef9053f948f 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -53,8 +53,7 @@ struct vmemmap_remap_walk {
 	struct list_head *vmemmap_pages;
 };
 
-static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start,
-				  struct vmemmap_remap_walk *walk)
+static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
 {
 	pmd_t __pmd;
 	int i;
@@ -76,15 +75,34 @@ static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start,
 		set_pte_at(&init_mm, addr, pte, entry);
 	}
 
-	/* Make pte visible before pmd. See comment in __pte_alloc(). */
-	smp_wmb();
-	pmd_populate_kernel(&init_mm, pmd, pgtable);
-
-	flush_tlb_kernel_range(start, start + PMD_SIZE);
+	spin_lock(&init_mm.page_table_lock);
+	if (likely(pmd_leaf(*pmd))) {
+		/* Make pte visible before pmd. See comment in __pte_alloc(). */
+		smp_wmb();
+		pmd_populate_kernel(&init_mm, pmd, pgtable);
+		flush_tlb_kernel_range(start, start + PMD_SIZE);
+	} else {
+		pte_free_kernel(&init_mm, pgtable);
+	}
+	spin_unlock(&init_mm.page_table_lock);
 
 	return 0;
 }
 
+static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
+{
+	int leaf;
+
+	spin_lock(&init_mm.page_table_lock);
+	leaf = pmd_leaf(*pmd);
+	spin_unlock(&init_mm.page_table_lock);
+
+	if (!leaf)
+		return 0;
+
+	return __split_vmemmap_huge_pmd(pmd, start);
+}
+
 static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr,
 			      unsigned long end,
 			      struct vmemmap_remap_walk *walk)
@@ -121,13 +139,12 @@ static int vmemmap_pmd_range(pud_t *pud, unsigned long addr,
 
 	pmd = pmd_offset(pud, addr);
 	do {
-		if (pmd_leaf(*pmd)) {
-			int ret;
+		int ret;
+
+		ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK);
+		if (ret)
+			return ret;
 
-			ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK, walk);
-			if (ret)
-				return ret;
-		}
 		next = pmd_addr_end(addr, end);
 		vmemmap_pte_range(pmd, addr, next, walk);
 	} while (pmd++, addr = next, addr != end);
@@ -321,10 +338,8 @@ int vmemmap_remap_free(unsigned long start, unsigned long end,
 	 */
 	BUG_ON(start - reuse != PAGE_SIZE);
 
-	mmap_write_lock(&init_mm);
+	mmap_read_lock(&init_mm);
 	ret = vmemmap_remap_range(reuse, end, &walk);
-	mmap_write_downgrade(&init_mm);
-
 	if (ret && walk.nr_walked) {
 		end = reuse + walk.nr_walked * PAGE_SIZE;
 		/*
-- 
Gitee


From 7d9a273f5bc8369c07ac2ba082984cb1d1e9b182 Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Thu, 28 Jul 2022 18:06:49 +0800
Subject: [PATCH 661/674] selftests: vm: add a hugetlb test case

mainline inclusion
from mainline-v5.18-rc1
commit b147c89cd429321a59147368378c8aba17c8480f
category: feature
bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b147c89cd429321a59147368378c8aba17c8480f

--------------------------------

Since the head vmemmap page frame associated with each HugeTLB page is
reused, we should hide the PG_head flag of tail struct page from the
user.  Add a tese case to check whether it is work properly.  The test
steps are as follows.

  1) alloc 2MB hugeTLB
  2) get each page frame
  3) apply those APIs in each page frame
  4) Those APIs work completely the same as before.

Reading the flags of a page by /proc/kpageflags is done in
stable_page_flags(), which has invoked PageHead(), PageTail(),
PageCompound() and compound_head().

If those APIs work properly, the head page must have 15 and 17 bits set.
And tail pages must have 16 and 17 bits set but 15 bit unset.  Those
flags are checked in check_page_flags().

Link: https://lkml.kernel.org/r/20211101031651.75851-5-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: Barry Song <song.bao.hua@hisilicon.com>
Cc: Bodeddula Balasubramaniam <bodeddub@amazon.com>
Cc: Chen Huang <chenhuang5@huawei.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Fam Zheng <fam.zheng@bytedance.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Conflicts:
	tools/testing/selftests/vm/Makefile
	tools/testing/selftests/vm/run_vmtests.sh
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 tools/testing/selftests/vm/.gitignore         |   1 +
 tools/testing/selftests/vm/Makefile           |   1 +
 tools/testing/selftests/vm/hugepage-vmemmap.c | 144 ++++++++++++++++++
 tools/testing/selftests/vm/run_vmtests        |  11 ++
 4 files changed, 157 insertions(+)
 create mode 100644 tools/testing/selftests/vm/hugepage-vmemmap.c

diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore
index b3a183c36cb5..905dc4efa879 100644
--- a/tools/testing/selftests/vm/.gitignore
+++ b/tools/testing/selftests/vm/.gitignore
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
 hugepage-mmap
 hugepage-shm
+hugepage-vmemmap
 khugepaged
 map_hugetlb
 map_populate
diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
index b150cc837177..549761bc7193 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -27,6 +27,7 @@ TEST_GEN_FILES += gup_benchmark
 TEST_GEN_FILES += hmm-tests
 TEST_GEN_FILES += hugepage-mmap
 TEST_GEN_FILES += hugepage-shm
+TEST_GEN_FILES += hugepage-vmemmap
 TEST_GEN_FILES += map_hugetlb
 TEST_GEN_FILES += map_fixed_noreplace
 TEST_GEN_FILES += map_populate
diff --git a/tools/testing/selftests/vm/hugepage-vmemmap.c b/tools/testing/selftests/vm/hugepage-vmemmap.c
new file mode 100644
index 000000000000..557bdbd4f87e
--- /dev/null
+++ b/tools/testing/selftests/vm/hugepage-vmemmap.c
@@ -0,0 +1,144 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * A test case of using hugepage memory in a user application using the
+ * mmap system call with MAP_HUGETLB flag.  Before running this program
+ * make sure the administrator has allocated enough default sized huge
+ * pages to cover the 2 MB allocation.
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+
+#define MAP_LENGTH		(2UL * 1024 * 1024)
+
+#ifndef MAP_HUGETLB
+#define MAP_HUGETLB		0x40000	/* arch specific */
+#endif
+
+#define PAGE_SIZE		4096
+
+#define PAGE_COMPOUND_HEAD	(1UL << 15)
+#define PAGE_COMPOUND_TAIL	(1UL << 16)
+#define PAGE_HUGE		(1UL << 17)
+
+#define HEAD_PAGE_FLAGS		(PAGE_COMPOUND_HEAD | PAGE_HUGE)
+#define TAIL_PAGE_FLAGS		(PAGE_COMPOUND_TAIL | PAGE_HUGE)
+
+#define PM_PFRAME_BITS		55
+#define PM_PFRAME_MASK		~((1UL << PM_PFRAME_BITS) - 1)
+
+/*
+ * For ia64 architecture, Linux kernel reserves Region number 4 for hugepages.
+ * That means the addresses starting with 0x800000... will need to be
+ * specified.  Specifying a fixed address is not required on ppc64, i386
+ * or x86_64.
+ */
+#ifdef __ia64__
+#define MAP_ADDR		(void *)(0x8000000000000000UL)
+#define MAP_FLAGS		(MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_FIXED)
+#else
+#define MAP_ADDR		NULL
+#define MAP_FLAGS		(MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB)
+#endif
+
+static void write_bytes(char *addr, size_t length)
+{
+	unsigned long i;
+
+	for (i = 0; i < length; i++)
+		*(addr + i) = (char)i;
+}
+
+static unsigned long virt_to_pfn(void *addr)
+{
+	int fd;
+	unsigned long pagemap;
+
+	fd = open("/proc/self/pagemap", O_RDONLY);
+	if (fd < 0)
+		return -1UL;
+
+	lseek(fd, (unsigned long)addr / PAGE_SIZE * sizeof(pagemap), SEEK_SET);
+	read(fd, &pagemap, sizeof(pagemap));
+	close(fd);
+
+	return pagemap & ~PM_PFRAME_MASK;
+}
+
+static int check_page_flags(unsigned long pfn)
+{
+	int fd, i;
+	unsigned long pageflags;
+
+	fd = open("/proc/kpageflags", O_RDONLY);
+	if (fd < 0)
+		return -1;
+
+	lseek(fd, pfn * sizeof(pageflags), SEEK_SET);
+
+	read(fd, &pageflags, sizeof(pageflags));
+	if ((pageflags & HEAD_PAGE_FLAGS) != HEAD_PAGE_FLAGS) {
+		close(fd);
+		printf("Head page flags (%lx) is invalid\n", pageflags);
+		return -1;
+	}
+
+	/*
+	 * pages other than the first page must be tail and shouldn't be head;
+	 * this also verifies kernel has correctly set the fake page_head to tail
+	 * while hugetlb_free_vmemmap is enabled.
+	 */
+	for (i = 1; i < MAP_LENGTH / PAGE_SIZE; i++) {
+		read(fd, &pageflags, sizeof(pageflags));
+		if ((pageflags & TAIL_PAGE_FLAGS) != TAIL_PAGE_FLAGS ||
+		    (pageflags & HEAD_PAGE_FLAGS) == HEAD_PAGE_FLAGS) {
+			close(fd);
+			printf("Tail page flags (%lx) is invalid\n", pageflags);
+			return -1;
+		}
+	}
+
+	close(fd);
+
+	return 0;
+}
+
+int main(int argc, char **argv)
+{
+	void *addr;
+	unsigned long pfn;
+
+	addr = mmap(MAP_ADDR, MAP_LENGTH, PROT_READ | PROT_WRITE, MAP_FLAGS, -1, 0);
+	if (addr == MAP_FAILED) {
+		perror("mmap");
+		exit(1);
+	}
+
+	/* Trigger allocation of HugeTLB page. */
+	write_bytes(addr, MAP_LENGTH);
+
+	pfn = virt_to_pfn(addr);
+	if (pfn == -1UL) {
+		munmap(addr, MAP_LENGTH);
+		perror("virt_to_pfn");
+		exit(1);
+	}
+
+	printf("Returned address is %p whose pfn is %lx\n", addr, pfn);
+
+	if (check_page_flags(pfn) < 0) {
+		munmap(addr, MAP_LENGTH);
+		perror("check_page_flags");
+		exit(1);
+	}
+
+	/* munmap() length of MAP_HUGETLB memory must be hugepage aligned */
+	if (munmap(addr, MAP_LENGTH)) {
+		perror("munmap");
+		exit(1);
+	}
+
+	return 0;
+}
diff --git a/tools/testing/selftests/vm/run_vmtests b/tools/testing/selftests/vm/run_vmtests
index d578ad831813..949c71fe5951 100755
--- a/tools/testing/selftests/vm/run_vmtests
+++ b/tools/testing/selftests/vm/run_vmtests
@@ -108,6 +108,17 @@ else
 	echo "[PASS]"
 fi
 
+echo "------------------------"
+echo "running hugepage-vmemmap"
+echo "------------------------"
+./hugepage-vmemmap
+if [ $? -ne 0 ]; then
+	echo "[FAIL]"
+	exitcode=1
+else
+	echo "[PASS]"
+fi
+
 echo "NOTE: The above hugetlb tests provide minimal coverage.  Use"
 echo "      https://github.com/libhugetlbfs/libhugetlbfs.git for"
 echo "      hugetlb regression testing."
-- 
Gitee


From 22f969cffb0be543692d6942f41cf3735fb66790 Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Thu, 28 Jul 2022 18:06:50 +0800
Subject: [PATCH 662/674] mm: sparsemem: move vmemmap related to HugeTLB to
 CONFIG_HUGETLB_PAGE_FREE_VMEMMAP

mainline inclusion
from mainline-v5.18-rc1
commit e54084173487804f5e2f23facf107fd9336e637e
category: feature
bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e54084173487804f5e2f23facf107fd9336e637e

--------------------------------

The vmemmap_remap_free/alloc are relevant to HugeTLB, so move those
functiongs to the scope of CONFIG_HUGETLB_PAGE_FREE_VMEMMAP.

Link: https://lkml.kernel.org/r/20211101031651.75851-6-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: Barry Song <song.bao.hua@hisilicon.com>
Cc: Bodeddula Balasubramaniam <bodeddub@amazon.com>
Cc: Chen Huang <chenhuang5@huawei.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Fam Zheng <fam.zheng@bytedance.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 include/linux/mm.h  | 2 ++
 mm/sparse-vmemmap.c | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 25891b581bf4..efa3972a23bf 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3104,10 +3104,12 @@ static inline void print_vma_addr(char *prefix, unsigned long rip)
 }
 #endif
 
+#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
 int vmemmap_remap_free(unsigned long start, unsigned long end,
 		       unsigned long reuse);
 int vmemmap_remap_alloc(unsigned long start, unsigned long end,
 			unsigned long reuse, gfp_t gfp_mask);
+#endif
 
 void *sparse_buffer_alloc(unsigned long size);
 struct page * __populate_section_memmap(unsigned long pfn,
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index eef9053f948f..e5ed2680ec57 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -34,6 +34,7 @@
 #include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 
+#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
 /**
  * struct vmemmap_remap_walk - walk vmemmap page table
  *
@@ -419,6 +420,7 @@ int vmemmap_remap_alloc(unsigned long start, unsigned long end,
 
 	return 0;
 }
+#endif /* CONFIG_HUGETLB_PAGE_FREE_VMEMMAP */
 
 /*
  * Allocate a block of memory to be used to back the virtual memory map
-- 
Gitee


From 0a5d68bb279164b7ec023203dc9f0d6a59fc17f1 Mon Sep 17 00:00:00 2001
From: Liu Shixin <liushixin2@huawei.com>
Date: Thu, 28 Jul 2022 18:06:51 +0800
Subject: [PATCH 663/674] Revert "arm64: mm: hugetlb: add support for free
 vmemmap pages of HugeTLB"

hulk inclusion
category: feature
bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO

--------------------------------

There is a formal solution to support hugetlb vmemmap feature on arm64.

This reverts commit 5838d235a395e221968e1f024bfaf8650278a522.

Signed-off-by: Liu Shixin <liushixin2@huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 fs/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/Kconfig b/fs/Kconfig
index 6e723c90a506..37a8895a3254 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -239,7 +239,7 @@ config HUGETLB_PAGE
 
 config HUGETLB_PAGE_FREE_VMEMMAP
 	def_bool HUGETLB_PAGE
-	depends on X86_64 || ARM64
+	depends on X86_64
 	depends on SPARSEMEM_VMEMMAP
 
 config HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON
-- 
Gitee


From e457e43c459e8ecc1679f5f356f11c84ada79bbf Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Thu, 28 Jul 2022 18:06:52 +0800
Subject: [PATCH 664/674] mm: hugetlb_vmemmap: introduce
 ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP

mainline inclusion
from mainline-v5.19-rc1
commit 2e4ec02bbcc05b8905d65c763ebde6bc85508e90
category: feature
bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=2e4ec02bbcc05b8905d65c763ebde6bc85508e90

--------------------------------

The feature of minimizing overhead of struct page associated with each
HugeTLB page is implemented on x86_64, however, the infrastructure of this
feature is already there, we could easily enable it for other
architectures.  Introduce ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP for other
architectures to be easily enabled.  Just select this config if they want
to enable this feature.

Link: https://lkml.kernel.org/r/20220331065640.5777-1-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Suggested-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Barry Song <baohua@kernel.org>
Tested-by: Barry Song <baohua@kernel.org>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Bodeddula Balasubramaniam <bodeddub@amazon.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Fam Zheng <fam.zheng@bytedance.com>
Cc: James Morse <james.morse@arm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Will Deacon <will@kernel.org>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 arch/x86/Kconfig |  1 +
 fs/Kconfig       | 10 +++++++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d17396ef4323..661e05e1f762 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -103,6 +103,7 @@ config X86
 	select ARCH_WANT_DEFAULT_BPF_JIT	if X86_64
 	select ARCH_WANTS_DYNAMIC_TASK_STRUCT
 	select ARCH_WANT_HUGE_PMD_SHARE
+	select ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP	if X86_64
 	select ARCH_WANT_LD_ORPHAN_WARN
 	select ARCH_WANTS_THP_SWAP		if X86_64
 	select BUILDTIME_TABLE_SORT
diff --git a/fs/Kconfig b/fs/Kconfig
index 37a8895a3254..b60a7614cb16 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -237,9 +237,17 @@ config HUGETLBFS
 config HUGETLB_PAGE
 	def_bool HUGETLBFS
 
+#
+# Select this config option from the architecture Kconfig, if it is preferred
+# to enable the feature of minimizing overhead of struct page associated with
+# each HugeTLB page.
+#
+config ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP
+	bool
+
 config HUGETLB_PAGE_FREE_VMEMMAP
 	def_bool HUGETLB_PAGE
-	depends on X86_64
+	depends on ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP
 	depends on SPARSEMEM_VMEMMAP
 
 config HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON
-- 
Gitee


From de126e063c1e89c698fe2304d0e590f69157ac66 Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Thu, 28 Jul 2022 18:06:53 +0800
Subject: [PATCH 665/674] arm64: mm: hugetlb: enable HUGETLB_PAGE_FREE_VMEMMAP
 for arm64

mainline inclusion
from mainline-v5.19-rc1
commit 1e63ac088f20f7a4425c430c31ecd3cf167fb3f2
category: feature
bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=1e63ac088f20f7a4425c430c31ecd3cf167fb3f2

--------------------------------

The feature of minimizing overhead of struct page associated with each
HugeTLB page aims to free its vmemmap pages (used as struct page) to save
memory, where is ~14GB/16GB per 1TB HugeTLB pages (2MB/1GB type).  In
short, when a HugeTLB page is allocated or freed, the vmemmap array
representing the range associated with the page will need to be remapped.
When a page is allocated, vmemmap pages are freed after remapping.  When a
page is freed, previously discarded vmemmap pages must be allocated before
remapping.  More implementations and details can be found here [1].

The infrastructure of freeing vmemmap pages associated with each HugeTLB
page is already there, we can easily enable HUGETLB_PAGE_FREE_VMEMMAP for
arm64, the only thing to be fixed is flush_dcache_page() .

flush_dcache_page() need to be adapted to operate on the head page's flags
since the tail vmemmap pages are mapped with read-only after the feature
is enabled (clear operation is not permitted).

There was some discussions about this in the thread [2], but there was no
conclusion in the end.  And I copied the concern proposed by Anshuman to
here and explain why those concern is superfluous.  It is safe to enable
it for x86_64 as well as arm64.

1st concern:
'''
But what happens when a hot remove section's vmemmap area (which is
being teared down) is nearby another vmemmap area which is either created
or being destroyed for HugeTLB alloc/free purpose. As you mentioned
HugeTLB pages inside the hot remove section might be safe. But what about
other HugeTLB areas whose vmemmap area shares page table entries with
vmemmap entries for a section being hot removed ? Massive HugeTLB alloc
/use/free test cycle using memory just adjacent to a memory hotplug area,
which is always added and removed periodically, should be able to expose
this problem.
'''

Answer: At the time memory is removed, all HugeTLB pages either have been
migrated away or dissolved.  So there is no race between memory hot remove
and free_huge_page_vmemmap().  Therefore, HugeTLB pages inside the hot
remove section is safe.  Let's talk your question "what about other
HugeTLB areas whose vmemmap area shares page table entries with vmemmap
entries for a section being hot removed ?", the question is not
established.  The minimal granularity size of hotplug memory 128MB (on
arm64, 4k base page), any HugeTLB smaller than 128MB is within a section,
then, there is no share PTE page tables between HugeTLB in this section
and ones in other sections and a HugeTLB page could not cross two
sections.  In this case, the section cannot be freed.  Any HugeTLB bigger
than 128MB (section size) whose vmemmap pages is an integer multiple of
2MB (PMD-mapped).  As long as:

  1) HugeTLBs are naturally aligned, power-of-two sizes
  2) The HugeTLB size >= the section size
  3) The HugeTLB size >= the vmemmap leaf mapping size

Then a HugeTLB will not share any leaf page table entries with *anything
else*, but will share intermediate entries.  In this case, at the time
memory is removed, all HugeTLB pages either have been migrated away or
dissolved.  So there is also no race between memory hot remove and
free_huge_page_vmemmap().

2nd concern:
'''
differently, not sure if ptdump would require any synchronization.

Dumping an wrong value is probably okay but crashing because a page table
entry is being freed after ptdump acquired the pointer is bad. On arm64,
ptdump() is protected against hotremove via [get|put]_online_mems().
'''

Answer: The ptdump should be fine since vmemmap_remap_free() only
exchanges PTEs or splits the PMD entry (which means allocating a PTE page
table).  Both operations do not free any page tables (PTE), so ptdump
cannot run into a UAF on any page tables.  The worst case is just dumping
an wrong value.

[1] https://lore.kernel.org/all/20210510030027.56044-1-songmuchun@bytedance.com/
[2] https://lore.kernel.org/all/20210518091826.36937-1-songmuchun@bytedance.com/

[songmuchun@bytedance.com: restructure the code comment inside flush_dcache_page()]
  Link: https://lkml.kernel.org/r/20220414072646.21910-1-songmuchun@bytedance.com
Link: https://lkml.kernel.org/r/20220331065640.5777-2-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: Barry Song <baohua@kernel.org>
Tested-by: Barry Song <baohua@kernel.org>
Cc: Will Deacon <will@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Bodeddula Balasubramaniam <bodeddub@amazon.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Cc: Fam Zheng <fam.zheng@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 arch/arm64/Kconfig    |  1 +
 arch/arm64/mm/flush.c | 14 ++++++++++++++
 2 files changed, 15 insertions(+)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index e253fdba1249..d9da5c4f91e0 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -81,6 +81,7 @@ config ARM64
 	select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
 	select ARCH_WANT_FRAME_POINTERS
 	select ARCH_WANT_HUGE_PMD_SHARE if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36)
+	select ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP
 	select ARCH_WANT_LD_ORPHAN_WARN
 	select ARCH_WANT_RESERVE_CRASH_KERNEL if KEXEC_CORE
 	select ARCH_HAS_UBSAN_SANITIZE_ALL
diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c
index 145fe60c5e68..2bb6defad92f 100644
--- a/arch/arm64/mm/flush.c
+++ b/arch/arm64/mm/flush.c
@@ -69,6 +69,20 @@ EXPORT_SYMBOL_GPL(__sync_icache_dcache);
  */
 void flush_dcache_page(struct page *page)
 {
+	/*
+	 * Only the head page's flags of HugeTLB can be cleared since the tail
+	 * vmemmap pages associated with each HugeTLB page are mapped with
+	 * read-only when CONFIG_HUGETLB_PAGE_FREE_VMEMMAP is enabled (more
+	 * details can refer to vmemmap_remap_pte()).  Although
+	 * __sync_icache_dcache() only set PG_dcache_clean flag on the head
+	 * page struct, there is more than one page struct with PG_dcache_clean
+	 * associated with the HugeTLB page since the head vmemmap page frame
+	 * is reused (more details can refer to the comments above
+	 * page_fixed_fake_head()).
+	 */
+	if (hugetlb_free_vmemmap_enabled() && PageHuge(page))
+		page = compound_head(page);
+
 	if (test_bit(PG_dcache_clean, &page->flags))
 		clear_bit(PG_dcache_clean, &page->flags);
 }
-- 
Gitee


From b8dfc94bd51821c5467f622ffec51c5bfd37c342 Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Thu, 28 Jul 2022 18:06:54 +0800
Subject: [PATCH 666/674] mm: hugetlb_vmemmap: cleanup hugetlb_vmemmap related
 functions

mainline inclusion
from mainline-v5.19-rc1
commit 5981611d0a006472d367d7a8e6ead8afaecf17c7
category: feature
bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=5981611d0a006472d367d7a8e6ead8afaecf17c7

--------------------------------

Patch series "cleanup hugetlb_vmemmap".

The word of "free" is not expressive enough to express the feature of
optimizing vmemmap pages associated with each HugeTLB, rename this keywork
to "optimize" is more clear.  In this series, cheanup related codes to
make it more clear and expressive.  This is suggested by David.

This patch (of 3):

The word of "free" is not expressive enough to express the feature of
optimizing vmemmap pages associated with each HugeTLB, rename this keywork
to "optimize".  And some function names are prefixed with "huge_page"
instead of "hugetlb", it is easily to be confused with THP.  In this
patch, cheanup related functions to make code more clear and expressive.

Link: https://lkml.kernel.org/r/20220404074652.68024-1-songmuchun@bytedance.com
Link: https://lkml.kernel.org/r/20220404074652.68024-2-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Conflicts:
	mm/hugetlb.c
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 include/linux/hugetlb.h |  2 +-
 mm/hugetlb.c            |  8 ++++----
 mm/hugetlb_vmemmap.c    | 42 ++++++++++++++++++++---------------------
 mm/hugetlb_vmemmap.h    | 20 ++++++++++----------
 4 files changed, 35 insertions(+), 37 deletions(-)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index fd1dc8d29436..218fc150d7f8 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -593,7 +593,7 @@ struct hstate {
 	unsigned int surplus_huge_pages_node[MAX_NUMNODES];
 	unsigned int resv_huge_pages_node[MAX_NUMNODES];
 #ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
-	unsigned int nr_free_vmemmap_pages;
+	unsigned int optimize_vmemmap_pages;
 #endif
 #ifdef CONFIG_CGROUP_HUGETLB
 	/* cgroup control files */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a8c815386ecc..c5168c7f282a 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1448,7 +1448,7 @@ static void __update_and_free_page(struct hstate *h, struct page *page)
 	if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
 		return;
 
-	if (alloc_huge_page_vmemmap(h, page)) {
+	if (hugetlb_vmemmap_alloc(h, page)) {
 		spin_lock_irq(&hugetlb_lock);
 		/*
 		 * If we cannot allocate vmemmap pages, just refuse to free the
@@ -1519,7 +1519,7 @@ static DECLARE_WORK(free_hpage_work, free_hpage_workfn);
 
 static inline void flush_free_hpage_work(struct hstate *h)
 {
-	if (free_vmemmap_pages_per_hpage(h))
+	if (hugetlb_optimize_vmemmap_pages(h))
 		flush_work(&free_hpage_work);
 }
 
@@ -1642,7 +1642,7 @@ void free_huge_page(struct page *page)
 
 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
 {
-	free_huge_page_vmemmap(h, page);
+	hugetlb_vmemmap_free(h, page);
 	INIT_LIST_HEAD(&page->lru);
 	set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
 	hugetlb_set_page_subpool(page, NULL);
@@ -2066,7 +2066,7 @@ int dissolve_free_huge_page(struct page *page)
 		 * Attempt to allocate vmemmmap here so that we can take
 		 * appropriate action on failure.
 		 */
-		rc = alloc_huge_page_vmemmap(h, head);
+		rc = hugetlb_vmemmap_alloc(h, head);
 		if (!rc) {
 			/*
 			 * Move PageHWPoison flag from head page to the raw
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 791626983c2e..91b79b9d9e25 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * Free some vmemmap pages of HugeTLB
+ * Optimize vmemmap pages associated with HugeTLB
  *
  * Copyright (c) 2020, Bytedance. All rights reserved.
  *
@@ -192,7 +192,7 @@ DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON,
 			hugetlb_free_vmemmap_enabled_key);
 EXPORT_SYMBOL(hugetlb_free_vmemmap_enabled_key);
 
-static int __init early_hugetlb_free_vmemmap_param(char *buf)
+static int __init hugetlb_vmemmap_early_param(char *buf)
 {
 	/* We cannot optimize if a "struct page" crosses page boundaries. */
 	if (!is_power_of_2(sizeof(struct page))) {
@@ -212,29 +212,26 @@ static int __init early_hugetlb_free_vmemmap_param(char *buf)
 
 	return 0;
 }
-early_param("hugetlb_free_vmemmap", early_hugetlb_free_vmemmap_param);
-
-static inline unsigned long free_vmemmap_pages_size_per_hpage(struct hstate *h)
-{
-	return (unsigned long)free_vmemmap_pages_per_hpage(h) << PAGE_SHIFT;
-}
+early_param("hugetlb_free_vmemmap", hugetlb_vmemmap_early_param);
 
 /*
  * Previously discarded vmemmap pages will be allocated and remapping
  * after this function returns zero.
  */
-int alloc_huge_page_vmemmap(struct hstate *h, struct page *head)
+int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head)
 {
 	int ret;
 	unsigned long vmemmap_addr = (unsigned long)head;
-	unsigned long vmemmap_end, vmemmap_reuse;
+	unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages;
 
 	if (!HPageVmemmapOptimized(head))
 		return 0;
 
-	vmemmap_addr += RESERVE_VMEMMAP_SIZE;
-	vmemmap_end = vmemmap_addr + free_vmemmap_pages_size_per_hpage(h);
-	vmemmap_reuse = vmemmap_addr - PAGE_SIZE;
+	vmemmap_addr	+= RESERVE_VMEMMAP_SIZE;
+	vmemmap_pages	= hugetlb_optimize_vmemmap_pages(h);
+	vmemmap_end	= vmemmap_addr + (vmemmap_pages << PAGE_SHIFT);
+	vmemmap_reuse	= vmemmap_addr - PAGE_SIZE;
+
 	/*
 	 * The pages which the vmemmap virtual address range [@vmemmap_addr,
 	 * @vmemmap_end) are mapped to are freed to the buddy allocator, and
@@ -250,17 +247,18 @@ int alloc_huge_page_vmemmap(struct hstate *h, struct page *head)
 	return ret;
 }
 
-void free_huge_page_vmemmap(struct hstate *h, struct page *head)
+void hugetlb_vmemmap_free(struct hstate *h, struct page *head)
 {
 	unsigned long vmemmap_addr = (unsigned long)head;
-	unsigned long vmemmap_end, vmemmap_reuse;
+	unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages;
 
-	if (!free_vmemmap_pages_per_hpage(h))
+	vmemmap_pages = hugetlb_optimize_vmemmap_pages(h);
+	if (!vmemmap_pages)
 		return;
 
-	vmemmap_addr += RESERVE_VMEMMAP_SIZE;
-	vmemmap_end = vmemmap_addr + free_vmemmap_pages_size_per_hpage(h);
-	vmemmap_reuse = vmemmap_addr - PAGE_SIZE;
+	vmemmap_addr	+= RESERVE_VMEMMAP_SIZE;
+	vmemmap_end	= vmemmap_addr + (vmemmap_pages << PAGE_SHIFT);
+	vmemmap_reuse	= vmemmap_addr - PAGE_SIZE;
 
 	/*
 	 * Remap the vmemmap virtual address range [@vmemmap_addr, @vmemmap_end)
@@ -297,8 +295,8 @@ void __init hugetlb_vmemmap_init(struct hstate *h)
 	 * hugetlbpage.rst for more details.
 	 */
 	if (likely(vmemmap_pages > RESERVE_VMEMMAP_NR))
-		h->nr_free_vmemmap_pages = vmemmap_pages - RESERVE_VMEMMAP_NR;
+		h->optimize_vmemmap_pages = vmemmap_pages - RESERVE_VMEMMAP_NR;
 
-	pr_info("can free %d vmemmap pages for %s\n", h->nr_free_vmemmap_pages,
-		h->name);
+	pr_info("can optimize %d vmemmap pages for %s\n",
+		h->optimize_vmemmap_pages, h->name);
 }
diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h
index cb2bef8f9e73..9760537849b5 100644
--- a/mm/hugetlb_vmemmap.h
+++ b/mm/hugetlb_vmemmap.h
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * Free some vmemmap pages of HugeTLB
+ * Optimize vmemmap pages associated with HugeTLB
  *
  * Copyright (c) 2020, Bytedance. All rights reserved.
  *
@@ -11,25 +11,25 @@
 #include <linux/hugetlb.h>
 
 #ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
-int alloc_huge_page_vmemmap(struct hstate *h, struct page *head);
-void free_huge_page_vmemmap(struct hstate *h, struct page *head);
+int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head);
+void hugetlb_vmemmap_free(struct hstate *h, struct page *head);
 void hugetlb_vmemmap_init(struct hstate *h);
 
 /*
- * How many vmemmap pages associated with a HugeTLB page that can be freed
- * to the buddy allocator.
+ * How many vmemmap pages associated with a HugeTLB page that can be
+ * optimized and freed to the buddy allocator.
  */
-static inline unsigned int free_vmemmap_pages_per_hpage(struct hstate *h)
+static inline unsigned int hugetlb_optimize_vmemmap_pages(struct hstate *h)
 {
-	return h->nr_free_vmemmap_pages;
+	return h->optimize_vmemmap_pages;
 }
 #else
-static inline int alloc_huge_page_vmemmap(struct hstate *h, struct page *head)
+static inline int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head)
 {
 	return 0;
 }
 
-static inline void free_huge_page_vmemmap(struct hstate *h, struct page *head)
+static inline void hugetlb_vmemmap_free(struct hstate *h, struct page *head)
 {
 }
 
@@ -37,7 +37,7 @@ static inline void hugetlb_vmemmap_init(struct hstate *h)
 {
 }
 
-static inline unsigned int free_vmemmap_pages_per_hpage(struct hstate *h)
+static inline unsigned int hugetlb_optimize_vmemmap_pages(struct hstate *h)
 {
 	return 0;
 }
-- 
Gitee


From 142ca734a4a1aab74f8c37818b634f996ef49e08 Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Thu, 28 Jul 2022 18:06:55 +0800
Subject: [PATCH 667/674] mm: hugetlb_vmemmap: cleanup
 hugetlb_free_vmemmap_enabled*

mainline inclusion
from mainline-v5.19-rc1
commit f10f1442c309ccef7a80ba3dc4abde0978e86fb4
category: feature
bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=f10f1442c309ccef7a80ba3dc4abde0978e86fb4

--------------------------------

The word of "free" is not expressive enough to express the feature of
optimizing vmemmap pages associated with each HugeTLB, rename this keywork
to "optimize".  In this patch , cheanup the static key and
hugetlb_free_vmemmap_enabled() to make code more expressive.

Link: https://lkml.kernel.org/r/20220404074652.68024-3-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Conflicts:
	mm/memory_hotplug.c
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 arch/arm64/mm/flush.c      |  2 +-
 include/linux/page-flags.h | 12 ++++++------
 mm/hugetlb_vmemmap.c       | 10 +++++-----
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c
index 2bb6defad92f..892e53e9c788 100644
--- a/arch/arm64/mm/flush.c
+++ b/arch/arm64/mm/flush.c
@@ -80,7 +80,7 @@ void flush_dcache_page(struct page *page)
 	 * is reused (more details can refer to the comments above
 	 * page_fixed_fake_head()).
 	 */
-	if (hugetlb_free_vmemmap_enabled() && PageHuge(page))
+	if (hugetlb_optimize_vmemmap_enabled() && PageHuge(page))
 		page = compound_head(page);
 
 	if (test_bit(PG_dcache_clean, &page->flags))
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 81061122af68..d1cf17a71a12 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -196,16 +196,16 @@ enum pageflags {
 
 #ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
 DECLARE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON,
-			 hugetlb_free_vmemmap_enabled_key);
+			 hugetlb_optimize_vmemmap_key);
 
-static __always_inline bool hugetlb_free_vmemmap_enabled(void)
+static __always_inline bool hugetlb_optimize_vmemmap_enabled(void)
 {
 	return static_branch_maybe(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON,
-				   &hugetlb_free_vmemmap_enabled_key);
+				   &hugetlb_optimize_vmemmap_key);
 }
 
 /*
- * If the feature of freeing some vmemmap pages associated with each HugeTLB
+ * If the feature of optimizing vmemmap pages associated with each HugeTLB
  * page is enabled, the head vmemmap page frame is reused and all of the tail
  * vmemmap addresses map to the head vmemmap page frame (furture details can
  * refer to the figure at the head of the mm/hugetlb_vmemmap.c).  In other
@@ -222,7 +222,7 @@ static __always_inline bool hugetlb_free_vmemmap_enabled(void)
  */
 static __always_inline const struct page *page_fixed_fake_head(const struct page *page)
 {
-	if (!hugetlb_free_vmemmap_enabled())
+	if (!hugetlb_optimize_vmemmap_enabled())
 		return page;
 
 	/*
@@ -251,7 +251,7 @@ static inline const struct page *page_fixed_fake_head(const struct page *page)
 	return page;
 }
 
-static inline bool hugetlb_free_vmemmap_enabled(void)
+static inline bool hugetlb_optimize_vmemmap_enabled(void)
 {
 	return false;
 }
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 91b79b9d9e25..f25294973398 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -189,8 +189,8 @@
 #define RESERVE_VMEMMAP_SIZE		(RESERVE_VMEMMAP_NR << PAGE_SHIFT)
 
 DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON,
-			hugetlb_free_vmemmap_enabled_key);
-EXPORT_SYMBOL(hugetlb_free_vmemmap_enabled_key);
+			hugetlb_optimize_vmemmap_key);
+EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key);
 
 static int __init hugetlb_vmemmap_early_param(char *buf)
 {
@@ -204,9 +204,9 @@ static int __init hugetlb_vmemmap_early_param(char *buf)
 		return -EINVAL;
 
 	if (!strcmp(buf, "on"))
-		static_branch_enable(&hugetlb_free_vmemmap_enabled_key);
+		static_branch_enable(&hugetlb_optimize_vmemmap_key);
 	else if (!strcmp(buf, "off"))
-		static_branch_disable(&hugetlb_free_vmemmap_enabled_key);
+		static_branch_disable(&hugetlb_optimize_vmemmap_key);
 	else
 		return -EINVAL;
 
@@ -282,7 +282,7 @@ void __init hugetlb_vmemmap_init(struct hstate *h)
 	BUILD_BUG_ON(__NR_USED_SUBPAGE >=
 		     RESERVE_VMEMMAP_SIZE / sizeof(struct page));
 
-	if (!hugetlb_free_vmemmap_enabled())
+	if (!hugetlb_optimize_vmemmap_enabled())
 		return;
 
 	vmemmap_pages = (nr_pages * sizeof(struct page)) >> PAGE_SHIFT;
-- 
Gitee


From f1f5face0c06f0526c21122eb601f4e5482d612b Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Thu, 28 Jul 2022 18:06:56 +0800
Subject: [PATCH 668/674] mm: hugetlb_vmemmap: cleanup
 CONFIG_HUGETLB_PAGE_FREE_VMEMMAP*

mainline inclusion
from mainline-v5.19-rc1
commit 47010c040dec8af6347ec6259104fc13f7e7e30a
category: feature
bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=47010c040dec8af6347ec6259104fc13f7e7e30a

--------------------------------

The word of "free" is not expressive enough to express the feature of
optimizing vmemmap pages associated with each HugeTLB, rename this keywork
to "optimize".  In this patch , cheanup configs to make code more
expressive.

Link: https://lkml.kernel.org/r/20220404074652.68024-4-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Conflicts:
	arch/arm64/configs/openeuler_defconfig
	arch/x86/configs/openeuler_defconfig
	Documentation/admin-guide/kernel-parameters.txt
	include/linux/hugetlb.h
	mm/Makefile
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 Documentation/admin-guide/kernel-parameters.txt |  5 ++++-
 Documentation/admin-guide/mm/hugetlbpage.rst    |  2 +-
 arch/arm64/Kconfig                              |  2 +-
 arch/arm64/configs/openeuler_defconfig          |  4 ++--
 arch/arm64/mm/flush.c                           |  2 +-
 arch/x86/Kconfig                                |  2 +-
 arch/x86/configs/openeuler_defconfig            |  4 ++--
 arch/x86/mm/init_64.c                           |  2 +-
 fs/Kconfig                                      | 16 ++++++++--------
 include/linux/hugetlb.h                         |  2 +-
 include/linux/mm.h                              |  2 +-
 include/linux/page-flags.h                      |  6 +++---
 mm/Makefile                                     |  2 +-
 mm/hugetlb_vmemmap.c                            |  4 ++--
 mm/hugetlb_vmemmap.h                            |  4 ++--
 mm/sparse-vmemmap.c                             |  4 ++--
 16 files changed, 33 insertions(+), 30 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index e0957b73f63d..86a7ea4a9964 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1593,7 +1593,7 @@
 			Format: size[KMG]
 
 	hugetlb_free_vmemmap=
-			[KNL] Reguires CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
+			[KNL] Reguires CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
 			enabled.
 			Allows heavy hugetlb users to free up some more
 			memory (7 * PAGE_SIZE for each 2MB hugetlb page).
@@ -1602,6 +1602,9 @@
 			on:  enable the feature
 			off: disable the feature
 
+			Built with CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON=y,
+			the default is on.
+
 	hugevmalloc	[KNL,PPC,ARM64,X86] Requires CONFIG_HAVE_ARCH_HUGE_VMALLOC
 			Format: { on | off }
 			Default set by CONFIG_HUGE_VMALLOC_DEFAULT_ENABLED.
diff --git a/Documentation/admin-guide/mm/hugetlbpage.rst b/Documentation/admin-guide/mm/hugetlbpage.rst
index d70828c07658..0f8acc4a6cf0 100644
--- a/Documentation/admin-guide/mm/hugetlbpage.rst
+++ b/Documentation/admin-guide/mm/hugetlbpage.rst
@@ -164,7 +164,7 @@ default_hugepagesz
 	will all result in 256 2M huge pages being allocated.  Valid default
 	huge page size is architecture dependent.
 hugetlb_free_vmemmap
-	When CONFIG_HUGETLB_PAGE_FREE_VMEMMAP is set, this enables freeing
+	When CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP is set, this enables optimizing
 	unused vmemmap pages associated with each HugeTLB page.
 
 When multiple huge page sizes are supported, ``/proc/sys/vm/nr_hugepages``
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index d9da5c4f91e0..c4f6c80ea976 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -81,7 +81,7 @@ config ARM64
 	select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
 	select ARCH_WANT_FRAME_POINTERS
 	select ARCH_WANT_HUGE_PMD_SHARE if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36)
-	select ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP
+	select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
 	select ARCH_WANT_LD_ORPHAN_WARN
 	select ARCH_WANT_RESERVE_CRASH_KERNEL if KEXEC_CORE
 	select ARCH_HAS_UBSAN_SANITIZE_ALL
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig
index d246fd508ef6..593ac67497e3 100644
--- a/arch/arm64/configs/openeuler_defconfig
+++ b/arch/arm64/configs/openeuler_defconfig
@@ -6281,8 +6281,8 @@ CONFIG_TMPFS_XATTR=y
 # CONFIG_TMPFS_INODE64 is not set
 CONFIG_HUGETLBFS=y
 CONFIG_HUGETLB_PAGE=y
-CONFIG_HUGETLB_PAGE_FREE_VMEMMAP=y
-# CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON is not set
+CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP=y
+# CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON is not set
 CONFIG_MEMFD_CREATE=y
 CONFIG_ARCH_HAS_GIGANTIC_PAGE=y
 CONFIG_CONFIGFS_FS=y
diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c
index 892e53e9c788..c7678e7df53a 100644
--- a/arch/arm64/mm/flush.c
+++ b/arch/arm64/mm/flush.c
@@ -72,7 +72,7 @@ void flush_dcache_page(struct page *page)
 	/*
 	 * Only the head page's flags of HugeTLB can be cleared since the tail
 	 * vmemmap pages associated with each HugeTLB page are mapped with
-	 * read-only when CONFIG_HUGETLB_PAGE_FREE_VMEMMAP is enabled (more
+	 * read-only when CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP is enabled (more
 	 * details can refer to vmemmap_remap_pte()).  Although
 	 * __sync_icache_dcache() only set PG_dcache_clean flag on the head
 	 * page struct, there is more than one page struct with PG_dcache_clean
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 661e05e1f762..f39cfb5a6535 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -103,7 +103,7 @@ config X86
 	select ARCH_WANT_DEFAULT_BPF_JIT	if X86_64
 	select ARCH_WANTS_DYNAMIC_TASK_STRUCT
 	select ARCH_WANT_HUGE_PMD_SHARE
-	select ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP	if X86_64
+	select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP	if X86_64
 	select ARCH_WANT_LD_ORPHAN_WARN
 	select ARCH_WANTS_THP_SWAP		if X86_64
 	select BUILDTIME_TABLE_SORT
diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig
index 3eac70518e6f..f013c7b95881 100644
--- a/arch/x86/configs/openeuler_defconfig
+++ b/arch/x86/configs/openeuler_defconfig
@@ -7370,8 +7370,8 @@ CONFIG_TMPFS_XATTR=y
 # CONFIG_TMPFS_INODE64 is not set
 CONFIG_HUGETLBFS=y
 CONFIG_HUGETLB_PAGE=y
-CONFIG_HUGETLB_PAGE_FREE_VMEMMAP=y
-# CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON is not set
+CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP=y
+# CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON is not set
 CONFIG_DYNAMIC_HUGETLB=y
 CONFIG_MEMFD_CREATE=y
 CONFIG_ARCH_HAS_GIGANTIC_PAGE=y
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 1eed5eaee41f..b1d5c05aeca8 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1225,7 +1225,7 @@ static struct kcore_list kcore_vsyscall;
 
 static void __init register_page_bootmem_info(void)
 {
-#if defined(CONFIG_NUMA) || defined(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP)
+#if defined(CONFIG_NUMA) || defined(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP)
 	int i;
 
 	for_each_online_node(i)
diff --git a/fs/Kconfig b/fs/Kconfig
index b60a7614cb16..aa097ca64ef6 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -242,22 +242,22 @@ config HUGETLB_PAGE
 # to enable the feature of minimizing overhead of struct page associated with
 # each HugeTLB page.
 #
-config ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP
+config ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
 	bool
 
-config HUGETLB_PAGE_FREE_VMEMMAP
+config HUGETLB_PAGE_OPTIMIZE_VMEMMAP
 	def_bool HUGETLB_PAGE
-	depends on ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP
+	depends on ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
 	depends on SPARSEMEM_VMEMMAP
 
-config HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON
-	bool "Default freeing vmemmap pages of HugeTLB to on"
+config HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON
+	bool "Default optimizing vmemmap pages of HugeTLB to on"
 	default n
-	depends on HUGETLB_PAGE_FREE_VMEMMAP
+	depends on HUGETLB_PAGE_OPTIMIZE_VMEMMAP
 	help
-	  When using HUGETLB_PAGE_FREE_VMEMMAP, the freeing unused vmemmap
+	  When using HUGETLB_PAGE_OPTIMIZE_VMEMMAP, the optimizing unused vmemmap
 	  pages associated with each HugeTLB page is default off. Say Y here
-	  to enable freeing vmemmap pages of HugeTLB by default. It can then
+	  to enable optimizing vmemmap pages of HugeTLB by default. It can then
 	  be disabled on the command line via hugetlb_free_vmemmap=off.
 
 config DYNAMIC_HUGETLB
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 218fc150d7f8..0dfe08439095 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -592,7 +592,7 @@ struct hstate {
 	unsigned int free_huge_pages_node[MAX_NUMNODES];
 	unsigned int surplus_huge_pages_node[MAX_NUMNODES];
 	unsigned int resv_huge_pages_node[MAX_NUMNODES];
-#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
+#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
 	unsigned int optimize_vmemmap_pages;
 #endif
 #ifdef CONFIG_CGROUP_HUGETLB
diff --git a/include/linux/mm.h b/include/linux/mm.h
index efa3972a23bf..1ae73cc4b806 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3104,7 +3104,7 @@ static inline void print_vma_addr(char *prefix, unsigned long rip)
 }
 #endif
 
-#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
+#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
 int vmemmap_remap_free(unsigned long start, unsigned long end,
 		       unsigned long reuse);
 int vmemmap_remap_alloc(unsigned long start, unsigned long end,
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index d1cf17a71a12..26b36cac9307 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -194,13 +194,13 @@ enum pageflags {
 
 #ifndef __GENERATING_BOUNDS_H
 
-#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
-DECLARE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON,
+#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
+DECLARE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON,
 			 hugetlb_optimize_vmemmap_key);
 
 static __always_inline bool hugetlb_optimize_vmemmap_enabled(void)
 {
-	return static_branch_maybe(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON,
+	return static_branch_maybe(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON,
 				   &hugetlb_optimize_vmemmap_key);
 }
 
diff --git a/mm/Makefile b/mm/Makefile
index d2a6a786f915..e83233177c7a 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -71,7 +71,7 @@ obj-$(CONFIG_FRONTSWAP)	+= frontswap.o
 obj-$(CONFIG_ZSWAP)	+= zswap.o
 obj-$(CONFIG_HAS_DMA)	+= dmapool.o
 obj-$(CONFIG_HUGETLBFS)	+= hugetlb.o
-obj-$(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP)	+= hugetlb_vmemmap.o
+obj-$(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP)	+= hugetlb_vmemmap.o
 obj-$(CONFIG_DYNAMIC_HUGETLB) += dynamic_hugetlb.o
 obj-$(CONFIG_NUMA) 	+= mempolicy.o
 obj-$(CONFIG_SPARSEMEM)	+= sparse.o
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index f25294973398..2655434a946b 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -188,7 +188,7 @@
 #define RESERVE_VMEMMAP_NR		1U
 #define RESERVE_VMEMMAP_SIZE		(RESERVE_VMEMMAP_NR << PAGE_SHIFT)
 
-DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON,
+DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON,
 			hugetlb_optimize_vmemmap_key);
 EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key);
 
@@ -276,7 +276,7 @@ void __init hugetlb_vmemmap_init(struct hstate *h)
 
 	/*
 	 * There are only (RESERVE_VMEMMAP_SIZE / sizeof(struct page)) struct
-	 * page structs that can be used when CONFIG_HUGETLB_PAGE_FREE_VMEMMAP,
+	 * page structs that can be used when CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP,
 	 * so add a BUILD_BUG_ON to catch invalid usage of the tail struct page.
 	 */
 	BUILD_BUG_ON(__NR_USED_SUBPAGE >=
diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h
index 9760537849b5..109b0a53b6fe 100644
--- a/mm/hugetlb_vmemmap.h
+++ b/mm/hugetlb_vmemmap.h
@@ -10,7 +10,7 @@
 #define _LINUX_HUGETLB_VMEMMAP_H
 #include <linux/hugetlb.h>
 
-#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
+#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
 int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head);
 void hugetlb_vmemmap_free(struct hstate *h, struct page *head);
 void hugetlb_vmemmap_init(struct hstate *h);
@@ -41,5 +41,5 @@ static inline unsigned int hugetlb_optimize_vmemmap_pages(struct hstate *h)
 {
 	return 0;
 }
-#endif /* CONFIG_HUGETLB_PAGE_FREE_VMEMMAP */
+#endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */
 #endif /* _LINUX_HUGETLB_VMEMMAP_H */
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index e5ed2680ec57..5b40a7473dc8 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -34,7 +34,7 @@
 #include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 
-#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
+#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
 /**
  * struct vmemmap_remap_walk - walk vmemmap page table
  *
@@ -420,7 +420,7 @@ int vmemmap_remap_alloc(unsigned long start, unsigned long end,
 
 	return 0;
 }
-#endif /* CONFIG_HUGETLB_PAGE_FREE_VMEMMAP */
+#endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */
 
 /*
  * Allocate a block of memory to be used to back the virtual memory map
-- 
Gitee


From 301b00fff3e70b91a9bc47af2cabce76f48cdb75 Mon Sep 17 00:00:00 2001
From: Xiaoming Ni <nixiaoming@huawei.com>
Date: Thu, 28 Jul 2022 18:06:57 +0800
Subject: [PATCH 669/674] sysctl: add a new register_sysctl_init() interface

mainline inclusion
from mainline-v5.17-rc1
commit 3ddd9a808cee7284931312f2f3e854c9617f44b2
category: feature
bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=3ddd9a808cee7284931312f2f3e854c9617f44b2

--------------------------------

Patch series "sysctl: first set of kernel/sysctl cleanups", v2.

Finally had time to respin the series of the work we had started last
year on cleaning up the kernel/sysct.c kitchen sink.  People keeps
stuffing their sysctls in that file and this creates a maintenance
burden.  So this effort is aimed at placing sysctls where they actually
belong.

I'm going to split patches up into series as there is quite a bit of
work.

This first set adds register_sysctl_init() for uses of registerting a
sysctl on the init path, adds const where missing to a few places,
generalizes common values so to be more easy to share, and starts the
move of a few kernel/sysctl.c out where they belong.

The majority of rework on v2 in this first patch set is 0-day fixes.
Eric Biederman's feedback is later addressed in subsequent patch sets.

I'll only post the first two patch sets for now.  We can address the
rest once the first two patch sets get completely reviewed / Acked.

This patch (of 9):

The kernel/sysctl.c is a kitchen sink where everyone leaves their dirty
dishes, this makes it very difficult to maintain.

To help with this maintenance let's start by moving sysctls to places
where they actually belong.  The proc sysctl maintainers do not want to
know what sysctl knobs you wish to add for your own piece of code, we
just care about the core logic.

Today though folks heavily rely on tables on kernel/sysctl.c so they can
easily just extend this table with their needed sysctls.  In order to
help users move their sysctls out we need to provide a helper which can
be used during code initialization.

We special-case the initialization use of register_sysctl() since it
*is* safe to fail, given all that sysctls do is provide a dynamic
interface to query or modify at runtime an existing variable.  So the
use case of register_sysctl() on init should *not* stop if the sysctls
don't end up getting registered.  It would be counter productive to stop
boot if a simple sysctl registration failed.

Provide a helper for init then, and document the recommended init levels
to use for callers of this routine.  We will later use this in
subsequent patches to start slimming down kernel/sysctl.c tables and
moving sysctl registration to the code which actually needs these
sysctls.

[mcgrof@kernel.org: major commit log and documentation rephrasing also moved to fs/proc/proc_sysctl.c                  ]

Link: https://lkml.kernel.org/r/20211123202347.818157-1-mcgrof@kernel.org
Link: https://lkml.kernel.org/r/20211123202347.818157-2-mcgrof@kernel.org
Signed-off-by: Xiaoming Ni <nixiaoming@huawei.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Iurii Zaikin <yzaikin@google.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Paul Turner <pjt@google.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Sebastian Reichel <sre@kernel.org>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Qing Wang <wangqing@vivo.com>
Cc: Benjamin LaHaise <bcrl@kvack.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Jan Kara <jack@suse.cz>
Cc: Amir Goldstein <amir73il@gmail.com>
Cc: Stephen Kitt <steve@sk2.org>
Cc: Antti Palosaari <crope@iki.fi>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Clemens Ladisch <clemens@ladisch.de>
Cc: David Airlie <airlied@linux.ie>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Julia Lawall <julia.lawall@inria.fr>
Cc: Lukas Middendorf <kernel@tuxforce.de>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Phillip Potter <phil@philpotter.co.uk>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Douglas Gilbert <dgilbert@interlog.com>
Cc: James E.J. Bottomley <jejb@linux.ibm.com>
Cc: Jani Nikula <jani.nikula@intel.com>
Cc: John Ogness <john.ogness@linutronix.de>
Cc: Martin K. Petersen <martin.petersen@oracle.com>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 fs/proc/proc_sysctl.c  | 33 +++++++++++++++++++++++++++++++++
 include/linux/sysctl.h |  3 +++
 2 files changed, 36 insertions(+)

diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index ffed75f833b7..df435cd91a5b 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -16,6 +16,7 @@
 #include <linux/module.h>
 #include <linux/bpf-cgroup.h>
 #include <linux/mount.h>
+#include <linux/kmemleak.h>
 #include "internal.h"
 
 static const struct dentry_operations proc_sys_dentry_operations;
@@ -1380,6 +1381,38 @@ struct ctl_table_header *register_sysctl(const char *path, struct ctl_table *tab
 }
 EXPORT_SYMBOL(register_sysctl);
 
+/**
+ * __register_sysctl_init() - register sysctl table to path
+ * @path: path name for sysctl base
+ * @table: This is the sysctl table that needs to be registered to the path
+ * @table_name: The name of sysctl table, only used for log printing when
+ *              registration fails
+ *
+ * The sysctl interface is used by userspace to query or modify at runtime
+ * a predefined value set on a variable. These variables however have default
+ * values pre-set. Code which depends on these variables will always work even
+ * if register_sysctl() fails. If register_sysctl() fails you'd just loose the
+ * ability to query or modify the sysctls dynamically at run time. Chances of
+ * register_sysctl() failing on init are extremely low, and so for both reasons
+ * this function does not return any error as it is used by initialization code.
+ *
+ * Context: Can only be called after your respective sysctl base path has been
+ * registered. So for instance, most base directories are registered early on
+ * init before init levels are processed through proc_sys_init() and
+ * sysctl_init().
+ */
+void __init __register_sysctl_init(const char *path, struct ctl_table *table,
+				 const char *table_name)
+{
+	struct ctl_table_header *hdr = register_sysctl(path, table);
+
+	if (unlikely(!hdr)) {
+		pr_err("failed when register_sysctl %s to %s\n", table_name, path);
+		return;
+	}
+	kmemleak_not_leak(hdr);
+}
+
 static char *append_path(const char *path, char *pos, const char *name)
 {
 	int namelen;
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 51298a4f4623..161eba9fd912 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -195,6 +195,9 @@ struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
 void unregister_sysctl_table(struct ctl_table_header * table);
 
 extern int sysctl_init(void);
+extern void __register_sysctl_init(const char *path, struct ctl_table *table,
+				 const char *table_name);
+#define register_sysctl_init(path, table) __register_sysctl_init(path, table, #table)
 void do_sysctl_args(void);
 
 extern int pwrsw_enabled;
-- 
Gitee


From 9e38aa2f71a1de04192e4c55f788f9c36f27d0a0 Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Thu, 28 Jul 2022 18:06:58 +0800
Subject: [PATCH 670/674] mm: hugetlb_vmemmap: disable hugetlb_optimize_vmemmap
 when struct page crosses page boundaries

mainline inclusion
from mainline-v5.19-rc1
commit 0effdf461c5789be02d40c1868c70cc02ea24627
category: feature
bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0effdf461c5789be02d40c1868c70cc02ea24627

--------------------------------

Patch series "add hugetlb_optimize_vmemmap sysctl", v11.

This series aims to add hugetlb_optimize_vmemmap sysctl to enable or
disable the feature of optimizing vmemmap pages associated with HugeTLB
pages.

This patch (of 4):

If the size of "struct page" is not the power of two but with the feature
of minimizing overhead of struct page associated with each HugeTLB is
enabled, then the vmemmap pages of HugeTLB will be corrupted after
remapping (panic is about to happen in theory).  But this only exists when
!CONFIG_MEMCG && !CONFIG_SLUB on x86_64.  However, it is not a
conventional configuration nowadays.  So it is not a real word issue, just
the result of a code review.

But we cannot prevent anyone from configuring that combined configure.
This hugetlb_optimize_vmemmap should be disable in this case to fix this
issue.

Link: https://lkml.kernel.org/r/20220512041142.39501-1-songmuchun@bytedance.com
Link: https://lkml.kernel.org/r/20220512041142.39501-2-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Iurii Zaikin <yzaikin@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kees Cook <keescook@chromium.org>
Cc: Luis Chamberlain <mcgrof@kernel.org>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 mm/hugetlb_vmemmap.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 2655434a946b..4b8e59eeb971 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -194,12 +194,6 @@ EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key);
 
 static int __init hugetlb_vmemmap_early_param(char *buf)
 {
-	/* We cannot optimize if a "struct page" crosses page boundaries. */
-	if (!is_power_of_2(sizeof(struct page))) {
-		pr_warn("cannot free vmemmap pages because \"struct page\" crosses page boundaries\n");
-		return 0;
-	}
-
 	if (!buf)
 		return -EINVAL;
 
@@ -285,6 +279,12 @@ void __init hugetlb_vmemmap_init(struct hstate *h)
 	if (!hugetlb_optimize_vmemmap_enabled())
 		return;
 
+	if (!is_power_of_2(sizeof(struct page))) {
+		pr_warn_once("cannot optimize vmemmap pages because \"struct page\" crosses page boundaries\n");
+		static_branch_disable(&hugetlb_optimize_vmemmap_key);
+		return;
+	}
+
 	vmemmap_pages = (nr_pages * sizeof(struct page)) >> PAGE_SHIFT;
 	/*
 	 * The head page is not to be freed to buddy allocator, the other tail
-- 
Gitee


From 6233471f1e8239cde7f7e49a4d9545b17cab51ac Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Thu, 28 Jul 2022 18:06:59 +0800
Subject: [PATCH 671/674] mm: hugetlb_vmemmap: use kstrtobool for
 hugetlb_vmemmap param parsing

mainline inclusion
from mainline-v5.19-rc1
commit 9c54c522bb76cbef480722bd44059e2ba8304bd2
category: feature
bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=9c54c522bb76cbef480722bd44059e2ba8304bd2

--------------------------------

Use kstrtobool rather than open coding "on" and "off" parsing in
mm/hugetlb_vmemmap.c, which is more powerful to handle all kinds of
parameters like 'Yy1Nn0' or [oO][NnFf] for "on" and "off".

Link: https://lkml.kernel.org/r/20220512041142.39501-4-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Iurii Zaikin <yzaikin@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kees Cook <keescook@chromium.org>
Cc: Luis Chamberlain <mcgrof@kernel.org>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 Documentation/admin-guide/kernel-parameters.txt |  6 +++---
 mm/hugetlb_vmemmap.c                            | 10 +++++-----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 86a7ea4a9964..247acf7fc837 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1597,10 +1597,10 @@
 			enabled.
 			Allows heavy hugetlb users to free up some more
 			memory (7 * PAGE_SIZE for each 2MB hugetlb page).
-			Format: { on | off (default) }
+			Format: { [oO][Nn]/Y/y/1 | [oO][Ff]/N/n/0 (default) }
 
-			on:  enable the feature
-			off: disable the feature
+			[oO][Nn]/Y/y/1: enable the feature
+			[oO][Ff]/N/n/0: disable the feature
 
 			Built with CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON=y,
 			the default is on.
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 4b8e59eeb971..112e74504905 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -194,15 +194,15 @@ EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key);
 
 static int __init hugetlb_vmemmap_early_param(char *buf)
 {
-	if (!buf)
+	bool enable;
+
+	if (kstrtobool(buf, &enable))
 		return -EINVAL;
 
-	if (!strcmp(buf, "on"))
+	if (enable)
 		static_branch_enable(&hugetlb_optimize_vmemmap_key);
-	else if (!strcmp(buf, "off"))
-		static_branch_disable(&hugetlb_optimize_vmemmap_key);
 	else
-		return -EINVAL;
+		static_branch_disable(&hugetlb_optimize_vmemmap_key);
 
 	return 0;
 }
-- 
Gitee


From 6b2239c22b56bddcdefd09fd58b6672f2b1f1566 Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Thu, 28 Jul 2022 18:07:00 +0800
Subject: [PATCH 672/674] mm: hugetlb_vmemmap: add hugetlb_optimize_vmemmap
 sysctl

mainline inclusion
from mainline-v5.19-rc1
commit 78f39084b41d287aedb2ea55f2c1895cfa11d61a
category: feature
bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=78f39084b41d287aedb2ea55f2c1895cfa11d61a

--------------------------------

We must add hugetlb_free_vmemmap=on (or "off") to the boot cmdline and
reboot the server to enable or disable the feature of optimizing vmemmap
pages associated with HugeTLB pages.  However, rebooting usually takes a
long time.  So add a sysctl to enable or disable the feature at runtime
without rebooting.  Why we need this?  There are 3 use cases.

1) The feature of minimizing overhead of struct page associated with
   each HugeTLB is disabled by default without passing
   "hugetlb_free_vmemmap=on" to the boot cmdline.  When we (ByteDance)
   deliver the servers to the users who want to enable this feature, they
   have to configure the grub (change boot cmdline) and reboot the
   servers, whereas rebooting usually takes a long time (we have thousands
   of servers).  It's a very bad experience for the users.  So we need a
   approach to enable this feature after rebooting.  This is a use case in
   our practical environment.

2) Some use cases are that HugeTLB pages are allocated 'on the fly'
   instead of being pulled from the HugeTLB pool, those workloads would be
   affected with this feature enabled.  Those workloads could be
   identified by the characteristics of they never explicitly allocating
   huge pages with 'nr_hugepages' but only set 'nr_overcommit_hugepages'
   and then let the pages be allocated from the buddy allocator at fault
   time.  We can confirm it is a real use case from the commit
   099730d67417.  For those workloads, the page fault time could be ~2x
   slower than before.  We suspect those users want to disable this
   feature if the system has enabled this before and they don't think the
   memory savings benefit is enough to make up for the performance drop.

3) If the workload which wants vmemmap pages to be optimized and the
   workload which wants to set 'nr_overcommit_hugepages' and does not want
   the extera overhead at fault time when the overcommitted pages be
   allocated from the buddy allocator are deployed in the same server.
   The user could enable this feature and set 'nr_hugepages' and
   'nr_overcommit_hugepages', then disable the feature.  In this case, the
   overcommited HugeTLB pages will not encounter the extra overhead at
   fault time.

Link: https://lkml.kernel.org/r/20220512041142.39501-5-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Luis Chamberlain <mcgrof@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Iurii Zaikin <yzaikin@google.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: David Hildenbrand <david@redhat.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Conflicts:
	include/linux/memory_hotplug.h
	mm/hugetlb_vmemmap.c
	mm/memory_hotplug.c
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 Documentation/admin-guide/sysctl/vm.rst | 39 +++++++++++
 mm/hugetlb_vmemmap.c                    | 92 ++++++++++++++++++++++---
 2 files changed, 122 insertions(+), 9 deletions(-)

diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst
index a5fbef4740c2..5de629b932ae 100644
--- a/Documentation/admin-guide/sysctl/vm.rst
+++ b/Documentation/admin-guide/sysctl/vm.rst
@@ -560,6 +560,45 @@ Change the minimum size of the hugepage pool.
 See Documentation/admin-guide/mm/hugetlbpage.rst
 
 
+hugetlb_optimize_vmemmap
+========================
+
+This knob is not available when memory_hotplug.memmap_on_memory (kernel parameter)
+is configured or the size of 'struct page' (a structure defined in
+include/linux/mm_types.h) is not power of two (an unusual system config could
+result in this).
+
+Enable (set to 1) or disable (set to 0) the feature of optimizing vmemmap pages
+associated with each HugeTLB page.
+
+Once enabled, the vmemmap pages of subsequent allocation of HugeTLB pages from
+buddy allocator will be optimized (7 pages per 2MB HugeTLB page and 4095 pages
+per 1GB HugeTLB page), whereas already allocated HugeTLB pages will not be
+optimized.  When those optimized HugeTLB pages are freed from the HugeTLB pool
+to the buddy allocator, the vmemmap pages representing that range needs to be
+remapped again and the vmemmap pages discarded earlier need to be rellocated
+again.  If your use case is that HugeTLB pages are allocated 'on the fly' (e.g.
+never explicitly allocating HugeTLB pages with 'nr_hugepages' but only set
+'nr_overcommit_hugepages', those overcommitted HugeTLB pages are allocated 'on
+the fly') instead of being pulled from the HugeTLB pool, you should weigh the
+benefits of memory savings against the more overhead (~2x slower than before)
+of allocation or freeing HugeTLB pages between the HugeTLB pool and the buddy
+allocator.  Another behavior to note is that if the system is under heavy memory
+pressure, it could prevent the user from freeing HugeTLB pages from the HugeTLB
+pool to the buddy allocator since the allocation of vmemmap pages could be
+failed, you have to retry later if your system encounter this situation.
+
+Once disabled, the vmemmap pages of subsequent allocation of HugeTLB pages from
+buddy allocator will not be optimized meaning the extra overhead at allocation
+time from buddy allocator disappears, whereas already optimized HugeTLB pages
+will not be affected.  If you want to make sure there are no optimized HugeTLB
+pages, you can set "nr_hugepages" to 0 first and then disable this.  Note that
+writing 0 to nr_hugepages will make any "in use" HugeTLB pages become surplus
+pages.  So, those surplus pages are still optimized until they are no longer
+in use.  You would need to wait for those surplus pages to be released before
+there are no optimized pages in the system.
+
+
 nr_hugepages_mempolicy
 ======================
 
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 112e74504905..4340bf6c5551 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -188,21 +188,40 @@
 #define RESERVE_VMEMMAP_NR		1U
 #define RESERVE_VMEMMAP_SIZE		(RESERVE_VMEMMAP_NR << PAGE_SHIFT)
 
+enum vmemmap_optimize_mode {
+	VMEMMAP_OPTIMIZE_OFF,
+	VMEMMAP_OPTIMIZE_ON,
+};
+
 DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON,
 			hugetlb_optimize_vmemmap_key);
 EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key);
 
+static enum vmemmap_optimize_mode vmemmap_optimize_mode =
+	IS_ENABLED(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON);
+
+static void vmemmap_optimize_mode_switch(enum vmemmap_optimize_mode to)
+{
+	if (vmemmap_optimize_mode == to)
+		return;
+
+	if (to == VMEMMAP_OPTIMIZE_OFF)
+		static_branch_dec(&hugetlb_optimize_vmemmap_key);
+	else
+		static_branch_inc(&hugetlb_optimize_vmemmap_key);
+	WRITE_ONCE(vmemmap_optimize_mode, to);
+}
+
 static int __init hugetlb_vmemmap_early_param(char *buf)
 {
 	bool enable;
+	enum vmemmap_optimize_mode mode;
 
 	if (kstrtobool(buf, &enable))
 		return -EINVAL;
 
-	if (enable)
-		static_branch_enable(&hugetlb_optimize_vmemmap_key);
-	else
-		static_branch_disable(&hugetlb_optimize_vmemmap_key);
+	mode = enable ? VMEMMAP_OPTIMIZE_ON : VMEMMAP_OPTIMIZE_OFF;
+	vmemmap_optimize_mode_switch(mode);
 
 	return 0;
 }
@@ -235,8 +254,10 @@ int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head)
 	 */
 	ret = vmemmap_remap_alloc(vmemmap_addr, vmemmap_end, vmemmap_reuse,
 				  GFP_KERNEL | __GFP_NORETRY | __GFP_THISNODE);
-	if (!ret)
+	if (!ret) {
 		ClearHPageVmemmapOptimized(head);
+		static_branch_dec(&hugetlb_optimize_vmemmap_key);
+	}
 
 	return ret;
 }
@@ -250,6 +271,11 @@ void hugetlb_vmemmap_free(struct hstate *h, struct page *head)
 	if (!vmemmap_pages)
 		return;
 
+	if (READ_ONCE(vmemmap_optimize_mode) == VMEMMAP_OPTIMIZE_OFF)
+		return;
+
+	static_branch_inc(&hugetlb_optimize_vmemmap_key);
+
 	vmemmap_addr	+= RESERVE_VMEMMAP_SIZE;
 	vmemmap_end	= vmemmap_addr + (vmemmap_pages << PAGE_SHIFT);
 	vmemmap_reuse	= vmemmap_addr - PAGE_SIZE;
@@ -259,7 +285,9 @@ void hugetlb_vmemmap_free(struct hstate *h, struct page *head)
 	 * to the page which @vmemmap_reuse is mapped to, then free the pages
 	 * which the range [@vmemmap_addr, @vmemmap_end] is mapped to.
 	 */
-	if (!vmemmap_remap_free(vmemmap_addr, vmemmap_end, vmemmap_reuse))
+	if (vmemmap_remap_free(vmemmap_addr, vmemmap_end, vmemmap_reuse))
+		static_branch_dec(&hugetlb_optimize_vmemmap_key);
+	else
 		SetHPageVmemmapOptimized(head);
 }
 
@@ -276,9 +304,6 @@ void __init hugetlb_vmemmap_init(struct hstate *h)
 	BUILD_BUG_ON(__NR_USED_SUBPAGE >=
 		     RESERVE_VMEMMAP_SIZE / sizeof(struct page));
 
-	if (!hugetlb_optimize_vmemmap_enabled())
-		return;
-
 	if (!is_power_of_2(sizeof(struct page))) {
 		pr_warn_once("cannot optimize vmemmap pages because \"struct page\" crosses page boundaries\n");
 		static_branch_disable(&hugetlb_optimize_vmemmap_key);
@@ -300,3 +325,52 @@ void __init hugetlb_vmemmap_init(struct hstate *h)
 	pr_info("can optimize %d vmemmap pages for %s\n",
 		h->optimize_vmemmap_pages, h->name);
 }
+
+#ifdef CONFIG_PROC_SYSCTL
+static int hugetlb_optimize_vmemmap_handler(struct ctl_table *table, int write,
+					    void *buffer, size_t *length,
+					    loff_t *ppos)
+{
+	int ret;
+	enum vmemmap_optimize_mode mode;
+	static DEFINE_MUTEX(sysctl_mutex);
+
+	if (write && !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	mutex_lock(&sysctl_mutex);
+	mode = vmemmap_optimize_mode;
+	table->data = &mode;
+	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
+	if (write && !ret)
+		vmemmap_optimize_mode_switch(mode);
+	mutex_unlock(&sysctl_mutex);
+
+	return ret;
+}
+
+static struct ctl_table hugetlb_vmemmap_sysctls[] = {
+	{
+		.procname	= "hugetlb_optimize_vmemmap",
+		.maxlen		= sizeof(enum vmemmap_optimize_mode),
+		.mode		= 0644,
+		.proc_handler	= hugetlb_optimize_vmemmap_handler,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	},
+	{ }
+};
+
+static __init int hugetlb_vmemmap_sysctls_init(void)
+{
+	/*
+	 * If "memory_hotplug.memmap_on_memory" is enabled or "struct page"
+	 * crosses page boundaries, the vmemmap pages cannot be optimized.
+	 */
+	if (is_power_of_2(sizeof(struct page)))
+		register_sysctl_init("vm", hugetlb_vmemmap_sysctls);
+
+	return 0;
+}
+late_initcall(hugetlb_vmemmap_sysctls_init);
+#endif /* CONFIG_PROC_SYSCTL */
-- 
Gitee


From 1d2bbc77596574b078cd2b8dec8725d8c38617ea Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Thu, 28 Jul 2022 18:07:01 +0800
Subject: [PATCH 673/674] mm: hugetlb_vmemmap: fix
 CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON

mainline inclusion
from mainline-v5.19-rc1
commit 0111def915b280c64c05f73f01b59ca404255aa3
category: feature
bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0111def915b280c64c05f73f01b59ca404255aa3

--------------------------------

The following:

  commit 47010c040dec ("mm: hugetlb_vmemmap: cleanup CONFIG_HUGETLB_PAGE_FREE_VMEMMAP*")

forgot to update CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON used in
vmemmap_optimize_mode to CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON.
The result is we cannot enable hugetlb_optimize_vmemmap at boot time when
we configure CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON.  Fix it.

Link: https://lkml.kernel.org/r/20220527081948.68832-1-songmuchun@bytedance.com
Fixes: 47010c040dec ("mm: hugetlb_vmemmap: cleanup CONFIG_HUGETLB_PAGE_FREE_VMEMMAP*")
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Reported-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 mm/hugetlb_vmemmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 4340bf6c5551..e9f63cb9e3d4 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -198,7 +198,7 @@ DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON,
 EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key);
 
 static enum vmemmap_optimize_mode vmemmap_optimize_mode =
-	IS_ENABLED(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON);
+	IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON);
 
 static void vmemmap_optimize_mode_switch(enum vmemmap_optimize_mode to)
 {
-- 
Gitee


From f1cfb6175ca81adf5ad5448ea4ad8cd205acf96b Mon Sep 17 00:00:00 2001
From: Liu Shixin <liushixin2@huawei.com>
Date: Thu, 28 Jul 2022 18:07:02 +0800
Subject: [PATCH 674/674] mm: hugetlb_vmemmap: disable hugetlb_vmemmap when
 dynamic hugetlb is enabled

hulk inclusion
category: feature
bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO
CVE: NA

--------------------------------

Disable hugetlb_vmemmap when dynamic hugetlb is enabled.
By the way, fix a similar spelling error.

Signed-off-by: Liu Shixin <liushixin2@huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 mm/dynamic_hugetlb.c |  4 ++--
 mm/huge_memory.c     |  2 +-
 mm/hugetlb_vmemmap.c | 11 ++++++++++-
 3 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/mm/dynamic_hugetlb.c b/mm/dynamic_hugetlb.c
index f8ebc8ab7d60..c1b968a7e668 100644
--- a/mm/dynamic_hugetlb.c
+++ b/mm/dynamic_hugetlb.c
@@ -1150,6 +1150,6 @@ static int __init dynamic_hugetlb_setup(char *s)
 {
 	if (!strcmp(s, "on"))
 		enable_dhugetlb = true;
-	return 1;
+	return 0;
 }
-__setup("dynamic_hugetlb=", dynamic_hugetlb_setup);
+early_param("dynamic_hugetlb", dynamic_hugetlb_setup);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index bfe079e294cb..79c855b5adad 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -401,7 +401,7 @@ static int __init hugepage_init(void)
 	 */
 	if (enable_dhugetlb) {
 		transparent_hugepage_flags = 0;
-		pr_info("transparent hugepage is disabled due to confilct with dynamic hugetlb\n");
+		pr_info("transparent hugepage is disabled due to conflict with dynamic hugetlb\n");
 		return -EINVAL;
 	}
 
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index e9f63cb9e3d4..7ec8560d267d 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -176,6 +176,7 @@
  */
 #define pr_fmt(fmt)	"HugeTLB: " fmt
 
+#include <linux/dynamic_hugetlb.h>
 #include "hugetlb_vmemmap.h"
 
 /*
@@ -304,6 +305,12 @@ void __init hugetlb_vmemmap_init(struct hstate *h)
 	BUILD_BUG_ON(__NR_USED_SUBPAGE >=
 		     RESERVE_VMEMMAP_SIZE / sizeof(struct page));
 
+	if (enable_dhugetlb) {
+		pr_warn_once("cannot optimize vmemmap pages due to conflict with dynamic hugetlb\n");
+		static_branch_disable(&hugetlb_optimize_vmemmap_key);
+		return;
+	}
+
 	if (!is_power_of_2(sizeof(struct page))) {
 		pr_warn_once("cannot optimize vmemmap pages because \"struct page\" crosses page boundaries\n");
 		static_branch_disable(&hugetlb_optimize_vmemmap_key);
@@ -366,8 +373,10 @@ static __init int hugetlb_vmemmap_sysctls_init(void)
 	/*
 	 * If "memory_hotplug.memmap_on_memory" is enabled or "struct page"
 	 * crosses page boundaries, the vmemmap pages cannot be optimized.
+	 * If "dynamic hugetlb" is enabled, the vmemmap pages cannot be
+	 * optimized.
 	 */
-	if (is_power_of_2(sizeof(struct page)))
+	if (is_power_of_2(sizeof(struct page)) && !enable_dhugetlb)
 		register_sysctl_init("vm", hugetlb_vmemmap_sysctls);
 
 	return 0;
-- 
Gitee