From 1557661f6f84d7a9efc88c83742ceeb3c9b2e7f6 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 15 Nov 2023 11:54:48 -0800 Subject: [PATCH 01/33] x86/mce: Remove old CMCI storm mitigation code commit 3ed57b41a4125609e9fd03e32228aec61d95fe1f upstream When a "storm" of corrected machine check interrupts (CMCI) is detected this code mitigates by disabling CMCI interrupt signalling from all of the banks owned by the CPU that saw the storm. There are problems with this approach: 1) It is very coarse grained. In all likelihood only one of the banks was generating the interrupts, but CMCI is disabled for all. This means Linux may delay seeing and processing errors logged from other banks. 2) Although CMCI stands for Corrected Machine Check Interrupt, it is also used to signal when an uncorrected error is logged. This is a problem because these errors should be handled in a timely manner. Delete all this code in preparation for a finer grained solution. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Yazen Ghannam Tested-by: Yazen Ghannam Link: https://lore.kernel.org/r/20231115195450.12963-2-tony.luck@intel.com Signed-off-by: Malathi Signed-off-by: PvsNarasimha --- arch/x86/kernel/cpu/mce/core.c | 20 +--- arch/x86/kernel/cpu/mce/intel.c | 145 ----------------------------- arch/x86/kernel/cpu/mce/internal.h | 6 -- 3 files changed, 1 insertion(+), 170 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index ba05002df262..c56993a55b68 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1631,13 +1631,6 @@ static unsigned long check_interval = INITIAL_CHECK_INTERVAL; static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */ static DEFINE_PER_CPU(struct timer_list, mce_timer); -static unsigned long mce_adjust_timer_default(unsigned long interval) -{ - return interval; -} - -static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default; - static void __start_timer(struct timer_list *t, unsigned long interval) { unsigned long when = jiffies + interval; @@ -1667,15 +1660,9 @@ static void mce_timer_fn(struct timer_list *t) iv = __this_cpu_read(mce_next_interval); - if (mce_available(this_cpu_ptr(&cpu_info))) { + if (mce_available(this_cpu_ptr(&cpu_info))) mc_poll_banks(); - if (mce_intel_cmci_poll()) { - iv = mce_adjust_timer(iv); - goto done; - } - } - /* * Alert userspace if needed. If we logged an MCE, reduce the polling * interval, otherwise increase the polling interval. @@ -1685,7 +1672,6 @@ static void mce_timer_fn(struct timer_list *t) else iv = min(iv * 2, round_jiffies_relative(check_interval * HZ)); -done: __this_cpu_write(mce_next_interval, iv); __start_timer(t, iv); } @@ -2012,7 +1998,6 @@ static void mce_zhaoxin_feature_init(struct cpuinfo_x86 *c) intel_init_cmci(); intel_init_lmce(); - mce_adjust_timer = cmci_intel_adjust_timer; } static void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c) @@ -2025,7 +2010,6 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) switch (c->x86_vendor) { case X86_VENDOR_INTEL: mce_intel_feature_init(c); - mce_adjust_timer = cmci_intel_adjust_timer; break; case X86_VENDOR_AMD: { @@ -2685,8 +2669,6 @@ static void mce_reenable_cpu(void) static int mce_cpu_dead(unsigned int cpu) { - mce_intel_hcpu_update(cpu); - /* intentionally ignoring frozen here */ if (!cpuhp_tasks_frozen) cmci_rediscover(); diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c index ef059fe73e81..c56203c65ef5 100644 --- a/arch/x86/kernel/cpu/mce/intel.c +++ b/arch/x86/kernel/cpu/mce/intel.c @@ -41,15 +41,6 @@ */ static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned); -/* - * CMCI storm detection backoff counter - * - * During storm, we reset this counter to INITIAL_CHECK_INTERVAL in case we've - * encountered an error. If not, we decrement it by one. We signal the end of - * the CMCI storm when it reaches 0. - */ -static DEFINE_PER_CPU(int, cmci_backoff_cnt); - /* * cmci_discover_lock protects against parallel discovery attempts * which could race against each other. @@ -64,21 +55,6 @@ static DEFINE_RAW_SPINLOCK(cmci_discover_lock); static DEFINE_SPINLOCK(cmci_poll_lock); #define CMCI_THRESHOLD 1 -#define CMCI_POLL_INTERVAL (30 * HZ) -#define CMCI_STORM_INTERVAL (HZ) -#define CMCI_STORM_THRESHOLD 15 - -static DEFINE_PER_CPU(unsigned long, cmci_time_stamp); -static DEFINE_PER_CPU(unsigned int, cmci_storm_cnt); -static DEFINE_PER_CPU(unsigned int, cmci_storm_state); - -enum { - CMCI_STORM_NONE, - CMCI_STORM_ACTIVE, - CMCI_STORM_SUBSIDED, -}; - -static atomic_t cmci_storm_on_cpus; static int cmci_supported(int *banks) { @@ -135,124 +111,6 @@ static bool lmce_supported(void) return tmp & FEAT_CTL_LMCE_ENABLED; } -bool mce_intel_cmci_poll(void) -{ - if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE) - return false; - - /* - * Reset the counter if we've logged an error in the last poll - * during the storm. - */ - if (machine_check_poll(0, this_cpu_ptr(&mce_banks_owned))) - this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL); - else - this_cpu_dec(cmci_backoff_cnt); - - return true; -} - -void mce_intel_hcpu_update(unsigned long cpu) -{ - if (per_cpu(cmci_storm_state, cpu) == CMCI_STORM_ACTIVE) - atomic_dec(&cmci_storm_on_cpus); - - per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE; -} - -static void cmci_toggle_interrupt_mode(bool on) -{ - unsigned long flags, *owned; - int bank; - u64 val; - - raw_spin_lock_irqsave(&cmci_discover_lock, flags); - owned = this_cpu_ptr(mce_banks_owned); - for_each_set_bit(bank, owned, MAX_NR_BANKS) { - rdmsrl(MSR_IA32_MCx_CTL2(bank), val); - - if (on) - val |= MCI_CTL2_CMCI_EN; - else - val &= ~MCI_CTL2_CMCI_EN; - - wrmsrl(MSR_IA32_MCx_CTL2(bank), val); - } - raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); -} - -unsigned long cmci_intel_adjust_timer(unsigned long interval) -{ - if ((this_cpu_read(cmci_backoff_cnt) > 0) && - (__this_cpu_read(cmci_storm_state) == CMCI_STORM_ACTIVE)) { - mce_notify_irq(); - return CMCI_STORM_INTERVAL; - } - - switch (__this_cpu_read(cmci_storm_state)) { - case CMCI_STORM_ACTIVE: - - /* - * We switch back to interrupt mode once the poll timer has - * silenced itself. That means no events recorded and the timer - * interval is back to our poll interval. - */ - __this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED); - if (!atomic_sub_return(1, &cmci_storm_on_cpus)) - pr_notice("CMCI storm subsided: switching to interrupt mode\n"); - - fallthrough; - - case CMCI_STORM_SUBSIDED: - /* - * We wait for all CPUs to go back to SUBSIDED state. When that - * happens we switch back to interrupt mode. - */ - if (!atomic_read(&cmci_storm_on_cpus)) { - __this_cpu_write(cmci_storm_state, CMCI_STORM_NONE); - cmci_toggle_interrupt_mode(true); - cmci_recheck(); - } - return CMCI_POLL_INTERVAL; - default: - - /* We have shiny weather. Let the poll do whatever it thinks. */ - return interval; - } -} - -static bool cmci_storm_detect(void) -{ - unsigned int cnt = __this_cpu_read(cmci_storm_cnt); - unsigned long ts = __this_cpu_read(cmci_time_stamp); - unsigned long now = jiffies; - int r; - - if (__this_cpu_read(cmci_storm_state) != CMCI_STORM_NONE) - return true; - - if (time_before_eq(now, ts + CMCI_STORM_INTERVAL)) { - cnt++; - } else { - cnt = 1; - __this_cpu_write(cmci_time_stamp, now); - } - __this_cpu_write(cmci_storm_cnt, cnt); - - if (cnt <= CMCI_STORM_THRESHOLD) - return false; - - cmci_toggle_interrupt_mode(false); - __this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE); - r = atomic_add_return(1, &cmci_storm_on_cpus); - mce_timer_kick(CMCI_STORM_INTERVAL); - this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL); - - if (r == 1) - pr_notice("CMCI storm detected: switching to poll mode\n"); - return true; -} - /* * The interrupt handler. This is called on every event. * Just call the poller directly to log any events. @@ -261,9 +119,6 @@ static bool cmci_storm_detect(void) */ static void intel_threshold_interrupt(void) { - if (cmci_storm_detect()) - return; - machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)); } diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index bcf1b3c66c9c..616732ec16f7 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -41,18 +41,12 @@ struct dentry *mce_get_debugfs_dir(void); extern mce_banks_t mce_banks_ce_disabled; #ifdef CONFIG_X86_MCE_INTEL -unsigned long cmci_intel_adjust_timer(unsigned long interval); -bool mce_intel_cmci_poll(void); -void mce_intel_hcpu_update(unsigned long cpu); void cmci_disable_bank(int bank); void intel_init_cmci(void); void intel_init_lmce(void); void intel_clear_lmce(void); bool intel_filter_mce(struct mce *m); #else -# define cmci_intel_adjust_timer mce_adjust_timer_default -static inline bool mce_intel_cmci_poll(void) { return false; } -static inline void mce_intel_hcpu_update(unsigned long cpu) { } static inline void cmci_disable_bank(int bank) { } static inline void intel_init_cmci(void) { } static inline void intel_init_lmce(void) { } -- Gitee From e291cc9e0ea2b1d55c22289af1e2d8445c87fcd7 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 15 Nov 2023 11:54:49 -0800 Subject: [PATCH 02/33] x86/mce: Add per-bank CMCI storm mitigation commit 7eae17c4add5de46efcca45356388f480103e6d9 upstream This is the core functionality to track CMCI storms at the machine check bank granularity. Subsequent patches will add the vendor specific hooks to supply input to the storm detection and take actions on the start/end of a storm. machine_check_poll() is called both by the CMCI interrupt code, and for periodic polls from a timer. Add a hook in this routine to maintain a bitmap history for each bank showing whether the bank logged an corrected error or not each time it is polled. In normal operation the interval between polls of these banks determines how far to shift the history. The 64 bit width corresponds to about one second. When a storm is observed a CPU vendor specific action is taken to reduce or stop CMCI from the bank that is the source of the storm. The bank is added to the bitmap of banks for this CPU to poll. The polling rate is increased to once per second. During a storm each bit in the history indicates the status of the bank each time it is polled. Thus the history covers just over a minute. Declare a storm for that bank if the number of corrected interrupts seen in that history is above some threshold (defined as 5 in this series, could be tuned later if there is data to suggest a better value). A storm on a bank ends if enough consecutive polls of the bank show no corrected errors (defined as 30, may also change). That calls the CPU vendor specific function to revert to normal operational mode, and changes the polling rate back to the default. [ bp: Massage. ] Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20231115195450.12963-3-tony.luck@intel.com Signed-off-by: Malathi Signed-off-by: PvsNarasimha --- arch/x86/kernel/cpu/mce/core.c | 33 ++++++-- arch/x86/kernel/cpu/mce/internal.h | 58 +++++++++++++- arch/x86/kernel/cpu/mce/threshold.c | 112 ++++++++++++++++++++++++++++ 3 files changed, 194 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index c56993a55b68..858804b990e9 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -698,6 +698,16 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) barrier(); m.status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS)); + /* + * Update storm tracking here, before checking for the + * MCI_STATUS_VAL bit. Valid corrected errors count + * towards declaring, or maintaining, storm status. No + * error in a bank counts towards avoiding, or ending, + * storm status. + */ + if (!mca_cfg.cmci_disabled) + mce_track_storm(&m); + /* If this entry is not valid, ignore it */ if (!(m.status & MCI_STATUS_VAL)) continue; @@ -1672,22 +1682,29 @@ static void mce_timer_fn(struct timer_list *t) else iv = min(iv * 2, round_jiffies_relative(check_interval * HZ)); - __this_cpu_write(mce_next_interval, iv); - __start_timer(t, iv); + if (mce_get_storm_mode()) { + __start_timer(t, HZ); + } else { + __this_cpu_write(mce_next_interval, iv); + __start_timer(t, iv); + } } /* - * Ensure that the timer is firing in @interval from now. + * When a storm starts on any bank on this CPU, switch to polling + * once per second. When the storm ends, revert to the default + * polling interval. */ -void mce_timer_kick(unsigned long interval) +void mce_timer_kick(bool storm) { struct timer_list *t = this_cpu_ptr(&mce_timer); - unsigned long iv = __this_cpu_read(mce_next_interval); - __start_timer(t, interval); + mce_set_storm_mode(storm); - if (interval < iv) - __this_cpu_write(mce_next_interval, interval); + if (storm) + __start_timer(t, HZ); + else + __this_cpu_write(mce_next_interval, check_interval * HZ); } /* Must not be called in IRQ context where del_timer_sync() can deadlock */ diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index 616732ec16f7..c60129eb62c2 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -54,7 +54,63 @@ static inline void intel_clear_lmce(void) { } static inline bool intel_filter_mce(struct mce *m) { return false; } #endif -void mce_timer_kick(unsigned long interval); +void mce_timer_kick(bool storm); + +#ifdef CONFIG_X86_MCE_THRESHOLD +void cmci_storm_begin(unsigned int bank); +void cmci_storm_end(unsigned int bank); +void mce_track_storm(struct mce *mce); +void mce_inherit_storm(unsigned int bank); +bool mce_get_storm_mode(void); +void mce_set_storm_mode(bool storm); +#else +static inline void cmci_storm_begin(unsigned int bank) {} +static inline void cmci_storm_end(unsigned int bank) {} +static inline void mce_track_storm(struct mce *mce) {} +static inline void mce_inherit_storm(unsigned int bank) {} +static inline bool mce_get_storm_mode(void) { return false; } +static inline void mce_set_storm_mode(bool storm) {} +#endif + +/* + * history: Bitmask tracking errors occurrence. Each set bit + * represents an error seen. + * + * timestamp: Last time (in jiffies) that the bank was polled. + * in_storm_mode: Is this bank in storm mode? + * poll_only: Bank does not support CMCI, skip storm tracking. + */ +struct storm_bank { + u64 history; + u64 timestamp; + bool in_storm_mode; + bool poll_only; +}; + +#define NUM_HISTORY_BITS (sizeof(u64) * BITS_PER_BYTE) + +/* How many errors within the history buffer mark the start of a storm. */ +#define STORM_BEGIN_THRESHOLD 5 + +/* + * How many polls of machine check bank without an error before declaring + * the storm is over. Since it is tracked by the bitmasks in the history + * field of struct storm_bank the mask is 30 bits [0 ... 29]. + */ +#define STORM_END_POLL_THRESHOLD 29 + +/* + * banks: per-cpu, per-bank details + * stormy_bank_count: count of MC banks in storm state + * poll_mode: CPU is in poll mode + */ +struct mca_storm_desc { + struct storm_bank banks[MAX_NR_BANKS]; + u8 stormy_bank_count; + bool poll_mode; +}; + +DECLARE_PER_CPU(struct mca_storm_desc, storm_desc); #ifdef CONFIG_ACPI_APEI int apei_write_mce(struct mce *m); diff --git a/arch/x86/kernel/cpu/mce/threshold.c b/arch/x86/kernel/cpu/mce/threshold.c index ef4e7bb5fd88..0e1988468ee4 100644 --- a/arch/x86/kernel/cpu/mce/threshold.c +++ b/arch/x86/kernel/cpu/mce/threshold.c @@ -29,3 +29,115 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_threshold) trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR); apic_eoi(); } + +DEFINE_PER_CPU(struct mca_storm_desc, storm_desc); + +void mce_inherit_storm(unsigned int bank) +{ + struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc); + + /* + * Previous CPU owning this bank had put it into storm mode, + * but the precise history of that storm is unknown. Assume + * the worst (all recent polls of the bank found a valid error + * logged). This will avoid the new owner prematurely declaring + * the storm has ended. + */ + storm->banks[bank].history = ~0ull; + storm->banks[bank].timestamp = jiffies; +} + +bool mce_get_storm_mode(void) +{ + return __this_cpu_read(storm_desc.poll_mode); +} + +void mce_set_storm_mode(bool storm) +{ + __this_cpu_write(storm_desc.poll_mode, storm); +} + +static void mce_handle_storm(unsigned int bank, bool on) +{ + switch (boot_cpu_data.x86_vendor) { + } +} + +void cmci_storm_begin(unsigned int bank) +{ + struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc); + + __set_bit(bank, this_cpu_ptr(mce_poll_banks)); + storm->banks[bank].in_storm_mode = true; + + /* + * If this is the first bank on this CPU to enter storm mode + * start polling. + */ + if (++storm->stormy_bank_count == 1) + mce_timer_kick(true); +} + +void cmci_storm_end(unsigned int bank) +{ + struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc); + + __clear_bit(bank, this_cpu_ptr(mce_poll_banks)); + storm->banks[bank].history = 0; + storm->banks[bank].in_storm_mode = false; + + /* If no banks left in storm mode, stop polling. */ + if (!this_cpu_dec_return(storm_desc.stormy_bank_count)) + mce_timer_kick(false); +} + +void mce_track_storm(struct mce *mce) +{ + struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc); + unsigned long now = jiffies, delta; + unsigned int shift = 1; + u64 history = 0; + + /* No tracking needed for banks that do not support CMCI */ + if (storm->banks[mce->bank].poll_only) + return; + + /* + * When a bank is in storm mode it is polled once per second and + * the history mask will record about the last minute of poll results. + * If it is not in storm mode, then the bank is only checked when + * there is a CMCI interrupt. Check how long it has been since + * this bank was last checked, and adjust the amount of "shift" + * to apply to history. + */ + if (!storm->banks[mce->bank].in_storm_mode) { + delta = now - storm->banks[mce->bank].timestamp; + shift = (delta + HZ) / HZ; + } + + /* If it has been a long time since the last poll, clear history. */ + if (shift < NUM_HISTORY_BITS) + history = storm->banks[mce->bank].history << shift; + + storm->banks[mce->bank].timestamp = now; + + /* History keeps track of corrected errors. VAL=1 && UC=0 */ + if ((mce->status & MCI_STATUS_VAL) && mce_is_correctable(mce)) + history |= 1; + + storm->banks[mce->bank].history = history; + + if (storm->banks[mce->bank].in_storm_mode) { + if (history & GENMASK_ULL(STORM_END_POLL_THRESHOLD, 0)) + return; + printk_deferred(KERN_NOTICE "CPU%d BANK%d CMCI storm subsided\n", smp_processor_id(), mce->bank); + mce_handle_storm(mce->bank, false); + cmci_storm_end(mce->bank); + } else { + if (hweight64(history) < STORM_BEGIN_THRESHOLD) + return; + printk_deferred(KERN_NOTICE "CPU%d BANK%d CMCI storm detected\n", smp_processor_id(), mce->bank); + mce_handle_storm(mce->bank, true); + cmci_storm_begin(mce->bank); + } +} -- Gitee From 9a0b5acb8346342a7df1e436643a1c95ad6b0693 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Tue, 24 Jun 2025 14:15:57 +0000 Subject: [PATCH 03/33] x86/mce: Ensure user polling settings are honored when restarting timer commit 00c092de6f28ebd32208aef83b02d61af2229b60 upstream Users can disable MCA polling by setting the "ignore_ce" parameter or by setting "check_interval=0". This tells the kernel to *not* start the MCE timer on a CPU. If the user did not disable CMCI, then storms can occur. When these happen, the MCE timer will be started with a fixed interval. After the storm subsides, the timer's next interval is set to check_interval. This disregards the user's input through "ignore_ce" and "check_interval". Furthermore, if "check_interval=0", then the new timer will run faster than expected. Create a new helper to check these conditions and use it when a CMCI storm ends. [ bp: Massage. ] Fixes: 7eae17c4add5 ("x86/mce: Add per-bank CMCI storm mitigation") Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Cc: stable@vger.kernel.org Link: https://lore.kernel.org/20250624-wip-mca-updates-v4-2-236dd74f645f@amd.com Signed-off-by: Malathi Signed-off-by: PvsNarasimha --- arch/x86/kernel/cpu/mce/core.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 858804b990e9..b6169ddd4077 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1661,6 +1661,11 @@ static void mc_poll_banks_default(void) void (*mc_poll_banks)(void) = mc_poll_banks_default; +static bool should_enable_timer(unsigned long iv) +{ + return !mca_cfg.ignore_ce && iv; +} + static void mce_timer_fn(struct timer_list *t) { struct timer_list *cpu_t = this_cpu_ptr(&mce_timer); @@ -1684,7 +1689,7 @@ static void mce_timer_fn(struct timer_list *t) if (mce_get_storm_mode()) { __start_timer(t, HZ); - } else { + } else if (should_enable_timer(iv)) { __this_cpu_write(mce_next_interval, iv); __start_timer(t, iv); } @@ -2069,11 +2074,10 @@ static void mce_start_timer(struct timer_list *t) { unsigned long iv = check_interval * HZ; - if (mca_cfg.ignore_ce || !iv) - return; - - this_cpu_write(mce_next_interval, iv); - __start_timer(t, iv); + if (should_enable_timer(iv)) { + this_cpu_write(mce_next_interval, iv); + __start_timer(t, iv); + } } static void __mcheck_cpu_setup_timer(void) -- Gitee From 10d743c3e23ad3f56344cb52d433cbd776db15e2 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Tue, 24 Jun 2025 14:16:00 +0000 Subject: [PATCH 04/33] x86/mce/amd: Rename threshold restart function commit 9af8b441cf6953f683b825fbf241a979ea7521e8 upstream It operates per block rather than per bank. So rename it for clarity. No functional changes. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/20250624-wip-mca-updates-v4-5-236dd74f645f@amd.com Signed-off-by: Malathi Signed-off-by: PvsNarasimha --- arch/x86/kernel/cpu/mce/amd.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index b88f4fb7b7d9..d610eef1112b 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -417,8 +417,8 @@ static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi) return 1; }; -/* Reprogram MCx_MISC MSR behind this threshold bank. */ -static void threshold_restart_bank(void *_tr) +/* Reprogram MCx_MISC MSR behind this threshold block. */ +static void threshold_restart_block(void *_tr) { struct thresh_restart *tr = _tr; u32 hi, lo; @@ -476,7 +476,7 @@ static void mce_threshold_block_init(struct threshold_block *b, int offset) }; b->threshold_limit = THRESHOLD_MAX; - threshold_restart_bank(&tr); + threshold_restart_block(&tr); }; static int setup_APIC_mce_threshold(int reserved, int new) @@ -857,7 +857,7 @@ static void log_and_reset_block(struct threshold_block *block) /* Reset threshold block after logging error. */ memset(&tr, 0, sizeof(tr)); tr.b = block; - threshold_restart_bank(&tr); + threshold_restart_block(&tr); } /* @@ -931,7 +931,7 @@ store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size) memset(&tr, 0, sizeof(tr)); tr.b = b; - if (smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1)) + if (smp_call_function_single(b->cpu, threshold_restart_block, &tr, 1)) return -ENODEV; return size; @@ -956,7 +956,7 @@ store_threshold_limit(struct threshold_block *b, const char *buf, size_t size) b->threshold_limit = new; tr.b = b; - if (smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1)) + if (smp_call_function_single(b->cpu, threshold_restart_block, &tr, 1)) return -ENODEV; return size; -- Gitee From aba60ad5f0fb97481f9fe89ab1551285d7f7e190 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Tue, 24 Jun 2025 14:16:01 +0000 Subject: [PATCH 05/33] x86/mce/amd: Remove return value for mce_threshold_{create,remove}_device() commit 4d2161b9e8ba64076f520ec2f00eefb00722c15e upstream The return values are not checked, so set return type to 'void'. Also, move function declarations to internal.h, since these functions are only used within the MCE subsystem. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Nikolay Borisov Link: https://lore.kernel.org/20250624-wip-mca-updates-v4-6-236dd74f645f@amd.com Signed-off-by: Malathi Signed-off-by: PvsNarasimha --- arch/x86/include/asm/mce.h | 6 ------ arch/x86/kernel/cpu/mce/amd.c | 22 ++++++++++------------ arch/x86/kernel/cpu/mce/internal.h | 4 ++++ 3 files changed, 14 insertions(+), 18 deletions(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index a02d2215a79f..81213c1e799f 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -342,15 +342,9 @@ enum smca_bank_types { extern const char *smca_get_long_name(enum smca_bank_types t); extern bool amd_mce_is_memory_error(struct mce *m); -extern int mce_threshold_create_device(unsigned int cpu); -extern int mce_threshold_remove_device(unsigned int cpu); - void mce_amd_feature_init(struct cpuinfo_x86 *c); enum smca_bank_types smca_get_bank_type(unsigned int cpu, unsigned int bank); #else - -static inline int mce_threshold_create_device(unsigned int cpu) { return 0; }; -static inline int mce_threshold_remove_device(unsigned int cpu) { return 0; }; static inline bool amd_mce_is_memory_error(struct mce *m) { return false; }; static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { } #endif diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index d610eef1112b..bc033ac51f22 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -1317,12 +1317,12 @@ static void __threshold_remove_device(struct threshold_bank **bp) kfree(bp); } -int mce_threshold_remove_device(unsigned int cpu) +void mce_threshold_remove_device(unsigned int cpu) { struct threshold_bank **bp = this_cpu_read(threshold_banks); if (!bp) - return 0; + return; /* * Clear the pointer before cleaning up, so that the interrupt won't @@ -1331,7 +1331,7 @@ int mce_threshold_remove_device(unsigned int cpu) this_cpu_write(threshold_banks, NULL); __threshold_remove_device(bp); - return 0; + return; } /** @@ -1345,36 +1345,34 @@ int mce_threshold_remove_device(unsigned int cpu) * thread running on @cpu. The callback is invoked on all CPUs which are * online when the callback is installed or during a real hotplug event. */ -int mce_threshold_create_device(unsigned int cpu) +void mce_threshold_create_device(unsigned int cpu) { unsigned int numbanks, bank; struct threshold_bank **bp; - int err; if (!mce_flags.amd_threshold) - return 0; + return; bp = this_cpu_read(threshold_banks); if (bp) - return 0; + return; numbanks = this_cpu_read(mce_num_banks); bp = kcalloc(numbanks, sizeof(*bp), GFP_KERNEL); if (!bp) - return -ENOMEM; + return; for (bank = 0; bank < numbanks; ++bank) { if (!(this_cpu_read(bank_map) & BIT_ULL(bank))) continue; - err = threshold_create_bank(bp, cpu, bank); - if (err) { + if (threshold_create_bank(bp, cpu, bank)) { __threshold_remove_device(bp); - return err; + return; } } this_cpu_write(threshold_banks, bp); if (thresholding_irq_en) mce_threshold_vector = amd_threshold_interrupt; - return 0; + return; } diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index c60129eb62c2..9529349e90f2 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -259,6 +259,8 @@ enum mca_msr { extern bool filter_mce(struct mce *m); #ifdef CONFIG_X86_MCE_AMD +void mce_threshold_create_device(unsigned int cpu); +void mce_threshold_remove_device(unsigned int cpu); extern bool amd_filter_mce(struct mce *m); /* @@ -286,6 +288,8 @@ static __always_inline void smca_extract_err_addr(struct mce *m) } #else +static inline void mce_threshold_create_device(unsigned int cpu) { } +static inline void mce_threshold_remove_device(unsigned int cpu) { } static inline bool amd_filter_mce(struct mce *m) { return false; } static inline void smca_extract_err_addr(struct mce *m) { } #endif -- Gitee From b860b1e14fbbf005a253f7f7fa3d9122205b0f06 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Mon, 25 Aug 2025 17:33:00 +0000 Subject: [PATCH 06/33] x86/mce/amd: Remove smca_banks_map commit b249288abde5190bb113ea5acef8af4ceac4957c upstream The MCx_MISC0[BlkPtr] field was used on legacy systems to hold a register offset for the next MCx_MISC* register. In this way, an implementation-specific number of registers can be discovered at runtime. The MCAX/SMCA register space simplifies this by always including the MCx_MISC[1-4] registers. The MCx_MISC0[BlkPtr] field is used to indicate (true/false) whether any MCx_MISC[1-4] registers are present. Currently, MCx_MISC0[BlkPtr] is checked early and cached to be used during sysfs init later. This is unnecessary as the MCx_MISC0 register is read again later anyway. Remove the smca_banks_map variable as it is effectively redundant, and use a direct register/bit check instead. [ bp: Zap smca_get_block_address() too. ] Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Qiuxu Zhuo Reviewed-by: Tony Luck Tested-by: Tony Luck Link: https://lore.kernel.org/20250825-wip-mca-updates-v5-3-865768a2eef8@amd.com Signed-off-by: Malathi Signed-off-by: PvsNarasimha --- arch/x86/kernel/cpu/mce/amd.c | 50 +++++++---------------------------- 1 file changed, 9 insertions(+), 41 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index bc033ac51f22..c6d5a255e0a5 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -237,9 +237,6 @@ static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks); */ static DEFINE_PER_CPU(u64, bank_map); -/* Map of banks that have more than MCA_MISC0 available. */ -static DEFINE_PER_CPU(u64, smca_misc_banks_map); - static void amd_threshold_interrupt(void); static void amd_deferred_error_interrupt(void); @@ -249,28 +246,6 @@ static void default_deferred_error_interrupt(void) } void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt; -static void smca_set_misc_banks_map(unsigned int bank, unsigned int cpu) -{ - u32 low, high; - - /* - * For SMCA enabled processors, BLKPTR field of the first MISC register - * (MCx_MISC0) indicates presence of additional MISC regs set (MISC1-4). - */ - if (rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high)) - return; - - if (!(low & MCI_CONFIG_MCAX)) - return; - - if (rdmsr_safe(MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high)) - return; - - if (low & MASK_BLKPTR_LO) - per_cpu(smca_misc_banks_map, cpu) |= BIT_ULL(bank); - -} - static void smca_configure(unsigned int bank, unsigned int cpu) { u8 *bank_counts = this_cpu_ptr(smca_bank_counts); @@ -311,8 +286,6 @@ static void smca_configure(unsigned int bank, unsigned int cpu) wrmsr(smca_config, low, high); } - smca_set_misc_banks_map(bank, cpu); - if (rdmsr_safe(MSR_AMD64_SMCA_MCx_IPID(bank), &low, &high)) { pr_warn("Failed to read MCA_IPID for bank %d\n", bank); return; @@ -523,18 +496,6 @@ static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c) wrmsr(MSR_CU_DEF_ERR, low, high); } -static u32 smca_get_block_address(unsigned int bank, unsigned int block, - unsigned int cpu) -{ - if (!block) - return MSR_AMD64_SMCA_MCx_MISC(bank); - - if (!(per_cpu(smca_misc_banks_map, cpu) & BIT_ULL(bank))) - return 0; - - return MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1); -} - static u32 get_block_address(u32 current_addr, u32 low, u32 high, unsigned int bank, unsigned int block, unsigned int cpu) @@ -544,8 +505,15 @@ static u32 get_block_address(u32 current_addr, u32 low, u32 high, if ((bank >= per_cpu(mce_num_banks, cpu)) || (block >= NR_BLOCKS)) return addr; - if (mce_flags.smca) - return smca_get_block_address(bank, block, cpu); + if (mce_flags.smca) { + if (!block) + return MSR_AMD64_SMCA_MCx_MISC(bank); + + if (!(low & MASK_BLKPTR_LO)) + return 0; + + return MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1); + } /* Fall back to method we used for older processors: */ switch (block) { -- Gitee From af2c5e1b9a4fb1665a5f4bc4f10e1a5235e1aa9f Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Fri, 6 Dec 2024 16:11:54 +0000 Subject: [PATCH 07/33] x86/mce/amd: Remove shared threshold bank plumbing commit d35fb3121a36170bba951c529847a630440e4174 upstream Legacy AMD systems include an integrated Northbridge that is represented by MCA bank 4. This is the only non-core MCA bank in legacy systems. The Northbridge is physically shared by all the CPUs within an AMD "Node". However, in practice the "shared" MCA bank can only by managed by a single CPU within that AMD Node. This is known as the "Node Base Core" (NBC). For example, only the NBC will be able to read the MCA bank 4 registers; they will be Read-as-Zero for other CPUs. Also, the MCA Thresholding interrupt will only signal the NBC; the other CPUs will not receive it. This is enforced by hardware, and it should not be managed by software. The current AMD Thresholding code attempts to deal with the "shared" MCA bank by micromanaging the bank's sysfs kobjects. However, this does not follow the intended kobject use cases. It is also fragile, and it has caused bugs in the past. Modern AMD systems do not need this shared MCA bank support, and it should not be needed on legacy systems either. Remove the shared threshold bank code. Also, move the threshold struct definitions to mce/amd.c, since they are no longer needed in amd_nb.c. [Backport Changes] 1. In arch/x86/include/asm/amd_nb.h, the upstream patch removes the refcount.h include, but this header is already removed in the current source tree. Therefore, the removal step was skipped since the expected change is already reflected in the existing code. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20241206161210.163701-2-yazen.ghannam@amd.com Signed-off-by: Malathi Signed-off-by: PvsNarasimha --- arch/x86/Kconfig | 2 +- arch/x86/include/asm/amd_nb.h | 30 -------- arch/x86/kernel/cpu/mce/amd.c | 127 +++++++--------------------------- 3 files changed, 27 insertions(+), 132 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 8a54a79180cb..7c9809925f19 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1176,7 +1176,7 @@ config X86_MCE_INTEL config X86_MCE_AMD def_bool y prompt "AMD MCE features" - depends on X86_MCE && X86_LOCAL_APIC && AMD_NB + depends on X86_MCE && X86_LOCAL_APIC help Additional support for AMD specific MCE features such as the DRAM Error Threshold. diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h index 259883d42f4b..288101025fb4 100644 --- a/arch/x86/include/asm/amd_nb.h +++ b/arch/x86/include/asm/amd_nb.h @@ -26,41 +26,11 @@ struct amd_l3_cache { u8 subcaches[4]; }; -struct threshold_block { - unsigned int block; /* Number within bank */ - unsigned int bank; /* MCA bank the block belongs to */ - unsigned int cpu; /* CPU which controls MCA bank */ - u32 address; /* MSR address for the block */ - u16 interrupt_enable; /* Enable/Disable APIC interrupt */ - bool interrupt_capable; /* Bank can generate an interrupt. */ - - u16 threshold_limit; /* - * Value upon which threshold - * interrupt is generated. - */ - - struct kobject kobj; /* sysfs object */ - struct list_head miscj; /* - * List of threshold blocks - * within a bank. - */ -}; - -struct threshold_bank { - struct kobject *kobj; - struct threshold_block *blocks; - - /* initialized to the number of CPUs on the node sharing this bank */ - refcount_t cpus; - unsigned int shared; -}; - struct amd_northbridge { struct pci_dev *root; struct pci_dev *misc; struct pci_dev *link; struct amd_l3_cache l3_cache; - struct threshold_bank *bank4; }; struct amd_northbridge_info { diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index c6d5a255e0a5..ca1b2924df98 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -4,8 +4,6 @@ * * Written by Jacob Shin - AMD, Inc. * Maintained by: Borislav Petkov - * - * All MC4_MISCi registers are shared between cores on a node. */ #include #include @@ -20,7 +18,6 @@ #include #include -#include #include #include #include @@ -229,6 +226,32 @@ static const struct smca_hwid smca_hwid_mcatypes[] = { #define MAX_MCATYPE_NAME_LEN 30 static char buf_mcatype[MAX_MCATYPE_NAME_LEN]; +struct threshold_block { + /* This block's number within its bank. */ + unsigned int block; + /* MCA bank number that contains this block. */ + unsigned int bank; + /* CPU which controls this block's MCA bank. */ + unsigned int cpu; + /* MCA_MISC MSR address for this block. */ + u32 address; + /* Enable/Disable APIC interrupt. */ + bool interrupt_enable; + /* Bank can generate an interrupt. */ + bool interrupt_capable; + /* Value upon which threshold interrupt is generated. */ + u16 threshold_limit; + /* sysfs object */ + struct kobject kobj; + /* List of threshold blocks within this block's MCA bank. */ + struct list_head miscj; +}; + +struct threshold_bank { + struct kobject *kobj; + struct threshold_block *blocks; +}; + static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks); /* @@ -313,19 +336,6 @@ struct thresh_restart { u16 old_limit; }; -static inline bool is_shared_bank(int bank) -{ - /* - * Scalable MCA provides for only one core to have access to the MSRs of - * a shared bank. - */ - if (mce_flags.smca) - return false; - - /* Bank 4 is for northbridge reporting and is thus shared */ - return (bank == 4); -} - static const char *bank4_names(const struct threshold_block *b) { switch (b->address) { @@ -1118,35 +1128,10 @@ static int allocate_threshold_blocks(unsigned int cpu, struct threshold_bank *tb return err; } -static int __threshold_add_blocks(struct threshold_bank *b) -{ - struct list_head *head = &b->blocks->miscj; - struct threshold_block *pos = NULL; - struct threshold_block *tmp = NULL; - int err = 0; - - err = kobject_add(&b->blocks->kobj, b->kobj, b->blocks->kobj.name); - if (err) - return err; - - list_for_each_entry_safe(pos, tmp, head, miscj) { - - err = kobject_add(&pos->kobj, b->kobj, pos->kobj.name); - if (err) { - list_for_each_entry_safe_reverse(pos, tmp, head, miscj) - kobject_del(&pos->kobj); - - return err; - } - } - return err; -} - static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu, unsigned int bank) { struct device *dev = this_cpu_read(mce_device); - struct amd_northbridge *nb = NULL; struct threshold_bank *b = NULL; const char *name = get_name(cpu, bank, NULL); int err = 0; @@ -1154,26 +1139,6 @@ static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu, if (!dev) return -ENODEV; - if (is_shared_bank(bank)) { - nb = node_to_amd_nb(topology_amd_node_id(cpu)); - - /* threshold descriptor already initialized on this node? */ - if (nb && nb->bank4) { - /* yes, use it */ - b = nb->bank4; - err = kobject_add(b->kobj, &dev->kobj, name); - if (err) - goto out; - - bp[bank] = b; - refcount_inc(&b->cpus); - - err = __threshold_add_blocks(b); - - goto out; - } - } - b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL); if (!b) { err = -ENOMEM; @@ -1187,17 +1152,6 @@ static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu, goto out_free; } - if (is_shared_bank(bank)) { - b->shared = 1; - refcount_set(&b->cpus, 1); - - /* nb is already initialized, see above */ - if (nb) { - WARN_ON(nb->bank4); - nb->bank4 = b; - } - } - err = allocate_threshold_blocks(cpu, b, bank, 0, mca_msr_reg(bank, MCA_MISC)); if (err) goto out_kobj; @@ -1230,40 +1184,11 @@ static void deallocate_threshold_blocks(struct threshold_bank *bank) kobject_put(&bank->blocks->kobj); } -static void __threshold_remove_blocks(struct threshold_bank *b) -{ - struct threshold_block *pos = NULL; - struct threshold_block *tmp = NULL; - - kobject_put(b->kobj); - - list_for_each_entry_safe(pos, tmp, &b->blocks->miscj, miscj) - kobject_put(b->kobj); -} - static void threshold_remove_bank(struct threshold_bank *bank) { - struct amd_northbridge *nb; - if (!bank->blocks) goto out_free; - if (!bank->shared) - goto out_dealloc; - - if (!refcount_dec_and_test(&bank->cpus)) { - __threshold_remove_blocks(bank); - return; - } else { - /* - * The last CPU on this node using the shared bank is going - * away, remove that bank now. - */ - nb = node_to_amd_nb(topology_amd_node_id(smp_processor_id())); - nb->bank4 = NULL; - } - -out_dealloc: deallocate_threshold_blocks(bank); out_free: -- Gitee From 97a79e8fc54abf03fb827e9effa72435f940e313 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Tue, 24 Jun 2025 14:16:03 +0000 Subject: [PATCH 08/33] x86/mce/amd: Put list_head in threshold_bank commit c4bac5c640e3782bf30c07c4d82042d0202fe224 upstream The threshold_bank structure is a container for one or more threshold_block structures. Currently, the container has a single pointer to the 'first' threshold_block structure which then has a linked list of the remaining threshold_block structures. This results in an extra level of indirection where the 'first' block is checked before iterating over the remaining blocks. Remove the indirection by including the head of the block list in the threshold_bank structure which already acts as a container for all the bank's thresholding blocks. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Qiuxu Zhuo Reviewed-by: Tony Luck Tested-by: Tony Luck Link: https://lore.kernel.org/20250624-wip-mca-updates-v4-8-236dd74f645f@amd.com Signed-off-by: chaithanyaLagisetty Signed-off-by: PvsNarasimha --- arch/x86/kernel/cpu/mce/amd.c | 43 ++++++++++------------------------- 1 file changed, 12 insertions(+), 31 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index ca1b2924df98..6f893a51bc69 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -249,7 +249,8 @@ struct threshold_block { struct threshold_bank { struct kobject *kobj; - struct threshold_block *blocks; + /* List of threshold blocks within this MCA bank. */ + struct list_head miscj; }; static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks); @@ -844,9 +845,9 @@ static void log_and_reset_block(struct threshold_block *block) */ static void amd_threshold_interrupt(void) { - struct threshold_block *first_block = NULL, *block = NULL, *tmp = NULL; - struct threshold_bank **bp = this_cpu_read(threshold_banks); + struct threshold_bank **bp = this_cpu_read(threshold_banks), *thr_bank; unsigned int bank, cpu = smp_processor_id(); + struct threshold_block *block, *tmp; /* * Validate that the threshold bank has been initialized already. The @@ -860,16 +861,11 @@ static void amd_threshold_interrupt(void) if (!(per_cpu(bank_map, cpu) & BIT_ULL(bank))) continue; - first_block = bp[bank]->blocks; - if (!first_block) + thr_bank = bp[bank]; + if (!thr_bank) continue; - /* - * The first block is also the head of the list. Check it first - * before iterating over the rest. - */ - log_and_reset_block(first_block); - list_for_each_entry_safe(block, tmp, &first_block->miscj, miscj) + list_for_each_entry_safe(block, tmp, &thr_bank->miscj, miscj) log_and_reset_block(block); } } @@ -1095,13 +1091,7 @@ static int allocate_threshold_blocks(unsigned int cpu, struct threshold_bank *tb default_attrs[2] = NULL; } - INIT_LIST_HEAD(&b->miscj); - - /* This is safe as @tb is not visible yet */ - if (tb->blocks) - list_add(&b->miscj, &tb->blocks->miscj); - else - tb->blocks = b; + list_add(&b->miscj, &tb->miscj); err = kobject_init_and_add(&b->kobj, &threshold_ktype, tb->kobj, get_name(cpu, bank, b)); if (err) @@ -1152,6 +1142,8 @@ static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu, goto out_free; } + INIT_LIST_HEAD(&b->miscj); + err = allocate_threshold_blocks(cpu, b, bank, 0, mca_msr_reg(bank, MCA_MISC)); if (err) goto out_kobj; @@ -1172,26 +1164,15 @@ static void threshold_block_release(struct kobject *kobj) kfree(to_block(kobj)); } -static void deallocate_threshold_blocks(struct threshold_bank *bank) +static void threshold_remove_bank(struct threshold_bank *bank) { struct threshold_block *pos, *tmp; - list_for_each_entry_safe(pos, tmp, &bank->blocks->miscj, miscj) { + list_for_each_entry_safe(pos, tmp, &bank->miscj, miscj) { list_del(&pos->miscj); kobject_put(&pos->kobj); } - kobject_put(&bank->blocks->kobj); -} - -static void threshold_remove_bank(struct threshold_bank *bank) -{ - if (!bank->blocks) - goto out_free; - - deallocate_threshold_blocks(bank); - -out_free: kobject_put(bank->kobj); kfree(bank); } -- Gitee From 36892a1c2a7f5b87abfb20ea8fb67edfd00a357a Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Mon, 25 Aug 2025 17:33:03 +0000 Subject: [PATCH 09/33] x86/mce: Remove __mcheck_cpu_init_early() commit 9f34032ec0deef58bd0eb7475f1981adfa998648 upstream The __mcheck_cpu_init_early() function was introduced so that some vendor-specific features are detected before the first MCA polling event done in __mcheck_cpu_init_generic(). Currently, __mcheck_cpu_init_early() is only used on AMD-based systems and additional code will be needed to support various system configurations. However, the current and future vendor-specific code should be done during vendor init. This keeps all the vendor code in a common location and simplifies the generic init flow. Move all the __mcheck_cpu_init_early() code into mce_amd_feature_init(). Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Qiuxu Zhuo Reviewed-by: Tony Luck Reviewed-by: Nikolay Borisov Tested-by: Tony Luck Link: https://lore.kernel.org/20250825-wip-mca-updates-v5-6-865768a2eef8@amd.com Signed-off-by: chaithanyaLagisetty Signed-off-by: PvsNarasimha --- arch/x86/kernel/cpu/mce/amd.c | 4 ++++ arch/x86/kernel/cpu/mce/core.c | 14 -------------- 2 files changed, 4 insertions(+), 14 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 6f893a51bc69..c3110a4dfecd 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -661,6 +661,10 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) u32 low = 0, high = 0, address = 0; int offset = -1; + mce_flags.overflow_recov = cpu_feature_enabled(X86_FEATURE_OVERFLOW_RECOV); + mce_flags.succor = cpu_feature_enabled(X86_FEATURE_SUCCOR); + mce_flags.smca = cpu_feature_enabled(X86_FEATURE_SMCA); + mce_flags.amd_threshold = 1; for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) { if (mce_flags.smca) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index b6169ddd4077..ef4ca9a166b2 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1986,19 +1986,6 @@ static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) return 0; } -/* - * Init basic CPU features needed for early decoding of MCEs. - */ -static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c) -{ - if (c->x86_vendor == X86_VENDOR_AMD || c->x86_vendor == X86_VENDOR_HYGON) { - mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV); - mce_flags.succor = !!cpu_has(c, X86_FEATURE_SUCCOR); - mce_flags.smca = !!cpu_has(c, X86_FEATURE_SMCA); - mce_flags.amd_threshold = 1; - } -} - static void mce_zhaoxin_feature_init(struct cpuinfo_x86 *c) { struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); @@ -2201,7 +2188,6 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c) mca_cfg.initialized = 1; - __mcheck_cpu_init_early(c); __mcheck_cpu_init_generic(); __mcheck_cpu_init_vendor(c); __mcheck_cpu_init_clear_banks(); -- Gitee From 5f8cd5162f2062986ef27abbd978f15029519c84 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Mon, 8 Sep 2025 15:40:30 +0000 Subject: [PATCH 10/33] x86/mce: Set CR4.MCE last during init commit cfffcf97997bd35f4a59e035523d1762568bdbad upstream Set the CR4.MCE bit as the last step during init. This brings the MCA init order closer to what is described in the x86 docs. x86 docs: AMD Intel MCG_CTL MCA_CONFIG MCG_EXT_CTL MCi_CTL MCi_CTL MCG_CTL CR4.MCE CR4.MCE Current Linux: AMD Intel CR4.MCE CR4.MCE MCG_CTL MCG_CTL MCA_CONFIG MCG_EXT_CTL MCi_CTL MCi_CTL Updated Linux: AMD Intel MCG_CTL MCG_CTL MCA_CONFIG MCG_EXT_CTL MCi_CTL MCi_CTL CR4.MCE CR4.MCE The new init flow will match Intel's docs, but there will still be a mismatch for AMD regarding MCG_CTL. However, there is no known issue with this ordering, so leave it for now. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Nikolay Borisov Link: https://lore.kernel.org/20250908-wip-mca-updates-v6-0-eef5d6c74b9c@amd.com Signed-off-by: chaithanyaLagisetty Signed-off-by: PvsNarasimha --- arch/x86/kernel/cpu/mce/core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index ef4ca9a166b2..185578817c75 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1809,8 +1809,6 @@ static void __mcheck_cpu_init_generic(void) bitmap_fill(all_banks, MAX_NR_BANKS); machine_check_poll(MCP_UC | MCP_QUEUE_LOG | m_fl, &all_banks); - cr4_set_bits(X86_CR4_MCE); - rdmsrl(MSR_IA32_MCG_CAP, cap); if (cap & MCG_CTL_P) wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); @@ -2193,6 +2191,7 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c) __mcheck_cpu_init_clear_banks(); __mcheck_cpu_check_banks(); __mcheck_cpu_setup_timer(); + cr4_set_bits(X86_CR4_MCE); } /* @@ -2361,6 +2360,7 @@ static void mce_syscore_resume(void) __mcheck_cpu_init_generic(); __mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info)); __mcheck_cpu_init_clear_banks(); + cr4_set_bits(X86_CR4_MCE); } static struct syscore_ops mce_syscore_ops = { @@ -2380,6 +2380,7 @@ static void mce_cpu_restart(void *data) __mcheck_cpu_init_generic(); __mcheck_cpu_init_clear_banks(); __mcheck_cpu_init_timer(); + cr4_set_bits(X86_CR4_MCE); } /* Reinit MCEs after user configuration changes */ -- Gitee From 82cd0eea96ed398645abfb61376ba4d60b74b832 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Mon, 8 Sep 2025 15:40:31 +0000 Subject: [PATCH 11/33] x86/mce: Define BSP-only init commit 669ce4984b729ad5b4c6249d4a8721ae52398bfb upstream Currently, MCA initialization is executed identically on each CPU as they are brought online. However, a number of MCA initialization tasks only need to be done once. Define a function to collect all 'global' init tasks and call this from the BSP only. Start with CPU features. [Backport Changes] 1. In file arch/x86/kernel/cpu/mce/core.c, within the newly added function mca_bsp_init(), the call to rdmsrq() was replaced with the existing equivalent call rdmsrl() because the upstream commit that globally renamed rdmsrl() to rdmsrq() is not available yet in the current source tree. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Qiuxu Zhuo Reviewed-by: Tony Luck Reviewed-by: Nikolay Borisov Tested-by: Tony Luck Link: https://lore.kernel.org/20250908-wip-mca-updates-v6-0-eef5d6c74b9c@amd.com Signed-off-by: chaithanyaLagisetty Signed-off-by: PvsNarasimha --- arch/x86/include/asm/mce.h | 2 ++ arch/x86/kernel/cpu/common.c | 1 + arch/x86/kernel/cpu/mce/amd.c | 3 --- arch/x86/kernel/cpu/mce/core.c | 28 +++++++++++++++++++++------- 4 files changed, 24 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 81213c1e799f..64a32a973f11 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -207,12 +207,14 @@ struct cper_ia_proc_ctx; #ifdef CONFIG_X86_MCE int mcheck_init(void); +void mca_bsp_init(struct cpuinfo_x86 *c); void mcheck_cpu_init(struct cpuinfo_x86 *c); void mcheck_cpu_clear(struct cpuinfo_x86 *c); int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id); #else static inline int mcheck_init(void) { return 0; } +static inline void mca_bsp_init(struct cpuinfo_x86 *c) {} static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {} static inline void mcheck_cpu_clear(struct cpuinfo_x86 *c) {} static inline int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 4018fb35165c..7b640c720c10 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1798,6 +1798,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) setup_clear_cpu_cap(X86_FEATURE_LA57); detect_nopl(); + mca_bsp_init(c); } void __init init_cpu_devs(void) diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index c3110a4dfecd..d28b5facc812 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -661,9 +661,6 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) u32 low = 0, high = 0, address = 0; int offset = -1; - mce_flags.overflow_recov = cpu_feature_enabled(X86_FEATURE_OVERFLOW_RECOV); - mce_flags.succor = cpu_feature_enabled(X86_FEATURE_SUCCOR); - mce_flags.smca = cpu_feature_enabled(X86_FEATURE_SMCA); mce_flags.amd_threshold = 1; for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) { diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 185578817c75..548edd18c80e 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1783,13 +1783,6 @@ static void __mcheck_cpu_cap_init(void) this_cpu_write(mce_num_banks, b); __mcheck_cpu_mce_banks_init(); - - /* Use accurate RIP reporting if available. */ - if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) - mca_cfg.rip_msr = MSR_IA32_MCG_EIP; - - if (cap & MCG_SER_P) - mca_cfg.ser = 1; } static void __mcheck_cpu_init_generic(void) @@ -2156,6 +2149,27 @@ DEFINE_IDTENTRY_RAW(exc_machine_check) } #endif +void mca_bsp_init(struct cpuinfo_x86 *c) +{ + u64 cap; + + if (!mce_available(c)) + return; + + mce_flags.overflow_recov = cpu_feature_enabled(X86_FEATURE_OVERFLOW_RECOV); + mce_flags.succor = cpu_feature_enabled(X86_FEATURE_SUCCOR); + mce_flags.smca = cpu_feature_enabled(X86_FEATURE_SMCA); + + rdmsrl(MSR_IA32_MCG_CAP, cap); + + /* Use accurate RIP reporting if available. */ + if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) + mca_cfg.rip_msr = MSR_IA32_MCG_EIP; + + if (cap & MCG_SER_P) + mca_cfg.ser = 1; +} + /* * Called for each booted CPU to set up machine checks. * Must be called with preempt off: -- Gitee From c6c7de62bf39c233e3bb78a452644758d6d516b0 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Mon, 8 Sep 2025 15:40:32 +0000 Subject: [PATCH 12/33] x86/mce: Define BSP-only SMCA init commit c6e465b8d45a1bc717d196ee769ee5a9060de8e2 upstream Currently, on AMD systems, MCA interrupt handler functions are set during CPU init. However, the functions only need to be set once for the whole system. Assign the handlers only during BSP init. Do so only for SMCA systems to maintain the old behavior for legacy systems. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Nikolay Borisov Link: https://lore.kernel.org/20250908-wip-mca-updates-v6-0-eef5d6c74b9c@amd.com Signed-off-by: Malathi Signed-off-by: PvsNarasimha --- arch/x86/kernel/cpu/mce/amd.c | 6 ++++++ arch/x86/kernel/cpu/mce/core.c | 3 +++ arch/x86/kernel/cpu/mce/internal.h | 2 ++ 3 files changed, 11 insertions(+) diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index d28b5facc812..95fb9829d2e1 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -692,6 +692,12 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) deferred_error_interrupt_enable(c); } +void smca_bsp_init(void) +{ + mce_threshold_vector = amd_threshold_interrupt; + deferred_error_int_vector = amd_deferred_error_interrupt; +} + bool amd_mce_is_memory_error(struct mce *m) { enum smca_bank_types bank_type; diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 548edd18c80e..a9d90aed6689 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -2160,6 +2160,9 @@ void mca_bsp_init(struct cpuinfo_x86 *c) mce_flags.succor = cpu_feature_enabled(X86_FEATURE_SUCCOR); mce_flags.smca = cpu_feature_enabled(X86_FEATURE_SMCA); + if (mce_flags.smca) + smca_bsp_init(); + rdmsrl(MSR_IA32_MCG_CAP, cap); /* Use accurate RIP reporting if available. */ diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index 9529349e90f2..322c16d49809 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -287,11 +287,13 @@ static __always_inline void smca_extract_err_addr(struct mce *m) m->addr &= GENMASK_ULL(55, lsb); } +void smca_bsp_init(void); #else static inline void mce_threshold_create_device(unsigned int cpu) { } static inline void mce_threshold_remove_device(unsigned int cpu) { } static inline bool amd_filter_mce(struct mce *m) { return false; } static inline void smca_extract_err_addr(struct mce *m) { } +static inline void smca_bsp_init(void) { } #endif #ifdef CONFIG_X86_ANCIENT_MCE -- Gitee From ff3250313491de3844e03dbc2773459d4aefbfd8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 29 Mar 2024 08:11:27 +0100 Subject: [PATCH 13/33] x86/mce: Clean up TP_printk() output line of the 'mce_record' tracepoint commit ac5e80e94f5c67d7053f50fc3faddab931707f0f upstream - Only capitalize entries where that makes sense - Print separate values separately - Rename 'PROCESSOR' to vendor & CPUID Signed-off-by: Ingo Molnar Cc: Avadhut Naik Cc: "Tony Luck" Link: https://lore.kernel.org/r/ZgZpn/zbCJWYdL5y@gmail.com Signed-off-by: Malathi Signed-off-by: PvsNarasimha --- include/trace/events/mce.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/include/trace/events/mce.h b/include/trace/events/mce.h index 1391ada0da3b..9c4e12163996 100644 --- a/include/trace/events/mce.h +++ b/include/trace/events/mce.h @@ -55,15 +55,18 @@ TRACE_EVENT(mce_record, __entry->cpuvendor = m->cpuvendor; ), - TP_printk("CPU: %d, MCGc/s: %llx/%llx, MC%d: %016Lx, IPID: %016Lx, ADDR/MISC/SYND: %016Lx/%016Lx/%016Lx, RIP: %02x:<%016Lx>, TSC: %llx, PROCESSOR: %u:%x, TIME: %llu, SOCKET: %u, APIC: %x", + TP_printk("CPU: %d, MCGc/s: %llx/%llx, MC%d: %016Lx, IPID: %016Lx, ADDR: %016Lx, MISC: %016Lx, SYND: %016Lx, RIP: %02x:<%016Lx>, TSC: %llx, vendor: %u, CPUID: %x, time: %llu, socket: %u, APIC: %x", __entry->cpu, __entry->mcgcap, __entry->mcgstatus, __entry->bank, __entry->status, __entry->ipid, - __entry->addr, __entry->misc, __entry->synd, + __entry->addr, + __entry->misc, + __entry->synd, __entry->cs, __entry->ip, __entry->tsc, - __entry->cpuvendor, __entry->cpuid, + __entry->cpuvendor, + __entry->cpuid, __entry->walltime, __entry->socketid, __entry->apicid) -- Gitee From 26495988f1559e8082e5968e3765d0ba05b80fd1 Mon Sep 17 00:00:00 2001 From: Avadhut Naik Date: Mon, 1 Apr 2024 12:14:54 -0500 Subject: [PATCH 14/33] tracing: Add the ::ppin field to the mce_record tracepoint commit 98430645e383404e5f6f784cabbb08ebb4ac5499 upstream Machine Check Error information from 'struct mce' is exposed to userspace through the mce_record tracepoint. Currently, however, the PPIN (Protected Processor Inventory Number) field of 'struct mce' is not exposed. Add a PPIN field to the tracepoint as it provides a unique identifier for the system (or socket in case of multi-socket systems) on which the MCE has been received. Also, add a comment explaining the kind of information that can be and should be added to the tracepoint. Signed-off-by: Avadhut Naik Signed-off-by: Ingo Molnar Reviewed-by: Sohil Mehta Reviewed-by: Steven Rostedt (Google) Reviewed-by: Tony Luck Link: https://lore.kernel.org/r/20240401171455.1737976-2-avadhut.naik@amd.com Signed-off-by: Malathi Signed-off-by: PvsNarasimha --- include/trace/events/mce.h | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/include/trace/events/mce.h b/include/trace/events/mce.h index 9c4e12163996..294fccc329c1 100644 --- a/include/trace/events/mce.h +++ b/include/trace/events/mce.h @@ -9,6 +9,14 @@ #include #include +/* + * MCE Event Record. + * + * Only very relevant and transient information which cannot be + * gathered from a system by any other means or which can only be + * acquired arduously should be added to this record. + */ + TRACE_EVENT(mce_record, TP_PROTO(struct mce *m), @@ -25,6 +33,7 @@ TRACE_EVENT(mce_record, __field( u64, ipid ) __field( u64, ip ) __field( u64, tsc ) + __field( u64, ppin ) __field( u64, walltime ) __field( u32, cpu ) __field( u32, cpuid ) @@ -45,6 +54,7 @@ TRACE_EVENT(mce_record, __entry->ipid = m->ipid; __entry->ip = m->ip; __entry->tsc = m->tsc; + __entry->ppin = m->ppin; __entry->walltime = m->time; __entry->cpu = m->extcpu; __entry->cpuid = m->cpuid; @@ -55,7 +65,7 @@ TRACE_EVENT(mce_record, __entry->cpuvendor = m->cpuvendor; ), - TP_printk("CPU: %d, MCGc/s: %llx/%llx, MC%d: %016Lx, IPID: %016Lx, ADDR: %016Lx, MISC: %016Lx, SYND: %016Lx, RIP: %02x:<%016Lx>, TSC: %llx, vendor: %u, CPUID: %x, time: %llu, socket: %u, APIC: %x", + TP_printk("CPU: %d, MCGc/s: %llx/%llx, MC%d: %016Lx, IPID: %016Lx, ADDR: %016Lx, MISC: %016Lx, SYND: %016Lx, RIP: %02x:<%016Lx>, TSC: %llx, PPIN: %llx, vendor: %u, CPUID: %x, time: %llu, socket: %u, APIC: %x", __entry->cpu, __entry->mcgcap, __entry->mcgstatus, __entry->bank, __entry->status, @@ -65,6 +75,7 @@ TRACE_EVENT(mce_record, __entry->synd, __entry->cs, __entry->ip, __entry->tsc, + __entry->ppin, __entry->cpuvendor, __entry->cpuid, __entry->walltime, -- Gitee From 0ff0e42346bb634d9dd9a65667d9ed3b1a0eaee6 Mon Sep 17 00:00:00 2001 From: Avadhut Naik Date: Mon, 1 Apr 2024 12:14:55 -0500 Subject: [PATCH 15/33] tracing: Add the ::microcode field to the mce_record tracepoint commit 186d7ef52c1f0c41450dedbdf6d6325d0a84e4c5 upstream Currently, the microcode field (Microcode Revision) of 'struct mce' is not exposed to userspace through the mce_record tracepoint. Knowing the microcode version on which the MCE was received is critical information for debugging. If the version is not recorded, later attempts to acquire the version might result in discrepancies since it can be changed at runtime. Add microcode version to the tracepoint to prevent ambiguity over the active version on the system when the MCE was received. Signed-off-by: Avadhut Naik Signed-off-by: Ingo Molnar Reviewed-by: Sohil Mehta Reviewed-by: Steven Rostedt (Google) Reviewed-by: Tony Luck Link: https://lore.kernel.org/r/20240401171455.1737976-3-avadhut.naik@amd.com Signed-off-by: Malathi Signed-off-by: PvsNarasimha --- include/trace/events/mce.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/include/trace/events/mce.h b/include/trace/events/mce.h index 294fccc329c1..f0f7b3cb2041 100644 --- a/include/trace/events/mce.h +++ b/include/trace/events/mce.h @@ -42,6 +42,7 @@ TRACE_EVENT(mce_record, __field( u8, cs ) __field( u8, bank ) __field( u8, cpuvendor ) + __field( u32, microcode ) ), TP_fast_assign( @@ -63,9 +64,10 @@ TRACE_EVENT(mce_record, __entry->cs = m->cs; __entry->bank = m->bank; __entry->cpuvendor = m->cpuvendor; + __entry->microcode = m->microcode; ), - TP_printk("CPU: %d, MCGc/s: %llx/%llx, MC%d: %016Lx, IPID: %016Lx, ADDR: %016Lx, MISC: %016Lx, SYND: %016Lx, RIP: %02x:<%016Lx>, TSC: %llx, PPIN: %llx, vendor: %u, CPUID: %x, time: %llu, socket: %u, APIC: %x", + TP_printk("CPU: %d, MCGc/s: %llx/%llx, MC%d: %016Lx, IPID: %016Lx, ADDR: %016Lx, MISC: %016Lx, SYND: %016Lx, RIP: %02x:<%016Lx>, TSC: %llx, PPIN: %llx, vendor: %u, CPUID: %x, time: %llu, socket: %u, APIC: %x, microcode: %x", __entry->cpu, __entry->mcgcap, __entry->mcgstatus, __entry->bank, __entry->status, @@ -80,7 +82,8 @@ TRACE_EVENT(mce_record, __entry->cpuid, __entry->walltime, __entry->socketid, - __entry->apicid) + __entry->apicid, + __entry->microcode) ); #endif /* _TRACE_MCE_H */ -- Gitee From 861bce30e5c9d7fc719bc64bff8534628371fbfb Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Tue, 30 Jul 2024 13:29:56 -0500 Subject: [PATCH 16/33] x86/mce: Rename mce_setup() to mce_prep_record() commit 5ad21a2497329c2b090d5b02b95394a1316bef53 upstream There is no MCE "setup" done in mce_setup(). Rather, this function initializes and prepares an MCE record. Rename the function to highlight what it does. No functional change is intended. [Backport Changes] The current upstream commit renames mce_setup() to mce_prep_record() globally as part of the MCE rework. However, the current Opencloudos 6.6 tree contains additional, non-upstream functions in file arch/x86/kernel/cpu/mce/apei.c introduced to support Zhaoxin processors. Therefore, the same upstream change was applied to these Opencloudos-specific functions,replacing mce_setup() with mce_prep_record() to maintain consistency with the upstream changes. List of modified functions in arch/x86/kernel/cpu/mce/apei.c are: - zx_apei_mce_report_mem_error() - zx_apei_mce_report_pcie_error() - zx_apei_mce_report_zdi_error() As noted in the upstream commit, this change has no functional impact. Suggested-by: Borislav Petkov Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Nikolay Borisov Link: https://lore.kernel.org/r/20240730182958.4117158-2-yazen.ghannam@amd.com Signed-off-by: Borislav Petkov (AMD) Signed-off-by: Malathi Signed-off-by: PvsNarasimha --- arch/x86/include/asm/mce.h | 2 +- arch/x86/kernel/cpu/mce/amd.c | 2 +- arch/x86/kernel/cpu/mce/apei.c | 10 +++++----- arch/x86/kernel/cpu/mce/core.c | 6 +++--- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 64a32a973f11..9d56005d5151 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -221,7 +221,7 @@ static inline int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id) { return -EINVAL; } #endif -void mce_setup(struct mce *m); +void mce_prep_record(struct mce *m); void mce_log(struct mce *m); DECLARE_PER_CPU(struct device *, mce_device); diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 95fb9829d2e1..06515618ff24 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -715,7 +715,7 @@ static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc) { struct mce m; - mce_setup(&m); + mce_prep_record(&m); m.status = status; m.misc = misc; diff --git a/arch/x86/kernel/cpu/mce/apei.c b/arch/x86/kernel/cpu/mce/apei.c index 24098bad6724..9e571e34c4d9 100644 --- a/arch/x86/kernel/cpu/mce/apei.c +++ b/arch/x86/kernel/cpu/mce/apei.c @@ -44,7 +44,7 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err) else lsb = PAGE_SHIFT; - mce_setup(&m); + mce_prep_record(&m); m.bank = -1; /* Fake a memory read error with unknown channel */ m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | MCI_STATUS_MISCV | 0x9f; @@ -74,7 +74,7 @@ void zx_apei_mce_report_mem_error(struct cper_sec_mem_err *mem_err) if (!(mem_err->validation_bits & CPER_MEM_VALID_PA)) return; - mce_setup(&m); + mce_prep_record(&m); m.misc = 0; m.misc = mem_err->module; m.addr = mem_err->physical_addr; @@ -137,7 +137,7 @@ void zx_apei_mce_report_pcie_error(int severity, struct cper_sec_pcie *pcie_err) if (boot_cpu_data.x86 != 7 || boot_cpu_data.x86_model != 91) return; - mce_setup(&m); + mce_prep_record(&m); m.addr = 0; m.misc = 0; m.misc |= (u64)pcie_err->device_id.segment << 32; @@ -173,7 +173,7 @@ void zx_apei_mce_report_zdi_error(struct cper_sec_proc_generic *zdi_err) if (boot_cpu_data.x86 != 7 || boot_cpu_data.x86_model != 91) return; - mce_setup(&m); + mce_prep_record(&m); m.misc = 0; m.misc |= (zdi_err->requestor_id & 0xff) << 19; m.misc |= ((zdi_err->requestor_id & 0xff00) >> 8) >> 24; @@ -264,7 +264,7 @@ int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id) if (ctx_info->reg_arr_size < 48) return -EINVAL; - mce_setup(&m); + mce_prep_record(&m); m.extcpu = -1; m.socketid = -1; diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index a9d90aed6689..ee5bedf3d7d2 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -116,7 +116,7 @@ static struct irq_work mce_irq_work; BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain); /* Do initial initialization of a struct mce */ -void mce_setup(struct mce *m) +void mce_prep_record(struct mce *m) { memset(m, 0, sizeof(struct mce)); m->cpu = m->extcpu = smp_processor_id(); @@ -420,11 +420,11 @@ static noinstr void mce_wrmsrl(u32 msr, u64 v) static noinstr void mce_gather_info(struct mce *m, struct pt_regs *regs) { /* - * Enable instrumentation around mce_setup() which calls external + * Enable instrumentation around mce_prep_record() which calls external * facilities. */ instrumentation_begin(); - mce_setup(m); + mce_prep_record(m); instrumentation_end(); m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); -- Gitee From 518ad7fd91ba1e81e94ac096585d55d1e93c78cd Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Tue, 30 Jul 2024 13:29:57 -0500 Subject: [PATCH 17/33] x86/mce: Define mce_prep_record() helpers for common and per-CPU fields commit f9bbb8ad0c8b2f37e3d474b8693f563e4a29e92e upstream Generally, MCA information for an error is gathered on the CPU that reported the error. In this case, CPU-specific information from the running CPU will be correct. However, this will be incorrect if the MCA information is gathered while running on a CPU that didn't report the error. One example is creating an MCA record using mce_prep_record() for errors reported from ACPI. Split mce_prep_record() so that there is a helper function to gather common, i.e. not CPU-specific, information and another helper for CPU-specific information. Leave mce_prep_record() defined as-is for the common case when running on the reporting CPU. Get MCG_CAP in the global helper even though the register is per-CPU. This value is not already cached per-CPU like other values. And it does not assist with any per-CPU decoding or handling. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Nikolay Borisov Link: https://lore.kernel.org/r/20240730182958.4117158-3-yazen.ghannam@amd.com Signed-off-by: Borislav Petkov (AMD) Signed-off-by: Malathi Signed-off-by: PvsNarasimha --- arch/x86/kernel/cpu/mce/core.c | 34 ++++++++++++++++++++---------- arch/x86/kernel/cpu/mce/internal.h | 2 ++ 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index ee5bedf3d7d2..452ce1ab9d0e 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -115,20 +115,32 @@ static struct irq_work mce_irq_work; */ BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain); -/* Do initial initialization of a struct mce */ -void mce_prep_record(struct mce *m) +void mce_prep_record_common(struct mce *m) { memset(m, 0, sizeof(struct mce)); - m->cpu = m->extcpu = smp_processor_id(); + + m->cpuid = cpuid_eax(1); + m->cpuvendor = boot_cpu_data.x86_vendor; + m->mcgcap = __rdmsr(MSR_IA32_MCG_CAP); /* need the internal __ version to avoid deadlocks */ - m->time = __ktime_get_real_seconds(); - m->cpuvendor = boot_cpu_data.x86_vendor; - m->cpuid = cpuid_eax(1); - m->socketid = cpu_data(m->extcpu).topo.pkg_id; - m->apicid = cpu_data(m->extcpu).topo.initial_apicid; - m->mcgcap = __rdmsr(MSR_IA32_MCG_CAP); - m->ppin = cpu_data(m->extcpu).ppin; - m->microcode = boot_cpu_data.microcode; + m->time = __ktime_get_real_seconds(); +} + +void mce_prep_record_per_cpu(unsigned int cpu, struct mce *m) +{ + m->cpu = cpu; + m->extcpu = cpu; + m->apicid = cpu_data(cpu).topo.initial_apicid; + m->microcode = cpu_data(cpu).microcode; + m->ppin = topology_ppin(cpu); + m->socketid = topology_physical_package_id(cpu); +} + +/* Do initial initialization of a struct mce */ +void mce_prep_record(struct mce *m) +{ + mce_prep_record_common(m); + mce_prep_record_per_cpu(smp_processor_id(), m); } DEFINE_PER_CPU(struct mce, injectm); diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index 322c16d49809..8f82eeaa616d 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -257,6 +257,8 @@ enum mca_msr { /* Decide whether to add MCE record to MCE event pool or filter it out. */ extern bool filter_mce(struct mce *m); +void mce_prep_record_common(struct mce *m); +void mce_prep_record_per_cpu(unsigned int cpu, struct mce *m); #ifdef CONFIG_X86_MCE_AMD void mce_threshold_create_device(unsigned int cpu); -- Gitee From ac9ca558bc70ef7f4224b27f7b20026728fbb072 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Mon, 8 Apr 2024 11:09:44 -0700 Subject: [PATCH 18/33] x86/mce: Implement recovery for errors in TDX/SEAM non-root mode commit 7911f145de5fecbee1d67f27f73bec12f0fbc472 upstream Machine check SMIs (MSMI) signaled during SEAM operation (typically inside TDX guests), on a system with Intel eMCA enabled, might eventually be reported to the kernel #MC handler with the saved RIP on the stack pointing to the instruction in kernel code after the SEAMCALL instruction that entered the SEAM operation. Linux currently says that is a fatal error and shuts down. There is a new bit in IA32_MCG_STATUS that, when set to 1, indicates that the machine check didn't originally occur at that saved RIP, but during SEAM non-root operation. Add new entries to the severity table to detect this for both data load and instruction fetch that set the severity to "AR" (action required). Increase the width of the mcgmask/mcgres fields in "struct severity" from unsigned char to unsigned short since the new bit is in position 12. Action required for these errors is just mark the page as poisoned and return from the machine check handler. HW ABI notes: ============= The SEAM_NR bit in IA32_MCG_STATUS hasn't yet made it into the Intel Software Developers' Manual. But it is described in section 16.5.2 of "Intel(R) Trust Domain Extensions (Intel(R) TDX) Module Base Architecture Specification" downloadable from: https://cdrdv2.intel.com/v1/dl/getContent/733575 Backport notes: =============== Little value in backporting this patch to stable or LTS kernels as this is only relevant with support for TDX, which I assume won't be backported. But for anyone taking this to v6.1 or older, you also need commit: a51cbd0d86d3 ("x86/mce: Use severity table to handle uncorrected errors in kernel") Signed-off-by: Tony Luck Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20240408180944.44638-1-tony.luck@intel.com Signed-off-by: Malathi Signed-off-by: PvsNarasimha --- arch/x86/include/asm/mce.h | 2 ++ arch/x86/kernel/cpu/mce/core.c | 18 ++++++++++++++++++ arch/x86/kernel/cpu/mce/severity.c | 16 ++++++++++++++-- 3 files changed, 34 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 9d56005d5151..75af8b41800f 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -13,6 +13,7 @@ #define MCG_CTL_P BIT_ULL(8) /* MCG_CTL register available */ #define MCG_EXT_P BIT_ULL(9) /* Extended registers available */ #define MCG_CMCI_P BIT_ULL(10) /* CMCI supported */ +#define MCG_SEAM_NR BIT_ULL(12) /* MCG_STATUS_SEAM_NR supported */ #define MCG_EXT_CNT_MASK 0xff0000 /* Number of Extended registers */ #define MCG_EXT_CNT_SHIFT 16 #define MCG_EXT_CNT(c) (((c) & MCG_EXT_CNT_MASK) >> MCG_EXT_CNT_SHIFT) @@ -25,6 +26,7 @@ #define MCG_STATUS_EIPV BIT_ULL(1) /* ip points to correct instruction */ #define MCG_STATUS_MCIP BIT_ULL(2) /* machine check in progress */ #define MCG_STATUS_LMCES BIT_ULL(3) /* LMCE signaled */ +#define MCG_STATUS_SEAM_NR BIT_ULL(12) /* Machine check inside SEAM non-root mode */ /* MCG_EXT_CTL register defines */ #define MCG_EXT_CTL_LMCE_EN BIT_ULL(0) /* Enable LMCE */ diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 452ce1ab9d0e..6dcb9ebd69cd 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1603,6 +1603,24 @@ noinstr void do_machine_check(struct pt_regs *regs) else queue_task_work(&m, msg, kill_me_maybe); + } else if (m.mcgstatus & MCG_STATUS_SEAM_NR) { + /* + * Saved RIP on stack makes it look like the machine check + * was taken in the kernel on the instruction following + * the entry to SEAM mode. But MCG_STATUS_SEAM_NR indicates + * that the machine check was taken inside SEAM non-root + * mode. CPU core has already marked that guest as dead. + * It is OK for the kernel to resume execution at the + * apparent point of the machine check as the fault did + * not occur there. Mark the page as poisoned so it won't + * be added to free list when the guest is terminated. + */ + if (mce_usable_address(&m)) { + struct page *p = pfn_to_online_page(m.addr >> PAGE_SHIFT); + + if (p) + SetPageHWPoison(p); + } } else { /* * Handle an MCE which has happened in kernel space but from diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c index 9c5754229d6e..ce5593607f65 100644 --- a/arch/x86/kernel/cpu/mce/severity.c +++ b/arch/x86/kernel/cpu/mce/severity.c @@ -39,8 +39,8 @@ static struct severity { u64 mask; u64 result; unsigned char sev; - unsigned char mcgmask; - unsigned char mcgres; + unsigned short mcgmask; + unsigned short mcgres; unsigned char ser; unsigned char context; unsigned char excp; @@ -173,6 +173,18 @@ static struct severity { SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR), USER ), + MCESEV( + AR, "Data load error in SEAM non-root mode", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), + MCGMASK(MCG_STATUS_SEAM_NR, MCG_STATUS_SEAM_NR), + KERNEL + ), + MCESEV( + AR, "Instruction fetch error in SEAM non-root mode", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR), + MCGMASK(MCG_STATUS_SEAM_NR, MCG_STATUS_SEAM_NR), + KERNEL + ), MCESEV( PANIC, "Data load in unrecoverable area of kernel", SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), -- Gitee From 8c8c7f7f5ae4762af15c85b188830563c87e1cf6 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Tue, 30 Jul 2024 13:29:58 -0500 Subject: [PATCH 19/33] x86/mce: Use mce_prep_record() helpers for apei_smca_report_x86_error() commit 793aa4bf192d0ad07cca001a596f955d121f5c10 upstream Current AMD systems can report MCA errors using the ACPI Boot Error Record Table (BERT). The BERT entries for MCA errors will be an x86 Common Platform Error Record (CPER) with an MSR register context that matches the MCAX/SMCA register space. However, the BERT will not necessarily be processed on the CPU that reported the MCA errors. Therefore, the correct CPU number needs to be determined and the information saved in struct mce. Use the newly defined mce_prep_record_*() helpers to get the correct data. Also, add an explicit check to verify that a valid CPU number was found from the APIC ID search. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Nikolay Borisov Link: https://lore.kernel.org/r/20240730182958.4117158-4-yazen.ghannam@amd.com Signed-off-by: Borislav Petkov (AMD) Signed-off-by: Malathi Signed-off-by: PvsNarasimha --- arch/x86/kernel/cpu/mce/apei.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/apei.c b/arch/x86/kernel/cpu/mce/apei.c index 9e571e34c4d9..420fd3a7d046 100644 --- a/arch/x86/kernel/cpu/mce/apei.c +++ b/arch/x86/kernel/cpu/mce/apei.c @@ -233,6 +233,7 @@ EXPORT_SYMBOL_GPL(zx_apei_mce_report_zdi_error); int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id) { const u64 *i_mce = ((const u64 *) (ctx_info + 1)); + bool apicid_found = false; unsigned int cpu; struct mce m; @@ -264,20 +265,19 @@ int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id) if (ctx_info->reg_arr_size < 48) return -EINVAL; - mce_prep_record(&m); - - m.extcpu = -1; - m.socketid = -1; - for_each_possible_cpu(cpu) { if (cpu_data(cpu).topo.initial_apicid == lapic_id) { - m.extcpu = cpu; - m.socketid = cpu_data(m.extcpu).topo.pkg_id; + apicid_found = true; break; } } - m.apicid = lapic_id; + if (!apicid_found) + return -EINVAL; + + mce_prep_record_common(&m); + mce_prep_record_per_cpu(cpu, &m); + m.bank = (ctx_info->msr_addr >> 4) & 0xFF; m.status = *i_mce; m.addr = *(i_mce + 1); -- Gitee From d898e7c11266064637b6474ea7acd232b91f1a35 Mon Sep 17 00:00:00 2001 From: Avadhut Naik Date: Tue, 22 Oct 2024 19:36:27 +0000 Subject: [PATCH 20/33] x86/mce: Add wrapper for struct mce to export vendor specific info commit 750fd23926f1507cc826b5a4fdd4bfc7283e7723 upstream Currently, exporting new additional machine check error information involves adding new fields for the same at the end of the struct mce. This additional information can then be consumed through mcelog or tracepoint. However, as new MSRs are being added (and will be added in the future) by CPU vendors on their newer CPUs with additional machine check error information to be exported, the size of struct mce will balloon on some CPUs, unnecessarily, since those fields are vendor-specific. Moreover, different CPU vendors may export the additional information in varying sizes. The problem particularly intensifies since struct mce is exposed to userspace as part of UAPI. It's bloating through vendor-specific data should be avoided to limit the information being sent out to userspace. Add a new structure mce_hw_err to wrap the existing struct mce. The same will prevent its ballooning since vendor-specifc data, if any, can now be exported through a union within the wrapper structure and through __dynamic_array in mce_record tracepoint. Furthermore, new internal kernel fields can be added to the wrapper struct without impacting the user space API. [ bp: Restore reverse x-mas tree order of function vars declarations. ] [Backport Changes] 1. Current upstream commit reworks the MCE infrastructure by introducing wrapper struct mce_hw_err for existing struct mce. However, Opencloudos 6.6 kernel tree contains additional, non-upstream functions in arch/x86/kernel/cpu/mce/apei.c that were introduced to support Zhaoxin processors. To align with the upstream, the following Opencloudos-specific functions were updated to use wrapper struct mce_hw_err similar to upstream in the current backport. - zx_apei_mce_report_mem_error() - zx_apei_mce_report_pcie_error() - zx_apei_mce_report_zdi_error() 2. Pointer usage was also updated in some non-upstream lines to use 'm->' instead of 'm.' in the following lines: - Line 'if (m->status & MCI_STATUS_UC)' in do_machine_check() - Line 'm->cpuvendor == X86_VENDOR_ZHAOXIN' in machine_check_poll() The above changes are limited to new wrapper adaptation only, with no functional behavior changes and are consistent with the upstream changes. Suggested-by: Borislav Petkov (AMD) Signed-off-by: Avadhut Naik Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Qiuxu Zhuo Link: https://lore.kernel.org/r/20241022194158.110073-2-avadhut.naik@amd.com Signed-off-by: Malathi Signed-off-by: PvsNarasimha --- arch/x86/include/asm/mce.h | 14 +- arch/x86/kernel/cpu/mce/amd.c | 27 ++-- arch/x86/kernel/cpu/mce/apei.c | 174 +++++++++++++----------- arch/x86/kernel/cpu/mce/core.c | 207 ++++++++++++++++------------- arch/x86/kernel/cpu/mce/genpool.c | 18 +-- arch/x86/kernel/cpu/mce/inject.c | 6 +- arch/x86/kernel/cpu/mce/internal.h | 4 +- include/trace/events/mce.h | 42 +++--- 8 files changed, 271 insertions(+), 221 deletions(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 75af8b41800f..2796cb89eb68 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -187,6 +187,16 @@ enum mce_notifier_prios { MCE_PRIO_HIGHEST = MCE_PRIO_CEC }; +/** + * struct mce_hw_err - Hardware Error Record. + * @m: Machine Check record. + */ +struct mce_hw_err { + struct mce m; +}; + +#define to_mce_hw_err(mce) container_of(mce, struct mce_hw_err, m) + struct notifier_block; extern void mce_register_decode_chain(struct notifier_block *nb); extern void mce_unregister_decode_chain(struct notifier_block *nb); @@ -223,8 +233,8 @@ static inline int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id) { return -EINVAL; } #endif -void mce_prep_record(struct mce *m); -void mce_log(struct mce *m); +void mce_prep_record(struct mce_hw_err *err); +void mce_log(struct mce_hw_err *err); DECLARE_PER_CPU(struct device *, mce_device); /* Maximum number of MCA banks per CPU. */ diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 06515618ff24..8c016d365964 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -713,29 +713,30 @@ bool amd_mce_is_memory_error(struct mce *m) static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc) { - struct mce m; + struct mce_hw_err err; + struct mce *m = &err.m; - mce_prep_record(&m); + mce_prep_record(&err); - m.status = status; - m.misc = misc; - m.bank = bank; - m.tsc = rdtsc(); + m->status = status; + m->misc = misc; + m->bank = bank; + m->tsc = rdtsc(); - if (m.status & MCI_STATUS_ADDRV) { - m.addr = addr; + if (m->status & MCI_STATUS_ADDRV) { + m->addr = addr; - smca_extract_err_addr(&m); + smca_extract_err_addr(m); } if (mce_flags.smca) { - rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), m.ipid); + rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), m->ipid); - if (m.status & MCI_STATUS_SYNDV) - rdmsrl(MSR_AMD64_SMCA_MCx_SYND(bank), m.synd); + if (m->status & MCI_STATUS_SYNDV) + rdmsrl(MSR_AMD64_SMCA_MCx_SYND(bank), m->synd); } - mce_log(&m); + mce_log(&err); } DEFINE_IDTENTRY_SYSVEC(sysvec_deferred_error) diff --git a/arch/x86/kernel/cpu/mce/apei.c b/arch/x86/kernel/cpu/mce/apei.c index 420fd3a7d046..d1f77a50e994 100644 --- a/arch/x86/kernel/cpu/mce/apei.c +++ b/arch/x86/kernel/cpu/mce/apei.c @@ -28,7 +28,8 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err) { - struct mce m; + struct mce_hw_err err; + struct mce *m; int lsb; if (!(mem_err->validation_bits & CPER_MEM_VALID_PA)) @@ -44,28 +45,31 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err) else lsb = PAGE_SHIFT; - mce_prep_record(&m); - m.bank = -1; + mce_prep_record(&err); + m = &err.m; + m->bank = -1; /* Fake a memory read error with unknown channel */ - m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | MCI_STATUS_MISCV | 0x9f; - m.misc = (MCI_MISC_ADDR_PHYS << 6) | lsb; + m->status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | MCI_STATUS_MISCV | 0x9f; + m->misc = (MCI_MISC_ADDR_PHYS << 6) | lsb; if (severity >= GHES_SEV_RECOVERABLE) - m.status |= MCI_STATUS_UC; + m->status |= MCI_STATUS_UC; if (severity >= GHES_SEV_PANIC) { - m.status |= MCI_STATUS_PCC; - m.tsc = rdtsc(); + m->status |= MCI_STATUS_PCC; + m->tsc = rdtsc(); } - m.addr = mem_err->physical_addr; - mce_log(&m); + m->addr = mem_err->physical_addr; + mce_log(&err); } EXPORT_SYMBOL_GPL(apei_mce_report_mem_error); void zx_apei_mce_report_mem_error(struct cper_sec_mem_err *mem_err) { - struct mce m; + struct mce_hw_err err; + struct mce *m; + int apei_error = 0; if (boot_cpu_data.x86 != 7 || boot_cpu_data.x86_model != 91) @@ -74,159 +78,166 @@ void zx_apei_mce_report_mem_error(struct cper_sec_mem_err *mem_err) if (!(mem_err->validation_bits & CPER_MEM_VALID_PA)) return; - mce_prep_record(&m); - m.misc = 0; - m.misc = mem_err->module; - m.addr = mem_err->physical_addr; + mce_prep_record(&err); + m = &err.m; + m->misc = 0; + m->misc = mem_err->module; + m->addr = mem_err->physical_addr; if (mem_err->card == 0) - m.bank = 9; + m->bank = 9; else - m.bank = 10; + m->bank = 10; switch (mem_err->error_type) { case 2: - m.status = 0x9c20004000010080; + m->status = 0x9c20004000010080; break; case 3: - m.status = 0xbe40000000020090; - apei_error = apei_write_mce(&m); + m->status = 0xbe40000000020090; + apei_error = apei_write_mce(m); break; case 8: if (mem_err->requestor_id == 2) { - m.status = 0x98200040000400b0; + m->status = 0x98200040000400b0; } else if (mem_err->requestor_id == 3) { - m.status = 0xba400000000600a0; - apei_error = apei_write_mce(&m); + m->status = 0xba400000000600a0; + apei_error = apei_write_mce(m); } else if (mem_err->requestor_id == 4) { - m.status = 0x98200100000300b0; + m->status = 0x98200100000300b0; } else if (mem_err->requestor_id == 5) { - m.status = 0xba000000000500b0; - apei_error = apei_write_mce(&m); + m->status = 0xba000000000500b0; + apei_error = apei_write_mce(m); } else { pr_info("Undefined Parity error\n"); } break; case 10: if (mem_err->requestor_id == 6) { - m.status = 0xba400000000700a0; - apei_error = apei_write_mce(&m); + m->status = 0xba400000000700a0; + apei_error = apei_write_mce(m); } else if (mem_err->requestor_id == 7) { - m.status = 0xba000000000800b0; - apei_error = apei_write_mce(&m); + m->status = 0xba000000000800b0; + apei_error = apei_write_mce(m); } else { pr_info("Undefined dvad error\n"); } break; case 13: - m.status = 0x9c200040000100c0; + m->status = 0x9c200040000100c0; break; case 14: - m.status = 0xbd000000000200c0; - apei_error = apei_write_mce(&m); + m->status = 0xbd000000000200c0; + apei_error = apei_write_mce(m); break; } - mce_log(&m); + mce_log(&err); } EXPORT_SYMBOL_GPL(zx_apei_mce_report_mem_error); void zx_apei_mce_report_pcie_error(int severity, struct cper_sec_pcie *pcie_err) { - struct mce m; + struct mce_hw_err err; + struct mce *m; + int apei_error = 0; if (boot_cpu_data.x86 != 7 || boot_cpu_data.x86_model != 91) return; - mce_prep_record(&m); - m.addr = 0; - m.misc = 0; - m.misc |= (u64)pcie_err->device_id.segment << 32; - m.misc |= pcie_err->device_id.bus << 24; - m.misc |= pcie_err->device_id.device << 19; - m.misc |= pcie_err->device_id.function << 16; - m.bank = 6; + mce_prep_record(&err); + m = &err.m; + m->addr = 0; + m->misc = 0; + m->misc |= (u64)pcie_err->device_id.segment << 32; + m->misc |= pcie_err->device_id.bus << 24; + m->misc |= pcie_err->device_id.device << 19; + m->misc |= pcie_err->device_id.function << 16; + m->bank = 6; switch (severity) { case 1: - m.status = 0x9820004000020e0b; + m->status = 0x9820004000020e0b; break; case 2: - m.status = 0xba20000000010e0b; + m->status = 0xba20000000010e0b; break; case 3: - m.status = 0xbd20000000000e0b; - apei_error = apei_write_mce(&m); + m->status = 0xbd20000000000e0b; + apei_error = apei_write_mce(m); break; default: pr_info("Undefine pcie error\n"); break; } - mce_log(&m); + mce_log(&err); } EXPORT_SYMBOL_GPL(zx_apei_mce_report_pcie_error); void zx_apei_mce_report_zdi_error(struct cper_sec_proc_generic *zdi_err) { - struct mce m; + struct mce_hw_err err; + struct mce *m; + int apei_error = 0; if (boot_cpu_data.x86 != 7 || boot_cpu_data.x86_model != 91) return; - mce_prep_record(&m); - m.misc = 0; - m.misc |= (zdi_err->requestor_id & 0xff) << 19; - m.misc |= ((zdi_err->requestor_id & 0xff00) >> 8) >> 24; - m.bank = 5; + mce_prep_record(&err); + m = &err.m; + m->misc = 0; + m->misc |= (zdi_err->requestor_id & 0xff) << 19; + m->misc |= ((zdi_err->requestor_id & 0xff00) >> 8) >> 24; + m->bank = 5; switch (zdi_err->responder_id) { case 2: - m.status = 0xba00000000040e0f; - apei_error = apei_write_mce(&m); + m->status = 0xba00000000040e0f; + apei_error = apei_write_mce(m); break; case 3: - m.status = 0xba00000000030e0f; - apei_error = apei_write_mce(&m); + m->status = 0xba00000000030e0f; + apei_error = apei_write_mce(m); break; case 4: - m.status = 0xba00000000020e0f; - apei_error = apei_write_mce(&m); + m->status = 0xba00000000020e0f; + apei_error = apei_write_mce(m); break; case 5: - m.status = 0xba00000000010e0f; - apei_error = apei_write_mce(&m); + m->status = 0xba00000000010e0f; + apei_error = apei_write_mce(m); break; case 6: - m.status = 0x9820004000090e0f; + m->status = 0x9820004000090e0f; break; case 7: - m.status = 0x9820004000080e0f; + m->status = 0x9820004000080e0f; break; case 8: - m.status = 0x9820004000070e0f; + m->status = 0x9820004000070e0f; break; case 9: - m.status = 0x9820004000060e0f; + m->status = 0x9820004000060e0f; break; case 10: - m.status = 0x9820004000050e0f; + m->status = 0x9820004000050e0f; break; case 11: case 12: case 13: case 14: case 15: - m.status = 0x98200040000b0e0f; + m->status = 0x98200040000b0e0f; break; case 16: case 17: case 18: - m.status = 0x98200040000c0e0f; + m->status = 0x98200040000c0e0f; break; default: pr_info("Undefined ZDI Error\n"); break; } - mce_log(&m); + mce_log(&err); } EXPORT_SYMBOL_GPL(zx_apei_mce_report_zdi_error); @@ -234,8 +245,9 @@ int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id) { const u64 *i_mce = ((const u64 *) (ctx_info + 1)); bool apicid_found = false; + struct mce_hw_err err; unsigned int cpu; - struct mce m; + struct mce *m; if (!boot_cpu_has(X86_FEATURE_SMCA)) return -EINVAL; @@ -275,18 +287,20 @@ int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id) if (!apicid_found) return -EINVAL; - mce_prep_record_common(&m); - mce_prep_record_per_cpu(cpu, &m); + m = &err.m; + memset(&err, 0, sizeof(struct mce_hw_err)); + mce_prep_record_common(m); + mce_prep_record_per_cpu(cpu, m); - m.bank = (ctx_info->msr_addr >> 4) & 0xFF; - m.status = *i_mce; - m.addr = *(i_mce + 1); - m.misc = *(i_mce + 2); + m->bank = (ctx_info->msr_addr >> 4) & 0xFF; + m->status = *i_mce; + m->addr = *(i_mce + 1); + m->misc = *(i_mce + 2); /* Skipping MCA_CONFIG */ - m.ipid = *(i_mce + 4); - m.synd = *(i_mce + 5); + m->ipid = *(i_mce + 4); + m->synd = *(i_mce + 5); - mce_log(&m); + mce_log(&err); return 0; } diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 6dcb9ebd69cd..411d9cc83c87 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -86,7 +86,7 @@ struct mca_config mca_cfg __read_mostly = { .monarch_timeout = -1 }; -static DEFINE_PER_CPU(struct mce, mces_seen); +static DEFINE_PER_CPU(struct mce_hw_err, hw_errs_seen); static unsigned long mce_need_notify; /* @@ -117,8 +117,6 @@ BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain); void mce_prep_record_common(struct mce *m) { - memset(m, 0, sizeof(struct mce)); - m->cpuid = cpuid_eax(1); m->cpuvendor = boot_cpu_data.x86_vendor; m->mcgcap = __rdmsr(MSR_IA32_MCG_CAP); @@ -136,9 +134,12 @@ void mce_prep_record_per_cpu(unsigned int cpu, struct mce *m) m->socketid = topology_physical_package_id(cpu); } -/* Do initial initialization of a struct mce */ -void mce_prep_record(struct mce *m) +/* Do initial initialization of struct mce_hw_err */ +void mce_prep_record(struct mce_hw_err *err) { + struct mce *m = &err->m; + + memset(err, 0, sizeof(struct mce_hw_err)); mce_prep_record_common(m); mce_prep_record_per_cpu(smp_processor_id(), m); } @@ -146,9 +147,9 @@ void mce_prep_record(struct mce *m) DEFINE_PER_CPU(struct mce, injectm); EXPORT_PER_CPU_SYMBOL_GPL(injectm); -void mce_log(struct mce *m) +void mce_log(struct mce_hw_err *err) { - if (!mce_gen_pool_add(m)) + if (!mce_gen_pool_add(err)) irq_work_queue(&mce_irq_work); } EXPORT_SYMBOL_GPL(mce_log); @@ -169,8 +170,10 @@ void mce_unregister_decode_chain(struct notifier_block *nb) } EXPORT_SYMBOL_GPL(mce_unregister_decode_chain); -static void __print_mce(struct mce *m) +static void __print_mce(struct mce_hw_err *err) { + struct mce *m = &err->m; + pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n", m->extcpu, (m->mcgstatus & MCG_STATUS_MCIP ? " Exception" : ""), @@ -212,9 +215,11 @@ static void __print_mce(struct mce *m) m->microcode); } -static void print_mce(struct mce *m) +static void print_mce(struct mce_hw_err *err) { - __print_mce(m); + struct mce *m = &err->m; + + __print_mce(err); if (m->cpuvendor != X86_VENDOR_AMD && m->cpuvendor != X86_VENDOR_HYGON) pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n"); @@ -241,7 +246,7 @@ static void wait_for_panic(void) panic("Panicing machine check CPU died"); } -static noinstr void mce_panic(const char *msg, struct mce *final, char *exp) +static noinstr void mce_panic(const char *msg, struct mce_hw_err *final, char *exp) { struct llist_node *pending; struct mce_evt_llist *l; @@ -272,20 +277,22 @@ static noinstr void mce_panic(const char *msg, struct mce *final, char *exp) pending = mce_gen_pool_prepare_records(); /* First print corrected ones that are still unlogged */ llist_for_each_entry(l, pending, llnode) { - struct mce *m = &l->mce; + struct mce_hw_err *err = &l->err; + struct mce *m = &err->m; if (!(m->status & MCI_STATUS_UC)) { - print_mce(m); + print_mce(err); if (!apei_err) apei_err = apei_write_mce(m); } } /* Now print uncorrected but with the final one last */ llist_for_each_entry(l, pending, llnode) { - struct mce *m = &l->mce; + struct mce_hw_err *err = &l->err; + struct mce *m = &err->m; if (!(m->status & MCI_STATUS_UC)) continue; - if (!final || mce_cmp(m, final)) { - print_mce(m); + if (!final || mce_cmp(m, &final->m)) { + print_mce(err); if (!apei_err) apei_err = apei_write_mce(m); } @@ -293,7 +300,7 @@ static noinstr void mce_panic(const char *msg, struct mce *final, char *exp) if (final) { print_mce(final); if (!apei_err) - apei_err = apei_write_mce(final); + apei_err = apei_write_mce(&final->m); } if (exp) pr_emerg(HW_ERR "Machine check: %s\n", exp); @@ -308,8 +315,8 @@ static noinstr void mce_panic(const char *msg, struct mce *final, char *exp) * panic. */ if (kexec_crash_loaded()) { - if (final && (final->status & MCI_STATUS_ADDRV)) { - p = pfn_to_online_page(final->addr >> PAGE_SHIFT); + if (final && (final->m.status & MCI_STATUS_ADDRV)) { + p = pfn_to_online_page(final->m.addr >> PAGE_SHIFT); if (p) SetPageHWPoison(p); } @@ -429,16 +436,18 @@ static noinstr void mce_wrmsrl(u32 msr, u64 v) * check into our "mce" struct so that we can use it later to assess * the severity of the problem as we read per-bank specific details. */ -static noinstr void mce_gather_info(struct mce *m, struct pt_regs *regs) +static noinstr void mce_gather_info(struct mce_hw_err *err, struct pt_regs *regs) { + struct mce *m; /* * Enable instrumentation around mce_prep_record() which calls external * facilities. */ instrumentation_begin(); - mce_prep_record(m); + mce_prep_record(err); instrumentation_end(); + m = &err->m; m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); if (regs) { /* @@ -570,13 +579,13 @@ EXPORT_SYMBOL_GPL(mce_is_correctable); static int mce_early_notifier(struct notifier_block *nb, unsigned long val, void *data) { - struct mce *m = (struct mce *)data; + struct mce_hw_err *err = to_mce_hw_err(data); - if (!m) + if (!err) return NOTIFY_DONE; /* Emit the trace record: */ - trace_mce_record(m); + trace_mce_record(err); set_bit(0, &mce_need_notify); @@ -620,13 +629,13 @@ static struct notifier_block mce_uc_nb = { static int mce_default_notifier(struct notifier_block *nb, unsigned long val, void *data) { - struct mce *m = (struct mce *)data; + struct mce_hw_err *err = to_mce_hw_err(data); - if (!m) + if (!err) return NOTIFY_DONE; - if (mca_cfg.print_all || !m->kflags) - __print_mce(m); + if (mca_cfg.print_all || !(err->m.kflags)) + __print_mce(err); return NOTIFY_DONE; } @@ -640,8 +649,10 @@ static struct notifier_block mce_default_nb = { /* * Read ADDR and MISC registers. */ -static noinstr void mce_read_aux(struct mce *m, int i) +static noinstr void mce_read_aux(struct mce_hw_err *err, int i) { + struct mce *m = &err->m; + if (m->status & MCI_STATUS_MISCV) m->misc = mce_rdmsrl(mca_msr_reg(i, MCA_MISC)); @@ -689,26 +700,28 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) { struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); bool error_seen = false; - struct mce m; + struct mce_hw_err err; + struct mce *m; int i; this_cpu_inc(mce_poll_count); - mce_gather_info(&m, NULL); + mce_gather_info(&err, NULL); + m = &err.m; if (flags & MCP_TIMESTAMP) - m.tsc = rdtsc(); + m->tsc = rdtsc(); for (i = 0; i < this_cpu_read(mce_num_banks); i++) { if (!mce_banks[i].ctl || !test_bit(i, *b)) continue; - m.misc = 0; - m.addr = 0; - m.bank = i; + m->misc = 0; + m->addr = 0; + m->bank = i; barrier(); - m.status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS)); + m->status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS)); /* * Update storm tracking here, before checking for the @@ -718,17 +731,17 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) * storm status. */ if (!mca_cfg.cmci_disabled) - mce_track_storm(&m); + mce_track_storm(m); /* If this entry is not valid, ignore it */ - if (!(m.status & MCI_STATUS_VAL)) + if (!(m->status & MCI_STATUS_VAL)) continue; /* * If we are logging everything (at CPU online) or this * is a corrected error, then we must log it. */ - if ((flags & MCP_UC) || !(m.status & MCI_STATUS_UC)) + if ((flags & MCP_UC) || !(m->status & MCI_STATUS_UC)) goto log_it; /* @@ -738,20 +751,20 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) * everything else. */ if (!mca_cfg.ser) { - if (m.status & MCI_STATUS_UC) + if (m->status & MCI_STATUS_UC) continue; goto log_it; } /* Log "not enabled" (speculative) errors */ - if (!(m.status & MCI_STATUS_EN)) + if (!(m->status & MCI_STATUS_EN)) goto log_it; /* * Log UCNA (SDM: 15.6.3 "UCR Error Classification") * UC == 1 && PCC == 0 && S == 0 */ - if (!(m.status & MCI_STATUS_PCC) && !(m.status & MCI_STATUS_S)) + if (!(m->status & MCI_STATUS_PCC) && !(m->status & MCI_STATUS_S)) goto log_it; /* @@ -767,20 +780,20 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) if (flags & MCP_DONTLOG) goto clear_it; - mce_read_aux(&m, i); - m.severity = mce_severity(&m, NULL, NULL, false); + mce_read_aux(&err, i); + m->severity = mce_severity(m, NULL, NULL, false); /* * Don't get the IP here because it's unlikely to * have anything to do with the actual error location. */ - if (mca_cfg.dont_log_ce && !mce_usable_address(&m)) + if (mca_cfg.dont_log_ce && !mce_usable_address(m)) goto clear_it; if (flags & MCP_QUEUE_LOG) - mce_gen_pool_add(&m); + mce_gen_pool_add(&err); else - mce_log(&m); + mce_log(&err); clear_it: /* @@ -906,9 +919,10 @@ static __always_inline void quirk_zen_ifu(int bank, struct mce *m, struct pt_reg * Do a quick check if any of the events requires a panic. * This decides if we keep the events around or clear them. */ -static __always_inline int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, +static __always_inline int mce_no_way_out(struct mce_hw_err *err, char **msg, unsigned long *validp, struct pt_regs *regs) { + struct mce *m = &err->m; char *tmp = *msg; int i; @@ -926,7 +940,7 @@ static __always_inline int mce_no_way_out(struct mce *m, char **msg, unsigned lo m->bank = i; if (mce_severity(m, regs, &tmp, true) >= MCE_PANIC_SEVERITY) { - mce_read_aux(m, i); + mce_read_aux(err, i); *msg = tmp; return 1; } @@ -1017,10 +1031,11 @@ static noinstr int mce_timed_out(u64 *t, const char *msg) */ static void mce_reign(void) { - int cpu; + struct mce_hw_err *err = NULL; struct mce *m = NULL; int global_worst = 0; char *msg = NULL; + int cpu; /* * This CPU is the Monarch and the other CPUs have run @@ -1028,11 +1043,13 @@ static void mce_reign(void) * Grade the severity of the errors of all the CPUs. */ for_each_possible_cpu(cpu) { - struct mce *mtmp = &per_cpu(mces_seen, cpu); + struct mce_hw_err *etmp = &per_cpu(hw_errs_seen, cpu); + struct mce *mtmp = &etmp->m; if (mtmp->severity > global_worst) { global_worst = mtmp->severity; - m = &per_cpu(mces_seen, cpu); + err = &per_cpu(hw_errs_seen, cpu); + m = &err->m; } } @@ -1044,7 +1061,7 @@ static void mce_reign(void) if (m && global_worst >= MCE_PANIC_SEVERITY) { /* call mce_severity() to get "msg" for panic */ mce_severity(m, NULL, &msg, true); - mce_panic("Fatal machine check", m, msg); + mce_panic("Fatal machine check", err, msg); } /* @@ -1061,11 +1078,11 @@ static void mce_reign(void) mce_panic("Fatal machine check from unknown source", NULL, NULL); /* - * Now clear all the mces_seen so that they don't reappear on + * Now clear all the hw_errs_seen so that they don't reappear on * the next mce. */ for_each_possible_cpu(cpu) - memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce)); + memset(&per_cpu(hw_errs_seen, cpu), 0, sizeof(struct mce_hw_err)); } static atomic_t global_nwo; @@ -1270,13 +1287,14 @@ static noinstr bool mce_check_crashing_cpu(void) } static __always_inline int -__mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final, - unsigned long *toclear, unsigned long *valid_banks, int no_way_out, - int *worst) +__mc_scan_banks(struct mce_hw_err *err, struct pt_regs *regs, + struct mce_hw_err *final, unsigned long *toclear, + unsigned long *valid_banks, int no_way_out, int *worst) { struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); struct mca_config *cfg = &mca_cfg; int severity, i, taint = 0; + struct mce *m = &err->m; for (i = 0; i < this_cpu_read(mce_num_banks); i++) { arch___clear_bit(i, toclear); @@ -1321,7 +1339,7 @@ __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final, if (severity == MCE_NO_SEVERITY) continue; - mce_read_aux(m, i); + mce_read_aux(err, i); /* assuming valid severity level != 0 */ m->severity = severity; @@ -1331,17 +1349,17 @@ __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final, * done in #MC context, where instrumentation is disabled. */ instrumentation_begin(); - mce_log(m); + mce_log(err); instrumentation_end(); if (severity > *worst) { - *final = *m; + *final = *err; *worst = severity; } } /* mce_clear_state will clear *final, save locally for use later */ - *m = *final; + *err = *final; return taint; } @@ -1401,9 +1419,10 @@ static void kill_me_never(struct callback_head *cb) set_mce_nospec(pfn); } -static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callback_head *)) +static void queue_task_work(struct mce_hw_err *err, char *msg, void (*func)(struct callback_head *)) { int count = ++current->mce_count; + struct mce *m = &err->m; /* First call, save all the details */ if (count == 1) { @@ -1416,11 +1435,12 @@ static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callba /* Ten is likely overkill. Don't expect more than two faults before task_work() */ if (count > 10) - mce_panic("Too many consecutive machine checks while accessing user data", m, msg); + mce_panic("Too many consecutive machine checks while accessing user data", + err, msg); /* Second or later call, make sure page address matches the one from first call */ if (count > 1 && (current->mce_addr >> PAGE_SHIFT) != (m->addr >> PAGE_SHIFT)) - mce_panic("Consecutive machine checks to different user pages", m, msg); + mce_panic("Consecutive machine checks to different user pages", err, msg); /* Do not call task_work_add() more than once */ if (count > 1) @@ -1469,8 +1489,10 @@ noinstr void do_machine_check(struct pt_regs *regs) int worst = 0, order, no_way_out, kill_current_task, lmce, taint = 0; DECLARE_BITMAP(valid_banks, MAX_NR_BANKS) = { 0 }; DECLARE_BITMAP(toclear, MAX_NR_BANKS) = { 0 }; - struct mce m, *final; + struct mce_hw_err *final; + struct mce_hw_err err; char *msg = NULL; + struct mce *m; if (unlikely(mce_flags.p5)) return pentium_machine_check(regs); @@ -1508,13 +1530,14 @@ noinstr void do_machine_check(struct pt_regs *regs) this_cpu_inc(mce_exception_count); - mce_gather_info(&m, regs); - m.tsc = rdtsc(); + mce_gather_info(&err, regs); + m = &err.m; + m->tsc = rdtsc(); - final = this_cpu_ptr(&mces_seen); - *final = m; + final = this_cpu_ptr(&hw_errs_seen); + *final = err; - no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs); + no_way_out = mce_no_way_out(&err, &msg, valid_banks, regs); barrier(); @@ -1523,16 +1546,16 @@ noinstr void do_machine_check(struct pt_regs *regs) * Assume the worst for now, but if we find the * severity is MCE_AR_SEVERITY we have other options. */ - if (!(m.mcgstatus & MCG_STATUS_RIPV)) + if (!(m->mcgstatus & MCG_STATUS_RIPV)) kill_current_task = 1; /* * Check if this MCE is signaled to only this logical processor, * on Intel, Zhaoxin only. */ - if (m.cpuvendor == X86_VENDOR_INTEL || - m.cpuvendor == X86_VENDOR_CENTAUR || - m.cpuvendor == X86_VENDOR_ZHAOXIN) - lmce = m.mcgstatus & MCG_STATUS_LMCES; + if (m->cpuvendor == X86_VENDOR_INTEL || + m->cpuvendor == X86_VENDOR_CENTAUR || + m->cpuvendor == X86_VENDOR_ZHAOXIN) + lmce = m->mcgstatus & MCG_STATUS_LMCES; /* * Local machine check may already know that we have to panic. @@ -1543,12 +1566,12 @@ noinstr void do_machine_check(struct pt_regs *regs) */ if (lmce) { if (no_way_out) - mce_panic("Fatal local machine check", &m, msg); + mce_panic("Fatal local machine check", &err, msg); } else { order = mce_start(&no_way_out); } - taint = __mc_scan_banks(&m, regs, final, toclear, valid_banks, no_way_out, &worst); + taint = __mc_scan_banks(&err, regs, final, toclear, valid_banks, no_way_out, &worst); if (!no_way_out) mce_clear_state(toclear); @@ -1563,7 +1586,7 @@ noinstr void do_machine_check(struct pt_regs *regs) no_way_out = worst >= MCE_PANIC_SEVERITY; if (no_way_out) - mce_panic("Fatal machine check on current CPU", &m, msg); + mce_panic("Fatal machine check on current CPU", &err, msg); } } else { /* @@ -1575,8 +1598,8 @@ noinstr void do_machine_check(struct pt_regs *regs) * make sure we have the right "msg". */ if (worst >= MCE_PANIC_SEVERITY) { - mce_severity(&m, regs, &msg, true); - mce_panic("Local fatal machine check!", &m, msg); + mce_severity(m, regs, &msg, true); + mce_panic("Local fatal machine check!", &err, msg); } } @@ -1594,16 +1617,16 @@ noinstr void do_machine_check(struct pt_regs *regs) goto out; /* Fault was in user mode and we need to take some action */ - if ((m.cs & 3) == 3) { + if ((m->cs & 3) == 3) { /* If this triggers there is no way to recover. Die hard. */ BUG_ON(!on_thread_stack() || !user_mode(regs)); - if (!mce_usable_address(&m)) - queue_task_work(&m, msg, kill_me_now); + if (!mce_usable_address(m)) + queue_task_work(&err, msg, kill_me_now); else - queue_task_work(&m, msg, kill_me_maybe); + queue_task_work(&err, msg, kill_me_maybe); - } else if (m.mcgstatus & MCG_STATUS_SEAM_NR) { + } else if (m->mcgstatus & MCG_STATUS_SEAM_NR) { /* * Saved RIP on stack makes it look like the machine check * was taken in the kernel on the instruction following @@ -1615,8 +1638,8 @@ noinstr void do_machine_check(struct pt_regs *regs) * not occur there. Mark the page as poisoned so it won't * be added to free list when the guest is terminated. */ - if (mce_usable_address(&m)) { - struct page *p = pfn_to_online_page(m.addr >> PAGE_SHIFT); + if (mce_usable_address(m)) { + struct page *p = pfn_to_online_page(m->addr >> PAGE_SHIFT); if (p) SetPageHWPoison(p); @@ -1631,13 +1654,13 @@ noinstr void do_machine_check(struct pt_regs *regs) * corresponding exception handler which would do that is the * proper one. */ - if (m.kflags & MCE_IN_KERNEL_RECOV) { + if (m->kflags & MCE_IN_KERNEL_RECOV) { if (!fixup_exception(regs, X86_TRAP_MC, 0, 0)) - mce_panic("Failed kernel mode recovery", &m, msg); + mce_panic("Failed kernel mode recovery", &err, msg); } - if (m.kflags & MCE_IN_KERNEL_COPYIN) - queue_task_work(&m, msg, kill_me_never); + if (m->kflags & MCE_IN_KERNEL_COPYIN) + queue_task_work(&err, msg, kill_me_never); } out: diff --git a/arch/x86/kernel/cpu/mce/genpool.c b/arch/x86/kernel/cpu/mce/genpool.c index fbe8b61c3413..159beb39f12b 100644 --- a/arch/x86/kernel/cpu/mce/genpool.c +++ b/arch/x86/kernel/cpu/mce/genpool.c @@ -31,15 +31,15 @@ static char gen_pool_buf[MCE_POOLSZ]; */ static bool is_duplicate_mce_record(struct mce_evt_llist *t, struct mce_evt_llist *l) { + struct mce_hw_err *err1, *err2; struct mce_evt_llist *node; - struct mce *m1, *m2; - m1 = &t->mce; + err1 = &t->err; llist_for_each_entry(node, &l->llnode, llnode) { - m2 = &node->mce; + err2 = &node->err; - if (!mce_cmp(m1, m2)) + if (!mce_cmp(&err1->m, &err2->m)) return true; } return false; @@ -73,8 +73,8 @@ struct llist_node *mce_gen_pool_prepare_records(void) void mce_gen_pool_process(struct work_struct *__unused) { - struct llist_node *head; struct mce_evt_llist *node, *tmp; + struct llist_node *head; struct mce *mce; head = llist_del_all(&mce_event_llist); @@ -83,7 +83,7 @@ void mce_gen_pool_process(struct work_struct *__unused) head = llist_reverse_order(head); llist_for_each_entry_safe(node, tmp, head, llnode) { - mce = &node->mce; + mce = &node->err.m; blocking_notifier_call_chain(&x86_mce_decoder_chain, 0, mce); gen_pool_free(mce_evt_pool, (unsigned long)node, sizeof(*node)); } @@ -94,11 +94,11 @@ bool mce_gen_pool_empty(void) return llist_empty(&mce_event_llist); } -int mce_gen_pool_add(struct mce *mce) +int mce_gen_pool_add(struct mce_hw_err *err) { struct mce_evt_llist *node; - if (filter_mce(mce)) + if (filter_mce(&err->m)) return -EINVAL; if (!mce_evt_pool) @@ -110,7 +110,7 @@ int mce_gen_pool_add(struct mce *mce) return -ENOMEM; } - memcpy(&node->mce, mce, sizeof(*mce)); + memcpy(&node->err, err, sizeof(*err)); llist_add(&node->llnode, &mce_event_llist); return 0; diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c index 94953d749475..c374e5aeba5e 100644 --- a/arch/x86/kernel/cpu/mce/inject.c +++ b/arch/x86/kernel/cpu/mce/inject.c @@ -498,8 +498,9 @@ static void prepare_msrs(void *info) static void do_inject(void) { - u64 mcg_status = 0; unsigned int cpu = i_mce.extcpu; + struct mce_hw_err err; + u64 mcg_status = 0; u8 b = i_mce.bank; i_mce.tsc = rdtsc_ordered(); @@ -513,7 +514,8 @@ static void do_inject(void) i_mce.status |= MCI_STATUS_SYNDV; if (inj_type == SW_INJ) { - mce_log(&i_mce); + err.m = i_mce; + mce_log(&err); return; } diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index 8f82eeaa616d..e96f29e7a71e 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -26,12 +26,12 @@ extern struct blocking_notifier_head x86_mce_decoder_chain; struct mce_evt_llist { struct llist_node llnode; - struct mce mce; + struct mce_hw_err err; }; void mce_gen_pool_process(struct work_struct *__unused); bool mce_gen_pool_empty(void); -int mce_gen_pool_add(struct mce *mce); +int mce_gen_pool_add(struct mce_hw_err *err); int mce_gen_pool_init(void); struct llist_node *mce_gen_pool_prepare_records(void); diff --git a/include/trace/events/mce.h b/include/trace/events/mce.h index f0f7b3cb2041..65aba1afcd07 100644 --- a/include/trace/events/mce.h +++ b/include/trace/events/mce.h @@ -19,9 +19,9 @@ TRACE_EVENT(mce_record, - TP_PROTO(struct mce *m), + TP_PROTO(struct mce_hw_err *err), - TP_ARGS(m), + TP_ARGS(err), TP_STRUCT__entry( __field( u64, mcgcap ) @@ -46,25 +46,25 @@ TRACE_EVENT(mce_record, ), TP_fast_assign( - __entry->mcgcap = m->mcgcap; - __entry->mcgstatus = m->mcgstatus; - __entry->status = m->status; - __entry->addr = m->addr; - __entry->misc = m->misc; - __entry->synd = m->synd; - __entry->ipid = m->ipid; - __entry->ip = m->ip; - __entry->tsc = m->tsc; - __entry->ppin = m->ppin; - __entry->walltime = m->time; - __entry->cpu = m->extcpu; - __entry->cpuid = m->cpuid; - __entry->apicid = m->apicid; - __entry->socketid = m->socketid; - __entry->cs = m->cs; - __entry->bank = m->bank; - __entry->cpuvendor = m->cpuvendor; - __entry->microcode = m->microcode; + __entry->mcgcap = err->m.mcgcap; + __entry->mcgstatus = err->m.mcgstatus; + __entry->status = err->m.status; + __entry->addr = err->m.addr; + __entry->misc = err->m.misc; + __entry->synd = err->m.synd; + __entry->ipid = err->m.ipid; + __entry->ip = err->m.ip; + __entry->tsc = err->m.tsc; + __entry->ppin = err->m.ppin; + __entry->walltime = err->m.time; + __entry->cpu = err->m.extcpu; + __entry->cpuid = err->m.cpuid; + __entry->apicid = err->m.apicid; + __entry->socketid = err->m.socketid; + __entry->cs = err->m.cs; + __entry->bank = err->m.bank; + __entry->cpuvendor = err->m.cpuvendor; + __entry->microcode = err->m.microcode; ), TP_printk("CPU: %d, MCGc/s: %llx/%llx, MC%d: %016Lx, IPID: %016Lx, ADDR: %016Lx, MISC: %016Lx, SYND: %016Lx, RIP: %02x:<%016Lx>, TSC: %llx, PPIN: %llx, vendor: %u, CPUID: %x, time: %llu, socket: %u, APIC: %x, microcode: %x", -- Gitee From a0cf13a3fc9585dab8be3c6e482e3090b90622ee Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Thu, 7 Mar 2024 11:27:04 -0800 Subject: [PATCH 21/33] x86/mce: Dynamically size space for machine check records commit 108c6494bdf1dfeaefc0a506e2f471aa92fafdd6 upstream Systems with a large number of CPUs may generate a large number of machine check records when things go seriously wrong. But Linux has a fixed-size buffer that can only capture a few dozen errors. Allocate space based on the number of CPUs (with a minimum value based on the historical fixed buffer that could store 80 records). [ bp: Rename local var from tmpp to something more telling: gpool. ] Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Sohil Mehta Reviewed-by: Avadhut Naik Link: https://lore.kernel.org/r/20240307192704.37213-1-tony.luck@intel.com Signed-off-by: Malathi Signed-off-by: PvsNarasimha --- arch/x86/kernel/cpu/mce/genpool.c | 40 +++++++++++++++++++------------ 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/genpool.c b/arch/x86/kernel/cpu/mce/genpool.c index 159beb39f12b..d0be6dda0c14 100644 --- a/arch/x86/kernel/cpu/mce/genpool.c +++ b/arch/x86/kernel/cpu/mce/genpool.c @@ -16,14 +16,14 @@ * used to save error information organized in a lock-less list. * * This memory pool is only to be used to save MCE records in MCE context. - * MCE events are rare, so a fixed size memory pool should be enough. Use - * 2 pages to save MCE events for now (~80 MCE records at most). + * MCE events are rare, so a fixed size memory pool should be enough. + * Allocate on a sliding scale based on number of CPUs. */ -#define MCE_POOLSZ (2 * PAGE_SIZE) +#define MCE_MIN_ENTRIES 80 +#define MCE_PER_CPU 2 static struct gen_pool *mce_evt_pool; static LLIST_HEAD(mce_event_llist); -static char gen_pool_buf[MCE_POOLSZ]; /* * Compare the record "t" with each of the records on list "l" to see if @@ -118,22 +118,32 @@ int mce_gen_pool_add(struct mce_hw_err *err) static int mce_gen_pool_create(void) { - struct gen_pool *tmpp; + int mce_numrecords, mce_poolsz, order; + struct gen_pool *gpool; int ret = -ENOMEM; - - tmpp = gen_pool_create(ilog2(sizeof(struct mce_evt_llist)), -1); - if (!tmpp) - goto out; - - ret = gen_pool_add(tmpp, (unsigned long)gen_pool_buf, MCE_POOLSZ, -1); + void *mce_pool; + + order = order_base_2(sizeof(struct mce_evt_llist)); + gpool = gen_pool_create(order, -1); + if (!gpool) + return ret; + + mce_numrecords = max(MCE_MIN_ENTRIES, num_possible_cpus() * MCE_PER_CPU); + mce_poolsz = mce_numrecords * (1 << order); + mce_pool = kmalloc(mce_poolsz, GFP_KERNEL); + if (!mce_pool) { + gen_pool_destroy(gpool); + return ret; + } + ret = gen_pool_add(gpool, (unsigned long)mce_pool, mce_poolsz, -1); if (ret) { - gen_pool_destroy(tmpp); - goto out; + gen_pool_destroy(gpool); + kfree(mce_pool); + return ret; } - mce_evt_pool = tmpp; + mce_evt_pool = gpool; -out: return ret; } -- Gitee From b7cd9438181e27cdb6942dadddf2e5ec91a1659a Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Thu, 12 Dec 2024 22:00:59 +0800 Subject: [PATCH 22/33] x86/mce: Make four functions return bool commit c46945c9cac8437a674edb9d8fbe71511fb4acee upstream Make those functions whose callers only care about success or failure return a boolean value for better readability. Also, update the call sites accordingly as the polarities of all the return values have been flipped. No functional changes. Suggested-by: Thomas Gleixner Signed-off-by: Qiuxu Zhuo Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Sohil Mehta Reviewed-by: Yazen Ghannam Link: https://lore.kernel.org/r/20241212140103.66964-4-qiuxu.zhuo@intel.com Signed-off-by: Malathi Signed-off-by: PvsNarasimha --- arch/x86/kernel/cpu/mce/core.c | 12 ++++++------ arch/x86/kernel/cpu/mce/genpool.c | 29 ++++++++++++++--------------- arch/x86/kernel/cpu/mce/internal.h | 4 ++-- 3 files changed, 22 insertions(+), 23 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 411d9cc83c87..b456bd22f7ad 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -149,7 +149,7 @@ EXPORT_PER_CPU_SYMBOL_GPL(injectm); void mce_log(struct mce_hw_err *err) { - if (!mce_gen_pool_add(err)) + if (mce_gen_pool_add(err)) irq_work_queue(&mce_irq_work); } EXPORT_SYMBOL_GPL(mce_log); @@ -1903,14 +1903,14 @@ static void __mcheck_cpu_check_banks(void) } /* Add per CPU specific workarounds here */ -static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) +static bool __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) { struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); struct mca_config *cfg = &mca_cfg; if (c->x86_vendor == X86_VENDOR_UNKNOWN) { pr_info("unknown CPU type - not enabling MCE support\n"); - return -EOPNOTSUPP; + return false; } /* This should be disabled by the BIOS, but isn't always */ @@ -2006,7 +2006,7 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) if (cfg->bootlog != 0) cfg->panic_timeout = 30; - return 0; + return true; } static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) @@ -2243,12 +2243,12 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c) __mcheck_cpu_cap_init(); - if (__mcheck_cpu_apply_quirks(c) < 0) { + if (!__mcheck_cpu_apply_quirks(c)) { mca_cfg.disabled = 1; return; } - if (mce_gen_pool_init()) { + if (!mce_gen_pool_init()) { mca_cfg.disabled = 1; pr_emerg("Couldn't allocate MCE records pool!\n"); return; diff --git a/arch/x86/kernel/cpu/mce/genpool.c b/arch/x86/kernel/cpu/mce/genpool.c index d0be6dda0c14..3ca9c007a666 100644 --- a/arch/x86/kernel/cpu/mce/genpool.c +++ b/arch/x86/kernel/cpu/mce/genpool.c @@ -94,64 +94,63 @@ bool mce_gen_pool_empty(void) return llist_empty(&mce_event_llist); } -int mce_gen_pool_add(struct mce_hw_err *err) +bool mce_gen_pool_add(struct mce_hw_err *err) { struct mce_evt_llist *node; if (filter_mce(&err->m)) - return -EINVAL; + return false; if (!mce_evt_pool) - return -EINVAL; + return false; node = (void *)gen_pool_alloc(mce_evt_pool, sizeof(*node)); if (!node) { pr_warn_ratelimited("MCE records pool full!\n"); - return -ENOMEM; + return false; } memcpy(&node->err, err, sizeof(*err)); llist_add(&node->llnode, &mce_event_llist); - return 0; + return true; } -static int mce_gen_pool_create(void) +static bool mce_gen_pool_create(void) { int mce_numrecords, mce_poolsz, order; struct gen_pool *gpool; - int ret = -ENOMEM; void *mce_pool; order = order_base_2(sizeof(struct mce_evt_llist)); gpool = gen_pool_create(order, -1); if (!gpool) - return ret; + return false; mce_numrecords = max(MCE_MIN_ENTRIES, num_possible_cpus() * MCE_PER_CPU); mce_poolsz = mce_numrecords * (1 << order); mce_pool = kmalloc(mce_poolsz, GFP_KERNEL); if (!mce_pool) { gen_pool_destroy(gpool); - return ret; + return false; } - ret = gen_pool_add(gpool, (unsigned long)mce_pool, mce_poolsz, -1); - if (ret) { + + if (gen_pool_add(gpool, (unsigned long)mce_pool, mce_poolsz, -1)) { gen_pool_destroy(gpool); kfree(mce_pool); - return ret; + return false; } mce_evt_pool = gpool; - return ret; + return true; } -int mce_gen_pool_init(void) +bool mce_gen_pool_init(void) { /* Just init mce_gen_pool once. */ if (mce_evt_pool) - return 0; + return true; return mce_gen_pool_create(); } diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index e96f29e7a71e..5baca5a58775 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -31,8 +31,8 @@ struct mce_evt_llist { void mce_gen_pool_process(struct work_struct *__unused); bool mce_gen_pool_empty(void); -int mce_gen_pool_add(struct mce_hw_err *err); -int mce_gen_pool_init(void); +bool mce_gen_pool_add(struct mce_hw_err *err); +bool mce_gen_pool_init(void); struct llist_node *mce_gen_pool_prepare_records(void); int mce_severity(struct mce *a, struct pt_regs *regs, char **msg, bool is_excp); -- Gitee From 1fc3a3669e18cef7644eb3d8db659940add0fa76 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 24 Apr 2024 11:15:11 -0700 Subject: [PATCH 23/33] x86/mce: Switch to new Intel CPU model defines commit 4a5f2dd162fd68f588784eb9b0a927e3b328736d upstream New CPU #defines encode vendor and family as well as model. [ bp: Squash *three* mce patches into one, fold in fix: https://lore.kernel.org/r/20240429022051.63360-1-tony.luck@intel.com ] Signed-off-by: Tony Luck Signed-off-by: Dave Hansen Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/all/20240424181511.41772-1-tony.luck%40intel.com Signed-off-by: Malathi Signed-off-by: PvsNarasimha --- arch/x86/kernel/cpu/mce/core.c | 6 +++--- arch/x86/kernel/cpu/mce/intel.c | 21 ++++++++++----------- arch/x86/kernel/cpu/mce/severity.c | 10 +++++----- 3 files changed, 18 insertions(+), 19 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index b456bd22f7ad..dbb204161e8e 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -46,7 +46,7 @@ #include #include -#include +#include #include #include #include @@ -1977,14 +1977,14 @@ static bool __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0) cfg->bootlog = 0; - if (c->x86 == 6 && c->x86_model == 45) + if (c->x86_vfm == INTEL_SANDYBRIDGE_X) mce_flags.snb_ifu_quirk = 1; /* * Skylake, Cascacde Lake and Cooper Lake require a quirk on * rep movs. */ - if (c->x86 == 6 && c->x86_model == INTEL_FAM6_SKYLAKE_X) + if (c->x86_vfm == INTEL_SKYLAKE_X) mce_flags.skx_repmov_quirk = 1; } diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c index c56203c65ef5..8599af1c5a39 100644 --- a/arch/x86/kernel/cpu/mce/intel.c +++ b/arch/x86/kernel/cpu/mce/intel.c @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include #include #include @@ -351,10 +351,10 @@ static void intel_imc_init(struct cpuinfo_x86 *c) { u64 error_control; - switch (c->x86_model) { - case INTEL_FAM6_SANDYBRIDGE_X: - case INTEL_FAM6_IVYBRIDGE_X: - case INTEL_FAM6_HASWELL_X: + switch (c->x86_vfm) { + case INTEL_SANDYBRIDGE_X: + case INTEL_IVYBRIDGE_X: + case INTEL_HASWELL_X: if (rdmsrl_safe(MSR_ERROR_CONTROL, &error_control)) return; error_control |= 2; @@ -381,12 +381,11 @@ bool intel_filter_mce(struct mce *m) struct cpuinfo_x86 *c = &boot_cpu_data; /* MCE errata HSD131, HSM142, HSW131, BDM48, HSM142 and SKX37 */ - if ((c->x86 == 6) && - ((c->x86_model == INTEL_FAM6_HASWELL) || - (c->x86_model == INTEL_FAM6_HASWELL_L) || - (c->x86_model == INTEL_FAM6_BROADWELL) || - (c->x86_model == INTEL_FAM6_HASWELL_G) || - (c->x86_model == INTEL_FAM6_SKYLAKE_X)) && + if ((c->x86_vfm == INTEL_HASWELL || + c->x86_vfm == INTEL_HASWELL_L || + c->x86_vfm == INTEL_BROADWELL || + c->x86_vfm == INTEL_HASWELL_G || + c->x86_vfm == INTEL_SKYLAKE_X) && (m->bank == 0) && ((m->status & 0xa0000000ffffffff) == 0x80000000000f0005)) return true; diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c index ce5593607f65..2235a7477436 100644 --- a/arch/x86/kernel/cpu/mce/severity.c +++ b/arch/x86/kernel/cpu/mce/severity.c @@ -12,7 +12,7 @@ #include #include -#include +#include #include #include #include @@ -45,14 +45,14 @@ static struct severity { unsigned char context; unsigned char excp; unsigned char covered; - unsigned char cpu_model; + unsigned int cpu_vfm; unsigned char cpu_minstepping; unsigned char bank_lo, bank_hi; char *msg; } severities[] = { #define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c } #define BANK_RANGE(l, h) .bank_lo = l, .bank_hi = h -#define MODEL_STEPPING(m, s) .cpu_model = m, .cpu_minstepping = s +#define VFM_STEPPING(m, s) .cpu_vfm = m, .cpu_minstepping = s #define KERNEL .context = IN_KERNEL #define USER .context = IN_USER #define KERNEL_RECOV .context = IN_KERNEL_RECOV @@ -128,7 +128,7 @@ static struct severity { MCESEV( AO, "Uncorrected Patrol Scrub Error", SER, MASK(MCI_STATUS_UC|MCI_ADDR|0xffffeff0, MCI_ADDR|0x001000c0), - MODEL_STEPPING(INTEL_FAM6_SKYLAKE_X, 4), BANK_RANGE(13, 18) + VFM_STEPPING(INTEL_SKYLAKE_X, 4), BANK_RANGE(13, 18) ), /* ignore OVER for UCNA */ @@ -396,7 +396,7 @@ static noinstr int mce_severity_intel(struct mce *m, struct pt_regs *regs, char continue; if (s->excp && excp != s->excp) continue; - if (s->cpu_model && boot_cpu_data.x86_model != s->cpu_model) + if (s->cpu_vfm && boot_cpu_data.x86_vfm != s->cpu_vfm) continue; if (s->cpu_minstepping && boot_cpu_data.x86_stepping < s->cpu_minstepping) continue; -- Gitee From 4147ab2484d49f5d52960bbb00afe4690e18d9af Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Thu, 12 Dec 2024 22:01:00 +0800 Subject: [PATCH 24/33] x86/mce: Break up __mcheck_cpu_apply_quirks() commit 51a12c28bb9a043e9444db5bd214b00ec161a639 upstream Split each vendor specific part into its own helper function. [Backport Changes] In arch/x86/kernel/cpu/mce/core.c, within __mcheck_cpu_apply_quirks(), the vendor check for X86_VENDOR_CENTAUR and X86_VENDOR_ZHAOXIN was rewritten from an if condition to a switch-case style, while preserving the exact same vendor conditions and behavior. This is a structural change only and does not alter the existing behavior. Signed-off-by: Tony Luck Signed-off-by: Qiuxu Zhuo Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Sohil Mehta Reviewed-by: Yazen Ghannam Tested-by: Qiuxu Zhuo Link: https://lore.kernel.org/r/20241212140103.66964-5-qiuxu.zhuo@intel.com Signed-off-by: Malathi Signed-off-by: PvsNarasimha --- arch/x86/kernel/cpu/mce/core.c | 170 ++++++++++++++++++--------------- 1 file changed, 92 insertions(+), 78 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index dbb204161e8e..9d04b852d783 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1902,103 +1902,117 @@ static void __mcheck_cpu_check_banks(void) } } -/* Add per CPU specific workarounds here */ -static bool __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) +static void apply_quirks_amd(struct cpuinfo_x86 *c) { struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); - struct mca_config *cfg = &mca_cfg; - - if (c->x86_vendor == X86_VENDOR_UNKNOWN) { - pr_info("unknown CPU type - not enabling MCE support\n"); - return false; - } /* This should be disabled by the BIOS, but isn't always */ - if (c->x86_vendor == X86_VENDOR_AMD) { - if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) { - /* - * disable GART TBL walk error reporting, which - * trips off incorrectly with the IOMMU & 3ware - * & Cerberus: - */ - clear_bit(10, (unsigned long *)&mce_banks[4].ctl); - } - if (c->x86 < 0x11 && cfg->bootlog < 0) { - /* - * Lots of broken BIOS around that don't clear them - * by default and leave crap in there. Don't log: - */ - cfg->bootlog = 0; - } + if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) { /* - * Various K7s with broken bank 0 around. Always disable - * by default. + * disable GART TBL walk error reporting, which + * trips off incorrectly with the IOMMU & 3ware + * & Cerberus: */ - if (c->x86 == 6 && this_cpu_read(mce_num_banks) > 0) - mce_banks[0].ctl = 0; + clear_bit(10, (unsigned long *)&mce_banks[4].ctl); + } + if (c->x86 < 0x11 && mca_cfg.bootlog < 0) { /* - * overflow_recov is supported for F15h Models 00h-0fh - * even though we don't have a CPUID bit for it. + * Lots of broken BIOS around that don't clear them + * by default and leave crap in there. Don't log: */ - if (c->x86 == 0x15 && c->x86_model <= 0xf) - mce_flags.overflow_recov = 1; + mca_cfg.bootlog = 0; + } - if (c->x86 >= 0x17 && c->x86 <= 0x1A) - mce_flags.zen_ifu_quirk = 1; + /* + * Various K7s with broken bank 0 around. Always disable + * by default. + */ + if (c->x86 == 6 && this_cpu_read(mce_num_banks) > 0) + mce_banks[0].ctl = 0; - } + /* + * overflow_recov is supported for F15h Models 00h-0fh + * even though we don't have a CPUID bit for it. + */ + if (c->x86 == 0x15 && c->x86_model <= 0xf) + mce_flags.overflow_recov = 1; - if (c->x86_vendor == X86_VENDOR_INTEL) { - /* - * SDM documents that on family 6 bank 0 should not be written - * because it aliases to another special BIOS controlled - * register. - * But it's not aliased anymore on model 0x1a+ - * Don't ignore bank 0 completely because there could be a - * valid event later, merely don't write CTL0. - */ + if (c->x86 >= 0x17 && c->x86 <= 0x1A) + mce_flags.zen_ifu_quirk = 1; +} - if (c->x86 == 6 && c->x86_model < 0x1A && this_cpu_read(mce_num_banks) > 0) - mce_banks[0].init = false; +static void apply_quirks_intel(struct cpuinfo_x86 *c) +{ + struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); - /* - * All newer Intel systems support MCE broadcasting. Enable - * synchronization with a one second timeout. - */ - if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && - cfg->monarch_timeout < 0) - cfg->monarch_timeout = USEC_PER_SEC; + /* + * SDM documents that on family 6 bank 0 should not be written + * because it aliases to another special BIOS controlled + * register. + * But it's not aliased anymore on model 0x1a+ + * Don't ignore bank 0 completely because there could be a + * valid event later, merely don't write CTL0. + */ + if (c->x86 == 6 && c->x86_model < 0x1A && this_cpu_read(mce_num_banks) > 0) + mce_banks[0].init = false; - /* - * There are also broken BIOSes on some Pentium M and - * earlier systems: - */ - if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0) - cfg->bootlog = 0; + /* + * All newer Intel systems support MCE broadcasting. Enable + * synchronization with a one second timeout. + */ + if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && + mca_cfg.monarch_timeout < 0) + mca_cfg.monarch_timeout = USEC_PER_SEC; - if (c->x86_vfm == INTEL_SANDYBRIDGE_X) - mce_flags.snb_ifu_quirk = 1; + /* + * There are also broken BIOSes on some Pentium M and + * earlier systems: + */ + if (c->x86 == 6 && c->x86_model <= 13 && mca_cfg.bootlog < 0) + mca_cfg.bootlog = 0; - /* - * Skylake, Cascacde Lake and Cooper Lake require a quirk on - * rep movs. - */ - if (c->x86_vfm == INTEL_SKYLAKE_X) - mce_flags.skx_repmov_quirk = 1; + if (c->x86_vfm == INTEL_SANDYBRIDGE_X) + mce_flags.snb_ifu_quirk = 1; + + /* + * Skylake, Cascacde Lake and Cooper Lake require a quirk on + * rep movs. + */ + if (c->x86_vfm == INTEL_SKYLAKE_X) + mce_flags.skx_repmov_quirk = 1; +} + +static void apply_quirks_zhaoxin(struct cpuinfo_x86 *c) +{ + /* + * All newer Zhaoxin CPUs support MCE broadcasting. Enable + * synchronization with a one second timeout. + */ + if (c->x86 > 6 || (c->x86_model == 0x19 || c->x86_model == 0x1f)) { + if (mca_cfg.monarch_timeout < 0) + mca_cfg.monarch_timeout = USEC_PER_SEC; } +} - if (c->x86_vendor == X86_VENDOR_CENTAUR || - c->x86_vendor == X86_VENDOR_ZHAOXIN) { - /* - * All newer Zhaoxin CPUs support MCE broadcasting. Enable - * synchronization with a one second timeout. - */ - if (c->x86 > 6 || (c->x86_model == 0x19 || c->x86_model == 0x1f)) { - if (cfg->monarch_timeout < 0) - cfg->monarch_timeout = USEC_PER_SEC; - } - mca_cfg.bios_cmci_threshold = 1; +/* Add per CPU specific workarounds here */ +static bool __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) +{ + struct mca_config *cfg = &mca_cfg; + + switch (c->x86_vendor) { + case X86_VENDOR_UNKNOWN: + pr_info("unknown CPU type - not enabling MCE support\n"); + return false; + case X86_VENDOR_AMD: + apply_quirks_amd(c); + break; + case X86_VENDOR_INTEL: + apply_quirks_intel(c); + break; + case X86_VENDOR_ZHAOXIN: + apply_quirks_zhaoxin(c); + break; } if (cfg->monarch_timeout < 0) -- Gitee From 024c710b8d7b747f164de2bef9d89a854935b676 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Mon, 8 Sep 2025 15:40:33 +0000 Subject: [PATCH 25/33] x86/mce: Do 'UNKNOWN' vendor check early commit a46b2bbe1e36e7faab5010f68324b7d191c5c09f upstream The 'UNKNOWN' vendor check is handled as a quirk that is run on each online CPU. However, all CPUs are expected to have the same vendor. Move the 'UNKNOWN' vendor check to the BSP-only init so it is done early and once. Remove the unnecessary return value from the quirks check. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Qiuxu Zhuo Reviewed-by: Tony Luck Reviewed-by: Nikolay Borisov Tested-by: Tony Luck Link: https://lore.kernel.org/20250908-wip-mca-updates-v6-0-eef5d6c74b9c@amd.com Signed-off-by: Malathi Signed-off-by: PvsNarasimha --- arch/x86/kernel/cpu/mce/core.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 9d04b852d783..460bd1f1eb98 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1996,14 +1996,11 @@ static void apply_quirks_zhaoxin(struct cpuinfo_x86 *c) } /* Add per CPU specific workarounds here */ -static bool __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) +static void __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) { struct mca_config *cfg = &mca_cfg; switch (c->x86_vendor) { - case X86_VENDOR_UNKNOWN: - pr_info("unknown CPU type - not enabling MCE support\n"); - return false; case X86_VENDOR_AMD: apply_quirks_amd(c); break; @@ -2019,8 +2016,6 @@ static bool __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) cfg->monarch_timeout = 0; if (cfg->bootlog != 0) cfg->panic_timeout = 30; - - return true; } static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) @@ -2223,6 +2218,12 @@ void mca_bsp_init(struct cpuinfo_x86 *c) if (!mce_available(c)) return; + if (c->x86_vendor == X86_VENDOR_UNKNOWN) { + mca_cfg.disabled = 1; + pr_info("unknown CPU type - not enabling MCE support\n"); + return; + } + mce_flags.overflow_recov = cpu_feature_enabled(X86_FEATURE_OVERFLOW_RECOV); mce_flags.succor = cpu_feature_enabled(X86_FEATURE_SUCCOR); mce_flags.smca = cpu_feature_enabled(X86_FEATURE_SMCA); @@ -2257,10 +2258,7 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c) __mcheck_cpu_cap_init(); - if (!__mcheck_cpu_apply_quirks(c)) { - mca_cfg.disabled = 1; - return; - } + __mcheck_cpu_apply_quirks(c); if (!mce_gen_pool_init()) { mca_cfg.disabled = 1; -- Gitee From 3176938e549fa84bf827677e5a40dfb267c06cff Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 29 Aug 2024 15:00:42 -0700 Subject: [PATCH 26/33] x86/cpu/intel: Replace PAT erratum model/family magic numbers with symbolic IFM references commit fd82221a59fa5ce9dc7523e11c5e995104a28cb0 upstream There's an erratum that prevents the PAT from working correctly: https://www.intel.com/content/dam/www/public/us/en/documents/specification-updates/pentium-dual-core-specification-update.pdf # Document 316515 Version 010 The kernel currently disables PAT support on those CPUs, but it does it with some magic numbers. Replace the magic numbers with the new "IFM" macros. Make the check refer to the last affected CPU (INTEL_CORE_YONAH) rather than the first fixed one. This makes it easier to find the documentation of the erratum since Intel documents where it is broken and not where it is fixed. I don't think the Pentium Pro (or Pentium II) is actually affected. But the old check included them, so it can't hurt to keep doing the same. I'm also not completely sure about the "Pentium M" CPUs (models 0x9 and 0xd). But, again, they were included in in the old checks and were close Pentium III derivatives, so are likely affected. While we're at it, revise the comment referring to the erratum name and making sure it is a quote of the language from the actual errata doc. That should make it easier to find in the future when the URL inevitably changes. Why bother with this in the first place? It actually gets rid of one of the very few remaining direct references to c->x86{,_model}. No change in functionality intended. Signed-off-by: Dave Hansen Signed-off-by: Ingo Molnar Cc: Len Brown Link: https://lore.kernel.org/r/20240829220042.1007820-1-dave.hansen@linux.intel.com Signed-off-by: Arukonda Rahul Signed-off-by: PvsNarasimha --- arch/x86/include/asm/intel-family.h | 2 ++ arch/x86/kernel/cpu/intel.c | 18 ++++++++++-------- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index 9646542c9757..f74b89a93a4e 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -47,6 +47,8 @@ /* Wildcard match for FAM6 so X86_MATCH_VFM(ANY) works */ #define INTEL_ANY IFM(X86_FAMILY_ANY, X86_MODEL_ANY) +#define INTEL_PENTIUM_PRO IFM(6, 0x01) + #define INTEL_FAM6_CORE_YONAH 0x0E #define INTEL_CORE_YONAH IFM(6, 0x0E) diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 8851f981693b..09ae0a18f441 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -358,16 +358,18 @@ static void early_init_intel(struct cpuinfo_x86 *c) } /* - * There is a known erratum on Pentium III and Core Solo - * and Core Duo CPUs. - * " Page with PAT set to WC while associated MTRR is UC - * may consolidate to UC " - * Because of this erratum, it is better to stick with - * setting WC in MTRR rather than using PAT on these CPUs. + * PAT is broken on early family 6 CPUs, the last of which + * is "Yonah" where the erratum is named "AN7": * - * Enable PAT WC only on P4, Core 2 or later CPUs. + * Page with PAT (Page Attribute Table) Set to USWC + * (Uncacheable Speculative Write Combine) While + * Associated MTRR (Memory Type Range Register) Is UC + * (Uncacheable) May Consolidate to UC + * + * Disable PAT and fall back to MTRR on these CPUs. */ - if (c->x86 == 6 && c->x86_model < 15) + if (c->x86_vfm >= INTEL_PENTIUM_PRO && + c->x86_vfm <= INTEL_CORE_YONAH) clear_cpu_cap(c, X86_FEATURE_PAT); /* -- Gitee From 415095117f60f0f21d10ac7bcf80b81ccd7b21a9 Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Thu, 12 Dec 2024 22:01:01 +0800 Subject: [PATCH 27/33] x86/mce: Convert family/model mixed checks to VFM-based checks commit 359d7a98e3e3f88dbf45411427b284bb3bbbaea5 upstream Convert family/model mixed checks to VFM-based checks to make the code more compact. Simplify. [ bp: Drop the "what" from the commit message - it should be visible from the diff alone. ] Suggested-by: Sohil Mehta Suggested-by: Dave Hansen Signed-off-by: Qiuxu Zhuo Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tony Luck Reviewed-by: Sohil Mehta Reviewed-by: Yazen Ghannam Link: https://lore.kernel.org/r/20241212140103.66964-6-qiuxu.zhuo@intel.com Signed-off-by: Arukonda Rahul Signed-off-by: PvsNarasimha --- arch/x86/kernel/cpu/mce/core.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 460bd1f1eb98..82b0b62110ba 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1928,7 +1928,7 @@ static void apply_quirks_amd(struct cpuinfo_x86 *c) * Various K7s with broken bank 0 around. Always disable * by default. */ - if (c->x86 == 6 && this_cpu_read(mce_num_banks) > 0) + if (c->x86 == 6 && this_cpu_read(mce_num_banks)) mce_banks[0].ctl = 0; /* @@ -1946,6 +1946,10 @@ static void apply_quirks_intel(struct cpuinfo_x86 *c) { struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); + /* Older CPUs (prior to family 6) don't need quirks. */ + if (c->x86_vfm < INTEL_PENTIUM_PRO) + return; + /* * SDM documents that on family 6 bank 0 should not be written * because it aliases to another special BIOS controlled @@ -1954,22 +1958,21 @@ static void apply_quirks_intel(struct cpuinfo_x86 *c) * Don't ignore bank 0 completely because there could be a * valid event later, merely don't write CTL0. */ - if (c->x86 == 6 && c->x86_model < 0x1A && this_cpu_read(mce_num_banks) > 0) + if (c->x86_vfm < INTEL_NEHALEM_EP && this_cpu_read(mce_num_banks)) mce_banks[0].init = false; /* * All newer Intel systems support MCE broadcasting. Enable * synchronization with a one second timeout. */ - if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && - mca_cfg.monarch_timeout < 0) + if (c->x86_vfm >= INTEL_CORE_YONAH && mca_cfg.monarch_timeout < 0) mca_cfg.monarch_timeout = USEC_PER_SEC; /* * There are also broken BIOSes on some Pentium M and * earlier systems: */ - if (c->x86 == 6 && c->x86_model <= 13 && mca_cfg.bootlog < 0) + if (c->x86_vfm < INTEL_CORE_YONAH && mca_cfg.bootlog < 0) mca_cfg.bootlog = 0; if (c->x86_vfm == INTEL_SANDYBRIDGE_X) -- Gitee From 7167ffcb3d98e86e660874d82292d629c371ae99 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Mon, 8 Sep 2025 15:40:34 +0000 Subject: [PATCH 28/33] x86/mce: Separate global and per-CPU quirks commit 7eee1e92684507f64ec6a75fecbd27e37174b888 upstream Many quirks are global configuration settings and a handful apply to each CPU. Move the per-CPU quirks to vendor init to execute them on each online CPU. Set the global quirks during BSP-only init so they're only executed once and early. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Qiuxu Zhuo Reviewed-by: Tony Luck Reviewed-by: Nikolay Borisov Tested-by: Tony Luck Link: https://lore.kernel.org/20250908-wip-mca-updates-v6-0-eef5d6c74b9c@amd.com Signed-off-by: Arukonda Rahul Signed-off-by: PvsNarasimha --- arch/x86/kernel/cpu/mce/amd.c | 24 ++++++++++ arch/x86/kernel/cpu/mce/core.c | 85 +++++++++------------------------ arch/x86/kernel/cpu/mce/intel.c | 18 +++++++ 3 files changed, 65 insertions(+), 62 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 8c016d365964..cef067947456 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -654,6 +654,28 @@ static void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank) wrmsrl(MSR_K7_HWCR, hwcr); } +static void amd_apply_cpu_quirks(struct cpuinfo_x86 *c) +{ + struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); + + /* This should be disabled by the BIOS, but isn't always */ + if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) { + /* + * disable GART TBL walk error reporting, which + * trips off incorrectly with the IOMMU & 3ware + * & Cerberus: + */ + clear_bit(10, (unsigned long *)&mce_banks[4].ctl); + } + + /* + * Various K7s with broken bank 0 around. Always disable + * by default. + */ + if (c->x86 == 6 && this_cpu_read(mce_num_banks)) + mce_banks[0].ctl = 0; +} + /* cpu init entry point, called from mce.c with preempt off */ void mce_amd_feature_init(struct cpuinfo_x86 *c) { @@ -661,6 +683,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) u32 low = 0, high = 0, address = 0; int offset = -1; + amd_apply_cpu_quirks(c); + mce_flags.amd_threshold = 1; for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) { diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 82b0b62110ba..97432aa6935d 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1806,8 +1806,9 @@ static void __mcheck_cpu_mce_banks_init(void) struct mce_bank *b = &mce_banks[i]; /* - * Init them all, __mcheck_cpu_apply_quirks() is going to apply - * the required vendor quirks before + * Init them all by default. + * + * The required vendor quirks will be applied before * __mcheck_cpu_init_clear_banks() does the final bank setup. */ b->ctl = -1ULL; @@ -1902,20 +1903,8 @@ static void __mcheck_cpu_check_banks(void) } } -static void apply_quirks_amd(struct cpuinfo_x86 *c) +static void amd_apply_global_quirks(struct cpuinfo_x86 *c) { - struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); - - /* This should be disabled by the BIOS, but isn't always */ - if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) { - /* - * disable GART TBL walk error reporting, which - * trips off incorrectly with the IOMMU & 3ware - * & Cerberus: - */ - clear_bit(10, (unsigned long *)&mce_banks[4].ctl); - } - if (c->x86 < 0x11 && mca_cfg.bootlog < 0) { /* * Lots of broken BIOS around that don't clear them @@ -1924,13 +1913,6 @@ static void apply_quirks_amd(struct cpuinfo_x86 *c) mca_cfg.bootlog = 0; } - /* - * Various K7s with broken bank 0 around. Always disable - * by default. - */ - if (c->x86 == 6 && this_cpu_read(mce_num_banks)) - mce_banks[0].ctl = 0; - /* * overflow_recov is supported for F15h Models 00h-0fh * even though we don't have a CPUID bit for it. @@ -1942,25 +1924,12 @@ static void apply_quirks_amd(struct cpuinfo_x86 *c) mce_flags.zen_ifu_quirk = 1; } -static void apply_quirks_intel(struct cpuinfo_x86 *c) +static void intel_apply_global_quirks(struct cpuinfo_x86 *c) { - struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); - /* Older CPUs (prior to family 6) don't need quirks. */ if (c->x86_vfm < INTEL_PENTIUM_PRO) return; - /* - * SDM documents that on family 6 bank 0 should not be written - * because it aliases to another special BIOS controlled - * register. - * But it's not aliased anymore on model 0x1a+ - * Don't ignore bank 0 completely because there could be a - * valid event later, merely don't write CTL0. - */ - if (c->x86_vfm < INTEL_NEHALEM_EP && this_cpu_read(mce_num_banks)) - mce_banks[0].init = false; - /* * All newer Intel systems support MCE broadcasting. Enable * synchronization with a one second timeout. @@ -1986,7 +1955,7 @@ static void apply_quirks_intel(struct cpuinfo_x86 *c) mce_flags.skx_repmov_quirk = 1; } -static void apply_quirks_zhaoxin(struct cpuinfo_x86 *c) +static void zhaoxin_apply_global_quirks(struct cpuinfo_x86 *c) { /* * All newer Zhaoxin CPUs support MCE broadcasting. Enable @@ -1998,29 +1967,6 @@ static void apply_quirks_zhaoxin(struct cpuinfo_x86 *c) } } -/* Add per CPU specific workarounds here */ -static void __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) -{ - struct mca_config *cfg = &mca_cfg; - - switch (c->x86_vendor) { - case X86_VENDOR_AMD: - apply_quirks_amd(c); - break; - case X86_VENDOR_INTEL: - apply_quirks_intel(c); - break; - case X86_VENDOR_ZHAOXIN: - apply_quirks_zhaoxin(c); - break; - } - - if (cfg->monarch_timeout < 0) - cfg->monarch_timeout = 0; - if (cfg->bootlog != 0) - cfg->panic_timeout = 30; -} - static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) { if (c->x86 != 5) @@ -2242,6 +2188,23 @@ void mca_bsp_init(struct cpuinfo_x86 *c) if (cap & MCG_SER_P) mca_cfg.ser = 1; + + switch (c->x86_vendor) { + case X86_VENDOR_AMD: + amd_apply_global_quirks(c); + break; + case X86_VENDOR_INTEL: + intel_apply_global_quirks(c); + break; + case X86_VENDOR_ZHAOXIN: + zhaoxin_apply_global_quirks(c); + break; + } + + if (mca_cfg.monarch_timeout < 0) + mca_cfg.monarch_timeout = 0; + if (mca_cfg.bootlog != 0) + mca_cfg.panic_timeout = 30; } /* @@ -2261,8 +2224,6 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c) __mcheck_cpu_cap_init(); - __mcheck_cpu_apply_quirks(c); - if (!mce_gen_pool_init()) { mca_cfg.disabled = 1; pr_emerg("Couldn't allocate MCE records pool!\n"); diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c index 8599af1c5a39..0f4605518770 100644 --- a/arch/x86/kernel/cpu/mce/intel.c +++ b/arch/x86/kernel/cpu/mce/intel.c @@ -363,8 +363,26 @@ static void intel_imc_init(struct cpuinfo_x86 *c) } } +static void intel_apply_cpu_quirks(struct cpuinfo_x86 *c) +{ + /* + * SDM documents that on family 6 bank 0 should not be written + * because it aliases to another special BIOS controlled + * register. + * But it's not aliased anymore on model 0x1a+ + * Don't ignore bank 0 completely because there could be a + * valid event later, merely don't write CTL0. + * + * Older CPUs (prior to family 6) can't reach this point and already + * return early due to the check of __mcheck_cpu_ancient_init(). + */ + if (c->x86_vfm < INTEL_NEHALEM_EP && this_cpu_read(mce_num_banks)) + this_cpu_ptr(mce_banks_array)[0].init = false; +} + void mce_intel_feature_init(struct cpuinfo_x86 *c) { + intel_apply_cpu_quirks(c); intel_init_cmci(); intel_init_lmce(); intel_imc_init(c); -- Gitee From 705d25b112038cd97fe6366e2027393dd50018ae Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Thu, 23 May 2024 10:56:34 -0500 Subject: [PATCH 29/33] x86/mce: Remove unused variable and return value in machine_check_poll() commit 5b9d292ea87c836ec47483f98344cb0e7add82fe upstream The recent CMCI storm handling rework removed the last case that checks the return value of machine_check_poll(). Therefore the "error_seen" variable is no longer used, so remove it. Fixes: 3ed57b41a412 ("x86/mce: Remove old CMCI storm mitigation code") Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20240523155641.2805411-3-yazen.ghannam@amd.com Signed-off-by: Borislav Petkov (AMD) Signed-off-by: Arukonda Rahul Signed-off-by: PvsNarasimha --- arch/x86/include/asm/mce.h | 3 ++- arch/x86/kernel/cpu/mce/core.c | 7 +------ 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 2796cb89eb68..15a0505a627e 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -273,7 +273,8 @@ enum mcp_flags { MCP_DONTLOG = BIT(2), /* only clear, don't log */ MCP_QUEUE_LOG = BIT(3), /* only queue to genpool */ }; -bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b); + +void machine_check_poll(enum mcp_flags flags, mce_banks_t *b); int mce_notify_irq(void); diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 97432aa6935d..928a90660a5a 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -696,10 +696,9 @@ DEFINE_PER_CPU(unsigned, mce_poll_count); * is already totally * confused. In this case it's likely it will * not fully execute the machine check handler either. */ -bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) +void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) { struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); - bool error_seen = false; struct mce_hw_err err; struct mce *m; int i; @@ -775,8 +774,6 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) continue; log_it: - error_seen = true; - if (flags & MCP_DONTLOG) goto clear_it; @@ -808,8 +805,6 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) */ sync_core(); - - return error_seen; } EXPORT_SYMBOL_GPL(machine_check_poll); -- Gitee From 0b371cfa44f8a75ae1e7ba3b2efaea08c6c1a477 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 25 Aug 2025 17:33:02 +0000 Subject: [PATCH 30/33] x86/mce: Cleanup bank processing on init commit 0f134c53246366c00664b640f9edc9be5db255b3 upstream Unify the bank preparation into __mcheck_cpu_init_clear_banks(), rename that function to what it does now - prepares banks. Do this so that generic and vendor banks init goes first so that settings done during that init can take effect before the first bank polling takes place. Move __mcheck_cpu_check_banks() into __mcheck_cpu_init_prepare_banks() as it already loops over the banks. The MCP_DONTLOG flag is no longer needed, since the MCA polling function is now called only if boot-time logging should be done. [Backport Changes] 1. Since the commit 78255eb ("Rename 'wrmsrl()' to 'wrmsrq()'"), which renamed wrmsrl() to wrmsrq() globally, is not available in the current source tree and backporting it would introduce large, unrelated changes. Therefore, the changes intended for wrmsrq() are instead applied to functionally equivalent wrmsrl() calls in this backport. Accordingly, in file arch/x86/kernel/cpu/mce/core.c, wrmsrl() calls are removed and added as needed to follow the upstream changes while preserving the original behavior in the following functions: 1. __mcheck_cpu_init_prepare_banks() Signed-off-by: Borislav Petkov Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Yazen Ghannam Reviewed-by: Qiuxu Zhuo Reviewed-by: Tony Luck Tested-by: Tony Luck Link: https://lore.kernel.org/20250825-wip-mca-updates-v5-5-865768a2eef8@amd.com Signed-off-by: Arukonda Rahul Signed-off-by: PvsNarasimha --- arch/x86/include/asm/mce.h | 3 +- arch/x86/kernel/cpu/mce/core.c | 63 ++++++++++------------------------ 2 files changed, 19 insertions(+), 47 deletions(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 15a0505a627e..ece1d6572ff0 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -270,8 +270,7 @@ DECLARE_PER_CPU(mce_banks_t, mce_poll_banks); enum mcp_flags { MCP_TIMESTAMP = BIT(0), /* log time stamp */ MCP_UC = BIT(1), /* log uncorrected errors */ - MCP_DONTLOG = BIT(2), /* only clear, don't log */ - MCP_QUEUE_LOG = BIT(3), /* only queue to genpool */ + MCP_QUEUE_LOG = BIT(2), /* only queue to genpool */ }; void machine_check_poll(enum mcp_flags flags, mce_banks_t *b); diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 928a90660a5a..6f1e78915e58 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -774,9 +774,6 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) continue; log_it: - if (flags & MCP_DONTLOG) - goto clear_it; - mce_read_aux(&err, i); m->severity = mce_severity(m, NULL, NULL, false); /* @@ -1804,7 +1801,7 @@ static void __mcheck_cpu_mce_banks_init(void) * Init them all by default. * * The required vendor quirks will be applied before - * __mcheck_cpu_init_clear_banks() does the final bank setup. + * __mcheck_cpu_init_prepare_banks() does the final bank setup. */ b->ctl = -1ULL; b->init = true; @@ -1836,56 +1833,30 @@ static void __mcheck_cpu_cap_init(void) static void __mcheck_cpu_init_generic(void) { - enum mcp_flags m_fl = 0; - mce_banks_t all_banks; u64 cap; - if (!mca_cfg.bootlog) - m_fl = MCP_DONTLOG; - - /* - * Log the machine checks left over from the previous reset. Log them - * only, do not start processing them. That will happen in mcheck_late_init() - * when all consumers have been registered on the notifier chain. - */ - bitmap_fill(all_banks, MAX_NR_BANKS); - machine_check_poll(MCP_UC | MCP_QUEUE_LOG | m_fl, &all_banks); - rdmsrl(MSR_IA32_MCG_CAP, cap); if (cap & MCG_CTL_P) wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); } -static void __mcheck_cpu_init_clear_banks(void) +static void __mcheck_cpu_init_prepare_banks(void) { struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); + u64 msrval; int i; - for (i = 0; i < this_cpu_read(mce_num_banks); i++) { - struct mce_bank *b = &mce_banks[i]; + /* + * Log the machine checks left over from the previous reset. Log them + * only, do not start processing them. That will happen in mcheck_late_init() + * when all consumers have been registered on the notifier chain. + */ + if (mca_cfg.bootlog) { + mce_banks_t all_banks; - if (!b->init) - continue; - wrmsrl(mca_msr_reg(i, MCA_CTL), b->ctl); - wrmsrl(mca_msr_reg(i, MCA_STATUS), 0); + bitmap_fill(all_banks, MAX_NR_BANKS); + machine_check_poll(MCP_UC | MCP_QUEUE_LOG, &all_banks); } -} - -/* - * Do a final check to see if there are any unused/RAZ banks. - * - * This must be done after the banks have been initialized and any quirks have - * been applied. - * - * Do not call this from any user-initiated flows, e.g. CPU hotplug or sysfs. - * Otherwise, a user who disables a bank will not be able to re-enable it - * without a system reboot. - */ -static void __mcheck_cpu_check_banks(void) -{ - struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); - u64 msrval; - int i; for (i = 0; i < this_cpu_read(mce_num_banks); i++) { struct mce_bank *b = &mce_banks[i]; @@ -1894,6 +1865,9 @@ static void __mcheck_cpu_check_banks(void) continue; rdmsrl(mca_msr_reg(i, MCA_CTL), msrval); + wrmsrl(mca_msr_reg(i, MCA_CTL), b->ctl); + wrmsrl(mca_msr_reg(i, MCA_STATUS), 0); + b->init = !!msrval; } } @@ -2229,8 +2203,7 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c) __mcheck_cpu_init_generic(); __mcheck_cpu_init_vendor(c); - __mcheck_cpu_init_clear_banks(); - __mcheck_cpu_check_banks(); + __mcheck_cpu_init_prepare_banks(); __mcheck_cpu_setup_timer(); cr4_set_bits(X86_CR4_MCE); } @@ -2400,7 +2373,7 @@ static void mce_syscore_resume(void) { __mcheck_cpu_init_generic(); __mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info)); - __mcheck_cpu_init_clear_banks(); + __mcheck_cpu_init_prepare_banks(); cr4_set_bits(X86_CR4_MCE); } @@ -2419,7 +2392,7 @@ static void mce_cpu_restart(void *data) if (!mce_available(raw_cpu_ptr(&cpu_info))) return; __mcheck_cpu_init_generic(); - __mcheck_cpu_init_clear_banks(); + __mcheck_cpu_init_prepare_banks(); __mcheck_cpu_init_timer(); cr4_set_bits(X86_CR4_MCE); } -- Gitee From f19b331ebe3dd8e2acc1fc6d372e5bc0b628979e Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Mon, 8 Sep 2025 15:40:35 +0000 Subject: [PATCH 31/33] x86/mce: Move machine_check_poll() status checks to helper functions commit 91af6842e9945d064401ed2d6e91539a619760d1 upstream There are a number of generic and vendor-specific status checks in machine_check_poll(). These are used to determine if an error should be skipped. Move these into helper functions. Future vendor-specific checks will be added to the helpers. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Qiuxu Zhuo Reviewed-by: Tony Luck Reviewed-by: Nikolay Borisov Tested-by: Tony Luck Link: https://lore.kernel.org/20250908-wip-mca-updates-v6-0-eef5d6c74b9c@amd.com Signed-off-by: Arukonda Rahul Signed-off-by: PvsNarasimha --- arch/x86/kernel/cpu/mce/core.c | 88 ++++++++++++++++++---------------- 1 file changed, 48 insertions(+), 40 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 6f1e78915e58..67dcfe876354 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -681,6 +681,52 @@ static noinstr void mce_read_aux(struct mce_hw_err *err, int i) DEFINE_PER_CPU(unsigned, mce_poll_count); +/* + * Newer Intel systems that support software error + * recovery need to make additional checks. Other + * CPUs should skip over uncorrected errors, but log + * everything else. + */ +static bool ser_should_log_poll_error(struct mce *m) +{ + /* Log "not enabled" (speculative) errors */ + if (!(m->status & MCI_STATUS_EN)) + return true; + + /* + * Log UCNA (SDM: 15.6.3 "UCR Error Classification") + * UC == 1 && PCC == 0 && S == 0 + */ + if (!(m->status & MCI_STATUS_PCC) && !(m->status & MCI_STATUS_S)) + return true; + + return false; +} + +static bool should_log_poll_error(enum mcp_flags flags, struct mce_hw_err *err) +{ + struct mce *m = &err->m; + + /* If this entry is not valid, ignore it. */ + if (!(m->status & MCI_STATUS_VAL)) + return false; + + /* + * If we are logging everything (at CPU online) or this + * is a corrected error, then we must log it. + */ + if ((flags & MCP_UC) || !(m->status & MCI_STATUS_UC)) + return true; + + if (mca_cfg.ser) + return ser_should_log_poll_error(m); + + if (m->status & MCI_STATUS_UC) + return false; + + return true; +} + /* * Poll for corrected events or events that happened before reset. * Those are just logged through /dev/mcelog. @@ -732,48 +778,10 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) if (!mca_cfg.cmci_disabled) mce_track_storm(m); - /* If this entry is not valid, ignore it */ - if (!(m->status & MCI_STATUS_VAL)) + /* Verify that the error should be logged based on hardware conditions. */ + if (!should_log_poll_error(flags, &err)) continue; - /* - * If we are logging everything (at CPU online) or this - * is a corrected error, then we must log it. - */ - if ((flags & MCP_UC) || !(m->status & MCI_STATUS_UC)) - goto log_it; - - /* - * Newer Intel systems that support software error - * recovery need to make additional checks. Other - * CPUs should skip over uncorrected errors, but log - * everything else. - */ - if (!mca_cfg.ser) { - if (m->status & MCI_STATUS_UC) - continue; - goto log_it; - } - - /* Log "not enabled" (speculative) errors */ - if (!(m->status & MCI_STATUS_EN)) - goto log_it; - - /* - * Log UCNA (SDM: 15.6.3 "UCR Error Classification") - * UC == 1 && PCC == 0 && S == 0 - */ - if (!(m->status & MCI_STATUS_PCC) && !(m->status & MCI_STATUS_S)) - goto log_it; - - /* - * Skip anything else. Presumption is that our read of this - * bank is racing with a machine check. Leave the log alone - * for do_machine_check() to deal with it. - */ - continue; - -log_it: mce_read_aux(&err, i); m->severity = mce_severity(m, NULL, NULL, false); /* -- Gitee From 598a6f2f0a9bcf5ab0e73784c8de83f470b167dd Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 9 Apr 2025 22:29:03 +0200 Subject: [PATCH 32/33] x86/msr: Rename 'mce_wrmsrl()' to 'mce_wrmsrq()' commit 8e44e83f57c3289a41507eb79a315400629978ae upstream Suggested-by: "H. Peter Anvin" Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra (Intel) Cc: Juergen Gross Cc: Dave Hansen Cc: Xin Li Cc: Linus Torvalds Signed-off-by: Arukonda Rahul Signed-off-by: PvsNarasimha --- arch/x86/kernel/cpu/mce/core.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 67dcfe876354..d5dcdd85552a 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -403,7 +403,7 @@ noinstr u64 mce_rdmsrl(u32 msr) return EAX_EDX_VAL(val, low, high); } -static noinstr void mce_wrmsrl(u32 msr, u64 v) +static noinstr void mce_wrmsrq(u32 msr, u64 v) { u32 low, high; @@ -801,7 +801,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) /* * Clear state for this bank. */ - mce_wrmsrl(mca_msr_reg(i, MCA_STATUS), 0); + mce_wrmsrq(mca_msr_reg(i, MCA_STATUS), 0); } /* @@ -882,8 +882,8 @@ static noinstr bool quirk_skylake_repmov(void) MCI_STATUS_ADDRV | MCI_STATUS_MISCV | MCI_STATUS_AR | MCI_STATUS_S)) { misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING; - mce_wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable); - mce_wrmsrl(MSR_IA32_MCx_STATUS(1), 0); + mce_wrmsrq(MSR_IA32_MISC_ENABLE, misc_enable); + mce_wrmsrq(MSR_IA32_MCx_STATUS(1), 0); instrumentation_begin(); pr_err_once("Erratum detected, disable fast string copy instructions.\n"); @@ -1246,7 +1246,7 @@ static __always_inline void mce_clear_state(unsigned long *toclear) for (i = 0; i < this_cpu_read(mce_num_banks); i++) { if (arch_test_bit(i, toclear)) - mce_wrmsrl(mca_msr_reg(i, MCA_STATUS), 0); + mce_wrmsrq(mca_msr_reg(i, MCA_STATUS), 0); } } @@ -1667,7 +1667,7 @@ noinstr void do_machine_check(struct pt_regs *regs) instrumentation_end(); clear: - mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); + mce_wrmsrq(MSR_IA32_MCG_STATUS, 0); } EXPORT_SYMBOL_GPL(do_machine_check); -- Gitee From 892d172c610e1b81c4150a3215449d9d70c183c3 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Mon, 8 Sep 2025 15:40:36 +0000 Subject: [PATCH 33/33] x86/mce: Add a clear_bank() helper commit 5c6f123c419b6e20f84ac1683089a52f449273aa upstream Add a helper at the end of the MCA polling function to collect vendor and/or feature actions. Start with a basic skeleton for now. Actions for AMD thresholding and deferred errors will be added later. [ bp: Drop the obvious comment too. ] Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Nikolay Borisov Link: https://lore.kernel.org/20250908-wip-mca-updates-v6-0-eef5d6c74b9c@amd.com Signed-off-by: Arukonda Rahul Signed-off-by: PvsNarasimha --- arch/x86/kernel/cpu/mce/amd.c | 5 +++++ arch/x86/kernel/cpu/mce/core.c | 15 ++++++++++----- arch/x86/kernel/cpu/mce/internal.h | 3 +++ 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index cef067947456..09da81198d33 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -902,6 +902,11 @@ static void amd_threshold_interrupt(void) } } +void amd_clear_bank(struct mce *m) +{ + mce_wrmsrq(mca_msr_reg(m->bank, MCA_STATUS), 0); +} + /* * Sysfs Interface */ diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index d5dcdd85552a..947adbc6ed2b 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -403,7 +403,7 @@ noinstr u64 mce_rdmsrl(u32 msr) return EAX_EDX_VAL(val, low, high); } -static noinstr void mce_wrmsrq(u32 msr, u64 v) +noinstr void mce_wrmsrq(u32 msr, u64 v) { u32 low, high; @@ -727,6 +727,14 @@ static bool should_log_poll_error(enum mcp_flags flags, struct mce_hw_err *err) return true; } +static void clear_bank(struct mce *m) +{ + if (m->cpuvendor == X86_VENDOR_AMD) + return amd_clear_bank(m); + + mce_wrmsrq(mca_msr_reg(m->bank, MCA_STATUS), 0); +} + /* * Poll for corrected events or events that happened before reset. * Those are just logged through /dev/mcelog. @@ -798,10 +806,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) mce_log(&err); clear_it: - /* - * Clear state for this bank. - */ - mce_wrmsrq(mca_msr_reg(i, MCA_STATUS), 0); + clear_bank(m); } /* diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index 5baca5a58775..bdfae89c5a40 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -264,6 +264,7 @@ void mce_prep_record_per_cpu(unsigned int cpu, struct mce *m); void mce_threshold_create_device(unsigned int cpu); void mce_threshold_remove_device(unsigned int cpu); extern bool amd_filter_mce(struct mce *m); +void amd_clear_bank(struct mce *m); /* * If MCA_CONFIG[McaLsbInStatusSupported] is set, extract ErrAddr in bits @@ -294,6 +295,7 @@ void smca_bsp_init(void); static inline void mce_threshold_create_device(unsigned int cpu) { } static inline void mce_threshold_remove_device(unsigned int cpu) { } static inline bool amd_filter_mce(struct mce *m) { return false; } +static inline void amd_clear_bank(struct mce *m) { } static inline void smca_extract_err_addr(struct mce *m) { } static inline void smca_bsp_init(void) { } #endif @@ -313,6 +315,7 @@ static __always_inline void winchip_machine_check(struct pt_regs *regs) {} #endif noinstr u64 mce_rdmsrl(u32 msr); +noinstr void mce_wrmsrq(u32 msr, u64 v); static __always_inline u32 mca_msr_reg(int bank, enum mca_msr reg) { -- Gitee