diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 8a54a79180cb3e55e7f1d6e4cbe274873d4f6705..7c9809925f199317b9d658f73526e131adb525f9 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1176,7 +1176,7 @@ config X86_MCE_INTEL config X86_MCE_AMD def_bool y prompt "AMD MCE features" - depends on X86_MCE && X86_LOCAL_APIC && AMD_NB + depends on X86_MCE && X86_LOCAL_APIC help Additional support for AMD specific MCE features such as the DRAM Error Threshold. diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h index 259883d42f4b8669a70e1a0d5d5dd343d6a671b3..288101025fb4c0121d34a1f35ff461ba92e34d88 100644 --- a/arch/x86/include/asm/amd_nb.h +++ b/arch/x86/include/asm/amd_nb.h @@ -26,41 +26,11 @@ struct amd_l3_cache { u8 subcaches[4]; }; -struct threshold_block { - unsigned int block; /* Number within bank */ - unsigned int bank; /* MCA bank the block belongs to */ - unsigned int cpu; /* CPU which controls MCA bank */ - u32 address; /* MSR address for the block */ - u16 interrupt_enable; /* Enable/Disable APIC interrupt */ - bool interrupt_capable; /* Bank can generate an interrupt. */ - - u16 threshold_limit; /* - * Value upon which threshold - * interrupt is generated. - */ - - struct kobject kobj; /* sysfs object */ - struct list_head miscj; /* - * List of threshold blocks - * within a bank. - */ -}; - -struct threshold_bank { - struct kobject *kobj; - struct threshold_block *blocks; - - /* initialized to the number of CPUs on the node sharing this bank */ - refcount_t cpus; - unsigned int shared; -}; - struct amd_northbridge { struct pci_dev *root; struct pci_dev *misc; struct pci_dev *link; struct amd_l3_cache l3_cache; - struct threshold_bank *bank4; }; struct amd_northbridge_info { diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index 9646542c97574704312e12ae4a5d2464d540e5ed..f74b89a93a4e131f3bf7883f43c0189545dcdfc8 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -47,6 +47,8 @@ /* Wildcard match for FAM6 so X86_MATCH_VFM(ANY) works */ #define INTEL_ANY IFM(X86_FAMILY_ANY, X86_MODEL_ANY) +#define INTEL_PENTIUM_PRO IFM(6, 0x01) + #define INTEL_FAM6_CORE_YONAH 0x0E #define INTEL_CORE_YONAH IFM(6, 0x0E) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index a02d2215a79f12ef40e88a658475960879d064a9..ece1d6572ff0638fb845a3c1f7d1e3285ea7b5ef 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -13,6 +13,7 @@ #define MCG_CTL_P BIT_ULL(8) /* MCG_CTL register available */ #define MCG_EXT_P BIT_ULL(9) /* Extended registers available */ #define MCG_CMCI_P BIT_ULL(10) /* CMCI supported */ +#define MCG_SEAM_NR BIT_ULL(12) /* MCG_STATUS_SEAM_NR supported */ #define MCG_EXT_CNT_MASK 0xff0000 /* Number of Extended registers */ #define MCG_EXT_CNT_SHIFT 16 #define MCG_EXT_CNT(c) (((c) & MCG_EXT_CNT_MASK) >> MCG_EXT_CNT_SHIFT) @@ -25,6 +26,7 @@ #define MCG_STATUS_EIPV BIT_ULL(1) /* ip points to correct instruction */ #define MCG_STATUS_MCIP BIT_ULL(2) /* machine check in progress */ #define MCG_STATUS_LMCES BIT_ULL(3) /* LMCE signaled */ +#define MCG_STATUS_SEAM_NR BIT_ULL(12) /* Machine check inside SEAM non-root mode */ /* MCG_EXT_CTL register defines */ #define MCG_EXT_CTL_LMCE_EN BIT_ULL(0) /* Enable LMCE */ @@ -185,6 +187,16 @@ enum mce_notifier_prios { MCE_PRIO_HIGHEST = MCE_PRIO_CEC }; +/** + * struct mce_hw_err - Hardware Error Record. + * @m: Machine Check record. + */ +struct mce_hw_err { + struct mce m; +}; + +#define to_mce_hw_err(mce) container_of(mce, struct mce_hw_err, m) + struct notifier_block; extern void mce_register_decode_chain(struct notifier_block *nb); extern void mce_unregister_decode_chain(struct notifier_block *nb); @@ -207,20 +219,22 @@ struct cper_ia_proc_ctx; #ifdef CONFIG_X86_MCE int mcheck_init(void); +void mca_bsp_init(struct cpuinfo_x86 *c); void mcheck_cpu_init(struct cpuinfo_x86 *c); void mcheck_cpu_clear(struct cpuinfo_x86 *c); int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id); #else static inline int mcheck_init(void) { return 0; } +static inline void mca_bsp_init(struct cpuinfo_x86 *c) {} static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {} static inline void mcheck_cpu_clear(struct cpuinfo_x86 *c) {} static inline int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id) { return -EINVAL; } #endif -void mce_setup(struct mce *m); -void mce_log(struct mce *m); +void mce_prep_record(struct mce_hw_err *err); +void mce_log(struct mce_hw_err *err); DECLARE_PER_CPU(struct device *, mce_device); /* Maximum number of MCA banks per CPU. */ @@ -256,10 +270,10 @@ DECLARE_PER_CPU(mce_banks_t, mce_poll_banks); enum mcp_flags { MCP_TIMESTAMP = BIT(0), /* log time stamp */ MCP_UC = BIT(1), /* log uncorrected errors */ - MCP_DONTLOG = BIT(2), /* only clear, don't log */ - MCP_QUEUE_LOG = BIT(3), /* only queue to genpool */ + MCP_QUEUE_LOG = BIT(2), /* only queue to genpool */ }; -bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b); + +void machine_check_poll(enum mcp_flags flags, mce_banks_t *b); int mce_notify_irq(void); @@ -342,15 +356,9 @@ enum smca_bank_types { extern const char *smca_get_long_name(enum smca_bank_types t); extern bool amd_mce_is_memory_error(struct mce *m); -extern int mce_threshold_create_device(unsigned int cpu); -extern int mce_threshold_remove_device(unsigned int cpu); - void mce_amd_feature_init(struct cpuinfo_x86 *c); enum smca_bank_types smca_get_bank_type(unsigned int cpu, unsigned int bank); #else - -static inline int mce_threshold_create_device(unsigned int cpu) { return 0; }; -static inline int mce_threshold_remove_device(unsigned int cpu) { return 0; }; static inline bool amd_mce_is_memory_error(struct mce *m) { return false; }; static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { } #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 4018fb35165cdb30d6f8f1948fec87786a9962ae..7b640c720c107d6a88952a003d5b735d63ae9be2 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1798,6 +1798,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) setup_clear_cpu_cap(X86_FEATURE_LA57); detect_nopl(); + mca_bsp_init(c); } void __init init_cpu_devs(void) diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 8851f981693b884c2222c6b4e36ed7f5dfffa318..09ae0a18f4414e80bb7ce0294013363a18ea21a1 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -358,16 +358,18 @@ static void early_init_intel(struct cpuinfo_x86 *c) } /* - * There is a known erratum on Pentium III and Core Solo - * and Core Duo CPUs. - * " Page with PAT set to WC while associated MTRR is UC - * may consolidate to UC " - * Because of this erratum, it is better to stick with - * setting WC in MTRR rather than using PAT on these CPUs. + * PAT is broken on early family 6 CPUs, the last of which + * is "Yonah" where the erratum is named "AN7": * - * Enable PAT WC only on P4, Core 2 or later CPUs. + * Page with PAT (Page Attribute Table) Set to USWC + * (Uncacheable Speculative Write Combine) While + * Associated MTRR (Memory Type Range Register) Is UC + * (Uncacheable) May Consolidate to UC + * + * Disable PAT and fall back to MTRR on these CPUs. */ - if (c->x86 == 6 && c->x86_model < 15) + if (c->x86_vfm >= INTEL_PENTIUM_PRO && + c->x86_vfm <= INTEL_CORE_YONAH) clear_cpu_cap(c, X86_FEATURE_PAT); /* diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index b88f4fb7b7d947e8076517fe2d22fa0f39fbe7e3..09da81198d33110e51858f123c924440524f8dce 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -4,8 +4,6 @@ * * Written by Jacob Shin - AMD, Inc. * Maintained by: Borislav Petkov - * - * All MC4_MISCi registers are shared between cores on a node. */ #include #include @@ -20,7 +18,6 @@ #include #include -#include #include #include #include @@ -229,6 +226,33 @@ static const struct smca_hwid smca_hwid_mcatypes[] = { #define MAX_MCATYPE_NAME_LEN 30 static char buf_mcatype[MAX_MCATYPE_NAME_LEN]; +struct threshold_block { + /* This block's number within its bank. */ + unsigned int block; + /* MCA bank number that contains this block. */ + unsigned int bank; + /* CPU which controls this block's MCA bank. */ + unsigned int cpu; + /* MCA_MISC MSR address for this block. */ + u32 address; + /* Enable/Disable APIC interrupt. */ + bool interrupt_enable; + /* Bank can generate an interrupt. */ + bool interrupt_capable; + /* Value upon which threshold interrupt is generated. */ + u16 threshold_limit; + /* sysfs object */ + struct kobject kobj; + /* List of threshold blocks within this block's MCA bank. */ + struct list_head miscj; +}; + +struct threshold_bank { + struct kobject *kobj; + /* List of threshold blocks within this MCA bank. */ + struct list_head miscj; +}; + static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks); /* @@ -237,9 +261,6 @@ static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks); */ static DEFINE_PER_CPU(u64, bank_map); -/* Map of banks that have more than MCA_MISC0 available. */ -static DEFINE_PER_CPU(u64, smca_misc_banks_map); - static void amd_threshold_interrupt(void); static void amd_deferred_error_interrupt(void); @@ -249,28 +270,6 @@ static void default_deferred_error_interrupt(void) } void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt; -static void smca_set_misc_banks_map(unsigned int bank, unsigned int cpu) -{ - u32 low, high; - - /* - * For SMCA enabled processors, BLKPTR field of the first MISC register - * (MCx_MISC0) indicates presence of additional MISC regs set (MISC1-4). - */ - if (rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high)) - return; - - if (!(low & MCI_CONFIG_MCAX)) - return; - - if (rdmsr_safe(MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high)) - return; - - if (low & MASK_BLKPTR_LO) - per_cpu(smca_misc_banks_map, cpu) |= BIT_ULL(bank); - -} - static void smca_configure(unsigned int bank, unsigned int cpu) { u8 *bank_counts = this_cpu_ptr(smca_bank_counts); @@ -311,8 +310,6 @@ static void smca_configure(unsigned int bank, unsigned int cpu) wrmsr(smca_config, low, high); } - smca_set_misc_banks_map(bank, cpu); - if (rdmsr_safe(MSR_AMD64_SMCA_MCx_IPID(bank), &low, &high)) { pr_warn("Failed to read MCA_IPID for bank %d\n", bank); return; @@ -340,19 +337,6 @@ struct thresh_restart { u16 old_limit; }; -static inline bool is_shared_bank(int bank) -{ - /* - * Scalable MCA provides for only one core to have access to the MSRs of - * a shared bank. - */ - if (mce_flags.smca) - return false; - - /* Bank 4 is for northbridge reporting and is thus shared */ - return (bank == 4); -} - static const char *bank4_names(const struct threshold_block *b) { switch (b->address) { @@ -417,8 +401,8 @@ static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi) return 1; }; -/* Reprogram MCx_MISC MSR behind this threshold bank. */ -static void threshold_restart_bank(void *_tr) +/* Reprogram MCx_MISC MSR behind this threshold block. */ +static void threshold_restart_block(void *_tr) { struct thresh_restart *tr = _tr; u32 hi, lo; @@ -476,7 +460,7 @@ static void mce_threshold_block_init(struct threshold_block *b, int offset) }; b->threshold_limit = THRESHOLD_MAX; - threshold_restart_bank(&tr); + threshold_restart_block(&tr); }; static int setup_APIC_mce_threshold(int reserved, int new) @@ -523,18 +507,6 @@ static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c) wrmsr(MSR_CU_DEF_ERR, low, high); } -static u32 smca_get_block_address(unsigned int bank, unsigned int block, - unsigned int cpu) -{ - if (!block) - return MSR_AMD64_SMCA_MCx_MISC(bank); - - if (!(per_cpu(smca_misc_banks_map, cpu) & BIT_ULL(bank))) - return 0; - - return MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1); -} - static u32 get_block_address(u32 current_addr, u32 low, u32 high, unsigned int bank, unsigned int block, unsigned int cpu) @@ -544,8 +516,15 @@ static u32 get_block_address(u32 current_addr, u32 low, u32 high, if ((bank >= per_cpu(mce_num_banks, cpu)) || (block >= NR_BLOCKS)) return addr; - if (mce_flags.smca) - return smca_get_block_address(bank, block, cpu); + if (mce_flags.smca) { + if (!block) + return MSR_AMD64_SMCA_MCx_MISC(bank); + + if (!(low & MASK_BLKPTR_LO)) + return 0; + + return MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1); + } /* Fall back to method we used for older processors: */ switch (block) { @@ -675,6 +654,28 @@ static void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank) wrmsrl(MSR_K7_HWCR, hwcr); } +static void amd_apply_cpu_quirks(struct cpuinfo_x86 *c) +{ + struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); + + /* This should be disabled by the BIOS, but isn't always */ + if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) { + /* + * disable GART TBL walk error reporting, which + * trips off incorrectly with the IOMMU & 3ware + * & Cerberus: + */ + clear_bit(10, (unsigned long *)&mce_banks[4].ctl); + } + + /* + * Various K7s with broken bank 0 around. Always disable + * by default. + */ + if (c->x86 == 6 && this_cpu_read(mce_num_banks)) + mce_banks[0].ctl = 0; +} + /* cpu init entry point, called from mce.c with preempt off */ void mce_amd_feature_init(struct cpuinfo_x86 *c) { @@ -682,6 +683,9 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) u32 low = 0, high = 0, address = 0; int offset = -1; + amd_apply_cpu_quirks(c); + + mce_flags.amd_threshold = 1; for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) { if (mce_flags.smca) @@ -712,6 +716,12 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) deferred_error_interrupt_enable(c); } +void smca_bsp_init(void) +{ + mce_threshold_vector = amd_threshold_interrupt; + deferred_error_int_vector = amd_deferred_error_interrupt; +} + bool amd_mce_is_memory_error(struct mce *m) { enum smca_bank_types bank_type; @@ -727,29 +737,30 @@ bool amd_mce_is_memory_error(struct mce *m) static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc) { - struct mce m; + struct mce_hw_err err; + struct mce *m = &err.m; - mce_setup(&m); + mce_prep_record(&err); - m.status = status; - m.misc = misc; - m.bank = bank; - m.tsc = rdtsc(); + m->status = status; + m->misc = misc; + m->bank = bank; + m->tsc = rdtsc(); - if (m.status & MCI_STATUS_ADDRV) { - m.addr = addr; + if (m->status & MCI_STATUS_ADDRV) { + m->addr = addr; - smca_extract_err_addr(&m); + smca_extract_err_addr(m); } if (mce_flags.smca) { - rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), m.ipid); + rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), m->ipid); - if (m.status & MCI_STATUS_SYNDV) - rdmsrl(MSR_AMD64_SMCA_MCx_SYND(bank), m.synd); + if (m->status & MCI_STATUS_SYNDV) + rdmsrl(MSR_AMD64_SMCA_MCx_SYND(bank), m->synd); } - mce_log(&m); + mce_log(&err); } DEFINE_IDTENTRY_SYSVEC(sysvec_deferred_error) @@ -857,7 +868,7 @@ static void log_and_reset_block(struct threshold_block *block) /* Reset threshold block after logging error. */ memset(&tr, 0, sizeof(tr)); tr.b = block; - threshold_restart_bank(&tr); + threshold_restart_block(&tr); } /* @@ -866,9 +877,9 @@ static void log_and_reset_block(struct threshold_block *block) */ static void amd_threshold_interrupt(void) { - struct threshold_block *first_block = NULL, *block = NULL, *tmp = NULL; - struct threshold_bank **bp = this_cpu_read(threshold_banks); + struct threshold_bank **bp = this_cpu_read(threshold_banks), *thr_bank; unsigned int bank, cpu = smp_processor_id(); + struct threshold_block *block, *tmp; /* * Validate that the threshold bank has been initialized already. The @@ -882,20 +893,20 @@ static void amd_threshold_interrupt(void) if (!(per_cpu(bank_map, cpu) & BIT_ULL(bank))) continue; - first_block = bp[bank]->blocks; - if (!first_block) + thr_bank = bp[bank]; + if (!thr_bank) continue; - /* - * The first block is also the head of the list. Check it first - * before iterating over the rest. - */ - log_and_reset_block(first_block); - list_for_each_entry_safe(block, tmp, &first_block->miscj, miscj) + list_for_each_entry_safe(block, tmp, &thr_bank->miscj, miscj) log_and_reset_block(block); } } +void amd_clear_bank(struct mce *m) +{ + mce_wrmsrq(mca_msr_reg(m->bank, MCA_STATUS), 0); +} + /* * Sysfs Interface */ @@ -931,7 +942,7 @@ store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size) memset(&tr, 0, sizeof(tr)); tr.b = b; - if (smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1)) + if (smp_call_function_single(b->cpu, threshold_restart_block, &tr, 1)) return -ENODEV; return size; @@ -956,7 +967,7 @@ store_threshold_limit(struct threshold_block *b, const char *buf, size_t size) b->threshold_limit = new; tr.b = b; - if (smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1)) + if (smp_call_function_single(b->cpu, threshold_restart_block, &tr, 1)) return -ENODEV; return size; @@ -1117,13 +1128,7 @@ static int allocate_threshold_blocks(unsigned int cpu, struct threshold_bank *tb default_attrs[2] = NULL; } - INIT_LIST_HEAD(&b->miscj); - - /* This is safe as @tb is not visible yet */ - if (tb->blocks) - list_add(&b->miscj, &tb->blocks->miscj); - else - tb->blocks = b; + list_add(&b->miscj, &tb->miscj); err = kobject_init_and_add(&b->kobj, &threshold_ktype, tb->kobj, get_name(cpu, bank, b)); if (err) @@ -1150,35 +1155,10 @@ static int allocate_threshold_blocks(unsigned int cpu, struct threshold_bank *tb return err; } -static int __threshold_add_blocks(struct threshold_bank *b) -{ - struct list_head *head = &b->blocks->miscj; - struct threshold_block *pos = NULL; - struct threshold_block *tmp = NULL; - int err = 0; - - err = kobject_add(&b->blocks->kobj, b->kobj, b->blocks->kobj.name); - if (err) - return err; - - list_for_each_entry_safe(pos, tmp, head, miscj) { - - err = kobject_add(&pos->kobj, b->kobj, pos->kobj.name); - if (err) { - list_for_each_entry_safe_reverse(pos, tmp, head, miscj) - kobject_del(&pos->kobj); - - return err; - } - } - return err; -} - static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu, unsigned int bank) { struct device *dev = this_cpu_read(mce_device); - struct amd_northbridge *nb = NULL; struct threshold_bank *b = NULL; const char *name = get_name(cpu, bank, NULL); int err = 0; @@ -1186,26 +1166,6 @@ static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu, if (!dev) return -ENODEV; - if (is_shared_bank(bank)) { - nb = node_to_amd_nb(topology_amd_node_id(cpu)); - - /* threshold descriptor already initialized on this node? */ - if (nb && nb->bank4) { - /* yes, use it */ - b = nb->bank4; - err = kobject_add(b->kobj, &dev->kobj, name); - if (err) - goto out; - - bp[bank] = b; - refcount_inc(&b->cpus); - - err = __threshold_add_blocks(b); - - goto out; - } - } - b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL); if (!b) { err = -ENOMEM; @@ -1219,16 +1179,7 @@ static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu, goto out_free; } - if (is_shared_bank(bank)) { - b->shared = 1; - refcount_set(&b->cpus, 1); - - /* nb is already initialized, see above */ - if (nb) { - WARN_ON(nb->bank4); - nb->bank4 = b; - } - } + INIT_LIST_HEAD(&b->miscj); err = allocate_threshold_blocks(cpu, b, bank, 0, mca_msr_reg(bank, MCA_MISC)); if (err) @@ -1250,55 +1201,15 @@ static void threshold_block_release(struct kobject *kobj) kfree(to_block(kobj)); } -static void deallocate_threshold_blocks(struct threshold_bank *bank) +static void threshold_remove_bank(struct threshold_bank *bank) { struct threshold_block *pos, *tmp; - list_for_each_entry_safe(pos, tmp, &bank->blocks->miscj, miscj) { + list_for_each_entry_safe(pos, tmp, &bank->miscj, miscj) { list_del(&pos->miscj); kobject_put(&pos->kobj); } - kobject_put(&bank->blocks->kobj); -} - -static void __threshold_remove_blocks(struct threshold_bank *b) -{ - struct threshold_block *pos = NULL; - struct threshold_block *tmp = NULL; - - kobject_put(b->kobj); - - list_for_each_entry_safe(pos, tmp, &b->blocks->miscj, miscj) - kobject_put(b->kobj); -} - -static void threshold_remove_bank(struct threshold_bank *bank) -{ - struct amd_northbridge *nb; - - if (!bank->blocks) - goto out_free; - - if (!bank->shared) - goto out_dealloc; - - if (!refcount_dec_and_test(&bank->cpus)) { - __threshold_remove_blocks(bank); - return; - } else { - /* - * The last CPU on this node using the shared bank is going - * away, remove that bank now. - */ - nb = node_to_amd_nb(topology_amd_node_id(smp_processor_id())); - nb->bank4 = NULL; - } - -out_dealloc: - deallocate_threshold_blocks(bank); - -out_free: kobject_put(bank->kobj); kfree(bank); } @@ -1317,12 +1228,12 @@ static void __threshold_remove_device(struct threshold_bank **bp) kfree(bp); } -int mce_threshold_remove_device(unsigned int cpu) +void mce_threshold_remove_device(unsigned int cpu) { struct threshold_bank **bp = this_cpu_read(threshold_banks); if (!bp) - return 0; + return; /* * Clear the pointer before cleaning up, so that the interrupt won't @@ -1331,7 +1242,7 @@ int mce_threshold_remove_device(unsigned int cpu) this_cpu_write(threshold_banks, NULL); __threshold_remove_device(bp); - return 0; + return; } /** @@ -1345,36 +1256,34 @@ int mce_threshold_remove_device(unsigned int cpu) * thread running on @cpu. The callback is invoked on all CPUs which are * online when the callback is installed or during a real hotplug event. */ -int mce_threshold_create_device(unsigned int cpu) +void mce_threshold_create_device(unsigned int cpu) { unsigned int numbanks, bank; struct threshold_bank **bp; - int err; if (!mce_flags.amd_threshold) - return 0; + return; bp = this_cpu_read(threshold_banks); if (bp) - return 0; + return; numbanks = this_cpu_read(mce_num_banks); bp = kcalloc(numbanks, sizeof(*bp), GFP_KERNEL); if (!bp) - return -ENOMEM; + return; for (bank = 0; bank < numbanks; ++bank) { if (!(this_cpu_read(bank_map) & BIT_ULL(bank))) continue; - err = threshold_create_bank(bp, cpu, bank); - if (err) { + if (threshold_create_bank(bp, cpu, bank)) { __threshold_remove_device(bp); - return err; + return; } } this_cpu_write(threshold_banks, bp); if (thresholding_irq_en) mce_threshold_vector = amd_threshold_interrupt; - return 0; + return; } diff --git a/arch/x86/kernel/cpu/mce/apei.c b/arch/x86/kernel/cpu/mce/apei.c index 24098bad6724240bdefcaf3509a91f96f9e77cd6..d1f77a50e99456520dc5572e14102ebdf8eedb95 100644 --- a/arch/x86/kernel/cpu/mce/apei.c +++ b/arch/x86/kernel/cpu/mce/apei.c @@ -28,7 +28,8 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err) { - struct mce m; + struct mce_hw_err err; + struct mce *m; int lsb; if (!(mem_err->validation_bits & CPER_MEM_VALID_PA)) @@ -44,28 +45,31 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err) else lsb = PAGE_SHIFT; - mce_setup(&m); - m.bank = -1; + mce_prep_record(&err); + m = &err.m; + m->bank = -1; /* Fake a memory read error with unknown channel */ - m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | MCI_STATUS_MISCV | 0x9f; - m.misc = (MCI_MISC_ADDR_PHYS << 6) | lsb; + m->status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | MCI_STATUS_MISCV | 0x9f; + m->misc = (MCI_MISC_ADDR_PHYS << 6) | lsb; if (severity >= GHES_SEV_RECOVERABLE) - m.status |= MCI_STATUS_UC; + m->status |= MCI_STATUS_UC; if (severity >= GHES_SEV_PANIC) { - m.status |= MCI_STATUS_PCC; - m.tsc = rdtsc(); + m->status |= MCI_STATUS_PCC; + m->tsc = rdtsc(); } - m.addr = mem_err->physical_addr; - mce_log(&m); + m->addr = mem_err->physical_addr; + mce_log(&err); } EXPORT_SYMBOL_GPL(apei_mce_report_mem_error); void zx_apei_mce_report_mem_error(struct cper_sec_mem_err *mem_err) { - struct mce m; + struct mce_hw_err err; + struct mce *m; + int apei_error = 0; if (boot_cpu_data.x86 != 7 || boot_cpu_data.x86_model != 91) @@ -74,167 +78,176 @@ void zx_apei_mce_report_mem_error(struct cper_sec_mem_err *mem_err) if (!(mem_err->validation_bits & CPER_MEM_VALID_PA)) return; - mce_setup(&m); - m.misc = 0; - m.misc = mem_err->module; - m.addr = mem_err->physical_addr; + mce_prep_record(&err); + m = &err.m; + m->misc = 0; + m->misc = mem_err->module; + m->addr = mem_err->physical_addr; if (mem_err->card == 0) - m.bank = 9; + m->bank = 9; else - m.bank = 10; + m->bank = 10; switch (mem_err->error_type) { case 2: - m.status = 0x9c20004000010080; + m->status = 0x9c20004000010080; break; case 3: - m.status = 0xbe40000000020090; - apei_error = apei_write_mce(&m); + m->status = 0xbe40000000020090; + apei_error = apei_write_mce(m); break; case 8: if (mem_err->requestor_id == 2) { - m.status = 0x98200040000400b0; + m->status = 0x98200040000400b0; } else if (mem_err->requestor_id == 3) { - m.status = 0xba400000000600a0; - apei_error = apei_write_mce(&m); + m->status = 0xba400000000600a0; + apei_error = apei_write_mce(m); } else if (mem_err->requestor_id == 4) { - m.status = 0x98200100000300b0; + m->status = 0x98200100000300b0; } else if (mem_err->requestor_id == 5) { - m.status = 0xba000000000500b0; - apei_error = apei_write_mce(&m); + m->status = 0xba000000000500b0; + apei_error = apei_write_mce(m); } else { pr_info("Undefined Parity error\n"); } break; case 10: if (mem_err->requestor_id == 6) { - m.status = 0xba400000000700a0; - apei_error = apei_write_mce(&m); + m->status = 0xba400000000700a0; + apei_error = apei_write_mce(m); } else if (mem_err->requestor_id == 7) { - m.status = 0xba000000000800b0; - apei_error = apei_write_mce(&m); + m->status = 0xba000000000800b0; + apei_error = apei_write_mce(m); } else { pr_info("Undefined dvad error\n"); } break; case 13: - m.status = 0x9c200040000100c0; + m->status = 0x9c200040000100c0; break; case 14: - m.status = 0xbd000000000200c0; - apei_error = apei_write_mce(&m); + m->status = 0xbd000000000200c0; + apei_error = apei_write_mce(m); break; } - mce_log(&m); + mce_log(&err); } EXPORT_SYMBOL_GPL(zx_apei_mce_report_mem_error); void zx_apei_mce_report_pcie_error(int severity, struct cper_sec_pcie *pcie_err) { - struct mce m; + struct mce_hw_err err; + struct mce *m; + int apei_error = 0; if (boot_cpu_data.x86 != 7 || boot_cpu_data.x86_model != 91) return; - mce_setup(&m); - m.addr = 0; - m.misc = 0; - m.misc |= (u64)pcie_err->device_id.segment << 32; - m.misc |= pcie_err->device_id.bus << 24; - m.misc |= pcie_err->device_id.device << 19; - m.misc |= pcie_err->device_id.function << 16; - m.bank = 6; + mce_prep_record(&err); + m = &err.m; + m->addr = 0; + m->misc = 0; + m->misc |= (u64)pcie_err->device_id.segment << 32; + m->misc |= pcie_err->device_id.bus << 24; + m->misc |= pcie_err->device_id.device << 19; + m->misc |= pcie_err->device_id.function << 16; + m->bank = 6; switch (severity) { case 1: - m.status = 0x9820004000020e0b; + m->status = 0x9820004000020e0b; break; case 2: - m.status = 0xba20000000010e0b; + m->status = 0xba20000000010e0b; break; case 3: - m.status = 0xbd20000000000e0b; - apei_error = apei_write_mce(&m); + m->status = 0xbd20000000000e0b; + apei_error = apei_write_mce(m); break; default: pr_info("Undefine pcie error\n"); break; } - mce_log(&m); + mce_log(&err); } EXPORT_SYMBOL_GPL(zx_apei_mce_report_pcie_error); void zx_apei_mce_report_zdi_error(struct cper_sec_proc_generic *zdi_err) { - struct mce m; + struct mce_hw_err err; + struct mce *m; + int apei_error = 0; if (boot_cpu_data.x86 != 7 || boot_cpu_data.x86_model != 91) return; - mce_setup(&m); - m.misc = 0; - m.misc |= (zdi_err->requestor_id & 0xff) << 19; - m.misc |= ((zdi_err->requestor_id & 0xff00) >> 8) >> 24; - m.bank = 5; + mce_prep_record(&err); + m = &err.m; + m->misc = 0; + m->misc |= (zdi_err->requestor_id & 0xff) << 19; + m->misc |= ((zdi_err->requestor_id & 0xff00) >> 8) >> 24; + m->bank = 5; switch (zdi_err->responder_id) { case 2: - m.status = 0xba00000000040e0f; - apei_error = apei_write_mce(&m); + m->status = 0xba00000000040e0f; + apei_error = apei_write_mce(m); break; case 3: - m.status = 0xba00000000030e0f; - apei_error = apei_write_mce(&m); + m->status = 0xba00000000030e0f; + apei_error = apei_write_mce(m); break; case 4: - m.status = 0xba00000000020e0f; - apei_error = apei_write_mce(&m); + m->status = 0xba00000000020e0f; + apei_error = apei_write_mce(m); break; case 5: - m.status = 0xba00000000010e0f; - apei_error = apei_write_mce(&m); + m->status = 0xba00000000010e0f; + apei_error = apei_write_mce(m); break; case 6: - m.status = 0x9820004000090e0f; + m->status = 0x9820004000090e0f; break; case 7: - m.status = 0x9820004000080e0f; + m->status = 0x9820004000080e0f; break; case 8: - m.status = 0x9820004000070e0f; + m->status = 0x9820004000070e0f; break; case 9: - m.status = 0x9820004000060e0f; + m->status = 0x9820004000060e0f; break; case 10: - m.status = 0x9820004000050e0f; + m->status = 0x9820004000050e0f; break; case 11: case 12: case 13: case 14: case 15: - m.status = 0x98200040000b0e0f; + m->status = 0x98200040000b0e0f; break; case 16: case 17: case 18: - m.status = 0x98200040000c0e0f; + m->status = 0x98200040000c0e0f; break; default: pr_info("Undefined ZDI Error\n"); break; } - mce_log(&m); + mce_log(&err); } EXPORT_SYMBOL_GPL(zx_apei_mce_report_zdi_error); int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id) { const u64 *i_mce = ((const u64 *) (ctx_info + 1)); + bool apicid_found = false; + struct mce_hw_err err; unsigned int cpu; - struct mce m; + struct mce *m; if (!boot_cpu_has(X86_FEATURE_SMCA)) return -EINVAL; @@ -264,29 +277,30 @@ int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id) if (ctx_info->reg_arr_size < 48) return -EINVAL; - mce_setup(&m); - - m.extcpu = -1; - m.socketid = -1; - for_each_possible_cpu(cpu) { if (cpu_data(cpu).topo.initial_apicid == lapic_id) { - m.extcpu = cpu; - m.socketid = cpu_data(m.extcpu).topo.pkg_id; + apicid_found = true; break; } } - m.apicid = lapic_id; - m.bank = (ctx_info->msr_addr >> 4) & 0xFF; - m.status = *i_mce; - m.addr = *(i_mce + 1); - m.misc = *(i_mce + 2); + if (!apicid_found) + return -EINVAL; + + m = &err.m; + memset(&err, 0, sizeof(struct mce_hw_err)); + mce_prep_record_common(m); + mce_prep_record_per_cpu(cpu, m); + + m->bank = (ctx_info->msr_addr >> 4) & 0xFF; + m->status = *i_mce; + m->addr = *(i_mce + 1); + m->misc = *(i_mce + 2); /* Skipping MCA_CONFIG */ - m.ipid = *(i_mce + 4); - m.synd = *(i_mce + 5); + m->ipid = *(i_mce + 4); + m->synd = *(i_mce + 5); - mce_log(&m); + mce_log(&err); return 0; } diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index ba05002df26242b3d9d045f6724b81e00606ce8a..947adbc6ed2b497160e552cbed51bd286fd6cf3e 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -46,7 +46,7 @@ #include #include -#include +#include #include #include #include @@ -86,7 +86,7 @@ struct mca_config mca_cfg __read_mostly = { .monarch_timeout = -1 }; -static DEFINE_PER_CPU(struct mce, mces_seen); +static DEFINE_PER_CPU(struct mce_hw_err, hw_errs_seen); static unsigned long mce_need_notify; /* @@ -115,28 +115,41 @@ static struct irq_work mce_irq_work; */ BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain); -/* Do initial initialization of a struct mce */ -void mce_setup(struct mce *m) +void mce_prep_record_common(struct mce *m) { - memset(m, 0, sizeof(struct mce)); - m->cpu = m->extcpu = smp_processor_id(); + m->cpuid = cpuid_eax(1); + m->cpuvendor = boot_cpu_data.x86_vendor; + m->mcgcap = __rdmsr(MSR_IA32_MCG_CAP); /* need the internal __ version to avoid deadlocks */ - m->time = __ktime_get_real_seconds(); - m->cpuvendor = boot_cpu_data.x86_vendor; - m->cpuid = cpuid_eax(1); - m->socketid = cpu_data(m->extcpu).topo.pkg_id; - m->apicid = cpu_data(m->extcpu).topo.initial_apicid; - m->mcgcap = __rdmsr(MSR_IA32_MCG_CAP); - m->ppin = cpu_data(m->extcpu).ppin; - m->microcode = boot_cpu_data.microcode; + m->time = __ktime_get_real_seconds(); +} + +void mce_prep_record_per_cpu(unsigned int cpu, struct mce *m) +{ + m->cpu = cpu; + m->extcpu = cpu; + m->apicid = cpu_data(cpu).topo.initial_apicid; + m->microcode = cpu_data(cpu).microcode; + m->ppin = topology_ppin(cpu); + m->socketid = topology_physical_package_id(cpu); +} + +/* Do initial initialization of struct mce_hw_err */ +void mce_prep_record(struct mce_hw_err *err) +{ + struct mce *m = &err->m; + + memset(err, 0, sizeof(struct mce_hw_err)); + mce_prep_record_common(m); + mce_prep_record_per_cpu(smp_processor_id(), m); } DEFINE_PER_CPU(struct mce, injectm); EXPORT_PER_CPU_SYMBOL_GPL(injectm); -void mce_log(struct mce *m) +void mce_log(struct mce_hw_err *err) { - if (!mce_gen_pool_add(m)) + if (mce_gen_pool_add(err)) irq_work_queue(&mce_irq_work); } EXPORT_SYMBOL_GPL(mce_log); @@ -157,8 +170,10 @@ void mce_unregister_decode_chain(struct notifier_block *nb) } EXPORT_SYMBOL_GPL(mce_unregister_decode_chain); -static void __print_mce(struct mce *m) +static void __print_mce(struct mce_hw_err *err) { + struct mce *m = &err->m; + pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n", m->extcpu, (m->mcgstatus & MCG_STATUS_MCIP ? " Exception" : ""), @@ -200,9 +215,11 @@ static void __print_mce(struct mce *m) m->microcode); } -static void print_mce(struct mce *m) +static void print_mce(struct mce_hw_err *err) { - __print_mce(m); + struct mce *m = &err->m; + + __print_mce(err); if (m->cpuvendor != X86_VENDOR_AMD && m->cpuvendor != X86_VENDOR_HYGON) pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n"); @@ -229,7 +246,7 @@ static void wait_for_panic(void) panic("Panicing machine check CPU died"); } -static noinstr void mce_panic(const char *msg, struct mce *final, char *exp) +static noinstr void mce_panic(const char *msg, struct mce_hw_err *final, char *exp) { struct llist_node *pending; struct mce_evt_llist *l; @@ -260,20 +277,22 @@ static noinstr void mce_panic(const char *msg, struct mce *final, char *exp) pending = mce_gen_pool_prepare_records(); /* First print corrected ones that are still unlogged */ llist_for_each_entry(l, pending, llnode) { - struct mce *m = &l->mce; + struct mce_hw_err *err = &l->err; + struct mce *m = &err->m; if (!(m->status & MCI_STATUS_UC)) { - print_mce(m); + print_mce(err); if (!apei_err) apei_err = apei_write_mce(m); } } /* Now print uncorrected but with the final one last */ llist_for_each_entry(l, pending, llnode) { - struct mce *m = &l->mce; + struct mce_hw_err *err = &l->err; + struct mce *m = &err->m; if (!(m->status & MCI_STATUS_UC)) continue; - if (!final || mce_cmp(m, final)) { - print_mce(m); + if (!final || mce_cmp(m, &final->m)) { + print_mce(err); if (!apei_err) apei_err = apei_write_mce(m); } @@ -281,7 +300,7 @@ static noinstr void mce_panic(const char *msg, struct mce *final, char *exp) if (final) { print_mce(final); if (!apei_err) - apei_err = apei_write_mce(final); + apei_err = apei_write_mce(&final->m); } if (exp) pr_emerg(HW_ERR "Machine check: %s\n", exp); @@ -296,8 +315,8 @@ static noinstr void mce_panic(const char *msg, struct mce *final, char *exp) * panic. */ if (kexec_crash_loaded()) { - if (final && (final->status & MCI_STATUS_ADDRV)) { - p = pfn_to_online_page(final->addr >> PAGE_SHIFT); + if (final && (final->m.status & MCI_STATUS_ADDRV)) { + p = pfn_to_online_page(final->m.addr >> PAGE_SHIFT); if (p) SetPageHWPoison(p); } @@ -384,7 +403,7 @@ noinstr u64 mce_rdmsrl(u32 msr) return EAX_EDX_VAL(val, low, high); } -static noinstr void mce_wrmsrl(u32 msr, u64 v) +noinstr void mce_wrmsrq(u32 msr, u64 v) { u32 low, high; @@ -417,16 +436,18 @@ static noinstr void mce_wrmsrl(u32 msr, u64 v) * check into our "mce" struct so that we can use it later to assess * the severity of the problem as we read per-bank specific details. */ -static noinstr void mce_gather_info(struct mce *m, struct pt_regs *regs) +static noinstr void mce_gather_info(struct mce_hw_err *err, struct pt_regs *regs) { + struct mce *m; /* - * Enable instrumentation around mce_setup() which calls external + * Enable instrumentation around mce_prep_record() which calls external * facilities. */ instrumentation_begin(); - mce_setup(m); + mce_prep_record(err); instrumentation_end(); + m = &err->m; m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); if (regs) { /* @@ -558,13 +579,13 @@ EXPORT_SYMBOL_GPL(mce_is_correctable); static int mce_early_notifier(struct notifier_block *nb, unsigned long val, void *data) { - struct mce *m = (struct mce *)data; + struct mce_hw_err *err = to_mce_hw_err(data); - if (!m) + if (!err) return NOTIFY_DONE; /* Emit the trace record: */ - trace_mce_record(m); + trace_mce_record(err); set_bit(0, &mce_need_notify); @@ -608,13 +629,13 @@ static struct notifier_block mce_uc_nb = { static int mce_default_notifier(struct notifier_block *nb, unsigned long val, void *data) { - struct mce *m = (struct mce *)data; + struct mce_hw_err *err = to_mce_hw_err(data); - if (!m) + if (!err) return NOTIFY_DONE; - if (mca_cfg.print_all || !m->kflags) - __print_mce(m); + if (mca_cfg.print_all || !(err->m.kflags)) + __print_mce(err); return NOTIFY_DONE; } @@ -628,8 +649,10 @@ static struct notifier_block mce_default_nb = { /* * Read ADDR and MISC registers. */ -static noinstr void mce_read_aux(struct mce *m, int i) +static noinstr void mce_read_aux(struct mce_hw_err *err, int i) { + struct mce *m = &err->m; + if (m->status & MCI_STATUS_MISCV) m->misc = mce_rdmsrl(mca_msr_reg(i, MCA_MISC)); @@ -658,6 +681,60 @@ static noinstr void mce_read_aux(struct mce *m, int i) DEFINE_PER_CPU(unsigned, mce_poll_count); +/* + * Newer Intel systems that support software error + * recovery need to make additional checks. Other + * CPUs should skip over uncorrected errors, but log + * everything else. + */ +static bool ser_should_log_poll_error(struct mce *m) +{ + /* Log "not enabled" (speculative) errors */ + if (!(m->status & MCI_STATUS_EN)) + return true; + + /* + * Log UCNA (SDM: 15.6.3 "UCR Error Classification") + * UC == 1 && PCC == 0 && S == 0 + */ + if (!(m->status & MCI_STATUS_PCC) && !(m->status & MCI_STATUS_S)) + return true; + + return false; +} + +static bool should_log_poll_error(enum mcp_flags flags, struct mce_hw_err *err) +{ + struct mce *m = &err->m; + + /* If this entry is not valid, ignore it. */ + if (!(m->status & MCI_STATUS_VAL)) + return false; + + /* + * If we are logging everything (at CPU online) or this + * is a corrected error, then we must log it. + */ + if ((flags & MCP_UC) || !(m->status & MCI_STATUS_UC)) + return true; + + if (mca_cfg.ser) + return ser_should_log_poll_error(m); + + if (m->status & MCI_STATUS_UC) + return false; + + return true; +} + +static void clear_bank(struct mce *m) +{ + if (m->cpuvendor == X86_VENDOR_AMD) + return amd_clear_bank(m); + + mce_wrmsrq(mca_msr_reg(m->bank, MCA_STATUS), 0); +} + /* * Poll for corrected events or events that happened before reset. * Those are just logged through /dev/mcelog. @@ -673,98 +750,63 @@ DEFINE_PER_CPU(unsigned, mce_poll_count); * is already totally * confused. In this case it's likely it will * not fully execute the machine check handler either. */ -bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) +void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) { struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); - bool error_seen = false; - struct mce m; + struct mce_hw_err err; + struct mce *m; int i; this_cpu_inc(mce_poll_count); - mce_gather_info(&m, NULL); + mce_gather_info(&err, NULL); + m = &err.m; if (flags & MCP_TIMESTAMP) - m.tsc = rdtsc(); + m->tsc = rdtsc(); for (i = 0; i < this_cpu_read(mce_num_banks); i++) { if (!mce_banks[i].ctl || !test_bit(i, *b)) continue; - m.misc = 0; - m.addr = 0; - m.bank = i; + m->misc = 0; + m->addr = 0; + m->bank = i; barrier(); - m.status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS)); - - /* If this entry is not valid, ignore it */ - if (!(m.status & MCI_STATUS_VAL)) - continue; - - /* - * If we are logging everything (at CPU online) or this - * is a corrected error, then we must log it. - */ - if ((flags & MCP_UC) || !(m.status & MCI_STATUS_UC)) - goto log_it; - - /* - * Newer Intel systems that support software error - * recovery need to make additional checks. Other - * CPUs should skip over uncorrected errors, but log - * everything else. - */ - if (!mca_cfg.ser) { - if (m.status & MCI_STATUS_UC) - continue; - goto log_it; - } - - /* Log "not enabled" (speculative) errors */ - if (!(m.status & MCI_STATUS_EN)) - goto log_it; - - /* - * Log UCNA (SDM: 15.6.3 "UCR Error Classification") - * UC == 1 && PCC == 0 && S == 0 - */ - if (!(m.status & MCI_STATUS_PCC) && !(m.status & MCI_STATUS_S)) - goto log_it; + m->status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS)); /* - * Skip anything else. Presumption is that our read of this - * bank is racing with a machine check. Leave the log alone - * for do_machine_check() to deal with it. + * Update storm tracking here, before checking for the + * MCI_STATUS_VAL bit. Valid corrected errors count + * towards declaring, or maintaining, storm status. No + * error in a bank counts towards avoiding, or ending, + * storm status. */ - continue; + if (!mca_cfg.cmci_disabled) + mce_track_storm(m); -log_it: - error_seen = true; - - if (flags & MCP_DONTLOG) - goto clear_it; + /* Verify that the error should be logged based on hardware conditions. */ + if (!should_log_poll_error(flags, &err)) + continue; - mce_read_aux(&m, i); - m.severity = mce_severity(&m, NULL, NULL, false); + mce_read_aux(&err, i); + m->severity = mce_severity(m, NULL, NULL, false); /* * Don't get the IP here because it's unlikely to * have anything to do with the actual error location. */ - if (mca_cfg.dont_log_ce && !mce_usable_address(&m)) + if (mca_cfg.dont_log_ce && !mce_usable_address(m)) goto clear_it; if (flags & MCP_QUEUE_LOG) - mce_gen_pool_add(&m); + mce_gen_pool_add(&err); else - mce_log(&m); + mce_log(&err); clear_it: - /* - * Clear state for this bank. - */ - mce_wrmsrl(mca_msr_reg(i, MCA_STATUS), 0); + clear_bank(m); } /* @@ -773,8 +815,6 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) */ sync_core(); - - return error_seen; } EXPORT_SYMBOL_GPL(machine_check_poll); @@ -847,8 +887,8 @@ static noinstr bool quirk_skylake_repmov(void) MCI_STATUS_ADDRV | MCI_STATUS_MISCV | MCI_STATUS_AR | MCI_STATUS_S)) { misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING; - mce_wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable); - mce_wrmsrl(MSR_IA32_MCx_STATUS(1), 0); + mce_wrmsrq(MSR_IA32_MISC_ENABLE, misc_enable); + mce_wrmsrq(MSR_IA32_MCx_STATUS(1), 0); instrumentation_begin(); pr_err_once("Erratum detected, disable fast string copy instructions.\n"); @@ -884,9 +924,10 @@ static __always_inline void quirk_zen_ifu(int bank, struct mce *m, struct pt_reg * Do a quick check if any of the events requires a panic. * This decides if we keep the events around or clear them. */ -static __always_inline int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, +static __always_inline int mce_no_way_out(struct mce_hw_err *err, char **msg, unsigned long *validp, struct pt_regs *regs) { + struct mce *m = &err->m; char *tmp = *msg; int i; @@ -904,7 +945,7 @@ static __always_inline int mce_no_way_out(struct mce *m, char **msg, unsigned lo m->bank = i; if (mce_severity(m, regs, &tmp, true) >= MCE_PANIC_SEVERITY) { - mce_read_aux(m, i); + mce_read_aux(err, i); *msg = tmp; return 1; } @@ -995,10 +1036,11 @@ static noinstr int mce_timed_out(u64 *t, const char *msg) */ static void mce_reign(void) { - int cpu; + struct mce_hw_err *err = NULL; struct mce *m = NULL; int global_worst = 0; char *msg = NULL; + int cpu; /* * This CPU is the Monarch and the other CPUs have run @@ -1006,11 +1048,13 @@ static void mce_reign(void) * Grade the severity of the errors of all the CPUs. */ for_each_possible_cpu(cpu) { - struct mce *mtmp = &per_cpu(mces_seen, cpu); + struct mce_hw_err *etmp = &per_cpu(hw_errs_seen, cpu); + struct mce *mtmp = &etmp->m; if (mtmp->severity > global_worst) { global_worst = mtmp->severity; - m = &per_cpu(mces_seen, cpu); + err = &per_cpu(hw_errs_seen, cpu); + m = &err->m; } } @@ -1022,7 +1066,7 @@ static void mce_reign(void) if (m && global_worst >= MCE_PANIC_SEVERITY) { /* call mce_severity() to get "msg" for panic */ mce_severity(m, NULL, &msg, true); - mce_panic("Fatal machine check", m, msg); + mce_panic("Fatal machine check", err, msg); } /* @@ -1039,11 +1083,11 @@ static void mce_reign(void) mce_panic("Fatal machine check from unknown source", NULL, NULL); /* - * Now clear all the mces_seen so that they don't reappear on + * Now clear all the hw_errs_seen so that they don't reappear on * the next mce. */ for_each_possible_cpu(cpu) - memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce)); + memset(&per_cpu(hw_errs_seen, cpu), 0, sizeof(struct mce_hw_err)); } static atomic_t global_nwo; @@ -1207,7 +1251,7 @@ static __always_inline void mce_clear_state(unsigned long *toclear) for (i = 0; i < this_cpu_read(mce_num_banks); i++) { if (arch_test_bit(i, toclear)) - mce_wrmsrl(mca_msr_reg(i, MCA_STATUS), 0); + mce_wrmsrq(mca_msr_reg(i, MCA_STATUS), 0); } } @@ -1248,13 +1292,14 @@ static noinstr bool mce_check_crashing_cpu(void) } static __always_inline int -__mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final, - unsigned long *toclear, unsigned long *valid_banks, int no_way_out, - int *worst) +__mc_scan_banks(struct mce_hw_err *err, struct pt_regs *regs, + struct mce_hw_err *final, unsigned long *toclear, + unsigned long *valid_banks, int no_way_out, int *worst) { struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); struct mca_config *cfg = &mca_cfg; int severity, i, taint = 0; + struct mce *m = &err->m; for (i = 0; i < this_cpu_read(mce_num_banks); i++) { arch___clear_bit(i, toclear); @@ -1299,7 +1344,7 @@ __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final, if (severity == MCE_NO_SEVERITY) continue; - mce_read_aux(m, i); + mce_read_aux(err, i); /* assuming valid severity level != 0 */ m->severity = severity; @@ -1309,17 +1354,17 @@ __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final, * done in #MC context, where instrumentation is disabled. */ instrumentation_begin(); - mce_log(m); + mce_log(err); instrumentation_end(); if (severity > *worst) { - *final = *m; + *final = *err; *worst = severity; } } /* mce_clear_state will clear *final, save locally for use later */ - *m = *final; + *err = *final; return taint; } @@ -1379,9 +1424,10 @@ static void kill_me_never(struct callback_head *cb) set_mce_nospec(pfn); } -static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callback_head *)) +static void queue_task_work(struct mce_hw_err *err, char *msg, void (*func)(struct callback_head *)) { int count = ++current->mce_count; + struct mce *m = &err->m; /* First call, save all the details */ if (count == 1) { @@ -1394,11 +1440,12 @@ static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callba /* Ten is likely overkill. Don't expect more than two faults before task_work() */ if (count > 10) - mce_panic("Too many consecutive machine checks while accessing user data", m, msg); + mce_panic("Too many consecutive machine checks while accessing user data", + err, msg); /* Second or later call, make sure page address matches the one from first call */ if (count > 1 && (current->mce_addr >> PAGE_SHIFT) != (m->addr >> PAGE_SHIFT)) - mce_panic("Consecutive machine checks to different user pages", m, msg); + mce_panic("Consecutive machine checks to different user pages", err, msg); /* Do not call task_work_add() more than once */ if (count > 1) @@ -1447,8 +1494,10 @@ noinstr void do_machine_check(struct pt_regs *regs) int worst = 0, order, no_way_out, kill_current_task, lmce, taint = 0; DECLARE_BITMAP(valid_banks, MAX_NR_BANKS) = { 0 }; DECLARE_BITMAP(toclear, MAX_NR_BANKS) = { 0 }; - struct mce m, *final; + struct mce_hw_err *final; + struct mce_hw_err err; char *msg = NULL; + struct mce *m; if (unlikely(mce_flags.p5)) return pentium_machine_check(regs); @@ -1486,13 +1535,14 @@ noinstr void do_machine_check(struct pt_regs *regs) this_cpu_inc(mce_exception_count); - mce_gather_info(&m, regs); - m.tsc = rdtsc(); + mce_gather_info(&err, regs); + m = &err.m; + m->tsc = rdtsc(); - final = this_cpu_ptr(&mces_seen); - *final = m; + final = this_cpu_ptr(&hw_errs_seen); + *final = err; - no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs); + no_way_out = mce_no_way_out(&err, &msg, valid_banks, regs); barrier(); @@ -1501,16 +1551,16 @@ noinstr void do_machine_check(struct pt_regs *regs) * Assume the worst for now, but if we find the * severity is MCE_AR_SEVERITY we have other options. */ - if (!(m.mcgstatus & MCG_STATUS_RIPV)) + if (!(m->mcgstatus & MCG_STATUS_RIPV)) kill_current_task = 1; /* * Check if this MCE is signaled to only this logical processor, * on Intel, Zhaoxin only. */ - if (m.cpuvendor == X86_VENDOR_INTEL || - m.cpuvendor == X86_VENDOR_CENTAUR || - m.cpuvendor == X86_VENDOR_ZHAOXIN) - lmce = m.mcgstatus & MCG_STATUS_LMCES; + if (m->cpuvendor == X86_VENDOR_INTEL || + m->cpuvendor == X86_VENDOR_CENTAUR || + m->cpuvendor == X86_VENDOR_ZHAOXIN) + lmce = m->mcgstatus & MCG_STATUS_LMCES; /* * Local machine check may already know that we have to panic. @@ -1521,12 +1571,12 @@ noinstr void do_machine_check(struct pt_regs *regs) */ if (lmce) { if (no_way_out) - mce_panic("Fatal local machine check", &m, msg); + mce_panic("Fatal local machine check", &err, msg); } else { order = mce_start(&no_way_out); } - taint = __mc_scan_banks(&m, regs, final, toclear, valid_banks, no_way_out, &worst); + taint = __mc_scan_banks(&err, regs, final, toclear, valid_banks, no_way_out, &worst); if (!no_way_out) mce_clear_state(toclear); @@ -1541,7 +1591,7 @@ noinstr void do_machine_check(struct pt_regs *regs) no_way_out = worst >= MCE_PANIC_SEVERITY; if (no_way_out) - mce_panic("Fatal machine check on current CPU", &m, msg); + mce_panic("Fatal machine check on current CPU", &err, msg); } } else { /* @@ -1553,8 +1603,8 @@ noinstr void do_machine_check(struct pt_regs *regs) * make sure we have the right "msg". */ if (worst >= MCE_PANIC_SEVERITY) { - mce_severity(&m, regs, &msg, true); - mce_panic("Local fatal machine check!", &m, msg); + mce_severity(m, regs, &msg, true); + mce_panic("Local fatal machine check!", &err, msg); } } @@ -1572,15 +1622,33 @@ noinstr void do_machine_check(struct pt_regs *regs) goto out; /* Fault was in user mode and we need to take some action */ - if ((m.cs & 3) == 3) { + if ((m->cs & 3) == 3) { /* If this triggers there is no way to recover. Die hard. */ BUG_ON(!on_thread_stack() || !user_mode(regs)); - if (!mce_usable_address(&m)) - queue_task_work(&m, msg, kill_me_now); + if (!mce_usable_address(m)) + queue_task_work(&err, msg, kill_me_now); else - queue_task_work(&m, msg, kill_me_maybe); + queue_task_work(&err, msg, kill_me_maybe); + } else if (m->mcgstatus & MCG_STATUS_SEAM_NR) { + /* + * Saved RIP on stack makes it look like the machine check + * was taken in the kernel on the instruction following + * the entry to SEAM mode. But MCG_STATUS_SEAM_NR indicates + * that the machine check was taken inside SEAM non-root + * mode. CPU core has already marked that guest as dead. + * It is OK for the kernel to resume execution at the + * apparent point of the machine check as the fault did + * not occur there. Mark the page as poisoned so it won't + * be added to free list when the guest is terminated. + */ + if (mce_usable_address(m)) { + struct page *p = pfn_to_online_page(m->addr >> PAGE_SHIFT); + + if (p) + SetPageHWPoison(p); + } } else { /* * Handle an MCE which has happened in kernel space but from @@ -1591,20 +1659,20 @@ noinstr void do_machine_check(struct pt_regs *regs) * corresponding exception handler which would do that is the * proper one. */ - if (m.kflags & MCE_IN_KERNEL_RECOV) { + if (m->kflags & MCE_IN_KERNEL_RECOV) { if (!fixup_exception(regs, X86_TRAP_MC, 0, 0)) - mce_panic("Failed kernel mode recovery", &m, msg); + mce_panic("Failed kernel mode recovery", &err, msg); } - if (m.kflags & MCE_IN_KERNEL_COPYIN) - queue_task_work(&m, msg, kill_me_never); + if (m->kflags & MCE_IN_KERNEL_COPYIN) + queue_task_work(&err, msg, kill_me_never); } out: instrumentation_end(); clear: - mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); + mce_wrmsrq(MSR_IA32_MCG_STATUS, 0); } EXPORT_SYMBOL_GPL(do_machine_check); @@ -1631,13 +1699,6 @@ static unsigned long check_interval = INITIAL_CHECK_INTERVAL; static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */ static DEFINE_PER_CPU(struct timer_list, mce_timer); -static unsigned long mce_adjust_timer_default(unsigned long interval) -{ - return interval; -} - -static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default; - static void __start_timer(struct timer_list *t, unsigned long interval) { unsigned long when = jiffies + interval; @@ -1658,6 +1719,11 @@ static void mc_poll_banks_default(void) void (*mc_poll_banks)(void) = mc_poll_banks_default; +static bool should_enable_timer(unsigned long iv) +{ + return !mca_cfg.ignore_ce && iv; +} + static void mce_timer_fn(struct timer_list *t) { struct timer_list *cpu_t = this_cpu_ptr(&mce_timer); @@ -1667,15 +1733,9 @@ static void mce_timer_fn(struct timer_list *t) iv = __this_cpu_read(mce_next_interval); - if (mce_available(this_cpu_ptr(&cpu_info))) { + if (mce_available(this_cpu_ptr(&cpu_info))) mc_poll_banks(); - if (mce_intel_cmci_poll()) { - iv = mce_adjust_timer(iv); - goto done; - } - } - /* * Alert userspace if needed. If we logged an MCE, reduce the polling * interval, otherwise increase the polling interval. @@ -1685,23 +1745,29 @@ static void mce_timer_fn(struct timer_list *t) else iv = min(iv * 2, round_jiffies_relative(check_interval * HZ)); -done: - __this_cpu_write(mce_next_interval, iv); - __start_timer(t, iv); + if (mce_get_storm_mode()) { + __start_timer(t, HZ); + } else if (should_enable_timer(iv)) { + __this_cpu_write(mce_next_interval, iv); + __start_timer(t, iv); + } } /* - * Ensure that the timer is firing in @interval from now. + * When a storm starts on any bank on this CPU, switch to polling + * once per second. When the storm ends, revert to the default + * polling interval. */ -void mce_timer_kick(unsigned long interval) +void mce_timer_kick(bool storm) { struct timer_list *t = this_cpu_ptr(&mce_timer); - unsigned long iv = __this_cpu_read(mce_next_interval); - __start_timer(t, interval); + mce_set_storm_mode(storm); - if (interval < iv) - __this_cpu_write(mce_next_interval, interval); + if (storm) + __start_timer(t, HZ); + else + __this_cpu_write(mce_next_interval, check_interval * HZ); } /* Must not be called in IRQ context where del_timer_sync() can deadlock */ @@ -1745,9 +1811,10 @@ static void __mcheck_cpu_mce_banks_init(void) struct mce_bank *b = &mce_banks[i]; /* - * Init them all, __mcheck_cpu_apply_quirks() is going to apply - * the required vendor quirks before - * __mcheck_cpu_init_clear_banks() does the final bank setup. + * Init them all by default. + * + * The required vendor quirks will be applied before + * __mcheck_cpu_init_prepare_banks() does the final bank setup. */ b->ctl = -1ULL; b->init = true; @@ -1775,69 +1842,34 @@ static void __mcheck_cpu_cap_init(void) this_cpu_write(mce_num_banks, b); __mcheck_cpu_mce_banks_init(); - - /* Use accurate RIP reporting if available. */ - if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) - mca_cfg.rip_msr = MSR_IA32_MCG_EIP; - - if (cap & MCG_SER_P) - mca_cfg.ser = 1; } static void __mcheck_cpu_init_generic(void) { - enum mcp_flags m_fl = 0; - mce_banks_t all_banks; u64 cap; - if (!mca_cfg.bootlog) - m_fl = MCP_DONTLOG; - - /* - * Log the machine checks left over from the previous reset. Log them - * only, do not start processing them. That will happen in mcheck_late_init() - * when all consumers have been registered on the notifier chain. - */ - bitmap_fill(all_banks, MAX_NR_BANKS); - machine_check_poll(MCP_UC | MCP_QUEUE_LOG | m_fl, &all_banks); - - cr4_set_bits(X86_CR4_MCE); - rdmsrl(MSR_IA32_MCG_CAP, cap); if (cap & MCG_CTL_P) wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); } -static void __mcheck_cpu_init_clear_banks(void) +static void __mcheck_cpu_init_prepare_banks(void) { struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); + u64 msrval; int i; - for (i = 0; i < this_cpu_read(mce_num_banks); i++) { - struct mce_bank *b = &mce_banks[i]; + /* + * Log the machine checks left over from the previous reset. Log them + * only, do not start processing them. That will happen in mcheck_late_init() + * when all consumers have been registered on the notifier chain. + */ + if (mca_cfg.bootlog) { + mce_banks_t all_banks; - if (!b->init) - continue; - wrmsrl(mca_msr_reg(i, MCA_CTL), b->ctl); - wrmsrl(mca_msr_reg(i, MCA_STATUS), 0); + bitmap_fill(all_banks, MAX_NR_BANKS); + machine_check_poll(MCP_UC | MCP_QUEUE_LOG, &all_banks); } -} - -/* - * Do a final check to see if there are any unused/RAZ banks. - * - * This must be done after the banks have been initialized and any quirks have - * been applied. - * - * Do not call this from any user-initiated flows, e.g. CPU hotplug or sysfs. - * Otherwise, a user who disables a bank will not be able to re-enable it - * without a system reboot. - */ -static void __mcheck_cpu_check_banks(void) -{ - struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); - u64 msrval; - int i; for (i = 0; i < this_cpu_read(mce_num_banks); i++) { struct mce_bank *b = &mce_banks[i]; @@ -1846,115 +1878,75 @@ static void __mcheck_cpu_check_banks(void) continue; rdmsrl(mca_msr_reg(i, MCA_CTL), msrval); + wrmsrl(mca_msr_reg(i, MCA_CTL), b->ctl); + wrmsrl(mca_msr_reg(i, MCA_STATUS), 0); + b->init = !!msrval; } } -/* Add per CPU specific workarounds here */ -static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) +static void amd_apply_global_quirks(struct cpuinfo_x86 *c) { - struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); - struct mca_config *cfg = &mca_cfg; - - if (c->x86_vendor == X86_VENDOR_UNKNOWN) { - pr_info("unknown CPU type - not enabling MCE support\n"); - return -EOPNOTSUPP; - } - - /* This should be disabled by the BIOS, but isn't always */ - if (c->x86_vendor == X86_VENDOR_AMD) { - if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) { - /* - * disable GART TBL walk error reporting, which - * trips off incorrectly with the IOMMU & 3ware - * & Cerberus: - */ - clear_bit(10, (unsigned long *)&mce_banks[4].ctl); - } - if (c->x86 < 0x11 && cfg->bootlog < 0) { - /* - * Lots of broken BIOS around that don't clear them - * by default and leave crap in there. Don't log: - */ - cfg->bootlog = 0; - } - /* - * Various K7s with broken bank 0 around. Always disable - * by default. - */ - if (c->x86 == 6 && this_cpu_read(mce_num_banks) > 0) - mce_banks[0].ctl = 0; - + if (c->x86 < 0x11 && mca_cfg.bootlog < 0) { /* - * overflow_recov is supported for F15h Models 00h-0fh - * even though we don't have a CPUID bit for it. + * Lots of broken BIOS around that don't clear them + * by default and leave crap in there. Don't log: */ - if (c->x86 == 0x15 && c->x86_model <= 0xf) - mce_flags.overflow_recov = 1; - - if (c->x86 >= 0x17 && c->x86 <= 0x1A) - mce_flags.zen_ifu_quirk = 1; - + mca_cfg.bootlog = 0; } - if (c->x86_vendor == X86_VENDOR_INTEL) { - /* - * SDM documents that on family 6 bank 0 should not be written - * because it aliases to another special BIOS controlled - * register. - * But it's not aliased anymore on model 0x1a+ - * Don't ignore bank 0 completely because there could be a - * valid event later, merely don't write CTL0. - */ - - if (c->x86 == 6 && c->x86_model < 0x1A && this_cpu_read(mce_num_banks) > 0) - mce_banks[0].init = false; + /* + * overflow_recov is supported for F15h Models 00h-0fh + * even though we don't have a CPUID bit for it. + */ + if (c->x86 == 0x15 && c->x86_model <= 0xf) + mce_flags.overflow_recov = 1; - /* - * All newer Intel systems support MCE broadcasting. Enable - * synchronization with a one second timeout. - */ - if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && - cfg->monarch_timeout < 0) - cfg->monarch_timeout = USEC_PER_SEC; + if (c->x86 >= 0x17 && c->x86 <= 0x1A) + mce_flags.zen_ifu_quirk = 1; +} - /* - * There are also broken BIOSes on some Pentium M and - * earlier systems: - */ - if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0) - cfg->bootlog = 0; +static void intel_apply_global_quirks(struct cpuinfo_x86 *c) +{ + /* Older CPUs (prior to family 6) don't need quirks. */ + if (c->x86_vfm < INTEL_PENTIUM_PRO) + return; - if (c->x86 == 6 && c->x86_model == 45) - mce_flags.snb_ifu_quirk = 1; + /* + * All newer Intel systems support MCE broadcasting. Enable + * synchronization with a one second timeout. + */ + if (c->x86_vfm >= INTEL_CORE_YONAH && mca_cfg.monarch_timeout < 0) + mca_cfg.monarch_timeout = USEC_PER_SEC; - /* - * Skylake, Cascacde Lake and Cooper Lake require a quirk on - * rep movs. - */ - if (c->x86 == 6 && c->x86_model == INTEL_FAM6_SKYLAKE_X) - mce_flags.skx_repmov_quirk = 1; - } + /* + * There are also broken BIOSes on some Pentium M and + * earlier systems: + */ + if (c->x86_vfm < INTEL_CORE_YONAH && mca_cfg.bootlog < 0) + mca_cfg.bootlog = 0; - if (c->x86_vendor == X86_VENDOR_CENTAUR || - c->x86_vendor == X86_VENDOR_ZHAOXIN) { - /* - * All newer Zhaoxin CPUs support MCE broadcasting. Enable - * synchronization with a one second timeout. - */ - if (c->x86 > 6 || (c->x86_model == 0x19 || c->x86_model == 0x1f)) { - if (cfg->monarch_timeout < 0) - cfg->monarch_timeout = USEC_PER_SEC; - } - mca_cfg.bios_cmci_threshold = 1; - } + if (c->x86_vfm == INTEL_SANDYBRIDGE_X) + mce_flags.snb_ifu_quirk = 1; - if (cfg->monarch_timeout < 0) - cfg->monarch_timeout = 0; - if (cfg->bootlog != 0) - cfg->panic_timeout = 30; + /* + * Skylake, Cascacde Lake and Cooper Lake require a quirk on + * rep movs. + */ + if (c->x86_vfm == INTEL_SKYLAKE_X) + mce_flags.skx_repmov_quirk = 1; +} - return 0; +static void zhaoxin_apply_global_quirks(struct cpuinfo_x86 *c) +{ + /* + * All newer Zhaoxin CPUs support MCE broadcasting. Enable + * synchronization with a one second timeout. + */ + if (c->x86 > 6 || (c->x86_model == 0x19 || c->x86_model == 0x1f)) { + if (mca_cfg.monarch_timeout < 0) + mca_cfg.monarch_timeout = USEC_PER_SEC; + } } static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) @@ -1978,19 +1970,6 @@ static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) return 0; } -/* - * Init basic CPU features needed for early decoding of MCEs. - */ -static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c) -{ - if (c->x86_vendor == X86_VENDOR_AMD || c->x86_vendor == X86_VENDOR_HYGON) { - mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV); - mce_flags.succor = !!cpu_has(c, X86_FEATURE_SUCCOR); - mce_flags.smca = !!cpu_has(c, X86_FEATURE_SMCA); - mce_flags.amd_threshold = 1; - } -} - static void mce_zhaoxin_feature_init(struct cpuinfo_x86 *c) { struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); @@ -2012,7 +1991,6 @@ static void mce_zhaoxin_feature_init(struct cpuinfo_x86 *c) intel_init_cmci(); intel_init_lmce(); - mce_adjust_timer = cmci_intel_adjust_timer; } static void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c) @@ -2025,7 +2003,6 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) switch (c->x86_vendor) { case X86_VENDOR_INTEL: mce_intel_feature_init(c); - mce_adjust_timer = cmci_intel_adjust_timer; break; case X86_VENDOR_AMD: { @@ -2068,11 +2045,10 @@ static void mce_start_timer(struct timer_list *t) { unsigned long iv = check_interval * HZ; - if (mca_cfg.ignore_ce || !iv) - return; - - this_cpu_write(mce_next_interval, iv); - __start_timer(t, iv); + if (should_enable_timer(iv)) { + this_cpu_write(mce_next_interval, iv); + __start_timer(t, iv); + } } static void __mcheck_cpu_setup_timer(void) @@ -2166,6 +2142,53 @@ DEFINE_IDTENTRY_RAW(exc_machine_check) } #endif +void mca_bsp_init(struct cpuinfo_x86 *c) +{ + u64 cap; + + if (!mce_available(c)) + return; + + if (c->x86_vendor == X86_VENDOR_UNKNOWN) { + mca_cfg.disabled = 1; + pr_info("unknown CPU type - not enabling MCE support\n"); + return; + } + + mce_flags.overflow_recov = cpu_feature_enabled(X86_FEATURE_OVERFLOW_RECOV); + mce_flags.succor = cpu_feature_enabled(X86_FEATURE_SUCCOR); + mce_flags.smca = cpu_feature_enabled(X86_FEATURE_SMCA); + + if (mce_flags.smca) + smca_bsp_init(); + + rdmsrl(MSR_IA32_MCG_CAP, cap); + + /* Use accurate RIP reporting if available. */ + if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) + mca_cfg.rip_msr = MSR_IA32_MCG_EIP; + + if (cap & MCG_SER_P) + mca_cfg.ser = 1; + + switch (c->x86_vendor) { + case X86_VENDOR_AMD: + amd_apply_global_quirks(c); + break; + case X86_VENDOR_INTEL: + intel_apply_global_quirks(c); + break; + case X86_VENDOR_ZHAOXIN: + zhaoxin_apply_global_quirks(c); + break; + } + + if (mca_cfg.monarch_timeout < 0) + mca_cfg.monarch_timeout = 0; + if (mca_cfg.bootlog != 0) + mca_cfg.panic_timeout = 30; +} + /* * Called for each booted CPU to set up machine checks. * Must be called with preempt off: @@ -2183,12 +2206,7 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c) __mcheck_cpu_cap_init(); - if (__mcheck_cpu_apply_quirks(c) < 0) { - mca_cfg.disabled = 1; - return; - } - - if (mce_gen_pool_init()) { + if (!mce_gen_pool_init()) { mca_cfg.disabled = 1; pr_emerg("Couldn't allocate MCE records pool!\n"); return; @@ -2196,12 +2214,11 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c) mca_cfg.initialized = 1; - __mcheck_cpu_init_early(c); __mcheck_cpu_init_generic(); __mcheck_cpu_init_vendor(c); - __mcheck_cpu_init_clear_banks(); - __mcheck_cpu_check_banks(); + __mcheck_cpu_init_prepare_banks(); __mcheck_cpu_setup_timer(); + cr4_set_bits(X86_CR4_MCE); } /* @@ -2369,7 +2386,8 @@ static void mce_syscore_resume(void) { __mcheck_cpu_init_generic(); __mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info)); - __mcheck_cpu_init_clear_banks(); + __mcheck_cpu_init_prepare_banks(); + cr4_set_bits(X86_CR4_MCE); } static struct syscore_ops mce_syscore_ops = { @@ -2387,8 +2405,9 @@ static void mce_cpu_restart(void *data) if (!mce_available(raw_cpu_ptr(&cpu_info))) return; __mcheck_cpu_init_generic(); - __mcheck_cpu_init_clear_banks(); + __mcheck_cpu_init_prepare_banks(); __mcheck_cpu_init_timer(); + cr4_set_bits(X86_CR4_MCE); } /* Reinit MCEs after user configuration changes */ @@ -2685,8 +2704,6 @@ static void mce_reenable_cpu(void) static int mce_cpu_dead(unsigned int cpu) { - mce_intel_hcpu_update(cpu); - /* intentionally ignoring frozen here */ if (!cpuhp_tasks_frozen) cmci_rediscover(); diff --git a/arch/x86/kernel/cpu/mce/genpool.c b/arch/x86/kernel/cpu/mce/genpool.c index fbe8b61c3413037ae30203f893f545c9782d51f9..3ca9c007a6668e4b948e49a3843273d1590b9915 100644 --- a/arch/x86/kernel/cpu/mce/genpool.c +++ b/arch/x86/kernel/cpu/mce/genpool.c @@ -16,14 +16,14 @@ * used to save error information organized in a lock-less list. * * This memory pool is only to be used to save MCE records in MCE context. - * MCE events are rare, so a fixed size memory pool should be enough. Use - * 2 pages to save MCE events for now (~80 MCE records at most). + * MCE events are rare, so a fixed size memory pool should be enough. + * Allocate on a sliding scale based on number of CPUs. */ -#define MCE_POOLSZ (2 * PAGE_SIZE) +#define MCE_MIN_ENTRIES 80 +#define MCE_PER_CPU 2 static struct gen_pool *mce_evt_pool; static LLIST_HEAD(mce_event_llist); -static char gen_pool_buf[MCE_POOLSZ]; /* * Compare the record "t" with each of the records on list "l" to see if @@ -31,15 +31,15 @@ static char gen_pool_buf[MCE_POOLSZ]; */ static bool is_duplicate_mce_record(struct mce_evt_llist *t, struct mce_evt_llist *l) { + struct mce_hw_err *err1, *err2; struct mce_evt_llist *node; - struct mce *m1, *m2; - m1 = &t->mce; + err1 = &t->err; llist_for_each_entry(node, &l->llnode, llnode) { - m2 = &node->mce; + err2 = &node->err; - if (!mce_cmp(m1, m2)) + if (!mce_cmp(&err1->m, &err2->m)) return true; } return false; @@ -73,8 +73,8 @@ struct llist_node *mce_gen_pool_prepare_records(void) void mce_gen_pool_process(struct work_struct *__unused) { - struct llist_node *head; struct mce_evt_llist *node, *tmp; + struct llist_node *head; struct mce *mce; head = llist_del_all(&mce_event_llist); @@ -83,7 +83,7 @@ void mce_gen_pool_process(struct work_struct *__unused) head = llist_reverse_order(head); llist_for_each_entry_safe(node, tmp, head, llnode) { - mce = &node->mce; + mce = &node->err.m; blocking_notifier_call_chain(&x86_mce_decoder_chain, 0, mce); gen_pool_free(mce_evt_pool, (unsigned long)node, sizeof(*node)); } @@ -94,54 +94,63 @@ bool mce_gen_pool_empty(void) return llist_empty(&mce_event_llist); } -int mce_gen_pool_add(struct mce *mce) +bool mce_gen_pool_add(struct mce_hw_err *err) { struct mce_evt_llist *node; - if (filter_mce(mce)) - return -EINVAL; + if (filter_mce(&err->m)) + return false; if (!mce_evt_pool) - return -EINVAL; + return false; node = (void *)gen_pool_alloc(mce_evt_pool, sizeof(*node)); if (!node) { pr_warn_ratelimited("MCE records pool full!\n"); - return -ENOMEM; + return false; } - memcpy(&node->mce, mce, sizeof(*mce)); + memcpy(&node->err, err, sizeof(*err)); llist_add(&node->llnode, &mce_event_llist); - return 0; + return true; } -static int mce_gen_pool_create(void) +static bool mce_gen_pool_create(void) { - struct gen_pool *tmpp; - int ret = -ENOMEM; - - tmpp = gen_pool_create(ilog2(sizeof(struct mce_evt_llist)), -1); - if (!tmpp) - goto out; + int mce_numrecords, mce_poolsz, order; + struct gen_pool *gpool; + void *mce_pool; + + order = order_base_2(sizeof(struct mce_evt_llist)); + gpool = gen_pool_create(order, -1); + if (!gpool) + return false; + + mce_numrecords = max(MCE_MIN_ENTRIES, num_possible_cpus() * MCE_PER_CPU); + mce_poolsz = mce_numrecords * (1 << order); + mce_pool = kmalloc(mce_poolsz, GFP_KERNEL); + if (!mce_pool) { + gen_pool_destroy(gpool); + return false; + } - ret = gen_pool_add(tmpp, (unsigned long)gen_pool_buf, MCE_POOLSZ, -1); - if (ret) { - gen_pool_destroy(tmpp); - goto out; + if (gen_pool_add(gpool, (unsigned long)mce_pool, mce_poolsz, -1)) { + gen_pool_destroy(gpool); + kfree(mce_pool); + return false; } - mce_evt_pool = tmpp; + mce_evt_pool = gpool; -out: - return ret; + return true; } -int mce_gen_pool_init(void) +bool mce_gen_pool_init(void) { /* Just init mce_gen_pool once. */ if (mce_evt_pool) - return 0; + return true; return mce_gen_pool_create(); } diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c index 94953d749475d025dd64efe5414a4cdbf83a63e4..c374e5aeba5e4459128a1dfbde033d589bb358e1 100644 --- a/arch/x86/kernel/cpu/mce/inject.c +++ b/arch/x86/kernel/cpu/mce/inject.c @@ -498,8 +498,9 @@ static void prepare_msrs(void *info) static void do_inject(void) { - u64 mcg_status = 0; unsigned int cpu = i_mce.extcpu; + struct mce_hw_err err; + u64 mcg_status = 0; u8 b = i_mce.bank; i_mce.tsc = rdtsc_ordered(); @@ -513,7 +514,8 @@ static void do_inject(void) i_mce.status |= MCI_STATUS_SYNDV; if (inj_type == SW_INJ) { - mce_log(&i_mce); + err.m = i_mce; + mce_log(&err); return; } diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c index ef059fe73e81e23ee5375e967b9328fd89fbee76..0f4605518770126e25fc5ffa4122939c484707eb 100644 --- a/arch/x86/kernel/cpu/mce/intel.c +++ b/arch/x86/kernel/cpu/mce/intel.c @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include #include #include @@ -41,15 +41,6 @@ */ static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned); -/* - * CMCI storm detection backoff counter - * - * During storm, we reset this counter to INITIAL_CHECK_INTERVAL in case we've - * encountered an error. If not, we decrement it by one. We signal the end of - * the CMCI storm when it reaches 0. - */ -static DEFINE_PER_CPU(int, cmci_backoff_cnt); - /* * cmci_discover_lock protects against parallel discovery attempts * which could race against each other. @@ -64,21 +55,6 @@ static DEFINE_RAW_SPINLOCK(cmci_discover_lock); static DEFINE_SPINLOCK(cmci_poll_lock); #define CMCI_THRESHOLD 1 -#define CMCI_POLL_INTERVAL (30 * HZ) -#define CMCI_STORM_INTERVAL (HZ) -#define CMCI_STORM_THRESHOLD 15 - -static DEFINE_PER_CPU(unsigned long, cmci_time_stamp); -static DEFINE_PER_CPU(unsigned int, cmci_storm_cnt); -static DEFINE_PER_CPU(unsigned int, cmci_storm_state); - -enum { - CMCI_STORM_NONE, - CMCI_STORM_ACTIVE, - CMCI_STORM_SUBSIDED, -}; - -static atomic_t cmci_storm_on_cpus; static int cmci_supported(int *banks) { @@ -135,124 +111,6 @@ static bool lmce_supported(void) return tmp & FEAT_CTL_LMCE_ENABLED; } -bool mce_intel_cmci_poll(void) -{ - if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE) - return false; - - /* - * Reset the counter if we've logged an error in the last poll - * during the storm. - */ - if (machine_check_poll(0, this_cpu_ptr(&mce_banks_owned))) - this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL); - else - this_cpu_dec(cmci_backoff_cnt); - - return true; -} - -void mce_intel_hcpu_update(unsigned long cpu) -{ - if (per_cpu(cmci_storm_state, cpu) == CMCI_STORM_ACTIVE) - atomic_dec(&cmci_storm_on_cpus); - - per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE; -} - -static void cmci_toggle_interrupt_mode(bool on) -{ - unsigned long flags, *owned; - int bank; - u64 val; - - raw_spin_lock_irqsave(&cmci_discover_lock, flags); - owned = this_cpu_ptr(mce_banks_owned); - for_each_set_bit(bank, owned, MAX_NR_BANKS) { - rdmsrl(MSR_IA32_MCx_CTL2(bank), val); - - if (on) - val |= MCI_CTL2_CMCI_EN; - else - val &= ~MCI_CTL2_CMCI_EN; - - wrmsrl(MSR_IA32_MCx_CTL2(bank), val); - } - raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); -} - -unsigned long cmci_intel_adjust_timer(unsigned long interval) -{ - if ((this_cpu_read(cmci_backoff_cnt) > 0) && - (__this_cpu_read(cmci_storm_state) == CMCI_STORM_ACTIVE)) { - mce_notify_irq(); - return CMCI_STORM_INTERVAL; - } - - switch (__this_cpu_read(cmci_storm_state)) { - case CMCI_STORM_ACTIVE: - - /* - * We switch back to interrupt mode once the poll timer has - * silenced itself. That means no events recorded and the timer - * interval is back to our poll interval. - */ - __this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED); - if (!atomic_sub_return(1, &cmci_storm_on_cpus)) - pr_notice("CMCI storm subsided: switching to interrupt mode\n"); - - fallthrough; - - case CMCI_STORM_SUBSIDED: - /* - * We wait for all CPUs to go back to SUBSIDED state. When that - * happens we switch back to interrupt mode. - */ - if (!atomic_read(&cmci_storm_on_cpus)) { - __this_cpu_write(cmci_storm_state, CMCI_STORM_NONE); - cmci_toggle_interrupt_mode(true); - cmci_recheck(); - } - return CMCI_POLL_INTERVAL; - default: - - /* We have shiny weather. Let the poll do whatever it thinks. */ - return interval; - } -} - -static bool cmci_storm_detect(void) -{ - unsigned int cnt = __this_cpu_read(cmci_storm_cnt); - unsigned long ts = __this_cpu_read(cmci_time_stamp); - unsigned long now = jiffies; - int r; - - if (__this_cpu_read(cmci_storm_state) != CMCI_STORM_NONE) - return true; - - if (time_before_eq(now, ts + CMCI_STORM_INTERVAL)) { - cnt++; - } else { - cnt = 1; - __this_cpu_write(cmci_time_stamp, now); - } - __this_cpu_write(cmci_storm_cnt, cnt); - - if (cnt <= CMCI_STORM_THRESHOLD) - return false; - - cmci_toggle_interrupt_mode(false); - __this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE); - r = atomic_add_return(1, &cmci_storm_on_cpus); - mce_timer_kick(CMCI_STORM_INTERVAL); - this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL); - - if (r == 1) - pr_notice("CMCI storm detected: switching to poll mode\n"); - return true; -} - /* * The interrupt handler. This is called on every event. * Just call the poller directly to log any events. @@ -261,9 +119,6 @@ static bool cmci_storm_detect(void) */ static void intel_threshold_interrupt(void) { - if (cmci_storm_detect()) - return; - machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)); } @@ -496,10 +351,10 @@ static void intel_imc_init(struct cpuinfo_x86 *c) { u64 error_control; - switch (c->x86_model) { - case INTEL_FAM6_SANDYBRIDGE_X: - case INTEL_FAM6_IVYBRIDGE_X: - case INTEL_FAM6_HASWELL_X: + switch (c->x86_vfm) { + case INTEL_SANDYBRIDGE_X: + case INTEL_IVYBRIDGE_X: + case INTEL_HASWELL_X: if (rdmsrl_safe(MSR_ERROR_CONTROL, &error_control)) return; error_control |= 2; @@ -508,8 +363,26 @@ static void intel_imc_init(struct cpuinfo_x86 *c) } } +static void intel_apply_cpu_quirks(struct cpuinfo_x86 *c) +{ + /* + * SDM documents that on family 6 bank 0 should not be written + * because it aliases to another special BIOS controlled + * register. + * But it's not aliased anymore on model 0x1a+ + * Don't ignore bank 0 completely because there could be a + * valid event later, merely don't write CTL0. + * + * Older CPUs (prior to family 6) can't reach this point and already + * return early due to the check of __mcheck_cpu_ancient_init(). + */ + if (c->x86_vfm < INTEL_NEHALEM_EP && this_cpu_read(mce_num_banks)) + this_cpu_ptr(mce_banks_array)[0].init = false; +} + void mce_intel_feature_init(struct cpuinfo_x86 *c) { + intel_apply_cpu_quirks(c); intel_init_cmci(); intel_init_lmce(); intel_imc_init(c); @@ -526,12 +399,11 @@ bool intel_filter_mce(struct mce *m) struct cpuinfo_x86 *c = &boot_cpu_data; /* MCE errata HSD131, HSM142, HSW131, BDM48, HSM142 and SKX37 */ - if ((c->x86 == 6) && - ((c->x86_model == INTEL_FAM6_HASWELL) || - (c->x86_model == INTEL_FAM6_HASWELL_L) || - (c->x86_model == INTEL_FAM6_BROADWELL) || - (c->x86_model == INTEL_FAM6_HASWELL_G) || - (c->x86_model == INTEL_FAM6_SKYLAKE_X)) && + if ((c->x86_vfm == INTEL_HASWELL || + c->x86_vfm == INTEL_HASWELL_L || + c->x86_vfm == INTEL_BROADWELL || + c->x86_vfm == INTEL_HASWELL_G || + c->x86_vfm == INTEL_SKYLAKE_X) && (m->bank == 0) && ((m->status & 0xa0000000ffffffff) == 0x80000000000f0005)) return true; diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index bcf1b3c66c9cfdba1e74b6b70e52bd182c5e10ee..bdfae89c5a401ddf37086bc46510292a612536bd 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -26,13 +26,13 @@ extern struct blocking_notifier_head x86_mce_decoder_chain; struct mce_evt_llist { struct llist_node llnode; - struct mce mce; + struct mce_hw_err err; }; void mce_gen_pool_process(struct work_struct *__unused); bool mce_gen_pool_empty(void); -int mce_gen_pool_add(struct mce *mce); -int mce_gen_pool_init(void); +bool mce_gen_pool_add(struct mce_hw_err *err); +bool mce_gen_pool_init(void); struct llist_node *mce_gen_pool_prepare_records(void); int mce_severity(struct mce *a, struct pt_regs *regs, char **msg, bool is_excp); @@ -41,18 +41,12 @@ struct dentry *mce_get_debugfs_dir(void); extern mce_banks_t mce_banks_ce_disabled; #ifdef CONFIG_X86_MCE_INTEL -unsigned long cmci_intel_adjust_timer(unsigned long interval); -bool mce_intel_cmci_poll(void); -void mce_intel_hcpu_update(unsigned long cpu); void cmci_disable_bank(int bank); void intel_init_cmci(void); void intel_init_lmce(void); void intel_clear_lmce(void); bool intel_filter_mce(struct mce *m); #else -# define cmci_intel_adjust_timer mce_adjust_timer_default -static inline bool mce_intel_cmci_poll(void) { return false; } -static inline void mce_intel_hcpu_update(unsigned long cpu) { } static inline void cmci_disable_bank(int bank) { } static inline void intel_init_cmci(void) { } static inline void intel_init_lmce(void) { } @@ -60,7 +54,63 @@ static inline void intel_clear_lmce(void) { } static inline bool intel_filter_mce(struct mce *m) { return false; } #endif -void mce_timer_kick(unsigned long interval); +void mce_timer_kick(bool storm); + +#ifdef CONFIG_X86_MCE_THRESHOLD +void cmci_storm_begin(unsigned int bank); +void cmci_storm_end(unsigned int bank); +void mce_track_storm(struct mce *mce); +void mce_inherit_storm(unsigned int bank); +bool mce_get_storm_mode(void); +void mce_set_storm_mode(bool storm); +#else +static inline void cmci_storm_begin(unsigned int bank) {} +static inline void cmci_storm_end(unsigned int bank) {} +static inline void mce_track_storm(struct mce *mce) {} +static inline void mce_inherit_storm(unsigned int bank) {} +static inline bool mce_get_storm_mode(void) { return false; } +static inline void mce_set_storm_mode(bool storm) {} +#endif + +/* + * history: Bitmask tracking errors occurrence. Each set bit + * represents an error seen. + * + * timestamp: Last time (in jiffies) that the bank was polled. + * in_storm_mode: Is this bank in storm mode? + * poll_only: Bank does not support CMCI, skip storm tracking. + */ +struct storm_bank { + u64 history; + u64 timestamp; + bool in_storm_mode; + bool poll_only; +}; + +#define NUM_HISTORY_BITS (sizeof(u64) * BITS_PER_BYTE) + +/* How many errors within the history buffer mark the start of a storm. */ +#define STORM_BEGIN_THRESHOLD 5 + +/* + * How many polls of machine check bank without an error before declaring + * the storm is over. Since it is tracked by the bitmasks in the history + * field of struct storm_bank the mask is 30 bits [0 ... 29]. + */ +#define STORM_END_POLL_THRESHOLD 29 + +/* + * banks: per-cpu, per-bank details + * stormy_bank_count: count of MC banks in storm state + * poll_mode: CPU is in poll mode + */ +struct mca_storm_desc { + struct storm_bank banks[MAX_NR_BANKS]; + u8 stormy_bank_count; + bool poll_mode; +}; + +DECLARE_PER_CPU(struct mca_storm_desc, storm_desc); #ifdef CONFIG_ACPI_APEI int apei_write_mce(struct mce *m); @@ -207,9 +257,14 @@ enum mca_msr { /* Decide whether to add MCE record to MCE event pool or filter it out. */ extern bool filter_mce(struct mce *m); +void mce_prep_record_common(struct mce *m); +void mce_prep_record_per_cpu(unsigned int cpu, struct mce *m); #ifdef CONFIG_X86_MCE_AMD +void mce_threshold_create_device(unsigned int cpu); +void mce_threshold_remove_device(unsigned int cpu); extern bool amd_filter_mce(struct mce *m); +void amd_clear_bank(struct mce *m); /* * If MCA_CONFIG[McaLsbInStatusSupported] is set, extract ErrAddr in bits @@ -235,9 +290,14 @@ static __always_inline void smca_extract_err_addr(struct mce *m) m->addr &= GENMASK_ULL(55, lsb); } +void smca_bsp_init(void); #else +static inline void mce_threshold_create_device(unsigned int cpu) { } +static inline void mce_threshold_remove_device(unsigned int cpu) { } static inline bool amd_filter_mce(struct mce *m) { return false; } +static inline void amd_clear_bank(struct mce *m) { } static inline void smca_extract_err_addr(struct mce *m) { } +static inline void smca_bsp_init(void) { } #endif #ifdef CONFIG_X86_ANCIENT_MCE @@ -255,6 +315,7 @@ static __always_inline void winchip_machine_check(struct pt_regs *regs) {} #endif noinstr u64 mce_rdmsrl(u32 msr); +noinstr void mce_wrmsrq(u32 msr, u64 v); static __always_inline u32 mca_msr_reg(int bank, enum mca_msr reg) { diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c index 9c5754229d6ed37fa6b71777e970ebb3b4810de5..2235a74774360d05cd4686de951b577d55c3124e 100644 --- a/arch/x86/kernel/cpu/mce/severity.c +++ b/arch/x86/kernel/cpu/mce/severity.c @@ -12,7 +12,7 @@ #include #include -#include +#include #include #include #include @@ -39,20 +39,20 @@ static struct severity { u64 mask; u64 result; unsigned char sev; - unsigned char mcgmask; - unsigned char mcgres; + unsigned short mcgmask; + unsigned short mcgres; unsigned char ser; unsigned char context; unsigned char excp; unsigned char covered; - unsigned char cpu_model; + unsigned int cpu_vfm; unsigned char cpu_minstepping; unsigned char bank_lo, bank_hi; char *msg; } severities[] = { #define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c } #define BANK_RANGE(l, h) .bank_lo = l, .bank_hi = h -#define MODEL_STEPPING(m, s) .cpu_model = m, .cpu_minstepping = s +#define VFM_STEPPING(m, s) .cpu_vfm = m, .cpu_minstepping = s #define KERNEL .context = IN_KERNEL #define USER .context = IN_USER #define KERNEL_RECOV .context = IN_KERNEL_RECOV @@ -128,7 +128,7 @@ static struct severity { MCESEV( AO, "Uncorrected Patrol Scrub Error", SER, MASK(MCI_STATUS_UC|MCI_ADDR|0xffffeff0, MCI_ADDR|0x001000c0), - MODEL_STEPPING(INTEL_FAM6_SKYLAKE_X, 4), BANK_RANGE(13, 18) + VFM_STEPPING(INTEL_SKYLAKE_X, 4), BANK_RANGE(13, 18) ), /* ignore OVER for UCNA */ @@ -173,6 +173,18 @@ static struct severity { SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR), USER ), + MCESEV( + AR, "Data load error in SEAM non-root mode", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), + MCGMASK(MCG_STATUS_SEAM_NR, MCG_STATUS_SEAM_NR), + KERNEL + ), + MCESEV( + AR, "Instruction fetch error in SEAM non-root mode", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR), + MCGMASK(MCG_STATUS_SEAM_NR, MCG_STATUS_SEAM_NR), + KERNEL + ), MCESEV( PANIC, "Data load in unrecoverable area of kernel", SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), @@ -384,7 +396,7 @@ static noinstr int mce_severity_intel(struct mce *m, struct pt_regs *regs, char continue; if (s->excp && excp != s->excp) continue; - if (s->cpu_model && boot_cpu_data.x86_model != s->cpu_model) + if (s->cpu_vfm && boot_cpu_data.x86_vfm != s->cpu_vfm) continue; if (s->cpu_minstepping && boot_cpu_data.x86_stepping < s->cpu_minstepping) continue; diff --git a/arch/x86/kernel/cpu/mce/threshold.c b/arch/x86/kernel/cpu/mce/threshold.c index ef4e7bb5fd88c60a6d1d881a31aed389e8d12f18..0e1988468ee465adde626a87a571125be6f1a829 100644 --- a/arch/x86/kernel/cpu/mce/threshold.c +++ b/arch/x86/kernel/cpu/mce/threshold.c @@ -29,3 +29,115 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_threshold) trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR); apic_eoi(); } + +DEFINE_PER_CPU(struct mca_storm_desc, storm_desc); + +void mce_inherit_storm(unsigned int bank) +{ + struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc); + + /* + * Previous CPU owning this bank had put it into storm mode, + * but the precise history of that storm is unknown. Assume + * the worst (all recent polls of the bank found a valid error + * logged). This will avoid the new owner prematurely declaring + * the storm has ended. + */ + storm->banks[bank].history = ~0ull; + storm->banks[bank].timestamp = jiffies; +} + +bool mce_get_storm_mode(void) +{ + return __this_cpu_read(storm_desc.poll_mode); +} + +void mce_set_storm_mode(bool storm) +{ + __this_cpu_write(storm_desc.poll_mode, storm); +} + +static void mce_handle_storm(unsigned int bank, bool on) +{ + switch (boot_cpu_data.x86_vendor) { + } +} + +void cmci_storm_begin(unsigned int bank) +{ + struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc); + + __set_bit(bank, this_cpu_ptr(mce_poll_banks)); + storm->banks[bank].in_storm_mode = true; + + /* + * If this is the first bank on this CPU to enter storm mode + * start polling. + */ + if (++storm->stormy_bank_count == 1) + mce_timer_kick(true); +} + +void cmci_storm_end(unsigned int bank) +{ + struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc); + + __clear_bit(bank, this_cpu_ptr(mce_poll_banks)); + storm->banks[bank].history = 0; + storm->banks[bank].in_storm_mode = false; + + /* If no banks left in storm mode, stop polling. */ + if (!this_cpu_dec_return(storm_desc.stormy_bank_count)) + mce_timer_kick(false); +} + +void mce_track_storm(struct mce *mce) +{ + struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc); + unsigned long now = jiffies, delta; + unsigned int shift = 1; + u64 history = 0; + + /* No tracking needed for banks that do not support CMCI */ + if (storm->banks[mce->bank].poll_only) + return; + + /* + * When a bank is in storm mode it is polled once per second and + * the history mask will record about the last minute of poll results. + * If it is not in storm mode, then the bank is only checked when + * there is a CMCI interrupt. Check how long it has been since + * this bank was last checked, and adjust the amount of "shift" + * to apply to history. + */ + if (!storm->banks[mce->bank].in_storm_mode) { + delta = now - storm->banks[mce->bank].timestamp; + shift = (delta + HZ) / HZ; + } + + /* If it has been a long time since the last poll, clear history. */ + if (shift < NUM_HISTORY_BITS) + history = storm->banks[mce->bank].history << shift; + + storm->banks[mce->bank].timestamp = now; + + /* History keeps track of corrected errors. VAL=1 && UC=0 */ + if ((mce->status & MCI_STATUS_VAL) && mce_is_correctable(mce)) + history |= 1; + + storm->banks[mce->bank].history = history; + + if (storm->banks[mce->bank].in_storm_mode) { + if (history & GENMASK_ULL(STORM_END_POLL_THRESHOLD, 0)) + return; + printk_deferred(KERN_NOTICE "CPU%d BANK%d CMCI storm subsided\n", smp_processor_id(), mce->bank); + mce_handle_storm(mce->bank, false); + cmci_storm_end(mce->bank); + } else { + if (hweight64(history) < STORM_BEGIN_THRESHOLD) + return; + printk_deferred(KERN_NOTICE "CPU%d BANK%d CMCI storm detected\n", smp_processor_id(), mce->bank); + mce_handle_storm(mce->bank, true); + cmci_storm_begin(mce->bank); + } +} diff --git a/include/trace/events/mce.h b/include/trace/events/mce.h index 1391ada0da3b95f4e73ae810ab11e51b69fcfae7..65aba1afcd070acd70b0eb7f809ddbbf53eacd4a 100644 --- a/include/trace/events/mce.h +++ b/include/trace/events/mce.h @@ -9,11 +9,19 @@ #include #include +/* + * MCE Event Record. + * + * Only very relevant and transient information which cannot be + * gathered from a system by any other means or which can only be + * acquired arduously should be added to this record. + */ + TRACE_EVENT(mce_record, - TP_PROTO(struct mce *m), + TP_PROTO(struct mce_hw_err *err), - TP_ARGS(m), + TP_ARGS(err), TP_STRUCT__entry( __field( u64, mcgcap ) @@ -25,6 +33,7 @@ TRACE_EVENT(mce_record, __field( u64, ipid ) __field( u64, ip ) __field( u64, tsc ) + __field( u64, ppin ) __field( u64, walltime ) __field( u32, cpu ) __field( u32, cpuid ) @@ -33,40 +42,48 @@ TRACE_EVENT(mce_record, __field( u8, cs ) __field( u8, bank ) __field( u8, cpuvendor ) + __field( u32, microcode ) ), TP_fast_assign( - __entry->mcgcap = m->mcgcap; - __entry->mcgstatus = m->mcgstatus; - __entry->status = m->status; - __entry->addr = m->addr; - __entry->misc = m->misc; - __entry->synd = m->synd; - __entry->ipid = m->ipid; - __entry->ip = m->ip; - __entry->tsc = m->tsc; - __entry->walltime = m->time; - __entry->cpu = m->extcpu; - __entry->cpuid = m->cpuid; - __entry->apicid = m->apicid; - __entry->socketid = m->socketid; - __entry->cs = m->cs; - __entry->bank = m->bank; - __entry->cpuvendor = m->cpuvendor; + __entry->mcgcap = err->m.mcgcap; + __entry->mcgstatus = err->m.mcgstatus; + __entry->status = err->m.status; + __entry->addr = err->m.addr; + __entry->misc = err->m.misc; + __entry->synd = err->m.synd; + __entry->ipid = err->m.ipid; + __entry->ip = err->m.ip; + __entry->tsc = err->m.tsc; + __entry->ppin = err->m.ppin; + __entry->walltime = err->m.time; + __entry->cpu = err->m.extcpu; + __entry->cpuid = err->m.cpuid; + __entry->apicid = err->m.apicid; + __entry->socketid = err->m.socketid; + __entry->cs = err->m.cs; + __entry->bank = err->m.bank; + __entry->cpuvendor = err->m.cpuvendor; + __entry->microcode = err->m.microcode; ), - TP_printk("CPU: %d, MCGc/s: %llx/%llx, MC%d: %016Lx, IPID: %016Lx, ADDR/MISC/SYND: %016Lx/%016Lx/%016Lx, RIP: %02x:<%016Lx>, TSC: %llx, PROCESSOR: %u:%x, TIME: %llu, SOCKET: %u, APIC: %x", + TP_printk("CPU: %d, MCGc/s: %llx/%llx, MC%d: %016Lx, IPID: %016Lx, ADDR: %016Lx, MISC: %016Lx, SYND: %016Lx, RIP: %02x:<%016Lx>, TSC: %llx, PPIN: %llx, vendor: %u, CPUID: %x, time: %llu, socket: %u, APIC: %x, microcode: %x", __entry->cpu, __entry->mcgcap, __entry->mcgstatus, __entry->bank, __entry->status, __entry->ipid, - __entry->addr, __entry->misc, __entry->synd, + __entry->addr, + __entry->misc, + __entry->synd, __entry->cs, __entry->ip, __entry->tsc, - __entry->cpuvendor, __entry->cpuid, + __entry->ppin, + __entry->cpuvendor, + __entry->cpuid, __entry->walltime, __entry->socketid, - __entry->apicid) + __entry->apicid, + __entry->microcode) ); #endif /* _TRACE_MCE_H */