diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index ac7c45889947be25bca231eff0f37a1e9f750fe9..0a1d53119ff5cd0bedb39936a3f5b0e6fbe092f4 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include @@ -813,6 +814,14 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) continue; log_it: + if (m.status & MCI_STATUS_UC) + report_fault_event(-1, NULL, SLIGHT_FAULT, + FE_MCE, "UCE hardware failure"); + else + report_fault_event(-1, NULL, SLIGHT_FAULT, + FE_MCE, "CE hardware failure"); + + error_seen = true; if (flags & MCP_DONTLOG) @@ -1282,6 +1291,7 @@ static void kill_me_maybe(struct callback_head *cb) ret = memory_failure(p->mce_addr >> PAGE_SHIFT, flags); if (!ret) { set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page); + report_fault_event(smp_processor_id(), p, SLIGHT_FAULT, FE_MCE, "UCE recovered"); sync_core(); return; } @@ -1295,6 +1305,7 @@ static void kill_me_maybe(struct callback_head *cb) return; pr_err("Memory error not recovered"); + report_fault_event(smp_processor_id(), p, FATAL_FAULT, FE_MCE, "UCE not recovered"); kill_me_now(cb); } diff --git a/block/blk-core.c b/block/blk-core.c index 06fb25bd24df5ec0709e4d4669c9489dcd7d5b73..958b4628ff4762ef040e526003cb941255c7ab4e 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -42,6 +42,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -228,6 +229,9 @@ int blk_status_to_errno(blk_status_t status) if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors))) return -EIO; + report_fault_event(smp_processor_id(), current, + FATAL_FAULT, FE_IO_ERR, NULL); + return blk_errors[idx].errno; } EXPORT_SYMBOL_GPL(blk_status_to_errno); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index c096cebcea6bf184c23d6682e10850505074a5fd..f379f40364da2f191107da1db6018e8cb912e66a 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -46,6 +46,7 @@ #include #include #include +#include #include "ext4.h" #include "ext4_extents.h" /* Needed for trace points definition */ @@ -794,6 +795,8 @@ void __ext4_error(struct super_block *sb, const char *function, trace_ext4_error(sb, function, line); if (ext4_error_ratelimit(sb)) { + report_fault_event(smp_processor_id(), current, + NORMAL_FAULT, FE_EXT4_ERR, "ext4-fs error"); va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; @@ -817,6 +820,8 @@ void __ext4_error_inode(struct inode *inode, const char *function, trace_ext4_error(inode->i_sb, function, line); if (ext4_error_ratelimit(inode->i_sb)) { + report_fault_event(smp_processor_id(), current, + NORMAL_FAULT, FE_EXT4_ERR, "ext4-fs error"); va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; @@ -850,6 +855,8 @@ void __ext4_error_file(struct file *file, const char *function, trace_ext4_error(inode->i_sb, function, line); if (ext4_error_ratelimit(inode->i_sb)) { + report_fault_event(smp_processor_id(), current, + NORMAL_FAULT, FE_EXT4_ERR, "ext4-fs error"); path = file_path(file, pathname, sizeof(pathname)); if (IS_ERR(path)) path = "(unknown)"; @@ -933,6 +940,8 @@ void __ext4_std_error(struct super_block *sb, const char *function, return; if (ext4_error_ratelimit(sb)) { + report_fault_event(smp_processor_id(), current, + NORMAL_FAULT, FE_EXT4_ERR, "ext4-fs error"); errstr = ext4_decode_error(sb, errno, nbuf); printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n", sb->s_id, function, line, errstr); @@ -974,6 +983,9 @@ void __ext4_warning(struct super_block *sb, const char *function, if (!ext4_warning_ratelimit(sb)) return; + report_fault_event(smp_processor_id(), current, + SLIGHT_FAULT, FE_EXT4_ERR, "ext4-fs warning"); + va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; @@ -991,6 +1003,8 @@ void __ext4_warning_inode(const struct inode *inode, const char *function, if (!ext4_warning_ratelimit(inode->i_sb)) return; + report_fault_event(smp_processor_id(), current, + SLIGHT_FAULT, FE_EXT4_ERR, "ext4-fs warning"); va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; @@ -1015,6 +1029,8 @@ __acquires(bitlock) trace_ext4_error(sb, function, line); if (ext4_error_ratelimit(sb)) { + report_fault_event(smp_processor_id(), current, + NORMAL_FAULT, FE_EXT4_ERR, "ext4-fs error"); va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; diff --git a/include/linux/fault_event.h b/include/linux/fault_event.h new file mode 100644 index 0000000000000000000000000000000000000000..dbf24acdf467b962f7abd0904989cd6a5c82ae0f --- /dev/null +++ b/include/linux/fault_event.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _FAULT_EVENT_H +#define _FAULT_EVENT_H +#include + +enum FAULT_CLASS { + SLIGHT_FAULT, + NORMAL_FAULT, + FATAL_FAULT, + FAULT_CLASSS_MAX +}; + +enum FAULT_EVENT { + /*kernel fault events*/ + FE_SOFTLOCKUP, + FE_RCUSTALL, + FE_HUNGTASK, + FE_OOM_GLOBAL, + FE_OOM_CGROUP, + FE_ALLOCFAIL, + FE_LIST_CORRUPT, + FE_MM_STATE, + FE_IO_ERR, + FE_EXT4_ERR, + FE_MCE, + FE_SIGNAL, + FE_WARN, + FE_PANIC, + FE_MAX +}; + +struct fault_event { + enum FAULT_EVENT type; + char *name; + char *module; + atomic_t count; +}; + +extern unsigned int sysctl_fault_event_enable; +extern unsigned int sysctl_fault_event_print; +extern unsigned int sysctl_panic_on_fatal_event; + +extern bool fault_monitor_enable(void); +extern void report_fault_event(int cpu, struct task_struct *tsk, + enum FAULT_CLASS class, enum FAULT_EVENT event, + const char *msg); +#endif + + diff --git a/kernel/fork.c b/kernel/fork.c index 6592d68d98ceda6ab0e2f99cf10b554d420f3ba8..11d782a7d0895dd3c4e18024f9392c36a62c2daf 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -97,6 +97,7 @@ #include #include #include +#include #include #include @@ -663,14 +664,18 @@ static void check_mm(struct mm_struct *mm) long x = atomic_long_read(&mm->rss_stat.count[i]); if (unlikely(x)) + report_fault_event(-1, NULL, FATAL_FAULT, + FE_MM_STATE, "Bad rss-counter"); pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n", mm, resident_page_types[i], x); } - if (mm_pgtables_bytes(mm)) + if (mm_pgtables_bytes(mm)) { + report_fault_event(-1, NULL, FATAL_FAULT, + FE_MM_STATE, "non-zero pgtables_bytes"); pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n", mm_pgtables_bytes(mm)); - + } #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS VM_BUG_ON_MM(mm->pmd_huge_pte, mm); #endif diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 396ebaebea3fea3578fe295c73cf041530a91d93..fe280c5ee6434f799a9bf8a8fabb07e155100c2f 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -21,6 +21,7 @@ #include #include #include +#include #include @@ -119,6 +120,8 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) hung_task_call_panic = true; } + report_fault_event(-1, t, NORMAL_FAULT, FE_HUNGTASK, NULL); + /* * Ok, the task did not get scheduled for more than 2 minutes, * complain: diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index b9a6f4658f8981f0e2cc92f941854ac779cdfb45..bf0c7f0f16fc6e6354646a65de775efce8436da6 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -76,8 +77,11 @@ int kexec_should_crash(struct task_struct *p) * There are 4 panic() calls in do_exit() path, each of which * corresponds to each of these 4 conditions. */ - if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops) + if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops) { + report_fault_event(smp_processor_id(), + current, FATAL_FAULT, FE_PANIC, "kernel crash"); return 1; + } return 0; } diff --git a/kernel/panic.c b/kernel/panic.c index d991c3b1b5591dd2aaa933a727cc91e9bee4644b..29080dd0630b4588a61001bd2494856a814fcf0e 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -32,6 +32,9 @@ #include #include #include +#include +#include +#include #define PANIC_TIMER_STEP 100 #define PANIC_BLINK_SPD 18 @@ -215,6 +218,8 @@ void panic(const char *fmt, ...) console_verbose(); bust_spinlocks(1); + report_fault_event(smp_processor_id(), current, FATAL_FAULT, + FE_PANIC, NULL); va_start(args, fmt); len = vscnprintf(buf, sizeof(buf), fmt, args); va_end(args); @@ -594,6 +599,159 @@ void oops_exit(void) print_oops_end_marker(); kmsg_dump(KMSG_DUMP_OOPS); } +unsigned int sysctl_fault_event_enable = 1; +unsigned int sysctl_fault_event_print; +unsigned int sysctl_panic_on_fatal_event; +static atomic_t tot_fault_cnt; +static atomic_t class_fault_cnt[FAULT_CLASSS_MAX]; + +static char *fault_class_name[FAULT_CLASSS_MAX] = { + "Slight", + "Normal", + "Fatal" +}; + +static struct fault_event fevents[FE_MAX] = { + {FE_SOFTLOCKUP, "soft lockup", "general", {0} }, + {FE_RCUSTALL, "rcu stall", "general", {0} }, + {FE_HUNGTASK, "hung task", "general", {0} }, + {FE_OOM_GLOBAL, "global oom", "mem", {0} }, + {FE_OOM_CGROUP, "cgroup oom", "mem", {0} }, + {FE_ALLOCFAIL, "alloc failed", "mem", {0} }, + {FE_LIST_CORRUPT, "list corruption", "general", {0} }, + {FE_MM_STATE, "bad mm_struct", "mem", {0} }, + {FE_IO_ERR, "io error", "io", {0} }, + {FE_EXT4_ERR, "ext4 fs error", "fs", {0} }, + {FE_MCE, "mce", "hardware", {0} }, + {FE_SIGNAL, "fatal signal", "general", {0} }, + {FE_WARN, "warning", "general", {0} }, + {FE_PANIC, "panic", "general", {0} }, +}; + +bool fault_monitor_enable(void) +{ + return sysctl_fault_event_enable; +} + +static const char *get_task_cmdline(struct task_struct *tsk, char *buff, + int size) +{ + struct mm_struct *mm; + char *p = buff, c; + int i, len, count = 0; + + if (!tsk) + return "nil"; + + if (tsk->tgid != current->tgid || !tsk->mm + || (tsk->flags & PF_KTHREAD)) + goto use_comm; + + mm = tsk->mm; + len = mm->arg_end - mm->arg_start; + len = min(len, size); + if (len <= 0) + goto use_comm; + + if (__copy_from_user_inatomic(p, (void *)mm->arg_start, len)) + goto use_comm; + + if (__copy_from_user_inatomic(&c, (void *)(mm->arg_end - 1), 1)) + goto use_comm; + + count += len; + if (c == '\0' || len == size) + goto out; + + p = buff + len; + len = mm->env_end - mm->env_start; + len = min(len, size - count); + if (len <= 0) + goto out; + + if (!__copy_from_user_inatomic(p, (void *)mm->env_start, len)) + count += len; + +out: + for (i = 0; i < count-1; i++) { + if (buff[i] == '\0') + buff[i] = ' '; + } + buff[count - 1] = '\0'; + + return buff; + +use_comm: + return tsk->comm; +} + +void report_fault_event(int cpu, struct task_struct *tsk, + enum FAULT_CLASS class, enum FAULT_EVENT event, + const char *msg) +{ + unsigned int evt_cnt; + char tsk_cmdline[256]; + + if (!sysctl_fault_event_enable) + return; + + if (class >= FAULT_CLASSS_MAX || event >= FE_MAX) + return; + + evt_cnt = atomic_inc_return(&fevents[event].count); + atomic_inc(&class_fault_cnt[class]); + atomic_inc(&tot_fault_cnt); + + if (!sysctl_fault_event_print) + goto may_panic; + + printk_ratelimited(KERN_EMERG "%s fault event[%s:%s]: %s. " + "At cpu %d task %d(%s). Total: %d\n", + fault_class_name[class], fevents[event].module, + fevents[event].name, msg ? msg : "", cpu, + tsk ? tsk->pid : -1, + get_task_cmdline(tsk, tsk_cmdline, 256), evt_cnt); + +may_panic: + if (sysctl_panic_on_fatal_event && class == FATAL_FAULT && + event != FE_PANIC) { + sysctl_fault_event_enable = false; + panic("kernel fault event"); + } +} +EXPORT_SYMBOL(report_fault_event); + +static int fault_events_show(struct seq_file *m, void *v) +{ + unsigned int evt_cnt, class_cnt, total; + int i; + + total = atomic_read(&tot_fault_cnt); + seq_printf(m, "\nTotal fault events: %d\n\n", total); + + for (i = 0; i < FAULT_CLASSS_MAX; i++) { + class_cnt = atomic_read(&class_fault_cnt[i]); + seq_printf(m, "%s: %d\n", fault_class_name[i], + class_cnt); + } + + seq_puts(m, "\n"); + for (i = 0; i < FE_MAX; i++) { + evt_cnt = atomic_read(&fevents[i].count); + seq_printf(m, "%s: %d\n", fevents[i].name, + evt_cnt); + } + + return 0; +} + +static int fault_events_init(void) +{ + proc_create_single("fault_events", 0, NULL, fault_events_show); + + return 0; +} +module_init(fault_events_init); struct warn_args { const char *fmt; @@ -613,6 +771,13 @@ void __warn(const char *file, int line, void *caller, unsigned taint, pr_warn("WARNING: CPU: %d PID: %d at %pS\n", raw_smp_processor_id(), current->pid, caller); + if (strstr(file, "list_debug.c")) + report_fault_event(smp_processor_id(), current, + FATAL_FAULT, FE_LIST_CORRUPT, NULL); + else + report_fault_event(smp_processor_id(), current, + SLIGHT_FAULT, FE_WARN, "kernel warning"); + if (args) vprintk(args->fmt, args->args); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 0142855482f8b674fe52ae2db65e5d07906e47fe..6e681a48d7a657e4f09b35932dd23e749a1b3fde 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -60,6 +60,7 @@ #include #include #include +#include #include "../time/tick-internal.h" #include "tree.h" diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 251a9af3709afd6c44e90dcfa7b926c6bc330434..84353dc057c88275b7cc96ee2065d153fcaad3fe 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -8,7 +8,7 @@ */ #include - +#include ////////////////////////////////////////////////////////////////////////////// // // Controlling CPU stall warnings, including delay calculation. @@ -477,7 +477,10 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) int ndetected = 0; struct rcu_node *rnp; long totqlen = 0; - + enum FAULT_CLASS class = SLIGHT_FAULT; + int first_cpu = -1; + unsigned int stall_cpus = 0; + lockdep_assert_irqs_disabled(); /* Kick and suppress, if so configured. */ @@ -498,12 +501,19 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) { print_cpu_stall_info(cpu); ndetected++; + if (first_cpu == -1) + first_cpu = cpu; + stall_cpus++; } } ndetected += rcu_print_task_stall(rnp, flags); // Releases rnp->lock. lockdep_assert_irqs_disabled(); } + if (stall_cpus > 1) + class = FATAL_FAULT; + report_fault_event(first_cpu, NULL, class, FE_RCUSTALL, NULL); + for_each_possible_cpu(cpu) totqlen += rcu_get_n_cbs_cpu(cpu); pr_cont("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n", @@ -554,6 +564,9 @@ static void print_cpu_stall(unsigned long gps) if (rcu_stall_is_suppressed()) return; + report_fault_event(smp_processor_id(), current, SLIGHT_FAULT, + FE_RCUSTALL, NULL); + /* * OK, time to rat on ourselves... * See Documentation/RCU/stallwarn.rst for info on how to debug diff --git a/kernel/signal.c b/kernel/signal.c index cf498d949f2f3ab1c101ec93567c93901765ac9d..587b1b4f5701b08cae24b86d840ec3a365791438 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -46,6 +46,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -2746,6 +2747,11 @@ bool get_signal(struct ksignal *ksig) current->flags |= PF_SIGNALED; if (sig_kernel_coredump(signr)) { + char msg[32]; + sprintf(msg, "sig%d exit", signr); + report_fault_event(smp_processor_id(), current, + NORMAL_FAULT, FE_SIGNAL, msg); + if (print_fatal_signals) print_fatal_signal(ksig->info.si_signo); proc_coredump_connector(current); @@ -2760,6 +2766,10 @@ bool get_signal(struct ksignal *ksig) do_coredump(&ksig->info); } + if (ksig->info.si_signo == SIGKILL && ksig->info.si_code == SI_KERNEL) + report_fault_event(smp_processor_id(), current, + NORMAL_FAULT, FE_SIGNAL, "sigkill kernel"); + /* * Death signals, no core dump. */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 60079815aafdeb795a7dc18c0dc46d493749bf8b..380bb952e35b7f3cfd8563b746d1b5f104974fda 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -71,6 +71,7 @@ #include #include #include +#include #include "../lib/kstrtox.h" @@ -2740,6 +2741,35 @@ static struct ctl_table kern_table[] = { .extra2 = &one_hundred, }, #endif + { + .procname = "fault_event_enable", + .data = &sysctl_fault_event_enable, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#if defined CONFIG_PRINTK + { + .procname = "fault_event_print", + .data = &sysctl_fault_event_print, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif + { + .procname = "panic_on_fatal_event", + .data = &sysctl_panic_on_fatal_event, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, { } }; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 0f77eb4c6644ee60df5ae8d275d48bde1c511567..506ff5b65a4d6f047b9daed7d4d4fabac2fd15da 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include "internal.h" @@ -962,6 +963,9 @@ static void __oom_kill_process(struct task_struct *victim, const char *message) */ do_send_sig_info(SIGKILL, SEND_SIG_PRIV, victim, PIDTYPE_TGID); mark_oom_victim(victim); + report_fault_event(smp_processor_id(), victim, + NORMAL_FAULT, FE_SIGNAL, "sigkill by oom"); + pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB, UID:%u pgtables:%lukB oom_score_adj:%hd\n", message, task_pid_nr(victim), victim->comm, K(mm->total_vm), K(get_mm_counter(mm, MM_ANONPAGES)), @@ -1001,6 +1005,8 @@ static void __oom_kill_process(struct task_struct *victim, const char *message) if (unlikely(p->flags & PF_KTHREAD)) continue; do_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_TGID); + report_fault_event(smp_processor_id(), p, NORMAL_FAULT, + FE_SIGNAL, "sigkill by oom"); } rcu_read_unlock(); @@ -1206,6 +1212,9 @@ bool out_of_memory(struct oom_control *oc) oc->constraint = constrained_alloc(oc); if (oc->constraint != CONSTRAINT_MEMORY_POLICY) oc->nodemask = NULL; + report_fault_event(smp_processor_id(), current, NORMAL_FAULT, + is_memcg_oom(oc) ? FE_OOM_CGROUP : FE_OOM_GLOBAL, NULL); + check_panic_on_oom(oc); if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task && diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 12da70f39e0cbb6fb2d5a0ea5190448c5482fca7..4a4350f2852ba995db2441e3821374156f98e4a9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -77,6 +77,7 @@ #include #include #include +#include #include "internal.h" #include "shuffle.h" #include "page_reporting.h" @@ -4037,6 +4038,8 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) ((gfp_mask & __GFP_DMA) && !has_managed_dma())) return; + report_fault_event(smp_processor_id(), current, + NORMAL_FAULT, FE_ALLOCFAIL, NULL); va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args;