diff --git a/arch/Kconfig b/arch/Kconfig index 0fc9c6d591b8bb91779768a1aa1a912dd40d0c4e..d27eb1800cd9b4cb12a68bc1ee3094f4f10e5b70 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -1186,4 +1186,85 @@ source "kernel/gcov/Kconfig" source "scripts/gcc-plugins/Kconfig" +config ARCH_SUPPORTS_FAST_SYSCALL + bool + +config FAST_SYSCALL + bool "Fast Syscall support" + depends on ARCH_SUPPORTS_FAST_SYSCALL + default n + help + This enable support Fast syscall feature. + The svc exception handling process, which includes auxiliary + functions for debug/trace and core functions like + KPTI, has been identified as overly "lengthy". + This inefficiency is particularly noticeable in short syscalls + such as lseek() and getpid(), where the syscall function itself + comprises a small percentage of the total instructions executed. + To address this, we introduce the concept of fast syscall, a fast svc + exception handling path that only considers necessary features + such as security, context saving, and recovery. + +config XCALL_PREFETCH + bool "Xcall prefetch support" + depends on FAST_SYSCALL + depends on EPOLL + default n + help + This enable xcall prefetch feature. + Xcall prefetch feature implements customized epoll_wait() and + read() system calls, which enable data prefetching. + In high-concurrency connection scenarios, this improves + the parallel execution efficiency of the read() system call + and increases the system's business throughput. + The Xcall prefetch feature is suitable for business scenarios + where the epoll I/O multiplexing mechanism is used, the read() + system call takes up a large proportion of time, and the number + of concurrent connections is large. + +config ARCH_SUPPORTS_FAST_IRQ + bool + +config FAST_IRQ + bool "Fast irq support" + depends on ARCH_SUPPORTS_FAST_IRQ + default n + help + The irq handling process, which includes auxiliary + functions for debug/trace and core functions like + KPTI, interrupt time record, interrupt processing as + a random number source, interrupt affinity + modification and interrupt processing race, as well as + spurious and unhandled interrupt debugging, has been + identified as overly "lengthy". + To address this, we introduce the concept of fast irq, + a fast interrupt handling path that only considers + necessary features such as security, context saving + and recovery, which adds an lightweight interrupt processing + framework for latency-sensitive interrupts. + +config DEBUG_FEATURE_BYPASS + bool "Bypass debug feature in fast syscall" + depends on FAST_SYSCALL || FAST_IRQ + default y + help + This to bypass debug feature in fast syscall. + The svc exception handling process, which includes auxiliary + functions for debug/trace and core functions like + KPTI, has been identified as overly "lengthy". + In fast syscall we only considers necessary features. + Disable this config to keep debug feature in fast syscall. + +config SECURITY_FEATURE_BYPASS + bool "Bypass security feature in fast syscall" + depends on FAST_SYSCALL || FAST_IRQ + default y + help + This to bypass security feature in fast syscall. + The svc exception handling process, which includes auxiliary + functions for debug/trace and core functions like + KPTI, has been identified as overly "lengthy". + In fast syscall we only considers necessary features. + Disable this config to keep security feature in fast syscall. + endmenu diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 76f07a283d4e048ee97875a79ccd0f73453fe2b5..eb30ef59aca296a7cb2e581c476f0eddc444173c 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -220,6 +220,8 @@ config ARM64 select THREAD_INFO_IN_TASK select HAVE_LIVEPATCH_WO_FTRACE select THP_NUMA_CONTROL if ARM64_64K_PAGES && NUMA_BALANCING && TRANSPARENT_HUGEPAGE + select ARCH_SUPPORTS_FAST_SYSCALL if !ARM64_MTE && !KASAN_HW_TAGS + select ARCH_SUPPORTS_FAST_IRQ if ARM_GIC_V3 && !ARM64_MTE && !KASAN_HW_TAGS help ARM 64-bit (AArch64) Linux support. diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 97f400b031db4365d660eb6af6bf38187f4f5303..f6d1d7d4f2a431c9a0e2e478141b06f92eb20f6e 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -922,6 +922,13 @@ CONFIG_ARCH_HAS_GCOV_PROFILE_ALL=y # end of GCOV-based kernel profiling CONFIG_HAVE_GCC_PLUGINS=y +CONFIG_ARCH_SUPPORTS_FAST_SYSCALL=y +CONFIG_FAST_SYSCALL=y +CONFIG_XCALL_PREFETCH=y +CONFIG_ARCH_SUPPORTS_FAST_IRQ=y +CONFIG_FAST_IRQ=y +CONFIG_DEBUG_FEATURE_BYPASS=y +CONFIG_SECURITY_FEATURE_BYPASS=y # end of General architecture-dependent options CONFIG_RT_MUTEXES=y diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h index ce9fbf260a3cf20c0cbab1ede4f5982dbf3c7ac2..e2a2b3e40c94f9ac3ad913a0cae5663e15119a6d 100644 --- a/arch/arm64/include/asm/cpucaps.h +++ b/arch/arm64/include/asm/cpucaps.h @@ -81,6 +81,8 @@ #define ARM64_HAS_PBHA_STAGE2 73 #define ARM64_SME 74 #define ARM64_SME_FA64 75 +#define ARM64_HAS_XCALL 76 +#define ARM64_HAS_XINT 77 #define ARM64_NCAPS 80 diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h index d38d526d084e52452c0023a11243e520ad818349..4b7994bd2b94f3a188ae00bc8a2031e584289ea1 100644 --- a/arch/arm64/include/asm/exception.h +++ b/arch/arm64/include/asm/exception.h @@ -48,6 +48,9 @@ void do_el0_sys(unsigned long esr, struct pt_regs *regs); void do_sp_pc_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs); void bad_el0_sync(struct pt_regs *regs, int reason, unsigned int esr); void do_el0_cp15(unsigned long esr, struct pt_regs *regs); +#ifdef CONFIG_FAST_SYSCALL +void do_el0_xcall(struct pt_regs *regs); +#endif void do_el0_svc(struct pt_regs *regs); void do_el0_svc_compat(struct pt_regs *regs); void do_el0_fpac(struct pt_regs *regs, unsigned long esr); diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index c247e11130db7d88fcff5c3b237864f8b87855ca..7c6ad4b1667b585c222b1f30412c01e010a26505 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -26,6 +26,9 @@ int main(void) { +#ifdef CONFIG_FAST_SYSCALL + DEFINE(TSK_XCALL, offsetof(struct task_struct, xcall_enable)); +#endif DEFINE(TSK_ACTIVE_MM, offsetof(struct task_struct, active_mm)); BLANK(); DEFINE(TSK_TI_CPU, offsetof(struct task_struct, thread_info.cpu)); diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index dee049d27c745b0935d07e7521ebf53d46d41908..9b4a315e96bc0781cfd8be8b220108327a9fa6ea 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -2155,6 +2155,44 @@ static bool can_clearpage_use_stnp(const struct arm64_cpu_capabilities *entry, return use_clearpage_stnp && has_mor_nontemporal(entry); } +#ifdef CONFIG_FAST_SYSCALL +static bool is_xcall_support; +static int __init xcall_setup(char *str) +{ + is_xcall_support = true; + return 1; +} +__setup("xcall", xcall_setup); + +bool fast_syscall_enabled(void) +{ + return is_xcall_support; +} + +static bool has_xcall_support(const struct arm64_cpu_capabilities *entry, int __unused) +{ + return is_xcall_support; +} +#endif + +#ifdef CONFIG_FAST_IRQ +bool is_xint_support; +static int __init xint_setup(char *str) +{ + if (!cpus_have_cap(ARM64_HAS_SYSREG_GIC_CPUIF)) + return 1; + + is_xint_support = true; + return 1; +} +__setup("xint", xint_setup); + +static bool has_xint_support(const struct arm64_cpu_capabilities *entry, int __unused) +{ + return is_xint_support; +} +#endif + static const struct arm64_cpu_capabilities arm64_features[] = { { .desc = "GIC system register CPU interface", @@ -2701,6 +2739,22 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .cpu_enable = fa64_kernel_enable, }, #endif /* CONFIG_ARM64_SME */ +#ifdef CONFIG_FAST_SYSCALL + { + .desc = "Xcall Support", + .capability = ARM64_HAS_XCALL, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_xcall_support, + }, +#endif +#ifdef CONFIG_FAST_IRQ + { + .desc = "Xint Support", + .capability = ARM64_HAS_XINT, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_xint_support, + }, +#endif {}, }; diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c index 02cd5d57edb6e7eec4087798e1b9daf0c646fde5..3e59cbdedc4cd01640af6223a6781e41421e0789 100644 --- a/arch/arm64/kernel/entry-common.c +++ b/arch/arm64/kernel/entry-common.c @@ -388,6 +388,28 @@ static void noinstr el0_fpac(struct pt_regs *regs, unsigned long esr) do_el0_fpac(regs, esr); } +#if defined(CONFIG_FAST_SYSCALL) || defined(CONFIG_FAST_IRQ) +asmlinkage void noinstr fast_enter_from_user_mode(void) +{ +#ifndef CONFIG_DEBUG_FEATURE_BYPASS + lockdep_hardirqs_off(CALLER_ADDR0); + CT_WARN_ON(ct_state() != CONTEXT_USER); +#endif + user_exit_irqoff(); +#ifndef CONFIG_DEBUG_FEATURE_BYPASS + trace_hardirqs_off_finish(); +#endif +} +#endif + +#ifdef CONFIG_FAST_SYSCALL +asmlinkage void noinstr el0_xcall_handler(struct pt_regs *regs) +{ + fast_enter_from_user_mode(); + do_el0_xcall(regs); +} +#endif + asmlinkage void noinstr el0_sync_handler(struct pt_regs *regs) { unsigned long esr = read_sysreg(esr_el1); diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 1290f36c83713a65c22d7f61dc839e763a9e10cb..5ed8b8e8e58ec1d69c5143eb67d283204b24fe43 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -182,7 +182,7 @@ alternative_else_nop_endif #endif .endm - .macro kernel_entry, el, regsize = 64 + .macro kernel_entry, el, regsize = 64, fast_mode = std .if \regsize == 32 mov w0, w0 // zero upper 32 bits of x0 .endif @@ -212,12 +212,19 @@ alternative_else_nop_endif * Ensure MDSCR_EL1.SS is clear, since we can unmask debug exceptions * when scheduling. */ + .if \fast_mode == std ldr x19, [tsk, #TSK_TI_FLAGS] disable_step_tsk x19, x20 + .endif /* Check for asynchronous tag check faults in user space */ + .if \fast_mode == std check_mte_async_tcf x22, x23 + .endif + + .if \fast_mode == std apply_ssbd 1, x22, x23 + .endif ptrauth_keys_install_kernel tsk, x20, x22, x23 @@ -243,9 +250,11 @@ alternative_else_nop_endif add x29, sp, #S_STACKFRAME #ifdef CONFIG_ARM64_SW_TTBR0_PAN +.if \fast_mode == std alternative_if_not ARM64_HAS_PAN bl __swpan_entry_el\el alternative_else_nop_endif +.endif #endif stp x22, x23, [sp, #S_PC] @@ -268,9 +277,11 @@ alternative_else_nop_endif /* Re-enable tag checking (TCO set on exception entry) */ #ifdef CONFIG_ARM64_MTE +.if \fast_mode == std alternative_if ARM64_MTE SET_PSTATE_TCO(0) alternative_else_nop_endif +.endif #endif /* @@ -283,7 +294,7 @@ alternative_else_nop_endif */ .endm - .macro kernel_exit, el + .macro kernel_exit, el, fast_mode = std .if \el != 0 disable_daif .endif @@ -303,14 +314,18 @@ alternative_else_nop_endif ldp x21, x22, [sp, #S_PC] // load ELR, SPSR #ifdef CONFIG_ARM64_SW_TTBR0_PAN +.if \fast_mode == std alternative_if_not ARM64_HAS_PAN bl __swpan_exit_el\el alternative_else_nop_endif +.endif #endif .if \el == 0 ldr x23, [sp, #S_SP] // load return stack pointer msr sp_el0, x23 + + .if \fast_mode == std tst x22, #PSR_MODE32_BIT // native task? b.eq 3f @@ -325,13 +340,17 @@ alternative_if ARM64_WORKAROUND_845719 alternative_else_nop_endif #endif 3: + .endif + scs_save tsk, x0 /* No kernel C function calls after this as user keys are set. */ ptrauth_keys_install_user tsk, x0, x1, x2 + .if \fast_mode == std apply_ssbd 0, x0, x1 .endif + .endif msr elr_el1, x21 // set up the return data msr spsr_el1, x22 @@ -675,11 +694,91 @@ SYM_CODE_START_LOCAL_NOALIGN(el1_irq) kernel_exit 1 SYM_CODE_END(el1_irq) +#ifdef CONFIG_FAST_SYSCALL + .macro check_esr_el1_ec_svc64 + /* Only support SVC64 for now */ + mrs x20, esr_el1 + lsr w20, w20, #ESR_ELx_EC_SHIFT + cmp x20, #ESR_ELx_EC_SVC64 + .endm + + .macro check_syscall_nr + cmp x8, __NR_syscalls + .endm + + .macro check_xcall_enable + /* x21 = task_struct->xcall_enable */ + ldr_this_cpu x20, __entry_task, x21 + ldr x21, [x20, #TSK_XCALL] + /* x20 = sc_no / 8 */ + lsr x20, x8, 3 + ldr x21, [x21, x20] + /* x8 = sc_no % 8 */ + and x8, x8, 7 + mov x20, 1 + lsl x20, x20, x8 + and x21, x21, x20 + cmp x21, 0 + .endm + + .macro check_xcall_pre_kernel_entry + stp x20, x21, [sp, #0] + /* is ESR_ELx_EC_SVC64 */ + check_esr_el1_ec_svc64 + bne .Lskip_xcall\@ + /* x8 >= __NR_syscalls */ + check_syscall_nr + bhs .Lskip_xcall\@ + str x8, [sp, #16] + /* is xcall enabled */ + check_xcall_enable + ldr x8, [sp, #16] + beq .Lskip_xcall\@ + ldp x20, x21, [sp, #0] + /* do xcall */ +#ifdef CONFIG_SECURITY_FEATURE_BYPASS + kernel_entry 0, 64, xcall +#else + kernel_entry 0, 64 +#endif + mov x0, sp + bl el0_xcall_handler +#ifdef CONFIG_SECURITY_FEATURE_BYPASS + disable_daif + gic_prio_kentry_setup tmp=x3 + ldr x19, [tsk, #TSK_TI_FLAGS] + and x2, x19, #_TIF_WORK_MASK + cbnz x2, fast_work_pending\@ +fast_finish_ret_to_user\@: + user_enter_irqoff + kernel_exit 0 xcall +fast_work_pending\@: + mov x0, sp // 'regs' + mov x1, x19 + bl do_notify_resume + ldr x19, [tsk, #TSK_TI_FLAGS] // re-check for single-step + b fast_finish_ret_to_user\@ +#else + b ret_to_user +#endif +.Lskip_xcall\@: + ldp x20, x21, [sp, #0] + .endm +#endif + /* * EL0 mode handlers. */ .align 6 SYM_CODE_START_LOCAL_NOALIGN(el0_sync) +#ifdef CONFIG_FAST_SYSCALL + /* Only support el0 aarch64 sync exception */ + alternative_if_not ARM64_HAS_XCALL + b .Lret_to_kernel_entry + alternative_else_nop_endif + check_xcall_pre_kernel_entry + .Lret_to_kernel_entry: +#endif kernel_entry 0 mov x0, sp bl el0_sync_handler @@ -705,10 +804,90 @@ SYM_CODE_START_LOCAL_NOALIGN(el0_error_compat) kernel_entry 0, 32 b el0_error_naked SYM_CODE_END(el0_error_compat) +#endif + +#ifdef CONFIG_FAST_IRQ +.macro el0_xint_handler, handler:req +#if defined(CONFIG_CONTEXT_TRACKING) || defined(CONFIG_TRACE_IRQFLAGS) + bl fast_enter_from_user_mode +#endif + enable_da_f +#ifndef CONFIG_SECURITY_FEATURE_BYPASS + tbz x22, #55, 1f + bl do_el0_irq_bp_hardening +1: +#endif + irq_handler \handler +.endm + +.macro check_xint_pre_kernel_entry + stp x0, x1, [sp, #0] + stp x2, x3, [sp, #16] + + ldr x0, =irqnr_xint_map + /* get hpp irqnr */ + mrs_s x1, SYS_ICC_HPPIR1_EL1 + + /* xint hwirq can not exceed 1020 */ + cmp x1, 1020 + b.ge .Lskip_xint\@ + + /* x2 = irqnr % 8 */ + and x2, x1, #7 + /* x3 = irqnr / 8 */ + lsr x3, x1, #3 + /* x1 is the byte of irqnr in irqnr_xint_map */ + ldr x1, [x0, x3] + + /* Get the check mask */ + mov x3, #1 + /* x3 = 1 << (irqnr % 8) */ + lsl x3, x3, x2 + + /* x1 = x1 & x3 */ + ands x1, x1, x3 + b.eq .Lskip_xint\@ + + ldp x0, x1, [sp, #0] + ldp x2, x3, [sp, #16] +#ifdef CONFIG_SECURITY_FEATURE_BYPASS + kernel_entry 0, 64, xint + el0_xint_handler handle_arch_irq + disable_daif + gic_prio_kentry_setup tmp=x3 + ldr x19, [tsk, #TSK_TI_FLAGS] + and x2, x19, #_TIF_WORK_MASK + cbnz x2, xint_fast_work_pending\@ +xint_fast_finish_ret_to_user\@: + user_enter_irqoff + kernel_exit 0 xint +xint_fast_work_pending\@: + mov x0, sp // 'regs' + mov x1, x19 + bl do_notify_resume + b xint_fast_finish_ret_to_user\@ +#else + kernel_entry 0, 64 + el0_xint_handler handle_arch_irq + b ret_to_user +#endif + +.Lskip_xint\@: + ldp x0, x1, [sp, #0] + ldp x2, x3, [sp, #16] +.endm #endif .align 6 SYM_CODE_START_LOCAL_NOALIGN(el0_irq) +#ifdef CONFIG_FAST_IRQ + /* Only support el0 aarch64 irq */ + alternative_if_not ARM64_HAS_XINT + b .Lskip_check_xint + alternative_else_nop_endif + check_xint_pre_kernel_entry +.Lskip_check_xint: +#endif kernel_entry 0 el0_irq_naked: el0_interrupt_handler handle_arch_irq diff --git a/arch/arm64/kernel/syscall.c b/arch/arm64/kernel/syscall.c index 9bd304568d90349164ff1926d100a833ac9c7399..2d73eaaf9bc28d9b24979f6fafd8d7fbb2f817f0 100644 --- a/arch/arm64/kernel/syscall.c +++ b/arch/arm64/kernel/syscall.c @@ -106,6 +106,46 @@ static void cortex_a76_erratum_1463225_svc_handler(void) static void cortex_a76_erratum_1463225_svc_handler(void) { } #endif /* CONFIG_ARM64_ERRATUM_1463225 */ +#ifdef CONFIG_FAST_SYSCALL +static void el0_xcall_common(struct pt_regs *regs, int scno, int sc_nr, + const syscall_fn_t syscall_table[]) +{ + unsigned long flags = read_thread_flags(); + + regs->orig_x0 = regs->regs[0]; + regs->syscallno = scno; + +#ifndef CONFIG_SECURITY_FEATURE_BYPASS + cortex_a76_erratum_1463225_svc_handler(); +#endif + local_daif_restore(DAIF_PROCCTX); + + if (system_supports_mte() && (flags & _TIF_MTE_ASYNC_FAULT)) { + syscall_set_return_value(current, regs, -ERESTARTNOINTR, 0); + return; + } + + if (has_syscall_work(flags)) { + if (scno == NO_SYSCALL) + syscall_set_return_value(current, regs, -ENOSYS, 0); + scno = syscall_trace_enter(regs); + if (scno == NO_SYSCALL) + goto trace_exit; + } + + invoke_syscall(regs, scno, sc_nr, syscall_table); + + if (!has_syscall_work(flags) && !IS_ENABLED(CONFIG_DEBUG_RSEQ)) { + flags = read_thread_flags(); + if (!has_syscall_work(flags) && !(flags & _TIF_SINGLESTEP)) + return; + } + +trace_exit: + syscall_trace_exit(regs); +} +#endif + static void el0_svc_common(struct pt_regs *regs, int scno, int sc_nr, const syscall_fn_t syscall_table[]) { @@ -237,6 +277,23 @@ static inline void delouse_pt_regs(struct pt_regs *regs) } #endif +#ifdef CONFIG_FAST_SYSCALL +void do_el0_xcall(struct pt_regs *regs) +{ + const syscall_fn_t *t = sys_call_table; + +#ifdef CONFIG_ARM64_ILP32 + if (is_ilp32_compat_task()) { + t = ilp32_sys_call_table; + delouse_pt_regs(regs); + } +#endif + + fp_user_discard(); + el0_xcall_common(regs, regs->regs[8], __NR_syscalls, t); +} +#endif + void do_el0_svc(struct pt_regs *regs) { const syscall_fn_t *t = sys_call_table; diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c index 87af452d82dc866dd4f75c9dee2168f01c70eb33..ef2f1ca54a42f07e599cf9c75ea79d2c279ac363 100644 --- a/drivers/irqchip/irq-gic-v3.c +++ b/drivers/irqchip/irq-gic-v3.c @@ -31,6 +31,10 @@ #include "irq-gic-common.h" +#ifdef CONFIG_FAST_IRQ +#include "../../../kernel/irq/internals.h" +#endif + #define GICD_INT_NMI_PRI (GICD_INT_DEF_PRI & ~0x80) #define FLAGS_WORKAROUND_GICR_WAKER_MSM8996 (1ULL << 0) @@ -720,6 +724,132 @@ static asmlinkage void __exception_irq_entry gic_handle_irq(struct pt_regs *regs } } +#ifdef CONFIG_FAST_IRQ +DECLARE_BITMAP(irqnr_xint_map, 1024); + +static bool can_set_xint(unsigned int hwirq) +{ + if (__get_intid_range(hwirq) == SGI_RANGE || + __get_intid_range(hwirq) == SPI_RANGE) + return true; + + return false; +} + +static bool xint_transform(int irqno, enum xint_op op) +{ + struct irq_data *data = irq_get_irq_data(irqno); + int hwirq; + + while (data->parent_data) + data = data->parent_data; + + hwirq = data->hwirq; + + if (!can_set_xint(hwirq)) + return false; + + switch (op) { + case IRQ_TO_XINT: + set_bit(hwirq, irqnr_xint_map); + return true; + case XINT_TO_IRQ: + clear_bit(hwirq, irqnr_xint_map); + return false; + case XINT_SET_CHECK: + return test_bit(hwirq, irqnr_xint_map); + case XINT_RANGE_CHECK: + return true; + default: + return false; + } +} + +static ssize_t xint_proc_write(struct file *file, + const char __user *buffer, size_t count, loff_t *pos) +{ + int irq = (int)(long)PDE_DATA(file_inode(file)); + bool xint_state = false; + enum xint_op switch_type; + unsigned long val; + char *buf = NULL; + + if (!xint_transform(irq, XINT_RANGE_CHECK)) + return -EPERM; + + buf = memdup_user_nul(buffer, count); + if (IS_ERR(buf)) + return PTR_ERR(buf); + + if (kstrtoul(buf, 0, &val) || (val != 0 && val != 1)) { + kfree(buf); + return -EINVAL; + } + + xint_state = xint_transform(irq, XINT_SET_CHECK); + if (xint_state == val) { + kfree(buf); + return -EBUSY; + } + + if (xint_state) { + switch_type = XINT_TO_IRQ; + xint_remove_debugfs_entry(irq); + } else { + switch_type = IRQ_TO_XINT; + xint_add_debugfs_entry(irq); + } + + disable_irq(irq); + local_irq_disable(); + + xint_transform(irq, switch_type); + + local_irq_enable(); + enable_irq(irq); + + kfree(buf); + + return count; +} + +static int xint_proc_show(struct seq_file *m, void *v) +{ + seq_printf(m, "%d\n", xint_transform((long)m->private, XINT_SET_CHECK)); + return 0; +} + +static int xint_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, xint_proc_show, PDE_DATA(inode)); +} + +static const struct proc_ops xint_proc_ops = { + .proc_open = xint_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, + .proc_write = xint_proc_write, +}; + +void register_irqchip_proc(struct irq_desc *desc, void *irqp) +{ + if (!is_xint_support) + return; + + /* create /proc/irq//xint */ + proc_create_data("xint", 0644, desc->dir, &xint_proc_ops, irqp); +} + +void unregister_irqchip_proc(struct irq_desc *desc) +{ + if (!is_xint_support) + return; + + remove_proc_entry("xint", desc->dir); +} +#endif /* CONFIG_FAST_IRQ */ + static u32 gic_get_pribits(void) { u32 pribits; diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 5ce1ea1f452b1e010e50da04730608a3e4f95ccd..0793174a5c697f0edca6e7984ca7600029278511 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -768,6 +768,404 @@ static void epi_rcu_free(struct rcu_head *head) kmem_cache_free(epi_cache, epi); } +#ifdef CONFIG_XCALL_PREFETCH +#define CREATE_TRACE_POINTS +#include + +#define XCALL_CACHE_PAGE_ORDER 2 +#define XCALL_CACHE_BUF_SIZE ((1 << XCALL_CACHE_PAGE_ORDER) * PAGE_SIZE) +DEFINE_PER_CPU_ALIGNED(unsigned long, xcall_cache_hit); +DEFINE_PER_CPU_ALIGNED(unsigned long, xcall_cache_miss); + +#define PREFETCH_ITEM_HASH_BITS 6 +static DEFINE_HASHTABLE(xcall_item_table, PREFETCH_ITEM_HASH_BITS); +static DEFINE_RWLOCK(xcall_table_lock); +static struct workqueue_struct *rc_work; +static struct cpumask xcall_mask; + +static ssize_t xcall_mask_proc_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct cpumask tmp; + int err; + + err = cpumask_parselist_user(buf, count, &tmp); + if (err) + return err; + + if (!cpumask_intersects(&tmp, cpu_online_mask)) { + pr_warn("cpu %*pbl is not online.\n", cpumask_pr_args(&tmp)); + return -EINVAL; + } + + cpumask_copy(&xcall_mask, &tmp); + return count; +} + +static int xcall_mask_proc_show(struct seq_file *m, void *v) +{ + seq_printf(m, "%*pbl\n", cpumask_pr_args(&xcall_mask)); + return 0; +} + +static int xcall_mask_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, xcall_mask_proc_show, PDE_DATA(inode)); +} + +static const struct proc_ops xcall_mask_fops = { + .proc_open = xcall_mask_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, + .proc_write = xcall_mask_proc_write, +}; + +static ssize_t xcall_prefetch_write(struct file *file, const char __user *buf, + size_t count, loff_t *pos) +{ + int cpu; + + for_each_cpu(cpu, cpu_online_mask) { + *per_cpu_ptr(&xcall_cache_hit, cpu) = 0; + *per_cpu_ptr(&xcall_cache_miss, cpu) = 0; + } + + return count; +} + +static int xcall_prefetch_show(struct seq_file *m, void *v) +{ + unsigned long hit = 0, miss = 0; + unsigned int cpu; + u64 percent; + + for_each_cpu(cpu, cpu_online_mask) { + hit = *per_cpu_ptr(&xcall_cache_hit, cpu); + miss = *per_cpu_ptr(&xcall_cache_miss, cpu); + + if (hit == 0 && miss == 0) + continue; + + percent = DIV_ROUND_CLOSEST(hit * 100ULL, hit + miss); + seq_printf(m, "cpu%u epoll cache_{hit,miss}: %lu,%lu, hit ratio: %llu%%\n", + cpu, hit, miss, percent); + } + return 0; +} + +static int xcall_prefetch_open(struct inode *inode, struct file *file) +{ + return single_open(file, xcall_prefetch_show, NULL); +} + +static const struct proc_ops xcall_prefetch_fops = { + .proc_open = xcall_prefetch_open, + .proc_read = seq_read, + .proc_write = xcall_prefetch_write, + .proc_lseek = seq_lseek, + .proc_release = single_release +}; + +extern bool fast_syscall_enabled(void); +static int __init init_xcall_prefetch_procfs(void) +{ + struct proc_dir_entry *xcall_proc_dir, *prefetch_dir, *xcall_mask_dir; + + if (!fast_syscall_enabled()) + return 0; + + xcall_proc_dir = proc_mkdir("xcall", NULL); + if (!xcall_proc_dir) + return -ENOMEM; + prefetch_dir = proc_create("prefetch", 0644, xcall_proc_dir, &xcall_prefetch_fops); + if (!prefetch_dir) + goto rm_xcall_proc_dir; + xcall_mask_dir = proc_create("cpu_list", 0644, xcall_proc_dir, + &xcall_mask_fops); + if (!xcall_mask_dir) + goto rm_prefetch_dir; + + cpumask_copy(&xcall_mask, cpu_online_mask); + return 0; + +rm_prefetch_dir: + proc_remove(prefetch_dir); +rm_xcall_proc_dir: + proc_remove(xcall_proc_dir); + return -ENOMEM; +} +device_initcall(init_xcall_prefetch_procfs); + +static inline bool transition_state(struct prefetch_item *pfi, + enum cache_state old, enum cache_state new) +{ + return atomic_cmpxchg(&pfi->state, old, new) == old; +} + +static void xcall_prefetch_init(void) +{ + rc_work = alloc_workqueue("eventpoll_rc", 0, 0); + if (!rc_work) + pr_warn("alloc eventpoll_rc workqueue failed.\n"); + + hash_init(xcall_item_table); +} + +static struct prefetch_item *find_prefetch_item(struct file *file) +{ + unsigned int hash = hash_64((u64)file, PREFETCH_ITEM_HASH_BITS); + struct prefetch_item *found = NULL; + + read_lock(&xcall_table_lock); + hash_for_each_possible(xcall_item_table, found, node, hash) { + if (found->file == file) + break; + } + read_unlock(&xcall_table_lock); + + return found; +} + +static void prefetch_work_fn(struct work_struct *work) +{ + struct prefetch_item *pfi = container_of(work, struct prefetch_item, work); + + if (!transition_state(pfi, XCALL_CACHE_NONE, XCALL_CACHE_PREFETCH)) + return; + + trace_epoll_rc_prefetch(pfi->file); + pfi->pos = 0; + pfi->len = kernel_read(pfi->file, pfi->cache, + XCALL_CACHE_BUF_SIZE, &pfi->file->f_pos); + transition_state(pfi, XCALL_CACHE_PREFETCH, XCALL_CACHE_READY); + trace_epoll_rc_ready(pfi->file, pfi->len); +} + +static void set_prefetch_numa_cpu(struct prefetch_item *pfi, int fd) +{ + int cur_cpu = smp_processor_id(); + struct cpumask tmp; + int cpu; + + cpumask_copy(&tmp, &xcall_mask); + cpumask_and(&pfi->related_cpus, cpu_cpu_mask(cur_cpu), cpu_online_mask); + if (cpumask_intersects(&tmp, &pfi->related_cpus)) + cpumask_and(&pfi->related_cpus, &pfi->related_cpus, &tmp); + cpu = cpumask_next(fd % cpumask_weight(&pfi->related_cpus), + &pfi->related_cpus); + if (cpu > cpumask_last(&pfi->related_cpus)) + cpu = cpumask_first(&pfi->related_cpus); + pfi->cpu = cpu; +} + +static struct prefetch_item *alloc_prefetch_item(struct epitem *epi) +{ + struct file *tfile = epi->ffd.file; + struct prefetch_item *pfi; + int fd = epi->ffd.fd; + unsigned int hash; + + pfi = kmalloc(sizeof(struct prefetch_item), GFP_KERNEL); + if (!pfi) + return NULL; + + pfi->cache_pages = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, + XCALL_CACHE_PAGE_ORDER); + if (!pfi->cache_pages) { + kfree(pfi); + return NULL; + } + + pfi->cache = page_address(pfi->cache_pages); + atomic_set(&pfi->state, XCALL_CACHE_NONE); + INIT_WORK(&pfi->work, prefetch_work_fn); + INIT_HLIST_NODE(&pfi->node); + pfi->file = tfile; + pfi->len = 0; + pfi->pos = 0; + set_prefetch_numa_cpu(pfi, fd); + + write_lock(&xcall_table_lock); + hash = hash_64((u64)tfile, PREFETCH_ITEM_HASH_BITS); + hash_add(xcall_item_table, &pfi->node, hash); + write_unlock(&xcall_table_lock); + + return pfi; +} + +void free_prefetch_item(struct file *file) +{ + unsigned int hash = hash_64((u64)file, PREFETCH_ITEM_HASH_BITS); + struct prefetch_item *pfi = NULL; + struct hlist_node *next; + bool has_pfi = false; + + if (!fast_syscall_enabled()) + return; + + write_lock(&xcall_table_lock); + hash_for_each_possible_safe(xcall_item_table, pfi, next, node, hash) { + if (pfi->file == file) { + if (!hlist_unhashed(&pfi->node)) + hlist_del_init(&pfi->node); + has_pfi = true; + break; + } + } + write_unlock(&xcall_table_lock); + if (!has_pfi) + return; + + cancel_work_sync(&pfi->work); + __free_pages(pfi->cache_pages, XCALL_CACHE_PAGE_ORDER); + pfi->cache = NULL; + kfree(pfi); +} + +static int xcall_read(struct prefetch_item *pfi, char __user *buf, size_t count) +{ + ssize_t copy_len = 0; + + /* + * Everytime it does the memcpy on prefetch buffer, it has to keep + * the state of pfi is "CANCEL" to avoid the race on the prefetch + * buffer from both the prefetch thread calling kernel_read() and + * other threads calling copy_to_user(), also avoid race on the + * prefetch file from both the prefetch thread calling kernel_read() + * and other threads calling vfs_read(). + */ + while (!transition_state(pfi, XCALL_CACHE_READY, XCALL_CACHE_CANCEL)) { + /* + * Once the prefetch thread read return error code or prefetch + * has not start, no need to waste CPU on waiting right here, + * it should do a slow vfs_read() to ensure no new arrival data. + */ + if (transition_state(pfi, XCALL_CACHE_NONE, XCALL_CACHE_CANCEL)) + goto slow_read; + } + + copy_len = pfi->len; + if (unlikely(copy_len < 0)) + goto slow_read; + + if (copy_len == 0) { + this_cpu_inc(xcall_cache_hit); + trace_epoll_rc_hit(pfi->file, 0); + transition_state(pfi, XCALL_CACHE_CANCEL, XCALL_CACHE_NONE); + return 0; + } + + copy_len = (copy_len >= count) ? count : copy_len; + copy_len -= copy_to_user(buf, (void *)(pfi->cache + pfi->pos), copy_len); + pfi->len -= copy_len; + pfi->pos += copy_len; + if (pfi->len == 0) + transition_state(pfi, XCALL_CACHE_CANCEL, XCALL_CACHE_NONE); + else if (pfi->len > 0) + transition_state(pfi, XCALL_CACHE_CANCEL, XCALL_CACHE_READY); + + this_cpu_inc(xcall_cache_hit); + trace_epoll_rc_hit(pfi->file, copy_len); + return copy_len; + +slow_read: + this_cpu_inc(xcall_cache_miss); + trace_epoll_rc_miss(pfi->file); + pfi->len = 0; + pfi->pos = 0; + cancel_work(&pfi->work); + + return -EAGAIN; +} + +int xcall_read_begin(struct file *file, char __user *buf, size_t count) +{ + struct prefetch_item *pfi = NULL; + + if (!current->xcall_select || + !test_bit(__NR_epoll_pwait, current->xcall_select)) + return -EAGAIN; + + pfi = find_prefetch_item(file); + if (!pfi) + return -EAGAIN; + + return xcall_read(pfi, buf, count); +} + +void xcall_read_end(struct file *file) +{ + struct prefetch_item *pfi = NULL; + + if (!current->xcall_select || + !test_bit(__NR_epoll_pwait, current->xcall_select)) + return; + + pfi = find_prefetch_item(file); + if (!pfi) + return; + + transition_state(pfi, XCALL_CACHE_CANCEL, XCALL_CACHE_NONE); +} + +static int get_async_prefetch_cpu(struct prefetch_item *pfi) +{ + int cpu; + + if (pfi->cpu != smp_processor_id()) + return pfi->cpu; + + cpu = cpumask_next(pfi->cpu, &pfi->related_cpus); + if (cpu > cpumask_last(&pfi->related_cpus)) + cpu = cpumask_first(&pfi->related_cpus); + pfi->cpu = cpu; + return pfi->cpu; +} + +static void ep_prefetch_item_enqueue(struct eventpoll *ep, struct epitem *epi) +{ + struct prefetch_item *pfi; + int cpu, err; + + if (unlikely(!rc_work) || !current->xcall_select || + !test_bit(__NR_epoll_pwait, current->xcall_select)) + return; + + if (!(epi->event.events & EPOLLIN) || + !sock_from_file(epi->ffd.file, &err) || + !(epi->ffd.file->f_mode & FMODE_READ)) + return; + + pfi = find_prefetch_item(epi->ffd.file); + if (unlikely(!pfi)) { + pfi = alloc_prefetch_item(epi); + if (unlikely(!pfi)) + return; + } + + if (atomic_read(&pfi->state) != XCALL_CACHE_NONE) + return; + + cpu = get_async_prefetch_cpu(pfi); + queue_work_on(cpu, rc_work, &pfi->work); + trace_epoll_rc_queue(pfi->file, cpu); +} + +static void xcall_cancel_work(struct file *file) +{ + struct prefetch_item *pfi; + + if (!current->xcall_select || + !test_bit(__NR_epoll_pwait, current->xcall_select)) + return; + + pfi = find_prefetch_item(file); + if (pfi) + cancel_work_sync(&pfi->work); +} +#endif + /* * Removes a "struct epitem" from the eventpoll RB tree and deallocates * all the associated resources. Must be called with "mtx" held. @@ -782,6 +1180,9 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) * Removes poll wait queue hooks. */ ep_unregister_pollwait(ep, epi); +#ifdef CONFIG_XCALL_PREFETCH + xcall_cancel_work(file); +#endif /* Remove the current item from the list of epoll hooks */ spin_lock(&file->f_lock); @@ -1751,6 +2152,10 @@ static __poll_t ep_send_events_proc(struct eventpoll *ep, struct list_head *head if (!revents) continue; +#ifdef CONFIG_XCALL_PREFETCH + ep_prefetch_item_enqueue(ep, epi); +#endif + if (__put_user(revents, &uevent->events) || __put_user(epi->event.data, &uevent->data)) { list_add(&epi->rdllink, head); @@ -2454,6 +2859,10 @@ static int __init eventpoll_init(void) pwq_cache = kmem_cache_create("eventpoll_pwq", sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL); +#ifdef CONFIG_XCALL_PREFETCH + xcall_prefetch_init(); +#endif + return 0; } fs_initcall(eventpoll_init); diff --git a/fs/proc/base.c b/fs/proc/base.c index 4e0054a37c4c54fa808264b1130ff2074d292bad..507cc47ab19359541b75b0c8ffe6018b37dcd168 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3589,6 +3589,155 @@ static const struct file_operations proc_pid_sg_level_operations = { }; #endif +#ifdef CONFIG_FAST_SYSCALL +bool fast_syscall_enabled(void); + +static int xcall_show(struct seq_file *m, void *v) +{ + struct inode *inode = m->private; + struct task_struct *p; + unsigned int rs, re, sc_no; + + if (!fast_syscall_enabled()) + return -EACCES; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + + if (!p->xcall_enable) + goto out; + + seq_printf(m, "Enabled Total[%d/%d]:", bitmap_weight(p->xcall_enable, __NR_syscalls), + __NR_syscalls); + + for (rs = 0, bitmap_next_set_region(p->xcall_enable, &rs, &re, __NR_syscalls); + rs < re; rs = re + 1, + bitmap_next_set_region(p->xcall_enable, &rs, &re, __NR_syscalls)) { + rs == (re - 1) ? seq_printf(m, "%d,", rs) : + seq_printf(m, "%d-%d,", rs, re - 1); + } + seq_puts(m, "\nAvailable:\n"); + + for (sc_no = 0; sc_no < __NR_syscalls; sc_no++) { + if (p->xcall_select && test_bit(sc_no, p->xcall_select)) { + seq_printf(m, "NR_syscall: %3d: enabled: %d ", + sc_no, test_bit(sc_no, p->xcall_enable)); + seq_printf(m, "xcall_select: %d\n", + test_bit(sc_no, p->xcall_select)); + } + } +out: + put_task_struct(p); + + return 0; +} + +static int xcall_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, xcall_show, inode); +} + +static int xcall_enable_one(struct task_struct *p, unsigned int sc_no) +{ + if (p->xcall_select && test_bit(sc_no, p->xcall_select)) + return -EINVAL; + + if (!bitmap_weight(p->xcall_enable, __NR_syscalls)) { + p->xcall_select = bitmap_zalloc(__NR_syscalls, GFP_KERNEL); + if (!p->xcall_select) + return -ENOMEM; + } + + bitmap_set(p->xcall_enable, sc_no, 1); + return 0; +} + +static int xcall_disable_one(struct task_struct *p, unsigned int sc_no) +{ + if (p->xcall_select && test_bit(sc_no, p->xcall_select)) + return -EINVAL; + + bitmap_clear(p->xcall_enable, sc_no, 1); + return 0; +} + +static int xcall_select_table(struct task_struct *p, unsigned int sc_no) +{ + if (!p->xcall_select || !test_bit(sc_no, p->xcall_enable)) { + pr_err("Please enable NR_syscall: %d to xcall first.\n", sc_no); + return -EINVAL; + } + + if (test_bit(sc_no, p->xcall_select)) + return -EINVAL; + + bitmap_set(p->xcall_select, sc_no, 1); + + return 0; +} + +static ssize_t xcall_write(struct file *file, const char __user *buf, + size_t count, loff_t *offset) +{ + struct inode *inode = file_inode(file); + struct task_struct *p; + char buffer[TASK_COMM_LEN]; + const size_t maxlen = sizeof(buffer) - 1; + unsigned int sc_no = __NR_syscalls; + int ret = 0; + int is_clear = 0, is_switch = 0; + + if (!fast_syscall_enabled()) + return -EACCES; + + memset(buffer, 0, sizeof(buffer)); + if (!count || copy_from_user(buffer, buf, count > maxlen ? maxlen : count)) + return -EFAULT; + + p = get_proc_task(inode); + if (!p || !p->xcall_enable) + return -ESRCH; + + if (buffer[0] == '!') + is_clear = 1; + else if ((buffer[0] == '@')) + is_switch = 1; + + if (kstrtouint(buffer + is_clear + is_switch, 10, &sc_no)) { + ret = -EINVAL; + goto out; + } + + if (sc_no >= __NR_syscalls) { + ret = -EINVAL; + goto out; + } + + if (is_switch && test_bit(sc_no, p->xcall_enable)) + ret = xcall_select_table(p, sc_no); + else if (!is_switch && !is_clear && !test_bit(sc_no, p->xcall_enable)) + ret = xcall_enable_one(p, sc_no); + else if (!is_switch && is_clear && test_bit(sc_no, p->xcall_enable)) + ret = xcall_disable_one(p, sc_no); + else + ret = -EINVAL; + +out: + put_task_struct(p); + + return ret ? ret : count; +} + +static const struct file_operations proc_pid_xcall_operations = { + .open = xcall_open, + .read = seq_read, + .write = xcall_write, + .llseek = seq_lseek, + .release = single_release, +}; +#endif + /* * Thread groups */ @@ -3615,6 +3764,9 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_QOS_SCHED_SMART_GRID REG("smart_grid_level", 0644, proc_pid_sg_level_operations), #endif +#ifdef CONFIG_FAST_SYSCALL + REG("xcall", 0644, proc_pid_xcall_operations), +#endif #ifdef CONFIG_SCHED_AUTOGROUP REG("autogroup", S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations), #endif diff --git a/fs/read_write.c b/fs/read_write.c index da03b3e65cf3be6ab98bc26302e8ac9109ebef8f..63e1f4c2fd1ba2423fe3b226c263f9a3e04f2f72 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -623,7 +623,15 @@ ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count) ssize_t ret = -EBADF; if (f.file) { - loff_t pos, *ppos = file_ppos(f.file); + loff_t pos, *ppos; + + ret = xcall_read_begin(f.file, buf, count); + if (ret != -EAGAIN) { + fdput_pos(f); + return ret; + } + + ppos = file_ppos(f.file); if (ppos) { pos = *ppos; ppos = &pos; @@ -632,6 +640,7 @@ ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count) if (ret >= 0 && ppos) f.file->f_pos = pos; fdput_pos(f); + xcall_read_end(f.file); } return ret; } diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h index 0df0de0cf45e3442f4ec386b58c8484dbb5582b7..21669f07d24b1284e50df89a157e362b21860535 100644 --- a/include/linux/eventpoll.h +++ b/include/linux/eventpoll.h @@ -41,7 +41,7 @@ void eventpoll_release_file(struct file *file); */ static inline void eventpoll_release(struct file *file) { - + free_prefetch_item(file); /* * Fast check to avoid the get/release of the semaphore. Since * we're doing this outside the semaphore lock, it might return diff --git a/include/linux/fs.h b/include/linux/fs.h index a0ea6b64c45d081297340c0be79453ad10a59df8..221b4d4e38891da9fadb90a648110c61c103b77b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3750,4 +3750,39 @@ static inline bool cachefiles_ondemand_is_enabled(void) } #endif +#ifdef CONFIG_XCALL_PREFETCH +enum cache_state { + XCALL_CACHE_NONE = 0, + XCALL_CACHE_PREFETCH, + XCALL_CACHE_READY, + XCALL_CACHE_CANCEL +}; + +struct prefetch_item { + struct file *file; + struct work_struct work; + int cpu; + cpumask_t related_cpus; + struct page *cache_pages; + char *cache; + ssize_t len; + /* cache state in epoll_wait */ + atomic_t state; + loff_t pos; + struct hlist_node node; +}; + +int xcall_read_begin(struct file *file, char __user *buf, size_t count); +void xcall_read_end(struct file *file); +void free_prefetch_item(struct file *file); +#else +static inline int xcall_read_begin(struct file *file, char __user *buf, + size_t count) +{ + return -EAGAIN; +} +static inline void xcall_read_end(struct file *file) {} +static inline void free_prefetch_item(struct file *file) {} +#endif + #endif /* _LINUX_FS_H */ diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index 754f67ac4326a7f9cd33efe117911abc305be978..ad08a37f3bc0791a9817c1a62c848d43d8aa03ca 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -86,6 +86,11 @@ void irq_exit(void); */ void irq_exit_rcu(void); +#ifdef CONFIG_FAST_IRQ +void xint_enter(void); +void xint_exit(void); +#endif + #ifndef arch_nmi_enter #define arch_nmi_enter() do { } while (0) #define arch_nmi_exit() do { } while (0) diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h index 88b02e3b81da7e4510a7418444666f578a9a41d3..d94b013a091c76b914e73bd6c02510a8ab3292cd 100644 --- a/include/linux/irqchip/arm-gic-v3.h +++ b/include/linux/irqchip/arm-gic-v3.h @@ -762,6 +762,19 @@ static inline enum gic_intid_range __get_intid_range(irq_hw_number_t hwirq) } } +#ifdef CONFIG_FAST_IRQ +extern bool is_xint_support; + +enum xint_op { + XINT_TO_IRQ, + IRQ_TO_XINT, + XINT_SET_CHECK, + XINT_RANGE_CHECK, +}; + +void register_irqchip_proc(struct irq_desc *desc, void *irqp); +void unregister_irqchip_proc(struct irq_desc *desc); +#endif #endif #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index cf7b3520ac2a953aec5a9313a809e66752eca946..e0b18d1c2261e192cecdd41073cd3017638ddceb 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1477,8 +1477,13 @@ struct task_struct { #else KABI_RESERVE(14) #endif +#if defined(CONFIG_FAST_SYSCALL) + KABI_USE(15, unsigned long *xcall_enable) + KABI_USE(16, unsigned long *xcall_select) +#else KABI_RESERVE(15) KABI_RESERVE(16) +#endif KABI_AUX_PTR(task_struct) /* CPU-specific state of this task: */ diff --git a/include/trace/events/xcall.h b/include/trace/events/xcall.h new file mode 100644 index 0000000000000000000000000000000000000000..524a21e433ba51b20793cf10854567176877cd52 --- /dev/null +++ b/include/trace/events/xcall.h @@ -0,0 +1,106 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM xcall + +#if !defined(_TRACE_XCALL_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_XCALL_H + +#include +#include +#include + +TRACE_EVENT(epoll_rc_queue, + + TP_PROTO(struct file *file, int cpu), + + TP_ARGS(file, cpu), + + TP_STRUCT__entry( + __field(struct file *, file) + __field(int, cpu) + ), + + TP_fast_assign( + __entry->file = file; + __entry->cpu = cpu; + ), + + TP_printk("0x%p on cpu %d", __entry->file, __entry->cpu) +); + +TRACE_EVENT(epoll_rc_prefetch, + + TP_PROTO(struct file *file), + + TP_ARGS(file), + + TP_STRUCT__entry( + __field(struct file *, file) + ), + + TP_fast_assign( + __entry->file = file; + ), + + TP_printk("0x%p", __entry->file) +); + +TRACE_EVENT(epoll_rc_ready, + + TP_PROTO(struct file *file, int len), + + TP_ARGS(file, len), + + TP_STRUCT__entry( + __field(struct file *, file) + __field(int, len) + ), + + TP_fast_assign( + __entry->file = file; + __entry->len = len; + ), + + TP_printk("0x%p, len %d", __entry->file, __entry->len) +); + +TRACE_EVENT(epoll_rc_hit, + + TP_PROTO(struct file *file, int len), + + TP_ARGS(file, len), + + TP_STRUCT__entry( + __field(struct file *, file) + __field(int, len) + ), + + TP_fast_assign( + __entry->file = file; + __entry->len = len; + ), + + TP_printk("0x%p, len: %d", __entry->file, __entry->len) +); + +TRACE_EVENT(epoll_rc_miss, + + TP_PROTO(struct file *file), + + TP_ARGS(file), + + TP_STRUCT__entry( + __field(struct file *, file) + ), + + TP_fast_assign( + __entry->file = file; + ), + + TP_printk("0x%p", __entry->file) +); + +#endif /* _TRACE_XCALL_H */ + +/* This part must be outside protection */ +#include diff --git a/kernel/fork.c b/kernel/fork.c index 9b1ea79deaa52ee5d6c56390264216e6f86c42cc..b884ac9cdece2d58c829356f77bdfeabeb116656 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -479,6 +479,15 @@ void free_task(struct task_struct *tsk) #endif if (task_relationship_used()) sched_relationship_free(tsk); + +#ifdef CONFIG_FAST_SYSCALL + if (tsk->xcall_enable) + bitmap_free(tsk->xcall_enable); + + if (tsk->xcall_select) + bitmap_free(tsk->xcall_select); +#endif + free_task_struct(tsk); } EXPORT_SYMBOL(free_task); @@ -1007,6 +1016,12 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) #ifdef CONFIG_MEMCG tsk->active_memcg = NULL; #endif + +#ifdef CONFIG_FAST_SYSCALL + tsk->xcall_enable = NULL; + tsk->xcall_select = NULL; +#endif + return tsk; free_stack: @@ -2085,6 +2100,23 @@ static __latent_entropy struct task_struct *copy_process( rt_mutex_init_task(p); +#ifdef CONFIG_FAST_SYSCALL + p->xcall_enable = bitmap_zalloc(__NR_syscalls, GFP_KERNEL); + if (!p->xcall_enable) + goto bad_fork_free; + + if (current->xcall_enable) + bitmap_copy(p->xcall_enable, current->xcall_enable, __NR_syscalls); + + if (current->xcall_select) { + p->xcall_select = bitmap_zalloc(__NR_syscalls, GFP_KERNEL); + if (!p->xcall_select) + goto bad_fork_free; + + bitmap_copy(p->xcall_select, current->xcall_select, __NR_syscalls); + } +#endif + #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY retval = sched_prefer_cpus_fork(p, current->prefer_cpus); if (retval) diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index e4cff358b437ebf4afef594d4d927846990904db..a4a7f87eab39faedc2d3138eeeb64ce2fdacefde 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -236,6 +236,34 @@ void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc) &dfs_irq_ops); } +#ifdef CONFIG_FAST_IRQ +static struct dentry *xint_dir; + +void xint_add_debugfs_entry(unsigned int irq) +{ + char name[10]; + char buf[100]; + + if (!xint_dir) + return; + + sprintf(name, "%d", irq); + sprintf(buf, "../irqs/%d", irq); + debugfs_create_symlink(name, xint_dir, buf); +} + +void xint_remove_debugfs_entry(unsigned int irq) +{ + char name[10]; + + if (!xint_dir) + return; + + sprintf(name, "%d", irq); + debugfs_lookup_and_remove(name, xint_dir); +} +#endif + static int __init irq_debugfs_init(void) { struct dentry *root_dir; @@ -247,6 +275,11 @@ static int __init irq_debugfs_init(void) irq_dir = debugfs_create_dir("irqs", root_dir); +#ifdef CONFIG_FAST_IRQ + if (is_xint_support) + xint_dir = debugfs_create_dir("xints", root_dir); +#endif + irq_lock_sparse(); for_each_active_irq(irq) irq_add_debugfs_entry(irq, irq_to_desc(irq)); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 48d6aa8cdbed7f64086e68d67e14414fda9b8be5..d725d8ef5ce7fe1b1f51264e7b538178b72af93e 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -492,6 +492,14 @@ static inline void irq_remove_debugfs_entry(struct irq_desc *desc) debugfs_remove(desc->debugfs_file); kfree(desc->dev_name); } + +#ifdef CONFIG_FAST_IRQ +extern bool is_xint_support; + +void xint_add_debugfs_entry(unsigned int irq); +void xint_remove_debugfs_entry(unsigned int irq); +#endif + void irq_debugfs_copy_devname(int irq, struct device *dev); # ifdef CONFIG_IRQ_DOMAIN void irq_domain_debugfs_init(struct dentry *root); @@ -507,6 +515,16 @@ static inline void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *d) static inline void irq_remove_debugfs_entry(struct irq_desc *d) { } + +#ifdef CONFIG_FAST_IRQ +static inline void xint_add_debugfs_entry(unsigned int irq) +{ +} +static inline void xint_remove_debugfs_entry(unsigned int irq) +{ +} +#endif + static inline void irq_debugfs_copy_devname(int irq, struct device *dev) { } diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 6c009a033c73fbfa010a96e8e9c2502c8cc8d5f4..5dc976d32c7443539a3038ef54b6f20c01fedced 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -658,6 +658,10 @@ int generic_handle_irq(unsigned int irq) EXPORT_SYMBOL_GPL(generic_handle_irq); #ifdef CONFIG_HANDLE_DOMAIN_IRQ +#ifdef CONFIG_FAST_IRQ +extern DECLARE_BITMAP(irqnr_xint_map, 1024); +#endif + /** * __handle_domain_irq - Invoke the handler for a HW irq belonging to a domain * @domain: The domain where to perform the lookup @@ -673,8 +677,16 @@ int __handle_domain_irq(struct irq_domain *domain, unsigned int hwirq, struct pt_regs *old_regs = set_irq_regs(regs); unsigned int irq = hwirq; int ret = 0; +#ifdef CONFIG_FAST_IRQ + int is_xint = test_bit(hwirq, irqnr_xint_map); + if (is_xint) + xint_enter(); + else + irq_enter(); +#else irq_enter(); +#endif #ifdef CONFIG_IRQ_DOMAIN if (lookup) @@ -692,7 +704,14 @@ int __handle_domain_irq(struct irq_domain *domain, unsigned int hwirq, generic_handle_irq(irq); } +#ifdef CONFIG_FAST_IRQ + if (is_xint) + xint_exit(); + else + irq_exit(); +#else irq_exit(); +#endif set_irq_regs(old_regs); return ret; } diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 0df62a3a1f3742a6f8c6e00e81477b85d1ac0113..64805a37c57698f38330e1e3c3b0dc1191781fc4 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -13,6 +13,10 @@ #include #include +#ifdef CONFIG_FAST_IRQ +#include +#endif + #include "internals.h" /* @@ -331,6 +335,9 @@ void register_handler_proc(unsigned int irq, struct irqaction *action) action->dir = proc_mkdir(name, desc->dir); } +void __weak register_irqchip_proc(struct irq_desc *desc, void *irqp) { } +void __weak unregister_irqchip_proc(struct irq_desc *desc) { } + #undef MAX_NAMELEN #define MAX_NAMELEN 10 @@ -385,6 +392,7 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) #endif proc_create_single_data("spurious", 0444, desc->dir, irq_spurious_proc_show, (void *)(long)irq); + register_irqchip_proc(desc, irqp); out_unlock: mutex_unlock(®ister_lock); @@ -408,6 +416,8 @@ void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) #endif remove_proc_entry("spurious", desc->dir); + unregister_irqchip_proc(desc); + sprintf(name, "%u", irq); remove_proc_entry(name, root_irq_dir); } diff --git a/kernel/softirq.c b/kernel/softirq.c index 4196b9f84690066a75f6e67bdefbe4ff79eb0336..9fc69e6e2c11734a8033a0f8d9cc0867d1154381 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -345,6 +345,42 @@ asmlinkage __visible void do_softirq(void) local_irq_restore(flags); } +#ifdef CONFIG_FAST_IRQ +/** + * xint_enter_rcu - Copy from irq_enter_rcu + */ +void xint_enter_rcu(void) +{ + if (tick_nohz_full_cpu(smp_processor_id()) || + (is_idle_task(current) && !in_interrupt())) { + /* + * Prevent raise_softirq from needlessly waking up ksoftirqd + * here, as softirq will be serviced on return from interrupt. + */ + local_bh_disable(); + tick_irq_enter(); + _local_bh_enable(); + } + +#ifndef CONFIG_DEBUG_FEATURE_BYPASS + account_irq_enter_time(current); +#endif + preempt_count_add(HARDIRQ_OFFSET); +#ifndef CONFIG_DEBUG_FEATURE_BYPASS + lockdep_hardirq_enter(); +#endif +} + +/** + * irq_enter - Copy from irq_enter + */ +void xint_enter(void) +{ + rcu_irq_enter(); + xint_enter_rcu(); +} +#endif + /** * irq_enter_rcu - Enter an interrupt context with RCU watching */ @@ -411,6 +447,43 @@ static inline void tick_irq_exit(void) #endif } +#ifdef CONFIG_FAST_IRQ +static inline void __xint_exit_rcu(void) +{ +#ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED + local_irq_disable(); +#else +#ifndef CONFIG_DEBUG_FEATURE_BYPASS + lockdep_assert_irqs_disabled(); +#endif +#endif + +#ifndef CONFIG_DEBUG_FEATURE_BYPASS + account_irq_exit_time(current); +#endif + preempt_count_sub(HARDIRQ_OFFSET); + if (!in_interrupt() && local_softirq_pending()) + invoke_softirq(); + + tick_irq_exit(); +} + +/** + * xint_exit - Copy from irq_exit + * + * Also processes softirqs if needed and possible. + */ +void xint_exit(void) +{ + __xint_exit_rcu(); + rcu_irq_exit(); + /* must be last! */ +#ifndef CONFIG_DEBUG_FEATURE_BYPASS + lockdep_hardirq_exit(); +#endif +} +#endif + static inline void __irq_exit_rcu(void) { #ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED