diff --git a/arch/Kconfig b/arch/Kconfig index 0fc9c6d591b8bb91779768a1aa1a912dd40d0c4e..9a07425c0e1a9bd101914c40f8ca00a8a50521fa 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -1186,4 +1186,84 @@ source "kernel/gcov/Kconfig" source "scripts/gcc-plugins/Kconfig" +config ARCH_SUPPORTS_FAST_SYSCALL + bool + +config FAST_SYSCALL + bool "Fast Syscall support" + depends on ARCH_SUPPORTS_FAST_SYSCALL + default n + help + This enable support Fast syscall feature. + The svc exception handling process, which includes auxiliary + functions for debug/trace and core functions like + KPTI, has been identified as overly "lengthy". + This inefficiency is particularly noticeable in short syscalls + such as lseek() and getpid(), where the syscall function itself + comprises a small percentage of the total instructions executed. + To address this, we introduce the concept of fast syscall, a fast svc + exception handling path that only considers necessary features + such as security, context saving, and recovery. + +config XCALL_PREFETCH + bool "Xcall prefetch support" + select FAST_SYSCALL + default n + help + This enable xcall prefetch feature. + Xcall prefetch feature implements customized epoll_wait() and + read() system calls, which enable data prefetching. + In high-concurrency connection scenarios, this improves + the parallel execution efficiency of the read() system call + and increases the system's business throughput. + The Xcall prefetch feature is suitable for business scenarios + where the epoll I/O multiplexing mechanism is used, the read() + system call takes up a large proportion of time, and the number + of concurrent connections is large. + +config ARCH_SUPPORTS_FAST_IRQ + bool + +config FAST_IRQ + bool "Fast irq support" + depends on ARCH_SUPPORTS_FAST_IRQ + default n + help + The irq handling process, which includes auxiliary + functions for debug/trace and core functions like + KPTI, interrupt time record, interrupt processing as + a random number source, interrupt affinity + modification and interrupt processing race, as well as + spurious and unhandled interrupt debugging, has been + identified as overly "lengthy". + To address this, we introduce the concept of fast irq, + a fast interrupt handling path that only considers + necessary features such as security, context saving + and recovery, which adds an lightweight interrupt processing + framework for latency-sensitive interrupts. + +config DEBUG_FEATURE_BYPASS + bool "Bypass debug feature in fast syscall" + depends on FAST_SYSCALL || FAST_IRQ + default y + help + This to bypass debug feature in fast syscall. + The svc exception handling process, which includes auxiliary + functions for debug/trace and core functions like + KPTI, has been identified as overly "lengthy". + In fast syscall we only considers necessary features. + Disable this config to keep debug feature in fast syscall. + +config SECURITY_FEATURE_BYPASS + bool "Bypass security feature in fast syscall" + depends on FAST_SYSCALL || FAST_IRQ + default y + help + This to bypass security feature in fast syscall. + The svc exception handling process, which includes auxiliary + functions for debug/trace and core functions like + KPTI, has been identified as overly "lengthy". + In fast syscall we only considers necessary features. + Disable this config to keep security feature in fast syscall. + endmenu diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 93ced97f8c6c4dc926b32ed7ea63acc11dbb8dd8..ad38ba5be5901d3b7a7fe6c912d69083f3734f58 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -219,6 +219,8 @@ config ARM64 select THREAD_INFO_IN_TASK select HAVE_LIVEPATCH_WO_FTRACE select THP_NUMA_CONTROL if ARM64_64K_PAGES && NUMA_BALANCING && TRANSPARENT_HUGEPAGE + select ARCH_SUPPORTS_FAST_SYSCALL if !ARM64_MTE && !KASAN_HW_TAGS + select ARCH_SUPPORTS_FAST_IRQ if ARM_GIC_V3 && !ARM64_MTE && !KASAN_HW_TAGS help ARM 64-bit (AArch64) Linux support. diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 4a11c9f061cc9ec97bc50b25c8337d80ef5f3b75..ffb33e2991f6c5f3c60d0794688ec60ce003581c 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -921,6 +921,10 @@ CONFIG_ARCH_HAS_GCOV_PROFILE_ALL=y # end of GCOV-based kernel profiling CONFIG_HAVE_GCC_PLUGINS=y +CONFIG_ARCH_SUPPORTS_FAST_SYSCALL=y +# CONFIG_FAST_SYSCALL is not set +CONFIG_ARCH_SUPPORTS_FAST_IRQ=y +# CONFIG_FAST_IRQ is not set # end of General architecture-dependent options CONFIG_RT_MUTEXES=y diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h index ce9fbf260a3cf20c0cbab1ede4f5982dbf3c7ac2..e2a2b3e40c94f9ac3ad913a0cae5663e15119a6d 100644 --- a/arch/arm64/include/asm/cpucaps.h +++ b/arch/arm64/include/asm/cpucaps.h @@ -81,6 +81,8 @@ #define ARM64_HAS_PBHA_STAGE2 73 #define ARM64_SME 74 #define ARM64_SME_FA64 75 +#define ARM64_HAS_XCALL 76 +#define ARM64_HAS_XINT 77 #define ARM64_NCAPS 80 diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h index d38d526d084e52452c0023a11243e520ad818349..4b7994bd2b94f3a188ae00bc8a2031e584289ea1 100644 --- a/arch/arm64/include/asm/exception.h +++ b/arch/arm64/include/asm/exception.h @@ -48,6 +48,9 @@ void do_el0_sys(unsigned long esr, struct pt_regs *regs); void do_sp_pc_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs); void bad_el0_sync(struct pt_regs *regs, int reason, unsigned int esr); void do_el0_cp15(unsigned long esr, struct pt_regs *regs); +#ifdef CONFIG_FAST_SYSCALL +void do_el0_xcall(struct pt_regs *regs); +#endif void do_el0_svc(struct pt_regs *regs); void do_el0_svc_compat(struct pt_regs *regs); void do_el0_fpac(struct pt_regs *regs, unsigned long esr); diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index c247e11130db7d88fcff5c3b237864f8b87855ca..7c6ad4b1667b585c222b1f30412c01e010a26505 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -26,6 +26,9 @@ int main(void) { +#ifdef CONFIG_FAST_SYSCALL + DEFINE(TSK_XCALL, offsetof(struct task_struct, xcall_enable)); +#endif DEFINE(TSK_ACTIVE_MM, offsetof(struct task_struct, active_mm)); BLANK(); DEFINE(TSK_TI_CPU, offsetof(struct task_struct, thread_info.cpu)); diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index dee049d27c745b0935d07e7521ebf53d46d41908..9b4a315e96bc0781cfd8be8b220108327a9fa6ea 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -2155,6 +2155,44 @@ static bool can_clearpage_use_stnp(const struct arm64_cpu_capabilities *entry, return use_clearpage_stnp && has_mor_nontemporal(entry); } +#ifdef CONFIG_FAST_SYSCALL +static bool is_xcall_support; +static int __init xcall_setup(char *str) +{ + is_xcall_support = true; + return 1; +} +__setup("xcall", xcall_setup); + +bool fast_syscall_enabled(void) +{ + return is_xcall_support; +} + +static bool has_xcall_support(const struct arm64_cpu_capabilities *entry, int __unused) +{ + return is_xcall_support; +} +#endif + +#ifdef CONFIG_FAST_IRQ +bool is_xint_support; +static int __init xint_setup(char *str) +{ + if (!cpus_have_cap(ARM64_HAS_SYSREG_GIC_CPUIF)) + return 1; + + is_xint_support = true; + return 1; +} +__setup("xint", xint_setup); + +static bool has_xint_support(const struct arm64_cpu_capabilities *entry, int __unused) +{ + return is_xint_support; +} +#endif + static const struct arm64_cpu_capabilities arm64_features[] = { { .desc = "GIC system register CPU interface", @@ -2701,6 +2739,22 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .cpu_enable = fa64_kernel_enable, }, #endif /* CONFIG_ARM64_SME */ +#ifdef CONFIG_FAST_SYSCALL + { + .desc = "Xcall Support", + .capability = ARM64_HAS_XCALL, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_xcall_support, + }, +#endif +#ifdef CONFIG_FAST_IRQ + { + .desc = "Xint Support", + .capability = ARM64_HAS_XINT, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_xint_support, + }, +#endif {}, }; diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c index 02cd5d57edb6e7eec4087798e1b9daf0c646fde5..3e59cbdedc4cd01640af6223a6781e41421e0789 100644 --- a/arch/arm64/kernel/entry-common.c +++ b/arch/arm64/kernel/entry-common.c @@ -388,6 +388,28 @@ static void noinstr el0_fpac(struct pt_regs *regs, unsigned long esr) do_el0_fpac(regs, esr); } +#if defined(CONFIG_FAST_SYSCALL) || defined(CONFIG_FAST_IRQ) +asmlinkage void noinstr fast_enter_from_user_mode(void) +{ +#ifndef CONFIG_DEBUG_FEATURE_BYPASS + lockdep_hardirqs_off(CALLER_ADDR0); + CT_WARN_ON(ct_state() != CONTEXT_USER); +#endif + user_exit_irqoff(); +#ifndef CONFIG_DEBUG_FEATURE_BYPASS + trace_hardirqs_off_finish(); +#endif +} +#endif + +#ifdef CONFIG_FAST_SYSCALL +asmlinkage void noinstr el0_xcall_handler(struct pt_regs *regs) +{ + fast_enter_from_user_mode(); + do_el0_xcall(regs); +} +#endif + asmlinkage void noinstr el0_sync_handler(struct pt_regs *regs) { unsigned long esr = read_sysreg(esr_el1); diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 1290f36c83713a65c22d7f61dc839e763a9e10cb..5ed8b8e8e58ec1d69c5143eb67d283204b24fe43 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -182,7 +182,7 @@ alternative_else_nop_endif #endif .endm - .macro kernel_entry, el, regsize = 64 + .macro kernel_entry, el, regsize = 64, fast_mode = std .if \regsize == 32 mov w0, w0 // zero upper 32 bits of x0 .endif @@ -212,12 +212,19 @@ alternative_else_nop_endif * Ensure MDSCR_EL1.SS is clear, since we can unmask debug exceptions * when scheduling. */ + .if \fast_mode == std ldr x19, [tsk, #TSK_TI_FLAGS] disable_step_tsk x19, x20 + .endif /* Check for asynchronous tag check faults in user space */ + .if \fast_mode == std check_mte_async_tcf x22, x23 + .endif + + .if \fast_mode == std apply_ssbd 1, x22, x23 + .endif ptrauth_keys_install_kernel tsk, x20, x22, x23 @@ -243,9 +250,11 @@ alternative_else_nop_endif add x29, sp, #S_STACKFRAME #ifdef CONFIG_ARM64_SW_TTBR0_PAN +.if \fast_mode == std alternative_if_not ARM64_HAS_PAN bl __swpan_entry_el\el alternative_else_nop_endif +.endif #endif stp x22, x23, [sp, #S_PC] @@ -268,9 +277,11 @@ alternative_else_nop_endif /* Re-enable tag checking (TCO set on exception entry) */ #ifdef CONFIG_ARM64_MTE +.if \fast_mode == std alternative_if ARM64_MTE SET_PSTATE_TCO(0) alternative_else_nop_endif +.endif #endif /* @@ -283,7 +294,7 @@ alternative_else_nop_endif */ .endm - .macro kernel_exit, el + .macro kernel_exit, el, fast_mode = std .if \el != 0 disable_daif .endif @@ -303,14 +314,18 @@ alternative_else_nop_endif ldp x21, x22, [sp, #S_PC] // load ELR, SPSR #ifdef CONFIG_ARM64_SW_TTBR0_PAN +.if \fast_mode == std alternative_if_not ARM64_HAS_PAN bl __swpan_exit_el\el alternative_else_nop_endif +.endif #endif .if \el == 0 ldr x23, [sp, #S_SP] // load return stack pointer msr sp_el0, x23 + + .if \fast_mode == std tst x22, #PSR_MODE32_BIT // native task? b.eq 3f @@ -325,13 +340,17 @@ alternative_if ARM64_WORKAROUND_845719 alternative_else_nop_endif #endif 3: + .endif + scs_save tsk, x0 /* No kernel C function calls after this as user keys are set. */ ptrauth_keys_install_user tsk, x0, x1, x2 + .if \fast_mode == std apply_ssbd 0, x0, x1 .endif + .endif msr elr_el1, x21 // set up the return data msr spsr_el1, x22 @@ -675,11 +694,91 @@ SYM_CODE_START_LOCAL_NOALIGN(el1_irq) kernel_exit 1 SYM_CODE_END(el1_irq) +#ifdef CONFIG_FAST_SYSCALL + .macro check_esr_el1_ec_svc64 + /* Only support SVC64 for now */ + mrs x20, esr_el1 + lsr w20, w20, #ESR_ELx_EC_SHIFT + cmp x20, #ESR_ELx_EC_SVC64 + .endm + + .macro check_syscall_nr + cmp x8, __NR_syscalls + .endm + + .macro check_xcall_enable + /* x21 = task_struct->xcall_enable */ + ldr_this_cpu x20, __entry_task, x21 + ldr x21, [x20, #TSK_XCALL] + /* x20 = sc_no / 8 */ + lsr x20, x8, 3 + ldr x21, [x21, x20] + /* x8 = sc_no % 8 */ + and x8, x8, 7 + mov x20, 1 + lsl x20, x20, x8 + and x21, x21, x20 + cmp x21, 0 + .endm + + .macro check_xcall_pre_kernel_entry + stp x20, x21, [sp, #0] + /* is ESR_ELx_EC_SVC64 */ + check_esr_el1_ec_svc64 + bne .Lskip_xcall\@ + /* x8 >= __NR_syscalls */ + check_syscall_nr + bhs .Lskip_xcall\@ + str x8, [sp, #16] + /* is xcall enabled */ + check_xcall_enable + ldr x8, [sp, #16] + beq .Lskip_xcall\@ + ldp x20, x21, [sp, #0] + /* do xcall */ +#ifdef CONFIG_SECURITY_FEATURE_BYPASS + kernel_entry 0, 64, xcall +#else + kernel_entry 0, 64 +#endif + mov x0, sp + bl el0_xcall_handler +#ifdef CONFIG_SECURITY_FEATURE_BYPASS + disable_daif + gic_prio_kentry_setup tmp=x3 + ldr x19, [tsk, #TSK_TI_FLAGS] + and x2, x19, #_TIF_WORK_MASK + cbnz x2, fast_work_pending\@ +fast_finish_ret_to_user\@: + user_enter_irqoff + kernel_exit 0 xcall +fast_work_pending\@: + mov x0, sp // 'regs' + mov x1, x19 + bl do_notify_resume + ldr x19, [tsk, #TSK_TI_FLAGS] // re-check for single-step + b fast_finish_ret_to_user\@ +#else + b ret_to_user +#endif +.Lskip_xcall\@: + ldp x20, x21, [sp, #0] + .endm +#endif + /* * EL0 mode handlers. */ .align 6 SYM_CODE_START_LOCAL_NOALIGN(el0_sync) +#ifdef CONFIG_FAST_SYSCALL + /* Only support el0 aarch64 sync exception */ + alternative_if_not ARM64_HAS_XCALL + b .Lret_to_kernel_entry + alternative_else_nop_endif + check_xcall_pre_kernel_entry + .Lret_to_kernel_entry: +#endif kernel_entry 0 mov x0, sp bl el0_sync_handler @@ -705,10 +804,90 @@ SYM_CODE_START_LOCAL_NOALIGN(el0_error_compat) kernel_entry 0, 32 b el0_error_naked SYM_CODE_END(el0_error_compat) +#endif + +#ifdef CONFIG_FAST_IRQ +.macro el0_xint_handler, handler:req +#if defined(CONFIG_CONTEXT_TRACKING) || defined(CONFIG_TRACE_IRQFLAGS) + bl fast_enter_from_user_mode +#endif + enable_da_f +#ifndef CONFIG_SECURITY_FEATURE_BYPASS + tbz x22, #55, 1f + bl do_el0_irq_bp_hardening +1: +#endif + irq_handler \handler +.endm + +.macro check_xint_pre_kernel_entry + stp x0, x1, [sp, #0] + stp x2, x3, [sp, #16] + + ldr x0, =irqnr_xint_map + /* get hpp irqnr */ + mrs_s x1, SYS_ICC_HPPIR1_EL1 + + /* xint hwirq can not exceed 1020 */ + cmp x1, 1020 + b.ge .Lskip_xint\@ + + /* x2 = irqnr % 8 */ + and x2, x1, #7 + /* x3 = irqnr / 8 */ + lsr x3, x1, #3 + /* x1 is the byte of irqnr in irqnr_xint_map */ + ldr x1, [x0, x3] + + /* Get the check mask */ + mov x3, #1 + /* x3 = 1 << (irqnr % 8) */ + lsl x3, x3, x2 + + /* x1 = x1 & x3 */ + ands x1, x1, x3 + b.eq .Lskip_xint\@ + + ldp x0, x1, [sp, #0] + ldp x2, x3, [sp, #16] +#ifdef CONFIG_SECURITY_FEATURE_BYPASS + kernel_entry 0, 64, xint + el0_xint_handler handle_arch_irq + disable_daif + gic_prio_kentry_setup tmp=x3 + ldr x19, [tsk, #TSK_TI_FLAGS] + and x2, x19, #_TIF_WORK_MASK + cbnz x2, xint_fast_work_pending\@ +xint_fast_finish_ret_to_user\@: + user_enter_irqoff + kernel_exit 0 xint +xint_fast_work_pending\@: + mov x0, sp // 'regs' + mov x1, x19 + bl do_notify_resume + b xint_fast_finish_ret_to_user\@ +#else + kernel_entry 0, 64 + el0_xint_handler handle_arch_irq + b ret_to_user +#endif + +.Lskip_xint\@: + ldp x0, x1, [sp, #0] + ldp x2, x3, [sp, #16] +.endm #endif .align 6 SYM_CODE_START_LOCAL_NOALIGN(el0_irq) +#ifdef CONFIG_FAST_IRQ + /* Only support el0 aarch64 irq */ + alternative_if_not ARM64_HAS_XINT + b .Lskip_check_xint + alternative_else_nop_endif + check_xint_pre_kernel_entry +.Lskip_check_xint: +#endif kernel_entry 0 el0_irq_naked: el0_interrupt_handler handle_arch_irq diff --git a/arch/arm64/kernel/syscall.c b/arch/arm64/kernel/syscall.c index 9bd304568d90349164ff1926d100a833ac9c7399..2d73eaaf9bc28d9b24979f6fafd8d7fbb2f817f0 100644 --- a/arch/arm64/kernel/syscall.c +++ b/arch/arm64/kernel/syscall.c @@ -106,6 +106,46 @@ static void cortex_a76_erratum_1463225_svc_handler(void) static void cortex_a76_erratum_1463225_svc_handler(void) { } #endif /* CONFIG_ARM64_ERRATUM_1463225 */ +#ifdef CONFIG_FAST_SYSCALL +static void el0_xcall_common(struct pt_regs *regs, int scno, int sc_nr, + const syscall_fn_t syscall_table[]) +{ + unsigned long flags = read_thread_flags(); + + regs->orig_x0 = regs->regs[0]; + regs->syscallno = scno; + +#ifndef CONFIG_SECURITY_FEATURE_BYPASS + cortex_a76_erratum_1463225_svc_handler(); +#endif + local_daif_restore(DAIF_PROCCTX); + + if (system_supports_mte() && (flags & _TIF_MTE_ASYNC_FAULT)) { + syscall_set_return_value(current, regs, -ERESTARTNOINTR, 0); + return; + } + + if (has_syscall_work(flags)) { + if (scno == NO_SYSCALL) + syscall_set_return_value(current, regs, -ENOSYS, 0); + scno = syscall_trace_enter(regs); + if (scno == NO_SYSCALL) + goto trace_exit; + } + + invoke_syscall(regs, scno, sc_nr, syscall_table); + + if (!has_syscall_work(flags) && !IS_ENABLED(CONFIG_DEBUG_RSEQ)) { + flags = read_thread_flags(); + if (!has_syscall_work(flags) && !(flags & _TIF_SINGLESTEP)) + return; + } + +trace_exit: + syscall_trace_exit(regs); +} +#endif + static void el0_svc_common(struct pt_regs *regs, int scno, int sc_nr, const syscall_fn_t syscall_table[]) { @@ -237,6 +277,23 @@ static inline void delouse_pt_regs(struct pt_regs *regs) } #endif +#ifdef CONFIG_FAST_SYSCALL +void do_el0_xcall(struct pt_regs *regs) +{ + const syscall_fn_t *t = sys_call_table; + +#ifdef CONFIG_ARM64_ILP32 + if (is_ilp32_compat_task()) { + t = ilp32_sys_call_table; + delouse_pt_regs(regs); + } +#endif + + fp_user_discard(); + el0_xcall_common(regs, regs->regs[8], __NR_syscalls, t); +} +#endif + void do_el0_svc(struct pt_regs *regs) { const syscall_fn_t *t = sys_call_table; diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c index 87af452d82dc866dd4f75c9dee2168f01c70eb33..7293732b5f72d9ebd3dd80ff361acf8d86885cba 100644 --- a/drivers/irqchip/irq-gic-v3.c +++ b/drivers/irqchip/irq-gic-v3.c @@ -31,6 +31,10 @@ #include "irq-gic-common.h" +#ifdef CONFIG_FAST_IRQ +#include "../../../kernel/irq/internals.h" +#endif + #define GICD_INT_NMI_PRI (GICD_INT_DEF_PRI & ~0x80) #define FLAGS_WORKAROUND_GICR_WAKER_MSM8996 (1ULL << 0) @@ -720,6 +724,125 @@ static asmlinkage void __exception_irq_entry gic_handle_irq(struct pt_regs *regs } } +#ifdef CONFIG_FAST_IRQ +DECLARE_BITMAP(irqnr_xint_map, 1024); + +static bool can_set_xint(unsigned int hwirq) +{ + if (__get_intid_range(hwirq) == SGI_RANGE || + __get_intid_range(hwirq) == SPI_RANGE) + return true; + + return false; +} + +static bool xint_transform(int irqno, enum xint_op op) +{ + struct irq_data *data = irq_get_irq_data(irqno); + int hwirq; + + while (data->parent_data) + data = data->parent_data; + + hwirq = data->hwirq; + + if (!can_set_xint(hwirq)) + return false; + + switch (op) { + case IRQ_TO_XINT: + set_bit(hwirq, irqnr_xint_map); + xint_add_debugfs_entry(irqno); + return true; + case XINT_TO_IRQ: + clear_bit(hwirq, irqnr_xint_map); + xint_remove_debugfs_entry(irqno); + return false; + case XINT_SET_CHECK: + return test_bit(hwirq, irqnr_xint_map); + case XINT_RANGE_CHECK: + return true; + default: + return false; + } +} + +static ssize_t xint_proc_write(struct file *file, + const char __user *buffer, size_t count, loff_t *pos) +{ + int irq = (int)(long)PDE_DATA(file_inode(file)); + bool xint_state = false; + unsigned long val; + char *buf = NULL; + + if (!xint_transform(irq, XINT_RANGE_CHECK)) + return -EPERM; + + buf = memdup_user_nul(buffer, count); + if (IS_ERR(buf)) + return PTR_ERR(buf); + + if (kstrtoul(buf, 0, &val) || (val != 0 && val != 1)) { + kfree(buf); + return -EINVAL; + } + + xint_state = xint_transform(irq, XINT_SET_CHECK); + if (xint_state == val) { + kfree(buf); + return -EBUSY; + } + + local_irq_disable(); + disable_irq(irq); + + xint_transform(irq, xint_state ? XINT_TO_IRQ : IRQ_TO_XINT); + + enable_irq(irq); + local_irq_enable(); + + kfree(buf); + + return count; +} + +static int xint_proc_show(struct seq_file *m, void *v) +{ + seq_printf(m, "%d\n", xint_transform((long)m->private, XINT_SET_CHECK)); + return 0; +} + +static int xint_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, xint_proc_show, PDE_DATA(inode)); +} + +static const struct proc_ops xint_proc_ops = { + .proc_open = xint_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, + .proc_write = xint_proc_write, +}; + +void register_irqchip_proc(struct irq_desc *desc, void *irqp) +{ + if (!is_xint_support) + return; + + /* create /proc/irq//xint */ + proc_create_data("xint", 0644, desc->dir, &xint_proc_ops, irqp); +} + +void unregister_irqchip_proc(struct irq_desc *desc) +{ + if (!is_xint_support) + return; + + remove_proc_entry("xint", desc->dir); +} +#endif /* CONFIG_FAST_IRQ */ + static u32 gic_get_pribits(void) { u32 pribits; diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 5ce1ea1f452b1e010e50da04730608a3e4f95ccd..09428ad339d69973d0feea50c6b839ab937c41e0 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -38,6 +38,7 @@ #include #include #include +#include /* * LOCKING: @@ -768,6 +769,67 @@ static void epi_rcu_free(struct rcu_head *head) kmem_cache_free(epi_cache, epi); } +#ifdef CONFIG_XCALL_PREFETCH +DEFINE_PER_CPU_ALIGNED(unsigned long, xcall_cache_hit); +DEFINE_PER_CPU_ALIGNED(unsigned long, xcall_cache_miss); + +#define PREFETCH_ITEM_HASH_BITS 6 +#define PREFETCH_ITEM_TABLE_SIZE (1 << PREFETCH_ITEM_HASH_BITS) +static DEFINE_HASHTABLE(xcall_item_table, PREFETCH_ITEM_HASH_BITS); +static DEFINE_RWLOCK(xcall_table_lock); +static struct workqueue_struct *rc_work; +int cache_pages_order; + +static struct prefetch_item *find_prefetch_item(struct file *file) +{ + struct prefetch_item *found = NULL; + unsigned int hash = 0; + + hash = hash_64((u64)file, PREFETCH_ITEM_HASH_BITS); + read_lock(&xcall_table_lock); + hash_for_each_possible(xcall_item_table, found, node, hash) { + if (found->file == file) + break; + } + read_unlock(&xcall_table_lock); + + return found; +} + +void free_prefetch_item(struct file *file) +{ + struct prefetch_item *pfi = find_prefetch_item(file); + + if (!pfi) + return; + + write_lock(&xcall_table_lock); + if (!hlist_unhashed(&pfi->node)) + hlist_del_init(&pfi->node); + write_unlock(&xcall_table_lock); + if (pfi->cache_pages) { + __free_pages(pfi->cache_pages, cache_pages_order); + pfi->cache = NULL; + } + kfree(pfi); +} + +static void xcall_cancel_work(struct file *file) +{ + struct prefetch_item *pfi; + + if (!current->xcall_select || + !test_bit(__NR_epoll_pwait, current->xcall_select)) + return; + + pfi = find_prefetch_item(file); + if (pfi) + cancel_work_sync(&pfi->work); +} +#else +static inline void xcall_cancel_work(struct file *file) {} +#endif + /* * Removes a "struct epitem" from the eventpoll RB tree and deallocates * all the associated resources. Must be called with "mtx" held. @@ -782,6 +844,7 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) * Removes poll wait queue hooks. */ ep_unregister_pollwait(ep, epi); + xcall_cancel_work(file); /* Remove the current item from the list of epoll hooks */ spin_lock(&file->f_lock); @@ -1191,6 +1254,207 @@ static inline bool chain_epi_lockless(struct epitem *epi) return true; } +#ifdef CONFIG_XCALL_PREFETCH +static inline bool transition_state(struct prefetch_item *pfi, + enum cache_state old, enum cache_state new) +{ + return atomic_cmpxchg(&pfi->state, old, new) == old; +} + +static int xcall_read(struct prefetch_item *pfi, unsigned int fd, + char __user *buf, size_t count) +{ + ssize_t copy_ret = -1; + ssize_t copy_len = 0; + + while (!transition_state(pfi, XCALL_CACHE_READY, XCALL_CACHE_CANCEL)) { + if (transition_state(pfi, XCALL_CACHE_NONE, XCALL_CACHE_CANCEL)) + goto reset_pfi_and_retry_vfs_read; + } + + copy_len = pfi->len; + if (unlikely(copy_len < 0)) + goto reset_pfi_and_retry_vfs_read; + + if (copy_len == 0) { + copy_ret = 0; + transition_state(pfi, XCALL_CACHE_CANCEL, XCALL_CACHE_NONE); + goto hit_return; + } + + copy_len = (copy_len >= count) ? count : copy_len; + copy_ret = copy_to_user(buf, (void *)(pfi->cache + pfi->pos), copy_len); + pfi->len -= copy_len; + if (pfi->len <= 0) { + pfi->len = 0; + pfi->pos = 0; + transition_state(pfi, XCALL_CACHE_CANCEL, XCALL_CACHE_NONE); + } else if (pfi->len > 0) { + pfi->pos += copy_len; + transition_state(pfi, XCALL_CACHE_CANCEL, XCALL_CACHE_READY); + } +hit_return: + this_cpu_inc(xcall_cache_hit); + trace_epoll_rc_hit(fd, copy_len); + if (copy_ret == 0) + return copy_len; + else + return -EBADF; + +reset_pfi_and_retry_vfs_read: + this_cpu_inc(xcall_cache_miss); + trace_epoll_rc_miss(fd); + pfi->len = 0; + pfi->pos = 0; + cancel_work(&pfi->work); + + return -EAGAIN; +} + +int xcall_read_begin(struct file *file, unsigned int fd, char __user *buf, + size_t count) +{ + struct prefetch_item *pfi = NULL; + + if (!current->xcall_select || + !test_bit(__NR_epoll_pwait, current->xcall_select)) + return -EAGAIN; + + if (!file) + return -EAGAIN; + + pfi = find_prefetch_item(file); + if (!pfi) + return -EAGAIN; + + return xcall_read(pfi, fd, buf, count); +} + +void xcall_read_end(struct file *file) +{ + struct prefetch_item *pfi = NULL; + + if (!current->xcall_select || + !test_bit(__NR_epoll_pwait, current->xcall_select)) + return; + + if (!file) + return; + + pfi = find_prefetch_item(file); + if (!pfi) + return; + + transition_state(pfi, XCALL_CACHE_CANCEL, XCALL_CACHE_NONE); +} + +static void prefetch_work_fn(struct work_struct *work) +{ + struct prefetch_item *pfi = container_of(work, struct prefetch_item, work); + + if (!transition_state(pfi, XCALL_CACHE_NONE, XCALL_CACHE_PREFETCH)) + return; + + trace_epoll_rc_prefetch(pfi->fd, smp_processor_id()); + pfi->len = kernel_read(pfi->file, pfi->cache, + (1UL << cache_pages_order) * PAGE_SIZE, + &pfi->file->f_pos); + transition_state(pfi, XCALL_CACHE_PREFETCH, XCALL_CACHE_READY); + trace_epoll_rc_ready(pfi->fd, pfi->len); +} + +static void set_prefetch_numa_cpu(struct prefetch_item *pfi, int fd) +{ + int cpu = smp_processor_id(); + int node = numa_node_id(); + + cpumask_and(&pfi->related_cpus, cpu_cpu_mask(cpu), cpu_online_mask); + mutex_lock(&xcall_numa_entries[node].lock); + cpumask_and(&pfi->related_cpus, &pfi->related_cpus, + xcall_numa_entries[node].mask); + mutex_unlock(&xcall_numa_entries[node].lock); + pfi->cpu = cpumask_next(fd % cpumask_weight(&pfi->related_cpus), + &pfi->related_cpus); +} + +static int get_async_prefetch_cpu(struct prefetch_item *pfi) +{ + int cpu; + + if (pfi->cpu != smp_processor_id()) + return pfi->cpu; + + cpu = cpumask_next(pfi->cpu, &pfi->related_cpus); + if (cpu > cpumask_last(&pfi->related_cpus)) + cpu = cpumask_first(&pfi->related_cpus); + pfi->cpu = cpu; + return pfi->cpu; +} + +static struct prefetch_item *alloc_prefetch_item(struct epitem *epi) +{ + struct file *tfile = epi->ffd.file; + struct prefetch_item *pfi; + int fd = epi->ffd.fd; + + pfi = kmalloc(sizeof(struct prefetch_item), GFP_KERNEL); + if (!pfi) + return NULL; + + pfi->cache_pages = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, + cache_pages_order); + if (!pfi->cache_pages) { + kfree(pfi); + return NULL; + } + + pfi->cache = page_address(pfi->cache_pages); + atomic_set(&pfi->state, XCALL_CACHE_NONE); + INIT_WORK(&pfi->work, prefetch_work_fn); + INIT_HLIST_NODE(&pfi->node); + pfi->fd = fd; + pfi->file = tfile; + pfi->len = 0; + pfi->pos = 0; + set_prefetch_numa_cpu(pfi, fd); + + write_lock(&xcall_table_lock); + hash_add(xcall_item_table, &pfi->node, hash_64((u64)tfile, PREFETCH_ITEM_HASH_BITS)); + write_unlock(&xcall_table_lock); + + return pfi; +} + +static void ep_prefetch_item_enqueue(struct eventpoll *ep, struct epitem *epi) +{ + struct prefetch_item *pfi; + int cpu; + + if (!current->xcall_select || + !test_bit(__NR_epoll_pwait, current->xcall_select)) + return; + + pfi = find_prefetch_item(epi->ffd.file); + if (unlikely(!pfi)) { + pfi = alloc_prefetch_item(epi); + if (unlikely(!pfi)) + return; + } + + if (!pfi->cache || !(epi->event.events & EPOLLIN) || + atomic_read(&pfi->state) != XCALL_CACHE_NONE) + return; + + cpu = get_async_prefetch_cpu(pfi); + queue_work_on(cpu, rc_work, &pfi->work); + trace_epoll_rc_queue(epi->ffd.fd, cpu); +} +#else +static void ep_prefetch_item_enqueue(struct eventpoll *ep, struct epitem *epi) +{ +} +#endif + /* * This is the callback that is passed to the wait queue wakeup * mechanism. It is called by the stored file descriptors when they @@ -1751,6 +2015,8 @@ static __poll_t ep_send_events_proc(struct eventpoll *ep, struct list_head *head if (!revents) continue; + ep_prefetch_item_enqueue(ep, epi); + if (__put_user(revents, &uevent->events) || __put_user(epi->event.data, &uevent->data)) { list_add(&epi->rdllink, head); @@ -2454,6 +2720,14 @@ static int __init eventpoll_init(void) pwq_cache = kmem_cache_create("eventpoll_pwq", sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL); +#ifdef CONFIG_XCALL_PREFETCH + rc_work = alloc_workqueue("eventpoll_rc", 0, 0); + if (!rc_work) + return -ENOMEM; + + hash_init(xcall_item_table); +#endif + return 0; } fs_initcall(eventpoll_init); diff --git a/fs/open.c b/fs/open.c index 96de0d3f1a8b500ca39a9e46856ce8c87e17d801..381d9ec6e52cd6d208086de69a7789a9607880f4 100644 --- a/fs/open.c +++ b/fs/open.c @@ -1287,6 +1287,8 @@ int filp_close(struct file *filp, fl_owner_t id) return 0; } + free_prefetch_item(filp); + if (filp->f_op->flush) retval = filp->f_op->flush(filp, id); diff --git a/fs/proc/base.c b/fs/proc/base.c index 4e0054a37c4c54fa808264b1130ff2074d292bad..a759c660e8bf4f706f48ae86652a58dbac768975 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3589,6 +3589,361 @@ static const struct file_operations proc_pid_sg_level_operations = { }; #endif +#ifdef CONFIG_XCALL_PREFETCH +static atomic_t epoll_wait_select_count = ATOMIC_INIT(0); +void update_epoll_wait_select_count(struct task_struct *p, + unsigned int sc_no, bool add) +{ + if (sc_no != __NR_epoll_pwait) + return; + + if (!p->xcall_select || !test_bit(sc_no, p->xcall_select)) + return; + + if (add) { + atomic_inc(&epoll_wait_select_count); + pr_info("epoll_wait_select count add: %ld, %s\n", + atomic_read(&epoll_wait_select_count), p->comm); + } else { + atomic_dec(&epoll_wait_select_count); + pr_info("epoll_wait_select count sub: %ld, %s\n", + atomic_read(&epoll_wait_select_count), p->comm); + } +} + +int proc_adjust_cache_pages_order(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + if (write && atomic_read(&epoll_wait_select_count) > 0) + return -EPERM; + + return proc_dointvec_minmax(table, write, buffer, lenp, ppos); +} +#endif + +#ifdef CONFIG_FAST_SYSCALL +bool fast_syscall_enabled(void); + +static int xcall_show(struct seq_file *m, void *v) +{ + struct inode *inode = m->private; + struct task_struct *p; + unsigned int rs, re, sc_no; + + if (!fast_syscall_enabled()) + return -EACCES; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + + if (!p->xcall_enable) + goto out; + + seq_printf(m, "Enabled Total[%d/%d]:", bitmap_weight(p->xcall_enable, __NR_syscalls), + __NR_syscalls); + + for (rs = 0, bitmap_next_set_region(p->xcall_enable, &rs, &re, __NR_syscalls); + rs < re; rs = re + 1, + bitmap_next_set_region(p->xcall_enable, &rs, &re, __NR_syscalls)) { + rs == (re - 1) ? seq_printf(m, "%d,", rs) : + seq_printf(m, "%d-%d,", rs, re - 1); + } + seq_puts(m, "\nAvailable:\n"); + + for (sc_no = 0; sc_no < __NR_syscalls; sc_no++) { + if (p->xcall_select && test_bit(sc_no, p->xcall_select)) { + seq_printf(m, "NR_syscall: %3d: enabled: %d ", + sc_no, test_bit(sc_no, p->xcall_enable)); + seq_printf(m, "xcall_select: %d\n", + test_bit(sc_no, p->xcall_select)); + } + } +out: + put_task_struct(p); + + return 0; +} + +static int xcall_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, xcall_show, inode); +} + +static int xcall_enable_one(struct task_struct *p, unsigned int sc_no) +{ + if (p->xcall_select && test_bit(sc_no, p->xcall_select)) + return -EINVAL; + + if (!bitmap_weight(p->xcall_enable, __NR_syscalls)) { + p->xcall_select = bitmap_zalloc(__NR_syscalls, GFP_KERNEL); + if (!p->xcall_select) + return -ENOMEM; + } + + bitmap_set(p->xcall_enable, sc_no, 1); + return 0; +} + +static int xcall_disable_one(struct task_struct *p, unsigned int sc_no) +{ + if (p->xcall_select && test_bit(sc_no, p->xcall_select)) + return -EINVAL; + + bitmap_clear(p->xcall_enable, sc_no, 1); + return 0; +} + +static int xcall_select_table(struct task_struct *p, unsigned int sc_no) +{ + if (!p->xcall_select || !test_bit(sc_no, p->xcall_enable)) { + pr_err("Please enable NR_syscall: %d to xcall first.\n", sc_no); + return -EINVAL; + } + + if (p->xcall_select && test_bit(sc_no, p->xcall_select)) + return -EINVAL; + + bitmap_set(p->xcall_select, sc_no, 1); + update_epoll_wait_select_count(p, sc_no, true); + + return 0; +} + +static ssize_t xcall_write(struct file *file, const char __user *buf, + size_t count, loff_t *offset) +{ + struct inode *inode = file_inode(file); + struct task_struct *p; + char buffer[TASK_COMM_LEN]; + const size_t maxlen = sizeof(buffer) - 1; + unsigned int sc_no = __NR_syscalls; + int ret = 0; + int is_clear = 0, is_switch = 0; + + if (!fast_syscall_enabled()) + return -EACCES; + + memset(buffer, 0, sizeof(buffer)); + if (!count || copy_from_user(buffer, buf, count > maxlen ? maxlen : count)) + return -EFAULT; + + p = get_proc_task(inode); + if (!p || !p->xcall_enable) + return -ESRCH; + + if (buffer[0] == '!') + is_clear = 1; + else if ((buffer[0] == '@')) + is_switch = 1; + + if (kstrtouint(buffer + is_clear + is_switch, 10, &sc_no)) { + ret = -EINVAL; + goto out; + } + + if (sc_no >= __NR_syscalls) { + ret = -EINVAL; + goto out; + } + + if (is_switch && test_bit(sc_no, p->xcall_enable)) + ret = xcall_select_table(p, sc_no); + else if (!is_switch && !is_clear && !test_bit(sc_no, p->xcall_enable)) + ret = xcall_enable_one(p, sc_no); + else if (!is_switch && is_clear && test_bit(sc_no, p->xcall_enable)) + ret = xcall_disable_one(p, sc_no); + else + ret = -EINVAL; + +out: + put_task_struct(p); + + return ret ? ret : count; +} + +static const struct file_operations proc_pid_xcall_operations = { + .open = xcall_open, + .read = seq_read, + .write = xcall_write, + .llseek = seq_lseek, + .release = single_release, +}; +#endif + +#ifdef CONFIG_XCALL_PREFETCH +static ssize_t xcall_stats_write(struct file *file, const char __user *buf, + size_t count, loff_t *pos) +{ + int cpu; + + for_each_cpu(cpu, cpu_online_mask) { + *per_cpu_ptr(&xcall_cache_hit, cpu) = 0; + *per_cpu_ptr(&xcall_cache_miss, cpu) = 0; + } + + return count; +} + +static int xcall_stats_show(struct seq_file *m, void *v) +{ + unsigned long hit = 0, miss = 0; + unsigned int cpu; + u64 percent; + + for_each_cpu(cpu, cpu_online_mask) { + hit = *per_cpu_ptr(&xcall_cache_hit, cpu); + miss = *per_cpu_ptr(&xcall_cache_miss, cpu); + + if (hit == 0 && miss == 0) + continue; + + percent = (hit * 10000ULL) / (hit + miss); + seq_printf(m, "cpu%d epoll cache_{hit,miss}: %ld,%ld, hit ratio: %3llu.%02llu%%\n", + cpu, hit, miss, percent / 100, percent % 100); + } + return 0; +} + +static int xcall_stats_open(struct inode *inode, struct file *file) +{ + return single_open(file, xcall_stats_show, NULL); +} + +static const struct proc_ops xcall_stats_fops = { + .proc_open = xcall_stats_open, + .proc_read = seq_read, + .proc_write = xcall_stats_write, + .proc_lseek = seq_lseek, + .proc_release = single_release +}; + +struct numa_mask_entry *xcall_numa_entries; +static int nr_numa_nodes; + +static ssize_t xcall_numa_masks_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + char *input, *token, *tmp; + struct cpumask tmp_mask; + int node, ret = 0; + + input = kzalloc(count + 1, GFP_KERNEL); + if (!input) + return -ENOMEM; + + if (copy_from_user(input, buf, count)) { + ret = -EFAULT; + goto out; + } + + input[count] = '\0'; + tmp = input; + + for_each_online_node(node) { + token = strsep(&tmp, ":"); + if (!token || node >= nr_numa_nodes) { + ret = -EINVAL; + goto out; + } + + if (cpulist_parse(token, &tmp_mask)) { + ret = -EINVAL; + goto out; + } + cpumask_and(&tmp_mask, &tmp_mask, cpumask_of_node(node)); + if (cpumask_empty(&tmp_mask)) { + ret = -EINVAL; + goto out; + } + + mutex_lock(&xcall_numa_entries[node].lock); + cpumask_copy(xcall_numa_entries[node].mask, &tmp_mask); + mutex_unlock(&xcall_numa_entries[node].lock); + } + +out: + kfree(input); + return ret ? ret : count; +} + +static ssize_t xcall_numa_masks_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + char *output, *tmp; + int node, ret = 0; + size_t len = 0; + + tmp = kzalloc(num_possible_cpus() * 5, GFP_KERNEL); + output = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!output) + return -ENOMEM; + + for_each_online_node(node) { + mutex_lock(&xcall_numa_entries[node].lock); + cpumap_print_to_pagebuf(true, tmp, xcall_numa_entries[node].mask); + mutex_unlock(&xcall_numa_entries[node].lock); + + len += scnprintf(output + len, PAGE_SIZE - len, + "numa %d: %s", node, tmp); + if (len >= PAGE_SIZE) + break; + } + + ret = simple_read_from_buffer(buf, count, ppos, output, len); + kfree(output); + kfree(tmp); + return ret; +} + +static const struct proc_ops xcall_numa_mask_fops = { + .proc_read = xcall_numa_masks_read, + .proc_write = xcall_numa_masks_write, +}; + +static int __init xcall_numa_masks_init(void) +{ + int node; + + nr_numa_nodes = nr_online_nodes; + xcall_numa_entries = kcalloc(nr_numa_nodes, sizeof(*xcall_numa_entries), + GFP_KERNEL); + if (!xcall_numa_entries) + return -ENOMEM; + + for_each_online_node(node) { + if (!alloc_cpumask_var(&xcall_numa_entries[node].mask, GFP_KERNEL)) + goto err_free; + cpumask_copy(xcall_numa_entries[node].mask, cpumask_of_node(node)); + mutex_init(&xcall_numa_entries[node].lock); + } + + return 0; + +err_free: + while (--node >= 0) + free_cpumask_var(xcall_numa_entries[node].mask); + kfree(xcall_numa_entries); + return -ENOMEM; +} + +static int __init init_xcall_stats_procfs(void) +{ + struct proc_dir_entry *xcall_proc_dir; + + if (!fast_syscall_enabled()) + return 0; + + xcall_proc_dir = proc_mkdir("xcall", NULL); + proc_create("stats", 0444, xcall_proc_dir, &xcall_stats_fops); + proc_create("numa_mask", 0644, xcall_proc_dir, &xcall_numa_mask_fops); + xcall_numa_masks_init(); + + return 0; +} +device_initcall(init_xcall_stats_procfs); +#endif + /* * Thread groups */ @@ -3615,6 +3970,9 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_QOS_SCHED_SMART_GRID REG("smart_grid_level", 0644, proc_pid_sg_level_operations), #endif +#ifdef CONFIG_FAST_SYSCALL + REG("xcall", 0644, proc_pid_xcall_operations), +#endif #ifdef CONFIG_SCHED_AUTOGROUP REG("autogroup", S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations), #endif diff --git a/fs/read_write.c b/fs/read_write.c index da03b3e65cf3be6ab98bc26302e8ac9109ebef8f..72f75ab23f579d471b905efe4a50cbfb9231960f 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -622,6 +622,13 @@ ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count) struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; +#ifdef CONFIG_XCALL_PREFETCH + ret = xcall_read_begin(f.file, fd, buf, count); + if (ret != -EAGAIN) { + fdput_pos(f); + return ret; + } +#endif if (f.file) { loff_t pos, *ppos = file_ppos(f.file); if (ppos) { @@ -632,6 +639,7 @@ ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count) if (ret >= 0 && ppos) f.file->f_pos = pos; fdput_pos(f); + xcall_read_end(f.file); } return ret; } diff --git a/include/linux/fs.h b/include/linux/fs.h index a0ea6b64c45d081297340c0be79453ad10a59df8..86fc23c96c31f6b37596c4ae0e5a7ffff95d1df1 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3750,4 +3750,53 @@ static inline bool cachefiles_ondemand_is_enabled(void) } #endif +#ifdef CONFIG_XCALL_PREFETCH +enum cache_state { + XCALL_CACHE_NONE = 0, + XCALL_CACHE_QUEUED, + XCALL_CACHE_PREFETCH, + XCALL_CACHE_READY, + XCALL_CACHE_CANCEL +}; + +struct prefetch_item { + struct file *file; + int fd; + struct work_struct work; + int cpu; + cpumask_t related_cpus; + struct page *cache_pages; + char *cache; + ssize_t len; + /* cache state in epoll_wait */ + atomic_t state; + loff_t pos; + struct hlist_node node; +}; + +struct numa_mask_entry { + cpumask_var_t mask; + struct mutex lock; +}; + +DECLARE_PER_CPU_ALIGNED(unsigned long, xcall_cache_hit); +DECLARE_PER_CPU_ALIGNED(unsigned long, xcall_cache_miss); + +extern int cache_pages_order; +extern struct numa_mask_entry *xcall_numa_entries; +int xcall_read_begin(struct file *file, unsigned int fd, char __user *buf, + size_t count); +void xcall_read_end(struct file *file); +void free_prefetch_item(struct file *file); +int proc_adjust_cache_pages_order(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); +void update_epoll_wait_select_count(struct task_struct *p, unsigned int sc_no, + bool add); +#else +static inline void xcall_read_end(struct file *file) {} +static inline void free_prefetch_item(struct file *file) {} +static inline void update_epoll_wait_select_count(struct task_struct *p, + unsigned int sc_no, bool add) {} +#endif + #endif /* _LINUX_FS_H */ diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index 754f67ac4326a7f9cd33efe117911abc305be978..ad08a37f3bc0791a9817c1a62c848d43d8aa03ca 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -86,6 +86,11 @@ void irq_exit(void); */ void irq_exit_rcu(void); +#ifdef CONFIG_FAST_IRQ +void xint_enter(void); +void xint_exit(void); +#endif + #ifndef arch_nmi_enter #define arch_nmi_enter() do { } while (0) #define arch_nmi_exit() do { } while (0) diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h index 88b02e3b81da7e4510a7418444666f578a9a41d3..d94b013a091c76b914e73bd6c02510a8ab3292cd 100644 --- a/include/linux/irqchip/arm-gic-v3.h +++ b/include/linux/irqchip/arm-gic-v3.h @@ -762,6 +762,19 @@ static inline enum gic_intid_range __get_intid_range(irq_hw_number_t hwirq) } } +#ifdef CONFIG_FAST_IRQ +extern bool is_xint_support; + +enum xint_op { + XINT_TO_IRQ, + IRQ_TO_XINT, + XINT_SET_CHECK, + XINT_RANGE_CHECK, +}; + +void register_irqchip_proc(struct irq_desc *desc, void *irqp); +void unregister_irqchip_proc(struct irq_desc *desc); +#endif #endif #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index e3170b7f81fab260de0506e3f5ea136c8902be6f..a377bae2064e11aecfcb795af507f649025850fc 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1477,8 +1477,13 @@ struct task_struct { #else KABI_RESERVE(14) #endif +#if defined(CONFIG_FAST_SYSCALL) + KABI_USE(15, unsigned long *xcall_enable) + KABI_USE(16, unsigned long *xcall_select) +#else KABI_RESERVE(15) KABI_RESERVE(16) +#endif KABI_AUX_PTR(task_struct) /* CPU-specific state of this task: */ diff --git a/include/trace/events/fs.h b/include/trace/events/fs.h index ee82dad9d9dadc1f7d9bfe715a1e489ed5a2bd9c..d1796e9469abc98e3a1c722286e8abf7921b4957 100644 --- a/include/trace/events/fs.h +++ b/include/trace/events/fs.h @@ -29,5 +29,98 @@ DECLARE_TRACE(fs_file_release, #endif /* _TRACE_FS_H */ +TRACE_EVENT(epoll_rc_queue, + + TP_PROTO(int fd, int cpu), + + TP_ARGS(fd, cpu), + + TP_STRUCT__entry( + __field(int, fd) + __field(int, cpu) + ), + + TP_fast_assign( + __entry->fd = fd; + __entry->cpu = cpu; + ), + + TP_printk("%d on cpu %d", __entry->fd, __entry->cpu) +); + +TRACE_EVENT(epoll_rc_prefetch, + + TP_PROTO(int fd, int cpu), + + TP_ARGS(fd, cpu), + + TP_STRUCT__entry( + __field(int, fd) + __field(int, cpu) + ), + + TP_fast_assign( + __entry->fd = fd; + __entry->cpu = cpu; + ), + + TP_printk("%d on cpu %d", __entry->fd, __entry->cpu) +); + +TRACE_EVENT(epoll_rc_ready, + + TP_PROTO(int fd, ssize_t len), + + TP_ARGS(fd, len), + + TP_STRUCT__entry( + __field(int, fd) + __field(ssize_t, len) + ), + + TP_fast_assign( + __entry->fd = fd; + __entry->len = len; + ), + + TP_printk("%d, len %d", __entry->fd, __entry->len) +); + +TRACE_EVENT(epoll_rc_hit, + + TP_PROTO(int fd, ssize_t len), + + TP_ARGS(fd, len), + + TP_STRUCT__entry( + __field(int, fd) + __field(ssize_t, len) + ), + + TP_fast_assign( + __entry->fd = fd; + __entry->len = len; + ), + + TP_printk("%d, len: %d", __entry->fd, __entry->len) +); + +TRACE_EVENT(epoll_rc_miss, + + TP_PROTO(int fd), + + TP_ARGS(fd), + + TP_STRUCT__entry( + __field(int, fd) + ), + + TP_fast_assign( + __entry->fd = fd; + ), + + TP_printk("%d", __entry->fd) +); + /* This part must be outside protection */ #include diff --git a/kernel/fork.c b/kernel/fork.c index 9b1ea79deaa52ee5d6c56390264216e6f86c42cc..afd84458f37c7ff010529a3b0c4ed38267b4f7be 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -479,6 +479,16 @@ void free_task(struct task_struct *tsk) #endif if (task_relationship_used()) sched_relationship_free(tsk); + +#ifdef CONFIG_FAST_SYSCALL + if (tsk->xcall_enable) + bitmap_free(tsk->xcall_enable); + + if (tsk->xcall_select) + bitmap_free(tsk->xcall_select); +#endif + update_epoll_wait_select_count(tsk, __NR_epoll_pwait, false); + free_task_struct(tsk); } EXPORT_SYMBOL(free_task); @@ -1007,6 +1017,12 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) #ifdef CONFIG_MEMCG tsk->active_memcg = NULL; #endif + +#ifdef CONFIG_FAST_SYSCALL + tsk->xcall_enable = NULL; + tsk->xcall_select = NULL; +#endif + return tsk; free_stack: @@ -2085,6 +2101,23 @@ static __latent_entropy struct task_struct *copy_process( rt_mutex_init_task(p); +#ifdef CONFIG_FAST_SYSCALL + p->xcall_enable = bitmap_zalloc(__NR_syscalls, GFP_KERNEL); + if (!p->xcall_enable) + goto bad_fork_free; + + if (current->xcall_enable) + bitmap_copy(p->xcall_enable, current->xcall_enable, __NR_syscalls); + + if (current->xcall_select) { + p->xcall_select = bitmap_zalloc(__NR_syscalls, GFP_KERNEL); + if (!p->xcall_select) + goto bad_fork_free; + + bitmap_copy(p->xcall_select, current->xcall_select, __NR_syscalls); + } +#endif + #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY retval = sched_prefer_cpus_fork(p, current->prefer_cpus); if (retval) diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index e4cff358b437ebf4afef594d4d927846990904db..a4a7f87eab39faedc2d3138eeeb64ce2fdacefde 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -236,6 +236,34 @@ void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc) &dfs_irq_ops); } +#ifdef CONFIG_FAST_IRQ +static struct dentry *xint_dir; + +void xint_add_debugfs_entry(unsigned int irq) +{ + char name[10]; + char buf[100]; + + if (!xint_dir) + return; + + sprintf(name, "%d", irq); + sprintf(buf, "../irqs/%d", irq); + debugfs_create_symlink(name, xint_dir, buf); +} + +void xint_remove_debugfs_entry(unsigned int irq) +{ + char name[10]; + + if (!xint_dir) + return; + + sprintf(name, "%d", irq); + debugfs_lookup_and_remove(name, xint_dir); +} +#endif + static int __init irq_debugfs_init(void) { struct dentry *root_dir; @@ -247,6 +275,11 @@ static int __init irq_debugfs_init(void) irq_dir = debugfs_create_dir("irqs", root_dir); +#ifdef CONFIG_FAST_IRQ + if (is_xint_support) + xint_dir = debugfs_create_dir("xints", root_dir); +#endif + irq_lock_sparse(); for_each_active_irq(irq) irq_add_debugfs_entry(irq, irq_to_desc(irq)); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 48d6aa8cdbed7f64086e68d67e14414fda9b8be5..d725d8ef5ce7fe1b1f51264e7b538178b72af93e 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -492,6 +492,14 @@ static inline void irq_remove_debugfs_entry(struct irq_desc *desc) debugfs_remove(desc->debugfs_file); kfree(desc->dev_name); } + +#ifdef CONFIG_FAST_IRQ +extern bool is_xint_support; + +void xint_add_debugfs_entry(unsigned int irq); +void xint_remove_debugfs_entry(unsigned int irq); +#endif + void irq_debugfs_copy_devname(int irq, struct device *dev); # ifdef CONFIG_IRQ_DOMAIN void irq_domain_debugfs_init(struct dentry *root); @@ -507,6 +515,16 @@ static inline void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *d) static inline void irq_remove_debugfs_entry(struct irq_desc *d) { } + +#ifdef CONFIG_FAST_IRQ +static inline void xint_add_debugfs_entry(unsigned int irq) +{ +} +static inline void xint_remove_debugfs_entry(unsigned int irq) +{ +} +#endif + static inline void irq_debugfs_copy_devname(int irq, struct device *dev) { } diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 6c009a033c73fbfa010a96e8e9c2502c8cc8d5f4..5dc976d32c7443539a3038ef54b6f20c01fedced 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -658,6 +658,10 @@ int generic_handle_irq(unsigned int irq) EXPORT_SYMBOL_GPL(generic_handle_irq); #ifdef CONFIG_HANDLE_DOMAIN_IRQ +#ifdef CONFIG_FAST_IRQ +extern DECLARE_BITMAP(irqnr_xint_map, 1024); +#endif + /** * __handle_domain_irq - Invoke the handler for a HW irq belonging to a domain * @domain: The domain where to perform the lookup @@ -673,8 +677,16 @@ int __handle_domain_irq(struct irq_domain *domain, unsigned int hwirq, struct pt_regs *old_regs = set_irq_regs(regs); unsigned int irq = hwirq; int ret = 0; +#ifdef CONFIG_FAST_IRQ + int is_xint = test_bit(hwirq, irqnr_xint_map); + if (is_xint) + xint_enter(); + else + irq_enter(); +#else irq_enter(); +#endif #ifdef CONFIG_IRQ_DOMAIN if (lookup) @@ -692,7 +704,14 @@ int __handle_domain_irq(struct irq_domain *domain, unsigned int hwirq, generic_handle_irq(irq); } +#ifdef CONFIG_FAST_IRQ + if (is_xint) + xint_exit(); + else + irq_exit(); +#else irq_exit(); +#endif set_irq_regs(old_regs); return ret; } diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 0df62a3a1f3742a6f8c6e00e81477b85d1ac0113..64805a37c57698f38330e1e3c3b0dc1191781fc4 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -13,6 +13,10 @@ #include #include +#ifdef CONFIG_FAST_IRQ +#include +#endif + #include "internals.h" /* @@ -331,6 +335,9 @@ void register_handler_proc(unsigned int irq, struct irqaction *action) action->dir = proc_mkdir(name, desc->dir); } +void __weak register_irqchip_proc(struct irq_desc *desc, void *irqp) { } +void __weak unregister_irqchip_proc(struct irq_desc *desc) { } + #undef MAX_NAMELEN #define MAX_NAMELEN 10 @@ -385,6 +392,7 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) #endif proc_create_single_data("spurious", 0444, desc->dir, irq_spurious_proc_show, (void *)(long)irq); + register_irqchip_proc(desc, irqp); out_unlock: mutex_unlock(®ister_lock); @@ -408,6 +416,8 @@ void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) #endif remove_proc_entry("spurious", desc->dir); + unregister_irqchip_proc(desc); + sprintf(name, "%u", irq); remove_proc_entry(name, root_irq_dir); } diff --git a/kernel/softirq.c b/kernel/softirq.c index 4196b9f84690066a75f6e67bdefbe4ff79eb0336..9fc69e6e2c11734a8033a0f8d9cc0867d1154381 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -345,6 +345,42 @@ asmlinkage __visible void do_softirq(void) local_irq_restore(flags); } +#ifdef CONFIG_FAST_IRQ +/** + * xint_enter_rcu - Copy from irq_enter_rcu + */ +void xint_enter_rcu(void) +{ + if (tick_nohz_full_cpu(smp_processor_id()) || + (is_idle_task(current) && !in_interrupt())) { + /* + * Prevent raise_softirq from needlessly waking up ksoftirqd + * here, as softirq will be serviced on return from interrupt. + */ + local_bh_disable(); + tick_irq_enter(); + _local_bh_enable(); + } + +#ifndef CONFIG_DEBUG_FEATURE_BYPASS + account_irq_enter_time(current); +#endif + preempt_count_add(HARDIRQ_OFFSET); +#ifndef CONFIG_DEBUG_FEATURE_BYPASS + lockdep_hardirq_enter(); +#endif +} + +/** + * irq_enter - Copy from irq_enter + */ +void xint_enter(void) +{ + rcu_irq_enter(); + xint_enter_rcu(); +} +#endif + /** * irq_enter_rcu - Enter an interrupt context with RCU watching */ @@ -411,6 +447,43 @@ static inline void tick_irq_exit(void) #endif } +#ifdef CONFIG_FAST_IRQ +static inline void __xint_exit_rcu(void) +{ +#ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED + local_irq_disable(); +#else +#ifndef CONFIG_DEBUG_FEATURE_BYPASS + lockdep_assert_irqs_disabled(); +#endif +#endif + +#ifndef CONFIG_DEBUG_FEATURE_BYPASS + account_irq_exit_time(current); +#endif + preempt_count_sub(HARDIRQ_OFFSET); + if (!in_interrupt() && local_softirq_pending()) + invoke_softirq(); + + tick_irq_exit(); +} + +/** + * xint_exit - Copy from irq_exit + * + * Also processes softirqs if needed and possible. + */ +void xint_exit(void) +{ + __xint_exit_rcu(); + rcu_irq_exit(); + /* must be last! */ +#ifndef CONFIG_DEBUG_FEATURE_BYPASS + lockdep_hardirq_exit(); +#endif +} +#endif + static inline void __irq_exit_rcu(void) { #ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED diff --git a/kernel/sysctl.c b/kernel/sysctl.c index b4b36f8a3149193bca94a074d4ca2563c090463c..8f07ac8c86922f79cdd61d708b34c5edddcfaf77 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2861,6 +2861,16 @@ static struct ctl_table kern_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = &hundred_thousand, }, +#endif +#ifdef CONFIG_XCALL_PREFETCH + { .procname = "xcall_cache_pages_order", + .data = &cache_pages_order, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_adjust_cache_pages_order, + .extra1 = SYSCTL_ZERO, + .extra2 = &four, + }, #endif { } };